├── vllm ├── log ├── core │ ├── __init__.py │ ├── root.code-workspace │ └── policy.py ├── engine │ ├── __init__.py │ └── ray_utils.py ├── worker │ └── __init__.py ├── entrypoints │ ├── __init__.py │ ├── openai │ │ └── __init__.py │ └── api_server.py ├── model_executor │ ├── layers │ │ ├── __init__.py │ │ ├── layernorm.py │ │ ├── quantized_linear │ │ │ ├── __init__.py │ │ │ └── awq.py │ │ ├── activation.py │ │ └── simulator.py │ ├── parallel_utils │ │ ├── __init__.py │ │ ├── README.md │ │ ├── communication_op.py │ │ └── utils.py │ ├── __init__.py │ ├── utils.py │ ├── quantization_utils │ │ ├── __init__.py │ │ ├── awq.py │ │ └── base.py │ ├── models │ │ └── __init__.py │ ├── input_metadata.py │ └── model_loader.py ├── transformers_utils │ ├── __init__.py │ ├── configs │ │ ├── __init__.py │ │ ├── qwen.py │ │ ├── baichuan.py │ │ ├── aquila.py │ │ ├── mistral.py │ │ ├── mpt.py │ │ └── falcon.py │ └── config.py ├── __init__.py ├── logger.py ├── block.py ├── utils.py └── outputs.py ├── MANIFEST.in ├── docs ├── requirements-docs.txt ├── source │ ├── assets │ │ ├── figures │ │ │ ├── perf_a100_n1_dark.png │ │ │ ├── perf_a100_n3_dark.png │ │ │ ├── perf_a10g_n1_dark.png │ │ │ ├── perf_a10g_n3_dark.png │ │ │ ├── perf_a100_n1_light.png │ │ │ ├── perf_a100_n3_light.png │ │ │ ├── perf_a10g_n1_light.png │ │ │ └── perf_a10g_n3_light.png │ │ └── logos │ │ │ ├── vllm-logo-text-dark.png │ │ │ ├── vllm-logo-only-light.png │ │ │ └── vllm-logo-text-light.png │ ├── serving │ │ ├── deploying_with_triton.rst │ │ ├── distributed_serving.rst │ │ └── run_on_sky.rst │ ├── getting_started │ │ ├── installation.rst │ │ └── quickstart.rst │ ├── index.rst │ ├── conf.py │ └── models │ │ ├── supported_models.rst │ │ └── adding_model.rst ├── README.md ├── Makefile └── make.bat ├── benchmarks ├── visualizations │ ├── model_size_plots.pdf │ ├── waste_vs_heuristic.pdf │ ├── plot_over_heuristic.pdf │ ├── Plots Over Model Sizes.pdf │ ├── discard_vs_preserve_swap_vs_heuristic_vs_waste.pdf │ └── Makefile ├── README.md ├── launch_tgi_server.sh └── benchmark_latency.py ├── csrc ├── attention │ ├── attention_dtypes.h │ ├── attention_generic.cuh │ └── attention_utils.cuh ├── cuda_utils.cpp ├── layernorm.cpp ├── cuda_utils_kernels.cu ├── quantization.cpp ├── pos_encoding.cpp ├── dispatch_utils.h ├── activation.cpp ├── attention.cpp ├── reduction_utils.cuh ├── cache.cpp ├── layernorm_kernels.cu ├── quantization │ └── awq │ │ └── dequantize.cuh ├── activation_kernels.cu └── pos_encoding_kernels.cu ├── pyproject.toml ├── mypy.ini ├── requirements.txt ├── .readthedocs.yaml ├── examples ├── test.py ├── openai_completion_client.py ├── offline_inference.py ├── openai_chatcompletion_client.py ├── gradio_webserver.py ├── llm_engine_example.py ├── api_client.py ├── react_vllm_impl.py ├── test_pause.py └── test_ref_outputs.py ├── exps └── README.md ├── tests ├── kernels │ ├── conftest.py │ ├── test_layernorm.py │ ├── test_activation.py │ ├── test_cache.py │ └── test_pos_encoding.py ├── models │ └── test_models.py ├── samplers │ └── test_beam_search.py ├── async_engine │ ├── api_server_async_engine.py │ ├── test_async_llm_engine.py │ ├── test_request_tracker.py │ └── test_api_server.py ├── engine │ └── test_detokenize.py └── distributed │ └── test_comm_ops.py ├── README.md └── .gitignore /vllm/log: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/engine/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/worker/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/entrypoints/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/entrypoints/openai/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/transformers_utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/model_executor/parallel_utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include requirements.txt 3 | 4 | recursive-include csrc * 5 | -------------------------------------------------------------------------------- /docs/requirements-docs.txt: -------------------------------------------------------------------------------- 1 | sphinx == 6.2.1 2 | sphinx-book-theme == 1.0.1 3 | sphinx-copybutton == 0.5.2 4 | -------------------------------------------------------------------------------- /vllm/core/root.code-workspace: -------------------------------------------------------------------------------- 1 | { 2 | "folders": [ 3 | { 4 | "path": "../.." 5 | } 6 | ] 7 | } -------------------------------------------------------------------------------- /benchmarks/visualizations/model_size_plots.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WukLab/InferCept/HEAD/benchmarks/visualizations/model_size_plots.pdf -------------------------------------------------------------------------------- /benchmarks/visualizations/waste_vs_heuristic.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WukLab/InferCept/HEAD/benchmarks/visualizations/waste_vs_heuristic.pdf -------------------------------------------------------------------------------- /docs/source/assets/figures/perf_a100_n1_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WukLab/InferCept/HEAD/docs/source/assets/figures/perf_a100_n1_dark.png -------------------------------------------------------------------------------- /docs/source/assets/figures/perf_a100_n3_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WukLab/InferCept/HEAD/docs/source/assets/figures/perf_a100_n3_dark.png -------------------------------------------------------------------------------- /docs/source/assets/figures/perf_a10g_n1_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WukLab/InferCept/HEAD/docs/source/assets/figures/perf_a10g_n1_dark.png -------------------------------------------------------------------------------- /docs/source/assets/figures/perf_a10g_n3_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WukLab/InferCept/HEAD/docs/source/assets/figures/perf_a10g_n3_dark.png -------------------------------------------------------------------------------- /docs/source/assets/logos/vllm-logo-text-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WukLab/InferCept/HEAD/docs/source/assets/logos/vllm-logo-text-dark.png -------------------------------------------------------------------------------- /benchmarks/visualizations/plot_over_heuristic.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WukLab/InferCept/HEAD/benchmarks/visualizations/plot_over_heuristic.pdf -------------------------------------------------------------------------------- /docs/source/assets/figures/perf_a100_n1_light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WukLab/InferCept/HEAD/docs/source/assets/figures/perf_a100_n1_light.png -------------------------------------------------------------------------------- /docs/source/assets/figures/perf_a100_n3_light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WukLab/InferCept/HEAD/docs/source/assets/figures/perf_a100_n3_light.png -------------------------------------------------------------------------------- /docs/source/assets/figures/perf_a10g_n1_light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WukLab/InferCept/HEAD/docs/source/assets/figures/perf_a10g_n1_light.png -------------------------------------------------------------------------------- /docs/source/assets/figures/perf_a10g_n3_light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WukLab/InferCept/HEAD/docs/source/assets/figures/perf_a10g_n3_light.png -------------------------------------------------------------------------------- /docs/source/assets/logos/vllm-logo-only-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WukLab/InferCept/HEAD/docs/source/assets/logos/vllm-logo-only-light.png -------------------------------------------------------------------------------- /docs/source/assets/logos/vllm-logo-text-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WukLab/InferCept/HEAD/docs/source/assets/logos/vllm-logo-text-light.png -------------------------------------------------------------------------------- /benchmarks/visualizations/Plots Over Model Sizes.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WukLab/InferCept/HEAD/benchmarks/visualizations/Plots Over Model Sizes.pdf -------------------------------------------------------------------------------- /csrc/attention/attention_dtypes.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "attention_generic.cuh" 4 | #include "dtype_float16.cuh" 5 | #include "dtype_float32.cuh" 6 | #include "dtype_bfloat16.cuh" 7 | -------------------------------------------------------------------------------- /benchmarks/visualizations/discard_vs_preserve_swap_vs_heuristic_vs_waste.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/WukLab/InferCept/HEAD/benchmarks/visualizations/discard_vs_preserve_swap_vs_heuristic_vs_waste.pdf -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "ninja", 4 | "packaging", 5 | "setuptools", 6 | "torch >= 2.0.0", 7 | "wheel", 8 | ] 9 | build-backend = "setuptools.build_meta" 10 | -------------------------------------------------------------------------------- /vllm/model_executor/parallel_utils/README.md: -------------------------------------------------------------------------------- 1 | The files in this folder are ported from [Megatron-LM](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core). We only keep the codes that are used in inference. -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | python_version = 3.8 3 | 4 | ignore_missing_imports = True 5 | 6 | files = vllm 7 | # TODO(woosuk): Include the code from Megatron and HuggingFace. 8 | exclude = vllm/model_executor/parallel_utils/|vllm/model_executor/models/ 9 | -------------------------------------------------------------------------------- /benchmarks/README.md: -------------------------------------------------------------------------------- 1 | # Benchmarking vLLM 2 | 3 | ## Downloading the ShareGPT dataset 4 | 5 | You can download the dataset by running: 6 | ```bash 7 | wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json 8 | ``` 9 | -------------------------------------------------------------------------------- /vllm/model_executor/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.model_executor.input_metadata import InputMetadata 2 | from vllm.model_executor.model_loader import get_model 3 | from vllm.model_executor.utils import set_random_seed 4 | 5 | __all__ = [ 6 | "InputMetadata", 7 | "get_model", 8 | "set_random_seed", 9 | ] 10 | -------------------------------------------------------------------------------- /csrc/cuda_utils.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | int get_device_attribute( 4 | int attribute, 5 | int device_id); 6 | 7 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 8 | m.def( 9 | "get_device_attribute", 10 | &get_device_attribute, 11 | "Gets the specified device attribute."); 12 | } 13 | 14 | -------------------------------------------------------------------------------- /benchmarks/visualizations/Makefile: -------------------------------------------------------------------------------- 1 | make_pdf: 2 | jupyter nbconvert "Plot Over Model Sizes".ipynb --to=pdf --TemplateExporter.exclude_input=True --output "Plots Over Model Sizes".pdf 3 | 4 | notebook: 5 | cp $(file).ipynb '$(title).ipynb' 6 | jupyter nbconvert '$(title).ipynb' --to=pdf --TemplateExporter.exclude_input=True --output=$(file) 7 | rm '$(title).ipynb' 8 | -------------------------------------------------------------------------------- /vllm/model_executor/utils.py: -------------------------------------------------------------------------------- 1 | """Utils for model executor.""" 2 | import random 3 | 4 | import numpy as np 5 | import torch 6 | 7 | 8 | def set_random_seed(seed: int) -> None: 9 | random.seed(seed) 10 | np.random.seed(seed) 11 | torch.manual_seed(seed) 12 | if torch.cuda.is_available(): 13 | torch.cuda.manual_seed_all(seed) 14 | 15 | -------------------------------------------------------------------------------- /csrc/layernorm.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void rms_norm( 4 | torch::Tensor& out, 5 | torch::Tensor& input, 6 | torch::Tensor& weight, 7 | float epsilon); 8 | 9 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 10 | m.def( 11 | "rms_norm", 12 | &rms_norm, 13 | "Apply Root Mean Square (RMS) Normalization to the input tensor."); 14 | } 15 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # vLLM documents 2 | 3 | ## Build the docs 4 | 5 | ```bash 6 | # Install dependencies. 7 | pip install -r requirements-docs.txt 8 | 9 | # Build the docs. 10 | make clean 11 | make html 12 | ``` 13 | 14 | ## Open the docs with your browser 15 | 16 | ```bash 17 | python -m http.server -d build/html/ 18 | ``` 19 | Launch your browser and open localhost:8000. 20 | -------------------------------------------------------------------------------- /csrc/cuda_utils_kernels.cu: -------------------------------------------------------------------------------- 1 | int get_device_attribute( 2 | int attribute, 3 | int device_id) 4 | { 5 | int device, value; 6 | if (device_id < 0) { 7 | cudaGetDevice(&device); 8 | } 9 | else { 10 | device = device_id; 11 | } 12 | cudaDeviceGetAttribute(&value, static_cast(attribute), device); 13 | return value; 14 | } 15 | -------------------------------------------------------------------------------- /csrc/quantization.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | torch::Tensor awq_gemm( 4 | torch::Tensor _in_feats, 5 | torch::Tensor _kernel, 6 | torch::Tensor _scaling_factors, 7 | torch::Tensor _zeros, 8 | int split_k_iters); 9 | 10 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 11 | m.def( 12 | "awq_gemm", 13 | &awq_gemm, 14 | "Quantized GEMM for AWQ"); 15 | } 16 | -------------------------------------------------------------------------------- /csrc/pos_encoding.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void rotary_embedding( 4 | torch::Tensor& positions, 5 | torch::Tensor& query, 6 | torch::Tensor& key, 7 | int head_size, 8 | torch::Tensor& cos_sin_cache, 9 | bool is_neox); 10 | 11 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 12 | m.def( 13 | "rotary_embedding", 14 | &rotary_embedding, 15 | "Apply GPT-NeoX or GPT-J style rotary embedding to query and key"); 16 | } 17 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | ninja # For faster builds. 2 | psutil 3 | ray >= 2.5.1 4 | pandas # Required for Ray data. 5 | pyarrow # Required for Ray data. 6 | sentencepiece # Required for LLaMA tokenizer. 7 | numpy 8 | torch >= 2.0.0 9 | transformers >= 4.33.1 # Required for Code Llama. 10 | xformers >= 0.0.22 11 | fastapi 12 | uvicorn[standard] 13 | pydantic < 2 # Required for OpenAI server. 14 | gurobipy 15 | rich 16 | deepspeed == 0.12.3 17 | deepspeed-kernels -------------------------------------------------------------------------------- /benchmarks/launch_tgi_server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PORT=8000 4 | MODEL=$1 5 | TOKENS=$2 6 | 7 | docker run --gpus all --shm-size 1g -p $PORT:80 \ 8 | -v $PWD/data:/data \ 9 | ghcr.io/huggingface/text-generation-inference:0.8 \ 10 | --model-id $MODEL \ 11 | --sharded false \ 12 | --max-input-length 1024 \ 13 | --max-total-tokens 2048 \ 14 | --max-best-of 5 \ 15 | --max-concurrent-requests 5000 \ 16 | --max-batch-total-tokens $TOKENS 17 | -------------------------------------------------------------------------------- /docs/source/serving/deploying_with_triton.rst: -------------------------------------------------------------------------------- 1 | .. _deploying_with_triton: 2 | 3 | Deploying with NVIDIA Triton 4 | ============================ 5 | 6 | The `Triton Inference Server `_ hosts a tutorial demonstrating how to quickly deploy a simple `facebook/opt-125m `_ model using vLLM. Please see `Deploying a vLLM model in Triton `_ for more details. 7 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # Read the Docs configuration file 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 3 | 4 | version: 2 5 | 6 | build: 7 | os: ubuntu-22.04 8 | tools: 9 | python: "3.8" 10 | 11 | sphinx: 12 | configuration: docs/source/conf.py 13 | 14 | # If using Sphinx, optionally build your docs in additional formats such as PDF 15 | formats: 16 | - pdf 17 | 18 | # Optionally declare the Python requirements required to build your docs 19 | python: 20 | install: 21 | - requirements: docs/requirements-docs.txt 22 | -------------------------------------------------------------------------------- /csrc/dispatch_utils.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Adapted from 3 | * https://github.com/pytorch/pytorch/blob/v2.0.1/aten/src/ATen/Dispatch.h 4 | */ 5 | #include 6 | 7 | #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ 8 | AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ 9 | AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ 10 | AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) 11 | 12 | #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ 13 | AT_DISPATCH_SWITCH( \ 14 | TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) 15 | -------------------------------------------------------------------------------- /vllm/model_executor/quantization_utils/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Type 2 | 3 | from vllm.model_executor.quantization_utils.awq import AWQConfig 4 | from vllm.model_executor.quantization_utils.base import QuantizationConfig 5 | 6 | _QUANTIZATION_REGISTRY = { 7 | "awq": AWQConfig, 8 | } 9 | 10 | 11 | def get_quant_class(quantization: str) -> Type[QuantizationConfig]: 12 | if quantization not in _QUANTIZATION_REGISTRY: 13 | raise ValueError(f"Invalid quantization method: {quantization}") 14 | return _QUANTIZATION_REGISTRY[quantization] 15 | 16 | 17 | __all__ = [ 18 | "QuantizationConfig", 19 | "get_quant_class", 20 | ] 21 | -------------------------------------------------------------------------------- /csrc/activation.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void silu_and_mul( 4 | torch::Tensor& out, 5 | torch::Tensor& input); 6 | 7 | void gelu_new( 8 | torch::Tensor& out, 9 | torch::Tensor& input); 10 | 11 | void gelu_fast( 12 | torch::Tensor& out, 13 | torch::Tensor& input); 14 | 15 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 16 | m.def( 17 | "silu_and_mul", 18 | &silu_and_mul, 19 | "Activation function used in SwiGLU."); 20 | m.def( 21 | "gelu_new", 22 | &gelu_new, 23 | "GELU implementation used in GPT-2."); 24 | m.def( 25 | "gelu_fast", 26 | &gelu_fast, 27 | "Approximate GELU implementation."); 28 | } 29 | -------------------------------------------------------------------------------- /examples/test.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | 4 | async def async_foo(): 5 | print("async_foo started") 6 | await asyncio.sleep(1) 7 | print("async_foo done") 8 | 9 | 10 | async def main(): 11 | for _ in range(2): 12 | asyncio.ensure_future(async_foo()) # fire and forget async_foo() 13 | 14 | # btw, you can also create tasks inside non-async funcs 15 | 16 | print('Do some actions 1') 17 | await asyncio.sleep(1) 18 | print('Do some actions 2') 19 | await asyncio.sleep(1) 20 | print('Do some actions 3') 21 | 22 | 23 | if __name__ == '__main__': 24 | loop = asyncio.get_event_loop() 25 | loop.run_until_complete(main()) -------------------------------------------------------------------------------- /examples/openai_completion_client.py: -------------------------------------------------------------------------------- 1 | import openai 2 | 3 | # Modify OpenAI's API key and API base to use vLLM's API server. 4 | openai.api_key = "EMPTY" 5 | openai.api_base = "http://localhost:8000/v1" 6 | 7 | # List models API 8 | models = openai.Model.list() 9 | print("Models:", models) 10 | 11 | model = models["data"][0]["id"] 12 | 13 | # Completion API 14 | stream = False 15 | completion = openai.Completion.create( 16 | model=model, 17 | prompt="A robot may not injure a human being", 18 | echo=False, 19 | n=2, 20 | stream=stream, 21 | logprobs=3) 22 | 23 | print("Completion results:") 24 | if stream: 25 | for c in completion: 26 | print(c) 27 | else: 28 | print(completion) 29 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /csrc/attention.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | void single_query_cached_kv_attention( 5 | torch::Tensor& out, 6 | torch::Tensor& query, 7 | torch::Tensor& key_cache, 8 | torch::Tensor& value_cache, 9 | torch::Tensor& head_mapping, 10 | float scale, 11 | torch::Tensor& block_tables, 12 | torch::Tensor& context_lens, 13 | int block_size, 14 | int max_context_len, 15 | const c10::optional& alibi_slopes); 16 | 17 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 18 | m.def( 19 | "single_query_cached_kv_attention", 20 | &single_query_cached_kv_attention, 21 | "Compute the attention between an input query and the cached key/value tensors"); 22 | } 23 | -------------------------------------------------------------------------------- /vllm/__init__.py: -------------------------------------------------------------------------------- 1 | """vLLM: a high-throughput and memory-efficient inference engine for LLMs""" 2 | 3 | from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs 4 | from vllm.engine.async_llm_engine import AsyncLLMEngine 5 | from vllm.engine.llm_engine import LLMEngine 6 | from vllm.engine.ray_utils import initialize_cluster 7 | from vllm.entrypoints.llm import LLM 8 | from vllm.outputs import CompletionOutput, RequestOutput 9 | from vllm.sampling_params import SamplingParams 10 | 11 | __version__ = "0.2.0" 12 | 13 | __all__ = [ 14 | "LLM", 15 | "SamplingParams", 16 | "RequestOutput", 17 | "CompletionOutput", 18 | "LLMEngine", 19 | "EngineArgs", 20 | "AsyncLLMEngine", 21 | "AsyncEngineArgs", 22 | "initialize_cluster", 23 | ] 24 | -------------------------------------------------------------------------------- /vllm/transformers_utils/configs/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.transformers_utils.configs.mpt import MPTConfig 2 | from vllm.transformers_utils.configs.baichuan import BaiChuanConfig 3 | from vllm.transformers_utils.configs.aquila import AquilaConfig 4 | from vllm.transformers_utils.configs.qwen import QWenConfig 5 | # RWConfig is for the original tiiuae/falcon-40b(-instruct) and 6 | # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the 7 | # `FalconConfig` class from the official HuggingFace transformers library. 8 | from vllm.transformers_utils.configs.falcon import RWConfig 9 | from vllm.transformers_utils.configs.mistral import MistralConfig 10 | 11 | __all__ = [ 12 | "MPTConfig", 13 | "BaiChuanConfig", 14 | "AquilaConfig", 15 | "QWenConfig", 16 | "RWConfig", 17 | "MistralConfig", 18 | ] 19 | -------------------------------------------------------------------------------- /examples/offline_inference.py: -------------------------------------------------------------------------------- 1 | from vllm import LLM, SamplingParams 2 | 3 | # Sample prompts. 4 | prompts = [ 5 | "Hello, my name is", 6 | "The president of the United States is", 7 | "The capital of France is", 8 | "The future of AI is", 9 | ] 10 | # Create a sampling params object. 11 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95) 12 | 13 | # Create an LLM. 14 | llm = LLM(model="facebook/opt-125m") 15 | # Generate texts from the prompts. The output is a list of RequestOutput objects 16 | # that contain the prompt, generated text, and other information. 17 | outputs = llm.generate(prompts, sampling_params) 18 | # Print the outputs. 19 | for output in outputs: 20 | prompt = output.prompt 21 | generated_text = output.outputs[0].text 22 | print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") 23 | -------------------------------------------------------------------------------- /exps/README.md: -------------------------------------------------------------------------------- 1 | # This is the reproduce instructions for paper "INFERCEPT: Efficient Intercept Support for Large-Language Model Inferencing" 2 | 3 | ## Dataset 4 | Download our 6-augment mixture workload from google drive and place it under `exps` filder. 5 | 6 | ## Profiler 7 | The profiler is still under refactoring. The current benchmark script will set profiling variables to ones used in the paper. 8 | 9 | ## Run Benchmark 10 | ```bash 11 | # after installing InferCept 12 | bash bench.sh 13 | ``` 14 | 1. Results will be available at `exps/results`. 15 | 2. Each data point will run for 30min, please manage your GPU cluster wisely. 16 | 3. Please do not schedule two swap-involved run concurrently as we assume exclusive access to the PCIE bendwidth. -------------------------------------------------------------------------------- /vllm/model_executor/layers/layernorm.py: -------------------------------------------------------------------------------- 1 | """Custom normalization layers.""" 2 | import torch 3 | import torch.nn as nn 4 | 5 | from vllm import layernorm_ops 6 | 7 | 8 | class RMSNorm(nn.Module): 9 | """Root mean square normalization. 10 | 11 | Computes x -> w * x / sqrt(E[x^2] + eps) where w is the learned weight. 12 | Refer to https://arxiv.org/abs/1910.07467 13 | """ 14 | 15 | def __init__( 16 | self, 17 | hidden_size: int, 18 | eps: float = 1e-6, 19 | ) -> None: 20 | super().__init__() 21 | self.weight = nn.Parameter(torch.ones(hidden_size)) 22 | self.variance_epsilon = eps 23 | 24 | def forward(self, x: torch.Tensor) -> torch.Tensor: 25 | out = torch.empty_like(x) 26 | layernorm_ops.rms_norm( 27 | out, 28 | x, 29 | self.weight.data, 30 | self.variance_epsilon, 31 | ) 32 | return out 33 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /examples/openai_chatcompletion_client.py: -------------------------------------------------------------------------------- 1 | import openai 2 | 3 | # Modify OpenAI's API key and API base to use vLLM's API server. 4 | openai.api_key = "EMPTY" 5 | openai.api_base = "http://localhost:8000/v1" 6 | 7 | # List models API 8 | models = openai.Model.list() 9 | print("Models:", models) 10 | 11 | model = models["data"][0]["id"] 12 | 13 | # Chat completion API 14 | chat_completion = openai.ChatCompletion.create( 15 | model=model, 16 | messages=[{ 17 | "role": "system", 18 | "content": "You are a helpful assistant." 19 | }, { 20 | "role": "user", 21 | "content": "Who won the world series in 2020?" 22 | }, { 23 | "role": 24 | "assistant", 25 | "content": 26 | "The Los Angeles Dodgers won the World Series in 2020." 27 | }, { 28 | "role": "user", 29 | "content": "Where was it played?" 30 | }]) 31 | 32 | print("Chat completion results:") 33 | print(chat_completion) 34 | -------------------------------------------------------------------------------- /docs/source/getting_started/installation.rst: -------------------------------------------------------------------------------- 1 | .. _installation: 2 | 3 | Installation 4 | ============ 5 | 6 | vLLM is a Python library that also contains pre-compiled C++ and CUDA (11.8) binaries. 7 | 8 | Requirements 9 | ------------ 10 | 11 | * OS: Linux 12 | * Python: 3.8 -- 3.11 13 | * GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, etc.) 14 | 15 | Install with pip 16 | ---------------- 17 | 18 | You can install vLLM using pip: 19 | 20 | .. code-block:: console 21 | 22 | $ # (Optional) Create a new conda environment. 23 | $ conda create -n myenv python=3.8 -y 24 | $ conda activate myenv 25 | 26 | $ # Install vLLM. 27 | $ pip install vllm 28 | 29 | 30 | .. _build_from_source: 31 | 32 | Build from source 33 | ----------------- 34 | 35 | You can also build and install vLLM from source: 36 | 37 | .. code-block:: console 38 | 39 | $ git clone https://github.com/vllm-project/vllm.git 40 | $ cd vllm 41 | $ pip install -e . # This may take 5-10 minutes. 42 | 43 | .. tip:: 44 | If you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image. 45 | 46 | .. code-block:: console 47 | 48 | $ # Pull the Docker image with CUDA 11.8. 49 | $ # Use `--ipc=host` to make sure the shared memory is large enough. 50 | $ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:22.12-py3 51 | -------------------------------------------------------------------------------- /tests/kernels/conftest.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | 3 | import pytest 4 | import torch 5 | 6 | 7 | def create_kv_caches( 8 | num_blocks: int, 9 | block_size: int, 10 | num_layers: int, 11 | num_heads: int, 12 | head_size: int, 13 | dtype: torch.dtype, 14 | seed: int, 15 | ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: 16 | torch.random.manual_seed(seed) 17 | torch.cuda.manual_seed(seed) 18 | 19 | scale = head_size**-0.5 20 | x = 16 // torch.tensor([], dtype=dtype).element_size() 21 | key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x) 22 | key_caches = [] 23 | for _ in range(num_layers): 24 | key_cache = torch.empty(size=key_cache_shape, 25 | dtype=dtype, 26 | device='cuda') 27 | key_cache.uniform_(-scale, scale) 28 | key_caches.append(key_cache) 29 | 30 | value_cache_shape = (num_blocks, num_heads, head_size, block_size) 31 | value_caches = [] 32 | for _ in range(num_layers): 33 | value_cache = torch.empty(size=value_cache_shape, 34 | dtype=dtype, 35 | device='cuda') 36 | value_cache.uniform_(-scale, scale) 37 | value_caches.append(value_cache) 38 | return key_caches, value_caches 39 | 40 | 41 | @pytest.fixture() 42 | def kv_cache_factory(): 43 | return create_kv_caches 44 | -------------------------------------------------------------------------------- /vllm/model_executor/models/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.model_executor.models.aquila import AquilaForCausalLM 2 | from vllm.model_executor.models.baichuan import (BaiChuanForCausalLM, 3 | BaichuanForCausalLM) 4 | from vllm.model_executor.models.bloom import BloomForCausalLM 5 | from vllm.model_executor.models.falcon import FalconForCausalLM 6 | from vllm.model_executor.models.gpt2 import GPT2LMHeadModel 7 | from vllm.model_executor.models.gpt_bigcode import GPTBigCodeForCausalLM 8 | from vllm.model_executor.models.gpt_j import GPTJForCausalLM 9 | from vllm.model_executor.models.gpt_neox import GPTNeoXForCausalLM 10 | from vllm.model_executor.models.internlm import InternLMForCausalLM 11 | from vllm.model_executor.models.llama import LlamaForCausalLM 12 | from vllm.model_executor.models.mpt import MPTForCausalLM 13 | from vllm.model_executor.models.opt import OPTForCausalLM 14 | from vllm.model_executor.models.qwen import QWenLMHeadModel 15 | from vllm.model_executor.models.mistral import MistralForCausalLM 16 | 17 | __all__ = [ 18 | "AquilaForCausalLM", 19 | "BaiChuanForCausalLM", 20 | "BaichuanForCausalLM", 21 | "BloomForCausalLM", 22 | "FalconForCausalLM", 23 | "GPT2LMHeadModel", 24 | "GPTBigCodeForCausalLM", 25 | "GPTJForCausalLM", 26 | "GPTNeoXForCausalLM", 27 | "InternLMForCausalLM", 28 | "LlamaForCausalLM", 29 | "MPTForCausalLM", 30 | "OPTForCausalLM", 31 | "QWenLMHeadModel", 32 | "MistralForCausalLM", 33 | ] 34 | -------------------------------------------------------------------------------- /tests/models/test_models.py: -------------------------------------------------------------------------------- 1 | """Compare the outputs of HF and vLLM when using greedy sampling. 2 | 3 | Run `pytest tests/models/test_models.py --forked`. 4 | """ 5 | import pytest 6 | 7 | MODELS = [ 8 | "facebook/opt-125m", 9 | "gpt2", 10 | "bigcode/tiny_starcoder_py", 11 | "EleutherAI/gpt-j-6b", 12 | "EleutherAI/pythia-70m", 13 | "bigscience/bloom-560m", 14 | "mosaicml/mpt-7b", 15 | "tiiuae/falcon-7b", 16 | "meta-llama/Llama-2-7b-hf", 17 | ] 18 | 19 | 20 | @pytest.mark.parametrize("model", MODELS) 21 | @pytest.mark.parametrize("dtype", ["half"]) 22 | @pytest.mark.parametrize("max_tokens", [128]) 23 | def test_models( 24 | hf_runner, 25 | vllm_runner, 26 | example_prompts, 27 | model: str, 28 | dtype: str, 29 | max_tokens: int, 30 | ) -> None: 31 | hf_model = hf_runner(model, dtype=dtype) 32 | hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) 33 | del hf_model 34 | 35 | vllm_model = vllm_runner(model, dtype=dtype) 36 | vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) 37 | del vllm_model 38 | 39 | for i in range(len(example_prompts)): 40 | hf_output_ids, hf_output_str = hf_outputs[i] 41 | vllm_output_ids, vllm_output_str = vllm_outputs[i] 42 | assert hf_output_str == vllm_output_str, ( 43 | f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}") 44 | assert hf_output_ids == vllm_output_ids, ( 45 | f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") 46 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 |

3 | INFERCEPT: Efficient Intercept Support for Augmented Large Language Model 4 | Inference 5 |

6 | 7 | 8 | 9 | 10 | 11 | This repo contains implementation of InferCept. Please refer to our paper for more details. 12 | --- 13 | ## Instructions 14 | To install InferCept to your environment: 15 | ```bash 16 | # After cloning the repo 17 | cd infercept/ 18 | pip install -e . 19 | ``` 20 | 21 | To enable the serving system to hook on augmentation calls, register your aug-stop token in `vllm/utils.py`. You can register multiple keys at once: 22 | 23 | ```python 24 | def get_api_stop_strings() -> List[str]: 25 | return ["", ""] 26 | ``` 27 | 28 | To reproduce paper results, check `exps` folder. 29 | ## Citation 30 | 31 | If you use InferCept for your research, please cite our paper: 32 | ```bibtex 33 | @inproceedings{ 34 | abhyankar2024infer, 35 | title={INFERCEPT: Efficient Intercept Support for Augmented Large Language Model 36 | Inference}, 37 | author={Reyna Abhyankar and Zijian He and Vikranth Srivatsa and Hao Zhang and Yiying Zhang}, 38 | booktitle={Forty-first International Conference on Machine Learning}, 39 | year={2024}, 40 | month=Jul, 41 | address={Vienna, Austria}, 42 | } 43 | ``` 44 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantized_linear/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.model_executor.layers.quantized_linear.awq import ( 2 | AWQColumnParallelLinear, AWQRowParallelLinear) 3 | from vllm.model_executor.parallel_utils.layers import (ColumnParallelLinear, 4 | RowParallelLinear) 5 | 6 | _QUANTIZED_LINEAR_REGISTRY = { 7 | "awq": (AWQColumnParallelLinear, AWQRowParallelLinear), 8 | } 9 | 10 | 11 | class ParallelLinear: 12 | 13 | @classmethod 14 | def column(cls, *args, **kwargs) -> ColumnParallelLinear: 15 | quant_config = kwargs.get("quant_config", None) 16 | if quant_config is None: 17 | return ColumnParallelLinear(*args, **kwargs) 18 | 19 | name = quant_config.get_name() 20 | if name not in _QUANTIZED_LINEAR_REGISTRY: 21 | raise ValueError(f"No quantized linear is found for {name}") 22 | 23 | quant_linear_cls = _QUANTIZED_LINEAR_REGISTRY[name][0] 24 | return quant_linear_cls(*args, **kwargs) 25 | 26 | @classmethod 27 | def row(cls, *args, **kwargs) -> RowParallelLinear: 28 | quant_config = kwargs.get("quant_config", None) 29 | if quant_config is None: 30 | return RowParallelLinear(*args, **kwargs) 31 | 32 | name = quant_config.get_name() 33 | if name not in _QUANTIZED_LINEAR_REGISTRY: 34 | raise ValueError(f"No quantized linear is found for {name}") 35 | 36 | quant_linear_cls = _QUANTIZED_LINEAR_REGISTRY[name][1] 37 | return quant_linear_cls(*args, **kwargs) 38 | -------------------------------------------------------------------------------- /vllm/core/policy.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | 3 | from vllm.sequence import SequenceGroup 4 | 5 | 6 | class Policy: 7 | 8 | def get_priority( 9 | self, 10 | now: float, 11 | seq_group: SequenceGroup, 12 | ) -> float: 13 | raise NotImplementedError 14 | 15 | def sort_by_priority( 16 | self, 17 | now: float, 18 | seq_groups: List[SequenceGroup], 19 | ) -> List[SequenceGroup]: 20 | return sorted( 21 | seq_groups, 22 | key=lambda seq_group: self.get_priority(now, seq_group), 23 | reverse=True, 24 | ) 25 | 26 | 27 | class FCFS(Policy): 28 | 29 | def get_priority( 30 | self, 31 | now: float, 32 | seq_group: SequenceGroup, 33 | ) -> float: 34 | return now - seq_group.arrival_time 35 | 36 | class Chunked_FCFS(Policy): 37 | 38 | def get_priority( 39 | self, 40 | now: float, 41 | seq_group: SequenceGroup, 42 | ) -> Tuple[int, float]: 43 | return -seq_group.get_seqs()[0].data.logical_query_len, now - seq_group.arrival_time 44 | 45 | class LongestRemainingAPIFirst(Policy): 46 | 47 | def get_priority( 48 | self, 49 | now: float, 50 | seq_group: SequenceGroup, 51 | ) -> float: 52 | return seq_group.api_remaining_time(now) 53 | 54 | class PolicyFactory: 55 | 56 | _POLICY_REGISTRY = { 57 | 'fcfs': FCFS, 58 | 'c-fcfs': Chunked_FCFS, 59 | 'lra': LongestRemainingAPIFirst, 60 | } 61 | 62 | @classmethod 63 | def get_policy(cls, policy_name: str, **kwargs) -> Policy: 64 | return cls._POLICY_REGISTRY[policy_name](**kwargs) 65 | -------------------------------------------------------------------------------- /tests/samplers/test_beam_search.py: -------------------------------------------------------------------------------- 1 | """Compare the outputs of HF and vLLM when using beam search. 2 | 3 | Run `pytest tests/samplers/test_beam_search.py --forked`. 4 | """ 5 | import pytest 6 | 7 | # FIXME(zhuohan): The test can not pass if we: 8 | # 1. Increase max_tokens to 256. 9 | # 2. Increase beam_width to 8. 10 | # 3. Use the model "huggyllama/llama-7b". 11 | MAX_TOKENS = [128] 12 | BEAM_WIDTHS = [4] 13 | MODELS = ["facebook/opt-125m"] 14 | 15 | 16 | @pytest.mark.parametrize("model", MODELS) 17 | @pytest.mark.parametrize("dtype", ["half"]) 18 | @pytest.mark.parametrize("max_tokens", MAX_TOKENS) 19 | @pytest.mark.parametrize("beam_width", BEAM_WIDTHS) 20 | def test_beam_search_single_input( 21 | hf_runner, 22 | vllm_runner, 23 | example_prompts, 24 | model: str, 25 | dtype: str, 26 | max_tokens: int, 27 | beam_width: int, 28 | ) -> None: 29 | hf_model = hf_runner(model, dtype=dtype) 30 | hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width, 31 | max_tokens) 32 | del hf_model 33 | 34 | vllm_model = vllm_runner(model, dtype=dtype) 35 | vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width, 36 | max_tokens) 37 | del vllm_model 38 | 39 | for i in range(len(example_prompts)): 40 | hf_output_ids, _ = hf_outputs[i] 41 | vllm_output_ids, _ = vllm_outputs[i] 42 | assert len(hf_output_ids) == len(vllm_output_ids) 43 | for j in range(len(hf_output_ids)): 44 | assert hf_output_ids[j] == vllm_output_ids[j], ( 45 | f"Test{i} output{j}:\nHF: {hf_output_ids}\n" 46 | f"vLLM: {vllm_output_ids}") 47 | -------------------------------------------------------------------------------- /docs/source/serving/distributed_serving.rst: -------------------------------------------------------------------------------- 1 | .. _distributed_serving: 2 | 3 | Distributed Inference and Serving 4 | ================================= 5 | 6 | vLLM supports distributed tensor-parallel inference and serving. Currently, we support `Megatron-LM's tensor parallel algorithm `_. We manage the distributed runtime with `Ray `_. To run distributed inference, install Ray with: 7 | 8 | .. code-block:: console 9 | 10 | $ pip install ray 11 | 12 | To run multi-GPU inference with the :code:`LLM` class, set the :code:`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs: 13 | 14 | .. code-block:: python 15 | 16 | from vllm import LLM 17 | llm = LLM("facebook/opt-13b", tensor_parallel_size=4) 18 | output = llm.generate("San Franciso is a") 19 | 20 | To run multi-GPU serving, pass in the :code:`--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs: 21 | 22 | .. code-block:: console 23 | 24 | $ python -m vllm.entrypoints.api_server \ 25 | $ --model facebook/opt-13b \ 26 | $ --tensor-parallel-size 4 27 | 28 | To scale vLLM beyond a single machine, start a `Ray runtime `_ via CLI before running vLLM: 29 | 30 | .. code-block:: console 31 | 32 | $ # On head node 33 | $ ray start --head 34 | 35 | $ # On worker nodes 36 | $ ray start --address= 37 | 38 | After that, you can run inference and serving on multiple machines by launching the vLLM process on the head node by setting :code:`tensor_parallel_size` to the number of GPUs to be the total number of GPUs across all machines. -------------------------------------------------------------------------------- /tests/async_engine/api_server_async_engine.py: -------------------------------------------------------------------------------- 1 | """vllm.entrypoints.api_server with some extra logging for testing.""" 2 | import argparse 3 | from typing import Any, Dict 4 | 5 | import uvicorn 6 | from fastapi.responses import JSONResponse, Response 7 | 8 | import vllm.entrypoints.api_server 9 | from vllm.engine.arg_utils import AsyncEngineArgs 10 | from vllm.engine.async_llm_engine import AsyncLLMEngine 11 | 12 | app = vllm.entrypoints.api_server.app 13 | 14 | 15 | class AsyncLLMEngineWithStats(AsyncLLMEngine): 16 | 17 | # pylint: disable=redefined-outer-name 18 | def __init__(self, *args, **kwargs): 19 | super().__init__(*args, **kwargs) 20 | self._num_aborts = 0 21 | 22 | async def abort(self, request_id: str) -> None: 23 | await super().abort(request_id) 24 | self._num_aborts += 1 25 | 26 | def testing_stats(self) -> Dict[str, Any]: 27 | return {"num_aborted_requests": self._num_aborts} 28 | 29 | 30 | @app.get("/stats") 31 | def stats() -> Response: 32 | """Get the statistics of the engine.""" 33 | return JSONResponse(engine.testing_stats()) 34 | 35 | 36 | if __name__ == "__main__": 37 | parser = argparse.ArgumentParser() 38 | parser.add_argument("--host", type=str, default="localhost") 39 | parser.add_argument("--port", type=int, default=8000) 40 | parser = AsyncEngineArgs.add_cli_args(parser) 41 | args = parser.parse_args() 42 | 43 | engine_args = AsyncEngineArgs.from_cli_args(args) 44 | engine = AsyncLLMEngineWithStats.from_engine_args(engine_args) 45 | vllm.entrypoints.api_server.engine = engine 46 | uvicorn.run( 47 | app, 48 | host=args.host, 49 | port=args.port, 50 | log_level="debug", 51 | timeout_keep_alive=vllm.entrypoints.api_server.TIMEOUT_KEEP_ALIVE) 52 | -------------------------------------------------------------------------------- /csrc/reduction_utils.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/reduce_kernel_utils.cuh 3 | * Copyright (c) 2023, The vLLM team. 4 | * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | #pragma once 19 | 20 | namespace vllm { 21 | 22 | template 23 | __inline__ __device__ T warpReduceSum(T val) { 24 | #pragma unroll 25 | for (int mask = 16; mask > 0; mask >>= 1) 26 | val += __shfl_xor_sync(0xffffffff, val, mask, 32); 27 | return val; 28 | } 29 | 30 | /* Calculate the sum of all elements in a block */ 31 | template 32 | __inline__ __device__ T blockReduceSum(T val) { 33 | static __shared__ T shared[32]; 34 | int lane = threadIdx.x & 0x1f; 35 | int wid = threadIdx.x >> 5; 36 | 37 | val = warpReduceSum(val); 38 | 39 | if (lane == 0) 40 | shared[wid] = val; 41 | 42 | __syncthreads(); 43 | 44 | // Modify from blockDim.x << 5 to blockDim.x / 32. to prevent 45 | // blockDim.x is not divided by 32 46 | val = (threadIdx.x < (blockDim.x / 32.f)) ? shared[lane] : (T)(0.0f); 47 | val = warpReduceSum(val); 48 | return val; 49 | } 50 | 51 | } // namespace vllm 52 | -------------------------------------------------------------------------------- /vllm/logger.py: -------------------------------------------------------------------------------- 1 | # Adapted from 2 | # https://github.com/skypilot-org/skypilot/blob/86dc0f6283a335e4aa37b3c10716f90999f48ab6/sky/sky_logging.py 3 | """Logging configuration for vLLM.""" 4 | import logging 5 | import sys 6 | 7 | _FORMAT = "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s" 8 | _DATE_FORMAT = "%m-%d %H:%M:%S" 9 | 10 | 11 | class NewLineFormatter(logging.Formatter): 12 | """Adds logging prefix to newlines to align multi-line messages.""" 13 | 14 | def __init__(self, fmt, datefmt=None): 15 | logging.Formatter.__init__(self, fmt, datefmt) 16 | 17 | def format(self, record): 18 | msg = logging.Formatter.format(self, record) 19 | if record.message != "": 20 | parts = msg.split(record.message) 21 | msg = msg.replace("\n", "\r\n" + parts[0]) 22 | return msg 23 | 24 | 25 | _root_logger = logging.getLogger("vllm") 26 | _default_handler = None 27 | 28 | 29 | def _setup_logger(): 30 | _root_logger.setLevel(logging.DEBUG) 31 | global _default_handler 32 | if _default_handler is None: 33 | _default_handler = logging.StreamHandler(sys.stdout) 34 | _default_handler.flush = sys.stdout.flush # type: ignore 35 | _default_handler.setLevel(logging.INFO) 36 | _root_logger.addHandler(_default_handler) 37 | fmt = NewLineFormatter(_FORMAT, datefmt=_DATE_FORMAT) 38 | _default_handler.setFormatter(fmt) 39 | # Setting this will avoid the message 40 | # being propagated to the parent logger. 41 | _root_logger.propagate = False 42 | 43 | 44 | # The logger is initialized when the module is imported. 45 | # This is thread-safe as the module is only imported once, 46 | # guaranteed by the Python GIL. 47 | _setup_logger() 48 | 49 | 50 | def init_logger(name: str): 51 | return logging.getLogger(name) 52 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/activation.py: -------------------------------------------------------------------------------- 1 | """Custom activation functions.""" 2 | import torch 3 | import torch.nn as nn 4 | 5 | from vllm import activation_ops 6 | 7 | 8 | class SiluAndMul(nn.Module): 9 | """An activation function for SwiGLU. 10 | 11 | The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[1] // 2. 12 | 13 | Shapes: 14 | x: (num_tokens, 2 * d) 15 | return: (num_tokens, d) 16 | """ 17 | 18 | def forward(self, x: torch.Tensor) -> torch.Tensor: 19 | num_tokens = x.shape[0] 20 | d = x.shape[1] // 2 21 | out = torch.empty(num_tokens, d, dtype=x.dtype, device=x.device) 22 | activation_ops.silu_and_mul(out, x) 23 | return out 24 | 25 | 26 | class NewGELU(nn.Module): 27 | 28 | def forward(self, x: torch.Tensor) -> torch.Tensor: 29 | num_tokens = x.shape[0] 30 | d = x.shape[1] 31 | out = torch.empty(num_tokens, d, dtype=x.dtype, device=x.device) 32 | activation_ops.gelu_new(out, x) 33 | return out 34 | 35 | 36 | class FastGELU(nn.Module): 37 | 38 | def forward(self, x: torch.Tensor) -> torch.Tensor: 39 | num_tokens = x.shape[0] 40 | d = x.shape[1] 41 | out = torch.empty(num_tokens, d, dtype=x.dtype, device=x.device) 42 | activation_ops.gelu_fast(out, x) 43 | return out 44 | 45 | 46 | _ACTIVATION_REGISTRY = { 47 | "gelu": nn.GELU(), 48 | "gelu_fast": FastGELU(), 49 | "gelu_new": NewGELU(), 50 | "gelu_pytorch_tanh": nn.GELU(approximate="tanh"), 51 | "relu": nn.ReLU(), 52 | } 53 | 54 | 55 | def get_act_fn(act_fn: str) -> nn.Module: 56 | """Get an activation function by name.""" 57 | act_fn = act_fn.lower() 58 | if act_fn in _ACTIVATION_REGISTRY: 59 | return _ACTIVATION_REGISTRY[act_fn] 60 | raise ValueError(f"Activation function {act_fn!r} is not supported.") 61 | -------------------------------------------------------------------------------- /examples/gradio_webserver.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | import gradio as gr 5 | import requests 6 | 7 | 8 | def http_bot(prompt): 9 | headers = {"User-Agent": "vLLM Client"} 10 | pload = { 11 | "prompt": prompt, 12 | "stream": True, 13 | "max_tokens": 128, 14 | } 15 | response = requests.post(args.model_url, 16 | headers=headers, 17 | json=pload, 18 | stream=True) 19 | 20 | for chunk in response.iter_lines(chunk_size=8192, 21 | decode_unicode=False, 22 | delimiter=b"\0"): 23 | if chunk: 24 | data = json.loads(chunk.decode("utf-8")) 25 | output = data["text"][0] 26 | yield output 27 | 28 | 29 | def build_demo(): 30 | with gr.Blocks() as demo: 31 | gr.Markdown("# vLLM text completion demo\n") 32 | inputbox = gr.Textbox(label="Input", 33 | placeholder="Enter text and press ENTER") 34 | outputbox = gr.Textbox(label="Output", 35 | placeholder="Generated result from the model") 36 | inputbox.submit(http_bot, [inputbox], [outputbox]) 37 | return demo 38 | 39 | 40 | if __name__ == "__main__": 41 | parser = argparse.ArgumentParser() 42 | parser.add_argument("--host", type=str, default="localhost") 43 | parser.add_argument("--port", type=int, default=8001) 44 | parser.add_argument("--model-url", 45 | type=str, 46 | default="http://localhost:8000/generate") 47 | args = parser.parse_args() 48 | 49 | demo = build_demo() 50 | demo.queue(concurrency_count=100).launch(server_name=args.host, 51 | server_port=args.port, 52 | share=True) 53 | -------------------------------------------------------------------------------- /examples/llm_engine_example.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from vllm import EngineArgs, LLMEngine, SamplingParams 4 | 5 | 6 | def main(args: argparse.Namespace): 7 | # Parse the CLI argument and initialize the engine. 8 | engine_args = EngineArgs.from_cli_args(args) 9 | engine = LLMEngine.from_engine_args(engine_args) 10 | 11 | # Test the following prompts. 12 | test_prompts = [ 13 | ("A robot may not injure a human being", 14 | SamplingParams(temperature=0.0)), 15 | ("To be or not to be,", 16 | SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2)), 17 | ("What is the meaning of life?", 18 | SamplingParams(n=2, 19 | best_of=5, 20 | temperature=0.8, 21 | top_p=0.95, 22 | frequency_penalty=0.1)), 23 | ("It is only with the heart that one can see rightly", 24 | SamplingParams(n=3, best_of=3, use_beam_search=True, 25 | temperature=0.0)), 26 | ] 27 | 28 | # Run the engine by calling `engine.step()` manually. 29 | request_id = 0 30 | while True: 31 | # To test continuous batching, we add one request at each step. 32 | if test_prompts: 33 | prompt, sampling_params = test_prompts.pop(0) 34 | engine.add_request(str(request_id), prompt, sampling_params) 35 | request_id += 1 36 | 37 | request_outputs = engine.step() 38 | for request_output in request_outputs: 39 | if request_output.finished: 40 | print(request_output) 41 | 42 | if not (engine.has_unfinished_requests() or test_prompts): 43 | break 44 | 45 | 46 | if __name__ == '__main__': 47 | parser = argparse.ArgumentParser( 48 | description='Demo on using the LLMEngine class directly') 49 | parser = EngineArgs.add_cli_args(parser) 50 | args = parser.parse_args() 51 | main(args) 52 | -------------------------------------------------------------------------------- /vllm/model_executor/parallel_utils/communication_op.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from vllm.model_executor.parallel_utils.parallel_state import ( 4 | get_tensor_model_parallel_world_size, 5 | get_tensor_model_parallel_group, 6 | ) 7 | 8 | 9 | def tensor_model_parallel_all_reduce(input_): 10 | """All-reduce the input tensor across model parallel group. 11 | 12 | Note: This operation is applied in-place on the input tensor. 13 | """ 14 | # Bypass the function if we are using only 1 GPU. 15 | if get_tensor_model_parallel_world_size() == 1: 16 | return input_ 17 | # All-reduce. 18 | torch.distributed.all_reduce(input_, 19 | group=get_tensor_model_parallel_group()) 20 | return input_ 21 | 22 | 23 | def tensor_model_parallel_all_gather(input_, dim=-1): 24 | """All-gather the input tensor across model parallel group.""" 25 | world_size = get_tensor_model_parallel_world_size() 26 | # Bypass the function if we are using only 1 GPU. 27 | if world_size == 1: 28 | return input_ 29 | assert -input_.dim() <= dim < input_.dim(), ( 30 | f"Invalid dim ({dim}) for input tensor with shape {input_.size()}") 31 | if dim < 0: 32 | # Convert negative dim to positive. 33 | dim += input_.dim() 34 | input_size = input_.size() 35 | # Allocate output tensor. 36 | output_tensor = torch.empty((world_size, ) + input_size, 37 | dtype=input_.dtype, 38 | device=input_.device) 39 | # All-gather. 40 | torch.distributed.all_gather_into_tensor( 41 | output_tensor, input_, group=get_tensor_model_parallel_group()) 42 | # Reshape 43 | output_tensor = output_tensor.movedim(0, dim) 44 | output_tensor = output_tensor.reshape(input_size[:dim] + 45 | (world_size * input_size[dim], ) + 46 | input_size[dim + 1:]) 47 | return output_tensor 48 | -------------------------------------------------------------------------------- /csrc/attention/attention_generic.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h 3 | * Copyright (c) 2023, The vLLM team. 4 | * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | #pragma once 19 | 20 | #include 21 | 22 | namespace vllm { 23 | 24 | // A vector type to store Q, K, V elements. 25 | template 26 | struct Vec {}; 27 | 28 | // A vector type to store FP32 accumulators. 29 | template 30 | struct FloatVec {}; 31 | 32 | // Template vector operations. 33 | template 34 | inline __device__ Acc mul(A a, B b); 35 | 36 | template 37 | inline __device__ float sum(T v); 38 | 39 | template 40 | inline __device__ float dot(T a, T b) { 41 | return sum(mul(a, b)); 42 | } 43 | 44 | template 45 | inline __device__ float dot(T a, T b) { 46 | return sum(mul(a, b)); 47 | } 48 | 49 | template 50 | inline __device__ void zero(T& dst) { 51 | constexpr int WORDS = sizeof(T) / 4; 52 | union { 53 | T raw; 54 | uint32_t words[WORDS]; 55 | } tmp; 56 | 57 | #pragma unroll 58 | for (int ii = 0; ii < WORDS; ++ii) { 59 | tmp.words[ii] = 0u; 60 | } 61 | dst = tmp.raw; 62 | } 63 | 64 | } // namespace vllm 65 | -------------------------------------------------------------------------------- /csrc/cache.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | 6 | void swap_blocks( 7 | torch::Tensor& src, 8 | torch::Tensor& dst, 9 | const std::map& block_mapping); 10 | 11 | // void swap_blocks_new( 12 | // std::vector& key_caches, 13 | // std::vector& value_caches, 14 | // const std::map& block_mapping); 15 | 16 | void copy_blocks( 17 | std::vector& key_caches, 18 | std::vector& value_caches, 19 | const std::map>& block_mapping); 20 | 21 | void reshape_and_cache( 22 | torch::Tensor& key, 23 | torch::Tensor& value, 24 | torch::Tensor& key_cache, 25 | torch::Tensor& value_cache, 26 | torch::Tensor& slot_mapping); 27 | 28 | void new_reshape_and_cache( 29 | torch::Tensor& key, 30 | torch::Tensor& value, 31 | torch::Tensor& key_cache, 32 | torch::Tensor& value_cache, 33 | torch::Tensor& slot_mapping); 34 | 35 | void gather_cached_kv( 36 | torch::Tensor& key, 37 | torch::Tensor& value, 38 | torch::Tensor& key_cache, 39 | torch::Tensor& value_cache, 40 | torch::Tensor& slot_mapping); 41 | 42 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 43 | m.def( 44 | "swap_blocks", 45 | &swap_blocks, 46 | "Swap in (out) the cache blocks from src to dst"); 47 | // m.def( 48 | // "swap_blocks_new", 49 | // &swap_blocks_new, 50 | // "Swap in (out) the cache blocks from src to dst"); 51 | m.def( 52 | "copy_blocks", 53 | ©_blocks, 54 | "Copy the cache blocks from src to dst"); 55 | m.def( 56 | "reshape_and_cache", 57 | &reshape_and_cache, 58 | "Reshape the key and value tensors and cache them"); 59 | m.def( 60 | "new_reshape_and_cache", 61 | &new_reshape_and_cache, 62 | "Reshape the key and value tensors and cache them"); 63 | m.def( 64 | "gather_cached_kv", 65 | &gather_cached_kv, 66 | "Gather key and value from the cache into contiguous QKV tensors"); 67 | } 68 | -------------------------------------------------------------------------------- /tests/kernels/test_layernorm.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | import torch.nn as nn 4 | 5 | from vllm import layernorm_ops 6 | 7 | DTYPES = [torch.half, torch.bfloat16, torch.float] 8 | HIDDEN_SIZES = [67, 768, 2048, 5120, 8192] # Arbitrary values for testing 9 | NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing 10 | SEEDS = [0] 11 | 12 | 13 | class RefRMSNorm(nn.Module): 14 | 15 | def __init__(self, hidden_size, eps=1e-6): 16 | super().__init__() 17 | weight = torch.empty(hidden_size) 18 | weight.normal_(mean=1.0, std=0.1) 19 | self.weight = nn.Parameter(weight) 20 | self.variance_epsilon = eps 21 | 22 | def forward(self, hidden_states): 23 | input_dtype = hidden_states.dtype 24 | hidden_states = hidden_states.to(torch.float32) 25 | variance = hidden_states.pow(2).mean(-1, keepdim=True) 26 | hidden_states = hidden_states * torch.rsqrt(variance + 27 | self.variance_epsilon) 28 | return self.weight * hidden_states.to(input_dtype) 29 | 30 | 31 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS) 32 | @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) 33 | @pytest.mark.parametrize("dtype", DTYPES) 34 | @pytest.mark.parametrize("seed", SEEDS) 35 | @torch.inference_mode() 36 | def test_rms_norm( 37 | num_tokens: int, 38 | hidden_size: int, 39 | dtype: torch.dtype, 40 | seed: int, 41 | ) -> None: 42 | torch.random.manual_seed(seed) 43 | torch.cuda.manual_seed(seed) 44 | 45 | scale = float(hidden_size**-0.5) 46 | x = torch.empty(num_tokens, hidden_size, dtype=dtype, device="cuda") 47 | x.uniform_(-scale, scale) 48 | ref = RefRMSNorm(hidden_size).to(dtype).cuda() 49 | 50 | out = torch.empty_like(x) 51 | layernorm_ops.rms_norm( 52 | out, 53 | x, 54 | ref.weight.data, 55 | ref.variance_epsilon, 56 | ) 57 | ref_out = ref(x) 58 | assert torch.allclose(out, ref_out, atol=1e-2, rtol=1e-5) 59 | -------------------------------------------------------------------------------- /csrc/attention/attention_utils.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp 3 | * Copyright (c) 2023, The vLLM team. 4 | * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | #pragma once 19 | 20 | #include "attention_dtypes.h" 21 | 22 | #include 23 | #include 24 | 25 | namespace vllm { 26 | 27 | // Q*K^T operation. 28 | template 29 | inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) { 30 | using A_vec = typename FloatVec::Type; 31 | // Compute the parallel products for Q*K^T (treat vector lanes separately). 32 | A_vec qk_vec = mul(q[0], k[0]); 33 | #pragma unroll 34 | for (int ii = 1; ii < N; ++ii) { 35 | qk_vec = fma(q[ii], k[ii], qk_vec); 36 | } 37 | 38 | // Finalize the reduction across lanes. 39 | float qk = sum(qk_vec); 40 | #pragma unroll 41 | for (int mask = THREAD_GROUP_SIZE / 2; mask >= 1; mask /= 2) { 42 | qk += __shfl_xor_sync(uint32_t(-1), qk, mask); 43 | } 44 | return qk; 45 | } 46 | 47 | template 48 | struct Qk_dot { 49 | template 50 | static inline __device__ float dot(const Vec (&q)[N], const Vec (&k)[N]) { 51 | return qk_dot_(q, k); 52 | } 53 | }; 54 | 55 | } // namespace vllm 56 | -------------------------------------------------------------------------------- /vllm/transformers_utils/config.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from transformers import AutoConfig, PretrainedConfig 4 | 5 | from vllm.transformers_utils.configs import * # pylint: disable=wildcard-import 6 | 7 | _CONFIG_REGISTRY = { 8 | "mpt": MPTConfig, 9 | "baichuan": BaiChuanConfig, 10 | "aquila": AquilaConfig, 11 | "qwen": QWenConfig, 12 | "RefinedWeb": RWConfig, # For tiiuae/falcon-40b(-instruct) 13 | "RefinedWebModel": RWConfig, # For tiiuae/falcon-7b(-instruct) 14 | } 15 | 16 | 17 | def get_config(model: str, 18 | trust_remote_code: bool, 19 | revision: Optional[str] = None) -> PretrainedConfig: 20 | # NOTE: Because the Mistral model in HF hub does not have 21 | # `configuration_mistral.py`, we cannot use `AutoConfig` to load the 22 | # config. Instead, we use `MistralConfig` directly. 23 | # NOTE: This is a hack. This does not work for local models. 24 | # FIXME: Remove this once the Mistral model is available in the stable 25 | # version of HF transformers. 26 | if "mistral" in model.lower(): 27 | return MistralConfig.from_pretrained(model, revision=revision) 28 | 29 | try: 30 | config = AutoConfig.from_pretrained( 31 | model, trust_remote_code=trust_remote_code, revision=revision) 32 | except ValueError as e: 33 | if (not trust_remote_code and 34 | "requires you to execute the configuration file" in str(e)): 35 | err_msg = ( 36 | "Failed to load the model config. If the model is a custom " 37 | "model not yet available in the HuggingFace transformers " 38 | "library, consider setting `trust_remote_code=True` in LLM " 39 | "or using the `--trust-remote-code` flag in the CLI.") 40 | raise RuntimeError(err_msg) from e 41 | else: 42 | raise e 43 | if config.model_type in _CONFIG_REGISTRY: 44 | config_class = _CONFIG_REGISTRY[config.model_type] 45 | config = config_class.from_pretrained(model, revision=revision) 46 | return config 47 | -------------------------------------------------------------------------------- /vllm/block.py: -------------------------------------------------------------------------------- 1 | """Token blocks.""" 2 | from typing import List 3 | 4 | from vllm.utils import Device 5 | 6 | _BLANK_TOKEN_ID = -1 7 | 8 | 9 | class LogicalTokenBlock: 10 | """A block that stores a contiguous chunk of tokens from left to right. 11 | 12 | Logical blocks are used to represent the states of the corresponding 13 | physical blocks in the KV cache. 14 | """ 15 | 16 | def __init__( 17 | self, 18 | block_number: int, 19 | block_size: int, 20 | ) -> None: 21 | self.block_number = block_number 22 | self.block_size = block_size 23 | 24 | self.token_ids = [_BLANK_TOKEN_ID] * block_size 25 | self.num_tokens = 0 26 | 27 | def is_empty(self) -> bool: 28 | return self.num_tokens == 0 29 | 30 | def get_num_empty_slots(self) -> int: 31 | return self.block_size - self.num_tokens 32 | 33 | def is_full(self) -> bool: 34 | return self.num_tokens == self.block_size 35 | 36 | def append_tokens(self, token_ids: List[int]) -> None: 37 | assert len(token_ids) <= self.get_num_empty_slots() 38 | curr_idx = self.num_tokens 39 | self.token_ids[curr_idx:curr_idx + len(token_ids)] = token_ids 40 | self.num_tokens += len(token_ids) 41 | 42 | def get_token_ids(self) -> List[int]: 43 | return self.token_ids[:self.num_tokens] 44 | 45 | def get_last_token_id(self) -> int: 46 | assert self.num_tokens > 0 47 | return self.token_ids[self.num_tokens - 1] 48 | 49 | 50 | class PhysicalTokenBlock: 51 | """Represents the state of a block in the KV cache.""" 52 | 53 | def __init__( 54 | self, 55 | device: Device, 56 | block_number: int, 57 | block_size: int, 58 | ) -> None: 59 | self.device = device 60 | self.block_number = block_number 61 | self.block_size = block_size 62 | 63 | self.ref_count = 0 64 | 65 | def __repr__(self) -> str: 66 | return (f'PhysicalTokenBlock(device={self.device}, ' 67 | f'block_number={self.block_number}, ' 68 | f'ref_count={self.ref_count})') 69 | -------------------------------------------------------------------------------- /csrc/layernorm_kernels.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "dispatch_utils.h" 5 | #include "reduction_utils.cuh" 6 | 7 | namespace vllm { 8 | 9 | // TODO(woosuk): Further optimize this kernel. 10 | template 11 | __global__ void rms_norm_kernel( 12 | scalar_t* __restrict__ out, // [num_tokens, hidden_size] 13 | const scalar_t* __restrict__ input, // [num_tokens, hidden_size] 14 | const scalar_t* __restrict__ weight, // [hidden_size] 15 | const float epsilon, 16 | const int num_tokens, 17 | const int hidden_size) { 18 | __shared__ float s_variance; 19 | float variance = 0.0f; 20 | 21 | for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { 22 | const float x = (float) input[blockIdx.x * hidden_size + idx]; 23 | variance += x * x; 24 | } 25 | variance = blockReduceSum(variance); 26 | if (threadIdx.x == 0) { 27 | s_variance = rsqrtf(variance / hidden_size + epsilon); 28 | } 29 | __syncthreads(); 30 | 31 | for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { 32 | float x = (float) input[blockIdx.x * hidden_size + idx]; 33 | out[blockIdx.x * hidden_size + idx] = ((scalar_t) (x * s_variance)) * weight[idx]; 34 | } 35 | } 36 | 37 | } // namespace vllm 38 | 39 | void rms_norm( 40 | torch::Tensor& out, // [num_tokens, hidden_size] 41 | torch::Tensor& input, // [num_tokens, hidden_size] 42 | torch::Tensor& weight, // [hidden_size] 43 | float epsilon) { 44 | int num_tokens = input.size(0); 45 | int hidden_size = input.size(1); 46 | 47 | dim3 grid(num_tokens); 48 | dim3 block(std::min(hidden_size, 1024)); 49 | const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); 50 | VLLM_DISPATCH_FLOATING_TYPES( 51 | input.scalar_type(), 52 | "rms_norm_kernel", 53 | [&] { 54 | vllm::rms_norm_kernel<<>>( 55 | out.data_ptr(), 56 | input.data_ptr(), 57 | weight.data_ptr(), 58 | epsilon, 59 | num_tokens, 60 | hidden_size); 61 | }); 62 | } 63 | -------------------------------------------------------------------------------- /vllm/transformers_utils/configs/qwen.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba Cloud. 2 | # LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE 3 | 4 | from transformers import PretrainedConfig 5 | 6 | 7 | class QWenConfig(PretrainedConfig): 8 | model_type = "qwen" 9 | keys_to_ignore_at_inference = ["past_key_values"] 10 | 11 | def __init__( 12 | self, 13 | vocab_size=151936, 14 | hidden_size=4096, 15 | num_hidden_layers=32, 16 | num_attention_heads=32, 17 | emb_dropout_prob=0.0, 18 | attn_dropout_prob=0.0, 19 | layer_norm_epsilon=1e-6, 20 | initializer_range=0.02, 21 | max_position_embeddings=8192, 22 | scale_attn_weights=True, 23 | use_cache=True, 24 | bf16=False, 25 | fp16=False, 26 | fp32=False, 27 | kv_channels=128, 28 | rotary_pct=1.0, 29 | rotary_emb_base=10000, 30 | use_dynamic_ntk=True, 31 | use_logn_attn=True, 32 | use_flash_attn="auto", 33 | intermediate_size=22016, 34 | no_bias=True, 35 | tie_word_embeddings=False, 36 | **kwargs, 37 | ): 38 | self.vocab_size = vocab_size 39 | self.hidden_size = hidden_size 40 | self.intermediate_size = intermediate_size 41 | self.num_hidden_layers = num_hidden_layers 42 | self.num_attention_heads = num_attention_heads 43 | self.emb_dropout_prob = emb_dropout_prob 44 | self.attn_dropout_prob = attn_dropout_prob 45 | self.layer_norm_epsilon = layer_norm_epsilon 46 | self.initializer_range = initializer_range 47 | self.scale_attn_weights = scale_attn_weights 48 | self.use_cache = use_cache 49 | self.max_position_embeddings = max_position_embeddings 50 | self.bf16 = bf16 51 | self.fp16 = fp16 52 | self.fp32 = fp32 53 | self.kv_channels = kv_channels 54 | self.rotary_pct = rotary_pct 55 | self.rotary_emb_base = rotary_emb_base 56 | self.use_dynamic_ntk = use_dynamic_ntk 57 | self.use_logn_attn = use_logn_attn 58 | self.use_flash_attn = use_flash_attn 59 | self.no_bias = no_bias 60 | super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) 61 | -------------------------------------------------------------------------------- /tests/engine/test_detokenize.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from transformers import AutoTokenizer 4 | 5 | from vllm.transformers_utils.tokenizer import detokenize_incrementally 6 | 7 | TRUTH = [ 8 | # pylint: disable=line-too-long 9 | "Hello here, this is a simple test", 10 | "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving", 11 | "我很感谢你的热情" 12 | ] 13 | TOKENIZERS = [ 14 | "facebook/opt-125m", 15 | "gpt2", 16 | "bigcode/tiny_starcoder_py", 17 | "EleutherAI/gpt-j-6b", 18 | "EleutherAI/pythia-70m", 19 | "bigscience/bloom-560m", 20 | "mosaicml/mpt-7b", 21 | "tiiuae/falcon-7b", 22 | "meta-llama/Llama-2-7b-hf", 23 | "codellama/CodeLlama-7b-hf", 24 | ] 25 | 26 | 27 | def _run_incremental_decode(tokenizer, all_input_ids, 28 | skip_special_tokens: bool): 29 | decoded_text = "" 30 | offset = 0 31 | token_offset = 0 32 | prev_tokens = None 33 | for i in range(len(all_input_ids)): 34 | new_tokens, text, offset, token_offset = detokenize_incrementally( 35 | tokenizer, 36 | all_input_ids[:i + 1], 37 | prev_tokens, 38 | offset, 39 | token_offset, 40 | skip_special_tokens=skip_special_tokens) 41 | decoded_text += text 42 | if prev_tokens is None: 43 | prev_tokens = new_tokens 44 | else: 45 | prev_tokens += new_tokens 46 | return decoded_text 47 | 48 | 49 | @pytest.mark.parametrize("truth", TRUTH) 50 | @pytest.mark.parametrize("tokenizer_id", TOKENIZERS) 51 | @pytest.mark.parametrize("skip_special_tokens", (True, False)) 52 | def test_decode_streaming(tokenizer_id, truth, skip_special_tokens): 53 | tokenizer = AutoTokenizer.from_pretrained(tokenizer_id) 54 | all_input_ids = tokenizer(truth, add_special_tokens=False)["input_ids"] 55 | if skip_special_tokens: 56 | all_input_ids = ([tokenizer.bos_token_id] 57 | if tokenizer.bos_token_id is not None else 58 | []) + all_input_ids + [tokenizer.eos_token_id] 59 | 60 | decoded_text = _run_incremental_decode( 61 | tokenizer, all_input_ids, skip_special_tokens=skip_special_tokens) 62 | 63 | assert decoded_text == truth 64 | -------------------------------------------------------------------------------- /docs/source/serving/run_on_sky.rst: -------------------------------------------------------------------------------- 1 | .. _on_cloud: 2 | 3 | Running on clouds with SkyPilot 4 | =============================== 5 | 6 | .. raw:: html 7 | 8 |

9 | vLLM 10 |

11 | 12 | vLLM can be run on the cloud to scale to multiple GPUs with `SkyPilot `__, an open-source framework for running LLMs on any cloud. 13 | 14 | To install SkyPilot and setup your cloud credentials, run: 15 | 16 | .. code-block:: console 17 | 18 | $ pip install skypilot 19 | $ sky check 20 | 21 | See the vLLM SkyPilot YAML for serving, `serving.yaml `__. 22 | 23 | .. code-block:: yaml 24 | 25 | resources: 26 | accelerators: A100 27 | 28 | envs: 29 | MODEL_NAME: decapoda-research/llama-13b-hf 30 | TOKENIZER: hf-internal-testing/llama-tokenizer 31 | 32 | setup: | 33 | conda create -n vllm python=3.9 -y 34 | conda activate vllm 35 | git clone https://github.com/vllm-project/vllm.git 36 | cd vllm 37 | pip install . 38 | pip install gradio 39 | 40 | run: | 41 | conda activate vllm 42 | echo 'Starting vllm api server...' 43 | python -u -m vllm.entrypoints.api_server \ 44 | --model $MODEL_NAME \ 45 | --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ 46 | --tokenizer $TOKENIZER 2>&1 | tee api_server.log & 47 | echo 'Waiting for vllm api server to start...' 48 | while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done 49 | echo 'Starting gradio server...' 50 | python vllm/examples/gradio_webserver.py 51 | 52 | Start the serving the LLaMA-13B model on an A100 GPU: 53 | 54 | .. code-block:: console 55 | 56 | $ sky launch serving.yaml 57 | 58 | Check the output of the command. There will be a sharable gradio link (like the last line of the following). Open it in your browser to use the LLaMA model to do the text completion. 59 | 60 | .. code-block:: console 61 | 62 | (task, pid=7431) Running on public URL: https://.gradio.live 63 | 64 | **Optional**: Serve the 65B model instead of the default 13B and use more GPU: 65 | 66 | .. code-block:: console 67 | 68 | sky launch -c vllm-serve-new -s serve.yaml --gpus A100:8 --env MODEL_NAME=decapoda-research/llama-65b-hf 69 | 70 | -------------------------------------------------------------------------------- /tests/async_engine/test_async_llm_engine.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from dataclasses import dataclass 3 | 4 | import pytest 5 | 6 | from vllm.engine.async_llm_engine import AsyncLLMEngine 7 | 8 | 9 | @dataclass 10 | class RequestOutput: 11 | request_id: int 12 | finished: bool = False 13 | 14 | 15 | class MockEngine: 16 | 17 | def __init__(self): 18 | self.step_calls = 0 19 | self.add_request_calls = 0 20 | self.abort_request_calls = 0 21 | self.request_id = None 22 | 23 | async def step_async(self): 24 | self.step_calls += 1 25 | return [RequestOutput( 26 | request_id=self.request_id)] if self.request_id else [] 27 | 28 | def generate(self, request_id): 29 | self.request_id = request_id 30 | 31 | def stop_generating(self): 32 | self.request_id = None 33 | 34 | def add_request(self, **kwargs): 35 | del kwargs # Unused 36 | self.add_request_calls += 1 37 | 38 | def abort_request(self, request_id): 39 | del request_id # Unused 40 | self.abort_request_calls += 1 41 | 42 | 43 | class MockAsyncLLMEngine(AsyncLLMEngine): 44 | 45 | def _init_engine(self, *args, **kwargs): 46 | return MockEngine() 47 | 48 | 49 | @pytest.mark.asyncio 50 | async def test_new_requests_event(): 51 | engine = MockAsyncLLMEngine(worker_use_ray=False, engine_use_ray=False) 52 | engine.start_background_loop() 53 | await asyncio.sleep(0.01) 54 | assert engine.engine.step_calls == 0 55 | 56 | await engine.add_request("1", "", None) 57 | await asyncio.sleep(0.01) 58 | assert engine.engine.add_request_calls == 1 59 | assert engine.engine.step_calls == 1 60 | 61 | await engine.add_request("2", "", None) 62 | engine.engine.generate("2") 63 | await asyncio.sleep(0) 64 | assert engine.engine.add_request_calls == 2 65 | assert engine.engine.step_calls == 2 66 | await asyncio.sleep(0) 67 | assert engine.engine.step_calls == 3 68 | engine.engine.stop_generating() 69 | await asyncio.sleep(0) 70 | assert engine.engine.step_calls == 4 71 | await asyncio.sleep(0) 72 | assert engine.engine.step_calls == 4 73 | 74 | await engine.add_request("3", "", None) 75 | await asyncio.sleep(0.01) 76 | assert engine.engine.add_request_calls == 3 77 | assert engine.engine.step_calls == 5 78 | await asyncio.sleep(0.01) 79 | assert engine.engine.add_request_calls == 3 80 | assert engine.engine.step_calls == 5 81 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to vLLM! 2 | ================ 3 | 4 | .. figure:: ./assets/logos/vllm-logo-text-light.png 5 | :width: 60% 6 | :align: center 7 | :alt: vLLM 8 | :class: no-scaled-link 9 | 10 | .. raw:: html 11 | 12 |

13 | Easy, fast, and cheap LLM serving for everyone 14 | 15 |

16 | 17 |

18 | 19 | Star 20 | Watch 21 | Fork 22 |

23 | 24 | 25 | 26 | vLLM is a fast and easy-to-use library for LLM inference and serving. 27 | 28 | vLLM is fast with: 29 | 30 | * State-of-the-art serving throughput 31 | * Efficient management of attention key and value memory with **PagedAttention** 32 | * Continuous batching of incoming requests 33 | * Optimized CUDA kernels 34 | 35 | vLLM is flexible and easy to use with: 36 | 37 | * Seamless integration with popular HuggingFace models 38 | * High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more 39 | * Tensor parallelism support for distributed inference 40 | * Streaming outputs 41 | * OpenAI-compatible API server 42 | 43 | For more information, check out the following: 44 | 45 | * `vLLM announcing blog post `_ (intro to PagedAttention) 46 | * `vLLM paper `_ (SOSP 2023) 47 | * `How continuous batching enables 23x throughput in LLM inference while reducing p50 latency `_ by Cade Daniel et al. 48 | 49 | 50 | 51 | Documentation 52 | ------------- 53 | 54 | .. toctree:: 55 | :maxdepth: 1 56 | :caption: Getting Started 57 | 58 | getting_started/installation 59 | getting_started/quickstart 60 | 61 | .. toctree:: 62 | :maxdepth: 1 63 | :caption: Serving 64 | 65 | serving/distributed_serving 66 | serving/run_on_sky 67 | serving/deploying_with_triton 68 | 69 | .. toctree:: 70 | :maxdepth: 1 71 | :caption: Models 72 | 73 | models/supported_models 74 | models/adding_model 75 | -------------------------------------------------------------------------------- /vllm/transformers_utils/configs/baichuan.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. 3 | # 4 | # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX 5 | # and OPT implementations in this library. It has been modified from its 6 | # original forms to accommodate minor architectural differences compared 7 | # to GPT-NeoX and OPT used by the Meta AI team that trained the model. 8 | # 9 | # Licensed under the Apache License, Version 2.0 (the "License"); 10 | # you may not use this file except in compliance with the License. 11 | # You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | 21 | from transformers.configuration_utils import PretrainedConfig 22 | 23 | 24 | class BaiChuanConfig(PretrainedConfig): 25 | model_type = "baichuan" 26 | keys_to_ignore_at_inference = ["past_key_values"] 27 | 28 | def __init__( 29 | self, 30 | vocab_size=64000, 31 | hidden_size=4096, 32 | intermediate_size=11008, 33 | num_hidden_layers=32, 34 | num_attention_heads=32, 35 | hidden_act="silu", 36 | max_position_embeddings=4096, 37 | initializer_range=0.02, 38 | rms_norm_eps=1e-6, 39 | use_cache=True, 40 | pad_token_id=0, 41 | bos_token_id=1, 42 | eos_token_id=2, 43 | tie_word_embeddings=False, 44 | **kwargs, 45 | ): 46 | self.vocab_size = vocab_size 47 | self.max_position_embeddings = max_position_embeddings 48 | self.hidden_size = hidden_size 49 | self.intermediate_size = intermediate_size 50 | self.num_hidden_layers = num_hidden_layers 51 | self.num_attention_heads = num_attention_heads 52 | self.hidden_act = hidden_act 53 | self.initializer_range = initializer_range 54 | self.rms_norm_eps = rms_norm_eps 55 | self.use_cache = use_cache 56 | super().__init__( 57 | pad_token_id=pad_token_id, 58 | bos_token_id=bos_token_id, 59 | eos_token_id=eos_token_id, 60 | tie_word_embeddings=tie_word_embeddings, 61 | **kwargs, 62 | ) 63 | -------------------------------------------------------------------------------- /tests/async_engine/test_request_tracker.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from vllm.engine.async_llm_engine import RequestTracker 4 | from vllm.outputs import RequestOutput 5 | 6 | 7 | class DummyEvent: 8 | 9 | def __init__(self): 10 | self.flag = False 11 | 12 | def set(self): 13 | self.flag = True 14 | 15 | def clear(self): 16 | self.flag = False 17 | 18 | 19 | def test_request_tracker(): 20 | tracker = RequestTracker() 21 | tracker.new_requests_event = DummyEvent() 22 | stream_1 = tracker.add_request("1") 23 | assert tracker.new_requests_event.flag 24 | new, finished = tracker.get_new_and_finished_requests() 25 | assert not tracker.new_requests_event.flag 26 | assert len(new) == 1 27 | assert new[0]["request_id"] == "1" 28 | assert not finished 29 | assert not stream_1.finished 30 | 31 | stream_2 = tracker.add_request("2") 32 | stream_3 = tracker.add_request("3") 33 | assert tracker.new_requests_event.flag 34 | new, finished = tracker.get_new_and_finished_requests() 35 | assert not tracker.new_requests_event.flag 36 | assert len(new) == 2 37 | assert new[0]["request_id"] == "2" 38 | assert new[1]["request_id"] == "3" 39 | assert not finished 40 | assert not stream_2.finished 41 | assert not stream_3.finished 42 | 43 | # request_ids must be unique 44 | with pytest.raises(KeyError): 45 | tracker.add_request("1") 46 | assert not tracker.new_requests_event.flag 47 | 48 | tracker.abort_request("1") 49 | new, finished = tracker.get_new_and_finished_requests() 50 | assert len(finished) == 1 51 | assert "1" in finished 52 | assert not new 53 | assert stream_1.finished 54 | 55 | stream_4 = tracker.add_request("4") 56 | tracker.abort_request("4") 57 | assert tracker.new_requests_event.flag 58 | new, finished = tracker.get_new_and_finished_requests() 59 | assert len(finished) == 1 60 | assert "4" in finished 61 | assert not new 62 | assert stream_4.finished 63 | 64 | stream_5 = tracker.add_request("5") 65 | assert tracker.new_requests_event.flag 66 | tracker.process_request_output( 67 | RequestOutput("2", "output", [], [], finished=True)) 68 | new, finished = tracker.get_new_and_finished_requests() 69 | assert not tracker.new_requests_event.flag 70 | assert len(finished) == 1 71 | assert "2" in finished 72 | assert len(new) == 1 73 | assert new[0]["request_id"] == "5" 74 | assert stream_2.finished 75 | assert not stream_5.finished 76 | -------------------------------------------------------------------------------- /vllm/model_executor/quantization_utils/awq.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, List 2 | 3 | import torch 4 | 5 | from vllm.model_executor.quantization_utils.base import QuantizationConfig 6 | 7 | 8 | class AWQConfig(QuantizationConfig): 9 | """Config class for AWQ. 10 | 11 | Reference: https://arxiv.org/abs/2306.00978 12 | """ 13 | 14 | def __init__( 15 | self, 16 | weight_bits: int, 17 | group_size: int, 18 | zero_point: bool, 19 | ) -> None: 20 | self.weight_bits = weight_bits 21 | self.group_size = group_size 22 | self.zero_point = zero_point 23 | 24 | if self.weight_bits != 4: 25 | raise ValueError( 26 | "Currently, only 4-bit weight quantization is supported for " 27 | f"AWQ, but got {self.weight_bits} bits.") 28 | self.pack_factor = 32 // self.weight_bits 29 | 30 | def __repr__(self) -> str: 31 | return (f"AWQConfig(weight_bits={self.weight_bits}, " 32 | f"group_size={self.group_size}, " 33 | f"zero_point={self.zero_point})") 34 | 35 | @classmethod 36 | def get_name(cls) -> str: 37 | return "awq" 38 | 39 | @classmethod 40 | def get_supported_act_dtypes(cls) -> List[torch.dtype]: 41 | return [torch.half] 42 | 43 | @classmethod 44 | def get_min_capability(cls) -> int: 45 | # The AWQ kernel only supports Ampere or newer GPUs. 46 | return 80 47 | 48 | @classmethod 49 | def get_config_filenames(cls) -> List[str]: 50 | return [ 51 | "quant_config.json", # E.g., casperhansen/vicuna-7b-v1.5-awq 52 | "quantize_config.json", # E.g., abhinavkulkarni/mosaicml-mpt-7b-instruct-w4-g128-awq # pylint: disable=line-too-long 53 | ] 54 | 55 | @classmethod 56 | def from_config(cls, config: Dict[str, Any]) -> "AWQConfig": 57 | weight_bits = cls.get_from_keys(config, ["w_bit", "bits"]) 58 | group_size = cls.get_from_keys(config, ["q_group_size", "group_size"]) 59 | zero_point = cls.get_from_keys(config, ["zero_point"]) 60 | return cls(weight_bits, group_size, zero_point) 61 | 62 | @classmethod 63 | def get_packed_tensor_names(cls) -> List[str]: 64 | return ["qweight", "qzeros"] 65 | 66 | @classmethod 67 | def get_transposed_tensor_names(cls) -> List[str]: 68 | return ["qweight", "qzeros", "scales"] 69 | 70 | @classmethod 71 | def get_tp_tensor_names(cls) -> List[str]: 72 | return ["qweight", "qzeros", "scales"] 73 | -------------------------------------------------------------------------------- /vllm/transformers_utils/configs/aquila.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved. 3 | # 4 | # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX 5 | # and OPT implementations in this library. It has been modified from its 6 | # original forms to accommodate minor architectural differences compared 7 | # to GPT-NeoX and OPT used by the Meta AI team that trained the model. 8 | # 9 | # Licensed under the Apache License, Version 2.0 (the "License"); 10 | # you may not use this file except in compliance with the License. 11 | # You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | """ Aquila model configuration""" 21 | 22 | from transformers import PretrainedConfig 23 | 24 | 25 | class AquilaConfig(PretrainedConfig): 26 | model_type = "aquila" 27 | keys_to_ignore_at_inference = ["past_key_values"] 28 | 29 | def __init__( 30 | self, 31 | vocab_size=100008, 32 | hidden_size=4096, 33 | intermediate_size=11008, 34 | num_hidden_layers=32, 35 | num_attention_heads=32, 36 | hidden_act="silu", 37 | max_position_embeddings=2048, 38 | initializer_range=0.006, 39 | rms_norm_eps=1e-5, 40 | use_cache=True, 41 | pad_token_id=0, 42 | bos_token_id=1, 43 | eos_token_id=2, 44 | tie_word_embeddings=False, 45 | **kwargs, 46 | ): 47 | self.vocab_size = vocab_size 48 | self.max_position_embeddings = max_position_embeddings 49 | self.hidden_size = hidden_size 50 | self.intermediate_size = intermediate_size 51 | self.num_hidden_layers = num_hidden_layers 52 | self.num_attention_heads = num_attention_heads 53 | self.hidden_act = hidden_act 54 | self.initializer_range = initializer_range 55 | self.rms_norm_eps = rms_norm_eps 56 | self.use_cache = use_cache 57 | super().__init__( 58 | pad_token_id=pad_token_id, 59 | bos_token_id=bos_token_id, 60 | eos_token_id=eos_token_id, 61 | tie_word_embeddings=tie_word_embeddings, 62 | **kwargs, 63 | ) 64 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'vLLM' 21 | copyright = '2023, vLLM Team' 22 | author = 'the vLLM Team' 23 | 24 | 25 | # -- General configuration --------------------------------------------------- 26 | 27 | # Add any Sphinx extension module names here, as strings. They can be 28 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 29 | # ones. 30 | extensions = [ 31 | "sphinx.ext.napoleon", 32 | "sphinx.ext.viewcode", 33 | "sphinx.ext.intersphinx", 34 | "sphinx_copybutton", 35 | ] 36 | 37 | # Add any paths that contain templates here, relative to this directory. 38 | templates_path = ['_templates'] 39 | 40 | # List of patterns, relative to source directory, that match files and 41 | # directories to ignore when looking for source files. 42 | # This pattern also affects html_static_path and html_extra_path. 43 | exclude_patterns = [] 44 | 45 | # Exclude the prompt "$" when copying code 46 | copybutton_prompt_text = r"\$ " 47 | copybutton_prompt_is_regexp = True 48 | 49 | # -- Options for HTML output ------------------------------------------------- 50 | 51 | # The theme to use for HTML and HTML Help pages. See the documentation for 52 | # a list of builtin themes. 53 | # 54 | html_title = project 55 | html_theme = 'sphinx_book_theme' 56 | html_logo = 'assets/logos/vllm-logo-text-light.png' 57 | html_theme_options = { 58 | 'logo_only': True, 59 | 'path_to_docs': 'docs/source', 60 | 'repository_url': 'https://github.com/vllm-project/vllm', 61 | 'use_repository_button': True, 62 | } 63 | 64 | # Add any paths that contain custom static files (such as style sheets) here, 65 | # relative to this directory. They are copied after the builtin static files, 66 | # so a file named "default.css" will overwrite the builtin "default.css". 67 | html_static_path = ['_static'] 68 | -------------------------------------------------------------------------------- /vllm/transformers_utils/configs/mistral.py: -------------------------------------------------------------------------------- 1 | # Licensed under the Apache License, Version 2.0 (the "License"); 2 | # you may not use this file except in compliance with the License. 3 | # You may obtain a copy of the License at 4 | # 5 | # http://www.apache.org/licenses/LICENSE-2.0 6 | # 7 | # Unless required by applicable law or agreed to in writing, software 8 | # distributed under the License is distributed on an "AS IS" BASIS, 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 10 | # See the License for the specific language governing permissions and 11 | # limitations under the License. 12 | """Mistral-7B-v0.1 configuration""" 13 | from transformers.configuration_utils import PretrainedConfig 14 | 15 | 16 | class MistralConfig(PretrainedConfig): 17 | model_type = "mistral" 18 | keys_to_ignore_at_inference = ["past_key_values"] 19 | 20 | def __init__( 21 | self, 22 | vocab_size=32000, 23 | hidden_size=4096, 24 | intermediate_size=14336, 25 | num_hidden_layers=32, 26 | num_attention_heads=32, 27 | num_key_value_heads=8, 28 | hidden_act="silu", 29 | max_position_embeddings=4096 * 32, 30 | initializer_range=0.02, 31 | rms_norm_eps=1e-6, 32 | use_cache=True, 33 | pad_token_id=None, 34 | bos_token_id=1, 35 | eos_token_id=2, 36 | tie_word_embeddings=False, 37 | rope_theta=10000.0, 38 | sliding_window=4096, 39 | **kwargs, 40 | ): 41 | self.vocab_size = vocab_size 42 | self.max_position_embeddings = max_position_embeddings 43 | self.hidden_size = hidden_size 44 | self.intermediate_size = intermediate_size 45 | self.num_hidden_layers = num_hidden_layers 46 | self.num_attention_heads = num_attention_heads 47 | self.sliding_window = sliding_window 48 | 49 | # for backward compatibility 50 | if num_key_value_heads is None: 51 | num_key_value_heads = num_attention_heads 52 | 53 | self.num_key_value_heads = num_key_value_heads 54 | self.hidden_act = hidden_act 55 | self.initializer_range = initializer_range 56 | self.rms_norm_eps = rms_norm_eps 57 | self.use_cache = use_cache 58 | self.rope_theta = rope_theta 59 | 60 | super().__init__( 61 | pad_token_id=pad_token_id, 62 | bos_token_id=bos_token_id, 63 | eos_token_id=eos_token_id, 64 | tie_word_embeddings=tie_word_embeddings, 65 | **kwargs, 66 | ) 67 | -------------------------------------------------------------------------------- /vllm/transformers_utils/configs/mpt.py: -------------------------------------------------------------------------------- 1 | # Adapted from 2 | # https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py 3 | from typing import Any, Dict, Optional, Union 4 | 5 | from transformers import PretrainedConfig 6 | 7 | _ATTN_CONFIG_DEFAULTS = { 8 | "attn_type": "multihead_attention", 9 | "attn_pdrop": 0.0, 10 | "attn_impl": "triton", 11 | "qk_ln": False, 12 | "clip_qkv": None, 13 | "softmax_scale": None, 14 | "prefix_lm": False, 15 | "attn_uses_sequence_id": False, 16 | "alibi": False, 17 | "alibi_bias_max": 8, 18 | } 19 | 20 | 21 | class MPTConfig(PretrainedConfig): 22 | model_type = "mpt" 23 | attribute_map = { 24 | "hidden_size": "d_model", 25 | "num_attention_heads": "n_heads", 26 | "num_hidden_layers": "n_layers", 27 | } 28 | 29 | def __init__( 30 | self, 31 | d_model: int = 2048, 32 | n_heads: int = 16, 33 | n_layers: int = 24, 34 | expansion_ratio: int = 4, 35 | max_seq_len: int = 2048, 36 | vocab_size: int = 50368, 37 | resid_pdrop: float = 0.0, 38 | emb_pdrop: float = 0.0, 39 | learned_pos_emb: bool = True, 40 | attn_config: Optional[Dict[str, Any]] = None, 41 | init_device: str = "cpu", 42 | logit_scale: Optional[Union[float, str]] = None, 43 | no_bias: bool = False, 44 | verbose: int = 0, 45 | embedding_fraction: float = 1.0, 46 | norm_type: str = "low_precision_layernorm", 47 | use_cache: bool = False, 48 | **kwargs, 49 | ) -> None: 50 | self.d_model = d_model 51 | self.n_heads = n_heads 52 | self.n_layers = n_layers 53 | self.expansion_ratio = expansion_ratio 54 | self.max_seq_len = max_seq_len 55 | self.vocab_size = vocab_size 56 | self.resid_pdrop = resid_pdrop 57 | self.emb_pdrop = emb_pdrop 58 | self.learned_pos_emb = learned_pos_emb 59 | if attn_config is None: 60 | self.attn_config = _ATTN_CONFIG_DEFAULTS 61 | else: 62 | self.attn_config = attn_config 63 | self.init_device = init_device 64 | self.logit_scale = logit_scale 65 | self.no_bias = no_bias 66 | self.verbose = verbose 67 | self.embedding_fraction = embedding_fraction 68 | self.norm_type = norm_type 69 | self.use_cache = use_cache 70 | if "name" in kwargs: 71 | del kwargs["name"] 72 | if "loss_fn" in kwargs: 73 | del kwargs["loss_fn"] 74 | super().__init__(**kwargs) 75 | -------------------------------------------------------------------------------- /tests/kernels/test_activation.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | import torch.nn.functional as F 4 | from transformers.activations import get_activation 5 | 6 | from vllm import activation_ops 7 | 8 | DTYPES = [torch.half, torch.bfloat16, torch.float] 9 | NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing 10 | D = [512, 4096, 5120, 13824] # Arbitrary values for testing 11 | SEEDS = [0] 12 | 13 | 14 | def ref_silu_and_mul(x: torch.Tensor) -> torch.Tensor: 15 | x1, x2 = x.chunk(chunks=2, dim=1) 16 | return F.silu(x1) * x2 17 | 18 | 19 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS) 20 | @pytest.mark.parametrize("d", D) 21 | @pytest.mark.parametrize("dtype", DTYPES) 22 | @pytest.mark.parametrize("seed", SEEDS) 23 | @torch.inference_mode() 24 | def test_silu_and_mul( 25 | num_tokens: int, 26 | d: int, 27 | dtype: torch.dtype, 28 | seed: int, 29 | ) -> None: 30 | torch.random.manual_seed(seed) 31 | torch.cuda.manual_seed(seed) 32 | x = torch.randn(num_tokens, 2 * d, dtype=dtype, device="cuda") 33 | out = torch.empty(num_tokens, d, dtype=dtype, device="cuda") 34 | activation_ops.silu_and_mul(out, x) 35 | ref_out = ref_silu_and_mul(x) 36 | assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5) 37 | 38 | 39 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS) 40 | @pytest.mark.parametrize("d", D) 41 | @pytest.mark.parametrize("dtype", DTYPES) 42 | @pytest.mark.parametrize("seed", SEEDS) 43 | @torch.inference_mode() 44 | def test_gelu_new( 45 | num_tokens: int, 46 | d: int, 47 | dtype: torch.dtype, 48 | seed: int, 49 | ) -> None: 50 | torch.random.manual_seed(seed) 51 | torch.cuda.manual_seed(seed) 52 | x = torch.randn(num_tokens, d, dtype=dtype, device="cuda") 53 | out = torch.empty(num_tokens, d, dtype=dtype, device="cuda") 54 | activation_ops.gelu_new(out, x) 55 | ref_out = get_activation("gelu_new")(x) 56 | assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5) 57 | 58 | 59 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS) 60 | @pytest.mark.parametrize("d", D) 61 | @pytest.mark.parametrize("dtype", DTYPES) 62 | @pytest.mark.parametrize("seed", SEEDS) 63 | def test_gelu_fast( 64 | num_tokens: int, 65 | d: int, 66 | dtype: torch.dtype, 67 | seed: int, 68 | ) -> None: 69 | torch.random.manual_seed(seed) 70 | torch.cuda.manual_seed(seed) 71 | x = torch.randn(num_tokens, d, dtype=dtype, device="cuda") 72 | out = torch.empty(num_tokens, d, dtype=dtype, device="cuda") 73 | activation_ops.gelu_fast(out, x) 74 | ref_out = get_activation("gelu_fast")(x) 75 | assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5) 76 | -------------------------------------------------------------------------------- /examples/api_client.py: -------------------------------------------------------------------------------- 1 | """Example Python client for vllm.entrypoints.api_server""" 2 | 3 | import argparse 4 | import json 5 | from typing import Iterable, List 6 | 7 | import requests 8 | 9 | 10 | def clear_line(n: int = 1) -> None: 11 | LINE_UP = '\033[1A' 12 | LINE_CLEAR = '\x1b[2K' 13 | for _ in range(n): 14 | print(LINE_UP, end=LINE_CLEAR, flush=True) 15 | 16 | 17 | def post_http_request(prompt: str, 18 | api_url: str, 19 | n: int = 1, 20 | stream: bool = False) -> requests.Response: 21 | headers = {"User-Agent": "Test Client"} 22 | pload = { 23 | "prompt": prompt, 24 | "n": n, 25 | "use_beam_search": True, 26 | "temperature": 0.0, 27 | "max_tokens": 16, 28 | "stream": stream, 29 | } 30 | response = requests.post(api_url, headers=headers, json=pload, stream=True) 31 | return response 32 | 33 | 34 | def get_streaming_response(response: requests.Response) -> Iterable[List[str]]: 35 | for chunk in response.iter_lines(chunk_size=8192, 36 | decode_unicode=False, 37 | delimiter=b"\0"): 38 | if chunk: 39 | data = json.loads(chunk.decode("utf-8")) 40 | output = data["text"] 41 | yield output 42 | 43 | 44 | def get_response(response: requests.Response) -> List[str]: 45 | data = json.loads(response.content) 46 | output = data["text"] 47 | return output 48 | 49 | 50 | if __name__ == "__main__": 51 | parser = argparse.ArgumentParser() 52 | parser.add_argument("--host", type=str, default="localhost") 53 | parser.add_argument("--port", type=int, default=8000) 54 | parser.add_argument("--n", type=int, default=4) 55 | parser.add_argument("--prompt", type=str, default="San Francisco is a") 56 | parser.add_argument("--stream", action="store_true") 57 | args = parser.parse_args() 58 | prompt = args.prompt 59 | api_url = f"http://{args.host}:{args.port}/generate" 60 | n = args.n 61 | stream = args.stream 62 | 63 | print(f"Prompt: {prompt!r}\n", flush=True) 64 | response = post_http_request(prompt, api_url, n, stream) 65 | 66 | if stream: 67 | num_printed_lines = 0 68 | for h in get_streaming_response(response): 69 | clear_line(num_printed_lines) 70 | num_printed_lines = 0 71 | for i, line in enumerate(h): 72 | num_printed_lines += 1 73 | print(f"Beam candidate {i}: {line!r}", flush=True) 74 | else: 75 | output = get_response(response) 76 | for i, line in enumerate(output): 77 | print(f"Beam candidate {i}: {line!r}", flush=True) 78 | -------------------------------------------------------------------------------- /vllm/model_executor/parallel_utils/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 The vLLM team. 2 | # Adapted from 3 | # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py 4 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 5 | from typing import List, Sequence 6 | 7 | import torch 8 | 9 | 10 | def ensure_divisibility(numerator, denominator): 11 | """Ensure that numerator is divisible by the denominator.""" 12 | assert numerator % denominator == 0, "{} is not divisible by {}".format( 13 | numerator, denominator) 14 | 15 | 16 | def divide(numerator, denominator): 17 | """Ensure that numerator is divisible by the denominator and return 18 | the division value.""" 19 | ensure_divisibility(numerator, denominator) 20 | return numerator // denominator 21 | 22 | 23 | def split_tensor_along_last_dim( 24 | tensor: torch.Tensor, 25 | num_partitions: int, 26 | contiguous_split_chunks: bool = False, 27 | ) -> List[torch.Tensor]: 28 | """ Split a tensor along its last dimension. 29 | 30 | Arguments: 31 | tensor: input tensor. 32 | num_partitions: number of partitions to split the tensor 33 | contiguous_split_chunks: If True, make each chunk contiguous 34 | in memory. 35 | 36 | Returns: 37 | A list of Tensors 38 | """ 39 | # Get the size and dimension. 40 | last_dim = tensor.dim() - 1 41 | last_dim_size = divide(tensor.size()[last_dim], num_partitions) 42 | # Split. 43 | tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) 44 | # Note: torch.split does not create contiguous tensors by default. 45 | if contiguous_split_chunks: 46 | return tuple(chunk.contiguous() for chunk in tensor_list) 47 | 48 | return tensor_list 49 | 50 | 51 | class VocabUtility: 52 | """ Split the vocabulary into `world_size` chunks and return the first 53 | and last index of the vocabulary belonging to the `rank` 54 | partition: Note that indices in [fist, last) 55 | 56 | """ 57 | 58 | @staticmethod 59 | def vocab_range_from_per_partition_vocab_size( 60 | per_partition_vocab_size: int, rank: int) -> Sequence[int]: 61 | index_f = rank * per_partition_vocab_size 62 | index_l = index_f + per_partition_vocab_size 63 | return index_f, index_l 64 | 65 | @staticmethod 66 | def vocab_range_from_global_vocab_size(global_vocab_size: int, rank: int, 67 | world_size: int) -> Sequence[int]: 68 | per_partition_vocab_size = divide(global_vocab_size, world_size) 69 | return VocabUtility.vocab_range_from_per_partition_vocab_size( 70 | per_partition_vocab_size, rank) 71 | -------------------------------------------------------------------------------- /vllm/model_executor/quantization_utils/base.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, List 2 | 3 | import torch 4 | 5 | 6 | class QuantizationConfig: 7 | 8 | @classmethod 9 | def get_name(cls) -> str: 10 | """Name of the quantization method.""" 11 | raise NotImplementedError 12 | 13 | @classmethod 14 | def get_supported_act_dtypes(cls) -> List[torch.dtype]: 15 | """List of supported activation dtypes.""" 16 | raise NotImplementedError 17 | 18 | @classmethod 19 | def get_min_capability(cls) -> int: 20 | """Minimum GPU capability to support the quantization method. 21 | 22 | E.g., 70 for Volta, 75 for Turing, 80 for Ampere. 23 | This requirement is due to the custom CUDA kernels used by the 24 | quantization method. 25 | """ 26 | raise NotImplementedError 27 | 28 | @classmethod 29 | def get_config_filenames(cls) -> List[str]: 30 | """List of filenames to search for in the model directory.""" 31 | raise NotImplementedError 32 | 33 | @classmethod 34 | def from_config(cls, config: Dict[str, Any]) -> "QuantizationConfig": 35 | """Create a config class from the model's quantization config.""" 36 | raise NotImplementedError 37 | 38 | @staticmethod 39 | def get_from_keys(config: Dict[str, Any], keys: List[str]) -> Any: 40 | """Get a value from the model's quantization config.""" 41 | for key in keys: 42 | if key in config: 43 | return config[key] 44 | raise ValueError(f"Cannot find any of {keys} in the model's " 45 | "quantization config.") 46 | 47 | @classmethod 48 | def get_packed_tensor_names(cls) -> List[str]: 49 | raise NotImplementedError 50 | 51 | @classmethod 52 | def is_packed(cls, tensor_name: str) -> bool: 53 | """Returns True if a tensor is packed. 54 | 55 | A tensor is considered packed if each element in the tensor is a 56 | packed representation of multiple elements in the original tensor. 57 | For example, an INT32 element in the tensor may represent 8 INT4 58 | elements in the original tensor. 59 | """ 60 | return any(tag in tensor_name for tag in cls.get_packed_tensor_names()) 61 | 62 | @classmethod 63 | def get_transposed_tensor_names(cls) -> List[str]: 64 | raise NotImplementedError 65 | 66 | @classmethod 67 | def is_transposed(cls, tensor_name: str) -> bool: 68 | """Returns True if a tensor is transposed relative to nn.Linear.weight. 69 | """ 70 | return any(tag in tensor_name 71 | for tag in cls.get_transposed_tensor_names()) 72 | 73 | @classmethod 74 | def get_tp_tensor_names(cls) -> List[str]: 75 | raise NotImplementedError 76 | -------------------------------------------------------------------------------- /vllm/utils.py: -------------------------------------------------------------------------------- 1 | import enum 2 | import uuid 3 | from platform import uname 4 | 5 | import psutil 6 | import torch 7 | from typing import List 8 | 9 | from vllm import cuda_utils 10 | 11 | 12 | class Device(enum.Enum): 13 | GPU = enum.auto() 14 | CPU = enum.auto() 15 | 16 | 17 | class Counter: 18 | 19 | def __init__(self, start: int = 0) -> None: 20 | self.counter = start 21 | 22 | def __next__(self) -> int: 23 | i = self.counter 24 | self.counter += 1 25 | return i 26 | 27 | def reset(self) -> None: 28 | self.counter = 0 29 | 30 | class InvalidAccessError(Exception): 31 | pass 32 | 33 | def invalidate_access(field_names): 34 | def decorator(cls): 35 | original_getattr = cls.__getattribute__ 36 | 37 | def new_getattr(self, name): 38 | if name in field_names: 39 | raise InvalidAccessError(f"Access to {name} is invalid") 40 | return original_getattr(self, name) 41 | 42 | cls.__getattribute__ = new_getattr 43 | return cls 44 | 45 | return decorator 46 | 47 | def get_max_shared_memory_bytes(gpu: int = 0) -> int: 48 | """Returns the maximum shared memory per thread block in bytes.""" 49 | # https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html 50 | cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 # pylint: disable=invalid-name 51 | max_shared_mem = cuda_utils.get_device_attribute( 52 | cudaDevAttrMaxSharedMemoryPerBlockOptin, gpu) 53 | return int(max_shared_mem) 54 | 55 | 56 | def get_gpu_memory(gpu: int = 0) -> int: 57 | """Returns the total memory of the GPU in bytes.""" 58 | return torch.cuda.get_device_properties(gpu).total_memory 59 | 60 | 61 | def get_cpu_memory() -> int: 62 | """Returns the total CPU memory of the node in bytes.""" 63 | return psutil.virtual_memory().total 64 | 65 | 66 | def random_uuid() -> str: 67 | return str(uuid.uuid4().hex) 68 | 69 | 70 | def in_wsl() -> bool: 71 | # Reference: https://github.com/microsoft/WSL/issues/4071 72 | return "microsoft" in " ".join(uname()).lower() 73 | 74 | 75 | # TODO: Change this back to API response key when doing the real-case 76 | # NOTE: Currently this stop string is for testing only! 77 | # "not" is the token right after prompt in examples/test_pause.py 78 | def get_api_stop_string() -> str: 79 | # return 'Integrity' 80 | # return '\n' 81 | # return 'a' 82 | # return "" 83 | # return "Editor" # gpt-j 84 | # return "asa" # baichuan-13b 85 | # return "mandated" # opt 86 | return "USE" # dummy llama, vulcuna 87 | return "not" 88 | 89 | def get_api_stop_strings() -> List[str]: 90 | # return "" 91 | return ['\n', 'Editor', 'asa', 'USE'] 92 | 93 | def get_api_stop_token() -> int: 94 | # react "PAUSE" 95 | return 17171 -------------------------------------------------------------------------------- /tests/async_engine/test_api_server.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import sys 3 | import time 4 | from multiprocessing import Pool 5 | from pathlib import Path 6 | 7 | import pytest 8 | import requests 9 | 10 | 11 | def _query_server(prompt: str) -> dict: 12 | response = requests.post("http://localhost:8000/generate", 13 | json={ 14 | "prompt": prompt, 15 | "max_tokens": 100, 16 | "temperature": 0, 17 | "ignore_eos": True 18 | }) 19 | response.raise_for_status() 20 | return response.json() 21 | 22 | 23 | @pytest.fixture 24 | def api_server(): 25 | script_path = Path(__file__).parent.joinpath( 26 | "api_server_async_engine.py").absolute() 27 | # pylint: disable=consider-using-with 28 | uvicorn_process = subprocess.Popen([ 29 | sys.executable, "-u", 30 | str(script_path), "--model", "facebook/opt-125m" 31 | ]) 32 | yield 33 | uvicorn_process.terminate() 34 | 35 | 36 | # pylint: disable=redefined-outer-name, unused-argument 37 | def test_api_server(api_server): 38 | """ 39 | Run the API server and test it. 40 | 41 | We run both the server and requests in separate processes. 42 | 43 | We test that the server can handle incoming requests, including 44 | multiple requests at the same time, and that it can handle requests 45 | being cancelled without crashing. 46 | """ 47 | with Pool(32) as pool: 48 | # Wait until the server is ready 49 | prompts = ["Hello world"] * 1 50 | result = None 51 | while not result: 52 | # pylint: disable=bare-except 53 | try: 54 | for result in pool.map(_query_server, prompts): 55 | break 56 | except: 57 | time.sleep(1) 58 | 59 | # Actual tests start here 60 | # Try with 1 prompt 61 | for result in pool.map(_query_server, prompts): 62 | assert result 63 | 64 | num_aborted_requests = requests.get( 65 | "http://localhost:8000/stats").json()["num_aborted_requests"] 66 | assert num_aborted_requests == 0 67 | 68 | # Try with 100 prompts 69 | prompts = ["Hello world"] * 100 70 | for result in pool.map(_query_server, prompts): 71 | assert result 72 | 73 | # Cancel requests 74 | pool.map_async(_query_server, prompts) 75 | time.sleep(0.01) 76 | pool.terminate() 77 | pool.join() 78 | 79 | # check cancellation stats 80 | num_aborted_requests = requests.get( 81 | "http://localhost:8000/stats").json()["num_aborted_requests"] 82 | assert num_aborted_requests > 0 83 | 84 | # check that server still runs after cancellations 85 | with Pool(32) as pool: 86 | # Try with 100 prompts 87 | prompts = ["Hello world"] * 100 88 | for result in pool.map(_query_server, prompts): 89 | assert result 90 | -------------------------------------------------------------------------------- /vllm/transformers_utils/configs/falcon.py: -------------------------------------------------------------------------------- 1 | # Adapted from 2 | # https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py 3 | # Copyright 2023 The vLLM team. 4 | # Copyright 2022 the Big Science Workshop and HuggingFace Inc. team. 5 | # All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | """Falcon configuration""" 19 | from transformers.configuration_utils import PretrainedConfig 20 | 21 | 22 | class RWConfig(PretrainedConfig): 23 | model_type = "falcon" 24 | keys_to_ignore_at_inference = ["past_key_values"] 25 | attribute_map = { 26 | "num_hidden_layers": "n_layer", 27 | "num_attention_heads": "n_head", 28 | "num_kv_heads": "n_head_kv", 29 | } 30 | 31 | def __init__( 32 | self, 33 | vocab_size=250880, 34 | hidden_size=64, 35 | n_layer=2, 36 | n_head=8, 37 | layer_norm_epsilon=1e-5, 38 | initializer_range=0.02, 39 | use_cache=True, 40 | bos_token_id=1, 41 | eos_token_id=2, 42 | hidden_dropout=0.0, 43 | attention_dropout=0.0, 44 | multi_query=True, 45 | n_head_kv=None, 46 | alibi=False, 47 | bias=False, 48 | parallel_attn=False, 49 | new_decoder_architecture=False, 50 | **kwargs, 51 | ) -> None: 52 | self.vocab_size = vocab_size 53 | # Backward compatibility with n_embed kwarg 54 | n_embed = kwargs.pop("n_embed", None) 55 | self.hidden_size = hidden_size if n_embed is None else n_embed 56 | self.n_layer = n_layer 57 | self.n_head = n_head 58 | self.layer_norm_epsilon = layer_norm_epsilon 59 | self.initializer_range = initializer_range 60 | self.use_cache = use_cache 61 | self.hidden_dropout = hidden_dropout 62 | self.attention_dropout = attention_dropout 63 | 64 | self.bos_token_id = bos_token_id 65 | self.eos_token_id = eos_token_id 66 | self.multi_query = multi_query 67 | self.n_head_kv = 1 if n_head_kv is None else n_head_kv 68 | self.alibi = alibi 69 | self.bias = bias 70 | self.parallel_attn = parallel_attn 71 | self.new_decoder_architecture = new_decoder_architecture 72 | 73 | if self.hidden_size == 8192: 74 | # Hack for falcon-40b 75 | self.new_decoder_architecture = True 76 | 77 | super().__init__(bos_token_id=bos_token_id, 78 | eos_token_id=eos_token_id, 79 | **kwargs) 80 | 81 | @property 82 | def head_dim(self): 83 | return self.hidden_size // self.n_head 84 | 85 | @property 86 | def rotary(self): 87 | return not self.alibi 88 | -------------------------------------------------------------------------------- /tests/distributed/test_comm_ops.py: -------------------------------------------------------------------------------- 1 | """Test the communication operators. 2 | 3 | Run `pytest tests/distributed/test_comm_ops.py --forked`. 4 | """ 5 | from multiprocessing import Process 6 | 7 | import pytest 8 | import torch 9 | 10 | from vllm.config import ParallelConfig 11 | from vllm.engine.ray_utils import get_open_port 12 | from vllm.model_executor.parallel_utils.communication_op import ( 13 | tensor_model_parallel_all_reduce, 14 | tensor_model_parallel_all_gather, 15 | ) 16 | from vllm.worker.worker import _init_distributed_environment 17 | 18 | 19 | def init_test_distributed_environment(pipeline_parallel_size: int, 20 | tensor_parallel_size: int, rank: int, 21 | distributed_init_port: str): 22 | parallel_config = ParallelConfig(pipeline_parallel_size, 23 | tensor_parallel_size, 24 | worker_use_ray=True) 25 | distributed_init_method = f"tcp://localhost:{distributed_init_port}" 26 | torch.cuda.set_device(rank) 27 | _init_distributed_environment(parallel_config, rank, 28 | distributed_init_method) 29 | 30 | 31 | def all_reduce_test_worker(tensor_parallel_size: int, rank: int, 32 | distributed_init_port: str): 33 | init_test_distributed_environment(1, tensor_parallel_size, rank, 34 | distributed_init_port) 35 | num_elements = 8 36 | all_tensors = [ 37 | torch.arange(num_elements, dtype=torch.float32, device="cuda") * 38 | (r + 1) for r in range(tensor_parallel_size) 39 | ] 40 | expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0) 41 | t = all_tensors[rank] 42 | t = tensor_model_parallel_all_reduce(t) 43 | assert torch.allclose(t, expected) 44 | 45 | 46 | def all_gather_test_worker(tensor_parallel_size: int, rank: int, 47 | distributed_init_port: str): 48 | init_test_distributed_environment(1, tensor_parallel_size, rank, 49 | distributed_init_port) 50 | num_dimensions = 3 51 | tensor_size = list(range(2, num_dimensions + 2)) 52 | total_size = 1 53 | for s in tensor_size: 54 | total_size *= s 55 | for all_gather_dimension in range(num_dimensions): 56 | all_tensors = [ 57 | torch.arange(total_size, dtype=torch.float32, 58 | device="cuda").reshape(tensor_size) * (r + 1) 59 | for r in range(tensor_parallel_size) 60 | ] 61 | expected = torch.cat(all_tensors, dim=all_gather_dimension) 62 | t = all_tensors[rank] 63 | t = tensor_model_parallel_all_gather(t, all_gather_dimension) 64 | assert torch.allclose(t, expected) 65 | 66 | 67 | @pytest.mark.skipif(torch.cuda.device_count() < 2, 68 | reason="Need at least 2 GPUs to run the test.") 69 | @pytest.mark.parametrize("tensor_parallel_size", [2]) 70 | @pytest.mark.parametrize("test_target", 71 | [all_reduce_test_worker, all_gather_test_worker]) 72 | def test_multi_process_tensor_parallel(tensor_parallel_size, test_target): 73 | distributed_init_port = get_open_port() 74 | processes = [] 75 | for rank in range(tensor_parallel_size): 76 | p = Process(target=test_target, 77 | args=(tensor_parallel_size, rank, distributed_init_port)) 78 | p.start() 79 | processes.append(p) 80 | for p in processes: 81 | p.join() 82 | assert all(p.exitcode == 0 for p in processes) 83 | -------------------------------------------------------------------------------- /docs/source/models/supported_models.rst: -------------------------------------------------------------------------------- 1 | .. _supported_models: 2 | 3 | Supported Models 4 | ================ 5 | 6 | vLLM supports a variety of generative Transformer models in `HuggingFace Transformers `_. 7 | The following is the list of model architectures that are currently supported by vLLM. 8 | Alongside each architecture, we include some popular models that use it. 9 | 10 | .. list-table:: 11 | :widths: 25 25 50 12 | :header-rows: 1 13 | 14 | * - Architecture 15 | - Models 16 | - Example HuggingFace Models 17 | * - :code:`AquilaForCausalLM` 18 | - Aquila 19 | - :code:`BAAI/Aquila-7B`, :code:`BAAI/AquilaChat-7B`, etc. 20 | * - :code:`BaiChuanForCausalLM` 21 | - Baichuan 22 | - :code:`baichuan-inc/Baichuan-7B`, :code:`baichuan-inc/Baichuan-13B-Chat`, etc. 23 | * - :code:`BloomForCausalLM` 24 | - BLOOM, BLOOMZ, BLOOMChat 25 | - :code:`bigscience/bloom`, :code:`bigscience/bloomz`, etc. 26 | * - :code:`FalconForCausalLM` 27 | - Falcon 28 | - :code:`tiiuae/falcon-7b`, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc. 29 | * - :code:`GPT2LMHeadModel` 30 | - GPT-2 31 | - :code:`gpt2`, :code:`gpt2-xl`, etc. 32 | * - :code:`GPTBigCodeForCausalLM` 33 | - StarCoder, SantaCoder, WizardCoder 34 | - :code:`bigcode/starcoder`, :code:`bigcode/gpt_bigcode-santacoder`, :code:`WizardLM/WizardCoder-15B-V1.0`, etc. 35 | * - :code:`GPTJForCausalLM` 36 | - GPT-J 37 | - :code:`EleutherAI/gpt-j-6b`, :code:`nomic-ai/gpt4all-j`, etc. 38 | * - :code:`GPTNeoXForCausalLM` 39 | - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM 40 | - :code:`EleutherAI/gpt-neox-20b`, :code:`EleutherAI/pythia-12b`, :code:`OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, :code:`databricks/dolly-v2-12b`, :code:`stabilityai/stablelm-tuned-alpha-7b`, etc. 41 | * - :code:`InternLMForCausalLM` 42 | - InternLM 43 | - :code:`internlm/internlm-7b`, :code:`internlm/internlm-chat-7b`, etc. 44 | * - :code:`LlamaForCausalLM` 45 | - LLaMA, LLaMA-2, Vicuna, Alpaca, Koala, Guanaco 46 | - :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`young-geng/koala`, etc. 47 | * - :code:`MistralForCausalLM` 48 | - Mistral, Mistral-Instruct 49 | - :code:`mistralai/Mistral-7B-v0.1`, :code:`mistralai/Mistral-7B-Instruct-v0.1`, etc. 50 | * - :code:`MPTForCausalLM` 51 | - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter 52 | - :code:`mosaicml/mpt-7b`, :code:`mosaicml/mpt-7b-storywriter`, :code:`mosaicml/mpt-30b`, etc. 53 | * - :code:`OPTForCausalLM` 54 | - OPT, OPT-IML 55 | - :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc. 56 | * - :code:`QWenLMHeadModel` 57 | - Qwen 58 | - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc. 59 | 60 | If your model uses one of the above model architectures, you can seamlessly run your model with vLLM. 61 | Otherwise, please refer to :ref:`Adding a New Model ` for instructions on how to implement support for your model. 62 | Alternatively, you can raise an issue on our `GitHub `_ project. 63 | 64 | .. tip:: 65 | The easiest way to check if your model is supported is to run the program below: 66 | 67 | .. code-block:: python 68 | 69 | from vllm import LLM 70 | 71 | llm = LLM(model=...) # Name or path of your model 72 | output = llm.generate("Hello, my name is") 73 | print(output) 74 | 75 | If vLLM successfully generates text, it indicates that your model is supported. 76 | -------------------------------------------------------------------------------- /benchmarks/benchmark_latency.py: -------------------------------------------------------------------------------- 1 | """Benchmark the latency of processing a single batch of requests.""" 2 | import argparse 3 | import time 4 | 5 | import numpy as np 6 | import torch 7 | from tqdm import tqdm 8 | 9 | from vllm import LLM, SamplingParams 10 | 11 | 12 | def main(args: argparse.Namespace): 13 | print(args) 14 | 15 | # Process all the requests in a single batch if possible. 16 | # NOTE(woosuk): If the request cannot be processed in a single batch, 17 | # the engine will automatically process the request in multiple batches. 18 | llm = LLM( 19 | model=args.model, 20 | tokenizer=args.tokenizer, 21 | quantization=args.quantization, 22 | tensor_parallel_size=args.tensor_parallel_size, 23 | max_num_seqs=args.batch_size, 24 | max_num_batched_tokens=args.batch_size * args.input_len, 25 | trust_remote_code=args.trust_remote_code, 26 | dtype=args.dtype, 27 | ) 28 | 29 | sampling_params = SamplingParams( 30 | n=args.n, 31 | temperature=0.0 if args.use_beam_search else 1.0, 32 | top_p=1.0, 33 | use_beam_search=args.use_beam_search, 34 | ignore_eos=True, 35 | max_tokens=args.output_len, 36 | ) 37 | print(sampling_params) 38 | dummy_prompt_token_ids = [[0] * args.input_len] * args.batch_size 39 | 40 | def run_to_completion(profile: bool = False): 41 | if profile: 42 | torch.cuda.cudart().cudaProfilerStart() 43 | start_time = time.perf_counter() 44 | 45 | llm.generate(prompt_token_ids=dummy_prompt_token_ids, 46 | sampling_params=sampling_params, 47 | use_tqdm=False) 48 | 49 | end_time = time.perf_counter() 50 | latency = end_time - start_time 51 | if profile: 52 | torch.cuda.cudart().cudaProfilerStop() 53 | return latency 54 | 55 | print("Warming up...") 56 | run_to_completion(profile=False) 57 | 58 | # Benchmark. 59 | latencies = [] 60 | for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): 61 | latencies.append(run_to_completion(profile=False)) 62 | print(f'Avg latency: {np.mean(latencies)} seconds') 63 | 64 | 65 | if __name__ == '__main__': 66 | parser = argparse.ArgumentParser( 67 | description='Benchmark the latency of processing a single batch of ' 68 | 'requests till completion.') 69 | parser.add_argument('--model', type=str, default='facebook/opt-125m') 70 | parser.add_argument('--tokenizer', type=str, default=None) 71 | parser.add_argument('--quantization', 72 | '-q', 73 | choices=['awq', None], 74 | default=None) 75 | parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1) 76 | parser.add_argument('--input-len', type=int, default=32) 77 | parser.add_argument('--output-len', type=int, default=128) 78 | parser.add_argument('--batch-size', type=int, default=8) 79 | parser.add_argument('--n', 80 | type=int, 81 | default=1, 82 | help='Number of generated sequences per prompt.') 83 | parser.add_argument('--use-beam-search', action='store_true') 84 | parser.add_argument('--num-iters', 85 | type=int, 86 | default=3, 87 | help='Number of iterations to run.') 88 | parser.add_argument('--trust-remote-code', 89 | action='store_true', 90 | help='trust remote code from huggingface') 91 | parser.add_argument( 92 | '--dtype', 93 | type=str, 94 | default='auto', 95 | choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'], 96 | help='data type for model weights and activations. ' 97 | 'The "auto" option will use FP16 precision ' 98 | 'for FP32 and FP16 models, and BF16 precision ' 99 | 'for BF16 models.') 100 | args = parser.parse_args() 101 | main(args) 102 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantized_linear/awq.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import torch 4 | from torch.nn.parameter import Parameter 5 | 6 | from vllm import quantization_ops 7 | from vllm.model_executor.parallel_utils.layers import (ColumnParallelLinear, 8 | RowParallelLinear) 9 | 10 | 11 | class AWQColumnParallelLinear(ColumnParallelLinear): 12 | 13 | def create_weights(self, dtype: torch.dtype) -> None: 14 | assert self.input_size % self.quant_config.weight_bits == 0 15 | assert (self.output_size_per_partition % 16 | self.quant_config.pack_factor == 0) 17 | self.qweight = Parameter( 18 | torch.empty( 19 | self.input_size, 20 | self.output_size_per_partition // 21 | self.quant_config.pack_factor, 22 | device="cuda", 23 | dtype=torch.int32, 24 | ), 25 | requires_grad=False, 26 | ) 27 | self.qzeros = Parameter( 28 | torch.empty( 29 | self.input_size // self.quant_config.group_size, 30 | self.output_size_per_partition // 31 | self.quant_config.pack_factor, 32 | device="cuda", 33 | dtype=torch.int32, 34 | ), 35 | requires_grad=False, 36 | ) 37 | self.scales = Parameter( 38 | torch.empty( 39 | self.input_size // self.quant_config.group_size, 40 | self.output_size_per_partition, 41 | device="cuda", 42 | dtype=dtype, 43 | ), 44 | requires_grad=False, 45 | ) 46 | 47 | def apply_weights( 48 | self, 49 | x: torch.Tensor, 50 | bias: Optional[torch.Tensor], 51 | ) -> torch.Tensor: 52 | pack_factor = self.quant_config.pack_factor 53 | out_shape = (x.shape[-2], self.qweight.shape[-1] * pack_factor) 54 | reshaped_x = x.reshape(-1, x.shape[-1]) 55 | out = quantization_ops.awq_gemm(reshaped_x, self.qweight, self.scales, 56 | self.qzeros, pack_factor) 57 | if bias is not None: 58 | out = out + bias 59 | return out.reshape(out_shape) 60 | 61 | 62 | class AWQRowParallelLinear(RowParallelLinear): 63 | 64 | def create_weights(self, dtype: torch.dtype) -> None: 65 | assert (self.input_size_per_partition % 66 | self.quant_config.weight_bits == 0) 67 | assert self.output_size % self.quant_config.pack_factor == 0 68 | self.qweight = Parameter( 69 | torch.empty( 70 | self.input_size_per_partition, 71 | self.output_size // self.quant_config.pack_factor, 72 | device="cuda", 73 | dtype=torch.int32, 74 | ), 75 | requires_grad=False, 76 | ) 77 | self.qzeros = Parameter( 78 | torch.empty( 79 | self.input_size_per_partition // self.quant_config.group_size, 80 | self.output_size // self.quant_config.pack_factor, 81 | device="cuda", 82 | dtype=torch.int32, 83 | ), 84 | requires_grad=False, 85 | ) 86 | self.scales = Parameter( 87 | torch.empty( 88 | self.input_size_per_partition // self.quant_config.group_size, 89 | self.output_size, 90 | device="cuda", 91 | dtype=dtype, 92 | ), 93 | requires_grad=False, 94 | ) 95 | 96 | def apply_weights(self, x: torch.Tensor) -> torch.Tensor: 97 | pack_factor = self.quant_config.pack_factor 98 | out_shape = (x.shape[-2], self.qweight.shape[-1] * pack_factor) 99 | reshaped_x = x.reshape(-1, x.shape[-1]) 100 | out = quantization_ops.awq_gemm(reshaped_x, self.qweight, self.scales, 101 | self.qzeros, pack_factor) 102 | return out.reshape(out_shape) 103 | -------------------------------------------------------------------------------- /csrc/quantization/awq/dequantize.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | Adapted from https://github.com/mit-han-lab/llm-awq 3 | Modified from NVIDIA FasterTransformer: https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h 4 | @article{lin2023awq, 5 | title={AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration}, 6 | author={Lin, Ji and Tang, Jiaming and Tang, Haotian and Yang, Shang and Dang, Xingyu and Han, Song}, 7 | journal={arXiv}, 8 | year={2023} 9 | } 10 | */ 11 | 12 | #pragma once 13 | 14 | namespace vllm { 15 | namespace awq { 16 | 17 | __device__ uint4 dequantize_s4_to_fp16x2(uint32_t const& source) 18 | { 19 | #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 20 | assert(false); 21 | #else 22 | uint4 result; 23 | 24 | uint32_t* h = reinterpret_cast(&result); 25 | uint32_t const i4s = reinterpret_cast(source); 26 | 27 | // First, we extract the i4s and construct an intermediate fp16 number. 28 | static constexpr uint32_t immLut = (0xf0 & 0xcc) | 0xaa; 29 | static constexpr uint32_t BOTTOM_MASK = 0x000f000f; 30 | static constexpr uint32_t TOP_MASK = 0x00f000f0; 31 | static constexpr uint32_t I4s_TO_F16s_MAGIC_NUM = 0x64006400; 32 | 33 | // Note that the entire sequence only requires 1 shift instruction. This is thanks to the register packing 34 | // format and the fact that we force our integers to be unsigned, and account for this in the fp16 subtractions. 35 | // In addition, I exploit the fact that sub and fma have the same throughput in order to convert elt_23 and 36 | // elt_67 to fp16 without having to shift them to the bottom bits before hand. 37 | 38 | // Shift right by 8 to now consider elt_45 and elt_67. Issue first to hide RAW dependency if we issue 39 | // immediately before required. 40 | const uint32_t top_i4s = i4s >> 8; 41 | // Extract elt_01 - (i4s & 0x000f000f) | 0x64006400 42 | asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" 43 | : "=r"(h[0]) 44 | : "r"(i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut)); 45 | // Extract elt_23 (i4s & 0x00f000f0) | 0x64006400 46 | asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" 47 | : "=r"(h[1]) 48 | : "r"(i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut)); 49 | // Extract elt_45 (top_i4s & 0x000f000f) | 0x64006400 50 | asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" 51 | : "=r"(h[2]) 52 | : "r"(top_i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut)); 53 | // Extract elt_67 (top_i4s & 0x00f000f0) | 0x64006400 54 | asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" 55 | : "=r"(h[3]) 56 | : "r"(top_i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut)); 57 | 58 | // I use inline PTX below because I am not sure if the compiler will emit float2half instructions if I use the 59 | // half2 ctor. In this case, I chose performance reliability over code readability. 60 | 61 | // This is the half2 {1032, 1032} represented as an integer. 62 | // static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64086408; 63 | // Haotian: subtract {1024, 1024} instead, we do not need to map to [-8, 7] 64 | static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64006400; 65 | // This is the half2 {1 / 16, 1 / 16} represented as an integer. 66 | static constexpr uint32_t ONE_SIXTEENTH = 0x2c002c00; 67 | // This is the half2 {-72, -72} represented as an integer. 68 | // static constexpr uint32_t NEG_72 = 0xd480d480; 69 | // Haotian: Let's use {-64, -64}. 70 | static constexpr uint32_t NEG_64 = 0xd400d400; 71 | 72 | // Finally, we construct the output numbers. 73 | // Convert elt_01 74 | asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[0]) : "r"(h[0]), "r"(FP16_TOP_MAGIC_NUM)); 75 | // Convert elt_23 76 | asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[1]) : "r"(h[1]), "r"(ONE_SIXTEENTH), "r"(NEG_64)); 77 | // Convert elt_45 78 | asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[2]) : "r"(h[2]), "r"(FP16_TOP_MAGIC_NUM)); 79 | // Convert elt_67 80 | asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[3]) : "r"(h[3]), "r"(ONE_SIXTEENTH), "r"(NEG_64)); 81 | 82 | return result; 83 | #endif 84 | } 85 | 86 | } // namespace awq 87 | } // namespace vllm 88 | -------------------------------------------------------------------------------- /vllm/entrypoints/api_server.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | from typing import AsyncGenerator 4 | 5 | from fastapi import FastAPI, Request 6 | from fastapi.responses import JSONResponse, Response, StreamingResponse 7 | import uvicorn 8 | 9 | from vllm.engine.arg_utils import AsyncEngineArgs 10 | from vllm.engine.async_llm_engine import AsyncLLMEngine 11 | from vllm.sampling_params import SamplingParams 12 | from vllm.utils import random_uuid 13 | 14 | TIMEOUT_KEEP_ALIVE = 5 # seconds. 15 | TIMEOUT_TO_PREVENT_DEADLOCK = 1 # seconds. 16 | app = FastAPI() 17 | engine = None 18 | import base64, json 19 | 20 | 21 | @app.post("/generate") 22 | async def generate(request: Request) -> Response: 23 | """Generate completion for the request. 24 | 25 | The request should be a JSON object with the following fields: 26 | - prompt: the prompt to use for the generation. 27 | - stream: whether to stream the results or not. 28 | - other fields: the sampling parameters (See `SamplingParams` for details). 29 | """ 30 | request_dict = await request.json() 31 | prompt = request_dict.pop("prompt") 32 | prompt_token_ids = None 33 | stream = request_dict.pop("stream", False) 34 | sampling_params = SamplingParams(**request_dict) 35 | request_id = request_dict.pop("request_id", None) 36 | if not request_id: 37 | request_id = random_uuid() 38 | 39 | dummy_token_ids = request_dict.pop("dummy_token_ids", False) 40 | if dummy_token_ids: 41 | prompt_token_ids = [0] * int(prompt) 42 | prompt = None 43 | 44 | results_generator = engine.generate(prompt, sampling_params, request_id, prompt_token_ids) 45 | 46 | # Streaming case 47 | async def stream_results() -> AsyncGenerator[bytes, None]: 48 | async for request_output in results_generator: 49 | prompt = request_output.prompt 50 | text_outputs = [ 51 | prompt + output.text for output in request_output.outputs 52 | ] 53 | ret = {"text": text_outputs} 54 | yield (json.dumps(ret) + "\0").encode("utf-8") 55 | 56 | if stream: 57 | return StreamingResponse(stream_results()) 58 | 59 | # Non-streaming case 60 | final_output = None 61 | async for request_output in results_generator: 62 | if await request.is_disconnected(): 63 | # Abort the request if the client disconnects. 64 | await engine.abort(request_id) 65 | return Response(status_code=499) 66 | final_output = request_output 67 | 68 | assert final_output is not None 69 | prompt = final_output.prompt 70 | text_outputs = [prompt + output.text for output in final_output.outputs] 71 | ret = {"text": text_outputs} 72 | return JSONResponse(ret) 73 | 74 | @app.post("/resume") 75 | async def resume(request: Request) -> Response: 76 | request_dict = await request.json() 77 | request_id = request_dict.pop("request_id") 78 | api_return_length = request_dict.pop("api_return_length") 79 | api_return_tokens = [0] * api_return_length 80 | 81 | results_generator = engine.resume_request_single(request_id, api_return_tokens) 82 | 83 | # Non-streaming case 84 | final_output = None 85 | async for request_output in results_generator: 86 | if await request.is_disconnected(): 87 | # Abort the request if the client disconnects. 88 | await engine.abort(request_id) 89 | return Response(status_code=499) 90 | final_output = request_output 91 | 92 | assert final_output is not None 93 | prompt = final_output.prompt 94 | text_outputs = [prompt + output.text for output in final_output.outputs] 95 | ret = {"text": text_outputs} 96 | return JSONResponse(ret) 97 | 98 | if __name__ == "__main__": 99 | parser = argparse.ArgumentParser() 100 | parser.add_argument("--host", type=str, default="localhost") 101 | parser.add_argument("--port", type=int, default=8000) 102 | parser = AsyncEngineArgs.add_cli_args(parser) 103 | args = parser.parse_args() 104 | 105 | engine_args = AsyncEngineArgs.from_cli_args(args) 106 | engine = AsyncLLMEngine.from_engine_args(engine_args) 107 | 108 | uvicorn.run(app, 109 | host=args.host, 110 | port=args.port, 111 | log_level="debug", 112 | timeout_keep_alive=TIMEOUT_KEEP_ALIVE) 113 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | benchmarks/exp_*/ 2 | benchmarks/experiment_*/ 3 | 4 | # Byte-compiled / optimized / DLL files 5 | __pycache__/ 6 | *.py[cod] 7 | *$py.class 8 | 9 | # C extensions 10 | *.so 11 | 12 | # Distribution / packaging 13 | .Python 14 | build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | 32 | # PyInstaller 33 | # Usually these files are written by a python script from a template 34 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 35 | *.manifest 36 | *.spec 37 | 38 | # Installer logs 39 | pip-log.txt 40 | pip-delete-this-directory.txt 41 | 42 | # Unit test / coverage reports 43 | htmlcov/ 44 | .tox/ 45 | .nox/ 46 | .coverage 47 | .coverage.* 48 | .cache 49 | nosetests.xml 50 | coverage.xml 51 | *.cover 52 | *.py,cover 53 | .hypothesis/ 54 | .pytest_cache/ 55 | cover/ 56 | 57 | # Translations 58 | *.mo 59 | *.pot 60 | 61 | # Django stuff: 62 | *.log 63 | local_settings.py 64 | db.sqlite3 65 | db.sqlite3-journal 66 | 67 | # Flask stuff: 68 | instance/ 69 | .webassets-cache 70 | 71 | # Scrapy stuff: 72 | .scrapy 73 | 74 | # Sphinx documentation 75 | docs/_build/ 76 | 77 | # PyBuilder 78 | .pybuilder/ 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | # For a library or package, you might want to ignore these files since the code is 90 | # intended to run in multiple environments; otherwise, check them in: 91 | # .python-version 92 | 93 | # pipenv 94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 97 | # install all needed dependencies. 98 | #Pipfile.lock 99 | 100 | # poetry 101 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 102 | # This is especially recommended for binary packages to ensure reproducibility, and is more 103 | # commonly ignored for libraries. 104 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 105 | #poetry.lock 106 | 107 | # pdm 108 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 109 | #pdm.lock 110 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 111 | # in version control. 112 | # https://pdm.fming.dev/#use-with-ide 113 | .pdm.toml 114 | 115 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 116 | __pypackages__/ 117 | 118 | # Celery stuff 119 | celerybeat-schedule 120 | celerybeat.pid 121 | 122 | # SageMath parsed files 123 | *.sage.py 124 | 125 | # Environments 126 | .env 127 | .venv 128 | env/ 129 | venv/ 130 | ENV/ 131 | env.bak/ 132 | venv.bak/ 133 | 134 | # Spyder project settings 135 | .spyderproject 136 | .spyproject 137 | 138 | # Rope project settings 139 | .ropeproject 140 | 141 | # mkdocs documentation 142 | /site 143 | 144 | # mypy 145 | .mypy_cache/ 146 | .dmypy.json 147 | dmypy.json 148 | 149 | # Pyre type checker 150 | .pyre/ 151 | 152 | # pytype static type analyzer 153 | .pytype/ 154 | 155 | # Cython debug symbols 156 | cython_debug/ 157 | 158 | # PyCharm 159 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 160 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 161 | # and can be added to the global gitignore or merged into this file. For a more nuclear 162 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 163 | .idea/ 164 | 165 | # VSCode 166 | .vscode/ 167 | 168 | # DS Store 169 | .DS_Store 170 | 171 | # Results 172 | *.csv 173 | 174 | # Python pickle files 175 | *.pkl 176 | 177 | # Sphinx documentation 178 | _build/ 179 | 180 | # vim swap files 181 | *.swo 182 | *.swp 183 | ShareGPT_V3_unfiltered_cleaned_split.json 184 | *.nsys-rep 185 | *.pt 186 | 187 | benchmarks/exp_logs/ 188 | benchmarks/exp_version2/ 189 | 190 | real/ 191 | cswap/ 192 | merge*.json 193 | merged_results/ 194 | merged_results_13B/ 195 | new_real_baseline/ -------------------------------------------------------------------------------- /csrc/activation_kernels.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "dispatch_utils.h" 5 | 6 | namespace vllm { 7 | 8 | template 9 | __device__ __forceinline__ T silu(const T& x) { 10 | // x * sigmoid(x) 11 | return (T) (((float) x) / (1.0f + expf((float) -x))); 12 | } 13 | 14 | template 15 | __global__ void silu_and_mul_kernel( 16 | scalar_t* __restrict__ out, // [num_tokens, d] 17 | const scalar_t* __restrict__ input, // [num_tokens, 2, d] 18 | const int d) { 19 | const int token_idx = blockIdx.x; 20 | for (int idx = threadIdx.x; idx < d; idx += blockDim.x) { 21 | const scalar_t x = __ldg(&input[token_idx * 2 * d + idx]); 22 | const scalar_t y = __ldg(&input[token_idx * 2 * d + d + idx]); 23 | out[token_idx * d + idx] = silu(x) * y; 24 | } 25 | } 26 | 27 | } // namespace vllm 28 | 29 | void silu_and_mul( 30 | torch::Tensor& out, // [num_tokens, d] 31 | torch::Tensor& input) // [num_tokens, 2 * d] 32 | { 33 | int num_tokens = input.size(0); 34 | int d = input.size(1) / 2; 35 | 36 | dim3 grid(num_tokens); 37 | dim3 block(std::min(d, 1024)); 38 | const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); 39 | VLLM_DISPATCH_FLOATING_TYPES( 40 | input.scalar_type(), 41 | "silu_and_mul_kernel", 42 | [&] { 43 | vllm::silu_and_mul_kernel<<>>( 44 | out.data_ptr(), 45 | input.data_ptr(), 46 | d); 47 | }); 48 | } 49 | 50 | namespace vllm { 51 | 52 | // Element-wise activation kernel template. 53 | template 54 | __global__ void activation_kernel( 55 | scalar_t* __restrict__ out, // [num_tokens, d] 56 | const scalar_t* __restrict__ input, // [num_tokens, d] 57 | const int d) { 58 | const int token_idx = blockIdx.x; 59 | for (int idx = threadIdx.x; idx < d; idx += blockDim.x) { 60 | const scalar_t x = __ldg(&input[token_idx * d + idx]); 61 | out[token_idx * d + idx] = ACT_FN(x); 62 | } 63 | } 64 | 65 | } // namespace vllm 66 | 67 | // Launch element-wise activation kernel. 68 | #define LAUNCH_ACTIVATION_KERNEL(KERNEL) \ 69 | int num_tokens = input.size(0); \ 70 | int d = input.size(1); \ 71 | dim3 grid(num_tokens); \ 72 | dim3 block(std::min(d, 1024)); \ 73 | const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \ 74 | VLLM_DISPATCH_FLOATING_TYPES( \ 75 | input.scalar_type(), \ 76 | "activation_kernel", \ 77 | [&] { \ 78 | vllm::activation_kernel><<>>( \ 79 | out.data_ptr(), \ 80 | input.data_ptr(), \ 81 | d); \ 82 | }); 83 | 84 | namespace vllm { 85 | 86 | template 87 | __device__ __forceinline__ T gelu_new_kernel(const T& x) { 88 | const float x3 = (float) (x * x * x); 89 | const T t = (T) tanhf((T) (0.79788456f * (float) (x + (T) (0.044715f * x3)))); 90 | return ((T) 0.5) * x * (((T) 1.0) + t); 91 | } 92 | 93 | template 94 | __device__ __forceinline__ T gelu_fast_kernel(const T& x) { 95 | const float f = (float) x; 96 | const T t = (T) tanhf(((T) (f * 0.79788456f)) * (((T) 1.0) + (T) (0.044715f * f) * x)); 97 | return ((T) 0.5) * x * (((T) 1.0) + t); 98 | } 99 | 100 | } // namespace vllm 101 | 102 | void gelu_new( 103 | torch::Tensor& out, // [num_tokens, d] 104 | torch::Tensor& input) // [num_tokens, d] 105 | { 106 | LAUNCH_ACTIVATION_KERNEL(vllm::gelu_new_kernel); 107 | } 108 | 109 | void gelu_fast( 110 | torch::Tensor& out, // [num_tokens, d] 111 | torch::Tensor& input) // [num_tokens, d] 112 | { 113 | LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel); 114 | } 115 | -------------------------------------------------------------------------------- /vllm/model_executor/input_metadata.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Optional, Tuple 2 | 3 | import torch 4 | from xformers.ops import AttentionBias 5 | 6 | from vllm.sampling_params import SamplingParams 7 | from vllm.sequence import SequenceData 8 | 9 | 10 | class InputMetadata: 11 | """Metadata for input sequences. Used for PagedAttention. 12 | 13 | Args: 14 | seq_groups: List of (seq_ids, sampling_params). 15 | seq_data: Seq_id -> SequenceData. 16 | prompt_lens: Lengths of prompts. 17 | slot_mapping: The address to write the new KV to of each token. 18 | context_lens: the length of attention context for each generation token. 19 | max_context_len: The maximum context length. 20 | block_tables: The block tables. (Seq id -> list of physical block) 21 | """ 22 | 23 | def __init__( 24 | self, 25 | seq_groups: List[Tuple[List[int], SamplingParams]], 26 | seq_data: Dict[int, SequenceData], 27 | prompt_lens: List[int], 28 | slot_mapping: torch.Tensor, 29 | context_lens: torch.Tensor, 30 | max_context_len: int, 31 | block_tables: torch.Tensor, 32 | running_query_lens: List[int], 33 | atoms: torch.Tensor, 34 | is_generating_new_token: List[bool], 35 | sliding_window: Optional[int] = None, 36 | ) -> None: 37 | self.seq_groups = seq_groups 38 | self.seq_data = seq_data 39 | self.prompt_lens = prompt_lens 40 | self.slot_mapping = slot_mapping 41 | self.context_lens = context_lens 42 | self.max_context_len = max_context_len 43 | self.running_query_lens = running_query_lens 44 | self.block_tables = block_tables 45 | self.atoms = atoms 46 | self.is_generating_new_token = is_generating_new_token 47 | 48 | self.to_cache = None 49 | if sliding_window is not None: 50 | # We need to keep the positions of sliding windows within 51 | # the key / value tables, this is helpful to know which 52 | # elements we need to cache and where 53 | to_cache, start_idx = [], 0 54 | for prompt_len in self.prompt_lens: 55 | to_cache.extend( 56 | range( 57 | start_idx + max(0, prompt_len - sliding_window), 58 | start_idx + prompt_len, 59 | )) 60 | start_idx += prompt_len 61 | to_cache.extend(range(start_idx, slot_mapping.shape[0])) 62 | self.to_cache = torch.tensor(to_cache, 63 | dtype=torch.int32, 64 | device=self.slot_mapping.device) 65 | 66 | self.num_prompts = len(prompt_lens) 67 | self.num_prompt_tokens = sum(prompt_lens) 68 | self.num_generation_tokens = sum(running_query_lens) 69 | self.num_valid_tokens = self.num_prompt_tokens + self.num_generation_tokens 70 | assert len(slot_mapping) == self.num_valid_tokens 71 | if block_tables.numel() > 0: 72 | self.max_num_blocks_per_seq = block_tables.shape[1] 73 | else: 74 | self.max_num_blocks_per_seq = 0 75 | # NOTE: no longer true if mixed with multi-token kernel usage 76 | # assert block_tables.shape[0] == self.num_generation_tokens 77 | # assert context_lens.shape[0] == self.num_generation_tokens 78 | 79 | if running_query_lens: 80 | assert len(running_query_lens) == len(seq_data) - self.num_prompts 81 | 82 | # Set during the execution of the first attention op. 83 | self.attn_bias: List[AttentionBias] = [] 84 | self.multi_token_attn_bias: List[AttentionBias] = [] 85 | 86 | def __repr__(self) -> str: 87 | # Print only useful metadata. 88 | return (f'InputMetadata(' 89 | # f'seq_groups={self.seq_groups}, ' 90 | f'num_valid_tokens={self.num_valid_tokens}, ' 91 | f'num_prompt_tokens={self.num_prompt_tokens}, ' 92 | f'num_prompts={self.num_prompts}, ' 93 | f'prompt_lens={self.prompt_lens}, ' 94 | f'num_generation_tokens={self.num_generation_tokens}, ' 95 | f'context_lens={self.context_lens}, ' 96 | f'max_context_len={self.max_context_len}), ' 97 | f'max_num_blocks_per_seq={self.max_num_blocks_per_seq}, ' 98 | f'block_tables={self.block_tables}), ' 99 | # f'slot_mapping={self.slot_mapping}, ' 100 | f'running_query_lens={self.running_query_lens}, ' 101 | f'is_generating_new_token={self.is_generating_new_token}') 102 | 103 | -------------------------------------------------------------------------------- /vllm/model_executor/model_loader.py: -------------------------------------------------------------------------------- 1 | """Utilities for selecting and loading models.""" 2 | import contextlib 3 | from typing import Type 4 | 5 | import torch 6 | import torch.nn as nn 7 | from transformers import PretrainedConfig 8 | 9 | from vllm.config import ModelConfig 10 | from vllm.model_executor.models import * # pylint: disable=wildcard-import 11 | from vllm.model_executor.weight_utils import (get_quant_config, 12 | initialize_dummy_weights) 13 | 14 | # TODO(woosuk): Lazy-load the model classes. 15 | _MODEL_REGISTRY = { 16 | "AquilaModel": AquilaForCausalLM, 17 | "BaiChuanForCausalLM": BaiChuanForCausalLM, # baichuan-7b 18 | "BaichuanForCausalLM": BaichuanForCausalLM, # baichuan-13b 19 | "BloomForCausalLM": BloomForCausalLM, 20 | "FalconForCausalLM": FalconForCausalLM, 21 | "GPT2LMHeadModel": GPT2LMHeadModel, 22 | "GPTBigCodeForCausalLM": GPTBigCodeForCausalLM, 23 | "GPTJForCausalLM": GPTJForCausalLM, 24 | "GPTNeoXForCausalLM": GPTNeoXForCausalLM, 25 | "InternLMForCausalLM": InternLMForCausalLM, 26 | "LlamaForCausalLM": LlamaForCausalLM, 27 | "LLaMAForCausalLM": LlamaForCausalLM, # For decapoda-research/llama-* 28 | "MistralForCausalLM": MistralForCausalLM, 29 | "MPTForCausalLM": MPTForCausalLM, 30 | "OPTForCausalLM": OPTForCausalLM, 31 | "QWenLMHeadModel": QWenLMHeadModel, 32 | "RWForCausalLM": FalconForCausalLM, 33 | } 34 | 35 | # FIXME(woosuk): Remove this once all models support quantization. 36 | _MODEL_CLASSES_SUPPORT_QUANTIZATION = [ 37 | LlamaForCausalLM, 38 | ] 39 | 40 | 41 | @contextlib.contextmanager 42 | def _set_default_torch_dtype(dtype: torch.dtype): 43 | """Sets the default torch dtype to the given dtype.""" 44 | old_dtype = torch.get_default_dtype() 45 | torch.set_default_dtype(dtype) 46 | yield 47 | torch.set_default_dtype(old_dtype) 48 | 49 | 50 | def _get_model_architecture(config: PretrainedConfig) -> Type[nn.Module]: 51 | architectures = getattr(config, "architectures", []) 52 | for arch in architectures: 53 | if arch in _MODEL_REGISTRY: 54 | return _MODEL_REGISTRY[arch] 55 | raise ValueError( 56 | f"Model architectures {architectures} are not supported for now. " 57 | f"Supported architectures: {list(_MODEL_REGISTRY.keys())}") 58 | 59 | 60 | def get_model(model_config: ModelConfig) -> nn.Module: 61 | model_class = _get_model_architecture(model_config.hf_config) 62 | 63 | # Get the quantization config. 64 | quant_config = None 65 | if model_config.quantization is not None: 66 | if model_class not in _MODEL_CLASSES_SUPPORT_QUANTIZATION: 67 | raise ValueError( 68 | f"Quantization is not supported for {model_class}.") 69 | quant_config = get_quant_config(model_config.quantization, 70 | model_config.model, 71 | model_config.download_dir) 72 | capability = torch.cuda.get_device_capability() 73 | capability = capability[0] * 10 + capability[1] 74 | if capability < quant_config.get_min_capability(): 75 | raise ValueError( 76 | f"The quantization method {model_config.quantization} is not " 77 | "supported for the current GPU. " 78 | f"Minimum capability: {quant_config.get_min_capability()}. " 79 | f"Current capability: {capability}.") 80 | supported_dtypes = quant_config.get_supported_act_dtypes() 81 | if model_config.dtype not in supported_dtypes: 82 | raise ValueError( 83 | f"{model_config.dtype} is not supported for quantization " 84 | f"method {model_config.quantization}. Supported dtypes: " 85 | f"{supported_dtypes}") 86 | 87 | with _set_default_torch_dtype(model_config.dtype): 88 | # Create a model instance. 89 | # The weights will be initialized as empty tensors. 90 | if model_class in _MODEL_CLASSES_SUPPORT_QUANTIZATION: 91 | model = model_class(model_config.hf_config, quant_config) 92 | else: 93 | model = model_class(model_config.hf_config) 94 | if model_config.load_format == "dummy": 95 | model = model.cuda() 96 | # NOTE(woosuk): For accurate performance evaluation, we assign 97 | # random values to the weights. 98 | initialize_dummy_weights(model) 99 | else: 100 | # Load the weights from the cached or downloaded files. 101 | model.load_weights(model_config.model, model_config.download_dir, 102 | model_config.load_format, model_config.revision) 103 | model = model.cuda() 104 | return model.eval() 105 | -------------------------------------------------------------------------------- /csrc/pos_encoding_kernels.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "dispatch_utils.h" 5 | 6 | namespace vllm { 7 | 8 | template 9 | inline __device__ void apply_rotary_embedding( 10 | scalar_t* __restrict__ arr, 11 | const scalar_t* __restrict__ cos_ptr, 12 | const scalar_t* __restrict__ sin_ptr, 13 | int rot_offset, 14 | int embed_dim) 15 | { 16 | int x_index, y_index; 17 | scalar_t cos, sin; 18 | if (IS_NEOX) { 19 | // GPT-NeoX style rotary embedding. 20 | x_index = rot_offset; 21 | y_index = embed_dim + rot_offset; 22 | cos = __ldg(cos_ptr + x_index); 23 | sin = __ldg(sin_ptr + x_index); 24 | } else { 25 | // GPT-J style rotary embedding. 26 | x_index = 2 * rot_offset; 27 | y_index = 2 * rot_offset + 1; 28 | cos = __ldg(cos_ptr + x_index / 2); 29 | sin = __ldg(sin_ptr + x_index / 2); 30 | } 31 | 32 | const scalar_t x = arr[x_index]; 33 | const scalar_t y = arr[y_index]; 34 | arr[x_index] = x * cos - y * sin; 35 | arr[y_index] = y * cos + x * sin; 36 | } 37 | 38 | template 39 | __global__ void rotary_embedding_kernel( 40 | const int64_t* __restrict__ positions, // [num_tokens] 41 | scalar_t* __restrict__ query, // [num_tokens, num_heads, head_size] 42 | scalar_t* __restrict__ key, // [num_tokens, num_kv_heads, head_size] 43 | const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim // 2] 44 | const int rot_dim, 45 | const int query_stride, 46 | const int key_stride, 47 | const int num_heads, 48 | const int num_kv_heads, 49 | const int head_size) { 50 | // Each thread block is responsible for one token. 51 | const int token_idx = blockIdx.x; 52 | int64_t pos = positions[token_idx]; 53 | const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim; 54 | 55 | const int embed_dim = rot_dim / 2; 56 | const scalar_t* cos_ptr = cache_ptr; 57 | const scalar_t* sin_ptr = cache_ptr + embed_dim; 58 | 59 | const int nq = num_heads * embed_dim; 60 | for (int i = threadIdx.x; i < nq; i += blockDim.x) { 61 | const int head_idx = i / embed_dim; 62 | const int token_head = token_idx * query_stride + head_idx * head_size; 63 | const int rot_offset = i % embed_dim; 64 | apply_rotary_embedding(query + token_head, cos_ptr, 65 | sin_ptr, rot_offset, embed_dim); 66 | } 67 | 68 | const int nk = num_kv_heads * embed_dim; 69 | for (int i = threadIdx.x; i < nk; i += blockDim.x) { 70 | const int head_idx = i / embed_dim; 71 | const int token_head = token_idx * key_stride + head_idx * head_size; 72 | const int rot_offset = i % embed_dim; 73 | apply_rotary_embedding(key + token_head, cos_ptr, 74 | sin_ptr, rot_offset, embed_dim); 75 | } 76 | } 77 | 78 | } // namespace vllm 79 | 80 | void rotary_embedding( 81 | torch::Tensor& positions, // [num_tokens] 82 | torch::Tensor& query, // [num_tokens, num_heads * head_size] 83 | torch::Tensor& key, // [num_tokens, num_kv_heads * head_size] 84 | int head_size, 85 | torch::Tensor& cos_sin_cache, // [max_position, rot_dim] 86 | bool is_neox) { 87 | int num_tokens = query.size(0); 88 | int rot_dim = cos_sin_cache.size(1); 89 | int num_heads = query.size(1) / head_size; 90 | int num_kv_heads = key.size(1) / head_size; 91 | int query_stride = query.stride(0); 92 | int key_stride = key.stride(0); 93 | 94 | dim3 grid(num_tokens); 95 | dim3 block(std::min(num_heads * rot_dim / 2, 512)); 96 | const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); 97 | VLLM_DISPATCH_FLOATING_TYPES( 98 | query.scalar_type(), 99 | "rotary_embedding", 100 | [&] { 101 | if (is_neox) { 102 | vllm::rotary_embedding_kernel<<>>( 103 | positions.data_ptr(), 104 | query.data_ptr(), 105 | key.data_ptr(), 106 | cos_sin_cache.data_ptr(), 107 | rot_dim, 108 | query_stride, 109 | key_stride, 110 | num_heads, 111 | num_kv_heads, 112 | head_size); 113 | } else { 114 | vllm::rotary_embedding_kernel<<>>( 115 | positions.data_ptr(), 116 | query.data_ptr(), 117 | key.data_ptr(), 118 | cos_sin_cache.data_ptr(), 119 | rot_dim, 120 | query_stride, 121 | key_stride, 122 | num_heads, 123 | num_kv_heads, 124 | head_size); 125 | } 126 | }); 127 | } 128 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/simulator.py: -------------------------------------------------------------------------------- 1 | """A layer that simulates the next token.""" 2 | from typing import Dict, List, Optional, Tuple 3 | 4 | import torch 5 | import torch.nn as nn 6 | import random 7 | from vllm import utils 8 | from vllm.sampling_params import SamplingParams, SamplingType 9 | from vllm.model_executor.input_metadata import InputMetadata 10 | from vllm.sequence import SequenceOutputs, SequenceData, SamplerOutput 11 | 12 | DUMMY_TOKEN = 31548 # "Ġdummy" 13 | 14 | def _greedy_sample( 15 | selected_seq_groups: List[Tuple[List[int], SamplingParams]], 16 | seq_data: Dict[int, SequenceData], 17 | ) -> List[Tuple[List[int], List[int]]]: 18 | results = [] 19 | for seq_group in selected_seq_groups: 20 | seq_ids, sampling_params = seq_group 21 | num_parent_seqs = len(seq_ids) 22 | assert num_parent_seqs == 1, ( 23 | "Greedy sampling should have only one seq.") 24 | parent_ids = [0] 25 | next_token_ids = [_sample(sampling_params, seq_data[seq_ids[0]])] 26 | results.append((next_token_ids, parent_ids)) 27 | return results 28 | 29 | def _random_sample( 30 | selected_seq_groups: List[Tuple[List[int], SamplingParams]], 31 | is_prompts: List[bool], 32 | seq_data: Dict[int, SequenceData], 33 | ) -> List[Tuple[List[int], List[int]]]: 34 | results = [] 35 | for seq_group, is_prompt in zip(selected_seq_groups, is_prompts): 36 | seq_ids, sampling_params = seq_group 37 | num_parent_seqs = len(seq_ids) 38 | if is_prompt: 39 | # Prompt phase. 40 | assert num_parent_seqs == 1, ( 41 | "Prompt input should have only one seq.") 42 | parent_ids = [0] * sampling_params.best_of 43 | next_token_ids = [_sample(sampling_params, seq_data[seq_ids[0]])] * \ 44 | sampling_params.best_of 45 | else: 46 | # Generation phase. 47 | parent_ids = list(range(num_parent_seqs)) 48 | next_token_ids = [_sample(sampling_params, seq_data[seq_id]) 49 | for seq_id in seq_ids] 50 | results.append((next_token_ids, parent_ids)) 51 | return results 52 | 53 | class Simulator(nn.Module): 54 | 55 | def forward(self, 56 | input_metadata: InputMetadata) -> SamplerOutput: 57 | categorized_seq_group_ids = {t: [] for t in SamplingType} 58 | category_num_tokens = {t: 0 for t in SamplingType} 59 | for i, seq_group in enumerate(input_metadata.seq_groups): 60 | seq_ids, sampling_params = seq_group 61 | sampling_type = sampling_params.sampling_type 62 | categorized_seq_group_ids[sampling_type].append(i) 63 | num_seqs = len(seq_ids) 64 | category_num_tokens[sampling_type] += num_seqs 65 | 66 | seq_outputs_dict: Dict[int, List[SequenceOutputs]] = {} 67 | for sampling_type in SamplingType: 68 | seq_group_ids = categorized_seq_group_ids[sampling_type] 69 | seq_groups = [input_metadata.seq_groups[i] for i in seq_group_ids] 70 | is_prompts = [i < input_metadata.num_prompts for i in seq_group_ids] 71 | num_tokens = category_num_tokens[sampling_type] 72 | if num_tokens == 0: 73 | continue 74 | if sampling_type == SamplingType.GREEDY: 75 | sample_results = _greedy_sample(seq_groups, input_metadata.seq_data) 76 | elif sampling_type == SamplingType.RANDOM: 77 | sample_results = _random_sample(seq_groups, is_prompts, input_metadata.seq_data) 78 | else: 79 | raise NotImplementedError("Beam search is not supported yet") 80 | 81 | # build output 82 | for seq_group_id, seq_group, sample_result in zip( 83 | seq_group_ids, seq_groups, sample_results): 84 | seq_ids, sampling_params = seq_group 85 | next_token_ids, parent_ids = sample_result 86 | num_results = len(next_token_ids) 87 | num_parent_seqs = len(seq_ids) 88 | seq_outputs: List[SequenceOutputs] = [] 89 | for parent_id, next_token_id in zip( 90 | parent_ids, next_token_ids): 91 | seq_outputs.append( 92 | SequenceOutputs(seq_ids[parent_id], next_token_id, {next_token_id: 0.0})) 93 | seq_outputs_dict[seq_group_id] = seq_outputs 94 | return [seq_outputs_dict[i] for i in range(len(input_metadata.seq_groups))] 95 | 96 | def _sample(sampling_params: SamplingParams, seq_data: SequenceData) -> int: 97 | # seq_data should be updated at the master worker 98 | if seq_data.generation_counter == sampling_params.api_invoke_interval: 99 | if sampling_params.api_max_calls != 0: 100 | # seq_data.generation_counter = 0 101 | return utils.get_api_stop_token() 102 | # seq_data.generation_counter += 1 103 | return DUMMY_TOKEN -------------------------------------------------------------------------------- /vllm/engine/ray_utils.py: -------------------------------------------------------------------------------- 1 | import socket 2 | from typing import Optional, Tuple, TYPE_CHECKING 3 | 4 | from vllm.config import ParallelConfig 5 | from vllm.logger import init_logger 6 | 7 | logger = init_logger(__name__) 8 | 9 | try: 10 | import ray 11 | from ray.air.util.torch_dist import TorchDistributedWorker 12 | 13 | class RayWorker(TorchDistributedWorker): 14 | """Ray wrapper for vllm.worker.Worker, allowing Worker to be 15 | lazliy initialized after Ray sets CUDA_VISIBLE_DEVICES.""" 16 | 17 | def __init__(self, init_cached_hf_modules=False) -> None: 18 | if init_cached_hf_modules: 19 | # pylint: disable=import-outside-toplevel 20 | from transformers.dynamic_module_utils import init_hf_modules 21 | init_hf_modules() 22 | self.worker = None 23 | 24 | def init_worker(self, worker_init_fn): 25 | self.worker = worker_init_fn() 26 | 27 | def __getattr__(self, name): 28 | return getattr(self.worker, name) 29 | 30 | def execute_method(self, method, *args, **kwargs): 31 | executor = getattr(self, method) 32 | return executor(*args, **kwargs) 33 | 34 | except ImportError as e: 35 | logger.warning(f"Failed to import Ray with {e!r}. " 36 | "For distributed inference, please install Ray with " 37 | "`pip install ray pandas pyarrow`.") 38 | ray = None 39 | TorchDistributedWorker = None 40 | RayWorker = None # pylint: disable=invalid-name 41 | 42 | if TYPE_CHECKING: 43 | from ray.util.placement_group import PlacementGroup 44 | 45 | 46 | def get_open_port(): 47 | with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: 48 | s.bind(("", 0)) 49 | return s.getsockname()[1] 50 | 51 | 52 | def initialize_cluster( 53 | parallel_config: ParallelConfig, 54 | engine_use_ray: bool = False, 55 | ray_address: Optional[str] = None, 56 | ) -> Tuple[str, Optional["PlacementGroup"]]: 57 | """Initialize the distributed cluster probably with Ray. 58 | 59 | Args: 60 | parallel_config: The configurations for parallel execution. 61 | engine_use_ray: Whether to use Ray for async engine. 62 | ray_address: The address of the Ray cluster. If None, uses 63 | the default Ray cluster address. 64 | 65 | Returns: 66 | A tuple of (`distributed_init_method`, `placement_group`). The 67 | `distributed_init_method` is the address for initializing the 68 | distributed backend. `placement_group` includes the specification 69 | of the resources for each distributed worker. 70 | """ 71 | if parallel_config.worker_use_ray or engine_use_ray: 72 | if ray is None: 73 | raise ImportError( 74 | "Ray is not installed. Please install Ray to use distributed " 75 | "serving.") 76 | # Connect to a ray cluster. 77 | ray.init(address=ray_address, ignore_reinit_error=True) 78 | 79 | if not parallel_config.worker_use_ray: 80 | # Initialize cluster locally. 81 | port = get_open_port() 82 | # We need to setup the distributed init method to make sure 83 | # the distributed megatron code (e.g., get world size) works correctly. 84 | distributed_init_method = f"tcp://localhost:{port}" 85 | return distributed_init_method, None 86 | 87 | current_placement_group = ray.util.get_current_placement_group() 88 | if current_placement_group: 89 | # We are in a placement group 90 | bundles = current_placement_group.bundle_specs 91 | # Verify that we can use the placement group. 92 | gpu_bundles = 0 93 | for bundle in bundles: 94 | bundle_gpus = bundle.get("GPU", 0) 95 | if bundle_gpus > 1: 96 | raise ValueError( 97 | "Placement group bundle cannot have more than 1 GPU.") 98 | if bundle_gpus: 99 | gpu_bundles += 1 100 | if parallel_config.world_size > gpu_bundles: 101 | raise ValueError( 102 | "The number of required GPUs exceeds the total number of " 103 | "available GPUs in the placement group.") 104 | else: 105 | num_gpus_in_cluster = ray.cluster_resources().get("GPU", 0) 106 | if parallel_config.world_size > num_gpus_in_cluster: 107 | raise ValueError( 108 | "The number of required GPUs exceeds the total number of " 109 | "available GPUs in the cluster.") 110 | # Create a new placement group 111 | current_placement_group = ray.util.placement_group([{ 112 | "GPU": 1 113 | }] * parallel_config.world_size) 114 | # Wait until PG is ready - this will block until all 115 | # requested resources are available, and will timeout 116 | # if they cannot be provisioned. 117 | ray.get(current_placement_group.ready(), timeout=1800) 118 | 119 | return None, current_placement_group 120 | -------------------------------------------------------------------------------- /docs/source/models/adding_model.rst: -------------------------------------------------------------------------------- 1 | .. _adding_a_new_model: 2 | 3 | Adding a New Model 4 | ================== 5 | 6 | This document provides a high-level guide on integrating a `HuggingFace Transformers `_ model into vLLM. 7 | 8 | .. note:: 9 | The complexity of adding a new model depends heavily on the model's architecture. 10 | The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM. 11 | However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex. 12 | 13 | .. tip:: 14 | If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our `GitHub `_ repository. 15 | We will be happy to help you out! 16 | 17 | 18 | 0. Fork the vLLM repository 19 | -------------------------------- 20 | 21 | Start by forking our `GitHub `_ repository and then :ref:`build it from source `. 22 | This gives you the ability to modify the codebase and test your model. 23 | 24 | 25 | 1. Bring your model code 26 | ------------------------ 27 | 28 | Clone the PyTorch model code from the HuggingFace Transformers repository and put it into the `vllm/model_executor/models `_ directory. 29 | For instance, vLLM's `OPT model `_ was adpated from the HuggingFace's `modeling_opt.py `_ file. 30 | 31 | .. warning:: 32 | When copying the model code, make sure to review and adhere to the code's copyright and licensing terms. 33 | 34 | 35 | 2. Rewrite the :code:`forward` methods 36 | -------------------------------------- 37 | 38 | Next, you need to rewrite the :code:`forward` methods of your model by following these steps: 39 | 40 | 1. Remove any unnecessary code, such as the code only used for training. 41 | 2. Change the input parameters: 42 | 43 | .. code-block:: diff 44 | 45 | def forward( 46 | self, 47 | input_ids: torch.Tensor, 48 | - attention_mask: Optional[torch.Tensor] = None, 49 | - position_ids: Optional[torch.LongTensor] = None, 50 | - past_key_values: Optional[List[torch.FloatTensor]] = None, 51 | - inputs_embeds: Optional[torch.FloatTensor] = None, 52 | - labels: Optional[torch.LongTensor] = None, 53 | - use_cache: Optional[bool] = None, 54 | - output_attentions: Optional[bool] = None, 55 | - output_hidden_states: Optional[bool] = None, 56 | - return_dict: Optional[bool] = None, 57 | -) -> Union[Tuple, CausalLMOutputWithPast]: 58 | + positions: torch.Tensor, 59 | + kv_caches: List[KVCache], 60 | + input_metadata: InputMetadata, 61 | + cache_events: Optional[List[torch.cuda.Event]], 62 | +) -> SamplerOutput: 63 | 64 | 3. Update the code by considering that :code:`input_ids` and :code:`positions` are now flattened tensors. 65 | 4. Replace the attention operation with either :code:`GPTPagedAttention` or :code:`GPTNeoXPagedAttention`, depending on the model's architecture. 66 | 67 | .. note:: 68 | Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings. 69 | If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM. 70 | 71 | 72 | 3. (Optional) Implement tensor parallelism support 73 | -------------------------------------------------- 74 | 75 | If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it. 76 | To do this, substitute your model's linear and embedding layers with their tensor-parallel versions. 77 | For the embedding layer, you can simply replace :code:`nn.Embedding` with :code:`VocabParallelEmbedding`. 78 | When it comes to the linear layers, you should use either :code:`RowParallelLinear` or :code:`ColumnParallelLinear`. 79 | Typically, :code:`ColumnParallelLinear` is used for QKV linear layers and the first linear layers of the MLP blocks. 80 | For the remaining linear layers, :code:`RowParallelLinear` is used. 81 | 82 | 83 | 4. Implement the weight loading logic 84 | ------------------------------------- 85 | 86 | You now need to implement the :code:`load_weights` method in your :code:`*ForCausalLM` class. 87 | This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. 88 | While the process is straightforward for most layers, the tensor-parallel layers necessitate some additional care as their weights should be partitioned to multiple GPUs. 89 | 90 | 91 | 5. Register your model 92 | ---------------------- 93 | 94 | Finally, include your :code:`*ForCausalLM` class in `vllm/model_executor/models/__init__.py `_ and register it to the :code:`_MODEL_REGISTRY` in `vllm/model_executor/model_loader.py `_. 95 | -------------------------------------------------------------------------------- /vllm/outputs.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Optional, Tuple 2 | 3 | from vllm.sequence import SequenceGroup, SequenceStatus 4 | 5 | 6 | class CompletionOutput: 7 | """The output data of one completion output of a request. 8 | 9 | Args: 10 | index: The index of the output in the request. 11 | text: The generated output text. 12 | token_ids: The token IDs of the generated output text. 13 | cumulative_logprob: The cumulative log probability of the generated 14 | output text. 15 | logprobs: The log probabilities of the top probability words at each 16 | position if the logprobs are requested. 17 | finish_reason: The reason why the sequence is finished. 18 | """ 19 | 20 | def __init__( 21 | self, 22 | index: int, 23 | text: str, 24 | token_ids: List[int], 25 | cumulative_logprob: float, 26 | logprobs: Optional[List[Dict[int, float]]], 27 | finish_reason: Optional[str] = None, 28 | ) -> None: 29 | self.index = index 30 | self.text = text 31 | self.token_ids = token_ids 32 | self.cumulative_logprob = cumulative_logprob 33 | self.logprobs = logprobs 34 | self.finish_reason = finish_reason 35 | 36 | def finished(self) -> bool: 37 | return self.finish_reason is not None 38 | 39 | def __repr__(self) -> str: 40 | return (f"CompletionOutput(index={self.index}, " 41 | f"text={self.text!r}, " 42 | f"token_ids={self.token_ids}, " 43 | f"cumulative_logprob={self.cumulative_logprob}, " 44 | f"logprobs={self.logprobs}, " 45 | f"finish_reason={self.finish_reason})") 46 | 47 | 48 | class RequestOutput: 49 | """The output data of a request to the LLM. 50 | 51 | Args: 52 | request_id: The unique ID of the request. 53 | prompt: The prompt string of the request. 54 | prompt_token_ids: The token IDs of the prompt. 55 | outputs: The output sequences of the request. 56 | finished: Whether the whole request is finished. 57 | paused: List of output index to seq id. 58 | """ 59 | 60 | def __init__( 61 | self, 62 | request_id: str, 63 | prompt: str, 64 | prompt_token_ids: List[int], 65 | outputs: List[CompletionOutput], 66 | finished: bool, 67 | paused: List[Tuple[int, int]], 68 | ) -> None: 69 | self.request_id = request_id 70 | self.prompt = prompt 71 | self.prompt_token_ids = prompt_token_ids 72 | self.outputs = outputs 73 | self.finished = finished 74 | self.paused = paused 75 | 76 | # NOTE: toolformer's pause policy is no longer used 77 | @classmethod 78 | def from_seq_group(cls, seq_group: SequenceGroup) -> "RequestOutput": 79 | # Get the top-n sequences 80 | n = seq_group.sampling_params.n 81 | seqs = seq_group.get_seqs() 82 | assert n <= len(seqs) 83 | if seq_group.sampling_params.use_beam_search: 84 | sorting_key = lambda seq: seq.get_beam_search_score( 85 | seq_group.sampling_params.length_penalty) 86 | else: 87 | sorting_key = lambda seq: seq.get_cumulative_logprob() 88 | sorted_seqs = sorted(seqs, key=sorting_key, reverse=True) 89 | top_n_seqs = sorted_seqs[:n] 90 | 91 | # Create the outputs. 92 | paused: List[Tuple[int, int]] = [] 93 | outputs: List[CompletionOutput] = [] 94 | for seq in top_n_seqs: 95 | logprobs = seq.output_logprobs 96 | if seq_group.sampling_params.logprobs is None: 97 | # NOTE: We need to take care of this case because the sequence 98 | # always has the logprobs of the sampled tokens even if the 99 | # logprobs are not requested. 100 | logprobs = {} 101 | finshed_reason = SequenceStatus.get_finished_reason(seq.status) 102 | output = CompletionOutput(seqs.index(seq), seq.output_text, 103 | seq.get_output_token_ids(), 104 | seq.get_cumulative_logprob(), logprobs, 105 | finshed_reason) 106 | outputs.append(output) 107 | if seq.is_paused(): 108 | paused.append((len(outputs) - 1, seq.seq_id)) 109 | 110 | # Every sequence in the sequence group should have the same prompt. 111 | prompt = top_n_seqs[0].prompt 112 | prompt_token_ids = top_n_seqs[0].data.prompt_token_ids 113 | finished = seq_group.is_finished() 114 | return cls(seq_group.request_id, prompt, prompt_token_ids, outputs, 115 | finished, paused) 116 | 117 | def __repr__(self) -> str: 118 | return (f"RequestOutput(request_id={self.request_id}, " 119 | f"prompt={self.prompt!r}, " 120 | f"prompt_token_ids={self.prompt_token_ids}, " 121 | f"outputs={self.outputs}, " 122 | f"finished={self.finished}), " 123 | f"paused={self.paused})") 124 | -------------------------------------------------------------------------------- /examples/react_vllm_impl.py: -------------------------------------------------------------------------------- 1 | """Benchmark offline inference throughput.""" 2 | import argparse 3 | import json 4 | import random 5 | import time 6 | from typing import List, Optional, Tuple, Dict 7 | 8 | import torch 9 | import queue 10 | import threading 11 | from transformers import AutoModelForCausalLM, PreTrainedTokenizerBase 12 | from tqdm import tqdm 13 | 14 | from vllm import LLM, SamplingParams, LLMEngine, EngineArgs, utils 15 | from vllm.transformers_utils.tokenizer import get_tokenizer 16 | from vllm.outputs import RequestOutput 17 | 18 | class APIExecutor: 19 | def __init__(self) -> None: 20 | self._queue = queue.Queue() 21 | 22 | def _add_task(self, request_id: str, seq_id: int, api_time: float, ret_len: int): 23 | time.sleep(api_time) 24 | self._queue.put((request_id, seq_id, ret_len)) 25 | 26 | def add_task(self, request_id: str, seq_id: int, api_time: float, ret_len: int): 27 | task = threading.Thread(target=self._add_task, args=(request_id, seq_id, api_time, ret_len)) 28 | task.start() 29 | return task 30 | 31 | def _get_results(self) -> Dict[str, Dict[int, int]]: 32 | results = {} 33 | current_num_ret = self._queue.qsize() 34 | for _ in range(current_num_ret): 35 | request_id, seq_id, ret_len = self._queue.get() 36 | if request_id not in results: 37 | results[request_id] = {} 38 | results[request_id][seq_id] = ret_len 39 | return results 40 | 41 | def resume(self, vllm_engine: LLMEngine) -> None: 42 | api_rets = self._get_results() 43 | for request_id, seq_id_to_ret_len in api_rets.items(): 44 | response = {} 45 | for seq_id, ret_len in seq_id_to_ret_len.items(): 46 | response[seq_id] = [0] * ret_len 47 | vllm_engine.resume_request(request_id, response) 48 | 49 | 50 | 51 | def run_vllm( 52 | args: argparse.Namespace, 53 | ) -> float: 54 | engine_args = EngineArgs.from_cli_args(args) 55 | engine = LLMEngine.from_engine_args(engine_args) 56 | stop = [utils.get_api_stop_string()] 57 | api_engine = APIExecutor() 58 | tasks = set() 59 | 60 | dummy_prompt_token_ids = [[0] * args.input_len] * args.num_prompts 61 | 62 | # Add the requests to the engine. 63 | for request_id, prompt_token_ids in enumerate(dummy_prompt_token_ids): 64 | sampling_params = SamplingParams( 65 | n=1, 66 | temperature=0.0, 67 | top_p=1.0, 68 | # use_beam_search=use_beam_search, 69 | ignore_eos=True, 70 | max_tokens=args.output_len, 71 | stop=stop, 72 | use_api_simulator=True, 73 | api_return_length=32, 74 | api_invoke_interval=16 + request_id, 75 | api_exec_time=1.0 76 | ) 77 | engine.add_request( 78 | request_id=str(request_id), 79 | prompt=None, 80 | sampling_params=sampling_params, 81 | prompt_token_ids=prompt_token_ids, 82 | ) 83 | 84 | start = time.perf_counter() 85 | # Run the engine. 86 | outputs: List[RequestOutput] = [] 87 | iter = 0 88 | while engine.has_unfinished_requests(): 89 | step_outputs = engine.step() 90 | for output in step_outputs: 91 | if output.finished: 92 | outputs.append(output) 93 | if output.paused: 94 | print(f'iter: {iter}, output: {output}') 95 | sampling_params: SamplingParams = engine.scheduler.paused[output.request_id][0].sampling_params 96 | for (rid, sid) in output.paused: 97 | task = api_engine.add_task(output.request_id, sid, sampling_params.api_exec_time, sampling_params.api_return_length) 98 | tasks.add(task) 99 | api_engine.resume(engine) 100 | iter += 1 101 | 102 | # Sort the outputs by request ID. 103 | # This is necessary because some requests may be finished earlier than 104 | # its previous requests. 105 | outputs = sorted(outputs, key=lambda x: int(x.request_id)) 106 | end = time.perf_counter() 107 | for request_output in outputs: 108 | for seq_output in request_output.outputs: 109 | print(seq_output.text) 110 | print(seq_output.token_ids) 111 | return end - start 112 | 113 | 114 | def main(args: argparse.Namespace): 115 | print(args) 116 | random.seed(args.seed) 117 | 118 | elapsed_time = run_vllm( 119 | args, 120 | ) 121 | print(elapsed_time) 122 | 123 | # total_num_tokens = sum( 124 | # prompt_len + output_len for _, prompt_len, output_len in requests 125 | # ) 126 | # print( 127 | # f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, " 128 | # f"{total_num_tokens / elapsed_time:.2f} tokens/s" 129 | # ) 130 | 131 | 132 | if __name__ == "__main__": 133 | parser = argparse.ArgumentParser(description="Benchmark the throughput.") 134 | parser.add_argument( 135 | "--input-len", type=int, default=512 136 | ) 137 | parser.add_argument( 138 | "--output-len", type=int, default=512 139 | ) 140 | parser.add_argument( 141 | "--num-prompts", type=int, default=1, help="Number of prompts to process." 142 | ) 143 | parser = EngineArgs.add_cli_args(parser) 144 | 145 | args = parser.parse_args() 146 | main(args) 147 | -------------------------------------------------------------------------------- /examples/test_pause.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from vllm import EngineArgs, LLMEngine, SamplingParams, utils 4 | import torch 5 | import os 6 | from typing import List, Optional, Tuple, Dict 7 | from vllm.outputs import RequestOutput 8 | import json 9 | 10 | # os.environ['CUDA_VISIBLE_DEVICES'] = '7' 11 | # os.environ['CUDA_LAUNCH_BLOCKING'] = '1' 12 | 13 | def api_call(input: str): 14 | return " a " 15 | 16 | def main(args: argparse.Namespace): 17 | # Parse the CLI argument and initialize the engine. 18 | engine_args = EngineArgs.from_cli_args(args) 19 | engine = LLMEngine.from_engine_args(engine_args) 20 | stop = [utils.get_api_stop_string()] if args.test else None 21 | # Test the following prompts. 22 | test_prompts = [ 23 | ("Imagine a futuristic city in the year 2150, where advanced technology and environmental sustainability are perfectly integrated. Describe this city in great detail, focusing on aspects such as architecture, transportation, energy sources, and the daily lives of its inhabitants. How do the buildings look, and what innovative materials are they made from? Describe the public transportation system and how it differs from systems in the early 21st century. What are the primary energy sources, and how are they harnessed and distributed? How do the residents of this city work, entertain themselves, and interact with technology in their everyday lives? In addition, consider the city's government and societal structure. How is the city governed, and what kind of political system is in place? What are the core values and principles that guide decision-making? Discuss how this city ensures the well-being of its citizens, including healthcare, education, and social services. How does this city handle issues like crime, conflict resolution, and the preservation of civil liberties? Furthermore, explore the relationship of this city with the natural environment. How does the city maintain a balance with nature, and what are its strategies for conservation and biodiversity? Are there any unique parks, green spaces, or integration of natural elements within the urban landscape? Lastly, imagine a scenario where this city faces a significant challenge, such as a natural disaster or a technological crisis. How does the city respond and recover from this event? What systems and protocols are in place to handle such emergencies, and what role do citizens play in these situations? Please provide a comprehensive and imaginative description of each of these aspects, creating a vivid and detailed portrayal of life in this futuristic city.", 24 | SamplingParams(n=1, temperature=0.0, presence_penalty=0.0,stop=stop,max_tokens=100)), 25 | # ("Summarize the main ideas of Jeff Walker's Product Launch Formula into bullet points as it pertains to a growth marketing agency implementing these strategies and tactics for their clients...", 26 | # SamplingParams(n=1, temperature=0.0, presence_penalty=0.2,stop=stop,max_tokens=100)), 27 | ] * 100 28 | 29 | # Run the engine by calling `engine.step()` manually. 30 | request_id = 0 31 | # To test iteration-level scheduling, we add one request at each step. 32 | for prompt, sampling_params in test_prompts: 33 | engine.add_request(str(request_id), prompt, sampling_params) 34 | request_id += 1 35 | 36 | outputs: List[RequestOutput] = [] 37 | if not args.test: 38 | while True: 39 | # for _ in range(29): 40 | request_outputs = engine.step() 41 | for request_output in request_outputs: 42 | if request_output.finished: 43 | outputs.append(request_output) 44 | # print(request_output.outputs[0].token_ids) 45 | if not engine.has_unfinished_requests(): 46 | break 47 | else: 48 | # TO the test, only pause once 49 | torch.cuda.cudart().cudaProfilerStart() 50 | while True: 51 | request_outputs = engine.step() 52 | for request_output in request_outputs: 53 | # print(request_output) 54 | if request_output.finished: 55 | outputs.append(request_output) 56 | # print(request_output.outputs[0].token_ids) 57 | if request_output.paused: 58 | response = {} 59 | for (rid, sid) in request_output.paused: 60 | # response[sid] = [582, 508, 468, 587] 61 | # response[sid] = [198, 464, 13429, 21983, 25, 198, 198, 818, 262, 614, 2310, 1120, 11, 257] 62 | # normal_ret 63 | response[sid] = [50118, 133, 511, 16, 10, 889, 9, 5, 144, 505, 5894, 9, 5, 343, 35, 50118, 50118, 134, 4, 20, 343, 18, 2112, 50118, 50118, 133] 64 | engine.resume_request(request_output.request_id, response) 65 | if not engine.has_unfinished_requests(): 66 | break 67 | torch.cuda.cudart().cudaProfilerStop() 68 | 69 | print(f'finished {len(outputs)} requests') 70 | outputs = sorted(outputs, key=lambda x: int(x.request_id)) 71 | serialize = [] 72 | for output in outputs: 73 | serialize.append({ 74 | "id": output.request_id, 75 | "output_tokens": output.outputs[0].token_ids, 76 | "output_text": output.outputs[0].text, 77 | }) 78 | with open(f'pause_{args.test}_output.json', 'w+') as f: 79 | json.dump(serialize, f, separators=(",", ": ")) 80 | 81 | if __name__ == '__main__': 82 | parser = argparse.ArgumentParser( 83 | description='Demo on using the LLMEngine class directly') 84 | parser = EngineArgs.add_cli_args(parser) 85 | parser.add_argument('--test', action='store_true') 86 | args = parser.parse_args() 87 | main(args) 88 | -------------------------------------------------------------------------------- /tests/kernels/test_cache.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import pytest 4 | import torch 5 | 6 | from vllm import cache_ops 7 | 8 | DTYPES = [torch.half, torch.bfloat16, torch.float] 9 | NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing 10 | NUM_LAYERS = [5] # Arbitrary values for testing 11 | NUM_HEADS = [8] # Arbitrary values for testing 12 | HEAD_SIZES = [64, 80, 96, 112, 128, 256] 13 | BLOCK_SIZES = [8, 16, 32] 14 | NUM_BLOCKS = [1024] # Arbitrary values for testing 15 | NUM_MAPPINGS = [32, 256] # Arbitrary values for testing 16 | SEEDS = [0] 17 | 18 | 19 | @pytest.mark.parametrize("num_mappings", NUM_MAPPINGS) 20 | @pytest.mark.parametrize("num_layers", NUM_LAYERS) 21 | @pytest.mark.parametrize("num_heads", NUM_HEADS) 22 | @pytest.mark.parametrize("head_size", HEAD_SIZES) 23 | @pytest.mark.parametrize("block_size", BLOCK_SIZES) 24 | @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) 25 | @pytest.mark.parametrize("dtype", DTYPES) 26 | @pytest.mark.parametrize("seed", SEEDS) 27 | @torch.inference_mode() 28 | def test_copy_blocks( 29 | kv_cache_factory, 30 | num_mappings: int, 31 | num_layers: int, 32 | num_heads: int, 33 | head_size: int, 34 | block_size: int, 35 | num_blocks: int, 36 | dtype: torch.dtype, 37 | seed: int, 38 | ) -> None: 39 | random.seed(seed) 40 | torch.random.manual_seed(seed) 41 | torch.cuda.manual_seed(seed) 42 | 43 | # Generate random block mappings where each source block is mapped to two 44 | # destination blocks. 45 | assert 2 * num_mappings <= num_blocks 46 | src_blocks = random.sample(range(num_blocks), num_mappings) 47 | remainig_blocks = list(set(range(num_blocks)) - set(src_blocks)) 48 | dst_blocks = random.sample(remainig_blocks, 2 * num_mappings) 49 | block_mapping = {} 50 | for i in range(num_mappings): 51 | src = src_blocks[i] 52 | dst1 = dst_blocks[2 * i] 53 | dst2 = dst_blocks[2 * i + 1] 54 | block_mapping[src] = [dst1, dst2] 55 | 56 | # Create the KV caches. 57 | key_caches, value_caches = kv_cache_factory(num_blocks, block_size, 58 | num_layers, num_heads, 59 | head_size, dtype, seed) 60 | 61 | # Clone the KV caches. 62 | cloned_key_caches = [key_cache.clone() for key_cache in key_caches] 63 | cloned_value_caches = [value_cache.clone() for value_cache in value_caches] 64 | 65 | # Call the copy blocks kernel. 66 | cache_ops.copy_blocks(key_caches, value_caches, block_mapping) 67 | 68 | # Run the reference implementation. 69 | for src, dsts in block_mapping.items(): 70 | for dst in dsts: 71 | for cloned_key_cache in cloned_key_caches: 72 | cloned_key_cache[dst] = cloned_key_cache[src] 73 | for cloned_value_cache in cloned_value_caches: 74 | cloned_value_cache[dst] = cloned_value_cache[src] 75 | 76 | # Compare the results. 77 | for key_cache, cloned_key_cache in zip(key_caches, cloned_key_caches): 78 | assert torch.allclose(key_cache, cloned_key_cache) 79 | for value_cache, cloned_value_cache in zip(value_caches, 80 | cloned_value_caches): 81 | assert torch.allclose(value_cache, cloned_value_cache) 82 | 83 | 84 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS) 85 | @pytest.mark.parametrize("num_heads", NUM_HEADS) 86 | @pytest.mark.parametrize("head_size", HEAD_SIZES) 87 | @pytest.mark.parametrize("block_size", BLOCK_SIZES) 88 | @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) 89 | @pytest.mark.parametrize("dtype", DTYPES) 90 | @pytest.mark.parametrize("seed", SEEDS) 91 | @torch.inference_mode() 92 | def test_reshape_and_cache( 93 | kv_cache_factory, 94 | num_tokens: int, 95 | num_heads: int, 96 | head_size: int, 97 | block_size: int, 98 | num_blocks: int, 99 | dtype: torch.dtype, 100 | seed: int, 101 | ) -> None: 102 | random.seed(seed) 103 | torch.random.manual_seed(seed) 104 | torch.cuda.manual_seed(seed) 105 | 106 | # Create a random slot mapping. 107 | num_slots = block_size * num_blocks 108 | slot_mapping = random.sample(range(num_slots), num_tokens) 109 | slot_mapping = torch.tensor(slot_mapping, dtype=torch.int, device="cuda") 110 | 111 | qkv = torch.randn(num_tokens, 112 | 3, 113 | num_heads, 114 | head_size, 115 | dtype=dtype, 116 | device="cuda") 117 | _, key, value = qkv.unbind(dim=1) 118 | 119 | # Create the KV caches. 120 | key_caches, value_caches = kv_cache_factory(num_blocks, block_size, 1, 121 | num_heads, head_size, dtype, 122 | seed) 123 | key_cache, value_cache = key_caches[0], value_caches[0] 124 | 125 | # Clone the KV caches. 126 | cloned_key_cache = key_cache.clone() 127 | cloned_value_cache = value_cache.clone() 128 | 129 | # Call the reshape_and_cache kernel. 130 | cache_ops.reshape_and_cache(key, value, key_cache, value_cache, 131 | slot_mapping) 132 | 133 | # Run the reference implementation. 134 | reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape) 135 | block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor") 136 | block_indicies = block_indicies.cpu().tolist() 137 | block_offsets = slot_mapping % block_size 138 | block_offsets = block_offsets.cpu().tolist() 139 | for i in range(num_tokens): 140 | block_idx = block_indicies[i] 141 | block_offset = block_offsets[i] 142 | cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i] 143 | cloned_value_cache[block_idx, :, :, block_offset] = value[i] 144 | 145 | assert torch.allclose(key_cache, cloned_key_cache) 146 | assert torch.allclose(value_cache, cloned_value_cache) 147 | -------------------------------------------------------------------------------- /docs/source/getting_started/quickstart.rst: -------------------------------------------------------------------------------- 1 | .. _quickstart: 2 | 3 | Quickstart 4 | ========== 5 | 6 | This guide shows how to use vLLM to: 7 | 8 | * run offline batched inference on a dataset; 9 | * build an API server for a large language model; 10 | * start an OpenAI-compatible API server. 11 | 12 | Be sure to complete the :ref:`installation instructions ` before continuing with this guide. 13 | 14 | Offline Batched Inference 15 | ------------------------- 16 | 17 | We first show an example of using vLLM for offline batched inference on a dataset. In other words, we use vLLM to generate texts for a list of input prompts. 18 | 19 | Import ``LLM`` and ``SamplingParams`` from vLLM. The ``LLM`` class is the main class for running offline inference with vLLM engine. The ``SamplingParams`` class specifies the parameters for the sampling process. 20 | 21 | .. code-block:: python 22 | 23 | from vllm import LLM, SamplingParams 24 | 25 | Define the list of input prompts and the sampling parameters for generation. The sampling temperature is set to 0.8 and the nucleus sampling probability is set to 0.95. For more information about the sampling parameters, refer to the `class definition `_. 26 | 27 | .. code-block:: python 28 | 29 | prompts = [ 30 | "Hello, my name is", 31 | "The president of the United States is", 32 | "The capital of France is", 33 | "The future of AI is", 34 | ] 35 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95) 36 | 37 | Initialize vLLM's engine for offline inference with the ``LLM`` class and the `OPT-125M model `_. The list of supported models can be found at :ref:`supported models `. 38 | 39 | .. code-block:: python 40 | 41 | llm = LLM(model="facebook/opt-125m") 42 | 43 | Call ``llm.generate`` to generate the outputs. It adds the input prompts to vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of ``RequestOutput`` objects, which include all the output tokens. 44 | 45 | .. code-block:: python 46 | 47 | outputs = llm.generate(prompts, sampling_params) 48 | 49 | # Print the outputs. 50 | for output in outputs: 51 | prompt = output.prompt 52 | generated_text = output.outputs[0].text 53 | print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") 54 | 55 | 56 | The code example can also be found in `examples/offline_inference.py `_. 57 | 58 | 59 | API Server 60 | ---------- 61 | 62 | vLLM can be deployed as an LLM service. We provide an example `FastAPI `_ server. Check `vllm/entrypoints/api_server.py `_ for the server implementation. The server uses ``AsyncLLMEngine`` class to support asynchronous processing of incoming requests. 63 | 64 | Start the server: 65 | 66 | .. code-block:: console 67 | 68 | $ python -m vllm.entrypoints.api_server 69 | 70 | By default, this command starts the server at ``http://localhost:8000`` with the OPT-125M model. 71 | 72 | Query the model in shell: 73 | 74 | .. code-block:: console 75 | 76 | $ curl http://localhost:8000/generate \ 77 | $ -d '{ 78 | $ "prompt": "San Francisco is a", 79 | $ "use_beam_search": true, 80 | $ "n": 4, 81 | $ "temperature": 0 82 | $ }' 83 | 84 | See `examples/api_client.py `_ for a more detailed client example. 85 | 86 | OpenAI-Compatible Server 87 | ------------------------ 88 | 89 | vLLM can be deployed as a server that mimics the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API. 90 | 91 | Start the server: 92 | 93 | .. code-block:: console 94 | 95 | $ python -m vllm.entrypoints.openai.api_server \ 96 | $ --model facebook/opt-125m 97 | 98 | By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time (OPT-125M in the above command) and implements `list models `_ and `create completion `_ endpoints. We are actively adding support for more endpoints. 99 | 100 | This server can be queried in the same format as OpenAI API. For example, list the models: 101 | 102 | .. code-block:: console 103 | 104 | $ curl http://localhost:8000/v1/models 105 | 106 | Query the model with input prompts: 107 | 108 | .. code-block:: console 109 | 110 | $ curl http://localhost:8000/v1/completions \ 111 | $ -H "Content-Type: application/json" \ 112 | $ -d '{ 113 | $ "model": "facebook/opt-125m", 114 | $ "prompt": "San Francisco is a", 115 | $ "max_tokens": 7, 116 | $ "temperature": 0 117 | $ }' 118 | 119 | Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the ``openai`` python package: 120 | 121 | .. code-block:: python 122 | 123 | import openai 124 | # Modify OpenAI's API key and API base to use vLLM's API server. 125 | openai.api_key = "EMPTY" 126 | openai.api_base = "http://localhost:8000/v1" 127 | completion = openai.Completion.create(model="facebook/opt-125m", 128 | prompt="San Francisco is a") 129 | print("Completion result:", completion) 130 | 131 | For a more detailed client example, refer to `examples/openai_completion_client.py `_. 132 | -------------------------------------------------------------------------------- /examples/test_ref_outputs.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Tuple, Dict 2 | import argparse 3 | from transformers import AutoModelForCausalLM, PreTrainedTokenizerBase 4 | from vllm import EngineArgs, LLMEngine, SamplingParams, utils 5 | import torch 6 | import json 7 | import random 8 | import os 9 | from vllm.outputs import RequestOutput 10 | 11 | # os.environ['CUDA_VISIBLE_DEVICES'] = '7' 12 | # os.environ['CUDA_LAUNCH_BLOCKING'] = '1' 13 | 14 | def api_call(input: str): 15 | return " a " 16 | 17 | def sample_requests( 18 | dataset_path: str, 19 | num_requests: int, 20 | tokenizer: PreTrainedTokenizerBase, 21 | ) -> List[Tuple[str, List[int], int]]: 22 | # Load the dataset. 23 | with open(dataset_path) as f: 24 | dataset = json.load(f) 25 | # Filter out the conversations with less than 2 turns. 26 | dataset = [data for data in dataset if len(data["conversations"]) >= 2] 27 | # Only keep long prompts 28 | dataset = [(data["conversations"][0]["value"], data["conversations"][1]["value"]) 29 | for data in dataset if len(data["conversations"][0]["value"]) >= 300] 30 | 31 | # Tokenize the prompts and completions. 32 | prompts = [prompt for prompt, _ in dataset] 33 | prompt_token_ids = tokenizer(prompts).input_ids 34 | completions = [completion for _, completion in dataset] 35 | completion_token_ids = tokenizer(completions).input_ids 36 | tokenized_dataset = [] 37 | for i in range(len(dataset)): 38 | output_len = len(completion_token_ids[i]) 39 | tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len)) 40 | 41 | # Filter out sequences. 42 | filtered_dataset: List[Tuple[str, int, int]] = [] 43 | for prompt, prompt_token_ids, output_len in tokenized_dataset: 44 | prompt_len = len(prompt_token_ids) 45 | if prompt_len < 100 or output_len < 4: 46 | # Prune too short sequences. 47 | continue 48 | if prompt_len > 1024 or prompt_len + output_len > 2048: 49 | # Prune too long sequences. 50 | continue 51 | filtered_dataset.append((prompt, prompt_token_ids, output_len)) 52 | 53 | # Sample the requests. 54 | sampled_requests = random.sample(filtered_dataset, num_requests) 55 | return sampled_requests 56 | 57 | def parse_ref_outputs() -> Dict[int, Tuple[List[int], List[int]]]: 58 | with open('ref-outputs.json') as f: 59 | outputs = json.load(f) 60 | ref_outputs = {data["id"]: (data["prompt_tokens"], data["output_tokens"]) for data in outputs} 61 | return ref_outputs 62 | 63 | test_until = 100 64 | 65 | def main(args: argparse.Namespace): 66 | # Parse the CLI argument and initialize the engine. 67 | engine_args = EngineArgs.from_cli_args(args) 68 | engine = LLMEngine.from_engine_args(engine_args) 69 | if args.mode == "ref": 70 | # Test the following prompts. 71 | datasets = sample_requests("ShareGPT_V3_unfiltered_cleaned_split.json", 100, engine.tokenizer) 72 | request_id = 0 73 | for prompt, prompt_token_ids, output_len in datasets: 74 | if request_id <= test_until: 75 | sampling_params = SamplingParams(n=1, temperature=0.0, presence_penalty=0.0,max_tokens=output_len,ignore_eos=True) 76 | engine.add_request(str(request_id), prompt, sampling_params, prompt_token_ids) 77 | request_id += 1 78 | 79 | outputs: List[RequestOutput] = [] 80 | while True: 81 | request_outputs = engine.step() 82 | for request_output in request_outputs: 83 | if request_output.finished: 84 | outputs.append(request_output) 85 | # print(request_output.outputs[0].token_ids) 86 | if not engine.has_unfinished_requests(): 87 | break 88 | 89 | serialize = [] 90 | outputs = sorted(outputs, key=lambda x: x.request_id) 91 | for output in outputs: 92 | serialize.append({ 93 | "id": output.request_id, 94 | "prompt_tokens": output.prompt_token_ids, 95 | "output_tokens": output.outputs[0].token_ids, 96 | "output_text": output.outputs[0].text, 97 | }) 98 | with open("ref-outputs.json", "w") as f: 99 | json.dump(serialize, f, separators=(",", ": ")) 100 | else: 101 | ref_outputs = parse_ref_outputs() 102 | for request_id, (prompt_token_ids, output_token_ids) in ref_outputs.items(): 103 | # if int(request_id) <= 22: 104 | sampling_params = SamplingParams(n=1, temperature=0.0, presence_penalty=0.0,max_tokens=len(output_token_ids),ignore_eos=True) 105 | engine.add_request(request_id, "", sampling_params, prompt_token_ids) 106 | 107 | outputs: List[RequestOutput] = [] 108 | while True: 109 | request_outputs = engine.step() 110 | for request_output in request_outputs: 111 | if request_output.finished: 112 | outputs.append(request_output) 113 | # print(request_output.outputs[0].token_ids) 114 | if not engine.has_unfinished_requests(): 115 | break 116 | print(f'finished {len(outputs)} requests') 117 | serialize = [] 118 | outputs = sorted(outputs, key=lambda x: x.request_id) 119 | for output in outputs: 120 | serialize.append({ 121 | "id": output.request_id, 122 | "prompt_tokens": output.prompt_token_ids, 123 | "output_tokens": output.outputs[0].token_ids, 124 | "output_text": output.outputs[0].text, 125 | }) 126 | with open("test-outputs.json", "w") as f: 127 | json.dump(serialize, f, separators=(",", ": ")) 128 | return 129 | 130 | if __name__ == '__main__': 131 | parser = argparse.ArgumentParser( 132 | description='Demo on using the LLMEngine class directly') 133 | parser = EngineArgs.add_cli_args(parser) 134 | parser.add_argument('--mode', type=str, default="test", choices=["ref", "test"]) 135 | args = parser.parse_args() 136 | main(args) 137 | -------------------------------------------------------------------------------- /tests/kernels/test_pos_encoding.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Tuple 2 | 3 | import pytest 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | 8 | from vllm import pos_encoding_ops 9 | 10 | IS_NEOX_STYLE = [True, False] 11 | DTYPES = [torch.half, torch.bfloat16, torch.float] 12 | HEAD_SIZES = [64, 80, 96, 112, 128, 256] 13 | ROTARY_DIMS = [None, 32] # None means rotary dim == head size 14 | NUM_HEADS = [7, 12, 40, 52] # Arbitrary values for testing 15 | NUM_TOKENS = [11, 83, 2048] # Arbitrary values for testing 16 | SEEDS = [0] 17 | 18 | 19 | def rotate_neox(x: torch.Tensor) -> torch.Tensor: 20 | x1 = x[..., :x.shape[-1] // 2] 21 | x2 = x[..., x.shape[-1] // 2:] 22 | return torch.cat((-x2, x1), dim=-1) 23 | 24 | 25 | def rotate_gptj(x: torch.Tensor) -> torch.Tensor: 26 | x1 = x[..., ::2] 27 | x2 = x[..., 1::2] 28 | x = torch.stack((-x2, x1), dim=-1) 29 | return x.flatten(-2) 30 | 31 | 32 | def apply_rope( 33 | q: torch.Tensor, 34 | k: torch.Tensor, 35 | cos: torch.Tensor, 36 | sin: torch.Tensor, 37 | is_neox_style: bool, 38 | ) -> Tuple[torch.Tensor, torch.Tensor]: 39 | rotate_fn = rotate_neox if is_neox_style else rotate_gptj 40 | q_embed = (q * cos) + (rotate_fn(q) * sin) 41 | k_embed = (k * cos) + (rotate_fn(k) * sin) 42 | return q_embed, k_embed 43 | 44 | 45 | class RefRotaryEmbedding(nn.Module): 46 | """Reference implementation of rotary embedding.""" 47 | 48 | def __init__( 49 | self, 50 | dim: int, 51 | is_neox_style: bool, 52 | max_position_embeddings: int = 8192, 53 | base: int = 10000, 54 | ) -> None: 55 | super().__init__() 56 | self.rotary_dim = dim 57 | self.is_neox_style = is_neox_style 58 | self.max_position_embeddings = max_position_embeddings 59 | 60 | # Create cos and sin embeddings. 61 | inv_freq = 1.0 / (base**(torch.arange(0, dim, 2) / dim)) 62 | t = torch.arange(max_position_embeddings).float() 63 | freqs = torch.einsum("i,j->ij", t, inv_freq.float()) 64 | if is_neox_style: 65 | emb = torch.cat((freqs, freqs), dim=-1) 66 | else: 67 | emb = torch.repeat_interleave(freqs, 2, -1) 68 | cos = emb.cos().to(dtype=inv_freq.dtype) 69 | sin = emb.sin().to(dtype=inv_freq.dtype) 70 | self.register_buffer("cos_cached", cos, persistent=False) 71 | self.register_buffer("sin_cached", sin, persistent=False) 72 | 73 | def forward( 74 | self, 75 | positions: torch.Tensor, # [num_tokens] 76 | query: torch.Tensor, # [num_tokens, num_heads, head_size] 77 | key: torch.Tensor, # [num_tokens, num_heads, head_size] 78 | ) -> Tuple[torch.Tensor, torch.Tensor]: 79 | query_rot = query[..., :self.rotary_dim] 80 | query_pass = query[..., self.rotary_dim:] 81 | key_rot = key[..., :self.rotary_dim] 82 | key_pass = key[..., self.rotary_dim:] 83 | 84 | query_rot = query_rot.transpose(0, 1) 85 | key_rot = key_rot.transpose(0, 1) 86 | cos = F.embedding(positions, self.cos_cached) 87 | sin = F.embedding(positions, self.sin_cached) 88 | 89 | query_rot, key_rot = apply_rope(query_rot, key_rot, cos, sin, 90 | self.is_neox_style) 91 | query_rot = query_rot.transpose(0, 1).contiguous() 92 | key_rot = key_rot.transpose(0, 1).contiguous() 93 | 94 | query = torch.cat((query_rot, query_pass), dim=-1) 95 | key = torch.cat((key_rot, key_pass), dim=-1) 96 | 97 | # Output query/key shape: [num_tokens, num_tokens, head_size] 98 | return query, key 99 | 100 | 101 | @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) 102 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS) 103 | @pytest.mark.parametrize("num_heads", NUM_HEADS) 104 | @pytest.mark.parametrize("head_size", HEAD_SIZES) 105 | @pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) 106 | @pytest.mark.parametrize("dtype", DTYPES) 107 | @pytest.mark.parametrize("seed", SEEDS) 108 | @torch.inference_mode() 109 | def test_rotary_embedding( 110 | is_neox_style: bool, 111 | num_tokens: int, 112 | num_heads: int, 113 | head_size: int, 114 | rotary_dim: Optional[int], 115 | dtype: torch.dtype, 116 | seed: int, 117 | max_position: int = 8192, 118 | base: int = 10000, 119 | ) -> None: 120 | if rotary_dim is None: 121 | rotary_dim = head_size 122 | torch.random.manual_seed(seed) 123 | torch.cuda.manual_seed(seed) 124 | 125 | positions = torch.randint(0, max_position, (num_tokens, ), device="cuda") 126 | query = torch.randn(num_tokens, 127 | num_heads * head_size, 128 | dtype=dtype, 129 | device="cuda") 130 | key = torch.randn(num_tokens, 131 | num_heads * head_size, 132 | dtype=dtype, 133 | device="cuda") 134 | 135 | # Create the rotary embedding. 136 | inv_freq = 1.0 / (base**( 137 | torch.arange(0, rotary_dim, 2, dtype=torch.float) / rotary_dim)) 138 | t = torch.arange(max_position).float() 139 | freqs = torch.einsum("i,j -> ij", t, inv_freq) 140 | cos = freqs.cos() 141 | sin = freqs.sin() 142 | cos_sin_cache = torch.cat((cos, sin), dim=-1) 143 | cos_sin_cache = cos_sin_cache.to(dtype=dtype, device="cuda") 144 | 145 | # Run the kernel. The kernel is in-place, so we need to clone the inputs. 146 | out_query = query.clone() 147 | out_key = key.clone() 148 | pos_encoding_ops.rotary_embedding( 149 | positions, 150 | out_query, 151 | out_key, 152 | head_size, 153 | cos_sin_cache, 154 | is_neox_style, 155 | ) 156 | 157 | # Run the reference implementation. 158 | ref_rotary_embedding = RefRotaryEmbedding( 159 | dim=rotary_dim, 160 | is_neox_style=is_neox_style, 161 | max_position_embeddings=max_position, 162 | base=base, 163 | ).to(dtype=dtype, device="cuda") 164 | ref_query, ref_key = ref_rotary_embedding( 165 | positions, 166 | query.view(num_tokens, num_heads, head_size), 167 | key.view(num_tokens, num_heads, head_size), 168 | ) 169 | ref_query = ref_query.view(num_tokens, num_heads * head_size) 170 | ref_key = ref_key.view(num_tokens, num_heads * head_size) 171 | 172 | # Compare the results. 173 | assert torch.allclose(out_query, ref_query, atol=1e-5, rtol=1e-5) 174 | assert torch.allclose(out_key, ref_key, atol=1e-5, rtol=1e-5) 175 | --------------------------------------------------------------------------------