├── vllm ├── core │ ├── __init__.py │ └── policy.py ├── engine │ ├── __init__.py │ └── ray_utils.py ├── worker │ ├── __init__.py │ └── cache_engine.py ├── entrypoints │ ├── __init__.py │ ├── openai │ │ ├── __init__.py │ │ └── protocol.py │ └── api_server.py ├── model_executor │ ├── layers │ │ ├── __init__.py │ │ ├── layernorm.py │ │ └── activation.py │ ├── parallel_utils │ │ ├── README.md │ │ ├── __init__.py │ │ └── tensor_parallel │ │ │ ├── __init__.py │ │ │ ├── utils.py │ │ │ └── random.py │ ├── __init__.py │ ├── utils.py │ ├── models │ │ └── __init__.py │ ├── input_metadata.py │ └── model_loader.py ├── transformers_utils │ ├── __init__.py │ ├── configs │ │ ├── __init__.py │ │ ├── baichuan.py │ │ ├── aquila.py │ │ ├── mpt.py │ │ ├── qwen.py │ │ └── falcon.py │ ├── config.py │ └── tokenizer.py ├── __init__.py ├── utils.py ├── logger.py ├── block.py └── outputs.py ├── MANIFEST.in ├── docs ├── requirements-docs.txt ├── source │ ├── assets │ │ ├── figures │ │ │ ├── perf_a100_n1_dark.png │ │ │ ├── perf_a100_n1_light.png │ │ │ ├── perf_a100_n3_dark.png │ │ │ ├── perf_a100_n3_light.png │ │ │ ├── perf_a10g_n1_dark.png │ │ │ ├── perf_a10g_n1_light.png │ │ │ ├── perf_a10g_n3_dark.png │ │ │ └── perf_a10g_n3_light.png │ │ └── logos │ │ │ ├── vllm-logo-only-light.png │ │ │ ├── vllm-logo-text-dark.png │ │ │ └── vllm-logo-text-light.png │ ├── getting_started │ │ ├── installation.rst │ │ └── quickstart.rst │ ├── serving │ │ ├── distributed_serving.rst │ │ └── run_on_sky.rst │ ├── index.rst │ ├── conf.py │ └── models │ │ ├── supported_models.rst │ │ └── adding_model.rst ├── README.md ├── Makefile └── make.bat ├── csrc ├── attention │ ├── attention_dtypes.h │ ├── attention_generic.cuh │ ├── attention_utils.cuh │ └── dtype_float32.cuh ├── layernorm.cpp ├── pos_encoding.cpp ├── dispatch_utils.h ├── activation.cpp ├── attention.cpp ├── cache.cpp ├── reduction_utils.cuh ├── layernorm_kernels.cu ├── activation_kernels.cu └── pos_encoding_kernels.cu ├── pyproject.toml ├── requirements-dev.txt ├── mypy.ini ├── benchmarks ├── README.md ├── launch_tgi_server.sh └── benchmark_latency.py ├── requirements.txt ├── .github └── workflows │ ├── scripts │ ├── build.sh │ ├── cuda-install.sh │ ├── create_release.js │ ├── pytorch-install.sh │ └── env.sh │ ├── pylint.yml │ ├── yapf.yml │ └── publish.yml ├── .readthedocs.yaml ├── examples ├── openai_completion_client.py ├── offline_inference.py ├── openai_chatcompletion_client.py ├── gradio_webserver.py ├── llm_engine_example.py └── api_client.py ├── tests ├── kernels │ ├── conftest.py │ ├── test_layernorm.py │ ├── test_activation.py │ ├── test_cache.py │ └── test_pos_encoding.py ├── models │ └── test_models.py ├── samplers │ └── test_beam_search.py ├── async_engine │ ├── api_server_async_engine.py │ ├── test_request_tracker.py │ └── test_api_server.py ├── engine │ └── test_detokenize.py └── conftest.py ├── CONTRIBUTING.md ├── format.sh ├── .gitignore └── README.md /vllm/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/engine/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/worker/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/entrypoints/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/entrypoints/openai/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vllm/transformers_utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include requirements.txt 3 | 4 | recursive-include csrc * 5 | -------------------------------------------------------------------------------- /docs/requirements-docs.txt: -------------------------------------------------------------------------------- 1 | sphinx == 6.2.1 2 | sphinx-book-theme == 1.0.1 3 | sphinx-copybutton == 0.5.2 4 | -------------------------------------------------------------------------------- /docs/source/assets/figures/perf_a100_n1_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LLukas22/vllm/main/docs/source/assets/figures/perf_a100_n1_dark.png -------------------------------------------------------------------------------- /docs/source/assets/figures/perf_a100_n1_light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LLukas22/vllm/main/docs/source/assets/figures/perf_a100_n1_light.png -------------------------------------------------------------------------------- /docs/source/assets/figures/perf_a100_n3_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LLukas22/vllm/main/docs/source/assets/figures/perf_a100_n3_dark.png -------------------------------------------------------------------------------- /docs/source/assets/figures/perf_a100_n3_light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LLukas22/vllm/main/docs/source/assets/figures/perf_a100_n3_light.png -------------------------------------------------------------------------------- /docs/source/assets/figures/perf_a10g_n1_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LLukas22/vllm/main/docs/source/assets/figures/perf_a10g_n1_dark.png -------------------------------------------------------------------------------- /docs/source/assets/figures/perf_a10g_n1_light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LLukas22/vllm/main/docs/source/assets/figures/perf_a10g_n1_light.png -------------------------------------------------------------------------------- /docs/source/assets/figures/perf_a10g_n3_dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LLukas22/vllm/main/docs/source/assets/figures/perf_a10g_n3_dark.png -------------------------------------------------------------------------------- /docs/source/assets/figures/perf_a10g_n3_light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LLukas22/vllm/main/docs/source/assets/figures/perf_a10g_n3_light.png -------------------------------------------------------------------------------- /docs/source/assets/logos/vllm-logo-only-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LLukas22/vllm/main/docs/source/assets/logos/vllm-logo-only-light.png -------------------------------------------------------------------------------- /docs/source/assets/logos/vllm-logo-text-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LLukas22/vllm/main/docs/source/assets/logos/vllm-logo-text-dark.png -------------------------------------------------------------------------------- /docs/source/assets/logos/vllm-logo-text-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LLukas22/vllm/main/docs/source/assets/logos/vllm-logo-text-light.png -------------------------------------------------------------------------------- /csrc/attention/attention_dtypes.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "attention_generic.cuh" 4 | #include "dtype_float16.cuh" 5 | #include "dtype_float32.cuh" 6 | #include "dtype_bfloat16.cuh" 7 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "ninja", 4 | "packaging", 5 | "setuptools", 6 | "torch >= 2.0.0", 7 | "wheel", 8 | ] 9 | build-backend = "setuptools.build_meta" 10 | -------------------------------------------------------------------------------- /vllm/model_executor/parallel_utils/README.md: -------------------------------------------------------------------------------- 1 | The files in this folder are ported from [Megatron-LM](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core). We only keep the codes that are used in inference. -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | # formatting 2 | yapf==0.32.0 3 | pylint==2.8.2 4 | 5 | # type checking 6 | mypy==0.991 7 | types-PyYAML 8 | types-requests 9 | types-setuptools 10 | 11 | # testing 12 | pytest 13 | pytest-forked 14 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | python_version = 3.8 3 | 4 | ignore_missing_imports = True 5 | 6 | files = vllm 7 | # TODO(woosuk): Include the code from Megatron and HuggingFace. 8 | exclude = vllm/model_executor/parallel_utils/|vllm/model_executor/models/ 9 | -------------------------------------------------------------------------------- /vllm/model_executor/parallel_utils/__init__.py: -------------------------------------------------------------------------------- 1 | import vllm.model_executor.parallel_utils.parallel_state 2 | import vllm.model_executor.parallel_utils.tensor_parallel 3 | 4 | __all__ = [ 5 | "parallel_state", 6 | "tensor_parallel", 7 | ] 8 | -------------------------------------------------------------------------------- /benchmarks/README.md: -------------------------------------------------------------------------------- 1 | # Benchmarking vLLM 2 | 3 | ## Downloading the ShareGPT dataset 4 | 5 | You can download the dataset by running: 6 | ```bash 7 | wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json 8 | ``` 9 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | ninja # For faster builds. 2 | psutil 3 | ray >= 2.5.1 4 | sentencepiece # Required for LLaMA tokenizer. 5 | numpy 6 | torch >= 2.0.0 7 | transformers >= 4.33.1 # Required for Code Llama. 8 | xformers >= 0.0.21 9 | fastapi 10 | uvicorn 11 | pydantic < 2 # Required for OpenAI server. 12 | -------------------------------------------------------------------------------- /vllm/model_executor/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.model_executor.input_metadata import InputMetadata 2 | from vllm.model_executor.model_loader import get_model 3 | from vllm.model_executor.utils import set_random_seed 4 | 5 | __all__ = [ 6 | "InputMetadata", 7 | "get_model", 8 | "set_random_seed", 9 | ] 10 | -------------------------------------------------------------------------------- /csrc/layernorm.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void rms_norm( 4 | torch::Tensor& out, 5 | torch::Tensor& input, 6 | torch::Tensor& weight, 7 | float epsilon); 8 | 9 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 10 | m.def( 11 | "rms_norm", 12 | &rms_norm, 13 | "Apply Root Mean Square (RMS) Normalization to the input tensor."); 14 | } 15 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # vLLM documents 2 | 3 | ## Build the docs 4 | 5 | ```bash 6 | # Install dependencies. 7 | pip install -r requirements-docs.txt 8 | 9 | # Build the docs. 10 | make clean 11 | make html 12 | ``` 13 | 14 | ## Open the docs with your browser 15 | 16 | ```bash 17 | python -m http.server -d build/html/ 18 | ``` 19 | Launch your browser and open localhost:8000. 20 | -------------------------------------------------------------------------------- /.github/workflows/scripts/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python_executable=python$1 4 | cuda_home=/usr/local/cuda-$2 5 | 6 | # Update paths 7 | PATH=${cuda_home}/bin:$PATH 8 | LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH 9 | 10 | # Install requirements 11 | $python_executable -m pip install wheel packaging 12 | $python_executable -m pip install -r requirements.txt 13 | 14 | # Build 15 | $python_executable setup.py bdist_wheel --dist-dir=dist 16 | -------------------------------------------------------------------------------- /csrc/pos_encoding.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void rotary_embedding( 4 | torch::Tensor& positions, 5 | torch::Tensor& query, 6 | torch::Tensor& key, 7 | int head_size, 8 | torch::Tensor& cos_sin_cache, 9 | bool is_neox); 10 | 11 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 12 | m.def( 13 | "rotary_embedding", 14 | &rotary_embedding, 15 | "Apply GPT-NeoX or GPT-J style rotary embedding to query and key"); 16 | } 17 | -------------------------------------------------------------------------------- /benchmarks/launch_tgi_server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PORT=8000 4 | MODEL=$1 5 | TOKENS=$2 6 | 7 | docker run --gpus all --shm-size 1g -p $PORT:80 \ 8 | -v $PWD/data:/data \ 9 | ghcr.io/huggingface/text-generation-inference:0.8 \ 10 | --model-id $MODEL \ 11 | --sharded false \ 12 | --max-input-length 1024 \ 13 | --max-total-tokens 2048 \ 14 | --max-best-of 5 \ 15 | --max-concurrent-requests 5000 \ 16 | --max-batch-total-tokens $TOKENS 17 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # Read the Docs configuration file 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 3 | 4 | version: 2 5 | 6 | build: 7 | os: ubuntu-22.04 8 | tools: 9 | python: "3.8" 10 | 11 | sphinx: 12 | configuration: docs/source/conf.py 13 | 14 | # If using Sphinx, optionally build your docs in additional formats such as PDF 15 | formats: 16 | - pdf 17 | 18 | # Optionally declare the Python requirements required to build your docs 19 | python: 20 | install: 21 | - requirements: docs/requirements-docs.txt 22 | -------------------------------------------------------------------------------- /csrc/dispatch_utils.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Adapted from 3 | * https://github.com/pytorch/pytorch/blob/v2.0.1/aten/src/ATen/Dispatch.h 4 | */ 5 | #include 6 | 7 | #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ 8 | AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ 9 | AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ 10 | AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) 11 | 12 | #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ 13 | AT_DISPATCH_SWITCH( \ 14 | TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) 15 | -------------------------------------------------------------------------------- /vllm/model_executor/utils.py: -------------------------------------------------------------------------------- 1 | """Utils for model executor.""" 2 | import random 3 | 4 | import numpy as np 5 | import torch 6 | 7 | from vllm.model_executor.parallel_utils.parallel_state import model_parallel_is_initialized 8 | from vllm.model_executor.parallel_utils.tensor_parallel import model_parallel_cuda_manual_seed 9 | 10 | 11 | def set_random_seed(seed: int) -> None: 12 | random.seed(seed) 13 | np.random.seed(seed) 14 | torch.manual_seed(seed) 15 | if torch.cuda.is_available(): 16 | torch.cuda.manual_seed_all(seed) 17 | 18 | if model_parallel_is_initialized(): 19 | model_parallel_cuda_manual_seed(seed) 20 | -------------------------------------------------------------------------------- /csrc/activation.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void silu_and_mul( 4 | torch::Tensor& out, 5 | torch::Tensor& input); 6 | 7 | void gelu_new( 8 | torch::Tensor& out, 9 | torch::Tensor& input); 10 | 11 | void gelu_fast( 12 | torch::Tensor& out, 13 | torch::Tensor& input); 14 | 15 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 16 | m.def( 17 | "silu_and_mul", 18 | &silu_and_mul, 19 | "Activation function used in SwiGLU."); 20 | m.def( 21 | "gelu_new", 22 | &gelu_new, 23 | "GELU implementation used in GPT-2."); 24 | m.def( 25 | "gelu_fast", 26 | &gelu_fast, 27 | "Approximate GELU implementation."); 28 | } 29 | -------------------------------------------------------------------------------- /.github/workflows/scripts/cuda-install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Replace '.' with '-' ex: 11.8 -> 11-8 4 | cuda_version=$(echo $1 | tr "." "-") 5 | # Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004 6 | OS=$(echo $2 | tr -d ".\-") 7 | 8 | # Installs CUDA 9 | wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb 10 | sudo dpkg -i cuda-keyring_1.1-1_all.deb 11 | rm cuda-keyring_1.1-1_all.deb 12 | sudo apt -qq update 13 | sudo apt -y install cuda-${cuda_version} cuda-nvcc-${cuda_version} cuda-libraries-dev-${cuda_version} 14 | sudo apt clean 15 | 16 | # Test nvcc 17 | PATH=/usr/local/cuda-$1/bin:${PATH} 18 | nvcc --version 19 | -------------------------------------------------------------------------------- /examples/openai_completion_client.py: -------------------------------------------------------------------------------- 1 | import openai 2 | 3 | # Modify OpenAI's API key and API base to use vLLM's API server. 4 | openai.api_key = "EMPTY" 5 | openai.api_base = "http://localhost:8000/v1" 6 | 7 | # List models API 8 | models = openai.Model.list() 9 | print("Models:", models) 10 | 11 | model = models["data"][0]["id"] 12 | 13 | # Completion API 14 | stream = False 15 | completion = openai.Completion.create( 16 | model=model, 17 | prompt="A robot may not injure a human being", 18 | echo=False, 19 | n=2, 20 | stream=stream, 21 | logprobs=3) 22 | 23 | print("Completion results:") 24 | if stream: 25 | for c in completion: 26 | print(c) 27 | else: 28 | print(completion) 29 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /vllm/transformers_utils/configs/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.transformers_utils.configs.mpt import MPTConfig 2 | from vllm.transformers_utils.configs.baichuan import BaiChuanConfig 3 | from vllm.transformers_utils.configs.aquila import AquilaConfig 4 | from vllm.transformers_utils.configs.qwen import QWenConfig 5 | # RWConfig is for the original tiiuae/falcon-40b(-instruct) and 6 | # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the 7 | # `FalconConfig` class from the official HuggingFace transformers library. 8 | from vllm.transformers_utils.configs.falcon import RWConfig 9 | 10 | __all__ = [ 11 | "MPTConfig", 12 | "BaiChuanConfig", 13 | "AquilaConfig", 14 | "QWenConfig", 15 | "RWConfig", 16 | ] 17 | -------------------------------------------------------------------------------- /csrc/attention.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | void single_query_cached_kv_attention( 5 | torch::Tensor& out, 6 | torch::Tensor& query, 7 | torch::Tensor& key_cache, 8 | torch::Tensor& value_cache, 9 | torch::Tensor& head_mapping, 10 | float scale, 11 | torch::Tensor& block_tables, 12 | torch::Tensor& context_lens, 13 | int block_size, 14 | int max_context_len, 15 | const c10::optional& alibi_slopes); 16 | 17 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 18 | m.def( 19 | "single_query_cached_kv_attention", 20 | &single_query_cached_kv_attention, 21 | "Compute the attention between an input query and the cached key/value tensors"); 22 | } 23 | -------------------------------------------------------------------------------- /.github/workflows/scripts/create_release.js: -------------------------------------------------------------------------------- 1 | // Uses Github's API to create the release and wait for result. 2 | // We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately. 3 | 4 | module.exports = async (github, context, core) => { 5 | try { 6 | const response = await github.rest.repos.createRelease({ 7 | draft: false, 8 | generate_release_notes: true, 9 | name: process.env.RELEASE_TAG, 10 | owner: context.repo.owner, 11 | prerelease: false, 12 | repo: context.repo.repo, 13 | tag_name: process.env.RELEASE_TAG, 14 | }); 15 | 16 | core.setOutput('upload_url', response.data.upload_url); 17 | } catch (error) { 18 | core.setFailed(error.message); 19 | } 20 | } -------------------------------------------------------------------------------- /.github/workflows/scripts/pytorch-install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python_executable=python$1 4 | cuda_version=$2 5 | 6 | # Install torch 7 | $python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya 8 | $python_executable -m pip install torch -f https://download.pytorch.org/whl/cu${cuda_version//./}/torch_stable.html 9 | 10 | # Print version information 11 | $python_executable --version 12 | $python_executable -c "import torch; print('PyTorch:', torch.__version__)" 13 | $python_executable -c "import torch; print('CUDA:', torch.version.cuda)" 14 | $python_executable -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)" 15 | -------------------------------------------------------------------------------- /vllm/__init__.py: -------------------------------------------------------------------------------- 1 | """vLLM: a high-throughput and memory-efficient inference engine for LLMs""" 2 | 3 | from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs 4 | from vllm.engine.async_llm_engine import AsyncLLMEngine 5 | from vllm.engine.llm_engine import LLMEngine 6 | from vllm.engine.ray_utils import initialize_cluster 7 | from vllm.entrypoints.llm import LLM 8 | from vllm.outputs import CompletionOutput, RequestOutput 9 | from vllm.sampling_params import SamplingParams 10 | 11 | __version__ = "0.1.7" 12 | 13 | __all__ = [ 14 | "LLM", 15 | "SamplingParams", 16 | "RequestOutput", 17 | "CompletionOutput", 18 | "LLMEngine", 19 | "EngineArgs", 20 | "AsyncLLMEngine", 21 | "AsyncEngineArgs", 22 | "initialize_cluster", 23 | ] 24 | -------------------------------------------------------------------------------- /.github/workflows/pylint.yml: -------------------------------------------------------------------------------- 1 | name: pylint 2 | 3 | on: 4 | # Trigger the workflow on push or pull request, 5 | # but only for the main branch 6 | push: 7 | branches: 8 | - main 9 | pull_request: 10 | branches: 11 | - main 12 | 13 | jobs: 14 | pylint: 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | python-version: ["3.10"] 19 | steps: 20 | - uses: actions/checkout@v2 21 | - name: Set up Python ${{ matrix.python-version }} 22 | uses: actions/setup-python@v2 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | - name: Install dependencies 26 | run: | 27 | python -m pip install --upgrade pip 28 | pip install pylint==2.8.2 29 | - name: Analysing the code with pylint 30 | run: | 31 | pylint vllm 32 | -------------------------------------------------------------------------------- /examples/offline_inference.py: -------------------------------------------------------------------------------- 1 | from vllm import LLM, SamplingParams 2 | 3 | # Sample prompts. 4 | prompts = [ 5 | "Hello, my name is", 6 | "The president of the United States is", 7 | "The capital of France is", 8 | "The future of AI is", 9 | ] 10 | # Create a sampling params object. 11 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95) 12 | 13 | # Create an LLM. 14 | llm = LLM(model="facebook/opt-125m") 15 | # Generate texts from the prompts. The output is a list of RequestOutput objects 16 | # that contain the prompt, generated text, and other information. 17 | outputs = llm.generate(prompts, sampling_params) 18 | # Print the outputs. 19 | for output in outputs: 20 | prompt = output.prompt 21 | generated_text = output.outputs[0].text 22 | print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") 23 | -------------------------------------------------------------------------------- /.github/workflows/yapf.yml: -------------------------------------------------------------------------------- 1 | name: yapf 2 | 3 | on: 4 | # Trigger the workflow on push or pull request, 5 | # but only for the main branch 6 | push: 7 | branches: 8 | - main 9 | pull_request: 10 | branches: 11 | - main 12 | jobs: 13 | yapf: 14 | runs-on: ubuntu-latest 15 | strategy: 16 | matrix: 17 | python-version: ["3.10"] 18 | steps: 19 | - uses: actions/checkout@v2 20 | - name: Set up Python ${{ matrix.python-version }} 21 | uses: actions/setup-python@v2 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | pip install yapf==0.32.0 28 | pip install toml==0.10.2 29 | - name: Running yapf 30 | run: | 31 | yapf --diff --recursive vllm --exclude 'vllm/model_executor/parallel_utils/**' 32 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/layernorm.py: -------------------------------------------------------------------------------- 1 | """Custom normalization layers.""" 2 | import torch 3 | import torch.nn as nn 4 | 5 | from vllm import layernorm_ops 6 | 7 | 8 | class RMSNorm(nn.Module): 9 | """Root mean square normalization. 10 | 11 | Computes x -> w * x / sqrt(E[x^2] + eps) where w is the learned weight. 12 | Refer to https://arxiv.org/abs/1910.07467 13 | """ 14 | 15 | def __init__( 16 | self, 17 | hidden_size: int, 18 | eps: float = 1e-6, 19 | ) -> None: 20 | super().__init__() 21 | self.weight = nn.Parameter(torch.ones(hidden_size)) 22 | self.variance_epsilon = eps 23 | 24 | def forward(self, x: torch.Tensor) -> torch.Tensor: 25 | out = torch.empty_like(x) 26 | layernorm_ops.rms_norm( 27 | out, 28 | x, 29 | self.weight.data, 30 | self.variance_epsilon, 31 | ) 32 | return out 33 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /examples/openai_chatcompletion_client.py: -------------------------------------------------------------------------------- 1 | import openai 2 | 3 | # Modify OpenAI's API key and API base to use vLLM's API server. 4 | openai.api_key = "EMPTY" 5 | openai.api_base = "http://localhost:8000/v1" 6 | 7 | # List models API 8 | models = openai.Model.list() 9 | print("Models:", models) 10 | 11 | model = models["data"][0]["id"] 12 | 13 | # Chat completion API 14 | chat_completion = openai.ChatCompletion.create( 15 | model=model, 16 | messages=[{ 17 | "role": "system", 18 | "content": "You are a helpful assistant." 19 | }, { 20 | "role": "user", 21 | "content": "Who won the world series in 2020?" 22 | }, { 23 | "role": 24 | "assistant", 25 | "content": 26 | "The Los Angeles Dodgers won the World Series in 2020." 27 | }, { 28 | "role": "user", 29 | "content": "Where was it played?" 30 | }]) 31 | 32 | print("Chat completion results:") 33 | print(chat_completion) 34 | -------------------------------------------------------------------------------- /vllm/utils.py: -------------------------------------------------------------------------------- 1 | import enum 2 | from platform import uname 3 | import uuid 4 | 5 | import psutil 6 | import torch 7 | 8 | 9 | class Device(enum.Enum): 10 | GPU = enum.auto() 11 | CPU = enum.auto() 12 | 13 | 14 | class Counter: 15 | 16 | def __init__(self, start: int = 0) -> None: 17 | self.counter = start 18 | 19 | def __next__(self) -> int: 20 | i = self.counter 21 | self.counter += 1 22 | return i 23 | 24 | def reset(self) -> None: 25 | self.counter = 0 26 | 27 | 28 | def get_gpu_memory(gpu: int = 0) -> int: 29 | """Returns the total memory of the GPU in bytes.""" 30 | return torch.cuda.get_device_properties(gpu).total_memory 31 | 32 | 33 | def get_cpu_memory() -> int: 34 | """Returns the total CPU memory of the node in bytes.""" 35 | return psutil.virtual_memory().total 36 | 37 | 38 | def random_uuid() -> str: 39 | return str(uuid.uuid4().hex) 40 | 41 | 42 | def in_wsl() -> bool: 43 | # Reference: https://github.com/microsoft/WSL/issues/4071 44 | return "microsoft" in " ".join(uname()).lower() 45 | -------------------------------------------------------------------------------- /vllm/core/policy.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from vllm.sequence import SequenceGroup 4 | 5 | 6 | class Policy: 7 | 8 | def get_priority( 9 | self, 10 | now: float, 11 | seq_group: SequenceGroup, 12 | ) -> float: 13 | raise NotImplementedError 14 | 15 | def sort_by_priority( 16 | self, 17 | now: float, 18 | seq_groups: List[SequenceGroup], 19 | ) -> List[SequenceGroup]: 20 | return sorted( 21 | seq_groups, 22 | key=lambda seq_group: self.get_priority(now, seq_group), 23 | reverse=True, 24 | ) 25 | 26 | 27 | class FCFS(Policy): 28 | 29 | def get_priority( 30 | self, 31 | now: float, 32 | seq_group: SequenceGroup, 33 | ) -> float: 34 | return now - seq_group.arrival_time 35 | 36 | 37 | class PolicyFactory: 38 | 39 | _POLICY_REGISTRY = { 40 | 'fcfs': FCFS, 41 | } 42 | 43 | @classmethod 44 | def get_policy(cls, policy_name: str, **kwargs) -> Policy: 45 | return cls._POLICY_REGISTRY[policy_name](**kwargs) 46 | -------------------------------------------------------------------------------- /docs/source/getting_started/installation.rst: -------------------------------------------------------------------------------- 1 | .. _installation: 2 | 3 | Installation 4 | ============ 5 | 6 | vLLM is a Python library that also contains pre-compiled C++ and CUDA (11.8) binaries. 7 | 8 | Requirements 9 | ------------ 10 | 11 | * OS: Linux 12 | * Python: 3.8 -- 3.11 13 | * GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, etc.) 14 | 15 | Install with pip 16 | ---------------- 17 | 18 | You can install vLLM using pip: 19 | 20 | .. code-block:: console 21 | 22 | $ # (Optional) Create a new conda environment. 23 | $ conda create -n myenv python=3.8 -y 24 | $ conda activate myenv 25 | 26 | $ # Install vLLM. 27 | $ pip install vllm 28 | 29 | 30 | .. _build_from_source: 31 | 32 | Build from source 33 | ----------------- 34 | 35 | You can also build and install vLLM from source: 36 | 37 | .. code-block:: console 38 | 39 | $ git clone https://github.com/vllm-project/vllm.git 40 | $ cd vllm 41 | $ pip install -e . # This may take 5-10 minutes. 42 | 43 | .. tip:: 44 | If you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image. 45 | 46 | .. code-block:: console 47 | 48 | $ # Pull the Docker image with CUDA 11.8. 49 | $ docker run --gpus all -it --rm --shm-size=8g nvcr.io/nvidia/pytorch:22.12-py3 50 | -------------------------------------------------------------------------------- /csrc/cache.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | 6 | void swap_blocks( 7 | torch::Tensor& src, 8 | torch::Tensor& dst, 9 | const std::map& block_mapping); 10 | 11 | void copy_blocks( 12 | std::vector& key_caches, 13 | std::vector& value_caches, 14 | const std::map>& block_mapping); 15 | 16 | void reshape_and_cache( 17 | torch::Tensor& key, 18 | torch::Tensor& value, 19 | torch::Tensor& key_cache, 20 | torch::Tensor& value_cache, 21 | torch::Tensor& slot_mapping); 22 | 23 | void gather_cached_kv( 24 | torch::Tensor& key, 25 | torch::Tensor& value, 26 | torch::Tensor& key_cache, 27 | torch::Tensor& value_cache, 28 | torch::Tensor& slot_mapping); 29 | 30 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 31 | m.def( 32 | "swap_blocks", 33 | &swap_blocks, 34 | "Swap in (out) the cache blocks from src to dst"); 35 | m.def( 36 | "copy_blocks", 37 | ©_blocks, 38 | "Copy the cache blocks from src to dst"); 39 | m.def( 40 | "reshape_and_cache", 41 | &reshape_and_cache, 42 | "Reshape the key and value tensors and cache them"); 43 | m.def( 44 | "gather_cached_kv", 45 | &gather_cached_kv, 46 | "Gather key and value from the cache into contiguous QKV tensors"); 47 | } 48 | -------------------------------------------------------------------------------- /vllm/model_executor/models/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.model_executor.models.aquila import AquilaForCausalLM 2 | from vllm.model_executor.models.baichuan import (BaiChuanForCausalLM, 3 | BaichuanForCausalLM) 4 | from vllm.model_executor.models.bloom import BloomForCausalLM 5 | from vllm.model_executor.models.falcon import FalconForCausalLM 6 | from vllm.model_executor.models.gpt2 import GPT2LMHeadModel 7 | from vllm.model_executor.models.gpt_bigcode import GPTBigCodeForCausalLM 8 | from vllm.model_executor.models.gpt_j import GPTJForCausalLM 9 | from vllm.model_executor.models.gpt_neox import GPTNeoXForCausalLM 10 | from vllm.model_executor.models.internlm import InternLMForCausalLM 11 | from vllm.model_executor.models.llama import LlamaForCausalLM 12 | from vllm.model_executor.models.mpt import MPTForCausalLM 13 | from vllm.model_executor.models.opt import OPTForCausalLM 14 | from vllm.model_executor.models.qwen import QWenLMHeadModel 15 | 16 | __all__ = [ 17 | "AquilaForCausalLM", 18 | "BaiChuanForCausalLM", 19 | "BaichuanForCausalLM", 20 | "BloomForCausalLM", 21 | "FalconForCausalLM", 22 | "GPT2LMHeadModel", 23 | "GPTBigCodeForCausalLM", 24 | "GPTJForCausalLM", 25 | "GPTNeoXForCausalLM", 26 | "InternLMForCausalLM", 27 | "LlamaForCausalLM", 28 | "MPTForCausalLM", 29 | "OPTForCausalLM", 30 | "QWenLMHeadModel", 31 | ] 32 | -------------------------------------------------------------------------------- /.github/workflows/scripts/env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This file installs common linux environment tools 4 | 5 | export LANG C.UTF-8 6 | 7 | # python_version=$1 8 | 9 | sudo apt-get update && \ 10 | sudo apt-get install -y --no-install-recommends \ 11 | software-properties-common \ 12 | 13 | sudo apt-get install -y --no-install-recommends \ 14 | build-essential \ 15 | apt-utils \ 16 | ca-certificates \ 17 | wget \ 18 | git \ 19 | vim \ 20 | libssl-dev \ 21 | curl \ 22 | unzip \ 23 | unrar \ 24 | cmake \ 25 | net-tools \ 26 | sudo \ 27 | autotools-dev \ 28 | rsync \ 29 | jq \ 30 | openssh-server \ 31 | tmux \ 32 | screen \ 33 | htop \ 34 | pdsh \ 35 | openssh-client \ 36 | lshw \ 37 | dmidecode \ 38 | util-linux \ 39 | automake \ 40 | autoconf \ 41 | libtool \ 42 | net-tools \ 43 | pciutils \ 44 | libpci-dev \ 45 | libaio-dev \ 46 | libcap2 \ 47 | libtinfo5 \ 48 | fakeroot \ 49 | devscripts \ 50 | debhelper \ 51 | nfs-common 52 | 53 | # Remove github bloat files to free up disk space 54 | sudo rm -rf "/usr/local/share/boost" 55 | sudo rm -rf "$AGENT_TOOLSDIRECTORY" 56 | sudo rm -rf "/usr/share/dotnet" 57 | -------------------------------------------------------------------------------- /tests/kernels/conftest.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | 3 | import pytest 4 | import torch 5 | 6 | 7 | def create_kv_caches( 8 | num_blocks: int, 9 | block_size: int, 10 | num_layers: int, 11 | num_heads: int, 12 | head_size: int, 13 | dtype: torch.dtype, 14 | seed: int, 15 | ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: 16 | torch.random.manual_seed(seed) 17 | torch.cuda.manual_seed(seed) 18 | 19 | scale = head_size**-0.5 20 | x = 16 // torch.tensor([], dtype=dtype).element_size() 21 | key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x) 22 | key_caches = [] 23 | for _ in range(num_layers): 24 | key_cache = torch.empty(size=key_cache_shape, 25 | dtype=dtype, 26 | device='cuda') 27 | key_cache.uniform_(-scale, scale) 28 | key_caches.append(key_cache) 29 | 30 | value_cache_shape = (num_blocks, num_heads, head_size, block_size) 31 | value_caches = [] 32 | for _ in range(num_layers): 33 | value_cache = torch.empty(size=value_cache_shape, 34 | dtype=dtype, 35 | device='cuda') 36 | value_cache.uniform_(-scale, scale) 37 | value_caches.append(value_cache) 38 | return key_caches, value_caches 39 | 40 | 41 | @pytest.fixture() 42 | def kv_cache_factory(): 43 | return create_kv_caches 44 | -------------------------------------------------------------------------------- /tests/models/test_models.py: -------------------------------------------------------------------------------- 1 | """Compare the outputs of HF and vLLM when using greedy sampling. 2 | 3 | Run `pytest tests/models/test_models.py --forked`. 4 | """ 5 | import pytest 6 | 7 | MODELS = [ 8 | "facebook/opt-125m", 9 | "gpt2", 10 | "bigcode/tiny_starcoder_py", 11 | "EleutherAI/gpt-j-6b", 12 | "EleutherAI/pythia-70m", 13 | "bigscience/bloom-560m", 14 | "mosaicml/mpt-7b", 15 | "tiiuae/falcon-7b", 16 | "meta-llama/Llama-2-7b-hf", 17 | ] 18 | 19 | 20 | @pytest.mark.parametrize("model", MODELS) 21 | @pytest.mark.parametrize("dtype", ["half"]) 22 | @pytest.mark.parametrize("max_tokens", [128]) 23 | def test_models( 24 | hf_runner, 25 | vllm_runner, 26 | example_prompts, 27 | model: str, 28 | dtype: str, 29 | max_tokens: int, 30 | ) -> None: 31 | hf_model = hf_runner(model, dtype=dtype) 32 | hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) 33 | del hf_model 34 | 35 | vllm_model = vllm_runner(model, dtype=dtype) 36 | vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) 37 | del vllm_model 38 | 39 | for i in range(len(example_prompts)): 40 | hf_output_ids, hf_output_str = hf_outputs[i] 41 | vllm_output_ids, vllm_output_str = vllm_outputs[i] 42 | assert hf_output_str == vllm_output_str, ( 43 | f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}") 44 | assert hf_output_ids == vllm_output_ids, ( 45 | f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") 46 | -------------------------------------------------------------------------------- /vllm/transformers_utils/config.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from transformers import AutoConfig, PretrainedConfig 4 | 5 | from vllm.transformers_utils.configs import * # pylint: disable=wildcard-import 6 | 7 | _CONFIG_REGISTRY = { 8 | "mpt": MPTConfig, 9 | "baichuan": BaiChuanConfig, 10 | "aquila": AquilaConfig, 11 | "qwen": QWenConfig, 12 | "RefinedWeb": RWConfig, # For tiiuae/falcon-40b(-instruct) 13 | "RefinedWebModel": RWConfig, # For tiiuae/falcon-7b(-instruct) 14 | } 15 | 16 | 17 | def get_config(model: str, 18 | trust_remote_code: bool, 19 | revision: Optional[str] = None) -> PretrainedConfig: 20 | try: 21 | config = AutoConfig.from_pretrained( 22 | model, trust_remote_code=trust_remote_code, revision=revision) 23 | except ValueError as e: 24 | if (not trust_remote_code and 25 | "requires you to execute the configuration file" in str(e)): 26 | err_msg = ( 27 | "Failed to load the model config. If the model is a custom " 28 | "model not yet available in the HuggingFace transformers " 29 | "library, consider setting `trust_remote_code=True` in LLM " 30 | "or using the `--trust-remote-code` flag in the CLI.") 31 | raise RuntimeError(err_msg) from e 32 | else: 33 | raise e 34 | if config.model_type in _CONFIG_REGISTRY: 35 | config_class = _CONFIG_REGISTRY[config.model_type] 36 | config = config_class.from_pretrained(model, revision=revision) 37 | return config 38 | -------------------------------------------------------------------------------- /vllm/model_executor/parallel_utils/tensor_parallel/__init__.py: -------------------------------------------------------------------------------- 1 | from .layers import ( 2 | ColumnParallelLinear, 3 | RowParallelLinear, 4 | VocabParallelEmbedding, 5 | set_tensor_model_parallel_attributes, 6 | set_defaults_if_not_set_tensor_model_parallel_attributes, 7 | copy_tensor_model_parallel_attributes, 8 | param_is_not_tensor_parallel_duplicate, 9 | ) 10 | 11 | from .mappings import ( 12 | copy_to_tensor_model_parallel_region, 13 | gather_from_tensor_model_parallel_region, 14 | gather_from_sequence_parallel_region, 15 | reduce_from_tensor_model_parallel_region, 16 | scatter_to_tensor_model_parallel_region, 17 | scatter_to_sequence_parallel_region, 18 | ) 19 | 20 | from .random import ( 21 | get_cuda_rng_tracker, 22 | model_parallel_cuda_manual_seed, 23 | ) 24 | 25 | from .utils import ( 26 | split_tensor_along_last_dim, 27 | ) 28 | 29 | __all__ = [ 30 | #layers.py 31 | "ColumnParallelLinear", 32 | "RowParallelLinear", 33 | "VocabParallelEmbedding", 34 | "set_tensor_model_parallel_attributes", 35 | "set_defaults_if_not_set_tensor_model_parallel_attributes", 36 | "copy_tensor_model_parallel_attributes", 37 | "param_is_not_tensor_parallel_duplicate", 38 | # mappings.py 39 | "copy_to_tensor_model_parallel_region", 40 | "gather_from_tensor_model_parallel_region", 41 | "gather_from_sequence_parallel_region", 42 | "reduce_from_tensor_model_parallel_region", 43 | "scatter_to_tensor_model_parallel_region", 44 | "scatter_to_sequence_parallel_region", 45 | # random.py 46 | "get_cuda_rng_tracker", 47 | "model_parallel_cuda_manual_seed", 48 | # utils.py 49 | "split_tensor_along_last_dim", 50 | ] 51 | -------------------------------------------------------------------------------- /tests/samplers/test_beam_search.py: -------------------------------------------------------------------------------- 1 | """Compare the outputs of HF and vLLM when using beam search. 2 | 3 | Run `pytest tests/samplers/test_beam_search.py --forked`. 4 | """ 5 | import pytest 6 | 7 | # FIXME(zhuohan): The test can not pass if we: 8 | # 1. Increase max_tokens to 256. 9 | # 2. Increase beam_width to 8. 10 | # 3. Use the model "huggyllama/llama-7b". 11 | MAX_TOKENS = [128] 12 | BEAM_WIDTHS = [4] 13 | MODELS = ["facebook/opt-125m"] 14 | 15 | 16 | @pytest.mark.parametrize("model", MODELS) 17 | @pytest.mark.parametrize("dtype", ["half"]) 18 | @pytest.mark.parametrize("max_tokens", MAX_TOKENS) 19 | @pytest.mark.parametrize("beam_width", BEAM_WIDTHS) 20 | def test_beam_search_single_input( 21 | hf_runner, 22 | vllm_runner, 23 | example_prompts, 24 | model: str, 25 | dtype: str, 26 | max_tokens: int, 27 | beam_width: int, 28 | ) -> None: 29 | hf_model = hf_runner(model, dtype=dtype) 30 | hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width, 31 | max_tokens) 32 | del hf_model 33 | 34 | vllm_model = vllm_runner(model, dtype=dtype) 35 | vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width, 36 | max_tokens) 37 | del vllm_model 38 | 39 | for i in range(len(example_prompts)): 40 | hf_output_ids, _ = hf_outputs[i] 41 | vllm_output_ids, _ = vllm_outputs[i] 42 | assert len(hf_output_ids) == len(vllm_output_ids) 43 | for j in range(len(hf_output_ids)): 44 | assert hf_output_ids[j] == vllm_output_ids[j], ( 45 | f"Test{i} output{j}:\nHF: {hf_output_ids}\n" 46 | f"vLLM: {vllm_output_ids}") 47 | -------------------------------------------------------------------------------- /tests/async_engine/api_server_async_engine.py: -------------------------------------------------------------------------------- 1 | """vllm.entrypoints.api_server with some extra logging for testing.""" 2 | import argparse 3 | from typing import Any, Dict 4 | 5 | import uvicorn 6 | from fastapi.responses import JSONResponse, Response 7 | 8 | import vllm.entrypoints.api_server 9 | from vllm.engine.arg_utils import AsyncEngineArgs 10 | from vllm.engine.async_llm_engine import AsyncLLMEngine 11 | 12 | app = vllm.entrypoints.api_server.app 13 | 14 | 15 | class AsyncLLMEngineWithStats(AsyncLLMEngine): 16 | 17 | def __init__(self, *args, **kwargs): 18 | super().__init__(*args, **kwargs) 19 | self._num_aborts = 0 20 | 21 | async def abort(self, request_id: str) -> None: 22 | await super().abort(request_id) 23 | self._num_aborts += 1 24 | 25 | def testing_stats(self) -> Dict[str, Any]: 26 | return {"num_aborted_requests": self._num_aborts} 27 | 28 | 29 | @app.get("/stats") 30 | def stats() -> Response: 31 | """Get the statistics of the engine.""" 32 | return JSONResponse(engine.testing_stats()) 33 | 34 | 35 | if __name__ == "__main__": 36 | parser = argparse.ArgumentParser() 37 | parser.add_argument("--host", type=str, default="localhost") 38 | parser.add_argument("--port", type=int, default=8000) 39 | parser = AsyncEngineArgs.add_cli_args(parser) 40 | args = parser.parse_args() 41 | 42 | engine_args = AsyncEngineArgs.from_cli_args(args) 43 | engine = AsyncLLMEngineWithStats.from_engine_args(engine_args) 44 | vllm.entrypoints.api_server.engine = engine 45 | uvicorn.run( 46 | app, 47 | host=args.host, 48 | port=args.port, 49 | log_level="debug", 50 | timeout_keep_alive=vllm.entrypoints.api_server.TIMEOUT_KEEP_ALIVE) 51 | -------------------------------------------------------------------------------- /docs/source/serving/distributed_serving.rst: -------------------------------------------------------------------------------- 1 | .. _distributed_serving: 2 | 3 | Distributed Inference and Serving 4 | ================================= 5 | 6 | vLLM supports distributed tensor-parallel inference and serving. Currently, we support `Megatron-LM's tensor parallel algorithm `_. We manage the distributed runtime with `Ray `_. To run distributed inference, install Ray with: 7 | 8 | .. code-block:: console 9 | 10 | $ pip install ray 11 | 12 | To run multi-GPU inference with the :code:`LLM` class, set the :code:`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs: 13 | 14 | .. code-block:: python 15 | 16 | from vllm import LLM 17 | llm = LLM("facebook/opt-13b", tensor_parallel_size=4) 18 | output = llm.generate("San Franciso is a") 19 | 20 | To run multi-GPU serving, pass in the :code:`--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs: 21 | 22 | .. code-block:: console 23 | 24 | $ python -m vllm.entrypoints.api_server \ 25 | $ --model facebook/opt-13b \ 26 | $ --tensor-parallel-size 4 27 | 28 | To scale vLLM beyond a single machine, start a `Ray runtime `_ via CLI before running vLLM: 29 | 30 | .. code-block:: console 31 | 32 | $ # On head node 33 | $ ray start --head 34 | 35 | $ # On worker nodes 36 | $ ray start --address= 37 | 38 | After that, you can run inference and serving on multiple machines by launching the vLLM process on the head node by setting :code:`tensor_parallel_size` to the number of GPUs to be the total number of GPUs across all machines. -------------------------------------------------------------------------------- /csrc/reduction_utils.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/reduce_kernel_utils.cuh 3 | * Copyright (c) 2023, The vLLM team. 4 | * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | #pragma once 19 | 20 | namespace vllm { 21 | 22 | template 23 | __inline__ __device__ T warpReduceSum(T val) { 24 | #pragma unroll 25 | for (int mask = 16; mask > 0; mask >>= 1) 26 | val += __shfl_xor_sync(0xffffffff, val, mask, 32); 27 | return val; 28 | } 29 | 30 | /* Calculate the sum of all elements in a block */ 31 | template 32 | __inline__ __device__ T blockReduceSum(T val) { 33 | static __shared__ T shared[32]; 34 | int lane = threadIdx.x & 0x1f; 35 | int wid = threadIdx.x >> 5; 36 | 37 | val = warpReduceSum(val); 38 | 39 | if (lane == 0) 40 | shared[wid] = val; 41 | 42 | __syncthreads(); 43 | 44 | // Modify from blockDim.x << 5 to blockDim.x / 32. to prevent 45 | // blockDim.x is not divided by 32 46 | val = (threadIdx.x < (blockDim.x / 32.f)) ? shared[lane] : (T)(0.0f); 47 | val = warpReduceSum(val); 48 | return val; 49 | } 50 | 51 | } // namespace vllm 52 | -------------------------------------------------------------------------------- /vllm/logger.py: -------------------------------------------------------------------------------- 1 | # Adapted from 2 | # https://github.com/skypilot-org/skypilot/blob/86dc0f6283a335e4aa37b3c10716f90999f48ab6/sky/sky_logging.py 3 | """Logging configuration for vLLM.""" 4 | import logging 5 | import sys 6 | 7 | _FORMAT = "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s" 8 | _DATE_FORMAT = "%m-%d %H:%M:%S" 9 | 10 | 11 | class NewLineFormatter(logging.Formatter): 12 | """Adds logging prefix to newlines to align multi-line messages.""" 13 | 14 | def __init__(self, fmt, datefmt=None): 15 | logging.Formatter.__init__(self, fmt, datefmt) 16 | 17 | def format(self, record): 18 | msg = logging.Formatter.format(self, record) 19 | if record.message != "": 20 | parts = msg.split(record.message) 21 | msg = msg.replace("\n", "\r\n" + parts[0]) 22 | return msg 23 | 24 | 25 | _root_logger = logging.getLogger("vllm") 26 | _default_handler = None 27 | 28 | 29 | def _setup_logger(): 30 | _root_logger.setLevel(logging.DEBUG) 31 | global _default_handler 32 | if _default_handler is None: 33 | _default_handler = logging.StreamHandler(sys.stdout) 34 | _default_handler.flush = sys.stdout.flush # type: ignore 35 | _default_handler.setLevel(logging.INFO) 36 | _root_logger.addHandler(_default_handler) 37 | fmt = NewLineFormatter(_FORMAT, datefmt=_DATE_FORMAT) 38 | _default_handler.setFormatter(fmt) 39 | # Setting this will avoid the message 40 | # being propagated to the parent logger. 41 | _root_logger.propagate = False 42 | 43 | 44 | # The logger is initialized when the module is imported. 45 | # This is thread-safe as the module is only imported once, 46 | # guaranteed by the Python GIL. 47 | _setup_logger() 48 | 49 | 50 | def init_logger(name: str): 51 | return logging.getLogger(name) 52 | -------------------------------------------------------------------------------- /tests/engine/test_detokenize.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from transformers import AutoTokenizer 4 | 5 | from vllm.transformers_utils.tokenizer import detokenize_incrementally 6 | 7 | TRUTH = [ 8 | "Hello here, this is a simple test", 9 | "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving", 10 | "我很感谢你的热情" 11 | ] 12 | TOKENIZERS = [ 13 | "facebook/opt-125m", 14 | "gpt2", 15 | "bigcode/tiny_starcoder_py", 16 | "EleutherAI/gpt-j-6b", 17 | "EleutherAI/pythia-70m", 18 | "bigscience/bloom-560m", 19 | "mosaicml/mpt-7b", 20 | "tiiuae/falcon-7b", 21 | "meta-llama/Llama-2-7b-hf", 22 | "codellama/CodeLlama-7b-hf", 23 | ] 24 | 25 | 26 | def _run_incremental_decode(tokenizer, all_input_ids): 27 | decoded_text = "" 28 | offset = 0 29 | token_offset = 0 30 | prev_tokens = None 31 | for i in range(len(all_input_ids)): 32 | new_tokens, text, offset, token_offset = detokenize_incrementally( 33 | tokenizer, 34 | all_input_ids[:i + 1], 35 | prev_tokens, 36 | offset, 37 | token_offset, 38 | skip_special_tokens=False) 39 | decoded_text += text 40 | if prev_tokens is None: 41 | prev_tokens = new_tokens 42 | else: 43 | prev_tokens += new_tokens 44 | return decoded_text 45 | 46 | 47 | @pytest.mark.parametrize("truth", TRUTH) 48 | @pytest.mark.parametrize("tokenizer_id", TOKENIZERS) 49 | def test_decode_streaming(tokenizer_id, truth): 50 | tokenizer = AutoTokenizer.from_pretrained(tokenizer_id) 51 | all_input_ids = tokenizer(truth, add_special_tokens=False)["input_ids"] 52 | 53 | decoded_text = _run_incremental_decode(tokenizer, all_input_ids) 54 | 55 | assert decoded_text == truth 56 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/activation.py: -------------------------------------------------------------------------------- 1 | """Custom activation functions.""" 2 | import torch 3 | import torch.nn as nn 4 | 5 | from vllm import activation_ops 6 | 7 | 8 | class SiluAndMul(nn.Module): 9 | """An activation function for SwiGLU. 10 | 11 | The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[1] // 2. 12 | 13 | Shapes: 14 | x: (num_tokens, 2 * d) 15 | return: (num_tokens, d) 16 | """ 17 | 18 | def forward(self, x: torch.Tensor) -> torch.Tensor: 19 | num_tokens = x.shape[0] 20 | d = x.shape[1] // 2 21 | out = torch.empty(num_tokens, d, dtype=x.dtype, device=x.device) 22 | activation_ops.silu_and_mul(out, x) 23 | return out 24 | 25 | 26 | class NewGELU(nn.Module): 27 | 28 | def forward(self, x: torch.Tensor) -> torch.Tensor: 29 | num_tokens = x.shape[0] 30 | d = x.shape[1] 31 | out = torch.empty(num_tokens, d, dtype=x.dtype, device=x.device) 32 | activation_ops.gelu_new(out, x) 33 | return out 34 | 35 | 36 | class FastGELU(nn.Module): 37 | 38 | def forward(self, x: torch.Tensor) -> torch.Tensor: 39 | num_tokens = x.shape[0] 40 | d = x.shape[1] 41 | out = torch.empty(num_tokens, d, dtype=x.dtype, device=x.device) 42 | activation_ops.gelu_fast(out, x) 43 | return out 44 | 45 | 46 | _ACTIVATION_REGISTRY = { 47 | "gelu": nn.GELU(), 48 | "gelu_fast": FastGELU(), 49 | "gelu_new": NewGELU(), 50 | "gelu_pytorch_tanh": nn.GELU(approximate="tanh"), 51 | "relu": nn.ReLU(), 52 | } 53 | 54 | 55 | def get_act_fn(act_fn: str) -> nn.Module: 56 | """Get an activation function by name.""" 57 | act_fn = act_fn.lower() 58 | if act_fn in _ACTIVATION_REGISTRY: 59 | return _ACTIVATION_REGISTRY[act_fn] 60 | raise ValueError(f"Activation function {act_fn!r} is not supported.") 61 | -------------------------------------------------------------------------------- /tests/async_engine/test_request_tracker.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from vllm.engine.async_llm_engine import RequestTracker 4 | from vllm.outputs import RequestOutput 5 | 6 | 7 | def test_request_tracker(): 8 | tracker = RequestTracker() 9 | stream_1 = tracker.add_request("1") 10 | new, finished = tracker.get_new_and_finished_requests() 11 | assert len(new) == 1 12 | assert new[0]["request_id"] == "1" 13 | assert not finished 14 | assert not stream_1.finished 15 | 16 | stream_2 = tracker.add_request("2") 17 | stream_3 = tracker.add_request("3") 18 | new, finished = tracker.get_new_and_finished_requests() 19 | assert len(new) == 2 20 | assert new[0]["request_id"] == "2" 21 | assert new[1]["request_id"] == "3" 22 | assert not finished 23 | assert not stream_2.finished 24 | assert not stream_3.finished 25 | 26 | # request_ids must be unique 27 | with pytest.raises(KeyError): 28 | tracker.add_request("1") 29 | 30 | tracker.abort_request("1") 31 | new, finished = tracker.get_new_and_finished_requests() 32 | assert len(finished) == 1 33 | assert "1" in finished 34 | assert not new 35 | assert stream_1.finished 36 | 37 | stream_4 = tracker.add_request("4") 38 | tracker.abort_request("4") 39 | new, finished = tracker.get_new_and_finished_requests() 40 | assert len(finished) == 1 41 | assert "4" in finished 42 | assert not new 43 | assert stream_4.finished 44 | 45 | stream_5 = tracker.add_request("5") 46 | tracker.process_request_output( 47 | RequestOutput("2", "output", [], [], finished=True)) 48 | new, finished = tracker.get_new_and_finished_requests() 49 | assert len(finished) == 1 50 | assert "2" in finished 51 | assert len(new) == 1 52 | assert new[0]["request_id"] == "5" 53 | assert stream_2.finished 54 | assert not stream_5.finished 55 | -------------------------------------------------------------------------------- /examples/gradio_webserver.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | import gradio as gr 5 | import requests 6 | 7 | 8 | def http_bot(prompt): 9 | headers = {"User-Agent": "vLLM Client"} 10 | pload = { 11 | "prompt": prompt, 12 | "stream": True, 13 | "max_tokens": 128, 14 | } 15 | response = requests.post(args.model_url, 16 | headers=headers, 17 | json=pload, 18 | stream=True) 19 | 20 | for chunk in response.iter_lines(chunk_size=8192, 21 | decode_unicode=False, 22 | delimiter=b"\0"): 23 | if chunk: 24 | data = json.loads(chunk.decode("utf-8")) 25 | output = data["text"][0] 26 | yield output 27 | 28 | 29 | def build_demo(): 30 | with gr.Blocks() as demo: 31 | gr.Markdown("# vLLM text completion demo\n") 32 | inputbox = gr.Textbox(label="Input", 33 | placeholder="Enter text and press ENTER") 34 | outputbox = gr.Textbox(label="Output", 35 | placeholder="Generated result from the model") 36 | inputbox.submit(http_bot, [inputbox], [outputbox]) 37 | return demo 38 | 39 | 40 | if __name__ == "__main__": 41 | parser = argparse.ArgumentParser() 42 | parser.add_argument("--host", type=str, default="localhost") 43 | parser.add_argument("--port", type=int, default=8001) 44 | parser.add_argument("--model-url", 45 | type=str, 46 | default="http://localhost:8000/generate") 47 | args = parser.parse_args() 48 | 49 | demo = build_demo() 50 | demo.queue(concurrency_count=100).launch(server_name=args.host, 51 | server_port=args.port, 52 | share=True) 53 | -------------------------------------------------------------------------------- /examples/llm_engine_example.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | 3 | from vllm import EngineArgs, LLMEngine, SamplingParams 4 | 5 | 6 | def main(args: argparse.Namespace): 7 | # Parse the CLI argument and initialize the engine. 8 | engine_args = EngineArgs.from_cli_args(args) 9 | engine = LLMEngine.from_engine_args(engine_args) 10 | 11 | # Test the following prompts. 12 | test_prompts = [ 13 | ("A robot may not injure a human being", 14 | SamplingParams(temperature=0.0)), 15 | ("To be or not to be,", 16 | SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2)), 17 | ("What is the meaning of life?", 18 | SamplingParams(n=2, 19 | best_of=5, 20 | temperature=0.8, 21 | top_p=0.95, 22 | frequency_penalty=0.1)), 23 | ("It is only with the heart that one can see rightly", 24 | SamplingParams(n=3, best_of=3, use_beam_search=True, 25 | temperature=0.0)), 26 | ] 27 | 28 | # Run the engine by calling `engine.step()` manually. 29 | request_id = 0 30 | while True: 31 | # To test continuous batching, we add one request at each step. 32 | if test_prompts: 33 | prompt, sampling_params = test_prompts.pop(0) 34 | engine.add_request(str(request_id), prompt, sampling_params) 35 | request_id += 1 36 | 37 | request_outputs = engine.step() 38 | for request_output in request_outputs: 39 | if request_output.finished: 40 | print(request_output) 41 | 42 | if not (engine.has_unfinished_requests() or test_prompts): 43 | break 44 | 45 | 46 | if __name__ == '__main__': 47 | parser = argparse.ArgumentParser( 48 | description='Demo on using the LLMEngine class directly') 49 | parser = EngineArgs.add_cli_args(parser) 50 | args = parser.parse_args() 51 | main(args) 52 | -------------------------------------------------------------------------------- /csrc/attention/attention_generic.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h 3 | * Copyright (c) 2023, The vLLM team. 4 | * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | #pragma once 19 | 20 | #include 21 | 22 | namespace vllm { 23 | 24 | // A vector type to store Q, K, V elements. 25 | template 26 | struct Vec {}; 27 | 28 | // A vector type to store FP32 accumulators. 29 | template 30 | struct FloatVec {}; 31 | 32 | // Template vector operations. 33 | template 34 | inline __device__ Acc mul(A a, B b); 35 | 36 | template 37 | inline __device__ float sum(T v); 38 | 39 | template 40 | inline __device__ float dot(T a, T b) { 41 | return sum(mul(a, b)); 42 | } 43 | 44 | template 45 | inline __device__ float dot(T a, T b) { 46 | return sum(mul(a, b)); 47 | } 48 | 49 | template 50 | inline __device__ void zero(T& dst) { 51 | constexpr int WORDS = sizeof(T) / 4; 52 | union { 53 | T raw; 54 | uint32_t words[WORDS]; 55 | } tmp; 56 | 57 | #pragma unroll 58 | for (int ii = 0; ii < WORDS; ++ii) { 59 | tmp.words[ii] = 0u; 60 | } 61 | dst = tmp.raw; 62 | } 63 | 64 | } // namespace vllm 65 | -------------------------------------------------------------------------------- /tests/kernels/test_layernorm.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | import torch.nn as nn 4 | 5 | from vllm import layernorm_ops 6 | 7 | DTYPES = [torch.half, torch.bfloat16, torch.float] 8 | HIDDEN_SIZES = [67, 768, 2048, 5120, 8192] # Arbitrary values for testing 9 | NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing 10 | SEEDS = [0] 11 | 12 | 13 | class RefRMSNorm(nn.Module): 14 | 15 | def __init__(self, hidden_size, eps=1e-6): 16 | super().__init__() 17 | weight = torch.empty(hidden_size) 18 | weight.normal_(mean=1.0, std=0.1) 19 | self.weight = nn.Parameter(weight) 20 | self.variance_epsilon = eps 21 | 22 | def forward(self, hidden_states): 23 | input_dtype = hidden_states.dtype 24 | hidden_states = hidden_states.to(torch.float32) 25 | variance = hidden_states.pow(2).mean(-1, keepdim=True) 26 | hidden_states = hidden_states * torch.rsqrt(variance + 27 | self.variance_epsilon) 28 | return self.weight * hidden_states.to(input_dtype) 29 | 30 | 31 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS) 32 | @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) 33 | @pytest.mark.parametrize("dtype", DTYPES) 34 | @pytest.mark.parametrize("seed", SEEDS) 35 | @torch.inference_mode() 36 | def test_rms_norm( 37 | num_tokens: int, 38 | hidden_size: int, 39 | dtype: torch.dtype, 40 | seed: int, 41 | ) -> None: 42 | torch.random.manual_seed(seed) 43 | torch.cuda.manual_seed(seed) 44 | 45 | scale = float(hidden_size**-0.5) 46 | x = torch.empty(num_tokens, hidden_size, dtype=dtype, device="cuda") 47 | x.uniform_(-scale, scale) 48 | ref = RefRMSNorm(hidden_size).to(dtype).cuda() 49 | 50 | out = torch.empty_like(x) 51 | layernorm_ops.rms_norm( 52 | out, 53 | x, 54 | ref.weight.data, 55 | ref.variance_epsilon, 56 | ) 57 | ref_out = ref(x) 58 | assert torch.allclose(out, ref_out, atol=1e-2, rtol=1e-5) 59 | -------------------------------------------------------------------------------- /csrc/attention/attention_utils.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp 3 | * Copyright (c) 2023, The vLLM team. 4 | * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | #pragma once 19 | 20 | #include "attention_dtypes.h" 21 | 22 | #include 23 | #include 24 | 25 | namespace vllm { 26 | 27 | // Q*K^T operation. 28 | template 29 | inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) { 30 | using A_vec = typename FloatVec::Type; 31 | // Compute the parallel products for Q*K^T (treat vector lanes separately). 32 | A_vec qk_vec = mul(q[0], k[0]); 33 | #pragma unroll 34 | for (int ii = 1; ii < N; ++ii) { 35 | qk_vec = fma(q[ii], k[ii], qk_vec); 36 | } 37 | 38 | // Finalize the reduction across lanes. 39 | float qk = sum(qk_vec); 40 | #pragma unroll 41 | for (int mask = THREAD_GROUP_SIZE / 2; mask >= 1; mask /= 2) { 42 | qk += __shfl_xor_sync(uint32_t(-1), qk, mask); 43 | } 44 | return qk; 45 | } 46 | 47 | template 48 | struct Qk_dot { 49 | template 50 | static inline __device__ float dot(const Vec (&q)[N], const Vec (&k)[N]) { 51 | return qk_dot_(q, k); 52 | } 53 | }; 54 | 55 | } // namespace vllm 56 | -------------------------------------------------------------------------------- /vllm/block.py: -------------------------------------------------------------------------------- 1 | """Token blocks.""" 2 | from typing import List 3 | 4 | from vllm.utils import Device 5 | 6 | _BLANK_TOKEN_ID = -1 7 | 8 | 9 | class LogicalTokenBlock: 10 | """A block that stores a contiguous chunk of tokens from left to right. 11 | 12 | Logical blocks are used to represent the states of the corresponding 13 | physical blocks in the KV cache. 14 | """ 15 | 16 | def __init__( 17 | self, 18 | block_number: int, 19 | block_size: int, 20 | ) -> None: 21 | self.block_number = block_number 22 | self.block_size = block_size 23 | 24 | self.token_ids = [_BLANK_TOKEN_ID] * block_size 25 | self.num_tokens = 0 26 | 27 | def is_empty(self) -> bool: 28 | return self.num_tokens == 0 29 | 30 | def get_num_empty_slots(self) -> int: 31 | return self.block_size - self.num_tokens 32 | 33 | def is_full(self) -> bool: 34 | return self.num_tokens == self.block_size 35 | 36 | def append_tokens(self, token_ids: List[int]) -> None: 37 | assert len(token_ids) <= self.get_num_empty_slots() 38 | curr_idx = self.num_tokens 39 | self.token_ids[curr_idx:curr_idx + len(token_ids)] = token_ids 40 | self.num_tokens += len(token_ids) 41 | 42 | def get_token_ids(self) -> List[int]: 43 | return self.token_ids[:self.num_tokens] 44 | 45 | def get_last_token_id(self) -> int: 46 | assert self.num_tokens > 0 47 | return self.token_ids[self.num_tokens - 1] 48 | 49 | 50 | class PhysicalTokenBlock: 51 | """Represents the state of a block in the KV cache.""" 52 | 53 | def __init__( 54 | self, 55 | device: Device, 56 | block_number: int, 57 | block_size: int, 58 | ) -> None: 59 | self.device = device 60 | self.block_number = block_number 61 | self.block_size = block_size 62 | 63 | self.ref_count = 0 64 | 65 | def __repr__(self) -> str: 66 | return (f'PhysicalTokenBlock(device={self.device}, ' 67 | f'block_number={self.block_number}, ' 68 | f'ref_count={self.ref_count})') 69 | -------------------------------------------------------------------------------- /csrc/layernorm_kernels.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "dispatch_utils.h" 5 | #include "reduction_utils.cuh" 6 | 7 | namespace vllm { 8 | 9 | // TODO(woosuk): Further optimize this kernel. 10 | template 11 | __global__ void rms_norm_kernel( 12 | scalar_t* __restrict__ out, // [num_tokens, hidden_size] 13 | const scalar_t* __restrict__ input, // [num_tokens, hidden_size] 14 | const scalar_t* __restrict__ weight, // [hidden_size] 15 | const float epsilon, 16 | const int num_tokens, 17 | const int hidden_size) { 18 | __shared__ float s_variance; 19 | float variance = 0.0f; 20 | 21 | for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { 22 | const float x = (float) input[blockIdx.x * hidden_size + idx]; 23 | variance += x * x; 24 | } 25 | variance = blockReduceSum(variance); 26 | if (threadIdx.x == 0) { 27 | s_variance = rsqrtf(variance / hidden_size + epsilon); 28 | } 29 | __syncthreads(); 30 | 31 | for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { 32 | float x = (float) input[blockIdx.x * hidden_size + idx]; 33 | out[blockIdx.x * hidden_size + idx] = ((scalar_t) (x * s_variance)) * weight[idx]; 34 | } 35 | } 36 | 37 | } // namespace vllm 38 | 39 | void rms_norm( 40 | torch::Tensor& out, // [num_tokens, hidden_size] 41 | torch::Tensor& input, // [num_tokens, hidden_size] 42 | torch::Tensor& weight, // [hidden_size] 43 | float epsilon) { 44 | int num_tokens = input.size(0); 45 | int hidden_size = input.size(1); 46 | 47 | dim3 grid(num_tokens); 48 | dim3 block(std::min(hidden_size, 1024)); 49 | const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); 50 | VLLM_DISPATCH_FLOATING_TYPES( 51 | input.scalar_type(), 52 | "rms_norm_kernel", 53 | [&] { 54 | vllm::rms_norm_kernel<<>>( 55 | out.data_ptr(), 56 | input.data_ptr(), 57 | weight.data_ptr(), 58 | epsilon, 59 | num_tokens, 60 | hidden_size); 61 | }); 62 | } 63 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to vLLM! 2 | ================ 3 | 4 | .. figure:: ./assets/logos/vllm-logo-text-light.png 5 | :width: 60% 6 | :align: center 7 | :alt: vLLM 8 | :class: no-scaled-link 9 | 10 | .. raw:: html 11 | 12 |

13 | Easy, fast, and cheap LLM serving for everyone 14 | 15 |

16 | 17 |

18 | 19 | Star 20 | Watch 21 | Fork 22 |

23 | 24 | 25 | 26 | vLLM is a fast and easy-to-use library for LLM inference and serving. 27 | 28 | vLLM is fast with: 29 | 30 | * State-of-the-art serving throughput 31 | * Efficient management of attention key and value memory with **PagedAttention** 32 | * Continuous batching of incoming requests 33 | * Optimized CUDA kernels 34 | 35 | vLLM is flexible and easy to use with: 36 | 37 | * Seamless integration with popular HuggingFace models 38 | * High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more 39 | * Tensor parallelism support for distributed inference 40 | * Streaming outputs 41 | * OpenAI-compatible API server 42 | 43 | For more information, check out the following: 44 | 45 | * `vLLM announcing blog post `_ (intro to PagedAttention) 46 | * `vLLM paper `_ (SOSP 2023) 47 | * `How continuous batching enables 23x throughput in LLM inference while reducing p50 latency `_ by Cade Daniel et al. 48 | 49 | 50 | 51 | Documentation 52 | ------------- 53 | 54 | .. toctree:: 55 | :maxdepth: 1 56 | :caption: Getting Started 57 | 58 | getting_started/installation 59 | getting_started/quickstart 60 | 61 | .. toctree:: 62 | :maxdepth: 1 63 | :caption: Serving 64 | 65 | serving/distributed_serving 66 | serving/run_on_sky 67 | 68 | .. toctree:: 69 | :maxdepth: 1 70 | :caption: Models 71 | 72 | models/supported_models 73 | models/adding_model 74 | -------------------------------------------------------------------------------- /docs/source/serving/run_on_sky.rst: -------------------------------------------------------------------------------- 1 | .. _on_cloud: 2 | 3 | Running on clouds with SkyPilot 4 | =============================== 5 | 6 | .. raw:: html 7 | 8 |

9 | vLLM 10 |

11 | 12 | vLLM can be run on the cloud to scale to multiple GPUs with `SkyPilot `__, an open-source framework for running LLMs on any cloud. 13 | 14 | To install SkyPilot and setup your cloud credentials, run: 15 | 16 | .. code-block:: console 17 | 18 | $ pip install skypilot 19 | $ sky check 20 | 21 | See the vLLM SkyPilot YAML for serving, `serving.yaml `__. 22 | 23 | .. code-block:: yaml 24 | 25 | resources: 26 | accelerators: A100 27 | 28 | envs: 29 | MODEL_NAME: decapoda-research/llama-13b-hf 30 | TOKENIZER: hf-internal-testing/llama-tokenizer 31 | 32 | setup: | 33 | conda create -n vllm python=3.9 -y 34 | conda activate vllm 35 | git clone https://github.com/vllm-project/vllm.git 36 | cd vllm 37 | pip install . 38 | pip install gradio 39 | 40 | run: | 41 | conda activate vllm 42 | echo 'Starting vllm api server...' 43 | python -u -m vllm.entrypoints.api_server \ 44 | --model $MODEL_NAME \ 45 | --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ 46 | --tokenizer $TOKENIZER 2>&1 | tee api_server.log & 47 | echo 'Waiting for vllm api server to start...' 48 | while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done 49 | echo 'Starting gradio server...' 50 | python vllm/examples/gradio_webserver.py 51 | 52 | Start the serving the LLaMA-13B model on an A100 GPU: 53 | 54 | .. code-block:: console 55 | 56 | $ sky launch serving.yaml 57 | 58 | Check the output of the command. There will be a sharable gradio link (like the last line of the following). Open it in your browser to use the LLaMA model to do the text completion. 59 | 60 | .. code-block:: console 61 | 62 | (task, pid=7431) Running on public URL: https://.gradio.live 63 | 64 | **Optional**: Serve the 65B model instead of the default 13B and use more GPU: 65 | 66 | .. code-block:: console 67 | 68 | sky launch -c vllm-serve-new -s serve.yaml --gpus A100:8 --env MODEL_NAME=decapoda-research/llama-65b-hf 69 | 70 | -------------------------------------------------------------------------------- /vllm/transformers_utils/configs/baichuan.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. 3 | # 4 | # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX 5 | # and OPT implementations in this library. It has been modified from its 6 | # original forms to accommodate minor architectural differences compared 7 | # to GPT-NeoX and OPT used by the Meta AI team that trained the model. 8 | # 9 | # Licensed under the Apache License, Version 2.0 (the "License"); 10 | # you may not use this file except in compliance with the License. 11 | # You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | 21 | from transformers.configuration_utils import PretrainedConfig 22 | 23 | 24 | class BaiChuanConfig(PretrainedConfig): 25 | model_type = "baichuan" 26 | keys_to_ignore_at_inference = ["past_key_values"] 27 | 28 | def __init__( 29 | self, 30 | vocab_size=64000, 31 | hidden_size=4096, 32 | intermediate_size=11008, 33 | num_hidden_layers=32, 34 | num_attention_heads=32, 35 | hidden_act="silu", 36 | max_position_embeddings=4096, 37 | initializer_range=0.02, 38 | rms_norm_eps=1e-6, 39 | use_cache=True, 40 | pad_token_id=0, 41 | bos_token_id=1, 42 | eos_token_id=2, 43 | tie_word_embeddings=False, 44 | **kwargs, 45 | ): 46 | self.vocab_size = vocab_size 47 | self.max_position_embeddings = max_position_embeddings 48 | self.hidden_size = hidden_size 49 | self.intermediate_size = intermediate_size 50 | self.num_hidden_layers = num_hidden_layers 51 | self.num_attention_heads = num_attention_heads 52 | self.hidden_act = hidden_act 53 | self.initializer_range = initializer_range 54 | self.rms_norm_eps = rms_norm_eps 55 | self.use_cache = use_cache 56 | super().__init__( 57 | pad_token_id=pad_token_id, 58 | bos_token_id=bos_token_id, 59 | eos_token_id=eos_token_id, 60 | tie_word_embeddings=tie_word_embeddings, 61 | **kwargs, 62 | ) 63 | -------------------------------------------------------------------------------- /vllm/transformers_utils/configs/aquila.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved. 3 | # 4 | # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX 5 | # and OPT implementations in this library. It has been modified from its 6 | # original forms to accommodate minor architectural differences compared 7 | # to GPT-NeoX and OPT used by the Meta AI team that trained the model. 8 | # 9 | # Licensed under the Apache License, Version 2.0 (the "License"); 10 | # you may not use this file except in compliance with the License. 11 | # You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | """ Aquila model configuration""" 21 | 22 | from transformers import PretrainedConfig 23 | 24 | 25 | class AquilaConfig(PretrainedConfig): 26 | model_type = "aquila" 27 | keys_to_ignore_at_inference = ["past_key_values"] 28 | 29 | def __init__( 30 | self, 31 | vocab_size=100008, 32 | hidden_size=4096, 33 | intermediate_size=11008, 34 | num_hidden_layers=32, 35 | num_attention_heads=32, 36 | hidden_act="silu", 37 | max_position_embeddings=2048, 38 | initializer_range=0.006, 39 | rms_norm_eps=1e-5, 40 | use_cache=True, 41 | pad_token_id=0, 42 | bos_token_id=1, 43 | eos_token_id=2, 44 | tie_word_embeddings=False, 45 | **kwargs, 46 | ): 47 | self.vocab_size = vocab_size 48 | self.max_position_embeddings = max_position_embeddings 49 | self.hidden_size = hidden_size 50 | self.intermediate_size = intermediate_size 51 | self.num_hidden_layers = num_hidden_layers 52 | self.num_attention_heads = num_attention_heads 53 | self.hidden_act = hidden_act 54 | self.initializer_range = initializer_range 55 | self.rms_norm_eps = rms_norm_eps 56 | self.use_cache = use_cache 57 | super().__init__( 58 | pad_token_id=pad_token_id, 59 | bos_token_id=bos_token_id, 60 | eos_token_id=eos_token_id, 61 | tie_word_embeddings=tie_word_embeddings, 62 | **kwargs, 63 | ) 64 | -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'vLLM' 21 | copyright = '2023, vLLM Team' 22 | author = 'the vLLM Team' 23 | 24 | 25 | # -- General configuration --------------------------------------------------- 26 | 27 | # Add any Sphinx extension module names here, as strings. They can be 28 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 29 | # ones. 30 | extensions = [ 31 | "sphinx.ext.napoleon", 32 | "sphinx.ext.viewcode", 33 | "sphinx.ext.intersphinx", 34 | "sphinx_copybutton", 35 | ] 36 | 37 | # Add any paths that contain templates here, relative to this directory. 38 | templates_path = ['_templates'] 39 | 40 | # List of patterns, relative to source directory, that match files and 41 | # directories to ignore when looking for source files. 42 | # This pattern also affects html_static_path and html_extra_path. 43 | exclude_patterns = [] 44 | 45 | # Exclude the prompt "$" when copying code 46 | copybutton_prompt_text = r"\$ " 47 | copybutton_prompt_is_regexp = True 48 | 49 | # -- Options for HTML output ------------------------------------------------- 50 | 51 | # The theme to use for HTML and HTML Help pages. See the documentation for 52 | # a list of builtin themes. 53 | # 54 | html_title = project 55 | html_theme = 'sphinx_book_theme' 56 | html_logo = 'assets/logos/vllm-logo-text-light.png' 57 | html_theme_options = { 58 | 'logo_only': True, 59 | 'path_to_docs': 'docs/source', 60 | 'repository_url': 'https://github.com/vllm-project/vllm', 61 | 'use_repository_button': True, 62 | } 63 | 64 | # Add any paths that contain custom static files (such as style sheets) here, 65 | # relative to this directory. They are copied after the builtin static files, 66 | # so a file named "default.css" will overwrite the builtin "default.css". 67 | html_static_path = ['_static'] 68 | -------------------------------------------------------------------------------- /vllm/transformers_utils/configs/mpt.py: -------------------------------------------------------------------------------- 1 | # Adapted from 2 | # https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py 3 | from typing import Any, Dict, Optional, Union 4 | 5 | from transformers import PretrainedConfig 6 | 7 | _ATTN_CONFIG_DEFAULTS = { 8 | "attn_type": "multihead_attention", 9 | "attn_pdrop": 0.0, 10 | "attn_impl": "triton", 11 | "qk_ln": False, 12 | "clip_qkv": None, 13 | "softmax_scale": None, 14 | "prefix_lm": False, 15 | "attn_uses_sequence_id": False, 16 | "alibi": False, 17 | "alibi_bias_max": 8, 18 | } 19 | 20 | 21 | class MPTConfig(PretrainedConfig): 22 | model_type = "mpt" 23 | attribute_map = { 24 | "hidden_size": "d_model", 25 | "num_attention_heads": "n_heads", 26 | "num_hidden_layers": "n_layers", 27 | } 28 | 29 | def __init__( 30 | self, 31 | d_model: int = 2048, 32 | n_heads: int = 16, 33 | n_layers: int = 24, 34 | expansion_ratio: int = 4, 35 | max_seq_len: int = 2048, 36 | vocab_size: int = 50368, 37 | resid_pdrop: float = 0.0, 38 | emb_pdrop: float = 0.0, 39 | learned_pos_emb: bool = True, 40 | attn_config: Optional[Dict[str, Any]] = None, 41 | init_device: str = "cpu", 42 | logit_scale: Optional[Union[float, str]] = None, 43 | no_bias: bool = False, 44 | verbose: int = 0, 45 | embedding_fraction: float = 1.0, 46 | norm_type: str = "low_precision_layernorm", 47 | use_cache: bool = False, 48 | **kwargs, 49 | ) -> None: 50 | self.d_model = d_model 51 | self.n_heads = n_heads 52 | self.n_layers = n_layers 53 | self.expansion_ratio = expansion_ratio 54 | self.max_seq_len = max_seq_len 55 | self.vocab_size = vocab_size 56 | self.resid_pdrop = resid_pdrop 57 | self.emb_pdrop = emb_pdrop 58 | self.learned_pos_emb = learned_pos_emb 59 | if attn_config is None: 60 | self.attn_config = _ATTN_CONFIG_DEFAULTS 61 | else: 62 | self.attn_config = attn_config 63 | self.init_device = init_device 64 | self.logit_scale = logit_scale 65 | self.no_bias = no_bias 66 | self.verbose = verbose 67 | self.embedding_fraction = embedding_fraction 68 | self.norm_type = norm_type 69 | self.use_cache = use_cache 70 | if "name" in kwargs: 71 | del kwargs["name"] 72 | if "loss_fn" in kwargs: 73 | del kwargs["loss_fn"] 74 | super().__init__(**kwargs) 75 | -------------------------------------------------------------------------------- /vllm/transformers_utils/configs/qwen.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba Cloud. 2 | # LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE 3 | 4 | from transformers import PretrainedConfig 5 | 6 | 7 | class QWenConfig(PretrainedConfig): 8 | model_type = "qwen" 9 | keys_to_ignore_at_inference = ["past_key_values"] 10 | attribute_map = { 11 | "hidden_size": "n_embd", 12 | "num_attention_heads": "n_head", 13 | "max_position_embeddings": "n_positions", 14 | "num_hidden_layers": "n_layer", 15 | } 16 | 17 | def __init__( 18 | self, 19 | vocab_size=151851, 20 | n_embd=4096, 21 | n_layer=32, 22 | n_head=32, 23 | n_inner=None, 24 | embd_pdrop=0.0, 25 | attn_pdrop=0.0, 26 | layer_norm_epsilon=1e-5, 27 | initializer_range=0.02, 28 | scale_attn_weights=True, 29 | use_cache=True, 30 | eos_token_id=151643, 31 | apply_residual_connection_post_layernorm=False, 32 | bf16=True, 33 | kv_channels=128, 34 | rotary_pct=1.0, 35 | rotary_emb_base=10000, 36 | use_dynamic_ntk=False, 37 | use_logn_attn=False, 38 | use_flash_attn=True, 39 | ffn_hidden_size=22016, 40 | no_bias=True, 41 | tie_word_embeddings=False, 42 | **kwargs, 43 | ): 44 | self.eos_token_id = eos_token_id 45 | super().__init__(eos_token_id=eos_token_id, 46 | tie_word_embeddings=tie_word_embeddings, 47 | **kwargs) 48 | 49 | self.vocab_size = vocab_size 50 | self.n_embd = n_embd 51 | self.n_layer = n_layer 52 | self.n_head = n_head 53 | self.n_inner = n_inner 54 | self.embd_pdrop = embd_pdrop 55 | self.attn_pdrop = attn_pdrop 56 | self.layer_norm_epsilon = layer_norm_epsilon 57 | self.initializer_range = initializer_range 58 | self.scale_attn_weights = scale_attn_weights 59 | self.use_cache = use_cache 60 | self.apply_residual_connection_post_layernorm = ( 61 | apply_residual_connection_post_layernorm) 62 | self.bf16 = bf16 63 | self.kv_channels = kv_channels 64 | self.rotary_pct = rotary_pct 65 | self.rotary_emb_base = rotary_emb_base 66 | self.use_dynamic_ntk = use_dynamic_ntk 67 | self.use_logn_attn = use_logn_attn 68 | self.use_flash_attn = use_flash_attn 69 | self.ffn_hidden_size = ffn_hidden_size 70 | self.no_bias = no_bias 71 | self.tie_word_embeddings = tie_word_embeddings 72 | -------------------------------------------------------------------------------- /tests/kernels/test_activation.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | import torch.nn.functional as F 4 | from transformers.activations import get_activation 5 | 6 | from vllm import activation_ops 7 | 8 | DTYPES = [torch.half, torch.bfloat16, torch.float] 9 | NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing 10 | D = [512, 4096, 5120, 13824] # Arbitrary values for testing 11 | SEEDS = [0] 12 | 13 | 14 | def ref_silu_and_mul(x: torch.Tensor) -> torch.Tensor: 15 | x1, x2 = x.chunk(chunks=2, dim=1) 16 | return F.silu(x1) * x2 17 | 18 | 19 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS) 20 | @pytest.mark.parametrize("d", D) 21 | @pytest.mark.parametrize("dtype", DTYPES) 22 | @pytest.mark.parametrize("seed", SEEDS) 23 | @torch.inference_mode() 24 | def test_silu_and_mul( 25 | num_tokens: int, 26 | d: int, 27 | dtype: torch.dtype, 28 | seed: int, 29 | ) -> None: 30 | torch.random.manual_seed(seed) 31 | torch.cuda.manual_seed(seed) 32 | x = torch.randn(num_tokens, 2 * d, dtype=dtype, device='cuda') 33 | out = torch.empty(num_tokens, d, dtype=dtype, device='cuda') 34 | activation_ops.silu_and_mul(out, x) 35 | ref_out = ref_silu_and_mul(x) 36 | assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5) 37 | 38 | 39 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS) 40 | @pytest.mark.parametrize("d", D) 41 | @pytest.mark.parametrize("dtype", DTYPES) 42 | @pytest.mark.parametrize("seed", SEEDS) 43 | @torch.inference_mode() 44 | def test_gelu_new( 45 | num_tokens: int, 46 | d: int, 47 | dtype: torch.dtype, 48 | seed: int, 49 | ) -> None: 50 | torch.random.manual_seed(seed) 51 | torch.cuda.manual_seed(seed) 52 | x = torch.randn(num_tokens, d, dtype=dtype, device='cuda') 53 | out = torch.empty(num_tokens, d, dtype=dtype, device='cuda') 54 | activation_ops.gelu_new(out, x) 55 | ref_out = get_activation("gelu_new")(x) 56 | assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5) 57 | 58 | 59 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS) 60 | @pytest.mark.parametrize("d", D) 61 | @pytest.mark.parametrize("dtype", DTYPES) 62 | @pytest.mark.parametrize("seed", SEEDS) 63 | def test_gelu_fast( 64 | num_tokens: int, 65 | d: int, 66 | dtype: torch.dtype, 67 | seed: int, 68 | ) -> None: 69 | torch.random.manual_seed(seed) 70 | torch.cuda.manual_seed(seed) 71 | x = torch.randn(num_tokens, d, dtype=dtype, device='cuda') 72 | out = torch.empty(num_tokens, d, dtype=dtype, device='cuda') 73 | activation_ops.gelu_fast(out, x) 74 | ref_out = get_activation("gelu_fast")(x) 75 | assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5) 76 | -------------------------------------------------------------------------------- /examples/api_client.py: -------------------------------------------------------------------------------- 1 | """Example Python client for vllm.entrypoints.api_server""" 2 | 3 | import argparse 4 | import json 5 | from typing import Iterable, List 6 | 7 | import requests 8 | 9 | 10 | def clear_line(n: int = 1) -> None: 11 | LINE_UP = '\033[1A' 12 | LINE_CLEAR = '\x1b[2K' 13 | for _ in range(n): 14 | print(LINE_UP, end=LINE_CLEAR, flush=True) 15 | 16 | 17 | def post_http_request(prompt: str, 18 | api_url: str, 19 | n: int = 1, 20 | stream: bool = False) -> requests.Response: 21 | headers = {"User-Agent": "Test Client"} 22 | pload = { 23 | "prompt": prompt, 24 | "n": n, 25 | "use_beam_search": True, 26 | "temperature": 0.0, 27 | "max_tokens": 16, 28 | "stream": stream, 29 | } 30 | response = requests.post(api_url, headers=headers, json=pload, stream=True) 31 | return response 32 | 33 | 34 | def get_streaming_response(response: requests.Response) -> Iterable[List[str]]: 35 | for chunk in response.iter_lines(chunk_size=8192, 36 | decode_unicode=False, 37 | delimiter=b"\0"): 38 | if chunk: 39 | data = json.loads(chunk.decode("utf-8")) 40 | output = data["text"] 41 | yield output 42 | 43 | 44 | def get_response(response: requests.Response) -> List[str]: 45 | data = json.loads(response.content) 46 | output = data["text"] 47 | return output 48 | 49 | 50 | if __name__ == "__main__": 51 | parser = argparse.ArgumentParser() 52 | parser.add_argument("--host", type=str, default="localhost") 53 | parser.add_argument("--port", type=int, default=8000) 54 | parser.add_argument("--n", type=int, default=4) 55 | parser.add_argument("--prompt", type=str, default="San Francisco is a") 56 | parser.add_argument("--stream", action="store_true") 57 | args = parser.parse_args() 58 | prompt = args.prompt 59 | api_url = f"http://{args.host}:{args.port}/generate" 60 | n = args.n 61 | stream = args.stream 62 | 63 | print(f"Prompt: {prompt!r}\n", flush=True) 64 | response = post_http_request(prompt, api_url, n, stream) 65 | 66 | if stream: 67 | num_printed_lines = 0 68 | for h in get_streaming_response(response): 69 | clear_line(num_printed_lines) 70 | num_printed_lines = 0 71 | for i, line in enumerate(h): 72 | num_printed_lines += 1 73 | print(f"Beam candidate {i}: {line!r}", flush=True) 74 | else: 75 | output = get_response(response) 76 | for i, line in enumerate(output): 77 | print(f"Beam candidate {i}: {line!r}", flush=True) 78 | -------------------------------------------------------------------------------- /vllm/model_executor/parallel_utils/tensor_parallel/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 The vLLM team. 2 | # Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py 3 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 4 | 5 | import torch 6 | from typing import List, Sequence 7 | 8 | def ensure_divisibility(numerator, denominator): 9 | """Ensure that numerator is divisible by the denominator.""" 10 | assert numerator % denominator == 0, "{} is not divisible by {}".format( 11 | numerator, denominator 12 | ) 13 | 14 | 15 | def divide(numerator, denominator): 16 | """Ensure that numerator is divisible by the denominator and return 17 | the division value.""" 18 | ensure_divisibility(numerator, denominator) 19 | return numerator // denominator 20 | 21 | 22 | def split_tensor_along_last_dim( 23 | tensor: torch.Tensor, 24 | num_partitions: int, 25 | contiguous_split_chunks: bool = False, 26 | ) -> List[torch.Tensor]: 27 | """ Split a tensor along its last dimension. 28 | 29 | Arguments: 30 | tensor: input tensor. 31 | num_partitions: number of partitions to split the tensor 32 | contiguous_split_chunks: If True, make each chunk contiguous 33 | in memory. 34 | 35 | Returns: 36 | A list of Tensors 37 | """ 38 | # Get the size and dimension. 39 | last_dim = tensor.dim() - 1 40 | last_dim_size = divide(tensor.size()[last_dim], num_partitions) 41 | # Split. 42 | tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) 43 | # Note: torch.split does not create contiguous tensors by default. 44 | if contiguous_split_chunks: 45 | return tuple(chunk.contiguous() for chunk in tensor_list) 46 | 47 | return tensor_list 48 | 49 | 50 | class VocabUtility: 51 | """ Split the vocabulary into `world_size` chunks and return the first 52 | and last index of the vocabulary belonging to the `rank` 53 | partition: Note that indices in [fist, last) 54 | 55 | """ 56 | 57 | @staticmethod 58 | def vocab_range_from_per_partition_vocab_size( 59 | per_partition_vocab_size: int, rank, world_size: int 60 | ) -> Sequence[int]: 61 | index_f = rank * per_partition_vocab_size 62 | index_l = index_f + per_partition_vocab_size 63 | return index_f, index_l 64 | 65 | @staticmethod 66 | def vocab_range_from_global_vocab_size(global_vocab_size: int, rank: int, world_size: int) -> Sequence[int]: 67 | per_partition_vocab_size = divide(global_vocab_size, world_size) 68 | return VocabUtility.vocab_range_from_per_partition_vocab_size( 69 | per_partition_vocab_size, rank, world_size 70 | ) 71 | -------------------------------------------------------------------------------- /vllm/model_executor/input_metadata.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Tuple 2 | 3 | import torch 4 | from xformers.ops import AttentionBias 5 | 6 | from vllm.sampling_params import SamplingParams 7 | from vllm.sequence import SequenceData 8 | 9 | 10 | class InputMetadata: 11 | """Metadata for input sequences. Used for PagedAttention. 12 | 13 | Args: 14 | seq_groups: List of (seq_ids, sampling_params). 15 | seq_data: Seq_id -> SequenceData. 16 | prompt_lens: Lengths of prompts. 17 | slot_mapping: The address to write the new KV to of each token. 18 | context_lens: the length of attention context for each generation token. 19 | max_context_len: The maximum context length. 20 | block_tables: The block tables. (Seq id -> list of physical block) 21 | """ 22 | 23 | def __init__( 24 | self, 25 | seq_groups: List[Tuple[List[int], SamplingParams]], 26 | seq_data: Dict[int, SequenceData], 27 | prompt_lens: List[int], 28 | slot_mapping: torch.Tensor, 29 | context_lens: torch.Tensor, 30 | max_context_len: int, 31 | block_tables: torch.Tensor, 32 | ) -> None: 33 | self.seq_groups = seq_groups 34 | self.seq_data = seq_data 35 | self.prompt_lens = prompt_lens 36 | self.slot_mapping = slot_mapping 37 | self.context_lens = context_lens 38 | self.max_context_len = max_context_len 39 | self.block_tables = block_tables 40 | 41 | self.num_prompts = len(prompt_lens) 42 | self.num_prompt_tokens = sum(prompt_lens) 43 | self.num_generation_tokens = context_lens.shape[0] 44 | self.num_valid_tokens = slot_mapping.shape[0] 45 | if block_tables.numel() > 0: 46 | self.max_num_blocks_per_seq = block_tables.shape[1] 47 | else: 48 | self.max_num_blocks_per_seq = 0 49 | assert block_tables.shape[0] == self.num_generation_tokens 50 | assert context_lens.shape[0] == self.num_generation_tokens 51 | 52 | # Set during the execution of the first attention op. 53 | self.attn_bias: List[AttentionBias] = [] 54 | 55 | def __repr__(self) -> str: 56 | # Print only useful metadata. 57 | return (f'InputMetadata(' 58 | f'num_valid_tokens={self.num_valid_tokens}, ' 59 | f'num_prompt_tokens={self.num_prompt_tokens}, ' 60 | f'num_prompts={self.num_prompts}, ' 61 | f'prompt_lens={self.prompt_lens}, ' 62 | f'num_generation_tokens={self.num_generation_tokens}, ' 63 | f'context_lens={self.context_lens}, ' 64 | f'max_context_len={self.max_context_len}), ' 65 | f'max_num_blocks_per_seq={self.max_num_blocks_per_seq}, ' 66 | f'block_tables={self.block_tables}), ' 67 | f'slot_mapping={self.slot_mapping}') 68 | -------------------------------------------------------------------------------- /tests/async_engine/test_api_server.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import sys 3 | import time 4 | from multiprocessing import Pool 5 | from pathlib import Path 6 | 7 | import pytest 8 | import requests 9 | 10 | 11 | def _query_server(prompt: str) -> dict: 12 | response = requests.post("http://localhost:8000/generate", 13 | json={ 14 | "prompt": prompt, 15 | "max_tokens": 100, 16 | "temperature": 0, 17 | "ignore_eos": True 18 | }) 19 | response.raise_for_status() 20 | return response.json() 21 | 22 | 23 | @pytest.fixture 24 | def api_server(): 25 | script_path = Path(__file__).parent.joinpath( 26 | "api_server_async_engine.py").absolute() 27 | uvicorn_process = subprocess.Popen([ 28 | sys.executable, "-u", 29 | str(script_path), "--model", "facebook/opt-125m" 30 | ]) 31 | yield 32 | uvicorn_process.terminate() 33 | 34 | 35 | def test_api_server(api_server): 36 | """ 37 | Run the API server and test it. 38 | 39 | We run both the server and requests in separate processes. 40 | 41 | We test that the server can handle incoming requests, including 42 | multiple requests at the same time, and that it can handle requests 43 | being cancelled without crashing. 44 | """ 45 | with Pool(32) as pool: 46 | # Wait until the server is ready 47 | prompts = ["Hello world"] * 1 48 | result = None 49 | while not result: 50 | try: 51 | for result in pool.map(_query_server, prompts): 52 | break 53 | except: 54 | time.sleep(1) 55 | 56 | # Actual tests start here 57 | # Try with 1 prompt 58 | for result in pool.map(_query_server, prompts): 59 | assert result 60 | 61 | num_aborted_requests = requests.get( 62 | "http://localhost:8000/stats").json()["num_aborted_requests"] 63 | assert num_aborted_requests == 0 64 | 65 | # Try with 100 prompts 66 | prompts = ["Hello world"] * 100 67 | for result in pool.map(_query_server, prompts): 68 | assert result 69 | 70 | # Cancel requests 71 | pool.map_async(_query_server, prompts) 72 | time.sleep(0.01) 73 | pool.terminate() 74 | pool.join() 75 | 76 | # check cancellation stats 77 | num_aborted_requests = requests.get( 78 | "http://localhost:8000/stats").json()["num_aborted_requests"] 79 | assert num_aborted_requests > 0 80 | 81 | # check that server still runs after cancellations 82 | with Pool(32) as pool: 83 | # Try with 100 prompts 84 | prompts = ["Hello world"] * 100 85 | for result in pool.map(_query_server, prompts): 86 | assert result 87 | -------------------------------------------------------------------------------- /vllm/model_executor/model_loader.py: -------------------------------------------------------------------------------- 1 | """Utilities for selecting and loading models.""" 2 | import contextlib 3 | from typing import Type 4 | 5 | import torch 6 | import torch.nn as nn 7 | from transformers import PretrainedConfig 8 | 9 | from vllm.config import ModelConfig 10 | from vllm.model_executor.models import * # pylint: disable=wildcard-import 11 | from vllm.model_executor.weight_utils import initialize_dummy_weights 12 | 13 | # TODO(woosuk): Lazy-load the model classes. 14 | _MODEL_REGISTRY = { 15 | "AquilaModel": AquilaForCausalLM, 16 | "BaiChuanForCausalLM": BaiChuanForCausalLM, # baichuan-7b 17 | "BaichuanForCausalLM": BaichuanForCausalLM, # baichuan-13b 18 | "BloomForCausalLM": BloomForCausalLM, 19 | "FalconForCausalLM": FalconForCausalLM, 20 | "GPT2LMHeadModel": GPT2LMHeadModel, 21 | "GPTBigCodeForCausalLM": GPTBigCodeForCausalLM, 22 | "GPTJForCausalLM": GPTJForCausalLM, 23 | "GPTNeoXForCausalLM": GPTNeoXForCausalLM, 24 | "InternLMForCausalLM": InternLMForCausalLM, 25 | "LlamaForCausalLM": LlamaForCausalLM, 26 | "LLaMAForCausalLM": LlamaForCausalLM, # For decapoda-research/llama-* 27 | "MPTForCausalLM": MPTForCausalLM, 28 | "OPTForCausalLM": OPTForCausalLM, 29 | "QWenLMHeadModel": QWenLMHeadModel, 30 | "RWForCausalLM": FalconForCausalLM, 31 | } 32 | 33 | 34 | @contextlib.contextmanager 35 | def _set_default_torch_dtype(dtype: torch.dtype): 36 | """Sets the default torch dtype to the given dtype.""" 37 | old_dtype = torch.get_default_dtype() 38 | torch.set_default_dtype(dtype) 39 | yield 40 | torch.set_default_dtype(old_dtype) 41 | 42 | 43 | def _get_model_architecture(config: PretrainedConfig) -> Type[nn.Module]: 44 | architectures = getattr(config, "architectures", []) 45 | for arch in architectures: 46 | if arch in _MODEL_REGISTRY: 47 | return _MODEL_REGISTRY[arch] 48 | raise ValueError( 49 | f"Model architectures {architectures} are not supported for now. " 50 | f"Supported architectures: {list(_MODEL_REGISTRY.keys())}") 51 | 52 | 53 | def get_model(model_config: ModelConfig) -> nn.Module: 54 | model_class = _get_model_architecture(model_config.hf_config) 55 | with _set_default_torch_dtype(model_config.dtype): 56 | # Create a model instance. 57 | # The weights will be initialized as empty tensors. 58 | model = model_class(model_config.hf_config) 59 | if model_config.load_format == "dummy": 60 | model = model.cuda() 61 | # NOTE(woosuk): For accurate performance evaluation, we assign 62 | # random values to the weights. 63 | initialize_dummy_weights(model) 64 | else: 65 | # Load the weights from the cached or downloaded files. 66 | model.load_weights(model_config.model, model_config.download_dir, 67 | model_config.load_format, model_config.revision) 68 | model = model.cuda() 69 | return model.eval() 70 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to vLLM 2 | 3 | Thank you for your interest in contributing to vLLM! 4 | Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. 5 | There are several ways you can contribute to the project: 6 | 7 | - Identify and report any issues or bugs. 8 | - Request or add a new model. 9 | - Suggest or implement new features. 10 | 11 | However, remember that contributions aren't just about code. 12 | We believe in the power of community support; thus, answering queries, assisting others, and enhancing the documentation are highly regarded and beneficial contributions. 13 | 14 | Finally, one of the most impactful ways to support us is by raising awareness about vLLM. 15 | Talk about it in your blog posts, highlighting how it's driving your incredible projects. 16 | Express your support on Twitter if vLLM aids you, or simply offer your appreciation by starring our repository. 17 | 18 | 19 | ## Setup for development 20 | 21 | ### Build from source 22 | 23 | ```bash 24 | pip install -r requirements.txt 25 | pip install -e . # This may take several minutes. 26 | ``` 27 | 28 | ### Testing 29 | 30 | ```bash 31 | pip install -r requirements-dev.txt 32 | 33 | # Static type checking 34 | mypy 35 | # Unit tests 36 | pytest tests/ 37 | ``` 38 | **Note:** Currently, the repository does not pass the mypy tests. 39 | 40 | 41 | ## Contributing Guidelines 42 | 43 | ### Issue Reporting 44 | 45 | If you encounter a bug or have a feature request, please check our issues page first to see if someone else has already reported it. 46 | If not, please file a new issue, providing as much relevant information as possible. 47 | 48 | ### Coding Style Guide 49 | 50 | In general, we adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html). 51 | 52 | We include a formatting script [`format.sh`](./format.sh) to format the code. 53 | 54 | ### Pull Requests 55 | 56 | When submitting a pull request: 57 | 58 | 1. Make sure your code has been rebased on top of the latest commit on the main branch. 59 | 2. Ensure code is properly formatted by running [`format.sh`](./format.sh). 60 | 3. Include a detailed description of the changes in the pull request. 61 | Explain why you made the changes you did. 62 | If your pull request fixes an open issue, please include a reference to it in the description. 63 | 64 | ### Code Reviews 65 | 66 | All submissions, including submissions by project members, require a code review. 67 | To make the review process as smooth as possible, please: 68 | 69 | 1. Keep your changes as concise as possible. 70 | If your pull request involves multiple unrelated changes, consider splitting it into separate pull requests. 71 | 2. Respond to all comments within a reasonable time frame. 72 | If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion. 73 | 74 | ### Thank You 75 | 76 | Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. 77 | Your contributions make vLLM a great tool for everyone! 78 | -------------------------------------------------------------------------------- /vllm/transformers_utils/configs/falcon.py: -------------------------------------------------------------------------------- 1 | # Adapted from 2 | # https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py 3 | # Copyright 2023 The vLLM team. 4 | # Copyright 2022 the Big Science Workshop and HuggingFace Inc. team. 5 | # All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | """Falcon configuration""" 19 | from transformers.configuration_utils import PretrainedConfig 20 | 21 | 22 | class RWConfig(PretrainedConfig): 23 | model_type = "falcon" 24 | keys_to_ignore_at_inference = ["past_key_values"] 25 | attribute_map = { 26 | "num_hidden_layers": "n_layer", 27 | "num_attention_heads": "n_head", 28 | "num_kv_heads": "n_head_kv", 29 | } 30 | 31 | def __init__( 32 | self, 33 | vocab_size=250880, 34 | hidden_size=64, 35 | n_layer=2, 36 | n_head=8, 37 | layer_norm_epsilon=1e-5, 38 | initializer_range=0.02, 39 | use_cache=True, 40 | bos_token_id=1, 41 | eos_token_id=2, 42 | hidden_dropout=0.0, 43 | attention_dropout=0.0, 44 | multi_query=True, 45 | n_head_kv=None, 46 | alibi=False, 47 | bias=False, 48 | parallel_attn=False, 49 | new_decoder_architecture=False, 50 | **kwargs, 51 | ) -> None: 52 | self.vocab_size = vocab_size 53 | # Backward compatibility with n_embed kwarg 54 | n_embed = kwargs.pop("n_embed", None) 55 | self.hidden_size = hidden_size if n_embed is None else n_embed 56 | self.n_layer = n_layer 57 | self.n_head = n_head 58 | self.layer_norm_epsilon = layer_norm_epsilon 59 | self.initializer_range = initializer_range 60 | self.use_cache = use_cache 61 | self.hidden_dropout = hidden_dropout 62 | self.attention_dropout = attention_dropout 63 | 64 | self.bos_token_id = bos_token_id 65 | self.eos_token_id = eos_token_id 66 | self.multi_query = multi_query 67 | self.n_head_kv = 1 if n_head_kv is None else n_head_kv 68 | self.alibi = alibi 69 | self.bias = bias 70 | self.parallel_attn = parallel_attn 71 | self.new_decoder_architecture = new_decoder_architecture 72 | 73 | if self.hidden_size == 8192: 74 | # Hack for falcon-40b 75 | self.new_decoder_architecture = True 76 | 77 | super().__init__(bos_token_id=bos_token_id, 78 | eos_token_id=eos_token_id, 79 | **kwargs) 80 | 81 | @property 82 | def head_dim(self): 83 | return self.hidden_size // self.n_head 84 | 85 | @property 86 | def rotary(self): 87 | return not self.alibi 88 | -------------------------------------------------------------------------------- /benchmarks/benchmark_latency.py: -------------------------------------------------------------------------------- 1 | """Benchmark the latency of processing a single batch of requests.""" 2 | import argparse 3 | import time 4 | 5 | import numpy as np 6 | import torch 7 | from tqdm import tqdm 8 | 9 | from vllm import LLM, SamplingParams 10 | 11 | 12 | def main(args: argparse.Namespace): 13 | print(args) 14 | 15 | # Process all the requests in a single batch if possible. 16 | # NOTE(woosuk): If the request cannot be processed in a single batch, 17 | # the engine will automatically process the request in multiple batches. 18 | llm = LLM( 19 | model=args.model, 20 | tokenizer=args.tokenizer, 21 | tensor_parallel_size=args.tensor_parallel_size, 22 | max_num_seqs=args.batch_size, 23 | max_num_batched_tokens=args.batch_size * args.input_len, 24 | trust_remote_code=args.trust_remote_code, 25 | ) 26 | 27 | sampling_params = SamplingParams( 28 | n=args.n, 29 | temperature=0.0 if args.use_beam_search else 1.0, 30 | top_p=1.0, 31 | use_beam_search=args.use_beam_search, 32 | ignore_eos=True, 33 | max_tokens=args.output_len, 34 | ) 35 | print(sampling_params) 36 | dummy_prompt_token_ids = [[0] * args.input_len] * args.batch_size 37 | 38 | def run_to_completion(profile: bool = False): 39 | if profile: 40 | torch.cuda.cudart().cudaProfilerStart() 41 | start_time = time.time() 42 | 43 | llm.generate(prompt_token_ids=dummy_prompt_token_ids, 44 | sampling_params=sampling_params, 45 | use_tqdm=False) 46 | 47 | end_time = time.time() 48 | latency = end_time - start_time 49 | if profile: 50 | torch.cuda.cudart().cudaProfilerStop() 51 | return latency 52 | 53 | print("Warming up...") 54 | run_to_completion(profile=False) 55 | 56 | # Benchmark. 57 | latencies = [] 58 | for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): 59 | latencies.append(run_to_completion(profile=False)) 60 | print(f'Avg latency: {np.mean(latencies)} seconds') 61 | 62 | 63 | if __name__ == '__main__': 64 | parser = argparse.ArgumentParser( 65 | description='Benchmark the latency of processing a single batch of ' 66 | 'requests till completion.') 67 | parser.add_argument('--model', type=str, default='facebook/opt-125m') 68 | parser.add_argument('--tokenizer', type=str, default=None) 69 | parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1) 70 | parser.add_argument('--input-len', type=int, default=32) 71 | parser.add_argument('--output-len', type=int, default=128) 72 | parser.add_argument('--batch-size', type=int, default=8) 73 | parser.add_argument('--n', type=int, default=1, 74 | help='Number of generated sequences per prompt.') 75 | parser.add_argument('--use-beam-search', action='store_true') 76 | parser.add_argument('--num-iters', type=int, default=3, 77 | help='Number of iterations to run.') 78 | parser.add_argument('--trust-remote-code', action='store_true', 79 | help='trust remote code from huggingface') 80 | args = parser.parse_args() 81 | main(args) 82 | -------------------------------------------------------------------------------- /vllm/entrypoints/api_server.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | from typing import AsyncGenerator 4 | 5 | from fastapi import BackgroundTasks, FastAPI, Request 6 | from fastapi.responses import JSONResponse, Response, StreamingResponse 7 | import uvicorn 8 | 9 | from vllm.engine.arg_utils import AsyncEngineArgs 10 | from vllm.engine.async_llm_engine import AsyncLLMEngine 11 | from vllm.sampling_params import SamplingParams 12 | from vllm.utils import random_uuid 13 | 14 | TIMEOUT_KEEP_ALIVE = 5 # seconds. 15 | TIMEOUT_TO_PREVENT_DEADLOCK = 1 # seconds. 16 | app = FastAPI() 17 | engine = None 18 | 19 | 20 | @app.post("/generate") 21 | async def generate(request: Request) -> Response: 22 | """Generate completion for the request. 23 | 24 | The request should be a JSON object with the following fields: 25 | - prompt: the prompt to use for the generation. 26 | - stream: whether to stream the results or not. 27 | - other fields: the sampling parameters (See `SamplingParams` for details). 28 | """ 29 | request_dict = await request.json() 30 | prompt = request_dict.pop("prompt") 31 | stream = request_dict.pop("stream", False) 32 | sampling_params = SamplingParams(**request_dict) 33 | request_id = random_uuid() 34 | 35 | results_generator = engine.generate(prompt, sampling_params, request_id) 36 | 37 | # Streaming case 38 | async def stream_results() -> AsyncGenerator[bytes, None]: 39 | async for request_output in results_generator: 40 | prompt = request_output.prompt 41 | text_outputs = [ 42 | prompt + output.text for output in request_output.outputs 43 | ] 44 | ret = {"text": text_outputs} 45 | yield (json.dumps(ret) + "\0").encode("utf-8") 46 | 47 | async def abort_request() -> None: 48 | await engine.abort(request_id) 49 | 50 | if stream: 51 | background_tasks = BackgroundTasks() 52 | # Abort the request if the client disconnects. 53 | background_tasks.add_task(abort_request) 54 | return StreamingResponse(stream_results(), background=background_tasks) 55 | 56 | # Non-streaming case 57 | final_output = None 58 | async for request_output in results_generator: 59 | if await request.is_disconnected(): 60 | # Abort the request if the client disconnects. 61 | await engine.abort(request_id) 62 | return Response(status_code=499) 63 | final_output = request_output 64 | 65 | assert final_output is not None 66 | prompt = final_output.prompt 67 | text_outputs = [prompt + output.text for output in final_output.outputs] 68 | ret = {"text": text_outputs} 69 | return JSONResponse(ret) 70 | 71 | 72 | if __name__ == "__main__": 73 | parser = argparse.ArgumentParser() 74 | parser.add_argument("--host", type=str, default="localhost") 75 | parser.add_argument("--port", type=int, default=8000) 76 | parser = AsyncEngineArgs.add_cli_args(parser) 77 | args = parser.parse_args() 78 | 79 | engine_args = AsyncEngineArgs.from_cli_args(args) 80 | engine = AsyncLLMEngine.from_engine_args(engine_args) 81 | 82 | uvicorn.run(app, 83 | host=args.host, 84 | port=args.port, 85 | log_level="debug", 86 | timeout_keep_alive=TIMEOUT_KEEP_ALIVE) 87 | -------------------------------------------------------------------------------- /docs/source/models/supported_models.rst: -------------------------------------------------------------------------------- 1 | .. _supported_models: 2 | 3 | Supported Models 4 | ================ 5 | 6 | vLLM supports a variety of generative Transformer models in `HuggingFace Transformers `_. 7 | The following is the list of model architectures that are currently supported by vLLM. 8 | Alongside each architecture, we include some popular models that use it. 9 | 10 | .. list-table:: 11 | :widths: 25 25 50 12 | :header-rows: 1 13 | 14 | * - Architecture 15 | - Models 16 | - Example HuggingFace Models 17 | * - :code:`AquilaForCausalLM` 18 | - Aquila 19 | - :code:`BAAI/Aquila-7B`, :code:`BAAI/AquilaChat-7B`, etc. 20 | * - :code:`BaiChuanForCausalLM` 21 | - Baichuan 22 | - :code:`baichuan-inc/Baichuan-7B`, :code:`baichuan-inc/Baichuan-13B-Chat`, etc. 23 | * - :code:`BloomForCausalLM` 24 | - BLOOM, BLOOMZ, BLOOMChat 25 | - :code:`bigscience/bloom`, :code:`bigscience/bloomz`, etc. 26 | * - :code:`FalconForCausalLM` 27 | - Falcon 28 | - :code:`tiiuae/falcon-7b``, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc. 29 | * - :code:`GPT2LMHeadModel` 30 | - GPT-2 31 | - :code:`gpt2`, :code:`gpt2-xl`, etc. 32 | * - :code:`GPTBigCodeForCausalLM` 33 | - StarCoder, SantaCoder, WizardCoder 34 | - :code:`bigcode/starcoder`, :code:`bigcode/gpt_bigcode-santacoder`, :code:`WizardLM/WizardCoder-15B-V1.0`, etc. 35 | * - :code:`GPTJForCausalLM` 36 | - GPT-J 37 | - :code:`EleutherAI/gpt-j-6b`, :code:`nomic-ai/gpt4all-j`, etc. 38 | * - :code:`GPTNeoXForCausalLM` 39 | - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM 40 | - :code:`EleutherAI/gpt-neox-20b`, :code:`EleutherAI/pythia-12b`, :code:`OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, :code:`databricks/dolly-v2-12b`, :code:`stabilityai/stablelm-tuned-alpha-7b`, etc. 41 | * - :code:`InternLMForCausalLM` 42 | - InternLM 43 | - :code:`internlm/internlm-7b`, :code:`internlm/internlm-chat-7b`, etc. 44 | * - :code:`LlamaForCausalLM` 45 | - LLaMA, LLaMA-2, Vicuna, Alpaca, Koala, Guanaco 46 | - :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`young-geng/koala`, etc. 47 | * - :code:`MPTForCausalLM` 48 | - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter 49 | - :code:`mosaicml/mpt-7b`, :code:`mosaicml/mpt-7b-storywriter`, :code:`mosaicml/mpt-30b`, etc. 50 | * - :code:`OPTForCausalLM` 51 | - OPT, OPT-IML 52 | - :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc. 53 | * - :code:`QWenLMHeadModel` 54 | - Qwen 55 | - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc. 56 | 57 | If your model uses one of the above model architectures, you can seamlessly run your model with vLLM. 58 | Otherwise, please refer to :ref:`Adding a New Model ` for instructions on how to implement support for your model. 59 | Alternatively, you can raise an issue on our `GitHub `_ project. 60 | 61 | .. tip:: 62 | The easiest way to check if your model is supported is to run the program below: 63 | 64 | .. code-block:: python 65 | 66 | from vllm import LLM 67 | 68 | llm = LLM(model=...) # Name or path of your model 69 | output = llm.generate("Hello, my name is") 70 | print(output) 71 | 72 | If vLLM successfully generates text, it indicates that your model is supported. 73 | -------------------------------------------------------------------------------- /format.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # YAPF formatter, adapted from ray and skypilot. 3 | # 4 | # Usage: 5 | # # Do work and commit your work. 6 | 7 | # # Format files that differ from origin/main. 8 | # bash format.sh 9 | 10 | # # Commit changed files with message 'Run yapf and pylint' 11 | # 12 | # 13 | # YAPF + Clang formatter (if installed). This script formats all changed files from the last mergebase. 14 | # You are encouraged to run this locally before pushing changes for review. 15 | 16 | # Cause the script to exit if a single command fails 17 | set -eo pipefail 18 | 19 | # this stops git rev-parse from failing if we run this from the .git directory 20 | builtin cd "$(dirname "${BASH_SOURCE:-$0}")" 21 | ROOT="$(git rev-parse --show-toplevel)" 22 | builtin cd "$ROOT" || exit 1 23 | 24 | YAPF_VERSION=$(yapf --version | awk '{print $2}') 25 | PYLINT_VERSION=$(pylint --version | head -n 1 | awk '{print $2}') 26 | MYPY_VERSION=$(mypy --version | awk '{print $2}') 27 | 28 | # # params: tool name, tool version, required version 29 | tool_version_check() { 30 | if [[ $2 != $3 ]]; then 31 | echo "Wrong $1 version installed: $3 is required, not $2." 32 | exit 1 33 | fi 34 | } 35 | 36 | tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-dev.txt | cut -d'=' -f3)" 37 | tool_version_check "pylint" $PYLINT_VERSION "$(grep "pylint==" requirements-dev.txt | cut -d'=' -f3)" 38 | tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-dev.txt | cut -d'=' -f3)" 39 | 40 | YAPF_FLAGS=( 41 | '--recursive' 42 | '--parallel' 43 | ) 44 | 45 | YAPF_EXCLUDES=( 46 | '--exclude' 'build/**' 47 | '--exclude' 'vllm/model_executor/parallel_utils/**' 48 | ) 49 | 50 | # Format specified files 51 | format() { 52 | yapf --in-place "${YAPF_FLAGS[@]}" "$@" 53 | } 54 | 55 | # Format files that differ from main branch. Ignores dirs that are not slated 56 | # for autoformat yet. 57 | format_changed() { 58 | # The `if` guard ensures that the list of filenames is not empty, which 59 | # could cause yapf to receive 0 positional arguments, making it hang 60 | # waiting for STDIN. 61 | # 62 | # `diff-filter=ACM` and $MERGEBASE is to ensure we only format files that 63 | # exist on both branches. 64 | MERGEBASE="$(git merge-base origin/main HEAD)" 65 | 66 | if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then 67 | git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \ 68 | yapf --in-place "${YAPF_EXCLUDES[@]}" "${YAPF_FLAGS[@]}" 69 | fi 70 | 71 | } 72 | 73 | # Format all files 74 | format_all() { 75 | yapf --in-place "${YAPF_FLAGS[@]}" "${YAPF_EXCLUDES[@]}" vllm 76 | } 77 | 78 | ## This flag formats individual files. --files *must* be the first command line 79 | ## arg to use this option. 80 | if [[ "$1" == '--files' ]]; then 81 | format "${@:2}" 82 | # If `--all` is passed, then any further arguments are ignored and the 83 | # entire python directory is formatted. 84 | elif [[ "$1" == '--all' ]]; then 85 | format_all 86 | else 87 | # Format only the files that changed in last commit. 88 | format_changed 89 | fi 90 | echo 'vLLM yapf: Done' 91 | 92 | # Run mypy 93 | # TODO(zhuohan): Enable mypy 94 | # echo 'vLLM mypy:' 95 | # mypy 96 | 97 | # Run Pylint 98 | echo 'vLLM Pylint:' 99 | pylint vllm 100 | 101 | if ! git diff --quiet &>/dev/null; then 102 | echo 'Reformatted files. Please review and stage the changes.' 103 | echo 'Changes not staged for commit:' 104 | echo 105 | git --no-pager diff --name-only 106 | 107 | exit 1 108 | fi 109 | -------------------------------------------------------------------------------- /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package to Release asset 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Create Release 5 | 6 | on: 7 | push: 8 | tags: 9 | - v* 10 | 11 | # Needed to create release and upload assets 12 | permissions: 13 | contents: write 14 | 15 | jobs: 16 | release: 17 | # Retrieve tag and create release 18 | name: Create Release 19 | runs-on: ubuntu-latest 20 | outputs: 21 | upload_url: ${{ steps.create_release.outputs.upload_url }} 22 | steps: 23 | - name: Checkout 24 | uses: actions/checkout@v3 25 | 26 | - name: Extract branch info 27 | shell: bash 28 | run: | 29 | echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV 30 | 31 | - name: Create Release 32 | id: create_release 33 | uses: "actions/github-script@v6" 34 | env: 35 | RELEASE_TAG: ${{ env.release_tag }} 36 | with: 37 | github-token: "${{ secrets.GITHUB_TOKEN }}" 38 | script: | 39 | const script = require('.github/workflows/scripts/create_release.js') 40 | await script(github, context, core) 41 | 42 | wheel: 43 | name: Build Wheel 44 | runs-on: ${{ matrix.os }} 45 | needs: release 46 | 47 | strategy: 48 | fail-fast: false 49 | matrix: 50 | os: ['ubuntu-20.04'] 51 | python-version: ['3.8', '3.9', '3.10', '3.11'] 52 | cuda-version: ['11.8'] # Github runner can't build anything older than 11.8 53 | 54 | steps: 55 | - name: Checkout 56 | uses: actions/checkout@v3 57 | 58 | - name: Set up Linux Env 59 | if: ${{ runner.os == 'Linux' }} 60 | run: | 61 | bash -x .github/workflows/scripts/env.sh 62 | 63 | - name: Set up Python 64 | uses: actions/setup-python@v4 65 | with: 66 | python-version: ${{ matrix.python-version }} 67 | 68 | - name: Install CUDA ${{ matrix.cuda-version }} 69 | run: | 70 | bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }} 71 | 72 | - name: Install PyTorch-cu${{ matrix.cuda-version }} 73 | run: | 74 | bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }} 75 | 76 | - name: Build wheel 77 | shell: bash 78 | run: | 79 | bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }} 80 | wheel_name=$(ls dist/*whl | xargs -n 1 basename) 81 | asset_name=${wheel_name//"linux"/"manylinux1"} 82 | echo "wheel_name=${wheel_name}" >> $GITHUB_ENV 83 | echo "asset_name=${asset_name}" >> $GITHUB_ENV 84 | 85 | - name: Upload Release Asset 86 | uses: actions/upload-release-asset@v1 87 | env: 88 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 89 | with: 90 | upload_url: ${{ needs.release.outputs.upload_url }} 91 | asset_path: ./dist/${{ env.wheel_name }} 92 | asset_name: ${{ env.asset_name }} 93 | asset_content_type: application/* 94 | 95 | # (Danielkinz): This last step will publish the .whl to pypi. Warning: untested 96 | # - name: Publish package 97 | # uses: pypa/gh-action-pypi-publish@release/v1.8 98 | # with: 99 | # repository-url: https://test.pypi.org/legacy/ 100 | # password: ${{ secrets.PYPI_API_TOKEN }} 101 | # skip-existing: true 102 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | .idea/ 161 | 162 | # VSCode 163 | .vscode/ 164 | 165 | # DS Store 166 | .DS_Store 167 | 168 | # Results 169 | *.csv 170 | 171 | # Python pickle files 172 | *.pkl 173 | 174 | # Sphinx documentation 175 | _build/ 176 | -------------------------------------------------------------------------------- /csrc/activation_kernels.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "dispatch_utils.h" 5 | 6 | namespace vllm { 7 | 8 | template 9 | __device__ __forceinline__ T silu(const T& x) { 10 | // x * sigmoid(x) 11 | return (T) (((float) x) / (1.0f + expf((float) -x))); 12 | } 13 | 14 | template 15 | __global__ void silu_and_mul_kernel( 16 | scalar_t* __restrict__ out, // [num_tokens, d] 17 | const scalar_t* __restrict__ input, // [num_tokens, 2, d] 18 | const int d) { 19 | const int token_idx = blockIdx.x; 20 | for (int idx = threadIdx.x; idx < d; idx += blockDim.x) { 21 | const scalar_t x = __ldg(&input[token_idx * 2 * d + idx]); 22 | const scalar_t y = __ldg(&input[token_idx * 2 * d + d + idx]); 23 | out[token_idx * d + idx] = silu(x) * y; 24 | } 25 | } 26 | 27 | } // namespace vllm 28 | 29 | void silu_and_mul( 30 | torch::Tensor& out, // [num_tokens, d] 31 | torch::Tensor& input) // [num_tokens, 2 * d] 32 | { 33 | int num_tokens = input.size(0); 34 | int d = input.size(1) / 2; 35 | 36 | dim3 grid(num_tokens); 37 | dim3 block(std::min(d, 1024)); 38 | const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); 39 | VLLM_DISPATCH_FLOATING_TYPES( 40 | input.scalar_type(), 41 | "silu_and_mul_kernel", 42 | [&] { 43 | vllm::silu_and_mul_kernel<<>>( 44 | out.data_ptr(), 45 | input.data_ptr(), 46 | d); 47 | }); 48 | } 49 | 50 | namespace vllm { 51 | 52 | // Element-wise activation kernel template. 53 | template 54 | __global__ void activation_kernel( 55 | scalar_t* __restrict__ out, // [num_tokens, d] 56 | const scalar_t* __restrict__ input, // [num_tokens, d] 57 | const int d) { 58 | const int token_idx = blockIdx.x; 59 | for (int idx = threadIdx.x; idx < d; idx += blockDim.x) { 60 | const scalar_t x = __ldg(&input[token_idx * d + idx]); 61 | out[token_idx * d + idx] = ACT_FN(x); 62 | } 63 | } 64 | 65 | } // namespace vllm 66 | 67 | // Launch element-wise activation kernel. 68 | #define LAUNCH_ACTIVATION_KERNEL(KERNEL) \ 69 | int num_tokens = input.size(0); \ 70 | int d = input.size(1); \ 71 | dim3 grid(num_tokens); \ 72 | dim3 block(std::min(d, 1024)); \ 73 | const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \ 74 | VLLM_DISPATCH_FLOATING_TYPES( \ 75 | input.scalar_type(), \ 76 | "activation_kernel", \ 77 | [&] { \ 78 | vllm::activation_kernel><<>>( \ 79 | out.data_ptr(), \ 80 | input.data_ptr(), \ 81 | d); \ 82 | }); 83 | 84 | namespace vllm { 85 | 86 | template 87 | __device__ __forceinline__ T gelu_new_kernel(const T& x) { 88 | const float x3 = (float) (x * x * x); 89 | const T t = (T) tanhf((T) (0.79788456f * (float) (x + (T) (0.044715f * x3)))); 90 | return ((T) 0.5) * x * (((T) 1.0) + t); 91 | } 92 | 93 | template 94 | __device__ __forceinline__ T gelu_fast_kernel(const T& x) { 95 | const float f = (float) x; 96 | const T t = (T) tanhf(((T) (f * 0.79788456f)) * (((T) 1.0) + (T) (0.044715f * f) * x)); 97 | return ((T) 0.5) * x * (((T) 1.0) + t); 98 | } 99 | 100 | } // namespace vllm 101 | 102 | void gelu_new( 103 | torch::Tensor& out, // [num_tokens, d] 104 | torch::Tensor& input) // [num_tokens, d] 105 | { 106 | LAUNCH_ACTIVATION_KERNEL(vllm::gelu_new_kernel); 107 | } 108 | 109 | void gelu_fast( 110 | torch::Tensor& out, // [num_tokens, d] 111 | torch::Tensor& input) // [num_tokens, d] 112 | { 113 | LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel); 114 | } 115 | -------------------------------------------------------------------------------- /vllm/outputs.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Optional 2 | 3 | from vllm.sequence import SequenceGroup, SequenceStatus 4 | 5 | 6 | class CompletionOutput: 7 | """The output data of one completion output of a request. 8 | 9 | Args: 10 | index: The index of the output in the request. 11 | text: The generated output text. 12 | token_ids: The token IDs of the generated output text. 13 | cumulative_logprob: The cumulative log probability of the generated 14 | output text. 15 | logprobs: The log probabilities of the top probability words at each 16 | position if the logprobs are requested. 17 | finish_reason: The reason why the sequence is finished. 18 | """ 19 | 20 | def __init__( 21 | self, 22 | index: int, 23 | text: str, 24 | token_ids: List[int], 25 | cumulative_logprob: float, 26 | logprobs: Optional[List[Dict[int, float]]], 27 | finish_reason: Optional[str] = None, 28 | ) -> None: 29 | self.index = index 30 | self.text = text 31 | self.token_ids = token_ids 32 | self.cumulative_logprob = cumulative_logprob 33 | self.logprobs = logprobs 34 | self.finish_reason = finish_reason 35 | 36 | def finished(self) -> bool: 37 | return self.finish_reason is not None 38 | 39 | def __repr__(self) -> str: 40 | return (f"CompletionOutput(index={self.index}, " 41 | f"text={self.text!r}, " 42 | f"token_ids={self.token_ids}, " 43 | f"cumulative_logprob={self.cumulative_logprob}, " 44 | f"logprobs={self.logprobs}, " 45 | f"finish_reason={self.finish_reason})") 46 | 47 | 48 | class RequestOutput: 49 | """The output data of a request to the LLM. 50 | 51 | Args: 52 | request_id: The unique ID of the request. 53 | prompt: The prompt string of the request. 54 | prompt_token_ids: The token IDs of the prompt. 55 | outputs: The output sequences of the request. 56 | finished: Whether the whole request is finished. 57 | """ 58 | 59 | def __init__( 60 | self, 61 | request_id: str, 62 | prompt: str, 63 | prompt_token_ids: List[int], 64 | outputs: List[CompletionOutput], 65 | finished: bool, 66 | ) -> None: 67 | self.request_id = request_id 68 | self.prompt = prompt 69 | self.prompt_token_ids = prompt_token_ids 70 | self.outputs = outputs 71 | self.finished = finished 72 | 73 | @classmethod 74 | def from_seq_group(cls, seq_group: SequenceGroup) -> "RequestOutput": 75 | # Get the top-n sequences. 76 | n = seq_group.sampling_params.n 77 | seqs = seq_group.get_seqs() 78 | if seq_group.sampling_params.use_beam_search: 79 | sorting_key = lambda seq: seq.get_beam_search_score( 80 | seq_group.sampling_params.length_penalty) 81 | else: 82 | sorting_key = lambda seq: seq.get_cumulative_logprob() 83 | sorted_seqs = sorted(seqs, key=sorting_key, reverse=True) 84 | top_n_seqs = sorted_seqs[:n] 85 | 86 | # Create the outputs. 87 | outputs: List[CompletionOutput] = [] 88 | for seq in top_n_seqs: 89 | logprobs = seq.output_logprobs 90 | if seq_group.sampling_params.logprobs is None: 91 | # NOTE: We need to take care of this case because the sequence 92 | # always has the logprobs of the sampled tokens even if the 93 | # logprobs are not requested. 94 | logprobs = {} 95 | finshed_reason = SequenceStatus.get_finished_reason(seq.status) 96 | output = CompletionOutput(seqs.index(seq), seq.output_text, 97 | seq.get_output_token_ids(), 98 | seq.get_cumulative_logprob(), logprobs, 99 | finshed_reason) 100 | outputs.append(output) 101 | 102 | # Every sequence in the sequence group should have the same prompt. 103 | prompt = top_n_seqs[0].prompt 104 | prompt_token_ids = top_n_seqs[0].data.prompt_token_ids 105 | finished = seq_group.is_finished() 106 | return cls(seq_group.request_id, prompt, prompt_token_ids, outputs, 107 | finished) 108 | 109 | def __repr__(self) -> str: 110 | return (f"RequestOutput(request_id={self.request_id}, " 111 | f"prompt={self.prompt!r}, " 112 | f"prompt_token_ids={self.prompt_token_ids}, " 113 | f"outputs={self.outputs}, " 114 | f"finished={self.finished})") 115 | -------------------------------------------------------------------------------- /csrc/pos_encoding_kernels.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "dispatch_utils.h" 5 | 6 | namespace vllm { 7 | 8 | template 9 | inline __device__ void apply_rotary_embedding( 10 | scalar_t* __restrict__ arr, 11 | const scalar_t* __restrict__ cos_ptr, 12 | const scalar_t* __restrict__ sin_ptr, 13 | int rot_offset, 14 | int embed_dim) 15 | { 16 | int x_index, y_index; 17 | scalar_t cos, sin; 18 | if (IS_NEOX) { 19 | // GPT-NeoX style rotary embedding. 20 | x_index = rot_offset; 21 | y_index = embed_dim + rot_offset; 22 | cos = __ldg(cos_ptr + x_index); 23 | sin = __ldg(sin_ptr + x_index); 24 | } else { 25 | // GPT-J style rotary embedding. 26 | x_index = 2 * rot_offset; 27 | y_index = 2 * rot_offset + 1; 28 | cos = __ldg(cos_ptr + x_index / 2); 29 | sin = __ldg(sin_ptr + x_index / 2); 30 | } 31 | 32 | const scalar_t x = arr[x_index]; 33 | const scalar_t y = arr[y_index]; 34 | arr[x_index] = x * cos - y * sin; 35 | arr[y_index] = y * cos + x * sin; 36 | } 37 | 38 | template 39 | __global__ void rotary_embedding_kernel( 40 | const int64_t* __restrict__ positions, // [num_tokens] 41 | scalar_t* __restrict__ query, // [num_tokens, num_heads, head_size] 42 | scalar_t* __restrict__ key, // [num_tokens, num_kv_heads, head_size] 43 | const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim // 2] 44 | const int rot_dim, 45 | const int query_stride, 46 | const int key_stride, 47 | const int num_heads, 48 | const int num_kv_heads, 49 | const int head_size) { 50 | // Each thread block is responsible for one token. 51 | const int token_idx = blockIdx.x; 52 | int64_t pos = positions[token_idx]; 53 | const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim; 54 | 55 | const int embed_dim = rot_dim / 2; 56 | const scalar_t* cos_ptr = cache_ptr; 57 | const scalar_t* sin_ptr = cache_ptr + embed_dim; 58 | 59 | const int nq = num_heads * embed_dim; 60 | for (int i = threadIdx.x; i < nq; i += blockDim.x) { 61 | const int head_idx = i / embed_dim; 62 | const int token_head = token_idx * query_stride + head_idx * head_size; 63 | const int rot_offset = i % embed_dim; 64 | apply_rotary_embedding(query + token_head, cos_ptr, 65 | sin_ptr, rot_offset, embed_dim); 66 | } 67 | 68 | const int nk = num_kv_heads * embed_dim; 69 | for (int i = threadIdx.x; i < nk; i += blockDim.x) { 70 | const int head_idx = i / embed_dim; 71 | const int token_head = token_idx * key_stride + head_idx * head_size; 72 | const int rot_offset = i % embed_dim; 73 | apply_rotary_embedding(key + token_head, cos_ptr, 74 | sin_ptr, rot_offset, embed_dim); 75 | } 76 | } 77 | 78 | } // namespace vllm 79 | 80 | void rotary_embedding( 81 | torch::Tensor& positions, // [num_tokens] 82 | torch::Tensor& query, // [num_tokens, num_heads * head_size] 83 | torch::Tensor& key, // [num_tokens, num_kv_heads * head_size] 84 | int head_size, 85 | torch::Tensor& cos_sin_cache, // [max_position, rot_dim] 86 | bool is_neox) { 87 | int num_tokens = query.size(0); 88 | int rot_dim = cos_sin_cache.size(1); 89 | int num_heads = query.size(1) / head_size; 90 | int num_kv_heads = key.size(1) / head_size; 91 | int query_stride = query.stride(0); 92 | int key_stride = key.stride(0); 93 | 94 | dim3 grid(num_tokens); 95 | dim3 block(std::min(num_heads * rot_dim / 2, 512)); 96 | const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); 97 | VLLM_DISPATCH_FLOATING_TYPES( 98 | query.scalar_type(), 99 | "rotary_embedding", 100 | [&] { 101 | if (is_neox) { 102 | vllm::rotary_embedding_kernel<<>>( 103 | positions.data_ptr(), 104 | query.data_ptr(), 105 | key.data_ptr(), 106 | cos_sin_cache.data_ptr(), 107 | rot_dim, 108 | query_stride, 109 | key_stride, 110 | num_heads, 111 | num_kv_heads, 112 | head_size); 113 | } else { 114 | vllm::rotary_embedding_kernel<<>>( 115 | positions.data_ptr(), 116 | query.data_ptr(), 117 | key.data_ptr(), 118 | cos_sin_cache.data_ptr(), 119 | rot_dim, 120 | query_stride, 121 | key_stride, 122 | num_heads, 123 | num_kv_heads, 124 | head_size); 125 | } 126 | }); 127 | } 128 | -------------------------------------------------------------------------------- /vllm/engine/ray_utils.py: -------------------------------------------------------------------------------- 1 | import socket 2 | from typing import Optional, Tuple, TYPE_CHECKING 3 | 4 | from vllm.config import ParallelConfig 5 | 6 | try: 7 | import ray 8 | from ray.air.util.torch_dist import TorchDistributedWorker 9 | 10 | class RayWorker(TorchDistributedWorker): 11 | """Ray wrapper for vllm.worker.Worker, allowing Worker to be 12 | lazliy initialized after Ray sets CUDA_VISIBLE_DEVICES.""" 13 | 14 | def __init__(self, init_cached_hf_modules=False) -> None: 15 | if init_cached_hf_modules: 16 | # pylint: disable=import-outside-toplevel 17 | from transformers.dynamic_module_utils import init_hf_modules 18 | init_hf_modules() 19 | self.worker = None 20 | 21 | def init_worker(self, worker_init_fn): 22 | self.worker = worker_init_fn() 23 | 24 | def __getattr__(self, name): 25 | return getattr(self.worker, name) 26 | 27 | def execute_method(self, method, *args, **kwargs): 28 | executor = getattr(self, method) 29 | return executor(*args, **kwargs) 30 | 31 | except ImportError: 32 | ray = None 33 | TorchDistributedWorker = None 34 | RayWorker = None # pylint: disable=invalid-name 35 | 36 | if TYPE_CHECKING: 37 | from ray.util.placement_group import PlacementGroup 38 | 39 | 40 | def get_open_port(): 41 | with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: 42 | s.bind(("", 0)) 43 | return s.getsockname()[1] 44 | 45 | 46 | def initialize_cluster( 47 | parallel_config: ParallelConfig, 48 | engine_use_ray: bool = False, 49 | ray_address: Optional[str] = None, 50 | ) -> Tuple[str, Optional["PlacementGroup"]]: 51 | """Initialize the distributed cluster probably with Ray. 52 | 53 | Args: 54 | parallel_config: The configurations for parallel execution. 55 | engine_use_ray: Whether to use Ray for async engine. 56 | ray_address: The address of the Ray cluster. If None, uses 57 | the default Ray cluster address. 58 | 59 | Returns: 60 | A tuple of (`distributed_init_method`, `all_stage_devices`). The 61 | `distributed_init_method` is the address for initializing the 62 | distributed backend. `all_stage_devices` includes device IDs for 63 | each worker in each pipeline stage. Each device ID is a tuple of 64 | (rank, node resource, device id). 65 | """ 66 | if parallel_config.worker_use_ray or engine_use_ray: 67 | if ray is None: 68 | raise ImportError( 69 | "Ray is not installed. Please install Ray to use distributed " 70 | "serving.") 71 | # Connect to a ray cluster. 72 | ray.init(address=ray_address, ignore_reinit_error=True) 73 | 74 | if not parallel_config.worker_use_ray: 75 | # Initialize cluster locally. 76 | port = get_open_port() 77 | # We need to setup the distributed init method to make sure 78 | # the distributed megatron code (e.g., get world size) works correctly. 79 | distributed_init_method = f"tcp://localhost:{port}" 80 | return distributed_init_method, None 81 | 82 | current_placement_group = ray.util.get_current_placement_group() 83 | if current_placement_group: 84 | # We are in a placement group 85 | bundles = current_placement_group.bundle_specs 86 | # Verify that we can use the placement group. 87 | gpu_bundles = 0 88 | for bundle in bundles: 89 | bundle_gpus = bundle.get("GPU", 0) 90 | if bundle_gpus > 1: 91 | raise ValueError( 92 | "Placement group bundle cannot have more than 1 GPU.") 93 | if bundle_gpus: 94 | gpu_bundles += 1 95 | if parallel_config.world_size > gpu_bundles: 96 | raise ValueError( 97 | "The number of required GPUs exceeds the total number of " 98 | "available GPUs in the placement group.") 99 | else: 100 | num_gpus_in_cluster = ray.cluster_resources().get("GPU", 0) 101 | if parallel_config.world_size > num_gpus_in_cluster: 102 | raise ValueError( 103 | "The number of required GPUs exceeds the total number of " 104 | "available GPUs in the cluster.") 105 | # Create a new placement group 106 | current_placement_group = ray.util.placement_group([{ 107 | "GPU": 1 108 | }] * parallel_config.world_size) 109 | # Wait until PG is ready - this will block until all 110 | # requested resources are available, and will timeout 111 | # if they cannot be provisioned. 112 | ray.get(current_placement_group.ready(), timeout=1800) 113 | 114 | return None, current_placement_group 115 | -------------------------------------------------------------------------------- /docs/source/models/adding_model.rst: -------------------------------------------------------------------------------- 1 | .. _adding_a_new_model: 2 | 3 | Adding a New Model 4 | ================== 5 | 6 | This document provides a high-level guide on integrating a `HuggingFace Transformers `_ model into vLLM. 7 | 8 | .. note:: 9 | The complexity of adding a new model depends heavily on the model's architecture. 10 | The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM. 11 | However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex. 12 | 13 | .. tip:: 14 | If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our `GitHub `_ repository. 15 | We will be happy to help you out! 16 | 17 | 18 | 0. Fork the vLLM repository 19 | -------------------------------- 20 | 21 | Start by forking our `GitHub `_ repository and then :ref:`build it from source `. 22 | This gives you the ability to modify the codebase and test your model. 23 | 24 | 25 | 1. Bring your model code 26 | ------------------------ 27 | 28 | Clone the PyTorch model code from the HuggingFace Transformers repository and put it into the `vllm/model_executor/models `_ directory. 29 | For instance, vLLM's `OPT model `_ was adpated from the HuggingFace's `modeling_opt.py `_ file. 30 | 31 | .. warning:: 32 | When copying the model code, make sure to review and adhere to the code's copyright and licensing terms. 33 | 34 | 35 | 2. Rewrite the :code:`forward` methods 36 | -------------------------------------- 37 | 38 | Next, you need to rewrite the :code:`forward` methods of your model by following these steps: 39 | 40 | 1. Remove any unnecessary code, such as the code only used for training. 41 | 2. Change the input parameters: 42 | 43 | .. code-block:: diff 44 | 45 | def forward( 46 | self, 47 | input_ids: torch.Tensor, 48 | - attention_mask: Optional[torch.Tensor] = None, 49 | - position_ids: Optional[torch.LongTensor] = None, 50 | - past_key_values: Optional[List[torch.FloatTensor]] = None, 51 | - inputs_embeds: Optional[torch.FloatTensor] = None, 52 | - labels: Optional[torch.LongTensor] = None, 53 | - use_cache: Optional[bool] = None, 54 | - output_attentions: Optional[bool] = None, 55 | - output_hidden_states: Optional[bool] = None, 56 | - return_dict: Optional[bool] = None, 57 | -) -> Union[Tuple, CausalLMOutputWithPast]: 58 | + positions: torch.Tensor, 59 | + kv_caches: List[KVCache], 60 | + input_metadata: InputMetadata, 61 | + cache_events: Optional[List[torch.cuda.Event]], 62 | +) -> SamplerOutput: 63 | 64 | 3. Update the code by considering that :code:`input_ids` and :code:`positions` are now flattened tensors. 65 | 4. Replace the attention operation with either :code:`GPTPagedAttention` or :code:`GPTNeoXPagedAttention`, depending on the model's architecture. 66 | 67 | .. note:: 68 | Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings. 69 | If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM. 70 | 71 | 72 | 3. (Optional) Implement tensor parallelism support 73 | -------------------------------------------------- 74 | 75 | If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it. 76 | To do this, substitute your model's linear and embedding layers with their tensor-parallel versions. 77 | For the embedding layer, you can simply replace :code:`nn.Embedding` with :code:`VocabParallelEmbedding`. 78 | When it comes to the linear layers, you should use either :code:`RowParallelLinear` or :code:`ColumnParallelLinear`. 79 | Typically, :code:`ColumnParallelLinear` is used for QKV linear layers and the first linear layers of the MLP blocks. 80 | For the remaining linear layers, :code:`RowParallelLinear` is used. 81 | 82 | 83 | 4. Implement the weight loading logic 84 | ------------------------------------- 85 | 86 | You now need to implement the :code:`load_weights` method in your :code:`*ForCausalLM` class. 87 | This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model. 88 | While the process is straightforward for most layers, the tensor-parallel layers necessitate some additional care as their weights should be partitioned to multiple GPUs. 89 | 90 | 91 | 5. Register your model 92 | ---------------------- 93 | 94 | Finally, include your :code:`*ForCausalLM` class in `vllm/model_executor/models/__init__.py `_ and register it to the :code:`_MODEL_REGISTRY` in `vllm/model_executor/model_loader.py `_. 95 | -------------------------------------------------------------------------------- /docs/source/getting_started/quickstart.rst: -------------------------------------------------------------------------------- 1 | .. _quickstart: 2 | 3 | Quickstart 4 | ========== 5 | 6 | This guide shows how to use vLLM to: 7 | 8 | * run offline batched inference on a dataset; 9 | * build an API server for a large language model; 10 | * start an OpenAI-compatible API server. 11 | 12 | Be sure to complete the :ref:`installation instructions ` before continuing with this guide. 13 | 14 | Offline Batched Inference 15 | ------------------------- 16 | 17 | We first show an example of using vLLM for offline batched inference on a dataset. In other words, we use vLLM to generate texts for a list of input prompts. 18 | 19 | Import ``LLM`` and ``SamplingParams`` from vLLM. The ``LLM`` class is the main class for running offline inference with vLLM engine. The ``SamplingParams`` class specifies the parameters for the sampling process. 20 | 21 | .. code-block:: python 22 | 23 | from vllm import LLM, SamplingParams 24 | 25 | Define the list of input prompts and the sampling parameters for generation. The sampling temperature is set to 0.8 and the nucleus sampling probability is set to 0.95. For more information about the sampling parameters, refer to the `class definition `_. 26 | 27 | .. code-block:: python 28 | 29 | prompts = [ 30 | "Hello, my name is", 31 | "The president of the United States is", 32 | "The capital of France is", 33 | "The future of AI is", 34 | ] 35 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95) 36 | 37 | Initialize vLLM's engine for offline inference with the ``LLM`` class and the `OPT-125M model `_. The list of supported models can be found at :ref:`supported models `. 38 | 39 | .. code-block:: python 40 | 41 | llm = LLM(model="facebook/opt-125m") 42 | 43 | Call ``llm.generate`` to generate the outputs. It adds the input prompts to vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of ``RequestOutput`` objects, which include all the output tokens. 44 | 45 | .. code-block:: python 46 | 47 | outputs = llm.generate(prompts, sampling_params) 48 | 49 | # Print the outputs. 50 | for output in outputs: 51 | prompt = output.prompt 52 | generated_text = output.outputs[0].text 53 | print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") 54 | 55 | 56 | The code example can also be found in `examples/offline_inference.py `_. 57 | 58 | 59 | API Server 60 | ---------- 61 | 62 | vLLM can be deployed as an LLM service. We provide an example `FastAPI `_ server. Check `vllm/entrypoints/api_server.py `_ for the server implementation. The server uses ``AsyncLLMEngine`` class to support asynchronous processing of incoming requests. 63 | 64 | Start the server: 65 | 66 | .. code-block:: console 67 | 68 | $ python -m vllm.entrypoints.api_server 69 | 70 | By default, this command starts the server at ``http://localhost:8000`` with the OPT-125M model. 71 | 72 | Query the model in shell: 73 | 74 | .. code-block:: console 75 | 76 | $ curl http://localhost:8000/generate \ 77 | $ -d '{ 78 | $ "prompt": "San Francisco is a", 79 | $ "use_beam_search": true, 80 | $ "n": 4, 81 | $ "temperature": 0 82 | $ }' 83 | 84 | See `examples/api_client.py `_ for a more detailed client example. 85 | 86 | OpenAI-Compatible Server 87 | ------------------------ 88 | 89 | vLLM can be deployed as a server that mimics the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API. 90 | 91 | Start the server: 92 | 93 | .. code-block:: console 94 | 95 | $ python -m vllm.entrypoints.openai.api_server \ 96 | $ --model facebook/opt-125m 97 | 98 | By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time (OPT-125M in the above command) and implements `list models `_ and `create completion `_ endpoints. We are actively adding support for more endpoints. 99 | 100 | This server can be queried in the same format as OpenAI API. For example, list the models: 101 | 102 | .. code-block:: console 103 | 104 | $ curl http://localhost:8000/v1/models 105 | 106 | Query the model with input prompts: 107 | 108 | .. code-block:: console 109 | 110 | $ curl http://localhost:8000/v1/completions \ 111 | $ -H "Content-Type: application/json" \ 112 | $ -d '{ 113 | $ "model": "facebook/opt-125m", 114 | $ "prompt": "San Francisco is a", 115 | $ "max_tokens": 7, 116 | $ "temperature": 0 117 | $ }' 118 | 119 | Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the ``openai`` python package: 120 | 121 | .. code-block:: python 122 | 123 | import openai 124 | # Modify OpenAI's API key and API base to use vLLM's API server. 125 | openai.api_key = "EMPTY" 126 | openai.api_base = "http://localhost:8000/v1" 127 | completion = openai.Completion.create(model="facebook/opt-125m", 128 | prompt="San Francisco is a") 129 | print("Completion result:", completion) 130 | 131 | For a more detailed client example, refer to `examples/openai_client.py `_. 132 | -------------------------------------------------------------------------------- /tests/kernels/test_cache.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import pytest 4 | import torch 5 | 6 | from vllm import cache_ops 7 | 8 | DTYPES = [torch.half, torch.bfloat16, torch.float] 9 | NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing 10 | NUM_LAYERS = [5] # Arbitrary values for testing 11 | NUM_HEADS = [8] # Arbitrary values for testing 12 | HEAD_SIZES = [64, 80, 96, 112, 128, 256] 13 | BLOCK_SIZES = [8, 16, 32] 14 | NUM_BLOCKS = [1024] # Arbitrary values for testing 15 | NUM_MAPPINGS = [32, 256] # Arbitrary values for testing 16 | SEEDS = [0] 17 | 18 | 19 | @pytest.mark.parametrize("num_mappings", NUM_MAPPINGS) 20 | @pytest.mark.parametrize("num_layers", NUM_LAYERS) 21 | @pytest.mark.parametrize("num_heads", NUM_HEADS) 22 | @pytest.mark.parametrize("head_size", HEAD_SIZES) 23 | @pytest.mark.parametrize("block_size", BLOCK_SIZES) 24 | @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) 25 | @pytest.mark.parametrize("dtype", DTYPES) 26 | @pytest.mark.parametrize("seed", SEEDS) 27 | @torch.inference_mode() 28 | def test_copy_blocks( 29 | kv_cache_factory, 30 | num_mappings: int, 31 | num_layers: int, 32 | num_heads: int, 33 | head_size: int, 34 | block_size: int, 35 | num_blocks: int, 36 | dtype: torch.dtype, 37 | seed: int, 38 | ) -> None: 39 | random.seed(seed) 40 | torch.random.manual_seed(seed) 41 | torch.cuda.manual_seed(seed) 42 | 43 | # Generate random block mappings where each source block is mapped to two 44 | # destination blocks. 45 | assert 2 * num_mappings <= num_blocks 46 | src_blocks = random.sample(range(num_blocks), num_mappings) 47 | remainig_blocks = list(set(range(num_blocks)) - set(src_blocks)) 48 | dst_blocks = random.sample(remainig_blocks, 2 * num_mappings) 49 | block_mapping = {} 50 | for i in range(num_mappings): 51 | src = src_blocks[i] 52 | dst1 = dst_blocks[2 * i] 53 | dst2 = dst_blocks[2 * i + 1] 54 | block_mapping[src] = [dst1, dst2] 55 | 56 | # Create the KV caches. 57 | key_caches, value_caches = kv_cache_factory(num_blocks, block_size, 58 | num_layers, num_heads, 59 | head_size, dtype, seed) 60 | 61 | # Clone the KV caches. 62 | cloned_key_caches = [key_cache.clone() for key_cache in key_caches] 63 | cloned_value_caches = [value_cache.clone() for value_cache in value_caches] 64 | 65 | # Call the copy blocks kernel. 66 | cache_ops.copy_blocks(key_caches, value_caches, block_mapping) 67 | 68 | # Run the reference implementation. 69 | for src, dsts in block_mapping.items(): 70 | for dst in dsts: 71 | for cloned_key_cache in cloned_key_caches: 72 | cloned_key_cache[dst] = cloned_key_cache[src] 73 | for cloned_value_cache in cloned_value_caches: 74 | cloned_value_cache[dst] = cloned_value_cache[src] 75 | 76 | # Compare the results. 77 | for key_cache, cloned_key_cache in zip(key_caches, cloned_key_caches): 78 | assert torch.allclose(key_cache, cloned_key_cache) 79 | for value_cache, cloned_value_cache in zip(value_caches, 80 | cloned_value_caches): 81 | assert torch.allclose(value_cache, cloned_value_cache) 82 | 83 | 84 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS) 85 | @pytest.mark.parametrize("num_heads", NUM_HEADS) 86 | @pytest.mark.parametrize("head_size", HEAD_SIZES) 87 | @pytest.mark.parametrize("block_size", BLOCK_SIZES) 88 | @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) 89 | @pytest.mark.parametrize("dtype", DTYPES) 90 | @pytest.mark.parametrize("seed", SEEDS) 91 | @torch.inference_mode() 92 | def test_reshape_and_cache( 93 | kv_cache_factory, 94 | num_tokens: int, 95 | num_heads: int, 96 | head_size: int, 97 | block_size: int, 98 | num_blocks: int, 99 | dtype: torch.dtype, 100 | seed: int, 101 | ) -> None: 102 | random.seed(seed) 103 | torch.random.manual_seed(seed) 104 | torch.cuda.manual_seed(seed) 105 | 106 | # Create a random slot mapping. 107 | num_slots = block_size * num_blocks 108 | slot_mapping = random.sample(range(num_slots), num_tokens) 109 | slot_mapping = torch.tensor(slot_mapping, dtype=torch.int, device='cuda') 110 | 111 | qkv = torch.randn(num_tokens, 112 | 3, 113 | num_heads, 114 | head_size, 115 | dtype=dtype, 116 | device='cuda') 117 | _, key, value = qkv.unbind(dim=1) 118 | 119 | # Create the KV caches. 120 | key_caches, value_caches = kv_cache_factory(num_blocks, block_size, 1, 121 | num_heads, head_size, dtype, 122 | seed) 123 | key_cache, value_cache = key_caches[0], value_caches[0] 124 | 125 | # Clone the KV caches. 126 | cloned_key_cache = key_cache.clone() 127 | cloned_value_cache = value_cache.clone() 128 | 129 | # Call the reshape_and_cache kernel. 130 | cache_ops.reshape_and_cache(key, value, key_cache, value_cache, 131 | slot_mapping) 132 | 133 | # Run the reference implementation. 134 | reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape) 135 | block_indicies = torch.div(slot_mapping, block_size, rounding_mode='floor') 136 | block_indicies = block_indicies.cpu().tolist() 137 | block_offsets = slot_mapping % block_size 138 | block_offsets = block_offsets.cpu().tolist() 139 | for i in range(num_tokens): 140 | block_idx = block_indicies[i] 141 | block_offset = block_offsets[i] 142 | cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i] 143 | cloned_value_cache[block_idx, :, :, block_offset] = value[i] 144 | 145 | assert torch.allclose(key_cache, cloned_key_cache) 146 | assert torch.allclose(value_cache, cloned_value_cache) 147 | -------------------------------------------------------------------------------- /vllm/entrypoints/openai/protocol.py: -------------------------------------------------------------------------------- 1 | # Adapted from 2 | # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py 3 | import time 4 | from typing import Dict, List, Literal, Optional, Union 5 | 6 | from pydantic import BaseModel, Field 7 | 8 | from vllm.utils import random_uuid 9 | 10 | 11 | class ErrorResponse(BaseModel): 12 | object: str = "error" 13 | message: str 14 | type: str 15 | param: Optional[str] = None 16 | code: Optional[str] = None 17 | 18 | 19 | class ModelPermission(BaseModel): 20 | id: str = Field(default_factory=lambda: f"modelperm-{random_uuid()}") 21 | object: str = "model_permission" 22 | created: int = Field(default_factory=lambda: int(time.time())) 23 | allow_create_engine: bool = False 24 | allow_sampling: bool = True 25 | allow_logprobs: bool = True 26 | allow_search_indices: bool = False 27 | allow_view: bool = True 28 | allow_fine_tuning: bool = False 29 | organization: str = "*" 30 | group: Optional[str] = None 31 | is_blocking: str = False 32 | 33 | 34 | class ModelCard(BaseModel): 35 | id: str 36 | object: str = "model" 37 | created: int = Field(default_factory=lambda: int(time.time())) 38 | owned_by: str = "vllm" 39 | root: Optional[str] = None 40 | parent: Optional[str] = None 41 | permission: List[ModelPermission] = Field(default_factory=list) 42 | 43 | 44 | class ModelList(BaseModel): 45 | object: str = "list" 46 | data: List[ModelCard] = Field(default_factory=list) 47 | 48 | 49 | class UsageInfo(BaseModel): 50 | prompt_tokens: int = 0 51 | total_tokens: int = 0 52 | completion_tokens: Optional[int] = 0 53 | 54 | 55 | class ChatCompletionRequest(BaseModel): 56 | model: str 57 | messages: Union[str, List[Dict[str, str]]] 58 | temperature: Optional[float] = 0.7 59 | top_p: Optional[float] = 1.0 60 | n: Optional[int] = 1 61 | max_tokens: Optional[int] = 16 62 | stop: Optional[Union[str, List[str]]] = Field(default_factory=list) 63 | stream: Optional[bool] = False 64 | presence_penalty: Optional[float] = 0.0 65 | frequency_penalty: Optional[float] = 0.0 66 | logit_bias: Optional[Dict[str, float]] = None 67 | user: Optional[str] = None 68 | # Additional parameters supported by vLLM 69 | best_of: Optional[int] = None 70 | top_k: Optional[int] = -1 71 | ignore_eos: Optional[bool] = False 72 | use_beam_search: Optional[bool] = False 73 | 74 | 75 | class CompletionRequest(BaseModel): 76 | model: str 77 | # a string, array of strings, array of tokens, or array of token arrays 78 | prompt: Union[List[int], List[List[int]], str, List[str]] 79 | suffix: Optional[str] = None 80 | max_tokens: Optional[int] = 16 81 | temperature: Optional[float] = 1.0 82 | top_p: Optional[float] = 1.0 83 | n: Optional[int] = 1 84 | stream: Optional[bool] = False 85 | logprobs: Optional[int] = None 86 | echo: Optional[bool] = False 87 | stop: Optional[Union[str, List[str]]] = Field(default_factory=list) 88 | presence_penalty: Optional[float] = 0.0 89 | frequency_penalty: Optional[float] = 0.0 90 | best_of: Optional[int] = None 91 | logit_bias: Optional[Dict[str, float]] = None 92 | user: Optional[str] = None 93 | # Additional parameters supported by vLLM 94 | top_k: Optional[int] = -1 95 | ignore_eos: Optional[bool] = False 96 | use_beam_search: Optional[bool] = False 97 | 98 | 99 | class LogProbs(BaseModel): 100 | text_offset: List[int] = Field(default_factory=list) 101 | token_logprobs: List[Optional[float]] = Field(default_factory=list) 102 | tokens: List[str] = Field(default_factory=list) 103 | top_logprobs: List[Optional[Dict[str, 104 | float]]] = Field(default_factory=list) 105 | 106 | 107 | class CompletionResponseChoice(BaseModel): 108 | index: int 109 | text: str 110 | logprobs: Optional[LogProbs] = None 111 | finish_reason: Optional[Literal["stop", "length"]] = None 112 | 113 | 114 | class CompletionResponse(BaseModel): 115 | id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}") 116 | object: str = "text_completion" 117 | created: int = Field(default_factory=lambda: int(time.time())) 118 | model: str 119 | choices: List[CompletionResponseChoice] 120 | usage: UsageInfo 121 | 122 | 123 | class CompletionResponseStreamChoice(BaseModel): 124 | index: int 125 | text: str 126 | logprobs: Optional[LogProbs] = None 127 | finish_reason: Optional[Literal["stop", "length"]] = None 128 | 129 | 130 | class CompletionStreamResponse(BaseModel): 131 | id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}") 132 | object: str = "text_completion" 133 | created: int = Field(default_factory=lambda: int(time.time())) 134 | model: str 135 | choices: List[CompletionResponseStreamChoice] 136 | 137 | 138 | class ChatMessage(BaseModel): 139 | role: str 140 | content: str 141 | 142 | 143 | class ChatCompletionResponseChoice(BaseModel): 144 | index: int 145 | message: ChatMessage 146 | finish_reason: Optional[Literal["stop", "length"]] = None 147 | 148 | 149 | class ChatCompletionResponse(BaseModel): 150 | id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}") 151 | object: str = "chat.completion" 152 | created: int = Field(default_factory=lambda: int(time.time())) 153 | model: str 154 | choices: List[ChatCompletionResponseChoice] 155 | usage: UsageInfo 156 | 157 | 158 | class DeltaMessage(BaseModel): 159 | role: Optional[str] = None 160 | content: Optional[str] = None 161 | 162 | 163 | class ChatCompletionResponseStreamChoice(BaseModel): 164 | index: int 165 | delta: DeltaMessage 166 | finish_reason: Optional[Literal["stop", "length"]] = None 167 | 168 | 169 | class ChatCompletionStreamResponse(BaseModel): 170 | id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}") 171 | object: str = "chat.completion.chunk" 172 | created: int = Field(default_factory=lambda: int(time.time())) 173 | model: str 174 | choices: List[ChatCompletionResponseStreamChoice] 175 | -------------------------------------------------------------------------------- /tests/kernels/test_pos_encoding.py: -------------------------------------------------------------------------------- 1 | from typing import Optional, Tuple 2 | 3 | import pytest 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | 8 | from vllm import pos_encoding_ops 9 | 10 | IS_NEOX_STYLE = [True, False] 11 | DTYPES = [torch.half, torch.bfloat16, torch.float] 12 | HEAD_SIZES = [64, 80, 96, 112, 128, 256] 13 | ROTARY_DIMS = [None, 32] # None means rotary dim == head size 14 | NUM_HEADS = [7, 12, 40, 52] # Arbitrary values for testing 15 | NUM_TOKENS = [11, 83, 2048] # Arbitrary values for testing 16 | SEEDS = [0] 17 | 18 | 19 | def rotate_neox(x: torch.Tensor) -> torch.Tensor: 20 | x1 = x[..., :x.shape[-1] // 2] 21 | x2 = x[..., x.shape[-1] // 2:] 22 | return torch.cat((-x2, x1), dim=-1) 23 | 24 | 25 | def rotate_gptj(x: torch.Tensor) -> torch.Tensor: 26 | x1 = x[..., ::2] 27 | x2 = x[..., 1::2] 28 | x = torch.stack((-x2, x1), dim=-1) 29 | return x.flatten(-2) 30 | 31 | 32 | def apply_rope( 33 | q: torch.Tensor, 34 | k: torch.Tensor, 35 | cos: torch.Tensor, 36 | sin: torch.Tensor, 37 | is_neox_style: bool, 38 | ) -> Tuple[torch.Tensor, torch.Tensor]: 39 | rotate_fn = rotate_neox if is_neox_style else rotate_gptj 40 | q_embed = (q * cos) + (rotate_fn(q) * sin) 41 | k_embed = (k * cos) + (rotate_fn(k) * sin) 42 | return q_embed, k_embed 43 | 44 | 45 | class RefRotaryEmbedding(nn.Module): 46 | """Reference implementation of rotary embedding.""" 47 | 48 | def __init__( 49 | self, 50 | dim: int, 51 | is_neox_style: bool, 52 | max_position_embeddings: int = 8192, 53 | base: int = 10000, 54 | ) -> None: 55 | super().__init__() 56 | self.rotary_dim = dim 57 | self.is_neox_style = is_neox_style 58 | self.max_position_embeddings = max_position_embeddings 59 | 60 | # Create cos and sin embeddings. 61 | inv_freq = 1.0 / (base**(torch.arange(0, dim, 2) / dim)) 62 | t = torch.arange(max_position_embeddings).float() 63 | freqs = torch.einsum("i,j->ij", t, inv_freq.float()) 64 | if is_neox_style: 65 | emb = torch.cat((freqs, freqs), dim=-1) 66 | else: 67 | emb = torch.repeat_interleave(freqs, 2, -1) 68 | cos = emb.cos().to(dtype=inv_freq.dtype) 69 | sin = emb.sin().to(dtype=inv_freq.dtype) 70 | self.register_buffer("cos_cached", cos, persistent=False) 71 | self.register_buffer("sin_cached", sin, persistent=False) 72 | 73 | def forward( 74 | self, 75 | positions: torch.Tensor, # [num_tokens] 76 | query: torch.Tensor, # [num_tokens, num_heads, head_size] 77 | key: torch.Tensor, # [num_tokens, num_heads, head_size] 78 | ) -> Tuple[torch.Tensor, torch.Tensor]: 79 | query_rot = query[..., :self.rotary_dim] 80 | query_pass = query[..., self.rotary_dim:] 81 | key_rot = key[..., :self.rotary_dim] 82 | key_pass = key[..., self.rotary_dim:] 83 | 84 | query_rot = query_rot.transpose(0, 1) 85 | key_rot = key_rot.transpose(0, 1) 86 | cos = F.embedding(positions, self.cos_cached) 87 | sin = F.embedding(positions, self.sin_cached) 88 | 89 | query_rot, key_rot = apply_rope(query_rot, key_rot, cos, sin, 90 | self.is_neox_style) 91 | query_rot = query_rot.transpose(0, 1).contiguous() 92 | key_rot = key_rot.transpose(0, 1).contiguous() 93 | 94 | query = torch.cat((query_rot, query_pass), dim=-1) 95 | key = torch.cat((key_rot, key_pass), dim=-1) 96 | 97 | # Output query/key shape: [num_tokens, num_tokens, head_size] 98 | return query, key 99 | 100 | 101 | @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE) 102 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS) 103 | @pytest.mark.parametrize("num_heads", NUM_HEADS) 104 | @pytest.mark.parametrize("head_size", HEAD_SIZES) 105 | @pytest.mark.parametrize("rotary_dim", ROTARY_DIMS) 106 | @pytest.mark.parametrize("dtype", DTYPES) 107 | @pytest.mark.parametrize("seed", SEEDS) 108 | @torch.inference_mode() 109 | def test_rotary_embedding( 110 | is_neox_style: bool, 111 | num_tokens: int, 112 | num_heads: int, 113 | head_size: int, 114 | rotary_dim: Optional[int], 115 | dtype: torch.dtype, 116 | seed: int, 117 | max_position: int = 8192, 118 | base: int = 10000, 119 | ) -> None: 120 | if rotary_dim is None: 121 | rotary_dim = head_size 122 | torch.random.manual_seed(seed) 123 | torch.cuda.manual_seed(seed) 124 | 125 | positions = torch.randint(0, max_position, (num_tokens, ), device="cuda") 126 | query = torch.randn(num_tokens, 127 | num_heads * head_size, 128 | dtype=dtype, 129 | device="cuda") 130 | key = torch.randn(num_tokens, 131 | num_heads * head_size, 132 | dtype=dtype, 133 | device="cuda") 134 | 135 | # Create the rotary embedding. 136 | inv_freq = 1.0 / (base**( 137 | torch.arange(0, rotary_dim, 2, dtype=torch.float) / rotary_dim)) 138 | t = torch.arange(max_position).float() 139 | freqs = torch.einsum("i,j -> ij", t, inv_freq) 140 | cos = freqs.cos() 141 | sin = freqs.sin() 142 | cos_sin_cache = torch.cat((cos, sin), dim=-1) 143 | cos_sin_cache = cos_sin_cache.to(dtype=dtype, device='cuda') 144 | 145 | # Run the kernel. The kernel is in-place, so we need to clone the inputs. 146 | out_query = query.clone() 147 | out_key = key.clone() 148 | pos_encoding_ops.rotary_embedding( 149 | positions, 150 | out_query, 151 | out_key, 152 | head_size, 153 | cos_sin_cache, 154 | is_neox_style, 155 | ) 156 | 157 | # Run the reference implementation. 158 | ref_rotary_embedding = RefRotaryEmbedding( 159 | dim=rotary_dim, 160 | is_neox_style=is_neox_style, 161 | max_position_embeddings=max_position, 162 | base=base, 163 | ).to(dtype=dtype, device="cuda") 164 | ref_query, ref_key = ref_rotary_embedding( 165 | positions, 166 | query.view(num_tokens, num_heads, head_size), 167 | key.view(num_tokens, num_heads, head_size), 168 | ) 169 | ref_query = ref_query.view(num_tokens, num_heads * head_size) 170 | ref_key = ref_key.view(num_tokens, num_heads * head_size) 171 | 172 | # Compare the results. 173 | assert torch.allclose(out_query, ref_query, atol=1e-5, rtol=1e-5) 174 | assert torch.allclose(out_key, ref_key, atol=1e-5, rtol=1e-5) 175 | -------------------------------------------------------------------------------- /vllm/worker/cache_engine.py: -------------------------------------------------------------------------------- 1 | """CacheEngine class for managing the KV cache.""" 2 | from typing import Dict, List, Tuple 3 | 4 | import torch 5 | 6 | from vllm import cache_ops 7 | from vllm.config import CacheConfig, ModelConfig, ParallelConfig 8 | from vllm.logger import init_logger 9 | from vllm.utils import in_wsl 10 | 11 | logger = init_logger(__name__) 12 | 13 | KVCache = Tuple[torch.Tensor, torch.Tensor] 14 | 15 | 16 | class CacheEngine: 17 | """Manages the KV cache. 18 | 19 | This class is responsible for initializing and managing the GPU and CPU KV 20 | caches. It also provides methods for performing KV cache operations, such 21 | as swapping and copying. 22 | """ 23 | 24 | def __init__( 25 | self, 26 | cache_config: CacheConfig, 27 | model_config: ModelConfig, 28 | parallel_config: ParallelConfig, 29 | ) -> None: 30 | self.cache_config = cache_config 31 | self.model_config = model_config 32 | self.parallel_config = parallel_config 33 | 34 | self.head_size = model_config.get_head_size() 35 | self.num_layers = model_config.get_num_layers(parallel_config) 36 | self.num_heads = model_config.get_num_heads(parallel_config) 37 | self.dtype = model_config.dtype 38 | 39 | self.block_size = cache_config.block_size 40 | self.num_gpu_blocks = cache_config.num_gpu_blocks 41 | self.num_cpu_blocks = cache_config.num_cpu_blocks 42 | 43 | # Initialize the cache. 44 | self.gpu_cache = self.allocate_gpu_cache() 45 | self.cpu_cache = self.allocate_cpu_cache() 46 | 47 | # Initialize the stream for caching operations. 48 | self.cache_stream = torch.cuda.Stream() 49 | assert self.cache_stream != torch.cuda.current_stream() 50 | # Initialize the events for stream synchronization. 51 | self.events = [torch.cuda.Event() for _ in range(self.num_layers)] 52 | 53 | def get_key_block_shape(self) -> Tuple[int, int, int, int]: 54 | element_size = torch.tensor([], dtype=self.dtype).element_size() 55 | x = 16 // element_size 56 | return ( 57 | self.num_heads, 58 | self.head_size // x, 59 | self.block_size, 60 | x, 61 | ) 62 | 63 | def get_value_block_shape(self) -> Tuple[int, int, int]: 64 | return ( 65 | self.num_heads, 66 | self.head_size, 67 | self.block_size, 68 | ) 69 | 70 | def allocate_gpu_cache(self) -> List[KVCache]: 71 | gpu_cache: List[KVCache] = [] 72 | key_block_shape = self.get_key_block_shape() 73 | value_block_shape = self.get_value_block_shape() 74 | for _ in range(self.num_layers): 75 | key_blocks = torch.empty( 76 | size=(self.num_gpu_blocks, *key_block_shape), 77 | dtype=self.dtype, 78 | device="cuda", 79 | ) 80 | value_blocks = torch.empty( 81 | size=(self.num_gpu_blocks, *value_block_shape), 82 | dtype=self.dtype, 83 | device="cuda", 84 | ) 85 | gpu_cache.append((key_blocks, value_blocks)) 86 | return gpu_cache 87 | 88 | def allocate_cpu_cache(self) -> List[KVCache]: 89 | cpu_cache: List[KVCache] = [] 90 | key_block_shape = self.get_key_block_shape() 91 | value_block_shape = self.get_value_block_shape() 92 | pin_memory = not in_wsl() 93 | if not pin_memory: 94 | # Pinning memory in WSL is not supported. 95 | # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications 96 | logger.warning("Using 'pin_memory=False' as WSL is detected. " 97 | "This may slow down the performance.") 98 | for _ in range(self.num_layers): 99 | key_blocks = torch.empty( 100 | size=(self.num_cpu_blocks, *key_block_shape), 101 | dtype=self.dtype, 102 | pin_memory=pin_memory, 103 | ) 104 | value_blocks = torch.empty( 105 | size=(self.num_cpu_blocks, *value_block_shape), 106 | dtype=self.dtype, 107 | pin_memory=pin_memory, 108 | ) 109 | cpu_cache.append((key_blocks, value_blocks)) 110 | return cpu_cache 111 | 112 | def _swap( 113 | self, 114 | src: List[KVCache], 115 | dst: List[KVCache], 116 | src_to_dst: Dict[int, int], 117 | ) -> None: 118 | with torch.cuda.stream(self.cache_stream): 119 | for i in range(self.num_layers): 120 | src_key_cache, src_value_cache = src[i] 121 | dst_key_cache, dst_value_cache = dst[i] 122 | # Copy the key blocks. 123 | cache_ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst) 124 | # Copy the value blocks. 125 | cache_ops.swap_blocks(src_value_cache, dst_value_cache, 126 | src_to_dst) 127 | event = self.events[i] 128 | event.record(stream=self.cache_stream) 129 | 130 | def swap_in(self, src_to_dst: Dict[int, int]) -> None: 131 | self._swap(self.cpu_cache, self.gpu_cache, src_to_dst) 132 | 133 | def swap_out(self, src_to_dst: Dict[int, int]) -> None: 134 | self._swap(self.gpu_cache, self.cpu_cache, src_to_dst) 135 | 136 | def copy(self, src_to_dsts: Dict[int, List[int]]) -> None: 137 | key_caches = [key_cache for key_cache, _ in self.gpu_cache] 138 | value_caches = [value_cache for _, value_cache in self.gpu_cache] 139 | # NOTE(woosuk): This operation implicitly synchronizes the CPU and GPU. 140 | cache_ops.copy_blocks(key_caches, value_caches, src_to_dsts) 141 | 142 | @staticmethod 143 | def get_cache_block_size( 144 | block_size: int, 145 | model_config: ModelConfig, 146 | parallel_config: ParallelConfig, 147 | ) -> int: 148 | head_size = model_config.get_head_size() 149 | num_heads = model_config.get_num_heads(parallel_config) 150 | num_layers = model_config.get_num_layers(parallel_config) 151 | 152 | key_cache_block = block_size * num_heads * head_size 153 | value_cache_block = key_cache_block 154 | total = num_layers * (key_cache_block + value_cache_block) 155 | dtype_size = _get_dtype_size(model_config.dtype) 156 | return dtype_size * total 157 | 158 | 159 | def _get_dtype_size(dtype: torch.dtype) -> int: 160 | return torch.tensor([], dtype=dtype).element_size() 161 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | 4 | vLLM 5 | 6 |

7 | 8 |

9 | Easy, fast, and cheap LLM serving for everyone 10 |

11 | 12 |

13 | | Documentation | Blog | Paper | Discussions | 14 | 15 |

16 | 17 | --- 18 | 19 | *Latest News* 🔥 20 | - [2023/09] We released our [PagedAttention paper](https://arxiv.org/abs/2309.06180) on arXiv! 21 | - [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM. 22 | - [2023/07] Added support for LLaMA-2! You can run and serve 7B/13B/70B LLaMA-2s on vLLM with a single command! 23 | - [2023/06] Serving vLLM On any Cloud with SkyPilot. Check out a 1-click [example](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm) to start the vLLM demo, and the [blog post](https://blog.skypilot.co/serving-llm-24x-faster-on-the-cloud-with-vllm-and-skypilot/) for the story behind vLLM development on the clouds. 24 | - [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai). 25 | 26 | --- 27 | 28 | vLLM is a fast and easy-to-use library for LLM inference and serving. 29 | 30 | vLLM is fast with: 31 | 32 | - State-of-the-art serving throughput 33 | - Efficient management of attention key and value memory with **PagedAttention** 34 | - Continuous batching of incoming requests 35 | - Optimized CUDA kernels 36 | 37 | vLLM is flexible and easy to use with: 38 | 39 | - Seamless integration with popular Hugging Face models 40 | - High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more 41 | - Tensor parallelism support for distributed inference 42 | - Streaming outputs 43 | - OpenAI-compatible API server 44 | 45 | vLLM seamlessly supports many Hugging Face models, including the following architectures: 46 | 47 | - Aquila (`BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.) 48 | - Baichuan (`baichuan-inc/Baichuan-7B`, `baichuan-inc/Baichuan-13B-Chat`, etc.) 49 | - BLOOM (`bigscience/bloom`, `bigscience/bloomz`, etc.) 50 | - Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.) 51 | - GPT-2 (`gpt2`, `gpt2-xl`, etc.) 52 | - GPT BigCode (`bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, etc.) 53 | - GPT-J (`EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.) 54 | - GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.) 55 | - InternLM (`internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.) 56 | - LLaMA & LLaMA-2 (`meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.) 57 | - MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.) 58 | - OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.) 59 | - Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.) 60 | 61 | Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source): 62 | 63 | ```bash 64 | pip install vllm 65 | ``` 66 | 67 | ## Getting Started 68 | 69 | Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to get started. 70 | - [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html) 71 | - [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html) 72 | - [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html) 73 | 74 | ## Performance 75 | 76 | vLLM outperforms Hugging Face Transformers (HF) by up to 24x and Text Generation Inference (TGI) by up to 3.5x, in terms of throughput. 77 | For details, check out our [blog post](https://vllm.ai). 78 | 79 |

80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 |
89 | Serving throughput when each request asks for 1 output completion. 90 |

91 | 92 |

93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 |
101 | Serving throughput when each request asks for 3 output completions. 102 |

103 | 104 | ## Contributing 105 | 106 | We welcome and value any contributions and collaborations. 107 | Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved. 108 | 109 | ## Citation 110 | 111 | If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180): 112 | ```bibtex 113 | @inproceedings{kwon2023efficient, 114 | title={Efficient Memory Management for Large Language Model Serving with PagedAttention}, 115 | author={Woosuk Kwon and Zhuohan Li and Siyuan Zhuang and Ying Sheng and Lianmin Zheng and Cody Hao Yu and Joseph E. Gonzalez and Hao Zhang and Ion Stoica}, 116 | booktitle={Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles}, 117 | year={2023} 118 | } 119 | ``` 120 | -------------------------------------------------------------------------------- /vllm/model_executor/parallel_utils/tensor_parallel/random.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 The vLLM team. 2 | # Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/random.py 3 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 4 | 5 | # Parts of the code here are adapted from PyTorch 6 | # repo: https://github.com/pytorch/pytorch 7 | 8 | import contextlib 9 | 10 | import torch 11 | from torch import _C 12 | from torch.cuda import _lazy_call, device as device_ctx_manager 13 | 14 | from vllm.model_executor.parallel_utils.parallel_state import ( 15 | get_tensor_model_parallel_rank, 16 | ) 17 | 18 | # Default name for the model parallel rng tracker. 19 | _MODEL_PARALLEL_RNG_TRACKER_NAME = 'model-parallel-rng' 20 | 21 | 22 | def _set_cuda_rng_state(new_state, device=-1): 23 | """Sets the random number generator state of the current GPU. 24 | 25 | Argumentss: 26 | new_state (torch.ByteTensor): The desired state 27 | This function is adapted from PyTorch repo (torch.cuda.set_rng_state) 28 | with a single change: the input state is not cloned. Cloning caused 29 | major performance issues for +4 GPU cases. 30 | """ 31 | if hasattr(_C, '_cuda_setRNGState') and callable(_C._cuda_setRNGState): 32 | # older PyTorch 33 | def cb(): 34 | with device_ctx_manager(device): 35 | _C._cuda_setRNGState(new_state) 36 | else: 37 | # newer PyTorch 38 | if device == -1: 39 | device = torch.device('cuda') 40 | elif isinstance(device, str): 41 | device = torch.device(device) 42 | elif isinstance(device, int): 43 | device = torch.device('cuda', device) 44 | 45 | def cb(): 46 | idx = device.index 47 | if idx is None: 48 | idx = torch.cuda.current_device() 49 | default_generator = torch.cuda.default_generators[idx] 50 | default_generator.set_state(new_state) 51 | 52 | _lazy_call(cb) 53 | 54 | 55 | 56 | class CudaRNGStatesTracker: 57 | """Tracker for the cuda RNG states. 58 | 59 | Using the `add` method, a cuda rng state is initialized based on 60 | the input `seed` and is assigned to `name`. Later, by forking the 61 | rng state, we can perform operations and return to our starting 62 | cuda state. 63 | """ 64 | 65 | def __init__(self): 66 | # Map from a string name to the cuda rng state. 67 | self.states_ = {} 68 | # Seeds are just for book keeping and ensure no seed is set twice. 69 | self.seeds_ = set() 70 | 71 | def reset(self): 72 | """Set to the initial state (no tracker).""" 73 | self.states_ = {} 74 | self.seeds_ = set() 75 | 76 | def get_states(self): 77 | """Get rng states. Copy the dictionary so we have direct 78 | pointers to the states, not just a pointer to the dictionary.""" 79 | states = {} 80 | for name in self.states_: 81 | states[name] = self.states_[name] 82 | return states 83 | 84 | def set_states(self, states): 85 | """Set the rng states. For efficiency purposes, we do not check 86 | the size of seed for compatibility.""" 87 | self.states_ = states 88 | 89 | def add(self, name, seed): 90 | """Track the rng state.""" 91 | # Check seed is not already used. 92 | if seed in self.seeds_: 93 | raise Exception('seed {} already exists'.format(seed)) 94 | self.seeds_.add(seed) 95 | # Check that state is not already defined. 96 | if name in self.states_: 97 | raise Exception('cuda rng state {} already exists'.format(name)) 98 | # Get the current rng state. 99 | orig_rng_state = torch.cuda.get_rng_state() 100 | # Set the new state and store it. 101 | torch.cuda.manual_seed(seed) 102 | self.states_[name] = torch.cuda.get_rng_state() 103 | # Reset rng state to what it was. 104 | _set_cuda_rng_state(orig_rng_state) 105 | 106 | @contextlib.contextmanager 107 | def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME): 108 | """Fork the cuda rng state, perform operations, and exit with 109 | the original state.""" 110 | # Check if we have added the state 111 | if name not in self.states_: 112 | raise Exception('cuda rng state {} is not added'.format(name)) 113 | # Store current rng state. 114 | orig_cuda_rng_state = torch.cuda.get_rng_state() 115 | # Set rng state to the desired one 116 | _set_cuda_rng_state(self.states_[name]) 117 | # Do the stuff we wanted to do. 118 | try: 119 | yield 120 | finally: 121 | # Update the current rng state for later use. 122 | self.states_[name] = torch.cuda.get_rng_state() 123 | # And set the state to the original state we started with. 124 | _set_cuda_rng_state(orig_cuda_rng_state) 125 | 126 | 127 | # RNG tracker object. 128 | _CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker() 129 | 130 | 131 | def get_cuda_rng_tracker(): 132 | """Get cuda rng tracker.""" 133 | return _CUDA_RNG_STATE_TRACKER 134 | 135 | 136 | def model_parallel_cuda_manual_seed(seed): 137 | """Initialize model parallel cuda seed. 138 | 139 | This function should be called after the model parallel is 140 | initialized. Also, no torch.cuda.manual_seed should be called 141 | after this function. Basically, this is replacement for that 142 | function. 143 | Two set of RNG states are tracked: 144 | default state: This is for data parallelism and is the same among a 145 | set of model parallel GPUs but different across 146 | different model paralle groups. This is used for 147 | example for dropout in the non-tensor-model-parallel regions. 148 | tensor-model-parallel state: This state is different among a set of model 149 | parallel GPUs, but the same across data parallel 150 | groups. This is used for example for dropout in 151 | model parallel regions. 152 | """ 153 | # 2718 is just for fun and any POSITIVE value will work. 154 | offset = seed + 2718 155 | tensor_model_parallel_seed = offset + get_tensor_model_parallel_rank() 156 | # Data parallel gets the original seed. 157 | data_parallel_seed = seed 158 | 159 | _CUDA_RNG_STATE_TRACKER.reset() 160 | # Set the default state. 161 | torch.cuda.manual_seed(data_parallel_seed) 162 | # and model parallel state. 163 | _CUDA_RNG_STATE_TRACKER.add(_MODEL_PARALLEL_RNG_TRACKER_NAME, 164 | tensor_model_parallel_seed) 165 | -------------------------------------------------------------------------------- /tests/conftest.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Tuple 2 | 3 | import pytest 4 | import torch 5 | from transformers import AutoModelForCausalLM 6 | 7 | from vllm import LLM, SamplingParams 8 | from vllm.transformers_utils.tokenizer import get_tokenizer 9 | 10 | _TEST_PROMPTS = [ 11 | "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.", 12 | "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.", 13 | "Compare and contrast artificial intelligence with human intelligence in terms of processing information.", 14 | "Describe the basic components of a neural network and how it can be trained.", 15 | "Write a short story about a robot that dreams for the first time.", 16 | "Analyze the impact of the COVID-19 pandemic on global economic structures and future business models.", 17 | "Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies.", 18 | "Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.'", 19 | ] 20 | 21 | 22 | @pytest.fixture 23 | def example_prompts() -> List[str]: 24 | return _TEST_PROMPTS 25 | 26 | 27 | _STR_DTYPE_TO_TORCH_DTYPE = { 28 | "half": torch.half, 29 | "bfloat16": torch.bfloat16, 30 | "float": torch.float, 31 | } 32 | 33 | 34 | class HfRunner: 35 | 36 | def __init__( 37 | self, 38 | model_name: str, 39 | tokenizer_name: Optional[str] = None, 40 | dtype: str = "half", 41 | ) -> None: 42 | assert dtype in _STR_DTYPE_TO_TORCH_DTYPE 43 | torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype] 44 | self.model = AutoModelForCausalLM.from_pretrained( 45 | model_name, 46 | torch_dtype=torch_dtype, 47 | trust_remote_code=True, 48 | ).cuda() 49 | if tokenizer_name is None: 50 | tokenizer_name = model_name 51 | self.tokenizer = get_tokenizer(tokenizer_name, trust_remote_code=True) 52 | 53 | def generate( 54 | self, 55 | prompts: List[str], 56 | **kwargs, 57 | ) -> List[Tuple[List[int], str]]: 58 | outputs: List[Tuple[List[int], str]] = [] 59 | for prompt in prompts: 60 | input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids 61 | output_ids = self.model.generate( 62 | input_ids.cuda(), 63 | use_cache=True, 64 | **kwargs, 65 | ) 66 | output_str = self.tokenizer.batch_decode( 67 | output_ids, 68 | skip_special_tokens=True, 69 | clean_up_tokenization_spaces=False, 70 | ) 71 | output_ids = output_ids.cpu().tolist() 72 | outputs.append((output_ids, output_str)) 73 | return outputs 74 | 75 | def generate_greedy( 76 | self, 77 | prompts: List[str], 78 | max_tokens: int, 79 | ) -> List[Tuple[List[int], str]]: 80 | outputs = self.generate(prompts, 81 | do_sample=False, 82 | max_new_tokens=max_tokens) 83 | for i in range(len(outputs)): 84 | output_ids, output_str = outputs[i] 85 | outputs[i] = (output_ids[0], output_str[0]) 86 | return outputs 87 | 88 | def generate_beam_search( 89 | self, 90 | prompts: List[str], 91 | beam_width: int, 92 | max_tokens: int, 93 | ) -> List[Tuple[List[int], str]]: 94 | outputs = self.generate(prompts, 95 | do_sample=False, 96 | max_new_tokens=max_tokens, 97 | num_beams=beam_width, 98 | num_return_sequences=beam_width) 99 | for i in range(len(outputs)): 100 | output_ids, output_str = outputs[i] 101 | for j in range(len(output_ids)): 102 | output_ids[j] = [ 103 | x for x in output_ids[j] 104 | if x != self.tokenizer.pad_token_id 105 | ] 106 | outputs[i] = (output_ids, output_str) 107 | return outputs 108 | 109 | 110 | @pytest.fixture 111 | def hf_runner(): 112 | return HfRunner 113 | 114 | 115 | class VllmRunner: 116 | 117 | def __init__( 118 | self, 119 | model_name: str, 120 | tokenizer_name: Optional[str] = None, 121 | dtype: str = "half", 122 | ) -> None: 123 | self.model = LLM( 124 | model=model_name, 125 | tokenizer=tokenizer_name, 126 | trust_remote_code=True, 127 | dtype=dtype, 128 | swap_space=0, 129 | ) 130 | 131 | def generate( 132 | self, 133 | prompts: List[str], 134 | sampling_params: SamplingParams, 135 | ) -> List[Tuple[List[int], str]]: 136 | req_outputs = self.model.generate(prompts, 137 | sampling_params=sampling_params) 138 | outputs = [] 139 | for req_output in req_outputs: 140 | prompt_str = req_output.prompt 141 | prompt_ids = req_output.prompt_token_ids 142 | req_sample_output_ids = [] 143 | req_sample_output_strs = [] 144 | for sample in req_output.outputs: 145 | output_str = sample.text 146 | output_ids = sample.token_ids 147 | req_sample_output_ids.append(prompt_ids + output_ids) 148 | req_sample_output_strs.append(prompt_str + output_str) 149 | outputs.append((req_sample_output_ids, req_sample_output_strs)) 150 | return outputs 151 | 152 | def generate_greedy( 153 | self, 154 | prompts: List[str], 155 | max_tokens: int, 156 | ) -> List[Tuple[List[int], str]]: 157 | greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens) 158 | outputs = self.generate(prompts, greedy_params) 159 | return [(output_ids[0], output_str[0]) 160 | for output_ids, output_str in outputs] 161 | 162 | def generate_beam_search( 163 | self, 164 | prompts: List[str], 165 | beam_width: int, 166 | max_tokens: int, 167 | ) -> List[Tuple[List[int], str]]: 168 | beam_search_params = SamplingParams(n=beam_width, 169 | use_beam_search=True, 170 | temperature=0.0, 171 | max_tokens=max_tokens) 172 | outputs = self.generate(prompts, beam_search_params) 173 | return outputs 174 | 175 | 176 | @pytest.fixture 177 | def vllm_runner(): 178 | return VllmRunner 179 | -------------------------------------------------------------------------------- /csrc/attention/dtype_float32.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp 3 | * and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h 4 | * Copyright (c) 2023, The vLLM team. 5 | * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. 6 | * 7 | * Licensed under the Apache License, Version 2.0 (the "License"); 8 | * you may not use this file except in compliance with the License. 9 | * You may obtain a copy of the License at 10 | * 11 | * http://www.apache.org/licenses/LICENSE-2.0 12 | * 13 | * Unless required by applicable law or agreed to in writing, software 14 | * distributed under the License is distributed on an "AS IS" BASIS, 15 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | * See the License for the specific language governing permissions and 17 | * limitations under the License. 18 | */ 19 | #pragma once 20 | 21 | #include "attention_generic.cuh" 22 | 23 | #include 24 | 25 | namespace vllm { 26 | 27 | // Define custom FP32 vector data types. 28 | struct Float4_ { 29 | float2 x; 30 | float2 y; 31 | }; 32 | 33 | struct Float8_ { 34 | float2 x; 35 | float2 y; 36 | float2 z; 37 | float2 w; 38 | }; 39 | 40 | // FP32 vector types for Q, K, V. 41 | template<> 42 | struct Vec { 43 | using Type = float; 44 | }; 45 | template<> 46 | struct Vec { 47 | using Type = float2; 48 | }; 49 | template<> 50 | struct Vec { 51 | using Type = float4; 52 | }; 53 | 54 | // FP32 accumulator vector types corresponding to Vec. 55 | template<> 56 | struct FloatVec { 57 | using Type = float; 58 | }; 59 | template<> 60 | struct FloatVec { 61 | using Type = float2; 62 | }; 63 | template<> 64 | struct FloatVec { 65 | using Type = float4; 66 | }; 67 | 68 | // Vector addition. 69 | inline __device__ float add(float a, float b) { 70 | return a + b; 71 | } 72 | 73 | inline __device__ float2 add(float2 a, float2 b) { 74 | float2 c; 75 | c.x = add(a.x, b.x); 76 | c.y = add(a.y, b.y); 77 | return c; 78 | } 79 | 80 | inline __device__ float4 add(float4 a, float4 b) { 81 | float4 c; 82 | c.x = add(a.x, b.x); 83 | c.y = add(a.y, b.y); 84 | c.z = add(a.z, b.z); 85 | c.w = add(a.w, b.w); 86 | return c; 87 | } 88 | 89 | // Vector multiplication. 90 | template<> 91 | inline __device__ float mul(float a, float b) { 92 | return a * b; 93 | } 94 | 95 | template<> 96 | inline __device__ float2 mul(float2 a, float2 b) { 97 | float2 c; 98 | c.x = a.x * b.x; 99 | c.y = a.y * b.y; 100 | return c; 101 | } 102 | 103 | template<> 104 | inline __device__ float2 mul(float a, float2 b) { 105 | float2 c; 106 | c.x = a * b.x; 107 | c.y = a * b.y; 108 | return c; 109 | } 110 | 111 | template<> 112 | inline __device__ float4 mul(float4 a, float4 b) { 113 | float4 c; 114 | c.x = a.x * b.x; 115 | c.y = a.y * b.y; 116 | c.z = a.z * b.z; 117 | c.w = a.w * b.w; 118 | return c; 119 | } 120 | 121 | template<> 122 | inline __device__ float4 mul(float a, float4 b) { 123 | float4 c; 124 | c.x = a * b.x; 125 | c.y = a * b.y; 126 | c.z = a * b.z; 127 | c.w = a * b.w; 128 | return c; 129 | } 130 | 131 | // Vector fused multiply-add. 132 | inline __device__ float fma(float a, float b, float c) { 133 | return a * b + c; 134 | } 135 | 136 | inline __device__ float2 fma(float2 a, float2 b, float2 c) { 137 | float2 d; 138 | d.x = fma(a.x, b.x, c.x); 139 | d.y = fma(a.y, b.y, c.y); 140 | return d; 141 | } 142 | 143 | inline __device__ float2 fma(float a, float2 b, float2 c) { 144 | float2 d; 145 | d.x = fma(a, b.x, c.x); 146 | d.y = fma(a, b.y, c.y); 147 | return d; 148 | } 149 | 150 | inline __device__ float4 fma(float4 a, float4 b, float4 c) { 151 | float4 d; 152 | d.x = fma(a.x, b.x, c.x); 153 | d.y = fma(a.y, b.y, c.y); 154 | d.z = fma(a.z, b.z, c.z); 155 | d.w = fma(a.w, b.w, c.w); 156 | return d; 157 | } 158 | 159 | inline __device__ float4 fma(float a, float4 b, float4 c) { 160 | float4 d; 161 | d.x = fma(a, b.x, c.x); 162 | d.y = fma(a, b.y, c.y); 163 | d.z = fma(a, b.z, c.z); 164 | d.w = fma(a, b.w, c.w); 165 | return d; 166 | } 167 | 168 | inline __device__ Float4_ fma(float a, Float4_ b, Float4_ c) { 169 | Float4_ d; 170 | d.x = fma(a, b.x, c.x); 171 | d.y = fma(a, b.y, c.y); 172 | return d; 173 | } 174 | 175 | inline __device__ Float8_ fma(float a, Float8_ b, Float8_ c) { 176 | Float8_ d; 177 | d.x = fma(a, b.x, c.x); 178 | d.y = fma(a, b.y, c.y); 179 | d.z = fma(a, b.z, c.z); 180 | d.w = fma(a, b.w, c.w); 181 | return d; 182 | } 183 | 184 | // Vector sum. 185 | template<> 186 | inline __device__ float sum(float v) { 187 | return v; 188 | } 189 | 190 | template<> 191 | inline __device__ float sum(float2 v) { 192 | return v.x + v.y; 193 | } 194 | 195 | template<> 196 | inline __device__ float sum(float4 v) { 197 | return v.x + v.y + v.z + v.w; 198 | } 199 | 200 | template<> 201 | inline __device__ float sum(Float4_ v) { 202 | return v.x.x + v.x.y + v.y.x + v.y.y; 203 | } 204 | 205 | template<> 206 | inline __device__ float sum(Float8_ v) { 207 | return v.x.x + v.x.y + v.y.x + v.y.y + v.z.x + v.z.y + v.w.x + v.w.y; 208 | } 209 | 210 | // Vector dot product. 211 | inline __device__ float dot(float a, float b) { 212 | return a * b; 213 | } 214 | 215 | inline __device__ float dot(float2 a, float2 b) { 216 | float2 c = mul(a, b); 217 | return c.x + c.y; 218 | } 219 | 220 | inline __device__ float dot(Float4_ a, Float4_ b) { 221 | float2 acc = mul(a.x, b.x); 222 | acc = fma(a.y, b.y, acc); 223 | return acc.x + acc.y; 224 | } 225 | 226 | inline __device__ float dot(Float8_ a, Float8_ b) { 227 | float2 acc = mul(a.x, b.x); 228 | acc = fma(a.y, b.y, acc); 229 | acc = fma(a.z, b.z, acc); 230 | acc = fma(a.w, b.w, acc); 231 | return acc.x + acc.y; 232 | } 233 | 234 | // From float to float. 235 | inline __device__ void from_float(float& dst, float src) { 236 | dst = src; 237 | } 238 | 239 | inline __device__ void from_float(float2& dst, float2 src) { 240 | dst = src; 241 | } 242 | 243 | inline __device__ void from_float(float4& dst, float4 src) { 244 | dst = src; 245 | } 246 | 247 | // From float to float. 248 | inline __device__ float to_float(float u) { 249 | return u; 250 | } 251 | 252 | inline __device__ float2 to_float(float2 u) { 253 | return u; 254 | } 255 | 256 | inline __device__ float4 to_float(float4 u) { 257 | return u; 258 | } 259 | 260 | inline __device__ Float4_ to_float(Float4_ u) { 261 | return u; 262 | } 263 | 264 | inline __device__ Float8_ to_float(Float8_ u) { 265 | return u; 266 | } 267 | 268 | // Zero-out a variable. 269 | inline __device__ void zero(float& dst) { 270 | dst = 0.f; 271 | } 272 | 273 | } // namespace vllm 274 | -------------------------------------------------------------------------------- /vllm/transformers_utils/tokenizer.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Tuple, Union 2 | 3 | from transformers import (AutoTokenizer, PreTrainedTokenizer, 4 | PreTrainedTokenizerFast) 5 | 6 | from vllm.logger import init_logger 7 | 8 | logger = init_logger(__name__) 9 | 10 | # A fast LLaMA tokenizer with the pre-processed `tokenizer.json` file. 11 | _FAST_LLAMA_TOKENIZER = "hf-internal-testing/llama-tokenizer" 12 | 13 | 14 | def get_tokenizer( 15 | tokenizer_name: str, 16 | *args, 17 | tokenizer_mode: str = "auto", 18 | trust_remote_code: bool = False, 19 | **kwargs, 20 | ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]: 21 | """Gets a tokenizer for the given model name via Huggingface.""" 22 | if tokenizer_mode == "slow": 23 | if kwargs.get("use_fast", False): 24 | raise ValueError( 25 | "Cannot use the fast tokenizer in slow tokenizer mode.") 26 | kwargs["use_fast"] = False 27 | 28 | if ("llama" in tokenizer_name.lower() and kwargs.get("use_fast", True) 29 | and tokenizer_name != _FAST_LLAMA_TOKENIZER): 30 | logger.info( 31 | "For some LLaMA-based models, initializing the fast tokenizer may " 32 | "take a long time. To eliminate the initialization time, consider " 33 | f"using '{_FAST_LLAMA_TOKENIZER}' instead of the original " 34 | "tokenizer.") 35 | try: 36 | tokenizer = AutoTokenizer.from_pretrained( 37 | tokenizer_name, 38 | *args, 39 | trust_remote_code=trust_remote_code, 40 | **kwargs) 41 | except TypeError as e: 42 | # The LLaMA tokenizer causes a protobuf error in some environments. 43 | err_msg = ( 44 | "Failed to load the tokenizer. If you are using a LLaMA-based " 45 | f"model, use '{_FAST_LLAMA_TOKENIZER}' instead of the original " 46 | "tokenizer.") 47 | raise RuntimeError(err_msg) from e 48 | except ValueError as e: 49 | # If the error pertains to the tokenizer class not existing or not 50 | # currently being imported, suggest using the --trust-remote-code flag. 51 | if (not trust_remote_code and 52 | ("does not exist or is not currently imported." in str(e) 53 | or "requires you to execute the tokenizer file" in str(e))): 54 | err_msg = ( 55 | "Failed to load the tokenizer. If the tokenizer is a custom " 56 | "tokenizer not yet available in the HuggingFace transformers " 57 | "library, consider setting `trust_remote_code=True` in LLM " 58 | "or using the `--trust-remote-code` flag in the CLI.") 59 | raise RuntimeError(err_msg) from e 60 | else: 61 | raise e 62 | 63 | if not isinstance(tokenizer, PreTrainedTokenizerFast): 64 | logger.warning( 65 | "Using a slow tokenizer. This might cause a significant " 66 | "slowdown. Consider using a fast tokenizer instead.") 67 | return tokenizer 68 | 69 | 70 | def _convert_tokens_to_string_with_added_encoders( 71 | tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], 72 | output_tokens: List[str], 73 | skip_special_tokens: bool, 74 | ) -> str: 75 | # Adapted from 76 | # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921 77 | # NOTE(woosuk): The following code is slow because it runs a for loop over 78 | # the output_tokens. In Python, running a for loop over a list can be slow 79 | # even when the loop body is very simple. 80 | sub_texts = [] 81 | current_sub_text = [] 82 | for token in output_tokens: 83 | if skip_special_tokens and token in tokenizer.all_special_tokens: 84 | continue 85 | if token in tokenizer.added_tokens_encoder: 86 | if current_sub_text: 87 | sub_text = tokenizer.convert_tokens_to_string(current_sub_text) 88 | sub_texts.append(sub_text) 89 | current_sub_text = [] 90 | sub_texts.append(token) 91 | else: 92 | current_sub_text.append(token) 93 | if current_sub_text: 94 | sub_text = tokenizer.convert_tokens_to_string(current_sub_text) 95 | sub_texts.append(sub_text) 96 | return " ".join(sub_texts) 97 | 98 | 99 | # Based on 100 | # https://github.com/huggingface/text-generation-inference/blob/v0.9.4/server/text_generation_server/models/model.py#L62C9-L62C15 101 | # under Apache 2.0 license 102 | def detokenize_incrementally( 103 | tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast], 104 | all_input_ids: List[int], 105 | prev_tokens: Optional[List[str]], 106 | prefix_offset: int = 0, 107 | read_offset: int = 0, 108 | skip_special_tokens: bool = False, 109 | ) -> Tuple[List[str], str, int, int]: 110 | new_token_id = all_input_ids[-1] 111 | # This is the first iteration for this sequence 112 | if prev_tokens is None: 113 | new_tokens = tokenizer.convert_ids_to_tokens( 114 | all_input_ids, skip_special_tokens=skip_special_tokens) 115 | output_tokens = new_tokens 116 | # 5 is an arbitrary value that should work for all 117 | # tokenizers (bigger = more conservative). 118 | # Subtract 1 extra to account for the generated token. 119 | prefix_offset = max(len(output_tokens) - 6, 0) 120 | read_offset = max(len(output_tokens) - 1, 0) 121 | else: 122 | new_token = tokenizer.convert_ids_to_tokens( 123 | new_token_id, skip_special_tokens=skip_special_tokens) 124 | new_tokens = [new_token] 125 | output_tokens = prev_tokens + new_tokens 126 | 127 | # The prefix text is necessary only to defeat cleanup algorithms in 128 | # the decode which decide to add a space or not depending on the 129 | # surrounding ids. 130 | if not getattr(tokenizer, "added_tokens_encoder", {}): 131 | prefix_text = tokenizer.convert_tokens_to_string( 132 | output_tokens[prefix_offset:read_offset]) 133 | new_text = tokenizer.convert_tokens_to_string( 134 | output_tokens[prefix_offset:]) 135 | else: 136 | prefix_text = _convert_tokens_to_string_with_added_encoders( 137 | tokenizer, 138 | output_tokens[prefix_offset:read_offset], 139 | skip_special_tokens=skip_special_tokens) 140 | new_text = _convert_tokens_to_string_with_added_encoders( 141 | tokenizer, 142 | output_tokens[prefix_offset:], 143 | skip_special_tokens=skip_special_tokens) 144 | 145 | if len(new_text) > len(prefix_text) and not new_text.endswith("�"): 146 | # utf-8 char at the end means it's a potential unfinished byte sequence 147 | # from byte fallback tokenization. 148 | # If it's in the middle, it's probably a real invalid id generated 149 | # by the model 150 | new_text = new_text[len(prefix_text):] 151 | return new_tokens, new_text, read_offset, len(output_tokens) 152 | else: 153 | return new_tokens, "", prefix_offset, read_offset 154 | --------------------------------------------------------------------------------