├── vllm
    ├── core
    │   ├── __init__.py
    │   └── policy.py
    ├── engine
    │   ├── __init__.py
    │   └── ray_utils.py
    ├── worker
    │   ├── __init__.py
    │   └── cache_engine.py
    ├── entrypoints
    │   ├── __init__.py
    │   ├── openai
    │   │   ├── __init__.py
    │   │   └── protocol.py
    │   └── api_server.py
    ├── model_executor
    │   ├── layers
    │   │   ├── __init__.py
    │   │   ├── layernorm.py
    │   │   └── activation.py
    │   ├── parallel_utils
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   └── tensor_parallel
    │   │   │   ├── __init__.py
    │   │   │   ├── utils.py
    │   │   │   └── random.py
    │   ├── __init__.py
    │   ├── utils.py
    │   ├── models
    │   │   └── __init__.py
    │   ├── input_metadata.py
    │   └── model_loader.py
    ├── transformers_utils
    │   ├── __init__.py
    │   ├── configs
    │   │   ├── __init__.py
    │   │   ├── baichuan.py
    │   │   ├── aquila.py
    │   │   ├── mpt.py
    │   │   ├── qwen.py
    │   │   └── falcon.py
    │   ├── config.py
    │   └── tokenizer.py
    ├── __init__.py
    ├── utils.py
    ├── logger.py
    ├── block.py
    └── outputs.py
├── MANIFEST.in
├── docs
    ├── requirements-docs.txt
    ├── source
    │   ├── assets
    │   │   ├── figures
    │   │   │   ├── perf_a100_n1_dark.png
    │   │   │   ├── perf_a100_n1_light.png
    │   │   │   ├── perf_a100_n3_dark.png
    │   │   │   ├── perf_a100_n3_light.png
    │   │   │   ├── perf_a10g_n1_dark.png
    │   │   │   ├── perf_a10g_n1_light.png
    │   │   │   ├── perf_a10g_n3_dark.png
    │   │   │   └── perf_a10g_n3_light.png
    │   │   └── logos
    │   │   │   ├── vllm-logo-only-light.png
    │   │   │   ├── vllm-logo-text-dark.png
    │   │   │   └── vllm-logo-text-light.png
    │   ├── getting_started
    │   │   ├── installation.rst
    │   │   └── quickstart.rst
    │   ├── serving
    │   │   ├── distributed_serving.rst
    │   │   └── run_on_sky.rst
    │   ├── index.rst
    │   ├── conf.py
    │   └── models
    │   │   ├── supported_models.rst
    │   │   └── adding_model.rst
    ├── README.md
    ├── Makefile
    └── make.bat
├── csrc
    ├── attention
    │   ├── attention_dtypes.h
    │   ├── attention_generic.cuh
    │   ├── attention_utils.cuh
    │   └── dtype_float32.cuh
    ├── layernorm.cpp
    ├── pos_encoding.cpp
    ├── dispatch_utils.h
    ├── activation.cpp
    ├── attention.cpp
    ├── cache.cpp
    ├── reduction_utils.cuh
    ├── layernorm_kernels.cu
    ├── activation_kernels.cu
    └── pos_encoding_kernels.cu
├── pyproject.toml
├── requirements-dev.txt
├── mypy.ini
├── benchmarks
    ├── README.md
    ├── launch_tgi_server.sh
    └── benchmark_latency.py
├── requirements.txt
├── .github
    └── workflows
    │   ├── scripts
    │       ├── build.sh
    │       ├── cuda-install.sh
    │       ├── create_release.js
    │       ├── pytorch-install.sh
    │       └── env.sh
    │   ├── pylint.yml
    │   ├── yapf.yml
    │   └── publish.yml
├── .readthedocs.yaml
├── examples
    ├── openai_completion_client.py
    ├── offline_inference.py
    ├── openai_chatcompletion_client.py
    ├── gradio_webserver.py
    ├── llm_engine_example.py
    └── api_client.py
├── tests
    ├── kernels
    │   ├── conftest.py
    │   ├── test_layernorm.py
    │   ├── test_activation.py
    │   ├── test_cache.py
    │   └── test_pos_encoding.py
    ├── models
    │   └── test_models.py
    ├── samplers
    │   └── test_beam_search.py
    ├── async_engine
    │   ├── api_server_async_engine.py
    │   ├── test_request_tracker.py
    │   └── test_api_server.py
    ├── engine
    │   └── test_detokenize.py
    └── conftest.py
├── CONTRIBUTING.md
├── format.sh
├── .gitignore
└── README.md


/vllm/core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/engine/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/worker/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/entrypoints/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/entrypoints/openai/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include requirements.txt
3 | 
4 | recursive-include csrc *
5 | 


--------------------------------------------------------------------------------
/docs/requirements-docs.txt:
--------------------------------------------------------------------------------
1 | sphinx == 6.2.1
2 | sphinx-book-theme == 1.0.1
3 | sphinx-copybutton == 0.5.2
4 | 


--------------------------------------------------------------------------------
/docs/source/assets/figures/perf_a100_n1_dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLukas22/vllm/main/docs/source/assets/figures/perf_a100_n1_dark.png


--------------------------------------------------------------------------------
/docs/source/assets/figures/perf_a100_n1_light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLukas22/vllm/main/docs/source/assets/figures/perf_a100_n1_light.png


--------------------------------------------------------------------------------
/docs/source/assets/figures/perf_a100_n3_dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLukas22/vllm/main/docs/source/assets/figures/perf_a100_n3_dark.png


--------------------------------------------------------------------------------
/docs/source/assets/figures/perf_a100_n3_light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLukas22/vllm/main/docs/source/assets/figures/perf_a100_n3_light.png


--------------------------------------------------------------------------------
/docs/source/assets/figures/perf_a10g_n1_dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLukas22/vllm/main/docs/source/assets/figures/perf_a10g_n1_dark.png


--------------------------------------------------------------------------------
/docs/source/assets/figures/perf_a10g_n1_light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLukas22/vllm/main/docs/source/assets/figures/perf_a10g_n1_light.png


--------------------------------------------------------------------------------
/docs/source/assets/figures/perf_a10g_n3_dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLukas22/vllm/main/docs/source/assets/figures/perf_a10g_n3_dark.png


--------------------------------------------------------------------------------
/docs/source/assets/figures/perf_a10g_n3_light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLukas22/vllm/main/docs/source/assets/figures/perf_a10g_n3_light.png


--------------------------------------------------------------------------------
/docs/source/assets/logos/vllm-logo-only-light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLukas22/vllm/main/docs/source/assets/logos/vllm-logo-only-light.png


--------------------------------------------------------------------------------
/docs/source/assets/logos/vllm-logo-text-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLukas22/vllm/main/docs/source/assets/logos/vllm-logo-text-dark.png


--------------------------------------------------------------------------------
/docs/source/assets/logos/vllm-logo-text-light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LLukas22/vllm/main/docs/source/assets/logos/vllm-logo-text-light.png


--------------------------------------------------------------------------------
/csrc/attention/attention_dtypes.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #include "attention_generic.cuh"
4 | #include "dtype_float16.cuh"
5 | #include "dtype_float32.cuh"
6 | #include "dtype_bfloat16.cuh"
7 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = [
 3 |     "ninja",
 4 |     "packaging",
 5 |     "setuptools",
 6 |     "torch >= 2.0.0",
 7 |     "wheel",
 8 | ]
 9 | build-backend = "setuptools.build_meta"
10 | 


--------------------------------------------------------------------------------
/vllm/model_executor/parallel_utils/README.md:
--------------------------------------------------------------------------------
1 | The files in this folder are ported from [Megatron-LM](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core). We only keep the codes that are used in inference.


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
 1 | # formatting
 2 | yapf==0.32.0
 3 | pylint==2.8.2
 4 | 
 5 | # type checking
 6 | mypy==0.991
 7 | types-PyYAML
 8 | types-requests
 9 | types-setuptools
10 | 
11 | # testing
12 | pytest
13 | pytest-forked
14 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | python_version = 3.8
3 | 
4 | ignore_missing_imports = True
5 | 
6 | files = vllm
7 | # TODO(woosuk): Include the code from Megatron and HuggingFace.
8 | exclude = vllm/model_executor/parallel_utils/|vllm/model_executor/models/
9 | 


--------------------------------------------------------------------------------
/vllm/model_executor/parallel_utils/__init__.py:
--------------------------------------------------------------------------------
1 | import vllm.model_executor.parallel_utils.parallel_state
2 | import vllm.model_executor.parallel_utils.tensor_parallel
3 | 
4 | __all__ = [
5 |     "parallel_state",
6 |     "tensor_parallel",
7 | ]
8 | 


--------------------------------------------------------------------------------
/benchmarks/README.md:
--------------------------------------------------------------------------------
1 | # Benchmarking vLLM
2 | 
3 | ## Downloading the ShareGPT dataset
4 | 
5 | You can download the dataset by running:
6 | ```bash
7 | wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
8 | ```
9 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | ninja  # For faster builds.
 2 | psutil
 3 | ray >= 2.5.1
 4 | sentencepiece  # Required for LLaMA tokenizer.
 5 | numpy
 6 | torch >= 2.0.0
 7 | transformers >= 4.33.1  # Required for Code Llama.
 8 | xformers >= 0.0.21
 9 | fastapi
10 | uvicorn
11 | pydantic < 2  # Required for OpenAI server.
12 | 


--------------------------------------------------------------------------------
/vllm/model_executor/__init__.py:
--------------------------------------------------------------------------------
 1 | from vllm.model_executor.input_metadata import InputMetadata
 2 | from vllm.model_executor.model_loader import get_model
 3 | from vllm.model_executor.utils import set_random_seed
 4 | 
 5 | __all__ = [
 6 |     "InputMetadata",
 7 |     "get_model",
 8 |     "set_random_seed",
 9 | ]
10 | 


--------------------------------------------------------------------------------
/csrc/layernorm.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | void rms_norm(
 4 |   torch::Tensor& out,
 5 |   torch::Tensor& input,
 6 |   torch::Tensor& weight,
 7 |   float epsilon);
 8 | 
 9 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
10 |   m.def(
11 |     "rms_norm",
12 |     &rms_norm,
13 |     "Apply Root Mean Square (RMS) Normalization to the input tensor.");
14 | }
15 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # vLLM documents
 2 | 
 3 | ## Build the docs
 4 | 
 5 | ```bash
 6 | # Install dependencies.
 7 | pip install -r requirements-docs.txt
 8 | 
 9 | # Build the docs.
10 | make clean
11 | make html
12 | ```
13 | 
14 | ## Open the docs with your browser
15 | 
16 | ```bash
17 | python -m http.server -d build/html/
18 | ```
19 | Launch your browser and open localhost:8000.
20 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python_executable=python$1
 4 | cuda_home=/usr/local/cuda-$2
 5 | 
 6 | # Update paths
 7 | PATH=${cuda_home}/bin:$PATH
 8 | LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
 9 | 
10 | # Install requirements
11 | $python_executable -m pip install wheel packaging
12 | $python_executable -m pip install -r requirements.txt
13 | 
14 | # Build
15 | $python_executable setup.py bdist_wheel --dist-dir=dist
16 | 


--------------------------------------------------------------------------------
/csrc/pos_encoding.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | void rotary_embedding(
 4 |   torch::Tensor& positions,
 5 |   torch::Tensor& query,
 6 |   torch::Tensor& key,
 7 |   int head_size,
 8 |   torch::Tensor& cos_sin_cache,
 9 |   bool is_neox);
10 | 
11 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
12 |   m.def(
13 |     "rotary_embedding",
14 |     &rotary_embedding,
15 |     "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
16 | }
17 | 


--------------------------------------------------------------------------------
/benchmarks/launch_tgi_server.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PORT=8000
 4 | MODEL=$1
 5 | TOKENS=$2
 6 | 
 7 | docker run --gpus all --shm-size 1g -p $PORT:80 \
 8 |            -v $PWD/data:/data \
 9 |            ghcr.io/huggingface/text-generation-inference:0.8 \
10 |            --model-id $MODEL \
11 |            --sharded false  \
12 |            --max-input-length 1024 \
13 |            --max-total-tokens 2048 \
14 |            --max-best-of 5 \
15 |            --max-concurrent-requests 5000 \
16 |            --max-batch-total-tokens $TOKENS
17 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # Read the Docs configuration file
 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 3 | 
 4 | version: 2
 5 | 
 6 | build:
 7 |   os: ubuntu-22.04
 8 |   tools:
 9 |     python: "3.8"
10 | 
11 | sphinx:
12 |    configuration: docs/source/conf.py
13 | 
14 | # If using Sphinx, optionally build your docs in additional formats such as PDF
15 | formats:
16 |    - pdf
17 | 
18 | # Optionally declare the Python requirements required to build your docs
19 | python:
20 |    install:
21 |    - requirements: docs/requirements-docs.txt
22 | 


--------------------------------------------------------------------------------
/csrc/dispatch_utils.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Adapted from
 3 |  * https://github.com/pytorch/pytorch/blob/v2.0.1/aten/src/ATen/Dispatch.h
 4 |  */
 5 | #include <torch/extension.h>
 6 | 
 7 | #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)              \
 8 |   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)      \
 9 |   AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)       \
10 |   AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
11 | 
12 | #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)             \
13 |   AT_DISPATCH_SWITCH(                                             \
14 |     TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
15 | 


--------------------------------------------------------------------------------
/vllm/model_executor/utils.py:
--------------------------------------------------------------------------------
 1 | """Utils for model executor."""
 2 | import random
 3 | 
 4 | import numpy as np
 5 | import torch
 6 | 
 7 | from vllm.model_executor.parallel_utils.parallel_state import model_parallel_is_initialized
 8 | from vllm.model_executor.parallel_utils.tensor_parallel import model_parallel_cuda_manual_seed
 9 | 
10 | 
11 | def set_random_seed(seed: int) -> None:
12 |     random.seed(seed)
13 |     np.random.seed(seed)
14 |     torch.manual_seed(seed)
15 |     if torch.cuda.is_available():
16 |         torch.cuda.manual_seed_all(seed)
17 | 
18 |     if model_parallel_is_initialized():
19 |         model_parallel_cuda_manual_seed(seed)
20 | 


--------------------------------------------------------------------------------
/csrc/activation.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | void silu_and_mul(
 4 |   torch::Tensor& out,
 5 |   torch::Tensor& input);
 6 | 
 7 | void gelu_new(
 8 |   torch::Tensor& out,
 9 |   torch::Tensor& input);
10 | 
11 | void gelu_fast(
12 |   torch::Tensor& out,
13 |   torch::Tensor& input);
14 | 
15 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
16 |   m.def(
17 |     "silu_and_mul",
18 |     &silu_and_mul,
19 |     "Activation function used in SwiGLU.");
20 |   m.def(
21 |     "gelu_new",
22 |     &gelu_new,
23 |     "GELU implementation used in GPT-2.");
24 |   m.def(
25 |     "gelu_fast",
26 |     &gelu_fast,
27 |     "Approximate GELU implementation.");
28 | }
29 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/cuda-install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Replace '.' with '-' ex: 11.8 -> 11-8
 4 | cuda_version=$(echo $1 | tr "." "-")
 5 | # Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004
 6 | OS=$(echo $2 | tr -d ".\-")
 7 | 
 8 | # Installs CUDA
 9 | wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb
10 | sudo dpkg -i cuda-keyring_1.1-1_all.deb
11 | rm cuda-keyring_1.1-1_all.deb
12 | sudo apt -qq update
13 | sudo apt -y install cuda-${cuda_version} cuda-nvcc-${cuda_version} cuda-libraries-dev-${cuda_version}
14 | sudo apt clean
15 | 
16 | # Test nvcc
17 | PATH=/usr/local/cuda-$1/bin:${PATH}
18 | nvcc --version
19 | 


--------------------------------------------------------------------------------
/examples/openai_completion_client.py:
--------------------------------------------------------------------------------
 1 | import openai
 2 | 
 3 | # Modify OpenAI's API key and API base to use vLLM's API server.
 4 | openai.api_key = "EMPTY"
 5 | openai.api_base = "http://localhost:8000/v1"
 6 | 
 7 | # List models API
 8 | models = openai.Model.list()
 9 | print("Models:", models)
10 | 
11 | model = models["data"][0]["id"]
12 | 
13 | # Completion API
14 | stream = False
15 | completion = openai.Completion.create(
16 |     model=model,
17 |     prompt="A robot may not injure a human being",
18 |     echo=False,
19 |     n=2,
20 |     stream=stream,
21 |     logprobs=3)
22 | 
23 | print("Completion results:")
24 | if stream:
25 |     for c in completion:
26 |         print(c)
27 | else:
28 |     print(completion)
29 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/configs/__init__.py:
--------------------------------------------------------------------------------
 1 | from vllm.transformers_utils.configs.mpt import MPTConfig
 2 | from vllm.transformers_utils.configs.baichuan import BaiChuanConfig
 3 | from vllm.transformers_utils.configs.aquila import AquilaConfig
 4 | from vllm.transformers_utils.configs.qwen import QWenConfig
 5 | # RWConfig is for the original tiiuae/falcon-40b(-instruct) and
 6 | # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
 7 | # `FalconConfig` class from the official HuggingFace transformers library.
 8 | from vllm.transformers_utils.configs.falcon import RWConfig
 9 | 
10 | __all__ = [
11 |     "MPTConfig",
12 |     "BaiChuanConfig",
13 |     "AquilaConfig",
14 |     "QWenConfig",
15 |     "RWConfig",
16 | ]
17 | 


--------------------------------------------------------------------------------
/csrc/attention.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | #include <c10/util/Optional.h>
 3 | 
 4 | void single_query_cached_kv_attention(
 5 |   torch::Tensor& out,
 6 |   torch::Tensor& query,
 7 |   torch::Tensor& key_cache,
 8 |   torch::Tensor& value_cache,
 9 |   torch::Tensor& head_mapping,
10 |   float scale,
11 |   torch::Tensor& block_tables,
12 |   torch::Tensor& context_lens,
13 |   int block_size,
14 |   int max_context_len,
15 |   const c10::optional<torch::Tensor>& alibi_slopes);
16 | 
17 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
18 |   m.def(
19 |     "single_query_cached_kv_attention",
20 |     &single_query_cached_kv_attention,
21 |     "Compute the attention between an input query and the cached key/value tensors");
22 | }
23 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/create_release.js:
--------------------------------------------------------------------------------
 1 | // Uses Github's API to create the release and wait for result.
 2 | // We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately.
 3 | 
 4 | module.exports = async (github, context, core) => {
 5 | 	try {
 6 | 		const response = await github.rest.repos.createRelease({
 7 | 			draft: false,
 8 | 			generate_release_notes: true,
 9 | 			name: process.env.RELEASE_TAG,
10 | 			owner: context.repo.owner,
11 | 			prerelease: false,
12 | 			repo: context.repo.repo,
13 | 			tag_name: process.env.RELEASE_TAG,
14 | 		});
15 | 
16 | 		core.setOutput('upload_url', response.data.upload_url);
17 | 	} catch (error) {
18 | 		core.setFailed(error.message);
19 | 	}
20 | }


--------------------------------------------------------------------------------
/.github/workflows/scripts/pytorch-install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python_executable=python$1
 4 | cuda_version=$2
 5 | 
 6 | # Install torch
 7 | $python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya
 8 | $python_executable -m pip install torch -f https://download.pytorch.org/whl/cu${cuda_version//./}/torch_stable.html
 9 | 
10 | # Print version information
11 | $python_executable --version
12 | $python_executable -c "import torch; print('PyTorch:', torch.__version__)"
13 | $python_executable -c "import torch; print('CUDA:', torch.version.cuda)"
14 | $python_executable -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)"
15 | 


--------------------------------------------------------------------------------
/vllm/__init__.py:
--------------------------------------------------------------------------------
 1 | """vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
 2 | 
 3 | from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 4 | from vllm.engine.async_llm_engine import AsyncLLMEngine
 5 | from vllm.engine.llm_engine import LLMEngine
 6 | from vllm.engine.ray_utils import initialize_cluster
 7 | from vllm.entrypoints.llm import LLM
 8 | from vllm.outputs import CompletionOutput, RequestOutput
 9 | from vllm.sampling_params import SamplingParams
10 | 
11 | __version__ = "0.1.7"
12 | 
13 | __all__ = [
14 |     "LLM",
15 |     "SamplingParams",
16 |     "RequestOutput",
17 |     "CompletionOutput",
18 |     "LLMEngine",
19 |     "EngineArgs",
20 |     "AsyncLLMEngine",
21 |     "AsyncEngineArgs",
22 |     "initialize_cluster",
23 | ]
24 | 


--------------------------------------------------------------------------------
/.github/workflows/pylint.yml:
--------------------------------------------------------------------------------
 1 | name: pylint
 2 | 
 3 | on:
 4 |   # Trigger the workflow on push or pull request,
 5 |   # but only for the main branch
 6 |   push:
 7 |     branches:
 8 |       - main
 9 |   pull_request:
10 |     branches:
11 |       - main
12 | 
13 | jobs:
14 |   pylint:
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       matrix:
18 |         python-version: ["3.10"]
19 |     steps:
20 |     - uses: actions/checkout@v2
21 |     - name: Set up Python ${{ matrix.python-version }}
22 |       uses: actions/setup-python@v2
23 |       with:
24 |         python-version: ${{ matrix.python-version }}
25 |     - name: Install dependencies
26 |       run: |
27 |         python -m pip install --upgrade pip
28 |         pip install pylint==2.8.2
29 |     - name: Analysing the code with pylint
30 |       run: |
31 |         pylint vllm
32 | 


--------------------------------------------------------------------------------
/examples/offline_inference.py:
--------------------------------------------------------------------------------
 1 | from vllm import LLM, SamplingParams
 2 | 
 3 | # Sample prompts.
 4 | prompts = [
 5 |     "Hello, my name is",
 6 |     "The president of the United States is",
 7 |     "The capital of France is",
 8 |     "The future of AI is",
 9 | ]
10 | # Create a sampling params object.
11 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
12 | 
13 | # Create an LLM.
14 | llm = LLM(model="facebook/opt-125m")
15 | # Generate texts from the prompts. The output is a list of RequestOutput objects
16 | # that contain the prompt, generated text, and other information.
17 | outputs = llm.generate(prompts, sampling_params)
18 | # Print the outputs.
19 | for output in outputs:
20 |     prompt = output.prompt
21 |     generated_text = output.outputs[0].text
22 |     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
23 | 


--------------------------------------------------------------------------------
/.github/workflows/yapf.yml:
--------------------------------------------------------------------------------
 1 | name: yapf
 2 | 
 3 | on:
 4 |   # Trigger the workflow on push or pull request,
 5 |   # but only for the main branch
 6 |   push:
 7 |     branches:
 8 |       - main
 9 |   pull_request:
10 |     branches:
11 |       - main
12 | jobs:
13 |   yapf:
14 |     runs-on: ubuntu-latest
15 |     strategy:
16 |       matrix:
17 |         python-version: ["3.10"]
18 |     steps:
19 |     - uses: actions/checkout@v2
20 |     - name: Set up Python ${{ matrix.python-version }}
21 |       uses: actions/setup-python@v2
22 |       with:
23 |         python-version: ${{ matrix.python-version }}
24 |     - name: Install dependencies
25 |       run: |
26 |         python -m pip install --upgrade pip
27 |         pip install yapf==0.32.0
28 |         pip install toml==0.10.2
29 |     - name: Running yapf
30 |       run: |
31 |         yapf --diff --recursive vllm --exclude 'vllm/model_executor/parallel_utils/**'
32 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/layernorm.py:
--------------------------------------------------------------------------------
 1 | """Custom normalization layers."""
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | from vllm import layernorm_ops
 6 | 
 7 | 
 8 | class RMSNorm(nn.Module):
 9 |     """Root mean square normalization.
10 | 
11 |     Computes x -> w * x / sqrt(E[x^2] + eps) where w is the learned weight.
12 |     Refer to https://arxiv.org/abs/1910.07467
13 |     """
14 | 
15 |     def __init__(
16 |         self,
17 |         hidden_size: int,
18 |         eps: float = 1e-6,
19 |     ) -> None:
20 |         super().__init__()
21 |         self.weight = nn.Parameter(torch.ones(hidden_size))
22 |         self.variance_epsilon = eps
23 | 
24 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
25 |         out = torch.empty_like(x)
26 |         layernorm_ops.rms_norm(
27 |             out,
28 |             x,
29 |             self.weight.data,
30 |             self.variance_epsilon,
31 |         )
32 |         return out
33 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/examples/openai_chatcompletion_client.py:
--------------------------------------------------------------------------------
 1 | import openai
 2 | 
 3 | # Modify OpenAI's API key and API base to use vLLM's API server.
 4 | openai.api_key = "EMPTY"
 5 | openai.api_base = "http://localhost:8000/v1"
 6 | 
 7 | # List models API
 8 | models = openai.Model.list()
 9 | print("Models:", models)
10 | 
11 | model = models["data"][0]["id"]
12 | 
13 | # Chat completion API
14 | chat_completion = openai.ChatCompletion.create(
15 |     model=model,
16 |     messages=[{
17 |         "role": "system",
18 |         "content": "You are a helpful assistant."
19 |     }, {
20 |         "role": "user",
21 |         "content": "Who won the world series in 2020?"
22 |     }, {
23 |         "role":
24 |         "assistant",
25 |         "content":
26 |         "The Los Angeles Dodgers won the World Series in 2020."
27 |     }, {
28 |         "role": "user",
29 |         "content": "Where was it played?"
30 |     }])
31 | 
32 | print("Chat completion results:")
33 | print(chat_completion)
34 | 


--------------------------------------------------------------------------------
/vllm/utils.py:
--------------------------------------------------------------------------------
 1 | import enum
 2 | from platform import uname
 3 | import uuid
 4 | 
 5 | import psutil
 6 | import torch
 7 | 
 8 | 
 9 | class Device(enum.Enum):
10 |     GPU = enum.auto()
11 |     CPU = enum.auto()
12 | 
13 | 
14 | class Counter:
15 | 
16 |     def __init__(self, start: int = 0) -> None:
17 |         self.counter = start
18 | 
19 |     def __next__(self) -> int:
20 |         i = self.counter
21 |         self.counter += 1
22 |         return i
23 | 
24 |     def reset(self) -> None:
25 |         self.counter = 0
26 | 
27 | 
28 | def get_gpu_memory(gpu: int = 0) -> int:
29 |     """Returns the total memory of the GPU in bytes."""
30 |     return torch.cuda.get_device_properties(gpu).total_memory
31 | 
32 | 
33 | def get_cpu_memory() -> int:
34 |     """Returns the total CPU memory of the node in bytes."""
35 |     return psutil.virtual_memory().total
36 | 
37 | 
38 | def random_uuid() -> str:
39 |     return str(uuid.uuid4().hex)
40 | 
41 | 
42 | def in_wsl() -> bool:
43 |     # Reference: https://github.com/microsoft/WSL/issues/4071
44 |     return "microsoft" in " ".join(uname()).lower()
45 | 


--------------------------------------------------------------------------------
/vllm/core/policy.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from vllm.sequence import SequenceGroup
 4 | 
 5 | 
 6 | class Policy:
 7 | 
 8 |     def get_priority(
 9 |         self,
10 |         now: float,
11 |         seq_group: SequenceGroup,
12 |     ) -> float:
13 |         raise NotImplementedError
14 | 
15 |     def sort_by_priority(
16 |         self,
17 |         now: float,
18 |         seq_groups: List[SequenceGroup],
19 |     ) -> List[SequenceGroup]:
20 |         return sorted(
21 |             seq_groups,
22 |             key=lambda seq_group: self.get_priority(now, seq_group),
23 |             reverse=True,
24 |         )
25 | 
26 | 
27 | class FCFS(Policy):
28 | 
29 |     def get_priority(
30 |         self,
31 |         now: float,
32 |         seq_group: SequenceGroup,
33 |     ) -> float:
34 |         return now - seq_group.arrival_time
35 | 
36 | 
37 | class PolicyFactory:
38 | 
39 |     _POLICY_REGISTRY = {
40 |         'fcfs': FCFS,
41 |     }
42 | 
43 |     @classmethod
44 |     def get_policy(cls, policy_name: str, **kwargs) -> Policy:
45 |         return cls._POLICY_REGISTRY[policy_name](**kwargs)
46 | 


--------------------------------------------------------------------------------
/docs/source/getting_started/installation.rst:
--------------------------------------------------------------------------------
 1 | .. _installation:
 2 | 
 3 | Installation
 4 | ============
 5 | 
 6 | vLLM is a Python library that also contains pre-compiled C++ and CUDA (11.8) binaries.
 7 | 
 8 | Requirements
 9 | ------------
10 | 
11 | * OS: Linux
12 | * Python: 3.8 -- 3.11
13 | * GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, etc.)
14 | 
15 | Install with pip
16 | ----------------
17 | 
18 | You can install vLLM using pip:
19 | 
20 | .. code-block:: console
21 | 
22 |     $ # (Optional) Create a new conda environment.
23 |     $ conda create -n myenv python=3.8 -y
24 |     $ conda activate myenv
25 | 
26 |     $ # Install vLLM.
27 |     $ pip install vllm
28 | 
29 | 
30 | .. _build_from_source:
31 | 
32 | Build from source
33 | -----------------
34 | 
35 | You can also build and install vLLM from source:
36 | 
37 | .. code-block:: console
38 | 
39 |     $ git clone https://github.com/vllm-project/vllm.git
40 |     $ cd vllm
41 |     $ pip install -e .  # This may take 5-10 minutes.
42 | 
43 | .. tip::
44 |     If you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image.
45 | 
46 |     .. code-block:: console
47 | 
48 |         $ # Pull the Docker image with CUDA 11.8.
49 |         $ docker run --gpus all -it --rm --shm-size=8g nvcr.io/nvidia/pytorch:22.12-py3
50 | 


--------------------------------------------------------------------------------
/csrc/cache.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | #include <map>
 4 | #include <vector>
 5 | 
 6 | void swap_blocks(
 7 |   torch::Tensor& src,
 8 |   torch::Tensor& dst,
 9 |   const std::map<int64_t, int64_t>& block_mapping);
10 | 
11 | void copy_blocks(
12 |   std::vector<torch::Tensor>& key_caches,
13 |   std::vector<torch::Tensor>& value_caches,
14 |   const std::map<int64_t, std::vector<int64_t>>& block_mapping);
15 | 
16 | void reshape_and_cache(
17 |   torch::Tensor& key,
18 |   torch::Tensor& value,
19 |   torch::Tensor& key_cache,
20 |   torch::Tensor& value_cache,
21 |   torch::Tensor& slot_mapping);
22 | 
23 | void gather_cached_kv(
24 |   torch::Tensor& key,
25 |   torch::Tensor& value,
26 |   torch::Tensor& key_cache,
27 |   torch::Tensor& value_cache,
28 |   torch::Tensor& slot_mapping);
29 | 
30 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
31 |   m.def(
32 |     "swap_blocks",
33 |     &swap_blocks,
34 |     "Swap in (out) the cache blocks from src to dst");
35 |   m.def(
36 |     "copy_blocks",
37 |     &copy_blocks,
38 |     "Copy the cache blocks from src to dst");
39 |   m.def(
40 |     "reshape_and_cache",
41 |     &reshape_and_cache,
42 |     "Reshape the key and value tensors and cache them");
43 |   m.def(
44 |     "gather_cached_kv",
45 |     &gather_cached_kv,
46 |     "Gather key and value from the cache into contiguous QKV tensors");
47 | }
48 | 


--------------------------------------------------------------------------------
/vllm/model_executor/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from vllm.model_executor.models.aquila import AquilaForCausalLM
 2 | from vllm.model_executor.models.baichuan import (BaiChuanForCausalLM,
 3 |                                                  BaichuanForCausalLM)
 4 | from vllm.model_executor.models.bloom import BloomForCausalLM
 5 | from vllm.model_executor.models.falcon import FalconForCausalLM
 6 | from vllm.model_executor.models.gpt2 import GPT2LMHeadModel
 7 | from vllm.model_executor.models.gpt_bigcode import GPTBigCodeForCausalLM
 8 | from vllm.model_executor.models.gpt_j import GPTJForCausalLM
 9 | from vllm.model_executor.models.gpt_neox import GPTNeoXForCausalLM
10 | from vllm.model_executor.models.internlm import InternLMForCausalLM
11 | from vllm.model_executor.models.llama import LlamaForCausalLM
12 | from vllm.model_executor.models.mpt import MPTForCausalLM
13 | from vllm.model_executor.models.opt import OPTForCausalLM
14 | from vllm.model_executor.models.qwen import QWenLMHeadModel
15 | 
16 | __all__ = [
17 |     "AquilaForCausalLM",
18 |     "BaiChuanForCausalLM",
19 |     "BaichuanForCausalLM",
20 |     "BloomForCausalLM",
21 |     "FalconForCausalLM",
22 |     "GPT2LMHeadModel",
23 |     "GPTBigCodeForCausalLM",
24 |     "GPTJForCausalLM",
25 |     "GPTNeoXForCausalLM",
26 |     "InternLMForCausalLM",
27 |     "LlamaForCausalLM",
28 |     "MPTForCausalLM",
29 |     "OPTForCausalLM",
30 |     "QWenLMHeadModel",
31 | ]
32 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/env.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This file installs common linux environment tools
 4 | 
 5 | export LANG C.UTF-8
 6 | 
 7 | # python_version=$1
 8 | 
 9 | sudo    apt-get update && \
10 | sudo    apt-get install -y --no-install-recommends \
11 |         software-properties-common \
12 | 
13 | sudo    apt-get install -y --no-install-recommends \
14 |         build-essential \
15 |         apt-utils \
16 |         ca-certificates \
17 |         wget \
18 |         git \
19 |         vim \
20 |         libssl-dev \
21 |         curl \
22 |         unzip \
23 |         unrar \
24 |         cmake \
25 |         net-tools \
26 |         sudo \
27 |         autotools-dev \
28 |         rsync \
29 |         jq \
30 |         openssh-server \
31 |         tmux \
32 |         screen \
33 |         htop \
34 |         pdsh \
35 |         openssh-client \
36 |         lshw \
37 |         dmidecode \
38 |         util-linux \
39 |         automake \
40 |         autoconf \
41 |         libtool \
42 |         net-tools \
43 |         pciutils \
44 |         libpci-dev \
45 |         libaio-dev \
46 |         libcap2 \
47 |         libtinfo5 \
48 |         fakeroot \
49 |         devscripts \
50 |         debhelper \
51 |         nfs-common
52 | 
53 | # Remove github bloat files to free up disk space
54 | sudo rm -rf "/usr/local/share/boost"
55 | sudo rm -rf "$AGENT_TOOLSDIRECTORY"
56 | sudo rm -rf "/usr/share/dotnet"
57 | 


--------------------------------------------------------------------------------
/tests/kernels/conftest.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple
 2 | 
 3 | import pytest
 4 | import torch
 5 | 
 6 | 
 7 | def create_kv_caches(
 8 |     num_blocks: int,
 9 |     block_size: int,
10 |     num_layers: int,
11 |     num_heads: int,
12 |     head_size: int,
13 |     dtype: torch.dtype,
14 |     seed: int,
15 | ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
16 |     torch.random.manual_seed(seed)
17 |     torch.cuda.manual_seed(seed)
18 | 
19 |     scale = head_size**-0.5
20 |     x = 16 // torch.tensor([], dtype=dtype).element_size()
21 |     key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x)
22 |     key_caches = []
23 |     for _ in range(num_layers):
24 |         key_cache = torch.empty(size=key_cache_shape,
25 |                                 dtype=dtype,
26 |                                 device='cuda')
27 |         key_cache.uniform_(-scale, scale)
28 |         key_caches.append(key_cache)
29 | 
30 |     value_cache_shape = (num_blocks, num_heads, head_size, block_size)
31 |     value_caches = []
32 |     for _ in range(num_layers):
33 |         value_cache = torch.empty(size=value_cache_shape,
34 |                                   dtype=dtype,
35 |                                   device='cuda')
36 |         value_cache.uniform_(-scale, scale)
37 |         value_caches.append(value_cache)
38 |     return key_caches, value_caches
39 | 
40 | 
41 | @pytest.fixture()
42 | def kv_cache_factory():
43 |     return create_kv_caches
44 | 


--------------------------------------------------------------------------------
/tests/models/test_models.py:
--------------------------------------------------------------------------------
 1 | """Compare the outputs of HF and vLLM when using greedy sampling.
 2 | 
 3 | Run `pytest tests/models/test_models.py --forked`.
 4 | """
 5 | import pytest
 6 | 
 7 | MODELS = [
 8 |     "facebook/opt-125m",
 9 |     "gpt2",
10 |     "bigcode/tiny_starcoder_py",
11 |     "EleutherAI/gpt-j-6b",
12 |     "EleutherAI/pythia-70m",
13 |     "bigscience/bloom-560m",
14 |     "mosaicml/mpt-7b",
15 |     "tiiuae/falcon-7b",
16 |     "meta-llama/Llama-2-7b-hf",
17 | ]
18 | 
19 | 
20 | @pytest.mark.parametrize("model", MODELS)
21 | @pytest.mark.parametrize("dtype", ["half"])
22 | @pytest.mark.parametrize("max_tokens", [128])
23 | def test_models(
24 |     hf_runner,
25 |     vllm_runner,
26 |     example_prompts,
27 |     model: str,
28 |     dtype: str,
29 |     max_tokens: int,
30 | ) -> None:
31 |     hf_model = hf_runner(model, dtype=dtype)
32 |     hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
33 |     del hf_model
34 | 
35 |     vllm_model = vllm_runner(model, dtype=dtype)
36 |     vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
37 |     del vllm_model
38 | 
39 |     for i in range(len(example_prompts)):
40 |         hf_output_ids, hf_output_str = hf_outputs[i]
41 |         vllm_output_ids, vllm_output_str = vllm_outputs[i]
42 |         assert hf_output_str == vllm_output_str, (
43 |             f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
44 |         assert hf_output_ids == vllm_output_ids, (
45 |             f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
46 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/config.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from transformers import AutoConfig, PretrainedConfig
 4 | 
 5 | from vllm.transformers_utils.configs import *  # pylint: disable=wildcard-import
 6 | 
 7 | _CONFIG_REGISTRY = {
 8 |     "mpt": MPTConfig,
 9 |     "baichuan": BaiChuanConfig,
10 |     "aquila": AquilaConfig,
11 |     "qwen": QWenConfig,
12 |     "RefinedWeb": RWConfig,  # For tiiuae/falcon-40b(-instruct)
13 |     "RefinedWebModel": RWConfig,  # For tiiuae/falcon-7b(-instruct)
14 | }
15 | 
16 | 
17 | def get_config(model: str,
18 |                trust_remote_code: bool,
19 |                revision: Optional[str] = None) -> PretrainedConfig:
20 |     try:
21 |         config = AutoConfig.from_pretrained(
22 |             model, trust_remote_code=trust_remote_code, revision=revision)
23 |     except ValueError as e:
24 |         if (not trust_remote_code and
25 |                 "requires you to execute the configuration file" in str(e)):
26 |             err_msg = (
27 |                 "Failed to load the model config. If the model is a custom "
28 |                 "model not yet available in the HuggingFace transformers "
29 |                 "library, consider setting `trust_remote_code=True` in LLM "
30 |                 "or using the `--trust-remote-code` flag in the CLI.")
31 |             raise RuntimeError(err_msg) from e
32 |         else:
33 |             raise e
34 |     if config.model_type in _CONFIG_REGISTRY:
35 |         config_class = _CONFIG_REGISTRY[config.model_type]
36 |         config = config_class.from_pretrained(model, revision=revision)
37 |     return config
38 | 


--------------------------------------------------------------------------------
/vllm/model_executor/parallel_utils/tensor_parallel/__init__.py:
--------------------------------------------------------------------------------
 1 | from .layers import (
 2 |     ColumnParallelLinear,
 3 |     RowParallelLinear,
 4 |     VocabParallelEmbedding,
 5 |     set_tensor_model_parallel_attributes,
 6 |     set_defaults_if_not_set_tensor_model_parallel_attributes,
 7 |     copy_tensor_model_parallel_attributes,
 8 |     param_is_not_tensor_parallel_duplicate,
 9 | )
10 | 
11 | from .mappings import (
12 |     copy_to_tensor_model_parallel_region,
13 |     gather_from_tensor_model_parallel_region,
14 |     gather_from_sequence_parallel_region,
15 |     reduce_from_tensor_model_parallel_region,
16 |     scatter_to_tensor_model_parallel_region,
17 |     scatter_to_sequence_parallel_region,
18 | )
19 | 
20 | from .random import (
21 |     get_cuda_rng_tracker,
22 |     model_parallel_cuda_manual_seed,
23 | )
24 | 
25 | from .utils import (
26 |     split_tensor_along_last_dim,
27 | )
28 | 
29 | __all__ = [
30 |     #layers.py
31 |     "ColumnParallelLinear",
32 |     "RowParallelLinear",
33 |     "VocabParallelEmbedding",
34 |     "set_tensor_model_parallel_attributes",
35 |     "set_defaults_if_not_set_tensor_model_parallel_attributes",
36 |     "copy_tensor_model_parallel_attributes",
37 |     "param_is_not_tensor_parallel_duplicate",
38 |     # mappings.py
39 |     "copy_to_tensor_model_parallel_region",
40 |     "gather_from_tensor_model_parallel_region",
41 |     "gather_from_sequence_parallel_region",
42 |     "reduce_from_tensor_model_parallel_region",
43 |     "scatter_to_tensor_model_parallel_region",
44 |     "scatter_to_sequence_parallel_region",
45 |     # random.py
46 |     "get_cuda_rng_tracker",
47 |     "model_parallel_cuda_manual_seed",
48 |     # utils.py
49 |     "split_tensor_along_last_dim",
50 | ]
51 | 


--------------------------------------------------------------------------------
/tests/samplers/test_beam_search.py:
--------------------------------------------------------------------------------
 1 | """Compare the outputs of HF and vLLM when using beam search.
 2 | 
 3 | Run `pytest tests/samplers/test_beam_search.py --forked`.
 4 | """
 5 | import pytest
 6 | 
 7 | # FIXME(zhuohan): The test can not pass if we:
 8 | #   1. Increase max_tokens to 256.
 9 | #   2. Increase beam_width to 8.
10 | #   3. Use the model "huggyllama/llama-7b".
11 | MAX_TOKENS = [128]
12 | BEAM_WIDTHS = [4]
13 | MODELS = ["facebook/opt-125m"]
14 | 
15 | 
16 | @pytest.mark.parametrize("model", MODELS)
17 | @pytest.mark.parametrize("dtype", ["half"])
18 | @pytest.mark.parametrize("max_tokens", MAX_TOKENS)
19 | @pytest.mark.parametrize("beam_width", BEAM_WIDTHS)
20 | def test_beam_search_single_input(
21 |     hf_runner,
22 |     vllm_runner,
23 |     example_prompts,
24 |     model: str,
25 |     dtype: str,
26 |     max_tokens: int,
27 |     beam_width: int,
28 | ) -> None:
29 |     hf_model = hf_runner(model, dtype=dtype)
30 |     hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
31 |                                                max_tokens)
32 |     del hf_model
33 | 
34 |     vllm_model = vllm_runner(model, dtype=dtype)
35 |     vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width,
36 |                                                    max_tokens)
37 |     del vllm_model
38 | 
39 |     for i in range(len(example_prompts)):
40 |         hf_output_ids, _ = hf_outputs[i]
41 |         vllm_output_ids, _ = vllm_outputs[i]
42 |         assert len(hf_output_ids) == len(vllm_output_ids)
43 |         for j in range(len(hf_output_ids)):
44 |             assert hf_output_ids[j] == vllm_output_ids[j], (
45 |                 f"Test{i} output{j}:\nHF: {hf_output_ids}\n"
46 |                 f"vLLM: {vllm_output_ids}")
47 | 


--------------------------------------------------------------------------------
/tests/async_engine/api_server_async_engine.py:
--------------------------------------------------------------------------------
 1 | """vllm.entrypoints.api_server with some extra logging for testing."""
 2 | import argparse
 3 | from typing import Any, Dict
 4 | 
 5 | import uvicorn
 6 | from fastapi.responses import JSONResponse, Response
 7 | 
 8 | import vllm.entrypoints.api_server
 9 | from vllm.engine.arg_utils import AsyncEngineArgs
10 | from vllm.engine.async_llm_engine import AsyncLLMEngine
11 | 
12 | app = vllm.entrypoints.api_server.app
13 | 
14 | 
15 | class AsyncLLMEngineWithStats(AsyncLLMEngine):
16 | 
17 |     def __init__(self, *args, **kwargs):
18 |         super().__init__(*args, **kwargs)
19 |         self._num_aborts = 0
20 | 
21 |     async def abort(self, request_id: str) -> None:
22 |         await super().abort(request_id)
23 |         self._num_aborts += 1
24 | 
25 |     def testing_stats(self) -> Dict[str, Any]:
26 |         return {"num_aborted_requests": self._num_aborts}
27 | 
28 | 
29 | @app.get("/stats")
30 | def stats() -> Response:
31 |     """Get the statistics of the engine."""
32 |     return JSONResponse(engine.testing_stats())
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     parser = argparse.ArgumentParser()
37 |     parser.add_argument("--host", type=str, default="localhost")
38 |     parser.add_argument("--port", type=int, default=8000)
39 |     parser = AsyncEngineArgs.add_cli_args(parser)
40 |     args = parser.parse_args()
41 | 
42 |     engine_args = AsyncEngineArgs.from_cli_args(args)
43 |     engine = AsyncLLMEngineWithStats.from_engine_args(engine_args)
44 |     vllm.entrypoints.api_server.engine = engine
45 |     uvicorn.run(
46 |         app,
47 |         host=args.host,
48 |         port=args.port,
49 |         log_level="debug",
50 |         timeout_keep_alive=vllm.entrypoints.api_server.TIMEOUT_KEEP_ALIVE)
51 | 


--------------------------------------------------------------------------------
/docs/source/serving/distributed_serving.rst:
--------------------------------------------------------------------------------
 1 | .. _distributed_serving:
 2 | 
 3 | Distributed Inference and Serving
 4 | =================================
 5 | 
 6 | vLLM supports distributed tensor-parallel inference and serving. Currently, we support `Megatron-LM's tensor parallel algorithm <https://arxiv.org/pdf/1909.08053.pdf>`_. We manage the distributed runtime with `Ray <https://github.com/ray-project/ray>`_. To run distributed inference, install Ray with:
 7 | 
 8 | .. code-block:: console
 9 | 
10 |     $ pip install ray
11 | 
12 | To run multi-GPU inference with the :code:`LLM` class, set the :code:`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs:
13 | 
14 | .. code-block:: python
15 | 
16 |     from vllm import LLM
17 |     llm = LLM("facebook/opt-13b", tensor_parallel_size=4)
18 |     output = llm.generate("San Franciso is a")
19 | 
20 | To run multi-GPU serving, pass in the :code:`--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs:
21 | 
22 | .. code-block:: console
23 | 
24 |     $ python -m vllm.entrypoints.api_server \
25 |     $     --model facebook/opt-13b \
26 |     $     --tensor-parallel-size 4
27 | 
28 | To scale vLLM beyond a single machine, start a `Ray runtime <https://docs.ray.io/en/latest/ray-core/starting-ray.html>`_ via CLI before running vLLM:
29 | 
30 | .. code-block:: console
31 | 
32 |     $ # On head node
33 |     $ ray start --head
34 | 
35 |     $ # On worker nodes
36 |     $ ray start --address=<ray-head-address>
37 | 
38 | After that, you can run inference and serving on multiple machines by launching the vLLM process on the head node by setting :code:`tensor_parallel_size` to the number of GPUs to be the total number of GPUs across all machines.


--------------------------------------------------------------------------------
/csrc/reduction_utils.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/reduce_kernel_utils.cuh
 3 |  * Copyright (c) 2023, The vLLM team.
 4 |  * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License");
 7 |  * you may not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | #pragma once
19 | 
20 | namespace vllm {
21 | 
22 | template<typename T>
23 | __inline__ __device__ T warpReduceSum(T val) {
24 | #pragma unroll
25 |   for (int mask = 16; mask > 0; mask >>= 1)
26 |     val += __shfl_xor_sync(0xffffffff, val, mask, 32);
27 |   return val;
28 | }
29 | 
30 | /* Calculate the sum of all elements in a block */
31 | template<typename T>
32 | __inline__ __device__ T blockReduceSum(T val) {
33 |   static __shared__ T shared[32];
34 |   int lane = threadIdx.x & 0x1f;
35 |   int wid = threadIdx.x >> 5;
36 | 
37 |   val = warpReduceSum<T>(val);
38 | 
39 |   if (lane == 0)
40 |     shared[wid] = val;
41 | 
42 |   __syncthreads();
43 | 
44 |   // Modify from blockDim.x << 5 to blockDim.x / 32. to prevent
45 |   // blockDim.x is not divided by 32
46 |   val = (threadIdx.x < (blockDim.x / 32.f)) ? shared[lane] : (T)(0.0f);
47 |   val = warpReduceSum<T>(val);
48 |   return val;
49 | }
50 | 
51 | } // namespace vllm
52 | 


--------------------------------------------------------------------------------
/vllm/logger.py:
--------------------------------------------------------------------------------
 1 | # Adapted from
 2 | # https://github.com/skypilot-org/skypilot/blob/86dc0f6283a335e4aa37b3c10716f90999f48ab6/sky/sky_logging.py
 3 | """Logging configuration for vLLM."""
 4 | import logging
 5 | import sys
 6 | 
 7 | _FORMAT = "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s"
 8 | _DATE_FORMAT = "%m-%d %H:%M:%S"
 9 | 
10 | 
11 | class NewLineFormatter(logging.Formatter):
12 |     """Adds logging prefix to newlines to align multi-line messages."""
13 | 
14 |     def __init__(self, fmt, datefmt=None):
15 |         logging.Formatter.__init__(self, fmt, datefmt)
16 | 
17 |     def format(self, record):
18 |         msg = logging.Formatter.format(self, record)
19 |         if record.message != "":
20 |             parts = msg.split(record.message)
21 |             msg = msg.replace("\n", "\r\n" + parts[0])
22 |         return msg
23 | 
24 | 
25 | _root_logger = logging.getLogger("vllm")
26 | _default_handler = None
27 | 
28 | 
29 | def _setup_logger():
30 |     _root_logger.setLevel(logging.DEBUG)
31 |     global _default_handler
32 |     if _default_handler is None:
33 |         _default_handler = logging.StreamHandler(sys.stdout)
34 |         _default_handler.flush = sys.stdout.flush  # type: ignore
35 |         _default_handler.setLevel(logging.INFO)
36 |         _root_logger.addHandler(_default_handler)
37 |     fmt = NewLineFormatter(_FORMAT, datefmt=_DATE_FORMAT)
38 |     _default_handler.setFormatter(fmt)
39 |     # Setting this will avoid the message
40 |     # being propagated to the parent logger.
41 |     _root_logger.propagate = False
42 | 
43 | 
44 | # The logger is initialized when the module is imported.
45 | # This is thread-safe as the module is only imported once,
46 | # guaranteed by the Python GIL.
47 | _setup_logger()
48 | 
49 | 
50 | def init_logger(name: str):
51 |     return logging.getLogger(name)
52 | 


--------------------------------------------------------------------------------
/tests/engine/test_detokenize.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from transformers import AutoTokenizer
 4 | 
 5 | from vllm.transformers_utils.tokenizer import detokenize_incrementally
 6 | 
 7 | TRUTH = [
 8 |     "Hello here, this is a simple test",
 9 |     "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving",
10 |     "我很感谢你的热情"
11 | ]
12 | TOKENIZERS = [
13 |     "facebook/opt-125m",
14 |     "gpt2",
15 |     "bigcode/tiny_starcoder_py",
16 |     "EleutherAI/gpt-j-6b",
17 |     "EleutherAI/pythia-70m",
18 |     "bigscience/bloom-560m",
19 |     "mosaicml/mpt-7b",
20 |     "tiiuae/falcon-7b",
21 |     "meta-llama/Llama-2-7b-hf",
22 |     "codellama/CodeLlama-7b-hf",
23 | ]
24 | 
25 | 
26 | def _run_incremental_decode(tokenizer, all_input_ids):
27 |     decoded_text = ""
28 |     offset = 0
29 |     token_offset = 0
30 |     prev_tokens = None
31 |     for i in range(len(all_input_ids)):
32 |         new_tokens, text, offset, token_offset = detokenize_incrementally(
33 |             tokenizer,
34 |             all_input_ids[:i + 1],
35 |             prev_tokens,
36 |             offset,
37 |             token_offset,
38 |             skip_special_tokens=False)
39 |         decoded_text += text
40 |         if prev_tokens is None:
41 |             prev_tokens = new_tokens
42 |         else:
43 |             prev_tokens += new_tokens
44 |     return decoded_text
45 | 
46 | 
47 | @pytest.mark.parametrize("truth", TRUTH)
48 | @pytest.mark.parametrize("tokenizer_id", TOKENIZERS)
49 | def test_decode_streaming(tokenizer_id, truth):
50 |     tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
51 |     all_input_ids = tokenizer(truth, add_special_tokens=False)["input_ids"]
52 | 
53 |     decoded_text = _run_incremental_decode(tokenizer, all_input_ids)
54 | 
55 |     assert decoded_text == truth
56 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/activation.py:
--------------------------------------------------------------------------------
 1 | """Custom activation functions."""
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | from vllm import activation_ops
 6 | 
 7 | 
 8 | class SiluAndMul(nn.Module):
 9 |     """An activation function for SwiGLU.
10 | 
11 |     The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[1] // 2.
12 | 
13 |     Shapes:
14 |         x: (num_tokens, 2 * d)
15 |         return: (num_tokens, d)
16 |     """
17 | 
18 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
19 |         num_tokens = x.shape[0]
20 |         d = x.shape[1] // 2
21 |         out = torch.empty(num_tokens, d, dtype=x.dtype, device=x.device)
22 |         activation_ops.silu_and_mul(out, x)
23 |         return out
24 | 
25 | 
26 | class NewGELU(nn.Module):
27 | 
28 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
29 |         num_tokens = x.shape[0]
30 |         d = x.shape[1]
31 |         out = torch.empty(num_tokens, d, dtype=x.dtype, device=x.device)
32 |         activation_ops.gelu_new(out, x)
33 |         return out
34 | 
35 | 
36 | class FastGELU(nn.Module):
37 | 
38 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
39 |         num_tokens = x.shape[0]
40 |         d = x.shape[1]
41 |         out = torch.empty(num_tokens, d, dtype=x.dtype, device=x.device)
42 |         activation_ops.gelu_fast(out, x)
43 |         return out
44 | 
45 | 
46 | _ACTIVATION_REGISTRY = {
47 |     "gelu": nn.GELU(),
48 |     "gelu_fast": FastGELU(),
49 |     "gelu_new": NewGELU(),
50 |     "gelu_pytorch_tanh": nn.GELU(approximate="tanh"),
51 |     "relu": nn.ReLU(),
52 | }
53 | 
54 | 
55 | def get_act_fn(act_fn: str) -> nn.Module:
56 |     """Get an activation function by name."""
57 |     act_fn = act_fn.lower()
58 |     if act_fn in _ACTIVATION_REGISTRY:
59 |         return _ACTIVATION_REGISTRY[act_fn]
60 |     raise ValueError(f"Activation function {act_fn!r} is not supported.")
61 | 


--------------------------------------------------------------------------------
/tests/async_engine/test_request_tracker.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from vllm.engine.async_llm_engine import RequestTracker
 4 | from vllm.outputs import RequestOutput
 5 | 
 6 | 
 7 | def test_request_tracker():
 8 |     tracker = RequestTracker()
 9 |     stream_1 = tracker.add_request("1")
10 |     new, finished = tracker.get_new_and_finished_requests()
11 |     assert len(new) == 1
12 |     assert new[0]["request_id"] == "1"
13 |     assert not finished
14 |     assert not stream_1.finished
15 | 
16 |     stream_2 = tracker.add_request("2")
17 |     stream_3 = tracker.add_request("3")
18 |     new, finished = tracker.get_new_and_finished_requests()
19 |     assert len(new) == 2
20 |     assert new[0]["request_id"] == "2"
21 |     assert new[1]["request_id"] == "3"
22 |     assert not finished
23 |     assert not stream_2.finished
24 |     assert not stream_3.finished
25 | 
26 |     # request_ids must be unique
27 |     with pytest.raises(KeyError):
28 |         tracker.add_request("1")
29 | 
30 |     tracker.abort_request("1")
31 |     new, finished = tracker.get_new_and_finished_requests()
32 |     assert len(finished) == 1
33 |     assert "1" in finished
34 |     assert not new
35 |     assert stream_1.finished
36 | 
37 |     stream_4 = tracker.add_request("4")
38 |     tracker.abort_request("4")
39 |     new, finished = tracker.get_new_and_finished_requests()
40 |     assert len(finished) == 1
41 |     assert "4" in finished
42 |     assert not new
43 |     assert stream_4.finished
44 | 
45 |     stream_5 = tracker.add_request("5")
46 |     tracker.process_request_output(
47 |         RequestOutput("2", "output", [], [], finished=True))
48 |     new, finished = tracker.get_new_and_finished_requests()
49 |     assert len(finished) == 1
50 |     assert "2" in finished
51 |     assert len(new) == 1
52 |     assert new[0]["request_id"] == "5"
53 |     assert stream_2.finished
54 |     assert not stream_5.finished
55 | 


--------------------------------------------------------------------------------
/examples/gradio_webserver.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | import gradio as gr
 5 | import requests
 6 | 
 7 | 
 8 | def http_bot(prompt):
 9 |     headers = {"User-Agent": "vLLM Client"}
10 |     pload = {
11 |         "prompt": prompt,
12 |         "stream": True,
13 |         "max_tokens": 128,
14 |     }
15 |     response = requests.post(args.model_url,
16 |                              headers=headers,
17 |                              json=pload,
18 |                              stream=True)
19 | 
20 |     for chunk in response.iter_lines(chunk_size=8192,
21 |                                      decode_unicode=False,
22 |                                      delimiter=b"\0"):
23 |         if chunk:
24 |             data = json.loads(chunk.decode("utf-8"))
25 |             output = data["text"][0]
26 |             yield output
27 | 
28 | 
29 | def build_demo():
30 |     with gr.Blocks() as demo:
31 |         gr.Markdown("# vLLM text completion demo\n")
32 |         inputbox = gr.Textbox(label="Input",
33 |                               placeholder="Enter text and press ENTER")
34 |         outputbox = gr.Textbox(label="Output",
35 |                                placeholder="Generated result from the model")
36 |         inputbox.submit(http_bot, [inputbox], [outputbox])
37 |     return demo
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     parser = argparse.ArgumentParser()
42 |     parser.add_argument("--host", type=str, default="localhost")
43 |     parser.add_argument("--port", type=int, default=8001)
44 |     parser.add_argument("--model-url",
45 |                         type=str,
46 |                         default="http://localhost:8000/generate")
47 |     args = parser.parse_args()
48 | 
49 |     demo = build_demo()
50 |     demo.queue(concurrency_count=100).launch(server_name=args.host,
51 |                                              server_port=args.port,
52 |                                              share=True)
53 | 


--------------------------------------------------------------------------------
/examples/llm_engine_example.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from vllm import EngineArgs, LLMEngine, SamplingParams
 4 | 
 5 | 
 6 | def main(args: argparse.Namespace):
 7 |     # Parse the CLI argument and initialize the engine.
 8 |     engine_args = EngineArgs.from_cli_args(args)
 9 |     engine = LLMEngine.from_engine_args(engine_args)
10 | 
11 |     # Test the following prompts.
12 |     test_prompts = [
13 |         ("A robot may not injure a human being",
14 |          SamplingParams(temperature=0.0)),
15 |         ("To be or not to be,",
16 |          SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2)),
17 |         ("What is the meaning of life?",
18 |          SamplingParams(n=2,
19 |                         best_of=5,
20 |                         temperature=0.8,
21 |                         top_p=0.95,
22 |                         frequency_penalty=0.1)),
23 |         ("It is only with the heart that one can see rightly",
24 |          SamplingParams(n=3, best_of=3, use_beam_search=True,
25 |                         temperature=0.0)),
26 |     ]
27 | 
28 |     # Run the engine by calling `engine.step()` manually.
29 |     request_id = 0
30 |     while True:
31 |         # To test continuous batching, we add one request at each step.
32 |         if test_prompts:
33 |             prompt, sampling_params = test_prompts.pop(0)
34 |             engine.add_request(str(request_id), prompt, sampling_params)
35 |             request_id += 1
36 | 
37 |         request_outputs = engine.step()
38 |         for request_output in request_outputs:
39 |             if request_output.finished:
40 |                 print(request_output)
41 | 
42 |         if not (engine.has_unfinished_requests() or test_prompts):
43 |             break
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     parser = argparse.ArgumentParser(
48 |         description='Demo on using the LLMEngine class directly')
49 |     parser = EngineArgs.add_cli_args(parser)
50 |     args = parser.parse_args()
51 |     main(args)
52 | 


--------------------------------------------------------------------------------
/csrc/attention/attention_generic.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
 3 |  * Copyright (c) 2023, The vLLM team.
 4 |  * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License");
 7 |  * you may not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | #pragma once
19 | 
20 | #include <stdint.h>
21 | 
22 | namespace vllm {
23 | 
24 | // A vector type to store Q, K, V elements.
25 | template<typename T, int VEC_SIZE>
26 | struct Vec {};
27 | 
28 | // A vector type to store FP32 accumulators.
29 | template<typename T>
30 | struct FloatVec {};
31 | 
32 | // Template vector operations.
33 | template<typename Acc, typename A, typename B>
34 | inline __device__ Acc mul(A a, B b);
35 | 
36 | template<typename T>
37 | inline __device__ float sum(T v);
38 | 
39 | template<typename T>
40 | inline __device__ float dot(T a, T b) {
41 |   return sum(mul<T, T, T>(a, b));
42 | }
43 | 
44 | template<typename A, typename T>
45 | inline __device__ float dot(T a, T b) {
46 |   return sum(mul<A, T, T>(a, b));
47 | }
48 | 
49 | template<typename T>
50 | inline __device__ void zero(T& dst) {
51 |   constexpr int WORDS = sizeof(T) / 4;
52 |   union {
53 |     T raw;
54 |     uint32_t words[WORDS];
55 |   } tmp;
56 | 
57 | #pragma unroll
58 |   for (int ii = 0; ii < WORDS; ++ii) {
59 |     tmp.words[ii] = 0u;
60 |   }
61 |   dst = tmp.raw;
62 | }
63 | 
64 | } // namespace vllm
65 | 


--------------------------------------------------------------------------------
/tests/kernels/test_layernorm.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | from vllm import layernorm_ops
 6 | 
 7 | DTYPES = [torch.half, torch.bfloat16, torch.float]
 8 | HIDDEN_SIZES = [67, 768, 2048, 5120, 8192]  # Arbitrary values for testing
 9 | NUM_TOKENS = [7, 83, 4096]  # Arbitrary values for testing
10 | SEEDS = [0]
11 | 
12 | 
13 | class RefRMSNorm(nn.Module):
14 | 
15 |     def __init__(self, hidden_size, eps=1e-6):
16 |         super().__init__()
17 |         weight = torch.empty(hidden_size)
18 |         weight.normal_(mean=1.0, std=0.1)
19 |         self.weight = nn.Parameter(weight)
20 |         self.variance_epsilon = eps
21 | 
22 |     def forward(self, hidden_states):
23 |         input_dtype = hidden_states.dtype
24 |         hidden_states = hidden_states.to(torch.float32)
25 |         variance = hidden_states.pow(2).mean(-1, keepdim=True)
26 |         hidden_states = hidden_states * torch.rsqrt(variance +
27 |                                                     self.variance_epsilon)
28 |         return self.weight * hidden_states.to(input_dtype)
29 | 
30 | 
31 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
32 | @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
33 | @pytest.mark.parametrize("dtype", DTYPES)
34 | @pytest.mark.parametrize("seed", SEEDS)
35 | @torch.inference_mode()
36 | def test_rms_norm(
37 |     num_tokens: int,
38 |     hidden_size: int,
39 |     dtype: torch.dtype,
40 |     seed: int,
41 | ) -> None:
42 |     torch.random.manual_seed(seed)
43 |     torch.cuda.manual_seed(seed)
44 | 
45 |     scale = float(hidden_size**-0.5)
46 |     x = torch.empty(num_tokens, hidden_size, dtype=dtype, device="cuda")
47 |     x.uniform_(-scale, scale)
48 |     ref = RefRMSNorm(hidden_size).to(dtype).cuda()
49 | 
50 |     out = torch.empty_like(x)
51 |     layernorm_ops.rms_norm(
52 |         out,
53 |         x,
54 |         ref.weight.data,
55 |         ref.variance_epsilon,
56 |     )
57 |     ref_out = ref(x)
58 |     assert torch.allclose(out, ref_out, atol=1e-2, rtol=1e-5)
59 | 


--------------------------------------------------------------------------------
/csrc/attention/attention_utils.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
 3 |  * Copyright (c) 2023, The vLLM team.
 4 |  * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License");
 7 |  * you may not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | #pragma once
19 | 
20 | #include "attention_dtypes.h"
21 | 
22 | #include <float.h>
23 | #include <type_traits>
24 | 
25 | namespace vllm {
26 | 
27 | // Q*K^T operation.
28 | template<int THREAD_GROUP_SIZE, typename Vec, int N>
29 | inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {
30 |   using A_vec = typename FloatVec<Vec>::Type;
31 |   // Compute the parallel products for Q*K^T (treat vector lanes separately).
32 |   A_vec qk_vec = mul<A_vec, Vec, Vec>(q[0], k[0]);
33 | #pragma unroll
34 |   for (int ii = 1; ii < N; ++ii) {
35 |     qk_vec = fma(q[ii], k[ii], qk_vec);
36 |   }
37 | 
38 |   // Finalize the reduction across lanes.
39 |   float qk = sum(qk_vec);
40 | #pragma unroll
41 |   for (int mask = THREAD_GROUP_SIZE / 2; mask >= 1; mask /= 2) {
42 |     qk += __shfl_xor_sync(uint32_t(-1), qk, mask);
43 |   }
44 |   return qk;
45 | }
46 | 
47 | template<typename T, int THREAD_GROUP_SIZE>
48 | struct Qk_dot {
49 |   template<typename Vec, int N>
50 |   static inline __device__ float dot(const Vec (&q)[N], const Vec (&k)[N]) {
51 |     return qk_dot_<THREAD_GROUP_SIZE>(q, k);
52 |   }
53 | };
54 | 
55 | } // namespace vllm
56 | 


--------------------------------------------------------------------------------
/vllm/block.py:
--------------------------------------------------------------------------------
 1 | """Token blocks."""
 2 | from typing import List
 3 | 
 4 | from vllm.utils import Device
 5 | 
 6 | _BLANK_TOKEN_ID = -1
 7 | 
 8 | 
 9 | class LogicalTokenBlock:
10 |     """A block that stores a contiguous chunk of tokens from left to right.
11 | 
12 |     Logical blocks are used to represent the states of the corresponding
13 |     physical blocks in the KV cache.
14 |     """
15 | 
16 |     def __init__(
17 |         self,
18 |         block_number: int,
19 |         block_size: int,
20 |     ) -> None:
21 |         self.block_number = block_number
22 |         self.block_size = block_size
23 | 
24 |         self.token_ids = [_BLANK_TOKEN_ID] * block_size
25 |         self.num_tokens = 0
26 | 
27 |     def is_empty(self) -> bool:
28 |         return self.num_tokens == 0
29 | 
30 |     def get_num_empty_slots(self) -> int:
31 |         return self.block_size - self.num_tokens
32 | 
33 |     def is_full(self) -> bool:
34 |         return self.num_tokens == self.block_size
35 | 
36 |     def append_tokens(self, token_ids: List[int]) -> None:
37 |         assert len(token_ids) <= self.get_num_empty_slots()
38 |         curr_idx = self.num_tokens
39 |         self.token_ids[curr_idx:curr_idx + len(token_ids)] = token_ids
40 |         self.num_tokens += len(token_ids)
41 | 
42 |     def get_token_ids(self) -> List[int]:
43 |         return self.token_ids[:self.num_tokens]
44 | 
45 |     def get_last_token_id(self) -> int:
46 |         assert self.num_tokens > 0
47 |         return self.token_ids[self.num_tokens - 1]
48 | 
49 | 
50 | class PhysicalTokenBlock:
51 |     """Represents the state of a block in the KV cache."""
52 | 
53 |     def __init__(
54 |         self,
55 |         device: Device,
56 |         block_number: int,
57 |         block_size: int,
58 |     ) -> None:
59 |         self.device = device
60 |         self.block_number = block_number
61 |         self.block_size = block_size
62 | 
63 |         self.ref_count = 0
64 | 
65 |     def __repr__(self) -> str:
66 |         return (f'PhysicalTokenBlock(device={self.device}, '
67 |                 f'block_number={self.block_number}, '
68 |                 f'ref_count={self.ref_count})')
69 | 


--------------------------------------------------------------------------------
/csrc/layernorm_kernels.cu:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | #include <ATen/cuda/CUDAContext.h>
 3 | 
 4 | #include "dispatch_utils.h"
 5 | #include "reduction_utils.cuh"
 6 | 
 7 | namespace vllm {
 8 | 
 9 | // TODO(woosuk): Further optimize this kernel.
10 | template<typename scalar_t>
11 | __global__ void rms_norm_kernel(
12 |   scalar_t* __restrict__ out,             // [num_tokens, hidden_size]
13 |   const scalar_t* __restrict__ input,     // [num_tokens, hidden_size]
14 |   const scalar_t* __restrict__ weight,    // [hidden_size]
15 |   const float epsilon,
16 |   const int num_tokens,
17 |   const int hidden_size) {
18 |   __shared__ float s_variance;
19 |   float variance = 0.0f;
20 | 
21 |   for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
22 |     const float x = (float) input[blockIdx.x * hidden_size + idx];
23 |     variance += x * x;
24 |   }
25 |   variance = blockReduceSum<float>(variance);
26 |   if (threadIdx.x == 0) {
27 |     s_variance = rsqrtf(variance / hidden_size + epsilon);
28 |   }
29 |   __syncthreads();
30 | 
31 |   for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
32 |     float x = (float) input[blockIdx.x * hidden_size + idx];
33 |     out[blockIdx.x * hidden_size + idx] = ((scalar_t) (x * s_variance)) * weight[idx];
34 |   }
35 | }
36 | 
37 | } // namespace vllm
38 | 
39 | void rms_norm(
40 |   torch::Tensor& out,      // [num_tokens, hidden_size]
41 |   torch::Tensor& input,    // [num_tokens, hidden_size]
42 |   torch::Tensor& weight,   // [hidden_size]
43 |   float epsilon) {
44 |   int num_tokens = input.size(0);
45 |   int hidden_size = input.size(1);
46 | 
47 |   dim3 grid(num_tokens);
48 |   dim3 block(std::min(hidden_size, 1024));
49 |   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
50 |   VLLM_DISPATCH_FLOATING_TYPES(
51 |     input.scalar_type(),
52 |     "rms_norm_kernel",
53 |     [&] {
54 |       vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
55 |         out.data_ptr<scalar_t>(),
56 |         input.data_ptr<scalar_t>(),
57 |         weight.data_ptr<scalar_t>(),
58 |         epsilon,
59 |         num_tokens,
60 |         hidden_size);
61 |     });
62 | }
63 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | Welcome to vLLM!
 2 | ================
 3 | 
 4 | .. figure:: ./assets/logos/vllm-logo-text-light.png
 5 |   :width: 60%
 6 |   :align: center
 7 |   :alt: vLLM
 8 |   :class: no-scaled-link
 9 | 
10 | .. raw:: html
11 | 
12 |    <p style="text-align:center">
13 |    <strong>Easy, fast, and cheap LLM serving for everyone
14 |    </strong>
15 |    </p>
16 | 
17 |    <p style="text-align:center">
18 |    <script async defer src="https://buttons.github.io/buttons.js"></script>
19 |    <a class="github-button" href="https://github.com/vllm-project/vllm" data-show-count="true" data-size="large" aria-label="Star">Star</a>
20 |    <a class="github-button" href="https://github.com/vllm-project/vllm/subscription" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
21 |    <a class="github-button" href="https://github.com/vllm-project/vllm/fork" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
22 |    </p>
23 | 
24 | 
25 | 
26 | vLLM is a fast and easy-to-use library for LLM inference and serving.
27 | 
28 | vLLM is fast with:
29 | 
30 | * State-of-the-art serving throughput
31 | * Efficient management of attention key and value memory with **PagedAttention**
32 | * Continuous batching of incoming requests
33 | * Optimized CUDA kernels
34 | 
35 | vLLM is flexible and easy to use with:
36 | 
37 | * Seamless integration with popular HuggingFace models
38 | * High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
39 | * Tensor parallelism support for distributed inference
40 | * Streaming outputs
41 | * OpenAI-compatible API server
42 | 
43 | For more information, check out the following:
44 | 
45 | * `vLLM announcing blog post <https://vllm.ai>`_ (intro to PagedAttention)
46 | * `vLLM paper <https://arxiv.org/abs/2309.06180>`_ (SOSP 2023)
47 | * `How continuous batching enables 23x throughput in LLM inference while reducing p50 latency <https://www.anyscale.com/blog/continuous-batching-llm-inference>`_ by Cade Daniel et al.
48 | 
49 | 
50 | 
51 | Documentation
52 | -------------
53 | 
54 | .. toctree::
55 |    :maxdepth: 1
56 |    :caption: Getting Started
57 | 
58 |    getting_started/installation
59 |    getting_started/quickstart
60 | 
61 | .. toctree::
62 |    :maxdepth: 1
63 |    :caption: Serving
64 | 
65 |    serving/distributed_serving
66 |    serving/run_on_sky
67 | 
68 | .. toctree::
69 |    :maxdepth: 1
70 |    :caption: Models
71 | 
72 |    models/supported_models
73 |    models/adding_model
74 | 


--------------------------------------------------------------------------------
/docs/source/serving/run_on_sky.rst:
--------------------------------------------------------------------------------
 1 | .. _on_cloud:
 2 | 
 3 | Running on clouds with SkyPilot
 4 | ===============================
 5 | 
 6 | .. raw:: html
 7 | 
 8 |     <p align="center">
 9 |         <img src="https://imgur.com/yxtzPEu.png" alt="vLLM"/>
10 |     </p>
11 | 
12 | vLLM can be run on the cloud to scale to multiple GPUs with `SkyPilot <https://github.com/skypilot-org/skypilot>`__, an open-source framework for running LLMs on any cloud.
13 | 
14 | To install SkyPilot and setup your cloud credentials, run:
15 | 
16 | .. code-block:: console
17 | 
18 |     $ pip install skypilot
19 |     $ sky check
20 | 
21 | See the vLLM SkyPilot YAML for serving, `serving.yaml <https://github.com/skypilot-org/skypilot/blob/master/llm/vllm/serve.yaml>`__.
22 | 
23 | .. code-block:: yaml
24 | 
25 |     resources:
26 |         accelerators: A100
27 | 
28 |     envs:
29 |         MODEL_NAME: decapoda-research/llama-13b-hf
30 |         TOKENIZER: hf-internal-testing/llama-tokenizer
31 | 
32 |     setup: |
33 |         conda create -n vllm python=3.9 -y
34 |         conda activate vllm
35 |         git clone https://github.com/vllm-project/vllm.git
36 |         cd vllm
37 |         pip install .
38 |         pip install gradio
39 | 
40 |     run: |
41 |         conda activate vllm
42 |         echo 'Starting vllm api server...'
43 |         python -u -m vllm.entrypoints.api_server \
44 |                         --model $MODEL_NAME \
45 |                         --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
46 |                         --tokenizer $TOKENIZER 2>&1 | tee api_server.log &
47 |         echo 'Waiting for vllm api server to start...'
48 |         while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
49 |         echo 'Starting gradio server...'
50 |         python vllm/examples/gradio_webserver.py
51 | 
52 | Start the serving the LLaMA-13B model on an A100 GPU:
53 | 
54 | .. code-block:: console
55 | 
56 |     $ sky launch serving.yaml
57 | 
58 | Check the output of the command. There will be a sharable gradio link (like the last line of the following). Open it in your browser to use the LLaMA model to do the text completion.
59 | 
60 | .. code-block:: console
61 | 
62 |     (task, pid=7431) Running on public URL: https://<gradio-hash>.gradio.live
63 | 
64 | **Optional**: Serve the 65B model instead of the default 13B and use more GPU:
65 | 
66 | .. code-block:: console
67 | 
68 |     sky launch -c vllm-serve-new -s serve.yaml --gpus A100:8 --env MODEL_NAME=decapoda-research/llama-65b-hf
69 | 
70 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/configs/baichuan.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 3 | #
 4 | # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
 5 | # and OPT implementations in this library. It has been modified from its
 6 | # original forms to accommodate minor architectural differences compared
 7 | # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
 8 | #
 9 | # Licensed under the Apache License, Version 2.0 (the "License");
10 | # you may not use this file except in compliance with the License.
11 | # You may obtain a copy of the License at
12 | #
13 | #     http://www.apache.org/licenses/LICENSE-2.0
14 | #
15 | # Unless required by applicable law or agreed to in writing, software
16 | # distributed under the License is distributed on an "AS IS" BASIS,
17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | # See the License for the specific language governing permissions and
19 | # limitations under the License.
20 | 
21 | from transformers.configuration_utils import PretrainedConfig
22 | 
23 | 
24 | class BaiChuanConfig(PretrainedConfig):
25 |     model_type = "baichuan"
26 |     keys_to_ignore_at_inference = ["past_key_values"]
27 | 
28 |     def __init__(
29 |         self,
30 |         vocab_size=64000,
31 |         hidden_size=4096,
32 |         intermediate_size=11008,
33 |         num_hidden_layers=32,
34 |         num_attention_heads=32,
35 |         hidden_act="silu",
36 |         max_position_embeddings=4096,
37 |         initializer_range=0.02,
38 |         rms_norm_eps=1e-6,
39 |         use_cache=True,
40 |         pad_token_id=0,
41 |         bos_token_id=1,
42 |         eos_token_id=2,
43 |         tie_word_embeddings=False,
44 |         **kwargs,
45 |     ):
46 |         self.vocab_size = vocab_size
47 |         self.max_position_embeddings = max_position_embeddings
48 |         self.hidden_size = hidden_size
49 |         self.intermediate_size = intermediate_size
50 |         self.num_hidden_layers = num_hidden_layers
51 |         self.num_attention_heads = num_attention_heads
52 |         self.hidden_act = hidden_act
53 |         self.initializer_range = initializer_range
54 |         self.rms_norm_eps = rms_norm_eps
55 |         self.use_cache = use_cache
56 |         super().__init__(
57 |             pad_token_id=pad_token_id,
58 |             bos_token_id=bos_token_id,
59 |             eos_token_id=eos_token_id,
60 |             tie_word_embeddings=tie_word_embeddings,
61 |             **kwargs,
62 |         )
63 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/configs/aquila.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 3 | #
 4 | # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
 5 | # and OPT implementations in this library. It has been modified from its
 6 | # original forms to accommodate minor architectural differences compared
 7 | # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
 8 | #
 9 | # Licensed under the Apache License, Version 2.0 (the "License");
10 | # you may not use this file except in compliance with the License.
11 | # You may obtain a copy of the License at
12 | #
13 | #     http://www.apache.org/licenses/LICENSE-2.0
14 | #
15 | # Unless required by applicable law or agreed to in writing, software
16 | # distributed under the License is distributed on an "AS IS" BASIS,
17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | # See the License for the specific language governing permissions and
19 | # limitations under the License.
20 | """ Aquila model configuration"""
21 | 
22 | from transformers import PretrainedConfig
23 | 
24 | 
25 | class AquilaConfig(PretrainedConfig):
26 |     model_type = "aquila"
27 |     keys_to_ignore_at_inference = ["past_key_values"]
28 | 
29 |     def __init__(
30 |         self,
31 |         vocab_size=100008,
32 |         hidden_size=4096,
33 |         intermediate_size=11008,
34 |         num_hidden_layers=32,
35 |         num_attention_heads=32,
36 |         hidden_act="silu",
37 |         max_position_embeddings=2048,
38 |         initializer_range=0.006,
39 |         rms_norm_eps=1e-5,
40 |         use_cache=True,
41 |         pad_token_id=0,
42 |         bos_token_id=1,
43 |         eos_token_id=2,
44 |         tie_word_embeddings=False,
45 |         **kwargs,
46 |     ):
47 |         self.vocab_size = vocab_size
48 |         self.max_position_embeddings = max_position_embeddings
49 |         self.hidden_size = hidden_size
50 |         self.intermediate_size = intermediate_size
51 |         self.num_hidden_layers = num_hidden_layers
52 |         self.num_attention_heads = num_attention_heads
53 |         self.hidden_act = hidden_act
54 |         self.initializer_range = initializer_range
55 |         self.rms_norm_eps = rms_norm_eps
56 |         self.use_cache = use_cache
57 |         super().__init__(
58 |             pad_token_id=pad_token_id,
59 |             bos_token_id=bos_token_id,
60 |             eos_token_id=eos_token_id,
61 |             tie_word_embeddings=tie_word_embeddings,
62 |             **kwargs,
63 |         )
64 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | # import os
14 | # import sys
15 | # sys.path.insert(0, os.path.abspath('.'))
16 | 
17 | 
18 | # -- Project information -----------------------------------------------------
19 | 
20 | project = 'vLLM'
21 | copyright = '2023, vLLM Team'
22 | author = 'the vLLM Team'
23 | 
24 | 
25 | # -- General configuration ---------------------------------------------------
26 | 
27 | # Add any Sphinx extension module names here, as strings. They can be
28 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
29 | # ones.
30 | extensions = [
31 |     "sphinx.ext.napoleon",
32 |     "sphinx.ext.viewcode",
33 |     "sphinx.ext.intersphinx",
34 |     "sphinx_copybutton",
35 | ]
36 | 
37 | # Add any paths that contain templates here, relative to this directory.
38 | templates_path = ['_templates']
39 | 
40 | # List of patterns, relative to source directory, that match files and
41 | # directories to ignore when looking for source files.
42 | # This pattern also affects html_static_path and html_extra_path.
43 | exclude_patterns = []
44 | 
45 | # Exclude the prompt "$" when copying code
46 | copybutton_prompt_text = r"\$ "
47 | copybutton_prompt_is_regexp = True
48 | 
49 | # -- Options for HTML output -------------------------------------------------
50 | 
51 | # The theme to use for HTML and HTML Help pages.  See the documentation for
52 | # a list of builtin themes.
53 | #
54 | html_title = project
55 | html_theme = 'sphinx_book_theme'
56 | html_logo = 'assets/logos/vllm-logo-text-light.png'
57 | html_theme_options = {
58 |     'logo_only': True,
59 |     'path_to_docs': 'docs/source',
60 |     'repository_url': 'https://github.com/vllm-project/vllm',
61 |     'use_repository_button': True,
62 | }
63 | 
64 | # Add any paths that contain custom static files (such as style sheets) here,
65 | # relative to this directory. They are copied after the builtin static files,
66 | # so a file named "default.css" will overwrite the builtin "default.css".
67 | html_static_path = ['_static']
68 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/configs/mpt.py:
--------------------------------------------------------------------------------
 1 | # Adapted from
 2 | # https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py
 3 | from typing import Any, Dict, Optional, Union
 4 | 
 5 | from transformers import PretrainedConfig
 6 | 
 7 | _ATTN_CONFIG_DEFAULTS = {
 8 |     "attn_type": "multihead_attention",
 9 |     "attn_pdrop": 0.0,
10 |     "attn_impl": "triton",
11 |     "qk_ln": False,
12 |     "clip_qkv": None,
13 |     "softmax_scale": None,
14 |     "prefix_lm": False,
15 |     "attn_uses_sequence_id": False,
16 |     "alibi": False,
17 |     "alibi_bias_max": 8,
18 | }
19 | 
20 | 
21 | class MPTConfig(PretrainedConfig):
22 |     model_type = "mpt"
23 |     attribute_map = {
24 |         "hidden_size": "d_model",
25 |         "num_attention_heads": "n_heads",
26 |         "num_hidden_layers": "n_layers",
27 |     }
28 | 
29 |     def __init__(
30 |         self,
31 |         d_model: int = 2048,
32 |         n_heads: int = 16,
33 |         n_layers: int = 24,
34 |         expansion_ratio: int = 4,
35 |         max_seq_len: int = 2048,
36 |         vocab_size: int = 50368,
37 |         resid_pdrop: float = 0.0,
38 |         emb_pdrop: float = 0.0,
39 |         learned_pos_emb: bool = True,
40 |         attn_config: Optional[Dict[str, Any]] = None,
41 |         init_device: str = "cpu",
42 |         logit_scale: Optional[Union[float, str]] = None,
43 |         no_bias: bool = False,
44 |         verbose: int = 0,
45 |         embedding_fraction: float = 1.0,
46 |         norm_type: str = "low_precision_layernorm",
47 |         use_cache: bool = False,
48 |         **kwargs,
49 |     ) -> None:
50 |         self.d_model = d_model
51 |         self.n_heads = n_heads
52 |         self.n_layers = n_layers
53 |         self.expansion_ratio = expansion_ratio
54 |         self.max_seq_len = max_seq_len
55 |         self.vocab_size = vocab_size
56 |         self.resid_pdrop = resid_pdrop
57 |         self.emb_pdrop = emb_pdrop
58 |         self.learned_pos_emb = learned_pos_emb
59 |         if attn_config is None:
60 |             self.attn_config = _ATTN_CONFIG_DEFAULTS
61 |         else:
62 |             self.attn_config = attn_config
63 |         self.init_device = init_device
64 |         self.logit_scale = logit_scale
65 |         self.no_bias = no_bias
66 |         self.verbose = verbose
67 |         self.embedding_fraction = embedding_fraction
68 |         self.norm_type = norm_type
69 |         self.use_cache = use_cache
70 |         if "name" in kwargs:
71 |             del kwargs["name"]
72 |         if "loss_fn" in kwargs:
73 |             del kwargs["loss_fn"]
74 |         super().__init__(**kwargs)
75 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/configs/qwen.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Alibaba Cloud.
 2 | # LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
 3 | 
 4 | from transformers import PretrainedConfig
 5 | 
 6 | 
 7 | class QWenConfig(PretrainedConfig):
 8 |     model_type = "qwen"
 9 |     keys_to_ignore_at_inference = ["past_key_values"]
10 |     attribute_map = {
11 |         "hidden_size": "n_embd",
12 |         "num_attention_heads": "n_head",
13 |         "max_position_embeddings": "n_positions",
14 |         "num_hidden_layers": "n_layer",
15 |     }
16 | 
17 |     def __init__(
18 |         self,
19 |         vocab_size=151851,
20 |         n_embd=4096,
21 |         n_layer=32,
22 |         n_head=32,
23 |         n_inner=None,
24 |         embd_pdrop=0.0,
25 |         attn_pdrop=0.0,
26 |         layer_norm_epsilon=1e-5,
27 |         initializer_range=0.02,
28 |         scale_attn_weights=True,
29 |         use_cache=True,
30 |         eos_token_id=151643,
31 |         apply_residual_connection_post_layernorm=False,
32 |         bf16=True,
33 |         kv_channels=128,
34 |         rotary_pct=1.0,
35 |         rotary_emb_base=10000,
36 |         use_dynamic_ntk=False,
37 |         use_logn_attn=False,
38 |         use_flash_attn=True,
39 |         ffn_hidden_size=22016,
40 |         no_bias=True,
41 |         tie_word_embeddings=False,
42 |         **kwargs,
43 |     ):
44 |         self.eos_token_id = eos_token_id
45 |         super().__init__(eos_token_id=eos_token_id,
46 |                          tie_word_embeddings=tie_word_embeddings,
47 |                          **kwargs)
48 | 
49 |         self.vocab_size = vocab_size
50 |         self.n_embd = n_embd
51 |         self.n_layer = n_layer
52 |         self.n_head = n_head
53 |         self.n_inner = n_inner
54 |         self.embd_pdrop = embd_pdrop
55 |         self.attn_pdrop = attn_pdrop
56 |         self.layer_norm_epsilon = layer_norm_epsilon
57 |         self.initializer_range = initializer_range
58 |         self.scale_attn_weights = scale_attn_weights
59 |         self.use_cache = use_cache
60 |         self.apply_residual_connection_post_layernorm = (
61 |             apply_residual_connection_post_layernorm)
62 |         self.bf16 = bf16
63 |         self.kv_channels = kv_channels
64 |         self.rotary_pct = rotary_pct
65 |         self.rotary_emb_base = rotary_emb_base
66 |         self.use_dynamic_ntk = use_dynamic_ntk
67 |         self.use_logn_attn = use_logn_attn
68 |         self.use_flash_attn = use_flash_attn
69 |         self.ffn_hidden_size = ffn_hidden_size
70 |         self.no_bias = no_bias
71 |         self.tie_word_embeddings = tie_word_embeddings
72 | 


--------------------------------------------------------------------------------
/tests/kernels/test_activation.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | import torch.nn.functional as F
 4 | from transformers.activations import get_activation
 5 | 
 6 | from vllm import activation_ops
 7 | 
 8 | DTYPES = [torch.half, torch.bfloat16, torch.float]
 9 | NUM_TOKENS = [7, 83, 2048]  # Arbitrary values for testing
10 | D = [512, 4096, 5120, 13824]  # Arbitrary values for testing
11 | SEEDS = [0]
12 | 
13 | 
14 | def ref_silu_and_mul(x: torch.Tensor) -> torch.Tensor:
15 |     x1, x2 = x.chunk(chunks=2, dim=1)
16 |     return F.silu(x1) * x2
17 | 
18 | 
19 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
20 | @pytest.mark.parametrize("d", D)
21 | @pytest.mark.parametrize("dtype", DTYPES)
22 | @pytest.mark.parametrize("seed", SEEDS)
23 | @torch.inference_mode()
24 | def test_silu_and_mul(
25 |     num_tokens: int,
26 |     d: int,
27 |     dtype: torch.dtype,
28 |     seed: int,
29 | ) -> None:
30 |     torch.random.manual_seed(seed)
31 |     torch.cuda.manual_seed(seed)
32 |     x = torch.randn(num_tokens, 2 * d, dtype=dtype, device='cuda')
33 |     out = torch.empty(num_tokens, d, dtype=dtype, device='cuda')
34 |     activation_ops.silu_and_mul(out, x)
35 |     ref_out = ref_silu_and_mul(x)
36 |     assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5)
37 | 
38 | 
39 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
40 | @pytest.mark.parametrize("d", D)
41 | @pytest.mark.parametrize("dtype", DTYPES)
42 | @pytest.mark.parametrize("seed", SEEDS)
43 | @torch.inference_mode()
44 | def test_gelu_new(
45 |     num_tokens: int,
46 |     d: int,
47 |     dtype: torch.dtype,
48 |     seed: int,
49 | ) -> None:
50 |     torch.random.manual_seed(seed)
51 |     torch.cuda.manual_seed(seed)
52 |     x = torch.randn(num_tokens, d, dtype=dtype, device='cuda')
53 |     out = torch.empty(num_tokens, d, dtype=dtype, device='cuda')
54 |     activation_ops.gelu_new(out, x)
55 |     ref_out = get_activation("gelu_new")(x)
56 |     assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5)
57 | 
58 | 
59 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
60 | @pytest.mark.parametrize("d", D)
61 | @pytest.mark.parametrize("dtype", DTYPES)
62 | @pytest.mark.parametrize("seed", SEEDS)
63 | def test_gelu_fast(
64 |     num_tokens: int,
65 |     d: int,
66 |     dtype: torch.dtype,
67 |     seed: int,
68 | ) -> None:
69 |     torch.random.manual_seed(seed)
70 |     torch.cuda.manual_seed(seed)
71 |     x = torch.randn(num_tokens, d, dtype=dtype, device='cuda')
72 |     out = torch.empty(num_tokens, d, dtype=dtype, device='cuda')
73 |     activation_ops.gelu_fast(out, x)
74 |     ref_out = get_activation("gelu_fast")(x)
75 |     assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5)
76 | 


--------------------------------------------------------------------------------
/examples/api_client.py:
--------------------------------------------------------------------------------
 1 | """Example Python client for vllm.entrypoints.api_server"""
 2 | 
 3 | import argparse
 4 | import json
 5 | from typing import Iterable, List
 6 | 
 7 | import requests
 8 | 
 9 | 
10 | def clear_line(n: int = 1) -> None:
11 |     LINE_UP = '\033[1A'
12 |     LINE_CLEAR = '\x1b[2K'
13 |     for _ in range(n):
14 |         print(LINE_UP, end=LINE_CLEAR, flush=True)
15 | 
16 | 
17 | def post_http_request(prompt: str,
18 |                       api_url: str,
19 |                       n: int = 1,
20 |                       stream: bool = False) -> requests.Response:
21 |     headers = {"User-Agent": "Test Client"}
22 |     pload = {
23 |         "prompt": prompt,
24 |         "n": n,
25 |         "use_beam_search": True,
26 |         "temperature": 0.0,
27 |         "max_tokens": 16,
28 |         "stream": stream,
29 |     }
30 |     response = requests.post(api_url, headers=headers, json=pload, stream=True)
31 |     return response
32 | 
33 | 
34 | def get_streaming_response(response: requests.Response) -> Iterable[List[str]]:
35 |     for chunk in response.iter_lines(chunk_size=8192,
36 |                                      decode_unicode=False,
37 |                                      delimiter=b"\0"):
38 |         if chunk:
39 |             data = json.loads(chunk.decode("utf-8"))
40 |             output = data["text"]
41 |             yield output
42 | 
43 | 
44 | def get_response(response: requests.Response) -> List[str]:
45 |     data = json.loads(response.content)
46 |     output = data["text"]
47 |     return output
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     parser = argparse.ArgumentParser()
52 |     parser.add_argument("--host", type=str, default="localhost")
53 |     parser.add_argument("--port", type=int, default=8000)
54 |     parser.add_argument("--n", type=int, default=4)
55 |     parser.add_argument("--prompt", type=str, default="San Francisco is a")
56 |     parser.add_argument("--stream", action="store_true")
57 |     args = parser.parse_args()
58 |     prompt = args.prompt
59 |     api_url = f"http://{args.host}:{args.port}/generate"
60 |     n = args.n
61 |     stream = args.stream
62 | 
63 |     print(f"Prompt: {prompt!r}\n", flush=True)
64 |     response = post_http_request(prompt, api_url, n, stream)
65 | 
66 |     if stream:
67 |         num_printed_lines = 0
68 |         for h in get_streaming_response(response):
69 |             clear_line(num_printed_lines)
70 |             num_printed_lines = 0
71 |             for i, line in enumerate(h):
72 |                 num_printed_lines += 1
73 |                 print(f"Beam candidate {i}: {line!r}", flush=True)
74 |     else:
75 |         output = get_response(response)
76 |         for i, line in enumerate(output):
77 |             print(f"Beam candidate {i}: {line!r}", flush=True)
78 | 


--------------------------------------------------------------------------------
/vllm/model_executor/parallel_utils/tensor_parallel/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 The vLLM team.
 2 | # Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py
 3 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 4 | 
 5 | import torch
 6 | from typing import List, Sequence
 7 | 
 8 | def ensure_divisibility(numerator, denominator):
 9 |     """Ensure that numerator is divisible by the denominator."""
10 |     assert numerator % denominator == 0, "{} is not divisible by {}".format(
11 |         numerator, denominator
12 |     )
13 | 
14 | 
15 | def divide(numerator, denominator):
16 |     """Ensure that numerator is divisible by the denominator and return
17 |     the division value."""
18 |     ensure_divisibility(numerator, denominator)
19 |     return numerator // denominator
20 | 
21 | 
22 | def split_tensor_along_last_dim(
23 |     tensor: torch.Tensor,
24 |     num_partitions: int,
25 |     contiguous_split_chunks: bool = False,
26 | ) -> List[torch.Tensor]:
27 |     """ Split a tensor along its last dimension.
28 | 
29 |         Arguments:
30 |             tensor: input tensor.
31 |             num_partitions: number of partitions to split the tensor
32 |             contiguous_split_chunks: If True, make each chunk contiguous
33 |                                      in memory.
34 | 
35 |         Returns:
36 |             A list of Tensors
37 |     """
38 |     # Get the size and dimension.
39 |     last_dim = tensor.dim() - 1
40 |     last_dim_size = divide(tensor.size()[last_dim], num_partitions)
41 |     # Split.
42 |     tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
43 |     # Note: torch.split does not create contiguous tensors by default.
44 |     if contiguous_split_chunks:
45 |         return tuple(chunk.contiguous() for chunk in tensor_list)
46 | 
47 |     return tensor_list
48 | 
49 | 
50 | class VocabUtility:
51 |     """ Split the vocabulary into `world_size` chunks and return the first
52 |         and last index of the vocabulary belonging to the `rank`
53 |         partition: Note that indices in [fist, last)
54 | 
55 |     """
56 | 
57 |     @staticmethod
58 |     def vocab_range_from_per_partition_vocab_size(
59 |         per_partition_vocab_size: int, rank, world_size: int
60 |     ) -> Sequence[int]:
61 |         index_f = rank * per_partition_vocab_size
62 |         index_l = index_f + per_partition_vocab_size
63 |         return index_f, index_l
64 | 
65 |     @staticmethod
66 |     def vocab_range_from_global_vocab_size(global_vocab_size: int, rank: int, world_size: int) -> Sequence[int]:
67 |         per_partition_vocab_size = divide(global_vocab_size, world_size)
68 |         return VocabUtility.vocab_range_from_per_partition_vocab_size(
69 |             per_partition_vocab_size, rank, world_size
70 |         )
71 | 


--------------------------------------------------------------------------------
/vllm/model_executor/input_metadata.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, List, Tuple
 2 | 
 3 | import torch
 4 | from xformers.ops import AttentionBias
 5 | 
 6 | from vllm.sampling_params import SamplingParams
 7 | from vllm.sequence import SequenceData
 8 | 
 9 | 
10 | class InputMetadata:
11 |     """Metadata for input sequences. Used for PagedAttention.
12 | 
13 |     Args:
14 |         seq_groups: List of (seq_ids, sampling_params).
15 |         seq_data: Seq_id -> SequenceData.
16 |         prompt_lens: Lengths of prompts.
17 |         slot_mapping: The address to write the new KV to of each token.
18 |         context_lens: the length of attention context for each generation token.
19 |         max_context_len: The maximum context length.
20 |         block_tables: The block tables. (Seq id -> list of physical block)
21 |     """
22 | 
23 |     def __init__(
24 |         self,
25 |         seq_groups: List[Tuple[List[int], SamplingParams]],
26 |         seq_data: Dict[int, SequenceData],
27 |         prompt_lens: List[int],
28 |         slot_mapping: torch.Tensor,
29 |         context_lens: torch.Tensor,
30 |         max_context_len: int,
31 |         block_tables: torch.Tensor,
32 |     ) -> None:
33 |         self.seq_groups = seq_groups
34 |         self.seq_data = seq_data
35 |         self.prompt_lens = prompt_lens
36 |         self.slot_mapping = slot_mapping
37 |         self.context_lens = context_lens
38 |         self.max_context_len = max_context_len
39 |         self.block_tables = block_tables
40 | 
41 |         self.num_prompts = len(prompt_lens)
42 |         self.num_prompt_tokens = sum(prompt_lens)
43 |         self.num_generation_tokens = context_lens.shape[0]
44 |         self.num_valid_tokens = slot_mapping.shape[0]
45 |         if block_tables.numel() > 0:
46 |             self.max_num_blocks_per_seq = block_tables.shape[1]
47 |         else:
48 |             self.max_num_blocks_per_seq = 0
49 |         assert block_tables.shape[0] == self.num_generation_tokens
50 |         assert context_lens.shape[0] == self.num_generation_tokens
51 | 
52 |         # Set during the execution of the first attention op.
53 |         self.attn_bias: List[AttentionBias] = []
54 | 
55 |     def __repr__(self) -> str:
56 |         # Print only useful metadata.
57 |         return (f'InputMetadata('
58 |                 f'num_valid_tokens={self.num_valid_tokens}, '
59 |                 f'num_prompt_tokens={self.num_prompt_tokens}, '
60 |                 f'num_prompts={self.num_prompts}, '
61 |                 f'prompt_lens={self.prompt_lens}, '
62 |                 f'num_generation_tokens={self.num_generation_tokens}, '
63 |                 f'context_lens={self.context_lens}, '
64 |                 f'max_context_len={self.max_context_len}), '
65 |                 f'max_num_blocks_per_seq={self.max_num_blocks_per_seq}, '
66 |                 f'block_tables={self.block_tables}), '
67 |                 f'slot_mapping={self.slot_mapping}')
68 | 


--------------------------------------------------------------------------------
/tests/async_engine/test_api_server.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import sys
 3 | import time
 4 | from multiprocessing import Pool
 5 | from pathlib import Path
 6 | 
 7 | import pytest
 8 | import requests
 9 | 
10 | 
11 | def _query_server(prompt: str) -> dict:
12 |     response = requests.post("http://localhost:8000/generate",
13 |                              json={
14 |                                  "prompt": prompt,
15 |                                  "max_tokens": 100,
16 |                                  "temperature": 0,
17 |                                  "ignore_eos": True
18 |                              })
19 |     response.raise_for_status()
20 |     return response.json()
21 | 
22 | 
23 | @pytest.fixture
24 | def api_server():
25 |     script_path = Path(__file__).parent.joinpath(
26 |         "api_server_async_engine.py").absolute()
27 |     uvicorn_process = subprocess.Popen([
28 |         sys.executable, "-u",
29 |         str(script_path), "--model", "facebook/opt-125m"
30 |     ])
31 |     yield
32 |     uvicorn_process.terminate()
33 | 
34 | 
35 | def test_api_server(api_server):
36 |     """
37 |     Run the API server and test it.
38 | 
39 |     We run both the server and requests in separate processes.
40 | 
41 |     We test that the server can handle incoming requests, including
42 |     multiple requests at the same time, and that it can handle requests
43 |     being cancelled without crashing.
44 |     """
45 |     with Pool(32) as pool:
46 |         # Wait until the server is ready
47 |         prompts = ["Hello world"] * 1
48 |         result = None
49 |         while not result:
50 |             try:
51 |                 for result in pool.map(_query_server, prompts):
52 |                     break
53 |             except:
54 |                 time.sleep(1)
55 | 
56 |         # Actual tests start here
57 |         # Try with 1 prompt
58 |         for result in pool.map(_query_server, prompts):
59 |             assert result
60 | 
61 |         num_aborted_requests = requests.get(
62 |             "http://localhost:8000/stats").json()["num_aborted_requests"]
63 |         assert num_aborted_requests == 0
64 | 
65 |         # Try with 100 prompts
66 |         prompts = ["Hello world"] * 100
67 |         for result in pool.map(_query_server, prompts):
68 |             assert result
69 | 
70 |         # Cancel requests
71 |         pool.map_async(_query_server, prompts)
72 |         time.sleep(0.01)
73 |         pool.terminate()
74 |         pool.join()
75 | 
76 |         # check cancellation stats
77 |         num_aborted_requests = requests.get(
78 |             "http://localhost:8000/stats").json()["num_aborted_requests"]
79 |         assert num_aborted_requests > 0
80 | 
81 |     # check that server still runs after cancellations
82 |     with Pool(32) as pool:
83 |         # Try with 100 prompts
84 |         prompts = ["Hello world"] * 100
85 |         for result in pool.map(_query_server, prompts):
86 |             assert result
87 | 


--------------------------------------------------------------------------------
/vllm/model_executor/model_loader.py:
--------------------------------------------------------------------------------
 1 | """Utilities for selecting and loading models."""
 2 | import contextlib
 3 | from typing import Type
 4 | 
 5 | import torch
 6 | import torch.nn as nn
 7 | from transformers import PretrainedConfig
 8 | 
 9 | from vllm.config import ModelConfig
10 | from vllm.model_executor.models import *  # pylint: disable=wildcard-import
11 | from vllm.model_executor.weight_utils import initialize_dummy_weights
12 | 
13 | # TODO(woosuk): Lazy-load the model classes.
14 | _MODEL_REGISTRY = {
15 |     "AquilaModel": AquilaForCausalLM,
16 |     "BaiChuanForCausalLM": BaiChuanForCausalLM,  # baichuan-7b
17 |     "BaichuanForCausalLM": BaichuanForCausalLM,  # baichuan-13b
18 |     "BloomForCausalLM": BloomForCausalLM,
19 |     "FalconForCausalLM": FalconForCausalLM,
20 |     "GPT2LMHeadModel": GPT2LMHeadModel,
21 |     "GPTBigCodeForCausalLM": GPTBigCodeForCausalLM,
22 |     "GPTJForCausalLM": GPTJForCausalLM,
23 |     "GPTNeoXForCausalLM": GPTNeoXForCausalLM,
24 |     "InternLMForCausalLM": InternLMForCausalLM,
25 |     "LlamaForCausalLM": LlamaForCausalLM,
26 |     "LLaMAForCausalLM": LlamaForCausalLM,  # For decapoda-research/llama-*
27 |     "MPTForCausalLM": MPTForCausalLM,
28 |     "OPTForCausalLM": OPTForCausalLM,
29 |     "QWenLMHeadModel": QWenLMHeadModel,
30 |     "RWForCausalLM": FalconForCausalLM,
31 | }
32 | 
33 | 
34 | @contextlib.contextmanager
35 | def _set_default_torch_dtype(dtype: torch.dtype):
36 |     """Sets the default torch dtype to the given dtype."""
37 |     old_dtype = torch.get_default_dtype()
38 |     torch.set_default_dtype(dtype)
39 |     yield
40 |     torch.set_default_dtype(old_dtype)
41 | 
42 | 
43 | def _get_model_architecture(config: PretrainedConfig) -> Type[nn.Module]:
44 |     architectures = getattr(config, "architectures", [])
45 |     for arch in architectures:
46 |         if arch in _MODEL_REGISTRY:
47 |             return _MODEL_REGISTRY[arch]
48 |     raise ValueError(
49 |         f"Model architectures {architectures} are not supported for now. "
50 |         f"Supported architectures: {list(_MODEL_REGISTRY.keys())}")
51 | 
52 | 
53 | def get_model(model_config: ModelConfig) -> nn.Module:
54 |     model_class = _get_model_architecture(model_config.hf_config)
55 |     with _set_default_torch_dtype(model_config.dtype):
56 |         # Create a model instance.
57 |         # The weights will be initialized as empty tensors.
58 |         model = model_class(model_config.hf_config)
59 |         if model_config.load_format == "dummy":
60 |             model = model.cuda()
61 |             # NOTE(woosuk): For accurate performance evaluation, we assign
62 |             # random values to the weights.
63 |             initialize_dummy_weights(model)
64 |         else:
65 |             # Load the weights from the cached or downloaded files.
66 |             model.load_weights(model_config.model, model_config.download_dir,
67 |                                model_config.load_format, model_config.revision)
68 |             model = model.cuda()
69 |     return model.eval()
70 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to vLLM
 2 | 
 3 | Thank you for your interest in contributing to vLLM!
 4 | Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large.
 5 | There are several ways you can contribute to the project:
 6 | 
 7 | - Identify and report any issues or bugs.
 8 | - Request or add a new model.
 9 | - Suggest or implement new features.
10 | 
11 | However, remember that contributions aren't just about code.
12 | We believe in the power of community support; thus, answering queries, assisting others, and enhancing the documentation are highly regarded and beneficial contributions.
13 | 
14 | Finally, one of the most impactful ways to support us is by raising awareness about vLLM.
15 | Talk about it in your blog posts, highlighting how it's driving your incredible projects.
16 | Express your support on Twitter if vLLM aids you, or simply offer your appreciation by starring our repository.
17 | 
18 | 
19 | ## Setup for development
20 | 
21 | ### Build from source
22 | 
23 | ```bash
24 | pip install -r requirements.txt
25 | pip install -e .  # This may take several minutes.
26 | ```
27 | 
28 | ### Testing
29 | 
30 | ```bash
31 | pip install -r requirements-dev.txt
32 | 
33 | # Static type checking
34 | mypy
35 | # Unit tests
36 | pytest tests/
37 | ```
38 | **Note:** Currently, the repository does not pass the mypy tests.
39 | 
40 | 
41 | ## Contributing Guidelines
42 | 
43 | ### Issue Reporting
44 | 
45 | If you encounter a bug or have a feature request, please check our issues page first to see if someone else has already reported it.
46 | If not, please file a new issue, providing as much relevant information as possible.
47 | 
48 | ### Coding Style Guide
49 | 
50 | In general, we adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html).
51 | 
52 | We include a formatting script [`format.sh`](./format.sh) to format the code.
53 | 
54 | ### Pull Requests
55 | 
56 | When submitting a pull request:
57 | 
58 | 1. Make sure your code has been rebased on top of the latest commit on the main branch.
59 | 2. Ensure code is properly formatted by running [`format.sh`](./format.sh).
60 | 3. Include a detailed description of the changes in the pull request.
61 | Explain why you made the changes you did.
62 | If your pull request fixes an open issue, please include a reference to it in the description.
63 | 
64 | ### Code Reviews
65 | 
66 | All submissions, including submissions by project members, require a code review.
67 | To make the review process as smooth as possible, please:
68 | 
69 | 1. Keep your changes as concise as possible.
70 | If your pull request involves multiple unrelated changes, consider splitting it into separate pull requests.
71 | 2. Respond to all comments within a reasonable time frame.
72 | If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion.
73 | 
74 | ### Thank You
75 | 
76 | Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM.
77 | Your contributions make vLLM a great tool for everyone!
78 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/configs/falcon.py:
--------------------------------------------------------------------------------
 1 | # Adapted from
 2 | # https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py
 3 | # Copyright 2023 The vLLM team.
 4 | # Copyright 2022 the Big Science Workshop and HuggingFace Inc. team.
 5 | # All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | """Falcon configuration"""
19 | from transformers.configuration_utils import PretrainedConfig
20 | 
21 | 
22 | class RWConfig(PretrainedConfig):
23 |     model_type = "falcon"
24 |     keys_to_ignore_at_inference = ["past_key_values"]
25 |     attribute_map = {
26 |         "num_hidden_layers": "n_layer",
27 |         "num_attention_heads": "n_head",
28 |         "num_kv_heads": "n_head_kv",
29 |     }
30 | 
31 |     def __init__(
32 |         self,
33 |         vocab_size=250880,
34 |         hidden_size=64,
35 |         n_layer=2,
36 |         n_head=8,
37 |         layer_norm_epsilon=1e-5,
38 |         initializer_range=0.02,
39 |         use_cache=True,
40 |         bos_token_id=1,
41 |         eos_token_id=2,
42 |         hidden_dropout=0.0,
43 |         attention_dropout=0.0,
44 |         multi_query=True,
45 |         n_head_kv=None,
46 |         alibi=False,
47 |         bias=False,
48 |         parallel_attn=False,
49 |         new_decoder_architecture=False,
50 |         **kwargs,
51 |     ) -> None:
52 |         self.vocab_size = vocab_size
53 |         # Backward compatibility with n_embed kwarg
54 |         n_embed = kwargs.pop("n_embed", None)
55 |         self.hidden_size = hidden_size if n_embed is None else n_embed
56 |         self.n_layer = n_layer
57 |         self.n_head = n_head
58 |         self.layer_norm_epsilon = layer_norm_epsilon
59 |         self.initializer_range = initializer_range
60 |         self.use_cache = use_cache
61 |         self.hidden_dropout = hidden_dropout
62 |         self.attention_dropout = attention_dropout
63 | 
64 |         self.bos_token_id = bos_token_id
65 |         self.eos_token_id = eos_token_id
66 |         self.multi_query = multi_query
67 |         self.n_head_kv = 1 if n_head_kv is None else n_head_kv
68 |         self.alibi = alibi
69 |         self.bias = bias
70 |         self.parallel_attn = parallel_attn
71 |         self.new_decoder_architecture = new_decoder_architecture
72 | 
73 |         if self.hidden_size == 8192:
74 |             # Hack for falcon-40b
75 |             self.new_decoder_architecture = True
76 | 
77 |         super().__init__(bos_token_id=bos_token_id,
78 |                          eos_token_id=eos_token_id,
79 |                          **kwargs)
80 | 
81 |     @property
82 |     def head_dim(self):
83 |         return self.hidden_size // self.n_head
84 | 
85 |     @property
86 |     def rotary(self):
87 |         return not self.alibi
88 | 


--------------------------------------------------------------------------------
/benchmarks/benchmark_latency.py:
--------------------------------------------------------------------------------
 1 | """Benchmark the latency of processing a single batch of requests."""
 2 | import argparse
 3 | import time
 4 | 
 5 | import numpy as np
 6 | import torch
 7 | from tqdm import tqdm
 8 | 
 9 | from vllm import LLM, SamplingParams
10 | 
11 | 
12 | def main(args: argparse.Namespace):
13 |     print(args)
14 | 
15 |     # Process all the requests in a single batch if possible.
16 |     # NOTE(woosuk): If the request cannot be processed in a single batch,
17 |     # the engine will automatically process the request in multiple batches.
18 |     llm = LLM(
19 |         model=args.model,
20 |         tokenizer=args.tokenizer,
21 |         tensor_parallel_size=args.tensor_parallel_size,
22 |         max_num_seqs=args.batch_size,
23 |         max_num_batched_tokens=args.batch_size * args.input_len,
24 |         trust_remote_code=args.trust_remote_code,
25 |     )
26 | 
27 |     sampling_params = SamplingParams(
28 |         n=args.n,
29 |         temperature=0.0 if args.use_beam_search else 1.0,
30 |         top_p=1.0,
31 |         use_beam_search=args.use_beam_search,
32 |         ignore_eos=True,
33 |         max_tokens=args.output_len,
34 |     )
35 |     print(sampling_params)
36 |     dummy_prompt_token_ids = [[0] * args.input_len] * args.batch_size
37 | 
38 |     def run_to_completion(profile: bool = False):
39 |         if profile:
40 |             torch.cuda.cudart().cudaProfilerStart()
41 |         start_time = time.time()
42 | 
43 |         llm.generate(prompt_token_ids=dummy_prompt_token_ids,
44 |                      sampling_params=sampling_params,
45 |                      use_tqdm=False)
46 | 
47 |         end_time = time.time()
48 |         latency = end_time - start_time
49 |         if profile:
50 |             torch.cuda.cudart().cudaProfilerStop()
51 |         return latency
52 | 
53 |     print("Warming up...")
54 |     run_to_completion(profile=False)
55 | 
56 |     # Benchmark.
57 |     latencies = []
58 |     for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
59 |         latencies.append(run_to_completion(profile=False))
60 |     print(f'Avg latency: {np.mean(latencies)} seconds')
61 | 
62 | 
63 | if __name__ == '__main__':
64 |     parser = argparse.ArgumentParser(
65 |         description='Benchmark the latency of processing a single batch of '
66 |                     'requests till completion.')
67 |     parser.add_argument('--model', type=str, default='facebook/opt-125m')
68 |     parser.add_argument('--tokenizer', type=str, default=None)
69 |     parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
70 |     parser.add_argument('--input-len', type=int, default=32)
71 |     parser.add_argument('--output-len', type=int, default=128)
72 |     parser.add_argument('--batch-size', type=int, default=8)
73 |     parser.add_argument('--n', type=int, default=1,
74 |                         help='Number of generated sequences per prompt.')
75 |     parser.add_argument('--use-beam-search', action='store_true')
76 |     parser.add_argument('--num-iters', type=int, default=3,
77 |                         help='Number of iterations to run.')
78 |     parser.add_argument('--trust-remote-code', action='store_true',
79 |                         help='trust remote code from huggingface')
80 |     args = parser.parse_args()
81 |     main(args)
82 | 


--------------------------------------------------------------------------------
/vllm/entrypoints/api_server.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | from typing import AsyncGenerator
 4 | 
 5 | from fastapi import BackgroundTasks, FastAPI, Request
 6 | from fastapi.responses import JSONResponse, Response, StreamingResponse
 7 | import uvicorn
 8 | 
 9 | from vllm.engine.arg_utils import AsyncEngineArgs
10 | from vllm.engine.async_llm_engine import AsyncLLMEngine
11 | from vllm.sampling_params import SamplingParams
12 | from vllm.utils import random_uuid
13 | 
14 | TIMEOUT_KEEP_ALIVE = 5  # seconds.
15 | TIMEOUT_TO_PREVENT_DEADLOCK = 1  # seconds.
16 | app = FastAPI()
17 | engine = None
18 | 
19 | 
20 | @app.post("/generate")
21 | async def generate(request: Request) -> Response:
22 |     """Generate completion for the request.
23 | 
24 |     The request should be a JSON object with the following fields:
25 |     - prompt: the prompt to use for the generation.
26 |     - stream: whether to stream the results or not.
27 |     - other fields: the sampling parameters (See `SamplingParams` for details).
28 |     """
29 |     request_dict = await request.json()
30 |     prompt = request_dict.pop("prompt")
31 |     stream = request_dict.pop("stream", False)
32 |     sampling_params = SamplingParams(**request_dict)
33 |     request_id = random_uuid()
34 | 
35 |     results_generator = engine.generate(prompt, sampling_params, request_id)
36 | 
37 |     # Streaming case
38 |     async def stream_results() -> AsyncGenerator[bytes, None]:
39 |         async for request_output in results_generator:
40 |             prompt = request_output.prompt
41 |             text_outputs = [
42 |                 prompt + output.text for output in request_output.outputs
43 |             ]
44 |             ret = {"text": text_outputs}
45 |             yield (json.dumps(ret) + "\0").encode("utf-8")
46 | 
47 |     async def abort_request() -> None:
48 |         await engine.abort(request_id)
49 | 
50 |     if stream:
51 |         background_tasks = BackgroundTasks()
52 |         # Abort the request if the client disconnects.
53 |         background_tasks.add_task(abort_request)
54 |         return StreamingResponse(stream_results(), background=background_tasks)
55 | 
56 |     # Non-streaming case
57 |     final_output = None
58 |     async for request_output in results_generator:
59 |         if await request.is_disconnected():
60 |             # Abort the request if the client disconnects.
61 |             await engine.abort(request_id)
62 |             return Response(status_code=499)
63 |         final_output = request_output
64 | 
65 |     assert final_output is not None
66 |     prompt = final_output.prompt
67 |     text_outputs = [prompt + output.text for output in final_output.outputs]
68 |     ret = {"text": text_outputs}
69 |     return JSONResponse(ret)
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     parser = argparse.ArgumentParser()
74 |     parser.add_argument("--host", type=str, default="localhost")
75 |     parser.add_argument("--port", type=int, default=8000)
76 |     parser = AsyncEngineArgs.add_cli_args(parser)
77 |     args = parser.parse_args()
78 | 
79 |     engine_args = AsyncEngineArgs.from_cli_args(args)
80 |     engine = AsyncLLMEngine.from_engine_args(engine_args)
81 | 
82 |     uvicorn.run(app,
83 |                 host=args.host,
84 |                 port=args.port,
85 |                 log_level="debug",
86 |                 timeout_keep_alive=TIMEOUT_KEEP_ALIVE)
87 | 


--------------------------------------------------------------------------------
/docs/source/models/supported_models.rst:
--------------------------------------------------------------------------------
 1 | .. _supported_models:
 2 | 
 3 | Supported Models
 4 | ================
 5 | 
 6 | vLLM supports a variety of generative Transformer models in `HuggingFace Transformers <https://huggingface.co/models>`_.
 7 | The following is the list of model architectures that are currently supported by vLLM.
 8 | Alongside each architecture, we include some popular models that use it.
 9 | 
10 | .. list-table::
11 |   :widths: 25 25 50
12 |   :header-rows: 1
13 | 
14 |   * - Architecture
15 |     - Models
16 |     - Example HuggingFace Models
17 |   * - :code:`AquilaForCausalLM`
18 |     - Aquila
19 |     - :code:`BAAI/Aquila-7B`, :code:`BAAI/AquilaChat-7B`, etc.
20 |   * - :code:`BaiChuanForCausalLM`
21 |     - Baichuan
22 |     - :code:`baichuan-inc/Baichuan-7B`, :code:`baichuan-inc/Baichuan-13B-Chat`, etc.
23 |   * - :code:`BloomForCausalLM`
24 |     - BLOOM, BLOOMZ, BLOOMChat
25 |     - :code:`bigscience/bloom`, :code:`bigscience/bloomz`, etc.
26 |   * - :code:`FalconForCausalLM`
27 |     - Falcon
28 |     - :code:`tiiuae/falcon-7b``, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc.
29 |   * - :code:`GPT2LMHeadModel`
30 |     - GPT-2
31 |     - :code:`gpt2`, :code:`gpt2-xl`, etc.
32 |   * - :code:`GPTBigCodeForCausalLM`
33 |     - StarCoder, SantaCoder, WizardCoder
34 |     - :code:`bigcode/starcoder`, :code:`bigcode/gpt_bigcode-santacoder`, :code:`WizardLM/WizardCoder-15B-V1.0`, etc.
35 |   * - :code:`GPTJForCausalLM`
36 |     - GPT-J
37 |     - :code:`EleutherAI/gpt-j-6b`, :code:`nomic-ai/gpt4all-j`, etc.
38 |   * - :code:`GPTNeoXForCausalLM`
39 |     - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM
40 |     - :code:`EleutherAI/gpt-neox-20b`, :code:`EleutherAI/pythia-12b`, :code:`OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, :code:`databricks/dolly-v2-12b`, :code:`stabilityai/stablelm-tuned-alpha-7b`, etc.
41 |   * - :code:`InternLMForCausalLM`
42 |     - InternLM
43 |     - :code:`internlm/internlm-7b`, :code:`internlm/internlm-chat-7b`, etc.
44 |   * - :code:`LlamaForCausalLM`
45 |     - LLaMA, LLaMA-2, Vicuna, Alpaca, Koala, Guanaco
46 |     - :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`young-geng/koala`, etc.
47 |   * - :code:`MPTForCausalLM`
48 |     - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter
49 |     - :code:`mosaicml/mpt-7b`, :code:`mosaicml/mpt-7b-storywriter`, :code:`mosaicml/mpt-30b`, etc.
50 |   * - :code:`OPTForCausalLM`
51 |     - OPT, OPT-IML
52 |     - :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc.
53 |   * - :code:`QWenLMHeadModel`
54 |     - Qwen
55 |     - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc.
56 | 
57 | If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
58 | Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` for instructions on how to implement support for your model.
59 | Alternatively, you can raise an issue on our `GitHub <https://github.com/vllm-project/vllm/issues>`_ project.
60 | 
61 | .. tip::
62 |     The easiest way to check if your model is supported is to run the program below:
63 | 
64 |     .. code-block:: python
65 | 
66 |         from vllm import LLM
67 | 
68 |         llm = LLM(model=...)  # Name or path of your model
69 |         output = llm.generate("Hello, my name is")
70 |         print(output)
71 | 
72 |     If vLLM successfully generates text, it indicates that your model is supported.
73 | 


--------------------------------------------------------------------------------
/format.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | # YAPF formatter, adapted from ray and skypilot.
  3 | #
  4 | # Usage:
  5 | #    # Do work and commit your work.
  6 | 
  7 | #    # Format files that differ from origin/main.
  8 | #    bash format.sh
  9 | 
 10 | #    # Commit changed files with message 'Run yapf and pylint'
 11 | #
 12 | #
 13 | # YAPF + Clang formatter (if installed). This script formats all changed files from the last mergebase.
 14 | # You are encouraged to run this locally before pushing changes for review.
 15 | 
 16 | # Cause the script to exit if a single command fails
 17 | set -eo pipefail
 18 | 
 19 | # this stops git rev-parse from failing if we run this from the .git directory
 20 | builtin cd "$(dirname "${BASH_SOURCE:-$0}")"
 21 | ROOT="$(git rev-parse --show-toplevel)"
 22 | builtin cd "$ROOT" || exit 1
 23 | 
 24 | YAPF_VERSION=$(yapf --version | awk '{print $2}')
 25 | PYLINT_VERSION=$(pylint --version | head -n 1 | awk '{print $2}')
 26 | MYPY_VERSION=$(mypy --version | awk '{print $2}')
 27 | 
 28 | # # params: tool name, tool version, required version
 29 | tool_version_check() {
 30 |     if [[ $2 != $3 ]]; then
 31 |         echo "Wrong $1 version installed: $3 is required, not $2."
 32 |         exit 1
 33 |     fi
 34 | }
 35 | 
 36 | tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-dev.txt | cut -d'=' -f3)"
 37 | tool_version_check "pylint" $PYLINT_VERSION "$(grep "pylint==" requirements-dev.txt | cut -d'=' -f3)"
 38 | tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-dev.txt | cut -d'=' -f3)"
 39 | 
 40 | YAPF_FLAGS=(
 41 |     '--recursive'
 42 |     '--parallel'
 43 | )
 44 | 
 45 | YAPF_EXCLUDES=(
 46 |     '--exclude' 'build/**'
 47 |     '--exclude' 'vllm/model_executor/parallel_utils/**'
 48 | )
 49 | 
 50 | # Format specified files
 51 | format() {
 52 |     yapf --in-place "${YAPF_FLAGS[@]}" "$@"
 53 | }
 54 | 
 55 | # Format files that differ from main branch. Ignores dirs that are not slated
 56 | # for autoformat yet.
 57 | format_changed() {
 58 |     # The `if` guard ensures that the list of filenames is not empty, which
 59 |     # could cause yapf to receive 0 positional arguments, making it hang
 60 |     # waiting for STDIN.
 61 |     #
 62 |     # `diff-filter=ACM` and $MERGEBASE is to ensure we only format files that
 63 |     # exist on both branches.
 64 |     MERGEBASE="$(git merge-base origin/main HEAD)"
 65 | 
 66 |     if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
 67 |         git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \
 68 |              yapf --in-place "${YAPF_EXCLUDES[@]}" "${YAPF_FLAGS[@]}"
 69 |     fi
 70 | 
 71 | }
 72 | 
 73 | # Format all files
 74 | format_all() {
 75 |     yapf --in-place "${YAPF_FLAGS[@]}" "${YAPF_EXCLUDES[@]}" vllm
 76 | }
 77 | 
 78 | ## This flag formats individual files. --files *must* be the first command line
 79 | ## arg to use this option.
 80 | if [[ "$1" == '--files' ]]; then
 81 |    format "${@:2}"
 82 |    # If `--all` is passed, then any further arguments are ignored and the
 83 |    # entire python directory is formatted.
 84 | elif [[ "$1" == '--all' ]]; then
 85 |    format_all
 86 | else
 87 |    # Format only the files that changed in last commit.
 88 |    format_changed
 89 | fi
 90 | echo 'vLLM yapf: Done'
 91 | 
 92 | # Run mypy
 93 | # TODO(zhuohan): Enable mypy
 94 | # echo 'vLLM mypy:'
 95 | # mypy
 96 | 
 97 | # Run Pylint
 98 | echo 'vLLM Pylint:'
 99 | pylint vllm
100 | 
101 | if ! git diff --quiet &>/dev/null; then
102 |     echo 'Reformatted files. Please review and stage the changes.'
103 |     echo 'Changes not staged for commit:'
104 |     echo
105 |     git --no-pager diff --name-only
106 | 
107 |     exit 1
108 | fi
109 | 


--------------------------------------------------------------------------------
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
  1 | # This workflow will upload a Python Package to Release asset
  2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions
  3 | 
  4 | name: Create Release
  5 | 
  6 | on:
  7 |   push:
  8 |     tags:
  9 |       - v*
 10 | 
 11 | # Needed to create release and upload assets
 12 | permissions:
 13 |   contents: write
 14 | 
 15 | jobs:
 16 |   release:
 17 |     # Retrieve tag and create release
 18 |     name: Create Release
 19 |     runs-on: ubuntu-latest
 20 |     outputs:
 21 |       upload_url: ${{ steps.create_release.outputs.upload_url }}
 22 |     steps:
 23 |       - name: Checkout
 24 |         uses: actions/checkout@v3
 25 | 
 26 |       - name: Extract branch info
 27 |         shell: bash
 28 |         run: |
 29 |           echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
 30 | 
 31 |       - name: Create Release
 32 |         id: create_release
 33 |         uses: "actions/github-script@v6"
 34 |         env:
 35 |           RELEASE_TAG: ${{ env.release_tag }}
 36 |         with:
 37 |           github-token: "${{ secrets.GITHUB_TOKEN }}"
 38 |           script: |
 39 |             const script = require('.github/workflows/scripts/create_release.js')
 40 |             await script(github, context, core)
 41 | 
 42 |   wheel:
 43 |     name: Build Wheel
 44 |     runs-on: ${{ matrix.os }}
 45 |     needs: release
 46 |     
 47 |     strategy:
 48 |       fail-fast: false
 49 |       matrix:
 50 |           os: ['ubuntu-20.04']
 51 |           python-version: ['3.8', '3.9', '3.10', '3.11']
 52 |           cuda-version: ['11.8'] # Github runner can't build anything older than 11.8
 53 | 
 54 |     steps:
 55 |       - name: Checkout
 56 |         uses: actions/checkout@v3
 57 | 
 58 |       - name: Set up Linux Env
 59 |         if: ${{ runner.os == 'Linux' }}
 60 |         run: |
 61 |           bash -x .github/workflows/scripts/env.sh
 62 | 
 63 |       - name: Set up Python
 64 |         uses: actions/setup-python@v4
 65 |         with:
 66 |             python-version: ${{ matrix.python-version }}
 67 | 
 68 |       - name: Install CUDA ${{ matrix.cuda-version }}
 69 |         run: |
 70 |           bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
 71 | 
 72 |       - name: Install PyTorch-cu${{ matrix.cuda-version }}
 73 |         run: |
 74 |           bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
 75 | 
 76 |       - name: Build wheel
 77 |         shell: bash
 78 |         run: |
 79 |           bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
 80 |           wheel_name=$(ls dist/*whl | xargs -n 1 basename)
 81 |           asset_name=${wheel_name//"linux"/"manylinux1"}
 82 |           echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
 83 |           echo "asset_name=${asset_name}" >> $GITHUB_ENV
 84 |       
 85 |       - name: Upload Release Asset
 86 |         uses: actions/upload-release-asset@v1
 87 |         env:
 88 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 89 |         with:
 90 |           upload_url: ${{ needs.release.outputs.upload_url }}
 91 |           asset_path: ./dist/${{ env.wheel_name }}
 92 |           asset_name: ${{ env.asset_name }}
 93 |           asset_content_type: application/*
 94 | 
 95 |       # (Danielkinz): This last step will publish the .whl to pypi. Warning: untested
 96 |       # - name: Publish package
 97 |       #   uses: pypa/gh-action-pypi-publish@release/v1.8
 98 |       #   with:
 99 |       #     repository-url: https://test.pypi.org/legacy/
100 |       #     password: ${{ secrets.PYPI_API_TOKEN }}
101 |       #     skip-existing: true
102 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | .idea/
161 | 
162 | # VSCode
163 | .vscode/
164 | 
165 | # DS Store
166 | .DS_Store
167 | 
168 | # Results
169 | *.csv
170 | 
171 | # Python pickle files
172 | *.pkl
173 | 
174 | # Sphinx documentation
175 | _build/
176 | 


--------------------------------------------------------------------------------
/csrc/activation_kernels.cu:
--------------------------------------------------------------------------------
  1 | #include <torch/extension.h>
  2 | #include <ATen/cuda/CUDAContext.h>
  3 | 
  4 | #include "dispatch_utils.h"
  5 | 
  6 | namespace vllm {
  7 | 
  8 | template<typename T>
  9 | __device__ __forceinline__ T silu(const T& x) {
 10 |   // x * sigmoid(x)
 11 |   return (T) (((float) x) / (1.0f + expf((float) -x)));
 12 | }
 13 | 
 14 | template<typename scalar_t>
 15 | __global__ void silu_and_mul_kernel(
 16 |   scalar_t* __restrict__ out,               // [num_tokens, d]
 17 |   const scalar_t* __restrict__ input,       // [num_tokens, 2, d]
 18 |   const int d) {
 19 |   const int token_idx = blockIdx.x;
 20 |   for (int idx = threadIdx.x; idx < d; idx += blockDim.x) {
 21 |     const scalar_t x = __ldg(&input[token_idx * 2 * d + idx]);
 22 |     const scalar_t y = __ldg(&input[token_idx * 2 * d + d + idx]);
 23 |     out[token_idx * d + idx] = silu(x) * y;
 24 |   }
 25 | }
 26 | 
 27 | } // namespace vllm
 28 | 
 29 | void silu_and_mul(
 30 |   torch::Tensor& out,      // [num_tokens, d]
 31 |   torch::Tensor& input)    // [num_tokens, 2 * d]
 32 | {
 33 |   int num_tokens = input.size(0);
 34 |   int d = input.size(1) / 2;
 35 | 
 36 |   dim3 grid(num_tokens);
 37 |   dim3 block(std::min(d, 1024));
 38 |   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 39 |   VLLM_DISPATCH_FLOATING_TYPES(
 40 |     input.scalar_type(),
 41 |     "silu_and_mul_kernel",
 42 |     [&] {
 43 |       vllm::silu_and_mul_kernel<scalar_t><<<grid, block, 0, stream>>>(
 44 |         out.data_ptr<scalar_t>(),
 45 |         input.data_ptr<scalar_t>(),
 46 |         d);
 47 |     });
 48 | }
 49 | 
 50 | namespace vllm {
 51 | 
 52 | // Element-wise activation kernel template.
 53 | template<typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
 54 | __global__ void activation_kernel(
 55 |   scalar_t* __restrict__ out,               // [num_tokens, d]
 56 |   const scalar_t* __restrict__ input,       // [num_tokens, d]
 57 |   const int d) {
 58 |   const int token_idx = blockIdx.x;
 59 |   for (int idx = threadIdx.x; idx < d; idx += blockDim.x) {
 60 |     const scalar_t x = __ldg(&input[token_idx * d + idx]);
 61 |     out[token_idx * d + idx] = ACT_FN(x);
 62 |   }
 63 | }
 64 | 
 65 | } // namespace vllm
 66 | 
 67 | // Launch element-wise activation kernel.
 68 | #define LAUNCH_ACTIVATION_KERNEL(KERNEL)                                                  \
 69 |   int num_tokens = input.size(0);                                                         \
 70 |   int d = input.size(1);                                                                  \
 71 |   dim3 grid(num_tokens);                                                                  \
 72 |   dim3 block(std::min(d, 1024));                                                          \
 73 |   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                           \
 74 |   VLLM_DISPATCH_FLOATING_TYPES(                                                           \
 75 |     input.scalar_type(),                                                                  \
 76 |     "activation_kernel",                                                                  \
 77 |     [&] {                                                                                 \
 78 |       vllm::activation_kernel<scalar_t, KERNEL<scalar_t>><<<grid, block, 0, stream>>>(    \
 79 |         out.data_ptr<scalar_t>(),                                                         \
 80 |         input.data_ptr<scalar_t>(),                                                       \
 81 |         d);                                                                               \
 82 |     });
 83 | 
 84 | namespace vllm {
 85 | 
 86 | template<typename T>
 87 | __device__ __forceinline__ T gelu_new_kernel(const T& x) {
 88 |   const float x3 = (float) (x * x * x);
 89 |   const T t = (T) tanhf((T) (0.79788456f * (float) (x + (T) (0.044715f * x3))));
 90 |   return ((T) 0.5) * x * (((T) 1.0) + t);
 91 | }
 92 | 
 93 | template<typename T>
 94 | __device__ __forceinline__ T gelu_fast_kernel(const T& x) {
 95 |   const float f = (float) x;
 96 |   const T t = (T) tanhf(((T) (f * 0.79788456f)) * (((T) 1.0) + (T) (0.044715f * f) * x));
 97 |   return ((T) 0.5) * x * (((T) 1.0) + t);
 98 | }
 99 | 
100 | } // namespace vllm
101 | 
102 | void gelu_new(
103 |   torch::Tensor& out,     // [num_tokens, d]
104 |   torch::Tensor& input)   // [num_tokens, d]
105 | {
106 |   LAUNCH_ACTIVATION_KERNEL(vllm::gelu_new_kernel);
107 | }
108 | 
109 | void gelu_fast(
110 |   torch::Tensor& out,     // [num_tokens, d]
111 |   torch::Tensor& input)   // [num_tokens, d]
112 | {
113 |   LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel);
114 | }
115 | 


--------------------------------------------------------------------------------
/vllm/outputs.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, List, Optional
  2 | 
  3 | from vllm.sequence import SequenceGroup, SequenceStatus
  4 | 
  5 | 
  6 | class CompletionOutput:
  7 |     """The output data of one completion output of a request.
  8 | 
  9 |     Args:
 10 |         index: The index of the output in the request.
 11 |         text: The generated output text.
 12 |         token_ids: The token IDs of the generated output text.
 13 |         cumulative_logprob: The cumulative log probability of the generated
 14 |             output text.
 15 |         logprobs: The log probabilities of the top probability words at each
 16 |             position if the logprobs are requested.
 17 |         finish_reason: The reason why the sequence is finished.
 18 |     """
 19 | 
 20 |     def __init__(
 21 |         self,
 22 |         index: int,
 23 |         text: str,
 24 |         token_ids: List[int],
 25 |         cumulative_logprob: float,
 26 |         logprobs: Optional[List[Dict[int, float]]],
 27 |         finish_reason: Optional[str] = None,
 28 |     ) -> None:
 29 |         self.index = index
 30 |         self.text = text
 31 |         self.token_ids = token_ids
 32 |         self.cumulative_logprob = cumulative_logprob
 33 |         self.logprobs = logprobs
 34 |         self.finish_reason = finish_reason
 35 | 
 36 |     def finished(self) -> bool:
 37 |         return self.finish_reason is not None
 38 | 
 39 |     def __repr__(self) -> str:
 40 |         return (f"CompletionOutput(index={self.index}, "
 41 |                 f"text={self.text!r}, "
 42 |                 f"token_ids={self.token_ids}, "
 43 |                 f"cumulative_logprob={self.cumulative_logprob}, "
 44 |                 f"logprobs={self.logprobs}, "
 45 |                 f"finish_reason={self.finish_reason})")
 46 | 
 47 | 
 48 | class RequestOutput:
 49 |     """The output data of a request to the LLM.
 50 | 
 51 |     Args:
 52 |         request_id: The unique ID of the request.
 53 |         prompt: The prompt string of the request.
 54 |         prompt_token_ids: The token IDs of the prompt.
 55 |         outputs: The output sequences of the request.
 56 |         finished: Whether the whole request is finished.
 57 |     """
 58 | 
 59 |     def __init__(
 60 |         self,
 61 |         request_id: str,
 62 |         prompt: str,
 63 |         prompt_token_ids: List[int],
 64 |         outputs: List[CompletionOutput],
 65 |         finished: bool,
 66 |     ) -> None:
 67 |         self.request_id = request_id
 68 |         self.prompt = prompt
 69 |         self.prompt_token_ids = prompt_token_ids
 70 |         self.outputs = outputs
 71 |         self.finished = finished
 72 | 
 73 |     @classmethod
 74 |     def from_seq_group(cls, seq_group: SequenceGroup) -> "RequestOutput":
 75 |         # Get the top-n sequences.
 76 |         n = seq_group.sampling_params.n
 77 |         seqs = seq_group.get_seqs()
 78 |         if seq_group.sampling_params.use_beam_search:
 79 |             sorting_key = lambda seq: seq.get_beam_search_score(
 80 |                 seq_group.sampling_params.length_penalty)
 81 |         else:
 82 |             sorting_key = lambda seq: seq.get_cumulative_logprob()
 83 |         sorted_seqs = sorted(seqs, key=sorting_key, reverse=True)
 84 |         top_n_seqs = sorted_seqs[:n]
 85 | 
 86 |         # Create the outputs.
 87 |         outputs: List[CompletionOutput] = []
 88 |         for seq in top_n_seqs:
 89 |             logprobs = seq.output_logprobs
 90 |             if seq_group.sampling_params.logprobs is None:
 91 |                 # NOTE: We need to take care of this case because the sequence
 92 |                 # always has the logprobs of the sampled tokens even if the
 93 |                 # logprobs are not requested.
 94 |                 logprobs = {}
 95 |             finshed_reason = SequenceStatus.get_finished_reason(seq.status)
 96 |             output = CompletionOutput(seqs.index(seq), seq.output_text,
 97 |                                       seq.get_output_token_ids(),
 98 |                                       seq.get_cumulative_logprob(), logprobs,
 99 |                                       finshed_reason)
100 |             outputs.append(output)
101 | 
102 |         # Every sequence in the sequence group should have the same prompt.
103 |         prompt = top_n_seqs[0].prompt
104 |         prompt_token_ids = top_n_seqs[0].data.prompt_token_ids
105 |         finished = seq_group.is_finished()
106 |         return cls(seq_group.request_id, prompt, prompt_token_ids, outputs,
107 |                    finished)
108 | 
109 |     def __repr__(self) -> str:
110 |         return (f"RequestOutput(request_id={self.request_id}, "
111 |                 f"prompt={self.prompt!r}, "
112 |                 f"prompt_token_ids={self.prompt_token_ids}, "
113 |                 f"outputs={self.outputs}, "
114 |                 f"finished={self.finished})")
115 | 


--------------------------------------------------------------------------------
/csrc/pos_encoding_kernels.cu:
--------------------------------------------------------------------------------
  1 | #include <torch/extension.h>
  2 | #include <ATen/cuda/CUDAContext.h>
  3 | 
  4 | #include "dispatch_utils.h"
  5 | 
  6 | namespace vllm {
  7 | 
  8 | template<typename scalar_t, bool IS_NEOX>
  9 | inline __device__ void apply_rotary_embedding(
 10 |   scalar_t* __restrict__ arr,
 11 |   const scalar_t* __restrict__ cos_ptr,
 12 |   const scalar_t* __restrict__ sin_ptr,
 13 |   int rot_offset,
 14 |   int embed_dim)
 15 | {
 16 |   int x_index, y_index;
 17 |   scalar_t cos, sin;
 18 |   if (IS_NEOX) {
 19 |     // GPT-NeoX style rotary embedding.
 20 |     x_index = rot_offset;
 21 |     y_index = embed_dim + rot_offset;
 22 |     cos = __ldg(cos_ptr + x_index);
 23 |     sin = __ldg(sin_ptr + x_index);
 24 |   } else {
 25 |     // GPT-J style rotary embedding.
 26 |     x_index = 2 * rot_offset;
 27 |     y_index = 2 * rot_offset + 1;
 28 |     cos = __ldg(cos_ptr + x_index / 2);
 29 |     sin = __ldg(sin_ptr + x_index / 2);
 30 |   }
 31 | 
 32 |   const scalar_t x = arr[x_index];
 33 |   const scalar_t y = arr[y_index];
 34 |   arr[x_index] = x * cos - y * sin;
 35 |   arr[y_index] = y * cos + x * sin;
 36 | }
 37 | 
 38 | template<typename scalar_t, bool IS_NEOX>
 39 | __global__ void rotary_embedding_kernel(
 40 |   const int64_t* __restrict__ positions,        // [num_tokens]
 41 |   scalar_t* __restrict__ query,                 // [num_tokens, num_heads, head_size]
 42 |   scalar_t* __restrict__ key,                   // [num_tokens, num_kv_heads, head_size]
 43 |   const scalar_t* __restrict__ cos_sin_cache,   // [max_position, 2, rot_dim // 2]
 44 |   const int rot_dim,
 45 |   const int query_stride,
 46 |   const int key_stride,
 47 |   const int num_heads,
 48 |   const int num_kv_heads,
 49 |   const int head_size) {
 50 |   // Each thread block is responsible for one token.
 51 |   const int token_idx = blockIdx.x;
 52 |   int64_t pos = positions[token_idx];
 53 |   const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
 54 | 
 55 |   const int embed_dim = rot_dim / 2;
 56 |   const scalar_t* cos_ptr = cache_ptr;
 57 |   const scalar_t* sin_ptr = cache_ptr + embed_dim;
 58 | 
 59 |   const int nq = num_heads * embed_dim;
 60 |   for (int i = threadIdx.x; i < nq; i += blockDim.x) {
 61 |     const int head_idx = i / embed_dim;
 62 |     const int token_head = token_idx * query_stride + head_idx * head_size;
 63 |     const int rot_offset = i % embed_dim;
 64 |     apply_rotary_embedding<scalar_t, IS_NEOX>(query + token_head, cos_ptr,
 65 |                                               sin_ptr, rot_offset, embed_dim);
 66 |   }
 67 | 
 68 |   const int nk = num_kv_heads * embed_dim;
 69 |   for (int i = threadIdx.x; i < nk; i += blockDim.x) {
 70 |     const int head_idx = i / embed_dim;
 71 |     const int token_head = token_idx * key_stride + head_idx * head_size;
 72 |     const int rot_offset = i % embed_dim;
 73 |     apply_rotary_embedding<scalar_t, IS_NEOX>(key + token_head, cos_ptr,
 74 |                                               sin_ptr, rot_offset, embed_dim);
 75 |   }
 76 | }
 77 | 
 78 | } // namespace vllm
 79 | 
 80 | void rotary_embedding(
 81 |   torch::Tensor& positions,         // [num_tokens]
 82 |   torch::Tensor& query,             // [num_tokens, num_heads * head_size]
 83 |   torch::Tensor& key,               // [num_tokens, num_kv_heads * head_size]
 84 |   int head_size,
 85 |   torch::Tensor& cos_sin_cache,     // [max_position, rot_dim]
 86 |   bool is_neox) {
 87 |   int num_tokens = query.size(0);
 88 |   int rot_dim = cos_sin_cache.size(1);
 89 |   int num_heads = query.size(1) / head_size;
 90 |   int num_kv_heads = key.size(1) / head_size;
 91 |   int query_stride = query.stride(0);
 92 |   int key_stride = key.stride(0);
 93 | 
 94 |   dim3 grid(num_tokens);
 95 |   dim3 block(std::min(num_heads * rot_dim / 2, 512));
 96 |   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 97 |   VLLM_DISPATCH_FLOATING_TYPES(
 98 |     query.scalar_type(),
 99 |     "rotary_embedding",
100 |     [&] {
101 |       if (is_neox) {
102 |         vllm::rotary_embedding_kernel<scalar_t, true><<<grid, block, 0, stream>>>(
103 |           positions.data_ptr<int64_t>(),
104 |           query.data_ptr<scalar_t>(),
105 |           key.data_ptr<scalar_t>(),
106 |           cos_sin_cache.data_ptr<scalar_t>(),
107 |           rot_dim,
108 |           query_stride,
109 |           key_stride,
110 |           num_heads,
111 |           num_kv_heads,
112 |           head_size);
113 |       } else {
114 |         vllm::rotary_embedding_kernel<scalar_t, false><<<grid, block, 0, stream>>>(
115 |           positions.data_ptr<int64_t>(),
116 |           query.data_ptr<scalar_t>(),
117 |           key.data_ptr<scalar_t>(),
118 |           cos_sin_cache.data_ptr<scalar_t>(),
119 |           rot_dim,
120 |           query_stride,
121 |           key_stride,
122 |           num_heads,
123 |           num_kv_heads,
124 |           head_size);
125 |       }
126 |     });
127 | }
128 | 


--------------------------------------------------------------------------------
/vllm/engine/ray_utils.py:
--------------------------------------------------------------------------------
  1 | import socket
  2 | from typing import Optional, Tuple, TYPE_CHECKING
  3 | 
  4 | from vllm.config import ParallelConfig
  5 | 
  6 | try:
  7 |     import ray
  8 |     from ray.air.util.torch_dist import TorchDistributedWorker
  9 | 
 10 |     class RayWorker(TorchDistributedWorker):
 11 |         """Ray wrapper for vllm.worker.Worker, allowing Worker to be
 12 |         lazliy initialized after Ray sets CUDA_VISIBLE_DEVICES."""
 13 | 
 14 |         def __init__(self, init_cached_hf_modules=False) -> None:
 15 |             if init_cached_hf_modules:
 16 |                 # pylint: disable=import-outside-toplevel
 17 |                 from transformers.dynamic_module_utils import init_hf_modules
 18 |                 init_hf_modules()
 19 |             self.worker = None
 20 | 
 21 |         def init_worker(self, worker_init_fn):
 22 |             self.worker = worker_init_fn()
 23 | 
 24 |         def __getattr__(self, name):
 25 |             return getattr(self.worker, name)
 26 | 
 27 |         def execute_method(self, method, *args, **kwargs):
 28 |             executor = getattr(self, method)
 29 |             return executor(*args, **kwargs)
 30 | 
 31 | except ImportError:
 32 |     ray = None
 33 |     TorchDistributedWorker = None
 34 |     RayWorker = None  # pylint: disable=invalid-name
 35 | 
 36 | if TYPE_CHECKING:
 37 |     from ray.util.placement_group import PlacementGroup
 38 | 
 39 | 
 40 | def get_open_port():
 41 |     with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
 42 |         s.bind(("", 0))
 43 |         return s.getsockname()[1]
 44 | 
 45 | 
 46 | def initialize_cluster(
 47 |     parallel_config: ParallelConfig,
 48 |     engine_use_ray: bool = False,
 49 |     ray_address: Optional[str] = None,
 50 | ) -> Tuple[str, Optional["PlacementGroup"]]:
 51 |     """Initialize the distributed cluster probably with Ray.
 52 | 
 53 |     Args:
 54 |         parallel_config: The configurations for parallel execution.
 55 |         engine_use_ray: Whether to use Ray for async engine.
 56 |         ray_address: The address of the Ray cluster. If None, uses
 57 |             the default Ray cluster address.
 58 | 
 59 |     Returns:
 60 |         A tuple of (`distributed_init_method`, `all_stage_devices`). The
 61 |         `distributed_init_method` is the address for initializing the
 62 |         distributed backend. `all_stage_devices` includes device IDs for
 63 |         each worker in each pipeline stage. Each device ID is a tuple of
 64 |         (rank, node resource, device id).
 65 |     """
 66 |     if parallel_config.worker_use_ray or engine_use_ray:
 67 |         if ray is None:
 68 |             raise ImportError(
 69 |                 "Ray is not installed. Please install Ray to use distributed "
 70 |                 "serving.")
 71 |         # Connect to a ray cluster.
 72 |         ray.init(address=ray_address, ignore_reinit_error=True)
 73 | 
 74 |     if not parallel_config.worker_use_ray:
 75 |         # Initialize cluster locally.
 76 |         port = get_open_port()
 77 |         # We need to setup the distributed init method to make sure
 78 |         # the distributed megatron code (e.g., get world size) works correctly.
 79 |         distributed_init_method = f"tcp://localhost:{port}"
 80 |         return distributed_init_method, None
 81 | 
 82 |     current_placement_group = ray.util.get_current_placement_group()
 83 |     if current_placement_group:
 84 |         # We are in a placement group
 85 |         bundles = current_placement_group.bundle_specs
 86 |         # Verify that we can use the placement group.
 87 |         gpu_bundles = 0
 88 |         for bundle in bundles:
 89 |             bundle_gpus = bundle.get("GPU", 0)
 90 |             if bundle_gpus > 1:
 91 |                 raise ValueError(
 92 |                     "Placement group bundle cannot have more than 1 GPU.")
 93 |             if bundle_gpus:
 94 |                 gpu_bundles += 1
 95 |         if parallel_config.world_size > gpu_bundles:
 96 |             raise ValueError(
 97 |                 "The number of required GPUs exceeds the total number of "
 98 |                 "available GPUs in the placement group.")
 99 |     else:
100 |         num_gpus_in_cluster = ray.cluster_resources().get("GPU", 0)
101 |         if parallel_config.world_size > num_gpus_in_cluster:
102 |             raise ValueError(
103 |                 "The number of required GPUs exceeds the total number of "
104 |                 "available GPUs in the cluster.")
105 |         # Create a new placement group
106 |         current_placement_group = ray.util.placement_group([{
107 |             "GPU": 1
108 |         }] * parallel_config.world_size)
109 |         # Wait until PG is ready - this will block until all
110 |         # requested resources are available, and will timeout
111 |         # if they cannot be provisioned.
112 |         ray.get(current_placement_group.ready(), timeout=1800)
113 | 
114 |     return None, current_placement_group
115 | 


--------------------------------------------------------------------------------
/docs/source/models/adding_model.rst:
--------------------------------------------------------------------------------
 1 | .. _adding_a_new_model:
 2 | 
 3 | Adding a New Model
 4 | ==================
 5 | 
 6 | This document provides a high-level guide on integrating a `HuggingFace Transformers <https://github.com/huggingface/transformers>`_ model into vLLM.
 7 | 
 8 | .. note::
 9 |     The complexity of adding a new model depends heavily on the model's architecture.
10 |     The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
11 |     However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
12 | 
13 | .. tip::
14 |     If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our `GitHub <https://github.com/vllm-project/vllm/issues>`_ repository.
15 |     We will be happy to help you out!
16 | 
17 | 
18 | 0. Fork the vLLM repository
19 | --------------------------------
20 | 
21 | Start by forking our `GitHub <https://github.com/vllm-project/vllm/>`_ repository and then :ref:`build it from source <build_from_source>`.
22 | This gives you the ability to modify the codebase and test your model.
23 | 
24 | 
25 | 1. Bring your model code
26 | ------------------------
27 | 
28 | Clone the PyTorch model code from the HuggingFace Transformers repository and put it into the `vllm/model_executor/models <https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models>`_ directory.
29 | For instance, vLLM's `OPT model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/opt.py>`_ was adpated from the HuggingFace's `modeling_opt.py <https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py>`_ file.
30 | 
31 | .. warning::
32 |     When copying the model code, make sure to review and adhere to the code's copyright and licensing terms.
33 | 
34 | 
35 | 2. Rewrite the :code:`forward` methods
36 | --------------------------------------
37 | 
38 | Next, you need to rewrite the :code:`forward` methods of your model by following these steps:
39 | 
40 | 1. Remove any unnecessary code, such as the code only used for training.
41 | 2. Change the input parameters:
42 | 
43 | .. code-block:: diff
44 | 
45 |     def forward(
46 |         self,
47 |         input_ids: torch.Tensor,
48 |     -    attention_mask: Optional[torch.Tensor] = None,
49 |     -    position_ids: Optional[torch.LongTensor] = None,
50 |     -    past_key_values: Optional[List[torch.FloatTensor]] = None,
51 |     -    inputs_embeds: Optional[torch.FloatTensor] = None,
52 |     -    labels: Optional[torch.LongTensor] = None,
53 |     -    use_cache: Optional[bool] = None,
54 |     -    output_attentions: Optional[bool] = None,
55 |     -    output_hidden_states: Optional[bool] = None,
56 |     -    return_dict: Optional[bool] = None,
57 |     -) -> Union[Tuple, CausalLMOutputWithPast]:
58 |     +    positions: torch.Tensor,
59 |     +    kv_caches: List[KVCache],
60 |     +    input_metadata: InputMetadata,
61 |     +    cache_events: Optional[List[torch.cuda.Event]],
62 |     +) -> SamplerOutput:
63 | 
64 | 3. Update the code by considering that :code:`input_ids` and :code:`positions` are now flattened tensors.
65 | 4. Replace the attention operation with either :code:`GPTPagedAttention` or :code:`GPTNeoXPagedAttention`, depending on the model's architecture.
66 | 
67 | .. note::
68 |     Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
69 |     If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
70 | 
71 | 
72 | 3. (Optional) Implement tensor parallelism support
73 | --------------------------------------------------
74 | 
75 | If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it.
76 | To do this, substitute your model's linear and embedding layers with their tensor-parallel versions.
77 | For the embedding layer, you can simply replace :code:`nn.Embedding` with :code:`VocabParallelEmbedding`.
78 | When it comes to the linear layers, you should use either :code:`RowParallelLinear` or :code:`ColumnParallelLinear`.
79 | Typically, :code:`ColumnParallelLinear` is used for QKV linear layers and the first linear layers of the MLP blocks.
80 | For the remaining linear layers, :code:`RowParallelLinear` is used.
81 | 
82 | 
83 | 4. Implement the weight loading logic
84 | -------------------------------------
85 | 
86 | You now need to implement the :code:`load_weights` method in your :code:`*ForCausalLM` class.
87 | This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model.
88 | While the process is straightforward for most layers, the tensor-parallel layers necessitate some additional care as their weights should be partitioned to multiple GPUs.
89 | 
90 | 
91 | 5. Register your model
92 | ----------------------
93 | 
94 | Finally, include your :code:`*ForCausalLM` class in `vllm/model_executor/models/__init__.py <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/__init__.py>`_ and register it to the :code:`_MODEL_REGISTRY` in `vllm/model_executor/model_loader.py <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/model_loader.py>`_.
95 | 


--------------------------------------------------------------------------------
/docs/source/getting_started/quickstart.rst:
--------------------------------------------------------------------------------
  1 | .. _quickstart:
  2 | 
  3 | Quickstart
  4 | ==========
  5 | 
  6 | This guide shows how to use vLLM to:
  7 | 
  8 | * run offline batched inference on a dataset;
  9 | * build an API server for a large language model;
 10 | * start an OpenAI-compatible API server.
 11 | 
 12 | Be sure to complete the :ref:`installation instructions <installation>` before continuing with this guide.
 13 | 
 14 | Offline Batched Inference
 15 | -------------------------
 16 | 
 17 | We first show an example of using vLLM for offline batched inference on a dataset. In other words, we use vLLM to generate texts for a list of input prompts.
 18 | 
 19 | Import ``LLM`` and ``SamplingParams`` from vLLM. The ``LLM`` class is the main class for running offline inference with vLLM engine. The ``SamplingParams`` class specifies the parameters for the sampling process.
 20 | 
 21 | .. code-block:: python
 22 | 
 23 |     from vllm import LLM, SamplingParams
 24 | 
 25 | Define the list of input prompts and the sampling parameters for generation. The sampling temperature is set to 0.8 and the nucleus sampling probability is set to 0.95. For more information about the sampling parameters, refer to the `class definition <https://github.com/vllm-project/vllm/blob/main/vllm/sampling_params.py>`_.
 26 | 
 27 | .. code-block:: python
 28 | 
 29 |     prompts = [
 30 |         "Hello, my name is",
 31 |         "The president of the United States is",
 32 |         "The capital of France is",
 33 |         "The future of AI is",
 34 |     ]
 35 |     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 36 | 
 37 | Initialize vLLM's engine for offline inference with the ``LLM`` class and the `OPT-125M model <https://arxiv.org/abs/2205.01068>`_. The list of supported models can be found at :ref:`supported models <supported_models>`.
 38 | 
 39 | .. code-block:: python
 40 | 
 41 |     llm = LLM(model="facebook/opt-125m")
 42 | 
 43 | Call ``llm.generate`` to generate the outputs. It adds the input prompts to vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of ``RequestOutput`` objects, which include all the output tokens.
 44 | 
 45 | .. code-block:: python
 46 | 
 47 |     outputs = llm.generate(prompts, sampling_params)
 48 | 
 49 |     # Print the outputs.
 50 |     for output in outputs:
 51 |         prompt = output.prompt
 52 |         generated_text = output.outputs[0].text
 53 |         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 54 | 
 55 | 
 56 | The code example can also be found in `examples/offline_inference.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference.py>`_.
 57 | 
 58 | 
 59 | API Server
 60 | ----------
 61 | 
 62 | vLLM can be deployed as an LLM service. We provide an example `FastAPI <https://fastapi.tiangolo.com/>`_ server. Check `vllm/entrypoints/api_server.py <https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/api_server.py>`_ for the server implementation. The server uses ``AsyncLLMEngine`` class to support asynchronous processing of incoming requests.
 63 | 
 64 | Start the server:
 65 | 
 66 | .. code-block:: console
 67 | 
 68 |     $ python -m vllm.entrypoints.api_server
 69 | 
 70 | By default, this command starts the server at ``http://localhost:8000`` with the OPT-125M model.
 71 | 
 72 | Query the model in shell:
 73 | 
 74 | .. code-block:: console
 75 | 
 76 |     $ curl http://localhost:8000/generate \
 77 |     $     -d '{
 78 |     $         "prompt": "San Francisco is a",
 79 |     $         "use_beam_search": true,
 80 |     $         "n": 4,
 81 |     $         "temperature": 0
 82 |     $     }'
 83 | 
 84 | See `examples/api_client.py <https://github.com/vllm-project/vllm/blob/main/examples/api_client.py>`_ for a more detailed client example.
 85 | 
 86 | OpenAI-Compatible Server
 87 | ------------------------
 88 | 
 89 | vLLM can be deployed as a server that mimics the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API.
 90 | 
 91 | Start the server:
 92 | 
 93 | .. code-block:: console
 94 | 
 95 |     $ python -m vllm.entrypoints.openai.api_server \
 96 |     $     --model facebook/opt-125m
 97 | 
 98 | By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time (OPT-125M in the above command) and implements `list models <https://platform.openai.com/docs/api-reference/models/list>`_ and `create completion <https://platform.openai.com/docs/api-reference/completions/create>`_ endpoints. We are actively adding support for more endpoints.
 99 | 
100 | This server can be queried in the same format as OpenAI API. For example, list the models:
101 | 
102 | .. code-block:: console
103 | 
104 |     $ curl http://localhost:8000/v1/models
105 | 
106 | Query the model with input prompts:
107 | 
108 | .. code-block:: console
109 | 
110 |     $ curl http://localhost:8000/v1/completions \
111 |     $     -H "Content-Type: application/json" \
112 |     $     -d '{
113 |     $         "model": "facebook/opt-125m",
114 |     $         "prompt": "San Francisco is a",
115 |     $         "max_tokens": 7,
116 |     $         "temperature": 0
117 |     $     }'
118 | 
119 | Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the ``openai`` python package:
120 | 
121 | .. code-block:: python
122 | 
123 |     import openai
124 |     # Modify OpenAI's API key and API base to use vLLM's API server.
125 |     openai.api_key = "EMPTY"
126 |     openai.api_base = "http://localhost:8000/v1"
127 |     completion = openai.Completion.create(model="facebook/opt-125m",
128 |                                           prompt="San Francisco is a")
129 |     print("Completion result:", completion)
130 | 
131 | For a more detailed client example, refer to `examples/openai_client.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_client.py>`_.
132 | 


--------------------------------------------------------------------------------
/tests/kernels/test_cache.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | 
  3 | import pytest
  4 | import torch
  5 | 
  6 | from vllm import cache_ops
  7 | 
  8 | DTYPES = [torch.half, torch.bfloat16, torch.float]
  9 | NUM_TOKENS = [7, 83, 2048]  # Arbitrary values for testing
 10 | NUM_LAYERS = [5]  # Arbitrary values for testing
 11 | NUM_HEADS = [8]  # Arbitrary values for testing
 12 | HEAD_SIZES = [64, 80, 96, 112, 128, 256]
 13 | BLOCK_SIZES = [8, 16, 32]
 14 | NUM_BLOCKS = [1024]  # Arbitrary values for testing
 15 | NUM_MAPPINGS = [32, 256]  # Arbitrary values for testing
 16 | SEEDS = [0]
 17 | 
 18 | 
 19 | @pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
 20 | @pytest.mark.parametrize("num_layers", NUM_LAYERS)
 21 | @pytest.mark.parametrize("num_heads", NUM_HEADS)
 22 | @pytest.mark.parametrize("head_size", HEAD_SIZES)
 23 | @pytest.mark.parametrize("block_size", BLOCK_SIZES)
 24 | @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 25 | @pytest.mark.parametrize("dtype", DTYPES)
 26 | @pytest.mark.parametrize("seed", SEEDS)
 27 | @torch.inference_mode()
 28 | def test_copy_blocks(
 29 |     kv_cache_factory,
 30 |     num_mappings: int,
 31 |     num_layers: int,
 32 |     num_heads: int,
 33 |     head_size: int,
 34 |     block_size: int,
 35 |     num_blocks: int,
 36 |     dtype: torch.dtype,
 37 |     seed: int,
 38 | ) -> None:
 39 |     random.seed(seed)
 40 |     torch.random.manual_seed(seed)
 41 |     torch.cuda.manual_seed(seed)
 42 | 
 43 |     # Generate random block mappings where each source block is mapped to two
 44 |     # destination blocks.
 45 |     assert 2 * num_mappings <= num_blocks
 46 |     src_blocks = random.sample(range(num_blocks), num_mappings)
 47 |     remainig_blocks = list(set(range(num_blocks)) - set(src_blocks))
 48 |     dst_blocks = random.sample(remainig_blocks, 2 * num_mappings)
 49 |     block_mapping = {}
 50 |     for i in range(num_mappings):
 51 |         src = src_blocks[i]
 52 |         dst1 = dst_blocks[2 * i]
 53 |         dst2 = dst_blocks[2 * i + 1]
 54 |         block_mapping[src] = [dst1, dst2]
 55 | 
 56 |     # Create the KV caches.
 57 |     key_caches, value_caches = kv_cache_factory(num_blocks, block_size,
 58 |                                                 num_layers, num_heads,
 59 |                                                 head_size, dtype, seed)
 60 | 
 61 |     # Clone the KV caches.
 62 |     cloned_key_caches = [key_cache.clone() for key_cache in key_caches]
 63 |     cloned_value_caches = [value_cache.clone() for value_cache in value_caches]
 64 | 
 65 |     # Call the copy blocks kernel.
 66 |     cache_ops.copy_blocks(key_caches, value_caches, block_mapping)
 67 | 
 68 |     # Run the reference implementation.
 69 |     for src, dsts in block_mapping.items():
 70 |         for dst in dsts:
 71 |             for cloned_key_cache in cloned_key_caches:
 72 |                 cloned_key_cache[dst] = cloned_key_cache[src]
 73 |             for cloned_value_cache in cloned_value_caches:
 74 |                 cloned_value_cache[dst] = cloned_value_cache[src]
 75 | 
 76 |     # Compare the results.
 77 |     for key_cache, cloned_key_cache in zip(key_caches, cloned_key_caches):
 78 |         assert torch.allclose(key_cache, cloned_key_cache)
 79 |     for value_cache, cloned_value_cache in zip(value_caches,
 80 |                                                cloned_value_caches):
 81 |         assert torch.allclose(value_cache, cloned_value_cache)
 82 | 
 83 | 
 84 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 85 | @pytest.mark.parametrize("num_heads", NUM_HEADS)
 86 | @pytest.mark.parametrize("head_size", HEAD_SIZES)
 87 | @pytest.mark.parametrize("block_size", BLOCK_SIZES)
 88 | @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 89 | @pytest.mark.parametrize("dtype", DTYPES)
 90 | @pytest.mark.parametrize("seed", SEEDS)
 91 | @torch.inference_mode()
 92 | def test_reshape_and_cache(
 93 |     kv_cache_factory,
 94 |     num_tokens: int,
 95 |     num_heads: int,
 96 |     head_size: int,
 97 |     block_size: int,
 98 |     num_blocks: int,
 99 |     dtype: torch.dtype,
100 |     seed: int,
101 | ) -> None:
102 |     random.seed(seed)
103 |     torch.random.manual_seed(seed)
104 |     torch.cuda.manual_seed(seed)
105 | 
106 |     # Create a random slot mapping.
107 |     num_slots = block_size * num_blocks
108 |     slot_mapping = random.sample(range(num_slots), num_tokens)
109 |     slot_mapping = torch.tensor(slot_mapping, dtype=torch.int, device='cuda')
110 | 
111 |     qkv = torch.randn(num_tokens,
112 |                       3,
113 |                       num_heads,
114 |                       head_size,
115 |                       dtype=dtype,
116 |                       device='cuda')
117 |     _, key, value = qkv.unbind(dim=1)
118 | 
119 |     # Create the KV caches.
120 |     key_caches, value_caches = kv_cache_factory(num_blocks, block_size, 1,
121 |                                                 num_heads, head_size, dtype,
122 |                                                 seed)
123 |     key_cache, value_cache = key_caches[0], value_caches[0]
124 | 
125 |     # Clone the KV caches.
126 |     cloned_key_cache = key_cache.clone()
127 |     cloned_value_cache = value_cache.clone()
128 | 
129 |     # Call the reshape_and_cache kernel.
130 |     cache_ops.reshape_and_cache(key, value, key_cache, value_cache,
131 |                                 slot_mapping)
132 | 
133 |     # Run the reference implementation.
134 |     reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape)
135 |     block_indicies = torch.div(slot_mapping, block_size, rounding_mode='floor')
136 |     block_indicies = block_indicies.cpu().tolist()
137 |     block_offsets = slot_mapping % block_size
138 |     block_offsets = block_offsets.cpu().tolist()
139 |     for i in range(num_tokens):
140 |         block_idx = block_indicies[i]
141 |         block_offset = block_offsets[i]
142 |         cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i]
143 |         cloned_value_cache[block_idx, :, :, block_offset] = value[i]
144 | 
145 |     assert torch.allclose(key_cache, cloned_key_cache)
146 |     assert torch.allclose(value_cache, cloned_value_cache)
147 | 


--------------------------------------------------------------------------------
/vllm/entrypoints/openai/protocol.py:
--------------------------------------------------------------------------------
  1 | # Adapted from
  2 | # https://github.com/lm-sys/FastChat/blob/168ccc29d3f7edc50823016105c024fe2282732a/fastchat/protocol/openai_api_protocol.py
  3 | import time
  4 | from typing import Dict, List, Literal, Optional, Union
  5 | 
  6 | from pydantic import BaseModel, Field
  7 | 
  8 | from vllm.utils import random_uuid
  9 | 
 10 | 
 11 | class ErrorResponse(BaseModel):
 12 |     object: str = "error"
 13 |     message: str
 14 |     type: str
 15 |     param: Optional[str] = None
 16 |     code: Optional[str] = None
 17 | 
 18 | 
 19 | class ModelPermission(BaseModel):
 20 |     id: str = Field(default_factory=lambda: f"modelperm-{random_uuid()}")
 21 |     object: str = "model_permission"
 22 |     created: int = Field(default_factory=lambda: int(time.time()))
 23 |     allow_create_engine: bool = False
 24 |     allow_sampling: bool = True
 25 |     allow_logprobs: bool = True
 26 |     allow_search_indices: bool = False
 27 |     allow_view: bool = True
 28 |     allow_fine_tuning: bool = False
 29 |     organization: str = "*"
 30 |     group: Optional[str] = None
 31 |     is_blocking: str = False
 32 | 
 33 | 
 34 | class ModelCard(BaseModel):
 35 |     id: str
 36 |     object: str = "model"
 37 |     created: int = Field(default_factory=lambda: int(time.time()))
 38 |     owned_by: str = "vllm"
 39 |     root: Optional[str] = None
 40 |     parent: Optional[str] = None
 41 |     permission: List[ModelPermission] = Field(default_factory=list)
 42 | 
 43 | 
 44 | class ModelList(BaseModel):
 45 |     object: str = "list"
 46 |     data: List[ModelCard] = Field(default_factory=list)
 47 | 
 48 | 
 49 | class UsageInfo(BaseModel):
 50 |     prompt_tokens: int = 0
 51 |     total_tokens: int = 0
 52 |     completion_tokens: Optional[int] = 0
 53 | 
 54 | 
 55 | class ChatCompletionRequest(BaseModel):
 56 |     model: str
 57 |     messages: Union[str, List[Dict[str, str]]]
 58 |     temperature: Optional[float] = 0.7
 59 |     top_p: Optional[float] = 1.0
 60 |     n: Optional[int] = 1
 61 |     max_tokens: Optional[int] = 16
 62 |     stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
 63 |     stream: Optional[bool] = False
 64 |     presence_penalty: Optional[float] = 0.0
 65 |     frequency_penalty: Optional[float] = 0.0
 66 |     logit_bias: Optional[Dict[str, float]] = None
 67 |     user: Optional[str] = None
 68 |     # Additional parameters supported by vLLM
 69 |     best_of: Optional[int] = None
 70 |     top_k: Optional[int] = -1
 71 |     ignore_eos: Optional[bool] = False
 72 |     use_beam_search: Optional[bool] = False
 73 | 
 74 | 
 75 | class CompletionRequest(BaseModel):
 76 |     model: str
 77 |     # a string, array of strings, array of tokens, or array of token arrays
 78 |     prompt: Union[List[int], List[List[int]], str, List[str]]
 79 |     suffix: Optional[str] = None
 80 |     max_tokens: Optional[int] = 16
 81 |     temperature: Optional[float] = 1.0
 82 |     top_p: Optional[float] = 1.0
 83 |     n: Optional[int] = 1
 84 |     stream: Optional[bool] = False
 85 |     logprobs: Optional[int] = None
 86 |     echo: Optional[bool] = False
 87 |     stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
 88 |     presence_penalty: Optional[float] = 0.0
 89 |     frequency_penalty: Optional[float] = 0.0
 90 |     best_of: Optional[int] = None
 91 |     logit_bias: Optional[Dict[str, float]] = None
 92 |     user: Optional[str] = None
 93 |     # Additional parameters supported by vLLM
 94 |     top_k: Optional[int] = -1
 95 |     ignore_eos: Optional[bool] = False
 96 |     use_beam_search: Optional[bool] = False
 97 | 
 98 | 
 99 | class LogProbs(BaseModel):
100 |     text_offset: List[int] = Field(default_factory=list)
101 |     token_logprobs: List[Optional[float]] = Field(default_factory=list)
102 |     tokens: List[str] = Field(default_factory=list)
103 |     top_logprobs: List[Optional[Dict[str,
104 |                                      float]]] = Field(default_factory=list)
105 | 
106 | 
107 | class CompletionResponseChoice(BaseModel):
108 |     index: int
109 |     text: str
110 |     logprobs: Optional[LogProbs] = None
111 |     finish_reason: Optional[Literal["stop", "length"]] = None
112 | 
113 | 
114 | class CompletionResponse(BaseModel):
115 |     id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
116 |     object: str = "text_completion"
117 |     created: int = Field(default_factory=lambda: int(time.time()))
118 |     model: str
119 |     choices: List[CompletionResponseChoice]
120 |     usage: UsageInfo
121 | 
122 | 
123 | class CompletionResponseStreamChoice(BaseModel):
124 |     index: int
125 |     text: str
126 |     logprobs: Optional[LogProbs] = None
127 |     finish_reason: Optional[Literal["stop", "length"]] = None
128 | 
129 | 
130 | class CompletionStreamResponse(BaseModel):
131 |     id: str = Field(default_factory=lambda: f"cmpl-{random_uuid()}")
132 |     object: str = "text_completion"
133 |     created: int = Field(default_factory=lambda: int(time.time()))
134 |     model: str
135 |     choices: List[CompletionResponseStreamChoice]
136 | 
137 | 
138 | class ChatMessage(BaseModel):
139 |     role: str
140 |     content: str
141 | 
142 | 
143 | class ChatCompletionResponseChoice(BaseModel):
144 |     index: int
145 |     message: ChatMessage
146 |     finish_reason: Optional[Literal["stop", "length"]] = None
147 | 
148 | 
149 | class ChatCompletionResponse(BaseModel):
150 |     id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
151 |     object: str = "chat.completion"
152 |     created: int = Field(default_factory=lambda: int(time.time()))
153 |     model: str
154 |     choices: List[ChatCompletionResponseChoice]
155 |     usage: UsageInfo
156 | 
157 | 
158 | class DeltaMessage(BaseModel):
159 |     role: Optional[str] = None
160 |     content: Optional[str] = None
161 | 
162 | 
163 | class ChatCompletionResponseStreamChoice(BaseModel):
164 |     index: int
165 |     delta: DeltaMessage
166 |     finish_reason: Optional[Literal["stop", "length"]] = None
167 | 
168 | 
169 | class ChatCompletionStreamResponse(BaseModel):
170 |     id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
171 |     object: str = "chat.completion.chunk"
172 |     created: int = Field(default_factory=lambda: int(time.time()))
173 |     model: str
174 |     choices: List[ChatCompletionResponseStreamChoice]
175 | 


--------------------------------------------------------------------------------
/tests/kernels/test_pos_encoding.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional, Tuple
  2 | 
  3 | import pytest
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | 
  8 | from vllm import pos_encoding_ops
  9 | 
 10 | IS_NEOX_STYLE = [True, False]
 11 | DTYPES = [torch.half, torch.bfloat16, torch.float]
 12 | HEAD_SIZES = [64, 80, 96, 112, 128, 256]
 13 | ROTARY_DIMS = [None, 32]  # None means rotary dim == head size
 14 | NUM_HEADS = [7, 12, 40, 52]  # Arbitrary values for testing
 15 | NUM_TOKENS = [11, 83, 2048]  # Arbitrary values for testing
 16 | SEEDS = [0]
 17 | 
 18 | 
 19 | def rotate_neox(x: torch.Tensor) -> torch.Tensor:
 20 |     x1 = x[..., :x.shape[-1] // 2]
 21 |     x2 = x[..., x.shape[-1] // 2:]
 22 |     return torch.cat((-x2, x1), dim=-1)
 23 | 
 24 | 
 25 | def rotate_gptj(x: torch.Tensor) -> torch.Tensor:
 26 |     x1 = x[..., ::2]
 27 |     x2 = x[..., 1::2]
 28 |     x = torch.stack((-x2, x1), dim=-1)
 29 |     return x.flatten(-2)
 30 | 
 31 | 
 32 | def apply_rope(
 33 |     q: torch.Tensor,
 34 |     k: torch.Tensor,
 35 |     cos: torch.Tensor,
 36 |     sin: torch.Tensor,
 37 |     is_neox_style: bool,
 38 | ) -> Tuple[torch.Tensor, torch.Tensor]:
 39 |     rotate_fn = rotate_neox if is_neox_style else rotate_gptj
 40 |     q_embed = (q * cos) + (rotate_fn(q) * sin)
 41 |     k_embed = (k * cos) + (rotate_fn(k) * sin)
 42 |     return q_embed, k_embed
 43 | 
 44 | 
 45 | class RefRotaryEmbedding(nn.Module):
 46 |     """Reference implementation of rotary embedding."""
 47 | 
 48 |     def __init__(
 49 |         self,
 50 |         dim: int,
 51 |         is_neox_style: bool,
 52 |         max_position_embeddings: int = 8192,
 53 |         base: int = 10000,
 54 |     ) -> None:
 55 |         super().__init__()
 56 |         self.rotary_dim = dim
 57 |         self.is_neox_style = is_neox_style
 58 |         self.max_position_embeddings = max_position_embeddings
 59 | 
 60 |         # Create cos and sin embeddings.
 61 |         inv_freq = 1.0 / (base**(torch.arange(0, dim, 2) / dim))
 62 |         t = torch.arange(max_position_embeddings).float()
 63 |         freqs = torch.einsum("i,j->ij", t, inv_freq.float())
 64 |         if is_neox_style:
 65 |             emb = torch.cat((freqs, freqs), dim=-1)
 66 |         else:
 67 |             emb = torch.repeat_interleave(freqs, 2, -1)
 68 |         cos = emb.cos().to(dtype=inv_freq.dtype)
 69 |         sin = emb.sin().to(dtype=inv_freq.dtype)
 70 |         self.register_buffer("cos_cached", cos, persistent=False)
 71 |         self.register_buffer("sin_cached", sin, persistent=False)
 72 | 
 73 |     def forward(
 74 |         self,
 75 |         positions: torch.Tensor,  # [num_tokens]
 76 |         query: torch.Tensor,  # [num_tokens, num_heads, head_size]
 77 |         key: torch.Tensor,  # [num_tokens, num_heads, head_size]
 78 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
 79 |         query_rot = query[..., :self.rotary_dim]
 80 |         query_pass = query[..., self.rotary_dim:]
 81 |         key_rot = key[..., :self.rotary_dim]
 82 |         key_pass = key[..., self.rotary_dim:]
 83 | 
 84 |         query_rot = query_rot.transpose(0, 1)
 85 |         key_rot = key_rot.transpose(0, 1)
 86 |         cos = F.embedding(positions, self.cos_cached)
 87 |         sin = F.embedding(positions, self.sin_cached)
 88 | 
 89 |         query_rot, key_rot = apply_rope(query_rot, key_rot, cos, sin,
 90 |                                         self.is_neox_style)
 91 |         query_rot = query_rot.transpose(0, 1).contiguous()
 92 |         key_rot = key_rot.transpose(0, 1).contiguous()
 93 | 
 94 |         query = torch.cat((query_rot, query_pass), dim=-1)
 95 |         key = torch.cat((key_rot, key_pass), dim=-1)
 96 | 
 97 |         # Output query/key shape: [num_tokens, num_tokens, head_size]
 98 |         return query, key
 99 | 
100 | 
101 | @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
102 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
103 | @pytest.mark.parametrize("num_heads", NUM_HEADS)
104 | @pytest.mark.parametrize("head_size", HEAD_SIZES)
105 | @pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
106 | @pytest.mark.parametrize("dtype", DTYPES)
107 | @pytest.mark.parametrize("seed", SEEDS)
108 | @torch.inference_mode()
109 | def test_rotary_embedding(
110 |     is_neox_style: bool,
111 |     num_tokens: int,
112 |     num_heads: int,
113 |     head_size: int,
114 |     rotary_dim: Optional[int],
115 |     dtype: torch.dtype,
116 |     seed: int,
117 |     max_position: int = 8192,
118 |     base: int = 10000,
119 | ) -> None:
120 |     if rotary_dim is None:
121 |         rotary_dim = head_size
122 |     torch.random.manual_seed(seed)
123 |     torch.cuda.manual_seed(seed)
124 | 
125 |     positions = torch.randint(0, max_position, (num_tokens, ), device="cuda")
126 |     query = torch.randn(num_tokens,
127 |                         num_heads * head_size,
128 |                         dtype=dtype,
129 |                         device="cuda")
130 |     key = torch.randn(num_tokens,
131 |                       num_heads * head_size,
132 |                       dtype=dtype,
133 |                       device="cuda")
134 | 
135 |     # Create the rotary embedding.
136 |     inv_freq = 1.0 / (base**(
137 |         torch.arange(0, rotary_dim, 2, dtype=torch.float) / rotary_dim))
138 |     t = torch.arange(max_position).float()
139 |     freqs = torch.einsum("i,j -> ij", t, inv_freq)
140 |     cos = freqs.cos()
141 |     sin = freqs.sin()
142 |     cos_sin_cache = torch.cat((cos, sin), dim=-1)
143 |     cos_sin_cache = cos_sin_cache.to(dtype=dtype, device='cuda')
144 | 
145 |     # Run the kernel. The kernel is in-place, so we need to clone the inputs.
146 |     out_query = query.clone()
147 |     out_key = key.clone()
148 |     pos_encoding_ops.rotary_embedding(
149 |         positions,
150 |         out_query,
151 |         out_key,
152 |         head_size,
153 |         cos_sin_cache,
154 |         is_neox_style,
155 |     )
156 | 
157 |     # Run the reference implementation.
158 |     ref_rotary_embedding = RefRotaryEmbedding(
159 |         dim=rotary_dim,
160 |         is_neox_style=is_neox_style,
161 |         max_position_embeddings=max_position,
162 |         base=base,
163 |     ).to(dtype=dtype, device="cuda")
164 |     ref_query, ref_key = ref_rotary_embedding(
165 |         positions,
166 |         query.view(num_tokens, num_heads, head_size),
167 |         key.view(num_tokens, num_heads, head_size),
168 |     )
169 |     ref_query = ref_query.view(num_tokens, num_heads * head_size)
170 |     ref_key = ref_key.view(num_tokens, num_heads * head_size)
171 | 
172 |     # Compare the results.
173 |     assert torch.allclose(out_query, ref_query, atol=1e-5, rtol=1e-5)
174 |     assert torch.allclose(out_key, ref_key, atol=1e-5, rtol=1e-5)
175 | 


--------------------------------------------------------------------------------
/vllm/worker/cache_engine.py:
--------------------------------------------------------------------------------
  1 | """CacheEngine class for managing the KV cache."""
  2 | from typing import Dict, List, Tuple
  3 | 
  4 | import torch
  5 | 
  6 | from vllm import cache_ops
  7 | from vllm.config import CacheConfig, ModelConfig, ParallelConfig
  8 | from vllm.logger import init_logger
  9 | from vllm.utils import in_wsl
 10 | 
 11 | logger = init_logger(__name__)
 12 | 
 13 | KVCache = Tuple[torch.Tensor, torch.Tensor]
 14 | 
 15 | 
 16 | class CacheEngine:
 17 |     """Manages the KV cache.
 18 | 
 19 |     This class is responsible for initializing and managing the GPU and CPU KV
 20 |     caches. It also provides methods for performing KV cache operations, such
 21 |     as swapping and copying.
 22 |     """
 23 | 
 24 |     def __init__(
 25 |         self,
 26 |         cache_config: CacheConfig,
 27 |         model_config: ModelConfig,
 28 |         parallel_config: ParallelConfig,
 29 |     ) -> None:
 30 |         self.cache_config = cache_config
 31 |         self.model_config = model_config
 32 |         self.parallel_config = parallel_config
 33 | 
 34 |         self.head_size = model_config.get_head_size()
 35 |         self.num_layers = model_config.get_num_layers(parallel_config)
 36 |         self.num_heads = model_config.get_num_heads(parallel_config)
 37 |         self.dtype = model_config.dtype
 38 | 
 39 |         self.block_size = cache_config.block_size
 40 |         self.num_gpu_blocks = cache_config.num_gpu_blocks
 41 |         self.num_cpu_blocks = cache_config.num_cpu_blocks
 42 | 
 43 |         # Initialize the cache.
 44 |         self.gpu_cache = self.allocate_gpu_cache()
 45 |         self.cpu_cache = self.allocate_cpu_cache()
 46 | 
 47 |         # Initialize the stream for caching operations.
 48 |         self.cache_stream = torch.cuda.Stream()
 49 |         assert self.cache_stream != torch.cuda.current_stream()
 50 |         # Initialize the events for stream synchronization.
 51 |         self.events = [torch.cuda.Event() for _ in range(self.num_layers)]
 52 | 
 53 |     def get_key_block_shape(self) -> Tuple[int, int, int, int]:
 54 |         element_size = torch.tensor([], dtype=self.dtype).element_size()
 55 |         x = 16 // element_size
 56 |         return (
 57 |             self.num_heads,
 58 |             self.head_size // x,
 59 |             self.block_size,
 60 |             x,
 61 |         )
 62 | 
 63 |     def get_value_block_shape(self) -> Tuple[int, int, int]:
 64 |         return (
 65 |             self.num_heads,
 66 |             self.head_size,
 67 |             self.block_size,
 68 |         )
 69 | 
 70 |     def allocate_gpu_cache(self) -> List[KVCache]:
 71 |         gpu_cache: List[KVCache] = []
 72 |         key_block_shape = self.get_key_block_shape()
 73 |         value_block_shape = self.get_value_block_shape()
 74 |         for _ in range(self.num_layers):
 75 |             key_blocks = torch.empty(
 76 |                 size=(self.num_gpu_blocks, *key_block_shape),
 77 |                 dtype=self.dtype,
 78 |                 device="cuda",
 79 |             )
 80 |             value_blocks = torch.empty(
 81 |                 size=(self.num_gpu_blocks, *value_block_shape),
 82 |                 dtype=self.dtype,
 83 |                 device="cuda",
 84 |             )
 85 |             gpu_cache.append((key_blocks, value_blocks))
 86 |         return gpu_cache
 87 | 
 88 |     def allocate_cpu_cache(self) -> List[KVCache]:
 89 |         cpu_cache: List[KVCache] = []
 90 |         key_block_shape = self.get_key_block_shape()
 91 |         value_block_shape = self.get_value_block_shape()
 92 |         pin_memory = not in_wsl()
 93 |         if not pin_memory:
 94 |             # Pinning memory in WSL is not supported.
 95 |             # https://docs.nvidia.com/cuda/wsl-user-guide/index.html#known-limitations-for-linux-cuda-applications
 96 |             logger.warning("Using 'pin_memory=False' as WSL is detected. "
 97 |                            "This may slow down the performance.")
 98 |         for _ in range(self.num_layers):
 99 |             key_blocks = torch.empty(
100 |                 size=(self.num_cpu_blocks, *key_block_shape),
101 |                 dtype=self.dtype,
102 |                 pin_memory=pin_memory,
103 |             )
104 |             value_blocks = torch.empty(
105 |                 size=(self.num_cpu_blocks, *value_block_shape),
106 |                 dtype=self.dtype,
107 |                 pin_memory=pin_memory,
108 |             )
109 |             cpu_cache.append((key_blocks, value_blocks))
110 |         return cpu_cache
111 | 
112 |     def _swap(
113 |         self,
114 |         src: List[KVCache],
115 |         dst: List[KVCache],
116 |         src_to_dst: Dict[int, int],
117 |     ) -> None:
118 |         with torch.cuda.stream(self.cache_stream):
119 |             for i in range(self.num_layers):
120 |                 src_key_cache, src_value_cache = src[i]
121 |                 dst_key_cache, dst_value_cache = dst[i]
122 |                 # Copy the key blocks.
123 |                 cache_ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst)
124 |                 # Copy the value blocks.
125 |                 cache_ops.swap_blocks(src_value_cache, dst_value_cache,
126 |                                       src_to_dst)
127 |                 event = self.events[i]
128 |                 event.record(stream=self.cache_stream)
129 | 
130 |     def swap_in(self, src_to_dst: Dict[int, int]) -> None:
131 |         self._swap(self.cpu_cache, self.gpu_cache, src_to_dst)
132 | 
133 |     def swap_out(self, src_to_dst: Dict[int, int]) -> None:
134 |         self._swap(self.gpu_cache, self.cpu_cache, src_to_dst)
135 | 
136 |     def copy(self, src_to_dsts: Dict[int, List[int]]) -> None:
137 |         key_caches = [key_cache for key_cache, _ in self.gpu_cache]
138 |         value_caches = [value_cache for _, value_cache in self.gpu_cache]
139 |         # NOTE(woosuk): This operation implicitly synchronizes the CPU and GPU.
140 |         cache_ops.copy_blocks(key_caches, value_caches, src_to_dsts)
141 | 
142 |     @staticmethod
143 |     def get_cache_block_size(
144 |         block_size: int,
145 |         model_config: ModelConfig,
146 |         parallel_config: ParallelConfig,
147 |     ) -> int:
148 |         head_size = model_config.get_head_size()
149 |         num_heads = model_config.get_num_heads(parallel_config)
150 |         num_layers = model_config.get_num_layers(parallel_config)
151 | 
152 |         key_cache_block = block_size * num_heads * head_size
153 |         value_cache_block = key_cache_block
154 |         total = num_layers * (key_cache_block + value_cache_block)
155 |         dtype_size = _get_dtype_size(model_config.dtype)
156 |         return dtype_size * total
157 | 
158 | 
159 | def _get_dtype_size(dtype: torch.dtype) -> int:
160 |     return torch.tensor([], dtype=dtype).element_size()
161 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <p align="center">
  2 |   <picture>
  3 |     <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-dark.png">
  4 |     <img alt="vLLM" src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-light.png" width=55%>
  5 |   </picture>
  6 | </p>
  7 | 
  8 | <h3 align="center">
  9 | Easy, fast, and cheap LLM serving for everyone
 10 | </h3>
 11 | 
 12 | <p align="center">
 13 | | <a href="https://vllm.readthedocs.io/en/latest/"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://github.com/vllm-project/vllm/discussions"><b>Discussions</b></a> |
 14 | 
 15 | </p>
 16 | 
 17 | ---
 18 | 
 19 | *Latest News* 🔥
 20 | - [2023/09] We released our [PagedAttention paper](https://arxiv.org/abs/2309.06180) on arXiv!
 21 | - [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
 22 | - [2023/07] Added support for LLaMA-2! You can run and serve 7B/13B/70B LLaMA-2s on vLLM with a single command!
 23 | - [2023/06] Serving vLLM On any Cloud with SkyPilot. Check out a 1-click [example](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm) to start the vLLM demo, and the [blog post](https://blog.skypilot.co/serving-llm-24x-faster-on-the-cloud-with-vllm-and-skypilot/) for the story behind vLLM development on the clouds.
 24 | - [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
 25 | 
 26 | ---
 27 | 
 28 | vLLM is a fast and easy-to-use library for LLM inference and serving.
 29 | 
 30 | vLLM is fast with:
 31 | 
 32 | - State-of-the-art serving throughput
 33 | - Efficient management of attention key and value memory with **PagedAttention**
 34 | - Continuous batching of incoming requests
 35 | - Optimized CUDA kernels
 36 | 
 37 | vLLM is flexible and easy to use with:
 38 | 
 39 | - Seamless integration with popular Hugging Face models
 40 | - High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
 41 | - Tensor parallelism support for distributed inference
 42 | - Streaming outputs
 43 | - OpenAI-compatible API server
 44 | 
 45 | vLLM seamlessly supports many Hugging Face models, including the following architectures:
 46 | 
 47 | - Aquila (`BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.)
 48 | - Baichuan (`baichuan-inc/Baichuan-7B`, `baichuan-inc/Baichuan-13B-Chat`, etc.)
 49 | - BLOOM (`bigscience/bloom`, `bigscience/bloomz`, etc.)
 50 | - Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.)
 51 | - GPT-2 (`gpt2`, `gpt2-xl`, etc.)
 52 | - GPT BigCode (`bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, etc.)
 53 | - GPT-J (`EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.)
 54 | - GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.)
 55 | - InternLM (`internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.)
 56 | - LLaMA & LLaMA-2 (`meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
 57 | - MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.)
 58 | - OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
 59 | - Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
 60 | 
 61 | Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
 62 | 
 63 | ```bash
 64 | pip install vllm
 65 | ```
 66 | 
 67 | ## Getting Started
 68 | 
 69 | Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to get started.
 70 | - [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html)
 71 | - [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
 72 | - [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
 73 | 
 74 | ## Performance
 75 | 
 76 | vLLM outperforms Hugging Face Transformers (HF) by up to 24x and Text Generation Inference (TGI) by up to 3.5x, in terms of throughput.
 77 | For details, check out our [blog post](https://vllm.ai).
 78 | 
 79 | <p align="center">
 80 |   <picture>
 81 |   <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/figures/perf_a10g_n1_dark.png">
 82 |   <img src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/figures/perf_a10g_n1_light.png" width="45%">
 83 |   </picture>
 84 |   <picture>
 85 |   <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/figures/perf_a100_n1_dark.png">
 86 |   <img src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/figures/perf_a100_n1_light.png" width="45%">
 87 |   </picture>
 88 |   <br>
 89 |   <em> Serving throughput when each request asks for 1 output completion. </em>
 90 | </p>
 91 | 
 92 | <p align="center">
 93 |   <picture>
 94 |   <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/figures/perf_a10g_n3_dark.png">
 95 |   <img src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/figures/perf_a10g_n3_light.png" width="45%">
 96 |   </picture>
 97 |   <picture>
 98 |   <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/figures/perf_a100_n3_dark.png">
 99 |   <img src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/figures/perf_a100_n3_light.png" width="45%">
100 |   </picture>  <br>
101 |   <em> Serving throughput when each request asks for 3 output completions. </em>
102 | </p>
103 | 
104 | ## Contributing
105 | 
106 | We welcome and value any contributions and collaborations.
107 | Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
108 | 
109 | ## Citation
110 | 
111 | If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
112 | ```bibtex
113 | @inproceedings{kwon2023efficient,
114 |   title={Efficient Memory Management for Large Language Model Serving with PagedAttention}, 
115 |   author={Woosuk Kwon and Zhuohan Li and Siyuan Zhuang and Ying Sheng and Lianmin Zheng and Cody Hao Yu and Joseph E. Gonzalez and Hao Zhang and Ion Stoica},
116 |   booktitle={Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles},
117 |   year={2023}
118 | }
119 | ```
120 | 


--------------------------------------------------------------------------------
/vllm/model_executor/parallel_utils/tensor_parallel/random.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2023 The vLLM team.
  2 | # Adapted from https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/random.py
  3 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
  4 | 
  5 | # Parts of the code here are adapted from PyTorch
  6 | # repo: https://github.com/pytorch/pytorch
  7 | 
  8 | import contextlib
  9 | 
 10 | import torch
 11 | from torch import _C
 12 | from torch.cuda import _lazy_call, device as device_ctx_manager
 13 | 
 14 | from vllm.model_executor.parallel_utils.parallel_state import (
 15 |     get_tensor_model_parallel_rank,
 16 | )
 17 | 
 18 | # Default name for the model parallel rng tracker.
 19 | _MODEL_PARALLEL_RNG_TRACKER_NAME = 'model-parallel-rng'
 20 | 
 21 | 
 22 | def _set_cuda_rng_state(new_state, device=-1):
 23 |     """Sets the random number generator state of the current GPU.
 24 | 
 25 |     Argumentss:
 26 |         new_state (torch.ByteTensor): The desired state
 27 |     This function is adapted from PyTorch repo (torch.cuda.set_rng_state)
 28 |     with a single change: the input state is not cloned. Cloning caused
 29 |     major performance issues for +4 GPU cases.
 30 |     """
 31 |     if hasattr(_C, '_cuda_setRNGState') and callable(_C._cuda_setRNGState):
 32 |         # older PyTorch
 33 |         def cb():
 34 |             with device_ctx_manager(device):
 35 |                 _C._cuda_setRNGState(new_state)
 36 |     else:
 37 |         # newer PyTorch
 38 |         if device == -1:
 39 |             device = torch.device('cuda')
 40 |         elif isinstance(device, str):
 41 |             device = torch.device(device)
 42 |         elif isinstance(device, int):
 43 |             device = torch.device('cuda', device)
 44 | 
 45 |         def cb():
 46 |             idx = device.index
 47 |             if idx is None:
 48 |                 idx = torch.cuda.current_device()
 49 |             default_generator = torch.cuda.default_generators[idx]
 50 |             default_generator.set_state(new_state)
 51 | 
 52 |     _lazy_call(cb)
 53 | 
 54 | 
 55 | 
 56 | class CudaRNGStatesTracker:
 57 |     """Tracker for the cuda RNG states.
 58 | 
 59 |     Using the `add` method, a cuda rng state is initialized based on
 60 |     the input `seed` and is assigned to `name`. Later, by forking the
 61 |     rng state, we can perform operations and return to our starting
 62 |     cuda state.
 63 |     """
 64 | 
 65 |     def __init__(self):
 66 |         # Map from a string name to the cuda rng state.
 67 |         self.states_ = {}
 68 |         # Seeds are just for book keeping and ensure no seed is set twice.
 69 |         self.seeds_ = set()
 70 | 
 71 |     def reset(self):
 72 |         """Set to the initial state (no tracker)."""
 73 |         self.states_ = {}
 74 |         self.seeds_ = set()
 75 | 
 76 |     def get_states(self):
 77 |         """Get rng states. Copy the dictionary so we have direct
 78 |         pointers to the states, not just a pointer to the dictionary."""
 79 |         states = {}
 80 |         for name in self.states_:
 81 |             states[name] = self.states_[name]
 82 |         return states
 83 | 
 84 |     def set_states(self, states):
 85 |         """Set the rng states. For efficiency purposes, we do not check
 86 |         the size of seed for compatibility."""
 87 |         self.states_ = states
 88 | 
 89 |     def add(self, name, seed):
 90 |         """Track the rng state."""
 91 |         # Check seed is not already used.
 92 |         if seed in self.seeds_:
 93 |             raise Exception('seed {} already exists'.format(seed))
 94 |         self.seeds_.add(seed)
 95 |         # Check that state is not already defined.
 96 |         if name in self.states_:
 97 |             raise Exception('cuda rng state {} already exists'.format(name))
 98 |         # Get the current rng state.
 99 |         orig_rng_state = torch.cuda.get_rng_state()
100 |         # Set the new state and store it.
101 |         torch.cuda.manual_seed(seed)
102 |         self.states_[name] = torch.cuda.get_rng_state()
103 |         # Reset rng state to what it was.
104 |         _set_cuda_rng_state(orig_rng_state)
105 | 
106 |     @contextlib.contextmanager
107 |     def fork(self, name=_MODEL_PARALLEL_RNG_TRACKER_NAME):
108 |         """Fork the cuda rng state, perform operations, and exit with
109 |         the original state."""
110 |         # Check if we have added the state
111 |         if name not in self.states_:
112 |             raise Exception('cuda rng state {} is not added'.format(name))
113 |         # Store current rng state.
114 |         orig_cuda_rng_state = torch.cuda.get_rng_state()
115 |         # Set rng state to the desired one
116 |         _set_cuda_rng_state(self.states_[name])
117 |         # Do the stuff we wanted to do.
118 |         try:
119 |             yield
120 |         finally:
121 |             # Update the current rng state for later use.
122 |             self.states_[name] = torch.cuda.get_rng_state()
123 |             # And set the state to the original state we started with.
124 |             _set_cuda_rng_state(orig_cuda_rng_state)
125 | 
126 | 
127 | # RNG tracker object.
128 | _CUDA_RNG_STATE_TRACKER = CudaRNGStatesTracker()
129 | 
130 | 
131 | def get_cuda_rng_tracker():
132 |     """Get cuda rng tracker."""
133 |     return _CUDA_RNG_STATE_TRACKER
134 | 
135 | 
136 | def model_parallel_cuda_manual_seed(seed):
137 |     """Initialize model parallel cuda seed.
138 | 
139 |     This function should be called after the model parallel is
140 |     initialized. Also, no torch.cuda.manual_seed should be called
141 |     after this function. Basically, this is replacement for that
142 |     function.
143 |     Two set of RNG states are tracked:
144 |         default state: This is for data parallelism and is the same among a
145 |                        set of model parallel GPUs but different across
146 |                        different model paralle groups. This is used for
147 |                        example for dropout in the non-tensor-model-parallel regions.
148 |         tensor-model-parallel state: This state is different among a set of model
149 |                               parallel GPUs, but the same across data parallel
150 |                               groups. This is used for example for dropout in
151 |                               model parallel regions.
152 |     """
153 |     # 2718 is just for fun and any POSITIVE value will work.
154 |     offset = seed + 2718
155 |     tensor_model_parallel_seed = offset + get_tensor_model_parallel_rank()
156 |     # Data parallel gets the original seed.
157 |     data_parallel_seed = seed
158 | 
159 |     _CUDA_RNG_STATE_TRACKER.reset()
160 |     # Set the default state.
161 |     torch.cuda.manual_seed(data_parallel_seed)
162 |     # and model parallel state.
163 |     _CUDA_RNG_STATE_TRACKER.add(_MODEL_PARALLEL_RNG_TRACKER_NAME,
164 |                                 tensor_model_parallel_seed)
165 | 


--------------------------------------------------------------------------------
/tests/conftest.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Optional, Tuple
  2 | 
  3 | import pytest
  4 | import torch
  5 | from transformers import AutoModelForCausalLM
  6 | 
  7 | from vllm import LLM, SamplingParams
  8 | from vllm.transformers_utils.tokenizer import get_tokenizer
  9 | 
 10 | _TEST_PROMPTS = [
 11 |     "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs.",
 12 |     "Briefly describe the major milestones in the development of artificial intelligence from 1950 to 2020.",
 13 |     "Compare and contrast artificial intelligence with human intelligence in terms of processing information.",
 14 |     "Describe the basic components of a neural network and how it can be trained.",
 15 |     "Write a short story about a robot that dreams for the first time.",
 16 |     "Analyze the impact of the COVID-19 pandemic on global economic structures and future business models.",
 17 |     "Explain the cultural significance of the Mona Lisa painting, and how its perception might vary in Western versus Eastern societies.",
 18 |     "Translate the following English sentence into Japanese, French, and Swahili: 'The early bird catches the worm.'",
 19 | ]
 20 | 
 21 | 
 22 | @pytest.fixture
 23 | def example_prompts() -> List[str]:
 24 |     return _TEST_PROMPTS
 25 | 
 26 | 
 27 | _STR_DTYPE_TO_TORCH_DTYPE = {
 28 |     "half": torch.half,
 29 |     "bfloat16": torch.bfloat16,
 30 |     "float": torch.float,
 31 | }
 32 | 
 33 | 
 34 | class HfRunner:
 35 | 
 36 |     def __init__(
 37 |         self,
 38 |         model_name: str,
 39 |         tokenizer_name: Optional[str] = None,
 40 |         dtype: str = "half",
 41 |     ) -> None:
 42 |         assert dtype in _STR_DTYPE_TO_TORCH_DTYPE
 43 |         torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
 44 |         self.model = AutoModelForCausalLM.from_pretrained(
 45 |             model_name,
 46 |             torch_dtype=torch_dtype,
 47 |             trust_remote_code=True,
 48 |         ).cuda()
 49 |         if tokenizer_name is None:
 50 |             tokenizer_name = model_name
 51 |         self.tokenizer = get_tokenizer(tokenizer_name, trust_remote_code=True)
 52 | 
 53 |     def generate(
 54 |         self,
 55 |         prompts: List[str],
 56 |         **kwargs,
 57 |     ) -> List[Tuple[List[int], str]]:
 58 |         outputs: List[Tuple[List[int], str]] = []
 59 |         for prompt in prompts:
 60 |             input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
 61 |             output_ids = self.model.generate(
 62 |                 input_ids.cuda(),
 63 |                 use_cache=True,
 64 |                 **kwargs,
 65 |             )
 66 |             output_str = self.tokenizer.batch_decode(
 67 |                 output_ids,
 68 |                 skip_special_tokens=True,
 69 |                 clean_up_tokenization_spaces=False,
 70 |             )
 71 |             output_ids = output_ids.cpu().tolist()
 72 |             outputs.append((output_ids, output_str))
 73 |         return outputs
 74 | 
 75 |     def generate_greedy(
 76 |         self,
 77 |         prompts: List[str],
 78 |         max_tokens: int,
 79 |     ) -> List[Tuple[List[int], str]]:
 80 |         outputs = self.generate(prompts,
 81 |                                 do_sample=False,
 82 |                                 max_new_tokens=max_tokens)
 83 |         for i in range(len(outputs)):
 84 |             output_ids, output_str = outputs[i]
 85 |             outputs[i] = (output_ids[0], output_str[0])
 86 |         return outputs
 87 | 
 88 |     def generate_beam_search(
 89 |         self,
 90 |         prompts: List[str],
 91 |         beam_width: int,
 92 |         max_tokens: int,
 93 |     ) -> List[Tuple[List[int], str]]:
 94 |         outputs = self.generate(prompts,
 95 |                                 do_sample=False,
 96 |                                 max_new_tokens=max_tokens,
 97 |                                 num_beams=beam_width,
 98 |                                 num_return_sequences=beam_width)
 99 |         for i in range(len(outputs)):
100 |             output_ids, output_str = outputs[i]
101 |             for j in range(len(output_ids)):
102 |                 output_ids[j] = [
103 |                     x for x in output_ids[j]
104 |                     if x != self.tokenizer.pad_token_id
105 |                 ]
106 |             outputs[i] = (output_ids, output_str)
107 |         return outputs
108 | 
109 | 
110 | @pytest.fixture
111 | def hf_runner():
112 |     return HfRunner
113 | 
114 | 
115 | class VllmRunner:
116 | 
117 |     def __init__(
118 |         self,
119 |         model_name: str,
120 |         tokenizer_name: Optional[str] = None,
121 |         dtype: str = "half",
122 |     ) -> None:
123 |         self.model = LLM(
124 |             model=model_name,
125 |             tokenizer=tokenizer_name,
126 |             trust_remote_code=True,
127 |             dtype=dtype,
128 |             swap_space=0,
129 |         )
130 | 
131 |     def generate(
132 |         self,
133 |         prompts: List[str],
134 |         sampling_params: SamplingParams,
135 |     ) -> List[Tuple[List[int], str]]:
136 |         req_outputs = self.model.generate(prompts,
137 |                                           sampling_params=sampling_params)
138 |         outputs = []
139 |         for req_output in req_outputs:
140 |             prompt_str = req_output.prompt
141 |             prompt_ids = req_output.prompt_token_ids
142 |             req_sample_output_ids = []
143 |             req_sample_output_strs = []
144 |             for sample in req_output.outputs:
145 |                 output_str = sample.text
146 |                 output_ids = sample.token_ids
147 |                 req_sample_output_ids.append(prompt_ids + output_ids)
148 |                 req_sample_output_strs.append(prompt_str + output_str)
149 |             outputs.append((req_sample_output_ids, req_sample_output_strs))
150 |         return outputs
151 | 
152 |     def generate_greedy(
153 |         self,
154 |         prompts: List[str],
155 |         max_tokens: int,
156 |     ) -> List[Tuple[List[int], str]]:
157 |         greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
158 |         outputs = self.generate(prompts, greedy_params)
159 |         return [(output_ids[0], output_str[0])
160 |                 for output_ids, output_str in outputs]
161 | 
162 |     def generate_beam_search(
163 |         self,
164 |         prompts: List[str],
165 |         beam_width: int,
166 |         max_tokens: int,
167 |     ) -> List[Tuple[List[int], str]]:
168 |         beam_search_params = SamplingParams(n=beam_width,
169 |                                             use_beam_search=True,
170 |                                             temperature=0.0,
171 |                                             max_tokens=max_tokens)
172 |         outputs = self.generate(prompts, beam_search_params)
173 |         return outputs
174 | 
175 | 
176 | @pytest.fixture
177 | def vllm_runner():
178 |     return VllmRunner
179 | 


--------------------------------------------------------------------------------
/csrc/attention/dtype_float32.cuh:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
  3 |  * and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
  4 |  * Copyright (c) 2023, The vLLM team.
  5 |  * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
  6 |  *
  7 |  * Licensed under the Apache License, Version 2.0 (the "License");
  8 |  * you may not use this file except in compliance with the License.
  9 |  * You may obtain a copy of the License at
 10 |  *
 11 |  *     http://www.apache.org/licenses/LICENSE-2.0
 12 |  *
 13 |  * Unless required by applicable law or agreed to in writing, software
 14 |  * distributed under the License is distributed on an "AS IS" BASIS,
 15 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 16 |  * See the License for the specific language governing permissions and
 17 |  * limitations under the License.
 18 |  */
 19 | #pragma once
 20 | 
 21 | #include "attention_generic.cuh"
 22 | 
 23 | #include <stdint.h>
 24 | 
 25 | namespace vllm {
 26 | 
 27 | // Define custom FP32 vector data types.
 28 | struct Float4_ {
 29 |   float2 x;
 30 |   float2 y;
 31 | };
 32 | 
 33 | struct Float8_ {
 34 |   float2 x;
 35 |   float2 y;
 36 |   float2 z;
 37 |   float2 w;
 38 | };
 39 | 
 40 | // FP32 vector types for Q, K, V.
 41 | template<>
 42 | struct Vec<float, 1> {
 43 |   using Type = float;
 44 | };
 45 | template<>
 46 | struct Vec<float, 2> {
 47 |   using Type = float2;
 48 | };
 49 | template<>
 50 | struct Vec<float, 4> {
 51 |   using Type = float4;
 52 | };
 53 | 
 54 | // FP32 accumulator vector types corresponding to Vec.
 55 | template<>
 56 | struct FloatVec<float> {
 57 |   using Type = float;
 58 | };
 59 | template<>
 60 | struct FloatVec<float2> {
 61 |   using Type = float2;
 62 | };
 63 | template<>
 64 | struct FloatVec<float4> {
 65 |   using Type = float4;
 66 | };
 67 | 
 68 | // Vector addition.
 69 | inline __device__ float add(float a, float b) {
 70 |   return a + b;
 71 | }
 72 | 
 73 | inline __device__ float2 add(float2 a, float2 b) {
 74 |   float2 c;
 75 |   c.x = add(a.x, b.x);
 76 |   c.y = add(a.y, b.y);
 77 |   return c;
 78 | }
 79 | 
 80 | inline __device__ float4 add(float4 a, float4 b) {
 81 |   float4 c;
 82 |   c.x = add(a.x, b.x);
 83 |   c.y = add(a.y, b.y);
 84 |   c.z = add(a.z, b.z);
 85 |   c.w = add(a.w, b.w);
 86 |   return c;
 87 | }
 88 | 
 89 | // Vector multiplication.
 90 | template<>
 91 | inline __device__ float mul<float, float>(float a, float b) {
 92 |   return a * b;
 93 | }
 94 | 
 95 | template<>
 96 | inline __device__ float2 mul(float2 a, float2 b) {
 97 |   float2 c;
 98 |   c.x = a.x * b.x;
 99 |   c.y = a.y * b.y;
100 |   return c;
101 | }
102 | 
103 | template<>
104 | inline __device__ float2 mul(float a, float2 b) {
105 |   float2 c;
106 |   c.x = a * b.x;
107 |   c.y = a * b.y;
108 |   return c;
109 | }
110 | 
111 | template<>
112 | inline __device__ float4 mul(float4 a, float4 b) {
113 |   float4 c;
114 |   c.x = a.x * b.x;
115 |   c.y = a.y * b.y;
116 |   c.z = a.z * b.z;
117 |   c.w = a.w * b.w;
118 |   return c;
119 | }
120 | 
121 | template<>
122 | inline __device__ float4 mul(float a, float4 b) {
123 |   float4 c;
124 |   c.x = a * b.x;
125 |   c.y = a * b.y;
126 |   c.z = a * b.z;
127 |   c.w = a * b.w;
128 |   return c;
129 | }
130 | 
131 | // Vector fused multiply-add.
132 | inline __device__ float fma(float a, float b, float c) {
133 |   return a * b + c;
134 | }
135 | 
136 | inline __device__ float2 fma(float2 a, float2 b, float2 c) {
137 |   float2 d;
138 |   d.x = fma(a.x, b.x, c.x);
139 |   d.y = fma(a.y, b.y, c.y);
140 |   return d;
141 | }
142 | 
143 | inline __device__ float2 fma(float a, float2 b, float2 c) {
144 |   float2 d;
145 |   d.x = fma(a, b.x, c.x);
146 |   d.y = fma(a, b.y, c.y);
147 |   return d;
148 | }
149 | 
150 | inline __device__ float4 fma(float4 a, float4 b, float4 c) {
151 |   float4 d;
152 |   d.x = fma(a.x, b.x, c.x);
153 |   d.y = fma(a.y, b.y, c.y);
154 |   d.z = fma(a.z, b.z, c.z);
155 |   d.w = fma(a.w, b.w, c.w);
156 |   return d;
157 | }
158 | 
159 | inline __device__ float4 fma(float a, float4 b, float4 c) {
160 |   float4 d;
161 |   d.x = fma(a, b.x, c.x);
162 |   d.y = fma(a, b.y, c.y);
163 |   d.z = fma(a, b.z, c.z);
164 |   d.w = fma(a, b.w, c.w);
165 |   return d;
166 | }
167 | 
168 | inline __device__ Float4_ fma(float a, Float4_ b, Float4_ c) {
169 |   Float4_ d;
170 |   d.x = fma(a, b.x, c.x);
171 |   d.y = fma(a, b.y, c.y);
172 |   return d;
173 | }
174 | 
175 | inline __device__ Float8_ fma(float a, Float8_ b, Float8_ c) {
176 |   Float8_ d;
177 |   d.x = fma(a, b.x, c.x);
178 |   d.y = fma(a, b.y, c.y);
179 |   d.z = fma(a, b.z, c.z);
180 |   d.w = fma(a, b.w, c.w);
181 |   return d;
182 | }
183 | 
184 | // Vector sum.
185 | template<>
186 | inline __device__ float sum(float v) {
187 |   return v;
188 | }
189 | 
190 | template<>
191 | inline __device__ float sum(float2 v) {
192 |   return v.x + v.y;
193 | }
194 | 
195 | template<>
196 | inline __device__ float sum(float4 v) {
197 |   return v.x + v.y + v.z + v.w;
198 | }
199 | 
200 | template<>
201 | inline __device__ float sum(Float4_ v) {
202 |   return v.x.x + v.x.y + v.y.x + v.y.y;
203 | }
204 | 
205 | template<>
206 | inline __device__ float sum(Float8_ v) {
207 |   return v.x.x + v.x.y + v.y.x + v.y.y + v.z.x + v.z.y + v.w.x + v.w.y;
208 | }
209 | 
210 | // Vector dot product.
211 | inline __device__ float dot(float a, float b) {
212 |   return a * b;
213 | }
214 | 
215 | inline __device__ float dot(float2 a, float2 b) {
216 |   float2 c = mul<float2, float2, float2>(a, b);
217 |   return c.x + c.y;
218 | }
219 | 
220 | inline __device__ float dot(Float4_ a, Float4_ b) {
221 |   float2 acc = mul<float2, float2, float2>(a.x, b.x);
222 |   acc = fma(a.y, b.y, acc);
223 |   return acc.x + acc.y;
224 | }
225 | 
226 | inline __device__ float dot(Float8_ a, Float8_ b) {
227 |   float2 acc = mul<float2, float2, float2>(a.x, b.x);
228 |   acc = fma(a.y, b.y, acc);
229 |   acc = fma(a.z, b.z, acc);
230 |   acc = fma(a.w, b.w, acc);
231 |   return acc.x + acc.y;
232 | }
233 | 
234 | // From float to float.
235 | inline __device__ void from_float(float& dst, float src) {
236 |   dst = src;
237 | }
238 | 
239 | inline __device__ void from_float(float2& dst, float2 src) {
240 |   dst = src;
241 | }
242 | 
243 | inline __device__ void from_float(float4& dst, float4 src) {
244 |   dst = src;
245 | }
246 | 
247 | // From float to float.
248 | inline __device__ float to_float(float u) {
249 |   return u;
250 | }
251 | 
252 | inline __device__ float2 to_float(float2 u) {
253 |   return u;
254 | }
255 | 
256 | inline __device__ float4 to_float(float4 u) {
257 |   return u;
258 | }
259 | 
260 | inline __device__ Float4_ to_float(Float4_ u) {
261 |   return u;
262 | }
263 | 
264 | inline __device__ Float8_ to_float(Float8_ u) {
265 |   return u;
266 | }
267 | 
268 | // Zero-out a variable.
269 | inline __device__ void zero(float& dst) {
270 |   dst = 0.f;
271 | }
272 | 
273 | } // namespace vllm
274 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/tokenizer.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Optional, Tuple, Union
  2 | 
  3 | from transformers import (AutoTokenizer, PreTrainedTokenizer,
  4 |                           PreTrainedTokenizerFast)
  5 | 
  6 | from vllm.logger import init_logger
  7 | 
  8 | logger = init_logger(__name__)
  9 | 
 10 | # A fast LLaMA tokenizer with the pre-processed `tokenizer.json` file.
 11 | _FAST_LLAMA_TOKENIZER = "hf-internal-testing/llama-tokenizer"
 12 | 
 13 | 
 14 | def get_tokenizer(
 15 |     tokenizer_name: str,
 16 |     *args,
 17 |     tokenizer_mode: str = "auto",
 18 |     trust_remote_code: bool = False,
 19 |     **kwargs,
 20 | ) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
 21 |     """Gets a tokenizer for the given model name via Huggingface."""
 22 |     if tokenizer_mode == "slow":
 23 |         if kwargs.get("use_fast", False):
 24 |             raise ValueError(
 25 |                 "Cannot use the fast tokenizer in slow tokenizer mode.")
 26 |         kwargs["use_fast"] = False
 27 | 
 28 |     if ("llama" in tokenizer_name.lower() and kwargs.get("use_fast", True)
 29 |             and tokenizer_name != _FAST_LLAMA_TOKENIZER):
 30 |         logger.info(
 31 |             "For some LLaMA-based models, initializing the fast tokenizer may "
 32 |             "take a long time. To eliminate the initialization time, consider "
 33 |             f"using '{_FAST_LLAMA_TOKENIZER}' instead of the original "
 34 |             "tokenizer.")
 35 |     try:
 36 |         tokenizer = AutoTokenizer.from_pretrained(
 37 |             tokenizer_name,
 38 |             *args,
 39 |             trust_remote_code=trust_remote_code,
 40 |             **kwargs)
 41 |     except TypeError as e:
 42 |         # The LLaMA tokenizer causes a protobuf error in some environments.
 43 |         err_msg = (
 44 |             "Failed to load the tokenizer. If you are using a LLaMA-based "
 45 |             f"model, use '{_FAST_LLAMA_TOKENIZER}' instead of the original "
 46 |             "tokenizer.")
 47 |         raise RuntimeError(err_msg) from e
 48 |     except ValueError as e:
 49 |         # If the error pertains to the tokenizer class not existing or not
 50 |         # currently being imported, suggest using the --trust-remote-code flag.
 51 |         if (not trust_remote_code and
 52 |             ("does not exist or is not currently imported." in str(e)
 53 |              or "requires you to execute the tokenizer file" in str(e))):
 54 |             err_msg = (
 55 |                 "Failed to load the tokenizer. If the tokenizer is a custom "
 56 |                 "tokenizer not yet available in the HuggingFace transformers "
 57 |                 "library, consider setting `trust_remote_code=True` in LLM "
 58 |                 "or using the `--trust-remote-code` flag in the CLI.")
 59 |             raise RuntimeError(err_msg) from e
 60 |         else:
 61 |             raise e
 62 | 
 63 |     if not isinstance(tokenizer, PreTrainedTokenizerFast):
 64 |         logger.warning(
 65 |             "Using a slow tokenizer. This might cause a significant "
 66 |             "slowdown. Consider using a fast tokenizer instead.")
 67 |     return tokenizer
 68 | 
 69 | 
 70 | def _convert_tokens_to_string_with_added_encoders(
 71 |     tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
 72 |     output_tokens: List[str],
 73 |     skip_special_tokens: bool,
 74 | ) -> str:
 75 |     # Adapted from
 76 |     # https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/tokenization_utils.py#L921
 77 |     # NOTE(woosuk): The following code is slow because it runs a for loop over
 78 |     # the output_tokens. In Python, running a for loop over a list can be slow
 79 |     # even when the loop body is very simple.
 80 |     sub_texts = []
 81 |     current_sub_text = []
 82 |     for token in output_tokens:
 83 |         if skip_special_tokens and token in tokenizer.all_special_tokens:
 84 |             continue
 85 |         if token in tokenizer.added_tokens_encoder:
 86 |             if current_sub_text:
 87 |                 sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
 88 |                 sub_texts.append(sub_text)
 89 |                 current_sub_text = []
 90 |             sub_texts.append(token)
 91 |         else:
 92 |             current_sub_text.append(token)
 93 |     if current_sub_text:
 94 |         sub_text = tokenizer.convert_tokens_to_string(current_sub_text)
 95 |         sub_texts.append(sub_text)
 96 |     return " ".join(sub_texts)
 97 | 
 98 | 
 99 | # Based on
100 | # https://github.com/huggingface/text-generation-inference/blob/v0.9.4/server/text_generation_server/models/model.py#L62C9-L62C15
101 | # under Apache 2.0 license
102 | def detokenize_incrementally(
103 |     tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
104 |     all_input_ids: List[int],
105 |     prev_tokens: Optional[List[str]],
106 |     prefix_offset: int = 0,
107 |     read_offset: int = 0,
108 |     skip_special_tokens: bool = False,
109 | ) -> Tuple[List[str], str, int, int]:
110 |     new_token_id = all_input_ids[-1]
111 |     # This is the first iteration for this sequence
112 |     if prev_tokens is None:
113 |         new_tokens = tokenizer.convert_ids_to_tokens(
114 |             all_input_ids, skip_special_tokens=skip_special_tokens)
115 |         output_tokens = new_tokens
116 |         # 5 is an arbitrary value that should work for all
117 |         # tokenizers (bigger = more conservative).
118 |         # Subtract 1 extra to account for the generated token.
119 |         prefix_offset = max(len(output_tokens) - 6, 0)
120 |         read_offset = max(len(output_tokens) - 1, 0)
121 |     else:
122 |         new_token = tokenizer.convert_ids_to_tokens(
123 |             new_token_id, skip_special_tokens=skip_special_tokens)
124 |         new_tokens = [new_token]
125 |         output_tokens = prev_tokens + new_tokens
126 | 
127 |     # The prefix text is necessary only to defeat cleanup algorithms in
128 |     # the decode which decide to add a space or not depending on the
129 |     # surrounding ids.
130 |     if not getattr(tokenizer, "added_tokens_encoder", {}):
131 |         prefix_text = tokenizer.convert_tokens_to_string(
132 |             output_tokens[prefix_offset:read_offset])
133 |         new_text = tokenizer.convert_tokens_to_string(
134 |             output_tokens[prefix_offset:])
135 |     else:
136 |         prefix_text = _convert_tokens_to_string_with_added_encoders(
137 |             tokenizer,
138 |             output_tokens[prefix_offset:read_offset],
139 |             skip_special_tokens=skip_special_tokens)
140 |         new_text = _convert_tokens_to_string_with_added_encoders(
141 |             tokenizer,
142 |             output_tokens[prefix_offset:],
143 |             skip_special_tokens=skip_special_tokens)
144 | 
145 |     if len(new_text) > len(prefix_text) and not new_text.endswith("�"):
146 |         # utf-8 char at the end means it's a potential unfinished byte sequence
147 |         # from byte fallback tokenization.
148 |         # If it's in the middle, it's probably a real invalid id generated
149 |         # by the model
150 |         new_text = new_text[len(prefix_text):]
151 |         return new_tokens, new_text, read_offset, len(output_tokens)
152 |     else:
153 |         return new_tokens, "", prefix_offset, read_offset
154 | 


--------------------------------------------------------------------------------