├── vllm
    ├── log
    ├── core
    │   ├── __init__.py
    │   ├── root.code-workspace
    │   └── policy.py
    ├── engine
    │   ├── __init__.py
    │   └── ray_utils.py
    ├── worker
    │   └── __init__.py
    ├── entrypoints
    │   ├── __init__.py
    │   ├── openai
    │   │   └── __init__.py
    │   └── api_server.py
    ├── model_executor
    │   ├── layers
    │   │   ├── __init__.py
    │   │   ├── layernorm.py
    │   │   ├── quantized_linear
    │   │   │   ├── __init__.py
    │   │   │   └── awq.py
    │   │   ├── activation.py
    │   │   └── simulator.py
    │   ├── parallel_utils
    │   │   ├── __init__.py
    │   │   ├── README.md
    │   │   ├── communication_op.py
    │   │   └── utils.py
    │   ├── __init__.py
    │   ├── utils.py
    │   ├── quantization_utils
    │   │   ├── __init__.py
    │   │   ├── awq.py
    │   │   └── base.py
    │   ├── models
    │   │   └── __init__.py
    │   ├── input_metadata.py
    │   └── model_loader.py
    ├── transformers_utils
    │   ├── __init__.py
    │   ├── configs
    │   │   ├── __init__.py
    │   │   ├── qwen.py
    │   │   ├── baichuan.py
    │   │   ├── aquila.py
    │   │   ├── mistral.py
    │   │   ├── mpt.py
    │   │   └── falcon.py
    │   └── config.py
    ├── __init__.py
    ├── logger.py
    ├── block.py
    ├── utils.py
    └── outputs.py
├── MANIFEST.in
├── docs
    ├── requirements-docs.txt
    ├── source
    │   ├── assets
    │   │   ├── figures
    │   │   │   ├── perf_a100_n1_dark.png
    │   │   │   ├── perf_a100_n3_dark.png
    │   │   │   ├── perf_a10g_n1_dark.png
    │   │   │   ├── perf_a10g_n3_dark.png
    │   │   │   ├── perf_a100_n1_light.png
    │   │   │   ├── perf_a100_n3_light.png
    │   │   │   ├── perf_a10g_n1_light.png
    │   │   │   └── perf_a10g_n3_light.png
    │   │   └── logos
    │   │   │   ├── vllm-logo-text-dark.png
    │   │   │   ├── vllm-logo-only-light.png
    │   │   │   └── vllm-logo-text-light.png
    │   ├── serving
    │   │   ├── deploying_with_triton.rst
    │   │   ├── distributed_serving.rst
    │   │   └── run_on_sky.rst
    │   ├── getting_started
    │   │   ├── installation.rst
    │   │   └── quickstart.rst
    │   ├── index.rst
    │   ├── conf.py
    │   └── models
    │   │   ├── supported_models.rst
    │   │   └── adding_model.rst
    ├── README.md
    ├── Makefile
    └── make.bat
├── benchmarks
    ├── visualizations
    │   ├── model_size_plots.pdf
    │   ├── waste_vs_heuristic.pdf
    │   ├── plot_over_heuristic.pdf
    │   ├── Plots Over Model Sizes.pdf
    │   ├── discard_vs_preserve_swap_vs_heuristic_vs_waste.pdf
    │   └── Makefile
    ├── README.md
    ├── launch_tgi_server.sh
    └── benchmark_latency.py
├── csrc
    ├── attention
    │   ├── attention_dtypes.h
    │   ├── attention_generic.cuh
    │   └── attention_utils.cuh
    ├── cuda_utils.cpp
    ├── layernorm.cpp
    ├── cuda_utils_kernels.cu
    ├── quantization.cpp
    ├── pos_encoding.cpp
    ├── dispatch_utils.h
    ├── activation.cpp
    ├── attention.cpp
    ├── reduction_utils.cuh
    ├── cache.cpp
    ├── layernorm_kernels.cu
    ├── quantization
    │   └── awq
    │   │   └── dequantize.cuh
    ├── activation_kernels.cu
    └── pos_encoding_kernels.cu
├── pyproject.toml
├── mypy.ini
├── requirements.txt
├── .readthedocs.yaml
├── examples
    ├── test.py
    ├── openai_completion_client.py
    ├── offline_inference.py
    ├── openai_chatcompletion_client.py
    ├── gradio_webserver.py
    ├── llm_engine_example.py
    ├── api_client.py
    ├── react_vllm_impl.py
    ├── test_pause.py
    └── test_ref_outputs.py
├── exps
    └── README.md
├── tests
    ├── kernels
    │   ├── conftest.py
    │   ├── test_layernorm.py
    │   ├── test_activation.py
    │   ├── test_cache.py
    │   └── test_pos_encoding.py
    ├── models
    │   └── test_models.py
    ├── samplers
    │   └── test_beam_search.py
    ├── async_engine
    │   ├── api_server_async_engine.py
    │   ├── test_async_llm_engine.py
    │   ├── test_request_tracker.py
    │   └── test_api_server.py
    ├── engine
    │   └── test_detokenize.py
    └── distributed
    │   └── test_comm_ops.py
├── README.md
└── .gitignore


/vllm/log:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/engine/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/worker/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/entrypoints/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/entrypoints/openai/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/vllm/model_executor/parallel_utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include requirements.txt
3 | 
4 | recursive-include csrc *
5 | 


--------------------------------------------------------------------------------
/docs/requirements-docs.txt:
--------------------------------------------------------------------------------
1 | sphinx == 6.2.1
2 | sphinx-book-theme == 1.0.1
3 | sphinx-copybutton == 0.5.2
4 | 


--------------------------------------------------------------------------------
/vllm/core/root.code-workspace:
--------------------------------------------------------------------------------
1 | {
2 |   "folders": [
3 |     {
4 |       "path": "../.."
5 |     }
6 |   ]
7 | }


--------------------------------------------------------------------------------
/benchmarks/visualizations/model_size_plots.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WukLab/InferCept/HEAD/benchmarks/visualizations/model_size_plots.pdf


--------------------------------------------------------------------------------
/benchmarks/visualizations/waste_vs_heuristic.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WukLab/InferCept/HEAD/benchmarks/visualizations/waste_vs_heuristic.pdf


--------------------------------------------------------------------------------
/docs/source/assets/figures/perf_a100_n1_dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WukLab/InferCept/HEAD/docs/source/assets/figures/perf_a100_n1_dark.png


--------------------------------------------------------------------------------
/docs/source/assets/figures/perf_a100_n3_dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WukLab/InferCept/HEAD/docs/source/assets/figures/perf_a100_n3_dark.png


--------------------------------------------------------------------------------
/docs/source/assets/figures/perf_a10g_n1_dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WukLab/InferCept/HEAD/docs/source/assets/figures/perf_a10g_n1_dark.png


--------------------------------------------------------------------------------
/docs/source/assets/figures/perf_a10g_n3_dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WukLab/InferCept/HEAD/docs/source/assets/figures/perf_a10g_n3_dark.png


--------------------------------------------------------------------------------
/docs/source/assets/logos/vllm-logo-text-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WukLab/InferCept/HEAD/docs/source/assets/logos/vllm-logo-text-dark.png


--------------------------------------------------------------------------------
/benchmarks/visualizations/plot_over_heuristic.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WukLab/InferCept/HEAD/benchmarks/visualizations/plot_over_heuristic.pdf


--------------------------------------------------------------------------------
/docs/source/assets/figures/perf_a100_n1_light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WukLab/InferCept/HEAD/docs/source/assets/figures/perf_a100_n1_light.png


--------------------------------------------------------------------------------
/docs/source/assets/figures/perf_a100_n3_light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WukLab/InferCept/HEAD/docs/source/assets/figures/perf_a100_n3_light.png


--------------------------------------------------------------------------------
/docs/source/assets/figures/perf_a10g_n1_light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WukLab/InferCept/HEAD/docs/source/assets/figures/perf_a10g_n1_light.png


--------------------------------------------------------------------------------
/docs/source/assets/figures/perf_a10g_n3_light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WukLab/InferCept/HEAD/docs/source/assets/figures/perf_a10g_n3_light.png


--------------------------------------------------------------------------------
/docs/source/assets/logos/vllm-logo-only-light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WukLab/InferCept/HEAD/docs/source/assets/logos/vllm-logo-only-light.png


--------------------------------------------------------------------------------
/docs/source/assets/logos/vllm-logo-text-light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WukLab/InferCept/HEAD/docs/source/assets/logos/vllm-logo-text-light.png


--------------------------------------------------------------------------------
/benchmarks/visualizations/Plots Over Model Sizes.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WukLab/InferCept/HEAD/benchmarks/visualizations/Plots Over Model Sizes.pdf


--------------------------------------------------------------------------------
/csrc/attention/attention_dtypes.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #include "attention_generic.cuh"
4 | #include "dtype_float16.cuh"
5 | #include "dtype_float32.cuh"
6 | #include "dtype_bfloat16.cuh"
7 | 


--------------------------------------------------------------------------------
/benchmarks/visualizations/discard_vs_preserve_swap_vs_heuristic_vs_waste.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/WukLab/InferCept/HEAD/benchmarks/visualizations/discard_vs_preserve_swap_vs_heuristic_vs_waste.pdf


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = [
 3 |     "ninja",
 4 |     "packaging",
 5 |     "setuptools",
 6 |     "torch >= 2.0.0",
 7 |     "wheel",
 8 | ]
 9 | build-backend = "setuptools.build_meta"
10 | 


--------------------------------------------------------------------------------
/vllm/model_executor/parallel_utils/README.md:
--------------------------------------------------------------------------------
1 | The files in this folder are ported from [Megatron-LM](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core). We only keep the codes that are used in inference.


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | python_version = 3.8
3 | 
4 | ignore_missing_imports = True
5 | 
6 | files = vllm
7 | # TODO(woosuk): Include the code from Megatron and HuggingFace.
8 | exclude = vllm/model_executor/parallel_utils/|vllm/model_executor/models/
9 | 


--------------------------------------------------------------------------------
/benchmarks/README.md:
--------------------------------------------------------------------------------
1 | # Benchmarking vLLM
2 | 
3 | ## Downloading the ShareGPT dataset
4 | 
5 | You can download the dataset by running:
6 | ```bash
7 | wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
8 | ```
9 | 


--------------------------------------------------------------------------------
/vllm/model_executor/__init__.py:
--------------------------------------------------------------------------------
 1 | from vllm.model_executor.input_metadata import InputMetadata
 2 | from vllm.model_executor.model_loader import get_model
 3 | from vllm.model_executor.utils import set_random_seed
 4 | 
 5 | __all__ = [
 6 |     "InputMetadata",
 7 |     "get_model",
 8 |     "set_random_seed",
 9 | ]
10 | 


--------------------------------------------------------------------------------
/csrc/cuda_utils.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | int get_device_attribute(
 4 |     int attribute,
 5 |     int device_id);
 6 | 
 7 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
 8 |   m.def(
 9 |     "get_device_attribute",
10 |     &get_device_attribute,
11 |     "Gets the specified device attribute.");
12 | }
13 | 
14 | 


--------------------------------------------------------------------------------
/benchmarks/visualizations/Makefile:
--------------------------------------------------------------------------------
1 | make_pdf:
2 | 	jupyter nbconvert "Plot Over Model Sizes".ipynb --to=pdf --TemplateExporter.exclude_input=True --output "Plots Over Model Sizes".pdf
3 | 
4 | notebook:
5 | 	cp $(file).ipynb '$(title).ipynb'
6 | 	jupyter nbconvert '$(title).ipynb' --to=pdf --TemplateExporter.exclude_input=True --output=$(file)
7 | 	rm '$(title).ipynb'
8 | 


--------------------------------------------------------------------------------
/vllm/model_executor/utils.py:
--------------------------------------------------------------------------------
 1 | """Utils for model executor."""
 2 | import random
 3 | 
 4 | import numpy as np
 5 | import torch
 6 | 
 7 | 
 8 | def set_random_seed(seed: int) -> None:
 9 |     random.seed(seed)
10 |     np.random.seed(seed)
11 |     torch.manual_seed(seed)
12 |     if torch.cuda.is_available():
13 |         torch.cuda.manual_seed_all(seed)
14 | 
15 | 


--------------------------------------------------------------------------------
/csrc/layernorm.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | void rms_norm(
 4 |   torch::Tensor& out,
 5 |   torch::Tensor& input,
 6 |   torch::Tensor& weight,
 7 |   float epsilon);
 8 | 
 9 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
10 |   m.def(
11 |     "rms_norm",
12 |     &rms_norm,
13 |     "Apply Root Mean Square (RMS) Normalization to the input tensor.");
14 | }
15 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # vLLM documents
 2 | 
 3 | ## Build the docs
 4 | 
 5 | ```bash
 6 | # Install dependencies.
 7 | pip install -r requirements-docs.txt
 8 | 
 9 | # Build the docs.
10 | make clean
11 | make html
12 | ```
13 | 
14 | ## Open the docs with your browser
15 | 
16 | ```bash
17 | python -m http.server -d build/html/
18 | ```
19 | Launch your browser and open localhost:8000.
20 | 


--------------------------------------------------------------------------------
/csrc/cuda_utils_kernels.cu:
--------------------------------------------------------------------------------
 1 | int get_device_attribute(
 2 |     int attribute,
 3 |     int device_id)
 4 | {
 5 |     int device, value;
 6 |     if (device_id < 0) {
 7 |         cudaGetDevice(&device);
 8 |     }
 9 |     else {
10 |         device = device_id;
11 |     }
12 |     cudaDeviceGetAttribute(&value, static_cast<cudaDeviceAttr>(attribute), device);
13 |     return value;
14 | }
15 | 


--------------------------------------------------------------------------------
/csrc/quantization.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | torch::Tensor awq_gemm(
 4 |   torch::Tensor _in_feats,
 5 |   torch::Tensor _kernel,
 6 |   torch::Tensor _scaling_factors,
 7 |   torch::Tensor _zeros,
 8 |   int split_k_iters);
 9 | 
10 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
11 |   m.def(
12 |     "awq_gemm",
13 |     &awq_gemm,
14 |     "Quantized GEMM for AWQ");
15 | }
16 | 


--------------------------------------------------------------------------------
/csrc/pos_encoding.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | void rotary_embedding(
 4 |   torch::Tensor& positions,
 5 |   torch::Tensor& query,
 6 |   torch::Tensor& key,
 7 |   int head_size,
 8 |   torch::Tensor& cos_sin_cache,
 9 |   bool is_neox);
10 | 
11 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
12 |   m.def(
13 |     "rotary_embedding",
14 |     &rotary_embedding,
15 |     "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
16 | }
17 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | ninja  # For faster builds.
 2 | psutil
 3 | ray >= 2.5.1
 4 | pandas  # Required for Ray data.
 5 | pyarrow  # Required for Ray data.
 6 | sentencepiece  # Required for LLaMA tokenizer.
 7 | numpy
 8 | torch >= 2.0.0
 9 | transformers >= 4.33.1  # Required for Code Llama.
10 | xformers >= 0.0.22
11 | fastapi
12 | uvicorn[standard]
13 | pydantic < 2  # Required for OpenAI server.
14 | gurobipy
15 | rich
16 | deepspeed == 0.12.3
17 | deepspeed-kernels


--------------------------------------------------------------------------------
/benchmarks/launch_tgi_server.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PORT=8000
 4 | MODEL=$1
 5 | TOKENS=$2
 6 | 
 7 | docker run --gpus all --shm-size 1g -p $PORT:80 \
 8 |            -v $PWD/data:/data \
 9 |            ghcr.io/huggingface/text-generation-inference:0.8 \
10 |            --model-id $MODEL \
11 |            --sharded false  \
12 |            --max-input-length 1024 \
13 |            --max-total-tokens 2048 \
14 |            --max-best-of 5 \
15 |            --max-concurrent-requests 5000 \
16 |            --max-batch-total-tokens $TOKENS
17 | 


--------------------------------------------------------------------------------
/docs/source/serving/deploying_with_triton.rst:
--------------------------------------------------------------------------------
1 | .. _deploying_with_triton:
2 | 
3 | Deploying with NVIDIA Triton
4 | ============================
5 | 
6 | The `Triton Inference Server <https://github.com/triton-inference-server>`_ hosts a tutorial demonstrating how to quickly deploy a simple `facebook/opt-125m <https://huggingface.co/facebook/opt-125m>`_ model using vLLM. Please see `Deploying a vLLM model in Triton <https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton>`_ for more details.
7 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # Read the Docs configuration file
 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 3 | 
 4 | version: 2
 5 | 
 6 | build:
 7 |   os: ubuntu-22.04
 8 |   tools:
 9 |     python: "3.8"
10 | 
11 | sphinx:
12 |    configuration: docs/source/conf.py
13 | 
14 | # If using Sphinx, optionally build your docs in additional formats such as PDF
15 | formats:
16 |    - pdf
17 | 
18 | # Optionally declare the Python requirements required to build your docs
19 | python:
20 |    install:
21 |    - requirements: docs/requirements-docs.txt
22 | 


--------------------------------------------------------------------------------
/csrc/dispatch_utils.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Adapted from
 3 |  * https://github.com/pytorch/pytorch/blob/v2.0.1/aten/src/ATen/Dispatch.h
 4 |  */
 5 | #include <torch/extension.h>
 6 | 
 7 | #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)              \
 8 |   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)      \
 9 |   AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)       \
10 |   AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
11 | 
12 | #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)             \
13 |   AT_DISPATCH_SWITCH(                                             \
14 |     TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
15 | 


--------------------------------------------------------------------------------
/vllm/model_executor/quantization_utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from typing import Type
 2 | 
 3 | from vllm.model_executor.quantization_utils.awq import AWQConfig
 4 | from vllm.model_executor.quantization_utils.base import QuantizationConfig
 5 | 
 6 | _QUANTIZATION_REGISTRY = {
 7 |     "awq": AWQConfig,
 8 | }
 9 | 
10 | 
11 | def get_quant_class(quantization: str) -> Type[QuantizationConfig]:
12 |     if quantization not in _QUANTIZATION_REGISTRY:
13 |         raise ValueError(f"Invalid quantization method: {quantization}")
14 |     return _QUANTIZATION_REGISTRY[quantization]
15 | 
16 | 
17 | __all__ = [
18 |     "QuantizationConfig",
19 |     "get_quant_class",
20 | ]
21 | 


--------------------------------------------------------------------------------
/csrc/activation.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | void silu_and_mul(
 4 |   torch::Tensor& out,
 5 |   torch::Tensor& input);
 6 | 
 7 | void gelu_new(
 8 |   torch::Tensor& out,
 9 |   torch::Tensor& input);
10 | 
11 | void gelu_fast(
12 |   torch::Tensor& out,
13 |   torch::Tensor& input);
14 | 
15 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
16 |   m.def(
17 |     "silu_and_mul",
18 |     &silu_and_mul,
19 |     "Activation function used in SwiGLU.");
20 |   m.def(
21 |     "gelu_new",
22 |     &gelu_new,
23 |     "GELU implementation used in GPT-2.");
24 |   m.def(
25 |     "gelu_fast",
26 |     &gelu_fast,
27 |     "Approximate GELU implementation.");
28 | }
29 | 


--------------------------------------------------------------------------------
/examples/test.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | 
 3 | 
 4 | async def async_foo():
 5 |     print("async_foo started")
 6 |     await asyncio.sleep(1)
 7 |     print("async_foo done")
 8 | 
 9 | 
10 | async def main():
11 |     for _ in range(2):
12 |         asyncio.ensure_future(async_foo())  # fire and forget async_foo()
13 | 
14 |     # btw, you can also create tasks inside non-async funcs
15 | 
16 |     print('Do some actions 1')
17 |     await asyncio.sleep(1)
18 |     print('Do some actions 2')
19 |     await asyncio.sleep(1)
20 |     print('Do some actions 3')
21 | 
22 | 
23 | if __name__ == '__main__':
24 |     loop = asyncio.get_event_loop()
25 |     loop.run_until_complete(main())


--------------------------------------------------------------------------------
/examples/openai_completion_client.py:
--------------------------------------------------------------------------------
 1 | import openai
 2 | 
 3 | # Modify OpenAI's API key and API base to use vLLM's API server.
 4 | openai.api_key = "EMPTY"
 5 | openai.api_base = "http://localhost:8000/v1"
 6 | 
 7 | # List models API
 8 | models = openai.Model.list()
 9 | print("Models:", models)
10 | 
11 | model = models["data"][0]["id"]
12 | 
13 | # Completion API
14 | stream = False
15 | completion = openai.Completion.create(
16 |     model=model,
17 |     prompt="A robot may not injure a human being",
18 |     echo=False,
19 |     n=2,
20 |     stream=stream,
21 |     logprobs=3)
22 | 
23 | print("Completion results:")
24 | if stream:
25 |     for c in completion:
26 |         print(c)
27 | else:
28 |     print(completion)
29 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/csrc/attention.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | #include <c10/util/Optional.h>
 3 | 
 4 | void single_query_cached_kv_attention(
 5 |   torch::Tensor& out,
 6 |   torch::Tensor& query,
 7 |   torch::Tensor& key_cache,
 8 |   torch::Tensor& value_cache,
 9 |   torch::Tensor& head_mapping,
10 |   float scale,
11 |   torch::Tensor& block_tables,
12 |   torch::Tensor& context_lens,
13 |   int block_size,
14 |   int max_context_len,
15 |   const c10::optional<torch::Tensor>& alibi_slopes);
16 | 
17 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
18 |   m.def(
19 |     "single_query_cached_kv_attention",
20 |     &single_query_cached_kv_attention,
21 |     "Compute the attention between an input query and the cached key/value tensors");
22 | }
23 | 


--------------------------------------------------------------------------------
/vllm/__init__.py:
--------------------------------------------------------------------------------
 1 | """vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
 2 | 
 3 | from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 4 | from vllm.engine.async_llm_engine import AsyncLLMEngine
 5 | from vllm.engine.llm_engine import LLMEngine
 6 | from vllm.engine.ray_utils import initialize_cluster
 7 | from vllm.entrypoints.llm import LLM
 8 | from vllm.outputs import CompletionOutput, RequestOutput
 9 | from vllm.sampling_params import SamplingParams
10 | 
11 | __version__ = "0.2.0"
12 | 
13 | __all__ = [
14 |     "LLM",
15 |     "SamplingParams",
16 |     "RequestOutput",
17 |     "CompletionOutput",
18 |     "LLMEngine",
19 |     "EngineArgs",
20 |     "AsyncLLMEngine",
21 |     "AsyncEngineArgs",
22 |     "initialize_cluster",
23 | ]
24 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/configs/__init__.py:
--------------------------------------------------------------------------------
 1 | from vllm.transformers_utils.configs.mpt import MPTConfig
 2 | from vllm.transformers_utils.configs.baichuan import BaiChuanConfig
 3 | from vllm.transformers_utils.configs.aquila import AquilaConfig
 4 | from vllm.transformers_utils.configs.qwen import QWenConfig
 5 | # RWConfig is for the original tiiuae/falcon-40b(-instruct) and
 6 | # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
 7 | # `FalconConfig` class from the official HuggingFace transformers library.
 8 | from vllm.transformers_utils.configs.falcon import RWConfig
 9 | from vllm.transformers_utils.configs.mistral import MistralConfig
10 | 
11 | __all__ = [
12 |     "MPTConfig",
13 |     "BaiChuanConfig",
14 |     "AquilaConfig",
15 |     "QWenConfig",
16 |     "RWConfig",
17 |     "MistralConfig",
18 | ]
19 | 


--------------------------------------------------------------------------------
/examples/offline_inference.py:
--------------------------------------------------------------------------------
 1 | from vllm import LLM, SamplingParams
 2 | 
 3 | # Sample prompts.
 4 | prompts = [
 5 |     "Hello, my name is",
 6 |     "The president of the United States is",
 7 |     "The capital of France is",
 8 |     "The future of AI is",
 9 | ]
10 | # Create a sampling params object.
11 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
12 | 
13 | # Create an LLM.
14 | llm = LLM(model="facebook/opt-125m")
15 | # Generate texts from the prompts. The output is a list of RequestOutput objects
16 | # that contain the prompt, generated text, and other information.
17 | outputs = llm.generate(prompts, sampling_params)
18 | # Print the outputs.
19 | for output in outputs:
20 |     prompt = output.prompt
21 |     generated_text = output.outputs[0].text
22 |     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
23 | 


--------------------------------------------------------------------------------
/exps/README.md:
--------------------------------------------------------------------------------
 1 | # This is the reproduce instructions for paper "INFERCEPT: Efficient Intercept Support for Large-Language Model Inferencing"
 2 | 
 3 | ## Dataset
 4 | Download our 6-augment mixture workload from <a href='https://drive.google.com/file/d/1CMTgd-lYFXLprKK2Q3QkCcqrXJc1UgtR/view?usp=drive_link'><b>google drive</b></a> and place it under `exps` filder.
 5 | 
 6 | ## Profiler
 7 | The profiler is still under refactoring. The current benchmark script will set profiling variables to ones used in the paper.
 8 | 
 9 | ## Run Benchmark
10 | ```bash
11 | # after installing InferCept
12 | bash bench.sh
13 | ```
14 | 1. Results will be available at `exps/results`. 
15 | 2. Each data point will run for 30min, please manage your GPU cluster wisely.
16 | 3. Please do not schedule two swap-involved run concurrently as we assume exclusive access to the PCIE bendwidth.


--------------------------------------------------------------------------------
/vllm/model_executor/layers/layernorm.py:
--------------------------------------------------------------------------------
 1 | """Custom normalization layers."""
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | from vllm import layernorm_ops
 6 | 
 7 | 
 8 | class RMSNorm(nn.Module):
 9 |     """Root mean square normalization.
10 | 
11 |     Computes x -> w * x / sqrt(E[x^2] + eps) where w is the learned weight.
12 |     Refer to https://arxiv.org/abs/1910.07467
13 |     """
14 | 
15 |     def __init__(
16 |         self,
17 |         hidden_size: int,
18 |         eps: float = 1e-6,
19 |     ) -> None:
20 |         super().__init__()
21 |         self.weight = nn.Parameter(torch.ones(hidden_size))
22 |         self.variance_epsilon = eps
23 | 
24 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
25 |         out = torch.empty_like(x)
26 |         layernorm_ops.rms_norm(
27 |             out,
28 |             x,
29 |             self.weight.data,
30 |             self.variance_epsilon,
31 |         )
32 |         return out
33 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/examples/openai_chatcompletion_client.py:
--------------------------------------------------------------------------------
 1 | import openai
 2 | 
 3 | # Modify OpenAI's API key and API base to use vLLM's API server.
 4 | openai.api_key = "EMPTY"
 5 | openai.api_base = "http://localhost:8000/v1"
 6 | 
 7 | # List models API
 8 | models = openai.Model.list()
 9 | print("Models:", models)
10 | 
11 | model = models["data"][0]["id"]
12 | 
13 | # Chat completion API
14 | chat_completion = openai.ChatCompletion.create(
15 |     model=model,
16 |     messages=[{
17 |         "role": "system",
18 |         "content": "You are a helpful assistant."
19 |     }, {
20 |         "role": "user",
21 |         "content": "Who won the world series in 2020?"
22 |     }, {
23 |         "role":
24 |         "assistant",
25 |         "content":
26 |         "The Los Angeles Dodgers won the World Series in 2020."
27 |     }, {
28 |         "role": "user",
29 |         "content": "Where was it played?"
30 |     }])
31 | 
32 | print("Chat completion results:")
33 | print(chat_completion)
34 | 


--------------------------------------------------------------------------------
/docs/source/getting_started/installation.rst:
--------------------------------------------------------------------------------
 1 | .. _installation:
 2 | 
 3 | Installation
 4 | ============
 5 | 
 6 | vLLM is a Python library that also contains pre-compiled C++ and CUDA (11.8) binaries.
 7 | 
 8 | Requirements
 9 | ------------
10 | 
11 | * OS: Linux
12 | * Python: 3.8 -- 3.11
13 | * GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, etc.)
14 | 
15 | Install with pip
16 | ----------------
17 | 
18 | You can install vLLM using pip:
19 | 
20 | .. code-block:: console
21 | 
22 |     $ # (Optional) Create a new conda environment.
23 |     $ conda create -n myenv python=3.8 -y
24 |     $ conda activate myenv
25 | 
26 |     $ # Install vLLM.
27 |     $ pip install vllm
28 | 
29 | 
30 | .. _build_from_source:
31 | 
32 | Build from source
33 | -----------------
34 | 
35 | You can also build and install vLLM from source:
36 | 
37 | .. code-block:: console
38 | 
39 |     $ git clone https://github.com/vllm-project/vllm.git
40 |     $ cd vllm
41 |     $ pip install -e .  # This may take 5-10 minutes.
42 | 
43 | .. tip::
44 |     If you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image.
45 | 
46 |     .. code-block:: console
47 | 
48 |         $ # Pull the Docker image with CUDA 11.8.
49 |         $ # Use `--ipc=host` to make sure the shared memory is large enough.
50 |         $ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:22.12-py3
51 | 


--------------------------------------------------------------------------------
/tests/kernels/conftest.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple
 2 | 
 3 | import pytest
 4 | import torch
 5 | 
 6 | 
 7 | def create_kv_caches(
 8 |     num_blocks: int,
 9 |     block_size: int,
10 |     num_layers: int,
11 |     num_heads: int,
12 |     head_size: int,
13 |     dtype: torch.dtype,
14 |     seed: int,
15 | ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
16 |     torch.random.manual_seed(seed)
17 |     torch.cuda.manual_seed(seed)
18 | 
19 |     scale = head_size**-0.5
20 |     x = 16 // torch.tensor([], dtype=dtype).element_size()
21 |     key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x)
22 |     key_caches = []
23 |     for _ in range(num_layers):
24 |         key_cache = torch.empty(size=key_cache_shape,
25 |                                 dtype=dtype,
26 |                                 device='cuda')
27 |         key_cache.uniform_(-scale, scale)
28 |         key_caches.append(key_cache)
29 | 
30 |     value_cache_shape = (num_blocks, num_heads, head_size, block_size)
31 |     value_caches = []
32 |     for _ in range(num_layers):
33 |         value_cache = torch.empty(size=value_cache_shape,
34 |                                   dtype=dtype,
35 |                                   device='cuda')
36 |         value_cache.uniform_(-scale, scale)
37 |         value_caches.append(value_cache)
38 |     return key_caches, value_caches
39 | 
40 | 
41 | @pytest.fixture()
42 | def kv_cache_factory():
43 |     return create_kv_caches
44 | 


--------------------------------------------------------------------------------
/vllm/model_executor/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from vllm.model_executor.models.aquila import AquilaForCausalLM
 2 | from vllm.model_executor.models.baichuan import (BaiChuanForCausalLM,
 3 |                                                  BaichuanForCausalLM)
 4 | from vllm.model_executor.models.bloom import BloomForCausalLM
 5 | from vllm.model_executor.models.falcon import FalconForCausalLM
 6 | from vllm.model_executor.models.gpt2 import GPT2LMHeadModel
 7 | from vllm.model_executor.models.gpt_bigcode import GPTBigCodeForCausalLM
 8 | from vllm.model_executor.models.gpt_j import GPTJForCausalLM
 9 | from vllm.model_executor.models.gpt_neox import GPTNeoXForCausalLM
10 | from vllm.model_executor.models.internlm import InternLMForCausalLM
11 | from vllm.model_executor.models.llama import LlamaForCausalLM
12 | from vllm.model_executor.models.mpt import MPTForCausalLM
13 | from vllm.model_executor.models.opt import OPTForCausalLM
14 | from vllm.model_executor.models.qwen import QWenLMHeadModel
15 | from vllm.model_executor.models.mistral import MistralForCausalLM
16 | 
17 | __all__ = [
18 |     "AquilaForCausalLM",
19 |     "BaiChuanForCausalLM",
20 |     "BaichuanForCausalLM",
21 |     "BloomForCausalLM",
22 |     "FalconForCausalLM",
23 |     "GPT2LMHeadModel",
24 |     "GPTBigCodeForCausalLM",
25 |     "GPTJForCausalLM",
26 |     "GPTNeoXForCausalLM",
27 |     "InternLMForCausalLM",
28 |     "LlamaForCausalLM",
29 |     "MPTForCausalLM",
30 |     "OPTForCausalLM",
31 |     "QWenLMHeadModel",
32 |     "MistralForCausalLM",
33 | ]
34 | 


--------------------------------------------------------------------------------
/tests/models/test_models.py:
--------------------------------------------------------------------------------
 1 | """Compare the outputs of HF and vLLM when using greedy sampling.
 2 | 
 3 | Run `pytest tests/models/test_models.py --forked`.
 4 | """
 5 | import pytest
 6 | 
 7 | MODELS = [
 8 |     "facebook/opt-125m",
 9 |     "gpt2",
10 |     "bigcode/tiny_starcoder_py",
11 |     "EleutherAI/gpt-j-6b",
12 |     "EleutherAI/pythia-70m",
13 |     "bigscience/bloom-560m",
14 |     "mosaicml/mpt-7b",
15 |     "tiiuae/falcon-7b",
16 |     "meta-llama/Llama-2-7b-hf",
17 | ]
18 | 
19 | 
20 | @pytest.mark.parametrize("model", MODELS)
21 | @pytest.mark.parametrize("dtype", ["half"])
22 | @pytest.mark.parametrize("max_tokens", [128])
23 | def test_models(
24 |     hf_runner,
25 |     vllm_runner,
26 |     example_prompts,
27 |     model: str,
28 |     dtype: str,
29 |     max_tokens: int,
30 | ) -> None:
31 |     hf_model = hf_runner(model, dtype=dtype)
32 |     hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
33 |     del hf_model
34 | 
35 |     vllm_model = vllm_runner(model, dtype=dtype)
36 |     vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
37 |     del vllm_model
38 | 
39 |     for i in range(len(example_prompts)):
40 |         hf_output_ids, hf_output_str = hf_outputs[i]
41 |         vllm_output_ids, vllm_output_str = vllm_outputs[i]
42 |         assert hf_output_str == vllm_output_str, (
43 |             f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
44 |         assert hf_output_ids == vllm_output_ids, (
45 |             f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
46 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | <h1 align="center">
 3 | INFERCEPT: Efficient Intercept Support for Augmented Large Language Model
 4 | Inference
 5 | </h1>
 6 | 
 7 | <!-- <p align="center"> -->
 8 | <!-- | <a href=""><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> | -->
 9 | <!-- </p> -->
10 | 
11 | This repo contains implementation of InferCept. Please refer to our <a href='https://arxiv.org/abs/2402.01869'><b>paper</b></a> for more details.
12 | ---
13 | ## Instructions
14 | To install InferCept to your environment:
15 | ```bash
16 | # After cloning the repo
17 | cd infercept/
18 | pip install -e .
19 | ```
20 | 
21 | To enable the serving system to hook on augmentation calls, register your aug-stop token in `vllm/utils.py`. You can register multiple keys at once:
22 | 
23 | ```python
24 | def get_api_stop_strings() -> List[str]:
25 |   return ["<stop token 1>", "<stop token 2>"]
26 | ```
27 | 
28 | To reproduce paper results, check `exps` folder.
29 | ## Citation
30 | 
31 | If you use InferCept for your research, please cite our paper:
32 | ```bibtex
33 | @inproceedings{
34 |   abhyankar2024infer,
35 |   title={INFERCEPT: Efficient Intercept Support for Augmented Large Language Model
36 | Inference},
37 |   author={Reyna Abhyankar and Zijian He and Vikranth Srivatsa and Hao Zhang and Yiying Zhang},
38 |   booktitle={Forty-first International Conference on Machine Learning},
39 |   year={2024},
40 |   month=Jul,
41 |   address={Vienna, Austria},
42 | }
43 | ```
44 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantized_linear/__init__.py:
--------------------------------------------------------------------------------
 1 | from vllm.model_executor.layers.quantized_linear.awq import (
 2 |     AWQColumnParallelLinear, AWQRowParallelLinear)
 3 | from vllm.model_executor.parallel_utils.layers import (ColumnParallelLinear,
 4 |                                                        RowParallelLinear)
 5 | 
 6 | _QUANTIZED_LINEAR_REGISTRY = {
 7 |     "awq": (AWQColumnParallelLinear, AWQRowParallelLinear),
 8 | }
 9 | 
10 | 
11 | class ParallelLinear:
12 | 
13 |     @classmethod
14 |     def column(cls, *args, **kwargs) -> ColumnParallelLinear:
15 |         quant_config = kwargs.get("quant_config", None)
16 |         if quant_config is None:
17 |             return ColumnParallelLinear(*args, **kwargs)
18 | 
19 |         name = quant_config.get_name()
20 |         if name not in _QUANTIZED_LINEAR_REGISTRY:
21 |             raise ValueError(f"No quantized linear is found for {name}")
22 | 
23 |         quant_linear_cls = _QUANTIZED_LINEAR_REGISTRY[name][0]
24 |         return quant_linear_cls(*args, **kwargs)
25 | 
26 |     @classmethod
27 |     def row(cls, *args, **kwargs) -> RowParallelLinear:
28 |         quant_config = kwargs.get("quant_config", None)
29 |         if quant_config is None:
30 |             return RowParallelLinear(*args, **kwargs)
31 | 
32 |         name = quant_config.get_name()
33 |         if name not in _QUANTIZED_LINEAR_REGISTRY:
34 |             raise ValueError(f"No quantized linear is found for {name}")
35 | 
36 |         quant_linear_cls = _QUANTIZED_LINEAR_REGISTRY[name][1]
37 |         return quant_linear_cls(*args, **kwargs)
38 | 


--------------------------------------------------------------------------------
/vllm/core/policy.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple
 2 | 
 3 | from vllm.sequence import SequenceGroup
 4 | 
 5 | 
 6 | class Policy:
 7 | 
 8 |     def get_priority(
 9 |         self,
10 |         now: float,
11 |         seq_group: SequenceGroup,
12 |     ) -> float:
13 |         raise NotImplementedError
14 | 
15 |     def sort_by_priority(
16 |         self,
17 |         now: float,
18 |         seq_groups: List[SequenceGroup],
19 |     ) -> List[SequenceGroup]:
20 |         return sorted(
21 |             seq_groups,
22 |             key=lambda seq_group: self.get_priority(now, seq_group),
23 |             reverse=True,
24 |         )
25 | 
26 | 
27 | class FCFS(Policy):
28 | 
29 |     def get_priority(
30 |         self,
31 |         now: float,
32 |         seq_group: SequenceGroup,
33 |     ) -> float:
34 |         return now - seq_group.arrival_time
35 |     
36 | class Chunked_FCFS(Policy):
37 | 
38 |     def get_priority(
39 |         self,
40 |         now: float,
41 |         seq_group: SequenceGroup,
42 |     ) -> Tuple[int, float]:
43 |         return -seq_group.get_seqs()[0].data.logical_query_len, now - seq_group.arrival_time
44 | 
45 | class LongestRemainingAPIFirst(Policy):
46 | 
47 |     def get_priority(
48 |         self,
49 |         now: float,
50 |         seq_group: SequenceGroup,
51 |     ) -> float:
52 |         return seq_group.api_remaining_time(now)
53 | 
54 | class PolicyFactory:
55 | 
56 |     _POLICY_REGISTRY = {
57 |         'fcfs': FCFS,
58 |         'c-fcfs': Chunked_FCFS,
59 |         'lra': LongestRemainingAPIFirst,
60 |     }
61 | 
62 |     @classmethod
63 |     def get_policy(cls, policy_name: str, **kwargs) -> Policy:
64 |         return cls._POLICY_REGISTRY[policy_name](**kwargs)
65 | 


--------------------------------------------------------------------------------
/tests/samplers/test_beam_search.py:
--------------------------------------------------------------------------------
 1 | """Compare the outputs of HF and vLLM when using beam search.
 2 | 
 3 | Run `pytest tests/samplers/test_beam_search.py --forked`.
 4 | """
 5 | import pytest
 6 | 
 7 | # FIXME(zhuohan): The test can not pass if we:
 8 | #   1. Increase max_tokens to 256.
 9 | #   2. Increase beam_width to 8.
10 | #   3. Use the model "huggyllama/llama-7b".
11 | MAX_TOKENS = [128]
12 | BEAM_WIDTHS = [4]
13 | MODELS = ["facebook/opt-125m"]
14 | 
15 | 
16 | @pytest.mark.parametrize("model", MODELS)
17 | @pytest.mark.parametrize("dtype", ["half"])
18 | @pytest.mark.parametrize("max_tokens", MAX_TOKENS)
19 | @pytest.mark.parametrize("beam_width", BEAM_WIDTHS)
20 | def test_beam_search_single_input(
21 |     hf_runner,
22 |     vllm_runner,
23 |     example_prompts,
24 |     model: str,
25 |     dtype: str,
26 |     max_tokens: int,
27 |     beam_width: int,
28 | ) -> None:
29 |     hf_model = hf_runner(model, dtype=dtype)
30 |     hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
31 |                                                max_tokens)
32 |     del hf_model
33 | 
34 |     vllm_model = vllm_runner(model, dtype=dtype)
35 |     vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width,
36 |                                                    max_tokens)
37 |     del vllm_model
38 | 
39 |     for i in range(len(example_prompts)):
40 |         hf_output_ids, _ = hf_outputs[i]
41 |         vllm_output_ids, _ = vllm_outputs[i]
42 |         assert len(hf_output_ids) == len(vllm_output_ids)
43 |         for j in range(len(hf_output_ids)):
44 |             assert hf_output_ids[j] == vllm_output_ids[j], (
45 |                 f"Test{i} output{j}:\nHF: {hf_output_ids}\n"
46 |                 f"vLLM: {vllm_output_ids}")
47 | 


--------------------------------------------------------------------------------
/docs/source/serving/distributed_serving.rst:
--------------------------------------------------------------------------------
 1 | .. _distributed_serving:
 2 | 
 3 | Distributed Inference and Serving
 4 | =================================
 5 | 
 6 | vLLM supports distributed tensor-parallel inference and serving. Currently, we support `Megatron-LM's tensor parallel algorithm <https://arxiv.org/pdf/1909.08053.pdf>`_. We manage the distributed runtime with `Ray <https://github.com/ray-project/ray>`_. To run distributed inference, install Ray with:
 7 | 
 8 | .. code-block:: console
 9 | 
10 |     $ pip install ray
11 | 
12 | To run multi-GPU inference with the :code:`LLM` class, set the :code:`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs:
13 | 
14 | .. code-block:: python
15 | 
16 |     from vllm import LLM
17 |     llm = LLM("facebook/opt-13b", tensor_parallel_size=4)
18 |     output = llm.generate("San Franciso is a")
19 | 
20 | To run multi-GPU serving, pass in the :code:`--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs:
21 | 
22 | .. code-block:: console
23 | 
24 |     $ python -m vllm.entrypoints.api_server \
25 |     $     --model facebook/opt-13b \
26 |     $     --tensor-parallel-size 4
27 | 
28 | To scale vLLM beyond a single machine, start a `Ray runtime <https://docs.ray.io/en/latest/ray-core/starting-ray.html>`_ via CLI before running vLLM:
29 | 
30 | .. code-block:: console
31 | 
32 |     $ # On head node
33 |     $ ray start --head
34 | 
35 |     $ # On worker nodes
36 |     $ ray start --address=<ray-head-address>
37 | 
38 | After that, you can run inference and serving on multiple machines by launching the vLLM process on the head node by setting :code:`tensor_parallel_size` to the number of GPUs to be the total number of GPUs across all machines.


--------------------------------------------------------------------------------
/tests/async_engine/api_server_async_engine.py:
--------------------------------------------------------------------------------
 1 | """vllm.entrypoints.api_server with some extra logging for testing."""
 2 | import argparse
 3 | from typing import Any, Dict
 4 | 
 5 | import uvicorn
 6 | from fastapi.responses import JSONResponse, Response
 7 | 
 8 | import vllm.entrypoints.api_server
 9 | from vllm.engine.arg_utils import AsyncEngineArgs
10 | from vllm.engine.async_llm_engine import AsyncLLMEngine
11 | 
12 | app = vllm.entrypoints.api_server.app
13 | 
14 | 
15 | class AsyncLLMEngineWithStats(AsyncLLMEngine):
16 | 
17 |     # pylint: disable=redefined-outer-name
18 |     def __init__(self, *args, **kwargs):
19 |         super().__init__(*args, **kwargs)
20 |         self._num_aborts = 0
21 | 
22 |     async def abort(self, request_id: str) -> None:
23 |         await super().abort(request_id)
24 |         self._num_aborts += 1
25 | 
26 |     def testing_stats(self) -> Dict[str, Any]:
27 |         return {"num_aborted_requests": self._num_aborts}
28 | 
29 | 
30 | @app.get("/stats")
31 | def stats() -> Response:
32 |     """Get the statistics of the engine."""
33 |     return JSONResponse(engine.testing_stats())
34 | 
35 | 
36 | if __name__ == "__main__":
37 |     parser = argparse.ArgumentParser()
38 |     parser.add_argument("--host", type=str, default="localhost")
39 |     parser.add_argument("--port", type=int, default=8000)
40 |     parser = AsyncEngineArgs.add_cli_args(parser)
41 |     args = parser.parse_args()
42 | 
43 |     engine_args = AsyncEngineArgs.from_cli_args(args)
44 |     engine = AsyncLLMEngineWithStats.from_engine_args(engine_args)
45 |     vllm.entrypoints.api_server.engine = engine
46 |     uvicorn.run(
47 |         app,
48 |         host=args.host,
49 |         port=args.port,
50 |         log_level="debug",
51 |         timeout_keep_alive=vllm.entrypoints.api_server.TIMEOUT_KEEP_ALIVE)
52 | 


--------------------------------------------------------------------------------
/csrc/reduction_utils.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/reduce_kernel_utils.cuh
 3 |  * Copyright (c) 2023, The vLLM team.
 4 |  * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License");
 7 |  * you may not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | #pragma once
19 | 
20 | namespace vllm {
21 | 
22 | template<typename T>
23 | __inline__ __device__ T warpReduceSum(T val) {
24 | #pragma unroll
25 |   for (int mask = 16; mask > 0; mask >>= 1)
26 |     val += __shfl_xor_sync(0xffffffff, val, mask, 32);
27 |   return val;
28 | }
29 | 
30 | /* Calculate the sum of all elements in a block */
31 | template<typename T>
32 | __inline__ __device__ T blockReduceSum(T val) {
33 |   static __shared__ T shared[32];
34 |   int lane = threadIdx.x & 0x1f;
35 |   int wid = threadIdx.x >> 5;
36 | 
37 |   val = warpReduceSum<T>(val);
38 | 
39 |   if (lane == 0)
40 |     shared[wid] = val;
41 | 
42 |   __syncthreads();
43 | 
44 |   // Modify from blockDim.x << 5 to blockDim.x / 32. to prevent
45 |   // blockDim.x is not divided by 32
46 |   val = (threadIdx.x < (blockDim.x / 32.f)) ? shared[lane] : (T)(0.0f);
47 |   val = warpReduceSum<T>(val);
48 |   return val;
49 | }
50 | 
51 | } // namespace vllm
52 | 


--------------------------------------------------------------------------------
/vllm/logger.py:
--------------------------------------------------------------------------------
 1 | # Adapted from
 2 | # https://github.com/skypilot-org/skypilot/blob/86dc0f6283a335e4aa37b3c10716f90999f48ab6/sky/sky_logging.py
 3 | """Logging configuration for vLLM."""
 4 | import logging
 5 | import sys
 6 | 
 7 | _FORMAT = "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s"
 8 | _DATE_FORMAT = "%m-%d %H:%M:%S"
 9 | 
10 | 
11 | class NewLineFormatter(logging.Formatter):
12 |     """Adds logging prefix to newlines to align multi-line messages."""
13 | 
14 |     def __init__(self, fmt, datefmt=None):
15 |         logging.Formatter.__init__(self, fmt, datefmt)
16 | 
17 |     def format(self, record):
18 |         msg = logging.Formatter.format(self, record)
19 |         if record.message != "":
20 |             parts = msg.split(record.message)
21 |             msg = msg.replace("\n", "\r\n" + parts[0])
22 |         return msg
23 | 
24 | 
25 | _root_logger = logging.getLogger("vllm")
26 | _default_handler = None
27 | 
28 | 
29 | def _setup_logger():
30 |     _root_logger.setLevel(logging.DEBUG)
31 |     global _default_handler
32 |     if _default_handler is None:
33 |         _default_handler = logging.StreamHandler(sys.stdout)
34 |         _default_handler.flush = sys.stdout.flush  # type: ignore
35 |         _default_handler.setLevel(logging.INFO)
36 |         _root_logger.addHandler(_default_handler)
37 |     fmt = NewLineFormatter(_FORMAT, datefmt=_DATE_FORMAT)
38 |     _default_handler.setFormatter(fmt)
39 |     # Setting this will avoid the message
40 |     # being propagated to the parent logger.
41 |     _root_logger.propagate = False
42 | 
43 | 
44 | # The logger is initialized when the module is imported.
45 | # This is thread-safe as the module is only imported once,
46 | # guaranteed by the Python GIL.
47 | _setup_logger()
48 | 
49 | 
50 | def init_logger(name: str):
51 |     return logging.getLogger(name)
52 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/activation.py:
--------------------------------------------------------------------------------
 1 | """Custom activation functions."""
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | from vllm import activation_ops
 6 | 
 7 | 
 8 | class SiluAndMul(nn.Module):
 9 |     """An activation function for SwiGLU.
10 | 
11 |     The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[1] // 2.
12 | 
13 |     Shapes:
14 |         x: (num_tokens, 2 * d)
15 |         return: (num_tokens, d)
16 |     """
17 | 
18 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
19 |         num_tokens = x.shape[0]
20 |         d = x.shape[1] // 2
21 |         out = torch.empty(num_tokens, d, dtype=x.dtype, device=x.device)
22 |         activation_ops.silu_and_mul(out, x)
23 |         return out
24 | 
25 | 
26 | class NewGELU(nn.Module):
27 | 
28 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
29 |         num_tokens = x.shape[0]
30 |         d = x.shape[1]
31 |         out = torch.empty(num_tokens, d, dtype=x.dtype, device=x.device)
32 |         activation_ops.gelu_new(out, x)
33 |         return out
34 | 
35 | 
36 | class FastGELU(nn.Module):
37 | 
38 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
39 |         num_tokens = x.shape[0]
40 |         d = x.shape[1]
41 |         out = torch.empty(num_tokens, d, dtype=x.dtype, device=x.device)
42 |         activation_ops.gelu_fast(out, x)
43 |         return out
44 | 
45 | 
46 | _ACTIVATION_REGISTRY = {
47 |     "gelu": nn.GELU(),
48 |     "gelu_fast": FastGELU(),
49 |     "gelu_new": NewGELU(),
50 |     "gelu_pytorch_tanh": nn.GELU(approximate="tanh"),
51 |     "relu": nn.ReLU(),
52 | }
53 | 
54 | 
55 | def get_act_fn(act_fn: str) -> nn.Module:
56 |     """Get an activation function by name."""
57 |     act_fn = act_fn.lower()
58 |     if act_fn in _ACTIVATION_REGISTRY:
59 |         return _ACTIVATION_REGISTRY[act_fn]
60 |     raise ValueError(f"Activation function {act_fn!r} is not supported.")
61 | 


--------------------------------------------------------------------------------
/examples/gradio_webserver.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | import gradio as gr
 5 | import requests
 6 | 
 7 | 
 8 | def http_bot(prompt):
 9 |     headers = {"User-Agent": "vLLM Client"}
10 |     pload = {
11 |         "prompt": prompt,
12 |         "stream": True,
13 |         "max_tokens": 128,
14 |     }
15 |     response = requests.post(args.model_url,
16 |                              headers=headers,
17 |                              json=pload,
18 |                              stream=True)
19 | 
20 |     for chunk in response.iter_lines(chunk_size=8192,
21 |                                      decode_unicode=False,
22 |                                      delimiter=b"\0"):
23 |         if chunk:
24 |             data = json.loads(chunk.decode("utf-8"))
25 |             output = data["text"][0]
26 |             yield output
27 | 
28 | 
29 | def build_demo():
30 |     with gr.Blocks() as demo:
31 |         gr.Markdown("# vLLM text completion demo\n")
32 |         inputbox = gr.Textbox(label="Input",
33 |                               placeholder="Enter text and press ENTER")
34 |         outputbox = gr.Textbox(label="Output",
35 |                                placeholder="Generated result from the model")
36 |         inputbox.submit(http_bot, [inputbox], [outputbox])
37 |     return demo
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     parser = argparse.ArgumentParser()
42 |     parser.add_argument("--host", type=str, default="localhost")
43 |     parser.add_argument("--port", type=int, default=8001)
44 |     parser.add_argument("--model-url",
45 |                         type=str,
46 |                         default="http://localhost:8000/generate")
47 |     args = parser.parse_args()
48 | 
49 |     demo = build_demo()
50 |     demo.queue(concurrency_count=100).launch(server_name=args.host,
51 |                                              server_port=args.port,
52 |                                              share=True)
53 | 


--------------------------------------------------------------------------------
/examples/llm_engine_example.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from vllm import EngineArgs, LLMEngine, SamplingParams
 4 | 
 5 | 
 6 | def main(args: argparse.Namespace):
 7 |     # Parse the CLI argument and initialize the engine.
 8 |     engine_args = EngineArgs.from_cli_args(args)
 9 |     engine = LLMEngine.from_engine_args(engine_args)
10 | 
11 |     # Test the following prompts.
12 |     test_prompts = [
13 |         ("A robot may not injure a human being",
14 |          SamplingParams(temperature=0.0)),
15 |         ("To be or not to be,",
16 |          SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2)),
17 |         ("What is the meaning of life?",
18 |          SamplingParams(n=2,
19 |                         best_of=5,
20 |                         temperature=0.8,
21 |                         top_p=0.95,
22 |                         frequency_penalty=0.1)),
23 |         ("It is only with the heart that one can see rightly",
24 |          SamplingParams(n=3, best_of=3, use_beam_search=True,
25 |                         temperature=0.0)),
26 |     ]
27 | 
28 |     # Run the engine by calling `engine.step()` manually.
29 |     request_id = 0
30 |     while True:
31 |         # To test continuous batching, we add one request at each step.
32 |         if test_prompts:
33 |             prompt, sampling_params = test_prompts.pop(0)
34 |             engine.add_request(str(request_id), prompt, sampling_params)
35 |             request_id += 1
36 | 
37 |         request_outputs = engine.step()
38 |         for request_output in request_outputs:
39 |             if request_output.finished:
40 |                 print(request_output)
41 | 
42 |         if not (engine.has_unfinished_requests() or test_prompts):
43 |             break
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     parser = argparse.ArgumentParser(
48 |         description='Demo on using the LLMEngine class directly')
49 |     parser = EngineArgs.add_cli_args(parser)
50 |     args = parser.parse_args()
51 |     main(args)
52 | 


--------------------------------------------------------------------------------
/vllm/model_executor/parallel_utils/communication_op.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from vllm.model_executor.parallel_utils.parallel_state import (
 4 |     get_tensor_model_parallel_world_size,
 5 |     get_tensor_model_parallel_group,
 6 | )
 7 | 
 8 | 
 9 | def tensor_model_parallel_all_reduce(input_):
10 |     """All-reduce the input tensor across model parallel group.
11 | 
12 |     Note: This operation is applied in-place on the input tensor.
13 |     """
14 |     # Bypass the function if we are using only 1 GPU.
15 |     if get_tensor_model_parallel_world_size() == 1:
16 |         return input_
17 |     # All-reduce.
18 |     torch.distributed.all_reduce(input_,
19 |                                  group=get_tensor_model_parallel_group())
20 |     return input_
21 | 
22 | 
23 | def tensor_model_parallel_all_gather(input_, dim=-1):
24 |     """All-gather the input tensor across model parallel group."""
25 |     world_size = get_tensor_model_parallel_world_size()
26 |     # Bypass the function if we are using only 1 GPU.
27 |     if world_size == 1:
28 |         return input_
29 |     assert -input_.dim() <= dim < input_.dim(), (
30 |         f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
31 |     if dim < 0:
32 |         # Convert negative dim to positive.
33 |         dim += input_.dim()
34 |     input_size = input_.size()
35 |     # Allocate output tensor.
36 |     output_tensor = torch.empty((world_size, ) + input_size,
37 |                                 dtype=input_.dtype,
38 |                                 device=input_.device)
39 |     # All-gather.
40 |     torch.distributed.all_gather_into_tensor(
41 |         output_tensor, input_, group=get_tensor_model_parallel_group())
42 |     # Reshape
43 |     output_tensor = output_tensor.movedim(0, dim)
44 |     output_tensor = output_tensor.reshape(input_size[:dim] +
45 |                                           (world_size * input_size[dim], ) +
46 |                                           input_size[dim + 1:])
47 |     return output_tensor
48 | 


--------------------------------------------------------------------------------
/csrc/attention/attention_generic.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
 3 |  * Copyright (c) 2023, The vLLM team.
 4 |  * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License");
 7 |  * you may not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | #pragma once
19 | 
20 | #include <stdint.h>
21 | 
22 | namespace vllm {
23 | 
24 | // A vector type to store Q, K, V elements.
25 | template<typename T, int VEC_SIZE>
26 | struct Vec {};
27 | 
28 | // A vector type to store FP32 accumulators.
29 | template<typename T>
30 | struct FloatVec {};
31 | 
32 | // Template vector operations.
33 | template<typename Acc, typename A, typename B>
34 | inline __device__ Acc mul(A a, B b);
35 | 
36 | template<typename T>
37 | inline __device__ float sum(T v);
38 | 
39 | template<typename T>
40 | inline __device__ float dot(T a, T b) {
41 |   return sum(mul<T, T, T>(a, b));
42 | }
43 | 
44 | template<typename A, typename T>
45 | inline __device__ float dot(T a, T b) {
46 |   return sum(mul<A, T, T>(a, b));
47 | }
48 | 
49 | template<typename T>
50 | inline __device__ void zero(T& dst) {
51 |   constexpr int WORDS = sizeof(T) / 4;
52 |   union {
53 |     T raw;
54 |     uint32_t words[WORDS];
55 |   } tmp;
56 | 
57 | #pragma unroll
58 |   for (int ii = 0; ii < WORDS; ++ii) {
59 |     tmp.words[ii] = 0u;
60 |   }
61 |   dst = tmp.raw;
62 | }
63 | 
64 | } // namespace vllm
65 | 


--------------------------------------------------------------------------------
/csrc/cache.cpp:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | #include <map>
 4 | #include <vector>
 5 | 
 6 | void swap_blocks(
 7 |   torch::Tensor& src,
 8 |   torch::Tensor& dst,
 9 |   const std::map<int64_t, int64_t>& block_mapping);
10 | 
11 | // void swap_blocks_new(
12 | //   std::vector<torch::Tensor>& key_caches,
13 | //   std::vector<torch::Tensor>& value_caches,
14 | //   const std::map<int64_t, int64_t>& block_mapping);
15 | 
16 | void copy_blocks(
17 |   std::vector<torch::Tensor>& key_caches,
18 |   std::vector<torch::Tensor>& value_caches,
19 |   const std::map<int64_t, std::vector<int64_t>>& block_mapping);
20 | 
21 | void reshape_and_cache(
22 |   torch::Tensor& key,
23 |   torch::Tensor& value,
24 |   torch::Tensor& key_cache,
25 |   torch::Tensor& value_cache,
26 |   torch::Tensor& slot_mapping);
27 | 
28 | void new_reshape_and_cache(
29 |   torch::Tensor& key,
30 |   torch::Tensor& value,
31 |   torch::Tensor& key_cache,
32 |   torch::Tensor& value_cache,
33 |   torch::Tensor& slot_mapping);
34 | 
35 | void gather_cached_kv(
36 |   torch::Tensor& key,
37 |   torch::Tensor& value,
38 |   torch::Tensor& key_cache,
39 |   torch::Tensor& value_cache,
40 |   torch::Tensor& slot_mapping);
41 | 
42 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
43 |   m.def(
44 |     "swap_blocks",
45 |     &swap_blocks,
46 |     "Swap in (out) the cache blocks from src to dst");
47 |   // m.def(
48 |   //   "swap_blocks_new",
49 |   //   &swap_blocks_new,
50 |   //   "Swap in (out) the cache blocks from src to dst");
51 |   m.def(
52 |     "copy_blocks",
53 |     &copy_blocks,
54 |     "Copy the cache blocks from src to dst");
55 |   m.def(
56 |     "reshape_and_cache",
57 |     &reshape_and_cache,
58 |     "Reshape the key and value tensors and cache them");
59 |   m.def(
60 |     "new_reshape_and_cache",
61 |     &new_reshape_and_cache,
62 |     "Reshape the key and value tensors and cache them");
63 |   m.def(
64 |     "gather_cached_kv",
65 |     &gather_cached_kv,
66 |     "Gather key and value from the cache into contiguous QKV tensors");
67 | }
68 | 


--------------------------------------------------------------------------------
/tests/kernels/test_layernorm.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | from vllm import layernorm_ops
 6 | 
 7 | DTYPES = [torch.half, torch.bfloat16, torch.float]
 8 | HIDDEN_SIZES = [67, 768, 2048, 5120, 8192]  # Arbitrary values for testing
 9 | NUM_TOKENS = [7, 83, 4096]  # Arbitrary values for testing
10 | SEEDS = [0]
11 | 
12 | 
13 | class RefRMSNorm(nn.Module):
14 | 
15 |     def __init__(self, hidden_size, eps=1e-6):
16 |         super().__init__()
17 |         weight = torch.empty(hidden_size)
18 |         weight.normal_(mean=1.0, std=0.1)
19 |         self.weight = nn.Parameter(weight)
20 |         self.variance_epsilon = eps
21 | 
22 |     def forward(self, hidden_states):
23 |         input_dtype = hidden_states.dtype
24 |         hidden_states = hidden_states.to(torch.float32)
25 |         variance = hidden_states.pow(2).mean(-1, keepdim=True)
26 |         hidden_states = hidden_states * torch.rsqrt(variance +
27 |                                                     self.variance_epsilon)
28 |         return self.weight * hidden_states.to(input_dtype)
29 | 
30 | 
31 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
32 | @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
33 | @pytest.mark.parametrize("dtype", DTYPES)
34 | @pytest.mark.parametrize("seed", SEEDS)
35 | @torch.inference_mode()
36 | def test_rms_norm(
37 |     num_tokens: int,
38 |     hidden_size: int,
39 |     dtype: torch.dtype,
40 |     seed: int,
41 | ) -> None:
42 |     torch.random.manual_seed(seed)
43 |     torch.cuda.manual_seed(seed)
44 | 
45 |     scale = float(hidden_size**-0.5)
46 |     x = torch.empty(num_tokens, hidden_size, dtype=dtype, device="cuda")
47 |     x.uniform_(-scale, scale)
48 |     ref = RefRMSNorm(hidden_size).to(dtype).cuda()
49 | 
50 |     out = torch.empty_like(x)
51 |     layernorm_ops.rms_norm(
52 |         out,
53 |         x,
54 |         ref.weight.data,
55 |         ref.variance_epsilon,
56 |     )
57 |     ref_out = ref(x)
58 |     assert torch.allclose(out, ref_out, atol=1e-2, rtol=1e-5)
59 | 


--------------------------------------------------------------------------------
/csrc/attention/attention_utils.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
 3 |  * Copyright (c) 2023, The vLLM team.
 4 |  * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License");
 7 |  * you may not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | #pragma once
19 | 
20 | #include "attention_dtypes.h"
21 | 
22 | #include <float.h>
23 | #include <type_traits>
24 | 
25 | namespace vllm {
26 | 
27 | // Q*K^T operation.
28 | template<int THREAD_GROUP_SIZE, typename Vec, int N>
29 | inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {
30 |   using A_vec = typename FloatVec<Vec>::Type;
31 |   // Compute the parallel products for Q*K^T (treat vector lanes separately).
32 |   A_vec qk_vec = mul<A_vec, Vec, Vec>(q[0], k[0]);
33 | #pragma unroll
34 |   for (int ii = 1; ii < N; ++ii) {
35 |     qk_vec = fma(q[ii], k[ii], qk_vec);
36 |   }
37 | 
38 |   // Finalize the reduction across lanes.
39 |   float qk = sum(qk_vec);
40 | #pragma unroll
41 |   for (int mask = THREAD_GROUP_SIZE / 2; mask >= 1; mask /= 2) {
42 |     qk += __shfl_xor_sync(uint32_t(-1), qk, mask);
43 |   }
44 |   return qk;
45 | }
46 | 
47 | template<typename T, int THREAD_GROUP_SIZE>
48 | struct Qk_dot {
49 |   template<typename Vec, int N>
50 |   static inline __device__ float dot(const Vec (&q)[N], const Vec (&k)[N]) {
51 |     return qk_dot_<THREAD_GROUP_SIZE>(q, k);
52 |   }
53 | };
54 | 
55 | } // namespace vllm
56 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/config.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from transformers import AutoConfig, PretrainedConfig
 4 | 
 5 | from vllm.transformers_utils.configs import *  # pylint: disable=wildcard-import
 6 | 
 7 | _CONFIG_REGISTRY = {
 8 |     "mpt": MPTConfig,
 9 |     "baichuan": BaiChuanConfig,
10 |     "aquila": AquilaConfig,
11 |     "qwen": QWenConfig,
12 |     "RefinedWeb": RWConfig,  # For tiiuae/falcon-40b(-instruct)
13 |     "RefinedWebModel": RWConfig,  # For tiiuae/falcon-7b(-instruct)
14 | }
15 | 
16 | 
17 | def get_config(model: str,
18 |                trust_remote_code: bool,
19 |                revision: Optional[str] = None) -> PretrainedConfig:
20 |     # NOTE: Because the Mistral model in HF hub does not have
21 |     # `configuration_mistral.py`, we cannot use `AutoConfig` to load the
22 |     # config. Instead, we use `MistralConfig` directly.
23 |     # NOTE: This is a hack. This does not work for local models.
24 |     # FIXME: Remove this once the Mistral model is available in the stable
25 |     # version of HF transformers.
26 |     if "mistral" in model.lower():
27 |         return MistralConfig.from_pretrained(model, revision=revision)
28 | 
29 |     try:
30 |         config = AutoConfig.from_pretrained(
31 |             model, trust_remote_code=trust_remote_code, revision=revision)
32 |     except ValueError as e:
33 |         if (not trust_remote_code and
34 |                 "requires you to execute the configuration file" in str(e)):
35 |             err_msg = (
36 |                 "Failed to load the model config. If the model is a custom "
37 |                 "model not yet available in the HuggingFace transformers "
38 |                 "library, consider setting `trust_remote_code=True` in LLM "
39 |                 "or using the `--trust-remote-code` flag in the CLI.")
40 |             raise RuntimeError(err_msg) from e
41 |         else:
42 |             raise e
43 |     if config.model_type in _CONFIG_REGISTRY:
44 |         config_class = _CONFIG_REGISTRY[config.model_type]
45 |         config = config_class.from_pretrained(model, revision=revision)
46 |     return config
47 | 


--------------------------------------------------------------------------------
/vllm/block.py:
--------------------------------------------------------------------------------
 1 | """Token blocks."""
 2 | from typing import List
 3 | 
 4 | from vllm.utils import Device
 5 | 
 6 | _BLANK_TOKEN_ID = -1
 7 | 
 8 | 
 9 | class LogicalTokenBlock:
10 |     """A block that stores a contiguous chunk of tokens from left to right.
11 | 
12 |     Logical blocks are used to represent the states of the corresponding
13 |     physical blocks in the KV cache.
14 |     """
15 | 
16 |     def __init__(
17 |         self,
18 |         block_number: int,
19 |         block_size: int,
20 |     ) -> None:
21 |         self.block_number = block_number
22 |         self.block_size = block_size
23 | 
24 |         self.token_ids = [_BLANK_TOKEN_ID] * block_size
25 |         self.num_tokens = 0
26 | 
27 |     def is_empty(self) -> bool:
28 |         return self.num_tokens == 0
29 | 
30 |     def get_num_empty_slots(self) -> int:
31 |         return self.block_size - self.num_tokens
32 | 
33 |     def is_full(self) -> bool:
34 |         return self.num_tokens == self.block_size
35 | 
36 |     def append_tokens(self, token_ids: List[int]) -> None:
37 |         assert len(token_ids) <= self.get_num_empty_slots()
38 |         curr_idx = self.num_tokens
39 |         self.token_ids[curr_idx:curr_idx + len(token_ids)] = token_ids
40 |         self.num_tokens += len(token_ids)
41 | 
42 |     def get_token_ids(self) -> List[int]:
43 |         return self.token_ids[:self.num_tokens]
44 | 
45 |     def get_last_token_id(self) -> int:
46 |         assert self.num_tokens > 0
47 |         return self.token_ids[self.num_tokens - 1]
48 | 
49 | 
50 | class PhysicalTokenBlock:
51 |     """Represents the state of a block in the KV cache."""
52 | 
53 |     def __init__(
54 |         self,
55 |         device: Device,
56 |         block_number: int,
57 |         block_size: int,
58 |     ) -> None:
59 |         self.device = device
60 |         self.block_number = block_number
61 |         self.block_size = block_size
62 | 
63 |         self.ref_count = 0
64 | 
65 |     def __repr__(self) -> str:
66 |         return (f'PhysicalTokenBlock(device={self.device}, '
67 |                 f'block_number={self.block_number}, '
68 |                 f'ref_count={self.ref_count})')
69 | 


--------------------------------------------------------------------------------
/csrc/layernorm_kernels.cu:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | #include <ATen/cuda/CUDAContext.h>
 3 | 
 4 | #include "dispatch_utils.h"
 5 | #include "reduction_utils.cuh"
 6 | 
 7 | namespace vllm {
 8 | 
 9 | // TODO(woosuk): Further optimize this kernel.
10 | template<typename scalar_t>
11 | __global__ void rms_norm_kernel(
12 |   scalar_t* __restrict__ out,             // [num_tokens, hidden_size]
13 |   const scalar_t* __restrict__ input,     // [num_tokens, hidden_size]
14 |   const scalar_t* __restrict__ weight,    // [hidden_size]
15 |   const float epsilon,
16 |   const int num_tokens,
17 |   const int hidden_size) {
18 |   __shared__ float s_variance;
19 |   float variance = 0.0f;
20 | 
21 |   for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
22 |     const float x = (float) input[blockIdx.x * hidden_size + idx];
23 |     variance += x * x;
24 |   }
25 |   variance = blockReduceSum<float>(variance);
26 |   if (threadIdx.x == 0) {
27 |     s_variance = rsqrtf(variance / hidden_size + epsilon);
28 |   }
29 |   __syncthreads();
30 | 
31 |   for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
32 |     float x = (float) input[blockIdx.x * hidden_size + idx];
33 |     out[blockIdx.x * hidden_size + idx] = ((scalar_t) (x * s_variance)) * weight[idx];
34 |   }
35 | }
36 | 
37 | } // namespace vllm
38 | 
39 | void rms_norm(
40 |   torch::Tensor& out,      // [num_tokens, hidden_size]
41 |   torch::Tensor& input,    // [num_tokens, hidden_size]
42 |   torch::Tensor& weight,   // [hidden_size]
43 |   float epsilon) {
44 |   int num_tokens = input.size(0);
45 |   int hidden_size = input.size(1);
46 | 
47 |   dim3 grid(num_tokens);
48 |   dim3 block(std::min(hidden_size, 1024));
49 |   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
50 |   VLLM_DISPATCH_FLOATING_TYPES(
51 |     input.scalar_type(),
52 |     "rms_norm_kernel",
53 |     [&] {
54 |       vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
55 |         out.data_ptr<scalar_t>(),
56 |         input.data_ptr<scalar_t>(),
57 |         weight.data_ptr<scalar_t>(),
58 |         epsilon,
59 |         num_tokens,
60 |         hidden_size);
61 |     });
62 | }
63 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/configs/qwen.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Alibaba Cloud.
 2 | # LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
 3 | 
 4 | from transformers import PretrainedConfig
 5 | 
 6 | 
 7 | class QWenConfig(PretrainedConfig):
 8 |     model_type = "qwen"
 9 |     keys_to_ignore_at_inference = ["past_key_values"]
10 | 
11 |     def __init__(
12 |         self,
13 |         vocab_size=151936,
14 |         hidden_size=4096,
15 |         num_hidden_layers=32,
16 |         num_attention_heads=32,
17 |         emb_dropout_prob=0.0,
18 |         attn_dropout_prob=0.0,
19 |         layer_norm_epsilon=1e-6,
20 |         initializer_range=0.02,
21 |         max_position_embeddings=8192,
22 |         scale_attn_weights=True,
23 |         use_cache=True,
24 |         bf16=False,
25 |         fp16=False,
26 |         fp32=False,
27 |         kv_channels=128,
28 |         rotary_pct=1.0,
29 |         rotary_emb_base=10000,
30 |         use_dynamic_ntk=True,
31 |         use_logn_attn=True,
32 |         use_flash_attn="auto",
33 |         intermediate_size=22016,
34 |         no_bias=True,
35 |         tie_word_embeddings=False,
36 |         **kwargs,
37 |     ):
38 |         self.vocab_size = vocab_size
39 |         self.hidden_size = hidden_size
40 |         self.intermediate_size = intermediate_size
41 |         self.num_hidden_layers = num_hidden_layers
42 |         self.num_attention_heads = num_attention_heads
43 |         self.emb_dropout_prob = emb_dropout_prob
44 |         self.attn_dropout_prob = attn_dropout_prob
45 |         self.layer_norm_epsilon = layer_norm_epsilon
46 |         self.initializer_range = initializer_range
47 |         self.scale_attn_weights = scale_attn_weights
48 |         self.use_cache = use_cache
49 |         self.max_position_embeddings = max_position_embeddings
50 |         self.bf16 = bf16
51 |         self.fp16 = fp16
52 |         self.fp32 = fp32
53 |         self.kv_channels = kv_channels
54 |         self.rotary_pct = rotary_pct
55 |         self.rotary_emb_base = rotary_emb_base
56 |         self.use_dynamic_ntk = use_dynamic_ntk
57 |         self.use_logn_attn = use_logn_attn
58 |         self.use_flash_attn = use_flash_attn
59 |         self.no_bias = no_bias
60 |         super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
61 | 


--------------------------------------------------------------------------------
/tests/engine/test_detokenize.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from transformers import AutoTokenizer
 4 | 
 5 | from vllm.transformers_utils.tokenizer import detokenize_incrementally
 6 | 
 7 | TRUTH = [
 8 |     # pylint: disable=line-too-long
 9 |     "Hello here, this is a simple test",
10 |     "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving",
11 |     "我很感谢你的热情"
12 | ]
13 | TOKENIZERS = [
14 |     "facebook/opt-125m",
15 |     "gpt2",
16 |     "bigcode/tiny_starcoder_py",
17 |     "EleutherAI/gpt-j-6b",
18 |     "EleutherAI/pythia-70m",
19 |     "bigscience/bloom-560m",
20 |     "mosaicml/mpt-7b",
21 |     "tiiuae/falcon-7b",
22 |     "meta-llama/Llama-2-7b-hf",
23 |     "codellama/CodeLlama-7b-hf",
24 | ]
25 | 
26 | 
27 | def _run_incremental_decode(tokenizer, all_input_ids,
28 |                             skip_special_tokens: bool):
29 |     decoded_text = ""
30 |     offset = 0
31 |     token_offset = 0
32 |     prev_tokens = None
33 |     for i in range(len(all_input_ids)):
34 |         new_tokens, text, offset, token_offset = detokenize_incrementally(
35 |             tokenizer,
36 |             all_input_ids[:i + 1],
37 |             prev_tokens,
38 |             offset,
39 |             token_offset,
40 |             skip_special_tokens=skip_special_tokens)
41 |         decoded_text += text
42 |         if prev_tokens is None:
43 |             prev_tokens = new_tokens
44 |         else:
45 |             prev_tokens += new_tokens
46 |     return decoded_text
47 | 
48 | 
49 | @pytest.mark.parametrize("truth", TRUTH)
50 | @pytest.mark.parametrize("tokenizer_id", TOKENIZERS)
51 | @pytest.mark.parametrize("skip_special_tokens", (True, False))
52 | def test_decode_streaming(tokenizer_id, truth, skip_special_tokens):
53 |     tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
54 |     all_input_ids = tokenizer(truth, add_special_tokens=False)["input_ids"]
55 |     if skip_special_tokens:
56 |         all_input_ids = ([tokenizer.bos_token_id]
57 |                          if tokenizer.bos_token_id is not None else
58 |                          []) + all_input_ids + [tokenizer.eos_token_id]
59 | 
60 |     decoded_text = _run_incremental_decode(
61 |         tokenizer, all_input_ids, skip_special_tokens=skip_special_tokens)
62 | 
63 |     assert decoded_text == truth
64 | 


--------------------------------------------------------------------------------
/docs/source/serving/run_on_sky.rst:
--------------------------------------------------------------------------------
 1 | .. _on_cloud:
 2 | 
 3 | Running on clouds with SkyPilot
 4 | ===============================
 5 | 
 6 | .. raw:: html
 7 | 
 8 |     <p align="center">
 9 |         <img src="https://imgur.com/yxtzPEu.png" alt="vLLM"/>
10 |     </p>
11 | 
12 | vLLM can be run on the cloud to scale to multiple GPUs with `SkyPilot <https://github.com/skypilot-org/skypilot>`__, an open-source framework for running LLMs on any cloud.
13 | 
14 | To install SkyPilot and setup your cloud credentials, run:
15 | 
16 | .. code-block:: console
17 | 
18 |     $ pip install skypilot
19 |     $ sky check
20 | 
21 | See the vLLM SkyPilot YAML for serving, `serving.yaml <https://github.com/skypilot-org/skypilot/blob/master/llm/vllm/serve.yaml>`__.
22 | 
23 | .. code-block:: yaml
24 | 
25 |     resources:
26 |         accelerators: A100
27 | 
28 |     envs:
29 |         MODEL_NAME: decapoda-research/llama-13b-hf
30 |         TOKENIZER: hf-internal-testing/llama-tokenizer
31 | 
32 |     setup: |
33 |         conda create -n vllm python=3.9 -y
34 |         conda activate vllm
35 |         git clone https://github.com/vllm-project/vllm.git
36 |         cd vllm
37 |         pip install .
38 |         pip install gradio
39 | 
40 |     run: |
41 |         conda activate vllm
42 |         echo 'Starting vllm api server...'
43 |         python -u -m vllm.entrypoints.api_server \
44 |                         --model $MODEL_NAME \
45 |                         --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
46 |                         --tokenizer $TOKENIZER 2>&1 | tee api_server.log &
47 |         echo 'Waiting for vllm api server to start...'
48 |         while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
49 |         echo 'Starting gradio server...'
50 |         python vllm/examples/gradio_webserver.py
51 | 
52 | Start the serving the LLaMA-13B model on an A100 GPU:
53 | 
54 | .. code-block:: console
55 | 
56 |     $ sky launch serving.yaml
57 | 
58 | Check the output of the command. There will be a sharable gradio link (like the last line of the following). Open it in your browser to use the LLaMA model to do the text completion.
59 | 
60 | .. code-block:: console
61 | 
62 |     (task, pid=7431) Running on public URL: https://<gradio-hash>.gradio.live
63 | 
64 | **Optional**: Serve the 65B model instead of the default 13B and use more GPU:
65 | 
66 | .. code-block:: console
67 | 
68 |     sky launch -c vllm-serve-new -s serve.yaml --gpus A100:8 --env MODEL_NAME=decapoda-research/llama-65b-hf
69 | 
70 | 


--------------------------------------------------------------------------------
/tests/async_engine/test_async_llm_engine.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from dataclasses import dataclass
 3 | 
 4 | import pytest
 5 | 
 6 | from vllm.engine.async_llm_engine import AsyncLLMEngine
 7 | 
 8 | 
 9 | @dataclass
10 | class RequestOutput:
11 |     request_id: int
12 |     finished: bool = False
13 | 
14 | 
15 | class MockEngine:
16 | 
17 |     def __init__(self):
18 |         self.step_calls = 0
19 |         self.add_request_calls = 0
20 |         self.abort_request_calls = 0
21 |         self.request_id = None
22 | 
23 |     async def step_async(self):
24 |         self.step_calls += 1
25 |         return [RequestOutput(
26 |             request_id=self.request_id)] if self.request_id else []
27 | 
28 |     def generate(self, request_id):
29 |         self.request_id = request_id
30 | 
31 |     def stop_generating(self):
32 |         self.request_id = None
33 | 
34 |     def add_request(self, **kwargs):
35 |         del kwargs  # Unused
36 |         self.add_request_calls += 1
37 | 
38 |     def abort_request(self, request_id):
39 |         del request_id  # Unused
40 |         self.abort_request_calls += 1
41 | 
42 | 
43 | class MockAsyncLLMEngine(AsyncLLMEngine):
44 | 
45 |     def _init_engine(self, *args, **kwargs):
46 |         return MockEngine()
47 | 
48 | 
49 | @pytest.mark.asyncio
50 | async def test_new_requests_event():
51 |     engine = MockAsyncLLMEngine(worker_use_ray=False, engine_use_ray=False)
52 |     engine.start_background_loop()
53 |     await asyncio.sleep(0.01)
54 |     assert engine.engine.step_calls == 0
55 | 
56 |     await engine.add_request("1", "", None)
57 |     await asyncio.sleep(0.01)
58 |     assert engine.engine.add_request_calls == 1
59 |     assert engine.engine.step_calls == 1
60 | 
61 |     await engine.add_request("2", "", None)
62 |     engine.engine.generate("2")
63 |     await asyncio.sleep(0)
64 |     assert engine.engine.add_request_calls == 2
65 |     assert engine.engine.step_calls == 2
66 |     await asyncio.sleep(0)
67 |     assert engine.engine.step_calls == 3
68 |     engine.engine.stop_generating()
69 |     await asyncio.sleep(0)
70 |     assert engine.engine.step_calls == 4
71 |     await asyncio.sleep(0)
72 |     assert engine.engine.step_calls == 4
73 | 
74 |     await engine.add_request("3", "", None)
75 |     await asyncio.sleep(0.01)
76 |     assert engine.engine.add_request_calls == 3
77 |     assert engine.engine.step_calls == 5
78 |     await asyncio.sleep(0.01)
79 |     assert engine.engine.add_request_calls == 3
80 |     assert engine.engine.step_calls == 5
81 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | Welcome to vLLM!
 2 | ================
 3 | 
 4 | .. figure:: ./assets/logos/vllm-logo-text-light.png
 5 |   :width: 60%
 6 |   :align: center
 7 |   :alt: vLLM
 8 |   :class: no-scaled-link
 9 | 
10 | .. raw:: html
11 | 
12 |    <p style="text-align:center">
13 |    <strong>Easy, fast, and cheap LLM serving for everyone
14 |    </strong>
15 |    </p>
16 | 
17 |    <p style="text-align:center">
18 |    <script async defer src="https://buttons.github.io/buttons.js"></script>
19 |    <a class="github-button" href="https://github.com/vllm-project/vllm" data-show-count="true" data-size="large" aria-label="Star">Star</a>
20 |    <a class="github-button" href="https://github.com/vllm-project/vllm/subscription" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
21 |    <a class="github-button" href="https://github.com/vllm-project/vllm/fork" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
22 |    </p>
23 | 
24 | 
25 | 
26 | vLLM is a fast and easy-to-use library for LLM inference and serving.
27 | 
28 | vLLM is fast with:
29 | 
30 | * State-of-the-art serving throughput
31 | * Efficient management of attention key and value memory with **PagedAttention**
32 | * Continuous batching of incoming requests
33 | * Optimized CUDA kernels
34 | 
35 | vLLM is flexible and easy to use with:
36 | 
37 | * Seamless integration with popular HuggingFace models
38 | * High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
39 | * Tensor parallelism support for distributed inference
40 | * Streaming outputs
41 | * OpenAI-compatible API server
42 | 
43 | For more information, check out the following:
44 | 
45 | * `vLLM announcing blog post <https://vllm.ai>`_ (intro to PagedAttention)
46 | * `vLLM paper <https://arxiv.org/abs/2309.06180>`_ (SOSP 2023)
47 | * `How continuous batching enables 23x throughput in LLM inference while reducing p50 latency <https://www.anyscale.com/blog/continuous-batching-llm-inference>`_ by Cade Daniel et al.
48 | 
49 | 
50 | 
51 | Documentation
52 | -------------
53 | 
54 | .. toctree::
55 |    :maxdepth: 1
56 |    :caption: Getting Started
57 | 
58 |    getting_started/installation
59 |    getting_started/quickstart
60 | 
61 | .. toctree::
62 |    :maxdepth: 1
63 |    :caption: Serving
64 | 
65 |    serving/distributed_serving
66 |    serving/run_on_sky
67 |    serving/deploying_with_triton
68 | 
69 | .. toctree::
70 |    :maxdepth: 1
71 |    :caption: Models
72 | 
73 |    models/supported_models
74 |    models/adding_model
75 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/configs/baichuan.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 3 | #
 4 | # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
 5 | # and OPT implementations in this library. It has been modified from its
 6 | # original forms to accommodate minor architectural differences compared
 7 | # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
 8 | #
 9 | # Licensed under the Apache License, Version 2.0 (the "License");
10 | # you may not use this file except in compliance with the License.
11 | # You may obtain a copy of the License at
12 | #
13 | #     http://www.apache.org/licenses/LICENSE-2.0
14 | #
15 | # Unless required by applicable law or agreed to in writing, software
16 | # distributed under the License is distributed on an "AS IS" BASIS,
17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | # See the License for the specific language governing permissions and
19 | # limitations under the License.
20 | 
21 | from transformers.configuration_utils import PretrainedConfig
22 | 
23 | 
24 | class BaiChuanConfig(PretrainedConfig):
25 |     model_type = "baichuan"
26 |     keys_to_ignore_at_inference = ["past_key_values"]
27 | 
28 |     def __init__(
29 |         self,
30 |         vocab_size=64000,
31 |         hidden_size=4096,
32 |         intermediate_size=11008,
33 |         num_hidden_layers=32,
34 |         num_attention_heads=32,
35 |         hidden_act="silu",
36 |         max_position_embeddings=4096,
37 |         initializer_range=0.02,
38 |         rms_norm_eps=1e-6,
39 |         use_cache=True,
40 |         pad_token_id=0,
41 |         bos_token_id=1,
42 |         eos_token_id=2,
43 |         tie_word_embeddings=False,
44 |         **kwargs,
45 |     ):
46 |         self.vocab_size = vocab_size
47 |         self.max_position_embeddings = max_position_embeddings
48 |         self.hidden_size = hidden_size
49 |         self.intermediate_size = intermediate_size
50 |         self.num_hidden_layers = num_hidden_layers
51 |         self.num_attention_heads = num_attention_heads
52 |         self.hidden_act = hidden_act
53 |         self.initializer_range = initializer_range
54 |         self.rms_norm_eps = rms_norm_eps
55 |         self.use_cache = use_cache
56 |         super().__init__(
57 |             pad_token_id=pad_token_id,
58 |             bos_token_id=bos_token_id,
59 |             eos_token_id=eos_token_id,
60 |             tie_word_embeddings=tie_word_embeddings,
61 |             **kwargs,
62 |         )
63 | 


--------------------------------------------------------------------------------
/tests/async_engine/test_request_tracker.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from vllm.engine.async_llm_engine import RequestTracker
 4 | from vllm.outputs import RequestOutput
 5 | 
 6 | 
 7 | class DummyEvent:
 8 | 
 9 |     def __init__(self):
10 |         self.flag = False
11 | 
12 |     def set(self):
13 |         self.flag = True
14 | 
15 |     def clear(self):
16 |         self.flag = False
17 | 
18 | 
19 | def test_request_tracker():
20 |     tracker = RequestTracker()
21 |     tracker.new_requests_event = DummyEvent()
22 |     stream_1 = tracker.add_request("1")
23 |     assert tracker.new_requests_event.flag
24 |     new, finished = tracker.get_new_and_finished_requests()
25 |     assert not tracker.new_requests_event.flag
26 |     assert len(new) == 1
27 |     assert new[0]["request_id"] == "1"
28 |     assert not finished
29 |     assert not stream_1.finished
30 | 
31 |     stream_2 = tracker.add_request("2")
32 |     stream_3 = tracker.add_request("3")
33 |     assert tracker.new_requests_event.flag
34 |     new, finished = tracker.get_new_and_finished_requests()
35 |     assert not tracker.new_requests_event.flag
36 |     assert len(new) == 2
37 |     assert new[0]["request_id"] == "2"
38 |     assert new[1]["request_id"] == "3"
39 |     assert not finished
40 |     assert not stream_2.finished
41 |     assert not stream_3.finished
42 | 
43 |     # request_ids must be unique
44 |     with pytest.raises(KeyError):
45 |         tracker.add_request("1")
46 |     assert not tracker.new_requests_event.flag
47 | 
48 |     tracker.abort_request("1")
49 |     new, finished = tracker.get_new_and_finished_requests()
50 |     assert len(finished) == 1
51 |     assert "1" in finished
52 |     assert not new
53 |     assert stream_1.finished
54 | 
55 |     stream_4 = tracker.add_request("4")
56 |     tracker.abort_request("4")
57 |     assert tracker.new_requests_event.flag
58 |     new, finished = tracker.get_new_and_finished_requests()
59 |     assert len(finished) == 1
60 |     assert "4" in finished
61 |     assert not new
62 |     assert stream_4.finished
63 | 
64 |     stream_5 = tracker.add_request("5")
65 |     assert tracker.new_requests_event.flag
66 |     tracker.process_request_output(
67 |         RequestOutput("2", "output", [], [], finished=True))
68 |     new, finished = tracker.get_new_and_finished_requests()
69 |     assert not tracker.new_requests_event.flag
70 |     assert len(finished) == 1
71 |     assert "2" in finished
72 |     assert len(new) == 1
73 |     assert new[0]["request_id"] == "5"
74 |     assert stream_2.finished
75 |     assert not stream_5.finished
76 | 


--------------------------------------------------------------------------------
/vllm/model_executor/quantization_utils/awq.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict, List
 2 | 
 3 | import torch
 4 | 
 5 | from vllm.model_executor.quantization_utils.base import QuantizationConfig
 6 | 
 7 | 
 8 | class AWQConfig(QuantizationConfig):
 9 |     """Config class for AWQ.
10 | 
11 |     Reference: https://arxiv.org/abs/2306.00978
12 |     """
13 | 
14 |     def __init__(
15 |         self,
16 |         weight_bits: int,
17 |         group_size: int,
18 |         zero_point: bool,
19 |     ) -> None:
20 |         self.weight_bits = weight_bits
21 |         self.group_size = group_size
22 |         self.zero_point = zero_point
23 | 
24 |         if self.weight_bits != 4:
25 |             raise ValueError(
26 |                 "Currently, only 4-bit weight quantization is supported for "
27 |                 f"AWQ, but got {self.weight_bits} bits.")
28 |         self.pack_factor = 32 // self.weight_bits
29 | 
30 |     def __repr__(self) -> str:
31 |         return (f"AWQConfig(weight_bits={self.weight_bits}, "
32 |                 f"group_size={self.group_size}, "
33 |                 f"zero_point={self.zero_point})")
34 | 
35 |     @classmethod
36 |     def get_name(cls) -> str:
37 |         return "awq"
38 | 
39 |     @classmethod
40 |     def get_supported_act_dtypes(cls) -> List[torch.dtype]:
41 |         return [torch.half]
42 | 
43 |     @classmethod
44 |     def get_min_capability(cls) -> int:
45 |         # The AWQ kernel only supports Ampere or newer GPUs.
46 |         return 80
47 | 
48 |     @classmethod
49 |     def get_config_filenames(cls) -> List[str]:
50 |         return [
51 |             "quant_config.json",  # E.g., casperhansen/vicuna-7b-v1.5-awq
52 |             "quantize_config.json",  # E.g., abhinavkulkarni/mosaicml-mpt-7b-instruct-w4-g128-awq  # pylint: disable=line-too-long
53 |         ]
54 | 
55 |     @classmethod
56 |     def from_config(cls, config: Dict[str, Any]) -> "AWQConfig":
57 |         weight_bits = cls.get_from_keys(config, ["w_bit", "bits"])
58 |         group_size = cls.get_from_keys(config, ["q_group_size", "group_size"])
59 |         zero_point = cls.get_from_keys(config, ["zero_point"])
60 |         return cls(weight_bits, group_size, zero_point)
61 | 
62 |     @classmethod
63 |     def get_packed_tensor_names(cls) -> List[str]:
64 |         return ["qweight", "qzeros"]
65 | 
66 |     @classmethod
67 |     def get_transposed_tensor_names(cls) -> List[str]:
68 |         return ["qweight", "qzeros", "scales"]
69 | 
70 |     @classmethod
71 |     def get_tp_tensor_names(cls) -> List[str]:
72 |         return ["qweight", "qzeros", "scales"]
73 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/configs/aquila.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 3 | #
 4 | # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
 5 | # and OPT implementations in this library. It has been modified from its
 6 | # original forms to accommodate minor architectural differences compared
 7 | # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
 8 | #
 9 | # Licensed under the Apache License, Version 2.0 (the "License");
10 | # you may not use this file except in compliance with the License.
11 | # You may obtain a copy of the License at
12 | #
13 | #     http://www.apache.org/licenses/LICENSE-2.0
14 | #
15 | # Unless required by applicable law or agreed to in writing, software
16 | # distributed under the License is distributed on an "AS IS" BASIS,
17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | # See the License for the specific language governing permissions and
19 | # limitations under the License.
20 | """ Aquila model configuration"""
21 | 
22 | from transformers import PretrainedConfig
23 | 
24 | 
25 | class AquilaConfig(PretrainedConfig):
26 |     model_type = "aquila"
27 |     keys_to_ignore_at_inference = ["past_key_values"]
28 | 
29 |     def __init__(
30 |         self,
31 |         vocab_size=100008,
32 |         hidden_size=4096,
33 |         intermediate_size=11008,
34 |         num_hidden_layers=32,
35 |         num_attention_heads=32,
36 |         hidden_act="silu",
37 |         max_position_embeddings=2048,
38 |         initializer_range=0.006,
39 |         rms_norm_eps=1e-5,
40 |         use_cache=True,
41 |         pad_token_id=0,
42 |         bos_token_id=1,
43 |         eos_token_id=2,
44 |         tie_word_embeddings=False,
45 |         **kwargs,
46 |     ):
47 |         self.vocab_size = vocab_size
48 |         self.max_position_embeddings = max_position_embeddings
49 |         self.hidden_size = hidden_size
50 |         self.intermediate_size = intermediate_size
51 |         self.num_hidden_layers = num_hidden_layers
52 |         self.num_attention_heads = num_attention_heads
53 |         self.hidden_act = hidden_act
54 |         self.initializer_range = initializer_range
55 |         self.rms_norm_eps = rms_norm_eps
56 |         self.use_cache = use_cache
57 |         super().__init__(
58 |             pad_token_id=pad_token_id,
59 |             bos_token_id=bos_token_id,
60 |             eos_token_id=eos_token_id,
61 |             tie_word_embeddings=tie_word_embeddings,
62 |             **kwargs,
63 |         )
64 | 


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | # import os
14 | # import sys
15 | # sys.path.insert(0, os.path.abspath('.'))
16 | 
17 | 
18 | # -- Project information -----------------------------------------------------
19 | 
20 | project = 'vLLM'
21 | copyright = '2023, vLLM Team'
22 | author = 'the vLLM Team'
23 | 
24 | 
25 | # -- General configuration ---------------------------------------------------
26 | 
27 | # Add any Sphinx extension module names here, as strings. They can be
28 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
29 | # ones.
30 | extensions = [
31 |     "sphinx.ext.napoleon",
32 |     "sphinx.ext.viewcode",
33 |     "sphinx.ext.intersphinx",
34 |     "sphinx_copybutton",
35 | ]
36 | 
37 | # Add any paths that contain templates here, relative to this directory.
38 | templates_path = ['_templates']
39 | 
40 | # List of patterns, relative to source directory, that match files and
41 | # directories to ignore when looking for source files.
42 | # This pattern also affects html_static_path and html_extra_path.
43 | exclude_patterns = []
44 | 
45 | # Exclude the prompt "$" when copying code
46 | copybutton_prompt_text = r"\$ "
47 | copybutton_prompt_is_regexp = True
48 | 
49 | # -- Options for HTML output -------------------------------------------------
50 | 
51 | # The theme to use for HTML and HTML Help pages.  See the documentation for
52 | # a list of builtin themes.
53 | #
54 | html_title = project
55 | html_theme = 'sphinx_book_theme'
56 | html_logo = 'assets/logos/vllm-logo-text-light.png'
57 | html_theme_options = {
58 |     'logo_only': True,
59 |     'path_to_docs': 'docs/source',
60 |     'repository_url': 'https://github.com/vllm-project/vllm',
61 |     'use_repository_button': True,
62 | }
63 | 
64 | # Add any paths that contain custom static files (such as style sheets) here,
65 | # relative to this directory. They are copied after the builtin static files,
66 | # so a file named "default.css" will overwrite the builtin "default.css".
67 | html_static_path = ['_static']
68 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/configs/mistral.py:
--------------------------------------------------------------------------------
 1 | # Licensed under the Apache License, Version 2.0 (the "License");
 2 | # you may not use this file except in compliance with the License.
 3 | # You may obtain a copy of the License at
 4 | #
 5 | #     http://www.apache.org/licenses/LICENSE-2.0
 6 | #
 7 | # Unless required by applicable law or agreed to in writing, software
 8 | # distributed under the License is distributed on an "AS IS" BASIS,
 9 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
10 | # See the License for the specific language governing permissions and
11 | # limitations under the License.
12 | """Mistral-7B-v0.1 configuration"""
13 | from transformers.configuration_utils import PretrainedConfig
14 | 
15 | 
16 | class MistralConfig(PretrainedConfig):
17 |     model_type = "mistral"
18 |     keys_to_ignore_at_inference = ["past_key_values"]
19 | 
20 |     def __init__(
21 |         self,
22 |         vocab_size=32000,
23 |         hidden_size=4096,
24 |         intermediate_size=14336,
25 |         num_hidden_layers=32,
26 |         num_attention_heads=32,
27 |         num_key_value_heads=8,
28 |         hidden_act="silu",
29 |         max_position_embeddings=4096 * 32,
30 |         initializer_range=0.02,
31 |         rms_norm_eps=1e-6,
32 |         use_cache=True,
33 |         pad_token_id=None,
34 |         bos_token_id=1,
35 |         eos_token_id=2,
36 |         tie_word_embeddings=False,
37 |         rope_theta=10000.0,
38 |         sliding_window=4096,
39 |         **kwargs,
40 |     ):
41 |         self.vocab_size = vocab_size
42 |         self.max_position_embeddings = max_position_embeddings
43 |         self.hidden_size = hidden_size
44 |         self.intermediate_size = intermediate_size
45 |         self.num_hidden_layers = num_hidden_layers
46 |         self.num_attention_heads = num_attention_heads
47 |         self.sliding_window = sliding_window
48 | 
49 |         # for backward compatibility
50 |         if num_key_value_heads is None:
51 |             num_key_value_heads = num_attention_heads
52 | 
53 |         self.num_key_value_heads = num_key_value_heads
54 |         self.hidden_act = hidden_act
55 |         self.initializer_range = initializer_range
56 |         self.rms_norm_eps = rms_norm_eps
57 |         self.use_cache = use_cache
58 |         self.rope_theta = rope_theta
59 | 
60 |         super().__init__(
61 |             pad_token_id=pad_token_id,
62 |             bos_token_id=bos_token_id,
63 |             eos_token_id=eos_token_id,
64 |             tie_word_embeddings=tie_word_embeddings,
65 |             **kwargs,
66 |         )
67 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/configs/mpt.py:
--------------------------------------------------------------------------------
 1 | # Adapted from
 2 | # https://huggingface.co/mosaicml/mpt-7b/blob/main/configuration_mpt.py
 3 | from typing import Any, Dict, Optional, Union
 4 | 
 5 | from transformers import PretrainedConfig
 6 | 
 7 | _ATTN_CONFIG_DEFAULTS = {
 8 |     "attn_type": "multihead_attention",
 9 |     "attn_pdrop": 0.0,
10 |     "attn_impl": "triton",
11 |     "qk_ln": False,
12 |     "clip_qkv": None,
13 |     "softmax_scale": None,
14 |     "prefix_lm": False,
15 |     "attn_uses_sequence_id": False,
16 |     "alibi": False,
17 |     "alibi_bias_max": 8,
18 | }
19 | 
20 | 
21 | class MPTConfig(PretrainedConfig):
22 |     model_type = "mpt"
23 |     attribute_map = {
24 |         "hidden_size": "d_model",
25 |         "num_attention_heads": "n_heads",
26 |         "num_hidden_layers": "n_layers",
27 |     }
28 | 
29 |     def __init__(
30 |         self,
31 |         d_model: int = 2048,
32 |         n_heads: int = 16,
33 |         n_layers: int = 24,
34 |         expansion_ratio: int = 4,
35 |         max_seq_len: int = 2048,
36 |         vocab_size: int = 50368,
37 |         resid_pdrop: float = 0.0,
38 |         emb_pdrop: float = 0.0,
39 |         learned_pos_emb: bool = True,
40 |         attn_config: Optional[Dict[str, Any]] = None,
41 |         init_device: str = "cpu",
42 |         logit_scale: Optional[Union[float, str]] = None,
43 |         no_bias: bool = False,
44 |         verbose: int = 0,
45 |         embedding_fraction: float = 1.0,
46 |         norm_type: str = "low_precision_layernorm",
47 |         use_cache: bool = False,
48 |         **kwargs,
49 |     ) -> None:
50 |         self.d_model = d_model
51 |         self.n_heads = n_heads
52 |         self.n_layers = n_layers
53 |         self.expansion_ratio = expansion_ratio
54 |         self.max_seq_len = max_seq_len
55 |         self.vocab_size = vocab_size
56 |         self.resid_pdrop = resid_pdrop
57 |         self.emb_pdrop = emb_pdrop
58 |         self.learned_pos_emb = learned_pos_emb
59 |         if attn_config is None:
60 |             self.attn_config = _ATTN_CONFIG_DEFAULTS
61 |         else:
62 |             self.attn_config = attn_config
63 |         self.init_device = init_device
64 |         self.logit_scale = logit_scale
65 |         self.no_bias = no_bias
66 |         self.verbose = verbose
67 |         self.embedding_fraction = embedding_fraction
68 |         self.norm_type = norm_type
69 |         self.use_cache = use_cache
70 |         if "name" in kwargs:
71 |             del kwargs["name"]
72 |         if "loss_fn" in kwargs:
73 |             del kwargs["loss_fn"]
74 |         super().__init__(**kwargs)
75 | 


--------------------------------------------------------------------------------
/tests/kernels/test_activation.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | import torch.nn.functional as F
 4 | from transformers.activations import get_activation
 5 | 
 6 | from vllm import activation_ops
 7 | 
 8 | DTYPES = [torch.half, torch.bfloat16, torch.float]
 9 | NUM_TOKENS = [7, 83, 2048]  # Arbitrary values for testing
10 | D = [512, 4096, 5120, 13824]  # Arbitrary values for testing
11 | SEEDS = [0]
12 | 
13 | 
14 | def ref_silu_and_mul(x: torch.Tensor) -> torch.Tensor:
15 |     x1, x2 = x.chunk(chunks=2, dim=1)
16 |     return F.silu(x1) * x2
17 | 
18 | 
19 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
20 | @pytest.mark.parametrize("d", D)
21 | @pytest.mark.parametrize("dtype", DTYPES)
22 | @pytest.mark.parametrize("seed", SEEDS)
23 | @torch.inference_mode()
24 | def test_silu_and_mul(
25 |     num_tokens: int,
26 |     d: int,
27 |     dtype: torch.dtype,
28 |     seed: int,
29 | ) -> None:
30 |     torch.random.manual_seed(seed)
31 |     torch.cuda.manual_seed(seed)
32 |     x = torch.randn(num_tokens, 2 * d, dtype=dtype, device="cuda")
33 |     out = torch.empty(num_tokens, d, dtype=dtype, device="cuda")
34 |     activation_ops.silu_and_mul(out, x)
35 |     ref_out = ref_silu_and_mul(x)
36 |     assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5)
37 | 
38 | 
39 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
40 | @pytest.mark.parametrize("d", D)
41 | @pytest.mark.parametrize("dtype", DTYPES)
42 | @pytest.mark.parametrize("seed", SEEDS)
43 | @torch.inference_mode()
44 | def test_gelu_new(
45 |     num_tokens: int,
46 |     d: int,
47 |     dtype: torch.dtype,
48 |     seed: int,
49 | ) -> None:
50 |     torch.random.manual_seed(seed)
51 |     torch.cuda.manual_seed(seed)
52 |     x = torch.randn(num_tokens, d, dtype=dtype, device="cuda")
53 |     out = torch.empty(num_tokens, d, dtype=dtype, device="cuda")
54 |     activation_ops.gelu_new(out, x)
55 |     ref_out = get_activation("gelu_new")(x)
56 |     assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5)
57 | 
58 | 
59 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
60 | @pytest.mark.parametrize("d", D)
61 | @pytest.mark.parametrize("dtype", DTYPES)
62 | @pytest.mark.parametrize("seed", SEEDS)
63 | def test_gelu_fast(
64 |     num_tokens: int,
65 |     d: int,
66 |     dtype: torch.dtype,
67 |     seed: int,
68 | ) -> None:
69 |     torch.random.manual_seed(seed)
70 |     torch.cuda.manual_seed(seed)
71 |     x = torch.randn(num_tokens, d, dtype=dtype, device="cuda")
72 |     out = torch.empty(num_tokens, d, dtype=dtype, device="cuda")
73 |     activation_ops.gelu_fast(out, x)
74 |     ref_out = get_activation("gelu_fast")(x)
75 |     assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5)
76 | 


--------------------------------------------------------------------------------
/examples/api_client.py:
--------------------------------------------------------------------------------
 1 | """Example Python client for vllm.entrypoints.api_server"""
 2 | 
 3 | import argparse
 4 | import json
 5 | from typing import Iterable, List
 6 | 
 7 | import requests
 8 | 
 9 | 
10 | def clear_line(n: int = 1) -> None:
11 |     LINE_UP = '\033[1A'
12 |     LINE_CLEAR = '\x1b[2K'
13 |     for _ in range(n):
14 |         print(LINE_UP, end=LINE_CLEAR, flush=True)
15 | 
16 | 
17 | def post_http_request(prompt: str,
18 |                       api_url: str,
19 |                       n: int = 1,
20 |                       stream: bool = False) -> requests.Response:
21 |     headers = {"User-Agent": "Test Client"}
22 |     pload = {
23 |         "prompt": prompt,
24 |         "n": n,
25 |         "use_beam_search": True,
26 |         "temperature": 0.0,
27 |         "max_tokens": 16,
28 |         "stream": stream,
29 |     }
30 |     response = requests.post(api_url, headers=headers, json=pload, stream=True)
31 |     return response
32 | 
33 | 
34 | def get_streaming_response(response: requests.Response) -> Iterable[List[str]]:
35 |     for chunk in response.iter_lines(chunk_size=8192,
36 |                                      decode_unicode=False,
37 |                                      delimiter=b"\0"):
38 |         if chunk:
39 |             data = json.loads(chunk.decode("utf-8"))
40 |             output = data["text"]
41 |             yield output
42 | 
43 | 
44 | def get_response(response: requests.Response) -> List[str]:
45 |     data = json.loads(response.content)
46 |     output = data["text"]
47 |     return output
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     parser = argparse.ArgumentParser()
52 |     parser.add_argument("--host", type=str, default="localhost")
53 |     parser.add_argument("--port", type=int, default=8000)
54 |     parser.add_argument("--n", type=int, default=4)
55 |     parser.add_argument("--prompt", type=str, default="San Francisco is a")
56 |     parser.add_argument("--stream", action="store_true")
57 |     args = parser.parse_args()
58 |     prompt = args.prompt
59 |     api_url = f"http://{args.host}:{args.port}/generate"
60 |     n = args.n
61 |     stream = args.stream
62 | 
63 |     print(f"Prompt: {prompt!r}\n", flush=True)
64 |     response = post_http_request(prompt, api_url, n, stream)
65 | 
66 |     if stream:
67 |         num_printed_lines = 0
68 |         for h in get_streaming_response(response):
69 |             clear_line(num_printed_lines)
70 |             num_printed_lines = 0
71 |             for i, line in enumerate(h):
72 |                 num_printed_lines += 1
73 |                 print(f"Beam candidate {i}: {line!r}", flush=True)
74 |     else:
75 |         output = get_response(response)
76 |         for i, line in enumerate(output):
77 |             print(f"Beam candidate {i}: {line!r}", flush=True)
78 | 


--------------------------------------------------------------------------------
/vllm/model_executor/parallel_utils/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 The vLLM team.
 2 | # Adapted from
 3 | # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py
 4 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 5 | from typing import List, Sequence
 6 | 
 7 | import torch
 8 | 
 9 | 
10 | def ensure_divisibility(numerator, denominator):
11 |     """Ensure that numerator is divisible by the denominator."""
12 |     assert numerator % denominator == 0, "{} is not divisible by {}".format(
13 |         numerator, denominator)
14 | 
15 | 
16 | def divide(numerator, denominator):
17 |     """Ensure that numerator is divisible by the denominator and return
18 |     the division value."""
19 |     ensure_divisibility(numerator, denominator)
20 |     return numerator // denominator
21 | 
22 | 
23 | def split_tensor_along_last_dim(
24 |     tensor: torch.Tensor,
25 |     num_partitions: int,
26 |     contiguous_split_chunks: bool = False,
27 | ) -> List[torch.Tensor]:
28 |     """ Split a tensor along its last dimension.
29 | 
30 |         Arguments:
31 |             tensor: input tensor.
32 |             num_partitions: number of partitions to split the tensor
33 |             contiguous_split_chunks: If True, make each chunk contiguous
34 |                                      in memory.
35 | 
36 |         Returns:
37 |             A list of Tensors
38 |     """
39 |     # Get the size and dimension.
40 |     last_dim = tensor.dim() - 1
41 |     last_dim_size = divide(tensor.size()[last_dim], num_partitions)
42 |     # Split.
43 |     tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
44 |     # Note: torch.split does not create contiguous tensors by default.
45 |     if contiguous_split_chunks:
46 |         return tuple(chunk.contiguous() for chunk in tensor_list)
47 | 
48 |     return tensor_list
49 | 
50 | 
51 | class VocabUtility:
52 |     """ Split the vocabulary into `world_size` chunks and return the first
53 |         and last index of the vocabulary belonging to the `rank`
54 |         partition: Note that indices in [fist, last)
55 | 
56 |     """
57 | 
58 |     @staticmethod
59 |     def vocab_range_from_per_partition_vocab_size(
60 |             per_partition_vocab_size: int, rank: int) -> Sequence[int]:
61 |         index_f = rank * per_partition_vocab_size
62 |         index_l = index_f + per_partition_vocab_size
63 |         return index_f, index_l
64 | 
65 |     @staticmethod
66 |     def vocab_range_from_global_vocab_size(global_vocab_size: int, rank: int,
67 |                                            world_size: int) -> Sequence[int]:
68 |         per_partition_vocab_size = divide(global_vocab_size, world_size)
69 |         return VocabUtility.vocab_range_from_per_partition_vocab_size(
70 |             per_partition_vocab_size, rank)
71 | 


--------------------------------------------------------------------------------
/vllm/model_executor/quantization_utils/base.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Dict, List
 2 | 
 3 | import torch
 4 | 
 5 | 
 6 | class QuantizationConfig:
 7 | 
 8 |     @classmethod
 9 |     def get_name(cls) -> str:
10 |         """Name of the quantization method."""
11 |         raise NotImplementedError
12 | 
13 |     @classmethod
14 |     def get_supported_act_dtypes(cls) -> List[torch.dtype]:
15 |         """List of supported activation dtypes."""
16 |         raise NotImplementedError
17 | 
18 |     @classmethod
19 |     def get_min_capability(cls) -> int:
20 |         """Minimum GPU capability to support the quantization method.
21 | 
22 |         E.g., 70 for Volta, 75 for Turing, 80 for Ampere.
23 |         This requirement is due to the custom CUDA kernels used by the
24 |         quantization method.
25 |         """
26 |         raise NotImplementedError
27 | 
28 |     @classmethod
29 |     def get_config_filenames(cls) -> List[str]:
30 |         """List of filenames to search for in the model directory."""
31 |         raise NotImplementedError
32 | 
33 |     @classmethod
34 |     def from_config(cls, config: Dict[str, Any]) -> "QuantizationConfig":
35 |         """Create a config class from the model's quantization config."""
36 |         raise NotImplementedError
37 | 
38 |     @staticmethod
39 |     def get_from_keys(config: Dict[str, Any], keys: List[str]) -> Any:
40 |         """Get a value from the model's quantization config."""
41 |         for key in keys:
42 |             if key in config:
43 |                 return config[key]
44 |         raise ValueError(f"Cannot find any of {keys} in the model's "
45 |                          "quantization config.")
46 | 
47 |     @classmethod
48 |     def get_packed_tensor_names(cls) -> List[str]:
49 |         raise NotImplementedError
50 | 
51 |     @classmethod
52 |     def is_packed(cls, tensor_name: str) -> bool:
53 |         """Returns True if a tensor is packed.
54 | 
55 |         A tensor is considered packed if each element in the tensor is a
56 |         packed representation of multiple elements in the original tensor.
57 |         For example, an INT32 element in the tensor may represent 8 INT4
58 |         elements in the original tensor.
59 |         """
60 |         return any(tag in tensor_name for tag in cls.get_packed_tensor_names())
61 | 
62 |     @classmethod
63 |     def get_transposed_tensor_names(cls) -> List[str]:
64 |         raise NotImplementedError
65 | 
66 |     @classmethod
67 |     def is_transposed(cls, tensor_name: str) -> bool:
68 |         """Returns True if a tensor is transposed relative to nn.Linear.weight.
69 |         """
70 |         return any(tag in tensor_name
71 |                    for tag in cls.get_transposed_tensor_names())
72 | 
73 |     @classmethod
74 |     def get_tp_tensor_names(cls) -> List[str]:
75 |         raise NotImplementedError
76 | 


--------------------------------------------------------------------------------
/vllm/utils.py:
--------------------------------------------------------------------------------
 1 | import enum
 2 | import uuid
 3 | from platform import uname
 4 | 
 5 | import psutil
 6 | import torch
 7 | from typing import List
 8 | 
 9 | from vllm import cuda_utils
10 | 
11 | 
12 | class Device(enum.Enum):
13 |     GPU = enum.auto()
14 |     CPU = enum.auto()
15 | 
16 | 
17 | class Counter:
18 | 
19 |     def __init__(self, start: int = 0) -> None:
20 |         self.counter = start
21 | 
22 |     def __next__(self) -> int:
23 |         i = self.counter
24 |         self.counter += 1
25 |         return i
26 | 
27 |     def reset(self) -> None:
28 |         self.counter = 0
29 |         
30 | class InvalidAccessError(Exception):
31 |     pass
32 | 
33 | def invalidate_access(field_names):
34 |     def decorator(cls):
35 |         original_getattr = cls.__getattribute__
36 | 
37 |         def new_getattr(self, name):
38 |             if name in field_names:
39 |                 raise InvalidAccessError(f"Access to {name} is invalid")
40 |             return original_getattr(self, name)
41 | 
42 |         cls.__getattribute__ = new_getattr
43 |         return cls
44 | 
45 |     return decorator
46 | 
47 | def get_max_shared_memory_bytes(gpu: int = 0) -> int:
48 |     """Returns the maximum shared memory per thread block in bytes."""
49 |     # https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html
50 |     cudaDevAttrMaxSharedMemoryPerBlockOptin = 97  # pylint: disable=invalid-name
51 |     max_shared_mem = cuda_utils.get_device_attribute(
52 |         cudaDevAttrMaxSharedMemoryPerBlockOptin, gpu)
53 |     return int(max_shared_mem)
54 | 
55 | 
56 | def get_gpu_memory(gpu: int = 0) -> int:
57 |     """Returns the total memory of the GPU in bytes."""
58 |     return torch.cuda.get_device_properties(gpu).total_memory
59 | 
60 | 
61 | def get_cpu_memory() -> int:
62 |     """Returns the total CPU memory of the node in bytes."""
63 |     return psutil.virtual_memory().total
64 | 
65 | 
66 | def random_uuid() -> str:
67 |     return str(uuid.uuid4().hex)
68 | 
69 | 
70 | def in_wsl() -> bool:
71 |     # Reference: https://github.com/microsoft/WSL/issues/4071
72 |     return "microsoft" in " ".join(uname()).lower()
73 | 
74 | 
75 | # TODO: Change this back to API response key when doing the real-case
76 | # NOTE: Currently this stop string is for testing only!
77 | # "not" is the token right after prompt in examples/test_pause.py
78 | def get_api_stop_string() -> str:
79 |     # return 'Integrity'
80 |     # return '\n'
81 |     # return 'a'
82 |     # return "<TOOLFORMER_API_RESPONSE>"
83 |     # return "Editor" # gpt-j
84 |     # return "asa" # baichuan-13b
85 |     # return "mandated" # opt
86 |     return "USE" # dummy llama, vulcuna
87 |     return "not"
88 | 
89 | def get_api_stop_strings() -> List[str]:
90 |     # return "<TOOLFORMER_API_RESPONSE>"
91 |     return ['\n', 'Editor', 'asa', 'USE'] 
92 | 
93 | def get_api_stop_token() -> int:
94 |     # react "PAUSE"
95 |     return 17171


--------------------------------------------------------------------------------
/tests/async_engine/test_api_server.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import sys
 3 | import time
 4 | from multiprocessing import Pool
 5 | from pathlib import Path
 6 | 
 7 | import pytest
 8 | import requests
 9 | 
10 | 
11 | def _query_server(prompt: str) -> dict:
12 |     response = requests.post("http://localhost:8000/generate",
13 |                              json={
14 |                                  "prompt": prompt,
15 |                                  "max_tokens": 100,
16 |                                  "temperature": 0,
17 |                                  "ignore_eos": True
18 |                              })
19 |     response.raise_for_status()
20 |     return response.json()
21 | 
22 | 
23 | @pytest.fixture
24 | def api_server():
25 |     script_path = Path(__file__).parent.joinpath(
26 |         "api_server_async_engine.py").absolute()
27 |     # pylint: disable=consider-using-with
28 |     uvicorn_process = subprocess.Popen([
29 |         sys.executable, "-u",
30 |         str(script_path), "--model", "facebook/opt-125m"
31 |     ])
32 |     yield
33 |     uvicorn_process.terminate()
34 | 
35 | 
36 | # pylint: disable=redefined-outer-name, unused-argument
37 | def test_api_server(api_server):
38 |     """
39 |     Run the API server and test it.
40 | 
41 |     We run both the server and requests in separate processes.
42 | 
43 |     We test that the server can handle incoming requests, including
44 |     multiple requests at the same time, and that it can handle requests
45 |     being cancelled without crashing.
46 |     """
47 |     with Pool(32) as pool:
48 |         # Wait until the server is ready
49 |         prompts = ["Hello world"] * 1
50 |         result = None
51 |         while not result:
52 |             # pylint: disable=bare-except
53 |             try:
54 |                 for result in pool.map(_query_server, prompts):
55 |                     break
56 |             except:
57 |                 time.sleep(1)
58 | 
59 |         # Actual tests start here
60 |         # Try with 1 prompt
61 |         for result in pool.map(_query_server, prompts):
62 |             assert result
63 | 
64 |         num_aborted_requests = requests.get(
65 |             "http://localhost:8000/stats").json()["num_aborted_requests"]
66 |         assert num_aborted_requests == 0
67 | 
68 |         # Try with 100 prompts
69 |         prompts = ["Hello world"] * 100
70 |         for result in pool.map(_query_server, prompts):
71 |             assert result
72 | 
73 |         # Cancel requests
74 |         pool.map_async(_query_server, prompts)
75 |         time.sleep(0.01)
76 |         pool.terminate()
77 |         pool.join()
78 | 
79 |         # check cancellation stats
80 |         num_aborted_requests = requests.get(
81 |             "http://localhost:8000/stats").json()["num_aborted_requests"]
82 |         assert num_aborted_requests > 0
83 | 
84 |     # check that server still runs after cancellations
85 |     with Pool(32) as pool:
86 |         # Try with 100 prompts
87 |         prompts = ["Hello world"] * 100
88 |         for result in pool.map(_query_server, prompts):
89 |             assert result
90 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/configs/falcon.py:
--------------------------------------------------------------------------------
 1 | # Adapted from
 2 | # https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py
 3 | # Copyright 2023 The vLLM team.
 4 | # Copyright 2022 the Big Science Workshop and HuggingFace Inc. team.
 5 | # All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | """Falcon configuration"""
19 | from transformers.configuration_utils import PretrainedConfig
20 | 
21 | 
22 | class RWConfig(PretrainedConfig):
23 |     model_type = "falcon"
24 |     keys_to_ignore_at_inference = ["past_key_values"]
25 |     attribute_map = {
26 |         "num_hidden_layers": "n_layer",
27 |         "num_attention_heads": "n_head",
28 |         "num_kv_heads": "n_head_kv",
29 |     }
30 | 
31 |     def __init__(
32 |         self,
33 |         vocab_size=250880,
34 |         hidden_size=64,
35 |         n_layer=2,
36 |         n_head=8,
37 |         layer_norm_epsilon=1e-5,
38 |         initializer_range=0.02,
39 |         use_cache=True,
40 |         bos_token_id=1,
41 |         eos_token_id=2,
42 |         hidden_dropout=0.0,
43 |         attention_dropout=0.0,
44 |         multi_query=True,
45 |         n_head_kv=None,
46 |         alibi=False,
47 |         bias=False,
48 |         parallel_attn=False,
49 |         new_decoder_architecture=False,
50 |         **kwargs,
51 |     ) -> None:
52 |         self.vocab_size = vocab_size
53 |         # Backward compatibility with n_embed kwarg
54 |         n_embed = kwargs.pop("n_embed", None)
55 |         self.hidden_size = hidden_size if n_embed is None else n_embed
56 |         self.n_layer = n_layer
57 |         self.n_head = n_head
58 |         self.layer_norm_epsilon = layer_norm_epsilon
59 |         self.initializer_range = initializer_range
60 |         self.use_cache = use_cache
61 |         self.hidden_dropout = hidden_dropout
62 |         self.attention_dropout = attention_dropout
63 | 
64 |         self.bos_token_id = bos_token_id
65 |         self.eos_token_id = eos_token_id
66 |         self.multi_query = multi_query
67 |         self.n_head_kv = 1 if n_head_kv is None else n_head_kv
68 |         self.alibi = alibi
69 |         self.bias = bias
70 |         self.parallel_attn = parallel_attn
71 |         self.new_decoder_architecture = new_decoder_architecture
72 | 
73 |         if self.hidden_size == 8192:
74 |             # Hack for falcon-40b
75 |             self.new_decoder_architecture = True
76 | 
77 |         super().__init__(bos_token_id=bos_token_id,
78 |                          eos_token_id=eos_token_id,
79 |                          **kwargs)
80 | 
81 |     @property
82 |     def head_dim(self):
83 |         return self.hidden_size // self.n_head
84 | 
85 |     @property
86 |     def rotary(self):
87 |         return not self.alibi
88 | 


--------------------------------------------------------------------------------
/tests/distributed/test_comm_ops.py:
--------------------------------------------------------------------------------
 1 | """Test the communication operators.
 2 | 
 3 | Run `pytest tests/distributed/test_comm_ops.py --forked`.
 4 | """
 5 | from multiprocessing import Process
 6 | 
 7 | import pytest
 8 | import torch
 9 | 
10 | from vllm.config import ParallelConfig
11 | from vllm.engine.ray_utils import get_open_port
12 | from vllm.model_executor.parallel_utils.communication_op import (
13 |     tensor_model_parallel_all_reduce,
14 |     tensor_model_parallel_all_gather,
15 | )
16 | from vllm.worker.worker import _init_distributed_environment
17 | 
18 | 
19 | def init_test_distributed_environment(pipeline_parallel_size: int,
20 |                                       tensor_parallel_size: int, rank: int,
21 |                                       distributed_init_port: str):
22 |     parallel_config = ParallelConfig(pipeline_parallel_size,
23 |                                      tensor_parallel_size,
24 |                                      worker_use_ray=True)
25 |     distributed_init_method = f"tcp://localhost:{distributed_init_port}"
26 |     torch.cuda.set_device(rank)
27 |     _init_distributed_environment(parallel_config, rank,
28 |                                   distributed_init_method)
29 | 
30 | 
31 | def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
32 |                            distributed_init_port: str):
33 |     init_test_distributed_environment(1, tensor_parallel_size, rank,
34 |                                       distributed_init_port)
35 |     num_elements = 8
36 |     all_tensors = [
37 |         torch.arange(num_elements, dtype=torch.float32, device="cuda") *
38 |         (r + 1) for r in range(tensor_parallel_size)
39 |     ]
40 |     expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
41 |     t = all_tensors[rank]
42 |     t = tensor_model_parallel_all_reduce(t)
43 |     assert torch.allclose(t, expected)
44 | 
45 | 
46 | def all_gather_test_worker(tensor_parallel_size: int, rank: int,
47 |                            distributed_init_port: str):
48 |     init_test_distributed_environment(1, tensor_parallel_size, rank,
49 |                                       distributed_init_port)
50 |     num_dimensions = 3
51 |     tensor_size = list(range(2, num_dimensions + 2))
52 |     total_size = 1
53 |     for s in tensor_size:
54 |         total_size *= s
55 |     for all_gather_dimension in range(num_dimensions):
56 |         all_tensors = [
57 |             torch.arange(total_size, dtype=torch.float32,
58 |                          device="cuda").reshape(tensor_size) * (r + 1)
59 |             for r in range(tensor_parallel_size)
60 |         ]
61 |         expected = torch.cat(all_tensors, dim=all_gather_dimension)
62 |         t = all_tensors[rank]
63 |         t = tensor_model_parallel_all_gather(t, all_gather_dimension)
64 |         assert torch.allclose(t, expected)
65 | 
66 | 
67 | @pytest.mark.skipif(torch.cuda.device_count() < 2,
68 |                     reason="Need at least 2 GPUs to run the test.")
69 | @pytest.mark.parametrize("tensor_parallel_size", [2])
70 | @pytest.mark.parametrize("test_target",
71 |                          [all_reduce_test_worker, all_gather_test_worker])
72 | def test_multi_process_tensor_parallel(tensor_parallel_size, test_target):
73 |     distributed_init_port = get_open_port()
74 |     processes = []
75 |     for rank in range(tensor_parallel_size):
76 |         p = Process(target=test_target,
77 |                     args=(tensor_parallel_size, rank, distributed_init_port))
78 |         p.start()
79 |         processes.append(p)
80 |     for p in processes:
81 |         p.join()
82 |     assert all(p.exitcode == 0 for p in processes)
83 | 


--------------------------------------------------------------------------------
/docs/source/models/supported_models.rst:
--------------------------------------------------------------------------------
 1 | .. _supported_models:
 2 | 
 3 | Supported Models
 4 | ================
 5 | 
 6 | vLLM supports a variety of generative Transformer models in `HuggingFace Transformers <https://huggingface.co/models>`_.
 7 | The following is the list of model architectures that are currently supported by vLLM.
 8 | Alongside each architecture, we include some popular models that use it.
 9 | 
10 | .. list-table::
11 |   :widths: 25 25 50
12 |   :header-rows: 1
13 | 
14 |   * - Architecture
15 |     - Models
16 |     - Example HuggingFace Models
17 |   * - :code:`AquilaForCausalLM`
18 |     - Aquila
19 |     - :code:`BAAI/Aquila-7B`, :code:`BAAI/AquilaChat-7B`, etc.
20 |   * - :code:`BaiChuanForCausalLM`
21 |     - Baichuan
22 |     - :code:`baichuan-inc/Baichuan-7B`, :code:`baichuan-inc/Baichuan-13B-Chat`, etc.
23 |   * - :code:`BloomForCausalLM`
24 |     - BLOOM, BLOOMZ, BLOOMChat
25 |     - :code:`bigscience/bloom`, :code:`bigscience/bloomz`, etc.
26 |   * - :code:`FalconForCausalLM`
27 |     - Falcon
28 |     - :code:`tiiuae/falcon-7b`, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc.
29 |   * - :code:`GPT2LMHeadModel`
30 |     - GPT-2
31 |     - :code:`gpt2`, :code:`gpt2-xl`, etc.
32 |   * - :code:`GPTBigCodeForCausalLM`
33 |     - StarCoder, SantaCoder, WizardCoder
34 |     - :code:`bigcode/starcoder`, :code:`bigcode/gpt_bigcode-santacoder`, :code:`WizardLM/WizardCoder-15B-V1.0`, etc.
35 |   * - :code:`GPTJForCausalLM`
36 |     - GPT-J
37 |     - :code:`EleutherAI/gpt-j-6b`, :code:`nomic-ai/gpt4all-j`, etc.
38 |   * - :code:`GPTNeoXForCausalLM`
39 |     - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM
40 |     - :code:`EleutherAI/gpt-neox-20b`, :code:`EleutherAI/pythia-12b`, :code:`OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, :code:`databricks/dolly-v2-12b`, :code:`stabilityai/stablelm-tuned-alpha-7b`, etc.
41 |   * - :code:`InternLMForCausalLM`
42 |     - InternLM
43 |     - :code:`internlm/internlm-7b`, :code:`internlm/internlm-chat-7b`, etc.
44 |   * - :code:`LlamaForCausalLM`
45 |     - LLaMA, LLaMA-2, Vicuna, Alpaca, Koala, Guanaco
46 |     - :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`young-geng/koala`, etc.
47 |   * - :code:`MistralForCausalLM`
48 |     - Mistral, Mistral-Instruct
49 |     - :code:`mistralai/Mistral-7B-v0.1`, :code:`mistralai/Mistral-7B-Instruct-v0.1`, etc.
50 |   * - :code:`MPTForCausalLM`
51 |     - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter
52 |     - :code:`mosaicml/mpt-7b`, :code:`mosaicml/mpt-7b-storywriter`, :code:`mosaicml/mpt-30b`, etc.
53 |   * - :code:`OPTForCausalLM`
54 |     - OPT, OPT-IML
55 |     - :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc.
56 |   * - :code:`QWenLMHeadModel`
57 |     - Qwen
58 |     - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc.
59 | 
60 | If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
61 | Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` for instructions on how to implement support for your model.
62 | Alternatively, you can raise an issue on our `GitHub <https://github.com/vllm-project/vllm/issues>`_ project.
63 | 
64 | .. tip::
65 |     The easiest way to check if your model is supported is to run the program below:
66 | 
67 |     .. code-block:: python
68 | 
69 |         from vllm import LLM
70 | 
71 |         llm = LLM(model=...)  # Name or path of your model
72 |         output = llm.generate("Hello, my name is")
73 |         print(output)
74 | 
75 |     If vLLM successfully generates text, it indicates that your model is supported.
76 | 


--------------------------------------------------------------------------------
/benchmarks/benchmark_latency.py:
--------------------------------------------------------------------------------
  1 | """Benchmark the latency of processing a single batch of requests."""
  2 | import argparse
  3 | import time
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | from tqdm import tqdm
  8 | 
  9 | from vllm import LLM, SamplingParams
 10 | 
 11 | 
 12 | def main(args: argparse.Namespace):
 13 |     print(args)
 14 | 
 15 |     # Process all the requests in a single batch if possible.
 16 |     # NOTE(woosuk): If the request cannot be processed in a single batch,
 17 |     # the engine will automatically process the request in multiple batches.
 18 |     llm = LLM(
 19 |         model=args.model,
 20 |         tokenizer=args.tokenizer,
 21 |         quantization=args.quantization,
 22 |         tensor_parallel_size=args.tensor_parallel_size,
 23 |         max_num_seqs=args.batch_size,
 24 |         max_num_batched_tokens=args.batch_size * args.input_len,
 25 |         trust_remote_code=args.trust_remote_code,
 26 |         dtype=args.dtype,
 27 |     )
 28 | 
 29 |     sampling_params = SamplingParams(
 30 |         n=args.n,
 31 |         temperature=0.0 if args.use_beam_search else 1.0,
 32 |         top_p=1.0,
 33 |         use_beam_search=args.use_beam_search,
 34 |         ignore_eos=True,
 35 |         max_tokens=args.output_len,
 36 |     )
 37 |     print(sampling_params)
 38 |     dummy_prompt_token_ids = [[0] * args.input_len] * args.batch_size
 39 | 
 40 |     def run_to_completion(profile: bool = False):
 41 |         if profile:
 42 |             torch.cuda.cudart().cudaProfilerStart()
 43 |         start_time = time.perf_counter()
 44 | 
 45 |         llm.generate(prompt_token_ids=dummy_prompt_token_ids,
 46 |                      sampling_params=sampling_params,
 47 |                      use_tqdm=False)
 48 | 
 49 |         end_time = time.perf_counter()
 50 |         latency = end_time - start_time
 51 |         if profile:
 52 |             torch.cuda.cudart().cudaProfilerStop()
 53 |         return latency
 54 | 
 55 |     print("Warming up...")
 56 |     run_to_completion(profile=False)
 57 | 
 58 |     # Benchmark.
 59 |     latencies = []
 60 |     for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
 61 |         latencies.append(run_to_completion(profile=False))
 62 |     print(f'Avg latency: {np.mean(latencies)} seconds')
 63 | 
 64 | 
 65 | if __name__ == '__main__':
 66 |     parser = argparse.ArgumentParser(
 67 |         description='Benchmark the latency of processing a single batch of '
 68 |         'requests till completion.')
 69 |     parser.add_argument('--model', type=str, default='facebook/opt-125m')
 70 |     parser.add_argument('--tokenizer', type=str, default=None)
 71 |     parser.add_argument('--quantization',
 72 |                         '-q',
 73 |                         choices=['awq', None],
 74 |                         default=None)
 75 |     parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
 76 |     parser.add_argument('--input-len', type=int, default=32)
 77 |     parser.add_argument('--output-len', type=int, default=128)
 78 |     parser.add_argument('--batch-size', type=int, default=8)
 79 |     parser.add_argument('--n',
 80 |                         type=int,
 81 |                         default=1,
 82 |                         help='Number of generated sequences per prompt.')
 83 |     parser.add_argument('--use-beam-search', action='store_true')
 84 |     parser.add_argument('--num-iters',
 85 |                         type=int,
 86 |                         default=3,
 87 |                         help='Number of iterations to run.')
 88 |     parser.add_argument('--trust-remote-code',
 89 |                         action='store_true',
 90 |                         help='trust remote code from huggingface')
 91 |     parser.add_argument(
 92 |         '--dtype',
 93 |         type=str,
 94 |         default='auto',
 95 |         choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
 96 |         help='data type for model weights and activations. '
 97 |         'The "auto" option will use FP16 precision '
 98 |         'for FP32 and FP16 models, and BF16 precision '
 99 |         'for BF16 models.')
100 |     args = parser.parse_args()
101 |     main(args)
102 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantized_linear/awq.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional
  2 | 
  3 | import torch
  4 | from torch.nn.parameter import Parameter
  5 | 
  6 | from vllm import quantization_ops
  7 | from vllm.model_executor.parallel_utils.layers import (ColumnParallelLinear,
  8 |                                                        RowParallelLinear)
  9 | 
 10 | 
 11 | class AWQColumnParallelLinear(ColumnParallelLinear):
 12 | 
 13 |     def create_weights(self, dtype: torch.dtype) -> None:
 14 |         assert self.input_size % self.quant_config.weight_bits == 0
 15 |         assert (self.output_size_per_partition %
 16 |                 self.quant_config.pack_factor == 0)
 17 |         self.qweight = Parameter(
 18 |             torch.empty(
 19 |                 self.input_size,
 20 |                 self.output_size_per_partition //
 21 |                 self.quant_config.pack_factor,
 22 |                 device="cuda",
 23 |                 dtype=torch.int32,
 24 |             ),
 25 |             requires_grad=False,
 26 |         )
 27 |         self.qzeros = Parameter(
 28 |             torch.empty(
 29 |                 self.input_size // self.quant_config.group_size,
 30 |                 self.output_size_per_partition //
 31 |                 self.quant_config.pack_factor,
 32 |                 device="cuda",
 33 |                 dtype=torch.int32,
 34 |             ),
 35 |             requires_grad=False,
 36 |         )
 37 |         self.scales = Parameter(
 38 |             torch.empty(
 39 |                 self.input_size // self.quant_config.group_size,
 40 |                 self.output_size_per_partition,
 41 |                 device="cuda",
 42 |                 dtype=dtype,
 43 |             ),
 44 |             requires_grad=False,
 45 |         )
 46 | 
 47 |     def apply_weights(
 48 |         self,
 49 |         x: torch.Tensor,
 50 |         bias: Optional[torch.Tensor],
 51 |     ) -> torch.Tensor:
 52 |         pack_factor = self.quant_config.pack_factor
 53 |         out_shape = (x.shape[-2], self.qweight.shape[-1] * pack_factor)
 54 |         reshaped_x = x.reshape(-1, x.shape[-1])
 55 |         out = quantization_ops.awq_gemm(reshaped_x, self.qweight, self.scales,
 56 |                                         self.qzeros, pack_factor)
 57 |         if bias is not None:
 58 |             out = out + bias
 59 |         return out.reshape(out_shape)
 60 | 
 61 | 
 62 | class AWQRowParallelLinear(RowParallelLinear):
 63 | 
 64 |     def create_weights(self, dtype: torch.dtype) -> None:
 65 |         assert (self.input_size_per_partition %
 66 |                 self.quant_config.weight_bits == 0)
 67 |         assert self.output_size % self.quant_config.pack_factor == 0
 68 |         self.qweight = Parameter(
 69 |             torch.empty(
 70 |                 self.input_size_per_partition,
 71 |                 self.output_size // self.quant_config.pack_factor,
 72 |                 device="cuda",
 73 |                 dtype=torch.int32,
 74 |             ),
 75 |             requires_grad=False,
 76 |         )
 77 |         self.qzeros = Parameter(
 78 |             torch.empty(
 79 |                 self.input_size_per_partition // self.quant_config.group_size,
 80 |                 self.output_size // self.quant_config.pack_factor,
 81 |                 device="cuda",
 82 |                 dtype=torch.int32,
 83 |             ),
 84 |             requires_grad=False,
 85 |         )
 86 |         self.scales = Parameter(
 87 |             torch.empty(
 88 |                 self.input_size_per_partition // self.quant_config.group_size,
 89 |                 self.output_size,
 90 |                 device="cuda",
 91 |                 dtype=dtype,
 92 |             ),
 93 |             requires_grad=False,
 94 |         )
 95 | 
 96 |     def apply_weights(self, x: torch.Tensor) -> torch.Tensor:
 97 |         pack_factor = self.quant_config.pack_factor
 98 |         out_shape = (x.shape[-2], self.qweight.shape[-1] * pack_factor)
 99 |         reshaped_x = x.reshape(-1, x.shape[-1])
100 |         out = quantization_ops.awq_gemm(reshaped_x, self.qweight, self.scales,
101 |                                         self.qzeros, pack_factor)
102 |         return out.reshape(out_shape)
103 | 


--------------------------------------------------------------------------------
/csrc/quantization/awq/dequantize.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 | Adapted from https://github.com/mit-han-lab/llm-awq
 3 | Modified from NVIDIA FasterTransformer: https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
 4 | @article{lin2023awq,
 5 |   title={AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration},
 6 |   author={Lin, Ji and Tang, Jiaming and Tang, Haotian and Yang, Shang and Dang, Xingyu and Han, Song},
 7 |   journal={arXiv},
 8 |   year={2023}
 9 | }
10 | */
11 | 
12 | #pragma once
13 | 
14 | namespace vllm {
15 | namespace awq {
16 | 
17 | __device__ uint4 dequantize_s4_to_fp16x2(uint32_t const& source)
18 | {
19 | #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
20 |   assert(false);
21 | #else
22 |     uint4 result;
23 | 
24 |     uint32_t*      h   = reinterpret_cast<uint32_t*>(&result);
25 |     uint32_t const i4s = reinterpret_cast<uint32_t const&>(source);
26 | 
27 |     // First, we extract the i4s and construct an intermediate fp16 number.
28 |     static constexpr uint32_t immLut                = (0xf0 & 0xcc) | 0xaa;
29 |     static constexpr uint32_t BOTTOM_MASK           = 0x000f000f;
30 |     static constexpr uint32_t TOP_MASK              = 0x00f000f0;
31 |     static constexpr uint32_t I4s_TO_F16s_MAGIC_NUM = 0x64006400;
32 | 
33 |     // Note that the entire sequence only requires 1 shift instruction. This is thanks to the register packing
34 |     // format and the fact that we force our integers to be unsigned, and account for this in the fp16 subtractions.
35 |     // In addition, I exploit the fact that sub and fma have the same throughput in order to convert elt_23 and
36 |     // elt_67 to fp16 without having to shift them to the bottom bits before hand.
37 | 
38 |     // Shift right by 8 to now consider elt_45 and elt_67. Issue first to hide RAW dependency if we issue
39 |     // immediately before required.
40 |     const uint32_t top_i4s = i4s >> 8;
41 |     // Extract elt_01 - (i4s & 0x000f000f) | 0x64006400
42 |     asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
43 |                     : "=r"(h[0])
44 |                     : "r"(i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
45 |     // Extract elt_23 (i4s & 0x00f000f0) | 0x64006400
46 |     asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
47 |                     : "=r"(h[1])
48 |                     : "r"(i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
49 |     // Extract elt_45 (top_i4s & 0x000f000f) | 0x64006400
50 |     asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
51 |                     : "=r"(h[2])
52 |                     : "r"(top_i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
53 |     // Extract elt_67 (top_i4s & 0x00f000f0) | 0x64006400
54 |     asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
55 |                     : "=r"(h[3])
56 |                     : "r"(top_i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
57 | 
58 |     // I use inline PTX below because I am not sure if the compiler will emit float2half instructions if I use the
59 |     // half2 ctor. In this case, I chose performance reliability over code readability.
60 | 
61 |     // This is the half2 {1032, 1032} represented as an integer.
62 |     // static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64086408;
63 |     // Haotian: subtract {1024, 1024} instead, we do not need to map to [-8, 7]
64 |     static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64006400;
65 |     // This is the half2 {1 / 16, 1 / 16} represented as an integer.
66 |     static constexpr uint32_t ONE_SIXTEENTH = 0x2c002c00;
67 |     // This is the half2 {-72, -72} represented as an integer.
68 |     // static constexpr uint32_t NEG_72 = 0xd480d480;
69 |     // Haotian: Let's use {-64, -64}.
70 |     static constexpr uint32_t NEG_64 = 0xd400d400;
71 | 
72 |     // Finally, we construct the output numbers.
73 |     // Convert elt_01
74 |     asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[0]) : "r"(h[0]), "r"(FP16_TOP_MAGIC_NUM));
75 |     // Convert elt_23
76 |     asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[1]) : "r"(h[1]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
77 |     // Convert elt_45
78 |     asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[2]) : "r"(h[2]), "r"(FP16_TOP_MAGIC_NUM));
79 |     // Convert elt_67
80 |     asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[3]) : "r"(h[3]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
81 | 
82 |     return result;
83 | #endif
84 | }
85 | 
86 | } // namespace awq
87 | } // namespace vllm
88 | 


--------------------------------------------------------------------------------
/vllm/entrypoints/api_server.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | from typing import AsyncGenerator
  4 | 
  5 | from fastapi import FastAPI, Request
  6 | from fastapi.responses import JSONResponse, Response, StreamingResponse
  7 | import uvicorn
  8 | 
  9 | from vllm.engine.arg_utils import AsyncEngineArgs
 10 | from vllm.engine.async_llm_engine import AsyncLLMEngine
 11 | from vllm.sampling_params import SamplingParams
 12 | from vllm.utils import random_uuid
 13 | 
 14 | TIMEOUT_KEEP_ALIVE = 5  # seconds.
 15 | TIMEOUT_TO_PREVENT_DEADLOCK = 1  # seconds.
 16 | app = FastAPI()
 17 | engine = None
 18 | import base64, json
 19 | 
 20 | 
 21 | @app.post("/generate")
 22 | async def generate(request: Request) -> Response:
 23 |     """Generate completion for the request.
 24 | 
 25 |     The request should be a JSON object with the following fields:
 26 |     - prompt: the prompt to use for the generation.
 27 |     - stream: whether to stream the results or not.
 28 |     - other fields: the sampling parameters (See `SamplingParams` for details).
 29 |     """
 30 |     request_dict = await request.json()
 31 |     prompt = request_dict.pop("prompt")
 32 |     prompt_token_ids = None
 33 |     stream = request_dict.pop("stream", False)
 34 |     sampling_params = SamplingParams(**request_dict)
 35 |     request_id = request_dict.pop("request_id", None)
 36 |     if not request_id:
 37 |         request_id = random_uuid()
 38 |     
 39 |     dummy_token_ids = request_dict.pop("dummy_token_ids", False)
 40 |     if dummy_token_ids:
 41 |         prompt_token_ids = [0] * int(prompt)
 42 |         prompt = None
 43 | 
 44 |     results_generator = engine.generate(prompt, sampling_params, request_id, prompt_token_ids)
 45 | 
 46 |     # Streaming case
 47 |     async def stream_results() -> AsyncGenerator[bytes, None]:
 48 |         async for request_output in results_generator:
 49 |             prompt = request_output.prompt
 50 |             text_outputs = [
 51 |                 prompt + output.text for output in request_output.outputs
 52 |             ]
 53 |             ret = {"text": text_outputs}
 54 |             yield (json.dumps(ret) + "\0").encode("utf-8")
 55 | 
 56 |     if stream:
 57 |         return StreamingResponse(stream_results())
 58 | 
 59 |     # Non-streaming case
 60 |     final_output = None
 61 |     async for request_output in results_generator:
 62 |         if await request.is_disconnected():
 63 |             # Abort the request if the client disconnects.
 64 |             await engine.abort(request_id)
 65 |             return Response(status_code=499)
 66 |         final_output = request_output
 67 | 
 68 |     assert final_output is not None
 69 |     prompt = final_output.prompt
 70 |     text_outputs = [prompt + output.text for output in final_output.outputs]
 71 |     ret = {"text": text_outputs}
 72 |     return JSONResponse(ret)
 73 | 
 74 | @app.post("/resume")
 75 | async def resume(request: Request) -> Response:
 76 |     request_dict = await request.json()
 77 |     request_id = request_dict.pop("request_id")
 78 |     api_return_length = request_dict.pop("api_return_length")
 79 |     api_return_tokens = [0] * api_return_length
 80 | 
 81 |     results_generator = engine.resume_request_single(request_id, api_return_tokens)
 82 | 
 83 |     # Non-streaming case
 84 |     final_output = None
 85 |     async for request_output in results_generator:
 86 |         if await request.is_disconnected():
 87 |             # Abort the request if the client disconnects.
 88 |             await engine.abort(request_id)
 89 |             return Response(status_code=499)
 90 |         final_output = request_output
 91 | 
 92 |     assert final_output is not None
 93 |     prompt = final_output.prompt
 94 |     text_outputs = [prompt + output.text for output in final_output.outputs]
 95 |     ret = {"text": text_outputs}
 96 |     return JSONResponse(ret)
 97 | 
 98 | if __name__ == "__main__":
 99 |     parser = argparse.ArgumentParser()
100 |     parser.add_argument("--host", type=str, default="localhost")
101 |     parser.add_argument("--port", type=int, default=8000)
102 |     parser = AsyncEngineArgs.add_cli_args(parser)
103 |     args = parser.parse_args()
104 | 
105 |     engine_args = AsyncEngineArgs.from_cli_args(args)
106 |     engine = AsyncLLMEngine.from_engine_args(engine_args)
107 | 
108 |     uvicorn.run(app,
109 |                 host=args.host,
110 |                 port=args.port,
111 |                 log_level="debug",
112 |                 timeout_keep_alive=TIMEOUT_KEEP_ALIVE)
113 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | benchmarks/exp_*/
  2 | benchmarks/experiment_*/
  3 | 
  4 | # Byte-compiled / optimized / DLL files
  5 | __pycache__/
  6 | *.py[cod]
  7 | *$py.class
  8 | 
  9 | # C extensions
 10 | *.so
 11 | 
 12 | # Distribution / packaging
 13 | .Python
 14 | build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | share/python-wheels/
 27 | *.egg-info/
 28 | .installed.cfg
 29 | *.egg
 30 | MANIFEST
 31 | 
 32 | # PyInstaller
 33 | #  Usually these files are written by a python script from a template
 34 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 35 | *.manifest
 36 | *.spec
 37 | 
 38 | # Installer logs
 39 | pip-log.txt
 40 | pip-delete-this-directory.txt
 41 | 
 42 | # Unit test / coverage reports
 43 | htmlcov/
 44 | .tox/
 45 | .nox/
 46 | .coverage
 47 | .coverage.*
 48 | .cache
 49 | nosetests.xml
 50 | coverage.xml
 51 | *.cover
 52 | *.py,cover
 53 | .hypothesis/
 54 | .pytest_cache/
 55 | cover/
 56 | 
 57 | # Translations
 58 | *.mo
 59 | *.pot
 60 | 
 61 | # Django stuff:
 62 | *.log
 63 | local_settings.py
 64 | db.sqlite3
 65 | db.sqlite3-journal
 66 | 
 67 | # Flask stuff:
 68 | instance/
 69 | .webassets-cache
 70 | 
 71 | # Scrapy stuff:
 72 | .scrapy
 73 | 
 74 | # Sphinx documentation
 75 | docs/_build/
 76 | 
 77 | # PyBuilder
 78 | .pybuilder/
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | #   For a library or package, you might want to ignore these files since the code is
 90 | #   intended to run in multiple environments; otherwise, check them in:
 91 | # .python-version
 92 | 
 93 | # pipenv
 94 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 95 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 96 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 97 | #   install all needed dependencies.
 98 | #Pipfile.lock
 99 | 
100 | # poetry
101 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
102 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
103 | #   commonly ignored for libraries.
104 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
105 | #poetry.lock
106 | 
107 | # pdm
108 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
109 | #pdm.lock
110 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
111 | #   in version control.
112 | #   https://pdm.fming.dev/#use-with-ide
113 | .pdm.toml
114 | 
115 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
116 | __pypackages__/
117 | 
118 | # Celery stuff
119 | celerybeat-schedule
120 | celerybeat.pid
121 | 
122 | # SageMath parsed files
123 | *.sage.py
124 | 
125 | # Environments
126 | .env
127 | .venv
128 | env/
129 | venv/
130 | ENV/
131 | env.bak/
132 | venv.bak/
133 | 
134 | # Spyder project settings
135 | .spyderproject
136 | .spyproject
137 | 
138 | # Rope project settings
139 | .ropeproject
140 | 
141 | # mkdocs documentation
142 | /site
143 | 
144 | # mypy
145 | .mypy_cache/
146 | .dmypy.json
147 | dmypy.json
148 | 
149 | # Pyre type checker
150 | .pyre/
151 | 
152 | # pytype static type analyzer
153 | .pytype/
154 | 
155 | # Cython debug symbols
156 | cython_debug/
157 | 
158 | # PyCharm
159 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
160 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
161 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
162 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
163 | .idea/
164 | 
165 | # VSCode
166 | .vscode/
167 | 
168 | # DS Store
169 | .DS_Store
170 | 
171 | # Results
172 | *.csv
173 | 
174 | # Python pickle files
175 | *.pkl
176 | 
177 | # Sphinx documentation
178 | _build/
179 | 
180 | # vim swap files
181 | *.swo
182 | *.swp
183 | ShareGPT_V3_unfiltered_cleaned_split.json
184 | *.nsys-rep
185 | *.pt
186 | 
187 | benchmarks/exp_logs/
188 | benchmarks/exp_version2/
189 | 
190 | real/
191 | cswap/
192 | merge*.json
193 | merged_results/
194 | merged_results_13B/
195 | new_real_baseline/


--------------------------------------------------------------------------------
/csrc/activation_kernels.cu:
--------------------------------------------------------------------------------
  1 | #include <torch/extension.h>
  2 | #include <ATen/cuda/CUDAContext.h>
  3 | 
  4 | #include "dispatch_utils.h"
  5 | 
  6 | namespace vllm {
  7 | 
  8 | template<typename T>
  9 | __device__ __forceinline__ T silu(const T& x) {
 10 |   // x * sigmoid(x)
 11 |   return (T) (((float) x) / (1.0f + expf((float) -x)));
 12 | }
 13 | 
 14 | template<typename scalar_t>
 15 | __global__ void silu_and_mul_kernel(
 16 |   scalar_t* __restrict__ out,               // [num_tokens, d]
 17 |   const scalar_t* __restrict__ input,       // [num_tokens, 2, d]
 18 |   const int d) {
 19 |   const int token_idx = blockIdx.x;
 20 |   for (int idx = threadIdx.x; idx < d; idx += blockDim.x) {
 21 |     const scalar_t x = __ldg(&input[token_idx * 2 * d + idx]);
 22 |     const scalar_t y = __ldg(&input[token_idx * 2 * d + d + idx]);
 23 |     out[token_idx * d + idx] = silu(x) * y;
 24 |   }
 25 | }
 26 | 
 27 | } // namespace vllm
 28 | 
 29 | void silu_and_mul(
 30 |   torch::Tensor& out,      // [num_tokens, d]
 31 |   torch::Tensor& input)    // [num_tokens, 2 * d]
 32 | {
 33 |   int num_tokens = input.size(0);
 34 |   int d = input.size(1) / 2;
 35 | 
 36 |   dim3 grid(num_tokens);
 37 |   dim3 block(std::min(d, 1024));
 38 |   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 39 |   VLLM_DISPATCH_FLOATING_TYPES(
 40 |     input.scalar_type(),
 41 |     "silu_and_mul_kernel",
 42 |     [&] {
 43 |       vllm::silu_and_mul_kernel<scalar_t><<<grid, block, 0, stream>>>(
 44 |         out.data_ptr<scalar_t>(),
 45 |         input.data_ptr<scalar_t>(),
 46 |         d);
 47 |     });
 48 | }
 49 | 
 50 | namespace vllm {
 51 | 
 52 | // Element-wise activation kernel template.
 53 | template<typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
 54 | __global__ void activation_kernel(
 55 |   scalar_t* __restrict__ out,               // [num_tokens, d]
 56 |   const scalar_t* __restrict__ input,       // [num_tokens, d]
 57 |   const int d) {
 58 |   const int token_idx = blockIdx.x;
 59 |   for (int idx = threadIdx.x; idx < d; idx += blockDim.x) {
 60 |     const scalar_t x = __ldg(&input[token_idx * d + idx]);
 61 |     out[token_idx * d + idx] = ACT_FN(x);
 62 |   }
 63 | }
 64 | 
 65 | } // namespace vllm
 66 | 
 67 | // Launch element-wise activation kernel.
 68 | #define LAUNCH_ACTIVATION_KERNEL(KERNEL)                                                  \
 69 |   int num_tokens = input.size(0);                                                         \
 70 |   int d = input.size(1);                                                                  \
 71 |   dim3 grid(num_tokens);                                                                  \
 72 |   dim3 block(std::min(d, 1024));                                                          \
 73 |   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                           \
 74 |   VLLM_DISPATCH_FLOATING_TYPES(                                                           \
 75 |     input.scalar_type(),                                                                  \
 76 |     "activation_kernel",                                                                  \
 77 |     [&] {                                                                                 \
 78 |       vllm::activation_kernel<scalar_t, KERNEL<scalar_t>><<<grid, block, 0, stream>>>(    \
 79 |         out.data_ptr<scalar_t>(),                                                         \
 80 |         input.data_ptr<scalar_t>(),                                                       \
 81 |         d);                                                                               \
 82 |     });
 83 | 
 84 | namespace vllm {
 85 | 
 86 | template<typename T>
 87 | __device__ __forceinline__ T gelu_new_kernel(const T& x) {
 88 |   const float x3 = (float) (x * x * x);
 89 |   const T t = (T) tanhf((T) (0.79788456f * (float) (x + (T) (0.044715f * x3))));
 90 |   return ((T) 0.5) * x * (((T) 1.0) + t);
 91 | }
 92 | 
 93 | template<typename T>
 94 | __device__ __forceinline__ T gelu_fast_kernel(const T& x) {
 95 |   const float f = (float) x;
 96 |   const T t = (T) tanhf(((T) (f * 0.79788456f)) * (((T) 1.0) + (T) (0.044715f * f) * x));
 97 |   return ((T) 0.5) * x * (((T) 1.0) + t);
 98 | }
 99 | 
100 | } // namespace vllm
101 | 
102 | void gelu_new(
103 |   torch::Tensor& out,     // [num_tokens, d]
104 |   torch::Tensor& input)   // [num_tokens, d]
105 | {
106 |   LAUNCH_ACTIVATION_KERNEL(vllm::gelu_new_kernel);
107 | }
108 | 
109 | void gelu_fast(
110 |   torch::Tensor& out,     // [num_tokens, d]
111 |   torch::Tensor& input)   // [num_tokens, d]
112 | {
113 |   LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel);
114 | }
115 | 


--------------------------------------------------------------------------------
/vllm/model_executor/input_metadata.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, List, Optional, Tuple
  2 | 
  3 | import torch
  4 | from xformers.ops import AttentionBias
  5 | 
  6 | from vllm.sampling_params import SamplingParams
  7 | from vllm.sequence import SequenceData
  8 | 
  9 | 
 10 | class InputMetadata:
 11 |     """Metadata for input sequences. Used for PagedAttention.
 12 | 
 13 |     Args:
 14 |         seq_groups: List of (seq_ids, sampling_params).
 15 |         seq_data: Seq_id -> SequenceData.
 16 |         prompt_lens: Lengths of prompts.
 17 |         slot_mapping: The address to write the new KV to of each token.
 18 |         context_lens: the length of attention context for each generation token.
 19 |         max_context_len: The maximum context length.
 20 |         block_tables: The block tables. (Seq id -> list of physical block)
 21 |     """
 22 | 
 23 |     def __init__(
 24 |         self,
 25 |         seq_groups: List[Tuple[List[int], SamplingParams]],
 26 |         seq_data: Dict[int, SequenceData],
 27 |         prompt_lens: List[int],
 28 |         slot_mapping: torch.Tensor,
 29 |         context_lens: torch.Tensor,
 30 |         max_context_len: int,
 31 |         block_tables: torch.Tensor,
 32 |         running_query_lens: List[int],
 33 |         atoms: torch.Tensor,
 34 |         is_generating_new_token: List[bool],
 35 |         sliding_window: Optional[int] = None,
 36 |     ) -> None:
 37 |         self.seq_groups = seq_groups
 38 |         self.seq_data = seq_data
 39 |         self.prompt_lens = prompt_lens
 40 |         self.slot_mapping = slot_mapping
 41 |         self.context_lens = context_lens
 42 |         self.max_context_len = max_context_len
 43 |         self.running_query_lens = running_query_lens
 44 |         self.block_tables = block_tables
 45 |         self.atoms = atoms
 46 |         self.is_generating_new_token = is_generating_new_token
 47 | 
 48 |         self.to_cache = None
 49 |         if sliding_window is not None:
 50 |             # We need to keep the positions of sliding windows within
 51 |             # the key / value tables, this is helpful to know which
 52 |             # elements we need to cache and where
 53 |             to_cache, start_idx = [], 0
 54 |             for prompt_len in self.prompt_lens:
 55 |                 to_cache.extend(
 56 |                     range(
 57 |                         start_idx + max(0, prompt_len - sliding_window),
 58 |                         start_idx + prompt_len,
 59 |                     ))
 60 |                 start_idx += prompt_len
 61 |             to_cache.extend(range(start_idx, slot_mapping.shape[0]))
 62 |             self.to_cache = torch.tensor(to_cache,
 63 |                                          dtype=torch.int32,
 64 |                                          device=self.slot_mapping.device)
 65 | 
 66 |         self.num_prompts = len(prompt_lens)
 67 |         self.num_prompt_tokens = sum(prompt_lens)
 68 |         self.num_generation_tokens = sum(running_query_lens)
 69 |         self.num_valid_tokens = self.num_prompt_tokens + self.num_generation_tokens
 70 |         assert len(slot_mapping) == self.num_valid_tokens
 71 |         if block_tables.numel() > 0:
 72 |             self.max_num_blocks_per_seq = block_tables.shape[1]
 73 |         else:
 74 |             self.max_num_blocks_per_seq = 0
 75 |         # NOTE: no longer true if mixed with multi-token kernel usage
 76 |         # assert block_tables.shape[0] == self.num_generation_tokens
 77 |         # assert context_lens.shape[0] == self.num_generation_tokens
 78 |         
 79 |         if running_query_lens:
 80 |             assert len(running_query_lens) == len(seq_data) - self.num_prompts
 81 |         
 82 |         # Set during the execution of the first attention op.
 83 |         self.attn_bias: List[AttentionBias] = []
 84 |         self.multi_token_attn_bias: List[AttentionBias] = []
 85 | 
 86 |     def __repr__(self) -> str:
 87 |         # Print only useful metadata.
 88 |         return (f'InputMetadata('
 89 |                 # f'seq_groups={self.seq_groups}, '
 90 |                 f'num_valid_tokens={self.num_valid_tokens}, '
 91 |                 f'num_prompt_tokens={self.num_prompt_tokens}, '
 92 |                 f'num_prompts={self.num_prompts}, '
 93 |                 f'prompt_lens={self.prompt_lens}, '
 94 |                 f'num_generation_tokens={self.num_generation_tokens}, '
 95 |                 f'context_lens={self.context_lens}, '
 96 |                 f'max_context_len={self.max_context_len}), '
 97 |                 f'max_num_blocks_per_seq={self.max_num_blocks_per_seq}, '
 98 |                 f'block_tables={self.block_tables}), '
 99 |                 # f'slot_mapping={self.slot_mapping}, '
100 |                 f'running_query_lens={self.running_query_lens}, '
101 |                 f'is_generating_new_token={self.is_generating_new_token}')
102 |                 
103 | 


--------------------------------------------------------------------------------
/vllm/model_executor/model_loader.py:
--------------------------------------------------------------------------------
  1 | """Utilities for selecting and loading models."""
  2 | import contextlib
  3 | from typing import Type
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | from transformers import PretrainedConfig
  8 | 
  9 | from vllm.config import ModelConfig
 10 | from vllm.model_executor.models import *  # pylint: disable=wildcard-import
 11 | from vllm.model_executor.weight_utils import (get_quant_config,
 12 |                                               initialize_dummy_weights)
 13 | 
 14 | # TODO(woosuk): Lazy-load the model classes.
 15 | _MODEL_REGISTRY = {
 16 |     "AquilaModel": AquilaForCausalLM,
 17 |     "BaiChuanForCausalLM": BaiChuanForCausalLM,  # baichuan-7b
 18 |     "BaichuanForCausalLM": BaichuanForCausalLM,  # baichuan-13b
 19 |     "BloomForCausalLM": BloomForCausalLM,
 20 |     "FalconForCausalLM": FalconForCausalLM,
 21 |     "GPT2LMHeadModel": GPT2LMHeadModel,
 22 |     "GPTBigCodeForCausalLM": GPTBigCodeForCausalLM,
 23 |     "GPTJForCausalLM": GPTJForCausalLM,
 24 |     "GPTNeoXForCausalLM": GPTNeoXForCausalLM,
 25 |     "InternLMForCausalLM": InternLMForCausalLM,
 26 |     "LlamaForCausalLM": LlamaForCausalLM,
 27 |     "LLaMAForCausalLM": LlamaForCausalLM,  # For decapoda-research/llama-*
 28 |     "MistralForCausalLM": MistralForCausalLM,
 29 |     "MPTForCausalLM": MPTForCausalLM,
 30 |     "OPTForCausalLM": OPTForCausalLM,
 31 |     "QWenLMHeadModel": QWenLMHeadModel,
 32 |     "RWForCausalLM": FalconForCausalLM,
 33 | }
 34 | 
 35 | # FIXME(woosuk): Remove this once all models support quantization.
 36 | _MODEL_CLASSES_SUPPORT_QUANTIZATION = [
 37 |     LlamaForCausalLM,
 38 | ]
 39 | 
 40 | 
 41 | @contextlib.contextmanager
 42 | def _set_default_torch_dtype(dtype: torch.dtype):
 43 |     """Sets the default torch dtype to the given dtype."""
 44 |     old_dtype = torch.get_default_dtype()
 45 |     torch.set_default_dtype(dtype)
 46 |     yield
 47 |     torch.set_default_dtype(old_dtype)
 48 | 
 49 | 
 50 | def _get_model_architecture(config: PretrainedConfig) -> Type[nn.Module]:
 51 |     architectures = getattr(config, "architectures", [])
 52 |     for arch in architectures:
 53 |         if arch in _MODEL_REGISTRY:
 54 |             return _MODEL_REGISTRY[arch]
 55 |     raise ValueError(
 56 |         f"Model architectures {architectures} are not supported for now. "
 57 |         f"Supported architectures: {list(_MODEL_REGISTRY.keys())}")
 58 | 
 59 | 
 60 | def get_model(model_config: ModelConfig) -> nn.Module:
 61 |     model_class = _get_model_architecture(model_config.hf_config)
 62 | 
 63 |     # Get the quantization config.
 64 |     quant_config = None
 65 |     if model_config.quantization is not None:
 66 |         if model_class not in _MODEL_CLASSES_SUPPORT_QUANTIZATION:
 67 |             raise ValueError(
 68 |                 f"Quantization is not supported for {model_class}.")
 69 |         quant_config = get_quant_config(model_config.quantization,
 70 |                                         model_config.model,
 71 |                                         model_config.download_dir)
 72 |         capability = torch.cuda.get_device_capability()
 73 |         capability = capability[0] * 10 + capability[1]
 74 |         if capability < quant_config.get_min_capability():
 75 |             raise ValueError(
 76 |                 f"The quantization method {model_config.quantization} is not "
 77 |                 "supported for the current GPU. "
 78 |                 f"Minimum capability: {quant_config.get_min_capability()}. "
 79 |                 f"Current capability: {capability}.")
 80 |         supported_dtypes = quant_config.get_supported_act_dtypes()
 81 |         if model_config.dtype not in supported_dtypes:
 82 |             raise ValueError(
 83 |                 f"{model_config.dtype} is not supported for quantization "
 84 |                 f"method {model_config.quantization}. Supported dtypes: "
 85 |                 f"{supported_dtypes}")
 86 | 
 87 |     with _set_default_torch_dtype(model_config.dtype):
 88 |         # Create a model instance.
 89 |         # The weights will be initialized as empty tensors.
 90 |         if model_class in _MODEL_CLASSES_SUPPORT_QUANTIZATION:
 91 |             model = model_class(model_config.hf_config, quant_config)
 92 |         else:
 93 |             model = model_class(model_config.hf_config)
 94 |         if model_config.load_format == "dummy":
 95 |             model = model.cuda()
 96 |             # NOTE(woosuk): For accurate performance evaluation, we assign
 97 |             # random values to the weights.
 98 |             initialize_dummy_weights(model)
 99 |         else:
100 |             # Load the weights from the cached or downloaded files.
101 |             model.load_weights(model_config.model, model_config.download_dir,
102 |                                model_config.load_format, model_config.revision)
103 |             model = model.cuda()
104 |     return model.eval()
105 | 


--------------------------------------------------------------------------------
/csrc/pos_encoding_kernels.cu:
--------------------------------------------------------------------------------
  1 | #include <torch/extension.h>
  2 | #include <ATen/cuda/CUDAContext.h>
  3 | 
  4 | #include "dispatch_utils.h"
  5 | 
  6 | namespace vllm {
  7 | 
  8 | template<typename scalar_t, bool IS_NEOX>
  9 | inline __device__ void apply_rotary_embedding(
 10 |   scalar_t* __restrict__ arr,
 11 |   const scalar_t* __restrict__ cos_ptr,
 12 |   const scalar_t* __restrict__ sin_ptr,
 13 |   int rot_offset,
 14 |   int embed_dim)
 15 | {
 16 |   int x_index, y_index;
 17 |   scalar_t cos, sin;
 18 |   if (IS_NEOX) {
 19 |     // GPT-NeoX style rotary embedding.
 20 |     x_index = rot_offset;
 21 |     y_index = embed_dim + rot_offset;
 22 |     cos = __ldg(cos_ptr + x_index);
 23 |     sin = __ldg(sin_ptr + x_index);
 24 |   } else {
 25 |     // GPT-J style rotary embedding.
 26 |     x_index = 2 * rot_offset;
 27 |     y_index = 2 * rot_offset + 1;
 28 |     cos = __ldg(cos_ptr + x_index / 2);
 29 |     sin = __ldg(sin_ptr + x_index / 2);
 30 |   }
 31 | 
 32 |   const scalar_t x = arr[x_index];
 33 |   const scalar_t y = arr[y_index];
 34 |   arr[x_index] = x * cos - y * sin;
 35 |   arr[y_index] = y * cos + x * sin;
 36 | }
 37 | 
 38 | template<typename scalar_t, bool IS_NEOX>
 39 | __global__ void rotary_embedding_kernel(
 40 |   const int64_t* __restrict__ positions,        // [num_tokens]
 41 |   scalar_t* __restrict__ query,                 // [num_tokens, num_heads, head_size]
 42 |   scalar_t* __restrict__ key,                   // [num_tokens, num_kv_heads, head_size]
 43 |   const scalar_t* __restrict__ cos_sin_cache,   // [max_position, 2, rot_dim // 2]
 44 |   const int rot_dim,
 45 |   const int query_stride,
 46 |   const int key_stride,
 47 |   const int num_heads,
 48 |   const int num_kv_heads,
 49 |   const int head_size) {
 50 |   // Each thread block is responsible for one token.
 51 |   const int token_idx = blockIdx.x;
 52 |   int64_t pos = positions[token_idx];
 53 |   const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
 54 | 
 55 |   const int embed_dim = rot_dim / 2;
 56 |   const scalar_t* cos_ptr = cache_ptr;
 57 |   const scalar_t* sin_ptr = cache_ptr + embed_dim;
 58 | 
 59 |   const int nq = num_heads * embed_dim;
 60 |   for (int i = threadIdx.x; i < nq; i += blockDim.x) {
 61 |     const int head_idx = i / embed_dim;
 62 |     const int token_head = token_idx * query_stride + head_idx * head_size;
 63 |     const int rot_offset = i % embed_dim;
 64 |     apply_rotary_embedding<scalar_t, IS_NEOX>(query + token_head, cos_ptr,
 65 |                                               sin_ptr, rot_offset, embed_dim);
 66 |   }
 67 | 
 68 |   const int nk = num_kv_heads * embed_dim;
 69 |   for (int i = threadIdx.x; i < nk; i += blockDim.x) {
 70 |     const int head_idx = i / embed_dim;
 71 |     const int token_head = token_idx * key_stride + head_idx * head_size;
 72 |     const int rot_offset = i % embed_dim;
 73 |     apply_rotary_embedding<scalar_t, IS_NEOX>(key + token_head, cos_ptr,
 74 |                                               sin_ptr, rot_offset, embed_dim);
 75 |   }
 76 | }
 77 | 
 78 | } // namespace vllm
 79 | 
 80 | void rotary_embedding(
 81 |   torch::Tensor& positions,         // [num_tokens]
 82 |   torch::Tensor& query,             // [num_tokens, num_heads * head_size]
 83 |   torch::Tensor& key,               // [num_tokens, num_kv_heads * head_size]
 84 |   int head_size,
 85 |   torch::Tensor& cos_sin_cache,     // [max_position, rot_dim]
 86 |   bool is_neox) {
 87 |   int num_tokens = query.size(0);
 88 |   int rot_dim = cos_sin_cache.size(1);
 89 |   int num_heads = query.size(1) / head_size;
 90 |   int num_kv_heads = key.size(1) / head_size;
 91 |   int query_stride = query.stride(0);
 92 |   int key_stride = key.stride(0);
 93 | 
 94 |   dim3 grid(num_tokens);
 95 |   dim3 block(std::min(num_heads * rot_dim / 2, 512));
 96 |   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 97 |   VLLM_DISPATCH_FLOATING_TYPES(
 98 |     query.scalar_type(),
 99 |     "rotary_embedding",
100 |     [&] {
101 |       if (is_neox) {
102 |         vllm::rotary_embedding_kernel<scalar_t, true><<<grid, block, 0, stream>>>(
103 |           positions.data_ptr<int64_t>(),
104 |           query.data_ptr<scalar_t>(),
105 |           key.data_ptr<scalar_t>(),
106 |           cos_sin_cache.data_ptr<scalar_t>(),
107 |           rot_dim,
108 |           query_stride,
109 |           key_stride,
110 |           num_heads,
111 |           num_kv_heads,
112 |           head_size);
113 |       } else {
114 |         vllm::rotary_embedding_kernel<scalar_t, false><<<grid, block, 0, stream>>>(
115 |           positions.data_ptr<int64_t>(),
116 |           query.data_ptr<scalar_t>(),
117 |           key.data_ptr<scalar_t>(),
118 |           cos_sin_cache.data_ptr<scalar_t>(),
119 |           rot_dim,
120 |           query_stride,
121 |           key_stride,
122 |           num_heads,
123 |           num_kv_heads,
124 |           head_size);
125 |       }
126 |     });
127 | }
128 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/simulator.py:
--------------------------------------------------------------------------------
  1 | """A layer that simulates the next token."""
  2 | from typing import Dict, List, Optional, Tuple
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | import random
  7 | from vllm import utils
  8 | from vllm.sampling_params import SamplingParams, SamplingType
  9 | from vllm.model_executor.input_metadata import InputMetadata
 10 | from vllm.sequence import SequenceOutputs, SequenceData, SamplerOutput
 11 | 
 12 | DUMMY_TOKEN = 31548  # "Ġdummy"
 13 | 
 14 | def _greedy_sample(
 15 |     selected_seq_groups: List[Tuple[List[int], SamplingParams]],
 16 |     seq_data: Dict[int, SequenceData],
 17 | ) -> List[Tuple[List[int], List[int]]]:
 18 |     results = []
 19 |     for seq_group in selected_seq_groups:
 20 |         seq_ids, sampling_params = seq_group
 21 |         num_parent_seqs = len(seq_ids)
 22 |         assert num_parent_seqs == 1, (
 23 |             "Greedy sampling should have only one seq.")
 24 |         parent_ids = [0]
 25 |         next_token_ids = [_sample(sampling_params, seq_data[seq_ids[0]])]
 26 |         results.append((next_token_ids, parent_ids))
 27 |     return results
 28 | 
 29 | def _random_sample(
 30 |     selected_seq_groups: List[Tuple[List[int], SamplingParams]],
 31 |     is_prompts: List[bool],
 32 |     seq_data: Dict[int, SequenceData],
 33 | ) -> List[Tuple[List[int], List[int]]]:
 34 |     results = []
 35 |     for seq_group, is_prompt in zip(selected_seq_groups, is_prompts):
 36 |         seq_ids, sampling_params = seq_group
 37 |         num_parent_seqs = len(seq_ids)
 38 |         if is_prompt:
 39 |             # Prompt phase.
 40 |             assert num_parent_seqs == 1, (
 41 |                 "Prompt input should have only one seq.")
 42 |             parent_ids = [0] * sampling_params.best_of
 43 |             next_token_ids = [_sample(sampling_params, seq_data[seq_ids[0]])] * \
 44 |                                 sampling_params.best_of
 45 |         else:
 46 |             # Generation phase.
 47 |             parent_ids = list(range(num_parent_seqs))
 48 |             next_token_ids = [_sample(sampling_params, seq_data[seq_id]) 
 49 |                                 for seq_id in seq_ids]
 50 |         results.append((next_token_ids, parent_ids))
 51 |     return results
 52 | 
 53 | class Simulator(nn.Module):
 54 | 
 55 |     def forward(self,
 56 |                 input_metadata: InputMetadata) -> SamplerOutput:
 57 |         categorized_seq_group_ids = {t: [] for t in SamplingType}
 58 |         category_num_tokens = {t: 0 for t in SamplingType}
 59 |         for i, seq_group in enumerate(input_metadata.seq_groups):
 60 |             seq_ids, sampling_params = seq_group
 61 |             sampling_type = sampling_params.sampling_type
 62 |             categorized_seq_group_ids[sampling_type].append(i)
 63 |             num_seqs = len(seq_ids)
 64 |             category_num_tokens[sampling_type] += num_seqs
 65 |             
 66 |         seq_outputs_dict: Dict[int, List[SequenceOutputs]] = {}
 67 |         for sampling_type in SamplingType:
 68 |             seq_group_ids = categorized_seq_group_ids[sampling_type]
 69 |             seq_groups = [input_metadata.seq_groups[i] for i in seq_group_ids]
 70 |             is_prompts = [i < input_metadata.num_prompts for i in seq_group_ids]
 71 |             num_tokens = category_num_tokens[sampling_type]
 72 |             if num_tokens == 0:
 73 |                 continue
 74 |             if sampling_type == SamplingType.GREEDY:
 75 |                 sample_results = _greedy_sample(seq_groups, input_metadata.seq_data)
 76 |             elif sampling_type == SamplingType.RANDOM:
 77 |                 sample_results = _random_sample(seq_groups, is_prompts, input_metadata.seq_data)
 78 |             else:
 79 |                 raise NotImplementedError("Beam search is not supported yet")
 80 |                 
 81 |             # build output    
 82 |             for seq_group_id, seq_group, sample_result in zip(
 83 |                     seq_group_ids, seq_groups, sample_results):
 84 |                 seq_ids, sampling_params = seq_group
 85 |                 next_token_ids, parent_ids = sample_result
 86 |                 num_results = len(next_token_ids)
 87 |                 num_parent_seqs = len(seq_ids)
 88 |                 seq_outputs: List[SequenceOutputs] = []
 89 |                 for parent_id, next_token_id in zip(
 90 |                     parent_ids, next_token_ids):
 91 |                     seq_outputs.append(
 92 |                         SequenceOutputs(seq_ids[parent_id], next_token_id, {next_token_id: 0.0}))
 93 |                 seq_outputs_dict[seq_group_id] = seq_outputs
 94 |         return [seq_outputs_dict[i] for i in range(len(input_metadata.seq_groups))]
 95 | 
 96 | def _sample(sampling_params: SamplingParams, seq_data: SequenceData) -> int:
 97 |     # seq_data should be updated at the master worker
 98 |     if seq_data.generation_counter == sampling_params.api_invoke_interval:
 99 |         if sampling_params.api_max_calls != 0:
100 |             # seq_data.generation_counter = 0
101 |             return utils.get_api_stop_token()
102 |     # seq_data.generation_counter += 1
103 |     return DUMMY_TOKEN


--------------------------------------------------------------------------------
/vllm/engine/ray_utils.py:
--------------------------------------------------------------------------------
  1 | import socket
  2 | from typing import Optional, Tuple, TYPE_CHECKING
  3 | 
  4 | from vllm.config import ParallelConfig
  5 | from vllm.logger import init_logger
  6 | 
  7 | logger = init_logger(__name__)
  8 | 
  9 | try:
 10 |     import ray
 11 |     from ray.air.util.torch_dist import TorchDistributedWorker
 12 | 
 13 |     class RayWorker(TorchDistributedWorker):
 14 |         """Ray wrapper for vllm.worker.Worker, allowing Worker to be
 15 |         lazliy initialized after Ray sets CUDA_VISIBLE_DEVICES."""
 16 | 
 17 |         def __init__(self, init_cached_hf_modules=False) -> None:
 18 |             if init_cached_hf_modules:
 19 |                 # pylint: disable=import-outside-toplevel
 20 |                 from transformers.dynamic_module_utils import init_hf_modules
 21 |                 init_hf_modules()
 22 |             self.worker = None
 23 | 
 24 |         def init_worker(self, worker_init_fn):
 25 |             self.worker = worker_init_fn()
 26 | 
 27 |         def __getattr__(self, name):
 28 |             return getattr(self.worker, name)
 29 | 
 30 |         def execute_method(self, method, *args, **kwargs):
 31 |             executor = getattr(self, method)
 32 |             return executor(*args, **kwargs)
 33 | 
 34 | except ImportError as e:
 35 |     logger.warning(f"Failed to import Ray with {e!r}. "
 36 |                    "For distributed inference, please install Ray with "
 37 |                    "`pip install ray pandas pyarrow`.")
 38 |     ray = None
 39 |     TorchDistributedWorker = None
 40 |     RayWorker = None  # pylint: disable=invalid-name
 41 | 
 42 | if TYPE_CHECKING:
 43 |     from ray.util.placement_group import PlacementGroup
 44 | 
 45 | 
 46 | def get_open_port():
 47 |     with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
 48 |         s.bind(("", 0))
 49 |         return s.getsockname()[1]
 50 | 
 51 | 
 52 | def initialize_cluster(
 53 |     parallel_config: ParallelConfig,
 54 |     engine_use_ray: bool = False,
 55 |     ray_address: Optional[str] = None,
 56 | ) -> Tuple[str, Optional["PlacementGroup"]]:
 57 |     """Initialize the distributed cluster probably with Ray.
 58 | 
 59 |     Args:
 60 |         parallel_config: The configurations for parallel execution.
 61 |         engine_use_ray: Whether to use Ray for async engine.
 62 |         ray_address: The address of the Ray cluster. If None, uses
 63 |             the default Ray cluster address.
 64 | 
 65 |     Returns:
 66 |         A tuple of (`distributed_init_method`, `placement_group`). The
 67 |         `distributed_init_method` is the address for initializing the
 68 |         distributed backend. `placement_group` includes the specification
 69 |         of the resources for each distributed worker.
 70 |     """
 71 |     if parallel_config.worker_use_ray or engine_use_ray:
 72 |         if ray is None:
 73 |             raise ImportError(
 74 |                 "Ray is not installed. Please install Ray to use distributed "
 75 |                 "serving.")
 76 |         # Connect to a ray cluster.
 77 |         ray.init(address=ray_address, ignore_reinit_error=True)
 78 | 
 79 |     if not parallel_config.worker_use_ray:
 80 |         # Initialize cluster locally.
 81 |         port = get_open_port()
 82 |         # We need to setup the distributed init method to make sure
 83 |         # the distributed megatron code (e.g., get world size) works correctly.
 84 |         distributed_init_method = f"tcp://localhost:{port}"
 85 |         return distributed_init_method, None
 86 | 
 87 |     current_placement_group = ray.util.get_current_placement_group()
 88 |     if current_placement_group:
 89 |         # We are in a placement group
 90 |         bundles = current_placement_group.bundle_specs
 91 |         # Verify that we can use the placement group.
 92 |         gpu_bundles = 0
 93 |         for bundle in bundles:
 94 |             bundle_gpus = bundle.get("GPU", 0)
 95 |             if bundle_gpus > 1:
 96 |                 raise ValueError(
 97 |                     "Placement group bundle cannot have more than 1 GPU.")
 98 |             if bundle_gpus:
 99 |                 gpu_bundles += 1
100 |         if parallel_config.world_size > gpu_bundles:
101 |             raise ValueError(
102 |                 "The number of required GPUs exceeds the total number of "
103 |                 "available GPUs in the placement group.")
104 |     else:
105 |         num_gpus_in_cluster = ray.cluster_resources().get("GPU", 0)
106 |         if parallel_config.world_size > num_gpus_in_cluster:
107 |             raise ValueError(
108 |                 "The number of required GPUs exceeds the total number of "
109 |                 "available GPUs in the cluster.")
110 |         # Create a new placement group
111 |         current_placement_group = ray.util.placement_group([{
112 |             "GPU": 1
113 |         }] * parallel_config.world_size)
114 |         # Wait until PG is ready - this will block until all
115 |         # requested resources are available, and will timeout
116 |         # if they cannot be provisioned.
117 |         ray.get(current_placement_group.ready(), timeout=1800)
118 | 
119 |     return None, current_placement_group
120 | 


--------------------------------------------------------------------------------
/docs/source/models/adding_model.rst:
--------------------------------------------------------------------------------
 1 | .. _adding_a_new_model:
 2 | 
 3 | Adding a New Model
 4 | ==================
 5 | 
 6 | This document provides a high-level guide on integrating a `HuggingFace Transformers <https://github.com/huggingface/transformers>`_ model into vLLM.
 7 | 
 8 | .. note::
 9 |     The complexity of adding a new model depends heavily on the model's architecture.
10 |     The process is considerably straightforward if the model shares a similar architecture with an existing model in vLLM.
11 |     However, for models that include new operators (e.g., a new attention mechanism), the process can be a bit more complex.
12 | 
13 | .. tip::
14 |     If you are encountering issues while integrating your model into vLLM, feel free to open an issue on our `GitHub <https://github.com/vllm-project/vllm/issues>`_ repository.
15 |     We will be happy to help you out!
16 | 
17 | 
18 | 0. Fork the vLLM repository
19 | --------------------------------
20 | 
21 | Start by forking our `GitHub <https://github.com/vllm-project/vllm/>`_ repository and then :ref:`build it from source <build_from_source>`.
22 | This gives you the ability to modify the codebase and test your model.
23 | 
24 | 
25 | 1. Bring your model code
26 | ------------------------
27 | 
28 | Clone the PyTorch model code from the HuggingFace Transformers repository and put it into the `vllm/model_executor/models <https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models>`_ directory.
29 | For instance, vLLM's `OPT model <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/opt.py>`_ was adpated from the HuggingFace's `modeling_opt.py <https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py>`_ file.
30 | 
31 | .. warning::
32 |     When copying the model code, make sure to review and adhere to the code's copyright and licensing terms.
33 | 
34 | 
35 | 2. Rewrite the :code:`forward` methods
36 | --------------------------------------
37 | 
38 | Next, you need to rewrite the :code:`forward` methods of your model by following these steps:
39 | 
40 | 1. Remove any unnecessary code, such as the code only used for training.
41 | 2. Change the input parameters:
42 | 
43 | .. code-block:: diff
44 | 
45 |     def forward(
46 |         self,
47 |         input_ids: torch.Tensor,
48 |     -    attention_mask: Optional[torch.Tensor] = None,
49 |     -    position_ids: Optional[torch.LongTensor] = None,
50 |     -    past_key_values: Optional[List[torch.FloatTensor]] = None,
51 |     -    inputs_embeds: Optional[torch.FloatTensor] = None,
52 |     -    labels: Optional[torch.LongTensor] = None,
53 |     -    use_cache: Optional[bool] = None,
54 |     -    output_attentions: Optional[bool] = None,
55 |     -    output_hidden_states: Optional[bool] = None,
56 |     -    return_dict: Optional[bool] = None,
57 |     -) -> Union[Tuple, CausalLMOutputWithPast]:
58 |     +    positions: torch.Tensor,
59 |     +    kv_caches: List[KVCache],
60 |     +    input_metadata: InputMetadata,
61 |     +    cache_events: Optional[List[torch.cuda.Event]],
62 |     +) -> SamplerOutput:
63 | 
64 | 3. Update the code by considering that :code:`input_ids` and :code:`positions` are now flattened tensors.
65 | 4. Replace the attention operation with either :code:`GPTPagedAttention` or :code:`GPTNeoXPagedAttention`, depending on the model's architecture.
66 | 
67 | .. note::
68 |     Currently, vLLM supports the basic multi-head attention mechanism and its variant with rotary positional embeddings.
69 |     If your model employs a different attention mechanism, you will need to implement a new attention layer in vLLM.
70 | 
71 | 
72 | 3. (Optional) Implement tensor parallelism support
73 | --------------------------------------------------
74 | 
75 | If your model is too large to fit into a single GPU, you can use tensor parallelism to manage it.
76 | To do this, substitute your model's linear and embedding layers with their tensor-parallel versions.
77 | For the embedding layer, you can simply replace :code:`nn.Embedding` with :code:`VocabParallelEmbedding`.
78 | When it comes to the linear layers, you should use either :code:`RowParallelLinear` or :code:`ColumnParallelLinear`.
79 | Typically, :code:`ColumnParallelLinear` is used for QKV linear layers and the first linear layers of the MLP blocks.
80 | For the remaining linear layers, :code:`RowParallelLinear` is used.
81 | 
82 | 
83 | 4. Implement the weight loading logic
84 | -------------------------------------
85 | 
86 | You now need to implement the :code:`load_weights` method in your :code:`*ForCausalLM` class.
87 | This method should load the weights from the HuggingFace's checkpoint file and assign them to the corresponding layers in your model.
88 | While the process is straightforward for most layers, the tensor-parallel layers necessitate some additional care as their weights should be partitioned to multiple GPUs.
89 | 
90 | 
91 | 5. Register your model
92 | ----------------------
93 | 
94 | Finally, include your :code:`*ForCausalLM` class in `vllm/model_executor/models/__init__.py <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/__init__.py>`_ and register it to the :code:`_MODEL_REGISTRY` in `vllm/model_executor/model_loader.py <https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/model_loader.py>`_.
95 | 


--------------------------------------------------------------------------------
/vllm/outputs.py:
--------------------------------------------------------------------------------
  1 | from typing import Dict, List, Optional, Tuple
  2 | 
  3 | from vllm.sequence import SequenceGroup, SequenceStatus
  4 | 
  5 | 
  6 | class CompletionOutput:
  7 |     """The output data of one completion output of a request.
  8 | 
  9 |     Args:
 10 |         index: The index of the output in the request.
 11 |         text: The generated output text.
 12 |         token_ids: The token IDs of the generated output text.
 13 |         cumulative_logprob: The cumulative log probability of the generated
 14 |             output text.
 15 |         logprobs: The log probabilities of the top probability words at each
 16 |             position if the logprobs are requested.
 17 |         finish_reason: The reason why the sequence is finished.
 18 |     """
 19 | 
 20 |     def __init__(
 21 |         self,
 22 |         index: int,
 23 |         text: str,
 24 |         token_ids: List[int],
 25 |         cumulative_logprob: float,
 26 |         logprobs: Optional[List[Dict[int, float]]],
 27 |         finish_reason: Optional[str] = None,
 28 |     ) -> None:
 29 |         self.index = index
 30 |         self.text = text
 31 |         self.token_ids = token_ids
 32 |         self.cumulative_logprob = cumulative_logprob
 33 |         self.logprobs = logprobs
 34 |         self.finish_reason = finish_reason
 35 | 
 36 |     def finished(self) -> bool:
 37 |         return self.finish_reason is not None
 38 | 
 39 |     def __repr__(self) -> str:
 40 |         return (f"CompletionOutput(index={self.index}, "
 41 |                 f"text={self.text!r}, "
 42 |                 f"token_ids={self.token_ids}, "
 43 |                 f"cumulative_logprob={self.cumulative_logprob}, "
 44 |                 f"logprobs={self.logprobs}, "
 45 |                 f"finish_reason={self.finish_reason})")
 46 | 
 47 | 
 48 | class RequestOutput:
 49 |     """The output data of a request to the LLM.
 50 | 
 51 |     Args:
 52 |         request_id: The unique ID of the request.
 53 |         prompt: The prompt string of the request.
 54 |         prompt_token_ids: The token IDs of the prompt.
 55 |         outputs: The output sequences of the request.
 56 |         finished: Whether the whole request is finished.
 57 |         paused: List of output index to seq id.
 58 |     """
 59 | 
 60 |     def __init__(
 61 |         self,
 62 |         request_id: str,
 63 |         prompt: str,
 64 |         prompt_token_ids: List[int],
 65 |         outputs: List[CompletionOutput],
 66 |         finished: bool,
 67 |         paused: List[Tuple[int, int]],
 68 |     ) -> None:
 69 |         self.request_id = request_id
 70 |         self.prompt = prompt
 71 |         self.prompt_token_ids = prompt_token_ids
 72 |         self.outputs = outputs
 73 |         self.finished = finished
 74 |         self.paused = paused
 75 | 
 76 |     # NOTE: toolformer's pause policy is no longer used
 77 |     @classmethod
 78 |     def from_seq_group(cls, seq_group: SequenceGroup) -> "RequestOutput":
 79 |         # Get the top-n sequences 
 80 |         n = seq_group.sampling_params.n
 81 |         seqs = seq_group.get_seqs()
 82 |         assert n <= len(seqs)
 83 |         if seq_group.sampling_params.use_beam_search:
 84 |             sorting_key = lambda seq: seq.get_beam_search_score(
 85 |                 seq_group.sampling_params.length_penalty)
 86 |         else:
 87 |             sorting_key = lambda seq: seq.get_cumulative_logprob()
 88 |         sorted_seqs = sorted(seqs, key=sorting_key, reverse=True)
 89 |         top_n_seqs = sorted_seqs[:n]
 90 | 
 91 |         # Create the outputs.
 92 |         paused: List[Tuple[int, int]] = []
 93 |         outputs: List[CompletionOutput] = []
 94 |         for seq in top_n_seqs:
 95 |             logprobs = seq.output_logprobs
 96 |             if seq_group.sampling_params.logprobs is None:
 97 |                 # NOTE: We need to take care of this case because the sequence
 98 |                 # always has the logprobs of the sampled tokens even if the
 99 |                 # logprobs are not requested.
100 |                 logprobs = {}
101 |             finshed_reason = SequenceStatus.get_finished_reason(seq.status)
102 |             output = CompletionOutput(seqs.index(seq), seq.output_text,
103 |                                       seq.get_output_token_ids(),
104 |                                       seq.get_cumulative_logprob(), logprobs,
105 |                                       finshed_reason)
106 |             outputs.append(output)
107 |             if seq.is_paused():
108 |                 paused.append((len(outputs) - 1, seq.seq_id))
109 | 
110 |         # Every sequence in the sequence group should have the same prompt.
111 |         prompt = top_n_seqs[0].prompt
112 |         prompt_token_ids = top_n_seqs[0].data.prompt_token_ids
113 |         finished = seq_group.is_finished()
114 |         return cls(seq_group.request_id, prompt, prompt_token_ids, outputs,
115 |                    finished, paused)
116 | 
117 |     def __repr__(self) -> str:
118 |         return (f"RequestOutput(request_id={self.request_id}, "
119 |                 f"prompt={self.prompt!r}, "
120 |                 f"prompt_token_ids={self.prompt_token_ids}, "
121 |                 f"outputs={self.outputs}, "
122 |                 f"finished={self.finished}), "
123 |                 f"paused={self.paused})")
124 | 


--------------------------------------------------------------------------------
/examples/react_vllm_impl.py:
--------------------------------------------------------------------------------
  1 | """Benchmark offline inference throughput."""
  2 | import argparse
  3 | import json
  4 | import random
  5 | import time
  6 | from typing import List, Optional, Tuple, Dict
  7 | 
  8 | import torch
  9 | import queue
 10 | import threading
 11 | from transformers import AutoModelForCausalLM, PreTrainedTokenizerBase
 12 | from tqdm import tqdm
 13 | 
 14 | from vllm import LLM, SamplingParams, LLMEngine, EngineArgs, utils
 15 | from vllm.transformers_utils.tokenizer import get_tokenizer
 16 | from vllm.outputs import RequestOutput
 17 | 
 18 | class APIExecutor:
 19 |     def __init__(self) -> None:
 20 |         self._queue = queue.Queue()
 21 |     
 22 |     def _add_task(self, request_id: str, seq_id: int, api_time: float, ret_len: int):
 23 |         time.sleep(api_time)
 24 |         self._queue.put((request_id, seq_id, ret_len))
 25 |     
 26 |     def add_task(self, request_id: str, seq_id: int, api_time: float, ret_len: int):
 27 |         task = threading.Thread(target=self._add_task, args=(request_id, seq_id, api_time, ret_len))
 28 |         task.start()
 29 |         return task
 30 |     
 31 |     def _get_results(self) -> Dict[str, Dict[int, int]]:
 32 |         results = {}
 33 |         current_num_ret = self._queue.qsize()
 34 |         for _ in range(current_num_ret):
 35 |             request_id, seq_id, ret_len = self._queue.get()
 36 |             if request_id not in results:
 37 |                 results[request_id] = {}
 38 |             results[request_id][seq_id] = ret_len
 39 |         return results
 40 |     
 41 |     def resume(self, vllm_engine: LLMEngine) -> None:
 42 |         api_rets = self._get_results()
 43 |         for request_id, seq_id_to_ret_len in api_rets.items():
 44 |             response = {}
 45 |             for seq_id, ret_len in seq_id_to_ret_len.items():
 46 |                 response[seq_id] = [0] * ret_len
 47 |             vllm_engine.resume_request(request_id, response)
 48 |         
 49 |         
 50 | 
 51 | def run_vllm(
 52 |     args: argparse.Namespace,
 53 | ) -> float:
 54 |     engine_args = EngineArgs.from_cli_args(args)
 55 |     engine = LLMEngine.from_engine_args(engine_args)
 56 |     stop = [utils.get_api_stop_string()]
 57 |     api_engine = APIExecutor()
 58 |     tasks = set()
 59 |     
 60 |     dummy_prompt_token_ids = [[0] * args.input_len] * args.num_prompts
 61 |     
 62 |     # Add the requests to the engine.
 63 |     for request_id, prompt_token_ids in enumerate(dummy_prompt_token_ids):
 64 |         sampling_params = SamplingParams(
 65 |             n=1,
 66 |             temperature=0.0,
 67 |             top_p=1.0,
 68 |             # use_beam_search=use_beam_search,
 69 |             ignore_eos=True,
 70 |             max_tokens=args.output_len,
 71 |             stop=stop,
 72 |             use_api_simulator=True,
 73 |             api_return_length=32,
 74 |             api_invoke_interval=16 + request_id,
 75 |             api_exec_time=1.0
 76 |         )
 77 |         engine.add_request(
 78 |             request_id=str(request_id),
 79 |             prompt=None,
 80 |             sampling_params=sampling_params,
 81 |             prompt_token_ids=prompt_token_ids,
 82 |         )
 83 | 
 84 |     start = time.perf_counter()
 85 |     # Run the engine.
 86 |     outputs: List[RequestOutput] = []
 87 |     iter = 0
 88 |     while engine.has_unfinished_requests():
 89 |         step_outputs = engine.step()
 90 |         for output in step_outputs:
 91 |             if output.finished:
 92 |                 outputs.append(output)
 93 |             if output.paused:
 94 |                 print(f'iter: {iter}, output: {output}')
 95 |                 sampling_params: SamplingParams = engine.scheduler.paused[output.request_id][0].sampling_params
 96 |                 for (rid, sid) in output.paused:
 97 |                     task = api_engine.add_task(output.request_id, sid, sampling_params.api_exec_time, sampling_params.api_return_length)
 98 |                     tasks.add(task)
 99 |         api_engine.resume(engine)
100 |         iter += 1
101 |     
102 |     # Sort the outputs by request ID.
103 |     # This is necessary because some requests may be finished earlier than
104 |     # its previous requests.
105 |     outputs = sorted(outputs, key=lambda x: int(x.request_id))
106 |     end = time.perf_counter()
107 |     for request_output in outputs:
108 |         for seq_output in request_output.outputs:
109 |             print(seq_output.text)
110 |             print(seq_output.token_ids)
111 |     return end - start
112 | 
113 | 
114 | def main(args: argparse.Namespace):
115 |     print(args)
116 |     random.seed(args.seed)
117 | 
118 |     elapsed_time = run_vllm(
119 |         args,
120 |     )
121 |     print(elapsed_time)
122 | 
123 |     # total_num_tokens = sum(
124 |     #     prompt_len + output_len for _, prompt_len, output_len in requests
125 |     # )
126 |     # print(
127 |     #     f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
128 |     #     f"{total_num_tokens / elapsed_time:.2f} tokens/s"
129 |     # )
130 | 
131 | 
132 | if __name__ == "__main__":
133 |     parser = argparse.ArgumentParser(description="Benchmark the throughput.")
134 |     parser.add_argument(
135 |         "--input-len", type=int, default=512
136 |     )
137 |     parser.add_argument(
138 |         "--output-len", type=int, default=512
139 |     )
140 |     parser.add_argument(
141 |         "--num-prompts", type=int, default=1, help="Number of prompts to process."
142 |     )
143 |     parser = EngineArgs.add_cli_args(parser)
144 |     
145 |     args = parser.parse_args()
146 |     main(args)
147 |     


--------------------------------------------------------------------------------
/examples/test_pause.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | 
 3 | from vllm import EngineArgs, LLMEngine, SamplingParams, utils
 4 | import torch
 5 | import os
 6 | from typing import List, Optional, Tuple, Dict
 7 | from vllm.outputs import RequestOutput
 8 | import json
 9 | 
10 | # os.environ['CUDA_VISIBLE_DEVICES'] = '7'
11 | # os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
12 | 
13 | def api_call(input: str):
14 |     return " a "
15 | 
16 | def main(args: argparse.Namespace):
17 |     # Parse the CLI argument and initialize the engine.
18 |     engine_args = EngineArgs.from_cli_args(args)
19 |     engine = LLMEngine.from_engine_args(engine_args)
20 |     stop = [utils.get_api_stop_string()] if args.test else None
21 |     # Test the following prompts.
22 |     test_prompts = [
23 |         ("Imagine a futuristic city in the year 2150, where advanced technology and environmental sustainability are perfectly integrated. Describe this city in great detail, focusing on aspects such as architecture, transportation, energy sources, and the daily lives of its inhabitants. How do the buildings look, and what innovative materials are they made from? Describe the public transportation system and how it differs from systems in the early 21st century. What are the primary energy sources, and how are they harnessed and distributed? How do the residents of this city work, entertain themselves, and interact with technology in their everyday lives? In addition, consider the city's government and societal structure. How is the city governed, and what kind of political system is in place? What are the core values and principles that guide decision-making? Discuss how this city ensures the well-being of its citizens, including healthcare, education, and social services. How does this city handle issues like crime, conflict resolution, and the preservation of civil liberties? Furthermore, explore the relationship of this city with the natural environment. How does the city maintain a balance with nature, and what are its strategies for conservation and biodiversity? Are there any unique parks, green spaces, or integration of natural elements within the urban landscape? Lastly, imagine a scenario where this city faces a significant challenge, such as a natural disaster or a technological crisis. How does the city respond and recover from this event? What systems and protocols are in place to handle such emergencies, and what role do citizens play in these situations? Please provide a comprehensive and imaginative description of each of these aspects, creating a vivid and detailed portrayal of life in this futuristic city.",
24 |         SamplingParams(n=1, temperature=0.0, presence_penalty=0.0,stop=stop,max_tokens=100)),
25 |         # ("Summarize the main ideas of Jeff Walker's Product Launch Formula into bullet points as it pertains to a growth marketing agency implementing these strategies and tactics for their clients...",
26 |         #  SamplingParams(n=1, temperature=0.0, presence_penalty=0.2,stop=stop,max_tokens=100)),
27 |     ] * 100
28 | 
29 |     # Run the engine by calling `engine.step()` manually.
30 |     request_id = 0
31 |     # To test iteration-level scheduling, we add one request at each step.
32 |     for prompt, sampling_params in test_prompts:
33 |         engine.add_request(str(request_id), prompt, sampling_params)
34 |         request_id += 1
35 |     
36 |     outputs: List[RequestOutput] = []
37 |     if not args.test:
38 |         while True:
39 |         # for _ in range(29):
40 |             request_outputs = engine.step()
41 |             for request_output in request_outputs:
42 |                 if request_output.finished:
43 |                     outputs.append(request_output)
44 |                     # print(request_output.outputs[0].token_ids)
45 |             if not engine.has_unfinished_requests():
46 |                 break
47 |     else:
48 |         # TO the test, only pause once
49 |         torch.cuda.cudart().cudaProfilerStart()
50 |         while True:
51 |             request_outputs = engine.step()
52 |             for request_output in request_outputs:
53 |                 # print(request_output)
54 |                 if request_output.finished:
55 |                     outputs.append(request_output)
56 |                     # print(request_output.outputs[0].token_ids)
57 |                 if request_output.paused:
58 |                     response = {}
59 |                     for (rid, sid) in request_output.paused:
60 |                         # response[sid] = [582, 508, 468, 587]
61 |                         # response[sid] = [198, 464, 13429, 21983, 25, 198, 198, 818, 262, 614, 2310, 1120, 11, 257]
62 |                         # normal_ret
63 |                         response[sid] = [50118, 133, 511, 16, 10, 889, 9, 5, 144, 505, 5894, 9, 5, 343, 35, 50118, 50118, 134, 4, 20, 343, 18, 2112, 50118, 50118, 133]
64 |                     engine.resume_request(request_output.request_id, response)
65 |             if not engine.has_unfinished_requests():
66 |                 break
67 |         torch.cuda.cudart().cudaProfilerStop()
68 |         
69 |     print(f'finished {len(outputs)} requests')
70 |     outputs = sorted(outputs, key=lambda x: int(x.request_id))
71 |     serialize = []
72 |     for output in outputs:
73 |         serialize.append({
74 |             "id": output.request_id,
75 |             "output_tokens": output.outputs[0].token_ids,
76 |             "output_text": output.outputs[0].text,
77 |         })
78 |     with open(f'pause_{args.test}_output.json', 'w+') as f:
79 |         json.dump(serialize, f, separators=(",", ": "))
80 |         
81 | if __name__ == '__main__':
82 |     parser = argparse.ArgumentParser(
83 |         description='Demo on using the LLMEngine class directly')
84 |     parser = EngineArgs.add_cli_args(parser)
85 |     parser.add_argument('--test', action='store_true')
86 |     args = parser.parse_args()
87 |     main(args)
88 | 


--------------------------------------------------------------------------------
/tests/kernels/test_cache.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | 
  3 | import pytest
  4 | import torch
  5 | 
  6 | from vllm import cache_ops
  7 | 
  8 | DTYPES = [torch.half, torch.bfloat16, torch.float]
  9 | NUM_TOKENS = [7, 83, 2048]  # Arbitrary values for testing
 10 | NUM_LAYERS = [5]  # Arbitrary values for testing
 11 | NUM_HEADS = [8]  # Arbitrary values for testing
 12 | HEAD_SIZES = [64, 80, 96, 112, 128, 256]
 13 | BLOCK_SIZES = [8, 16, 32]
 14 | NUM_BLOCKS = [1024]  # Arbitrary values for testing
 15 | NUM_MAPPINGS = [32, 256]  # Arbitrary values for testing
 16 | SEEDS = [0]
 17 | 
 18 | 
 19 | @pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
 20 | @pytest.mark.parametrize("num_layers", NUM_LAYERS)
 21 | @pytest.mark.parametrize("num_heads", NUM_HEADS)
 22 | @pytest.mark.parametrize("head_size", HEAD_SIZES)
 23 | @pytest.mark.parametrize("block_size", BLOCK_SIZES)
 24 | @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 25 | @pytest.mark.parametrize("dtype", DTYPES)
 26 | @pytest.mark.parametrize("seed", SEEDS)
 27 | @torch.inference_mode()
 28 | def test_copy_blocks(
 29 |     kv_cache_factory,
 30 |     num_mappings: int,
 31 |     num_layers: int,
 32 |     num_heads: int,
 33 |     head_size: int,
 34 |     block_size: int,
 35 |     num_blocks: int,
 36 |     dtype: torch.dtype,
 37 |     seed: int,
 38 | ) -> None:
 39 |     random.seed(seed)
 40 |     torch.random.manual_seed(seed)
 41 |     torch.cuda.manual_seed(seed)
 42 | 
 43 |     # Generate random block mappings where each source block is mapped to two
 44 |     # destination blocks.
 45 |     assert 2 * num_mappings <= num_blocks
 46 |     src_blocks = random.sample(range(num_blocks), num_mappings)
 47 |     remainig_blocks = list(set(range(num_blocks)) - set(src_blocks))
 48 |     dst_blocks = random.sample(remainig_blocks, 2 * num_mappings)
 49 |     block_mapping = {}
 50 |     for i in range(num_mappings):
 51 |         src = src_blocks[i]
 52 |         dst1 = dst_blocks[2 * i]
 53 |         dst2 = dst_blocks[2 * i + 1]
 54 |         block_mapping[src] = [dst1, dst2]
 55 | 
 56 |     # Create the KV caches.
 57 |     key_caches, value_caches = kv_cache_factory(num_blocks, block_size,
 58 |                                                 num_layers, num_heads,
 59 |                                                 head_size, dtype, seed)
 60 | 
 61 |     # Clone the KV caches.
 62 |     cloned_key_caches = [key_cache.clone() for key_cache in key_caches]
 63 |     cloned_value_caches = [value_cache.clone() for value_cache in value_caches]
 64 | 
 65 |     # Call the copy blocks kernel.
 66 |     cache_ops.copy_blocks(key_caches, value_caches, block_mapping)
 67 | 
 68 |     # Run the reference implementation.
 69 |     for src, dsts in block_mapping.items():
 70 |         for dst in dsts:
 71 |             for cloned_key_cache in cloned_key_caches:
 72 |                 cloned_key_cache[dst] = cloned_key_cache[src]
 73 |             for cloned_value_cache in cloned_value_caches:
 74 |                 cloned_value_cache[dst] = cloned_value_cache[src]
 75 | 
 76 |     # Compare the results.
 77 |     for key_cache, cloned_key_cache in zip(key_caches, cloned_key_caches):
 78 |         assert torch.allclose(key_cache, cloned_key_cache)
 79 |     for value_cache, cloned_value_cache in zip(value_caches,
 80 |                                                cloned_value_caches):
 81 |         assert torch.allclose(value_cache, cloned_value_cache)
 82 | 
 83 | 
 84 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 85 | @pytest.mark.parametrize("num_heads", NUM_HEADS)
 86 | @pytest.mark.parametrize("head_size", HEAD_SIZES)
 87 | @pytest.mark.parametrize("block_size", BLOCK_SIZES)
 88 | @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 89 | @pytest.mark.parametrize("dtype", DTYPES)
 90 | @pytest.mark.parametrize("seed", SEEDS)
 91 | @torch.inference_mode()
 92 | def test_reshape_and_cache(
 93 |     kv_cache_factory,
 94 |     num_tokens: int,
 95 |     num_heads: int,
 96 |     head_size: int,
 97 |     block_size: int,
 98 |     num_blocks: int,
 99 |     dtype: torch.dtype,
100 |     seed: int,
101 | ) -> None:
102 |     random.seed(seed)
103 |     torch.random.manual_seed(seed)
104 |     torch.cuda.manual_seed(seed)
105 | 
106 |     # Create a random slot mapping.
107 |     num_slots = block_size * num_blocks
108 |     slot_mapping = random.sample(range(num_slots), num_tokens)
109 |     slot_mapping = torch.tensor(slot_mapping, dtype=torch.int, device="cuda")
110 | 
111 |     qkv = torch.randn(num_tokens,
112 |                       3,
113 |                       num_heads,
114 |                       head_size,
115 |                       dtype=dtype,
116 |                       device="cuda")
117 |     _, key, value = qkv.unbind(dim=1)
118 | 
119 |     # Create the KV caches.
120 |     key_caches, value_caches = kv_cache_factory(num_blocks, block_size, 1,
121 |                                                 num_heads, head_size, dtype,
122 |                                                 seed)
123 |     key_cache, value_cache = key_caches[0], value_caches[0]
124 | 
125 |     # Clone the KV caches.
126 |     cloned_key_cache = key_cache.clone()
127 |     cloned_value_cache = value_cache.clone()
128 | 
129 |     # Call the reshape_and_cache kernel.
130 |     cache_ops.reshape_and_cache(key, value, key_cache, value_cache,
131 |                                 slot_mapping)
132 | 
133 |     # Run the reference implementation.
134 |     reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape)
135 |     block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
136 |     block_indicies = block_indicies.cpu().tolist()
137 |     block_offsets = slot_mapping % block_size
138 |     block_offsets = block_offsets.cpu().tolist()
139 |     for i in range(num_tokens):
140 |         block_idx = block_indicies[i]
141 |         block_offset = block_offsets[i]
142 |         cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i]
143 |         cloned_value_cache[block_idx, :, :, block_offset] = value[i]
144 | 
145 |     assert torch.allclose(key_cache, cloned_key_cache)
146 |     assert torch.allclose(value_cache, cloned_value_cache)
147 | 


--------------------------------------------------------------------------------
/docs/source/getting_started/quickstart.rst:
--------------------------------------------------------------------------------
  1 | .. _quickstart:
  2 | 
  3 | Quickstart
  4 | ==========
  5 | 
  6 | This guide shows how to use vLLM to:
  7 | 
  8 | * run offline batched inference on a dataset;
  9 | * build an API server for a large language model;
 10 | * start an OpenAI-compatible API server.
 11 | 
 12 | Be sure to complete the :ref:`installation instructions <installation>` before continuing with this guide.
 13 | 
 14 | Offline Batched Inference
 15 | -------------------------
 16 | 
 17 | We first show an example of using vLLM for offline batched inference on a dataset. In other words, we use vLLM to generate texts for a list of input prompts.
 18 | 
 19 | Import ``LLM`` and ``SamplingParams`` from vLLM. The ``LLM`` class is the main class for running offline inference with vLLM engine. The ``SamplingParams`` class specifies the parameters for the sampling process.
 20 | 
 21 | .. code-block:: python
 22 | 
 23 |     from vllm import LLM, SamplingParams
 24 | 
 25 | Define the list of input prompts and the sampling parameters for generation. The sampling temperature is set to 0.8 and the nucleus sampling probability is set to 0.95. For more information about the sampling parameters, refer to the `class definition <https://github.com/vllm-project/vllm/blob/main/vllm/sampling_params.py>`_.
 26 | 
 27 | .. code-block:: python
 28 | 
 29 |     prompts = [
 30 |         "Hello, my name is",
 31 |         "The president of the United States is",
 32 |         "The capital of France is",
 33 |         "The future of AI is",
 34 |     ]
 35 |     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 36 | 
 37 | Initialize vLLM's engine for offline inference with the ``LLM`` class and the `OPT-125M model <https://arxiv.org/abs/2205.01068>`_. The list of supported models can be found at :ref:`supported models <supported_models>`.
 38 | 
 39 | .. code-block:: python
 40 | 
 41 |     llm = LLM(model="facebook/opt-125m")
 42 | 
 43 | Call ``llm.generate`` to generate the outputs. It adds the input prompts to vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of ``RequestOutput`` objects, which include all the output tokens.
 44 | 
 45 | .. code-block:: python
 46 | 
 47 |     outputs = llm.generate(prompts, sampling_params)
 48 | 
 49 |     # Print the outputs.
 50 |     for output in outputs:
 51 |         prompt = output.prompt
 52 |         generated_text = output.outputs[0].text
 53 |         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 54 | 
 55 | 
 56 | The code example can also be found in `examples/offline_inference.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference.py>`_.
 57 | 
 58 | 
 59 | API Server
 60 | ----------
 61 | 
 62 | vLLM can be deployed as an LLM service. We provide an example `FastAPI <https://fastapi.tiangolo.com/>`_ server. Check `vllm/entrypoints/api_server.py <https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/api_server.py>`_ for the server implementation. The server uses ``AsyncLLMEngine`` class to support asynchronous processing of incoming requests.
 63 | 
 64 | Start the server:
 65 | 
 66 | .. code-block:: console
 67 | 
 68 |     $ python -m vllm.entrypoints.api_server
 69 | 
 70 | By default, this command starts the server at ``http://localhost:8000`` with the OPT-125M model.
 71 | 
 72 | Query the model in shell:
 73 | 
 74 | .. code-block:: console
 75 | 
 76 |     $ curl http://localhost:8000/generate \
 77 |     $     -d '{
 78 |     $         "prompt": "San Francisco is a",
 79 |     $         "use_beam_search": true,
 80 |     $         "n": 4,
 81 |     $         "temperature": 0
 82 |     $     }'
 83 | 
 84 | See `examples/api_client.py <https://github.com/vllm-project/vllm/blob/main/examples/api_client.py>`_ for a more detailed client example.
 85 | 
 86 | OpenAI-Compatible Server
 87 | ------------------------
 88 | 
 89 | vLLM can be deployed as a server that mimics the OpenAI API protocol. This allows vLLM to be used as a drop-in replacement for applications using OpenAI API.
 90 | 
 91 | Start the server:
 92 | 
 93 | .. code-block:: console
 94 | 
 95 |     $ python -m vllm.entrypoints.openai.api_server \
 96 |     $     --model facebook/opt-125m
 97 | 
 98 | By default, it starts the server at ``http://localhost:8000``. You can specify the address with ``--host`` and ``--port`` arguments. The server currently hosts one model at a time (OPT-125M in the above command) and implements `list models <https://platform.openai.com/docs/api-reference/models/list>`_ and `create completion <https://platform.openai.com/docs/api-reference/completions/create>`_ endpoints. We are actively adding support for more endpoints.
 99 | 
100 | This server can be queried in the same format as OpenAI API. For example, list the models:
101 | 
102 | .. code-block:: console
103 | 
104 |     $ curl http://localhost:8000/v1/models
105 | 
106 | Query the model with input prompts:
107 | 
108 | .. code-block:: console
109 | 
110 |     $ curl http://localhost:8000/v1/completions \
111 |     $     -H "Content-Type: application/json" \
112 |     $     -d '{
113 |     $         "model": "facebook/opt-125m",
114 |     $         "prompt": "San Francisco is a",
115 |     $         "max_tokens": 7,
116 |     $         "temperature": 0
117 |     $     }'
118 | 
119 | Since this server is compatible with OpenAI API, you can use it as a drop-in replacement for any applications using OpenAI API. For example, another way to query the server is via the ``openai`` python package:
120 | 
121 | .. code-block:: python
122 | 
123 |     import openai
124 |     # Modify OpenAI's API key and API base to use vLLM's API server.
125 |     openai.api_key = "EMPTY"
126 |     openai.api_base = "http://localhost:8000/v1"
127 |     completion = openai.Completion.create(model="facebook/opt-125m",
128 |                                           prompt="San Francisco is a")
129 |     print("Completion result:", completion)
130 | 
131 | For a more detailed client example, refer to `examples/openai_completion_client.py <https://github.com/vllm-project/vllm/blob/main/examples/openai_completion_client.py>`_.
132 | 


--------------------------------------------------------------------------------
/examples/test_ref_outputs.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Optional, Tuple, Dict
  2 | import argparse
  3 | from transformers import AutoModelForCausalLM, PreTrainedTokenizerBase
  4 | from vllm import EngineArgs, LLMEngine, SamplingParams, utils
  5 | import torch
  6 | import json
  7 | import random
  8 | import os
  9 | from vllm.outputs import RequestOutput
 10 | 
 11 | # os.environ['CUDA_VISIBLE_DEVICES'] = '7'
 12 | # os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
 13 | 
 14 | def api_call(input: str):
 15 |     return " a "
 16 | 
 17 | def sample_requests(
 18 |     dataset_path: str,
 19 |     num_requests: int,
 20 |     tokenizer: PreTrainedTokenizerBase,
 21 | ) -> List[Tuple[str, List[int], int]]:
 22 |     # Load the dataset.
 23 |     with open(dataset_path) as f:
 24 |         dataset = json.load(f)
 25 |     # Filter out the conversations with less than 2 turns.
 26 |     dataset = [data for data in dataset if len(data["conversations"]) >= 2]
 27 |     # Only keep long prompts
 28 |     dataset = [(data["conversations"][0]["value"], data["conversations"][1]["value"]) 
 29 |                for data in dataset if len(data["conversations"][0]["value"]) >= 300]
 30 | 
 31 |     # Tokenize the prompts and completions.
 32 |     prompts = [prompt for prompt, _ in dataset]
 33 |     prompt_token_ids = tokenizer(prompts).input_ids
 34 |     completions = [completion for _, completion in dataset]
 35 |     completion_token_ids = tokenizer(completions).input_ids
 36 |     tokenized_dataset = []
 37 |     for i in range(len(dataset)):
 38 |         output_len = len(completion_token_ids[i])
 39 |         tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
 40 | 
 41 |     # Filter out sequences.
 42 |     filtered_dataset: List[Tuple[str, int, int]] = []
 43 |     for prompt, prompt_token_ids, output_len in tokenized_dataset:
 44 |         prompt_len = len(prompt_token_ids)
 45 |         if prompt_len < 100 or output_len < 4:
 46 |             # Prune too short sequences.
 47 |             continue
 48 |         if prompt_len > 1024 or prompt_len + output_len > 2048:
 49 |             # Prune too long sequences.
 50 |             continue
 51 |         filtered_dataset.append((prompt, prompt_token_ids, output_len))
 52 | 
 53 |     # Sample the requests.
 54 |     sampled_requests = random.sample(filtered_dataset, num_requests)
 55 |     return sampled_requests
 56 | 
 57 | def parse_ref_outputs() -> Dict[int, Tuple[List[int], List[int]]]:
 58 |     with open('ref-outputs.json') as f:
 59 |         outputs = json.load(f)
 60 |     ref_outputs = {data["id"]: (data["prompt_tokens"], data["output_tokens"]) for data in outputs}
 61 |     return ref_outputs
 62 | 
 63 | test_until = 100
 64 | 
 65 | def main(args: argparse.Namespace):
 66 |     # Parse the CLI argument and initialize the engine.
 67 |     engine_args = EngineArgs.from_cli_args(args)
 68 |     engine = LLMEngine.from_engine_args(engine_args)
 69 |     if args.mode == "ref":
 70 |         # Test the following prompts.
 71 |         datasets = sample_requests("ShareGPT_V3_unfiltered_cleaned_split.json", 100, engine.tokenizer)
 72 |         request_id = 0
 73 |         for prompt, prompt_token_ids, output_len in datasets:
 74 |             if request_id <= test_until:
 75 |                 sampling_params = SamplingParams(n=1, temperature=0.0, presence_penalty=0.0,max_tokens=output_len,ignore_eos=True)
 76 |                 engine.add_request(str(request_id), prompt, sampling_params, prompt_token_ids)
 77 |                 request_id += 1
 78 |         
 79 |         outputs: List[RequestOutput] = []
 80 |         while True:
 81 |             request_outputs = engine.step()
 82 |             for request_output in request_outputs:
 83 |                 if request_output.finished:
 84 |                     outputs.append(request_output)
 85 |                     # print(request_output.outputs[0].token_ids)
 86 |             if not engine.has_unfinished_requests():
 87 |                 break
 88 |         
 89 |         serialize = []
 90 |         outputs = sorted(outputs, key=lambda x: x.request_id)
 91 |         for output in outputs:
 92 |             serialize.append({
 93 |                 "id": output.request_id,
 94 |                 "prompt_tokens": output.prompt_token_ids,
 95 |                 "output_tokens": output.outputs[0].token_ids,
 96 |                 "output_text": output.outputs[0].text,
 97 |             })
 98 |         with open("ref-outputs.json", "w") as f:
 99 |             json.dump(serialize, f, separators=(",", ": "))
100 |     else:
101 |         ref_outputs = parse_ref_outputs()
102 |         for request_id, (prompt_token_ids, output_token_ids) in ref_outputs.items():
103 |             # if int(request_id) <= 22:
104 |             sampling_params = SamplingParams(n=1, temperature=0.0, presence_penalty=0.0,max_tokens=len(output_token_ids),ignore_eos=True)
105 |             engine.add_request(request_id, "", sampling_params, prompt_token_ids)
106 |             
107 |         outputs: List[RequestOutput] = []
108 |         while True:
109 |             request_outputs = engine.step()
110 |             for request_output in request_outputs:
111 |                 if request_output.finished:
112 |                     outputs.append(request_output)
113 |                     # print(request_output.outputs[0].token_ids)
114 |             if not engine.has_unfinished_requests():
115 |                 break
116 |         print(f'finished {len(outputs)} requests')
117 |         serialize = []
118 |         outputs = sorted(outputs, key=lambda x: x.request_id)
119 |         for output in outputs:
120 |             serialize.append({
121 |                 "id": output.request_id,
122 |                 "prompt_tokens": output.prompt_token_ids,
123 |                 "output_tokens": output.outputs[0].token_ids,
124 |                 "output_text": output.outputs[0].text,
125 |             })
126 |         with open("test-outputs.json", "w") as f:
127 |             json.dump(serialize, f, separators=(",", ": "))
128 |     return
129 |     
130 | if __name__ == '__main__':
131 |     parser = argparse.ArgumentParser(
132 |         description='Demo on using the LLMEngine class directly')
133 |     parser = EngineArgs.add_cli_args(parser)
134 |     parser.add_argument('--mode', type=str, default="test", choices=["ref", "test"])
135 |     args = parser.parse_args()
136 |     main(args)
137 | 


--------------------------------------------------------------------------------
/tests/kernels/test_pos_encoding.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional, Tuple
  2 | 
  3 | import pytest
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | 
  8 | from vllm import pos_encoding_ops
  9 | 
 10 | IS_NEOX_STYLE = [True, False]
 11 | DTYPES = [torch.half, torch.bfloat16, torch.float]
 12 | HEAD_SIZES = [64, 80, 96, 112, 128, 256]
 13 | ROTARY_DIMS = [None, 32]  # None means rotary dim == head size
 14 | NUM_HEADS = [7, 12, 40, 52]  # Arbitrary values for testing
 15 | NUM_TOKENS = [11, 83, 2048]  # Arbitrary values for testing
 16 | SEEDS = [0]
 17 | 
 18 | 
 19 | def rotate_neox(x: torch.Tensor) -> torch.Tensor:
 20 |     x1 = x[..., :x.shape[-1] // 2]
 21 |     x2 = x[..., x.shape[-1] // 2:]
 22 |     return torch.cat((-x2, x1), dim=-1)
 23 | 
 24 | 
 25 | def rotate_gptj(x: torch.Tensor) -> torch.Tensor:
 26 |     x1 = x[..., ::2]
 27 |     x2 = x[..., 1::2]
 28 |     x = torch.stack((-x2, x1), dim=-1)
 29 |     return x.flatten(-2)
 30 | 
 31 | 
 32 | def apply_rope(
 33 |     q: torch.Tensor,
 34 |     k: torch.Tensor,
 35 |     cos: torch.Tensor,
 36 |     sin: torch.Tensor,
 37 |     is_neox_style: bool,
 38 | ) -> Tuple[torch.Tensor, torch.Tensor]:
 39 |     rotate_fn = rotate_neox if is_neox_style else rotate_gptj
 40 |     q_embed = (q * cos) + (rotate_fn(q) * sin)
 41 |     k_embed = (k * cos) + (rotate_fn(k) * sin)
 42 |     return q_embed, k_embed
 43 | 
 44 | 
 45 | class RefRotaryEmbedding(nn.Module):
 46 |     """Reference implementation of rotary embedding."""
 47 | 
 48 |     def __init__(
 49 |         self,
 50 |         dim: int,
 51 |         is_neox_style: bool,
 52 |         max_position_embeddings: int = 8192,
 53 |         base: int = 10000,
 54 |     ) -> None:
 55 |         super().__init__()
 56 |         self.rotary_dim = dim
 57 |         self.is_neox_style = is_neox_style
 58 |         self.max_position_embeddings = max_position_embeddings
 59 | 
 60 |         # Create cos and sin embeddings.
 61 |         inv_freq = 1.0 / (base**(torch.arange(0, dim, 2) / dim))
 62 |         t = torch.arange(max_position_embeddings).float()
 63 |         freqs = torch.einsum("i,j->ij", t, inv_freq.float())
 64 |         if is_neox_style:
 65 |             emb = torch.cat((freqs, freqs), dim=-1)
 66 |         else:
 67 |             emb = torch.repeat_interleave(freqs, 2, -1)
 68 |         cos = emb.cos().to(dtype=inv_freq.dtype)
 69 |         sin = emb.sin().to(dtype=inv_freq.dtype)
 70 |         self.register_buffer("cos_cached", cos, persistent=False)
 71 |         self.register_buffer("sin_cached", sin, persistent=False)
 72 | 
 73 |     def forward(
 74 |         self,
 75 |         positions: torch.Tensor,  # [num_tokens]
 76 |         query: torch.Tensor,  # [num_tokens, num_heads, head_size]
 77 |         key: torch.Tensor,  # [num_tokens, num_heads, head_size]
 78 |     ) -> Tuple[torch.Tensor, torch.Tensor]:
 79 |         query_rot = query[..., :self.rotary_dim]
 80 |         query_pass = query[..., self.rotary_dim:]
 81 |         key_rot = key[..., :self.rotary_dim]
 82 |         key_pass = key[..., self.rotary_dim:]
 83 | 
 84 |         query_rot = query_rot.transpose(0, 1)
 85 |         key_rot = key_rot.transpose(0, 1)
 86 |         cos = F.embedding(positions, self.cos_cached)
 87 |         sin = F.embedding(positions, self.sin_cached)
 88 | 
 89 |         query_rot, key_rot = apply_rope(query_rot, key_rot, cos, sin,
 90 |                                         self.is_neox_style)
 91 |         query_rot = query_rot.transpose(0, 1).contiguous()
 92 |         key_rot = key_rot.transpose(0, 1).contiguous()
 93 | 
 94 |         query = torch.cat((query_rot, query_pass), dim=-1)
 95 |         key = torch.cat((key_rot, key_pass), dim=-1)
 96 | 
 97 |         # Output query/key shape: [num_tokens, num_tokens, head_size]
 98 |         return query, key
 99 | 
100 | 
101 | @pytest.mark.parametrize("is_neox_style", IS_NEOX_STYLE)
102 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
103 | @pytest.mark.parametrize("num_heads", NUM_HEADS)
104 | @pytest.mark.parametrize("head_size", HEAD_SIZES)
105 | @pytest.mark.parametrize("rotary_dim", ROTARY_DIMS)
106 | @pytest.mark.parametrize("dtype", DTYPES)
107 | @pytest.mark.parametrize("seed", SEEDS)
108 | @torch.inference_mode()
109 | def test_rotary_embedding(
110 |     is_neox_style: bool,
111 |     num_tokens: int,
112 |     num_heads: int,
113 |     head_size: int,
114 |     rotary_dim: Optional[int],
115 |     dtype: torch.dtype,
116 |     seed: int,
117 |     max_position: int = 8192,
118 |     base: int = 10000,
119 | ) -> None:
120 |     if rotary_dim is None:
121 |         rotary_dim = head_size
122 |     torch.random.manual_seed(seed)
123 |     torch.cuda.manual_seed(seed)
124 | 
125 |     positions = torch.randint(0, max_position, (num_tokens, ), device="cuda")
126 |     query = torch.randn(num_tokens,
127 |                         num_heads * head_size,
128 |                         dtype=dtype,
129 |                         device="cuda")
130 |     key = torch.randn(num_tokens,
131 |                       num_heads * head_size,
132 |                       dtype=dtype,
133 |                       device="cuda")
134 | 
135 |     # Create the rotary embedding.
136 |     inv_freq = 1.0 / (base**(
137 |         torch.arange(0, rotary_dim, 2, dtype=torch.float) / rotary_dim))
138 |     t = torch.arange(max_position).float()
139 |     freqs = torch.einsum("i,j -> ij", t, inv_freq)
140 |     cos = freqs.cos()
141 |     sin = freqs.sin()
142 |     cos_sin_cache = torch.cat((cos, sin), dim=-1)
143 |     cos_sin_cache = cos_sin_cache.to(dtype=dtype, device="cuda")
144 | 
145 |     # Run the kernel. The kernel is in-place, so we need to clone the inputs.
146 |     out_query = query.clone()
147 |     out_key = key.clone()
148 |     pos_encoding_ops.rotary_embedding(
149 |         positions,
150 |         out_query,
151 |         out_key,
152 |         head_size,
153 |         cos_sin_cache,
154 |         is_neox_style,
155 |     )
156 | 
157 |     # Run the reference implementation.
158 |     ref_rotary_embedding = RefRotaryEmbedding(
159 |         dim=rotary_dim,
160 |         is_neox_style=is_neox_style,
161 |         max_position_embeddings=max_position,
162 |         base=base,
163 |     ).to(dtype=dtype, device="cuda")
164 |     ref_query, ref_key = ref_rotary_embedding(
165 |         positions,
166 |         query.view(num_tokens, num_heads, head_size),
167 |         key.view(num_tokens, num_heads, head_size),
168 |     )
169 |     ref_query = ref_query.view(num_tokens, num_heads * head_size)
170 |     ref_key = ref_key.view(num_tokens, num_heads * head_size)
171 | 
172 |     # Compare the results.
173 |     assert torch.allclose(out_query, ref_query, atol=1e-5, rtol=1e-5)
174 |     assert torch.allclose(out_key, ref_key, atol=1e-5, rtol=1e-5)
175 | 


--------------------------------------------------------------------------------