├── .github
└── workflows
│ ├── publish.yml
│ ├── ruff.yml
│ ├── scripts
│ ├── build.sh
│ ├── create_release.js
│ ├── cuda-install.sh
│ ├── env.sh
│ └── pytorch-install.sh
│ └── yapf.yml
├── .gitignore
├── .readthedocs.yaml
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── MANIFEST.in
├── README.md
├── benchmarks
├── README.md
├── benchmark_latency.py
├── benchmark_serving.py
├── benchmark_throughput.py
├── kernels
│ └── benchmark_paged_attention.py
└── launch_tgi_server.sh
├── csrc
├── activation_kernels.cu
├── attention
│ ├── attention_dtypes.h
│ ├── attention_generic.cuh
│ ├── attention_kernels.cu
│ ├── attention_utils.cuh
│ ├── dtype_bfloat16.cuh
│ ├── dtype_float16.cuh
│ └── dtype_float32.cuh
├── cache.h
├── cache_kernels.cu
├── cuda_utils.h
├── cuda_utils_kernels.cu
├── dispatch_utils.h
├── layernorm_kernels.cu
├── ops.h
├── pos_encoding_kernels.cu
├── pybind.cpp
├── quantization
│ ├── awq
│ │ ├── dequantize.cuh
│ │ └── gemm_kernels.cu
│ └── squeezellm
│ │ └── quant_cuda_kernel.cu
└── reduction_utils.cuh
├── docs
├── Makefile
├── README.md
├── make.bat
├── requirements-docs.txt
└── source
│ ├── assets
│ └── logos
│ │ ├── vllm-logo-only-light.png
│ │ ├── vllm-logo-text-dark.png
│ │ └── vllm-logo-text-light.png
│ ├── conf.py
│ ├── getting_started
│ ├── installation.rst
│ └── quickstart.rst
│ ├── index.rst
│ ├── models
│ ├── adding_model.rst
│ ├── engine_args.rst
│ └── supported_models.rst
│ ├── quantization
│ └── auto_awq.rst
│ └── serving
│ ├── deploying_with_docker.rst
│ ├── deploying_with_triton.rst
│ ├── distributed_serving.rst
│ └── run_on_sky.rst
├── examples
├── api_client.py
├── gradio_webserver.py
├── llm_engine_example.py
├── offline_inference.py
├── openai_chatcompletion_client.py
└── openai_completion_client.py
├── format.sh
├── mypy.ini
├── pyproject.toml
├── requirements-dev.txt
├── requirements.txt
├── setup.py
├── tests
├── __init__.py
├── async_engine
│ ├── api_server_async_engine.py
│ ├── test_api_server.py
│ ├── test_async_llm_engine.py
│ └── test_request_tracker.py
├── conftest.py
├── distributed
│ └── test_comm_ops.py
├── engine
│ └── test_detokenize.py
├── kernels
│ ├── conftest.py
│ ├── test_activation.py
│ ├── test_attention.py
│ ├── test_cache.py
│ ├── test_layernorm.py
│ └── test_pos_encoding.py
├── models
│ └── test_models.py
├── samplers
│ ├── test_beam_search.py
│ ├── test_logprobs.py
│ └── test_sampler.py
├── test_regression.py
└── worker
│ └── test_worker.py
└── vllm
├── __init__.py
├── block.py
├── config.py
├── core
├── __init__.py
├── block_manager.py
├── policy.py
└── scheduler.py
├── engine
├── __init__.py
├── arg_utils.py
├── async_llm_engine.py
├── llm_engine.py
└── ray_utils.py
├── entrypoints
├── __init__.py
├── api_server.py
├── llm.py
└── openai
│ ├── __init__.py
│ ├── api_server.py
│ └── protocol.py
├── logger.py
├── model_executor
├── __init__.py
├── input_metadata.py
├── layers
│ ├── __init__.py
│ ├── activation.py
│ ├── attention.py
│ ├── layernorm.py
│ ├── linear.py
│ ├── quantization
│ │ ├── __init__.py
│ │ ├── awq.py
│ │ ├── base_config.py
│ │ └── squeezellm.py
│ ├── rotary_embedding.py
│ ├── sampler.py
│ └── vocab_parallel_embedding.py
├── model_loader.py
├── models
│ ├── __init__.py
│ ├── aquila.py
│ ├── baichuan.py
│ ├── bloom.py
│ ├── chatglm.py
│ ├── falcon.py
│ ├── gpt2.py
│ ├── gpt_bigcode.py
│ ├── gpt_j.py
│ ├── gpt_neox.py
│ ├── internlm.py
│ ├── llama.py
│ ├── mistral.py
│ ├── mpt.py
│ ├── opt.py
│ ├── phi_1_5.py
│ ├── qwen.py
│ └── yi.py
├── parallel_utils
│ ├── README.md
│ ├── __init__.py
│ ├── communication_op.py
│ ├── parallel_state.py
│ └── utils.py
├── utils.py
└── weight_utils.py
├── outputs.py
├── py.typed
├── sampling_params.py
├── sequence.py
├── transformers_utils
├── __init__.py
├── config.py
├── configs
│ ├── __init__.py
│ ├── aquila.py
│ ├── baichuan.py
│ ├── chatglm.py
│ ├── falcon.py
│ ├── mpt.py
│ ├── qwen.py
│ └── yi.py
└── tokenizer.py
├── utils.py
└── worker
├── __init__.py
├── cache_engine.py
└── worker.py
/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
1 | # This workflow will upload a Python Package to Release asset
2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions
3 |
4 | name: Create Release
5 |
6 | on:
7 | push:
8 | tags:
9 | - v*
10 |
11 | # Needed to create release and upload assets
12 | permissions:
13 | contents: write
14 |
15 | jobs:
16 | release:
17 | # Retrieve tag and create release
18 | name: Create Release
19 | runs-on: ubuntu-latest
20 | outputs:
21 | upload_url: ${{ steps.create_release.outputs.upload_url }}
22 | steps:
23 | - name: Checkout
24 | uses: actions/checkout@v3
25 |
26 | - name: Extract branch info
27 | shell: bash
28 | run: |
29 | echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
30 |
31 | - name: Create Release
32 | id: create_release
33 | uses: "actions/github-script@v6"
34 | env:
35 | RELEASE_TAG: ${{ env.release_tag }}
36 | with:
37 | github-token: "${{ secrets.GITHUB_TOKEN }}"
38 | script: |
39 | const script = require('.github/workflows/scripts/create_release.js')
40 | await script(github, context, core)
41 |
42 | wheel:
43 | name: Build Wheel
44 | runs-on: ${{ matrix.os }}
45 | needs: release
46 |
47 | strategy:
48 | fail-fast: false
49 | matrix:
50 | os: ['ubuntu-20.04']
51 | python-version: ['3.8', '3.9', '3.10', '3.11']
52 | pytorch-version: ['2.1.0']
53 | cuda-version: ['11.8', '12.1']
54 |
55 | steps:
56 | - name: Checkout
57 | uses: actions/checkout@v3
58 |
59 | - name: Set up Linux Env
60 | if: ${{ runner.os == 'Linux' }}
61 | run: |
62 | bash -x .github/workflows/scripts/env.sh
63 |
64 | - name: Set up Python
65 | uses: actions/setup-python@v4
66 | with:
67 | python-version: ${{ matrix.python-version }}
68 |
69 | - name: Install CUDA ${{ matrix.cuda-version }}
70 | run: |
71 | bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
72 |
73 | - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
74 | run: |
75 | bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
76 |
77 | - name: Build wheel
78 | shell: bash
79 | run: |
80 | bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
81 | wheel_name=$(ls dist/*whl | xargs -n 1 basename)
82 | asset_name=${wheel_name//"linux"/"manylinux1"}
83 | echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
84 | echo "asset_name=${asset_name}" >> $GITHUB_ENV
85 |
86 | - name: Upload Release Asset
87 | uses: actions/upload-release-asset@v1
88 | env:
89 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
90 | with:
91 | upload_url: ${{ needs.release.outputs.upload_url }}
92 | asset_path: ./dist/${{ env.wheel_name }}
93 | asset_name: ${{ env.asset_name }}
94 | asset_content_type: application/*
95 |
96 | # (Danielkinz): This last step will publish the .whl to pypi. Warning: untested
97 | # - name: Publish package
98 | # uses: pypa/gh-action-pypi-publish@release/v1.8
99 | # with:
100 | # repository-url: https://test.pypi.org/legacy/
101 | # password: ${{ secrets.PYPI_API_TOKEN }}
102 | # skip-existing: true
103 |
--------------------------------------------------------------------------------
/.github/workflows/ruff.yml:
--------------------------------------------------------------------------------
1 | name: ruff
2 |
3 | on:
4 | # Trigger the workflow on push or pull request,
5 | # but only for the main branch
6 | push:
7 | branches:
8 | - main
9 | pull_request:
10 | branches:
11 | - main
12 |
13 | jobs:
14 | ruff:
15 | runs-on: ubuntu-latest
16 | strategy:
17 | matrix:
18 | python-version: ["3.10"]
19 | steps:
20 | - uses: actions/checkout@v2
21 | - name: Set up Python ${{ matrix.python-version }}
22 | uses: actions/setup-python@v2
23 | with:
24 | python-version: ${{ matrix.python-version }}
25 | - name: Install dependencies
26 | run: |
27 | python -m pip install --upgrade pip
28 | pip install ruff==0.1.5
29 | - name: Analysing the code with ruff
30 | run: |
31 | ruff vllm tests
32 |
--------------------------------------------------------------------------------
/.github/workflows/scripts/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | python_executable=python$1
4 | cuda_home=/usr/local/cuda-$2
5 |
6 | # Update paths
7 | PATH=${cuda_home}/bin:$PATH
8 | LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
9 |
10 | # Install requirements
11 | $python_executable -m pip install wheel packaging
12 | $python_executable -m pip install -r requirements.txt
13 |
14 | # Limit the number of parallel jobs to avoid OOM
15 | export MAX_JOBS=1
16 |
17 | # Build
18 | $python_executable setup.py bdist_wheel --dist-dir=dist
19 |
--------------------------------------------------------------------------------
/.github/workflows/scripts/create_release.js:
--------------------------------------------------------------------------------
1 | // Uses Github's API to create the release and wait for result.
2 | // We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately.
3 |
4 | module.exports = async (github, context, core) => {
5 | try {
6 | const response = await github.rest.repos.createRelease({
7 | draft: false,
8 | generate_release_notes: true,
9 | name: process.env.RELEASE_TAG,
10 | owner: context.repo.owner,
11 | prerelease: false,
12 | repo: context.repo.repo,
13 | tag_name: process.env.RELEASE_TAG,
14 | });
15 |
16 | core.setOutput('upload_url', response.data.upload_url);
17 | } catch (error) {
18 | core.setFailed(error.message);
19 | }
20 | }
--------------------------------------------------------------------------------
/.github/workflows/scripts/cuda-install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Replace '.' with '-' ex: 11.8 -> 11-8
4 | cuda_version=$(echo $1 | tr "." "-")
5 | # Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004
6 | OS=$(echo $2 | tr -d ".\-")
7 |
8 | # Installs CUDA
9 | wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb
10 | sudo dpkg -i cuda-keyring_1.1-1_all.deb
11 | rm cuda-keyring_1.1-1_all.deb
12 | sudo apt -qq update
13 | sudo apt -y install cuda-${cuda_version} cuda-nvcc-${cuda_version} cuda-libraries-dev-${cuda_version}
14 | sudo apt clean
15 |
16 | # Test nvcc
17 | PATH=/usr/local/cuda-$1/bin:${PATH}
18 | nvcc --version
19 |
20 | # Log gcc, g++, c++ versions
21 | gcc --version
22 | g++ --version
23 | c++ --version
24 |
--------------------------------------------------------------------------------
/.github/workflows/scripts/env.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # This file installs common linux environment tools
4 |
5 | export LANG C.UTF-8
6 |
7 | # python_version=$1
8 |
9 | sudo apt-get update && \
10 | sudo apt-get install -y --no-install-recommends \
11 | software-properties-common \
12 |
13 | sudo apt-get install -y --no-install-recommends \
14 | build-essential \
15 | apt-utils \
16 | ca-certificates \
17 | wget \
18 | git \
19 | vim \
20 | libssl-dev \
21 | curl \
22 | unzip \
23 | unrar \
24 | cmake \
25 | net-tools \
26 | sudo \
27 | autotools-dev \
28 | rsync \
29 | jq \
30 | openssh-server \
31 | tmux \
32 | screen \
33 | htop \
34 | pdsh \
35 | openssh-client \
36 | lshw \
37 | dmidecode \
38 | util-linux \
39 | automake \
40 | autoconf \
41 | libtool \
42 | net-tools \
43 | pciutils \
44 | libpci-dev \
45 | libaio-dev \
46 | libcap2 \
47 | libtinfo5 \
48 | fakeroot \
49 | devscripts \
50 | debhelper \
51 | nfs-common
52 |
53 | # Remove github bloat files to free up disk space
54 | sudo rm -rf "/usr/local/share/boost"
55 | sudo rm -rf "$AGENT_TOOLSDIRECTORY"
56 | sudo rm -rf "/usr/share/dotnet"
57 |
--------------------------------------------------------------------------------
/.github/workflows/scripts/pytorch-install.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | python_executable=python$1
4 | pytorch_version=$2
5 | cuda_version=$3
6 |
7 | # Install torch
8 | $python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya
9 | $python_executable -m pip install torch==${pytorch_version}+cu${cuda_version//./} --extra-index-url https://download.pytorch.org/whl/cu${cuda_version//./}
10 |
11 | # Print version information
12 | $python_executable --version
13 | $python_executable -c "import torch; print('PyTorch:', torch.__version__)"
14 | $python_executable -c "import torch; print('CUDA:', torch.version.cuda)"
15 | $python_executable -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)"
16 |
--------------------------------------------------------------------------------
/.github/workflows/yapf.yml:
--------------------------------------------------------------------------------
1 | name: yapf
2 |
3 | on:
4 | # Trigger the workflow on push or pull request,
5 | # but only for the main branch
6 | push:
7 | branches:
8 | - main
9 | pull_request:
10 | branches:
11 | - main
12 | jobs:
13 | yapf:
14 | runs-on: ubuntu-latest
15 | strategy:
16 | matrix:
17 | python-version: ["3.10"]
18 | steps:
19 | - uses: actions/checkout@v2
20 | - name: Set up Python ${{ matrix.python-version }}
21 | uses: actions/setup-python@v2
22 | with:
23 | python-version: ${{ matrix.python-version }}
24 | - name: Install dependencies
25 | run: |
26 | python -m pip install --upgrade pip
27 | pip install yapf==0.32.0
28 | pip install toml==0.10.2
29 | - name: Running yapf
30 | run: |
31 | yapf --diff --recursive vllm tests
32 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | .idea/
161 |
162 | # VSCode
163 | .vscode/
164 |
165 | # DS Store
166 | .DS_Store
167 |
168 | # Results
169 | *.csv
170 |
171 | # Python pickle files
172 | *.pkl
173 |
174 | # Sphinx documentation
175 | _build/
176 |
177 | # vim swap files
178 | *.swo
179 | *.swp
180 |
--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
1 | # Read the Docs configuration file
2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
3 |
4 | version: 2
5 |
6 | build:
7 | os: ubuntu-22.04
8 | tools:
9 | python: "3.8"
10 |
11 | sphinx:
12 | configuration: docs/source/conf.py
13 |
14 | # If using Sphinx, optionally build your docs in additional formats such as PDF
15 | formats:
16 | - pdf
17 |
18 | # Optionally declare the Python requirements required to build your docs
19 | python:
20 | install:
21 | - requirements: docs/requirements-docs.txt
22 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to vLLM
2 |
3 | Thank you for your interest in contributing to vLLM!
4 | Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large.
5 | There are several ways you can contribute to the project:
6 |
7 | - Identify and report any issues or bugs.
8 | - Request or add a new model.
9 | - Suggest or implement new features.
10 |
11 | However, remember that contributions aren't just about code.
12 | We believe in the power of community support; thus, answering queries, assisting others, and enhancing the documentation are highly regarded and beneficial contributions.
13 |
14 | Finally, one of the most impactful ways to support us is by raising awareness about vLLM.
15 | Talk about it in your blog posts, highlighting how it's driving your incredible projects.
16 | Express your support on Twitter if vLLM aids you, or simply offer your appreciation by starring our repository.
17 |
18 |
19 | ## Setup for development
20 |
21 | ### Build from source
22 |
23 | ```bash
24 | pip install -r requirements.txt
25 | pip install -e . # This may take several minutes.
26 | ```
27 |
28 | ### Testing
29 |
30 | ```bash
31 | pip install -r requirements-dev.txt
32 |
33 | # Static type checking
34 | mypy
35 | # Unit tests
36 | pytest tests/
37 | ```
38 | **Note:** Currently, the repository does not pass the mypy tests.
39 |
40 |
41 | ## Contributing Guidelines
42 |
43 | ### Issue Reporting
44 |
45 | If you encounter a bug or have a feature request, please check our issues page first to see if someone else has already reported it.
46 | If not, please file a new issue, providing as much relevant information as possible.
47 |
48 | ### Coding Style Guide
49 |
50 | In general, we adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html).
51 |
52 | We include a formatting script [`format.sh`](./format.sh) to format the code.
53 |
54 | ### Pull Requests
55 |
56 | When submitting a pull request:
57 |
58 | 1. Make sure your code has been rebased on top of the latest commit on the main branch.
59 | 2. Ensure code is properly formatted by running [`format.sh`](./format.sh).
60 | 3. Include a detailed description of the changes in the pull request.
61 | Explain why you made the changes you did.
62 | If your pull request fixes an open issue, please include a reference to it in the description.
63 |
64 | ### Code Reviews
65 |
66 | All submissions, including submissions by project members, require a code review.
67 | To make the review process as smooth as possible, please:
68 |
69 | 1. Keep your changes as concise as possible.
70 | If your pull request involves multiple unrelated changes, consider splitting it into separate pull requests.
71 | 2. Respond to all comments within a reasonable time frame.
72 | If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion.
73 |
74 | ### Thank You
75 |
76 | Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM.
77 | Your contributions make vLLM a great tool for everyone!
78 |
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
2 |
3 | RUN apt-get update -y \
4 | && apt-get install -y python3-pip
5 |
6 | WORKDIR /workspace
7 |
8 | # install build and runtime dependencies
9 | COPY requirements.txt requirements.txt
10 | RUN --mount=type=cache,target=/root/.cache/pip \
11 | pip install -r requirements.txt
12 |
13 | # install development dependencies
14 | COPY requirements-dev.txt requirements-dev.txt
15 | RUN --mount=type=cache,target=/root/.cache/pip \
16 | pip install -r requirements-dev.txt
17 |
18 | # image to build pytorch extensions
19 | FROM dev AS build
20 |
21 | # copy input files
22 | COPY csrc csrc
23 | COPY setup.py setup.py
24 | COPY requirements.txt requirements.txt
25 | COPY pyproject.toml pyproject.toml
26 | COPY vllm/__init__.py vllm/__init__.py
27 |
28 | # max jobs used by Ninja to build extensions
29 | ENV MAX_JOBS=$max_jobs
30 | RUN python3 setup.py build_ext --inplace
31 |
32 | # image to run unit testing suite
33 | FROM dev AS test
34 |
35 | # copy pytorch extensions separately to avoid having to rebuild
36 | # when python code changes
37 | COPY --from=build /workspace/vllm/*.so /workspace/vllm/
38 | COPY tests tests
39 | COPY vllm vllm
40 |
41 | ENTRYPOINT ["python3", "-m", "pytest", "tests"]
42 |
43 | # use CUDA base as CUDA runtime dependencies are already installed via pip
44 | FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base
45 |
46 | # libnccl required for ray
47 | RUN apt-get update -y \
48 | && apt-get install -y python3-pip
49 |
50 | WORKDIR /workspace
51 | COPY requirements.txt requirements.txt
52 | RUN --mount=type=cache,target=/root/.cache/pip \
53 | pip install -r requirements.txt
54 |
55 | FROM vllm-base AS vllm
56 | COPY --from=build /workspace/vllm/*.so /workspace/vllm/
57 | COPY vllm vllm
58 |
59 | EXPOSE 8000
60 | ENTRYPOINT ["python3", "-m", "vllm.entrypoints.api_server"]
61 |
62 | # openai api server alternative
63 | FROM vllm-base AS vllm-openai
64 | # install additional dependencies for openai api server
65 | RUN --mount=type=cache,target=/root/.cache/pip \
66 | pip install accelerate fschat
67 |
68 | COPY --from=build /workspace/vllm/*.so /workspace/vllm/
69 | COPY vllm vllm
70 |
71 | ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
72 |
73 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include requirements.txt
3 |
4 | recursive-include csrc *
5 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 | Easy, fast, and cheap LLM serving for everyone
10 |
11 |
12 |
13 | | Documentation | Blog | Paper | Discord |
14 |
15 |
16 |
17 | ---
18 |
19 | *Latest News* 🔥
20 | - [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
21 | - [2023/09] We created our [Discord server](https://discord.gg/jz7wjKhh6g)! Join us to discuss vLLM and LLM serving! We will also post the latest announcements and updates there.
22 | - [2023/09] We released our [PagedAttention paper](https://arxiv.org/abs/2309.06180) on arXiv!
23 | - [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
24 | - [2023/07] Added support for LLaMA-2! You can run and serve 7B/13B/70B LLaMA-2s on vLLM with a single command!
25 | - [2023/06] Serving vLLM On any Cloud with SkyPilot. Check out a 1-click [example](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm) to start the vLLM demo, and the [blog post](https://blog.skypilot.co/serving-llm-24x-faster-on-the-cloud-with-vllm-and-skypilot/) for the story behind vLLM development on the clouds.
26 | - [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
27 |
28 | ---
29 |
30 | vLLM is a fast and easy-to-use library for LLM inference and serving.
31 |
32 | vLLM is fast with:
33 |
34 | - State-of-the-art serving throughput
35 | - Efficient management of attention key and value memory with **PagedAttention**
36 | - Continuous batching of incoming requests
37 | - Optimized CUDA kernels
38 |
39 | vLLM is flexible and easy to use with:
40 |
41 | - Seamless integration with popular Hugging Face models
42 | - High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
43 | - Tensor parallelism support for distributed inference
44 | - Streaming outputs
45 | - OpenAI-compatible API server
46 |
47 | vLLM seamlessly supports many Hugging Face models, including the following architectures:
48 |
49 | - Aquila & Aquila2 (`BAAI/AquilaChat2-7B`, `BAAI/AquilaChat2-34B`, `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.)
50 | - Baichuan (`baichuan-inc/Baichuan-7B`, `baichuan-inc/Baichuan-13B-Chat`, etc.)
51 | - BLOOM (`bigscience/bloom`, `bigscience/bloomz`, etc.)
52 | - ChatGLM (`THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.)
53 | - Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.)
54 | - GPT-2 (`gpt2`, `gpt2-xl`, etc.)
55 | - GPT BigCode (`bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, etc.)
56 | - GPT-J (`EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.)
57 | - GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.)
58 | - InternLM (`internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.)
59 | - LLaMA & LLaMA-2 (`meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
60 | - Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.)
61 | - MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.)
62 | - OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
63 | - Phi-1.5 (`microsoft/phi-1_5`, etc.)
64 | - Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
65 | - Yi (`01-ai/Yi-6B`, `01-ai/Yi-34B`, etc.)
66 |
67 | Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
68 |
69 | ```bash
70 | pip install vllm
71 | ```
72 |
73 | ## Getting Started
74 |
75 | Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to get started.
76 | - [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html)
77 | - [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
78 | - [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
79 |
80 | ## Contributing
81 |
82 | We welcome and value any contributions and collaborations.
83 | Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
84 |
85 | ## Citation
86 |
87 | If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
88 | ```bibtex
89 | @inproceedings{kwon2023efficient,
90 | title={Efficient Memory Management for Large Language Model Serving with PagedAttention},
91 | author={Woosuk Kwon and Zhuohan Li and Siyuan Zhuang and Ying Sheng and Lianmin Zheng and Cody Hao Yu and Joseph E. Gonzalez and Hao Zhang and Ion Stoica},
92 | booktitle={Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles},
93 | year={2023}
94 | }
95 | ```
96 |
--------------------------------------------------------------------------------
/benchmarks/README.md:
--------------------------------------------------------------------------------
1 | # Benchmarking vLLM
2 |
3 | ## Downloading the ShareGPT dataset
4 |
5 | You can download the dataset by running:
6 | ```bash
7 | wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
8 | ```
9 |
--------------------------------------------------------------------------------
/benchmarks/benchmark_latency.py:
--------------------------------------------------------------------------------
1 | """Benchmark the latency of processing a single batch of requests."""
2 | import argparse
3 | import time
4 |
5 | import numpy as np
6 | import torch
7 | from tqdm import tqdm
8 |
9 | from vllm import LLM, SamplingParams
10 |
11 |
12 | def main(args: argparse.Namespace):
13 | print(args)
14 |
15 | # Process all the requests in a single batch if possible.
16 | # NOTE(woosuk): If the request cannot be processed in a single batch,
17 | # the engine will automatically process the request in multiple batches.
18 | llm = LLM(
19 | model=args.model,
20 | tokenizer=args.tokenizer,
21 | quantization=args.quantization,
22 | tensor_parallel_size=args.tensor_parallel_size,
23 | max_num_seqs=args.batch_size,
24 | max_num_batched_tokens=args.batch_size * args.input_len,
25 | trust_remote_code=args.trust_remote_code,
26 | dtype=args.dtype,
27 | )
28 |
29 | sampling_params = SamplingParams(
30 | n=args.n,
31 | temperature=0.0 if args.use_beam_search else 1.0,
32 | top_p=1.0,
33 | use_beam_search=args.use_beam_search,
34 | ignore_eos=True,
35 | max_tokens=args.output_len,
36 | )
37 | print(sampling_params)
38 | dummy_prompt_token_ids = [[0] * args.input_len] * args.batch_size
39 |
40 | def run_to_completion(profile: bool = False):
41 | if profile:
42 | torch.cuda.cudart().cudaProfilerStart()
43 | start_time = time.perf_counter()
44 |
45 | llm.generate(prompt_token_ids=dummy_prompt_token_ids,
46 | sampling_params=sampling_params,
47 | use_tqdm=False)
48 |
49 | end_time = time.perf_counter()
50 | latency = end_time - start_time
51 | if profile:
52 | torch.cuda.cudart().cudaProfilerStop()
53 | return latency
54 |
55 | print("Warming up...")
56 | run_to_completion(profile=False)
57 |
58 | # Benchmark.
59 | latencies = []
60 | for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
61 | latencies.append(run_to_completion(profile=False))
62 | print(f'Avg latency: {np.mean(latencies)} seconds')
63 |
64 |
65 | if __name__ == '__main__':
66 | parser = argparse.ArgumentParser(
67 | description='Benchmark the latency of processing a single batch of '
68 | 'requests till completion.')
69 | parser.add_argument('--model', type=str, default='facebook/opt-125m')
70 | parser.add_argument('--tokenizer', type=str, default=None)
71 | parser.add_argument('--quantization',
72 | '-q',
73 | choices=['awq', 'squeezellm', None],
74 | default=None)
75 | parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
76 | parser.add_argument('--input-len', type=int, default=32)
77 | parser.add_argument('--output-len', type=int, default=128)
78 | parser.add_argument('--batch-size', type=int, default=8)
79 | parser.add_argument('--n',
80 | type=int,
81 | default=1,
82 | help='Number of generated sequences per prompt.')
83 | parser.add_argument('--use-beam-search', action='store_true')
84 | parser.add_argument('--num-iters',
85 | type=int,
86 | default=3,
87 | help='Number of iterations to run.')
88 | parser.add_argument('--trust-remote-code',
89 | action='store_true',
90 | help='trust remote code from huggingface')
91 | parser.add_argument(
92 | '--dtype',
93 | type=str,
94 | default='auto',
95 | choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
96 | help='data type for model weights and activations. '
97 | 'The "auto" option will use FP16 precision '
98 | 'for FP32 and FP16 models, and BF16 precision '
99 | 'for BF16 models.')
100 | args = parser.parse_args()
101 | main(args)
102 |
--------------------------------------------------------------------------------
/benchmarks/launch_tgi_server.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | PORT=8000
4 | MODEL=$1
5 | TOKENS=$2
6 |
7 | docker run --gpus all --shm-size 1g -p $PORT:80 \
8 | -v $PWD/data:/data \
9 | ghcr.io/huggingface/text-generation-inference:0.8 \
10 | --model-id $MODEL \
11 | --sharded false \
12 | --max-input-length 1024 \
13 | --max-total-tokens 2048 \
14 | --max-best-of 5 \
15 | --max-concurrent-requests 5000 \
16 | --max-batch-total-tokens $TOKENS
17 |
--------------------------------------------------------------------------------
/csrc/activation_kernels.cu:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 |
4 | #include "dispatch_utils.h"
5 |
6 | namespace vllm {
7 |
8 | template
9 | __device__ __forceinline__ T silu(const T& x) {
10 | // x * sigmoid(x)
11 | return (T) (((float) x) / (1.0f + expf((float) -x)));
12 | }
13 |
14 | template
15 | __global__ void silu_and_mul_kernel(
16 | scalar_t* __restrict__ out, // [..., d]
17 | const scalar_t* __restrict__ input, // [..., 2, d]
18 | const int d) {
19 | const int64_t token_idx = blockIdx.x;
20 | for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
21 | const scalar_t x = __ldg(&input[token_idx * 2 * d + idx]);
22 | const scalar_t y = __ldg(&input[token_idx * 2 * d + d + idx]);
23 | out[token_idx * d + idx] = silu(x) * y;
24 | }
25 | }
26 |
27 | } // namespace vllm
28 |
29 | void silu_and_mul(
30 | torch::Tensor& out, // [..., d]
31 | torch::Tensor& input) // [..., 2 * d]
32 | {
33 | int64_t num_tokens = input.numel() / input.size(-1);
34 | int d = input.size(-1) / 2;
35 |
36 | dim3 grid(num_tokens);
37 | dim3 block(std::min(d, 1024));
38 | const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
39 | VLLM_DISPATCH_FLOATING_TYPES(
40 | input.scalar_type(),
41 | "silu_and_mul_kernel",
42 | [&] {
43 | vllm::silu_and_mul_kernel<<>>(
44 | out.data_ptr(),
45 | input.data_ptr(),
46 | d);
47 | });
48 | }
49 |
50 | namespace vllm {
51 |
52 | // Element-wise activation kernel template.
53 | template
54 | __global__ void activation_kernel(
55 | scalar_t* __restrict__ out, // [..., d]
56 | const scalar_t* __restrict__ input, // [..., d]
57 | const int d) {
58 | const int64_t token_idx = blockIdx.x;
59 | for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
60 | const scalar_t x = __ldg(&input[token_idx * d + idx]);
61 | out[token_idx * d + idx] = ACT_FN(x);
62 | }
63 | }
64 |
65 | } // namespace vllm
66 |
67 | // Launch element-wise activation kernel.
68 | #define LAUNCH_ACTIVATION_KERNEL(KERNEL) \
69 | int d = input.size(-1); \
70 | int64_t num_tokens = input.numel() / d; \
71 | dim3 grid(num_tokens); \
72 | dim3 block(std::min(d, 1024)); \
73 | const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
74 | VLLM_DISPATCH_FLOATING_TYPES( \
75 | input.scalar_type(), \
76 | "activation_kernel", \
77 | [&] { \
78 | vllm::activation_kernel><<>>( \
79 | out.data_ptr(), \
80 | input.data_ptr(), \
81 | d); \
82 | });
83 |
84 | namespace vllm {
85 |
86 | template
87 | __device__ __forceinline__ T gelu_new_kernel(const T& x) {
88 | const float x3 = (float) (x * x * x);
89 | const T t = (T) tanhf((T) (0.79788456f * (float) (x + (T) (0.044715f * x3))));
90 | return ((T) 0.5) * x * (((T) 1.0) + t);
91 | }
92 |
93 | template
94 | __device__ __forceinline__ T gelu_fast_kernel(const T& x) {
95 | const float f = (float) x;
96 | const T t = (T) tanhf(((T) (f * 0.79788456f)) * (((T) 1.0) + (T) (0.044715f * f) * x));
97 | return ((T) 0.5) * x * (((T) 1.0) + t);
98 | }
99 |
100 | } // namespace vllm
101 |
102 | void gelu_new(
103 | torch::Tensor& out, // [..., d]
104 | torch::Tensor& input) // [..., d]
105 | {
106 | LAUNCH_ACTIVATION_KERNEL(vllm::gelu_new_kernel);
107 | }
108 |
109 | void gelu_fast(
110 | torch::Tensor& out, // [..., d]
111 | torch::Tensor& input) // [..., d]
112 | {
113 | LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel);
114 | }
115 |
--------------------------------------------------------------------------------
/csrc/attention/attention_dtypes.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 |
3 | #include "attention_generic.cuh"
4 | #include "dtype_float16.cuh"
5 | #include "dtype_float32.cuh"
6 | #include "dtype_bfloat16.cuh"
7 |
--------------------------------------------------------------------------------
/csrc/attention/attention_generic.cuh:
--------------------------------------------------------------------------------
1 | /*
2 | * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
3 | * Copyright (c) 2023, The vLLM team.
4 | * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | #pragma once
19 |
20 | #include
21 |
22 | namespace vllm {
23 |
24 | // A vector type to store Q, K, V elements.
25 | template
26 | struct Vec {};
27 |
28 | // A vector type to store FP32 accumulators.
29 | template
30 | struct FloatVec {};
31 |
32 | // Template vector operations.
33 | template
34 | inline __device__ Acc mul(A a, B b);
35 |
36 | template
37 | inline __device__ float sum(T v);
38 |
39 | template
40 | inline __device__ float dot(T a, T b) {
41 | return sum(mul(a, b));
42 | }
43 |
44 | template
45 | inline __device__ float dot(T a, T b) {
46 | return sum(mul(a, b));
47 | }
48 |
49 | template
50 | inline __device__ void zero(T& dst) {
51 | constexpr int WORDS = sizeof(T) / 4;
52 | union {
53 | T raw;
54 | uint32_t words[WORDS];
55 | } tmp;
56 |
57 | #pragma unroll
58 | for (int ii = 0; ii < WORDS; ++ii) {
59 | tmp.words[ii] = 0u;
60 | }
61 | dst = tmp.raw;
62 | }
63 |
64 | } // namespace vllm
65 |
--------------------------------------------------------------------------------
/csrc/attention/attention_utils.cuh:
--------------------------------------------------------------------------------
1 | /*
2 | * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
3 | * Copyright (c) 2023, The vLLM team.
4 | * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
5 | *
6 | * Licensed under the Apache License, Version 2.0 (the "License");
7 | * you may not use this file except in compliance with the License.
8 | * You may obtain a copy of the License at
9 | *
10 | * http://www.apache.org/licenses/LICENSE-2.0
11 | *
12 | * Unless required by applicable law or agreed to in writing, software
13 | * distributed under the License is distributed on an "AS IS" BASIS,
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 | * See the License for the specific language governing permissions and
16 | * limitations under the License.
17 | */
18 | #pragma once
19 |
20 | #include "attention_dtypes.h"
21 |
22 | #include
23 | #include
24 |
25 | namespace vllm {
26 |
27 | // Q*K^T operation.
28 | template
29 | inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {
30 | using A_vec = typename FloatVec::Type;
31 | // Compute the parallel products for Q*K^T (treat vector lanes separately).
32 | A_vec qk_vec = mul(q[0], k[0]);
33 | #pragma unroll
34 | for (int ii = 1; ii < N; ++ii) {
35 | qk_vec = fma(q[ii], k[ii], qk_vec);
36 | }
37 |
38 | // Finalize the reduction across lanes.
39 | float qk = sum(qk_vec);
40 | #pragma unroll
41 | for (int mask = THREAD_GROUP_SIZE / 2; mask >= 1; mask /= 2) {
42 | qk += __shfl_xor_sync(uint32_t(-1), qk, mask);
43 | }
44 | return qk;
45 | }
46 |
47 | template
48 | struct Qk_dot {
49 | template
50 | static inline __device__ float dot(const Vec (&q)[N], const Vec (&k)[N]) {
51 | return qk_dot_(q, k);
52 | }
53 | };
54 |
55 | } // namespace vllm
56 |
--------------------------------------------------------------------------------
/csrc/cache.h:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | #include