├── .github
    └── workflows
    │   ├── publish.yml
    │   ├── ruff.yml
    │   ├── scripts
    │       ├── build.sh
    │       ├── create_release.js
    │       ├── cuda-install.sh
    │       ├── env.sh
    │       └── pytorch-install.sh
    │   └── yapf.yml
├── .gitignore
├── .readthedocs.yaml
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── MANIFEST.in
├── README.md
├── benchmarks
    ├── README.md
    ├── benchmark_latency.py
    ├── benchmark_serving.py
    ├── benchmark_throughput.py
    ├── kernels
    │   └── benchmark_paged_attention.py
    └── launch_tgi_server.sh
├── csrc
    ├── activation_kernels.cu
    ├── attention
    │   ├── attention_dtypes.h
    │   ├── attention_generic.cuh
    │   ├── attention_kernels.cu
    │   ├── attention_utils.cuh
    │   ├── dtype_bfloat16.cuh
    │   ├── dtype_float16.cuh
    │   └── dtype_float32.cuh
    ├── cache.h
    ├── cache_kernels.cu
    ├── cuda_utils.h
    ├── cuda_utils_kernels.cu
    ├── dispatch_utils.h
    ├── layernorm_kernels.cu
    ├── ops.h
    ├── pos_encoding_kernels.cu
    ├── pybind.cpp
    ├── quantization
    │   ├── awq
    │   │   ├── dequantize.cuh
    │   │   └── gemm_kernels.cu
    │   └── squeezellm
    │   │   └── quant_cuda_kernel.cu
    └── reduction_utils.cuh
├── docs
    ├── Makefile
    ├── README.md
    ├── make.bat
    ├── requirements-docs.txt
    └── source
    │   ├── assets
    │       └── logos
    │       │   ├── vllm-logo-only-light.png
    │       │   ├── vllm-logo-text-dark.png
    │       │   └── vllm-logo-text-light.png
    │   ├── conf.py
    │   ├── getting_started
    │       ├── installation.rst
    │       └── quickstart.rst
    │   ├── index.rst
    │   ├── models
    │       ├── adding_model.rst
    │       ├── engine_args.rst
    │       └── supported_models.rst
    │   ├── quantization
    │       └── auto_awq.rst
    │   └── serving
    │       ├── deploying_with_docker.rst
    │       ├── deploying_with_triton.rst
    │       ├── distributed_serving.rst
    │       └── run_on_sky.rst
├── examples
    ├── api_client.py
    ├── gradio_webserver.py
    ├── llm_engine_example.py
    ├── offline_inference.py
    ├── openai_chatcompletion_client.py
    └── openai_completion_client.py
├── format.sh
├── mypy.ini
├── pyproject.toml
├── requirements-dev.txt
├── requirements.txt
├── setup.py
├── tests
    ├── __init__.py
    ├── async_engine
    │   ├── api_server_async_engine.py
    │   ├── test_api_server.py
    │   ├── test_async_llm_engine.py
    │   └── test_request_tracker.py
    ├── conftest.py
    ├── distributed
    │   └── test_comm_ops.py
    ├── engine
    │   └── test_detokenize.py
    ├── kernels
    │   ├── conftest.py
    │   ├── test_activation.py
    │   ├── test_attention.py
    │   ├── test_cache.py
    │   ├── test_layernorm.py
    │   └── test_pos_encoding.py
    ├── models
    │   └── test_models.py
    ├── samplers
    │   ├── test_beam_search.py
    │   ├── test_logprobs.py
    │   └── test_sampler.py
    ├── test_regression.py
    └── worker
    │   └── test_worker.py
└── vllm
    ├── __init__.py
    ├── block.py
    ├── config.py
    ├── core
        ├── __init__.py
        ├── block_manager.py
        ├── policy.py
        └── scheduler.py
    ├── engine
        ├── __init__.py
        ├── arg_utils.py
        ├── async_llm_engine.py
        ├── llm_engine.py
        └── ray_utils.py
    ├── entrypoints
        ├── __init__.py
        ├── api_server.py
        ├── llm.py
        └── openai
        │   ├── __init__.py
        │   ├── api_server.py
        │   └── protocol.py
    ├── logger.py
    ├── model_executor
        ├── __init__.py
        ├── input_metadata.py
        ├── layers
        │   ├── __init__.py
        │   ├── activation.py
        │   ├── attention.py
        │   ├── layernorm.py
        │   ├── linear.py
        │   ├── quantization
        │   │   ├── __init__.py
        │   │   ├── awq.py
        │   │   ├── base_config.py
        │   │   └── squeezellm.py
        │   ├── rotary_embedding.py
        │   ├── sampler.py
        │   └── vocab_parallel_embedding.py
        ├── model_loader.py
        ├── models
        │   ├── __init__.py
        │   ├── aquila.py
        │   ├── baichuan.py
        │   ├── bloom.py
        │   ├── chatglm.py
        │   ├── falcon.py
        │   ├── gpt2.py
        │   ├── gpt_bigcode.py
        │   ├── gpt_j.py
        │   ├── gpt_neox.py
        │   ├── internlm.py
        │   ├── llama.py
        │   ├── mistral.py
        │   ├── mpt.py
        │   ├── opt.py
        │   ├── phi_1_5.py
        │   ├── qwen.py
        │   └── yi.py
        ├── parallel_utils
        │   ├── README.md
        │   ├── __init__.py
        │   ├── communication_op.py
        │   ├── parallel_state.py
        │   └── utils.py
        ├── utils.py
        └── weight_utils.py
    ├── outputs.py
    ├── py.typed
    ├── sampling_params.py
    ├── sequence.py
    ├── transformers_utils
        ├── __init__.py
        ├── config.py
        ├── configs
        │   ├── __init__.py
        │   ├── aquila.py
        │   ├── baichuan.py
        │   ├── chatglm.py
        │   ├── falcon.py
        │   ├── mpt.py
        │   ├── qwen.py
        │   └── yi.py
        └── tokenizer.py
    ├── utils.py
    └── worker
        ├── __init__.py
        ├── cache_engine.py
        └── worker.py


/.github/workflows/publish.yml:
--------------------------------------------------------------------------------
  1 | # This workflow will upload a Python Package to Release asset
  2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions
  3 | 
  4 | name: Create Release
  5 | 
  6 | on:
  7 |   push:
  8 |     tags:
  9 |       - v*
 10 | 
 11 | # Needed to create release and upload assets
 12 | permissions:
 13 |   contents: write
 14 | 
 15 | jobs:
 16 |   release:
 17 |     # Retrieve tag and create release
 18 |     name: Create Release
 19 |     runs-on: ubuntu-latest
 20 |     outputs:
 21 |       upload_url: ${{ steps.create_release.outputs.upload_url }}
 22 |     steps:
 23 |       - name: Checkout
 24 |         uses: actions/checkout@v3
 25 | 
 26 |       - name: Extract branch info
 27 |         shell: bash
 28 |         run: |
 29 |           echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
 30 | 
 31 |       - name: Create Release
 32 |         id: create_release
 33 |         uses: "actions/github-script@v6"
 34 |         env:
 35 |           RELEASE_TAG: ${{ env.release_tag }}
 36 |         with:
 37 |           github-token: "${{ secrets.GITHUB_TOKEN }}"
 38 |           script: |
 39 |             const script = require('.github/workflows/scripts/create_release.js')
 40 |             await script(github, context, core)
 41 | 
 42 |   wheel:
 43 |     name: Build Wheel
 44 |     runs-on: ${{ matrix.os }}
 45 |     needs: release
 46 | 
 47 |     strategy:
 48 |       fail-fast: false
 49 |       matrix:
 50 |           os: ['ubuntu-20.04']
 51 |           python-version: ['3.8', '3.9', '3.10', '3.11']
 52 |           pytorch-version: ['2.1.0']
 53 |           cuda-version: ['11.8', '12.1']
 54 | 
 55 |     steps:
 56 |       - name: Checkout
 57 |         uses: actions/checkout@v3
 58 | 
 59 |       - name: Set up Linux Env
 60 |         if: ${{ runner.os == 'Linux' }}
 61 |         run: |
 62 |           bash -x .github/workflows/scripts/env.sh
 63 | 
 64 |       - name: Set up Python
 65 |         uses: actions/setup-python@v4
 66 |         with:
 67 |             python-version: ${{ matrix.python-version }}
 68 | 
 69 |       - name: Install CUDA ${{ matrix.cuda-version }}
 70 |         run: |
 71 |           bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
 72 | 
 73 |       - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
 74 |         run: |
 75 |           bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
 76 | 
 77 |       - name: Build wheel
 78 |         shell: bash
 79 |         run: |
 80 |           bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
 81 |           wheel_name=$(ls dist/*whl | xargs -n 1 basename)
 82 |           asset_name=${wheel_name//"linux"/"manylinux1"}
 83 |           echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
 84 |           echo "asset_name=${asset_name}" >> $GITHUB_ENV
 85 | 
 86 |       - name: Upload Release Asset
 87 |         uses: actions/upload-release-asset@v1
 88 |         env:
 89 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 90 |         with:
 91 |           upload_url: ${{ needs.release.outputs.upload_url }}
 92 |           asset_path: ./dist/${{ env.wheel_name }}
 93 |           asset_name: ${{ env.asset_name }}
 94 |           asset_content_type: application/*
 95 | 
 96 |       # (Danielkinz): This last step will publish the .whl to pypi. Warning: untested
 97 |       # - name: Publish package
 98 |       #   uses: pypa/gh-action-pypi-publish@release/v1.8
 99 |       #   with:
100 |       #     repository-url: https://test.pypi.org/legacy/
101 |       #     password: ${{ secrets.PYPI_API_TOKEN }}
102 |       #     skip-existing: true
103 | 


--------------------------------------------------------------------------------
/.github/workflows/ruff.yml:
--------------------------------------------------------------------------------
 1 | name: ruff
 2 | 
 3 | on:
 4 |   # Trigger the workflow on push or pull request,
 5 |   # but only for the main branch
 6 |   push:
 7 |     branches:
 8 |       - main
 9 |   pull_request:
10 |     branches:
11 |       - main
12 | 
13 | jobs:
14 |   ruff:
15 |     runs-on: ubuntu-latest
16 |     strategy:
17 |       matrix:
18 |         python-version: ["3.10"]
19 |     steps:
20 |     - uses: actions/checkout@v2
21 |     - name: Set up Python ${{ matrix.python-version }}
22 |       uses: actions/setup-python@v2
23 |       with:
24 |         python-version: ${{ matrix.python-version }}
25 |     - name: Install dependencies
26 |       run: |
27 |         python -m pip install --upgrade pip
28 |         pip install ruff==0.1.5
29 |     - name: Analysing the code with ruff
30 |       run: |
31 |         ruff vllm tests
32 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/build.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python_executable=python$1
 4 | cuda_home=/usr/local/cuda-$2
 5 | 
 6 | # Update paths
 7 | PATH=${cuda_home}/bin:$PATH
 8 | LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
 9 | 
10 | # Install requirements
11 | $python_executable -m pip install wheel packaging
12 | $python_executable -m pip install -r requirements.txt
13 | 
14 | # Limit the number of parallel jobs to avoid OOM
15 | export MAX_JOBS=1
16 | 
17 | # Build
18 | $python_executable setup.py bdist_wheel --dist-dir=dist
19 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/create_release.js:
--------------------------------------------------------------------------------
 1 | // Uses Github's API to create the release and wait for result.
 2 | // We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately.
 3 | 
 4 | module.exports = async (github, context, core) => {
 5 | 	try {
 6 | 		const response = await github.rest.repos.createRelease({
 7 | 			draft: false,
 8 | 			generate_release_notes: true,
 9 | 			name: process.env.RELEASE_TAG,
10 | 			owner: context.repo.owner,
11 | 			prerelease: false,
12 | 			repo: context.repo.repo,
13 | 			tag_name: process.env.RELEASE_TAG,
14 | 		});
15 | 
16 | 		core.setOutput('upload_url', response.data.upload_url);
17 | 	} catch (error) {
18 | 		core.setFailed(error.message);
19 | 	}
20 | }


--------------------------------------------------------------------------------
/.github/workflows/scripts/cuda-install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Replace '.' with '-' ex: 11.8 -> 11-8
 4 | cuda_version=$(echo $1 | tr "." "-")
 5 | # Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004
 6 | OS=$(echo $2 | tr -d ".\-")
 7 | 
 8 | # Installs CUDA
 9 | wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb
10 | sudo dpkg -i cuda-keyring_1.1-1_all.deb
11 | rm cuda-keyring_1.1-1_all.deb
12 | sudo apt -qq update
13 | sudo apt -y install cuda-${cuda_version} cuda-nvcc-${cuda_version} cuda-libraries-dev-${cuda_version}
14 | sudo apt clean
15 | 
16 | # Test nvcc
17 | PATH=/usr/local/cuda-$1/bin:${PATH}
18 | nvcc --version
19 | 
20 | # Log gcc, g++, c++ versions
21 | gcc --version
22 | g++ --version
23 | c++ --version
24 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/env.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # This file installs common linux environment tools
 4 | 
 5 | export LANG C.UTF-8
 6 | 
 7 | # python_version=$1
 8 | 
 9 | sudo    apt-get update && \
10 | sudo    apt-get install -y --no-install-recommends \
11 |         software-properties-common \
12 | 
13 | sudo    apt-get install -y --no-install-recommends \
14 |         build-essential \
15 |         apt-utils \
16 |         ca-certificates \
17 |         wget \
18 |         git \
19 |         vim \
20 |         libssl-dev \
21 |         curl \
22 |         unzip \
23 |         unrar \
24 |         cmake \
25 |         net-tools \
26 |         sudo \
27 |         autotools-dev \
28 |         rsync \
29 |         jq \
30 |         openssh-server \
31 |         tmux \
32 |         screen \
33 |         htop \
34 |         pdsh \
35 |         openssh-client \
36 |         lshw \
37 |         dmidecode \
38 |         util-linux \
39 |         automake \
40 |         autoconf \
41 |         libtool \
42 |         net-tools \
43 |         pciutils \
44 |         libpci-dev \
45 |         libaio-dev \
46 |         libcap2 \
47 |         libtinfo5 \
48 |         fakeroot \
49 |         devscripts \
50 |         debhelper \
51 |         nfs-common
52 | 
53 | # Remove github bloat files to free up disk space
54 | sudo rm -rf "/usr/local/share/boost"
55 | sudo rm -rf "$AGENT_TOOLSDIRECTORY"
56 | sudo rm -rf "/usr/share/dotnet"
57 | 


--------------------------------------------------------------------------------
/.github/workflows/scripts/pytorch-install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | python_executable=python$1
 4 | pytorch_version=$2
 5 | cuda_version=$3
 6 | 
 7 | # Install torch
 8 | $python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya
 9 | $python_executable -m pip install torch==${pytorch_version}+cu${cuda_version//./} --extra-index-url https://download.pytorch.org/whl/cu${cuda_version//./}
10 | 
11 | # Print version information
12 | $python_executable --version
13 | $python_executable -c "import torch; print('PyTorch:', torch.__version__)"
14 | $python_executable -c "import torch; print('CUDA:', torch.version.cuda)"
15 | $python_executable -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)"
16 | 


--------------------------------------------------------------------------------
/.github/workflows/yapf.yml:
--------------------------------------------------------------------------------
 1 | name: yapf
 2 | 
 3 | on:
 4 |   # Trigger the workflow on push or pull request,
 5 |   # but only for the main branch
 6 |   push:
 7 |     branches:
 8 |       - main
 9 |   pull_request:
10 |     branches:
11 |       - main
12 | jobs:
13 |   yapf:
14 |     runs-on: ubuntu-latest
15 |     strategy:
16 |       matrix:
17 |         python-version: ["3.10"]
18 |     steps:
19 |     - uses: actions/checkout@v2
20 |     - name: Set up Python ${{ matrix.python-version }}
21 |       uses: actions/setup-python@v2
22 |       with:
23 |         python-version: ${{ matrix.python-version }}
24 |     - name: Install dependencies
25 |       run: |
26 |         python -m pip install --upgrade pip
27 |         pip install yapf==0.32.0
28 |         pip install toml==0.10.2
29 |     - name: Running yapf
30 |       run: |
31 |         yapf --diff --recursive vllm tests
32 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | .idea/
161 | 
162 | # VSCode
163 | .vscode/
164 | 
165 | # DS Store
166 | .DS_Store
167 | 
168 | # Results
169 | *.csv
170 | 
171 | # Python pickle files
172 | *.pkl
173 | 
174 | # Sphinx documentation
175 | _build/
176 | 
177 | # vim swap files
178 | *.swo
179 | *.swp
180 | 


--------------------------------------------------------------------------------
/.readthedocs.yaml:
--------------------------------------------------------------------------------
 1 | # Read the Docs configuration file
 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
 3 | 
 4 | version: 2
 5 | 
 6 | build:
 7 |   os: ubuntu-22.04
 8 |   tools:
 9 |     python: "3.8"
10 | 
11 | sphinx:
12 |    configuration: docs/source/conf.py
13 | 
14 | # If using Sphinx, optionally build your docs in additional formats such as PDF
15 | formats:
16 |    - pdf
17 | 
18 | # Optionally declare the Python requirements required to build your docs
19 | python:
20 |    install:
21 |    - requirements: docs/requirements-docs.txt
22 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to vLLM
 2 | 
 3 | Thank you for your interest in contributing to vLLM!
 4 | Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large.
 5 | There are several ways you can contribute to the project:
 6 | 
 7 | - Identify and report any issues or bugs.
 8 | - Request or add a new model.
 9 | - Suggest or implement new features.
10 | 
11 | However, remember that contributions aren't just about code.
12 | We believe in the power of community support; thus, answering queries, assisting others, and enhancing the documentation are highly regarded and beneficial contributions.
13 | 
14 | Finally, one of the most impactful ways to support us is by raising awareness about vLLM.
15 | Talk about it in your blog posts, highlighting how it's driving your incredible projects.
16 | Express your support on Twitter if vLLM aids you, or simply offer your appreciation by starring our repository.
17 | 
18 | 
19 | ## Setup for development
20 | 
21 | ### Build from source
22 | 
23 | ```bash
24 | pip install -r requirements.txt
25 | pip install -e .  # This may take several minutes.
26 | ```
27 | 
28 | ### Testing
29 | 
30 | ```bash
31 | pip install -r requirements-dev.txt
32 | 
33 | # Static type checking
34 | mypy
35 | # Unit tests
36 | pytest tests/
37 | ```
38 | **Note:** Currently, the repository does not pass the mypy tests.
39 | 
40 | 
41 | ## Contributing Guidelines
42 | 
43 | ### Issue Reporting
44 | 
45 | If you encounter a bug or have a feature request, please check our issues page first to see if someone else has already reported it.
46 | If not, please file a new issue, providing as much relevant information as possible.
47 | 
48 | ### Coding Style Guide
49 | 
50 | In general, we adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html).
51 | 
52 | We include a formatting script [`format.sh`](./format.sh) to format the code.
53 | 
54 | ### Pull Requests
55 | 
56 | When submitting a pull request:
57 | 
58 | 1. Make sure your code has been rebased on top of the latest commit on the main branch.
59 | 2. Ensure code is properly formatted by running [`format.sh`](./format.sh).
60 | 3. Include a detailed description of the changes in the pull request.
61 | Explain why you made the changes you did.
62 | If your pull request fixes an open issue, please include a reference to it in the description.
63 | 
64 | ### Code Reviews
65 | 
66 | All submissions, including submissions by project members, require a code review.
67 | To make the review process as smooth as possible, please:
68 | 
69 | 1. Keep your changes as concise as possible.
70 | If your pull request involves multiple unrelated changes, consider splitting it into separate pull requests.
71 | 2. Respond to all comments within a reasonable time frame.
72 | If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion.
73 | 
74 | ### Thank You
75 | 
76 | Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM.
77 | Your contributions make vLLM a great tool for everyone!
78 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
 2 | 
 3 | RUN apt-get update -y \
 4 |     && apt-get install -y python3-pip
 5 | 
 6 | WORKDIR /workspace
 7 | 
 8 | # install build and runtime dependencies
 9 | COPY requirements.txt requirements.txt
10 | RUN --mount=type=cache,target=/root/.cache/pip \
11 |     pip install -r requirements.txt
12 | 
13 | # install development dependencies
14 | COPY requirements-dev.txt requirements-dev.txt
15 | RUN --mount=type=cache,target=/root/.cache/pip \
16 |     pip install -r requirements-dev.txt
17 | 
18 | # image to build pytorch extensions
19 | FROM dev AS build
20 | 
21 | # copy input files
22 | COPY csrc csrc
23 | COPY setup.py setup.py
24 | COPY requirements.txt requirements.txt
25 | COPY pyproject.toml pyproject.toml
26 | COPY vllm/__init__.py vllm/__init__.py
27 | 
28 | # max jobs used by Ninja to build extensions
29 | ENV MAX_JOBS=$max_jobs
30 | RUN python3 setup.py build_ext --inplace
31 | 
32 | # image to run unit testing suite
33 | FROM dev AS test
34 | 
35 | # copy pytorch extensions separately to avoid having to rebuild
36 | # when python code changes
37 | COPY --from=build /workspace/vllm/*.so /workspace/vllm/
38 | COPY tests tests
39 | COPY vllm vllm
40 | 
41 | ENTRYPOINT ["python3", "-m", "pytest", "tests"]
42 | 
43 | # use CUDA base as CUDA runtime dependencies are already installed via pip
44 | FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base
45 | 
46 | # libnccl required for ray
47 | RUN apt-get update -y \
48 |     && apt-get install -y python3-pip
49 | 
50 | WORKDIR /workspace
51 | COPY requirements.txt requirements.txt
52 | RUN --mount=type=cache,target=/root/.cache/pip \
53 |     pip install -r requirements.txt
54 | 
55 | FROM vllm-base AS vllm
56 | COPY --from=build /workspace/vllm/*.so /workspace/vllm/
57 | COPY vllm vllm
58 | 
59 | EXPOSE 8000
60 | ENTRYPOINT ["python3", "-m", "vllm.entrypoints.api_server"]
61 | 
62 | # openai api server alternative
63 | FROM vllm-base AS vllm-openai
64 | # install additional dependencies for openai api server
65 | RUN --mount=type=cache,target=/root/.cache/pip \
66 |     pip install accelerate fschat
67 | 
68 | COPY --from=build /workspace/vllm/*.so /workspace/vllm/
69 | COPY vllm vllm
70 | 
71 | ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
72 | 
73 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE
2 | include requirements.txt
3 | 
4 | recursive-include csrc *
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <p align="center">
 2 |   <picture>
 3 |     <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-dark.png">
 4 |     <img alt="vLLM" src="https://raw.githubusercontent.com/vllm-project/vllm/main/docs/source/assets/logos/vllm-logo-text-light.png" width=55%>
 5 |   </picture>
 6 | </p>
 7 | 
 8 | <h3 align="center">
 9 | Easy, fast, and cheap LLM serving for everyone
10 | </h3>
11 | 
12 | <p align="center">
13 | | <a href="https://docs.vllm.ai"><b>Documentation</b></a> | <a href="https://vllm.ai"><b>Blog</b></a> | <a href="https://arxiv.org/abs/2309.06180"><b>Paper</b></a> | <a href="https://discord.gg/jz7wjKhh6g"><b>Discord</b></a> |
14 | 
15 | </p>
16 | 
17 | ---
18 | 
19 | *Latest News* 🔥
20 | - [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing).
21 | - [2023/09] We created our [Discord server](https://discord.gg/jz7wjKhh6g)! Join us to discuss vLLM and LLM serving! We will also post the latest announcements and updates there.
22 | - [2023/09] We released our [PagedAttention paper](https://arxiv.org/abs/2309.06180) on arXiv!
23 | - [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM.
24 | - [2023/07] Added support for LLaMA-2! You can run and serve 7B/13B/70B LLaMA-2s on vLLM with a single command!
25 | - [2023/06] Serving vLLM On any Cloud with SkyPilot. Check out a 1-click [example](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm) to start the vLLM demo, and the [blog post](https://blog.skypilot.co/serving-llm-24x-faster-on-the-cloud-with-vllm-and-skypilot/) for the story behind vLLM development on the clouds.
26 | - [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai).
27 | 
28 | ---
29 | 
30 | vLLM is a fast and easy-to-use library for LLM inference and serving.
31 | 
32 | vLLM is fast with:
33 | 
34 | - State-of-the-art serving throughput
35 | - Efficient management of attention key and value memory with **PagedAttention**
36 | - Continuous batching of incoming requests
37 | - Optimized CUDA kernels
38 | 
39 | vLLM is flexible and easy to use with:
40 | 
41 | - Seamless integration with popular Hugging Face models
42 | - High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
43 | - Tensor parallelism support for distributed inference
44 | - Streaming outputs
45 | - OpenAI-compatible API server
46 | 
47 | vLLM seamlessly supports many Hugging Face models, including the following architectures:
48 | 
49 | - Aquila & Aquila2 (`BAAI/AquilaChat2-7B`, `BAAI/AquilaChat2-34B`, `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.)
50 | - Baichuan (`baichuan-inc/Baichuan-7B`, `baichuan-inc/Baichuan-13B-Chat`, etc.)
51 | - BLOOM (`bigscience/bloom`, `bigscience/bloomz`, etc.)
52 | - ChatGLM (`THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.)
53 | - Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.)
54 | - GPT-2 (`gpt2`, `gpt2-xl`, etc.)
55 | - GPT BigCode (`bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, etc.)
56 | - GPT-J (`EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.)
57 | - GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.)
58 | - InternLM (`internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.)
59 | - LLaMA & LLaMA-2 (`meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
60 | - Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.)
61 | - MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.)
62 | - OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
63 | - Phi-1.5 (`microsoft/phi-1_5`, etc.)
64 | - Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
65 | - Yi (`01-ai/Yi-6B`, `01-ai/Yi-34B`, etc.)
66 | 
67 | Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
68 | 
69 | ```bash
70 | pip install vllm
71 | ```
72 | 
73 | ## Getting Started
74 | 
75 | Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to get started.
76 | - [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html)
77 | - [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
78 | - [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
79 | 
80 | ## Contributing
81 | 
82 | We welcome and value any contributions and collaborations.
83 | Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
84 | 
85 | ## Citation
86 | 
87 | If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
88 | ```bibtex
89 | @inproceedings{kwon2023efficient,
90 |   title={Efficient Memory Management for Large Language Model Serving with PagedAttention},
91 |   author={Woosuk Kwon and Zhuohan Li and Siyuan Zhuang and Ying Sheng and Lianmin Zheng and Cody Hao Yu and Joseph E. Gonzalez and Hao Zhang and Ion Stoica},
92 |   booktitle={Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles},
93 |   year={2023}
94 | }
95 | ```
96 | 


--------------------------------------------------------------------------------
/benchmarks/README.md:
--------------------------------------------------------------------------------
1 | # Benchmarking vLLM
2 | 
3 | ## Downloading the ShareGPT dataset
4 | 
5 | You can download the dataset by running:
6 | ```bash
7 | wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
8 | ```
9 | 


--------------------------------------------------------------------------------
/benchmarks/benchmark_latency.py:
--------------------------------------------------------------------------------
  1 | """Benchmark the latency of processing a single batch of requests."""
  2 | import argparse
  3 | import time
  4 | 
  5 | import numpy as np
  6 | import torch
  7 | from tqdm import tqdm
  8 | 
  9 | from vllm import LLM, SamplingParams
 10 | 
 11 | 
 12 | def main(args: argparse.Namespace):
 13 |     print(args)
 14 | 
 15 |     # Process all the requests in a single batch if possible.
 16 |     # NOTE(woosuk): If the request cannot be processed in a single batch,
 17 |     # the engine will automatically process the request in multiple batches.
 18 |     llm = LLM(
 19 |         model=args.model,
 20 |         tokenizer=args.tokenizer,
 21 |         quantization=args.quantization,
 22 |         tensor_parallel_size=args.tensor_parallel_size,
 23 |         max_num_seqs=args.batch_size,
 24 |         max_num_batched_tokens=args.batch_size * args.input_len,
 25 |         trust_remote_code=args.trust_remote_code,
 26 |         dtype=args.dtype,
 27 |     )
 28 | 
 29 |     sampling_params = SamplingParams(
 30 |         n=args.n,
 31 |         temperature=0.0 if args.use_beam_search else 1.0,
 32 |         top_p=1.0,
 33 |         use_beam_search=args.use_beam_search,
 34 |         ignore_eos=True,
 35 |         max_tokens=args.output_len,
 36 |     )
 37 |     print(sampling_params)
 38 |     dummy_prompt_token_ids = [[0] * args.input_len] * args.batch_size
 39 | 
 40 |     def run_to_completion(profile: bool = False):
 41 |         if profile:
 42 |             torch.cuda.cudart().cudaProfilerStart()
 43 |         start_time = time.perf_counter()
 44 | 
 45 |         llm.generate(prompt_token_ids=dummy_prompt_token_ids,
 46 |                      sampling_params=sampling_params,
 47 |                      use_tqdm=False)
 48 | 
 49 |         end_time = time.perf_counter()
 50 |         latency = end_time - start_time
 51 |         if profile:
 52 |             torch.cuda.cudart().cudaProfilerStop()
 53 |         return latency
 54 | 
 55 |     print("Warming up...")
 56 |     run_to_completion(profile=False)
 57 | 
 58 |     # Benchmark.
 59 |     latencies = []
 60 |     for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
 61 |         latencies.append(run_to_completion(profile=False))
 62 |     print(f'Avg latency: {np.mean(latencies)} seconds')
 63 | 
 64 | 
 65 | if __name__ == '__main__':
 66 |     parser = argparse.ArgumentParser(
 67 |         description='Benchmark the latency of processing a single batch of '
 68 |         'requests till completion.')
 69 |     parser.add_argument('--model', type=str, default='facebook/opt-125m')
 70 |     parser.add_argument('--tokenizer', type=str, default=None)
 71 |     parser.add_argument('--quantization',
 72 |                         '-q',
 73 |                         choices=['awq', 'squeezellm', None],
 74 |                         default=None)
 75 |     parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
 76 |     parser.add_argument('--input-len', type=int, default=32)
 77 |     parser.add_argument('--output-len', type=int, default=128)
 78 |     parser.add_argument('--batch-size', type=int, default=8)
 79 |     parser.add_argument('--n',
 80 |                         type=int,
 81 |                         default=1,
 82 |                         help='Number of generated sequences per prompt.')
 83 |     parser.add_argument('--use-beam-search', action='store_true')
 84 |     parser.add_argument('--num-iters',
 85 |                         type=int,
 86 |                         default=3,
 87 |                         help='Number of iterations to run.')
 88 |     parser.add_argument('--trust-remote-code',
 89 |                         action='store_true',
 90 |                         help='trust remote code from huggingface')
 91 |     parser.add_argument(
 92 |         '--dtype',
 93 |         type=str,
 94 |         default='auto',
 95 |         choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
 96 |         help='data type for model weights and activations. '
 97 |         'The "auto" option will use FP16 precision '
 98 |         'for FP32 and FP16 models, and BF16 precision '
 99 |         'for BF16 models.')
100 |     args = parser.parse_args()
101 |     main(args)
102 | 


--------------------------------------------------------------------------------
/benchmarks/launch_tgi_server.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | PORT=8000
 4 | MODEL=$1
 5 | TOKENS=$2
 6 | 
 7 | docker run --gpus all --shm-size 1g -p $PORT:80 \
 8 |            -v $PWD/data:/data \
 9 |            ghcr.io/huggingface/text-generation-inference:0.8 \
10 |            --model-id $MODEL \
11 |            --sharded false  \
12 |            --max-input-length 1024 \
13 |            --max-total-tokens 2048 \
14 |            --max-best-of 5 \
15 |            --max-concurrent-requests 5000 \
16 |            --max-batch-total-tokens $TOKENS
17 | 


--------------------------------------------------------------------------------
/csrc/activation_kernels.cu:
--------------------------------------------------------------------------------
  1 | #include <torch/extension.h>
  2 | #include <ATen/cuda/CUDAContext.h>
  3 | 
  4 | #include "dispatch_utils.h"
  5 | 
  6 | namespace vllm {
  7 | 
  8 | template<typename T>
  9 | __device__ __forceinline__ T silu(const T& x) {
 10 |   // x * sigmoid(x)
 11 |   return (T) (((float) x) / (1.0f + expf((float) -x)));
 12 | }
 13 | 
 14 | template<typename scalar_t>
 15 | __global__ void silu_and_mul_kernel(
 16 |   scalar_t* __restrict__ out,               // [..., d]
 17 |   const scalar_t* __restrict__ input,       // [..., 2, d]
 18 |   const int d) {
 19 |   const int64_t token_idx = blockIdx.x;
 20 |   for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
 21 |     const scalar_t x = __ldg(&input[token_idx * 2 * d + idx]);
 22 |     const scalar_t y = __ldg(&input[token_idx * 2 * d + d + idx]);
 23 |     out[token_idx * d + idx] = silu(x) * y;
 24 |   }
 25 | }
 26 | 
 27 | } // namespace vllm
 28 | 
 29 | void silu_and_mul(
 30 |   torch::Tensor& out,      // [..., d]
 31 |   torch::Tensor& input)    // [..., 2 * d]
 32 | {
 33 |   int64_t num_tokens = input.numel() / input.size(-1);
 34 |   int d = input.size(-1) / 2;
 35 | 
 36 |   dim3 grid(num_tokens);
 37 |   dim3 block(std::min(d, 1024));
 38 |   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 39 |   VLLM_DISPATCH_FLOATING_TYPES(
 40 |     input.scalar_type(),
 41 |     "silu_and_mul_kernel",
 42 |     [&] {
 43 |       vllm::silu_and_mul_kernel<scalar_t><<<grid, block, 0, stream>>>(
 44 |         out.data_ptr<scalar_t>(),
 45 |         input.data_ptr<scalar_t>(),
 46 |         d);
 47 |     });
 48 | }
 49 | 
 50 | namespace vllm {
 51 | 
 52 | // Element-wise activation kernel template.
 53 | template<typename scalar_t, scalar_t (*ACT_FN)(const scalar_t&)>
 54 | __global__ void activation_kernel(
 55 |   scalar_t* __restrict__ out,               // [..., d]
 56 |   const scalar_t* __restrict__ input,       // [..., d]
 57 |   const int d) {
 58 |   const int64_t token_idx = blockIdx.x;
 59 |   for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
 60 |     const scalar_t x = __ldg(&input[token_idx * d + idx]);
 61 |     out[token_idx * d + idx] = ACT_FN(x);
 62 |   }
 63 | }
 64 | 
 65 | } // namespace vllm
 66 | 
 67 | // Launch element-wise activation kernel.
 68 | #define LAUNCH_ACTIVATION_KERNEL(KERNEL)                                                  \
 69 |   int d = input.size(-1);                                                                 \
 70 |   int64_t num_tokens = input.numel() / d;                                                 \
 71 |   dim3 grid(num_tokens);                                                                  \
 72 |   dim3 block(std::min(d, 1024));                                                          \
 73 |   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();                           \
 74 |   VLLM_DISPATCH_FLOATING_TYPES(                                                           \
 75 |     input.scalar_type(),                                                                  \
 76 |     "activation_kernel",                                                                  \
 77 |     [&] {                                                                                 \
 78 |       vllm::activation_kernel<scalar_t, KERNEL<scalar_t>><<<grid, block, 0, stream>>>(    \
 79 |         out.data_ptr<scalar_t>(),                                                         \
 80 |         input.data_ptr<scalar_t>(),                                                       \
 81 |         d);                                                                               \
 82 |     });
 83 | 
 84 | namespace vllm {
 85 | 
 86 | template<typename T>
 87 | __device__ __forceinline__ T gelu_new_kernel(const T& x) {
 88 |   const float x3 = (float) (x * x * x);
 89 |   const T t = (T) tanhf((T) (0.79788456f * (float) (x + (T) (0.044715f * x3))));
 90 |   return ((T) 0.5) * x * (((T) 1.0) + t);
 91 | }
 92 | 
 93 | template<typename T>
 94 | __device__ __forceinline__ T gelu_fast_kernel(const T& x) {
 95 |   const float f = (float) x;
 96 |   const T t = (T) tanhf(((T) (f * 0.79788456f)) * (((T) 1.0) + (T) (0.044715f * f) * x));
 97 |   return ((T) 0.5) * x * (((T) 1.0) + t);
 98 | }
 99 | 
100 | } // namespace vllm
101 | 
102 | void gelu_new(
103 |   torch::Tensor& out,     // [..., d]
104 |   torch::Tensor& input)   // [..., d]
105 | {
106 |   LAUNCH_ACTIVATION_KERNEL(vllm::gelu_new_kernel);
107 | }
108 | 
109 | void gelu_fast(
110 |   torch::Tensor& out,     // [..., d]
111 |   torch::Tensor& input)   // [..., d]
112 | {
113 |   LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel);
114 | }
115 | 


--------------------------------------------------------------------------------
/csrc/attention/attention_dtypes.h:
--------------------------------------------------------------------------------
1 | #pragma once
2 | 
3 | #include "attention_generic.cuh"
4 | #include "dtype_float16.cuh"
5 | #include "dtype_float32.cuh"
6 | #include "dtype_bfloat16.cuh"
7 | 


--------------------------------------------------------------------------------
/csrc/attention/attention_generic.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
 3 |  * Copyright (c) 2023, The vLLM team.
 4 |  * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License");
 7 |  * you may not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | #pragma once
19 | 
20 | #include <stdint.h>
21 | 
22 | namespace vllm {
23 | 
24 | // A vector type to store Q, K, V elements.
25 | template<typename T, int VEC_SIZE>
26 | struct Vec {};
27 | 
28 | // A vector type to store FP32 accumulators.
29 | template<typename T>
30 | struct FloatVec {};
31 | 
32 | // Template vector operations.
33 | template<typename Acc, typename A, typename B>
34 | inline __device__ Acc mul(A a, B b);
35 | 
36 | template<typename T>
37 | inline __device__ float sum(T v);
38 | 
39 | template<typename T>
40 | inline __device__ float dot(T a, T b) {
41 |   return sum(mul<T, T, T>(a, b));
42 | }
43 | 
44 | template<typename A, typename T>
45 | inline __device__ float dot(T a, T b) {
46 |   return sum(mul<A, T, T>(a, b));
47 | }
48 | 
49 | template<typename T>
50 | inline __device__ void zero(T& dst) {
51 |   constexpr int WORDS = sizeof(T) / 4;
52 |   union {
53 |     T raw;
54 |     uint32_t words[WORDS];
55 |   } tmp;
56 | 
57 | #pragma unroll
58 |   for (int ii = 0; ii < WORDS; ++ii) {
59 |     tmp.words[ii] = 0u;
60 |   }
61 |   dst = tmp.raw;
62 | }
63 | 
64 | } // namespace vllm
65 | 


--------------------------------------------------------------------------------
/csrc/attention/attention_utils.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
 3 |  * Copyright (c) 2023, The vLLM team.
 4 |  * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License");
 7 |  * you may not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | #pragma once
19 | 
20 | #include "attention_dtypes.h"
21 | 
22 | #include <float.h>
23 | #include <type_traits>
24 | 
25 | namespace vllm {
26 | 
27 | // Q*K^T operation.
28 | template<int THREAD_GROUP_SIZE, typename Vec, int N>
29 | inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {
30 |   using A_vec = typename FloatVec<Vec>::Type;
31 |   // Compute the parallel products for Q*K^T (treat vector lanes separately).
32 |   A_vec qk_vec = mul<A_vec, Vec, Vec>(q[0], k[0]);
33 | #pragma unroll
34 |   for (int ii = 1; ii < N; ++ii) {
35 |     qk_vec = fma(q[ii], k[ii], qk_vec);
36 |   }
37 | 
38 |   // Finalize the reduction across lanes.
39 |   float qk = sum(qk_vec);
40 | #pragma unroll
41 |   for (int mask = THREAD_GROUP_SIZE / 2; mask >= 1; mask /= 2) {
42 |     qk += __shfl_xor_sync(uint32_t(-1), qk, mask);
43 |   }
44 |   return qk;
45 | }
46 | 
47 | template<typename T, int THREAD_GROUP_SIZE>
48 | struct Qk_dot {
49 |   template<typename Vec, int N>
50 |   static inline __device__ float dot(const Vec (&q)[N], const Vec (&k)[N]) {
51 |     return qk_dot_<THREAD_GROUP_SIZE>(q, k);
52 |   }
53 | };
54 | 
55 | } // namespace vllm
56 | 


--------------------------------------------------------------------------------
/csrc/cache.h:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | #include <map>
 4 | #include <vector>
 5 | 
 6 | void swap_blocks(
 7 |   torch::Tensor& src,
 8 |   torch::Tensor& dst,
 9 |   const std::map<int64_t, int64_t>& block_mapping);
10 | 
11 | void copy_blocks(
12 |   std::vector<torch::Tensor>& key_caches,
13 |   std::vector<torch::Tensor>& value_caches,
14 |   const std::map<int64_t, std::vector<int64_t>>& block_mapping);
15 | 
16 | void reshape_and_cache(
17 |   torch::Tensor& key,
18 |   torch::Tensor& value,
19 |   torch::Tensor& key_cache,
20 |   torch::Tensor& value_cache,
21 |   torch::Tensor& slot_mapping);
22 | 
23 | void gather_cached_kv(
24 |   torch::Tensor& key,
25 |   torch::Tensor& value,
26 |   torch::Tensor& key_cache,
27 |   torch::Tensor& value_cache,
28 |   torch::Tensor& slot_mapping);
29 | 


--------------------------------------------------------------------------------
/csrc/cuda_utils.h:
--------------------------------------------------------------------------------
1 | #include <torch/extension.h>
2 | 
3 | int get_device_attribute(
4 |     int attribute,
5 |     int device_id);
6 | 


--------------------------------------------------------------------------------
/csrc/cuda_utils_kernels.cu:
--------------------------------------------------------------------------------
 1 | int get_device_attribute(
 2 |     int attribute,
 3 |     int device_id)
 4 | {
 5 |     int device, value;
 6 |     if (device_id < 0) {
 7 |         cudaGetDevice(&device);
 8 |     }
 9 |     else {
10 |         device = device_id;
11 |     }
12 |     cudaDeviceGetAttribute(&value, static_cast<cudaDeviceAttr>(attribute), device);
13 |     return value;
14 | }
15 | 


--------------------------------------------------------------------------------
/csrc/dispatch_utils.h:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Adapted from
 3 |  * https://github.com/pytorch/pytorch/blob/v2.0.1/aten/src/ATen/Dispatch.h
 4 |  */
 5 | #include <torch/extension.h>
 6 | 
 7 | #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)              \
 8 |   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)      \
 9 |   AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)       \
10 |   AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__)
11 | 
12 | #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)             \
13 |   AT_DISPATCH_SWITCH(                                             \
14 |     TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__))
15 | 


--------------------------------------------------------------------------------
/csrc/layernorm_kernels.cu:
--------------------------------------------------------------------------------
  1 | #include <torch/extension.h>
  2 | #include <ATen/cuda/CUDAContext.h>
  3 | 
  4 | #include "dispatch_utils.h"
  5 | #include "reduction_utils.cuh"
  6 | 
  7 | namespace vllm {
  8 | 
  9 | // TODO(woosuk): Further optimize this kernel.
 10 | template<typename scalar_t>
 11 | __global__ void rms_norm_kernel(
 12 |   scalar_t* __restrict__ out,             // [..., hidden_size]
 13 |   const scalar_t* __restrict__ input,     // [..., hidden_size]
 14 |   const scalar_t* __restrict__ weight,    // [hidden_size]
 15 |   const float epsilon,
 16 |   const int num_tokens,
 17 |   const int hidden_size) {
 18 |   __shared__ float s_variance;
 19 |   float variance = 0.0f;
 20 | 
 21 |   for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
 22 |     const float x = (float) input[blockIdx.x * hidden_size + idx];
 23 |     variance += x * x;
 24 |   }
 25 |   variance = blockReduceSum<float>(variance);
 26 |   if (threadIdx.x == 0) {
 27 |     s_variance = rsqrtf(variance / hidden_size + epsilon);
 28 |   }
 29 |   __syncthreads();
 30 | 
 31 |   for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
 32 |     float x = (float) input[blockIdx.x * hidden_size + idx];
 33 |     out[blockIdx.x * hidden_size + idx] = ((scalar_t) (x * s_variance)) * weight[idx];
 34 |   }
 35 | }
 36 | 
 37 | // TODO: Further optimize this kernel.
 38 | template<typename scalar_t>
 39 | __global__ void fused_add_rms_norm_kernel(
 40 |   scalar_t* __restrict__ input,           // [..., hidden_size]
 41 |   scalar_t* __restrict__ residual,        // [..., hidden_size]
 42 |   const scalar_t* __restrict__ weight,    // [hidden_size]
 43 |   const float epsilon,
 44 |   const int num_tokens,
 45 |   const int hidden_size) {
 46 |   __shared__ float s_variance;
 47 |   float variance = 0.0f;
 48 | 
 49 |   for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
 50 |     float x = (float) input[blockIdx.x * hidden_size + idx];
 51 |     x += (float) residual[blockIdx.x * hidden_size + idx];
 52 |     variance += x * x;
 53 |     residual[blockIdx.x * hidden_size + idx] = (scalar_t) x;
 54 |   }
 55 |   variance = blockReduceSum<float>(variance);
 56 |   if (threadIdx.x == 0) {
 57 |     s_variance = rsqrtf(variance / hidden_size + epsilon);
 58 |   }
 59 |   __syncthreads();
 60 | 
 61 |   for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) {
 62 |     float x = (float) residual[blockIdx.x * hidden_size + idx];
 63 |     input[blockIdx.x * hidden_size + idx] = ((scalar_t) (x * s_variance)) * weight[idx];
 64 |   }
 65 | }
 66 | 
 67 | } // namespace vllm
 68 | 
 69 | void rms_norm(
 70 |   torch::Tensor& out,      // [..., hidden_size]
 71 |   torch::Tensor& input,    // [..., hidden_size]
 72 |   torch::Tensor& weight,   // [hidden_size]
 73 |   float epsilon) {
 74 |   int hidden_size = input.size(-1);
 75 |   int num_tokens = input.numel() / hidden_size;
 76 | 
 77 |   dim3 grid(num_tokens);
 78 |   dim3 block(std::min(hidden_size, 1024));
 79 |   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 80 |   VLLM_DISPATCH_FLOATING_TYPES(
 81 |     input.scalar_type(),
 82 |     "rms_norm_kernel",
 83 |     [&] {
 84 |       vllm::rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
 85 |         out.data_ptr<scalar_t>(),
 86 |         input.data_ptr<scalar_t>(),
 87 |         weight.data_ptr<scalar_t>(),
 88 |         epsilon,
 89 |         num_tokens,
 90 |         hidden_size);
 91 |     });
 92 | }
 93 | 
 94 | void fused_add_rms_norm(
 95 |   torch::Tensor& input,    // [..., hidden_size]
 96 |   torch::Tensor& residual, // [..., hidden_size]
 97 |   torch::Tensor& weight,   // [hidden_size]
 98 |   float epsilon) {
 99 |   int hidden_size = input.size(-1);
100 |   int num_tokens = input.numel() / hidden_size;
101 | 
102 |   dim3 grid(num_tokens);
103 |   dim3 block(std::min(hidden_size, 1024));
104 |   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
105 |   VLLM_DISPATCH_FLOATING_TYPES(
106 |     input.scalar_type(),
107 |     "fused_add_rms_norm_kernel",
108 |     [&] {
109 |       vllm::fused_add_rms_norm_kernel<scalar_t><<<grid, block, 0, stream>>>(
110 |         input.data_ptr<scalar_t>(),
111 |         residual.data_ptr<scalar_t>(),
112 |         weight.data_ptr<scalar_t>(),
113 |         epsilon,
114 |         num_tokens,
115 |         hidden_size);
116 |     });
117 | }
118 | 


--------------------------------------------------------------------------------
/csrc/ops.h:
--------------------------------------------------------------------------------
 1 | #include <torch/extension.h>
 2 | 
 3 | void paged_attention_v1(
 4 |   torch::Tensor& out,
 5 |   torch::Tensor& query,
 6 |   torch::Tensor& key_cache,
 7 |   torch::Tensor& value_cache,
 8 |   torch::Tensor& head_mapping,
 9 |   float scale,
10 |   torch::Tensor& block_tables,
11 |   torch::Tensor& context_lens,
12 |   int block_size,
13 |   int max_context_len,
14 |   const c10::optional<torch::Tensor>& alibi_slopes);
15 | 
16 | void paged_attention_v2(
17 |   torch::Tensor& out,
18 |   torch::Tensor& exp_sums,
19 |   torch::Tensor& max_logits,
20 |   torch::Tensor& tmp_out,
21 |   torch::Tensor& query,
22 |   torch::Tensor& key_cache,
23 |   torch::Tensor& value_cache,
24 |   torch::Tensor& head_mapping,
25 |   float scale,
26 |   torch::Tensor& block_tables,
27 |   torch::Tensor& context_lens,
28 |   int block_size,
29 |   int max_context_len,
30 |   const c10::optional<torch::Tensor>& alibi_slopes);
31 | 
32 | void rms_norm(
33 |   torch::Tensor& out,
34 |   torch::Tensor& input,
35 |   torch::Tensor& weight,
36 |   float epsilon);
37 | 
38 | void fused_add_rms_norm(
39 |   torch::Tensor& input,
40 |   torch::Tensor& residual,
41 |   torch::Tensor& weight,
42 |   float epsilon);
43 | 
44 | void rotary_embedding(
45 |   torch::Tensor& positions,
46 |   torch::Tensor& query,
47 |   torch::Tensor& key,
48 |   int head_size,
49 |   torch::Tensor& cos_sin_cache,
50 |   bool is_neox);
51 | 
52 | void silu_and_mul(
53 |   torch::Tensor& out,
54 |   torch::Tensor& input);
55 | 
56 | void gelu_new(
57 |   torch::Tensor& out,
58 |   torch::Tensor& input);
59 | 
60 | void gelu_fast(
61 |   torch::Tensor& out,
62 |   torch::Tensor& input);
63 | 
64 | torch::Tensor awq_gemm(
65 |   torch::Tensor _in_feats,
66 |   torch::Tensor _kernel,
67 |   torch::Tensor _scaling_factors,
68 |   torch::Tensor _zeros,
69 |   int split_k_iters);
70 | 
71 | void squeezellm_gemm(
72 |   torch::Tensor vec,
73 |   torch::Tensor mat,
74 |   torch::Tensor mul,
75 |   torch::Tensor lookup_table);
76 | 


--------------------------------------------------------------------------------
/csrc/pos_encoding_kernels.cu:
--------------------------------------------------------------------------------
  1 | #include <torch/extension.h>
  2 | #include <ATen/cuda/CUDAContext.h>
  3 | 
  4 | #include "dispatch_utils.h"
  5 | 
  6 | namespace vllm {
  7 | 
  8 | template<typename scalar_t, bool IS_NEOX>
  9 | inline __device__ void apply_rotary_embedding(
 10 |   scalar_t* __restrict__ arr,
 11 |   const scalar_t* __restrict__ cos_ptr,
 12 |   const scalar_t* __restrict__ sin_ptr,
 13 |   int rot_offset,
 14 |   int embed_dim)
 15 | {
 16 |   int x_index, y_index;
 17 |   scalar_t cos, sin;
 18 |   if (IS_NEOX) {
 19 |     // GPT-NeoX style rotary embedding.
 20 |     x_index = rot_offset;
 21 |     y_index = embed_dim + rot_offset;
 22 |     cos = __ldg(cos_ptr + x_index);
 23 |     sin = __ldg(sin_ptr + x_index);
 24 |   } else {
 25 |     // GPT-J style rotary embedding.
 26 |     x_index = 2 * rot_offset;
 27 |     y_index = 2 * rot_offset + 1;
 28 |     cos = __ldg(cos_ptr + x_index / 2);
 29 |     sin = __ldg(sin_ptr + x_index / 2);
 30 |   }
 31 | 
 32 |   const scalar_t x = arr[x_index];
 33 |   const scalar_t y = arr[y_index];
 34 |   arr[x_index] = x * cos - y * sin;
 35 |   arr[y_index] = y * cos + x * sin;
 36 | }
 37 | 
 38 | template<typename scalar_t, bool IS_NEOX>
 39 | __global__ void rotary_embedding_kernel(
 40 |   const int64_t* __restrict__ positions,        // [batch_size, seq_len] or [num_tokens]
 41 |   scalar_t* __restrict__ query,                 // [batch_size, seq_len, num_heads, head_size] or [num_tokens, num_heads, head_size]
 42 |   scalar_t* __restrict__ key,                   // [batch_size, seq_len, num_kv_heads, head_size] or [num_tokens, num_kv_heads, head_size]
 43 |   const scalar_t* __restrict__ cos_sin_cache,   // [max_position, 2, rot_dim // 2]
 44 |   const int rot_dim,
 45 |   const int query_stride,
 46 |   const int key_stride,
 47 |   const int num_heads,
 48 |   const int num_kv_heads,
 49 |   const int head_size) {
 50 |   // Each thread block is responsible for one token.
 51 |   const int token_idx = blockIdx.x;
 52 |   int64_t pos = positions[token_idx];
 53 |   const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
 54 | 
 55 |   const int embed_dim = rot_dim / 2;
 56 |   const scalar_t* cos_ptr = cache_ptr;
 57 |   const scalar_t* sin_ptr = cache_ptr + embed_dim;
 58 | 
 59 |   const int nq = num_heads * embed_dim;
 60 |   for (int i = threadIdx.x; i < nq; i += blockDim.x) {
 61 |     const int head_idx = i / embed_dim;
 62 |     const int token_head = token_idx * query_stride + head_idx * head_size;
 63 |     const int rot_offset = i % embed_dim;
 64 |     apply_rotary_embedding<scalar_t, IS_NEOX>(query + token_head, cos_ptr,
 65 |                                               sin_ptr, rot_offset, embed_dim);
 66 |   }
 67 | 
 68 |   const int nk = num_kv_heads * embed_dim;
 69 |   for (int i = threadIdx.x; i < nk; i += blockDim.x) {
 70 |     const int head_idx = i / embed_dim;
 71 |     const int token_head = token_idx * key_stride + head_idx * head_size;
 72 |     const int rot_offset = i % embed_dim;
 73 |     apply_rotary_embedding<scalar_t, IS_NEOX>(key + token_head, cos_ptr,
 74 |                                               sin_ptr, rot_offset, embed_dim);
 75 |   }
 76 | }
 77 | 
 78 | } // namespace vllm
 79 | 
 80 | void rotary_embedding(
 81 |   torch::Tensor& positions,         // [batch_size, seq_len] or [num_tokens]
 82 |   torch::Tensor& query,             // [batch_size, seq_len, num_heads * head_size] or [num_tokens, num_heads * head_size]
 83 |   torch::Tensor& key,               // [batch_size, seq_len, num_kv_heads * head_size] or [num_tokens, num_kv_heads * head_size]
 84 |   int head_size,
 85 |   torch::Tensor& cos_sin_cache,     // [max_position, rot_dim]
 86 |   bool is_neox) {
 87 |   int64_t num_tokens = query.numel() / query.size(-1);
 88 |   int rot_dim = cos_sin_cache.size(1);
 89 |   int num_heads = query.size(-1) / head_size;
 90 |   int num_kv_heads = key.size(-1) / head_size;
 91 |   int query_stride = query.stride(-2);
 92 |   int key_stride = key.stride(-2);
 93 | 
 94 |   dim3 grid(num_tokens);
 95 |   dim3 block(std::min(num_heads * rot_dim / 2, 512));
 96 |   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 97 |   VLLM_DISPATCH_FLOATING_TYPES(
 98 |     query.scalar_type(),
 99 |     "rotary_embedding",
100 |     [&] {
101 |       if (is_neox) {
102 |         vllm::rotary_embedding_kernel<scalar_t, true><<<grid, block, 0, stream>>>(
103 |           positions.data_ptr<int64_t>(),
104 |           query.data_ptr<scalar_t>(),
105 |           key.data_ptr<scalar_t>(),
106 |           cos_sin_cache.data_ptr<scalar_t>(),
107 |           rot_dim,
108 |           query_stride,
109 |           key_stride,
110 |           num_heads,
111 |           num_kv_heads,
112 |           head_size);
113 |       } else {
114 |         vllm::rotary_embedding_kernel<scalar_t, false><<<grid, block, 0, stream>>>(
115 |           positions.data_ptr<int64_t>(),
116 |           query.data_ptr<scalar_t>(),
117 |           key.data_ptr<scalar_t>(),
118 |           cos_sin_cache.data_ptr<scalar_t>(),
119 |           rot_dim,
120 |           query_stride,
121 |           key_stride,
122 |           num_heads,
123 |           num_kv_heads,
124 |           head_size);
125 |       }
126 |     });
127 | }
128 | 


--------------------------------------------------------------------------------
/csrc/pybind.cpp:
--------------------------------------------------------------------------------
 1 | #include "cache.h"
 2 | #include "cuda_utils.h"
 3 | #include "ops.h"
 4 | #include <torch/extension.h>
 5 | 
 6 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
 7 |   // vLLM custom ops
 8 |   pybind11::module ops = m.def_submodule("ops", "vLLM custom operators");
 9 | 
10 |   // Attention ops
11 |   ops.def(
12 |     "paged_attention_v1",
13 |     &paged_attention_v1,
14 |     "Compute the attention between an input query and the cached keys/values using PagedAttention.");
15 |   ops.def(
16 |     "paged_attention_v2",
17 |     &paged_attention_v2,
18 |     "PagedAttention V2.");
19 | 
20 |   // Activation ops
21 |   ops.def(
22 |     "silu_and_mul",
23 |     &silu_and_mul,
24 |     "Activation function used in SwiGLU.");
25 |   ops.def(
26 |     "gelu_new",
27 |     &gelu_new,
28 |     "GELU implementation used in GPT-2.");
29 |   ops.def(
30 |     "gelu_fast",
31 |     &gelu_fast,
32 |     "Approximate GELU implementation.");
33 | 
34 |   // Layernorm
35 |   ops.def(
36 |     "rms_norm",
37 |     &rms_norm,
38 |     "Apply Root Mean Square (RMS) Normalization to the input tensor.");
39 | 
40 |   ops.def(
41 |     "fused_add_rms_norm",
42 |     &fused_add_rms_norm,
43 |     "In-place fused Add and RMS Normalization");
44 | 
45 |   // Rotary embedding
46 |   ops.def(
47 |     "rotary_embedding",
48 |     &rotary_embedding,
49 |     "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
50 | 
51 |   // Quantization ops
52 |   ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");
53 |   ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM");
54 | 
55 |   // Cache ops
56 |   pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");
57 |   cache_ops.def(
58 |     "swap_blocks",
59 |     &swap_blocks,
60 |     "Swap in (out) the cache blocks from src to dst");
61 |   cache_ops.def(
62 |     "copy_blocks",
63 |     &copy_blocks,
64 |     "Copy the cache blocks from src to dst");
65 |   cache_ops.def(
66 |     "reshape_and_cache",
67 |     &reshape_and_cache,
68 |     "Reshape the key and value tensors and cache them");
69 |   cache_ops.def(
70 |     "gather_cached_kv",
71 |     &gather_cached_kv,
72 |     "Gather key and value from the cache into contiguous QKV tensors");
73 | 
74 |   // Cuda utils
75 |   pybind11::module cuda_utils = m.def_submodule("cuda_utils", "vLLM cuda utils");
76 |   cuda_utils.def(
77 |     "get_device_attribute",
78 |     &get_device_attribute,
79 |     "Gets the specified device attribute.");
80 | }
81 | 


--------------------------------------------------------------------------------
/csrc/quantization/awq/dequantize.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 | Adapted from https://github.com/mit-han-lab/llm-awq
 3 | Modified from NVIDIA FasterTransformer: https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
 4 | @article{lin2023awq,
 5 |   title={AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration},
 6 |   author={Lin, Ji and Tang, Jiaming and Tang, Haotian and Yang, Shang and Dang, Xingyu and Han, Song},
 7 |   journal={arXiv},
 8 |   year={2023}
 9 | }
10 | */
11 | 
12 | #pragma once
13 | 
14 | namespace vllm {
15 | namespace awq {
16 | 
17 | __device__ uint4 dequantize_s4_to_fp16x2(uint32_t const& source)
18 | {
19 | #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750
20 |   assert(false);
21 | #else
22 |     uint4 result;
23 | 
24 |     uint32_t*      h   = reinterpret_cast<uint32_t*>(&result);
25 |     uint32_t const i4s = reinterpret_cast<uint32_t const&>(source);
26 | 
27 |     // First, we extract the i4s and construct an intermediate fp16 number.
28 |     static constexpr uint32_t immLut                = (0xf0 & 0xcc) | 0xaa;
29 |     static constexpr uint32_t BOTTOM_MASK           = 0x000f000f;
30 |     static constexpr uint32_t TOP_MASK              = 0x00f000f0;
31 |     static constexpr uint32_t I4s_TO_F16s_MAGIC_NUM = 0x64006400;
32 | 
33 |     // Note that the entire sequence only requires 1 shift instruction. This is thanks to the register packing
34 |     // format and the fact that we force our integers to be unsigned, and account for this in the fp16 subtractions.
35 |     // In addition, I exploit the fact that sub and fma have the same throughput in order to convert elt_23 and
36 |     // elt_67 to fp16 without having to shift them to the bottom bits before hand.
37 | 
38 |     // Shift right by 8 to now consider elt_45 and elt_67. Issue first to hide RAW dependency if we issue
39 |     // immediately before required.
40 |     const uint32_t top_i4s = i4s >> 8;
41 |     // Extract elt_01 - (i4s & 0x000f000f) | 0x64006400
42 |     asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
43 |                     : "=r"(h[0])
44 |                     : "r"(i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
45 |     // Extract elt_23 (i4s & 0x00f000f0) | 0x64006400
46 |     asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
47 |                     : "=r"(h[1])
48 |                     : "r"(i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
49 |     // Extract elt_45 (top_i4s & 0x000f000f) | 0x64006400
50 |     asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
51 |                     : "=r"(h[2])
52 |                     : "r"(top_i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
53 |     // Extract elt_67 (top_i4s & 0x00f000f0) | 0x64006400
54 |     asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n"
55 |                     : "=r"(h[3])
56 |                     : "r"(top_i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut));
57 | 
58 |     // I use inline PTX below because I am not sure if the compiler will emit float2half instructions if I use the
59 |     // half2 ctor. In this case, I chose performance reliability over code readability.
60 | 
61 |     // This is the half2 {1032, 1032} represented as an integer.
62 |     // static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64086408;
63 |     // Haotian: subtract {1024, 1024} instead, we do not need to map to [-8, 7]
64 |     static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64006400;
65 |     // This is the half2 {1 / 16, 1 / 16} represented as an integer.
66 |     static constexpr uint32_t ONE_SIXTEENTH = 0x2c002c00;
67 |     // This is the half2 {-72, -72} represented as an integer.
68 |     // static constexpr uint32_t NEG_72 = 0xd480d480;
69 |     // Haotian: Let's use {-64, -64}.
70 |     static constexpr uint32_t NEG_64 = 0xd400d400;
71 | 
72 |     // Finally, we construct the output numbers.
73 |     // Convert elt_01
74 |     asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[0]) : "r"(h[0]), "r"(FP16_TOP_MAGIC_NUM));
75 |     // Convert elt_23
76 |     asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[1]) : "r"(h[1]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
77 |     // Convert elt_45
78 |     asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[2]) : "r"(h[2]), "r"(FP16_TOP_MAGIC_NUM));
79 |     // Convert elt_67
80 |     asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[3]) : "r"(h[3]), "r"(ONE_SIXTEENTH), "r"(NEG_64));
81 | 
82 |     return result;
83 | #endif
84 | }
85 | 
86 | } // namespace awq
87 | } // namespace vllm
88 | 


--------------------------------------------------------------------------------
/csrc/quantization/squeezellm/quant_cuda_kernel.cu:
--------------------------------------------------------------------------------
  1 | #include <torch/all.h>
  2 | #include <torch/python.h>
  3 | #include <cuda.h>
  4 | #include <cuda_runtime.h>
  5 | #include <cuda_fp16.h>
  6 | 
  7 | // half-tensor
  8 | #include <c10/cuda/CUDAStream.h>
  9 | #include <ATen/cuda/CUDATensorMethods.cuh>
 10 | 
 11 | #define BLOCKWIDTH 128
 12 | #define BLOCKHEIGHT4 16
 13 | 
 14 | namespace vllm {
 15 | namespace squeezellm {
 16 | 
 17 | __device__ inline unsigned int as_unsigned(int i) {
 18 |   return *reinterpret_cast<unsigned int*>(&i);
 19 | }
 20 | 
 21 | // 4-bit matvec kernel (LUT-based)
 22 | __global__ void NUQ4MatMulKernel(
 23 |     const  half2* __restrict__ vec,
 24 |     const    int* __restrict__ mat,
 25 |            half2* __restrict__ mul,
 26 |     const  __half* __restrict__ lookup_table,
 27 |     int height,
 28 |     int width,
 29 |     int batch,
 30 |     int vec_height
 31 | ) {
 32 | 
 33 |   const int blockwidth2 = BLOCKWIDTH / 2;
 34 | 
 35 |   int row = BLOCKHEIGHT4 * blockIdx.x;
 36 |   int col =  BLOCKWIDTH * blockIdx.y + threadIdx.x;
 37 | 
 38 |   __shared__ half2 blockvec[blockwidth2];
 39 | 
 40 |   __shared__ __half deq2[16][BLOCKWIDTH];
 41 |   int off = threadIdx.x;
 42 |   int column_offset = col * 16;
 43 |   for (int val = 0; val < 16; val += 1) {
 44 |     int lut_index = column_offset + val;
 45 |     deq2[val][off] = lookup_table[lut_index];
 46 |   }
 47 | 
 48 |   __half res;
 49 |   half2 res2;
 50 |   half2 tmp2;
 51 | 
 52 |   int i;
 53 |   int k;
 54 | 
 55 |   unsigned int tmp1;
 56 |   unsigned int lut_index1, lut_index2;
 57 | 
 58 |   for (int b = 0; b < batch; ++b){
 59 |     i = width * row + col;
 60 |     res = __int2half_rd(0);
 61 |     k = 0;
 62 | 
 63 |     __syncthreads();
 64 |     if (threadIdx.x < blockwidth2)
 65 |       blockvec[threadIdx.x] = vec[b * vec_height / 2 + (row / BLOCKHEIGHT4) * blockwidth2 + threadIdx.x];
 66 |     __syncthreads();
 67 | 
 68 |     while (k < blockwidth2) {
 69 |       tmp1 = as_unsigned(mat[i]);
 70 | 
 71 |       res2 = {};
 72 |       tmp2 = {};
 73 | 
 74 |       lut_index1 = tmp1 & 0xF;
 75 |       lut_index2 = (tmp1 >> 4) & 0xF;
 76 |       tmp2.x = deq2[lut_index1][off];
 77 |       tmp2.y = deq2[lut_index2][off];
 78 |       res2 = __hfma2(tmp2, blockvec[k + 0], res2);
 79 | 
 80 |       lut_index1 = (tmp1 >> 8) & 0xF;
 81 |       lut_index2 = (tmp1 >> 12) & 0xF;
 82 |       tmp2.x = deq2[lut_index1][off];
 83 |       tmp2.y = deq2[lut_index2][off];
 84 |       res2 = __hfma2(tmp2, blockvec[k + 1], res2);
 85 | 
 86 |       lut_index1 = (tmp1 >> 16) & 0xF;
 87 |       lut_index2 = (tmp1 >> 20) & 0xF;
 88 |       tmp2.x = deq2[lut_index1][off];
 89 |       tmp2.y = deq2[lut_index2][off];
 90 |       res2 = __hfma2(tmp2, blockvec[k + 2], res2);
 91 | 
 92 |       lut_index1 = (tmp1 >> 24) & 0xF;
 93 |       lut_index2 = (tmp1 >> 28) & 0xF;
 94 |       tmp2.x = deq2[lut_index1][off];
 95 |       tmp2.y = deq2[lut_index2][off];
 96 |       res2 = __hfma2(tmp2, blockvec[k + 3], res2);
 97 | 
 98 |       res = __hadd(__hadd(res2.x, res2.y), res);
 99 | 
100 |       i += width;
101 |       k += 4;
102 |     }
103 | 
104 |     // col%2 -> only set one of the two values
105 |     half2 res3 = {};
106 |     if (col % 2 == 0) {
107 |       res3.x = res;
108 |     } else {
109 |       res3.y = res;
110 |     }
111 | 
112 |     atomicAdd(&mul[b * width / 2 + col / 2], res3);
113 |   }
114 | }
115 | 
116 | } // namespace squeezellm
117 | } // namespace vllm
118 | 
119 | // 4-bit matvec kernel (LUT-based)
120 | void squeezellm_gemm(
121 |   torch::Tensor vec,
122 |   torch::Tensor mat,
123 |   torch::Tensor mul,
124 |   torch::Tensor lookup_table
125 | ) {
126 |   int height = mat.size(0);
127 |   int width = mat.size(1);
128 | 
129 |   int batch = vec.size(0);
130 |   int vec_height = vec.size(1);
131 | 
132 |   dim3 blocks(
133 |     (height + BLOCKHEIGHT4 - 1) / BLOCKHEIGHT4,
134 |     (width + BLOCKWIDTH - 1) / BLOCKWIDTH
135 |   );
136 |   dim3 threads(BLOCKWIDTH);
137 | 
138 |   vllm::squeezellm::NUQ4MatMulKernel<<<blocks, threads>>>(
139 |     (half2*) vec.data<at::Half>(),
140 |     mat.data_ptr<int>(),
141 |     (half2*) mul.data<at::Half>(),
142 |     (__half*) lookup_table.data<at::Half>(),
143 |     height, width, batch, vec_height
144 |   );
145 | }
146 | 
147 | #undef BLOCKWIDTH
148 | #undef BLOCKHEIGHT4
149 | 


--------------------------------------------------------------------------------
/csrc/reduction_utils.cuh:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/reduce_kernel_utils.cuh
 3 |  * Copyright (c) 2023, The vLLM team.
 4 |  * Copyright (c) 2020-2023, NVIDIA CORPORATION.  All rights reserved.
 5 |  *
 6 |  * Licensed under the Apache License, Version 2.0 (the "License");
 7 |  * you may not use this file except in compliance with the License.
 8 |  * You may obtain a copy of the License at
 9 |  *
10 |  *     http://www.apache.org/licenses/LICENSE-2.0
11 |  *
12 |  * Unless required by applicable law or agreed to in writing, software
13 |  * distributed under the License is distributed on an "AS IS" BASIS,
14 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 |  * See the License for the specific language governing permissions and
16 |  * limitations under the License.
17 |  */
18 | #pragma once
19 | 
20 | namespace vllm {
21 | 
22 | template<typename T>
23 | __inline__ __device__ T warpReduceSum(T val) {
24 | #pragma unroll
25 |   for (int mask = 16; mask > 0; mask >>= 1)
26 |     val += __shfl_xor_sync(0xffffffff, val, mask, 32);
27 |   return val;
28 | }
29 | 
30 | /* Calculate the sum of all elements in a block */
31 | template<typename T>
32 | __inline__ __device__ T blockReduceSum(T val) {
33 |   static __shared__ T shared[32];
34 |   int lane = threadIdx.x & 0x1f;
35 |   int wid = threadIdx.x >> 5;
36 | 
37 |   val = warpReduceSum<T>(val);
38 | 
39 |   if (lane == 0)
40 |     shared[wid] = val;
41 | 
42 |   __syncthreads();
43 | 
44 |   // Modify from blockDim.x << 5 to blockDim.x / 32. to prevent
45 |   // blockDim.x is not divided by 32
46 |   val = (threadIdx.x < (blockDim.x / 32.f)) ? shared[lane] : (T)(0.0f);
47 |   val = warpReduceSum<T>(val);
48 |   return val;
49 | }
50 | 
51 | } // namespace vllm
52 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # vLLM documents
 2 | 
 3 | ## Build the docs
 4 | 
 5 | ```bash
 6 | # Install dependencies.
 7 | pip install -r requirements-docs.txt
 8 | 
 9 | # Build the docs.
10 | make clean
11 | make html
12 | ```
13 | 
14 | ## Open the docs with your browser
15 | 
16 | ```bash
17 | python -m http.server -d build/html/
18 | ```
19 | Launch your browser and open localhost:8000.
20 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | %SPHINXBUILD% >NUL 2>NUL
14 | if errorlevel 9009 (
15 | 	echo.
16 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17 | 	echo.installed, then set the SPHINXBUILD environment variable to point
18 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
19 | 	echo.may add the Sphinx directory to PATH.
20 | 	echo.
21 | 	echo.If you don't have Sphinx installed, grab it from
22 | 	echo.https://www.sphinx-doc.org/
23 | 	exit /b 1
24 | )
25 | 
26 | if "%1" == "" goto help
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/requirements-docs.txt:
--------------------------------------------------------------------------------
1 | sphinx == 6.2.1
2 | sphinx-book-theme == 1.0.1
3 | sphinx-copybutton == 0.5.2
4 | 


--------------------------------------------------------------------------------
/docs/source/assets/logos/vllm-logo-only-light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OthersideAI/vllm/4b498312ccc7b2804dfe26a4d902750e49937183/docs/source/assets/logos/vllm-logo-only-light.png


--------------------------------------------------------------------------------
/docs/source/assets/logos/vllm-logo-text-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OthersideAI/vllm/4b498312ccc7b2804dfe26a4d902750e49937183/docs/source/assets/logos/vllm-logo-text-dark.png


--------------------------------------------------------------------------------
/docs/source/assets/logos/vllm-logo-text-light.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OthersideAI/vllm/4b498312ccc7b2804dfe26a4d902750e49937183/docs/source/assets/logos/vllm-logo-text-light.png


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | # import os
14 | # import sys
15 | # sys.path.insert(0, os.path.abspath('.'))
16 | 
17 | 
18 | # -- Project information -----------------------------------------------------
19 | 
20 | project = 'vLLM'
21 | copyright = '2023, vLLM Team'
22 | author = 'the vLLM Team'
23 | 
24 | 
25 | # -- General configuration ---------------------------------------------------
26 | 
27 | # Add any Sphinx extension module names here, as strings. They can be
28 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
29 | # ones.
30 | extensions = [
31 |     "sphinx.ext.napoleon",
32 |     "sphinx.ext.viewcode",
33 |     "sphinx.ext.intersphinx",
34 |     "sphinx_copybutton",
35 | ]
36 | 
37 | # Add any paths that contain templates here, relative to this directory.
38 | templates_path = ['_templates']
39 | 
40 | # List of patterns, relative to source directory, that match files and
41 | # directories to ignore when looking for source files.
42 | # This pattern also affects html_static_path and html_extra_path.
43 | exclude_patterns = []
44 | 
45 | # Exclude the prompt "$" when copying code
46 | copybutton_prompt_text = r"\$ "
47 | copybutton_prompt_is_regexp = True
48 | 
49 | # -- Options for HTML output -------------------------------------------------
50 | 
51 | # The theme to use for HTML and HTML Help pages.  See the documentation for
52 | # a list of builtin themes.
53 | #
54 | html_title = project
55 | html_theme = 'sphinx_book_theme'
56 | html_logo = 'assets/logos/vllm-logo-text-light.png'
57 | html_theme_options = {
58 |     'logo_only': True,
59 |     'path_to_docs': 'docs/source',
60 |     'repository_url': 'https://github.com/vllm-project/vllm',
61 |     'use_repository_button': True,
62 | }
63 | 
64 | # Add any paths that contain custom static files (such as style sheets) here,
65 | # relative to this directory. They are copied after the builtin static files,
66 | # so a file named "default.css" will overwrite the builtin "default.css".
67 | html_static_path = ['_static']
68 | 


--------------------------------------------------------------------------------
/docs/source/getting_started/installation.rst:
--------------------------------------------------------------------------------
 1 | .. _installation:
 2 | 
 3 | Installation
 4 | ============
 5 | 
 6 | vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries.
 7 | 
 8 | Requirements
 9 | ------------
10 | 
11 | * OS: Linux
12 | * Python: 3.8 -- 3.11
13 | * GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
14 | 
15 | Install with pip
16 | ----------------
17 | 
18 | You can install vLLM using pip:
19 | 
20 | .. code-block:: console
21 | 
22 |     $ # (Optional) Create a new conda environment.
23 |     $ conda create -n myenv python=3.8 -y
24 |     $ conda activate myenv
25 | 
26 |     $ # Install vLLM with CUDA 12.1.
27 |     $ pip install vllm
28 | 
29 | .. note::
30 | 
31 |     As of now, vLLM's binaries are compiled on CUDA 12.1 by default.
32 |     However, you can install vLLM with CUDA 11.8 by running:
33 | 
34 |     .. code-block:: console
35 | 
36 |         $ # Install vLLM with CUDA 11.8.
37 |         $ # Replace `cp310` with your Python version (e.g., `cp38`, `cp39`, `cp311`).
38 |         $ pip install https://github.com/vllm-project/vllm/releases/download/v0.2.2/vllm-0.2.2+cu118-cp310-cp310-manylinux1_x86_64.whl
39 | 
40 |         $ # Re-install PyTorch with CUDA 11.8.
41 |         $ pip uninstall torch -y
42 |         $ pip install torch --upgrade --index-url https://download.pytorch.org/whl/cu118
43 | 
44 | 
45 | .. _build_from_source:
46 | 
47 | Build from source
48 | -----------------
49 | 
50 | You can also build and install vLLM from source:
51 | 
52 | .. code-block:: console
53 | 
54 |     $ git clone https://github.com/vllm-project/vllm.git
55 |     $ cd vllm
56 |     $ pip install -e .  # This may take 5-10 minutes.
57 | 
58 | .. tip::
59 |     If you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image.
60 | 
61 |     .. code-block:: console
62 | 
63 |         $ # Use `--ipc=host` to make sure the shared memory is large enough.
64 |         $ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3
65 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | Welcome to vLLM!
 2 | ================
 3 | 
 4 | .. figure:: ./assets/logos/vllm-logo-text-light.png
 5 |   :width: 60%
 6 |   :align: center
 7 |   :alt: vLLM
 8 |   :class: no-scaled-link
 9 | 
10 | .. raw:: html
11 | 
12 |    <p style="text-align:center">
13 |    <strong>Easy, fast, and cheap LLM serving for everyone
14 |    </strong>
15 |    </p>
16 | 
17 |    <p style="text-align:center">
18 |    <script async defer src="https://buttons.github.io/buttons.js"></script>
19 |    <a class="github-button" href="https://github.com/vllm-project/vllm" data-show-count="true" data-size="large" aria-label="Star">Star</a>
20 |    <a class="github-button" href="https://github.com/vllm-project/vllm/subscription" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
21 |    <a class="github-button" href="https://github.com/vllm-project/vllm/fork" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
22 |    </p>
23 | 
24 | 
25 | 
26 | vLLM is a fast and easy-to-use library for LLM inference and serving.
27 | 
28 | vLLM is fast with:
29 | 
30 | * State-of-the-art serving throughput
31 | * Efficient management of attention key and value memory with **PagedAttention**
32 | * Continuous batching of incoming requests
33 | * Optimized CUDA kernels
34 | 
35 | vLLM is flexible and easy to use with:
36 | 
37 | * Seamless integration with popular HuggingFace models
38 | * High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more
39 | * Tensor parallelism support for distributed inference
40 | * Streaming outputs
41 | * OpenAI-compatible API server
42 | 
43 | For more information, check out the following:
44 | 
45 | * `vLLM announcing blog post <https://vllm.ai>`_ (intro to PagedAttention)
46 | * `vLLM paper <https://arxiv.org/abs/2309.06180>`_ (SOSP 2023)
47 | * `How continuous batching enables 23x throughput in LLM inference while reducing p50 latency <https://www.anyscale.com/blog/continuous-batching-llm-inference>`_ by Cade Daniel et al.
48 | 
49 | 
50 | 
51 | Documentation
52 | -------------
53 | 
54 | .. toctree::
55 |    :maxdepth: 1
56 |    :caption: Getting Started
57 | 
58 |    getting_started/installation
59 |    getting_started/quickstart
60 | 
61 | .. toctree::
62 |    :maxdepth: 1
63 |    :caption: Serving
64 | 
65 |    serving/distributed_serving
66 |    serving/run_on_sky
67 |    serving/deploying_with_triton
68 |    serving/deploying_with_docker
69 | 
70 | .. toctree::
71 |    :maxdepth: 1
72 |    :caption: Models
73 | 
74 |    models/supported_models
75 |    models/adding_model
76 |    models/engine_args
77 | 
78 | .. toctree::
79 |    :maxdepth: 1
80 |    :caption: Quantization
81 | 
82 |    quantization/auto_awq


--------------------------------------------------------------------------------
/docs/source/models/engine_args.rst:
--------------------------------------------------------------------------------
  1 | .. _engine_args:
  2 | 
  3 | Engine Arguments
  4 | ================
  5 | 
  6 | Below, you can find an explanation of every engine argument for vLLM:
  7 | 
  8 | .. option:: --model <model_name_or_path>
  9 | 
 10 |     Name or path of the huggingface model to use.
 11 | 
 12 | .. option:: --tokenizer <tokenizer_name_or_path>
 13 | 
 14 |     Name or path of the huggingface tokenizer to use.
 15 | 
 16 | .. option:: --revision <revision>
 17 | 
 18 |     The specific model version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.
 19 | 
 20 | .. option:: --tokenizer-revision <revision>
 21 | 
 22 |     The specific tokenizer version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version.
 23 | 
 24 | .. option:: --tokenizer-mode {auto,slow}
 25 | 
 26 |     The tokenizer mode.
 27 |     
 28 |     * "auto" will use the fast tokenizer if available.
 29 |     * "slow" will always use the slow tokenizer.
 30 | 
 31 | .. option:: --trust-remote-code
 32 | 
 33 |     Trust remote code from huggingface.
 34 | 
 35 | .. option:: --download-dir <directory>
 36 | 
 37 |     Directory to download and load the weights, default to the default cache dir of huggingface.
 38 | 
 39 | .. option:: --load-format {auto,pt,safetensors,npcache,dummy}
 40 | 
 41 |     The format of the model weights to load.
 42 | 
 43 |     * "auto" will try to load the weights in the safetensors format and fall back to the pytorch bin format if safetensors format is not available.
 44 |     * "pt" will load the weights in the pytorch bin format.
 45 |     * "safetensors" will load the weights in the safetensors format.
 46 |     * "npcache" will load the weights in pytorch format and store a numpy cache to speed up the loading.
 47 |     * "dummy" will initialize the weights with random values, mainly for profiling.
 48 | 
 49 | .. option:: --dtype {auto,half,float16,bfloat16,float,float32}
 50 | 
 51 |     Data type for model weights and activations.
 52 | 
 53 |     * "auto" will use FP16 precision for FP32 and FP16 models, and BF16 precision for BF16 models.
 54 |     * "half" for FP16. Recommended for AWQ quantization.
 55 |     * "float16" is the same as "half".
 56 |     * "bfloat16" for a balance between precision and range.
 57 |     * "float" is shorthand for FP32 precision.
 58 |     * "float32" for FP32 precision.
 59 | 
 60 | .. option:: --max-model-len <length>
 61 | 
 62 |     Model context length. If unspecified, will be automatically derived from the model config.
 63 | 
 64 | .. option:: --worker-use-ray
 65 | 
 66 |     Use Ray for distributed serving, will be automatically set when using more than 1 GPU.
 67 | 
 68 | .. option:: --pipeline-parallel-size (-pp) <size>
 69 | 
 70 |     Number of pipeline stages.
 71 | 
 72 | .. option:: --tensor-parallel-size (-tp) <size>
 73 | 
 74 |     Number of tensor parallel replicas.
 75 | 
 76 | .. option:: --max-parallel-loading-workers <workers>
 77 | 
 78 |     Load model sequentially in multiple batches, to avoid RAM OOM when using tensor parallel and large models.
 79 | 
 80 | .. option:: --block-size {8,16,32}
 81 | 
 82 |     Token block size for contiguous chunks of tokens.
 83 | 
 84 | .. option:: --seed <seed>
 85 | 
 86 |     Random seed for operations.
 87 | 
 88 | .. option:: --swap-space <size>
 89 | 
 90 |     CPU swap space size (GiB) per GPU.
 91 | 
 92 | .. option:: --gpu-memory-utilization <percentage>
 93 | 
 94 |     The percentage of GPU memory to be used for the model executor.
 95 | 
 96 | .. option:: --max-num-batched-tokens <tokens>
 97 | 
 98 |     Maximum number of batched tokens per iteration.
 99 | 
100 | .. option:: --max-num-seqs <sequences>
101 | 
102 |     Maximum number of sequences per iteration.
103 | 
104 | .. option:: --max-paddings <paddings>
105 | 
106 |     Maximum number of paddings in a batch.
107 | 
108 | .. option:: --disable-log-stats
109 | 
110 |     Disable logging statistics.
111 | 
112 | .. option:: --quantization (-q) {awq,squeezellm,None}
113 | 
114 |     Method used to quantize the weights.
115 | 


--------------------------------------------------------------------------------
/docs/source/models/supported_models.rst:
--------------------------------------------------------------------------------
 1 | .. _supported_models:
 2 | 
 3 | Supported Models
 4 | ================
 5 | 
 6 | vLLM supports a variety of generative Transformer models in `HuggingFace Transformers <https://huggingface.co/models>`_.
 7 | The following is the list of model architectures that are currently supported by vLLM.
 8 | Alongside each architecture, we include some popular models that use it.
 9 | 
10 | .. list-table::
11 |   :widths: 25 25 50
12 |   :header-rows: 1
13 | 
14 |   * - Architecture
15 |     - Models
16 |     - Example HuggingFace Models
17 |   * - :code:`AquilaForCausalLM`
18 |     - Aquila
19 |     - :code:`BAAI/Aquila-7B`, :code:`BAAI/AquilaChat-7B`, etc.
20 |   * - :code:`BaiChuanForCausalLM`
21 |     - Baichuan
22 |     - :code:`baichuan-inc/Baichuan-7B`, :code:`baichuan-inc/Baichuan-13B-Chat`, etc.
23 |   * - :code:`ChatGLMModel`
24 |     - ChatGLM
25 |     - :code:`THUDM/chatglm2-6b`, :code:`THUDM/chatglm3-6b`, etc.
26 |   * - :code:`BloomForCausalLM`
27 |     - BLOOM, BLOOMZ, BLOOMChat
28 |     - :code:`bigscience/bloom`, :code:`bigscience/bloomz`, etc.
29 |   * - :code:`FalconForCausalLM`
30 |     - Falcon
31 |     - :code:`tiiuae/falcon-7b`, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc.
32 |   * - :code:`GPT2LMHeadModel`
33 |     - GPT-2
34 |     - :code:`gpt2`, :code:`gpt2-xl`, etc.
35 |   * - :code:`GPTBigCodeForCausalLM`
36 |     - StarCoder, SantaCoder, WizardCoder
37 |     - :code:`bigcode/starcoder`, :code:`bigcode/gpt_bigcode-santacoder`, :code:`WizardLM/WizardCoder-15B-V1.0`, etc.
38 |   * - :code:`GPTJForCausalLM`
39 |     - GPT-J
40 |     - :code:`EleutherAI/gpt-j-6b`, :code:`nomic-ai/gpt4all-j`, etc.
41 |   * - :code:`GPTNeoXForCausalLM`
42 |     - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM
43 |     - :code:`EleutherAI/gpt-neox-20b`, :code:`EleutherAI/pythia-12b`, :code:`OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, :code:`databricks/dolly-v2-12b`, :code:`stabilityai/stablelm-tuned-alpha-7b`, etc.
44 |   * - :code:`InternLMForCausalLM`
45 |     - InternLM
46 |     - :code:`internlm/internlm-7b`, :code:`internlm/internlm-chat-7b`, etc.
47 |   * - :code:`LlamaForCausalLM`
48 |     - LLaMA, LLaMA-2, Vicuna, Alpaca, Koala, Guanaco
49 |     - :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`young-geng/koala`, etc.
50 |   * - :code:`MistralForCausalLM`
51 |     - Mistral, Mistral-Instruct
52 |     - :code:`mistralai/Mistral-7B-v0.1`, :code:`mistralai/Mistral-7B-Instruct-v0.1`, etc.
53 |   * - :code:`MPTForCausalLM`
54 |     - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter
55 |     - :code:`mosaicml/mpt-7b`, :code:`mosaicml/mpt-7b-storywriter`, :code:`mosaicml/mpt-30b`, etc.
56 |   * - :code:`OPTForCausalLM`
57 |     - OPT, OPT-IML
58 |     - :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc.
59 |   * - :code:`PhiForCausalLM`
60 |     - Phi-1.5
61 |     - :code:`microsoft/phi-1_5`, etc.
62 |   * - :code:`QWenLMHeadModel`
63 |     - Qwen
64 |     - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc.
65 |   * - :code:`YiForCausalLM`
66 |     - Yi
67 |     - :code:`01-ai/Yi-6B`, :code:`01-ai/Yi-34B`, etc.
68 | 
69 | If your model uses one of the above model architectures, you can seamlessly run your model with vLLM.
70 | Otherwise, please refer to :ref:`Adding a New Model <adding_a_new_model>` for instructions on how to implement support for your model.
71 | Alternatively, you can raise an issue on our `GitHub <https://github.com/vllm-project/vllm/issues>`_ project.
72 | 
73 | .. tip::
74 |     The easiest way to check if your model is supported is to run the program below:
75 | 
76 |     .. code-block:: python
77 | 
78 |         from vllm import LLM
79 | 
80 |         llm = LLM(model=...)  # Name or path of your model
81 |         output = llm.generate("Hello, my name is")
82 |         print(output)
83 | 
84 |     To use model from www.modelscope.cn
85 | 
86 |     .. code-block:: shell
87 | 
88 |        $ export VLLM_USE_MODELSCOPE=True
89 | 
90 |     .. code-block:: python
91 | 
92 |         from vllm import LLM
93 | 
94 |         llm = LLM(model=..., revision=..., trust_remote_code=True)  # Name or path of your model
95 |         output = llm.generate("Hello, my name is")
96 |         print(output)
97 | 
98 |     If vLLM successfully generates text, it indicates that your model is supported.
99 | 


--------------------------------------------------------------------------------
/docs/source/quantization/auto_awq.rst:
--------------------------------------------------------------------------------
 1 | .. _auto_awq:
 2 | 
 3 | AutoAWQ
 4 | ==================
 5 | 
 6 | To create a new 4-bit quantized model, you can leverage `AutoAWQ <https://github.com/casper-hansen/AutoAWQ>`_. 
 7 | Quantizing reduces the model's precision from FP16 to INT4 which effectively reduces the file size by ~70%.
 8 | The main benefits are lower latency and memory usage.
 9 | 
10 | You can quantize your own models by installing AutoAWQ or picking one of the `400+ models on Huggingface <https://huggingface.co/models?sort=trending&search=awq>`_. 
11 | 
12 | .. code-block:: console
13 | 
14 |     $ pip install autoawq
15 | 
16 | After installing AutoAWQ, you are ready to quantize a model. Here is an example of how to quantize Vicuna 7B v1.5:
17 | 
18 | .. code-block:: python
19 | 
20 |     from awq import AutoAWQForCausalLM
21 |     from transformers import AutoTokenizer
22 | 
23 |     model_path = 'lmsys/vicuna-7b-v1.5'
24 |     quant_path = 'vicuna-7b-v1.5-awq'
25 |     quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" }
26 | 
27 |     # Load model
28 |     model = AutoAWQForCausalLM.from_pretrained(model_path, **{"low_cpu_mem_usage": True})
29 |     tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
30 | 
31 |     # Quantize
32 |     model.quantize(tokenizer, quant_config=quant_config)
33 | 
34 |     # Save quantized model
35 |     model.save_quantized(quant_path)
36 |     tokenizer.save_pretrained(quant_path)
37 | 
38 | To run an AWQ model with vLLM, you can use `TheBloke/Llama-2-7b-Chat-AWQ <https://huggingface.co/TheBloke/Llama-2-7b-Chat-AWQ>`_ with the following command:
39 | 
40 | .. code-block:: console
41 | 
42 |     $ python examples/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq
43 | 
44 | AWQ models are also supported directly through the LLM entrypoint:
45 | 
46 | .. code-block:: python
47 | 
48 |     from vllm import LLM, SamplingParams
49 | 
50 |     # Sample prompts.
51 |     prompts = [
52 |         "Hello, my name is",
53 |         "The president of the United States is",
54 |         "The capital of France is",
55 |         "The future of AI is",
56 |     ]
57 |     # Create a sampling params object.
58 |     sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
59 | 
60 |     # Create an LLM.
61 |     llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ")
62 |     # Generate texts from the prompts. The output is a list of RequestOutput objects
63 |     # that contain the prompt, generated text, and other information.
64 |     outputs = llm.generate(prompts, sampling_params)
65 |     # Print the outputs.
66 |     for output in outputs:
67 |         prompt = output.prompt
68 |         generated_text = output.outputs[0].text
69 |         print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
70 | 


--------------------------------------------------------------------------------
/docs/source/serving/deploying_with_docker.rst:
--------------------------------------------------------------------------------
 1 | .. _deploying_with_docker:
 2 | 
 3 | Deploying with Docker
 4 | ============================
 5 | 
 6 | vLLM offers official docker image for deployment.
 7 | The image can be used to run OpenAI compatible server.
 8 | The image is available on Docker Hub as `vllm/vllm-openai <https://hub.docker.com/r/vllm/vllm-openai/tags>`_.
 9 | 
10 | .. code-block:: console
11 | 
12 |     $ docker run --runtime nvidia --gpus all \
13 |         -v ~/.cache/huggingface:/root/.cache/huggingface \
14 |         -p 8000:8000 \
15 |         --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
16 |         vllm/vllm-openai:latest \
17 |         --model mistralai/Mistral-7B-v0.1
18 | 
19 | 
20 | You can build and run vLLM from source via the provided dockerfile. To build vLLM:
21 | 
22 | .. code-block:: console
23 | 
24 |     $ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai --build-arg max_jobs=8
25 | 
26 | To run vLLM:
27 | 
28 | .. code-block:: console
29 | 
30 |     $ docker run --runtime nvidia --gpus all \
31 |         -v ~/.cache/huggingface:/root/.cache/huggingface \
32 |         -p 8000:8000 \
33 |         --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
34 |         vllm/vllm-openai <args...>
35 | 
36 | 


--------------------------------------------------------------------------------
/docs/source/serving/deploying_with_triton.rst:
--------------------------------------------------------------------------------
1 | .. _deploying_with_triton:
2 | 
3 | Deploying with NVIDIA Triton
4 | ============================
5 | 
6 | The `Triton Inference Server <https://github.com/triton-inference-server>`_ hosts a tutorial demonstrating how to quickly deploy a simple `facebook/opt-125m <https://huggingface.co/facebook/opt-125m>`_ model using vLLM. Please see `Deploying a vLLM model in Triton <https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#deploying-a-vllm-model-in-triton>`_ for more details.
7 | 


--------------------------------------------------------------------------------
/docs/source/serving/distributed_serving.rst:
--------------------------------------------------------------------------------
 1 | .. _distributed_serving:
 2 | 
 3 | Distributed Inference and Serving
 4 | =================================
 5 | 
 6 | vLLM supports distributed tensor-parallel inference and serving. Currently, we support `Megatron-LM's tensor parallel algorithm <https://arxiv.org/pdf/1909.08053.pdf>`_. We manage the distributed runtime with `Ray <https://github.com/ray-project/ray>`_. To run distributed inference, install Ray with:
 7 | 
 8 | .. code-block:: console
 9 | 
10 |     $ pip install ray
11 | 
12 | To run multi-GPU inference with the :code:`LLM` class, set the :code:`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs:
13 | 
14 | .. code-block:: python
15 | 
16 |     from vllm import LLM
17 |     llm = LLM("facebook/opt-13b", tensor_parallel_size=4)
18 |     output = llm.generate("San Franciso is a")
19 | 
20 | To run multi-GPU serving, pass in the :code:`--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs:
21 | 
22 | .. code-block:: console
23 | 
24 |     $ python -m vllm.entrypoints.api_server \
25 |     $     --model facebook/opt-13b \
26 |     $     --tensor-parallel-size 4
27 | 
28 | To scale vLLM beyond a single machine, start a `Ray runtime <https://docs.ray.io/en/latest/ray-core/starting-ray.html>`_ via CLI before running vLLM:
29 | 
30 | .. code-block:: console
31 | 
32 |     $ # On head node
33 |     $ ray start --head
34 | 
35 |     $ # On worker nodes
36 |     $ ray start --address=<ray-head-address>
37 | 
38 | After that, you can run inference and serving on multiple machines by launching the vLLM process on the head node by setting :code:`tensor_parallel_size` to the number of GPUs to be the total number of GPUs across all machines.


--------------------------------------------------------------------------------
/docs/source/serving/run_on_sky.rst:
--------------------------------------------------------------------------------
 1 | .. _on_cloud:
 2 | 
 3 | Running on clouds with SkyPilot
 4 | ===============================
 5 | 
 6 | .. raw:: html
 7 | 
 8 |     <p align="center">
 9 |         <img src="https://imgur.com/yxtzPEu.png" alt="vLLM"/>
10 |     </p>
11 | 
12 | vLLM can be run on the cloud to scale to multiple GPUs with `SkyPilot <https://github.com/skypilot-org/skypilot>`__, an open-source framework for running LLMs on any cloud.
13 | 
14 | To install SkyPilot and setup your cloud credentials, run:
15 | 
16 | .. code-block:: console
17 | 
18 |     $ pip install skypilot
19 |     $ sky check
20 | 
21 | See the vLLM SkyPilot YAML for serving, `serving.yaml <https://github.com/skypilot-org/skypilot/blob/master/llm/vllm/serve.yaml>`__.
22 | 
23 | .. code-block:: yaml
24 | 
25 |     resources:
26 |         accelerators: A100
27 | 
28 |     envs:
29 |         MODEL_NAME: decapoda-research/llama-13b-hf
30 |         TOKENIZER: hf-internal-testing/llama-tokenizer
31 | 
32 |     setup: |
33 |         conda create -n vllm python=3.9 -y
34 |         conda activate vllm
35 |         git clone https://github.com/vllm-project/vllm.git
36 |         cd vllm
37 |         pip install .
38 |         pip install gradio
39 | 
40 |     run: |
41 |         conda activate vllm
42 |         echo 'Starting vllm api server...'
43 |         python -u -m vllm.entrypoints.api_server \
44 |                         --model $MODEL_NAME \
45 |                         --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
46 |                         --tokenizer $TOKENIZER 2>&1 | tee api_server.log &
47 |         echo 'Waiting for vllm api server to start...'
48 |         while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done
49 |         echo 'Starting gradio server...'
50 |         python vllm/examples/gradio_webserver.py
51 | 
52 | Start the serving the LLaMA-13B model on an A100 GPU:
53 | 
54 | .. code-block:: console
55 | 
56 |     $ sky launch serving.yaml
57 | 
58 | Check the output of the command. There will be a sharable gradio link (like the last line of the following). Open it in your browser to use the LLaMA model to do the text completion.
59 | 
60 | .. code-block:: console
61 | 
62 |     (task, pid=7431) Running on public URL: https://<gradio-hash>.gradio.live
63 | 
64 | **Optional**: Serve the 65B model instead of the default 13B and use more GPU:
65 | 
66 | .. code-block:: console
67 | 
68 |     sky launch -c vllm-serve-new -s serve.yaml --gpus A100:8 --env MODEL_NAME=decapoda-research/llama-65b-hf
69 | 
70 | 


--------------------------------------------------------------------------------
/examples/api_client.py:
--------------------------------------------------------------------------------
 1 | """Example Python client for vllm.entrypoints.api_server"""
 2 | 
 3 | import argparse
 4 | import json
 5 | from typing import Iterable, List
 6 | 
 7 | import requests
 8 | 
 9 | 
10 | def clear_line(n: int = 1) -> None:
11 |     LINE_UP = '\033[1A'
12 |     LINE_CLEAR = '\x1b[2K'
13 |     for _ in range(n):
14 |         print(LINE_UP, end=LINE_CLEAR, flush=True)
15 | 
16 | 
17 | def post_http_request(prompt: str,
18 |                       api_url: str,
19 |                       n: int = 1,
20 |                       stream: bool = False) -> requests.Response:
21 |     headers = {"User-Agent": "Test Client"}
22 |     pload = {
23 |         "prompt": prompt,
24 |         "n": n,
25 |         "use_beam_search": True,
26 |         "temperature": 0.0,
27 |         "max_tokens": 16,
28 |         "stream": stream,
29 |     }
30 |     response = requests.post(api_url, headers=headers, json=pload, stream=True)
31 |     return response
32 | 
33 | 
34 | def get_streaming_response(response: requests.Response) -> Iterable[List[str]]:
35 |     for chunk in response.iter_lines(chunk_size=8192,
36 |                                      decode_unicode=False,
37 |                                      delimiter=b"\0"):
38 |         if chunk:
39 |             data = json.loads(chunk.decode("utf-8"))
40 |             output = data["text"]
41 |             yield output
42 | 
43 | 
44 | def get_response(response: requests.Response) -> List[str]:
45 |     data = json.loads(response.content)
46 |     output = data["text"]
47 |     return output
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     parser = argparse.ArgumentParser()
52 |     parser.add_argument("--host", type=str, default="localhost")
53 |     parser.add_argument("--port", type=int, default=8000)
54 |     parser.add_argument("--n", type=int, default=4)
55 |     parser.add_argument("--prompt", type=str, default="San Francisco is a")
56 |     parser.add_argument("--stream", action="store_true")
57 |     args = parser.parse_args()
58 |     prompt = args.prompt
59 |     api_url = f"http://{args.host}:{args.port}/generate"
60 |     n = args.n
61 |     stream = args.stream
62 | 
63 |     print(f"Prompt: {prompt!r}\n", flush=True)
64 |     response = post_http_request(prompt, api_url, n, stream)
65 | 
66 |     if stream:
67 |         num_printed_lines = 0
68 |         for h in get_streaming_response(response):
69 |             clear_line(num_printed_lines)
70 |             num_printed_lines = 0
71 |             for i, line in enumerate(h):
72 |                 num_printed_lines += 1
73 |                 print(f"Beam candidate {i}: {line!r}", flush=True)
74 |     else:
75 |         output = get_response(response)
76 |         for i, line in enumerate(output):
77 |             print(f"Beam candidate {i}: {line!r}", flush=True)
78 | 


--------------------------------------------------------------------------------
/examples/gradio_webserver.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | import gradio as gr
 5 | import requests
 6 | 
 7 | 
 8 | def http_bot(prompt):
 9 |     headers = {"User-Agent": "vLLM Client"}
10 |     pload = {
11 |         "prompt": prompt,
12 |         "stream": True,
13 |         "max_tokens": 128,
14 |     }
15 |     response = requests.post(args.model_url,
16 |                              headers=headers,
17 |                              json=pload,
18 |                              stream=True)
19 | 
20 |     for chunk in response.iter_lines(chunk_size=8192,
21 |                                      decode_unicode=False,
22 |                                      delimiter=b"\0"):
23 |         if chunk:
24 |             data = json.loads(chunk.decode("utf-8"))
25 |             output = data["text"][0]
26 |             yield output
27 | 
28 | 
29 | def build_demo():
30 |     with gr.Blocks() as demo:
31 |         gr.Markdown("# vLLM text completion demo\n")
32 |         inputbox = gr.Textbox(label="Input",
33 |                               placeholder="Enter text and press ENTER")
34 |         outputbox = gr.Textbox(label="Output",
35 |                                placeholder="Generated result from the model")
36 |         inputbox.submit(http_bot, [inputbox], [outputbox])
37 |     return demo
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     parser = argparse.ArgumentParser()
42 |     parser.add_argument("--host", type=str, default=None)
43 |     parser.add_argument("--port", type=int, default=8001)
44 |     parser.add_argument("--model-url",
45 |                         type=str,
46 |                         default="http://localhost:8000/generate")
47 |     args = parser.parse_args()
48 | 
49 |     demo = build_demo()
50 |     demo.queue(concurrency_count=100).launch(server_name=args.host,
51 |                                              server_port=args.port,
52 |                                              share=True)
53 | 


--------------------------------------------------------------------------------
/examples/llm_engine_example.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from typing import List, Tuple
 3 | 
 4 | from vllm import EngineArgs, LLMEngine, SamplingParams, RequestOutput
 5 | 
 6 | 
 7 | def create_test_prompts() -> List[Tuple[str, SamplingParams]]:
 8 |     """Create a list of test prompts with their sampling parameters."""
 9 |     return [
10 |         ("A robot may not injure a human being",
11 |          SamplingParams(temperature=0.0, logprobs=1, prompt_logprobs=1)),
12 |         ("To be or not to be,",
13 |          SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2)),
14 |         ("What is the meaning of life?",
15 |          SamplingParams(n=2,
16 |                         best_of=5,
17 |                         temperature=0.8,
18 |                         top_p=0.95,
19 |                         frequency_penalty=0.1)),
20 |         ("It is only with the heart that one can see rightly",
21 |          SamplingParams(n=3, best_of=3, use_beam_search=True,
22 |                         temperature=0.0)),
23 |     ]
24 | 
25 | 
26 | def process_requests(engine: LLMEngine,
27 |                      test_prompts: List[Tuple[str, SamplingParams]]):
28 |     """Continuously process a list of prompts and handle the outputs."""
29 |     request_id = 0
30 | 
31 |     while test_prompts or engine.has_unfinished_requests():
32 |         if test_prompts:
33 |             prompt, sampling_params = test_prompts.pop(0)
34 |             engine.add_request(str(request_id), prompt, sampling_params)
35 |             request_id += 1
36 | 
37 |         request_outputs: List[RequestOutput] = engine.step()
38 | 
39 |         for request_output in request_outputs:
40 |             if request_output.finished:
41 |                 print(request_output)
42 | 
43 | 
44 | def initialize_engine(args: argparse.Namespace) -> LLMEngine:
45 |     """Initialize the LLMEngine from the command line arguments."""
46 |     engine_args = EngineArgs.from_cli_args(args)
47 |     return LLMEngine.from_engine_args(engine_args)
48 | 
49 | 
50 | def main(args: argparse.Namespace):
51 |     """Main function that sets up and runs the prompt processing."""
52 |     engine = initialize_engine(args)
53 |     test_prompts = create_test_prompts()
54 |     process_requests(engine, test_prompts)
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     parser = argparse.ArgumentParser(
59 |         description='Demo on using the LLMEngine class directly')
60 |     parser = EngineArgs.add_cli_args(parser)
61 |     args = parser.parse_args()
62 |     main(args)
63 | 


--------------------------------------------------------------------------------
/examples/offline_inference.py:
--------------------------------------------------------------------------------
 1 | from vllm import LLM, SamplingParams
 2 | 
 3 | # Sample prompts.
 4 | prompts = [
 5 |     "Hello, my name is",
 6 |     "The president of the United States is",
 7 |     "The capital of France is",
 8 |     "The future of AI is",
 9 | ]
10 | # Create a sampling params object.
11 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
12 | 
13 | # Create an LLM.
14 | llm = LLM(model="facebook/opt-125m")
15 | # Generate texts from the prompts. The output is a list of RequestOutput objects
16 | # that contain the prompt, generated text, and other information.
17 | outputs = llm.generate(prompts, sampling_params)
18 | # Print the outputs.
19 | for output in outputs:
20 |     prompt = output.prompt
21 |     generated_text = output.outputs[0].text
22 |     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
23 | 


--------------------------------------------------------------------------------
/examples/openai_chatcompletion_client.py:
--------------------------------------------------------------------------------
 1 | import openai
 2 | 
 3 | # Modify OpenAI's API key and API base to use vLLM's API server.
 4 | openai.api_key = "EMPTY"
 5 | openai.api_base = "http://localhost:8000/v1"
 6 | 
 7 | # List models API
 8 | models = openai.Model.list()
 9 | print("Models:", models)
10 | 
11 | model = models["data"][0]["id"]
12 | 
13 | # Chat completion API
14 | chat_completion = openai.ChatCompletion.create(
15 |     model=model,
16 |     messages=[{
17 |         "role": "system",
18 |         "content": "You are a helpful assistant."
19 |     }, {
20 |         "role": "user",
21 |         "content": "Who won the world series in 2020?"
22 |     }, {
23 |         "role":
24 |         "assistant",
25 |         "content":
26 |         "The Los Angeles Dodgers won the World Series in 2020."
27 |     }, {
28 |         "role": "user",
29 |         "content": "Where was it played?"
30 |     }])
31 | 
32 | print("Chat completion results:")
33 | print(chat_completion)
34 | 


--------------------------------------------------------------------------------
/examples/openai_completion_client.py:
--------------------------------------------------------------------------------
 1 | import openai
 2 | 
 3 | # Modify OpenAI's API key and API base to use vLLM's API server.
 4 | openai.api_key = "EMPTY"
 5 | openai.api_base = "http://localhost:8000/v1"
 6 | 
 7 | # List models API
 8 | models = openai.Model.list()
 9 | print("Models:", models)
10 | 
11 | model = models["data"][0]["id"]
12 | 
13 | # Completion API
14 | stream = False
15 | completion = openai.Completion.create(
16 |     model=model,
17 |     prompt="A robot may not injure a human being",
18 |     echo=False,
19 |     n=2,
20 |     stream=stream,
21 |     logprobs=3)
22 | 
23 | print("Completion results:")
24 | if stream:
25 |     for c in completion:
26 |         print(c)
27 | else:
28 |     print(completion)
29 | 


--------------------------------------------------------------------------------
/format.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | # YAPF formatter, adapted from ray and skypilot.
  3 | #
  4 | # Usage:
  5 | #    # Do work and commit your work.
  6 | 
  7 | #    # Format files that differ from origin/main.
  8 | #    bash format.sh
  9 | 
 10 | #    # Commit changed files with message 'Run yapf and ruff'
 11 | #
 12 | #
 13 | # YAPF + Clang formatter (if installed). This script formats all changed files from the last mergebase.
 14 | # You are encouraged to run this locally before pushing changes for review.
 15 | 
 16 | # Cause the script to exit if a single command fails
 17 | set -eo pipefail
 18 | 
 19 | # this stops git rev-parse from failing if we run this from the .git directory
 20 | builtin cd "$(dirname "${BASH_SOURCE:-$0}")"
 21 | ROOT="$(git rev-parse --show-toplevel)"
 22 | builtin cd "$ROOT" || exit 1
 23 | 
 24 | YAPF_VERSION=$(yapf --version | awk '{print $2}')
 25 | RUFF_VERSION=$(ruff --version | awk '{print $2}')
 26 | MYPY_VERSION=$(mypy --version | awk '{print $2}')
 27 | 
 28 | # # params: tool name, tool version, required version
 29 | tool_version_check() {
 30 |     if [[ $2 != $3 ]]; then
 31 |         echo "Wrong $1 version installed: $3 is required, not $2."
 32 |         exit 1
 33 |     fi
 34 | }
 35 | 
 36 | tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-dev.txt | cut -d'=' -f3)"
 37 | tool_version_check "ruff" $RUFF_VERSION "$(grep "ruff==" requirements-dev.txt | cut -d'=' -f3)"
 38 | tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-dev.txt | cut -d'=' -f3)"
 39 | 
 40 | YAPF_FLAGS=(
 41 |     '--recursive'
 42 |     '--parallel'
 43 | )
 44 | 
 45 | YAPF_EXCLUDES=(
 46 |     '--exclude' 'build/**'
 47 | )
 48 | 
 49 | # Format specified files
 50 | format() {
 51 |     yapf --in-place "${YAPF_FLAGS[@]}" "$@"
 52 | }
 53 | 
 54 | # Format files that differ from main branch. Ignores dirs that are not slated
 55 | # for autoformat yet.
 56 | format_changed() {
 57 |     # The `if` guard ensures that the list of filenames is not empty, which
 58 |     # could cause yapf to receive 0 positional arguments, making it hang
 59 |     # waiting for STDIN.
 60 |     #
 61 |     # `diff-filter=ACM` and $MERGEBASE is to ensure we only format files that
 62 |     # exist on both branches.
 63 |     MERGEBASE="$(git merge-base origin/main HEAD)"
 64 | 
 65 |     if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
 66 |         git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \
 67 |              yapf --in-place "${YAPF_EXCLUDES[@]}" "${YAPF_FLAGS[@]}"
 68 |     fi
 69 | 
 70 | }
 71 | 
 72 | # Format all files
 73 | format_all() {
 74 |     yapf --in-place "${YAPF_FLAGS[@]}" "${YAPF_EXCLUDES[@]}" vllm tests
 75 | }
 76 | 
 77 | ## This flag formats individual files. --files *must* be the first command line
 78 | ## arg to use this option.
 79 | if [[ "$1" == '--files' ]]; then
 80 |    format "${@:2}"
 81 |    # If `--all` is passed, then any further arguments are ignored and the
 82 |    # entire python directory is formatted.
 83 | elif [[ "$1" == '--all' ]]; then
 84 |    format_all
 85 | else
 86 |    # Format only the files that changed in last commit.
 87 |    format_changed
 88 | fi
 89 | echo 'vLLM yapf: Done'
 90 | 
 91 | # Run mypy
 92 | # TODO(zhuohan): Enable mypy
 93 | # echo 'vLLM mypy:'
 94 | # mypy
 95 | 
 96 | # Lint specified files
 97 | lint() {
 98 |     ruff "$@"
 99 | }
100 | 
101 | # Lint files that differ from main branch. Ignores dirs that are not slated
102 | # for autolint yet.
103 | lint_changed() {
104 |     # The `if` guard ensures that the list of filenames is not empty, which
105 |     # could cause ruff to receive 0 positional arguments, making it hang
106 |     # waiting for STDIN.
107 |     #
108 |     # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that
109 |     # exist on both branches.
110 |     MERGEBASE="$(git merge-base origin/main HEAD)"
111 | 
112 |     if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then
113 |         git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \
114 |              ruff
115 |     fi
116 | 
117 | }
118 | 
119 | # Run Ruff
120 | echo 'vLLM Ruff:'
121 | ## This flag lints individual files. --files *must* be the first command line
122 | ## arg to use this option.
123 | if [[ "$1" == '--files' ]]; then
124 |    lint "${@:2}"
125 |    # If `--all` is passed, then any further arguments are ignored and the
126 |    # entire python directory is linted.
127 | elif [[ "$1" == '--all' ]]; then
128 |    lint vllm tests
129 | else
130 |    # Format only the files that changed in last commit.
131 |    lint_changed
132 | fi
133 | 
134 | if ! git diff --quiet &>/dev/null; then
135 |     echo 'Reformatted files. Please review and stage the changes.'
136 |     echo 'Changes not staged for commit:'
137 |     echo
138 |     git --no-pager diff --name-only
139 | 
140 |     exit 1
141 | fi
142 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
1 | [mypy]
2 | python_version = 3.8
3 | 
4 | ignore_missing_imports = True
5 | 
6 | files = vllm
7 | # TODO(woosuk): Include the code from Megatron and HuggingFace.
8 | exclude = vllm/model_executor/parallel_utils/|vllm/model_executor/models/
9 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = [
 3 |     "ninja",
 4 |     "packaging",
 5 |     "setuptools",
 6 |     "torch >= 2.1.0",
 7 |     "wheel",
 8 | ]
 9 | build-backend = "setuptools.build_meta"
10 | 
11 | [tool.ruff.lint]
12 | select = [
13 |     # pycodestyle
14 |     "E",
15 |     # Pyflakes
16 |     "F",
17 |     # pyupgrade
18 |     # "UP",
19 |     # flake8-bugbear
20 |     "B",
21 |     # flake8-simplify
22 |     "SIM",
23 |     # isort
24 |     # "I",
25 | ]
26 | ignore = [
27 |     # star imports
28 |     "F405", "F403",
29 |     # lambda expression assignment
30 |     "E731",
31 |     # line too long, handled by black formatting
32 |     "E501",
33 | ]
34 | 


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
 1 | # formatting
 2 | yapf==0.32.0
 3 | ruff==0.1.5
 4 | 
 5 | # type checking
 6 | mypy==0.991
 7 | types-PyYAML
 8 | types-requests
 9 | types-setuptools
10 | 
11 | # testing
12 | pytest
13 | pytest-forked
14 | pytest-asyncio
15 | 
16 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | ninja  # For faster builds.
 2 | psutil
 3 | ray >= 2.5.1
 4 | pandas  # Required for Ray data.
 5 | pyarrow  # Required for Ray data.
 6 | sentencepiece  # Required for LLaMA tokenizer.
 7 | numpy
 8 | einops  # Required for phi-1_5
 9 | torch >= 2.1.0
10 | transformers >= 4.34.0  # Required for Mistral.
11 | xformers >= 0.0.22.post7  # Required for CUDA 12.1.
12 | fastapi
13 | uvicorn[standard]
14 | pydantic == 1.10.13  # Required for OpenAI server.
15 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OthersideAI/vllm/4b498312ccc7b2804dfe26a4d902750e49937183/tests/__init__.py


--------------------------------------------------------------------------------
/tests/async_engine/api_server_async_engine.py:
--------------------------------------------------------------------------------
 1 | """vllm.entrypoints.api_server with some extra logging for testing."""
 2 | import argparse
 3 | from typing import Any, Dict
 4 | 
 5 | import uvicorn
 6 | from fastapi.responses import JSONResponse, Response
 7 | 
 8 | import vllm.entrypoints.api_server
 9 | from vllm.engine.arg_utils import AsyncEngineArgs
10 | from vllm.engine.async_llm_engine import AsyncLLMEngine
11 | 
12 | app = vllm.entrypoints.api_server.app
13 | 
14 | 
15 | class AsyncLLMEngineWithStats(AsyncLLMEngine):
16 | 
17 |     def __init__(self, *args, **kwargs):
18 |         super().__init__(*args, **kwargs)
19 |         self._num_aborts = 0
20 | 
21 |     async def abort(self, request_id: str) -> None:
22 |         await super().abort(request_id)
23 |         self._num_aborts += 1
24 | 
25 |     def testing_stats(self) -> Dict[str, Any]:
26 |         return {"num_aborted_requests": self._num_aborts}
27 | 
28 | 
29 | @app.get("/stats")
30 | def stats() -> Response:
31 |     """Get the statistics of the engine."""
32 |     return JSONResponse(engine.testing_stats())
33 | 
34 | 
35 | if __name__ == "__main__":
36 |     parser = argparse.ArgumentParser()
37 |     parser.add_argument("--host", type=str, default="localhost")
38 |     parser.add_argument("--port", type=int, default=8000)
39 |     parser = AsyncEngineArgs.add_cli_args(parser)
40 |     args = parser.parse_args()
41 | 
42 |     engine_args = AsyncEngineArgs.from_cli_args(args)
43 |     engine = AsyncLLMEngineWithStats.from_engine_args(engine_args)
44 |     vllm.entrypoints.api_server.engine = engine
45 |     uvicorn.run(
46 |         app,
47 |         host=args.host,
48 |         port=args.port,
49 |         log_level="debug",
50 |         timeout_keep_alive=vllm.entrypoints.api_server.TIMEOUT_KEEP_ALIVE)
51 | 


--------------------------------------------------------------------------------
/tests/async_engine/test_api_server.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import sys
 3 | import time
 4 | from multiprocessing import Pool
 5 | from pathlib import Path
 6 | 
 7 | import pytest
 8 | import requests
 9 | 
10 | 
11 | def _query_server(prompt: str) -> dict:
12 |     response = requests.post("http://localhost:8000/generate",
13 |                              json={
14 |                                  "prompt": prompt,
15 |                                  "max_tokens": 100,
16 |                                  "temperature": 0,
17 |                                  "ignore_eos": True
18 |                              })
19 |     response.raise_for_status()
20 |     return response.json()
21 | 
22 | 
23 | @pytest.fixture
24 | def api_server():
25 |     script_path = Path(__file__).parent.joinpath(
26 |         "api_server_async_engine.py").absolute()
27 |     uvicorn_process = subprocess.Popen([
28 |         sys.executable, "-u",
29 |         str(script_path), "--model", "facebook/opt-125m"
30 |     ])
31 |     yield
32 |     uvicorn_process.terminate()
33 | 
34 | 
35 | def test_api_server(api_server):
36 |     """
37 |     Run the API server and test it.
38 | 
39 |     We run both the server and requests in separate processes.
40 | 
41 |     We test that the server can handle incoming requests, including
42 |     multiple requests at the same time, and that it can handle requests
43 |     being cancelled without crashing.
44 |     """
45 |     with Pool(32) as pool:
46 |         # Wait until the server is ready
47 |         prompts = ["Hello world"] * 1
48 |         result = None
49 |         while not result:
50 |             try:
51 |                 for _ in pool.map(_query_server, prompts):
52 |                     break
53 |             except Exception:
54 |                 time.sleep(1)
55 | 
56 |         # Actual tests start here
57 |         # Try with 1 prompt
58 |         for result in pool.map(_query_server, prompts):
59 |             assert result
60 | 
61 |         num_aborted_requests = requests.get(
62 |             "http://localhost:8000/stats").json()["num_aborted_requests"]
63 |         assert num_aborted_requests == 0
64 | 
65 |         # Try with 100 prompts
66 |         prompts = ["Hello world"] * 100
67 |         for result in pool.map(_query_server, prompts):
68 |             assert result
69 | 
70 |         # Cancel requests
71 |         pool.map_async(_query_server, prompts)
72 |         time.sleep(0.01)
73 |         pool.terminate()
74 |         pool.join()
75 | 
76 |         # check cancellation stats
77 |         num_aborted_requests = requests.get(
78 |             "http://localhost:8000/stats").json()["num_aborted_requests"]
79 |         assert num_aborted_requests > 0
80 | 
81 |     # check that server still runs after cancellations
82 |     with Pool(32) as pool:
83 |         # Try with 100 prompts
84 |         prompts = ["Hello world"] * 100
85 |         for result in pool.map(_query_server, prompts):
86 |             assert result
87 | 


--------------------------------------------------------------------------------
/tests/async_engine/test_async_llm_engine.py:
--------------------------------------------------------------------------------
 1 | import asyncio
 2 | from dataclasses import dataclass
 3 | 
 4 | import pytest
 5 | 
 6 | from vllm.engine.async_llm_engine import AsyncLLMEngine
 7 | 
 8 | 
 9 | @dataclass
10 | class RequestOutput:
11 |     request_id: int
12 |     finished: bool = False
13 | 
14 | 
15 | class MockEngine:
16 | 
17 |     def __init__(self):
18 |         self.step_calls = 0
19 |         self.add_request_calls = 0
20 |         self.abort_request_calls = 0
21 |         self.request_id = None
22 | 
23 |     async def step_async(self):
24 |         self.step_calls += 1
25 |         return [RequestOutput(
26 |             request_id=self.request_id)] if self.request_id else []
27 | 
28 |     def generate(self, request_id):
29 |         self.request_id = request_id
30 | 
31 |     def stop_generating(self):
32 |         self.request_id = None
33 | 
34 |     def add_request(self, **kwargs):
35 |         del kwargs  # Unused
36 |         self.add_request_calls += 1
37 | 
38 |     def abort_request(self, request_id):
39 |         del request_id  # Unused
40 |         self.abort_request_calls += 1
41 | 
42 | 
43 | class MockAsyncLLMEngine(AsyncLLMEngine):
44 | 
45 |     def _init_engine(self, *args, **kwargs):
46 |         return MockEngine()
47 | 
48 | 
49 | @pytest.mark.asyncio
50 | async def test_new_requests_event():
51 |     engine = MockAsyncLLMEngine(worker_use_ray=False, engine_use_ray=False)
52 |     engine.start_background_loop()
53 |     await asyncio.sleep(0.01)
54 |     assert engine.engine.step_calls == 0
55 | 
56 |     await engine.add_request("1", "", None)
57 |     await asyncio.sleep(0.01)
58 |     assert engine.engine.add_request_calls == 1
59 |     assert engine.engine.step_calls == 1
60 | 
61 |     await engine.add_request("2", "", None)
62 |     engine.engine.generate("2")
63 |     await asyncio.sleep(0)
64 |     assert engine.engine.add_request_calls == 2
65 |     assert engine.engine.step_calls == 2
66 |     await asyncio.sleep(0)
67 |     assert engine.engine.step_calls == 3
68 |     engine.engine.stop_generating()
69 |     await asyncio.sleep(0)
70 |     assert engine.engine.step_calls == 4
71 |     await asyncio.sleep(0)
72 |     assert engine.engine.step_calls == 4
73 | 
74 |     await engine.add_request("3", "", None)
75 |     await asyncio.sleep(0.01)
76 |     assert engine.engine.add_request_calls == 3
77 |     assert engine.engine.step_calls == 5
78 |     await asyncio.sleep(0.01)
79 |     assert engine.engine.add_request_calls == 3
80 |     assert engine.engine.step_calls == 5
81 | 


--------------------------------------------------------------------------------
/tests/async_engine/test_request_tracker.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from vllm.engine.async_llm_engine import RequestTracker
 4 | from vllm.outputs import RequestOutput
 5 | 
 6 | 
 7 | class DummyEvent:
 8 | 
 9 |     def __init__(self):
10 |         self.flag = False
11 | 
12 |     def set(self):
13 |         self.flag = True
14 | 
15 |     def clear(self):
16 |         self.flag = False
17 | 
18 | 
19 | def test_request_tracker():
20 |     tracker = RequestTracker()
21 |     tracker.new_requests_event = DummyEvent()
22 |     stream_1 = tracker.add_request("1")
23 |     assert tracker.new_requests_event.flag
24 |     new, finished = tracker.get_new_and_finished_requests()
25 |     assert not tracker.new_requests_event.flag
26 |     assert len(new) == 1
27 |     assert new[0]["request_id"] == "1"
28 |     assert not finished
29 |     assert not stream_1.finished
30 | 
31 |     stream_2 = tracker.add_request("2")
32 |     stream_3 = tracker.add_request("3")
33 |     assert tracker.new_requests_event.flag
34 |     new, finished = tracker.get_new_and_finished_requests()
35 |     assert not tracker.new_requests_event.flag
36 |     assert len(new) == 2
37 |     assert new[0]["request_id"] == "2"
38 |     assert new[1]["request_id"] == "3"
39 |     assert not finished
40 |     assert not stream_2.finished
41 |     assert not stream_3.finished
42 | 
43 |     # request_ids must be unique
44 |     with pytest.raises(KeyError):
45 |         tracker.add_request("1")
46 |     assert not tracker.new_requests_event.flag
47 | 
48 |     tracker.abort_request("1")
49 |     new, finished = tracker.get_new_and_finished_requests()
50 |     assert len(finished) == 1
51 |     assert "1" in finished
52 |     assert not new
53 |     assert stream_1.finished
54 | 
55 |     stream_4 = tracker.add_request("4")
56 |     tracker.abort_request("4")
57 |     assert tracker.new_requests_event.flag
58 |     new, finished = tracker.get_new_and_finished_requests()
59 |     assert len(finished) == 1
60 |     assert "4" in finished
61 |     assert not new
62 |     assert stream_4.finished
63 | 
64 |     stream_5 = tracker.add_request("5")
65 |     assert tracker.new_requests_event.flag
66 |     tracker.process_request_output(
67 |         RequestOutput("2", "output", [], [], [], finished=True))
68 |     new, finished = tracker.get_new_and_finished_requests()
69 |     assert not tracker.new_requests_event.flag
70 |     assert len(finished) == 1
71 |     assert "2" in finished
72 |     assert len(new) == 1
73 |     assert new[0]["request_id"] == "5"
74 |     assert stream_2.finished
75 |     assert not stream_5.finished
76 | 


--------------------------------------------------------------------------------
/tests/distributed/test_comm_ops.py:
--------------------------------------------------------------------------------
 1 | """Test the communication operators.
 2 | 
 3 | Run `pytest tests/distributed/test_comm_ops.py --forked`.
 4 | """
 5 | from multiprocessing import Process, set_start_method
 6 | 
 7 | import pytest
 8 | import torch
 9 | 
10 | from vllm.config import ParallelConfig
11 | from vllm.engine.ray_utils import get_open_port
12 | from vllm.model_executor.parallel_utils.communication_op import (
13 |     tensor_model_parallel_all_reduce,
14 |     tensor_model_parallel_all_gather,
15 | )
16 | from vllm.worker.worker import _init_distributed_environment
17 | 
18 | 
19 | def init_test_distributed_environment(pipeline_parallel_size: int,
20 |                                       tensor_parallel_size: int, rank: int,
21 |                                       distributed_init_port: str):
22 |     parallel_config = ParallelConfig(pipeline_parallel_size,
23 |                                      tensor_parallel_size,
24 |                                      worker_use_ray=True)
25 |     distributed_init_method = f"tcp://localhost:{distributed_init_port}"
26 |     torch.cuda.set_device(rank)
27 |     _init_distributed_environment(parallel_config, rank,
28 |                                   distributed_init_method)
29 | 
30 | 
31 | def all_reduce_test_worker(tensor_parallel_size: int, rank: int,
32 |                            distributed_init_port: str):
33 |     init_test_distributed_environment(1, tensor_parallel_size, rank,
34 |                                       distributed_init_port)
35 |     num_elements = 8
36 |     all_tensors = [
37 |         torch.arange(num_elements, dtype=torch.float32, device="cuda") *
38 |         (r + 1) for r in range(tensor_parallel_size)
39 |     ]
40 |     expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0)
41 |     t = all_tensors[rank]
42 |     t = tensor_model_parallel_all_reduce(t)
43 |     assert torch.allclose(t, expected)
44 | 
45 | 
46 | def all_gather_test_worker(tensor_parallel_size: int, rank: int,
47 |                            distributed_init_port: str):
48 |     init_test_distributed_environment(1, tensor_parallel_size, rank,
49 |                                       distributed_init_port)
50 |     num_dimensions = 3
51 |     tensor_size = list(range(2, num_dimensions + 2))
52 |     total_size = 1
53 |     for s in tensor_size:
54 |         total_size *= s
55 |     for all_gather_dimension in range(num_dimensions):
56 |         all_tensors = [
57 |             torch.arange(total_size, dtype=torch.float32,
58 |                          device="cuda").reshape(tensor_size) * (r + 1)
59 |             for r in range(tensor_parallel_size)
60 |         ]
61 |         expected = torch.cat(all_tensors, dim=all_gather_dimension)
62 |         t = all_tensors[rank]
63 |         t = tensor_model_parallel_all_gather(t, all_gather_dimension)
64 |         assert torch.allclose(t, expected)
65 | 
66 | 
67 | @pytest.mark.skipif(torch.cuda.device_count() < 2,
68 |                     reason="Need at least 2 GPUs to run the test.")
69 | @pytest.mark.parametrize("tensor_parallel_size", [2])
70 | @pytest.mark.parametrize("test_target",
71 |                          [all_reduce_test_worker, all_gather_test_worker])
72 | def test_multi_process_tensor_parallel(tensor_parallel_size, test_target):
73 |     set_start_method("spawn", force=True)
74 |     distributed_init_port = get_open_port()
75 |     processes = []
76 |     for rank in range(tensor_parallel_size):
77 |         p = Process(target=test_target,
78 |                     args=(tensor_parallel_size, rank, distributed_init_port))
79 |         p.start()
80 |         processes.append(p)
81 |     for p in processes:
82 |         p.join()
83 |     assert all(p.exitcode == 0 for p in processes)
84 | 


--------------------------------------------------------------------------------
/tests/engine/test_detokenize.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from transformers import AutoTokenizer
 4 | 
 5 | from vllm.transformers_utils.tokenizer import detokenize_incrementally
 6 | 
 7 | TRUTH = [
 8 |     "Hello here, this is a simple test",  # noqa: E501
 9 |     "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving",  # noqa: E501
10 |     "我很感谢你的热情"  # noqa: E501
11 | ]
12 | TOKENIZERS = [
13 |     "facebook/opt-125m",
14 |     "gpt2",
15 |     "bigcode/tiny_starcoder_py",
16 |     "EleutherAI/gpt-j-6b",
17 |     "EleutherAI/pythia-70m",
18 |     "bigscience/bloom-560m",
19 |     "mosaicml/mpt-7b",
20 |     "tiiuae/falcon-7b",
21 |     "meta-llama/Llama-2-7b-hf",
22 |     "codellama/CodeLlama-7b-hf",
23 | ]
24 | 
25 | 
26 | def _run_incremental_decode(tokenizer, all_input_ids,
27 |                             skip_special_tokens: bool):
28 |     decoded_text = ""
29 |     offset = 0
30 |     token_offset = 0
31 |     prev_tokens = None
32 |     for i in range(len(all_input_ids)):
33 |         new_tokens, text, offset, token_offset = detokenize_incrementally(
34 |             tokenizer,
35 |             all_input_ids[:i + 1],
36 |             prev_tokens,
37 |             offset,
38 |             token_offset,
39 |             skip_special_tokens=skip_special_tokens)
40 |         decoded_text += text
41 |         if prev_tokens is None:
42 |             prev_tokens = new_tokens
43 |         else:
44 |             prev_tokens += new_tokens
45 |     return decoded_text
46 | 
47 | 
48 | @pytest.mark.parametrize("truth", TRUTH)
49 | @pytest.mark.parametrize("tokenizer_id", TOKENIZERS)
50 | @pytest.mark.parametrize("skip_special_tokens", (True, False))
51 | def test_decode_streaming(tokenizer_id, truth, skip_special_tokens):
52 |     tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
53 |     all_input_ids = tokenizer(truth, add_special_tokens=False)["input_ids"]
54 |     if skip_special_tokens:
55 |         all_input_ids = ([tokenizer.bos_token_id]
56 |                          if tokenizer.bos_token_id is not None else
57 |                          []) + all_input_ids + [tokenizer.eos_token_id]
58 | 
59 |     decoded_text = _run_incremental_decode(
60 |         tokenizer, all_input_ids, skip_special_tokens=skip_special_tokens)
61 | 
62 |     assert decoded_text == truth
63 | 


--------------------------------------------------------------------------------
/tests/kernels/conftest.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple
 2 | 
 3 | import pytest
 4 | import torch
 5 | 
 6 | 
 7 | def create_kv_caches(
 8 |     num_blocks: int,
 9 |     block_size: int,
10 |     num_layers: int,
11 |     num_heads: int,
12 |     head_size: int,
13 |     dtype: torch.dtype,
14 |     seed: int,
15 | ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
16 |     torch.random.manual_seed(seed)
17 |     torch.cuda.manual_seed(seed)
18 | 
19 |     scale = head_size**-0.5
20 |     x = 16 // torch.tensor([], dtype=dtype).element_size()
21 |     key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x)
22 |     key_caches = []
23 |     for _ in range(num_layers):
24 |         key_cache = torch.empty(size=key_cache_shape,
25 |                                 dtype=dtype,
26 |                                 device='cuda')
27 |         key_cache.uniform_(-scale, scale)
28 |         key_caches.append(key_cache)
29 | 
30 |     value_cache_shape = (num_blocks, num_heads, head_size, block_size)
31 |     value_caches = []
32 |     for _ in range(num_layers):
33 |         value_cache = torch.empty(size=value_cache_shape,
34 |                                   dtype=dtype,
35 |                                   device='cuda')
36 |         value_cache.uniform_(-scale, scale)
37 |         value_caches.append(value_cache)
38 |     return key_caches, value_caches
39 | 
40 | 
41 | @pytest.fixture()
42 | def kv_cache_factory():
43 |     return create_kv_caches
44 | 


--------------------------------------------------------------------------------
/tests/kernels/test_activation.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | import torch.nn.functional as F
 4 | from transformers.activations import get_activation
 5 | 
 6 | from vllm._C import ops
 7 | 
 8 | DTYPES = [torch.half, torch.bfloat16, torch.float]
 9 | NUM_TOKENS = [7, 83, 2048]  # Arbitrary values for testing
10 | D = [512, 4096, 5120, 13824]  # Arbitrary values for testing
11 | SEEDS = [0]
12 | 
13 | 
14 | def ref_silu_and_mul(x: torch.Tensor) -> torch.Tensor:
15 |     x1, x2 = x.chunk(chunks=2, dim=1)
16 |     return F.silu(x1) * x2
17 | 
18 | 
19 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
20 | @pytest.mark.parametrize("d", D)
21 | @pytest.mark.parametrize("dtype", DTYPES)
22 | @pytest.mark.parametrize("seed", SEEDS)
23 | @torch.inference_mode()
24 | def test_silu_and_mul(
25 |     num_tokens: int,
26 |     d: int,
27 |     dtype: torch.dtype,
28 |     seed: int,
29 | ) -> None:
30 |     torch.random.manual_seed(seed)
31 |     torch.cuda.manual_seed(seed)
32 |     x = torch.randn(num_tokens, 2 * d, dtype=dtype, device="cuda")
33 |     out = torch.empty(num_tokens, d, dtype=dtype, device="cuda")
34 |     ops.silu_and_mul(out, x)
35 |     ref_out = ref_silu_and_mul(x)
36 |     assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5)
37 | 
38 | 
39 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
40 | @pytest.mark.parametrize("d", D)
41 | @pytest.mark.parametrize("dtype", DTYPES)
42 | @pytest.mark.parametrize("seed", SEEDS)
43 | @torch.inference_mode()
44 | def test_gelu_new(
45 |     num_tokens: int,
46 |     d: int,
47 |     dtype: torch.dtype,
48 |     seed: int,
49 | ) -> None:
50 |     torch.random.manual_seed(seed)
51 |     torch.cuda.manual_seed(seed)
52 |     x = torch.randn(num_tokens, d, dtype=dtype, device="cuda")
53 |     out = torch.empty(num_tokens, d, dtype=dtype, device="cuda")
54 |     ops.gelu_new(out, x)
55 |     ref_out = get_activation("gelu_new")(x)
56 |     assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5)
57 | 
58 | 
59 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
60 | @pytest.mark.parametrize("d", D)
61 | @pytest.mark.parametrize("dtype", DTYPES)
62 | @pytest.mark.parametrize("seed", SEEDS)
63 | def test_gelu_fast(
64 |     num_tokens: int,
65 |     d: int,
66 |     dtype: torch.dtype,
67 |     seed: int,
68 | ) -> None:
69 |     torch.random.manual_seed(seed)
70 |     torch.cuda.manual_seed(seed)
71 |     x = torch.randn(num_tokens, d, dtype=dtype, device="cuda")
72 |     out = torch.empty(num_tokens, d, dtype=dtype, device="cuda")
73 |     ops.gelu_fast(out, x)
74 |     ref_out = get_activation("gelu_fast")(x)
75 |     assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5)
76 | 


--------------------------------------------------------------------------------
/tests/kernels/test_cache.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | 
  3 | import pytest
  4 | import torch
  5 | 
  6 | from vllm._C import cache_ops
  7 | 
  8 | DTYPES = [torch.half, torch.bfloat16, torch.float]
  9 | NUM_TOKENS = [83]  # Arbitrary values for testing
 10 | NUM_LAYERS = [1]  # Arbitrary values for testing
 11 | NUM_HEADS = [8]  # Arbitrary values for testing
 12 | HEAD_SIZES = [64, 80, 96, 112, 128, 256]
 13 | BLOCK_SIZES = [8, 16, 32]
 14 | NUM_BLOCKS = [1024, 36000]  # Arbitrary values for testing
 15 | NUM_MAPPINGS = [256]  # Arbitrary values for testing
 16 | SEEDS = [0]
 17 | 
 18 | 
 19 | @pytest.mark.parametrize("num_mappings", NUM_MAPPINGS)
 20 | @pytest.mark.parametrize("num_layers", NUM_LAYERS)
 21 | @pytest.mark.parametrize("num_heads", NUM_HEADS)
 22 | @pytest.mark.parametrize("head_size", HEAD_SIZES)
 23 | @pytest.mark.parametrize("block_size", BLOCK_SIZES)
 24 | @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 25 | @pytest.mark.parametrize("dtype", DTYPES)
 26 | @pytest.mark.parametrize("seed", SEEDS)
 27 | @torch.inference_mode()
 28 | def test_copy_blocks(
 29 |     kv_cache_factory,
 30 |     num_mappings: int,
 31 |     num_layers: int,
 32 |     num_heads: int,
 33 |     head_size: int,
 34 |     block_size: int,
 35 |     num_blocks: int,
 36 |     dtype: torch.dtype,
 37 |     seed: int,
 38 | ) -> None:
 39 |     random.seed(seed)
 40 |     torch.random.manual_seed(seed)
 41 |     torch.cuda.manual_seed(seed)
 42 | 
 43 |     # Generate random block mappings where each source block is mapped to two
 44 |     # destination blocks.
 45 |     assert 2 * num_mappings <= num_blocks
 46 |     src_blocks = random.sample(range(num_blocks), num_mappings)
 47 |     remainig_blocks = list(set(range(num_blocks)) - set(src_blocks))
 48 |     dst_blocks = random.sample(remainig_blocks, 2 * num_mappings)
 49 |     block_mapping = {}
 50 |     for i in range(num_mappings):
 51 |         src = src_blocks[i]
 52 |         dst1 = dst_blocks[2 * i]
 53 |         dst2 = dst_blocks[2 * i + 1]
 54 |         block_mapping[src] = [dst1, dst2]
 55 | 
 56 |     # Create the KV caches.
 57 |     key_caches, value_caches = kv_cache_factory(num_blocks, block_size,
 58 |                                                 num_layers, num_heads,
 59 |                                                 head_size, dtype, seed)
 60 | 
 61 |     # Clone the KV caches.
 62 |     cloned_key_caches = [key_cache.clone() for key_cache in key_caches]
 63 |     cloned_value_caches = [value_cache.clone() for value_cache in value_caches]
 64 | 
 65 |     # Call the copy blocks kernel.
 66 |     cache_ops.copy_blocks(key_caches, value_caches, block_mapping)
 67 | 
 68 |     # Run the reference implementation.
 69 |     for src, dsts in block_mapping.items():
 70 |         for dst in dsts:
 71 |             for cloned_key_cache in cloned_key_caches:
 72 |                 cloned_key_cache[dst].copy_(cloned_key_cache[src])
 73 |             for cloned_value_cache in cloned_value_caches:
 74 |                 cloned_value_cache[dst].copy_(cloned_value_cache[src])
 75 | 
 76 |     # Compare the results.
 77 |     for key_cache, cloned_key_cache in zip(key_caches, cloned_key_caches):
 78 |         assert torch.allclose(key_cache, cloned_key_cache)
 79 |     for value_cache, cloned_value_cache in zip(value_caches,
 80 |                                                cloned_value_caches):
 81 |         assert torch.allclose(value_cache, cloned_value_cache)
 82 | 
 83 | 
 84 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 85 | @pytest.mark.parametrize("num_heads", NUM_HEADS)
 86 | @pytest.mark.parametrize("head_size", HEAD_SIZES)
 87 | @pytest.mark.parametrize("block_size", BLOCK_SIZES)
 88 | @pytest.mark.parametrize("num_blocks", NUM_BLOCKS)
 89 | @pytest.mark.parametrize("dtype", DTYPES)
 90 | @pytest.mark.parametrize("seed", SEEDS)
 91 | @torch.inference_mode()
 92 | def test_reshape_and_cache(
 93 |     kv_cache_factory,
 94 |     num_tokens: int,
 95 |     num_heads: int,
 96 |     head_size: int,
 97 |     block_size: int,
 98 |     num_blocks: int,
 99 |     dtype: torch.dtype,
100 |     seed: int,
101 | ) -> None:
102 |     random.seed(seed)
103 |     torch.random.manual_seed(seed)
104 |     torch.cuda.manual_seed(seed)
105 | 
106 |     # Create a random slot mapping.
107 |     num_slots = block_size * num_blocks
108 |     slot_mapping = random.sample(range(num_slots), num_tokens)
109 |     slot_mapping = torch.tensor(slot_mapping, dtype=torch.long, device="cuda")
110 | 
111 |     qkv = torch.randn(num_tokens,
112 |                       3,
113 |                       num_heads,
114 |                       head_size,
115 |                       dtype=dtype,
116 |                       device="cuda")
117 |     _, key, value = qkv.unbind(dim=1)
118 | 
119 |     # Create the KV caches.
120 |     key_caches, value_caches = kv_cache_factory(num_blocks, block_size, 1,
121 |                                                 num_heads, head_size, dtype,
122 |                                                 seed)
123 |     key_cache, value_cache = key_caches[0], value_caches[0]
124 | 
125 |     # Clone the KV caches.
126 |     cloned_key_cache = key_cache.clone()
127 |     cloned_value_cache = value_cache.clone()
128 | 
129 |     # Call the reshape_and_cache kernel.
130 |     cache_ops.reshape_and_cache(key, value, key_cache, value_cache,
131 |                                 slot_mapping)
132 | 
133 |     # Run the reference implementation.
134 |     reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape)
135 |     block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor")
136 |     block_indicies = block_indicies.cpu().tolist()
137 |     block_offsets = slot_mapping % block_size
138 |     block_offsets = block_offsets.cpu().tolist()
139 |     for i in range(num_tokens):
140 |         block_idx = block_indicies[i]
141 |         block_offset = block_offsets[i]
142 |         cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i]
143 |         cloned_value_cache[block_idx, :, :, block_offset] = value[i]
144 | 
145 |     assert torch.allclose(key_cache, cloned_key_cache)
146 |     assert torch.allclose(value_cache, cloned_value_cache)
147 | 


--------------------------------------------------------------------------------
/tests/kernels/test_layernorm.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | from vllm._C import ops
 6 | 
 7 | DTYPES = [torch.half, torch.bfloat16, torch.float]
 8 | HIDDEN_SIZES = [67, 768, 2048, 5120, 8192]  # Arbitrary values for testing
 9 | NUM_TOKENS = [7, 83, 4096]  # Arbitrary values for testing
10 | SEEDS = [0]
11 | 
12 | 
13 | class RefRMSNorm(nn.Module):
14 | 
15 |     def __init__(self, hidden_size, eps=1e-6):
16 |         super().__init__()
17 |         weight = torch.empty(hidden_size)
18 |         weight.normal_(mean=1.0, std=0.1)
19 |         self.weight = nn.Parameter(weight)
20 |         self.variance_epsilon = eps
21 | 
22 |     def forward(self, hidden_states):
23 |         input_dtype = hidden_states.dtype
24 |         hidden_states = hidden_states.to(torch.float32)
25 |         variance = hidden_states.pow(2).mean(-1, keepdim=True)
26 |         hidden_states = hidden_states * torch.rsqrt(variance +
27 |                                                     self.variance_epsilon)
28 |         return self.weight * hidden_states.to(input_dtype)
29 | 
30 | 
31 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
32 | @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
33 | @pytest.mark.parametrize("dtype", DTYPES)
34 | @pytest.mark.parametrize("seed", SEEDS)
35 | @torch.inference_mode()
36 | def test_rms_norm(
37 |     num_tokens: int,
38 |     hidden_size: int,
39 |     dtype: torch.dtype,
40 |     seed: int,
41 | ) -> None:
42 |     torch.random.manual_seed(seed)
43 |     torch.cuda.manual_seed(seed)
44 | 
45 |     scale = float(hidden_size**-0.5)
46 |     x = torch.empty(num_tokens, hidden_size, dtype=dtype, device="cuda")
47 |     x.uniform_(-scale, scale)
48 |     ref = RefRMSNorm(hidden_size).to(dtype).cuda()
49 | 
50 |     out = torch.empty_like(x)
51 |     ops.rms_norm(
52 |         out,
53 |         x,
54 |         ref.weight.data,
55 |         ref.variance_epsilon,
56 |     )
57 |     ref_out = ref(x)
58 |     assert torch.allclose(out, ref_out, atol=1e-2, rtol=1e-5)
59 | 


--------------------------------------------------------------------------------
/tests/models/test_models.py:
--------------------------------------------------------------------------------
 1 | """Compare the outputs of HF and vLLM when using greedy sampling.
 2 | 
 3 | Run `pytest tests/models/test_models.py --forked`.
 4 | """
 5 | import pytest
 6 | 
 7 | MODELS = [
 8 |     "facebook/opt-125m",
 9 |     "meta-llama/Llama-2-7b-hf",
10 |     "mistralai/Mistral-7B-v0.1",
11 |     "tiiuae/falcon-7b",
12 |     "gpt2",
13 |     "bigcode/tiny_starcoder_py",
14 |     "EleutherAI/gpt-j-6b",
15 |     "EleutherAI/pythia-70m",
16 |     "bigscience/bloom-560m",
17 |     "mosaicml/mpt-7b",
18 |     "microsoft/phi-1_5",
19 | ]
20 | 
21 | 
22 | @pytest.mark.parametrize("model", MODELS)
23 | @pytest.mark.parametrize("dtype", ["half"])
24 | @pytest.mark.parametrize("max_tokens", [128])
25 | def test_models(
26 |     hf_runner,
27 |     vllm_runner,
28 |     example_prompts,
29 |     model: str,
30 |     dtype: str,
31 |     max_tokens: int,
32 | ) -> None:
33 |     hf_model = hf_runner(model, dtype=dtype)
34 |     hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
35 |     del hf_model
36 | 
37 |     vllm_model = vllm_runner(model, dtype=dtype)
38 |     vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
39 |     del vllm_model
40 | 
41 |     for i in range(len(example_prompts)):
42 |         hf_output_ids, hf_output_str = hf_outputs[i]
43 |         vllm_output_ids, vllm_output_str = vllm_outputs[i]
44 |         assert hf_output_str == vllm_output_str, (
45 |             f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
46 |         assert hf_output_ids == vllm_output_ids, (
47 |             f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
48 | 


--------------------------------------------------------------------------------
/tests/samplers/test_beam_search.py:
--------------------------------------------------------------------------------
 1 | """Compare the outputs of HF and vLLM when using beam search.
 2 | 
 3 | Run `pytest tests/samplers/test_beam_search.py --forked`.
 4 | """
 5 | import pytest
 6 | 
 7 | # FIXME(zhuohan): The test can not pass if we:
 8 | #   1. Increase max_tokens to 256.
 9 | #   2. Increase beam_width to 8.
10 | #   3. Use the model "huggyllama/llama-7b".
11 | MAX_TOKENS = [128]
12 | BEAM_WIDTHS = [4]
13 | MODELS = ["facebook/opt-125m"]
14 | 
15 | 
16 | @pytest.mark.parametrize("model", MODELS)
17 | @pytest.mark.parametrize("dtype", ["half"])
18 | @pytest.mark.parametrize("max_tokens", MAX_TOKENS)
19 | @pytest.mark.parametrize("beam_width", BEAM_WIDTHS)
20 | def test_beam_search_single_input(
21 |     hf_runner,
22 |     vllm_runner,
23 |     example_prompts,
24 |     model: str,
25 |     dtype: str,
26 |     max_tokens: int,
27 |     beam_width: int,
28 | ) -> None:
29 |     hf_model = hf_runner(model, dtype=dtype)
30 |     hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
31 |                                                max_tokens)
32 |     del hf_model
33 | 
34 |     vllm_model = vllm_runner(model, dtype=dtype)
35 |     vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width,
36 |                                                    max_tokens)
37 |     del vllm_model
38 | 
39 |     for i in range(len(example_prompts)):
40 |         hf_output_ids, _ = hf_outputs[i]
41 |         vllm_output_ids, _ = vllm_outputs[i]
42 |         assert len(hf_output_ids) == len(vllm_output_ids)
43 |         for j in range(len(hf_output_ids)):
44 |             assert hf_output_ids[j] == vllm_output_ids[j], (
45 |                 f"Test{i} output{j}:\nHF: {hf_output_ids}\n"
46 |                 f"vLLM: {vllm_output_ids}")
47 | 


--------------------------------------------------------------------------------
/tests/samplers/test_logprobs.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import torch
 3 | 
 4 | from vllm import SamplingParams
 5 | 
 6 | MODELS = ["facebook/opt-125m"]
 7 | 
 8 | 
 9 | @pytest.mark.parametrize("model", MODELS)
10 | @pytest.mark.parametrize("dtype", ["half"])
11 | def test_get_prompt_logprobs(
12 |     hf_runner,
13 |     vllm_runner,
14 |     model,
15 |     dtype,
16 |     example_prompts,
17 | ):
18 |     max_tokens = 5
19 |     hf_model = hf_runner(model, dtype=dtype)
20 |     hf_logprobs = hf_model.generate_greedy_logprobs(
21 |         example_prompts,
22 |         max_tokens=max_tokens,
23 |     )
24 |     del hf_model
25 | 
26 |     vllm_model = vllm_runner(model, dtype=dtype)
27 |     vllm_sampling_params = SamplingParams(max_tokens=max_tokens,
28 |                                           logprobs=5,
29 |                                           prompt_logprobs=5,
30 |                                           temperature=0.0)
31 |     vllm_results = vllm_model.model.generate(
32 |         example_prompts, sampling_params=vllm_sampling_params)
33 | 
34 |     # Test whether logprobs are included in the results.
35 |     for result in vllm_results:
36 |         assert result.prompt_logprobs is not None
37 |         assert result.outputs[0].logprobs is not None
38 | 
39 |     # Test whether prompt logprobs are consistent with HF
40 |     for vllm_result, hf_logprob in zip(vllm_results, hf_logprobs):
41 |         # Check prompt logprobs
42 |         vllm_prompt_logprobs = vllm_result.prompt_logprobs[1:]
43 |         for i, vllm_prompt_logprob_dict in enumerate(vllm_prompt_logprobs):
44 |             for token_id, logprob in vllm_prompt_logprob_dict.items():
45 |                 torch.testing.assert_close(logprob,
46 |                                            hf_logprob[0][i][token_id].item(),
47 |                                            atol=1e-2,
48 |                                            rtol=1e-2)
49 |         vllm_sample_logprobs = vllm_result.outputs[0].logprobs
50 |         for i, vllm_sample_logprob_dict in enumerate(vllm_sample_logprobs):
51 |             for token_id, logprob in vllm_sample_logprob_dict.items():
52 |                 torch.testing.assert_close(logprob,
53 |                                            hf_logprob[i][-1][token_id].item(),
54 |                                            atol=1e-2,
55 |                                            rtol=1e-2)
56 | 


--------------------------------------------------------------------------------
/tests/test_regression.py:
--------------------------------------------------------------------------------
 1 | """Containing tests that check for regressions in vLLM's behavior.
 2 | 
 3 | It should include tests that are reported by users and making sure they
 4 | will never happen again.
 5 | 
 6 | """
 7 | from vllm import LLM, SamplingParams
 8 | 
 9 | 
10 | def test_duplicated_ignored_sequence_group():
11 |     """https://github.com/vllm-project/vllm/issues/1655"""
12 | 
13 |     sampling_params = SamplingParams(temperature=0.01,
14 |                                      top_p=0.1,
15 |                                      max_tokens=256)
16 |     llm = LLM(model="facebook/opt-125m",
17 |               max_num_batched_tokens=4096,
18 |               tensor_parallel_size=1)
19 |     prompts = ["This is a short prompt", "This is a very long prompt " * 1000]
20 |     outputs = llm.generate(prompts, sampling_params=sampling_params)
21 | 
22 |     assert len(prompts) == len(outputs)
23 | 
24 | 
25 | if __name__ == "__main__":
26 |     import pytest
27 |     pytest.main([__file__])
28 | 


--------------------------------------------------------------------------------
/tests/worker/test_worker.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import torch
 3 | 
 4 | from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata
 5 | from vllm.worker.worker import Worker
 6 | 
 7 | 
 8 | def test_worker_prepare_inputs_for_prompt():
 9 |     worker = Worker(None, None, None)
10 |     worker.block_size = 16
11 |     batch_size = random.randint(1, 256)
12 |     prompt_lens = []
13 |     seq_group_metadata_list = []
14 |     for i in range(batch_size):
15 |         # make sure all tokens fit into one block
16 |         prompt_len = i % (worker.block_size - 1) + 1
17 |         prompt_lens.append(prompt_len)
18 |         seq_data = list(range(prompt_len))
19 |         seq_group_metadata_list.append(
20 |             SequenceGroupMetadata(
21 |                 request_id=f"test_{i}",
22 |                 is_prompt=True,
23 |                 seq_data={0: SequenceData(seq_data)},
24 |                 sampling_params=SamplingParams(temperature=0),
25 |                 block_tables={0: [1]},
26 |             ))
27 |     expected_selected_token_indices = []
28 |     selected_token_start_idx = 0
29 |     max_seq_len = max(prompt_lens)
30 |     for prompt_len in prompt_lens:
31 |         expected_selected_token_indices.append(selected_token_start_idx +
32 |                                                prompt_len - 1)
33 |         selected_token_start_idx += max_seq_len
34 |     input_tokens, input_positions, input_metadata = worker._prepare_inputs(
35 |         seq_group_metadata_list)
36 |     assert input_tokens.shape == input_positions.shape == (batch_size,
37 |                                                            max_seq_len)
38 |     torch.testing.assert_close(input_tokens, input_positions)
39 |     actual = input_metadata.selected_token_indices
40 |     expected = torch.tensor(expected_selected_token_indices,
41 |                             device=actual.device,
42 |                             dtype=actual.dtype)
43 |     torch.testing.assert_close(actual, expected)
44 | 


--------------------------------------------------------------------------------
/vllm/__init__.py:
--------------------------------------------------------------------------------
 1 | """vLLM: a high-throughput and memory-efficient inference engine for LLMs"""
 2 | 
 3 | from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs
 4 | from vllm.engine.async_llm_engine import AsyncLLMEngine
 5 | from vllm.engine.llm_engine import LLMEngine
 6 | from vllm.engine.ray_utils import initialize_cluster
 7 | from vllm.entrypoints.llm import LLM
 8 | from vllm.outputs import CompletionOutput, RequestOutput
 9 | from vllm.sampling_params import SamplingParams
10 | 
11 | __version__ = "0.2.2"
12 | 
13 | __all__ = [
14 |     "LLM",
15 |     "SamplingParams",
16 |     "RequestOutput",
17 |     "CompletionOutput",
18 |     "LLMEngine",
19 |     "EngineArgs",
20 |     "AsyncLLMEngine",
21 |     "AsyncEngineArgs",
22 |     "initialize_cluster",
23 | ]
24 | 


--------------------------------------------------------------------------------
/vllm/block.py:
--------------------------------------------------------------------------------
 1 | """Token blocks."""
 2 | from typing import List
 3 | 
 4 | from vllm.utils import Device
 5 | 
 6 | _BLANK_TOKEN_ID = -1
 7 | 
 8 | 
 9 | class LogicalTokenBlock:
10 |     """A block that stores a contiguous chunk of tokens from left to right.
11 | 
12 |     Logical blocks are used to represent the states of the corresponding
13 |     physical blocks in the KV cache.
14 |     """
15 | 
16 |     def __init__(
17 |         self,
18 |         block_number: int,
19 |         block_size: int,
20 |     ) -> None:
21 |         self.block_number = block_number
22 |         self.block_size = block_size
23 | 
24 |         self.token_ids = [_BLANK_TOKEN_ID] * block_size
25 |         self.num_tokens = 0
26 | 
27 |     def is_empty(self) -> bool:
28 |         return self.num_tokens == 0
29 | 
30 |     def get_num_empty_slots(self) -> int:
31 |         return self.block_size - self.num_tokens
32 | 
33 |     def is_full(self) -> bool:
34 |         return self.num_tokens == self.block_size
35 | 
36 |     def append_tokens(self, token_ids: List[int]) -> None:
37 |         assert len(token_ids) <= self.get_num_empty_slots()
38 |         curr_idx = self.num_tokens
39 |         self.token_ids[curr_idx:curr_idx + len(token_ids)] = token_ids
40 |         self.num_tokens += len(token_ids)
41 | 
42 |     def get_token_ids(self) -> List[int]:
43 |         return self.token_ids[:self.num_tokens]
44 | 
45 |     def get_last_token_id(self) -> int:
46 |         assert self.num_tokens > 0
47 |         return self.token_ids[self.num_tokens - 1]
48 | 
49 | 
50 | class PhysicalTokenBlock:
51 |     """Represents the state of a block in the KV cache."""
52 | 
53 |     def __init__(
54 |         self,
55 |         device: Device,
56 |         block_number: int,
57 |         block_size: int,
58 |     ) -> None:
59 |         self.device = device
60 |         self.block_number = block_number
61 |         self.block_size = block_size
62 | 
63 |         self.ref_count = 0
64 | 
65 |     def __repr__(self) -> str:
66 |         return (f'PhysicalTokenBlock(device={self.device}, '
67 |                 f'block_number={self.block_number}, '
68 |                 f'ref_count={self.ref_count})')
69 | 


--------------------------------------------------------------------------------
/vllm/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OthersideAI/vllm/4b498312ccc7b2804dfe26a4d902750e49937183/vllm/core/__init__.py


--------------------------------------------------------------------------------
/vllm/core/policy.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | 
 3 | from vllm.sequence import SequenceGroup
 4 | 
 5 | 
 6 | class Policy:
 7 | 
 8 |     def get_priority(
 9 |         self,
10 |         now: float,
11 |         seq_group: SequenceGroup,
12 |     ) -> float:
13 |         raise NotImplementedError
14 | 
15 |     def sort_by_priority(
16 |         self,
17 |         now: float,
18 |         seq_groups: List[SequenceGroup],
19 |     ) -> List[SequenceGroup]:
20 |         return sorted(
21 |             seq_groups,
22 |             key=lambda seq_group: self.get_priority(now, seq_group),
23 |             reverse=True,
24 |         )
25 | 
26 | 
27 | class FCFS(Policy):
28 | 
29 |     def get_priority(
30 |         self,
31 |         now: float,
32 |         seq_group: SequenceGroup,
33 |     ) -> float:
34 |         return now - seq_group.arrival_time
35 | 
36 | 
37 | class PolicyFactory:
38 | 
39 |     _POLICY_REGISTRY = {
40 |         'fcfs': FCFS,
41 |     }
42 | 
43 |     @classmethod
44 |     def get_policy(cls, policy_name: str, **kwargs) -> Policy:
45 |         return cls._POLICY_REGISTRY[policy_name](**kwargs)
46 | 


--------------------------------------------------------------------------------
/vllm/engine/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OthersideAI/vllm/4b498312ccc7b2804dfe26a4d902750e49937183/vllm/engine/__init__.py


--------------------------------------------------------------------------------
/vllm/engine/ray_utils.py:
--------------------------------------------------------------------------------
  1 | import socket
  2 | from typing import Optional, Tuple, TYPE_CHECKING
  3 | 
  4 | from vllm.config import ParallelConfig
  5 | from vllm.logger import init_logger
  6 | 
  7 | logger = init_logger(__name__)
  8 | 
  9 | try:
 10 |     import ray
 11 |     from ray.air.util.torch_dist import TorchDistributedWorker
 12 | 
 13 |     class RayWorker(TorchDistributedWorker):
 14 |         """Ray wrapper for vllm.worker.Worker, allowing Worker to be
 15 |         lazliy initialized after Ray sets CUDA_VISIBLE_DEVICES."""
 16 | 
 17 |         def __init__(self, init_cached_hf_modules=False) -> None:
 18 |             if init_cached_hf_modules:
 19 |                 from transformers.dynamic_module_utils import init_hf_modules
 20 |                 init_hf_modules()
 21 |             self.worker = None
 22 | 
 23 |         def init_worker(self, worker_init_fn):
 24 |             self.worker = worker_init_fn()
 25 | 
 26 |         def __getattr__(self, name):
 27 |             return getattr(self.worker, name)
 28 | 
 29 |         def execute_method(self, method, *args, **kwargs):
 30 |             executor = getattr(self, method)
 31 |             return executor(*args, **kwargs)
 32 | 
 33 | except ImportError as e:
 34 |     logger.warning(f"Failed to import Ray with {e!r}. "
 35 |                    "For distributed inference, please install Ray with "
 36 |                    "`pip install ray pandas pyarrow`.")
 37 |     ray = None
 38 |     TorchDistributedWorker = None
 39 |     RayWorker = None
 40 | 
 41 | if TYPE_CHECKING:
 42 |     from ray.util.placement_group import PlacementGroup
 43 | 
 44 | 
 45 | def get_open_port():
 46 |     with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
 47 |         s.bind(("", 0))
 48 |         return s.getsockname()[1]
 49 | 
 50 | 
 51 | def initialize_cluster(
 52 |     parallel_config: ParallelConfig,
 53 |     engine_use_ray: bool = False,
 54 |     ray_address: Optional[str] = None,
 55 | ) -> Tuple[str, Optional["PlacementGroup"]]:
 56 |     """Initialize the distributed cluster probably with Ray.
 57 | 
 58 |     Args:
 59 |         parallel_config: The configurations for parallel execution.
 60 |         engine_use_ray: Whether to use Ray for async engine.
 61 |         ray_address: The address of the Ray cluster. If None, uses
 62 |             the default Ray cluster address.
 63 | 
 64 |     Returns:
 65 |         A tuple of (`distributed_init_method`, `placement_group`). The
 66 |         `distributed_init_method` is the address for initializing the
 67 |         distributed backend. `placement_group` includes the specification
 68 |         of the resources for each distributed worker.
 69 |     """
 70 |     if parallel_config.worker_use_ray or engine_use_ray:
 71 |         if ray is None:
 72 |             raise ImportError(
 73 |                 "Ray is not installed. Please install Ray to use distributed "
 74 |                 "serving.")
 75 |         # Connect to a ray cluster.
 76 |         ray.init(address=ray_address, ignore_reinit_error=True)
 77 | 
 78 |     if not parallel_config.worker_use_ray:
 79 |         # Initialize cluster locally.
 80 |         port = get_open_port()
 81 |         # We need to setup the distributed init method to make sure
 82 |         # the distributed megatron code (e.g., get world size) works correctly.
 83 |         distributed_init_method = f"tcp://localhost:{port}"
 84 |         return distributed_init_method, None
 85 | 
 86 |     current_placement_group = ray.util.get_current_placement_group()
 87 |     if current_placement_group:
 88 |         # We are in a placement group
 89 |         bundles = current_placement_group.bundle_specs
 90 |         # Verify that we can use the placement group.
 91 |         gpu_bundles = 0
 92 |         for bundle in bundles:
 93 |             bundle_gpus = bundle.get("GPU", 0)
 94 |             if bundle_gpus > 1:
 95 |                 raise ValueError(
 96 |                     "Placement group bundle cannot have more than 1 GPU.")
 97 |             if bundle_gpus:
 98 |                 gpu_bundles += 1
 99 |         if parallel_config.world_size > gpu_bundles:
100 |             raise ValueError(
101 |                 "The number of required GPUs exceeds the total number of "
102 |                 "available GPUs in the placement group.")
103 |     else:
104 |         num_gpus_in_cluster = ray.cluster_resources().get("GPU", 0)
105 |         if parallel_config.world_size > num_gpus_in_cluster:
106 |             raise ValueError(
107 |                 "The number of required GPUs exceeds the total number of "
108 |                 "available GPUs in the cluster.")
109 |         # Create a new placement group
110 |         current_placement_group = ray.util.placement_group([{
111 |             "GPU": 1
112 |         }] * parallel_config.world_size)
113 |         # Wait until PG is ready - this will block until all
114 |         # requested resources are available, and will timeout
115 |         # if they cannot be provisioned.
116 |         ray.get(current_placement_group.ready(), timeout=1800)
117 | 
118 |     return None, current_placement_group
119 | 


--------------------------------------------------------------------------------
/vllm/entrypoints/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OthersideAI/vllm/4b498312ccc7b2804dfe26a4d902750e49937183/vllm/entrypoints/__init__.py


--------------------------------------------------------------------------------
/vllm/entrypoints/api_server.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | from typing import AsyncGenerator
 4 | 
 5 | from fastapi import FastAPI, Request
 6 | from fastapi.responses import JSONResponse, Response, StreamingResponse
 7 | import uvicorn
 8 | 
 9 | from vllm.engine.arg_utils import AsyncEngineArgs
10 | from vllm.engine.async_llm_engine import AsyncLLMEngine
11 | from vllm.sampling_params import SamplingParams
12 | from vllm.utils import random_uuid
13 | 
14 | TIMEOUT_KEEP_ALIVE = 5  # seconds.
15 | TIMEOUT_TO_PREVENT_DEADLOCK = 1  # seconds.
16 | app = FastAPI()
17 | engine = None
18 | 
19 | 
20 | @app.get("/health")
21 | async def health() -> Response:
22 |     """Health check."""
23 |     return Response(status_code=200)
24 | 
25 | 
26 | @app.post("/generate")
27 | async def generate(request: Request) -> Response:
28 |     """Generate completion for the request.
29 | 
30 |     The request should be a JSON object with the following fields:
31 |     - prompt: the prompt to use for the generation.
32 |     - stream: whether to stream the results or not.
33 |     - other fields: the sampling parameters (See `SamplingParams` for details).
34 |     """
35 |     request_dict = await request.json()
36 |     prompt = request_dict.pop("prompt")
37 |     stream = request_dict.pop("stream", False)
38 |     sampling_params = SamplingParams(**request_dict)
39 |     request_id = random_uuid()
40 | 
41 |     results_generator = engine.generate(prompt, sampling_params, request_id)
42 | 
43 |     # Streaming case
44 |     async def stream_results() -> AsyncGenerator[bytes, None]:
45 |         async for request_output in results_generator:
46 |             prompt = request_output.prompt
47 |             text_outputs = [
48 |                 prompt + output.text for output in request_output.outputs
49 |             ]
50 |             ret = {"text": text_outputs}
51 |             yield (json.dumps(ret) + "\0").encode("utf-8")
52 | 
53 |     if stream:
54 |         return StreamingResponse(stream_results())
55 | 
56 |     # Non-streaming case
57 |     final_output = None
58 |     async for request_output in results_generator:
59 |         if await request.is_disconnected():
60 |             # Abort the request if the client disconnects.
61 |             await engine.abort(request_id)
62 |             return Response(status_code=499)
63 |         final_output = request_output
64 | 
65 |     assert final_output is not None
66 |     prompt = final_output.prompt
67 |     text_outputs = [prompt + output.text for output in final_output.outputs]
68 |     ret = {"text": text_outputs}
69 |     return JSONResponse(ret)
70 | 
71 | 
72 | if __name__ == "__main__":
73 |     parser = argparse.ArgumentParser()
74 |     parser.add_argument("--host", type=str, default=None)
75 |     parser.add_argument("--port", type=int, default=8000)
76 |     parser = AsyncEngineArgs.add_cli_args(parser)
77 |     args = parser.parse_args()
78 | 
79 |     engine_args = AsyncEngineArgs.from_cli_args(args)
80 |     engine = AsyncLLMEngine.from_engine_args(engine_args)
81 | 
82 |     uvicorn.run(app,
83 |                 host=args.host,
84 |                 port=args.port,
85 |                 log_level="debug",
86 |                 timeout_keep_alive=TIMEOUT_KEEP_ALIVE)
87 | 


--------------------------------------------------------------------------------
/vllm/entrypoints/openai/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OthersideAI/vllm/4b498312ccc7b2804dfe26a4d902750e49937183/vllm/entrypoints/openai/__init__.py


--------------------------------------------------------------------------------
/vllm/logger.py:
--------------------------------------------------------------------------------
 1 | # Adapted from
 2 | # https://github.com/skypilot-org/skypilot/blob/86dc0f6283a335e4aa37b3c10716f90999f48ab6/sky/sky_logging.py
 3 | """Logging configuration for vLLM."""
 4 | import logging
 5 | import sys
 6 | 
 7 | _FORMAT = "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s"
 8 | _DATE_FORMAT = "%m-%d %H:%M:%S"
 9 | 
10 | 
11 | class NewLineFormatter(logging.Formatter):
12 |     """Adds logging prefix to newlines to align multi-line messages."""
13 | 
14 |     def __init__(self, fmt, datefmt=None):
15 |         logging.Formatter.__init__(self, fmt, datefmt)
16 | 
17 |     def format(self, record):
18 |         msg = logging.Formatter.format(self, record)
19 |         if record.message != "":
20 |             parts = msg.split(record.message)
21 |             msg = msg.replace("\n", "\r\n" + parts[0])
22 |         return msg
23 | 
24 | 
25 | _root_logger = logging.getLogger("vllm")
26 | _default_handler = None
27 | 
28 | 
29 | def _setup_logger():
30 |     _root_logger.setLevel(logging.DEBUG)
31 |     global _default_handler
32 |     if _default_handler is None:
33 |         _default_handler = logging.StreamHandler(sys.stdout)
34 |         _default_handler.flush = sys.stdout.flush  # type: ignore
35 |         _default_handler.setLevel(logging.INFO)
36 |         _root_logger.addHandler(_default_handler)
37 |     fmt = NewLineFormatter(_FORMAT, datefmt=_DATE_FORMAT)
38 |     _default_handler.setFormatter(fmt)
39 |     # Setting this will avoid the message
40 |     # being propagated to the parent logger.
41 |     _root_logger.propagate = False
42 | 
43 | 
44 | # The logger is initialized when the module is imported.
45 | # This is thread-safe as the module is only imported once,
46 | # guaranteed by the Python GIL.
47 | _setup_logger()
48 | 
49 | 
50 | def init_logger(name: str):
51 |     # Use the same settings as above for root logger
52 |     logger = logging.getLogger(name)
53 |     logger.setLevel(logging.DEBUG)
54 |     logger.addHandler(_default_handler)
55 |     logger.propagate = False
56 |     return logger
57 | 


--------------------------------------------------------------------------------
/vllm/model_executor/__init__.py:
--------------------------------------------------------------------------------
 1 | from vllm.model_executor.input_metadata import InputMetadata
 2 | from vllm.model_executor.model_loader import get_model
 3 | from vllm.model_executor.utils import set_random_seed
 4 | 
 5 | __all__ = [
 6 |     "InputMetadata",
 7 |     "get_model",
 8 |     "set_random_seed",
 9 | ]
10 | 


--------------------------------------------------------------------------------
/vllm/model_executor/input_metadata.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, List, Optional, Tuple
 2 | 
 3 | import torch
 4 | from xformers.ops import AttentionBias
 5 | 
 6 | from vllm.sampling_params import SamplingParams, SamplingType
 7 | from vllm.sequence import SequenceData
 8 | 
 9 | 
10 | class InputMetadata:
11 |     """Metadata for input sequences. Used for PagedAttention.
12 | 
13 |     Args:
14 |         seq_groups: List of (seq_ids, sampling_params).
15 |         seq_data: Seq_id -> SequenceData.
16 |         prompt_lens: Lengths of prompts.
17 |         slot_mapping: The address to write the new KV to of each token.
18 |         context_lens: the length of attention context for each generation token.
19 |         max_context_len: The maximum context length.
20 |         block_tables: The block tables. (Seq id -> list of physical block)
21 |     """
22 | 
23 |     def __init__(
24 |         self,
25 |         seq_groups: List[Tuple[List[int], SamplingParams]],
26 |         seq_data: Dict[int, SequenceData],
27 |         prompt_lens: List[int],
28 |         slot_mapping: torch.Tensor,
29 |         context_lens: torch.Tensor,
30 |         max_context_len: int,
31 |         block_tables: torch.Tensor,
32 |         selected_token_indices: torch.Tensor,
33 |         categorized_sample_indices: Dict[SamplingType, torch.Tensor],
34 |         sliding_window: Optional[int] = None,
35 |     ) -> None:
36 |         self.seq_groups = seq_groups
37 |         self.seq_data = seq_data
38 |         self.prompt_lens = prompt_lens
39 |         self.slot_mapping = slot_mapping
40 |         self.context_lens = context_lens
41 |         self.max_context_len = max_context_len
42 |         self.block_tables = block_tables
43 |         self.selected_token_indices = selected_token_indices
44 |         self.categorized_sample_indices = categorized_sample_indices
45 | 
46 |         self.max_prompt_len = max(prompt_lens) if prompt_lens else 0
47 |         self.to_cache = None
48 |         if sliding_window is not None:
49 |             # We need to keep the positions of sliding windows within
50 |             # the key / value tables, this is helpful to know which
51 |             # elements we need to cache.
52 |             to_cache, start_idx = [], 0
53 |             for prompt_len in self.prompt_lens:
54 |                 to_cache.extend(
55 |                     range(
56 |                         start_idx + max(0, prompt_len - sliding_window),
57 |                         start_idx + prompt_len,
58 |                     ))
59 |                 start_idx += self.max_prompt_len
60 |             to_cache.extend(range(start_idx, slot_mapping.shape[0]))
61 |             self.to_cache = torch.tensor(to_cache,
62 |                                          dtype=torch.int32,
63 |                                          device=self.slot_mapping.device)
64 | 
65 |         self.num_prompts = len(prompt_lens)
66 |         self.num_prompt_tokens = self.num_prompts * self.max_prompt_len
67 |         self.num_generation_tokens = context_lens.shape[0]
68 |         if block_tables.numel() > 0:
69 |             self.max_num_blocks_per_seq = block_tables.shape[1]
70 |         else:
71 |             self.max_num_blocks_per_seq = 0
72 |         assert block_tables.shape[0] == self.num_generation_tokens
73 | 
74 |         # Set during the execution of the first attention op.
75 |         self.attn_bias: Optional[AttentionBias] = None
76 | 
77 |     def __repr__(self) -> str:
78 |         # Print only useful metadata.
79 |         return (
80 |             f'InputMetadata('
81 |             f'num_prompt_tokens={self.num_prompt_tokens}, '
82 |             f'num_prompts={self.num_prompts}, '
83 |             f'prompt_lens={self.prompt_lens}, '
84 |             f'num_generation_tokens={self.num_generation_tokens}, '
85 |             f'context_lens={self.context_lens}, '
86 |             f'max_context_len={self.max_context_len}), '
87 |             f'max_num_blocks_per_seq={self.max_num_blocks_per_seq}, '
88 |             f'block_tables={self.block_tables}, '
89 |             f'selected_token_indices={self.selected_token_indices}, '
90 |             f'categorized_sample_indices={self.categorized_sample_indices}, '
91 |             f'slot_mapping={self.slot_mapping})')
92 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OthersideAI/vllm/4b498312ccc7b2804dfe26a4d902750e49937183/vllm/model_executor/layers/__init__.py


--------------------------------------------------------------------------------
/vllm/model_executor/layers/activation.py:
--------------------------------------------------------------------------------
  1 | """Custom activation functions."""
  2 | from typing import Optional
  3 | 
  4 | import torch
  5 | import torch.nn as nn
  6 | 
  7 | from vllm._C import ops
  8 | from vllm.model_executor.layers.quantization import QuantizationConfig
  9 | from vllm.model_executor.parallel_utils.parallel_state import (
 10 |     get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size)
 11 | from vllm.model_executor.parallel_utils.utils import divide
 12 | from vllm.model_executor.utils import set_weight_attrs
 13 | 
 14 | 
 15 | class SiluAndMul(nn.Module):
 16 |     """An activation function for SwiGLU.
 17 | 
 18 |     The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
 19 | 
 20 |     Shapes:
 21 |         x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d)
 22 |         return: (batch_size, seq_len, d) or (num_tokens, d)
 23 |     """
 24 | 
 25 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
 26 |         d = x.shape[-1] // 2
 27 |         output_shape = (x.shape[:-1] + (d, ))
 28 |         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
 29 |         ops.silu_and_mul(out, x)
 30 |         return out
 31 | 
 32 | 
 33 | class NewGELU(nn.Module):
 34 | 
 35 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
 36 |         out = torch.empty_like(x)
 37 |         ops.gelu_new(out, x)
 38 |         return out
 39 | 
 40 | 
 41 | class FastGELU(nn.Module):
 42 | 
 43 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
 44 |         out = torch.empty_like(x)
 45 |         ops.gelu_fast(out, x)
 46 |         return out
 47 | 
 48 | 
 49 | class ScaledActivation(nn.Module):
 50 |     """An activation function with post-scale parameters.
 51 | 
 52 |     This is used for some quantization methods like AWQ.
 53 |     """
 54 | 
 55 |     def __init__(
 56 |         self,
 57 |         act_module: nn.Module,
 58 |         intermediate_size: int,
 59 |         input_is_parallel: bool = True,
 60 |         params_dtype: Optional[torch.dtype] = None,
 61 |     ):
 62 |         super().__init__()
 63 |         self.act = act_module
 64 |         self.input_is_parallel = input_is_parallel
 65 |         if input_is_parallel:
 66 |             tp_size = get_tensor_model_parallel_world_size()
 67 |             intermediate_size_per_partition = divide(intermediate_size,
 68 |                                                      tp_size)
 69 |         else:
 70 |             intermediate_size_per_partition = intermediate_size
 71 |         if params_dtype is None:
 72 |             params_dtype = torch.get_default_dtype()
 73 |         self.scales = nn.Parameter(
 74 |             torch.empty(intermediate_size_per_partition,
 75 |                         dtype=params_dtype,
 76 |                         device="cuda"))
 77 |         set_weight_attrs(self.scales, {"weight_loader": self.weight_loader})
 78 | 
 79 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
 80 |         return self.act(x) / self.scales
 81 | 
 82 |     def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor):
 83 |         param_data = param.data
 84 |         if self.input_is_parallel:
 85 |             tp_rank = get_tensor_model_parallel_rank()
 86 |             shard_size = param_data.shape[0]
 87 |             start_idx = tp_rank * shard_size
 88 |             loaded_weight = loaded_weight.narrow(0, start_idx, shard_size)
 89 |         assert param_data.shape == loaded_weight.shape
 90 |         param_data.copy_(loaded_weight)
 91 | 
 92 | 
 93 | _ACTIVATION_REGISTRY = {
 94 |     "gelu": nn.GELU(),
 95 |     "gelu_fast": FastGELU(),
 96 |     "gelu_new": NewGELU(),
 97 |     "gelu_pytorch_tanh": nn.GELU(approximate="tanh"),
 98 |     "relu": nn.ReLU(),
 99 | }
100 | 
101 | 
102 | def get_act_fn(
103 |     act_fn_name: str,
104 |     quant_config: Optional[QuantizationConfig] = None,
105 |     intermediate_size: Optional[int] = None,
106 |     input_is_parallel: bool = True,
107 |     params_dtype: Optional[torch.dtype] = None,
108 | ) -> nn.Module:
109 |     """Get an activation function by name."""
110 |     act_fn_name = act_fn_name.lower()
111 |     if act_fn_name not in _ACTIVATION_REGISTRY:
112 |         raise ValueError(
113 |             f"Activation function {act_fn_name!r} is not supported.")
114 | 
115 |     act_fn = _ACTIVATION_REGISTRY[act_fn_name]
116 |     if (quant_config is not None
117 |             and act_fn_name in quant_config.get_scaled_act_names()):
118 |         if intermediate_size is None:
119 |             raise ValueError("intermediate_size must be specified for scaled "
120 |                              "activation functions.")
121 |         return ScaledActivation(act_fn, intermediate_size, input_is_parallel,
122 |                                 params_dtype)
123 |     return act_fn
124 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/layernorm.py:
--------------------------------------------------------------------------------
 1 | """Custom normalization layers."""
 2 | from typing import Optional, Tuple, Union
 3 | 
 4 | import torch
 5 | import torch.nn as nn
 6 | 
 7 | from vllm._C import ops
 8 | 
 9 | 
10 | class RMSNorm(nn.Module):
11 |     """Root mean square normalization.
12 | 
13 |     Computes x -> w * x / sqrt(E[x^2] + eps) where w is the learned weight.
14 |     Refer to https://arxiv.org/abs/1910.07467
15 |     """
16 | 
17 |     def __init__(
18 |         self,
19 |         hidden_size: int,
20 |         eps: float = 1e-6,
21 |     ) -> None:
22 |         super().__init__()
23 |         self.weight = nn.Parameter(torch.ones(hidden_size))
24 |         self.variance_epsilon = eps
25 | 
26 |     def forward(
27 |         self,
28 |         x: torch.Tensor,
29 |         residual: Optional[torch.Tensor] = None,
30 |     ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
31 |         if residual is not None:
32 |             ops.fused_add_rms_norm(
33 |                 x,
34 |                 residual,
35 |                 self.weight.data,
36 |                 self.variance_epsilon,
37 |             )
38 |             return x, residual
39 |         out = torch.empty_like(x)
40 |         ops.rms_norm(
41 |             out,
42 |             x,
43 |             self.weight.data,
44 |             self.variance_epsilon,
45 |         )
46 |         return out
47 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/__init__.py:
--------------------------------------------------------------------------------
 1 | from typing import Type
 2 | 
 3 | from vllm.model_executor.layers.quantization.awq import AWQConfig
 4 | from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig
 5 | from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
 6 | 
 7 | _QUANTIZATION_CONFIG_REGISTRY = {
 8 |     "awq": AWQConfig,
 9 |     "squeezellm": SqueezeLLMConfig,
10 | }
11 | 
12 | 
13 | def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
14 |     if quantization not in _QUANTIZATION_CONFIG_REGISTRY:
15 |         raise ValueError(f"Invalid quantization method: {quantization}")
16 |     return _QUANTIZATION_CONFIG_REGISTRY[quantization]
17 | 
18 | 
19 | __all__ = [
20 |     "QuantizationConfig",
21 |     "get_quantization_config",
22 | ]
23 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/base_config.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Any, Dict, List
 3 | 
 4 | import torch
 5 | 
 6 | from vllm.model_executor.layers.linear import LinearMethodBase
 7 | 
 8 | 
 9 | class QuantizationConfig(ABC):
10 |     """Base class for quantization configs."""
11 | 
12 |     @abstractmethod
13 |     def get_name(self) -> str:
14 |         """Name of the quantization method."""
15 |         raise NotImplementedError
16 | 
17 |     @abstractmethod
18 |     def get_supported_act_dtypes(self) -> List[torch.dtype]:
19 |         """List of supported activation dtypes."""
20 |         raise NotImplementedError
21 | 
22 |     @abstractmethod
23 |     def get_min_capability(self) -> int:
24 |         """Minimum GPU capability to support the quantization method.
25 | 
26 |         E.g., 70 for Volta, 75 for Turing, 80 for Ampere.
27 |         This requirement is due to the custom CUDA kernels used by the
28 |         quantization method.
29 |         """
30 |         raise NotImplementedError
31 | 
32 |     @staticmethod
33 |     @abstractmethod
34 |     def get_config_filenames() -> List[str]:
35 |         """List of filenames to search for in the model directory."""
36 |         raise NotImplementedError
37 | 
38 |     @classmethod
39 |     @abstractmethod
40 |     def from_config(cls, config: Dict[str, Any]) -> "QuantizationConfig":
41 |         """Create a config class from the model's quantization config."""
42 |         raise NotImplementedError
43 | 
44 |     @staticmethod
45 |     def get_from_keys(config: Dict[str, Any], keys: List[str]) -> Any:
46 |         """Get a value from the model's quantization config."""
47 |         for key in keys:
48 |             if key in config:
49 |                 return config[key]
50 |         raise ValueError(f"Cannot find any of {keys} in the model's "
51 |                          "quantization config.")
52 | 
53 |     @abstractmethod
54 |     def get_linear_method(self) -> LinearMethodBase:
55 |         """Get the linear method to use for the quantized linear layer."""
56 |         raise NotImplementedError
57 | 
58 |     @abstractmethod
59 |     def get_scaled_act_names(self) -> List[str]:
60 |         """Returns the activation function names that should be post-scaled.
61 | 
62 |         For now, this is only used by AWQ.
63 |         """
64 |         raise NotImplementedError
65 | 


--------------------------------------------------------------------------------
/vllm/model_executor/layers/quantization/squeezellm.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Dict, List, Optional
  2 | 
  3 | import torch
  4 | from torch.nn.parameter import Parameter
  5 | 
  6 | from vllm._C import ops
  7 | from vllm.model_executor.layers.linear import (LinearMethodBase,
  8 |                                                set_weight_attrs)
  9 | from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
 10 | 
 11 | 
 12 | class SqueezeLLMConfig(QuantizationConfig):
 13 |     """Config class for SqueezeLLM.
 14 | 
 15 |     Reference: https://arxiv.org/pdf/2306.07629
 16 |     """
 17 | 
 18 |     def __init__(
 19 |         self,
 20 |         weight_bits: int,
 21 |     ) -> None:
 22 |         self.weight_bits = weight_bits
 23 | 
 24 |         if self.weight_bits != 4:
 25 |             raise ValueError(
 26 |                 "Currently, only 4-bit weight quantization is supported for "
 27 |                 f"SqueezeLLM, but got {self.weight_bits} bits.")
 28 | 
 29 |         self.pack_factor = 32 // self.weight_bits
 30 | 
 31 |     def __repr__(self) -> str:
 32 |         return f"SqueezeLLMConfig(weight_bits={self.weight_bits})"
 33 | 
 34 |     def get_name(self) -> str:
 35 |         return "squeezellm"
 36 | 
 37 |     def get_supported_act_dtypes(self) -> List[torch.dtype]:
 38 |         return [torch.half]
 39 | 
 40 |     def get_min_capability(self) -> int:
 41 |         return 70
 42 | 
 43 |     @staticmethod
 44 |     def get_config_filenames() -> List[str]:
 45 |         return ["quant_config.json"]
 46 | 
 47 |     @classmethod
 48 |     def from_config(cls, config: Dict[str, Any]) -> "SqueezeLLMConfig":
 49 |         weight_bits = cls.get_from_keys(config, ["wbits"])
 50 |         return cls(weight_bits)
 51 | 
 52 |     def get_linear_method(self) -> "SqueezeLLMLinearMethod":
 53 |         return SqueezeLLMLinearMethod(self)
 54 | 
 55 |     def get_scaled_act_names(self) -> List[str]:
 56 |         return []
 57 | 
 58 | 
 59 | class SqueezeLLMLinearMethod(LinearMethodBase):
 60 |     """Linear method for SqueezeLLM.
 61 | 
 62 |     Args:
 63 |         quant_config: The SqueezeLLM quantization config.
 64 |     """
 65 | 
 66 |     def __init__(self, quant_config: SqueezeLLMConfig):
 67 |         self.quant_config = quant_config
 68 | 
 69 |     def create_weights(self, input_size: int, output_size: int,
 70 |                        params_dtype: torch.dtype) -> Dict[str, torch.Tensor]:
 71 |         if input_size % self.quant_config.pack_factor != 0:
 72 |             raise ValueError(
 73 |                 "The input size is not aligned with the quantized "
 74 |                 "weight shape. This can be caused by too large "
 75 |                 "tensor parallel size.")
 76 |         qweight = Parameter(
 77 |             torch.empty(
 78 |                 input_size // self.quant_config.pack_factor,
 79 |                 output_size,
 80 |                 device="cuda",
 81 |                 dtype=torch.int32,
 82 |             ),
 83 |             requires_grad=False,
 84 |         )
 85 |         set_weight_attrs(
 86 |             qweight, {
 87 |                 "input_dim": 0,
 88 |                 "output_dim": 1,
 89 |                 "packed_dim": 0,
 90 |                 "pack_factor": self.quant_config.pack_factor,
 91 |             })
 92 |         lookup_table = Parameter(
 93 |             torch.empty(
 94 |                 output_size,
 95 |                 self.quant_config.weight_bits**2,
 96 |                 device="cuda",
 97 |                 dtype=params_dtype,
 98 |             ),
 99 |             requires_grad=False,
100 |         )
101 |         set_weight_attrs(lookup_table, {
102 |             "output_dim": 0,
103 |         })
104 |         return {
105 |             "qweight": qweight,
106 |             "lookup_table": lookup_table,
107 |         }
108 | 
109 |     def apply_weights(self,
110 |                       weights: Dict[str, torch.Tensor],
111 |                       x: torch.Tensor,
112 |                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
113 |         qweight = weights["qweight"]
114 |         lookup_table = weights["lookup_table"]
115 |         out_shape = x.shape[:-1] + (qweight.shape[-1], )
116 |         reshaped_x = x.reshape(-1, x.shape[-1])
117 |         # NOTE: The output tensor should be zero-initialized.
118 |         out = torch.zeros(out_shape, device="cuda", dtype=torch.float16)
119 |         ops.squeezellm_gemm(reshaped_x, qweight, out, lookup_table)
120 | 
121 |         if bias is not None:
122 |             out = out + bias
123 |         return out.reshape(out_shape)
124 | 


--------------------------------------------------------------------------------
/vllm/model_executor/model_loader.py:
--------------------------------------------------------------------------------
  1 | """Utilities for selecting and loading models."""
  2 | import contextlib
  3 | from typing import Type
  4 | 
  5 | import torch
  6 | import torch.nn as nn
  7 | from transformers import PretrainedConfig
  8 | 
  9 | from vllm.config import ModelConfig
 10 | from vllm.model_executor.models import *
 11 | from vllm.model_executor.weight_utils import (get_quant_config,
 12 |                                               initialize_dummy_weights)
 13 | 
 14 | # TODO(woosuk): Lazy-load the model classes.
 15 | _MODEL_REGISTRY = {
 16 |     "AquilaModel": AquilaForCausalLM,
 17 |     "AquilaForCausalLM": AquilaForCausalLM,  # AquilaChat2
 18 |     "BaiChuanForCausalLM": BaiChuanForCausalLM,  # baichuan-7b
 19 |     "BaichuanForCausalLM": BaichuanForCausalLM,  # baichuan-13b
 20 |     "BloomForCausalLM": BloomForCausalLM,
 21 |     "ChatGLMModel": ChatGLMForCausalLM,
 22 |     "FalconForCausalLM": FalconForCausalLM,
 23 |     "GPT2LMHeadModel": GPT2LMHeadModel,
 24 |     "GPTBigCodeForCausalLM": GPTBigCodeForCausalLM,
 25 |     "GPTJForCausalLM": GPTJForCausalLM,
 26 |     "GPTNeoXForCausalLM": GPTNeoXForCausalLM,
 27 |     "InternLMForCausalLM": InternLMForCausalLM,
 28 |     "LlamaForCausalLM": LlamaForCausalLM,
 29 |     "LLaMAForCausalLM": LlamaForCausalLM,  # For decapoda-research/llama-*
 30 |     "MistralForCausalLM": MistralForCausalLM,
 31 |     # transformers's mpt class has lower case
 32 |     "MptForCausalLM": MPTForCausalLM,
 33 |     "MPTForCausalLM": MPTForCausalLM,
 34 |     "OPTForCausalLM": OPTForCausalLM,
 35 |     "PhiForCausalLM": PhiForCausalLM,
 36 |     "QWenLMHeadModel": QWenLMHeadModel,
 37 |     "RWForCausalLM": FalconForCausalLM,
 38 |     "YiForCausalLM": YiForCausalLM,
 39 | }
 40 | 
 41 | 
 42 | @contextlib.contextmanager
 43 | def _set_default_torch_dtype(dtype: torch.dtype):
 44 |     """Sets the default torch dtype to the given dtype."""
 45 |     old_dtype = torch.get_default_dtype()
 46 |     torch.set_default_dtype(dtype)
 47 |     yield
 48 |     torch.set_default_dtype(old_dtype)
 49 | 
 50 | 
 51 | def _get_model_architecture(config: PretrainedConfig) -> Type[nn.Module]:
 52 |     architectures = getattr(config, "architectures", [])
 53 |     for arch in architectures:
 54 |         if arch in _MODEL_REGISTRY:
 55 |             return _MODEL_REGISTRY[arch]
 56 |     raise ValueError(
 57 |         f"Model architectures {architectures} are not supported for now. "
 58 |         f"Supported architectures: {list(_MODEL_REGISTRY.keys())}")
 59 | 
 60 | 
 61 | def get_model(model_config: ModelConfig) -> nn.Module:
 62 |     model_class = _get_model_architecture(model_config.hf_config)
 63 | 
 64 |     # Get the (maybe quantized) linear method.
 65 |     linear_method = None
 66 |     if model_config.quantization is not None:
 67 |         quant_config = get_quant_config(model_config.quantization,
 68 |                                         model_config.model,
 69 |                                         model_config.hf_config,
 70 |                                         model_config.download_dir)
 71 |         capability = torch.cuda.get_device_capability()
 72 |         capability = capability[0] * 10 + capability[1]
 73 |         if capability < quant_config.get_min_capability():
 74 |             raise ValueError(
 75 |                 f"The quantization method {model_config.quantization} is not "
 76 |                 "supported for the current GPU. "
 77 |                 f"Minimum capability: {quant_config.get_min_capability()}. "
 78 |                 f"Current capability: {capability}.")
 79 |         supported_dtypes = quant_config.get_supported_act_dtypes()
 80 |         if model_config.dtype not in supported_dtypes:
 81 |             raise ValueError(
 82 |                 f"{model_config.dtype} is not supported for quantization "
 83 |                 f"method {model_config.quantization}. Supported dtypes: "
 84 |                 f"{supported_dtypes}")
 85 |         linear_method = quant_config.get_linear_method()
 86 | 
 87 |     with _set_default_torch_dtype(model_config.dtype):
 88 |         # Create a model instance.
 89 |         # The weights will be initialized as empty tensors.
 90 |         model = model_class(model_config.hf_config, linear_method)
 91 |         if model_config.load_format == "dummy":
 92 |             model = model.cuda()
 93 |             # NOTE(woosuk): For accurate performance evaluation, we assign
 94 |             # random values to the weights.
 95 |             initialize_dummy_weights(model)
 96 |         else:
 97 |             # Load the weights from the cached or downloaded files.
 98 |             model.load_weights(model_config.model, model_config.download_dir,
 99 |                                model_config.load_format, model_config.revision)
100 |             model = model.cuda()
101 |     return model.eval()
102 | 


--------------------------------------------------------------------------------
/vllm/model_executor/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from vllm.model_executor.models.aquila import AquilaForCausalLM
 2 | from vllm.model_executor.models.baichuan import (BaiChuanForCausalLM,
 3 |                                                  BaichuanForCausalLM)
 4 | from vllm.model_executor.models.bloom import BloomForCausalLM
 5 | from vllm.model_executor.models.falcon import FalconForCausalLM
 6 | from vllm.model_executor.models.gpt2 import GPT2LMHeadModel
 7 | from vllm.model_executor.models.gpt_bigcode import GPTBigCodeForCausalLM
 8 | from vllm.model_executor.models.gpt_j import GPTJForCausalLM
 9 | from vllm.model_executor.models.gpt_neox import GPTNeoXForCausalLM
10 | from vllm.model_executor.models.internlm import InternLMForCausalLM
11 | from vllm.model_executor.models.llama import LlamaForCausalLM
12 | from vllm.model_executor.models.mistral import MistralForCausalLM
13 | from vllm.model_executor.models.mpt import MPTForCausalLM
14 | from vllm.model_executor.models.opt import OPTForCausalLM
15 | from vllm.model_executor.models.phi_1_5 import PhiForCausalLM
16 | from vllm.model_executor.models.qwen import QWenLMHeadModel
17 | from vllm.model_executor.models.chatglm import ChatGLMForCausalLM
18 | from vllm.model_executor.models.yi import YiForCausalLM
19 | 
20 | __all__ = [
21 |     "AquilaForCausalLM",
22 |     "BaiChuanForCausalLM",
23 |     "BaichuanForCausalLM",
24 |     "BloomForCausalLM",
25 |     "ChatGLMForCausalLM",
26 |     "FalconForCausalLM",
27 |     "GPT2LMHeadModel",
28 |     "GPTBigCodeForCausalLM",
29 |     "GPTJForCausalLM",
30 |     "GPTNeoXForCausalLM",
31 |     "InternLMForCausalLM",
32 |     "LlamaForCausalLM",
33 |     "MPTForCausalLM",
34 |     "OPTForCausalLM",
35 |     "PhiForCausalLM",
36 |     "QWenLMHeadModel",
37 |     "MistralForCausalLM",
38 |     "YiForCausalLM",
39 | ]
40 | 


--------------------------------------------------------------------------------
/vllm/model_executor/parallel_utils/README.md:
--------------------------------------------------------------------------------
1 | The files in this folder are ported from [Megatron-LM](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core). We only keep the codes that are used in inference.


--------------------------------------------------------------------------------
/vllm/model_executor/parallel_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OthersideAI/vllm/4b498312ccc7b2804dfe26a4d902750e49937183/vllm/model_executor/parallel_utils/__init__.py


--------------------------------------------------------------------------------
/vllm/model_executor/parallel_utils/communication_op.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from vllm.model_executor.parallel_utils.parallel_state import (
 4 |     get_tensor_model_parallel_world_size,
 5 |     get_tensor_model_parallel_group,
 6 | )
 7 | 
 8 | 
 9 | def tensor_model_parallel_all_reduce(input_):
10 |     """All-reduce the input tensor across model parallel group.
11 | 
12 |     NOTE: This operation is applied in-place on the input tensor.
13 |     """
14 |     # Bypass the function if we are using only 1 GPU.
15 |     if get_tensor_model_parallel_world_size() == 1:
16 |         return input_
17 |     # All-reduce.
18 |     torch.distributed.all_reduce(input_,
19 |                                  group=get_tensor_model_parallel_group())
20 |     return input_
21 | 
22 | 
23 | def tensor_model_parallel_all_gather(input_, dim=-1):
24 |     """All-gather the input tensor across model parallel group."""
25 |     world_size = get_tensor_model_parallel_world_size()
26 |     # Bypass the function if we are using only 1 GPU.
27 |     if world_size == 1:
28 |         return input_
29 |     assert -input_.dim() <= dim < input_.dim(), (
30 |         f"Invalid dim ({dim}) for input tensor with shape {input_.size()}")
31 |     if dim < 0:
32 |         # Convert negative dim to positive.
33 |         dim += input_.dim()
34 |     input_size = input_.size()
35 |     # Allocate output tensor.
36 |     output_tensor = torch.empty((world_size, ) + input_size,
37 |                                 dtype=input_.dtype,
38 |                                 device=input_.device)
39 |     # All-gather.
40 |     torch.distributed.all_gather_into_tensor(
41 |         output_tensor, input_, group=get_tensor_model_parallel_group())
42 |     # Reshape
43 |     output_tensor = output_tensor.movedim(0, dim)
44 |     output_tensor = output_tensor.reshape(input_size[:dim] +
45 |                                           (world_size * input_size[dim], ) +
46 |                                           input_size[dim + 1:])
47 |     return output_tensor
48 | 


--------------------------------------------------------------------------------
/vllm/model_executor/parallel_utils/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 The vLLM team.
 2 | # Adapted from
 3 | # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py
 4 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
 5 | from typing import Sequence
 6 | 
 7 | import torch
 8 | 
 9 | 
10 | def ensure_divisibility(numerator, denominator):
11 |     """Ensure that numerator is divisible by the denominator."""
12 |     assert numerator % denominator == 0, "{} is not divisible by {}".format(
13 |         numerator, denominator)
14 | 
15 | 
16 | def divide(numerator, denominator):
17 |     """Ensure that numerator is divisible by the denominator and return
18 |     the division value."""
19 |     ensure_divisibility(numerator, denominator)
20 |     return numerator // denominator
21 | 
22 | 
23 | def split_tensor_along_last_dim(
24 |     tensor: torch.Tensor,
25 |     num_partitions: int,
26 |     contiguous_split_chunks: bool = False,
27 | ) -> Sequence[torch.Tensor]:
28 |     """ Split a tensor along its last dimension.
29 | 
30 |         Arguments:
31 |             tensor: input tensor.
32 |             num_partitions: number of partitions to split the tensor
33 |             contiguous_split_chunks: If True, make each chunk contiguous
34 |                                      in memory.
35 | 
36 |         Returns:
37 |             A list of Tensors
38 |     """
39 |     # Get the size and dimension.
40 |     last_dim = tensor.dim() - 1
41 |     last_dim_size = divide(tensor.size()[last_dim], num_partitions)
42 |     # Split.
43 |     tensor_list = torch.split(tensor, last_dim_size, dim=last_dim)
44 |     # NOTE: torch.split does not create contiguous tensors by default.
45 |     if contiguous_split_chunks:
46 |         return tuple(chunk.contiguous() for chunk in tensor_list)
47 | 
48 |     return tensor_list
49 | 


--------------------------------------------------------------------------------
/vllm/model_executor/utils.py:
--------------------------------------------------------------------------------
 1 | """Utils for model executor."""
 2 | import random
 3 | from typing import Any, Dict, Optional
 4 | 
 5 | import numpy as np
 6 | import torch
 7 | 
 8 | 
 9 | def set_random_seed(seed: int) -> None:
10 |     random.seed(seed)
11 |     np.random.seed(seed)
12 |     torch.manual_seed(seed)
13 |     if torch.cuda.is_available():
14 |         torch.cuda.manual_seed_all(seed)
15 | 
16 | 
17 | def set_weight_attrs(
18 |     weight: torch.Tensor,
19 |     weight_attrs: Optional[Dict[str, Any]],
20 | ):
21 |     """Set attributes on a weight tensor.
22 | 
23 |     This method is used to set attributes on a weight tensor. This method
24 |     will not overwrite existing attributes.
25 | 
26 |     Args:
27 |         weight: The weight tensor.
28 |         weight_attrs: A dictionary of attributes to set on the weight tensor.
29 |     """
30 |     if weight_attrs is None:
31 |         return
32 |     for key, value in weight_attrs.items():
33 |         assert not hasattr(
34 |             weight, key), (f"Overwriting existing tensor attribute: {key}")
35 |         setattr(weight, key, value)
36 | 


--------------------------------------------------------------------------------
/vllm/outputs.py:
--------------------------------------------------------------------------------
  1 | from typing import List, Optional
  2 | 
  3 | from vllm.sequence import (PromptLogprobs, SampleLogprobs, SequenceGroup,
  4 |                            SequenceStatus)
  5 | 
  6 | 
  7 | class CompletionOutput:
  8 |     """The output data of one completion output of a request.
  9 | 
 10 |     Args:
 11 |         index: The index of the output in the request.
 12 |         text: The generated output text.
 13 |         token_ids: The token IDs of the generated output text.
 14 |         cumulative_logprob: The cumulative log probability of the generated
 15 |             output text.
 16 |         logprobs: The log probabilities of the top probability words at each
 17 |             position if the logprobs are requested.
 18 |         finish_reason: The reason why the sequence is finished.
 19 |     """
 20 | 
 21 |     def __init__(
 22 |         self,
 23 |         index: int,
 24 |         text: str,
 25 |         token_ids: List[int],
 26 |         cumulative_logprob: float,
 27 |         logprobs: Optional[SampleLogprobs],
 28 |         finish_reason: Optional[str] = None,
 29 |     ) -> None:
 30 |         self.index = index
 31 |         self.text = text
 32 |         self.token_ids = token_ids
 33 |         self.cumulative_logprob = cumulative_logprob
 34 |         self.logprobs = logprobs
 35 |         self.finish_reason = finish_reason
 36 | 
 37 |     def finished(self) -> bool:
 38 |         return self.finish_reason is not None
 39 | 
 40 |     def __repr__(self) -> str:
 41 |         return (f"CompletionOutput(index={self.index}, "
 42 |                 f"text={self.text!r}, "
 43 |                 f"token_ids={self.token_ids}, "
 44 |                 f"cumulative_logprob={self.cumulative_logprob}, "
 45 |                 f"logprobs={self.logprobs}, "
 46 |                 f"finish_reason={self.finish_reason})")
 47 | 
 48 | 
 49 | class RequestOutput:
 50 |     """The output data of a request to the LLM.
 51 | 
 52 |     Args:
 53 |         request_id: The unique ID of the request.
 54 |         prompt: The prompt string of the request.
 55 |         prompt_token_ids: The token IDs of the prompt.
 56 |         prompt_logprobs: The log probabilities to return per prompt token.
 57 |         outputs: The output sequences of the request.
 58 |         finished: Whether the whole request is finished.
 59 |     """
 60 | 
 61 |     def __init__(
 62 |         self,
 63 |         request_id: str,
 64 |         prompt: str,
 65 |         prompt_token_ids: List[int],
 66 |         prompt_logprobs: Optional[PromptLogprobs],
 67 |         outputs: List[CompletionOutput],
 68 |         finished: bool,
 69 |     ) -> None:
 70 |         self.request_id = request_id
 71 |         self.prompt = prompt
 72 |         self.prompt_token_ids = prompt_token_ids
 73 |         self.prompt_logprobs = prompt_logprobs
 74 |         self.outputs = outputs
 75 |         self.finished = finished
 76 | 
 77 |     @classmethod
 78 |     def from_seq_group(cls, seq_group: SequenceGroup) -> "RequestOutput":
 79 |         # Get the top-n sequences.
 80 |         n = seq_group.sampling_params.n
 81 |         seqs = seq_group.get_seqs()
 82 |         if seq_group.sampling_params.use_beam_search:
 83 |             sorting_key = lambda seq: seq.get_beam_search_score(
 84 |                 seq_group.sampling_params.length_penalty)
 85 |         else:
 86 |             sorting_key = lambda seq: seq.get_cumulative_logprob()
 87 |         sorted_seqs = sorted(seqs, key=sorting_key, reverse=True)
 88 |         top_n_seqs = sorted_seqs[:n]
 89 | 
 90 |         # Create the outputs.
 91 |         outputs: List[CompletionOutput] = []
 92 |         for seq in top_n_seqs:
 93 |             logprobs = seq.output_logprobs
 94 |             if seq_group.sampling_params.logprobs is None:
 95 |                 # NOTE: We need to take care of this case because the sequence
 96 |                 # always has the logprobs of the sampled tokens even if the
 97 |                 # logprobs are not requested.
 98 |                 logprobs = None
 99 |             finshed_reason = SequenceStatus.get_finished_reason(seq.status)
100 |             output = CompletionOutput(seqs.index(seq), seq.output_text,
101 |                                       seq.get_output_token_ids(),
102 |                                       seq.get_cumulative_logprob(), logprobs,
103 |                                       finshed_reason)
104 |             outputs.append(output)
105 | 
106 |         # Every sequence in the sequence group should have the same prompt.
107 |         prompt = seq_group.prompt
108 |         prompt_token_ids = seq_group.prompt_token_ids
109 |         prompt_logprobs = seq_group.prompt_logprobs
110 |         finished = seq_group.is_finished()
111 |         return cls(seq_group.request_id, prompt, prompt_token_ids,
112 |                    prompt_logprobs, outputs, finished)
113 | 
114 |     def __repr__(self) -> str:
115 |         return (f"RequestOutput(request_id={self.request_id}, "
116 |                 f"prompt={self.prompt!r}, "
117 |                 f"prompt_token_ids={self.prompt_token_ids}, "
118 |                 f"prompt_logprobs={self.prompt_logprobs}, "
119 |                 f"outputs={self.outputs}, "
120 |                 f"finished={self.finished})")
121 | 


--------------------------------------------------------------------------------
/vllm/py.typed:
--------------------------------------------------------------------------------
1 | # Marker file for PEP 561.
2 | # The vllm package uses inline types.
3 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OthersideAI/vllm/4b498312ccc7b2804dfe26a4d902750e49937183/vllm/transformers_utils/__init__.py


--------------------------------------------------------------------------------
/vllm/transformers_utils/config.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | 
 3 | from transformers import AutoConfig, PretrainedConfig
 4 | 
 5 | from vllm.transformers_utils.configs import *
 6 | 
 7 | _CONFIG_REGISTRY = {
 8 |     "aquila": AquilaConfig,
 9 |     "baichuan": BaiChuanConfig,
10 |     "chatglm": ChatGLMConfig,
11 |     "mpt": MPTConfig,
12 |     "qwen": QWenConfig,
13 |     "RefinedWeb": RWConfig,  # For tiiuae/falcon-40b(-instruct)
14 |     "RefinedWebModel": RWConfig,  # For tiiuae/falcon-7b(-instruct)
15 |     "yi": YiConfig,
16 | }
17 | 
18 | 
19 | def get_config(model: str,
20 |                trust_remote_code: bool,
21 |                revision: Optional[str] = None) -> PretrainedConfig:
22 |     try:
23 |         config = AutoConfig.from_pretrained(
24 |             model, trust_remote_code=trust_remote_code, revision=revision)
25 |     except ValueError as e:
26 |         if (not trust_remote_code and
27 |                 "requires you to execute the configuration file" in str(e)):
28 |             err_msg = (
29 |                 "Failed to load the model config. If the model is a custom "
30 |                 "model not yet available in the HuggingFace transformers "
31 |                 "library, consider setting `trust_remote_code=True` in LLM "
32 |                 "or using the `--trust-remote-code` flag in the CLI.")
33 |             raise RuntimeError(err_msg) from e
34 |         else:
35 |             raise e
36 |     if config.model_type in _CONFIG_REGISTRY:
37 |         config_class = _CONFIG_REGISTRY[config.model_type]
38 |         config = config_class.from_pretrained(model, revision=revision)
39 |     return config
40 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/configs/__init__.py:
--------------------------------------------------------------------------------
 1 | from vllm.transformers_utils.configs.aquila import AquilaConfig
 2 | from vllm.transformers_utils.configs.baichuan import BaiChuanConfig
 3 | from vllm.transformers_utils.configs.chatglm import ChatGLMConfig
 4 | from vllm.transformers_utils.configs.mpt import MPTConfig
 5 | from vllm.transformers_utils.configs.qwen import QWenConfig
 6 | # RWConfig is for the original tiiuae/falcon-40b(-instruct) and
 7 | # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the
 8 | # `FalconConfig` class from the official HuggingFace transformers library.
 9 | from vllm.transformers_utils.configs.falcon import RWConfig
10 | from vllm.transformers_utils.configs.yi import YiConfig
11 | 
12 | __all__ = [
13 |     "AquilaConfig",
14 |     "BaiChuanConfig",
15 |     "ChatGLMConfig",
16 |     "MPTConfig",
17 |     "QWenConfig",
18 |     "RWConfig",
19 |     "YiConfig",
20 | ]
21 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/configs/aquila.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 3 | #
 4 | # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
 5 | # and OPT implementations in this library. It has been modified from its
 6 | # original forms to accommodate minor architectural differences compared
 7 | # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
 8 | #
 9 | # Licensed under the Apache License, Version 2.0 (the "License");
10 | # you may not use this file except in compliance with the License.
11 | # You may obtain a copy of the License at
12 | #
13 | #     http://www.apache.org/licenses/LICENSE-2.0
14 | #
15 | # Unless required by applicable law or agreed to in writing, software
16 | # distributed under the License is distributed on an "AS IS" BASIS,
17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | # See the License for the specific language governing permissions and
19 | # limitations under the License.
20 | """ Aquila model configuration"""
21 | 
22 | from transformers import PretrainedConfig
23 | 
24 | 
25 | class AquilaConfig(PretrainedConfig):
26 |     model_type = "aquila"
27 |     keys_to_ignore_at_inference = ["past_key_values"]
28 | 
29 |     def __init__(
30 |         self,
31 |         vocab_size=100008,
32 |         hidden_size=4096,
33 |         intermediate_size=11008,
34 |         num_hidden_layers=32,
35 |         num_attention_heads=32,
36 |         num_key_value_heads=None,
37 |         hidden_act="silu",
38 |         max_position_embeddings=2048,
39 |         initializer_range=0.006,
40 |         rms_norm_eps=1e-5,
41 |         use_cache=True,
42 |         pad_token_id=0,
43 |         bos_token_id=1,
44 |         eos_token_id=2,
45 |         tie_word_embeddings=False,
46 |         **kwargs,
47 |     ):
48 |         self.vocab_size = vocab_size
49 |         self.max_position_embeddings = max_position_embeddings
50 |         self.hidden_size = hidden_size
51 |         self.intermediate_size = intermediate_size
52 |         self.num_hidden_layers = num_hidden_layers
53 |         # for backward compatibility
54 |         if num_key_value_heads is None:
55 |             num_key_value_heads = num_attention_heads
56 | 
57 |         self.num_key_value_heads = num_key_value_heads
58 |         self.num_attention_heads = num_attention_heads
59 |         self.hidden_act = hidden_act
60 |         self.initializer_range = initializer_range
61 |         self.rms_norm_eps = rms_norm_eps
62 |         self.use_cache = use_cache
63 |         super().__init__(
64 |             pad_token_id=pad_token_id,
65 |             bos_token_id=bos_token_id,
66 |             eos_token_id=eos_token_id,
67 |             tie_word_embeddings=tie_word_embeddings,
68 |             **kwargs,
69 |         )
70 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/configs/baichuan.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved.
 3 | #
 4 | # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
 5 | # and OPT implementations in this library. It has been modified from its
 6 | # original forms to accommodate minor architectural differences compared
 7 | # to GPT-NeoX and OPT used by the Meta AI team that trained the model.
 8 | #
 9 | # Licensed under the Apache License, Version 2.0 (the "License");
10 | # you may not use this file except in compliance with the License.
11 | # You may obtain a copy of the License at
12 | #
13 | #     http://www.apache.org/licenses/LICENSE-2.0
14 | #
15 | # Unless required by applicable law or agreed to in writing, software
16 | # distributed under the License is distributed on an "AS IS" BASIS,
17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18 | # See the License for the specific language governing permissions and
19 | # limitations under the License.
20 | 
21 | from transformers.configuration_utils import PretrainedConfig
22 | 
23 | 
24 | class BaiChuanConfig(PretrainedConfig):
25 |     model_type = "baichuan"
26 |     keys_to_ignore_at_inference = ["past_key_values"]
27 | 
28 |     def __init__(
29 |         self,
30 |         vocab_size=64000,
31 |         hidden_size=4096,
32 |         intermediate_size=11008,
33 |         num_hidden_layers=32,
34 |         num_attention_heads=32,
35 |         hidden_act="silu",
36 |         max_position_embeddings=4096,
37 |         initializer_range=0.02,
38 |         rms_norm_eps=1e-6,
39 |         use_cache=True,
40 |         pad_token_id=0,
41 |         bos_token_id=1,
42 |         eos_token_id=2,
43 |         tie_word_embeddings=False,
44 |         **kwargs,
45 |     ):
46 |         self.vocab_size = vocab_size
47 |         self.max_position_embeddings = max_position_embeddings
48 |         self.hidden_size = hidden_size
49 |         self.intermediate_size = intermediate_size
50 |         self.num_hidden_layers = num_hidden_layers
51 |         self.num_attention_heads = num_attention_heads
52 |         self.hidden_act = hidden_act
53 |         self.initializer_range = initializer_range
54 |         self.rms_norm_eps = rms_norm_eps
55 |         self.use_cache = use_cache
56 |         super().__init__(
57 |             pad_token_id=pad_token_id,
58 |             bos_token_id=bos_token_id,
59 |             eos_token_id=eos_token_id,
60 |             tie_word_embeddings=tie_word_embeddings,
61 |             **kwargs,
62 |         )
63 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/configs/chatglm.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Adapted from
 3 | # https://github.com/THUDM/ChatGLM2-6B
 4 | from transformers import PretrainedConfig
 5 | 
 6 | 
 7 | class ChatGLMConfig(PretrainedConfig):
 8 |     model_type = "chatglm"
 9 |     attribute_map = {
10 |         "num_hidden_layers": "num_layers",
11 |         "n_head_kv": "multi_query_group_num",
12 |     }
13 | 
14 |     def __init__(self,
15 |                  num_layers=28,
16 |                  padded_vocab_size=65024,
17 |                  hidden_size=4096,
18 |                  ffn_hidden_size=13696,
19 |                  kv_channels=128,
20 |                  num_attention_heads=32,
21 |                  seq_length=2048,
22 |                  hidden_dropout=0.0,
23 |                  attention_dropout=0.0,
24 |                  layernorm_epsilon=1e-5,
25 |                  rmsnorm=True,
26 |                  apply_residual_connection_post_layernorm=False,
27 |                  post_layer_norm=True,
28 |                  add_bias_linear=False,
29 |                  add_qkv_bias=False,
30 |                  interleaved_qkv=False,
31 |                  bias_dropout_fusion=True,
32 |                  multi_query_attention=False,
33 |                  multi_query_group_num=1,
34 |                  apply_query_key_layer_scaling=True,
35 |                  attention_softmax_in_fp32=True,
36 |                  fp32_residual_connection=False,
37 |                  quantization_bit=0,
38 |                  pre_seq_len=None,
39 |                  prefix_projection=False,
40 |                  **kwargs):
41 |         self.num_layers = num_layers
42 |         self.vocab_size = padded_vocab_size
43 |         self.padded_vocab_size = padded_vocab_size
44 |         self.hidden_size = hidden_size
45 |         self.ffn_hidden_size = ffn_hidden_size
46 |         self.kv_channels = kv_channels
47 |         self.num_attention_heads = num_attention_heads
48 |         self.seq_length = seq_length
49 |         self.hidden_dropout = hidden_dropout
50 |         self.attention_dropout = attention_dropout
51 |         self.layernorm_epsilon = layernorm_epsilon
52 |         self.rmsnorm = rmsnorm
53 |         self.apply_residual_connection_post_layernorm = (
54 |             apply_residual_connection_post_layernorm)
55 |         self.post_layer_norm = post_layer_norm
56 |         self.add_bias_linear = add_bias_linear
57 |         self.add_qkv_bias = add_qkv_bias
58 |         self.bias_dropout_fusion = bias_dropout_fusion
59 |         self.multi_query_attention = multi_query_attention
60 |         self.multi_query_group_num = multi_query_group_num
61 |         self.apply_query_key_layer_scaling = apply_query_key_layer_scaling
62 |         self.attention_softmax_in_fp32 = attention_softmax_in_fp32
63 |         self.fp32_residual_connection = fp32_residual_connection
64 |         self.quantization_bit = quantization_bit
65 |         self.pre_seq_len = pre_seq_len
66 |         self.prefix_projection = prefix_projection
67 |         self.interleaved_qkv = interleaved_qkv
68 |         super().__init__(**kwargs)
69 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/configs/falcon.py:
--------------------------------------------------------------------------------
 1 | # Adapted from
 2 | # https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py
 3 | # Copyright 2023 The vLLM team.
 4 | # Copyright 2022 the Big Science Workshop and HuggingFace Inc. team.
 5 | # All rights reserved.
 6 | #
 7 | # Licensed under the Apache License, Version 2.0 (the "License");
 8 | # you may not use this file except in compliance with the License.
 9 | # You may obtain a copy of the License at
10 | #
11 | #     http://www.apache.org/licenses/LICENSE-2.0
12 | #
13 | # Unless required by applicable law or agreed to in writing, software
14 | # distributed under the License is distributed on an "AS IS" BASIS,
15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 | # See the License for the specific language governing permissions and
17 | # limitations under the License.
18 | """Falcon configuration"""
19 | from transformers.configuration_utils import PretrainedConfig
20 | 
21 | 
22 | class RWConfig(PretrainedConfig):
23 |     model_type = "falcon"
24 |     keys_to_ignore_at_inference = ["past_key_values"]
25 |     attribute_map = {
26 |         "num_hidden_layers": "n_layer",
27 |         "num_attention_heads": "n_head",
28 |         "num_kv_heads": "n_head_kv",
29 |     }
30 | 
31 |     def __init__(
32 |         self,
33 |         vocab_size=250880,
34 |         hidden_size=64,
35 |         n_layer=2,
36 |         n_head=8,
37 |         layer_norm_epsilon=1e-5,
38 |         initializer_range=0.02,
39 |         use_cache=True,
40 |         bos_token_id=1,
41 |         eos_token_id=2,
42 |         hidden_dropout=0.0,
43 |         attention_dropout=0.0,
44 |         multi_query=True,
45 |         n_head_kv=None,
46 |         alibi=False,
47 |         bias=False,
48 |         parallel_attn=False,
49 |         new_decoder_architecture=False,
50 |         **kwargs,
51 |     ) -> None:
52 |         self.vocab_size = vocab_size
53 |         # Backward compatibility with n_embed kwarg
54 |         n_embed = kwargs.pop("n_embed", None)
55 |         self.hidden_size = hidden_size if n_embed is None else n_embed
56 |         self.n_layer = n_layer
57 |         self.n_head = n_head
58 |         self.layer_norm_epsilon = layer_norm_epsilon
59 |         self.initializer_range = initializer_range
60 |         self.use_cache = use_cache
61 |         self.hidden_dropout = hidden_dropout
62 |         self.attention_dropout = attention_dropout
63 | 
64 |         self.bos_token_id = bos_token_id
65 |         self.eos_token_id = eos_token_id
66 |         self.multi_query = multi_query
67 |         self.n_head_kv = 1 if n_head_kv is None else n_head_kv
68 |         self.alibi = alibi
69 |         self.bias = bias
70 |         self.parallel_attn = parallel_attn
71 |         self.new_decoder_architecture = new_decoder_architecture
72 | 
73 |         if self.hidden_size == 8192:
74 |             # Hack for falcon-40b
75 |             self.new_decoder_architecture = True
76 | 
77 |         super().__init__(bos_token_id=bos_token_id,
78 |                          eos_token_id=eos_token_id,
79 |                          **kwargs)
80 | 
81 |     @property
82 |     def head_dim(self):
83 |         return self.hidden_size // self.n_head
84 | 
85 |     @property
86 |     def rotary(self):
87 |         return not self.alibi
88 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/configs/qwen.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Alibaba Cloud.
 2 | # LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
 3 | 
 4 | from transformers import PretrainedConfig
 5 | 
 6 | 
 7 | class QWenConfig(PretrainedConfig):
 8 |     model_type = "qwen"
 9 |     keys_to_ignore_at_inference = ["past_key_values"]
10 | 
11 |     def __init__(
12 |         self,
13 |         vocab_size=151936,
14 |         hidden_size=4096,
15 |         num_hidden_layers=32,
16 |         num_attention_heads=32,
17 |         emb_dropout_prob=0.0,
18 |         attn_dropout_prob=0.0,
19 |         layer_norm_epsilon=1e-6,
20 |         initializer_range=0.02,
21 |         max_position_embeddings=8192,
22 |         scale_attn_weights=True,
23 |         use_cache=True,
24 |         bf16=False,
25 |         fp16=False,
26 |         fp32=False,
27 |         kv_channels=128,
28 |         rotary_pct=1.0,
29 |         rotary_emb_base=10000,
30 |         use_dynamic_ntk=True,
31 |         use_logn_attn=True,
32 |         use_flash_attn="auto",
33 |         intermediate_size=22016,
34 |         no_bias=True,
35 |         tie_word_embeddings=False,
36 |         **kwargs,
37 |     ):
38 |         self.vocab_size = vocab_size
39 |         self.hidden_size = hidden_size
40 |         self.intermediate_size = intermediate_size
41 |         self.num_hidden_layers = num_hidden_layers
42 |         self.num_attention_heads = num_attention_heads
43 |         self.emb_dropout_prob = emb_dropout_prob
44 |         self.attn_dropout_prob = attn_dropout_prob
45 |         self.layer_norm_epsilon = layer_norm_epsilon
46 |         self.initializer_range = initializer_range
47 |         self.scale_attn_weights = scale_attn_weights
48 |         self.use_cache = use_cache
49 |         self.max_position_embeddings = max_position_embeddings
50 |         self.bf16 = bf16
51 |         self.fp16 = fp16
52 |         self.fp32 = fp32
53 |         self.kv_channels = kv_channels
54 |         self.rotary_pct = rotary_pct
55 |         self.rotary_emb_base = rotary_emb_base
56 |         self.use_dynamic_ntk = use_dynamic_ntk
57 |         self.use_logn_attn = use_logn_attn
58 |         self.use_flash_attn = use_flash_attn
59 |         self.no_bias = no_bias
60 |         super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
61 | 


--------------------------------------------------------------------------------
/vllm/transformers_utils/configs/yi.py:
--------------------------------------------------------------------------------
 1 | """ Yi model configuration"""
 2 | from transformers.configuration_utils import PretrainedConfig
 3 | from transformers.utils import logging
 4 | 
 5 | logger = logging.get_logger(__name__)
 6 | 
 7 | Yi_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
 8 | 
 9 | 
10 | class YiConfig(PretrainedConfig):
11 |     r"""
12 |         Reference:
13 |         https://huggingface.co/01-ai/Yi-6B/blob/main/configuration_yi.py
14 |     """
15 |     model_type = "Yi"
16 |     keys_to_ignore_at_inference = ["past_key_values"]
17 | 
18 |     def __init__(
19 |         self,
20 |         vocab_size=64000,
21 |         hidden_size=4096,
22 |         intermediate_size=11008,
23 |         num_hidden_layers=32,
24 |         num_attention_heads=32,
25 |         num_key_value_heads=4,
26 |         hidden_act="silu",
27 |         max_position_embeddings=4096,
28 |         initializer_range=0.02,
29 |         rms_norm_eps=1e-5,
30 |         use_cache=True,
31 |         pad_token_id=0,
32 |         bos_token_id=1,
33 |         eos_token_id=2,
34 |         tie_word_embeddings=False,
35 |         output_attentions=False,
36 |         rope_theta=5000000.0,
37 |         **kwargs,
38 |     ):
39 |         self.vocab_size = vocab_size
40 |         self.max_position_embeddings = max_position_embeddings
41 |         self.hidden_size = hidden_size
42 |         self.intermediate_size = intermediate_size
43 |         self.num_hidden_layers = num_hidden_layers
44 |         self.num_attention_heads = num_attention_heads
45 | 
46 |         # for backward compatibility
47 |         if num_key_value_heads is None:
48 |             num_key_value_heads = num_attention_heads
49 | 
50 |         self.num_key_value_heads = num_key_value_heads
51 |         self.hidden_act = hidden_act
52 |         self.initializer_range = initializer_range
53 |         self.rms_norm_eps = rms_norm_eps
54 |         self.use_cache = use_cache
55 |         self.output_attentions = output_attentions
56 |         self.rope_theta = rope_theta
57 | 
58 |         super().__init__(
59 |             pad_token_id=pad_token_id,
60 |             bos_token_id=bos_token_id,
61 |             eos_token_id=eos_token_id,
62 |             tie_word_embeddings=tie_word_embeddings,
63 |             **kwargs,
64 |         )
65 | 


--------------------------------------------------------------------------------
/vllm/utils.py:
--------------------------------------------------------------------------------
 1 | import enum
 2 | import uuid
 3 | from platform import uname
 4 | 
 5 | import psutil
 6 | import torch
 7 | 
 8 | from vllm._C import cuda_utils
 9 | 
10 | 
11 | class Device(enum.Enum):
12 |     GPU = enum.auto()
13 |     CPU = enum.auto()
14 | 
15 | 
16 | class Counter:
17 | 
18 |     def __init__(self, start: int = 0) -> None:
19 |         self.counter = start
20 | 
21 |     def __next__(self) -> int:
22 |         i = self.counter
23 |         self.counter += 1
24 |         return i
25 | 
26 |     def reset(self) -> None:
27 |         self.counter = 0
28 | 
29 | 
30 | def get_max_shared_memory_bytes(gpu: int = 0) -> int:
31 |     """Returns the maximum shared memory per thread block in bytes."""
32 |     # https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html
33 |     cudaDevAttrMaxSharedMemoryPerBlockOptin = 97
34 |     max_shared_mem = cuda_utils.get_device_attribute(
35 |         cudaDevAttrMaxSharedMemoryPerBlockOptin, gpu)
36 |     return int(max_shared_mem)
37 | 
38 | 
39 | def get_gpu_memory(gpu: int = 0) -> int:
40 |     """Returns the total memory of the GPU in bytes."""
41 |     return torch.cuda.get_device_properties(gpu).total_memory
42 | 
43 | 
44 | def get_cpu_memory() -> int:
45 |     """Returns the total CPU memory of the node in bytes."""
46 |     return psutil.virtual_memory().total
47 | 
48 | 
49 | def random_uuid() -> str:
50 |     return str(uuid.uuid4().hex)
51 | 
52 | 
53 | def in_wsl() -> bool:
54 |     # Reference: https://github.com/microsoft/WSL/issues/4071
55 |     return "microsoft" in " ".join(uname()).lower()
56 | 


--------------------------------------------------------------------------------
/vllm/worker/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OthersideAI/vllm/4b498312ccc7b2804dfe26a4d902750e49937183/vllm/worker/__init__.py


--------------------------------------------------------------------------------