├── .github └── workflows │ ├── publish.yml │ ├── ruff.yml │ ├── scripts │ ├── build.sh │ ├── create_release.js │ ├── cuda-install.sh │ ├── env.sh │ └── pytorch-install.sh │ └── yapf.yml ├── .gitignore ├── .readthedocs.yaml ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE ├── MANIFEST.in ├── README.md ├── benchmarks ├── README.md ├── benchmark_latency.py ├── benchmark_serving.py ├── benchmark_throughput.py ├── kernels │ └── benchmark_paged_attention.py └── launch_tgi_server.sh ├── csrc ├── activation_kernels.cu ├── attention │ ├── attention_dtypes.h │ ├── attention_generic.cuh │ ├── attention_kernels.cu │ ├── attention_utils.cuh │ ├── dtype_bfloat16.cuh │ ├── dtype_float16.cuh │ └── dtype_float32.cuh ├── cache.h ├── cache_kernels.cu ├── cuda_utils.h ├── cuda_utils_kernels.cu ├── dispatch_utils.h ├── layernorm_kernels.cu ├── ops.h ├── pos_encoding_kernels.cu ├── pybind.cpp ├── quantization │ ├── awq │ │ ├── dequantize.cuh │ │ └── gemm_kernels.cu │ └── squeezellm │ │ └── quant_cuda_kernel.cu └── reduction_utils.cuh ├── docs ├── Makefile ├── README.md ├── make.bat ├── requirements-docs.txt └── source │ ├── assets │ └── logos │ │ ├── vllm-logo-only-light.png │ │ ├── vllm-logo-text-dark.png │ │ └── vllm-logo-text-light.png │ ├── conf.py │ ├── getting_started │ ├── installation.rst │ └── quickstart.rst │ ├── index.rst │ ├── models │ ├── adding_model.rst │ ├── engine_args.rst │ └── supported_models.rst │ ├── quantization │ └── auto_awq.rst │ └── serving │ ├── deploying_with_docker.rst │ ├── deploying_with_triton.rst │ ├── distributed_serving.rst │ └── run_on_sky.rst ├── examples ├── api_client.py ├── gradio_webserver.py ├── llm_engine_example.py ├── offline_inference.py ├── openai_chatcompletion_client.py └── openai_completion_client.py ├── format.sh ├── mypy.ini ├── pyproject.toml ├── requirements-dev.txt ├── requirements.txt ├── setup.py ├── tests ├── __init__.py ├── async_engine │ ├── api_server_async_engine.py │ ├── test_api_server.py │ ├── test_async_llm_engine.py │ └── test_request_tracker.py ├── conftest.py ├── distributed │ └── test_comm_ops.py ├── engine │ └── test_detokenize.py ├── kernels │ ├── conftest.py │ ├── test_activation.py │ ├── test_attention.py │ ├── test_cache.py │ ├── test_layernorm.py │ └── test_pos_encoding.py ├── models │ └── test_models.py ├── samplers │ ├── test_beam_search.py │ ├── test_logprobs.py │ └── test_sampler.py ├── test_regression.py └── worker │ └── test_worker.py └── vllm ├── __init__.py ├── block.py ├── config.py ├── core ├── __init__.py ├── block_manager.py ├── policy.py └── scheduler.py ├── engine ├── __init__.py ├── arg_utils.py ├── async_llm_engine.py ├── llm_engine.py └── ray_utils.py ├── entrypoints ├── __init__.py ├── api_server.py ├── llm.py └── openai │ ├── __init__.py │ ├── api_server.py │ └── protocol.py ├── logger.py ├── model_executor ├── __init__.py ├── input_metadata.py ├── layers │ ├── __init__.py │ ├── activation.py │ ├── attention.py │ ├── layernorm.py │ ├── linear.py │ ├── quantization │ │ ├── __init__.py │ │ ├── awq.py │ │ ├── base_config.py │ │ └── squeezellm.py │ ├── rotary_embedding.py │ ├── sampler.py │ └── vocab_parallel_embedding.py ├── model_loader.py ├── models │ ├── __init__.py │ ├── aquila.py │ ├── baichuan.py │ ├── bloom.py │ ├── chatglm.py │ ├── falcon.py │ ├── gpt2.py │ ├── gpt_bigcode.py │ ├── gpt_j.py │ ├── gpt_neox.py │ ├── internlm.py │ ├── llama.py │ ├── mistral.py │ ├── mpt.py │ ├── opt.py │ ├── phi_1_5.py │ ├── qwen.py │ └── yi.py ├── parallel_utils │ ├── README.md │ ├── __init__.py │ ├── communication_op.py │ ├── parallel_state.py │ └── utils.py ├── utils.py └── weight_utils.py ├── outputs.py ├── py.typed ├── sampling_params.py ├── sequence.py ├── transformers_utils ├── __init__.py ├── config.py ├── configs │ ├── __init__.py │ ├── aquila.py │ ├── baichuan.py │ ├── chatglm.py │ ├── falcon.py │ ├── mpt.py │ ├── qwen.py │ └── yi.py └── tokenizer.py ├── utils.py └── worker ├── __init__.py ├── cache_engine.py └── worker.py /.github/workflows/publish.yml: -------------------------------------------------------------------------------- 1 | # This workflow will upload a Python Package to Release asset 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions 3 | 4 | name: Create Release 5 | 6 | on: 7 | push: 8 | tags: 9 | - v* 10 | 11 | # Needed to create release and upload assets 12 | permissions: 13 | contents: write 14 | 15 | jobs: 16 | release: 17 | # Retrieve tag and create release 18 | name: Create Release 19 | runs-on: ubuntu-latest 20 | outputs: 21 | upload_url: ${{ steps.create_release.outputs.upload_url }} 22 | steps: 23 | - name: Checkout 24 | uses: actions/checkout@v3 25 | 26 | - name: Extract branch info 27 | shell: bash 28 | run: | 29 | echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV 30 | 31 | - name: Create Release 32 | id: create_release 33 | uses: "actions/github-script@v6" 34 | env: 35 | RELEASE_TAG: ${{ env.release_tag }} 36 | with: 37 | github-token: "${{ secrets.GITHUB_TOKEN }}" 38 | script: | 39 | const script = require('.github/workflows/scripts/create_release.js') 40 | await script(github, context, core) 41 | 42 | wheel: 43 | name: Build Wheel 44 | runs-on: ${{ matrix.os }} 45 | needs: release 46 | 47 | strategy: 48 | fail-fast: false 49 | matrix: 50 | os: ['ubuntu-20.04'] 51 | python-version: ['3.8', '3.9', '3.10', '3.11'] 52 | pytorch-version: ['2.1.0'] 53 | cuda-version: ['11.8', '12.1'] 54 | 55 | steps: 56 | - name: Checkout 57 | uses: actions/checkout@v3 58 | 59 | - name: Set up Linux Env 60 | if: ${{ runner.os == 'Linux' }} 61 | run: | 62 | bash -x .github/workflows/scripts/env.sh 63 | 64 | - name: Set up Python 65 | uses: actions/setup-python@v4 66 | with: 67 | python-version: ${{ matrix.python-version }} 68 | 69 | - name: Install CUDA ${{ matrix.cuda-version }} 70 | run: | 71 | bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }} 72 | 73 | - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }} 74 | run: | 75 | bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }} 76 | 77 | - name: Build wheel 78 | shell: bash 79 | run: | 80 | bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }} 81 | wheel_name=$(ls dist/*whl | xargs -n 1 basename) 82 | asset_name=${wheel_name//"linux"/"manylinux1"} 83 | echo "wheel_name=${wheel_name}" >> $GITHUB_ENV 84 | echo "asset_name=${asset_name}" >> $GITHUB_ENV 85 | 86 | - name: Upload Release Asset 87 | uses: actions/upload-release-asset@v1 88 | env: 89 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 90 | with: 91 | upload_url: ${{ needs.release.outputs.upload_url }} 92 | asset_path: ./dist/${{ env.wheel_name }} 93 | asset_name: ${{ env.asset_name }} 94 | asset_content_type: application/* 95 | 96 | # (Danielkinz): This last step will publish the .whl to pypi. Warning: untested 97 | # - name: Publish package 98 | # uses: pypa/gh-action-pypi-publish@release/v1.8 99 | # with: 100 | # repository-url: https://test.pypi.org/legacy/ 101 | # password: ${{ secrets.PYPI_API_TOKEN }} 102 | # skip-existing: true 103 | -------------------------------------------------------------------------------- /.github/workflows/ruff.yml: -------------------------------------------------------------------------------- 1 | name: ruff 2 | 3 | on: 4 | # Trigger the workflow on push or pull request, 5 | # but only for the main branch 6 | push: 7 | branches: 8 | - main 9 | pull_request: 10 | branches: 11 | - main 12 | 13 | jobs: 14 | ruff: 15 | runs-on: ubuntu-latest 16 | strategy: 17 | matrix: 18 | python-version: ["3.10"] 19 | steps: 20 | - uses: actions/checkout@v2 21 | - name: Set up Python ${{ matrix.python-version }} 22 | uses: actions/setup-python@v2 23 | with: 24 | python-version: ${{ matrix.python-version }} 25 | - name: Install dependencies 26 | run: | 27 | python -m pip install --upgrade pip 28 | pip install ruff==0.1.5 29 | - name: Analysing the code with ruff 30 | run: | 31 | ruff vllm tests 32 | -------------------------------------------------------------------------------- /.github/workflows/scripts/build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python_executable=python$1 4 | cuda_home=/usr/local/cuda-$2 5 | 6 | # Update paths 7 | PATH=${cuda_home}/bin:$PATH 8 | LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH 9 | 10 | # Install requirements 11 | $python_executable -m pip install wheel packaging 12 | $python_executable -m pip install -r requirements.txt 13 | 14 | # Limit the number of parallel jobs to avoid OOM 15 | export MAX_JOBS=1 16 | 17 | # Build 18 | $python_executable setup.py bdist_wheel --dist-dir=dist 19 | -------------------------------------------------------------------------------- /.github/workflows/scripts/create_release.js: -------------------------------------------------------------------------------- 1 | // Uses Github's API to create the release and wait for result. 2 | // We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately. 3 | 4 | module.exports = async (github, context, core) => { 5 | try { 6 | const response = await github.rest.repos.createRelease({ 7 | draft: false, 8 | generate_release_notes: true, 9 | name: process.env.RELEASE_TAG, 10 | owner: context.repo.owner, 11 | prerelease: false, 12 | repo: context.repo.repo, 13 | tag_name: process.env.RELEASE_TAG, 14 | }); 15 | 16 | core.setOutput('upload_url', response.data.upload_url); 17 | } catch (error) { 18 | core.setFailed(error.message); 19 | } 20 | } -------------------------------------------------------------------------------- /.github/workflows/scripts/cuda-install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Replace '.' with '-' ex: 11.8 -> 11-8 4 | cuda_version=$(echo $1 | tr "." "-") 5 | # Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004 6 | OS=$(echo $2 | tr -d ".\-") 7 | 8 | # Installs CUDA 9 | wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb 10 | sudo dpkg -i cuda-keyring_1.1-1_all.deb 11 | rm cuda-keyring_1.1-1_all.deb 12 | sudo apt -qq update 13 | sudo apt -y install cuda-${cuda_version} cuda-nvcc-${cuda_version} cuda-libraries-dev-${cuda_version} 14 | sudo apt clean 15 | 16 | # Test nvcc 17 | PATH=/usr/local/cuda-$1/bin:${PATH} 18 | nvcc --version 19 | 20 | # Log gcc, g++, c++ versions 21 | gcc --version 22 | g++ --version 23 | c++ --version 24 | -------------------------------------------------------------------------------- /.github/workflows/scripts/env.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # This file installs common linux environment tools 4 | 5 | export LANG C.UTF-8 6 | 7 | # python_version=$1 8 | 9 | sudo apt-get update && \ 10 | sudo apt-get install -y --no-install-recommends \ 11 | software-properties-common \ 12 | 13 | sudo apt-get install -y --no-install-recommends \ 14 | build-essential \ 15 | apt-utils \ 16 | ca-certificates \ 17 | wget \ 18 | git \ 19 | vim \ 20 | libssl-dev \ 21 | curl \ 22 | unzip \ 23 | unrar \ 24 | cmake \ 25 | net-tools \ 26 | sudo \ 27 | autotools-dev \ 28 | rsync \ 29 | jq \ 30 | openssh-server \ 31 | tmux \ 32 | screen \ 33 | htop \ 34 | pdsh \ 35 | openssh-client \ 36 | lshw \ 37 | dmidecode \ 38 | util-linux \ 39 | automake \ 40 | autoconf \ 41 | libtool \ 42 | net-tools \ 43 | pciutils \ 44 | libpci-dev \ 45 | libaio-dev \ 46 | libcap2 \ 47 | libtinfo5 \ 48 | fakeroot \ 49 | devscripts \ 50 | debhelper \ 51 | nfs-common 52 | 53 | # Remove github bloat files to free up disk space 54 | sudo rm -rf "/usr/local/share/boost" 55 | sudo rm -rf "$AGENT_TOOLSDIRECTORY" 56 | sudo rm -rf "/usr/share/dotnet" 57 | -------------------------------------------------------------------------------- /.github/workflows/scripts/pytorch-install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python_executable=python$1 4 | pytorch_version=$2 5 | cuda_version=$3 6 | 7 | # Install torch 8 | $python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya 9 | $python_executable -m pip install torch==${pytorch_version}+cu${cuda_version//./} --extra-index-url https://download.pytorch.org/whl/cu${cuda_version//./} 10 | 11 | # Print version information 12 | $python_executable --version 13 | $python_executable -c "import torch; print('PyTorch:', torch.__version__)" 14 | $python_executable -c "import torch; print('CUDA:', torch.version.cuda)" 15 | $python_executable -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)" 16 | -------------------------------------------------------------------------------- /.github/workflows/yapf.yml: -------------------------------------------------------------------------------- 1 | name: yapf 2 | 3 | on: 4 | # Trigger the workflow on push or pull request, 5 | # but only for the main branch 6 | push: 7 | branches: 8 | - main 9 | pull_request: 10 | branches: 11 | - main 12 | jobs: 13 | yapf: 14 | runs-on: ubuntu-latest 15 | strategy: 16 | matrix: 17 | python-version: ["3.10"] 18 | steps: 19 | - uses: actions/checkout@v2 20 | - name: Set up Python ${{ matrix.python-version }} 21 | uses: actions/setup-python@v2 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | pip install yapf==0.32.0 28 | pip install toml==0.10.2 29 | - name: Running yapf 30 | run: | 31 | yapf --diff --recursive vllm tests 32 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | .idea/ 161 | 162 | # VSCode 163 | .vscode/ 164 | 165 | # DS Store 166 | .DS_Store 167 | 168 | # Results 169 | *.csv 170 | 171 | # Python pickle files 172 | *.pkl 173 | 174 | # Sphinx documentation 175 | _build/ 176 | 177 | # vim swap files 178 | *.swo 179 | *.swp 180 | -------------------------------------------------------------------------------- /.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | # Read the Docs configuration file 2 | # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details 3 | 4 | version: 2 5 | 6 | build: 7 | os: ubuntu-22.04 8 | tools: 9 | python: "3.8" 10 | 11 | sphinx: 12 | configuration: docs/source/conf.py 13 | 14 | # If using Sphinx, optionally build your docs in additional formats such as PDF 15 | formats: 16 | - pdf 17 | 18 | # Optionally declare the Python requirements required to build your docs 19 | python: 20 | install: 21 | - requirements: docs/requirements-docs.txt 22 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to vLLM 2 | 3 | Thank you for your interest in contributing to vLLM! 4 | Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. 5 | There are several ways you can contribute to the project: 6 | 7 | - Identify and report any issues or bugs. 8 | - Request or add a new model. 9 | - Suggest or implement new features. 10 | 11 | However, remember that contributions aren't just about code. 12 | We believe in the power of community support; thus, answering queries, assisting others, and enhancing the documentation are highly regarded and beneficial contributions. 13 | 14 | Finally, one of the most impactful ways to support us is by raising awareness about vLLM. 15 | Talk about it in your blog posts, highlighting how it's driving your incredible projects. 16 | Express your support on Twitter if vLLM aids you, or simply offer your appreciation by starring our repository. 17 | 18 | 19 | ## Setup for development 20 | 21 | ### Build from source 22 | 23 | ```bash 24 | pip install -r requirements.txt 25 | pip install -e . # This may take several minutes. 26 | ``` 27 | 28 | ### Testing 29 | 30 | ```bash 31 | pip install -r requirements-dev.txt 32 | 33 | # Static type checking 34 | mypy 35 | # Unit tests 36 | pytest tests/ 37 | ``` 38 | **Note:** Currently, the repository does not pass the mypy tests. 39 | 40 | 41 | ## Contributing Guidelines 42 | 43 | ### Issue Reporting 44 | 45 | If you encounter a bug or have a feature request, please check our issues page first to see if someone else has already reported it. 46 | If not, please file a new issue, providing as much relevant information as possible. 47 | 48 | ### Coding Style Guide 49 | 50 | In general, we adhere to [Google Python style guide](https://google.github.io/styleguide/pyguide.html) and [Google C++ style guide](https://google.github.io/styleguide/cppguide.html). 51 | 52 | We include a formatting script [`format.sh`](./format.sh) to format the code. 53 | 54 | ### Pull Requests 55 | 56 | When submitting a pull request: 57 | 58 | 1. Make sure your code has been rebased on top of the latest commit on the main branch. 59 | 2. Ensure code is properly formatted by running [`format.sh`](./format.sh). 60 | 3. Include a detailed description of the changes in the pull request. 61 | Explain why you made the changes you did. 62 | If your pull request fixes an open issue, please include a reference to it in the description. 63 | 64 | ### Code Reviews 65 | 66 | All submissions, including submissions by project members, require a code review. 67 | To make the review process as smooth as possible, please: 68 | 69 | 1. Keep your changes as concise as possible. 70 | If your pull request involves multiple unrelated changes, consider splitting it into separate pull requests. 71 | 2. Respond to all comments within a reasonable time frame. 72 | If a comment isn't clear or you disagree with a suggestion, feel free to ask for clarification or discuss the suggestion. 73 | 74 | ### Thank You 75 | 76 | Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM. 77 | Your contributions make vLLM a great tool for everyone! 78 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev 2 | 3 | RUN apt-get update -y \ 4 | && apt-get install -y python3-pip 5 | 6 | WORKDIR /workspace 7 | 8 | # install build and runtime dependencies 9 | COPY requirements.txt requirements.txt 10 | RUN --mount=type=cache,target=/root/.cache/pip \ 11 | pip install -r requirements.txt 12 | 13 | # install development dependencies 14 | COPY requirements-dev.txt requirements-dev.txt 15 | RUN --mount=type=cache,target=/root/.cache/pip \ 16 | pip install -r requirements-dev.txt 17 | 18 | # image to build pytorch extensions 19 | FROM dev AS build 20 | 21 | # copy input files 22 | COPY csrc csrc 23 | COPY setup.py setup.py 24 | COPY requirements.txt requirements.txt 25 | COPY pyproject.toml pyproject.toml 26 | COPY vllm/__init__.py vllm/__init__.py 27 | 28 | # max jobs used by Ninja to build extensions 29 | ENV MAX_JOBS=$max_jobs 30 | RUN python3 setup.py build_ext --inplace 31 | 32 | # image to run unit testing suite 33 | FROM dev AS test 34 | 35 | # copy pytorch extensions separately to avoid having to rebuild 36 | # when python code changes 37 | COPY --from=build /workspace/vllm/*.so /workspace/vllm/ 38 | COPY tests tests 39 | COPY vllm vllm 40 | 41 | ENTRYPOINT ["python3", "-m", "pytest", "tests"] 42 | 43 | # use CUDA base as CUDA runtime dependencies are already installed via pip 44 | FROM nvidia/cuda:12.1.0-base-ubuntu22.04 AS vllm-base 45 | 46 | # libnccl required for ray 47 | RUN apt-get update -y \ 48 | && apt-get install -y python3-pip 49 | 50 | WORKDIR /workspace 51 | COPY requirements.txt requirements.txt 52 | RUN --mount=type=cache,target=/root/.cache/pip \ 53 | pip install -r requirements.txt 54 | 55 | FROM vllm-base AS vllm 56 | COPY --from=build /workspace/vllm/*.so /workspace/vllm/ 57 | COPY vllm vllm 58 | 59 | EXPOSE 8000 60 | ENTRYPOINT ["python3", "-m", "vllm.entrypoints.api_server"] 61 | 62 | # openai api server alternative 63 | FROM vllm-base AS vllm-openai 64 | # install additional dependencies for openai api server 65 | RUN --mount=type=cache,target=/root/.cache/pip \ 66 | pip install accelerate fschat 67 | 68 | COPY --from=build /workspace/vllm/*.so /workspace/vllm/ 69 | COPY vllm vllm 70 | 71 | ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] 72 | 73 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include requirements.txt 3 | 4 | recursive-include csrc * 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | 4 | vLLM 5 | 6 |

7 | 8 |

9 | Easy, fast, and cheap LLM serving for everyone 10 |

11 | 12 |

13 | | Documentation | Blog | Paper | Discord | 14 | 15 |

16 | 17 | --- 18 | 19 | *Latest News* 🔥 20 | - [2023/10] We hosted [the first vLLM meetup](https://lu.ma/first-vllm-meetup) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/1QL-XPFXiFpDBh86DbEegFXBXFXjix4v032GhShbKf3s/edit?usp=sharing). 21 | - [2023/09] We created our [Discord server](https://discord.gg/jz7wjKhh6g)! Join us to discuss vLLM and LLM serving! We will also post the latest announcements and updates there. 22 | - [2023/09] We released our [PagedAttention paper](https://arxiv.org/abs/2309.06180) on arXiv! 23 | - [2023/08] We would like to express our sincere gratitude to [Andreessen Horowitz](https://a16z.com/2023/08/30/supporting-the-open-source-ai-community/) (a16z) for providing a generous grant to support the open-source development and research of vLLM. 24 | - [2023/07] Added support for LLaMA-2! You can run and serve 7B/13B/70B LLaMA-2s on vLLM with a single command! 25 | - [2023/06] Serving vLLM On any Cloud with SkyPilot. Check out a 1-click [example](https://github.com/skypilot-org/skypilot/blob/master/llm/vllm) to start the vLLM demo, and the [blog post](https://blog.skypilot.co/serving-llm-24x-faster-on-the-cloud-with-vllm-and-skypilot/) for the story behind vLLM development on the clouds. 26 | - [2023/06] We officially released vLLM! FastChat-vLLM integration has powered [LMSYS Vicuna and Chatbot Arena](https://chat.lmsys.org) since mid-April. Check out our [blog post](https://vllm.ai). 27 | 28 | --- 29 | 30 | vLLM is a fast and easy-to-use library for LLM inference and serving. 31 | 32 | vLLM is fast with: 33 | 34 | - State-of-the-art serving throughput 35 | - Efficient management of attention key and value memory with **PagedAttention** 36 | - Continuous batching of incoming requests 37 | - Optimized CUDA kernels 38 | 39 | vLLM is flexible and easy to use with: 40 | 41 | - Seamless integration with popular Hugging Face models 42 | - High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more 43 | - Tensor parallelism support for distributed inference 44 | - Streaming outputs 45 | - OpenAI-compatible API server 46 | 47 | vLLM seamlessly supports many Hugging Face models, including the following architectures: 48 | 49 | - Aquila & Aquila2 (`BAAI/AquilaChat2-7B`, `BAAI/AquilaChat2-34B`, `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.) 50 | - Baichuan (`baichuan-inc/Baichuan-7B`, `baichuan-inc/Baichuan-13B-Chat`, etc.) 51 | - BLOOM (`bigscience/bloom`, `bigscience/bloomz`, etc.) 52 | - ChatGLM (`THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.) 53 | - Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.) 54 | - GPT-2 (`gpt2`, `gpt2-xl`, etc.) 55 | - GPT BigCode (`bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, etc.) 56 | - GPT-J (`EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.) 57 | - GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.) 58 | - InternLM (`internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.) 59 | - LLaMA & LLaMA-2 (`meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.) 60 | - Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.) 61 | - MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.) 62 | - OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.) 63 | - Phi-1.5 (`microsoft/phi-1_5`, etc.) 64 | - Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.) 65 | - Yi (`01-ai/Yi-6B`, `01-ai/Yi-34B`, etc.) 66 | 67 | Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source): 68 | 69 | ```bash 70 | pip install vllm 71 | ``` 72 | 73 | ## Getting Started 74 | 75 | Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to get started. 76 | - [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html) 77 | - [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html) 78 | - [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html) 79 | 80 | ## Contributing 81 | 82 | We welcome and value any contributions and collaborations. 83 | Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved. 84 | 85 | ## Citation 86 | 87 | If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180): 88 | ```bibtex 89 | @inproceedings{kwon2023efficient, 90 | title={Efficient Memory Management for Large Language Model Serving with PagedAttention}, 91 | author={Woosuk Kwon and Zhuohan Li and Siyuan Zhuang and Ying Sheng and Lianmin Zheng and Cody Hao Yu and Joseph E. Gonzalez and Hao Zhang and Ion Stoica}, 92 | booktitle={Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles}, 93 | year={2023} 94 | } 95 | ``` 96 | -------------------------------------------------------------------------------- /benchmarks/README.md: -------------------------------------------------------------------------------- 1 | # Benchmarking vLLM 2 | 3 | ## Downloading the ShareGPT dataset 4 | 5 | You can download the dataset by running: 6 | ```bash 7 | wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json 8 | ``` 9 | -------------------------------------------------------------------------------- /benchmarks/benchmark_latency.py: -------------------------------------------------------------------------------- 1 | """Benchmark the latency of processing a single batch of requests.""" 2 | import argparse 3 | import time 4 | 5 | import numpy as np 6 | import torch 7 | from tqdm import tqdm 8 | 9 | from vllm import LLM, SamplingParams 10 | 11 | 12 | def main(args: argparse.Namespace): 13 | print(args) 14 | 15 | # Process all the requests in a single batch if possible. 16 | # NOTE(woosuk): If the request cannot be processed in a single batch, 17 | # the engine will automatically process the request in multiple batches. 18 | llm = LLM( 19 | model=args.model, 20 | tokenizer=args.tokenizer, 21 | quantization=args.quantization, 22 | tensor_parallel_size=args.tensor_parallel_size, 23 | max_num_seqs=args.batch_size, 24 | max_num_batched_tokens=args.batch_size * args.input_len, 25 | trust_remote_code=args.trust_remote_code, 26 | dtype=args.dtype, 27 | ) 28 | 29 | sampling_params = SamplingParams( 30 | n=args.n, 31 | temperature=0.0 if args.use_beam_search else 1.0, 32 | top_p=1.0, 33 | use_beam_search=args.use_beam_search, 34 | ignore_eos=True, 35 | max_tokens=args.output_len, 36 | ) 37 | print(sampling_params) 38 | dummy_prompt_token_ids = [[0] * args.input_len] * args.batch_size 39 | 40 | def run_to_completion(profile: bool = False): 41 | if profile: 42 | torch.cuda.cudart().cudaProfilerStart() 43 | start_time = time.perf_counter() 44 | 45 | llm.generate(prompt_token_ids=dummy_prompt_token_ids, 46 | sampling_params=sampling_params, 47 | use_tqdm=False) 48 | 49 | end_time = time.perf_counter() 50 | latency = end_time - start_time 51 | if profile: 52 | torch.cuda.cudart().cudaProfilerStop() 53 | return latency 54 | 55 | print("Warming up...") 56 | run_to_completion(profile=False) 57 | 58 | # Benchmark. 59 | latencies = [] 60 | for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): 61 | latencies.append(run_to_completion(profile=False)) 62 | print(f'Avg latency: {np.mean(latencies)} seconds') 63 | 64 | 65 | if __name__ == '__main__': 66 | parser = argparse.ArgumentParser( 67 | description='Benchmark the latency of processing a single batch of ' 68 | 'requests till completion.') 69 | parser.add_argument('--model', type=str, default='facebook/opt-125m') 70 | parser.add_argument('--tokenizer', type=str, default=None) 71 | parser.add_argument('--quantization', 72 | '-q', 73 | choices=['awq', 'squeezellm', None], 74 | default=None) 75 | parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1) 76 | parser.add_argument('--input-len', type=int, default=32) 77 | parser.add_argument('--output-len', type=int, default=128) 78 | parser.add_argument('--batch-size', type=int, default=8) 79 | parser.add_argument('--n', 80 | type=int, 81 | default=1, 82 | help='Number of generated sequences per prompt.') 83 | parser.add_argument('--use-beam-search', action='store_true') 84 | parser.add_argument('--num-iters', 85 | type=int, 86 | default=3, 87 | help='Number of iterations to run.') 88 | parser.add_argument('--trust-remote-code', 89 | action='store_true', 90 | help='trust remote code from huggingface') 91 | parser.add_argument( 92 | '--dtype', 93 | type=str, 94 | default='auto', 95 | choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'], 96 | help='data type for model weights and activations. ' 97 | 'The "auto" option will use FP16 precision ' 98 | 'for FP32 and FP16 models, and BF16 precision ' 99 | 'for BF16 models.') 100 | args = parser.parse_args() 101 | main(args) 102 | -------------------------------------------------------------------------------- /benchmarks/launch_tgi_server.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | PORT=8000 4 | MODEL=$1 5 | TOKENS=$2 6 | 7 | docker run --gpus all --shm-size 1g -p $PORT:80 \ 8 | -v $PWD/data:/data \ 9 | ghcr.io/huggingface/text-generation-inference:0.8 \ 10 | --model-id $MODEL \ 11 | --sharded false \ 12 | --max-input-length 1024 \ 13 | --max-total-tokens 2048 \ 14 | --max-best-of 5 \ 15 | --max-concurrent-requests 5000 \ 16 | --max-batch-total-tokens $TOKENS 17 | -------------------------------------------------------------------------------- /csrc/activation_kernels.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "dispatch_utils.h" 5 | 6 | namespace vllm { 7 | 8 | template 9 | __device__ __forceinline__ T silu(const T& x) { 10 | // x * sigmoid(x) 11 | return (T) (((float) x) / (1.0f + expf((float) -x))); 12 | } 13 | 14 | template 15 | __global__ void silu_and_mul_kernel( 16 | scalar_t* __restrict__ out, // [..., d] 17 | const scalar_t* __restrict__ input, // [..., 2, d] 18 | const int d) { 19 | const int64_t token_idx = blockIdx.x; 20 | for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { 21 | const scalar_t x = __ldg(&input[token_idx * 2 * d + idx]); 22 | const scalar_t y = __ldg(&input[token_idx * 2 * d + d + idx]); 23 | out[token_idx * d + idx] = silu(x) * y; 24 | } 25 | } 26 | 27 | } // namespace vllm 28 | 29 | void silu_and_mul( 30 | torch::Tensor& out, // [..., d] 31 | torch::Tensor& input) // [..., 2 * d] 32 | { 33 | int64_t num_tokens = input.numel() / input.size(-1); 34 | int d = input.size(-1) / 2; 35 | 36 | dim3 grid(num_tokens); 37 | dim3 block(std::min(d, 1024)); 38 | const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); 39 | VLLM_DISPATCH_FLOATING_TYPES( 40 | input.scalar_type(), 41 | "silu_and_mul_kernel", 42 | [&] { 43 | vllm::silu_and_mul_kernel<<>>( 44 | out.data_ptr(), 45 | input.data_ptr(), 46 | d); 47 | }); 48 | } 49 | 50 | namespace vllm { 51 | 52 | // Element-wise activation kernel template. 53 | template 54 | __global__ void activation_kernel( 55 | scalar_t* __restrict__ out, // [..., d] 56 | const scalar_t* __restrict__ input, // [..., d] 57 | const int d) { 58 | const int64_t token_idx = blockIdx.x; 59 | for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) { 60 | const scalar_t x = __ldg(&input[token_idx * d + idx]); 61 | out[token_idx * d + idx] = ACT_FN(x); 62 | } 63 | } 64 | 65 | } // namespace vllm 66 | 67 | // Launch element-wise activation kernel. 68 | #define LAUNCH_ACTIVATION_KERNEL(KERNEL) \ 69 | int d = input.size(-1); \ 70 | int64_t num_tokens = input.numel() / d; \ 71 | dim3 grid(num_tokens); \ 72 | dim3 block(std::min(d, 1024)); \ 73 | const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \ 74 | VLLM_DISPATCH_FLOATING_TYPES( \ 75 | input.scalar_type(), \ 76 | "activation_kernel", \ 77 | [&] { \ 78 | vllm::activation_kernel><<>>( \ 79 | out.data_ptr(), \ 80 | input.data_ptr(), \ 81 | d); \ 82 | }); 83 | 84 | namespace vllm { 85 | 86 | template 87 | __device__ __forceinline__ T gelu_new_kernel(const T& x) { 88 | const float x3 = (float) (x * x * x); 89 | const T t = (T) tanhf((T) (0.79788456f * (float) (x + (T) (0.044715f * x3)))); 90 | return ((T) 0.5) * x * (((T) 1.0) + t); 91 | } 92 | 93 | template 94 | __device__ __forceinline__ T gelu_fast_kernel(const T& x) { 95 | const float f = (float) x; 96 | const T t = (T) tanhf(((T) (f * 0.79788456f)) * (((T) 1.0) + (T) (0.044715f * f) * x)); 97 | return ((T) 0.5) * x * (((T) 1.0) + t); 98 | } 99 | 100 | } // namespace vllm 101 | 102 | void gelu_new( 103 | torch::Tensor& out, // [..., d] 104 | torch::Tensor& input) // [..., d] 105 | { 106 | LAUNCH_ACTIVATION_KERNEL(vllm::gelu_new_kernel); 107 | } 108 | 109 | void gelu_fast( 110 | torch::Tensor& out, // [..., d] 111 | torch::Tensor& input) // [..., d] 112 | { 113 | LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel); 114 | } 115 | -------------------------------------------------------------------------------- /csrc/attention/attention_dtypes.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include "attention_generic.cuh" 4 | #include "dtype_float16.cuh" 5 | #include "dtype_float32.cuh" 6 | #include "dtype_bfloat16.cuh" 7 | -------------------------------------------------------------------------------- /csrc/attention/attention_generic.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h 3 | * Copyright (c) 2023, The vLLM team. 4 | * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | #pragma once 19 | 20 | #include 21 | 22 | namespace vllm { 23 | 24 | // A vector type to store Q, K, V elements. 25 | template 26 | struct Vec {}; 27 | 28 | // A vector type to store FP32 accumulators. 29 | template 30 | struct FloatVec {}; 31 | 32 | // Template vector operations. 33 | template 34 | inline __device__ Acc mul(A a, B b); 35 | 36 | template 37 | inline __device__ float sum(T v); 38 | 39 | template 40 | inline __device__ float dot(T a, T b) { 41 | return sum(mul(a, b)); 42 | } 43 | 44 | template 45 | inline __device__ float dot(T a, T b) { 46 | return sum(mul(a, b)); 47 | } 48 | 49 | template 50 | inline __device__ void zero(T& dst) { 51 | constexpr int WORDS = sizeof(T) / 4; 52 | union { 53 | T raw; 54 | uint32_t words[WORDS]; 55 | } tmp; 56 | 57 | #pragma unroll 58 | for (int ii = 0; ii < WORDS; ++ii) { 59 | tmp.words[ii] = 0u; 60 | } 61 | dst = tmp.raw; 62 | } 63 | 64 | } // namespace vllm 65 | -------------------------------------------------------------------------------- /csrc/attention/attention_utils.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp 3 | * Copyright (c) 2023, The vLLM team. 4 | * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | #pragma once 19 | 20 | #include "attention_dtypes.h" 21 | 22 | #include 23 | #include 24 | 25 | namespace vllm { 26 | 27 | // Q*K^T operation. 28 | template 29 | inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) { 30 | using A_vec = typename FloatVec::Type; 31 | // Compute the parallel products for Q*K^T (treat vector lanes separately). 32 | A_vec qk_vec = mul(q[0], k[0]); 33 | #pragma unroll 34 | for (int ii = 1; ii < N; ++ii) { 35 | qk_vec = fma(q[ii], k[ii], qk_vec); 36 | } 37 | 38 | // Finalize the reduction across lanes. 39 | float qk = sum(qk_vec); 40 | #pragma unroll 41 | for (int mask = THREAD_GROUP_SIZE / 2; mask >= 1; mask /= 2) { 42 | qk += __shfl_xor_sync(uint32_t(-1), qk, mask); 43 | } 44 | return qk; 45 | } 46 | 47 | template 48 | struct Qk_dot { 49 | template 50 | static inline __device__ float dot(const Vec (&q)[N], const Vec (&k)[N]) { 51 | return qk_dot_(q, k); 52 | } 53 | }; 54 | 55 | } // namespace vllm 56 | -------------------------------------------------------------------------------- /csrc/cache.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include 4 | #include 5 | 6 | void swap_blocks( 7 | torch::Tensor& src, 8 | torch::Tensor& dst, 9 | const std::map& block_mapping); 10 | 11 | void copy_blocks( 12 | std::vector& key_caches, 13 | std::vector& value_caches, 14 | const std::map>& block_mapping); 15 | 16 | void reshape_and_cache( 17 | torch::Tensor& key, 18 | torch::Tensor& value, 19 | torch::Tensor& key_cache, 20 | torch::Tensor& value_cache, 21 | torch::Tensor& slot_mapping); 22 | 23 | void gather_cached_kv( 24 | torch::Tensor& key, 25 | torch::Tensor& value, 26 | torch::Tensor& key_cache, 27 | torch::Tensor& value_cache, 28 | torch::Tensor& slot_mapping); 29 | -------------------------------------------------------------------------------- /csrc/cuda_utils.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | int get_device_attribute( 4 | int attribute, 5 | int device_id); 6 | -------------------------------------------------------------------------------- /csrc/cuda_utils_kernels.cu: -------------------------------------------------------------------------------- 1 | int get_device_attribute( 2 | int attribute, 3 | int device_id) 4 | { 5 | int device, value; 6 | if (device_id < 0) { 7 | cudaGetDevice(&device); 8 | } 9 | else { 10 | device = device_id; 11 | } 12 | cudaDeviceGetAttribute(&value, static_cast(attribute), device); 13 | return value; 14 | } 15 | -------------------------------------------------------------------------------- /csrc/dispatch_utils.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Adapted from 3 | * https://github.com/pytorch/pytorch/blob/v2.0.1/aten/src/ATen/Dispatch.h 4 | */ 5 | #include 6 | 7 | #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...) \ 8 | AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ 9 | AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \ 10 | AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) 11 | 12 | #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ 13 | AT_DISPATCH_SWITCH( \ 14 | TYPE, NAME, VLLM_DISPATCH_CASE_FLOATING_TYPES(__VA_ARGS__)) 15 | -------------------------------------------------------------------------------- /csrc/layernorm_kernels.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "dispatch_utils.h" 5 | #include "reduction_utils.cuh" 6 | 7 | namespace vllm { 8 | 9 | // TODO(woosuk): Further optimize this kernel. 10 | template 11 | __global__ void rms_norm_kernel( 12 | scalar_t* __restrict__ out, // [..., hidden_size] 13 | const scalar_t* __restrict__ input, // [..., hidden_size] 14 | const scalar_t* __restrict__ weight, // [hidden_size] 15 | const float epsilon, 16 | const int num_tokens, 17 | const int hidden_size) { 18 | __shared__ float s_variance; 19 | float variance = 0.0f; 20 | 21 | for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { 22 | const float x = (float) input[blockIdx.x * hidden_size + idx]; 23 | variance += x * x; 24 | } 25 | variance = blockReduceSum(variance); 26 | if (threadIdx.x == 0) { 27 | s_variance = rsqrtf(variance / hidden_size + epsilon); 28 | } 29 | __syncthreads(); 30 | 31 | for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { 32 | float x = (float) input[blockIdx.x * hidden_size + idx]; 33 | out[blockIdx.x * hidden_size + idx] = ((scalar_t) (x * s_variance)) * weight[idx]; 34 | } 35 | } 36 | 37 | // TODO: Further optimize this kernel. 38 | template 39 | __global__ void fused_add_rms_norm_kernel( 40 | scalar_t* __restrict__ input, // [..., hidden_size] 41 | scalar_t* __restrict__ residual, // [..., hidden_size] 42 | const scalar_t* __restrict__ weight, // [hidden_size] 43 | const float epsilon, 44 | const int num_tokens, 45 | const int hidden_size) { 46 | __shared__ float s_variance; 47 | float variance = 0.0f; 48 | 49 | for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { 50 | float x = (float) input[blockIdx.x * hidden_size + idx]; 51 | x += (float) residual[blockIdx.x * hidden_size + idx]; 52 | variance += x * x; 53 | residual[blockIdx.x * hidden_size + idx] = (scalar_t) x; 54 | } 55 | variance = blockReduceSum(variance); 56 | if (threadIdx.x == 0) { 57 | s_variance = rsqrtf(variance / hidden_size + epsilon); 58 | } 59 | __syncthreads(); 60 | 61 | for (int idx = threadIdx.x; idx < hidden_size; idx += blockDim.x) { 62 | float x = (float) residual[blockIdx.x * hidden_size + idx]; 63 | input[blockIdx.x * hidden_size + idx] = ((scalar_t) (x * s_variance)) * weight[idx]; 64 | } 65 | } 66 | 67 | } // namespace vllm 68 | 69 | void rms_norm( 70 | torch::Tensor& out, // [..., hidden_size] 71 | torch::Tensor& input, // [..., hidden_size] 72 | torch::Tensor& weight, // [hidden_size] 73 | float epsilon) { 74 | int hidden_size = input.size(-1); 75 | int num_tokens = input.numel() / hidden_size; 76 | 77 | dim3 grid(num_tokens); 78 | dim3 block(std::min(hidden_size, 1024)); 79 | const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); 80 | VLLM_DISPATCH_FLOATING_TYPES( 81 | input.scalar_type(), 82 | "rms_norm_kernel", 83 | [&] { 84 | vllm::rms_norm_kernel<<>>( 85 | out.data_ptr(), 86 | input.data_ptr(), 87 | weight.data_ptr(), 88 | epsilon, 89 | num_tokens, 90 | hidden_size); 91 | }); 92 | } 93 | 94 | void fused_add_rms_norm( 95 | torch::Tensor& input, // [..., hidden_size] 96 | torch::Tensor& residual, // [..., hidden_size] 97 | torch::Tensor& weight, // [hidden_size] 98 | float epsilon) { 99 | int hidden_size = input.size(-1); 100 | int num_tokens = input.numel() / hidden_size; 101 | 102 | dim3 grid(num_tokens); 103 | dim3 block(std::min(hidden_size, 1024)); 104 | const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); 105 | VLLM_DISPATCH_FLOATING_TYPES( 106 | input.scalar_type(), 107 | "fused_add_rms_norm_kernel", 108 | [&] { 109 | vllm::fused_add_rms_norm_kernel<<>>( 110 | input.data_ptr(), 111 | residual.data_ptr(), 112 | weight.data_ptr(), 113 | epsilon, 114 | num_tokens, 115 | hidden_size); 116 | }); 117 | } 118 | -------------------------------------------------------------------------------- /csrc/ops.h: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | void paged_attention_v1( 4 | torch::Tensor& out, 5 | torch::Tensor& query, 6 | torch::Tensor& key_cache, 7 | torch::Tensor& value_cache, 8 | torch::Tensor& head_mapping, 9 | float scale, 10 | torch::Tensor& block_tables, 11 | torch::Tensor& context_lens, 12 | int block_size, 13 | int max_context_len, 14 | const c10::optional& alibi_slopes); 15 | 16 | void paged_attention_v2( 17 | torch::Tensor& out, 18 | torch::Tensor& exp_sums, 19 | torch::Tensor& max_logits, 20 | torch::Tensor& tmp_out, 21 | torch::Tensor& query, 22 | torch::Tensor& key_cache, 23 | torch::Tensor& value_cache, 24 | torch::Tensor& head_mapping, 25 | float scale, 26 | torch::Tensor& block_tables, 27 | torch::Tensor& context_lens, 28 | int block_size, 29 | int max_context_len, 30 | const c10::optional& alibi_slopes); 31 | 32 | void rms_norm( 33 | torch::Tensor& out, 34 | torch::Tensor& input, 35 | torch::Tensor& weight, 36 | float epsilon); 37 | 38 | void fused_add_rms_norm( 39 | torch::Tensor& input, 40 | torch::Tensor& residual, 41 | torch::Tensor& weight, 42 | float epsilon); 43 | 44 | void rotary_embedding( 45 | torch::Tensor& positions, 46 | torch::Tensor& query, 47 | torch::Tensor& key, 48 | int head_size, 49 | torch::Tensor& cos_sin_cache, 50 | bool is_neox); 51 | 52 | void silu_and_mul( 53 | torch::Tensor& out, 54 | torch::Tensor& input); 55 | 56 | void gelu_new( 57 | torch::Tensor& out, 58 | torch::Tensor& input); 59 | 60 | void gelu_fast( 61 | torch::Tensor& out, 62 | torch::Tensor& input); 63 | 64 | torch::Tensor awq_gemm( 65 | torch::Tensor _in_feats, 66 | torch::Tensor _kernel, 67 | torch::Tensor _scaling_factors, 68 | torch::Tensor _zeros, 69 | int split_k_iters); 70 | 71 | void squeezellm_gemm( 72 | torch::Tensor vec, 73 | torch::Tensor mat, 74 | torch::Tensor mul, 75 | torch::Tensor lookup_table); 76 | -------------------------------------------------------------------------------- /csrc/pos_encoding_kernels.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include "dispatch_utils.h" 5 | 6 | namespace vllm { 7 | 8 | template 9 | inline __device__ void apply_rotary_embedding( 10 | scalar_t* __restrict__ arr, 11 | const scalar_t* __restrict__ cos_ptr, 12 | const scalar_t* __restrict__ sin_ptr, 13 | int rot_offset, 14 | int embed_dim) 15 | { 16 | int x_index, y_index; 17 | scalar_t cos, sin; 18 | if (IS_NEOX) { 19 | // GPT-NeoX style rotary embedding. 20 | x_index = rot_offset; 21 | y_index = embed_dim + rot_offset; 22 | cos = __ldg(cos_ptr + x_index); 23 | sin = __ldg(sin_ptr + x_index); 24 | } else { 25 | // GPT-J style rotary embedding. 26 | x_index = 2 * rot_offset; 27 | y_index = 2 * rot_offset + 1; 28 | cos = __ldg(cos_ptr + x_index / 2); 29 | sin = __ldg(sin_ptr + x_index / 2); 30 | } 31 | 32 | const scalar_t x = arr[x_index]; 33 | const scalar_t y = arr[y_index]; 34 | arr[x_index] = x * cos - y * sin; 35 | arr[y_index] = y * cos + x * sin; 36 | } 37 | 38 | template 39 | __global__ void rotary_embedding_kernel( 40 | const int64_t* __restrict__ positions, // [batch_size, seq_len] or [num_tokens] 41 | scalar_t* __restrict__ query, // [batch_size, seq_len, num_heads, head_size] or [num_tokens, num_heads, head_size] 42 | scalar_t* __restrict__ key, // [batch_size, seq_len, num_kv_heads, head_size] or [num_tokens, num_kv_heads, head_size] 43 | const scalar_t* __restrict__ cos_sin_cache, // [max_position, 2, rot_dim // 2] 44 | const int rot_dim, 45 | const int query_stride, 46 | const int key_stride, 47 | const int num_heads, 48 | const int num_kv_heads, 49 | const int head_size) { 50 | // Each thread block is responsible for one token. 51 | const int token_idx = blockIdx.x; 52 | int64_t pos = positions[token_idx]; 53 | const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim; 54 | 55 | const int embed_dim = rot_dim / 2; 56 | const scalar_t* cos_ptr = cache_ptr; 57 | const scalar_t* sin_ptr = cache_ptr + embed_dim; 58 | 59 | const int nq = num_heads * embed_dim; 60 | for (int i = threadIdx.x; i < nq; i += blockDim.x) { 61 | const int head_idx = i / embed_dim; 62 | const int token_head = token_idx * query_stride + head_idx * head_size; 63 | const int rot_offset = i % embed_dim; 64 | apply_rotary_embedding(query + token_head, cos_ptr, 65 | sin_ptr, rot_offset, embed_dim); 66 | } 67 | 68 | const int nk = num_kv_heads * embed_dim; 69 | for (int i = threadIdx.x; i < nk; i += blockDim.x) { 70 | const int head_idx = i / embed_dim; 71 | const int token_head = token_idx * key_stride + head_idx * head_size; 72 | const int rot_offset = i % embed_dim; 73 | apply_rotary_embedding(key + token_head, cos_ptr, 74 | sin_ptr, rot_offset, embed_dim); 75 | } 76 | } 77 | 78 | } // namespace vllm 79 | 80 | void rotary_embedding( 81 | torch::Tensor& positions, // [batch_size, seq_len] or [num_tokens] 82 | torch::Tensor& query, // [batch_size, seq_len, num_heads * head_size] or [num_tokens, num_heads * head_size] 83 | torch::Tensor& key, // [batch_size, seq_len, num_kv_heads * head_size] or [num_tokens, num_kv_heads * head_size] 84 | int head_size, 85 | torch::Tensor& cos_sin_cache, // [max_position, rot_dim] 86 | bool is_neox) { 87 | int64_t num_tokens = query.numel() / query.size(-1); 88 | int rot_dim = cos_sin_cache.size(1); 89 | int num_heads = query.size(-1) / head_size; 90 | int num_kv_heads = key.size(-1) / head_size; 91 | int query_stride = query.stride(-2); 92 | int key_stride = key.stride(-2); 93 | 94 | dim3 grid(num_tokens); 95 | dim3 block(std::min(num_heads * rot_dim / 2, 512)); 96 | const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); 97 | VLLM_DISPATCH_FLOATING_TYPES( 98 | query.scalar_type(), 99 | "rotary_embedding", 100 | [&] { 101 | if (is_neox) { 102 | vllm::rotary_embedding_kernel<<>>( 103 | positions.data_ptr(), 104 | query.data_ptr(), 105 | key.data_ptr(), 106 | cos_sin_cache.data_ptr(), 107 | rot_dim, 108 | query_stride, 109 | key_stride, 110 | num_heads, 111 | num_kv_heads, 112 | head_size); 113 | } else { 114 | vllm::rotary_embedding_kernel<<>>( 115 | positions.data_ptr(), 116 | query.data_ptr(), 117 | key.data_ptr(), 118 | cos_sin_cache.data_ptr(), 119 | rot_dim, 120 | query_stride, 121 | key_stride, 122 | num_heads, 123 | num_kv_heads, 124 | head_size); 125 | } 126 | }); 127 | } 128 | -------------------------------------------------------------------------------- /csrc/pybind.cpp: -------------------------------------------------------------------------------- 1 | #include "cache.h" 2 | #include "cuda_utils.h" 3 | #include "ops.h" 4 | #include 5 | 6 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 7 | // vLLM custom ops 8 | pybind11::module ops = m.def_submodule("ops", "vLLM custom operators"); 9 | 10 | // Attention ops 11 | ops.def( 12 | "paged_attention_v1", 13 | &paged_attention_v1, 14 | "Compute the attention between an input query and the cached keys/values using PagedAttention."); 15 | ops.def( 16 | "paged_attention_v2", 17 | &paged_attention_v2, 18 | "PagedAttention V2."); 19 | 20 | // Activation ops 21 | ops.def( 22 | "silu_and_mul", 23 | &silu_and_mul, 24 | "Activation function used in SwiGLU."); 25 | ops.def( 26 | "gelu_new", 27 | &gelu_new, 28 | "GELU implementation used in GPT-2."); 29 | ops.def( 30 | "gelu_fast", 31 | &gelu_fast, 32 | "Approximate GELU implementation."); 33 | 34 | // Layernorm 35 | ops.def( 36 | "rms_norm", 37 | &rms_norm, 38 | "Apply Root Mean Square (RMS) Normalization to the input tensor."); 39 | 40 | ops.def( 41 | "fused_add_rms_norm", 42 | &fused_add_rms_norm, 43 | "In-place fused Add and RMS Normalization"); 44 | 45 | // Rotary embedding 46 | ops.def( 47 | "rotary_embedding", 48 | &rotary_embedding, 49 | "Apply GPT-NeoX or GPT-J style rotary embedding to query and key"); 50 | 51 | // Quantization ops 52 | ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ"); 53 | ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM"); 54 | 55 | // Cache ops 56 | pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops"); 57 | cache_ops.def( 58 | "swap_blocks", 59 | &swap_blocks, 60 | "Swap in (out) the cache blocks from src to dst"); 61 | cache_ops.def( 62 | "copy_blocks", 63 | ©_blocks, 64 | "Copy the cache blocks from src to dst"); 65 | cache_ops.def( 66 | "reshape_and_cache", 67 | &reshape_and_cache, 68 | "Reshape the key and value tensors and cache them"); 69 | cache_ops.def( 70 | "gather_cached_kv", 71 | &gather_cached_kv, 72 | "Gather key and value from the cache into contiguous QKV tensors"); 73 | 74 | // Cuda utils 75 | pybind11::module cuda_utils = m.def_submodule("cuda_utils", "vLLM cuda utils"); 76 | cuda_utils.def( 77 | "get_device_attribute", 78 | &get_device_attribute, 79 | "Gets the specified device attribute."); 80 | } 81 | -------------------------------------------------------------------------------- /csrc/quantization/awq/dequantize.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | Adapted from https://github.com/mit-han-lab/llm-awq 3 | Modified from NVIDIA FasterTransformer: https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h 4 | @article{lin2023awq, 5 | title={AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration}, 6 | author={Lin, Ji and Tang, Jiaming and Tang, Haotian and Yang, Shang and Dang, Xingyu and Han, Song}, 7 | journal={arXiv}, 8 | year={2023} 9 | } 10 | */ 11 | 12 | #pragma once 13 | 14 | namespace vllm { 15 | namespace awq { 16 | 17 | __device__ uint4 dequantize_s4_to_fp16x2(uint32_t const& source) 18 | { 19 | #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 750 20 | assert(false); 21 | #else 22 | uint4 result; 23 | 24 | uint32_t* h = reinterpret_cast(&result); 25 | uint32_t const i4s = reinterpret_cast(source); 26 | 27 | // First, we extract the i4s and construct an intermediate fp16 number. 28 | static constexpr uint32_t immLut = (0xf0 & 0xcc) | 0xaa; 29 | static constexpr uint32_t BOTTOM_MASK = 0x000f000f; 30 | static constexpr uint32_t TOP_MASK = 0x00f000f0; 31 | static constexpr uint32_t I4s_TO_F16s_MAGIC_NUM = 0x64006400; 32 | 33 | // Note that the entire sequence only requires 1 shift instruction. This is thanks to the register packing 34 | // format and the fact that we force our integers to be unsigned, and account for this in the fp16 subtractions. 35 | // In addition, I exploit the fact that sub and fma have the same throughput in order to convert elt_23 and 36 | // elt_67 to fp16 without having to shift them to the bottom bits before hand. 37 | 38 | // Shift right by 8 to now consider elt_45 and elt_67. Issue first to hide RAW dependency if we issue 39 | // immediately before required. 40 | const uint32_t top_i4s = i4s >> 8; 41 | // Extract elt_01 - (i4s & 0x000f000f) | 0x64006400 42 | asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" 43 | : "=r"(h[0]) 44 | : "r"(i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut)); 45 | // Extract elt_23 (i4s & 0x00f000f0) | 0x64006400 46 | asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" 47 | : "=r"(h[1]) 48 | : "r"(i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut)); 49 | // Extract elt_45 (top_i4s & 0x000f000f) | 0x64006400 50 | asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" 51 | : "=r"(h[2]) 52 | : "r"(top_i4s), "n"(BOTTOM_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut)); 53 | // Extract elt_67 (top_i4s & 0x00f000f0) | 0x64006400 54 | asm volatile("lop3.b32 %0, %1, %2, %3, %4;\n" 55 | : "=r"(h[3]) 56 | : "r"(top_i4s), "n"(TOP_MASK), "n"(I4s_TO_F16s_MAGIC_NUM), "n"(immLut)); 57 | 58 | // I use inline PTX below because I am not sure if the compiler will emit float2half instructions if I use the 59 | // half2 ctor. In this case, I chose performance reliability over code readability. 60 | 61 | // This is the half2 {1032, 1032} represented as an integer. 62 | // static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64086408; 63 | // Haotian: subtract {1024, 1024} instead, we do not need to map to [-8, 7] 64 | static constexpr uint32_t FP16_TOP_MAGIC_NUM = 0x64006400; 65 | // This is the half2 {1 / 16, 1 / 16} represented as an integer. 66 | static constexpr uint32_t ONE_SIXTEENTH = 0x2c002c00; 67 | // This is the half2 {-72, -72} represented as an integer. 68 | // static constexpr uint32_t NEG_72 = 0xd480d480; 69 | // Haotian: Let's use {-64, -64}. 70 | static constexpr uint32_t NEG_64 = 0xd400d400; 71 | 72 | // Finally, we construct the output numbers. 73 | // Convert elt_01 74 | asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[0]) : "r"(h[0]), "r"(FP16_TOP_MAGIC_NUM)); 75 | // Convert elt_23 76 | asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[1]) : "r"(h[1]), "r"(ONE_SIXTEENTH), "r"(NEG_64)); 77 | // Convert elt_45 78 | asm volatile("sub.f16x2 %0, %1, %2;\n" : "=r"(h[2]) : "r"(h[2]), "r"(FP16_TOP_MAGIC_NUM)); 79 | // Convert elt_67 80 | asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(h[3]) : "r"(h[3]), "r"(ONE_SIXTEENTH), "r"(NEG_64)); 81 | 82 | return result; 83 | #endif 84 | } 85 | 86 | } // namespace awq 87 | } // namespace vllm 88 | -------------------------------------------------------------------------------- /csrc/quantization/squeezellm/quant_cuda_kernel.cu: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | // half-tensor 8 | #include 9 | #include 10 | 11 | #define BLOCKWIDTH 128 12 | #define BLOCKHEIGHT4 16 13 | 14 | namespace vllm { 15 | namespace squeezellm { 16 | 17 | __device__ inline unsigned int as_unsigned(int i) { 18 | return *reinterpret_cast(&i); 19 | } 20 | 21 | // 4-bit matvec kernel (LUT-based) 22 | __global__ void NUQ4MatMulKernel( 23 | const half2* __restrict__ vec, 24 | const int* __restrict__ mat, 25 | half2* __restrict__ mul, 26 | const __half* __restrict__ lookup_table, 27 | int height, 28 | int width, 29 | int batch, 30 | int vec_height 31 | ) { 32 | 33 | const int blockwidth2 = BLOCKWIDTH / 2; 34 | 35 | int row = BLOCKHEIGHT4 * blockIdx.x; 36 | int col = BLOCKWIDTH * blockIdx.y + threadIdx.x; 37 | 38 | __shared__ half2 blockvec[blockwidth2]; 39 | 40 | __shared__ __half deq2[16][BLOCKWIDTH]; 41 | int off = threadIdx.x; 42 | int column_offset = col * 16; 43 | for (int val = 0; val < 16; val += 1) { 44 | int lut_index = column_offset + val; 45 | deq2[val][off] = lookup_table[lut_index]; 46 | } 47 | 48 | __half res; 49 | half2 res2; 50 | half2 tmp2; 51 | 52 | int i; 53 | int k; 54 | 55 | unsigned int tmp1; 56 | unsigned int lut_index1, lut_index2; 57 | 58 | for (int b = 0; b < batch; ++b){ 59 | i = width * row + col; 60 | res = __int2half_rd(0); 61 | k = 0; 62 | 63 | __syncthreads(); 64 | if (threadIdx.x < blockwidth2) 65 | blockvec[threadIdx.x] = vec[b * vec_height / 2 + (row / BLOCKHEIGHT4) * blockwidth2 + threadIdx.x]; 66 | __syncthreads(); 67 | 68 | while (k < blockwidth2) { 69 | tmp1 = as_unsigned(mat[i]); 70 | 71 | res2 = {}; 72 | tmp2 = {}; 73 | 74 | lut_index1 = tmp1 & 0xF; 75 | lut_index2 = (tmp1 >> 4) & 0xF; 76 | tmp2.x = deq2[lut_index1][off]; 77 | tmp2.y = deq2[lut_index2][off]; 78 | res2 = __hfma2(tmp2, blockvec[k + 0], res2); 79 | 80 | lut_index1 = (tmp1 >> 8) & 0xF; 81 | lut_index2 = (tmp1 >> 12) & 0xF; 82 | tmp2.x = deq2[lut_index1][off]; 83 | tmp2.y = deq2[lut_index2][off]; 84 | res2 = __hfma2(tmp2, blockvec[k + 1], res2); 85 | 86 | lut_index1 = (tmp1 >> 16) & 0xF; 87 | lut_index2 = (tmp1 >> 20) & 0xF; 88 | tmp2.x = deq2[lut_index1][off]; 89 | tmp2.y = deq2[lut_index2][off]; 90 | res2 = __hfma2(tmp2, blockvec[k + 2], res2); 91 | 92 | lut_index1 = (tmp1 >> 24) & 0xF; 93 | lut_index2 = (tmp1 >> 28) & 0xF; 94 | tmp2.x = deq2[lut_index1][off]; 95 | tmp2.y = deq2[lut_index2][off]; 96 | res2 = __hfma2(tmp2, blockvec[k + 3], res2); 97 | 98 | res = __hadd(__hadd(res2.x, res2.y), res); 99 | 100 | i += width; 101 | k += 4; 102 | } 103 | 104 | // col%2 -> only set one of the two values 105 | half2 res3 = {}; 106 | if (col % 2 == 0) { 107 | res3.x = res; 108 | } else { 109 | res3.y = res; 110 | } 111 | 112 | atomicAdd(&mul[b * width / 2 + col / 2], res3); 113 | } 114 | } 115 | 116 | } // namespace squeezellm 117 | } // namespace vllm 118 | 119 | // 4-bit matvec kernel (LUT-based) 120 | void squeezellm_gemm( 121 | torch::Tensor vec, 122 | torch::Tensor mat, 123 | torch::Tensor mul, 124 | torch::Tensor lookup_table 125 | ) { 126 | int height = mat.size(0); 127 | int width = mat.size(1); 128 | 129 | int batch = vec.size(0); 130 | int vec_height = vec.size(1); 131 | 132 | dim3 blocks( 133 | (height + BLOCKHEIGHT4 - 1) / BLOCKHEIGHT4, 134 | (width + BLOCKWIDTH - 1) / BLOCKWIDTH 135 | ); 136 | dim3 threads(BLOCKWIDTH); 137 | 138 | vllm::squeezellm::NUQ4MatMulKernel<<>>( 139 | (half2*) vec.data(), 140 | mat.data_ptr(), 141 | (half2*) mul.data(), 142 | (__half*) lookup_table.data(), 143 | height, width, batch, vec_height 144 | ); 145 | } 146 | 147 | #undef BLOCKWIDTH 148 | #undef BLOCKHEIGHT4 149 | -------------------------------------------------------------------------------- /csrc/reduction_utils.cuh: -------------------------------------------------------------------------------- 1 | /* 2 | * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/reduce_kernel_utils.cuh 3 | * Copyright (c) 2023, The vLLM team. 4 | * Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved. 5 | * 6 | * Licensed under the Apache License, Version 2.0 (the "License"); 7 | * you may not use this file except in compliance with the License. 8 | * You may obtain a copy of the License at 9 | * 10 | * http://www.apache.org/licenses/LICENSE-2.0 11 | * 12 | * Unless required by applicable law or agreed to in writing, software 13 | * distributed under the License is distributed on an "AS IS" BASIS, 14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | * See the License for the specific language governing permissions and 16 | * limitations under the License. 17 | */ 18 | #pragma once 19 | 20 | namespace vllm { 21 | 22 | template 23 | __inline__ __device__ T warpReduceSum(T val) { 24 | #pragma unroll 25 | for (int mask = 16; mask > 0; mask >>= 1) 26 | val += __shfl_xor_sync(0xffffffff, val, mask, 32); 27 | return val; 28 | } 29 | 30 | /* Calculate the sum of all elements in a block */ 31 | template 32 | __inline__ __device__ T blockReduceSum(T val) { 33 | static __shared__ T shared[32]; 34 | int lane = threadIdx.x & 0x1f; 35 | int wid = threadIdx.x >> 5; 36 | 37 | val = warpReduceSum(val); 38 | 39 | if (lane == 0) 40 | shared[wid] = val; 41 | 42 | __syncthreads(); 43 | 44 | // Modify from blockDim.x << 5 to blockDim.x / 32. to prevent 45 | // blockDim.x is not divided by 32 46 | val = (threadIdx.x < (blockDim.x / 32.f)) ? shared[lane] : (T)(0.0f); 47 | val = warpReduceSum(val); 48 | return val; 49 | } 50 | 51 | } // namespace vllm 52 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # vLLM documents 2 | 3 | ## Build the docs 4 | 5 | ```bash 6 | # Install dependencies. 7 | pip install -r requirements-docs.txt 8 | 9 | # Build the docs. 10 | make clean 11 | make html 12 | ``` 13 | 14 | ## Open the docs with your browser 15 | 16 | ```bash 17 | python -m http.server -d build/html/ 18 | ``` 19 | Launch your browser and open localhost:8000. 20 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | %SPHINXBUILD% >NUL 2>NUL 14 | if errorlevel 9009 ( 15 | echo. 16 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 17 | echo.installed, then set the SPHINXBUILD environment variable to point 18 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 19 | echo.may add the Sphinx directory to PATH. 20 | echo. 21 | echo.If you don't have Sphinx installed, grab it from 22 | echo.https://www.sphinx-doc.org/ 23 | exit /b 1 24 | ) 25 | 26 | if "%1" == "" goto help 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/requirements-docs.txt: -------------------------------------------------------------------------------- 1 | sphinx == 6.2.1 2 | sphinx-book-theme == 1.0.1 3 | sphinx-copybutton == 0.5.2 4 | -------------------------------------------------------------------------------- /docs/source/assets/logos/vllm-logo-only-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OthersideAI/vllm/4b498312ccc7b2804dfe26a4d902750e49937183/docs/source/assets/logos/vllm-logo-only-light.png -------------------------------------------------------------------------------- /docs/source/assets/logos/vllm-logo-text-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OthersideAI/vllm/4b498312ccc7b2804dfe26a4d902750e49937183/docs/source/assets/logos/vllm-logo-text-dark.png -------------------------------------------------------------------------------- /docs/source/assets/logos/vllm-logo-text-light.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OthersideAI/vllm/4b498312ccc7b2804dfe26a4d902750e49937183/docs/source/assets/logos/vllm-logo-text-light.png -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'vLLM' 21 | copyright = '2023, vLLM Team' 22 | author = 'the vLLM Team' 23 | 24 | 25 | # -- General configuration --------------------------------------------------- 26 | 27 | # Add any Sphinx extension module names here, as strings. They can be 28 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 29 | # ones. 30 | extensions = [ 31 | "sphinx.ext.napoleon", 32 | "sphinx.ext.viewcode", 33 | "sphinx.ext.intersphinx", 34 | "sphinx_copybutton", 35 | ] 36 | 37 | # Add any paths that contain templates here, relative to this directory. 38 | templates_path = ['_templates'] 39 | 40 | # List of patterns, relative to source directory, that match files and 41 | # directories to ignore when looking for source files. 42 | # This pattern also affects html_static_path and html_extra_path. 43 | exclude_patterns = [] 44 | 45 | # Exclude the prompt "$" when copying code 46 | copybutton_prompt_text = r"\$ " 47 | copybutton_prompt_is_regexp = True 48 | 49 | # -- Options for HTML output ------------------------------------------------- 50 | 51 | # The theme to use for HTML and HTML Help pages. See the documentation for 52 | # a list of builtin themes. 53 | # 54 | html_title = project 55 | html_theme = 'sphinx_book_theme' 56 | html_logo = 'assets/logos/vllm-logo-text-light.png' 57 | html_theme_options = { 58 | 'logo_only': True, 59 | 'path_to_docs': 'docs/source', 60 | 'repository_url': 'https://github.com/vllm-project/vllm', 61 | 'use_repository_button': True, 62 | } 63 | 64 | # Add any paths that contain custom static files (such as style sheets) here, 65 | # relative to this directory. They are copied after the builtin static files, 66 | # so a file named "default.css" will overwrite the builtin "default.css". 67 | html_static_path = ['_static'] 68 | -------------------------------------------------------------------------------- /docs/source/getting_started/installation.rst: -------------------------------------------------------------------------------- 1 | .. _installation: 2 | 3 | Installation 4 | ============ 5 | 6 | vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries. 7 | 8 | Requirements 9 | ------------ 10 | 11 | * OS: Linux 12 | * Python: 3.8 -- 3.11 13 | * GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) 14 | 15 | Install with pip 16 | ---------------- 17 | 18 | You can install vLLM using pip: 19 | 20 | .. code-block:: console 21 | 22 | $ # (Optional) Create a new conda environment. 23 | $ conda create -n myenv python=3.8 -y 24 | $ conda activate myenv 25 | 26 | $ # Install vLLM with CUDA 12.1. 27 | $ pip install vllm 28 | 29 | .. note:: 30 | 31 | As of now, vLLM's binaries are compiled on CUDA 12.1 by default. 32 | However, you can install vLLM with CUDA 11.8 by running: 33 | 34 | .. code-block:: console 35 | 36 | $ # Install vLLM with CUDA 11.8. 37 | $ # Replace `cp310` with your Python version (e.g., `cp38`, `cp39`, `cp311`). 38 | $ pip install https://github.com/vllm-project/vllm/releases/download/v0.2.2/vllm-0.2.2+cu118-cp310-cp310-manylinux1_x86_64.whl 39 | 40 | $ # Re-install PyTorch with CUDA 11.8. 41 | $ pip uninstall torch -y 42 | $ pip install torch --upgrade --index-url https://download.pytorch.org/whl/cu118 43 | 44 | 45 | .. _build_from_source: 46 | 47 | Build from source 48 | ----------------- 49 | 50 | You can also build and install vLLM from source: 51 | 52 | .. code-block:: console 53 | 54 | $ git clone https://github.com/vllm-project/vllm.git 55 | $ cd vllm 56 | $ pip install -e . # This may take 5-10 minutes. 57 | 58 | .. tip:: 59 | If you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image. 60 | 61 | .. code-block:: console 62 | 63 | $ # Use `--ipc=host` to make sure the shared memory is large enough. 64 | $ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3 65 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to vLLM! 2 | ================ 3 | 4 | .. figure:: ./assets/logos/vllm-logo-text-light.png 5 | :width: 60% 6 | :align: center 7 | :alt: vLLM 8 | :class: no-scaled-link 9 | 10 | .. raw:: html 11 | 12 |

13 | Easy, fast, and cheap LLM serving for everyone 14 | 15 |

16 | 17 |

18 | 19 | Star 20 | Watch 21 | Fork 22 |

23 | 24 | 25 | 26 | vLLM is a fast and easy-to-use library for LLM inference and serving. 27 | 28 | vLLM is fast with: 29 | 30 | * State-of-the-art serving throughput 31 | * Efficient management of attention key and value memory with **PagedAttention** 32 | * Continuous batching of incoming requests 33 | * Optimized CUDA kernels 34 | 35 | vLLM is flexible and easy to use with: 36 | 37 | * Seamless integration with popular HuggingFace models 38 | * High-throughput serving with various decoding algorithms, including *parallel sampling*, *beam search*, and more 39 | * Tensor parallelism support for distributed inference 40 | * Streaming outputs 41 | * OpenAI-compatible API server 42 | 43 | For more information, check out the following: 44 | 45 | * `vLLM announcing blog post `_ (intro to PagedAttention) 46 | * `vLLM paper `_ (SOSP 2023) 47 | * `How continuous batching enables 23x throughput in LLM inference while reducing p50 latency `_ by Cade Daniel et al. 48 | 49 | 50 | 51 | Documentation 52 | ------------- 53 | 54 | .. toctree:: 55 | :maxdepth: 1 56 | :caption: Getting Started 57 | 58 | getting_started/installation 59 | getting_started/quickstart 60 | 61 | .. toctree:: 62 | :maxdepth: 1 63 | :caption: Serving 64 | 65 | serving/distributed_serving 66 | serving/run_on_sky 67 | serving/deploying_with_triton 68 | serving/deploying_with_docker 69 | 70 | .. toctree:: 71 | :maxdepth: 1 72 | :caption: Models 73 | 74 | models/supported_models 75 | models/adding_model 76 | models/engine_args 77 | 78 | .. toctree:: 79 | :maxdepth: 1 80 | :caption: Quantization 81 | 82 | quantization/auto_awq -------------------------------------------------------------------------------- /docs/source/models/engine_args.rst: -------------------------------------------------------------------------------- 1 | .. _engine_args: 2 | 3 | Engine Arguments 4 | ================ 5 | 6 | Below, you can find an explanation of every engine argument for vLLM: 7 | 8 | .. option:: --model 9 | 10 | Name or path of the huggingface model to use. 11 | 12 | .. option:: --tokenizer 13 | 14 | Name or path of the huggingface tokenizer to use. 15 | 16 | .. option:: --revision 17 | 18 | The specific model version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version. 19 | 20 | .. option:: --tokenizer-revision 21 | 22 | The specific tokenizer version to use. It can be a branch name, a tag name, or a commit id. If unspecified, will use the default version. 23 | 24 | .. option:: --tokenizer-mode {auto,slow} 25 | 26 | The tokenizer mode. 27 | 28 | * "auto" will use the fast tokenizer if available. 29 | * "slow" will always use the slow tokenizer. 30 | 31 | .. option:: --trust-remote-code 32 | 33 | Trust remote code from huggingface. 34 | 35 | .. option:: --download-dir 36 | 37 | Directory to download and load the weights, default to the default cache dir of huggingface. 38 | 39 | .. option:: --load-format {auto,pt,safetensors,npcache,dummy} 40 | 41 | The format of the model weights to load. 42 | 43 | * "auto" will try to load the weights in the safetensors format and fall back to the pytorch bin format if safetensors format is not available. 44 | * "pt" will load the weights in the pytorch bin format. 45 | * "safetensors" will load the weights in the safetensors format. 46 | * "npcache" will load the weights in pytorch format and store a numpy cache to speed up the loading. 47 | * "dummy" will initialize the weights with random values, mainly for profiling. 48 | 49 | .. option:: --dtype {auto,half,float16,bfloat16,float,float32} 50 | 51 | Data type for model weights and activations. 52 | 53 | * "auto" will use FP16 precision for FP32 and FP16 models, and BF16 precision for BF16 models. 54 | * "half" for FP16. Recommended for AWQ quantization. 55 | * "float16" is the same as "half". 56 | * "bfloat16" for a balance between precision and range. 57 | * "float" is shorthand for FP32 precision. 58 | * "float32" for FP32 precision. 59 | 60 | .. option:: --max-model-len 61 | 62 | Model context length. If unspecified, will be automatically derived from the model config. 63 | 64 | .. option:: --worker-use-ray 65 | 66 | Use Ray for distributed serving, will be automatically set when using more than 1 GPU. 67 | 68 | .. option:: --pipeline-parallel-size (-pp) 69 | 70 | Number of pipeline stages. 71 | 72 | .. option:: --tensor-parallel-size (-tp) 73 | 74 | Number of tensor parallel replicas. 75 | 76 | .. option:: --max-parallel-loading-workers 77 | 78 | Load model sequentially in multiple batches, to avoid RAM OOM when using tensor parallel and large models. 79 | 80 | .. option:: --block-size {8,16,32} 81 | 82 | Token block size for contiguous chunks of tokens. 83 | 84 | .. option:: --seed 85 | 86 | Random seed for operations. 87 | 88 | .. option:: --swap-space 89 | 90 | CPU swap space size (GiB) per GPU. 91 | 92 | .. option:: --gpu-memory-utilization 93 | 94 | The percentage of GPU memory to be used for the model executor. 95 | 96 | .. option:: --max-num-batched-tokens 97 | 98 | Maximum number of batched tokens per iteration. 99 | 100 | .. option:: --max-num-seqs 101 | 102 | Maximum number of sequences per iteration. 103 | 104 | .. option:: --max-paddings 105 | 106 | Maximum number of paddings in a batch. 107 | 108 | .. option:: --disable-log-stats 109 | 110 | Disable logging statistics. 111 | 112 | .. option:: --quantization (-q) {awq,squeezellm,None} 113 | 114 | Method used to quantize the weights. 115 | -------------------------------------------------------------------------------- /docs/source/models/supported_models.rst: -------------------------------------------------------------------------------- 1 | .. _supported_models: 2 | 3 | Supported Models 4 | ================ 5 | 6 | vLLM supports a variety of generative Transformer models in `HuggingFace Transformers `_. 7 | The following is the list of model architectures that are currently supported by vLLM. 8 | Alongside each architecture, we include some popular models that use it. 9 | 10 | .. list-table:: 11 | :widths: 25 25 50 12 | :header-rows: 1 13 | 14 | * - Architecture 15 | - Models 16 | - Example HuggingFace Models 17 | * - :code:`AquilaForCausalLM` 18 | - Aquila 19 | - :code:`BAAI/Aquila-7B`, :code:`BAAI/AquilaChat-7B`, etc. 20 | * - :code:`BaiChuanForCausalLM` 21 | - Baichuan 22 | - :code:`baichuan-inc/Baichuan-7B`, :code:`baichuan-inc/Baichuan-13B-Chat`, etc. 23 | * - :code:`ChatGLMModel` 24 | - ChatGLM 25 | - :code:`THUDM/chatglm2-6b`, :code:`THUDM/chatglm3-6b`, etc. 26 | * - :code:`BloomForCausalLM` 27 | - BLOOM, BLOOMZ, BLOOMChat 28 | - :code:`bigscience/bloom`, :code:`bigscience/bloomz`, etc. 29 | * - :code:`FalconForCausalLM` 30 | - Falcon 31 | - :code:`tiiuae/falcon-7b`, :code:`tiiuae/falcon-40b`, :code:`tiiuae/falcon-rw-7b`, etc. 32 | * - :code:`GPT2LMHeadModel` 33 | - GPT-2 34 | - :code:`gpt2`, :code:`gpt2-xl`, etc. 35 | * - :code:`GPTBigCodeForCausalLM` 36 | - StarCoder, SantaCoder, WizardCoder 37 | - :code:`bigcode/starcoder`, :code:`bigcode/gpt_bigcode-santacoder`, :code:`WizardLM/WizardCoder-15B-V1.0`, etc. 38 | * - :code:`GPTJForCausalLM` 39 | - GPT-J 40 | - :code:`EleutherAI/gpt-j-6b`, :code:`nomic-ai/gpt4all-j`, etc. 41 | * - :code:`GPTNeoXForCausalLM` 42 | - GPT-NeoX, Pythia, OpenAssistant, Dolly V2, StableLM 43 | - :code:`EleutherAI/gpt-neox-20b`, :code:`EleutherAI/pythia-12b`, :code:`OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5`, :code:`databricks/dolly-v2-12b`, :code:`stabilityai/stablelm-tuned-alpha-7b`, etc. 44 | * - :code:`InternLMForCausalLM` 45 | - InternLM 46 | - :code:`internlm/internlm-7b`, :code:`internlm/internlm-chat-7b`, etc. 47 | * - :code:`LlamaForCausalLM` 48 | - LLaMA, LLaMA-2, Vicuna, Alpaca, Koala, Guanaco 49 | - :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`young-geng/koala`, etc. 50 | * - :code:`MistralForCausalLM` 51 | - Mistral, Mistral-Instruct 52 | - :code:`mistralai/Mistral-7B-v0.1`, :code:`mistralai/Mistral-7B-Instruct-v0.1`, etc. 53 | * - :code:`MPTForCausalLM` 54 | - MPT, MPT-Instruct, MPT-Chat, MPT-StoryWriter 55 | - :code:`mosaicml/mpt-7b`, :code:`mosaicml/mpt-7b-storywriter`, :code:`mosaicml/mpt-30b`, etc. 56 | * - :code:`OPTForCausalLM` 57 | - OPT, OPT-IML 58 | - :code:`facebook/opt-66b`, :code:`facebook/opt-iml-max-30b`, etc. 59 | * - :code:`PhiForCausalLM` 60 | - Phi-1.5 61 | - :code:`microsoft/phi-1_5`, etc. 62 | * - :code:`QWenLMHeadModel` 63 | - Qwen 64 | - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc. 65 | * - :code:`YiForCausalLM` 66 | - Yi 67 | - :code:`01-ai/Yi-6B`, :code:`01-ai/Yi-34B`, etc. 68 | 69 | If your model uses one of the above model architectures, you can seamlessly run your model with vLLM. 70 | Otherwise, please refer to :ref:`Adding a New Model ` for instructions on how to implement support for your model. 71 | Alternatively, you can raise an issue on our `GitHub `_ project. 72 | 73 | .. tip:: 74 | The easiest way to check if your model is supported is to run the program below: 75 | 76 | .. code-block:: python 77 | 78 | from vllm import LLM 79 | 80 | llm = LLM(model=...) # Name or path of your model 81 | output = llm.generate("Hello, my name is") 82 | print(output) 83 | 84 | To use model from www.modelscope.cn 85 | 86 | .. code-block:: shell 87 | 88 | $ export VLLM_USE_MODELSCOPE=True 89 | 90 | .. code-block:: python 91 | 92 | from vllm import LLM 93 | 94 | llm = LLM(model=..., revision=..., trust_remote_code=True) # Name or path of your model 95 | output = llm.generate("Hello, my name is") 96 | print(output) 97 | 98 | If vLLM successfully generates text, it indicates that your model is supported. 99 | -------------------------------------------------------------------------------- /docs/source/quantization/auto_awq.rst: -------------------------------------------------------------------------------- 1 | .. _auto_awq: 2 | 3 | AutoAWQ 4 | ================== 5 | 6 | To create a new 4-bit quantized model, you can leverage `AutoAWQ `_. 7 | Quantizing reduces the model's precision from FP16 to INT4 which effectively reduces the file size by ~70%. 8 | The main benefits are lower latency and memory usage. 9 | 10 | You can quantize your own models by installing AutoAWQ or picking one of the `400+ models on Huggingface `_. 11 | 12 | .. code-block:: console 13 | 14 | $ pip install autoawq 15 | 16 | After installing AutoAWQ, you are ready to quantize a model. Here is an example of how to quantize Vicuna 7B v1.5: 17 | 18 | .. code-block:: python 19 | 20 | from awq import AutoAWQForCausalLM 21 | from transformers import AutoTokenizer 22 | 23 | model_path = 'lmsys/vicuna-7b-v1.5' 24 | quant_path = 'vicuna-7b-v1.5-awq' 25 | quant_config = { "zero_point": True, "q_group_size": 128, "w_bit": 4, "version": "GEMM" } 26 | 27 | # Load model 28 | model = AutoAWQForCausalLM.from_pretrained(model_path, **{"low_cpu_mem_usage": True}) 29 | tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) 30 | 31 | # Quantize 32 | model.quantize(tokenizer, quant_config=quant_config) 33 | 34 | # Save quantized model 35 | model.save_quantized(quant_path) 36 | tokenizer.save_pretrained(quant_path) 37 | 38 | To run an AWQ model with vLLM, you can use `TheBloke/Llama-2-7b-Chat-AWQ `_ with the following command: 39 | 40 | .. code-block:: console 41 | 42 | $ python examples/llm_engine_example.py --model TheBloke/Llama-2-7b-Chat-AWQ --quantization awq 43 | 44 | AWQ models are also supported directly through the LLM entrypoint: 45 | 46 | .. code-block:: python 47 | 48 | from vllm import LLM, SamplingParams 49 | 50 | # Sample prompts. 51 | prompts = [ 52 | "Hello, my name is", 53 | "The president of the United States is", 54 | "The capital of France is", 55 | "The future of AI is", 56 | ] 57 | # Create a sampling params object. 58 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95) 59 | 60 | # Create an LLM. 61 | llm = LLM(model="TheBloke/Llama-2-7b-Chat-AWQ", quantization="AWQ") 62 | # Generate texts from the prompts. The output is a list of RequestOutput objects 63 | # that contain the prompt, generated text, and other information. 64 | outputs = llm.generate(prompts, sampling_params) 65 | # Print the outputs. 66 | for output in outputs: 67 | prompt = output.prompt 68 | generated_text = output.outputs[0].text 69 | print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") 70 | -------------------------------------------------------------------------------- /docs/source/serving/deploying_with_docker.rst: -------------------------------------------------------------------------------- 1 | .. _deploying_with_docker: 2 | 3 | Deploying with Docker 4 | ============================ 5 | 6 | vLLM offers official docker image for deployment. 7 | The image can be used to run OpenAI compatible server. 8 | The image is available on Docker Hub as `vllm/vllm-openai `_. 9 | 10 | .. code-block:: console 11 | 12 | $ docker run --runtime nvidia --gpus all \ 13 | -v ~/.cache/huggingface:/root/.cache/huggingface \ 14 | -p 8000:8000 \ 15 | --env "HUGGING_FACE_HUB_TOKEN=" \ 16 | vllm/vllm-openai:latest \ 17 | --model mistralai/Mistral-7B-v0.1 18 | 19 | 20 | You can build and run vLLM from source via the provided dockerfile. To build vLLM: 21 | 22 | .. code-block:: console 23 | 24 | $ DOCKER_BUILDKIT=1 docker build . --target vllm-openai --tag vllm/vllm-openai --build-arg max_jobs=8 25 | 26 | To run vLLM: 27 | 28 | .. code-block:: console 29 | 30 | $ docker run --runtime nvidia --gpus all \ 31 | -v ~/.cache/huggingface:/root/.cache/huggingface \ 32 | -p 8000:8000 \ 33 | --env "HUGGING_FACE_HUB_TOKEN=" \ 34 | vllm/vllm-openai 35 | 36 | -------------------------------------------------------------------------------- /docs/source/serving/deploying_with_triton.rst: -------------------------------------------------------------------------------- 1 | .. _deploying_with_triton: 2 | 3 | Deploying with NVIDIA Triton 4 | ============================ 5 | 6 | The `Triton Inference Server `_ hosts a tutorial demonstrating how to quickly deploy a simple `facebook/opt-125m `_ model using vLLM. Please see `Deploying a vLLM model in Triton `_ for more details. 7 | -------------------------------------------------------------------------------- /docs/source/serving/distributed_serving.rst: -------------------------------------------------------------------------------- 1 | .. _distributed_serving: 2 | 3 | Distributed Inference and Serving 4 | ================================= 5 | 6 | vLLM supports distributed tensor-parallel inference and serving. Currently, we support `Megatron-LM's tensor parallel algorithm `_. We manage the distributed runtime with `Ray `_. To run distributed inference, install Ray with: 7 | 8 | .. code-block:: console 9 | 10 | $ pip install ray 11 | 12 | To run multi-GPU inference with the :code:`LLM` class, set the :code:`tensor_parallel_size` argument to the number of GPUs you want to use. For example, to run inference on 4 GPUs: 13 | 14 | .. code-block:: python 15 | 16 | from vllm import LLM 17 | llm = LLM("facebook/opt-13b", tensor_parallel_size=4) 18 | output = llm.generate("San Franciso is a") 19 | 20 | To run multi-GPU serving, pass in the :code:`--tensor-parallel-size` argument when starting the server. For example, to run API server on 4 GPUs: 21 | 22 | .. code-block:: console 23 | 24 | $ python -m vllm.entrypoints.api_server \ 25 | $ --model facebook/opt-13b \ 26 | $ --tensor-parallel-size 4 27 | 28 | To scale vLLM beyond a single machine, start a `Ray runtime `_ via CLI before running vLLM: 29 | 30 | .. code-block:: console 31 | 32 | $ # On head node 33 | $ ray start --head 34 | 35 | $ # On worker nodes 36 | $ ray start --address= 37 | 38 | After that, you can run inference and serving on multiple machines by launching the vLLM process on the head node by setting :code:`tensor_parallel_size` to the number of GPUs to be the total number of GPUs across all machines. -------------------------------------------------------------------------------- /docs/source/serving/run_on_sky.rst: -------------------------------------------------------------------------------- 1 | .. _on_cloud: 2 | 3 | Running on clouds with SkyPilot 4 | =============================== 5 | 6 | .. raw:: html 7 | 8 |

9 | vLLM 10 |

11 | 12 | vLLM can be run on the cloud to scale to multiple GPUs with `SkyPilot `__, an open-source framework for running LLMs on any cloud. 13 | 14 | To install SkyPilot and setup your cloud credentials, run: 15 | 16 | .. code-block:: console 17 | 18 | $ pip install skypilot 19 | $ sky check 20 | 21 | See the vLLM SkyPilot YAML for serving, `serving.yaml `__. 22 | 23 | .. code-block:: yaml 24 | 25 | resources: 26 | accelerators: A100 27 | 28 | envs: 29 | MODEL_NAME: decapoda-research/llama-13b-hf 30 | TOKENIZER: hf-internal-testing/llama-tokenizer 31 | 32 | setup: | 33 | conda create -n vllm python=3.9 -y 34 | conda activate vllm 35 | git clone https://github.com/vllm-project/vllm.git 36 | cd vllm 37 | pip install . 38 | pip install gradio 39 | 40 | run: | 41 | conda activate vllm 42 | echo 'Starting vllm api server...' 43 | python -u -m vllm.entrypoints.api_server \ 44 | --model $MODEL_NAME \ 45 | --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \ 46 | --tokenizer $TOKENIZER 2>&1 | tee api_server.log & 47 | echo 'Waiting for vllm api server to start...' 48 | while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do sleep 1; done 49 | echo 'Starting gradio server...' 50 | python vllm/examples/gradio_webserver.py 51 | 52 | Start the serving the LLaMA-13B model on an A100 GPU: 53 | 54 | .. code-block:: console 55 | 56 | $ sky launch serving.yaml 57 | 58 | Check the output of the command. There will be a sharable gradio link (like the last line of the following). Open it in your browser to use the LLaMA model to do the text completion. 59 | 60 | .. code-block:: console 61 | 62 | (task, pid=7431) Running on public URL: https://.gradio.live 63 | 64 | **Optional**: Serve the 65B model instead of the default 13B and use more GPU: 65 | 66 | .. code-block:: console 67 | 68 | sky launch -c vllm-serve-new -s serve.yaml --gpus A100:8 --env MODEL_NAME=decapoda-research/llama-65b-hf 69 | 70 | -------------------------------------------------------------------------------- /examples/api_client.py: -------------------------------------------------------------------------------- 1 | """Example Python client for vllm.entrypoints.api_server""" 2 | 3 | import argparse 4 | import json 5 | from typing import Iterable, List 6 | 7 | import requests 8 | 9 | 10 | def clear_line(n: int = 1) -> None: 11 | LINE_UP = '\033[1A' 12 | LINE_CLEAR = '\x1b[2K' 13 | for _ in range(n): 14 | print(LINE_UP, end=LINE_CLEAR, flush=True) 15 | 16 | 17 | def post_http_request(prompt: str, 18 | api_url: str, 19 | n: int = 1, 20 | stream: bool = False) -> requests.Response: 21 | headers = {"User-Agent": "Test Client"} 22 | pload = { 23 | "prompt": prompt, 24 | "n": n, 25 | "use_beam_search": True, 26 | "temperature": 0.0, 27 | "max_tokens": 16, 28 | "stream": stream, 29 | } 30 | response = requests.post(api_url, headers=headers, json=pload, stream=True) 31 | return response 32 | 33 | 34 | def get_streaming_response(response: requests.Response) -> Iterable[List[str]]: 35 | for chunk in response.iter_lines(chunk_size=8192, 36 | decode_unicode=False, 37 | delimiter=b"\0"): 38 | if chunk: 39 | data = json.loads(chunk.decode("utf-8")) 40 | output = data["text"] 41 | yield output 42 | 43 | 44 | def get_response(response: requests.Response) -> List[str]: 45 | data = json.loads(response.content) 46 | output = data["text"] 47 | return output 48 | 49 | 50 | if __name__ == "__main__": 51 | parser = argparse.ArgumentParser() 52 | parser.add_argument("--host", type=str, default="localhost") 53 | parser.add_argument("--port", type=int, default=8000) 54 | parser.add_argument("--n", type=int, default=4) 55 | parser.add_argument("--prompt", type=str, default="San Francisco is a") 56 | parser.add_argument("--stream", action="store_true") 57 | args = parser.parse_args() 58 | prompt = args.prompt 59 | api_url = f"http://{args.host}:{args.port}/generate" 60 | n = args.n 61 | stream = args.stream 62 | 63 | print(f"Prompt: {prompt!r}\n", flush=True) 64 | response = post_http_request(prompt, api_url, n, stream) 65 | 66 | if stream: 67 | num_printed_lines = 0 68 | for h in get_streaming_response(response): 69 | clear_line(num_printed_lines) 70 | num_printed_lines = 0 71 | for i, line in enumerate(h): 72 | num_printed_lines += 1 73 | print(f"Beam candidate {i}: {line!r}", flush=True) 74 | else: 75 | output = get_response(response) 76 | for i, line in enumerate(output): 77 | print(f"Beam candidate {i}: {line!r}", flush=True) 78 | -------------------------------------------------------------------------------- /examples/gradio_webserver.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | import gradio as gr 5 | import requests 6 | 7 | 8 | def http_bot(prompt): 9 | headers = {"User-Agent": "vLLM Client"} 10 | pload = { 11 | "prompt": prompt, 12 | "stream": True, 13 | "max_tokens": 128, 14 | } 15 | response = requests.post(args.model_url, 16 | headers=headers, 17 | json=pload, 18 | stream=True) 19 | 20 | for chunk in response.iter_lines(chunk_size=8192, 21 | decode_unicode=False, 22 | delimiter=b"\0"): 23 | if chunk: 24 | data = json.loads(chunk.decode("utf-8")) 25 | output = data["text"][0] 26 | yield output 27 | 28 | 29 | def build_demo(): 30 | with gr.Blocks() as demo: 31 | gr.Markdown("# vLLM text completion demo\n") 32 | inputbox = gr.Textbox(label="Input", 33 | placeholder="Enter text and press ENTER") 34 | outputbox = gr.Textbox(label="Output", 35 | placeholder="Generated result from the model") 36 | inputbox.submit(http_bot, [inputbox], [outputbox]) 37 | return demo 38 | 39 | 40 | if __name__ == "__main__": 41 | parser = argparse.ArgumentParser() 42 | parser.add_argument("--host", type=str, default=None) 43 | parser.add_argument("--port", type=int, default=8001) 44 | parser.add_argument("--model-url", 45 | type=str, 46 | default="http://localhost:8000/generate") 47 | args = parser.parse_args() 48 | 49 | demo = build_demo() 50 | demo.queue(concurrency_count=100).launch(server_name=args.host, 51 | server_port=args.port, 52 | share=True) 53 | -------------------------------------------------------------------------------- /examples/llm_engine_example.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from typing import List, Tuple 3 | 4 | from vllm import EngineArgs, LLMEngine, SamplingParams, RequestOutput 5 | 6 | 7 | def create_test_prompts() -> List[Tuple[str, SamplingParams]]: 8 | """Create a list of test prompts with their sampling parameters.""" 9 | return [ 10 | ("A robot may not injure a human being", 11 | SamplingParams(temperature=0.0, logprobs=1, prompt_logprobs=1)), 12 | ("To be or not to be,", 13 | SamplingParams(temperature=0.8, top_k=5, presence_penalty=0.2)), 14 | ("What is the meaning of life?", 15 | SamplingParams(n=2, 16 | best_of=5, 17 | temperature=0.8, 18 | top_p=0.95, 19 | frequency_penalty=0.1)), 20 | ("It is only with the heart that one can see rightly", 21 | SamplingParams(n=3, best_of=3, use_beam_search=True, 22 | temperature=0.0)), 23 | ] 24 | 25 | 26 | def process_requests(engine: LLMEngine, 27 | test_prompts: List[Tuple[str, SamplingParams]]): 28 | """Continuously process a list of prompts and handle the outputs.""" 29 | request_id = 0 30 | 31 | while test_prompts or engine.has_unfinished_requests(): 32 | if test_prompts: 33 | prompt, sampling_params = test_prompts.pop(0) 34 | engine.add_request(str(request_id), prompt, sampling_params) 35 | request_id += 1 36 | 37 | request_outputs: List[RequestOutput] = engine.step() 38 | 39 | for request_output in request_outputs: 40 | if request_output.finished: 41 | print(request_output) 42 | 43 | 44 | def initialize_engine(args: argparse.Namespace) -> LLMEngine: 45 | """Initialize the LLMEngine from the command line arguments.""" 46 | engine_args = EngineArgs.from_cli_args(args) 47 | return LLMEngine.from_engine_args(engine_args) 48 | 49 | 50 | def main(args: argparse.Namespace): 51 | """Main function that sets up and runs the prompt processing.""" 52 | engine = initialize_engine(args) 53 | test_prompts = create_test_prompts() 54 | process_requests(engine, test_prompts) 55 | 56 | 57 | if __name__ == '__main__': 58 | parser = argparse.ArgumentParser( 59 | description='Demo on using the LLMEngine class directly') 60 | parser = EngineArgs.add_cli_args(parser) 61 | args = parser.parse_args() 62 | main(args) 63 | -------------------------------------------------------------------------------- /examples/offline_inference.py: -------------------------------------------------------------------------------- 1 | from vllm import LLM, SamplingParams 2 | 3 | # Sample prompts. 4 | prompts = [ 5 | "Hello, my name is", 6 | "The president of the United States is", 7 | "The capital of France is", 8 | "The future of AI is", 9 | ] 10 | # Create a sampling params object. 11 | sampling_params = SamplingParams(temperature=0.8, top_p=0.95) 12 | 13 | # Create an LLM. 14 | llm = LLM(model="facebook/opt-125m") 15 | # Generate texts from the prompts. The output is a list of RequestOutput objects 16 | # that contain the prompt, generated text, and other information. 17 | outputs = llm.generate(prompts, sampling_params) 18 | # Print the outputs. 19 | for output in outputs: 20 | prompt = output.prompt 21 | generated_text = output.outputs[0].text 22 | print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") 23 | -------------------------------------------------------------------------------- /examples/openai_chatcompletion_client.py: -------------------------------------------------------------------------------- 1 | import openai 2 | 3 | # Modify OpenAI's API key and API base to use vLLM's API server. 4 | openai.api_key = "EMPTY" 5 | openai.api_base = "http://localhost:8000/v1" 6 | 7 | # List models API 8 | models = openai.Model.list() 9 | print("Models:", models) 10 | 11 | model = models["data"][0]["id"] 12 | 13 | # Chat completion API 14 | chat_completion = openai.ChatCompletion.create( 15 | model=model, 16 | messages=[{ 17 | "role": "system", 18 | "content": "You are a helpful assistant." 19 | }, { 20 | "role": "user", 21 | "content": "Who won the world series in 2020?" 22 | }, { 23 | "role": 24 | "assistant", 25 | "content": 26 | "The Los Angeles Dodgers won the World Series in 2020." 27 | }, { 28 | "role": "user", 29 | "content": "Where was it played?" 30 | }]) 31 | 32 | print("Chat completion results:") 33 | print(chat_completion) 34 | -------------------------------------------------------------------------------- /examples/openai_completion_client.py: -------------------------------------------------------------------------------- 1 | import openai 2 | 3 | # Modify OpenAI's API key and API base to use vLLM's API server. 4 | openai.api_key = "EMPTY" 5 | openai.api_base = "http://localhost:8000/v1" 6 | 7 | # List models API 8 | models = openai.Model.list() 9 | print("Models:", models) 10 | 11 | model = models["data"][0]["id"] 12 | 13 | # Completion API 14 | stream = False 15 | completion = openai.Completion.create( 16 | model=model, 17 | prompt="A robot may not injure a human being", 18 | echo=False, 19 | n=2, 20 | stream=stream, 21 | logprobs=3) 22 | 23 | print("Completion results:") 24 | if stream: 25 | for c in completion: 26 | print(c) 27 | else: 28 | print(completion) 29 | -------------------------------------------------------------------------------- /format.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # YAPF formatter, adapted from ray and skypilot. 3 | # 4 | # Usage: 5 | # # Do work and commit your work. 6 | 7 | # # Format files that differ from origin/main. 8 | # bash format.sh 9 | 10 | # # Commit changed files with message 'Run yapf and ruff' 11 | # 12 | # 13 | # YAPF + Clang formatter (if installed). This script formats all changed files from the last mergebase. 14 | # You are encouraged to run this locally before pushing changes for review. 15 | 16 | # Cause the script to exit if a single command fails 17 | set -eo pipefail 18 | 19 | # this stops git rev-parse from failing if we run this from the .git directory 20 | builtin cd "$(dirname "${BASH_SOURCE:-$0}")" 21 | ROOT="$(git rev-parse --show-toplevel)" 22 | builtin cd "$ROOT" || exit 1 23 | 24 | YAPF_VERSION=$(yapf --version | awk '{print $2}') 25 | RUFF_VERSION=$(ruff --version | awk '{print $2}') 26 | MYPY_VERSION=$(mypy --version | awk '{print $2}') 27 | 28 | # # params: tool name, tool version, required version 29 | tool_version_check() { 30 | if [[ $2 != $3 ]]; then 31 | echo "Wrong $1 version installed: $3 is required, not $2." 32 | exit 1 33 | fi 34 | } 35 | 36 | tool_version_check "yapf" $YAPF_VERSION "$(grep yapf requirements-dev.txt | cut -d'=' -f3)" 37 | tool_version_check "ruff" $RUFF_VERSION "$(grep "ruff==" requirements-dev.txt | cut -d'=' -f3)" 38 | tool_version_check "mypy" "$MYPY_VERSION" "$(grep mypy requirements-dev.txt | cut -d'=' -f3)" 39 | 40 | YAPF_FLAGS=( 41 | '--recursive' 42 | '--parallel' 43 | ) 44 | 45 | YAPF_EXCLUDES=( 46 | '--exclude' 'build/**' 47 | ) 48 | 49 | # Format specified files 50 | format() { 51 | yapf --in-place "${YAPF_FLAGS[@]}" "$@" 52 | } 53 | 54 | # Format files that differ from main branch. Ignores dirs that are not slated 55 | # for autoformat yet. 56 | format_changed() { 57 | # The `if` guard ensures that the list of filenames is not empty, which 58 | # could cause yapf to receive 0 positional arguments, making it hang 59 | # waiting for STDIN. 60 | # 61 | # `diff-filter=ACM` and $MERGEBASE is to ensure we only format files that 62 | # exist on both branches. 63 | MERGEBASE="$(git merge-base origin/main HEAD)" 64 | 65 | if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then 66 | git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs -P 5 \ 67 | yapf --in-place "${YAPF_EXCLUDES[@]}" "${YAPF_FLAGS[@]}" 68 | fi 69 | 70 | } 71 | 72 | # Format all files 73 | format_all() { 74 | yapf --in-place "${YAPF_FLAGS[@]}" "${YAPF_EXCLUDES[@]}" vllm tests 75 | } 76 | 77 | ## This flag formats individual files. --files *must* be the first command line 78 | ## arg to use this option. 79 | if [[ "$1" == '--files' ]]; then 80 | format "${@:2}" 81 | # If `--all` is passed, then any further arguments are ignored and the 82 | # entire python directory is formatted. 83 | elif [[ "$1" == '--all' ]]; then 84 | format_all 85 | else 86 | # Format only the files that changed in last commit. 87 | format_changed 88 | fi 89 | echo 'vLLM yapf: Done' 90 | 91 | # Run mypy 92 | # TODO(zhuohan): Enable mypy 93 | # echo 'vLLM mypy:' 94 | # mypy 95 | 96 | # Lint specified files 97 | lint() { 98 | ruff "$@" 99 | } 100 | 101 | # Lint files that differ from main branch. Ignores dirs that are not slated 102 | # for autolint yet. 103 | lint_changed() { 104 | # The `if` guard ensures that the list of filenames is not empty, which 105 | # could cause ruff to receive 0 positional arguments, making it hang 106 | # waiting for STDIN. 107 | # 108 | # `diff-filter=ACM` and $MERGEBASE is to ensure we only lint files that 109 | # exist on both branches. 110 | MERGEBASE="$(git merge-base origin/main HEAD)" 111 | 112 | if ! git diff --diff-filter=ACM --quiet --exit-code "$MERGEBASE" -- '*.py' '*.pyi' &>/dev/null; then 113 | git diff --name-only --diff-filter=ACM "$MERGEBASE" -- '*.py' '*.pyi' | xargs \ 114 | ruff 115 | fi 116 | 117 | } 118 | 119 | # Run Ruff 120 | echo 'vLLM Ruff:' 121 | ## This flag lints individual files. --files *must* be the first command line 122 | ## arg to use this option. 123 | if [[ "$1" == '--files' ]]; then 124 | lint "${@:2}" 125 | # If `--all` is passed, then any further arguments are ignored and the 126 | # entire python directory is linted. 127 | elif [[ "$1" == '--all' ]]; then 128 | lint vllm tests 129 | else 130 | # Format only the files that changed in last commit. 131 | lint_changed 132 | fi 133 | 134 | if ! git diff --quiet &>/dev/null; then 135 | echo 'Reformatted files. Please review and stage the changes.' 136 | echo 'Changes not staged for commit:' 137 | echo 138 | git --no-pager diff --name-only 139 | 140 | exit 1 141 | fi 142 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | python_version = 3.8 3 | 4 | ignore_missing_imports = True 5 | 6 | files = vllm 7 | # TODO(woosuk): Include the code from Megatron and HuggingFace. 8 | exclude = vllm/model_executor/parallel_utils/|vllm/model_executor/models/ 9 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = [ 3 | "ninja", 4 | "packaging", 5 | "setuptools", 6 | "torch >= 2.1.0", 7 | "wheel", 8 | ] 9 | build-backend = "setuptools.build_meta" 10 | 11 | [tool.ruff.lint] 12 | select = [ 13 | # pycodestyle 14 | "E", 15 | # Pyflakes 16 | "F", 17 | # pyupgrade 18 | # "UP", 19 | # flake8-bugbear 20 | "B", 21 | # flake8-simplify 22 | "SIM", 23 | # isort 24 | # "I", 25 | ] 26 | ignore = [ 27 | # star imports 28 | "F405", "F403", 29 | # lambda expression assignment 30 | "E731", 31 | # line too long, handled by black formatting 32 | "E501", 33 | ] 34 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | # formatting 2 | yapf==0.32.0 3 | ruff==0.1.5 4 | 5 | # type checking 6 | mypy==0.991 7 | types-PyYAML 8 | types-requests 9 | types-setuptools 10 | 11 | # testing 12 | pytest 13 | pytest-forked 14 | pytest-asyncio 15 | 16 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | ninja # For faster builds. 2 | psutil 3 | ray >= 2.5.1 4 | pandas # Required for Ray data. 5 | pyarrow # Required for Ray data. 6 | sentencepiece # Required for LLaMA tokenizer. 7 | numpy 8 | einops # Required for phi-1_5 9 | torch >= 2.1.0 10 | transformers >= 4.34.0 # Required for Mistral. 11 | xformers >= 0.0.22.post7 # Required for CUDA 12.1. 12 | fastapi 13 | uvicorn[standard] 14 | pydantic == 1.10.13 # Required for OpenAI server. 15 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OthersideAI/vllm/4b498312ccc7b2804dfe26a4d902750e49937183/tests/__init__.py -------------------------------------------------------------------------------- /tests/async_engine/api_server_async_engine.py: -------------------------------------------------------------------------------- 1 | """vllm.entrypoints.api_server with some extra logging for testing.""" 2 | import argparse 3 | from typing import Any, Dict 4 | 5 | import uvicorn 6 | from fastapi.responses import JSONResponse, Response 7 | 8 | import vllm.entrypoints.api_server 9 | from vllm.engine.arg_utils import AsyncEngineArgs 10 | from vllm.engine.async_llm_engine import AsyncLLMEngine 11 | 12 | app = vllm.entrypoints.api_server.app 13 | 14 | 15 | class AsyncLLMEngineWithStats(AsyncLLMEngine): 16 | 17 | def __init__(self, *args, **kwargs): 18 | super().__init__(*args, **kwargs) 19 | self._num_aborts = 0 20 | 21 | async def abort(self, request_id: str) -> None: 22 | await super().abort(request_id) 23 | self._num_aborts += 1 24 | 25 | def testing_stats(self) -> Dict[str, Any]: 26 | return {"num_aborted_requests": self._num_aborts} 27 | 28 | 29 | @app.get("/stats") 30 | def stats() -> Response: 31 | """Get the statistics of the engine.""" 32 | return JSONResponse(engine.testing_stats()) 33 | 34 | 35 | if __name__ == "__main__": 36 | parser = argparse.ArgumentParser() 37 | parser.add_argument("--host", type=str, default="localhost") 38 | parser.add_argument("--port", type=int, default=8000) 39 | parser = AsyncEngineArgs.add_cli_args(parser) 40 | args = parser.parse_args() 41 | 42 | engine_args = AsyncEngineArgs.from_cli_args(args) 43 | engine = AsyncLLMEngineWithStats.from_engine_args(engine_args) 44 | vllm.entrypoints.api_server.engine = engine 45 | uvicorn.run( 46 | app, 47 | host=args.host, 48 | port=args.port, 49 | log_level="debug", 50 | timeout_keep_alive=vllm.entrypoints.api_server.TIMEOUT_KEEP_ALIVE) 51 | -------------------------------------------------------------------------------- /tests/async_engine/test_api_server.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import sys 3 | import time 4 | from multiprocessing import Pool 5 | from pathlib import Path 6 | 7 | import pytest 8 | import requests 9 | 10 | 11 | def _query_server(prompt: str) -> dict: 12 | response = requests.post("http://localhost:8000/generate", 13 | json={ 14 | "prompt": prompt, 15 | "max_tokens": 100, 16 | "temperature": 0, 17 | "ignore_eos": True 18 | }) 19 | response.raise_for_status() 20 | return response.json() 21 | 22 | 23 | @pytest.fixture 24 | def api_server(): 25 | script_path = Path(__file__).parent.joinpath( 26 | "api_server_async_engine.py").absolute() 27 | uvicorn_process = subprocess.Popen([ 28 | sys.executable, "-u", 29 | str(script_path), "--model", "facebook/opt-125m" 30 | ]) 31 | yield 32 | uvicorn_process.terminate() 33 | 34 | 35 | def test_api_server(api_server): 36 | """ 37 | Run the API server and test it. 38 | 39 | We run both the server and requests in separate processes. 40 | 41 | We test that the server can handle incoming requests, including 42 | multiple requests at the same time, and that it can handle requests 43 | being cancelled without crashing. 44 | """ 45 | with Pool(32) as pool: 46 | # Wait until the server is ready 47 | prompts = ["Hello world"] * 1 48 | result = None 49 | while not result: 50 | try: 51 | for _ in pool.map(_query_server, prompts): 52 | break 53 | except Exception: 54 | time.sleep(1) 55 | 56 | # Actual tests start here 57 | # Try with 1 prompt 58 | for result in pool.map(_query_server, prompts): 59 | assert result 60 | 61 | num_aborted_requests = requests.get( 62 | "http://localhost:8000/stats").json()["num_aborted_requests"] 63 | assert num_aborted_requests == 0 64 | 65 | # Try with 100 prompts 66 | prompts = ["Hello world"] * 100 67 | for result in pool.map(_query_server, prompts): 68 | assert result 69 | 70 | # Cancel requests 71 | pool.map_async(_query_server, prompts) 72 | time.sleep(0.01) 73 | pool.terminate() 74 | pool.join() 75 | 76 | # check cancellation stats 77 | num_aborted_requests = requests.get( 78 | "http://localhost:8000/stats").json()["num_aborted_requests"] 79 | assert num_aborted_requests > 0 80 | 81 | # check that server still runs after cancellations 82 | with Pool(32) as pool: 83 | # Try with 100 prompts 84 | prompts = ["Hello world"] * 100 85 | for result in pool.map(_query_server, prompts): 86 | assert result 87 | -------------------------------------------------------------------------------- /tests/async_engine/test_async_llm_engine.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from dataclasses import dataclass 3 | 4 | import pytest 5 | 6 | from vllm.engine.async_llm_engine import AsyncLLMEngine 7 | 8 | 9 | @dataclass 10 | class RequestOutput: 11 | request_id: int 12 | finished: bool = False 13 | 14 | 15 | class MockEngine: 16 | 17 | def __init__(self): 18 | self.step_calls = 0 19 | self.add_request_calls = 0 20 | self.abort_request_calls = 0 21 | self.request_id = None 22 | 23 | async def step_async(self): 24 | self.step_calls += 1 25 | return [RequestOutput( 26 | request_id=self.request_id)] if self.request_id else [] 27 | 28 | def generate(self, request_id): 29 | self.request_id = request_id 30 | 31 | def stop_generating(self): 32 | self.request_id = None 33 | 34 | def add_request(self, **kwargs): 35 | del kwargs # Unused 36 | self.add_request_calls += 1 37 | 38 | def abort_request(self, request_id): 39 | del request_id # Unused 40 | self.abort_request_calls += 1 41 | 42 | 43 | class MockAsyncLLMEngine(AsyncLLMEngine): 44 | 45 | def _init_engine(self, *args, **kwargs): 46 | return MockEngine() 47 | 48 | 49 | @pytest.mark.asyncio 50 | async def test_new_requests_event(): 51 | engine = MockAsyncLLMEngine(worker_use_ray=False, engine_use_ray=False) 52 | engine.start_background_loop() 53 | await asyncio.sleep(0.01) 54 | assert engine.engine.step_calls == 0 55 | 56 | await engine.add_request("1", "", None) 57 | await asyncio.sleep(0.01) 58 | assert engine.engine.add_request_calls == 1 59 | assert engine.engine.step_calls == 1 60 | 61 | await engine.add_request("2", "", None) 62 | engine.engine.generate("2") 63 | await asyncio.sleep(0) 64 | assert engine.engine.add_request_calls == 2 65 | assert engine.engine.step_calls == 2 66 | await asyncio.sleep(0) 67 | assert engine.engine.step_calls == 3 68 | engine.engine.stop_generating() 69 | await asyncio.sleep(0) 70 | assert engine.engine.step_calls == 4 71 | await asyncio.sleep(0) 72 | assert engine.engine.step_calls == 4 73 | 74 | await engine.add_request("3", "", None) 75 | await asyncio.sleep(0.01) 76 | assert engine.engine.add_request_calls == 3 77 | assert engine.engine.step_calls == 5 78 | await asyncio.sleep(0.01) 79 | assert engine.engine.add_request_calls == 3 80 | assert engine.engine.step_calls == 5 81 | -------------------------------------------------------------------------------- /tests/async_engine/test_request_tracker.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from vllm.engine.async_llm_engine import RequestTracker 4 | from vllm.outputs import RequestOutput 5 | 6 | 7 | class DummyEvent: 8 | 9 | def __init__(self): 10 | self.flag = False 11 | 12 | def set(self): 13 | self.flag = True 14 | 15 | def clear(self): 16 | self.flag = False 17 | 18 | 19 | def test_request_tracker(): 20 | tracker = RequestTracker() 21 | tracker.new_requests_event = DummyEvent() 22 | stream_1 = tracker.add_request("1") 23 | assert tracker.new_requests_event.flag 24 | new, finished = tracker.get_new_and_finished_requests() 25 | assert not tracker.new_requests_event.flag 26 | assert len(new) == 1 27 | assert new[0]["request_id"] == "1" 28 | assert not finished 29 | assert not stream_1.finished 30 | 31 | stream_2 = tracker.add_request("2") 32 | stream_3 = tracker.add_request("3") 33 | assert tracker.new_requests_event.flag 34 | new, finished = tracker.get_new_and_finished_requests() 35 | assert not tracker.new_requests_event.flag 36 | assert len(new) == 2 37 | assert new[0]["request_id"] == "2" 38 | assert new[1]["request_id"] == "3" 39 | assert not finished 40 | assert not stream_2.finished 41 | assert not stream_3.finished 42 | 43 | # request_ids must be unique 44 | with pytest.raises(KeyError): 45 | tracker.add_request("1") 46 | assert not tracker.new_requests_event.flag 47 | 48 | tracker.abort_request("1") 49 | new, finished = tracker.get_new_and_finished_requests() 50 | assert len(finished) == 1 51 | assert "1" in finished 52 | assert not new 53 | assert stream_1.finished 54 | 55 | stream_4 = tracker.add_request("4") 56 | tracker.abort_request("4") 57 | assert tracker.new_requests_event.flag 58 | new, finished = tracker.get_new_and_finished_requests() 59 | assert len(finished) == 1 60 | assert "4" in finished 61 | assert not new 62 | assert stream_4.finished 63 | 64 | stream_5 = tracker.add_request("5") 65 | assert tracker.new_requests_event.flag 66 | tracker.process_request_output( 67 | RequestOutput("2", "output", [], [], [], finished=True)) 68 | new, finished = tracker.get_new_and_finished_requests() 69 | assert not tracker.new_requests_event.flag 70 | assert len(finished) == 1 71 | assert "2" in finished 72 | assert len(new) == 1 73 | assert new[0]["request_id"] == "5" 74 | assert stream_2.finished 75 | assert not stream_5.finished 76 | -------------------------------------------------------------------------------- /tests/distributed/test_comm_ops.py: -------------------------------------------------------------------------------- 1 | """Test the communication operators. 2 | 3 | Run `pytest tests/distributed/test_comm_ops.py --forked`. 4 | """ 5 | from multiprocessing import Process, set_start_method 6 | 7 | import pytest 8 | import torch 9 | 10 | from vllm.config import ParallelConfig 11 | from vllm.engine.ray_utils import get_open_port 12 | from vllm.model_executor.parallel_utils.communication_op import ( 13 | tensor_model_parallel_all_reduce, 14 | tensor_model_parallel_all_gather, 15 | ) 16 | from vllm.worker.worker import _init_distributed_environment 17 | 18 | 19 | def init_test_distributed_environment(pipeline_parallel_size: int, 20 | tensor_parallel_size: int, rank: int, 21 | distributed_init_port: str): 22 | parallel_config = ParallelConfig(pipeline_parallel_size, 23 | tensor_parallel_size, 24 | worker_use_ray=True) 25 | distributed_init_method = f"tcp://localhost:{distributed_init_port}" 26 | torch.cuda.set_device(rank) 27 | _init_distributed_environment(parallel_config, rank, 28 | distributed_init_method) 29 | 30 | 31 | def all_reduce_test_worker(tensor_parallel_size: int, rank: int, 32 | distributed_init_port: str): 33 | init_test_distributed_environment(1, tensor_parallel_size, rank, 34 | distributed_init_port) 35 | num_elements = 8 36 | all_tensors = [ 37 | torch.arange(num_elements, dtype=torch.float32, device="cuda") * 38 | (r + 1) for r in range(tensor_parallel_size) 39 | ] 40 | expected = torch.sum(torch.stack(all_tensors, dim=0), dim=0) 41 | t = all_tensors[rank] 42 | t = tensor_model_parallel_all_reduce(t) 43 | assert torch.allclose(t, expected) 44 | 45 | 46 | def all_gather_test_worker(tensor_parallel_size: int, rank: int, 47 | distributed_init_port: str): 48 | init_test_distributed_environment(1, tensor_parallel_size, rank, 49 | distributed_init_port) 50 | num_dimensions = 3 51 | tensor_size = list(range(2, num_dimensions + 2)) 52 | total_size = 1 53 | for s in tensor_size: 54 | total_size *= s 55 | for all_gather_dimension in range(num_dimensions): 56 | all_tensors = [ 57 | torch.arange(total_size, dtype=torch.float32, 58 | device="cuda").reshape(tensor_size) * (r + 1) 59 | for r in range(tensor_parallel_size) 60 | ] 61 | expected = torch.cat(all_tensors, dim=all_gather_dimension) 62 | t = all_tensors[rank] 63 | t = tensor_model_parallel_all_gather(t, all_gather_dimension) 64 | assert torch.allclose(t, expected) 65 | 66 | 67 | @pytest.mark.skipif(torch.cuda.device_count() < 2, 68 | reason="Need at least 2 GPUs to run the test.") 69 | @pytest.mark.parametrize("tensor_parallel_size", [2]) 70 | @pytest.mark.parametrize("test_target", 71 | [all_reduce_test_worker, all_gather_test_worker]) 72 | def test_multi_process_tensor_parallel(tensor_parallel_size, test_target): 73 | set_start_method("spawn", force=True) 74 | distributed_init_port = get_open_port() 75 | processes = [] 76 | for rank in range(tensor_parallel_size): 77 | p = Process(target=test_target, 78 | args=(tensor_parallel_size, rank, distributed_init_port)) 79 | p.start() 80 | processes.append(p) 81 | for p in processes: 82 | p.join() 83 | assert all(p.exitcode == 0 for p in processes) 84 | -------------------------------------------------------------------------------- /tests/engine/test_detokenize.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from transformers import AutoTokenizer 4 | 5 | from vllm.transformers_utils.tokenizer import detokenize_incrementally 6 | 7 | TRUTH = [ 8 | "Hello here, this is a simple test", # noqa: E501 9 | "vLLM is a high-throughput and memory-efficient inference and serving engine for LLMs. It is designed to be used in production environments, where inference and serving", # noqa: E501 10 | "我很感谢你的热情" # noqa: E501 11 | ] 12 | TOKENIZERS = [ 13 | "facebook/opt-125m", 14 | "gpt2", 15 | "bigcode/tiny_starcoder_py", 16 | "EleutherAI/gpt-j-6b", 17 | "EleutherAI/pythia-70m", 18 | "bigscience/bloom-560m", 19 | "mosaicml/mpt-7b", 20 | "tiiuae/falcon-7b", 21 | "meta-llama/Llama-2-7b-hf", 22 | "codellama/CodeLlama-7b-hf", 23 | ] 24 | 25 | 26 | def _run_incremental_decode(tokenizer, all_input_ids, 27 | skip_special_tokens: bool): 28 | decoded_text = "" 29 | offset = 0 30 | token_offset = 0 31 | prev_tokens = None 32 | for i in range(len(all_input_ids)): 33 | new_tokens, text, offset, token_offset = detokenize_incrementally( 34 | tokenizer, 35 | all_input_ids[:i + 1], 36 | prev_tokens, 37 | offset, 38 | token_offset, 39 | skip_special_tokens=skip_special_tokens) 40 | decoded_text += text 41 | if prev_tokens is None: 42 | prev_tokens = new_tokens 43 | else: 44 | prev_tokens += new_tokens 45 | return decoded_text 46 | 47 | 48 | @pytest.mark.parametrize("truth", TRUTH) 49 | @pytest.mark.parametrize("tokenizer_id", TOKENIZERS) 50 | @pytest.mark.parametrize("skip_special_tokens", (True, False)) 51 | def test_decode_streaming(tokenizer_id, truth, skip_special_tokens): 52 | tokenizer = AutoTokenizer.from_pretrained(tokenizer_id) 53 | all_input_ids = tokenizer(truth, add_special_tokens=False)["input_ids"] 54 | if skip_special_tokens: 55 | all_input_ids = ([tokenizer.bos_token_id] 56 | if tokenizer.bos_token_id is not None else 57 | []) + all_input_ids + [tokenizer.eos_token_id] 58 | 59 | decoded_text = _run_incremental_decode( 60 | tokenizer, all_input_ids, skip_special_tokens=skip_special_tokens) 61 | 62 | assert decoded_text == truth 63 | -------------------------------------------------------------------------------- /tests/kernels/conftest.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple 2 | 3 | import pytest 4 | import torch 5 | 6 | 7 | def create_kv_caches( 8 | num_blocks: int, 9 | block_size: int, 10 | num_layers: int, 11 | num_heads: int, 12 | head_size: int, 13 | dtype: torch.dtype, 14 | seed: int, 15 | ) -> Tuple[List[torch.Tensor], List[torch.Tensor]]: 16 | torch.random.manual_seed(seed) 17 | torch.cuda.manual_seed(seed) 18 | 19 | scale = head_size**-0.5 20 | x = 16 // torch.tensor([], dtype=dtype).element_size() 21 | key_cache_shape = (num_blocks, num_heads, head_size // x, block_size, x) 22 | key_caches = [] 23 | for _ in range(num_layers): 24 | key_cache = torch.empty(size=key_cache_shape, 25 | dtype=dtype, 26 | device='cuda') 27 | key_cache.uniform_(-scale, scale) 28 | key_caches.append(key_cache) 29 | 30 | value_cache_shape = (num_blocks, num_heads, head_size, block_size) 31 | value_caches = [] 32 | for _ in range(num_layers): 33 | value_cache = torch.empty(size=value_cache_shape, 34 | dtype=dtype, 35 | device='cuda') 36 | value_cache.uniform_(-scale, scale) 37 | value_caches.append(value_cache) 38 | return key_caches, value_caches 39 | 40 | 41 | @pytest.fixture() 42 | def kv_cache_factory(): 43 | return create_kv_caches 44 | -------------------------------------------------------------------------------- /tests/kernels/test_activation.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | import torch.nn.functional as F 4 | from transformers.activations import get_activation 5 | 6 | from vllm._C import ops 7 | 8 | DTYPES = [torch.half, torch.bfloat16, torch.float] 9 | NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing 10 | D = [512, 4096, 5120, 13824] # Arbitrary values for testing 11 | SEEDS = [0] 12 | 13 | 14 | def ref_silu_and_mul(x: torch.Tensor) -> torch.Tensor: 15 | x1, x2 = x.chunk(chunks=2, dim=1) 16 | return F.silu(x1) * x2 17 | 18 | 19 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS) 20 | @pytest.mark.parametrize("d", D) 21 | @pytest.mark.parametrize("dtype", DTYPES) 22 | @pytest.mark.parametrize("seed", SEEDS) 23 | @torch.inference_mode() 24 | def test_silu_and_mul( 25 | num_tokens: int, 26 | d: int, 27 | dtype: torch.dtype, 28 | seed: int, 29 | ) -> None: 30 | torch.random.manual_seed(seed) 31 | torch.cuda.manual_seed(seed) 32 | x = torch.randn(num_tokens, 2 * d, dtype=dtype, device="cuda") 33 | out = torch.empty(num_tokens, d, dtype=dtype, device="cuda") 34 | ops.silu_and_mul(out, x) 35 | ref_out = ref_silu_and_mul(x) 36 | assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5) 37 | 38 | 39 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS) 40 | @pytest.mark.parametrize("d", D) 41 | @pytest.mark.parametrize("dtype", DTYPES) 42 | @pytest.mark.parametrize("seed", SEEDS) 43 | @torch.inference_mode() 44 | def test_gelu_new( 45 | num_tokens: int, 46 | d: int, 47 | dtype: torch.dtype, 48 | seed: int, 49 | ) -> None: 50 | torch.random.manual_seed(seed) 51 | torch.cuda.manual_seed(seed) 52 | x = torch.randn(num_tokens, d, dtype=dtype, device="cuda") 53 | out = torch.empty(num_tokens, d, dtype=dtype, device="cuda") 54 | ops.gelu_new(out, x) 55 | ref_out = get_activation("gelu_new")(x) 56 | assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5) 57 | 58 | 59 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS) 60 | @pytest.mark.parametrize("d", D) 61 | @pytest.mark.parametrize("dtype", DTYPES) 62 | @pytest.mark.parametrize("seed", SEEDS) 63 | def test_gelu_fast( 64 | num_tokens: int, 65 | d: int, 66 | dtype: torch.dtype, 67 | seed: int, 68 | ) -> None: 69 | torch.random.manual_seed(seed) 70 | torch.cuda.manual_seed(seed) 71 | x = torch.randn(num_tokens, d, dtype=dtype, device="cuda") 72 | out = torch.empty(num_tokens, d, dtype=dtype, device="cuda") 73 | ops.gelu_fast(out, x) 74 | ref_out = get_activation("gelu_fast")(x) 75 | assert torch.allclose(out, ref_out, atol=1e-5, rtol=1e-5) 76 | -------------------------------------------------------------------------------- /tests/kernels/test_cache.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import pytest 4 | import torch 5 | 6 | from vllm._C import cache_ops 7 | 8 | DTYPES = [torch.half, torch.bfloat16, torch.float] 9 | NUM_TOKENS = [83] # Arbitrary values for testing 10 | NUM_LAYERS = [1] # Arbitrary values for testing 11 | NUM_HEADS = [8] # Arbitrary values for testing 12 | HEAD_SIZES = [64, 80, 96, 112, 128, 256] 13 | BLOCK_SIZES = [8, 16, 32] 14 | NUM_BLOCKS = [1024, 36000] # Arbitrary values for testing 15 | NUM_MAPPINGS = [256] # Arbitrary values for testing 16 | SEEDS = [0] 17 | 18 | 19 | @pytest.mark.parametrize("num_mappings", NUM_MAPPINGS) 20 | @pytest.mark.parametrize("num_layers", NUM_LAYERS) 21 | @pytest.mark.parametrize("num_heads", NUM_HEADS) 22 | @pytest.mark.parametrize("head_size", HEAD_SIZES) 23 | @pytest.mark.parametrize("block_size", BLOCK_SIZES) 24 | @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) 25 | @pytest.mark.parametrize("dtype", DTYPES) 26 | @pytest.mark.parametrize("seed", SEEDS) 27 | @torch.inference_mode() 28 | def test_copy_blocks( 29 | kv_cache_factory, 30 | num_mappings: int, 31 | num_layers: int, 32 | num_heads: int, 33 | head_size: int, 34 | block_size: int, 35 | num_blocks: int, 36 | dtype: torch.dtype, 37 | seed: int, 38 | ) -> None: 39 | random.seed(seed) 40 | torch.random.manual_seed(seed) 41 | torch.cuda.manual_seed(seed) 42 | 43 | # Generate random block mappings where each source block is mapped to two 44 | # destination blocks. 45 | assert 2 * num_mappings <= num_blocks 46 | src_blocks = random.sample(range(num_blocks), num_mappings) 47 | remainig_blocks = list(set(range(num_blocks)) - set(src_blocks)) 48 | dst_blocks = random.sample(remainig_blocks, 2 * num_mappings) 49 | block_mapping = {} 50 | for i in range(num_mappings): 51 | src = src_blocks[i] 52 | dst1 = dst_blocks[2 * i] 53 | dst2 = dst_blocks[2 * i + 1] 54 | block_mapping[src] = [dst1, dst2] 55 | 56 | # Create the KV caches. 57 | key_caches, value_caches = kv_cache_factory(num_blocks, block_size, 58 | num_layers, num_heads, 59 | head_size, dtype, seed) 60 | 61 | # Clone the KV caches. 62 | cloned_key_caches = [key_cache.clone() for key_cache in key_caches] 63 | cloned_value_caches = [value_cache.clone() for value_cache in value_caches] 64 | 65 | # Call the copy blocks kernel. 66 | cache_ops.copy_blocks(key_caches, value_caches, block_mapping) 67 | 68 | # Run the reference implementation. 69 | for src, dsts in block_mapping.items(): 70 | for dst in dsts: 71 | for cloned_key_cache in cloned_key_caches: 72 | cloned_key_cache[dst].copy_(cloned_key_cache[src]) 73 | for cloned_value_cache in cloned_value_caches: 74 | cloned_value_cache[dst].copy_(cloned_value_cache[src]) 75 | 76 | # Compare the results. 77 | for key_cache, cloned_key_cache in zip(key_caches, cloned_key_caches): 78 | assert torch.allclose(key_cache, cloned_key_cache) 79 | for value_cache, cloned_value_cache in zip(value_caches, 80 | cloned_value_caches): 81 | assert torch.allclose(value_cache, cloned_value_cache) 82 | 83 | 84 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS) 85 | @pytest.mark.parametrize("num_heads", NUM_HEADS) 86 | @pytest.mark.parametrize("head_size", HEAD_SIZES) 87 | @pytest.mark.parametrize("block_size", BLOCK_SIZES) 88 | @pytest.mark.parametrize("num_blocks", NUM_BLOCKS) 89 | @pytest.mark.parametrize("dtype", DTYPES) 90 | @pytest.mark.parametrize("seed", SEEDS) 91 | @torch.inference_mode() 92 | def test_reshape_and_cache( 93 | kv_cache_factory, 94 | num_tokens: int, 95 | num_heads: int, 96 | head_size: int, 97 | block_size: int, 98 | num_blocks: int, 99 | dtype: torch.dtype, 100 | seed: int, 101 | ) -> None: 102 | random.seed(seed) 103 | torch.random.manual_seed(seed) 104 | torch.cuda.manual_seed(seed) 105 | 106 | # Create a random slot mapping. 107 | num_slots = block_size * num_blocks 108 | slot_mapping = random.sample(range(num_slots), num_tokens) 109 | slot_mapping = torch.tensor(slot_mapping, dtype=torch.long, device="cuda") 110 | 111 | qkv = torch.randn(num_tokens, 112 | 3, 113 | num_heads, 114 | head_size, 115 | dtype=dtype, 116 | device="cuda") 117 | _, key, value = qkv.unbind(dim=1) 118 | 119 | # Create the KV caches. 120 | key_caches, value_caches = kv_cache_factory(num_blocks, block_size, 1, 121 | num_heads, head_size, dtype, 122 | seed) 123 | key_cache, value_cache = key_caches[0], value_caches[0] 124 | 125 | # Clone the KV caches. 126 | cloned_key_cache = key_cache.clone() 127 | cloned_value_cache = value_cache.clone() 128 | 129 | # Call the reshape_and_cache kernel. 130 | cache_ops.reshape_and_cache(key, value, key_cache, value_cache, 131 | slot_mapping) 132 | 133 | # Run the reference implementation. 134 | reshaped_key = key.reshape(num_tokens, *key_cache[0, :, :, 0, :].shape) 135 | block_indicies = torch.div(slot_mapping, block_size, rounding_mode="floor") 136 | block_indicies = block_indicies.cpu().tolist() 137 | block_offsets = slot_mapping % block_size 138 | block_offsets = block_offsets.cpu().tolist() 139 | for i in range(num_tokens): 140 | block_idx = block_indicies[i] 141 | block_offset = block_offsets[i] 142 | cloned_key_cache[block_idx, :, :, block_offset, :] = reshaped_key[i] 143 | cloned_value_cache[block_idx, :, :, block_offset] = value[i] 144 | 145 | assert torch.allclose(key_cache, cloned_key_cache) 146 | assert torch.allclose(value_cache, cloned_value_cache) 147 | -------------------------------------------------------------------------------- /tests/kernels/test_layernorm.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | import torch.nn as nn 4 | 5 | from vllm._C import ops 6 | 7 | DTYPES = [torch.half, torch.bfloat16, torch.float] 8 | HIDDEN_SIZES = [67, 768, 2048, 5120, 8192] # Arbitrary values for testing 9 | NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing 10 | SEEDS = [0] 11 | 12 | 13 | class RefRMSNorm(nn.Module): 14 | 15 | def __init__(self, hidden_size, eps=1e-6): 16 | super().__init__() 17 | weight = torch.empty(hidden_size) 18 | weight.normal_(mean=1.0, std=0.1) 19 | self.weight = nn.Parameter(weight) 20 | self.variance_epsilon = eps 21 | 22 | def forward(self, hidden_states): 23 | input_dtype = hidden_states.dtype 24 | hidden_states = hidden_states.to(torch.float32) 25 | variance = hidden_states.pow(2).mean(-1, keepdim=True) 26 | hidden_states = hidden_states * torch.rsqrt(variance + 27 | self.variance_epsilon) 28 | return self.weight * hidden_states.to(input_dtype) 29 | 30 | 31 | @pytest.mark.parametrize("num_tokens", NUM_TOKENS) 32 | @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) 33 | @pytest.mark.parametrize("dtype", DTYPES) 34 | @pytest.mark.parametrize("seed", SEEDS) 35 | @torch.inference_mode() 36 | def test_rms_norm( 37 | num_tokens: int, 38 | hidden_size: int, 39 | dtype: torch.dtype, 40 | seed: int, 41 | ) -> None: 42 | torch.random.manual_seed(seed) 43 | torch.cuda.manual_seed(seed) 44 | 45 | scale = float(hidden_size**-0.5) 46 | x = torch.empty(num_tokens, hidden_size, dtype=dtype, device="cuda") 47 | x.uniform_(-scale, scale) 48 | ref = RefRMSNorm(hidden_size).to(dtype).cuda() 49 | 50 | out = torch.empty_like(x) 51 | ops.rms_norm( 52 | out, 53 | x, 54 | ref.weight.data, 55 | ref.variance_epsilon, 56 | ) 57 | ref_out = ref(x) 58 | assert torch.allclose(out, ref_out, atol=1e-2, rtol=1e-5) 59 | -------------------------------------------------------------------------------- /tests/models/test_models.py: -------------------------------------------------------------------------------- 1 | """Compare the outputs of HF and vLLM when using greedy sampling. 2 | 3 | Run `pytest tests/models/test_models.py --forked`. 4 | """ 5 | import pytest 6 | 7 | MODELS = [ 8 | "facebook/opt-125m", 9 | "meta-llama/Llama-2-7b-hf", 10 | "mistralai/Mistral-7B-v0.1", 11 | "tiiuae/falcon-7b", 12 | "gpt2", 13 | "bigcode/tiny_starcoder_py", 14 | "EleutherAI/gpt-j-6b", 15 | "EleutherAI/pythia-70m", 16 | "bigscience/bloom-560m", 17 | "mosaicml/mpt-7b", 18 | "microsoft/phi-1_5", 19 | ] 20 | 21 | 22 | @pytest.mark.parametrize("model", MODELS) 23 | @pytest.mark.parametrize("dtype", ["half"]) 24 | @pytest.mark.parametrize("max_tokens", [128]) 25 | def test_models( 26 | hf_runner, 27 | vllm_runner, 28 | example_prompts, 29 | model: str, 30 | dtype: str, 31 | max_tokens: int, 32 | ) -> None: 33 | hf_model = hf_runner(model, dtype=dtype) 34 | hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens) 35 | del hf_model 36 | 37 | vllm_model = vllm_runner(model, dtype=dtype) 38 | vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens) 39 | del vllm_model 40 | 41 | for i in range(len(example_prompts)): 42 | hf_output_ids, hf_output_str = hf_outputs[i] 43 | vllm_output_ids, vllm_output_str = vllm_outputs[i] 44 | assert hf_output_str == vllm_output_str, ( 45 | f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}") 46 | assert hf_output_ids == vllm_output_ids, ( 47 | f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}") 48 | -------------------------------------------------------------------------------- /tests/samplers/test_beam_search.py: -------------------------------------------------------------------------------- 1 | """Compare the outputs of HF and vLLM when using beam search. 2 | 3 | Run `pytest tests/samplers/test_beam_search.py --forked`. 4 | """ 5 | import pytest 6 | 7 | # FIXME(zhuohan): The test can not pass if we: 8 | # 1. Increase max_tokens to 256. 9 | # 2. Increase beam_width to 8. 10 | # 3. Use the model "huggyllama/llama-7b". 11 | MAX_TOKENS = [128] 12 | BEAM_WIDTHS = [4] 13 | MODELS = ["facebook/opt-125m"] 14 | 15 | 16 | @pytest.mark.parametrize("model", MODELS) 17 | @pytest.mark.parametrize("dtype", ["half"]) 18 | @pytest.mark.parametrize("max_tokens", MAX_TOKENS) 19 | @pytest.mark.parametrize("beam_width", BEAM_WIDTHS) 20 | def test_beam_search_single_input( 21 | hf_runner, 22 | vllm_runner, 23 | example_prompts, 24 | model: str, 25 | dtype: str, 26 | max_tokens: int, 27 | beam_width: int, 28 | ) -> None: 29 | hf_model = hf_runner(model, dtype=dtype) 30 | hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width, 31 | max_tokens) 32 | del hf_model 33 | 34 | vllm_model = vllm_runner(model, dtype=dtype) 35 | vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width, 36 | max_tokens) 37 | del vllm_model 38 | 39 | for i in range(len(example_prompts)): 40 | hf_output_ids, _ = hf_outputs[i] 41 | vllm_output_ids, _ = vllm_outputs[i] 42 | assert len(hf_output_ids) == len(vllm_output_ids) 43 | for j in range(len(hf_output_ids)): 44 | assert hf_output_ids[j] == vllm_output_ids[j], ( 45 | f"Test{i} output{j}:\nHF: {hf_output_ids}\n" 46 | f"vLLM: {vllm_output_ids}") 47 | -------------------------------------------------------------------------------- /tests/samplers/test_logprobs.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import torch 3 | 4 | from vllm import SamplingParams 5 | 6 | MODELS = ["facebook/opt-125m"] 7 | 8 | 9 | @pytest.mark.parametrize("model", MODELS) 10 | @pytest.mark.parametrize("dtype", ["half"]) 11 | def test_get_prompt_logprobs( 12 | hf_runner, 13 | vllm_runner, 14 | model, 15 | dtype, 16 | example_prompts, 17 | ): 18 | max_tokens = 5 19 | hf_model = hf_runner(model, dtype=dtype) 20 | hf_logprobs = hf_model.generate_greedy_logprobs( 21 | example_prompts, 22 | max_tokens=max_tokens, 23 | ) 24 | del hf_model 25 | 26 | vllm_model = vllm_runner(model, dtype=dtype) 27 | vllm_sampling_params = SamplingParams(max_tokens=max_tokens, 28 | logprobs=5, 29 | prompt_logprobs=5, 30 | temperature=0.0) 31 | vllm_results = vllm_model.model.generate( 32 | example_prompts, sampling_params=vllm_sampling_params) 33 | 34 | # Test whether logprobs are included in the results. 35 | for result in vllm_results: 36 | assert result.prompt_logprobs is not None 37 | assert result.outputs[0].logprobs is not None 38 | 39 | # Test whether prompt logprobs are consistent with HF 40 | for vllm_result, hf_logprob in zip(vllm_results, hf_logprobs): 41 | # Check prompt logprobs 42 | vllm_prompt_logprobs = vllm_result.prompt_logprobs[1:] 43 | for i, vllm_prompt_logprob_dict in enumerate(vllm_prompt_logprobs): 44 | for token_id, logprob in vllm_prompt_logprob_dict.items(): 45 | torch.testing.assert_close(logprob, 46 | hf_logprob[0][i][token_id].item(), 47 | atol=1e-2, 48 | rtol=1e-2) 49 | vllm_sample_logprobs = vllm_result.outputs[0].logprobs 50 | for i, vllm_sample_logprob_dict in enumerate(vllm_sample_logprobs): 51 | for token_id, logprob in vllm_sample_logprob_dict.items(): 52 | torch.testing.assert_close(logprob, 53 | hf_logprob[i][-1][token_id].item(), 54 | atol=1e-2, 55 | rtol=1e-2) 56 | -------------------------------------------------------------------------------- /tests/test_regression.py: -------------------------------------------------------------------------------- 1 | """Containing tests that check for regressions in vLLM's behavior. 2 | 3 | It should include tests that are reported by users and making sure they 4 | will never happen again. 5 | 6 | """ 7 | from vllm import LLM, SamplingParams 8 | 9 | 10 | def test_duplicated_ignored_sequence_group(): 11 | """https://github.com/vllm-project/vllm/issues/1655""" 12 | 13 | sampling_params = SamplingParams(temperature=0.01, 14 | top_p=0.1, 15 | max_tokens=256) 16 | llm = LLM(model="facebook/opt-125m", 17 | max_num_batched_tokens=4096, 18 | tensor_parallel_size=1) 19 | prompts = ["This is a short prompt", "This is a very long prompt " * 1000] 20 | outputs = llm.generate(prompts, sampling_params=sampling_params) 21 | 22 | assert len(prompts) == len(outputs) 23 | 24 | 25 | if __name__ == "__main__": 26 | import pytest 27 | pytest.main([__file__]) 28 | -------------------------------------------------------------------------------- /tests/worker/test_worker.py: -------------------------------------------------------------------------------- 1 | import random 2 | import torch 3 | 4 | from vllm.sequence import SamplingParams, SequenceData, SequenceGroupMetadata 5 | from vllm.worker.worker import Worker 6 | 7 | 8 | def test_worker_prepare_inputs_for_prompt(): 9 | worker = Worker(None, None, None) 10 | worker.block_size = 16 11 | batch_size = random.randint(1, 256) 12 | prompt_lens = [] 13 | seq_group_metadata_list = [] 14 | for i in range(batch_size): 15 | # make sure all tokens fit into one block 16 | prompt_len = i % (worker.block_size - 1) + 1 17 | prompt_lens.append(prompt_len) 18 | seq_data = list(range(prompt_len)) 19 | seq_group_metadata_list.append( 20 | SequenceGroupMetadata( 21 | request_id=f"test_{i}", 22 | is_prompt=True, 23 | seq_data={0: SequenceData(seq_data)}, 24 | sampling_params=SamplingParams(temperature=0), 25 | block_tables={0: [1]}, 26 | )) 27 | expected_selected_token_indices = [] 28 | selected_token_start_idx = 0 29 | max_seq_len = max(prompt_lens) 30 | for prompt_len in prompt_lens: 31 | expected_selected_token_indices.append(selected_token_start_idx + 32 | prompt_len - 1) 33 | selected_token_start_idx += max_seq_len 34 | input_tokens, input_positions, input_metadata = worker._prepare_inputs( 35 | seq_group_metadata_list) 36 | assert input_tokens.shape == input_positions.shape == (batch_size, 37 | max_seq_len) 38 | torch.testing.assert_close(input_tokens, input_positions) 39 | actual = input_metadata.selected_token_indices 40 | expected = torch.tensor(expected_selected_token_indices, 41 | device=actual.device, 42 | dtype=actual.dtype) 43 | torch.testing.assert_close(actual, expected) 44 | -------------------------------------------------------------------------------- /vllm/__init__.py: -------------------------------------------------------------------------------- 1 | """vLLM: a high-throughput and memory-efficient inference engine for LLMs""" 2 | 3 | from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs 4 | from vllm.engine.async_llm_engine import AsyncLLMEngine 5 | from vllm.engine.llm_engine import LLMEngine 6 | from vllm.engine.ray_utils import initialize_cluster 7 | from vllm.entrypoints.llm import LLM 8 | from vllm.outputs import CompletionOutput, RequestOutput 9 | from vllm.sampling_params import SamplingParams 10 | 11 | __version__ = "0.2.2" 12 | 13 | __all__ = [ 14 | "LLM", 15 | "SamplingParams", 16 | "RequestOutput", 17 | "CompletionOutput", 18 | "LLMEngine", 19 | "EngineArgs", 20 | "AsyncLLMEngine", 21 | "AsyncEngineArgs", 22 | "initialize_cluster", 23 | ] 24 | -------------------------------------------------------------------------------- /vllm/block.py: -------------------------------------------------------------------------------- 1 | """Token blocks.""" 2 | from typing import List 3 | 4 | from vllm.utils import Device 5 | 6 | _BLANK_TOKEN_ID = -1 7 | 8 | 9 | class LogicalTokenBlock: 10 | """A block that stores a contiguous chunk of tokens from left to right. 11 | 12 | Logical blocks are used to represent the states of the corresponding 13 | physical blocks in the KV cache. 14 | """ 15 | 16 | def __init__( 17 | self, 18 | block_number: int, 19 | block_size: int, 20 | ) -> None: 21 | self.block_number = block_number 22 | self.block_size = block_size 23 | 24 | self.token_ids = [_BLANK_TOKEN_ID] * block_size 25 | self.num_tokens = 0 26 | 27 | def is_empty(self) -> bool: 28 | return self.num_tokens == 0 29 | 30 | def get_num_empty_slots(self) -> int: 31 | return self.block_size - self.num_tokens 32 | 33 | def is_full(self) -> bool: 34 | return self.num_tokens == self.block_size 35 | 36 | def append_tokens(self, token_ids: List[int]) -> None: 37 | assert len(token_ids) <= self.get_num_empty_slots() 38 | curr_idx = self.num_tokens 39 | self.token_ids[curr_idx:curr_idx + len(token_ids)] = token_ids 40 | self.num_tokens += len(token_ids) 41 | 42 | def get_token_ids(self) -> List[int]: 43 | return self.token_ids[:self.num_tokens] 44 | 45 | def get_last_token_id(self) -> int: 46 | assert self.num_tokens > 0 47 | return self.token_ids[self.num_tokens - 1] 48 | 49 | 50 | class PhysicalTokenBlock: 51 | """Represents the state of a block in the KV cache.""" 52 | 53 | def __init__( 54 | self, 55 | device: Device, 56 | block_number: int, 57 | block_size: int, 58 | ) -> None: 59 | self.device = device 60 | self.block_number = block_number 61 | self.block_size = block_size 62 | 63 | self.ref_count = 0 64 | 65 | def __repr__(self) -> str: 66 | return (f'PhysicalTokenBlock(device={self.device}, ' 67 | f'block_number={self.block_number}, ' 68 | f'ref_count={self.ref_count})') 69 | -------------------------------------------------------------------------------- /vllm/core/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OthersideAI/vllm/4b498312ccc7b2804dfe26a4d902750e49937183/vllm/core/__init__.py -------------------------------------------------------------------------------- /vllm/core/policy.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | 3 | from vllm.sequence import SequenceGroup 4 | 5 | 6 | class Policy: 7 | 8 | def get_priority( 9 | self, 10 | now: float, 11 | seq_group: SequenceGroup, 12 | ) -> float: 13 | raise NotImplementedError 14 | 15 | def sort_by_priority( 16 | self, 17 | now: float, 18 | seq_groups: List[SequenceGroup], 19 | ) -> List[SequenceGroup]: 20 | return sorted( 21 | seq_groups, 22 | key=lambda seq_group: self.get_priority(now, seq_group), 23 | reverse=True, 24 | ) 25 | 26 | 27 | class FCFS(Policy): 28 | 29 | def get_priority( 30 | self, 31 | now: float, 32 | seq_group: SequenceGroup, 33 | ) -> float: 34 | return now - seq_group.arrival_time 35 | 36 | 37 | class PolicyFactory: 38 | 39 | _POLICY_REGISTRY = { 40 | 'fcfs': FCFS, 41 | } 42 | 43 | @classmethod 44 | def get_policy(cls, policy_name: str, **kwargs) -> Policy: 45 | return cls._POLICY_REGISTRY[policy_name](**kwargs) 46 | -------------------------------------------------------------------------------- /vllm/engine/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OthersideAI/vllm/4b498312ccc7b2804dfe26a4d902750e49937183/vllm/engine/__init__.py -------------------------------------------------------------------------------- /vllm/engine/ray_utils.py: -------------------------------------------------------------------------------- 1 | import socket 2 | from typing import Optional, Tuple, TYPE_CHECKING 3 | 4 | from vllm.config import ParallelConfig 5 | from vllm.logger import init_logger 6 | 7 | logger = init_logger(__name__) 8 | 9 | try: 10 | import ray 11 | from ray.air.util.torch_dist import TorchDistributedWorker 12 | 13 | class RayWorker(TorchDistributedWorker): 14 | """Ray wrapper for vllm.worker.Worker, allowing Worker to be 15 | lazliy initialized after Ray sets CUDA_VISIBLE_DEVICES.""" 16 | 17 | def __init__(self, init_cached_hf_modules=False) -> None: 18 | if init_cached_hf_modules: 19 | from transformers.dynamic_module_utils import init_hf_modules 20 | init_hf_modules() 21 | self.worker = None 22 | 23 | def init_worker(self, worker_init_fn): 24 | self.worker = worker_init_fn() 25 | 26 | def __getattr__(self, name): 27 | return getattr(self.worker, name) 28 | 29 | def execute_method(self, method, *args, **kwargs): 30 | executor = getattr(self, method) 31 | return executor(*args, **kwargs) 32 | 33 | except ImportError as e: 34 | logger.warning(f"Failed to import Ray with {e!r}. " 35 | "For distributed inference, please install Ray with " 36 | "`pip install ray pandas pyarrow`.") 37 | ray = None 38 | TorchDistributedWorker = None 39 | RayWorker = None 40 | 41 | if TYPE_CHECKING: 42 | from ray.util.placement_group import PlacementGroup 43 | 44 | 45 | def get_open_port(): 46 | with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: 47 | s.bind(("", 0)) 48 | return s.getsockname()[1] 49 | 50 | 51 | def initialize_cluster( 52 | parallel_config: ParallelConfig, 53 | engine_use_ray: bool = False, 54 | ray_address: Optional[str] = None, 55 | ) -> Tuple[str, Optional["PlacementGroup"]]: 56 | """Initialize the distributed cluster probably with Ray. 57 | 58 | Args: 59 | parallel_config: The configurations for parallel execution. 60 | engine_use_ray: Whether to use Ray for async engine. 61 | ray_address: The address of the Ray cluster. If None, uses 62 | the default Ray cluster address. 63 | 64 | Returns: 65 | A tuple of (`distributed_init_method`, `placement_group`). The 66 | `distributed_init_method` is the address for initializing the 67 | distributed backend. `placement_group` includes the specification 68 | of the resources for each distributed worker. 69 | """ 70 | if parallel_config.worker_use_ray or engine_use_ray: 71 | if ray is None: 72 | raise ImportError( 73 | "Ray is not installed. Please install Ray to use distributed " 74 | "serving.") 75 | # Connect to a ray cluster. 76 | ray.init(address=ray_address, ignore_reinit_error=True) 77 | 78 | if not parallel_config.worker_use_ray: 79 | # Initialize cluster locally. 80 | port = get_open_port() 81 | # We need to setup the distributed init method to make sure 82 | # the distributed megatron code (e.g., get world size) works correctly. 83 | distributed_init_method = f"tcp://localhost:{port}" 84 | return distributed_init_method, None 85 | 86 | current_placement_group = ray.util.get_current_placement_group() 87 | if current_placement_group: 88 | # We are in a placement group 89 | bundles = current_placement_group.bundle_specs 90 | # Verify that we can use the placement group. 91 | gpu_bundles = 0 92 | for bundle in bundles: 93 | bundle_gpus = bundle.get("GPU", 0) 94 | if bundle_gpus > 1: 95 | raise ValueError( 96 | "Placement group bundle cannot have more than 1 GPU.") 97 | if bundle_gpus: 98 | gpu_bundles += 1 99 | if parallel_config.world_size > gpu_bundles: 100 | raise ValueError( 101 | "The number of required GPUs exceeds the total number of " 102 | "available GPUs in the placement group.") 103 | else: 104 | num_gpus_in_cluster = ray.cluster_resources().get("GPU", 0) 105 | if parallel_config.world_size > num_gpus_in_cluster: 106 | raise ValueError( 107 | "The number of required GPUs exceeds the total number of " 108 | "available GPUs in the cluster.") 109 | # Create a new placement group 110 | current_placement_group = ray.util.placement_group([{ 111 | "GPU": 1 112 | }] * parallel_config.world_size) 113 | # Wait until PG is ready - this will block until all 114 | # requested resources are available, and will timeout 115 | # if they cannot be provisioned. 116 | ray.get(current_placement_group.ready(), timeout=1800) 117 | 118 | return None, current_placement_group 119 | -------------------------------------------------------------------------------- /vllm/entrypoints/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OthersideAI/vllm/4b498312ccc7b2804dfe26a4d902750e49937183/vllm/entrypoints/__init__.py -------------------------------------------------------------------------------- /vllm/entrypoints/api_server.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | from typing import AsyncGenerator 4 | 5 | from fastapi import FastAPI, Request 6 | from fastapi.responses import JSONResponse, Response, StreamingResponse 7 | import uvicorn 8 | 9 | from vllm.engine.arg_utils import AsyncEngineArgs 10 | from vllm.engine.async_llm_engine import AsyncLLMEngine 11 | from vllm.sampling_params import SamplingParams 12 | from vllm.utils import random_uuid 13 | 14 | TIMEOUT_KEEP_ALIVE = 5 # seconds. 15 | TIMEOUT_TO_PREVENT_DEADLOCK = 1 # seconds. 16 | app = FastAPI() 17 | engine = None 18 | 19 | 20 | @app.get("/health") 21 | async def health() -> Response: 22 | """Health check.""" 23 | return Response(status_code=200) 24 | 25 | 26 | @app.post("/generate") 27 | async def generate(request: Request) -> Response: 28 | """Generate completion for the request. 29 | 30 | The request should be a JSON object with the following fields: 31 | - prompt: the prompt to use for the generation. 32 | - stream: whether to stream the results or not. 33 | - other fields: the sampling parameters (See `SamplingParams` for details). 34 | """ 35 | request_dict = await request.json() 36 | prompt = request_dict.pop("prompt") 37 | stream = request_dict.pop("stream", False) 38 | sampling_params = SamplingParams(**request_dict) 39 | request_id = random_uuid() 40 | 41 | results_generator = engine.generate(prompt, sampling_params, request_id) 42 | 43 | # Streaming case 44 | async def stream_results() -> AsyncGenerator[bytes, None]: 45 | async for request_output in results_generator: 46 | prompt = request_output.prompt 47 | text_outputs = [ 48 | prompt + output.text for output in request_output.outputs 49 | ] 50 | ret = {"text": text_outputs} 51 | yield (json.dumps(ret) + "\0").encode("utf-8") 52 | 53 | if stream: 54 | return StreamingResponse(stream_results()) 55 | 56 | # Non-streaming case 57 | final_output = None 58 | async for request_output in results_generator: 59 | if await request.is_disconnected(): 60 | # Abort the request if the client disconnects. 61 | await engine.abort(request_id) 62 | return Response(status_code=499) 63 | final_output = request_output 64 | 65 | assert final_output is not None 66 | prompt = final_output.prompt 67 | text_outputs = [prompt + output.text for output in final_output.outputs] 68 | ret = {"text": text_outputs} 69 | return JSONResponse(ret) 70 | 71 | 72 | if __name__ == "__main__": 73 | parser = argparse.ArgumentParser() 74 | parser.add_argument("--host", type=str, default=None) 75 | parser.add_argument("--port", type=int, default=8000) 76 | parser = AsyncEngineArgs.add_cli_args(parser) 77 | args = parser.parse_args() 78 | 79 | engine_args = AsyncEngineArgs.from_cli_args(args) 80 | engine = AsyncLLMEngine.from_engine_args(engine_args) 81 | 82 | uvicorn.run(app, 83 | host=args.host, 84 | port=args.port, 85 | log_level="debug", 86 | timeout_keep_alive=TIMEOUT_KEEP_ALIVE) 87 | -------------------------------------------------------------------------------- /vllm/entrypoints/openai/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OthersideAI/vllm/4b498312ccc7b2804dfe26a4d902750e49937183/vllm/entrypoints/openai/__init__.py -------------------------------------------------------------------------------- /vllm/logger.py: -------------------------------------------------------------------------------- 1 | # Adapted from 2 | # https://github.com/skypilot-org/skypilot/blob/86dc0f6283a335e4aa37b3c10716f90999f48ab6/sky/sky_logging.py 3 | """Logging configuration for vLLM.""" 4 | import logging 5 | import sys 6 | 7 | _FORMAT = "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s" 8 | _DATE_FORMAT = "%m-%d %H:%M:%S" 9 | 10 | 11 | class NewLineFormatter(logging.Formatter): 12 | """Adds logging prefix to newlines to align multi-line messages.""" 13 | 14 | def __init__(self, fmt, datefmt=None): 15 | logging.Formatter.__init__(self, fmt, datefmt) 16 | 17 | def format(self, record): 18 | msg = logging.Formatter.format(self, record) 19 | if record.message != "": 20 | parts = msg.split(record.message) 21 | msg = msg.replace("\n", "\r\n" + parts[0]) 22 | return msg 23 | 24 | 25 | _root_logger = logging.getLogger("vllm") 26 | _default_handler = None 27 | 28 | 29 | def _setup_logger(): 30 | _root_logger.setLevel(logging.DEBUG) 31 | global _default_handler 32 | if _default_handler is None: 33 | _default_handler = logging.StreamHandler(sys.stdout) 34 | _default_handler.flush = sys.stdout.flush # type: ignore 35 | _default_handler.setLevel(logging.INFO) 36 | _root_logger.addHandler(_default_handler) 37 | fmt = NewLineFormatter(_FORMAT, datefmt=_DATE_FORMAT) 38 | _default_handler.setFormatter(fmt) 39 | # Setting this will avoid the message 40 | # being propagated to the parent logger. 41 | _root_logger.propagate = False 42 | 43 | 44 | # The logger is initialized when the module is imported. 45 | # This is thread-safe as the module is only imported once, 46 | # guaranteed by the Python GIL. 47 | _setup_logger() 48 | 49 | 50 | def init_logger(name: str): 51 | # Use the same settings as above for root logger 52 | logger = logging.getLogger(name) 53 | logger.setLevel(logging.DEBUG) 54 | logger.addHandler(_default_handler) 55 | logger.propagate = False 56 | return logger 57 | -------------------------------------------------------------------------------- /vllm/model_executor/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.model_executor.input_metadata import InputMetadata 2 | from vllm.model_executor.model_loader import get_model 3 | from vllm.model_executor.utils import set_random_seed 4 | 5 | __all__ = [ 6 | "InputMetadata", 7 | "get_model", 8 | "set_random_seed", 9 | ] 10 | -------------------------------------------------------------------------------- /vllm/model_executor/input_metadata.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List, Optional, Tuple 2 | 3 | import torch 4 | from xformers.ops import AttentionBias 5 | 6 | from vllm.sampling_params import SamplingParams, SamplingType 7 | from vllm.sequence import SequenceData 8 | 9 | 10 | class InputMetadata: 11 | """Metadata for input sequences. Used for PagedAttention. 12 | 13 | Args: 14 | seq_groups: List of (seq_ids, sampling_params). 15 | seq_data: Seq_id -> SequenceData. 16 | prompt_lens: Lengths of prompts. 17 | slot_mapping: The address to write the new KV to of each token. 18 | context_lens: the length of attention context for each generation token. 19 | max_context_len: The maximum context length. 20 | block_tables: The block tables. (Seq id -> list of physical block) 21 | """ 22 | 23 | def __init__( 24 | self, 25 | seq_groups: List[Tuple[List[int], SamplingParams]], 26 | seq_data: Dict[int, SequenceData], 27 | prompt_lens: List[int], 28 | slot_mapping: torch.Tensor, 29 | context_lens: torch.Tensor, 30 | max_context_len: int, 31 | block_tables: torch.Tensor, 32 | selected_token_indices: torch.Tensor, 33 | categorized_sample_indices: Dict[SamplingType, torch.Tensor], 34 | sliding_window: Optional[int] = None, 35 | ) -> None: 36 | self.seq_groups = seq_groups 37 | self.seq_data = seq_data 38 | self.prompt_lens = prompt_lens 39 | self.slot_mapping = slot_mapping 40 | self.context_lens = context_lens 41 | self.max_context_len = max_context_len 42 | self.block_tables = block_tables 43 | self.selected_token_indices = selected_token_indices 44 | self.categorized_sample_indices = categorized_sample_indices 45 | 46 | self.max_prompt_len = max(prompt_lens) if prompt_lens else 0 47 | self.to_cache = None 48 | if sliding_window is not None: 49 | # We need to keep the positions of sliding windows within 50 | # the key / value tables, this is helpful to know which 51 | # elements we need to cache. 52 | to_cache, start_idx = [], 0 53 | for prompt_len in self.prompt_lens: 54 | to_cache.extend( 55 | range( 56 | start_idx + max(0, prompt_len - sliding_window), 57 | start_idx + prompt_len, 58 | )) 59 | start_idx += self.max_prompt_len 60 | to_cache.extend(range(start_idx, slot_mapping.shape[0])) 61 | self.to_cache = torch.tensor(to_cache, 62 | dtype=torch.int32, 63 | device=self.slot_mapping.device) 64 | 65 | self.num_prompts = len(prompt_lens) 66 | self.num_prompt_tokens = self.num_prompts * self.max_prompt_len 67 | self.num_generation_tokens = context_lens.shape[0] 68 | if block_tables.numel() > 0: 69 | self.max_num_blocks_per_seq = block_tables.shape[1] 70 | else: 71 | self.max_num_blocks_per_seq = 0 72 | assert block_tables.shape[0] == self.num_generation_tokens 73 | 74 | # Set during the execution of the first attention op. 75 | self.attn_bias: Optional[AttentionBias] = None 76 | 77 | def __repr__(self) -> str: 78 | # Print only useful metadata. 79 | return ( 80 | f'InputMetadata(' 81 | f'num_prompt_tokens={self.num_prompt_tokens}, ' 82 | f'num_prompts={self.num_prompts}, ' 83 | f'prompt_lens={self.prompt_lens}, ' 84 | f'num_generation_tokens={self.num_generation_tokens}, ' 85 | f'context_lens={self.context_lens}, ' 86 | f'max_context_len={self.max_context_len}), ' 87 | f'max_num_blocks_per_seq={self.max_num_blocks_per_seq}, ' 88 | f'block_tables={self.block_tables}, ' 89 | f'selected_token_indices={self.selected_token_indices}, ' 90 | f'categorized_sample_indices={self.categorized_sample_indices}, ' 91 | f'slot_mapping={self.slot_mapping})') 92 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OthersideAI/vllm/4b498312ccc7b2804dfe26a4d902750e49937183/vllm/model_executor/layers/__init__.py -------------------------------------------------------------------------------- /vllm/model_executor/layers/activation.py: -------------------------------------------------------------------------------- 1 | """Custom activation functions.""" 2 | from typing import Optional 3 | 4 | import torch 5 | import torch.nn as nn 6 | 7 | from vllm._C import ops 8 | from vllm.model_executor.layers.quantization import QuantizationConfig 9 | from vllm.model_executor.parallel_utils.parallel_state import ( 10 | get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size) 11 | from vllm.model_executor.parallel_utils.utils import divide 12 | from vllm.model_executor.utils import set_weight_attrs 13 | 14 | 15 | class SiluAndMul(nn.Module): 16 | """An activation function for SwiGLU. 17 | 18 | The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2. 19 | 20 | Shapes: 21 | x: (batch_size, seq_len, 2 * d) or (num_tokens, 2 * d) 22 | return: (batch_size, seq_len, d) or (num_tokens, d) 23 | """ 24 | 25 | def forward(self, x: torch.Tensor) -> torch.Tensor: 26 | d = x.shape[-1] // 2 27 | output_shape = (x.shape[:-1] + (d, )) 28 | out = torch.empty(output_shape, dtype=x.dtype, device=x.device) 29 | ops.silu_and_mul(out, x) 30 | return out 31 | 32 | 33 | class NewGELU(nn.Module): 34 | 35 | def forward(self, x: torch.Tensor) -> torch.Tensor: 36 | out = torch.empty_like(x) 37 | ops.gelu_new(out, x) 38 | return out 39 | 40 | 41 | class FastGELU(nn.Module): 42 | 43 | def forward(self, x: torch.Tensor) -> torch.Tensor: 44 | out = torch.empty_like(x) 45 | ops.gelu_fast(out, x) 46 | return out 47 | 48 | 49 | class ScaledActivation(nn.Module): 50 | """An activation function with post-scale parameters. 51 | 52 | This is used for some quantization methods like AWQ. 53 | """ 54 | 55 | def __init__( 56 | self, 57 | act_module: nn.Module, 58 | intermediate_size: int, 59 | input_is_parallel: bool = True, 60 | params_dtype: Optional[torch.dtype] = None, 61 | ): 62 | super().__init__() 63 | self.act = act_module 64 | self.input_is_parallel = input_is_parallel 65 | if input_is_parallel: 66 | tp_size = get_tensor_model_parallel_world_size() 67 | intermediate_size_per_partition = divide(intermediate_size, 68 | tp_size) 69 | else: 70 | intermediate_size_per_partition = intermediate_size 71 | if params_dtype is None: 72 | params_dtype = torch.get_default_dtype() 73 | self.scales = nn.Parameter( 74 | torch.empty(intermediate_size_per_partition, 75 | dtype=params_dtype, 76 | device="cuda")) 77 | set_weight_attrs(self.scales, {"weight_loader": self.weight_loader}) 78 | 79 | def forward(self, x: torch.Tensor) -> torch.Tensor: 80 | return self.act(x) / self.scales 81 | 82 | def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor): 83 | param_data = param.data 84 | if self.input_is_parallel: 85 | tp_rank = get_tensor_model_parallel_rank() 86 | shard_size = param_data.shape[0] 87 | start_idx = tp_rank * shard_size 88 | loaded_weight = loaded_weight.narrow(0, start_idx, shard_size) 89 | assert param_data.shape == loaded_weight.shape 90 | param_data.copy_(loaded_weight) 91 | 92 | 93 | _ACTIVATION_REGISTRY = { 94 | "gelu": nn.GELU(), 95 | "gelu_fast": FastGELU(), 96 | "gelu_new": NewGELU(), 97 | "gelu_pytorch_tanh": nn.GELU(approximate="tanh"), 98 | "relu": nn.ReLU(), 99 | } 100 | 101 | 102 | def get_act_fn( 103 | act_fn_name: str, 104 | quant_config: Optional[QuantizationConfig] = None, 105 | intermediate_size: Optional[int] = None, 106 | input_is_parallel: bool = True, 107 | params_dtype: Optional[torch.dtype] = None, 108 | ) -> nn.Module: 109 | """Get an activation function by name.""" 110 | act_fn_name = act_fn_name.lower() 111 | if act_fn_name not in _ACTIVATION_REGISTRY: 112 | raise ValueError( 113 | f"Activation function {act_fn_name!r} is not supported.") 114 | 115 | act_fn = _ACTIVATION_REGISTRY[act_fn_name] 116 | if (quant_config is not None 117 | and act_fn_name in quant_config.get_scaled_act_names()): 118 | if intermediate_size is None: 119 | raise ValueError("intermediate_size must be specified for scaled " 120 | "activation functions.") 121 | return ScaledActivation(act_fn, intermediate_size, input_is_parallel, 122 | params_dtype) 123 | return act_fn 124 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/layernorm.py: -------------------------------------------------------------------------------- 1 | """Custom normalization layers.""" 2 | from typing import Optional, Tuple, Union 3 | 4 | import torch 5 | import torch.nn as nn 6 | 7 | from vllm._C import ops 8 | 9 | 10 | class RMSNorm(nn.Module): 11 | """Root mean square normalization. 12 | 13 | Computes x -> w * x / sqrt(E[x^2] + eps) where w is the learned weight. 14 | Refer to https://arxiv.org/abs/1910.07467 15 | """ 16 | 17 | def __init__( 18 | self, 19 | hidden_size: int, 20 | eps: float = 1e-6, 21 | ) -> None: 22 | super().__init__() 23 | self.weight = nn.Parameter(torch.ones(hidden_size)) 24 | self.variance_epsilon = eps 25 | 26 | def forward( 27 | self, 28 | x: torch.Tensor, 29 | residual: Optional[torch.Tensor] = None, 30 | ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: 31 | if residual is not None: 32 | ops.fused_add_rms_norm( 33 | x, 34 | residual, 35 | self.weight.data, 36 | self.variance_epsilon, 37 | ) 38 | return x, residual 39 | out = torch.empty_like(x) 40 | ops.rms_norm( 41 | out, 42 | x, 43 | self.weight.data, 44 | self.variance_epsilon, 45 | ) 46 | return out 47 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/__init__.py: -------------------------------------------------------------------------------- 1 | from typing import Type 2 | 3 | from vllm.model_executor.layers.quantization.awq import AWQConfig 4 | from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig 5 | from vllm.model_executor.layers.quantization.base_config import QuantizationConfig 6 | 7 | _QUANTIZATION_CONFIG_REGISTRY = { 8 | "awq": AWQConfig, 9 | "squeezellm": SqueezeLLMConfig, 10 | } 11 | 12 | 13 | def get_quantization_config(quantization: str) -> Type[QuantizationConfig]: 14 | if quantization not in _QUANTIZATION_CONFIG_REGISTRY: 15 | raise ValueError(f"Invalid quantization method: {quantization}") 16 | return _QUANTIZATION_CONFIG_REGISTRY[quantization] 17 | 18 | 19 | __all__ = [ 20 | "QuantizationConfig", 21 | "get_quantization_config", 22 | ] 23 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/base_config.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Any, Dict, List 3 | 4 | import torch 5 | 6 | from vllm.model_executor.layers.linear import LinearMethodBase 7 | 8 | 9 | class QuantizationConfig(ABC): 10 | """Base class for quantization configs.""" 11 | 12 | @abstractmethod 13 | def get_name(self) -> str: 14 | """Name of the quantization method.""" 15 | raise NotImplementedError 16 | 17 | @abstractmethod 18 | def get_supported_act_dtypes(self) -> List[torch.dtype]: 19 | """List of supported activation dtypes.""" 20 | raise NotImplementedError 21 | 22 | @abstractmethod 23 | def get_min_capability(self) -> int: 24 | """Minimum GPU capability to support the quantization method. 25 | 26 | E.g., 70 for Volta, 75 for Turing, 80 for Ampere. 27 | This requirement is due to the custom CUDA kernels used by the 28 | quantization method. 29 | """ 30 | raise NotImplementedError 31 | 32 | @staticmethod 33 | @abstractmethod 34 | def get_config_filenames() -> List[str]: 35 | """List of filenames to search for in the model directory.""" 36 | raise NotImplementedError 37 | 38 | @classmethod 39 | @abstractmethod 40 | def from_config(cls, config: Dict[str, Any]) -> "QuantizationConfig": 41 | """Create a config class from the model's quantization config.""" 42 | raise NotImplementedError 43 | 44 | @staticmethod 45 | def get_from_keys(config: Dict[str, Any], keys: List[str]) -> Any: 46 | """Get a value from the model's quantization config.""" 47 | for key in keys: 48 | if key in config: 49 | return config[key] 50 | raise ValueError(f"Cannot find any of {keys} in the model's " 51 | "quantization config.") 52 | 53 | @abstractmethod 54 | def get_linear_method(self) -> LinearMethodBase: 55 | """Get the linear method to use for the quantized linear layer.""" 56 | raise NotImplementedError 57 | 58 | @abstractmethod 59 | def get_scaled_act_names(self) -> List[str]: 60 | """Returns the activation function names that should be post-scaled. 61 | 62 | For now, this is only used by AWQ. 63 | """ 64 | raise NotImplementedError 65 | -------------------------------------------------------------------------------- /vllm/model_executor/layers/quantization/squeezellm.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Dict, List, Optional 2 | 3 | import torch 4 | from torch.nn.parameter import Parameter 5 | 6 | from vllm._C import ops 7 | from vllm.model_executor.layers.linear import (LinearMethodBase, 8 | set_weight_attrs) 9 | from vllm.model_executor.layers.quantization.base_config import QuantizationConfig 10 | 11 | 12 | class SqueezeLLMConfig(QuantizationConfig): 13 | """Config class for SqueezeLLM. 14 | 15 | Reference: https://arxiv.org/pdf/2306.07629 16 | """ 17 | 18 | def __init__( 19 | self, 20 | weight_bits: int, 21 | ) -> None: 22 | self.weight_bits = weight_bits 23 | 24 | if self.weight_bits != 4: 25 | raise ValueError( 26 | "Currently, only 4-bit weight quantization is supported for " 27 | f"SqueezeLLM, but got {self.weight_bits} bits.") 28 | 29 | self.pack_factor = 32 // self.weight_bits 30 | 31 | def __repr__(self) -> str: 32 | return f"SqueezeLLMConfig(weight_bits={self.weight_bits})" 33 | 34 | def get_name(self) -> str: 35 | return "squeezellm" 36 | 37 | def get_supported_act_dtypes(self) -> List[torch.dtype]: 38 | return [torch.half] 39 | 40 | def get_min_capability(self) -> int: 41 | return 70 42 | 43 | @staticmethod 44 | def get_config_filenames() -> List[str]: 45 | return ["quant_config.json"] 46 | 47 | @classmethod 48 | def from_config(cls, config: Dict[str, Any]) -> "SqueezeLLMConfig": 49 | weight_bits = cls.get_from_keys(config, ["wbits"]) 50 | return cls(weight_bits) 51 | 52 | def get_linear_method(self) -> "SqueezeLLMLinearMethod": 53 | return SqueezeLLMLinearMethod(self) 54 | 55 | def get_scaled_act_names(self) -> List[str]: 56 | return [] 57 | 58 | 59 | class SqueezeLLMLinearMethod(LinearMethodBase): 60 | """Linear method for SqueezeLLM. 61 | 62 | Args: 63 | quant_config: The SqueezeLLM quantization config. 64 | """ 65 | 66 | def __init__(self, quant_config: SqueezeLLMConfig): 67 | self.quant_config = quant_config 68 | 69 | def create_weights(self, input_size: int, output_size: int, 70 | params_dtype: torch.dtype) -> Dict[str, torch.Tensor]: 71 | if input_size % self.quant_config.pack_factor != 0: 72 | raise ValueError( 73 | "The input size is not aligned with the quantized " 74 | "weight shape. This can be caused by too large " 75 | "tensor parallel size.") 76 | qweight = Parameter( 77 | torch.empty( 78 | input_size // self.quant_config.pack_factor, 79 | output_size, 80 | device="cuda", 81 | dtype=torch.int32, 82 | ), 83 | requires_grad=False, 84 | ) 85 | set_weight_attrs( 86 | qweight, { 87 | "input_dim": 0, 88 | "output_dim": 1, 89 | "packed_dim": 0, 90 | "pack_factor": self.quant_config.pack_factor, 91 | }) 92 | lookup_table = Parameter( 93 | torch.empty( 94 | output_size, 95 | self.quant_config.weight_bits**2, 96 | device="cuda", 97 | dtype=params_dtype, 98 | ), 99 | requires_grad=False, 100 | ) 101 | set_weight_attrs(lookup_table, { 102 | "output_dim": 0, 103 | }) 104 | return { 105 | "qweight": qweight, 106 | "lookup_table": lookup_table, 107 | } 108 | 109 | def apply_weights(self, 110 | weights: Dict[str, torch.Tensor], 111 | x: torch.Tensor, 112 | bias: Optional[torch.Tensor] = None) -> torch.Tensor: 113 | qweight = weights["qweight"] 114 | lookup_table = weights["lookup_table"] 115 | out_shape = x.shape[:-1] + (qweight.shape[-1], ) 116 | reshaped_x = x.reshape(-1, x.shape[-1]) 117 | # NOTE: The output tensor should be zero-initialized. 118 | out = torch.zeros(out_shape, device="cuda", dtype=torch.float16) 119 | ops.squeezellm_gemm(reshaped_x, qweight, out, lookup_table) 120 | 121 | if bias is not None: 122 | out = out + bias 123 | return out.reshape(out_shape) 124 | -------------------------------------------------------------------------------- /vllm/model_executor/model_loader.py: -------------------------------------------------------------------------------- 1 | """Utilities for selecting and loading models.""" 2 | import contextlib 3 | from typing import Type 4 | 5 | import torch 6 | import torch.nn as nn 7 | from transformers import PretrainedConfig 8 | 9 | from vllm.config import ModelConfig 10 | from vllm.model_executor.models import * 11 | from vllm.model_executor.weight_utils import (get_quant_config, 12 | initialize_dummy_weights) 13 | 14 | # TODO(woosuk): Lazy-load the model classes. 15 | _MODEL_REGISTRY = { 16 | "AquilaModel": AquilaForCausalLM, 17 | "AquilaForCausalLM": AquilaForCausalLM, # AquilaChat2 18 | "BaiChuanForCausalLM": BaiChuanForCausalLM, # baichuan-7b 19 | "BaichuanForCausalLM": BaichuanForCausalLM, # baichuan-13b 20 | "BloomForCausalLM": BloomForCausalLM, 21 | "ChatGLMModel": ChatGLMForCausalLM, 22 | "FalconForCausalLM": FalconForCausalLM, 23 | "GPT2LMHeadModel": GPT2LMHeadModel, 24 | "GPTBigCodeForCausalLM": GPTBigCodeForCausalLM, 25 | "GPTJForCausalLM": GPTJForCausalLM, 26 | "GPTNeoXForCausalLM": GPTNeoXForCausalLM, 27 | "InternLMForCausalLM": InternLMForCausalLM, 28 | "LlamaForCausalLM": LlamaForCausalLM, 29 | "LLaMAForCausalLM": LlamaForCausalLM, # For decapoda-research/llama-* 30 | "MistralForCausalLM": MistralForCausalLM, 31 | # transformers's mpt class has lower case 32 | "MptForCausalLM": MPTForCausalLM, 33 | "MPTForCausalLM": MPTForCausalLM, 34 | "OPTForCausalLM": OPTForCausalLM, 35 | "PhiForCausalLM": PhiForCausalLM, 36 | "QWenLMHeadModel": QWenLMHeadModel, 37 | "RWForCausalLM": FalconForCausalLM, 38 | "YiForCausalLM": YiForCausalLM, 39 | } 40 | 41 | 42 | @contextlib.contextmanager 43 | def _set_default_torch_dtype(dtype: torch.dtype): 44 | """Sets the default torch dtype to the given dtype.""" 45 | old_dtype = torch.get_default_dtype() 46 | torch.set_default_dtype(dtype) 47 | yield 48 | torch.set_default_dtype(old_dtype) 49 | 50 | 51 | def _get_model_architecture(config: PretrainedConfig) -> Type[nn.Module]: 52 | architectures = getattr(config, "architectures", []) 53 | for arch in architectures: 54 | if arch in _MODEL_REGISTRY: 55 | return _MODEL_REGISTRY[arch] 56 | raise ValueError( 57 | f"Model architectures {architectures} are not supported for now. " 58 | f"Supported architectures: {list(_MODEL_REGISTRY.keys())}") 59 | 60 | 61 | def get_model(model_config: ModelConfig) -> nn.Module: 62 | model_class = _get_model_architecture(model_config.hf_config) 63 | 64 | # Get the (maybe quantized) linear method. 65 | linear_method = None 66 | if model_config.quantization is not None: 67 | quant_config = get_quant_config(model_config.quantization, 68 | model_config.model, 69 | model_config.hf_config, 70 | model_config.download_dir) 71 | capability = torch.cuda.get_device_capability() 72 | capability = capability[0] * 10 + capability[1] 73 | if capability < quant_config.get_min_capability(): 74 | raise ValueError( 75 | f"The quantization method {model_config.quantization} is not " 76 | "supported for the current GPU. " 77 | f"Minimum capability: {quant_config.get_min_capability()}. " 78 | f"Current capability: {capability}.") 79 | supported_dtypes = quant_config.get_supported_act_dtypes() 80 | if model_config.dtype not in supported_dtypes: 81 | raise ValueError( 82 | f"{model_config.dtype} is not supported for quantization " 83 | f"method {model_config.quantization}. Supported dtypes: " 84 | f"{supported_dtypes}") 85 | linear_method = quant_config.get_linear_method() 86 | 87 | with _set_default_torch_dtype(model_config.dtype): 88 | # Create a model instance. 89 | # The weights will be initialized as empty tensors. 90 | model = model_class(model_config.hf_config, linear_method) 91 | if model_config.load_format == "dummy": 92 | model = model.cuda() 93 | # NOTE(woosuk): For accurate performance evaluation, we assign 94 | # random values to the weights. 95 | initialize_dummy_weights(model) 96 | else: 97 | # Load the weights from the cached or downloaded files. 98 | model.load_weights(model_config.model, model_config.download_dir, 99 | model_config.load_format, model_config.revision) 100 | model = model.cuda() 101 | return model.eval() 102 | -------------------------------------------------------------------------------- /vllm/model_executor/models/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.model_executor.models.aquila import AquilaForCausalLM 2 | from vllm.model_executor.models.baichuan import (BaiChuanForCausalLM, 3 | BaichuanForCausalLM) 4 | from vllm.model_executor.models.bloom import BloomForCausalLM 5 | from vllm.model_executor.models.falcon import FalconForCausalLM 6 | from vllm.model_executor.models.gpt2 import GPT2LMHeadModel 7 | from vllm.model_executor.models.gpt_bigcode import GPTBigCodeForCausalLM 8 | from vllm.model_executor.models.gpt_j import GPTJForCausalLM 9 | from vllm.model_executor.models.gpt_neox import GPTNeoXForCausalLM 10 | from vllm.model_executor.models.internlm import InternLMForCausalLM 11 | from vllm.model_executor.models.llama import LlamaForCausalLM 12 | from vllm.model_executor.models.mistral import MistralForCausalLM 13 | from vllm.model_executor.models.mpt import MPTForCausalLM 14 | from vllm.model_executor.models.opt import OPTForCausalLM 15 | from vllm.model_executor.models.phi_1_5 import PhiForCausalLM 16 | from vllm.model_executor.models.qwen import QWenLMHeadModel 17 | from vllm.model_executor.models.chatglm import ChatGLMForCausalLM 18 | from vllm.model_executor.models.yi import YiForCausalLM 19 | 20 | __all__ = [ 21 | "AquilaForCausalLM", 22 | "BaiChuanForCausalLM", 23 | "BaichuanForCausalLM", 24 | "BloomForCausalLM", 25 | "ChatGLMForCausalLM", 26 | "FalconForCausalLM", 27 | "GPT2LMHeadModel", 28 | "GPTBigCodeForCausalLM", 29 | "GPTJForCausalLM", 30 | "GPTNeoXForCausalLM", 31 | "InternLMForCausalLM", 32 | "LlamaForCausalLM", 33 | "MPTForCausalLM", 34 | "OPTForCausalLM", 35 | "PhiForCausalLM", 36 | "QWenLMHeadModel", 37 | "MistralForCausalLM", 38 | "YiForCausalLM", 39 | ] 40 | -------------------------------------------------------------------------------- /vllm/model_executor/parallel_utils/README.md: -------------------------------------------------------------------------------- 1 | The files in this folder are ported from [Megatron-LM](https://github.com/NVIDIA/Megatron-LM/tree/main/megatron/core). We only keep the codes that are used in inference. -------------------------------------------------------------------------------- /vllm/model_executor/parallel_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OthersideAI/vllm/4b498312ccc7b2804dfe26a4d902750e49937183/vllm/model_executor/parallel_utils/__init__.py -------------------------------------------------------------------------------- /vllm/model_executor/parallel_utils/communication_op.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from vllm.model_executor.parallel_utils.parallel_state import ( 4 | get_tensor_model_parallel_world_size, 5 | get_tensor_model_parallel_group, 6 | ) 7 | 8 | 9 | def tensor_model_parallel_all_reduce(input_): 10 | """All-reduce the input tensor across model parallel group. 11 | 12 | NOTE: This operation is applied in-place on the input tensor. 13 | """ 14 | # Bypass the function if we are using only 1 GPU. 15 | if get_tensor_model_parallel_world_size() == 1: 16 | return input_ 17 | # All-reduce. 18 | torch.distributed.all_reduce(input_, 19 | group=get_tensor_model_parallel_group()) 20 | return input_ 21 | 22 | 23 | def tensor_model_parallel_all_gather(input_, dim=-1): 24 | """All-gather the input tensor across model parallel group.""" 25 | world_size = get_tensor_model_parallel_world_size() 26 | # Bypass the function if we are using only 1 GPU. 27 | if world_size == 1: 28 | return input_ 29 | assert -input_.dim() <= dim < input_.dim(), ( 30 | f"Invalid dim ({dim}) for input tensor with shape {input_.size()}") 31 | if dim < 0: 32 | # Convert negative dim to positive. 33 | dim += input_.dim() 34 | input_size = input_.size() 35 | # Allocate output tensor. 36 | output_tensor = torch.empty((world_size, ) + input_size, 37 | dtype=input_.dtype, 38 | device=input_.device) 39 | # All-gather. 40 | torch.distributed.all_gather_into_tensor( 41 | output_tensor, input_, group=get_tensor_model_parallel_group()) 42 | # Reshape 43 | output_tensor = output_tensor.movedim(0, dim) 44 | output_tensor = output_tensor.reshape(input_size[:dim] + 45 | (world_size * input_size[dim], ) + 46 | input_size[dim + 1:]) 47 | return output_tensor 48 | -------------------------------------------------------------------------------- /vllm/model_executor/parallel_utils/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 The vLLM team. 2 | # Adapted from 3 | # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/tensor_parallel/utils.py 4 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 5 | from typing import Sequence 6 | 7 | import torch 8 | 9 | 10 | def ensure_divisibility(numerator, denominator): 11 | """Ensure that numerator is divisible by the denominator.""" 12 | assert numerator % denominator == 0, "{} is not divisible by {}".format( 13 | numerator, denominator) 14 | 15 | 16 | def divide(numerator, denominator): 17 | """Ensure that numerator is divisible by the denominator and return 18 | the division value.""" 19 | ensure_divisibility(numerator, denominator) 20 | return numerator // denominator 21 | 22 | 23 | def split_tensor_along_last_dim( 24 | tensor: torch.Tensor, 25 | num_partitions: int, 26 | contiguous_split_chunks: bool = False, 27 | ) -> Sequence[torch.Tensor]: 28 | """ Split a tensor along its last dimension. 29 | 30 | Arguments: 31 | tensor: input tensor. 32 | num_partitions: number of partitions to split the tensor 33 | contiguous_split_chunks: If True, make each chunk contiguous 34 | in memory. 35 | 36 | Returns: 37 | A list of Tensors 38 | """ 39 | # Get the size and dimension. 40 | last_dim = tensor.dim() - 1 41 | last_dim_size = divide(tensor.size()[last_dim], num_partitions) 42 | # Split. 43 | tensor_list = torch.split(tensor, last_dim_size, dim=last_dim) 44 | # NOTE: torch.split does not create contiguous tensors by default. 45 | if contiguous_split_chunks: 46 | return tuple(chunk.contiguous() for chunk in tensor_list) 47 | 48 | return tensor_list 49 | -------------------------------------------------------------------------------- /vllm/model_executor/utils.py: -------------------------------------------------------------------------------- 1 | """Utils for model executor.""" 2 | import random 3 | from typing import Any, Dict, Optional 4 | 5 | import numpy as np 6 | import torch 7 | 8 | 9 | def set_random_seed(seed: int) -> None: 10 | random.seed(seed) 11 | np.random.seed(seed) 12 | torch.manual_seed(seed) 13 | if torch.cuda.is_available(): 14 | torch.cuda.manual_seed_all(seed) 15 | 16 | 17 | def set_weight_attrs( 18 | weight: torch.Tensor, 19 | weight_attrs: Optional[Dict[str, Any]], 20 | ): 21 | """Set attributes on a weight tensor. 22 | 23 | This method is used to set attributes on a weight tensor. This method 24 | will not overwrite existing attributes. 25 | 26 | Args: 27 | weight: The weight tensor. 28 | weight_attrs: A dictionary of attributes to set on the weight tensor. 29 | """ 30 | if weight_attrs is None: 31 | return 32 | for key, value in weight_attrs.items(): 33 | assert not hasattr( 34 | weight, key), (f"Overwriting existing tensor attribute: {key}") 35 | setattr(weight, key, value) 36 | -------------------------------------------------------------------------------- /vllm/outputs.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | 3 | from vllm.sequence import (PromptLogprobs, SampleLogprobs, SequenceGroup, 4 | SequenceStatus) 5 | 6 | 7 | class CompletionOutput: 8 | """The output data of one completion output of a request. 9 | 10 | Args: 11 | index: The index of the output in the request. 12 | text: The generated output text. 13 | token_ids: The token IDs of the generated output text. 14 | cumulative_logprob: The cumulative log probability of the generated 15 | output text. 16 | logprobs: The log probabilities of the top probability words at each 17 | position if the logprobs are requested. 18 | finish_reason: The reason why the sequence is finished. 19 | """ 20 | 21 | def __init__( 22 | self, 23 | index: int, 24 | text: str, 25 | token_ids: List[int], 26 | cumulative_logprob: float, 27 | logprobs: Optional[SampleLogprobs], 28 | finish_reason: Optional[str] = None, 29 | ) -> None: 30 | self.index = index 31 | self.text = text 32 | self.token_ids = token_ids 33 | self.cumulative_logprob = cumulative_logprob 34 | self.logprobs = logprobs 35 | self.finish_reason = finish_reason 36 | 37 | def finished(self) -> bool: 38 | return self.finish_reason is not None 39 | 40 | def __repr__(self) -> str: 41 | return (f"CompletionOutput(index={self.index}, " 42 | f"text={self.text!r}, " 43 | f"token_ids={self.token_ids}, " 44 | f"cumulative_logprob={self.cumulative_logprob}, " 45 | f"logprobs={self.logprobs}, " 46 | f"finish_reason={self.finish_reason})") 47 | 48 | 49 | class RequestOutput: 50 | """The output data of a request to the LLM. 51 | 52 | Args: 53 | request_id: The unique ID of the request. 54 | prompt: The prompt string of the request. 55 | prompt_token_ids: The token IDs of the prompt. 56 | prompt_logprobs: The log probabilities to return per prompt token. 57 | outputs: The output sequences of the request. 58 | finished: Whether the whole request is finished. 59 | """ 60 | 61 | def __init__( 62 | self, 63 | request_id: str, 64 | prompt: str, 65 | prompt_token_ids: List[int], 66 | prompt_logprobs: Optional[PromptLogprobs], 67 | outputs: List[CompletionOutput], 68 | finished: bool, 69 | ) -> None: 70 | self.request_id = request_id 71 | self.prompt = prompt 72 | self.prompt_token_ids = prompt_token_ids 73 | self.prompt_logprobs = prompt_logprobs 74 | self.outputs = outputs 75 | self.finished = finished 76 | 77 | @classmethod 78 | def from_seq_group(cls, seq_group: SequenceGroup) -> "RequestOutput": 79 | # Get the top-n sequences. 80 | n = seq_group.sampling_params.n 81 | seqs = seq_group.get_seqs() 82 | if seq_group.sampling_params.use_beam_search: 83 | sorting_key = lambda seq: seq.get_beam_search_score( 84 | seq_group.sampling_params.length_penalty) 85 | else: 86 | sorting_key = lambda seq: seq.get_cumulative_logprob() 87 | sorted_seqs = sorted(seqs, key=sorting_key, reverse=True) 88 | top_n_seqs = sorted_seqs[:n] 89 | 90 | # Create the outputs. 91 | outputs: List[CompletionOutput] = [] 92 | for seq in top_n_seqs: 93 | logprobs = seq.output_logprobs 94 | if seq_group.sampling_params.logprobs is None: 95 | # NOTE: We need to take care of this case because the sequence 96 | # always has the logprobs of the sampled tokens even if the 97 | # logprobs are not requested. 98 | logprobs = None 99 | finshed_reason = SequenceStatus.get_finished_reason(seq.status) 100 | output = CompletionOutput(seqs.index(seq), seq.output_text, 101 | seq.get_output_token_ids(), 102 | seq.get_cumulative_logprob(), logprobs, 103 | finshed_reason) 104 | outputs.append(output) 105 | 106 | # Every sequence in the sequence group should have the same prompt. 107 | prompt = seq_group.prompt 108 | prompt_token_ids = seq_group.prompt_token_ids 109 | prompt_logprobs = seq_group.prompt_logprobs 110 | finished = seq_group.is_finished() 111 | return cls(seq_group.request_id, prompt, prompt_token_ids, 112 | prompt_logprobs, outputs, finished) 113 | 114 | def __repr__(self) -> str: 115 | return (f"RequestOutput(request_id={self.request_id}, " 116 | f"prompt={self.prompt!r}, " 117 | f"prompt_token_ids={self.prompt_token_ids}, " 118 | f"prompt_logprobs={self.prompt_logprobs}, " 119 | f"outputs={self.outputs}, " 120 | f"finished={self.finished})") 121 | -------------------------------------------------------------------------------- /vllm/py.typed: -------------------------------------------------------------------------------- 1 | # Marker file for PEP 561. 2 | # The vllm package uses inline types. 3 | -------------------------------------------------------------------------------- /vllm/transformers_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OthersideAI/vllm/4b498312ccc7b2804dfe26a4d902750e49937183/vllm/transformers_utils/__init__.py -------------------------------------------------------------------------------- /vllm/transformers_utils/config.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from transformers import AutoConfig, PretrainedConfig 4 | 5 | from vllm.transformers_utils.configs import * 6 | 7 | _CONFIG_REGISTRY = { 8 | "aquila": AquilaConfig, 9 | "baichuan": BaiChuanConfig, 10 | "chatglm": ChatGLMConfig, 11 | "mpt": MPTConfig, 12 | "qwen": QWenConfig, 13 | "RefinedWeb": RWConfig, # For tiiuae/falcon-40b(-instruct) 14 | "RefinedWebModel": RWConfig, # For tiiuae/falcon-7b(-instruct) 15 | "yi": YiConfig, 16 | } 17 | 18 | 19 | def get_config(model: str, 20 | trust_remote_code: bool, 21 | revision: Optional[str] = None) -> PretrainedConfig: 22 | try: 23 | config = AutoConfig.from_pretrained( 24 | model, trust_remote_code=trust_remote_code, revision=revision) 25 | except ValueError as e: 26 | if (not trust_remote_code and 27 | "requires you to execute the configuration file" in str(e)): 28 | err_msg = ( 29 | "Failed to load the model config. If the model is a custom " 30 | "model not yet available in the HuggingFace transformers " 31 | "library, consider setting `trust_remote_code=True` in LLM " 32 | "or using the `--trust-remote-code` flag in the CLI.") 33 | raise RuntimeError(err_msg) from e 34 | else: 35 | raise e 36 | if config.model_type in _CONFIG_REGISTRY: 37 | config_class = _CONFIG_REGISTRY[config.model_type] 38 | config = config_class.from_pretrained(model, revision=revision) 39 | return config 40 | -------------------------------------------------------------------------------- /vllm/transformers_utils/configs/__init__.py: -------------------------------------------------------------------------------- 1 | from vllm.transformers_utils.configs.aquila import AquilaConfig 2 | from vllm.transformers_utils.configs.baichuan import BaiChuanConfig 3 | from vllm.transformers_utils.configs.chatglm import ChatGLMConfig 4 | from vllm.transformers_utils.configs.mpt import MPTConfig 5 | from vllm.transformers_utils.configs.qwen import QWenConfig 6 | # RWConfig is for the original tiiuae/falcon-40b(-instruct) and 7 | # tiiuae/falcon-7b(-instruct) models. Newer Falcon models will use the 8 | # `FalconConfig` class from the official HuggingFace transformers library. 9 | from vllm.transformers_utils.configs.falcon import RWConfig 10 | from vllm.transformers_utils.configs.yi import YiConfig 11 | 12 | __all__ = [ 13 | "AquilaConfig", 14 | "BaiChuanConfig", 15 | "ChatGLMConfig", 16 | "MPTConfig", 17 | "QWenConfig", 18 | "RWConfig", 19 | "YiConfig", 20 | ] 21 | -------------------------------------------------------------------------------- /vllm/transformers_utils/configs/aquila.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved. 3 | # 4 | # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX 5 | # and OPT implementations in this library. It has been modified from its 6 | # original forms to accommodate minor architectural differences compared 7 | # to GPT-NeoX and OPT used by the Meta AI team that trained the model. 8 | # 9 | # Licensed under the Apache License, Version 2.0 (the "License"); 10 | # you may not use this file except in compliance with the License. 11 | # You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | """ Aquila model configuration""" 21 | 22 | from transformers import PretrainedConfig 23 | 24 | 25 | class AquilaConfig(PretrainedConfig): 26 | model_type = "aquila" 27 | keys_to_ignore_at_inference = ["past_key_values"] 28 | 29 | def __init__( 30 | self, 31 | vocab_size=100008, 32 | hidden_size=4096, 33 | intermediate_size=11008, 34 | num_hidden_layers=32, 35 | num_attention_heads=32, 36 | num_key_value_heads=None, 37 | hidden_act="silu", 38 | max_position_embeddings=2048, 39 | initializer_range=0.006, 40 | rms_norm_eps=1e-5, 41 | use_cache=True, 42 | pad_token_id=0, 43 | bos_token_id=1, 44 | eos_token_id=2, 45 | tie_word_embeddings=False, 46 | **kwargs, 47 | ): 48 | self.vocab_size = vocab_size 49 | self.max_position_embeddings = max_position_embeddings 50 | self.hidden_size = hidden_size 51 | self.intermediate_size = intermediate_size 52 | self.num_hidden_layers = num_hidden_layers 53 | # for backward compatibility 54 | if num_key_value_heads is None: 55 | num_key_value_heads = num_attention_heads 56 | 57 | self.num_key_value_heads = num_key_value_heads 58 | self.num_attention_heads = num_attention_heads 59 | self.hidden_act = hidden_act 60 | self.initializer_range = initializer_range 61 | self.rms_norm_eps = rms_norm_eps 62 | self.use_cache = use_cache 63 | super().__init__( 64 | pad_token_id=pad_token_id, 65 | bos_token_id=bos_token_id, 66 | eos_token_id=eos_token_id, 67 | tie_word_embeddings=tie_word_embeddings, 68 | **kwargs, 69 | ) 70 | -------------------------------------------------------------------------------- /vllm/transformers_utils/configs/baichuan.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. 3 | # 4 | # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX 5 | # and OPT implementations in this library. It has been modified from its 6 | # original forms to accommodate minor architectural differences compared 7 | # to GPT-NeoX and OPT used by the Meta AI team that trained the model. 8 | # 9 | # Licensed under the Apache License, Version 2.0 (the "License"); 10 | # you may not use this file except in compliance with the License. 11 | # You may obtain a copy of the License at 12 | # 13 | # http://www.apache.org/licenses/LICENSE-2.0 14 | # 15 | # Unless required by applicable law or agreed to in writing, software 16 | # distributed under the License is distributed on an "AS IS" BASIS, 17 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 18 | # See the License for the specific language governing permissions and 19 | # limitations under the License. 20 | 21 | from transformers.configuration_utils import PretrainedConfig 22 | 23 | 24 | class BaiChuanConfig(PretrainedConfig): 25 | model_type = "baichuan" 26 | keys_to_ignore_at_inference = ["past_key_values"] 27 | 28 | def __init__( 29 | self, 30 | vocab_size=64000, 31 | hidden_size=4096, 32 | intermediate_size=11008, 33 | num_hidden_layers=32, 34 | num_attention_heads=32, 35 | hidden_act="silu", 36 | max_position_embeddings=4096, 37 | initializer_range=0.02, 38 | rms_norm_eps=1e-6, 39 | use_cache=True, 40 | pad_token_id=0, 41 | bos_token_id=1, 42 | eos_token_id=2, 43 | tie_word_embeddings=False, 44 | **kwargs, 45 | ): 46 | self.vocab_size = vocab_size 47 | self.max_position_embeddings = max_position_embeddings 48 | self.hidden_size = hidden_size 49 | self.intermediate_size = intermediate_size 50 | self.num_hidden_layers = num_hidden_layers 51 | self.num_attention_heads = num_attention_heads 52 | self.hidden_act = hidden_act 53 | self.initializer_range = initializer_range 54 | self.rms_norm_eps = rms_norm_eps 55 | self.use_cache = use_cache 56 | super().__init__( 57 | pad_token_id=pad_token_id, 58 | bos_token_id=bos_token_id, 59 | eos_token_id=eos_token_id, 60 | tie_word_embeddings=tie_word_embeddings, 61 | **kwargs, 62 | ) 63 | -------------------------------------------------------------------------------- /vllm/transformers_utils/configs/chatglm.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Adapted from 3 | # https://github.com/THUDM/ChatGLM2-6B 4 | from transformers import PretrainedConfig 5 | 6 | 7 | class ChatGLMConfig(PretrainedConfig): 8 | model_type = "chatglm" 9 | attribute_map = { 10 | "num_hidden_layers": "num_layers", 11 | "n_head_kv": "multi_query_group_num", 12 | } 13 | 14 | def __init__(self, 15 | num_layers=28, 16 | padded_vocab_size=65024, 17 | hidden_size=4096, 18 | ffn_hidden_size=13696, 19 | kv_channels=128, 20 | num_attention_heads=32, 21 | seq_length=2048, 22 | hidden_dropout=0.0, 23 | attention_dropout=0.0, 24 | layernorm_epsilon=1e-5, 25 | rmsnorm=True, 26 | apply_residual_connection_post_layernorm=False, 27 | post_layer_norm=True, 28 | add_bias_linear=False, 29 | add_qkv_bias=False, 30 | interleaved_qkv=False, 31 | bias_dropout_fusion=True, 32 | multi_query_attention=False, 33 | multi_query_group_num=1, 34 | apply_query_key_layer_scaling=True, 35 | attention_softmax_in_fp32=True, 36 | fp32_residual_connection=False, 37 | quantization_bit=0, 38 | pre_seq_len=None, 39 | prefix_projection=False, 40 | **kwargs): 41 | self.num_layers = num_layers 42 | self.vocab_size = padded_vocab_size 43 | self.padded_vocab_size = padded_vocab_size 44 | self.hidden_size = hidden_size 45 | self.ffn_hidden_size = ffn_hidden_size 46 | self.kv_channels = kv_channels 47 | self.num_attention_heads = num_attention_heads 48 | self.seq_length = seq_length 49 | self.hidden_dropout = hidden_dropout 50 | self.attention_dropout = attention_dropout 51 | self.layernorm_epsilon = layernorm_epsilon 52 | self.rmsnorm = rmsnorm 53 | self.apply_residual_connection_post_layernorm = ( 54 | apply_residual_connection_post_layernorm) 55 | self.post_layer_norm = post_layer_norm 56 | self.add_bias_linear = add_bias_linear 57 | self.add_qkv_bias = add_qkv_bias 58 | self.bias_dropout_fusion = bias_dropout_fusion 59 | self.multi_query_attention = multi_query_attention 60 | self.multi_query_group_num = multi_query_group_num 61 | self.apply_query_key_layer_scaling = apply_query_key_layer_scaling 62 | self.attention_softmax_in_fp32 = attention_softmax_in_fp32 63 | self.fp32_residual_connection = fp32_residual_connection 64 | self.quantization_bit = quantization_bit 65 | self.pre_seq_len = pre_seq_len 66 | self.prefix_projection = prefix_projection 67 | self.interleaved_qkv = interleaved_qkv 68 | super().__init__(**kwargs) 69 | -------------------------------------------------------------------------------- /vllm/transformers_utils/configs/falcon.py: -------------------------------------------------------------------------------- 1 | # Adapted from 2 | # https://huggingface.co/tiiuae/falcon-7b/blob/main/configuration_RW.py 3 | # Copyright 2023 The vLLM team. 4 | # Copyright 2022 the Big Science Workshop and HuggingFace Inc. team. 5 | # All rights reserved. 6 | # 7 | # Licensed under the Apache License, Version 2.0 (the "License"); 8 | # you may not use this file except in compliance with the License. 9 | # You may obtain a copy of the License at 10 | # 11 | # http://www.apache.org/licenses/LICENSE-2.0 12 | # 13 | # Unless required by applicable law or agreed to in writing, software 14 | # distributed under the License is distributed on an "AS IS" BASIS, 15 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 16 | # See the License for the specific language governing permissions and 17 | # limitations under the License. 18 | """Falcon configuration""" 19 | from transformers.configuration_utils import PretrainedConfig 20 | 21 | 22 | class RWConfig(PretrainedConfig): 23 | model_type = "falcon" 24 | keys_to_ignore_at_inference = ["past_key_values"] 25 | attribute_map = { 26 | "num_hidden_layers": "n_layer", 27 | "num_attention_heads": "n_head", 28 | "num_kv_heads": "n_head_kv", 29 | } 30 | 31 | def __init__( 32 | self, 33 | vocab_size=250880, 34 | hidden_size=64, 35 | n_layer=2, 36 | n_head=8, 37 | layer_norm_epsilon=1e-5, 38 | initializer_range=0.02, 39 | use_cache=True, 40 | bos_token_id=1, 41 | eos_token_id=2, 42 | hidden_dropout=0.0, 43 | attention_dropout=0.0, 44 | multi_query=True, 45 | n_head_kv=None, 46 | alibi=False, 47 | bias=False, 48 | parallel_attn=False, 49 | new_decoder_architecture=False, 50 | **kwargs, 51 | ) -> None: 52 | self.vocab_size = vocab_size 53 | # Backward compatibility with n_embed kwarg 54 | n_embed = kwargs.pop("n_embed", None) 55 | self.hidden_size = hidden_size if n_embed is None else n_embed 56 | self.n_layer = n_layer 57 | self.n_head = n_head 58 | self.layer_norm_epsilon = layer_norm_epsilon 59 | self.initializer_range = initializer_range 60 | self.use_cache = use_cache 61 | self.hidden_dropout = hidden_dropout 62 | self.attention_dropout = attention_dropout 63 | 64 | self.bos_token_id = bos_token_id 65 | self.eos_token_id = eos_token_id 66 | self.multi_query = multi_query 67 | self.n_head_kv = 1 if n_head_kv is None else n_head_kv 68 | self.alibi = alibi 69 | self.bias = bias 70 | self.parallel_attn = parallel_attn 71 | self.new_decoder_architecture = new_decoder_architecture 72 | 73 | if self.hidden_size == 8192: 74 | # Hack for falcon-40b 75 | self.new_decoder_architecture = True 76 | 77 | super().__init__(bos_token_id=bos_token_id, 78 | eos_token_id=eos_token_id, 79 | **kwargs) 80 | 81 | @property 82 | def head_dim(self): 83 | return self.hidden_size // self.n_head 84 | 85 | @property 86 | def rotary(self): 87 | return not self.alibi 88 | -------------------------------------------------------------------------------- /vllm/transformers_utils/configs/qwen.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Alibaba Cloud. 2 | # LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE 3 | 4 | from transformers import PretrainedConfig 5 | 6 | 7 | class QWenConfig(PretrainedConfig): 8 | model_type = "qwen" 9 | keys_to_ignore_at_inference = ["past_key_values"] 10 | 11 | def __init__( 12 | self, 13 | vocab_size=151936, 14 | hidden_size=4096, 15 | num_hidden_layers=32, 16 | num_attention_heads=32, 17 | emb_dropout_prob=0.0, 18 | attn_dropout_prob=0.0, 19 | layer_norm_epsilon=1e-6, 20 | initializer_range=0.02, 21 | max_position_embeddings=8192, 22 | scale_attn_weights=True, 23 | use_cache=True, 24 | bf16=False, 25 | fp16=False, 26 | fp32=False, 27 | kv_channels=128, 28 | rotary_pct=1.0, 29 | rotary_emb_base=10000, 30 | use_dynamic_ntk=True, 31 | use_logn_attn=True, 32 | use_flash_attn="auto", 33 | intermediate_size=22016, 34 | no_bias=True, 35 | tie_word_embeddings=False, 36 | **kwargs, 37 | ): 38 | self.vocab_size = vocab_size 39 | self.hidden_size = hidden_size 40 | self.intermediate_size = intermediate_size 41 | self.num_hidden_layers = num_hidden_layers 42 | self.num_attention_heads = num_attention_heads 43 | self.emb_dropout_prob = emb_dropout_prob 44 | self.attn_dropout_prob = attn_dropout_prob 45 | self.layer_norm_epsilon = layer_norm_epsilon 46 | self.initializer_range = initializer_range 47 | self.scale_attn_weights = scale_attn_weights 48 | self.use_cache = use_cache 49 | self.max_position_embeddings = max_position_embeddings 50 | self.bf16 = bf16 51 | self.fp16 = fp16 52 | self.fp32 = fp32 53 | self.kv_channels = kv_channels 54 | self.rotary_pct = rotary_pct 55 | self.rotary_emb_base = rotary_emb_base 56 | self.use_dynamic_ntk = use_dynamic_ntk 57 | self.use_logn_attn = use_logn_attn 58 | self.use_flash_attn = use_flash_attn 59 | self.no_bias = no_bias 60 | super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) 61 | -------------------------------------------------------------------------------- /vllm/transformers_utils/configs/yi.py: -------------------------------------------------------------------------------- 1 | """ Yi model configuration""" 2 | from transformers.configuration_utils import PretrainedConfig 3 | from transformers.utils import logging 4 | 5 | logger = logging.get_logger(__name__) 6 | 7 | Yi_PRETRAINED_CONFIG_ARCHIVE_MAP = {} 8 | 9 | 10 | class YiConfig(PretrainedConfig): 11 | r""" 12 | Reference: 13 | https://huggingface.co/01-ai/Yi-6B/blob/main/configuration_yi.py 14 | """ 15 | model_type = "Yi" 16 | keys_to_ignore_at_inference = ["past_key_values"] 17 | 18 | def __init__( 19 | self, 20 | vocab_size=64000, 21 | hidden_size=4096, 22 | intermediate_size=11008, 23 | num_hidden_layers=32, 24 | num_attention_heads=32, 25 | num_key_value_heads=4, 26 | hidden_act="silu", 27 | max_position_embeddings=4096, 28 | initializer_range=0.02, 29 | rms_norm_eps=1e-5, 30 | use_cache=True, 31 | pad_token_id=0, 32 | bos_token_id=1, 33 | eos_token_id=2, 34 | tie_word_embeddings=False, 35 | output_attentions=False, 36 | rope_theta=5000000.0, 37 | **kwargs, 38 | ): 39 | self.vocab_size = vocab_size 40 | self.max_position_embeddings = max_position_embeddings 41 | self.hidden_size = hidden_size 42 | self.intermediate_size = intermediate_size 43 | self.num_hidden_layers = num_hidden_layers 44 | self.num_attention_heads = num_attention_heads 45 | 46 | # for backward compatibility 47 | if num_key_value_heads is None: 48 | num_key_value_heads = num_attention_heads 49 | 50 | self.num_key_value_heads = num_key_value_heads 51 | self.hidden_act = hidden_act 52 | self.initializer_range = initializer_range 53 | self.rms_norm_eps = rms_norm_eps 54 | self.use_cache = use_cache 55 | self.output_attentions = output_attentions 56 | self.rope_theta = rope_theta 57 | 58 | super().__init__( 59 | pad_token_id=pad_token_id, 60 | bos_token_id=bos_token_id, 61 | eos_token_id=eos_token_id, 62 | tie_word_embeddings=tie_word_embeddings, 63 | **kwargs, 64 | ) 65 | -------------------------------------------------------------------------------- /vllm/utils.py: -------------------------------------------------------------------------------- 1 | import enum 2 | import uuid 3 | from platform import uname 4 | 5 | import psutil 6 | import torch 7 | 8 | from vllm._C import cuda_utils 9 | 10 | 11 | class Device(enum.Enum): 12 | GPU = enum.auto() 13 | CPU = enum.auto() 14 | 15 | 16 | class Counter: 17 | 18 | def __init__(self, start: int = 0) -> None: 19 | self.counter = start 20 | 21 | def __next__(self) -> int: 22 | i = self.counter 23 | self.counter += 1 24 | return i 25 | 26 | def reset(self) -> None: 27 | self.counter = 0 28 | 29 | 30 | def get_max_shared_memory_bytes(gpu: int = 0) -> int: 31 | """Returns the maximum shared memory per thread block in bytes.""" 32 | # https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html 33 | cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 34 | max_shared_mem = cuda_utils.get_device_attribute( 35 | cudaDevAttrMaxSharedMemoryPerBlockOptin, gpu) 36 | return int(max_shared_mem) 37 | 38 | 39 | def get_gpu_memory(gpu: int = 0) -> int: 40 | """Returns the total memory of the GPU in bytes.""" 41 | return torch.cuda.get_device_properties(gpu).total_memory 42 | 43 | 44 | def get_cpu_memory() -> int: 45 | """Returns the total CPU memory of the node in bytes.""" 46 | return psutil.virtual_memory().total 47 | 48 | 49 | def random_uuid() -> str: 50 | return str(uuid.uuid4().hex) 51 | 52 | 53 | def in_wsl() -> bool: 54 | # Reference: https://github.com/microsoft/WSL/issues/4071 55 | return "microsoft" in " ".join(uname()).lower() 56 | -------------------------------------------------------------------------------- /vllm/worker/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/OthersideAI/vllm/4b498312ccc7b2804dfe26a4d902750e49937183/vllm/worker/__init__.py --------------------------------------------------------------------------------