├── .github ├── ISSUE_TEMPLATE │ ├── blank_issue.md │ ├── bug_request.md │ ├── config.yml │ └── feature_request.md ├── changelog-config.json └── workflows │ ├── docker-build.yml │ ├── format.yml │ ├── release.yml │ └── unit_test.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE ├── Makefile ├── OWNERS ├── OWNERS_ALIASES ├── README.md ├── RELEASE.md ├── SECURITY.md ├── SECURITY_CONTACTS ├── code-of-conduct.md ├── config.yml ├── deploy ├── README.md └── manifests.yaml ├── docs ├── design.md └── images │ └── design.png ├── examples └── vllm │ ├── config-random.yml │ ├── config-shared-prefix.yml │ ├── config-synthetic.yml │ ├── config.yml │ └── vllm_server.ipynb ├── inference_perf ├── __init__.py ├── apis │ ├── __init__.py │ ├── base.py │ ├── chat.py │ └── completion.py ├── client │ ├── filestorage │ │ ├── __init__.py │ │ ├── base.py │ │ └── gcs.py │ ├── metricsclient │ │ ├── README.md │ │ ├── __init__.py │ │ ├── base.py │ │ ├── mock_client.py │ │ └── prometheus_client.py │ ├── modelserver │ │ ├── __init__.py │ │ ├── base.py │ │ ├── mock_client.py │ │ └── vllm_client.py │ └── requestdatacollector │ │ ├── __init__.py │ │ ├── base.py │ │ └── local.py ├── config.py ├── datagen │ ├── __init__.py │ ├── base.py │ ├── hf_sharegpt_datagen.py │ ├── mock_datagen.py │ ├── random_datagen.py │ ├── shared_prefix_datagen.py │ └── synthetic_datagen.py ├── loadgen │ ├── __init__.py │ ├── load_generator.py │ └── load_timer.py ├── main.py ├── reportgen │ ├── __init__.py │ └── base.py └── utils │ ├── __init__.py │ ├── custom_tokenizer.py │ ├── distribution.py │ └── report_file.py ├── pdm.lock ├── pyproject.toml ├── requirements.txt └── tests ├── apis ├── test_chat.py └── test_completion.py └── test_config.py /.github/ISSUE_TEMPLATE/blank_issue.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Blank Issue 3 | about: Create a new issue from scratch 4 | title: '' 5 | labels: needs-triage 6 | assignees: '' 7 | 8 | --- -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug Report 3 | about: Report a bug you encountered 4 | title: '' 5 | labels: kind/bug, needs-triage 6 | assignees: '' 7 | 8 | --- 9 | 10 | 13 | 14 | **What happened**: 15 | 16 | **What you expected to happen**: 17 | 18 | **How to reproduce it (as minimally and precisely as possible)**: 19 | 20 | **Anything else we need to know?**: 21 | 22 | **Environment**: 23 | - inference-perf version: 24 | - config.yml (entire one printed by the benchmark run): 25 | - cloud provider or hardware configuration: 26 | - others: -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: needs-triage 6 | assignees: '' 7 | 8 | --- 9 | 10 | 11 | 12 | **What would you like to be added**: 13 | 14 | **Why is this needed**: -------------------------------------------------------------------------------- /.github/changelog-config.json: -------------------------------------------------------------------------------- 1 | { 2 | "categories": [ 3 | { 4 | "title": "🚀 Features", 5 | "labels": ["feature", "enhancement"] 6 | }, 7 | { 8 | "title": "🐛 Bug Fixes", 9 | "labels": ["bug", "fix"] 10 | }, 11 | { 12 | "title": "📚 Documentation", 13 | "labels": ["documentation", "docs"] 14 | }, 15 | { 16 | "title": "⚡️ Performance", 17 | "labels": ["performance", "perf"] 18 | }, 19 | { 20 | "title": "🔧 Dependencies", 21 | "labels": ["dependencies", "deps"] 22 | } 23 | ], 24 | "template": "${{CHANGELOG}}\n\n## Docker Image\n\n${{DOCKER_IMAGE}}\n\n## Contributors\n\n${{CONTRIBUTORS}}", 25 | "pr-template": "- ${{TITLE}} (#${{NUMBER}})" 26 | } -------------------------------------------------------------------------------- /.github/workflows/docker-build.yml: -------------------------------------------------------------------------------- 1 | name: Docker Build and Push 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | docker: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - name: Checkout code 13 | uses: actions/checkout@v4 14 | 15 | - name: Set up Docker Buildx 16 | uses: docker/setup-buildx-action@v3 17 | 18 | - name: Login to Quay.io 19 | uses: docker/login-action@v3 20 | with: 21 | registry: quay.io 22 | username: ${{ secrets.QUAY_USERNAME }} 23 | password: ${{ secrets.QUAY_PASSWORD }} 24 | 25 | - name: Extract metadata (tags, labels) for Docker 26 | id: meta 27 | uses: docker/metadata-action@v5 28 | with: 29 | images: quay.io/inference-perf/inference-perf 30 | tags: | 31 | type=ref,event=branch 32 | type=sha,format=short 33 | type=raw,value=latest,enable=${{ github.ref == 'refs/heads/main' }} 34 | 35 | - name: Build and push Docker image 36 | uses: docker/build-push-action@v5 37 | with: 38 | context: . 39 | platforms: linux/amd64 40 | push: true 41 | tags: ${{ steps.meta.outputs.tags }} 42 | labels: ${{ steps.meta.outputs.labels }} 43 | cache-from: type=gha 44 | cache-to: type=gha,mode=max -------------------------------------------------------------------------------- /.github/workflows/format.yml: -------------------------------------------------------------------------------- 1 | name: Python Linting and Type Checks 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - 'feature/**' 8 | pull_request: 9 | 10 | jobs: 11 | format-check: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Checkout Code 15 | uses: actions/checkout@v4 16 | - name: Set up Python 17 | uses: actions/setup-python@v5 18 | with: 19 | python-version: '3.13' 20 | - name: Do Linting and Type Checks 21 | run: | 22 | make validate 23 | -------------------------------------------------------------------------------- /.github/workflows/release.yml: -------------------------------------------------------------------------------- 1 | name: Release Processing 2 | on: 3 | push: 4 | tags: 5 | - 'v*.*.*' # Matches semantic versioning tags like v1.0.0 6 | permissions: 7 | contents: write 8 | pull-requests: read 9 | jobs: 10 | build-and-publish: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - uses: actions/checkout@v4 14 | with: 15 | fetch-depth: 0 # Get full history for changelog 16 | 17 | - name: Set env variable for tag name 18 | run: echo "RELEASE_VERSION=${GITHUB_REF#refs/tags/}" >> $GITHUB_ENV 19 | 20 | - name: Generate changelog 21 | id: github_release 22 | uses: mikepenz/release-changelog-builder-action@v3 23 | with: 24 | configuration: ".github/changelog-config.json" 25 | ignorePreReleases: false 26 | env: 27 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 28 | 29 | - name: Create GitHub Release 30 | uses: softprops/action-gh-release@v1 31 | with: 32 | name: Release ${{ env.RELEASE_VERSION }} 33 | body: | 34 | # Release ${{ env.RELEASE_VERSION }} 35 | 36 | ## What's Changed 37 | ${{ steps.github_release.outputs.changelog }} 38 | env: 39 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 40 | 41 | docker: 42 | needs: build-and-publish # Run after the release is created 43 | runs-on: ubuntu-latest 44 | steps: 45 | - uses: actions/checkout@v4 46 | 47 | - name: Set up Docker Buildx 48 | uses: docker/setup-buildx-action@v3 49 | 50 | - name: Login to Quay.io 51 | uses: docker/login-action@v3 52 | with: 53 | registry: quay.io 54 | username: ${{ secrets.QUAY_USERNAME }} 55 | password: ${{ secrets.QUAY_PASSWORD }} 56 | 57 | - name: Extract metadata (tags, labels) for Docker 58 | id: meta 59 | uses: docker/metadata-action@v5 60 | with: 61 | images: quay.io/${{ secrets.QUAY_USERNAME }}/inference-perf 62 | tags: | 63 | type=raw,value=${{ github.ref_name }},enable=true 64 | type=raw,value=latest,enable=true 65 | 66 | - name: Build and push Docker image 67 | uses: docker/build-push-action@v5 68 | with: 69 | context: . 70 | platforms: linux/amd64 71 | push: true 72 | tags: ${{ steps.meta.outputs.tags }} 73 | labels: ${{ steps.meta.outputs.labels }} 74 | cache-from: type=gha 75 | cache-to: type=gha,mode=max -------------------------------------------------------------------------------- /.github/workflows/unit_test.yml: -------------------------------------------------------------------------------- 1 | name: Unit Tests 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | - 'feature/**' 8 | pull_request: 9 | 10 | jobs: 11 | format-check: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - name: Checkout Code 15 | uses: actions/checkout@v4 16 | - name: Set up Python 17 | uses: actions/setup-python@v5 18 | with: 19 | python-version: '3.13' 20 | - name: Do Linting and Type Checks 21 | run: | 22 | make test 23 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # UV 98 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | #uv.lock 102 | 103 | # poetry 104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure reproducibility, and is more 106 | # commonly ignored for libraries. 107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 108 | #poetry.lock 109 | 110 | # pdm 111 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 112 | #pdm.lock 113 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 114 | # in version control. 115 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 116 | .pdm.toml 117 | .pdm-python 118 | .pdm-build/ 119 | .python-version 120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 121 | __pypackages__/ 122 | 123 | # Celery stuff 124 | celerybeat-schedule 125 | celerybeat.pid 126 | 127 | # SageMath parsed files 128 | *.sage.py 129 | 130 | # Environments 131 | .env 132 | .venv 133 | env/ 134 | venv/ 135 | ENV/ 136 | env.bak/ 137 | venv.bak/ 138 | 139 | # Spyder project settings 140 | .spyderproject 141 | .spyproject 142 | 143 | # Rope project settings 144 | .ropeproject 145 | 146 | # mkdocs documentation 147 | /site 148 | 149 | # mypy 150 | .mypy_cache/ 151 | .dmypy.json 152 | dmypy.json 153 | 154 | # Pyre type checker 155 | .pyre/ 156 | 157 | # pytype static type analyzer 158 | .pytype/ 159 | 160 | # Cython debug symbols 161 | cython_debug/ 162 | 163 | # PyCharm 164 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 165 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 166 | # and can be added to the global gitignore or merged into this file. For a more nuclear 167 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 168 | #.idea/ 169 | 170 | # Ruff stuff: 171 | .ruff_cache/ 172 | 173 | # PyPI configuration file 174 | .pypirc 175 | 176 | # Test Reports 177 | *.json -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pdm-project/pdm 3 | rev: 2.22.3 4 | hooks: 5 | - id: pdm-lock-check 6 | name: check lock file matches pyproject 7 | - repo: https://github.com/astral-sh/ruff-pre-commit 8 | rev: v0.9.4 9 | hooks: 10 | - id: ruff 11 | name: run the linter 12 | args: [ --fix ] 13 | - id: ruff-format 14 | name: run the formatter 15 | - repo: https://github.com/pre-commit/mirrors-mypy 16 | rev: v1.14.1 17 | hooks: 18 | - id: mypy 19 | name: run static type check 20 | args: [--strict, --ignore-missing-imports] 21 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Welcome to Kubernetes. We are excited about the prospect of you joining our [community](https://git.k8s.io/community)! The Kubernetes community abides by the CNCF [code of conduct](code-of-conduct.md). Here is an excerpt: 4 | 5 | _As contributors and maintainers of this project, and in the interest of fostering an open and welcoming community, we pledge to respect all people who contribute through reporting issues, posting feature requests, updating documentation, submitting pull requests or patches, and other activities._ 6 | 7 | ## Getting Started 8 | 9 | We have full documentation on how to get started contributing here: 10 | 11 | 14 | 15 | - [Contributor License Agreement](https://git.k8s.io/community/CLA.md) - Kubernetes projects require that you sign a Contributor License Agreement (CLA) before we can accept your pull requests 16 | - [Kubernetes Contributor Guide](https://k8s.dev/guide) - Main contributor documentation, or you can just jump directly to the [contributing page](https://k8s.dev/docs/guide/contributing/) 17 | - [Contributor Cheat Sheet](https://k8s.dev/cheatsheet) - Common resources for existing developers 18 | 19 | ## Mentorship 20 | 21 | - [Mentoring Initiatives](https://k8s.dev/community/mentoring) - We have a diverse set of mentorship programs available that are always looking for volunteers! 22 | 23 | ## Contact Information 24 | 25 | - [Slack](https://kubernetes.slack.com/messages/sig-scalability) 26 | - [Mailing List](https://groups.google.com/forum/#!forum/kubernetes-sig-scale) 27 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.12.9-slim-bookworm as dev 2 | 3 | RUN apt-get update -y \ 4 | && apt-get install -y python3-pip 5 | 6 | # Upgrade pip 7 | RUN pip3 install --upgrade pip 8 | 9 | # Set working directory 10 | WORKDIR /workspace 11 | 12 | # Copy project files 13 | COPY . /workspace 14 | 15 | # Install dependencies 16 | RUN pip install -e . 17 | 18 | # Run inference-perf 19 | CMD ["inference-perf", "--config_file", "config.yml"] 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | VENV := .venv 2 | 3 | # Format Python code with ruff format 4 | .PHONY: format 5 | format: 6 | @echo "Formatting Python files with ruff format..." 7 | $(VENV)/bin/ruff format 8 | 9 | # Run ruff check to lint Python code in the whole repository 10 | .PHONY: lint 11 | lint: 12 | @echo "Linting Python files with ruff check..." 13 | $(VENV)/bin/ruff check 14 | 15 | # Perform type checking 16 | .PHONY: type-check 17 | type-check: 18 | @echo "Running type checking with mypy..." 19 | $(VENV)/bin/mypy --strict ./inference_perf ./tests 20 | 21 | # Check for and install dependencies 22 | .PHONY: all-deps 23 | all-deps: install-deps install-dev-deps 24 | 25 | .PHONY: 26 | install-deps: 27 | @echo "Creating virtual environment if it doesn't exist..." 28 | @if [ ! -d $(VENV) ]; then \ 29 | python3 -m venv $(VENV); \ 30 | fi 31 | @echo "Activating virtual environment and installing dependencies..." 32 | $(VENV)/bin/pip install --upgrade pip 33 | $(VENV)/bin/pip install -e . 34 | 35 | .PHONY: 36 | install-dev-deps: install-deps 37 | @echo "Installing development dependencies..." 38 | $(VENV)/bin/pip install -e .[dev] 39 | 40 | .PHONY: unit-test 41 | unit-test: 42 | $(VENV)/bin/pytest 43 | 44 | .PHONY: validate 45 | validate: install-dev-deps format lint type-check 46 | 47 | .PHONY: test 48 | test: install-dev-deps unit-test 49 | 50 | -------------------------------------------------------------------------------- /OWNERS: -------------------------------------------------------------------------------- 1 | # See the OWNERS docs at https://go.k8s.io/owners 2 | 3 | approvers: 4 | - inference-perf-maintainers 5 | - wg-serving-leads 6 | -------------------------------------------------------------------------------- /OWNERS_ALIASES: -------------------------------------------------------------------------------- 1 | # See the OWNERS docs: https://git.k8s.io/community/contributors/guide/owners.md 2 | # This file should be kept in sync with k/org. 3 | 4 | aliases: 5 | inference-perf-maintainers: 6 | - achandrasekar 7 | - wangchen615 8 | - SachinVarghese 9 | 10 | wg-serving-leads: 11 | - ArangoGutierrez 12 | - Jeffwan 13 | - SergeyKanzhelev 14 | - terrytangyuan 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Inference Perf 2 | 3 | The Inference Perf project aims to provide GenAI inference performance benchmarking tool. It came out of [wg-serving](https://github.com/kubernetes/community/tree/master/wg-serving) and is sponsored by [SIG Scalability](https://github.com/kubernetes/community/blob/master/sig-scalability/README.md#inference-perf). See the [proposal](https://github.com/kubernetes-sigs/wg-serving/tree/main/proposals/013-inference-perf) for more info. 4 | 5 | ## Status 6 | 7 | This project is currently in development. 8 | 9 | ## Getting Started 10 | 11 | ### Configuration 12 | 13 | You can configure inference-perf to run with different data generation and load generation configurations today. Please see `config.yml` and examples in `/examples`. 14 | 15 | Supported datasets include the following: 16 | - ShareGPT (for a real world conversational dataset) 17 | - Synthetic (for specific input / output distributions) 18 | - Mock (for testing) 19 | 20 | Similarly load generation can be configured to run with different request rates and durations. You can also run multiple stages with different request rates and durations within a single run. 21 | 22 | ### Run locally 23 | 24 | - Setup a virtual environment and install inference-perf 25 | 26 | ``` 27 | pip install . 28 | ``` 29 | 30 | - Run inference-perf CLI with a configuration file 31 | 32 | ``` 33 | inference-perf --config_file config.yml 34 | ``` 35 | 36 | - See more [examples](./examples/) 37 | 38 | ### Run in a Docker container 39 | 40 | - Build the container 41 | 42 | ``` 43 | docker build -t inference-perf . 44 | ``` 45 | 46 | - Run the container 47 | 48 | ``` 49 | docker run -it --rm -v $(pwd)/config.yml:/workspace/config.yml inference-perf 50 | 51 | ``` 52 | 53 | ### Run in Kubernetes cluster 54 | 55 | Refer to the [guide](./deploy/README.md) in `/deploy`. 56 | 57 | ## Contributing 58 | 59 | Our community meeting is weekly on Thursdays alternating betweem 09:00 and 11:30 PDT ([Zoom Link](https://zoom.us/j/9955436256?pwd=Z2FQWU1jeDZkVC9RRTN4TlZyZTBHZz09), [Meeting Notes](https://docs.google.com/document/d/15XSF8q4DShcXIiExDfyiXxAYQslCmOmO2ARSJErVTak/edit?usp=sharing), [Meeting Recordings](https://www.youtube.com/playlist?list=PL69nYSiGNLP30qNanabU75ayPK7OPNAAS)). 60 | 61 | We currently utilize the [#inference-perf](https://kubernetes.slack.com/?redir=%2Fmessages%2Finference-perf) channel in Kubernetes Slack workspace for communications. 62 | 63 | Contributions are welcomed, thanks for joining us! 64 | 65 | ### Code of conduct 66 | 67 | Participation in the Kubernetes community is governed by the [Kubernetes Code of Conduct](code-of-conduct.md). 68 | -------------------------------------------------------------------------------- /RELEASE.md: -------------------------------------------------------------------------------- 1 | # Release Process 2 | 3 | The Kubernetes Template Project is released on an as-needed basis. The process is as follows: 4 | 5 | 1. An issue is proposing a new release with a changelog since the last release 6 | 1. All [OWNERS](OWNERS) must LGTM this release 7 | 1. An OWNER runs `git tag -s $VERSION` and inserts the changelog and pushes the tag with `git push $VERSION` 8 | 1. The release issue is closed 9 | 1. An announcement email is sent to `dev@kubernetes.io` with the subject `[ANNOUNCE] kubernetes-template-project $VERSION is released` 10 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Security Announcements 4 | 5 | Join the [kubernetes-security-announce] group for security and vulnerability announcements. 6 | 7 | ## Reporting a Vulnerability 8 | 9 | Instructions for reporting a vulnerability can be found on the 10 | [Kubernetes Security and Disclosure Information] page. 11 | 12 | ## Supported Versions 13 | 14 | Information about supported Kubernetes versions can be found on the 15 | [Kubernetes version and version skew support policy] page on the Kubernetes website. 16 | 17 | [kubernetes-security-announce]: https://groups.google.com/forum/#!forum/kubernetes-security-announce 18 | [Kubernetes version and version skew support policy]: https://kubernetes.io/docs/setup/release/version-skew-policy/#supported-versions 19 | [Kubernetes Security and Disclosure Information]: https://kubernetes.io/docs/reference/issues-security/security/#report-a-vulnerability 20 | -------------------------------------------------------------------------------- /SECURITY_CONTACTS: -------------------------------------------------------------------------------- 1 | # Defined below are the security contacts for this repo. 2 | # 3 | # They are the contact point for the Security Response Committee to reach out 4 | # to for triaging and handling of incoming issues. 5 | # 6 | # The below names agree to abide by the 7 | # [Embargo Policy](https://git.k8s.io/security/private-distributors-list.md#embargo-policy) 8 | # and will be removed and replaced if they violate that agreement. 9 | # 10 | # DO NOT REPORT SECURITY VULNERABILITIES DIRECTLY TO THESE NAMES, FOLLOW THE 11 | # INSTRUCTIONS AT https://kubernetes.io/security/ 12 | 13 | ArangoGutierrez 14 | Jeffwan 15 | SergeyKanzhelev 16 | terrytangyuan 17 | -------------------------------------------------------------------------------- /code-of-conduct.md: -------------------------------------------------------------------------------- 1 | # Kubernetes Community Code of Conduct 2 | 3 | Please refer to our [Kubernetes Community Code of Conduct](https://git.k8s.io/community/code-of-conduct.md) 4 | -------------------------------------------------------------------------------- /config.yml: -------------------------------------------------------------------------------- 1 | load: 2 | type: constant 3 | stages: 4 | - rate: 1 5 | duration: 30 6 | api: chat 7 | server: 8 | type: vllm 9 | model_name: HuggingFaceTB/SmolLM2-135M-Instruct 10 | base_url: http://0.0.0.0:8000 11 | ignore_eos: true 12 | tokenizer: 13 | pretrained_model_name_or_path: HuggingFaceTB/SmolLM2-135M-Instruct 14 | data: 15 | type: shareGPT 16 | metrics: 17 | type: prometheus 18 | prometheus: 19 | url: http://localhost:9090 20 | scrape_interval: 15 21 | report: 22 | request_lifecycle: 23 | summary: true 24 | per_stage: true 25 | per_request: false 26 | prometheus: 27 | summary: true 28 | per_stage: false -------------------------------------------------------------------------------- /deploy/README.md: -------------------------------------------------------------------------------- 1 | ## Run `inference-perf` as a Job in a Kubernetes cluster 2 | 3 | This guide explains how to deploy `inference-perf` to a Kubernetes cluster as a job. 4 | 5 | > [!NOTE] 6 | > There is currently no support for persisting output reports, all outputs are currently printed to standard output, please refer to issue [#59](https://github.com/kubernetes-sigs/inference-perf/issues/59) 7 | 8 | ### Setup 9 | 10 | Since public container images are not actively being published, you'll need to build the `inference-perf` image yourself. Follow the [official guide](https://github.com/kubernetes-sigs/inference-perf?tab=readme-ov-file#run-in-a-docker-container) to build the container. 11 | 12 | Once built, push the image to your preferred container registry: 13 | - [Artifact Registry (Google Cloud)](https://cloud.google.com/artifact-registry/docs/docker/pushing-and-pulling) 14 | - [Docker Hub](https://docs.docker.com/get-started/introduction/build-and-push-first-image/) 15 | 16 | Take note of the image name once successfully pushed, replace `` in `manifests.yaml` with this image name. 17 | 18 | Running `inference-perf` requires an input file. This should be provided via a Kubernetes ConfigMap. Update the `config.yml` as needed then create the ConfigMap by running at the root of this repo: 19 | 20 | ```bash 21 | kubectl create configmap inference-perf-config --from-file=config.yml 22 | ``` 23 | 24 | ### Instructions 25 | 26 | Apply the job by running the following: 27 | ```bash 28 | kubectl apply -f manifests.yaml 29 | ``` 30 | 31 | ### Viewing Results 32 | 33 | Currently, inference-perf outputs benchmark results to standard output only. To view the results after the job completes, run: 34 | ```bash 35 | kubectl wait --for=condition=complete job/inference-perf && kubectl logs jobs/inference-perf 36 | ``` -------------------------------------------------------------------------------- /deploy/manifests.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: inference-perf 5 | labels: 6 | app: inference-perf 7 | spec: 8 | template: 9 | metadata: 10 | labels: 11 | app: inference-perf 12 | spec: 13 | containers: 14 | - name: inference-perf 15 | image: 16 | imagePullPolicy: Always 17 | command: ["inference-perf"] 18 | args: ["--config_file", "/etc/config/config.yml"] 19 | volumeMounts: 20 | - name: config-volume 21 | mountPath: /etc/config 22 | readOnly: true 23 | restartPolicy: Never 24 | volumes: 25 | - name: config-volume 26 | configMap: 27 | name: inference-perf-config -------------------------------------------------------------------------------- /docs/design.md: -------------------------------------------------------------------------------- 1 | # Design 2 | 3 | This document describes the high level design for the tool. It includes the 4 | following components. 5 | 6 | ## Dataset Preprocessor 7 | 8 | Dataset Preprocessor takes in a known dataset like ShareGPT or OpenOrca as the 9 | input and pre-processes them by making sure the prompt length and generation 10 | length are aligned with the user input to support different options like fixed 11 | input / output length tests, variable length tests (larger input / smaller 12 | output and the vice versa). This allows us to support different GenAI use cases 13 | like chat completion, summarization, code completion, etc. depending on the 14 | dataset and the benchmarking user’s inputs. 15 | 16 | ## Load Generator 17 | 18 | Load Generator is the component which generates different traffic patterns based 19 | on user input. This can include a fixed RPS test for a predetermined amount of 20 | time or include a way to generate bursts in traffic or other traffic patterns as 21 | desired for autoscaling and other use cases. 22 | 23 | ## Request Processor 24 | 25 | Request Processor provides a way to support different model servers and their 26 | corresponding request payload with different configurable parameters. This makes 27 | our tool model server agnostic and provides a generic way to benchmark different 28 | model servers and produce apples to apples comparison between them. This 29 | component will also support different protocols like http and grpc and options 30 | like request streaming which is important to produce time to first token (TTFT) 31 | metric. 32 | 33 | ## Response Processor / Data Collector 34 | 35 | Response Processor / Data Collector component allows us to process the response 36 | and measure the actual performance of the model server in terms of request 37 | latency, TPOT, TTFT and throughput. 38 | 39 | ## Report Generator / Metrics Exporter 40 | 41 | Report Generator / Metrics Exporter generates a report based on the data 42 | collected during benchmarking. It can also export the different metrics that we 43 | collected during benchmarking as metrics into Prometheus which can then be 44 | consumed by other monitoring or visualization solutions. 45 | 46 | ![benchmarking-tool-architecture](./images/design.png) 47 | 48 | ## Metrics to Collect 49 | 50 | The following are the essential metrics that we want to collect using the 51 | benchmarking tool. 52 | 53 | * Throughput 54 | * Output tokens / second 55 | * Input tokens / second 56 | * Requests / second 57 | * Latency at different percentiles (mean, median, p90, p99) 58 | * Time per output token (TPOT) 59 | * Inter-token latency (ITL) 60 | * Time to first token (TTFT) 61 | * Time per request 62 | * Request metrics (mean, median, p90, p99) 63 | * Prompt tokens 64 | * Output tokens 65 | 66 | Optionally we also want to collect specific accelerator and model server metrics. 67 | 68 | * Accelerator metrics (mean, median, p90, p99) 69 | * Accelerator utilization (duty cycle) 70 | * Accelerator memory utilization 71 | * Accelerator memory bandwidth utilization 72 | * Accelerator power usage 73 | * Model server metrics (mean, median, p90, p99) 74 | * Batch size 75 | * Queue size 76 | * KV cache usage 77 | -------------------------------------------------------------------------------- /docs/images/design.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/kubernetes-sigs/inference-perf/14ea94163c5942119ca7e3fc5633e396a523b06e/docs/images/design.png -------------------------------------------------------------------------------- /examples/vllm/config-random.yml: -------------------------------------------------------------------------------- 1 | load: 2 | type: constant 3 | stages: 4 | - rate: 1 5 | duration: 30 6 | api: completion 7 | server: 8 | type: vllm 9 | model_name: HuggingFaceTB/SmolLM2-135M-Instruct 10 | base_url: http://0.0.0.0:8000 11 | ignore_eos: true 12 | tokenizer: 13 | pretrained_model_name_or_path: HuggingFaceTB/SmolLM2-135M-Instruct 14 | data: 15 | type: random 16 | input_distribution: 17 | min: 10 18 | max: 100 19 | mean: 50 20 | std: 10 21 | total_count: 100 22 | output_distribution: 23 | min: 10 24 | max: 512 25 | mean: 256 26 | std: 100 27 | total_count: 100 28 | metrics: 29 | type: prometheus 30 | prometheus: 31 | url: http://localhost:9090 32 | scrape_interval: 15 33 | report: 34 | request_lifecycle: 35 | summary: true 36 | per_stage: true 37 | per_request: true -------------------------------------------------------------------------------- /examples/vllm/config-shared-prefix.yml: -------------------------------------------------------------------------------- 1 | load: 2 | type: constant 3 | interval: 15 4 | stages: 5 | - rate: 1 6 | duration: 30 7 | - rate: 2 8 | duration: 30 9 | api: completion 10 | server: 11 | type: vllm 12 | model_name: HuggingFaceTB/SmolLM2-135M-Instruct 13 | base_url: http://0.0.0.0:8000 14 | ignore_eos: true 15 | tokenizer: 16 | pretrained_model_name_or_path: HuggingFaceTB/SmolLM2-135M-Instruct 17 | data: 18 | type: shared_prefix 19 | shared_prefix: 20 | num_groups: 10 # Number of distinct shared prefixes 21 | num_prompts_per_group: 10 # Number of unique questions per shared prefix 22 | system_prompt_len: 100 # Length of the shared prefix (in tokens) 23 | question_len: 50 # Length of the unique question part (in tokens) 24 | output_len: 50 # Target length for the model's generated output (in tokens) 25 | metrics: 26 | type: prometheus 27 | prometheus: 28 | url: http://localhost:9090 29 | scrape_interval: 15 30 | report: 31 | request_lifecycle: 32 | summary: true 33 | per_stage: true 34 | per_request: true -------------------------------------------------------------------------------- /examples/vllm/config-synthetic.yml: -------------------------------------------------------------------------------- 1 | load: 2 | type: constant 3 | interval: 15 4 | stages: 5 | - rate: 1 6 | duration: 30 7 | - rate: 2 8 | duration: 30 9 | api: completion 10 | server: 11 | type: vllm 12 | model_name: HuggingFaceTB/SmolLM2-135M-Instruct 13 | base_url: http://0.0.0.0:8000 14 | tokenizer: 15 | pretrained_model_name_or_path: HuggingFaceTB/SmolLM2-135M-Instruct 16 | data: 17 | type: synthetic 18 | input_distribution: 19 | min: 10 20 | max: 100 21 | mean: 50 22 | std: 10 23 | total_count: 100 24 | output_distribution: 25 | min: 10 26 | max: 100 27 | mean: 50 28 | std: 10 29 | total_count: 100 30 | metrics: 31 | type: prometheus 32 | prometheus: 33 | url: http://localhost:9090 34 | scrape_interval: 15 35 | report: 36 | request_lifecycle: 37 | summary: true 38 | per_stage: true 39 | per_request: true 40 | -------------------------------------------------------------------------------- /examples/vllm/config.yml: -------------------------------------------------------------------------------- 1 | data: 2 | type: shareGPT 3 | load: 4 | type: constant 5 | stages: 6 | - rate: 1 7 | duration: 30 8 | api: chat 9 | server: 10 | type: vllm 11 | model_name: HuggingFaceTB/SmolLM2-135M-Instruct 12 | base_url: http://0.0.0.0:8000 -------------------------------------------------------------------------------- /examples/vllm/vllm_server.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Benchmark vLLM Server with inference-perf" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": {}, 13 | "source": [ 14 | "### Local vLLM Setup using docker" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "Run vLLM Server as a docker container with the model HuggingFace `HuggingFaceTB/SmolLM2-135M-Instruct`" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 1, 27 | "metadata": {}, 28 | "outputs": [ 29 | { 30 | "name": "stdout", 31 | "output_type": "stream", 32 | "text": [ 33 | "71c1f998ef3488239cf88c97e0084e6287c87df3f3de3842e47c3751acc43329\n" 34 | ] 35 | } 36 | ], 37 | "source": [ 38 | "!export MODEL_NAME=\"HuggingFaceTB/SmolLM2-135M-Instruct\" && \\\n", 39 | " docker run --name vllm-server -d --runtime nvidia --gpus all \\\n", 40 | " -v ~/.cache/huggingface:/root/.cache/huggingface \\\n", 41 | " -p 8000:8000 vllm/vllm-openai:latest \\\n", 42 | " --model ${MODEL_NAME}" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "**Note**: Configure [vLLM engine arguments](https://docs.vllm.ai/en/latest/serving/engine_args.html#engine-args) like `--max-model-len` and `--max-num-seqs` according to local compute capacity" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "### Benchmark with inference_perf" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "Create a configuration file for the test using `shareGPT` data and run the constant rate test for `30s`. You can also use the synthetic dataset if you prefer by running with the `config-synthetic.yml` file instead." 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 2, 69 | "metadata": {}, 70 | "outputs": [ 71 | { 72 | "name": "stdout", 73 | "output_type": "stream", 74 | "text": [ 75 | "data:\n", 76 | " type: shareGPT\n", 77 | "load:\n", 78 | " type: constant\n", 79 | " rate: 1\n", 80 | " duration: 30\n", 81 | "vllm:\n", 82 | " api: chat\n", 83 | " model_name: HuggingFaceTB/SmolLM2-135M-Instruct\n", 84 | " url: http://0.0.0.0:8000" 85 | ] 86 | } 87 | ], 88 | "source": [ 89 | "!cat config.yml" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 3, 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "name": "stdout", 99 | "output_type": "stream", 100 | "text": [ 101 | "Using configuration from: config.yml\n", 102 | "Run started\n", 103 | "Run completed\n", 104 | "\n", 105 | "\n", 106 | "Generating Report ..\n", 107 | "total_requests: 38\n", 108 | "avg_prompt_tokens: 2.763157894736842\n", 109 | "avg_output_tokens: 28.94736842105263\n", 110 | "avg_time_per_request: 0.11538009351045873\n" 111 | ] 112 | } 113 | ], 114 | "source": [ 115 | "!inference-perf --config_file config.yml" 116 | ] 117 | }, 118 | { 119 | "cell_type": "markdown", 120 | "metadata": {}, 121 | "source": [ 122 | "### Cleanup" 123 | ] 124 | }, 125 | { 126 | "cell_type": "markdown", 127 | "metadata": {}, 128 | "source": [ 129 | "Delete vLLM Server docker processes" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 4, 135 | "metadata": {}, 136 | "outputs": [ 137 | { 138 | "name": "stdout", 139 | "output_type": "stream", 140 | "text": [ 141 | "vllm-server\n", 142 | "vllm-server\n" 143 | ] 144 | } 145 | ], 146 | "source": [ 147 | "!docker stop vllm-server && docker rm vllm-server" 148 | ] 149 | } 150 | ], 151 | "metadata": { 152 | "kernelspec": { 153 | "display_name": ".venv", 154 | "language": "python", 155 | "name": "python3" 156 | }, 157 | "language_info": { 158 | "codemirror_mode": { 159 | "name": "ipython", 160 | "version": 3 161 | }, 162 | "file_extension": ".py", 163 | "mimetype": "text/x-python", 164 | "name": "python", 165 | "nbconvert_exporter": "python", 166 | "pygments_lexer": "ipython3", 167 | "version": "3.11.2" 168 | } 169 | }, 170 | "nbformat": 4, 171 | "nbformat_minor": 2 172 | } 173 | -------------------------------------------------------------------------------- /inference_perf/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from .main import main_cli 15 | 16 | __all__ = ["main_cli"] 17 | -------------------------------------------------------------------------------- /inference_perf/apis/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from .base import InferenceAPIData, InferenceInfo, RequestLifecycleMetric, ErrorResponseInfo 15 | from .chat import ChatCompletionAPIData, ChatMessage 16 | from .completion import CompletionAPIData 17 | 18 | __all__ = [ 19 | "InferenceAPIData", 20 | "InferenceInfo", 21 | "RequestLifecycleMetric", 22 | "ErrorResponseInfo", 23 | "ChatCompletionAPIData", 24 | "ChatMessage", 25 | "CompletionAPIData", 26 | ] 27 | -------------------------------------------------------------------------------- /inference_perf/apis/base.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from abc import abstractmethod 16 | from typing import Any, Optional 17 | from pydantic import BaseModel 18 | from inference_perf.utils.custom_tokenizer import CustomTokenizer 19 | from inference_perf.config import APIType 20 | 21 | 22 | class InferenceInfo(BaseModel): 23 | input_tokens: int = 0 24 | output_tokens: int = 0 25 | 26 | 27 | class ErrorResponseInfo(BaseModel): 28 | error_type: str 29 | error_msg: str 30 | 31 | 32 | class RequestLifecycleMetric(BaseModel): 33 | stage_id: Optional[int] = None 34 | start_time: float 35 | end_time: float 36 | request_data: str 37 | response_data: Optional[str] = None 38 | info: InferenceInfo 39 | error: Optional[ErrorResponseInfo] 40 | 41 | 42 | class InferenceAPIData(BaseModel): 43 | @abstractmethod 44 | def get_api_type(self) -> APIType: 45 | raise NotImplementedError 46 | 47 | @abstractmethod 48 | def get_route(self) -> str: 49 | raise NotImplementedError 50 | 51 | @abstractmethod 52 | def to_payload(self, model_name: str, max_tokens: int, ignore_eos: bool) -> dict[str, Any]: 53 | raise NotImplementedError 54 | 55 | @abstractmethod 56 | def process_response(self, data: dict[str, Any], tokenizer: CustomTokenizer) -> InferenceInfo: 57 | raise NotImplementedError 58 | -------------------------------------------------------------------------------- /inference_perf/apis/chat.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import Any, List 16 | from pydantic import BaseModel 17 | from inference_perf.apis import InferenceAPIData, InferenceInfo 18 | from inference_perf.utils.custom_tokenizer import CustomTokenizer 19 | from inference_perf.config import APIType 20 | 21 | 22 | class ChatMessage(BaseModel): 23 | role: str 24 | content: str 25 | 26 | 27 | class ChatCompletionAPIData(InferenceAPIData): 28 | messages: List[ChatMessage] 29 | max_tokens: int = 0 30 | 31 | def get_api_type(self) -> APIType: 32 | return APIType.Chat 33 | 34 | def get_route(self) -> str: 35 | return "/v1/chat/completions" 36 | 37 | def to_payload(self, model_name: str, max_tokens: int, ignore_eos: bool) -> dict[str, Any]: 38 | if self.max_tokens == 0: 39 | self.max_tokens = max_tokens 40 | return { 41 | "model": model_name, 42 | "messages": [{"role": m.role, "content": m.content} for m in self.messages], 43 | "max_tokens": self.max_tokens, 44 | "ignore_eos": ignore_eos, 45 | } 46 | 47 | def process_response(self, data: dict[str, Any], tokenizer: CustomTokenizer) -> InferenceInfo: 48 | choices = data.get("choices", []) 49 | output_text = choices[0].get("message", {}).get("content", "") 50 | output_len = tokenizer.count_tokens(output_text) 51 | return InferenceInfo( 52 | input_tokens=0, 53 | output_tokens=output_len, 54 | ) 55 | -------------------------------------------------------------------------------- /inference_perf/apis/completion.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from typing import Any 17 | from inference_perf.apis import InferenceAPIData, InferenceInfo 18 | from inference_perf.utils.custom_tokenizer import CustomTokenizer 19 | from inference_perf.config import APIType 20 | 21 | 22 | class CompletionAPIData(InferenceAPIData): 23 | prompt: str 24 | max_tokens: int = 0 25 | 26 | def get_api_type(self) -> APIType: 27 | return APIType.Completion 28 | 29 | def get_route(self) -> str: 30 | return "/v1/completions" 31 | 32 | def to_payload(self, model_name: str, max_tokens: int, ignore_eos: bool) -> dict[str, Any]: 33 | if self.max_tokens == 0: 34 | self.max_tokens = max_tokens 35 | return { 36 | "model": model_name, 37 | "prompt": self.prompt, 38 | "max_tokens": self.max_tokens, 39 | "ignore_eos": ignore_eos, 40 | } 41 | 42 | def process_response(self, data: dict[str, Any], tokenizer: CustomTokenizer) -> InferenceInfo: 43 | choices = data.get("choices", []) 44 | prompt_len = tokenizer.count_tokens(self.prompt) 45 | output_text = choices[0].get("text", "") 46 | output_len = tokenizer.count_tokens(output_text) 47 | return InferenceInfo(input_tokens=prompt_len, output_tokens=output_len) 48 | -------------------------------------------------------------------------------- /inference_perf/client/filestorage/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from .base import StorageClient 15 | from .gcs import GoogleCloudStorageClient 16 | 17 | 18 | __all__ = ["StorageClient", "GoogleCloudStorageClient"] 19 | -------------------------------------------------------------------------------- /inference_perf/client/filestorage/base.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from abc import ABC, abstractmethod 16 | from typing import List 17 | from inference_perf.config import StorageConfigBase 18 | from inference_perf.utils import ReportFile 19 | 20 | 21 | class StorageClient(ABC): 22 | def __init__(self, config: StorageConfigBase) -> None: 23 | self.config = config 24 | print(f"Report files will be stored at: {self.config.path}") 25 | 26 | @abstractmethod 27 | def save_report(self, reports: List[ReportFile]) -> None: 28 | raise NotImplementedError() 29 | -------------------------------------------------------------------------------- /inference_perf/client/filestorage/gcs.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import json 15 | from typing import List 16 | from google.cloud import storage 17 | from google.cloud.exceptions import GoogleCloudError 18 | from inference_perf.client.filestorage import StorageClient 19 | from inference_perf.config import GoogleCloudStorageConfig 20 | from inference_perf.utils import ReportFile 21 | 22 | 23 | class GoogleCloudStorageClient(StorageClient): 24 | def __init__(self, config: GoogleCloudStorageConfig) -> None: 25 | super().__init__(config=config) 26 | print("Created new GCS client") 27 | self.output_bucket = config.bucket_name 28 | self.client = storage.Client() 29 | 30 | self.bucket = self.client.lookup_bucket(config.bucket_name) 31 | if self.bucket is None: 32 | raise ValueError(f"GCS bucket '{config.bucket_name}' does not exist or is inaccessible.") 33 | 34 | def save_report(self, reports: List[ReportFile]) -> None: 35 | filenames = [report.get_filename() for report in reports] 36 | if len(filenames) != len(set(filenames)): 37 | raise ValueError("Duplicate filenames detected", filenames) 38 | 39 | for _, report in enumerate(reports): 40 | filename = report.get_filename() 41 | blob_path = f"{self.config.path if self.config.path else ''}/{self.config.report_file_prefix if self.config.report_file_prefix else ''}{filename}" 42 | blob = self.bucket.blob(blob_path) 43 | 44 | if blob.exists(): 45 | print(f"Skipping upload: gs://{self.output_bucket}/{blob_path} already exists") 46 | continue 47 | 48 | try: 49 | blob.upload_from_string(json.dumps(report.get_contents()), content_type="application/json") 50 | print(f"Uploaded gs://{self.output_bucket}/{blob_path}") 51 | except GoogleCloudError as e: 52 | print(f"Failed to upload {blob_path}: {e}") 53 | -------------------------------------------------------------------------------- /inference_perf/client/metricsclient/README.md: -------------------------------------------------------------------------------- 1 | # Model Server Metrics Query Clients 2 | 3 | This repository provides clients to query performance metrics from various monitoring platforms. Each model server exposes a list of relevant performance metrics, and these clients are designed to retrieve and process that data effectively. 4 | 5 | ## Supported Monitoring Platforms 6 | 7 | **Available now**: 8 | - Self Deployed Prometheus 9 | 10 | **Todo**: 11 | - Google Cloud Monitoring 12 | - AWS CloudWatch 13 | - Azure Monitor -------------------------------------------------------------------------------- /inference_perf/client/metricsclient/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from .base import MetricsClient, PerfRuntimeParameters, ModelServerMetrics 15 | from .mock_client import MockMetricsClient 16 | from .prometheus_client import PrometheusMetricsClient 17 | 18 | __all__ = ["MetricsClient", "MockMetricsClient", "PerfRuntimeParameters", "PrometheusMetricsClient", "ModelServerMetrics"] 19 | -------------------------------------------------------------------------------- /inference_perf/client/metricsclient/base.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from abc import ABC, abstractmethod 15 | from inference_perf.client.modelserver.base import ModelServerClient 16 | from inference_perf.loadgen.load_generator import StageRuntimeInfo 17 | from pydantic import BaseModel 18 | 19 | 20 | class PerfRuntimeParameters: 21 | def __init__( 22 | self, start_time: float, duration: float, model_server_client: ModelServerClient, stages: dict[int, StageRuntimeInfo] 23 | ) -> None: 24 | self.start_time = start_time 25 | self.duration = duration 26 | self.stages = stages 27 | self.model_server_client = model_server_client 28 | 29 | 30 | class ModelServerMetrics(BaseModel): 31 | # Throughput 32 | prompt_tokens_per_second: float = 0.0 33 | output_tokens_per_second: float = 0.0 34 | requests_per_second: float = 0.0 35 | 36 | # Latency 37 | avg_request_latency: float = 0.0 38 | median_request_latency: float = 0.0 39 | p90_request_latency: float = 0.0 40 | p99_request_latency: float = 0.0 41 | avg_time_to_first_token: float = 0.0 42 | median_time_to_first_token: float = 0.0 43 | p90_time_to_first_token: float = 0.0 44 | p99_time_to_first_token: float = 0.0 45 | avg_time_per_output_token: float = 0.0 46 | median_time_per_output_token: float = 0.0 47 | p90_time_per_output_token: float = 0.0 48 | p99_time_per_output_token: float = 0.0 49 | 50 | # Request 51 | total_requests: int = 0 52 | avg_prompt_tokens: int = 0 53 | avg_output_tokens: int = 0 54 | avg_queue_length: int = 0 55 | 56 | 57 | class MetricsClient(ABC): 58 | @abstractmethod 59 | def __init__(self) -> None: 60 | pass 61 | 62 | @abstractmethod 63 | def collect_metrics_summary(self, runtime_parameters: PerfRuntimeParameters) -> ModelServerMetrics | None: 64 | raise NotImplementedError 65 | 66 | @abstractmethod 67 | def collect_metrics_for_stage(self, runtime_parameters: PerfRuntimeParameters, stage_id: int) -> ModelServerMetrics | None: 68 | raise NotImplementedError 69 | 70 | @abstractmethod 71 | def wait(self) -> None: 72 | raise NotImplementedError 73 | -------------------------------------------------------------------------------- /inference_perf/client/metricsclient/mock_client.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from .base import MetricsClient, PerfRuntimeParameters, ModelServerMetrics 15 | 16 | 17 | class MockMetricsClient(MetricsClient): 18 | def __init__(self) -> None: 19 | pass 20 | 21 | def collect_metrics_summary(self, runtime_parameters: PerfRuntimeParameters) -> ModelServerMetrics | None: 22 | return None 23 | 24 | def collect_metrics_for_stage(self, runtime_parameters: PerfRuntimeParameters, stage_id: int) -> ModelServerMetrics | None: 25 | return None 26 | 27 | def wait(self) -> None: 28 | pass 29 | -------------------------------------------------------------------------------- /inference_perf/client/metricsclient/prometheus_client.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import time 15 | from typing import cast 16 | import requests 17 | from inference_perf.client.modelserver.base import ModelServerClient, ModelServerPrometheusMetric 18 | from inference_perf.config import PrometheusClientConfig 19 | from .base import MetricsClient, PerfRuntimeParameters, ModelServerMetrics 20 | 21 | PROMETHEUS_SCRAPE_BUFFER_SEC = 2 22 | 23 | 24 | class PrometheusQueryBuilder: 25 | def __init__(self, model_server_metric: ModelServerPrometheusMetric, duration: float): 26 | self.model_server_metric = model_server_metric 27 | self.duration = duration 28 | 29 | def get_queries(self) -> dict[str, dict[str, str]]: 30 | """ 31 | Returns a dictionary of queries for each metric type. 32 | """ 33 | metric_name = self.model_server_metric.name 34 | filter = self.model_server_metric.filters 35 | return { 36 | "gauge": { 37 | "mean": "avg_over_time(%s{%s}[%.0fs])" % (metric_name, filter, self.duration), 38 | "median": "quantile_over_time(0.5, %s{%s}[%.0fs])" % (metric_name, filter, self.duration), 39 | "sd": "stddev_over_time(%s{%s}[%.0fs])" % (metric_name, filter, self.duration), 40 | "min": "min_over_time(%s{%s}[%.0fs])" % (metric_name, filter, self.duration), 41 | "max": "max_over_time(%s{%s}[%.0fs])" % (metric_name, filter, self.duration), 42 | "p90": "quantile_over_time(0.9, %s{%s}[%.0fs])" % (metric_name, filter, self.duration), 43 | "p99": "quantile_over_time(0.99, %s{%s}[%.0fs])" % (metric_name, filter, self.duration), 44 | }, 45 | "histogram": { 46 | "mean": "sum(rate(%s_sum{%s}[%.0fs])) / (sum(rate(%s_count{%s}[%.0fs])) > 0)" 47 | % (metric_name, filter, self.duration, metric_name, filter, self.duration), 48 | "median": "histogram_quantile(0.5, sum(rate(%s_bucket{%s}[%.0fs])) by (le))" 49 | % (metric_name, filter, self.duration), 50 | "min": "histogram_quantile(0, sum(rate(%s_bucket{%s}[%.0fs])) by (le))" % (metric_name, filter, self.duration), 51 | "max": "histogram_quantile(1, sum(rate(%s_bucket{%s}[%.0fs])) by (le))" % (metric_name, filter, self.duration), 52 | "p90": "histogram_quantile(0.9, sum(rate(%s_bucket{%s}[%.0fs])) by (le))" 53 | % (metric_name, filter, self.duration), 54 | "p99": "histogram_quantile(0.99, sum(rate(%s_bucket{%s}[%.0fs])) by (le))" 55 | % (metric_name, filter, self.duration), 56 | }, 57 | "counter": { 58 | "rate": "sum(rate(%s{%s}[%.0fs]))" % (metric_name, filter, self.duration), 59 | "increase": "sum(increase(%s{%s}[%.0fs]))" % (metric_name, filter, self.duration), 60 | "mean": "avg_over_time(rate(%s{%s}[%.0fs])[%.0fs:%.0fs])" 61 | % (metric_name, filter, self.duration, self.duration, self.duration), 62 | "max": "max_over_time(rate(%s{%s}[%.0fs])[%.0fs:%.0fs])" 63 | % (metric_name, filter, self.duration, self.duration, self.duration), 64 | "min": "min_over_time(rate(%s{%s}[%.0fs])[%.0fs:%.0fs])" 65 | % (metric_name, filter, self.duration, self.duration, self.duration), 66 | "p90": "quantile_over_time(0.9, rate(%s{%s}[%.0fs])[%.0fs:%.0fs])" 67 | % (metric_name, filter, self.duration, self.duration, self.duration), 68 | "p99": "quantile_over_time(0.99, rate(%s{%s}[%.0fs])[%.0fs:%.0fs])" 69 | % (metric_name, filter, self.duration, self.duration, self.duration), 70 | }, 71 | } 72 | 73 | def build_query(self) -> str: 74 | """ 75 | Builds the PromQL query for the given metric type and query operation. 76 | 77 | Returns: 78 | The PromQL query. 79 | """ 80 | metric_type = self.model_server_metric.type 81 | query_op = self.model_server_metric.op 82 | 83 | queries = self.get_queries() 84 | if metric_type not in queries: 85 | print("Invalid metric type: %s" % (metric_type)) 86 | return "" 87 | if query_op not in queries[metric_type]: 88 | print("Invalid query operation: %s" % (query_op)) 89 | return "" 90 | return queries[metric_type][query_op] 91 | 92 | 93 | class PrometheusMetricsClient(MetricsClient): 94 | def __init__(self, config: PrometheusClientConfig) -> None: 95 | if config: 96 | self.url = config.url 97 | if not self.url: 98 | raise Exception("prometheus url missing") 99 | self.scrape_interval = config.scrape_interval or 30 100 | else: 101 | raise Exception("prometheus config missing") 102 | 103 | def wait(self) -> None: 104 | """ 105 | Waits for the Prometheus server to scrape the metrics. 106 | We have added a buffer of 5 seconds to the scrape interval to ensure that metrics for even the last request are collected. 107 | """ 108 | wait_time = self.scrape_interval + PROMETHEUS_SCRAPE_BUFFER_SEC 109 | time.sleep(wait_time) 110 | 111 | def collect_metrics_summary(self, runtime_parameters: PerfRuntimeParameters) -> ModelServerMetrics | None: 112 | """ 113 | Collects the summary metrics for the given Perf Benchmark run. 114 | 115 | Args: 116 | runtime_parameters: The runtime parameters containing details about the Perf Benchmark like the duration and model server client 117 | 118 | Returns: 119 | A ModelServerMetrics object containing the summary metrics. 120 | """ 121 | if runtime_parameters is None: 122 | print("Perf Runtime parameters are not set, skipping metrics collection") 123 | return None 124 | 125 | # Get the duration and model server client from the runtime parameters 126 | query_eval_time = time.time() 127 | query_duration = query_eval_time - runtime_parameters.start_time 128 | 129 | return self.get_model_server_metrics(runtime_parameters.model_server_client, query_duration, query_eval_time) 130 | 131 | def collect_metrics_for_stage(self, runtime_parameters: PerfRuntimeParameters, stage_id: int) -> ModelServerMetrics | None: 132 | """ 133 | Collects the summary metrics for a specific stage. 134 | 135 | Args: 136 | runtime_parameters: The runtime parameters containing details about the Perf Benchmark like the duration and model server client 137 | stage_id: The ID of the stage for which to collect metrics 138 | 139 | Returns: 140 | A ModelServerMetrics object containing the summary metrics for the specified stage. 141 | """ 142 | if runtime_parameters is None: 143 | print("Perf Runtime parameters are not set, skipping metrics collection") 144 | return None 145 | 146 | if runtime_parameters.stages is None or stage_id not in runtime_parameters.stages: 147 | print(f"Stage ID {stage_id} is not present in the runtime parameters, skipping metrics collection for this stage") 148 | return None 149 | 150 | # Get the query evaluation time and duration for the stage 151 | # The query evaluation time is the end time of the stage plus the scrape interval and a buffer to ensure metrics are collected 152 | # Duration is calculated as the difference between the eval time and start time of the stage 153 | query_eval_time = runtime_parameters.stages[stage_id].end_time + self.scrape_interval + PROMETHEUS_SCRAPE_BUFFER_SEC 154 | query_duration = query_eval_time - runtime_parameters.stages[stage_id].start_time 155 | return self.get_model_server_metrics(runtime_parameters.model_server_client, query_duration, query_eval_time) 156 | 157 | def get_model_server_metrics( 158 | self, model_server_client: ModelServerClient, query_duration: float, query_eval_time: float 159 | ) -> ModelServerMetrics | None: 160 | """ 161 | Collects the summary metrics for the given Model Server Client and query duration. 162 | 163 | Args: 164 | model_server_client: The model server client to use for collecting metrics 165 | query_duration: The duration for which to collect metrics 166 | query_eval_time: The time at which the query is evaluated, used to ensure we are querying the correct time range 167 | 168 | Returns: 169 | A ModelServerMetrics object containing the summary metrics. 170 | """ 171 | model_server_metrics: ModelServerMetrics = ModelServerMetrics() 172 | 173 | # Get the engine and model from the model server client 174 | if not model_server_client: 175 | print("Model server client is not set") 176 | return None 177 | 178 | metrics_metadata = model_server_client.get_prometheus_metric_metadata() 179 | if not metrics_metadata: 180 | print("Metrics metadata is not present for the runtime") 181 | return None 182 | for summary_metric_name in metrics_metadata: 183 | summary_metric_metadata = metrics_metadata.get(summary_metric_name) 184 | if summary_metric_metadata is None: 185 | print("Metric metadata is not present for metric: %s. Skipping this metric." % (summary_metric_name)) 186 | continue 187 | summary_metric_metadata = cast(ModelServerPrometheusMetric, summary_metric_metadata) 188 | if summary_metric_metadata is None: 189 | print( 190 | "Metric metadata for %s is missing or has an incorrect format. Skipping this metric." 191 | % (summary_metric_name) 192 | ) 193 | continue 194 | 195 | query_builder = PrometheusQueryBuilder(summary_metric_metadata, query_duration) 196 | query = query_builder.build_query() 197 | if not query: 198 | print("No query found for metric: %s. Skipping metric." % (summary_metric_name)) 199 | continue 200 | 201 | # Execute the query and get the result 202 | result = self.execute_query(query, str(query_eval_time)) 203 | if result is None: 204 | print("Error executing query: %s" % (query)) 205 | continue 206 | # Set the result in metrics summary 207 | attr = getattr(model_server_metrics, summary_metric_name) 208 | if attr is not None: 209 | target_type = type(attr) 210 | setattr(model_server_metrics, summary_metric_name, target_type(result)) 211 | 212 | return model_server_metrics 213 | 214 | def execute_query(self, query: str, eval_time: str) -> float: 215 | """ 216 | Executes the given query on the Prometheus server and returns the result. 217 | 218 | Args: 219 | query: the PromQL query to execute 220 | eval_time: the time at which the query is evaluated, used to ensure we are querying the correct time range 221 | 222 | Returns: 223 | The result of the query. 224 | """ 225 | query_result = 0.0 226 | try: 227 | response = requests.get(f"{self.url}/api/v1/query", params={"query": query, "time": eval_time}) 228 | if response is None: 229 | print("Error executing query: %s" % (query)) 230 | return query_result 231 | 232 | response.raise_for_status() 233 | except Exception as e: 234 | print("Error executing query: %s" % (e)) 235 | return query_result 236 | 237 | # Check if the response is valid 238 | # Sample response: 239 | # { 240 | # "status": "success", 241 | # "data": { 242 | # "resultType": "vector", 243 | # "result": [ 244 | # { 245 | # "metric": {}, 246 | # "value": [ 247 | # 1632741820.781, 248 | # "0.0000000000000000" 249 | # ] 250 | # } 251 | # ] 252 | # } 253 | # } 254 | response_obj = response.json() 255 | if response_obj.get("status") != "success": 256 | print("Error executing query: %s" % (response_obj)) 257 | return query_result 258 | 259 | data = response_obj.get("data", {}) 260 | result = data.get("result", []) 261 | if len(result) > 0 and "value" in result[0]: 262 | if isinstance(result[0]["value"], list) and len(result[0]["value"]) > 1: 263 | # Return the value of the first result 264 | # The value is in the second element of the list 265 | # e.g. [1632741820.781, "0.0000000000000000"] 266 | # We need to convert it to float 267 | # and return it 268 | # Convert the value to float 269 | try: 270 | query_result = round(float(result[0]["value"][1]), 6) 271 | except ValueError: 272 | print("Error converting value to float: %s" % (result[0]["value"][1])) 273 | return query_result 274 | return query_result 275 | -------------------------------------------------------------------------------- /inference_perf/client/modelserver/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from .base import ModelServerClient 15 | from .mock_client import MockModelServerClient 16 | from .vllm_client import vLLMModelServerClient 17 | 18 | 19 | __all__ = ["ModelServerClient", "MockModelServerClient", "vLLMModelServerClient"] 20 | -------------------------------------------------------------------------------- /inference_perf/client/modelserver/base.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from abc import ABC, abstractmethod 15 | from typing import List, Tuple, TypedDict 16 | from inference_perf.config import APIType 17 | 18 | from inference_perf.apis import InferenceAPIData 19 | 20 | 21 | class ModelServerPrometheusMetric: 22 | def __init__(self, name: str, op: str, type: str, filters: str) -> None: 23 | self.name = name 24 | self.op = op 25 | self.type = type 26 | self.filters = filters 27 | 28 | 29 | # PrometheusMetricMetadata stores the mapping of metrics to their model server names and types 30 | # and the filters to be applied to them. 31 | # This is used to generate Prometheus query for the metrics. 32 | class PrometheusMetricMetadata(TypedDict): 33 | # Throughput 34 | prompt_tokens_per_second: ModelServerPrometheusMetric 35 | output_tokens_per_second: ModelServerPrometheusMetric 36 | requests_per_second: ModelServerPrometheusMetric 37 | 38 | # Latency 39 | avg_request_latency: ModelServerPrometheusMetric 40 | median_request_latency: ModelServerPrometheusMetric 41 | p90_request_latency: ModelServerPrometheusMetric 42 | p99_request_latency: ModelServerPrometheusMetric 43 | avg_time_to_first_token: ModelServerPrometheusMetric 44 | median_time_to_first_token: ModelServerPrometheusMetric 45 | p90_time_to_first_token: ModelServerPrometheusMetric 46 | p99_time_to_first_token: ModelServerPrometheusMetric 47 | avg_time_per_output_token: ModelServerPrometheusMetric 48 | median_time_per_output_token: ModelServerPrometheusMetric 49 | p90_time_per_output_token: ModelServerPrometheusMetric 50 | p99_time_per_output_token: ModelServerPrometheusMetric 51 | 52 | # Request 53 | total_requests: ModelServerPrometheusMetric 54 | avg_prompt_tokens: ModelServerPrometheusMetric 55 | avg_output_tokens: ModelServerPrometheusMetric 56 | avg_queue_length: ModelServerPrometheusMetric 57 | 58 | 59 | class ModelServerClient(ABC): 60 | @abstractmethod 61 | def __init__(self, api_type: APIType, *args: Tuple[int, ...]) -> None: 62 | if api_type not in self.get_supported_apis(): 63 | raise Exception(f"Unsupported API type {api_type}") 64 | 65 | self.apiType = api_type 66 | 67 | @abstractmethod 68 | def get_supported_apis(self) -> List[APIType]: 69 | raise NotImplementedError 70 | 71 | @abstractmethod 72 | async def process_request(self, data: InferenceAPIData, stage_id: int) -> None: 73 | raise NotImplementedError 74 | 75 | @abstractmethod 76 | def get_prometheus_metric_metadata(self) -> PrometheusMetricMetadata: 77 | # assumption: all metrics clients have metrics exported in Prometheus format 78 | raise NotImplementedError 79 | -------------------------------------------------------------------------------- /inference_perf/client/modelserver/mock_client.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from inference_perf.client.requestdatacollector import RequestDataCollector 16 | from typing import List 17 | from inference_perf.config import APIType 18 | from inference_perf.apis import InferenceAPIData, InferenceInfo, RequestLifecycleMetric 19 | from .base import ModelServerClient 20 | import asyncio 21 | import time 22 | 23 | 24 | class MockModelServerClient(ModelServerClient): 25 | def __init__(self, metrics_collector: RequestDataCollector, api_type: APIType) -> None: 26 | super().__init__(api_type) 27 | self.metrics_collector = metrics_collector 28 | 29 | async def process_request(self, data: InferenceAPIData, stage_id: int) -> None: 30 | start = time.monotonic() 31 | print("Processing mock request for stage - " + str(stage_id)) 32 | await asyncio.sleep(3) 33 | self.metrics_collector.record_metric( 34 | RequestLifecycleMetric( 35 | stage_id=stage_id, 36 | request_data=str(data.to_payload("mock_model", 3, False)), 37 | info=InferenceInfo( 38 | input_tokens=0, 39 | output_tokens=0, 40 | ), 41 | error=None, 42 | start_time=start, 43 | end_time=time.monotonic(), 44 | ) 45 | ) 46 | 47 | def get_supported_apis(self) -> List[APIType]: 48 | return [APIType.Completion, APIType.Chat] 49 | -------------------------------------------------------------------------------- /inference_perf/client/modelserver/vllm_client.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from inference_perf.client.requestdatacollector import RequestDataCollector 16 | from inference_perf.config import APIType 17 | from inference_perf.apis import InferenceAPIData, InferenceInfo, RequestLifecycleMetric, ErrorResponseInfo 18 | from inference_perf.utils import CustomTokenizer 19 | from .base import ModelServerClient, PrometheusMetricMetadata, ModelServerPrometheusMetric 20 | from typing import List 21 | import aiohttp 22 | import json 23 | import time 24 | 25 | 26 | class vLLMModelServerClient(ModelServerClient): 27 | def __init__( 28 | self, 29 | metrics_collector: RequestDataCollector, 30 | api_type: APIType, 31 | uri: str, 32 | model_name: str, 33 | tokenizer: CustomTokenizer, 34 | ignore_eos: bool = True, 35 | ) -> None: 36 | super().__init__(api_type) 37 | self.model_name = model_name 38 | self.uri = uri 39 | self.max_completion_tokens = 30 # default to use when not set at the request level 40 | self.ignore_eos = ignore_eos 41 | self.tokenizer = tokenizer 42 | self.metrics_collector = metrics_collector 43 | 44 | self.prometheus_metric_metadata: PrometheusMetricMetadata = { 45 | "avg_queue_length": ModelServerPrometheusMetric( 46 | "vllm:num_requests_waiting", "mean", "gauge", "model_name='%s'" % self.model_name 47 | ), 48 | "avg_time_to_first_token": ModelServerPrometheusMetric( 49 | "vllm:time_to_first_token_seconds", "mean", "histogram", "model_name='%s'" % self.model_name 50 | ), 51 | "median_time_to_first_token": ModelServerPrometheusMetric( 52 | "vllm:time_to_first_token_seconds", "median", "histogram", "model_name='%s'" % self.model_name 53 | ), 54 | "p90_time_to_first_token": ModelServerPrometheusMetric( 55 | "vllm:time_to_first_token_seconds", "p90", "histogram", "model_name='%s'" % self.model_name 56 | ), 57 | "p99_time_to_first_token": ModelServerPrometheusMetric( 58 | "vllm:time_to_first_token_seconds", "p99", "histogram", "model_name='%s'" % self.model_name 59 | ), 60 | "avg_time_per_output_token": ModelServerPrometheusMetric( 61 | "vllm:time_per_output_token_seconds", "mean", "histogram", "model_name='%s'" % self.model_name 62 | ), 63 | "median_time_per_output_token": ModelServerPrometheusMetric( 64 | "vllm:time_per_output_token_seconds", "median", "histogram", "model_name='%s'" % self.model_name 65 | ), 66 | "p90_time_per_output_token": ModelServerPrometheusMetric( 67 | "vllm:time_per_output_token_seconds", "p90", "histogram", "model_name='%s'" % self.model_name 68 | ), 69 | "p99_time_per_output_token": ModelServerPrometheusMetric( 70 | "vllm:time_per_output_token_seconds", "p99", "histogram", "model_name='%s'" % self.model_name 71 | ), 72 | "avg_prompt_tokens": ModelServerPrometheusMetric( 73 | "vllm:prompt_tokens_total", "mean", "counter", "model_name='%s'" % self.model_name 74 | ), 75 | "prompt_tokens_per_second": ModelServerPrometheusMetric( 76 | "vllm:prompt_tokens_total", "rate", "counter", "model_name='%s'" % self.model_name 77 | ), 78 | "avg_output_tokens": ModelServerPrometheusMetric( 79 | "vllm:generation_tokens_total", "mean", "counter", "model_name='%s'" % self.model_name 80 | ), 81 | "output_tokens_per_second": ModelServerPrometheusMetric( 82 | "vllm:generation_tokens_total", "rate", "counter", "model_name='%s'" % self.model_name 83 | ), 84 | "total_requests": ModelServerPrometheusMetric( 85 | "vllm:e2e_request_latency_seconds_count", "increase", "counter", "model_name='%s'" % self.model_name 86 | ), 87 | "requests_per_second": ModelServerPrometheusMetric( 88 | "vllm:e2e_request_latency_seconds_count", "rate", "counter", "model_name='%s'" % self.model_name 89 | ), 90 | "avg_request_latency": ModelServerPrometheusMetric( 91 | "vllm:e2e_request_latency_seconds", "mean", "histogram", "model_name='%s'" % self.model_name 92 | ), 93 | "median_request_latency": ModelServerPrometheusMetric( 94 | "vllm:e2e_request_latency_seconds", "median", "histogram", "model_name='%s'" % self.model_name 95 | ), 96 | "p90_request_latency": ModelServerPrometheusMetric( 97 | "vllm:e2e_request_latency_seconds", "p90", "histogram", "model_name='%s'" % self.model_name 98 | ), 99 | "p99_request_latency": ModelServerPrometheusMetric( 100 | "vllm:e2e_request_latency_seconds", "p99", "histogram", "model_name='%s'" % self.model_name 101 | ), 102 | } 103 | 104 | async def process_request(self, data: InferenceAPIData, stage_id: int) -> None: 105 | payload = data.to_payload( 106 | model_name=self.model_name, max_tokens=self.max_completion_tokens, ignore_eos=self.ignore_eos 107 | ) 108 | headers = {"Content-Type": "application/json"} 109 | async with aiohttp.ClientSession() as session: 110 | start = time.monotonic() 111 | try: 112 | async with session.post(self.uri + data.get_route(), headers=headers, data=json.dumps(payload)) as response: 113 | if response.status == 200: 114 | content = await response.json() 115 | response_info = data.process_response(data=content, tokenizer=self.tokenizer) 116 | self.metrics_collector.record_metric( 117 | RequestLifecycleMetric( 118 | stage_id=stage_id, 119 | request_data=json.dumps(payload), 120 | response_data=json.dumps(content), 121 | info=response_info, 122 | error=None, 123 | start_time=start, 124 | end_time=time.monotonic(), 125 | ) 126 | ) 127 | else: 128 | content = await response.text() 129 | self.metrics_collector.record_metric( 130 | RequestLifecycleMetric( 131 | stage_id=stage_id, 132 | request_data=json.dumps(payload), 133 | response_data=content, 134 | info=InferenceInfo(), 135 | error=ErrorResponseInfo(error_msg=content, error_type="Non 200 reponse"), 136 | start_time=start, 137 | end_time=time.monotonic(), 138 | ) 139 | ) 140 | except Exception as e: 141 | self.metrics_collector.record_metric( 142 | RequestLifecycleMetric( 143 | stage_id=stage_id, 144 | request_data=json.dumps(payload), 145 | info=InferenceInfo(), 146 | error=ErrorResponseInfo( 147 | error_msg=str(e), 148 | error_type=type(e).__name__, 149 | ), 150 | start_time=start, 151 | end_time=time.monotonic(), 152 | ) 153 | ) 154 | 155 | def get_supported_apis(self) -> List[APIType]: 156 | return [APIType.Completion, APIType.Chat] 157 | 158 | def get_prometheus_metric_metadata(self) -> PrometheusMetricMetadata: 159 | return self.prometheus_metric_metadata 160 | -------------------------------------------------------------------------------- /inference_perf/client/requestdatacollector/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from .base import RequestDataCollector 15 | from .local import LocalRequestDataCollector 16 | 17 | 18 | __all__ = [ 19 | "RequestDataCollector", 20 | "LocalRequestDataCollector", 21 | ] 22 | -------------------------------------------------------------------------------- /inference_perf/client/requestdatacollector/base.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from abc import ABC, abstractmethod 15 | from typing import List 16 | 17 | from inference_perf.apis import RequestLifecycleMetric 18 | 19 | 20 | class RequestDataCollector(ABC): 21 | """ 22 | Responsible for collecting request information 23 | """ 24 | 25 | @abstractmethod 26 | def record_metric(self, metric: RequestLifecycleMetric) -> None: 27 | raise NotImplementedError 28 | 29 | @abstractmethod 30 | def get_metrics(self) -> List[RequestLifecycleMetric]: 31 | raise NotImplementedError 32 | -------------------------------------------------------------------------------- /inference_perf/client/requestdatacollector/local.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import List 16 | from inference_perf.client.requestdatacollector import RequestDataCollector 17 | from inference_perf.apis import RequestLifecycleMetric 18 | 19 | 20 | class LocalRequestDataCollector(RequestDataCollector): 21 | """Responsible for accumulating client request metrics""" 22 | 23 | def __init__(self) -> None: 24 | self.metrics: List[RequestLifecycleMetric] = [] 25 | 26 | def record_metric(self, metric: RequestLifecycleMetric) -> None: 27 | self.metrics.append(metric) 28 | 29 | def get_metrics(self) -> List[RequestLifecycleMetric]: 30 | return self.metrics 31 | -------------------------------------------------------------------------------- /inference_perf/config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from datetime import datetime 15 | from pydantic import BaseModel, HttpUrl 16 | from typing import Any, Optional, List 17 | from argparse import ArgumentParser 18 | from enum import Enum 19 | import yaml 20 | 21 | 22 | class APIType(Enum): 23 | Completion = "completion" 24 | Chat = "chat" 25 | 26 | 27 | class DataGenType(Enum): 28 | Mock = "mock" 29 | ShareGPT = "shareGPT" 30 | Synthetic = "synthetic" 31 | Random = "random" 32 | SharedPrefix = "shared_prefix" 33 | 34 | 35 | # Represents the distribution for input prompts and output generations. 36 | class Distribution(BaseModel): 37 | min: int = 10 38 | max: int = 1024 39 | mean: float = 512 40 | std_dev: float = 200 41 | total_count: int = 1000 42 | 43 | 44 | # Configuration for shared prefix datagen which allows users to specify shared prefixes. 45 | class SharedPrefix(BaseModel): 46 | num_groups: int = 10 47 | num_prompts_per_group: int = 10 48 | system_prompt_len: int = 100 49 | question_len: int = 50 50 | output_len: int = 50 51 | 52 | 53 | class DataConfig(BaseModel): 54 | type: DataGenType = DataGenType.Mock 55 | # Distributions are only supported for synthetic/random dataset at this moment 56 | input_distribution: Optional[Distribution] = None 57 | output_distribution: Optional[Distribution] = None 58 | shared_prefix: Optional[SharedPrefix] = None 59 | 60 | 61 | class ModelServerType(Enum): 62 | VLLM = "vllm" 63 | 64 | 65 | class LoadType(Enum): 66 | CONSTANT = "constant" 67 | POISSON = "poisson" 68 | 69 | 70 | class MetricsClientType(Enum): 71 | PROMETHEUS = "prometheus" 72 | DEFAULT = "default" 73 | 74 | 75 | class LoadStage(BaseModel): 76 | rate: int 77 | duration: int 78 | 79 | 80 | class LoadConfig(BaseModel): 81 | type: LoadType = LoadType.CONSTANT 82 | interval: float = 1.0 83 | stages: List[LoadStage] = [] 84 | 85 | 86 | class StorageConfigBase(BaseModel): 87 | path: str = f"reports-{datetime.now().strftime('%Y%m%d-%H%M%S')}" 88 | report_file_prefix: Optional[str] = None 89 | 90 | 91 | class GoogleCloudStorageConfig(StorageConfigBase): 92 | bucket_name: str 93 | 94 | 95 | class StorageConfig(BaseModel): 96 | google_cloud_storage: Optional[GoogleCloudStorageConfig] = None 97 | 98 | 99 | class RequestLifecycleMetricsReportConfig(BaseModel): 100 | summary: Optional[bool] = True 101 | per_stage: Optional[bool] = True 102 | per_request: Optional[bool] = False 103 | 104 | 105 | class PrometheusMetricsReportConfig(BaseModel): 106 | summary: Optional[bool] = True 107 | per_stage: Optional[bool] = False 108 | 109 | 110 | class ReportConfig(BaseModel): 111 | request_lifecycle: RequestLifecycleMetricsReportConfig = RequestLifecycleMetricsReportConfig() 112 | prometheus: PrometheusMetricsReportConfig = PrometheusMetricsReportConfig() 113 | 114 | 115 | class PrometheusClientConfig(BaseModel): 116 | scrape_interval: int = 15 117 | url: HttpUrl = HttpUrl(url="http://localhost:9090") 118 | 119 | 120 | class MetricsClientConfig(BaseModel): 121 | type: MetricsClientType 122 | prometheus: Optional[PrometheusClientConfig] = None 123 | 124 | 125 | class ModelServerClientConfig(BaseModel): 126 | type: ModelServerType = ModelServerType.VLLM 127 | model_name: str 128 | base_url: str 129 | ignore_eos: bool = True 130 | 131 | 132 | class CustomTokenizerConfig(BaseModel): 133 | pretrained_model_name_or_path: str 134 | trust_remote_code: Optional[bool] = None 135 | token: Optional[str] = None 136 | 137 | 138 | class Config(BaseModel): 139 | api: APIType = APIType.Completion 140 | data: DataConfig = DataConfig() 141 | load: LoadConfig = LoadConfig() 142 | metrics: Optional[MetricsClientConfig] = None 143 | report: ReportConfig = ReportConfig() 144 | storage: Optional[StorageConfig] = StorageConfig() 145 | server: Optional[ModelServerClientConfig] = None 146 | tokenizer: Optional[CustomTokenizerConfig] = None 147 | 148 | 149 | def deep_merge(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]: 150 | result = base.copy() 151 | for k, v in override.items(): 152 | if k in result and isinstance(result[k], dict) and isinstance(v, dict): 153 | result[k] = deep_merge(result[k], v) 154 | else: 155 | result[k] = v 156 | return result 157 | 158 | 159 | def read_config(arg_list: Optional[list[str]] = None) -> Config: 160 | parser = ArgumentParser() 161 | 162 | parser.add_argument("-c", "--config_file", help="Config File", required=True) 163 | 164 | args = parser.parse_args(arg_list) 165 | if args.config_file: 166 | print("Using configuration from: %s" % args.config_file) 167 | with open(args.config_file, "r") as stream: 168 | cfg = yaml.safe_load(stream) 169 | 170 | default_cfg = Config().model_dump(mode="json") 171 | merged_cfg = deep_merge(default_cfg, cfg) 172 | 173 | print( 174 | f"Benchmarking with the following config:\n\n{yaml.dump(merged_cfg, sort_keys=False, default_flow_style=False)}\n" 175 | ) 176 | return Config(**merged_cfg) 177 | return Config() 178 | -------------------------------------------------------------------------------- /inference_perf/datagen/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from .base import DataGenerator 15 | from .mock_datagen import MockDataGenerator 16 | from .hf_sharegpt_datagen import HFShareGPTDataGenerator 17 | from .synthetic_datagen import SyntheticDataGenerator 18 | from .random_datagen import RandomDataGenerator 19 | from .shared_prefix_datagen import SharedPrefixDataGenerator 20 | 21 | __all__ = [ 22 | "DataGenerator", 23 | "MockDataGenerator", 24 | "HFShareGPTDataGenerator", 25 | "SyntheticDataGenerator", 26 | "RandomDataGenerator", 27 | "SharedPrefixDataGenerator", 28 | ] 29 | -------------------------------------------------------------------------------- /inference_perf/datagen/base.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from inference_perf.apis import InferenceAPIData 15 | from inference_perf.utils.custom_tokenizer import CustomTokenizer 16 | from inference_perf.config import APIType, DataConfig, Distribution, SharedPrefix 17 | from abc import ABC, abstractmethod 18 | from typing import Generator, Optional, List 19 | 20 | 21 | class DataGenerator(ABC): 22 | """Abstract base class for data generators.""" 23 | 24 | apiType: APIType 25 | input_distribution: Optional[Distribution] 26 | output_distribution: Optional[Distribution] 27 | shared_prefix: Optional[SharedPrefix] 28 | tokenizer: Optional[CustomTokenizer] 29 | 30 | """Abstract base class for data generators.""" 31 | 32 | def __init__(self, apiType: APIType, config: DataConfig, tokenizer: Optional[CustomTokenizer]) -> None: 33 | if apiType not in self.get_supported_apis(): 34 | raise Exception(f"Unsupported API type {apiType}") 35 | 36 | if ( 37 | config.input_distribution is not None or config.output_distribution is not None 38 | ) and not self.is_io_distribution_supported(): 39 | raise Exception("IO distribution not supported for this data generator") 40 | 41 | if config.shared_prefix is not None and not self.is_shared_prefix_supported(): 42 | raise Exception("Shared prefix not supported for this data generator") 43 | 44 | if tokenizer is not None: 45 | self.tokenizer = tokenizer 46 | 47 | self.apiType = apiType 48 | self.input_distribution = config.input_distribution 49 | self.output_distribution = config.output_distribution 50 | self.shared_prefix = config.shared_prefix 51 | 52 | @abstractmethod 53 | def get_supported_apis(self) -> List[APIType]: 54 | raise NotImplementedError 55 | 56 | @abstractmethod 57 | def get_data(self) -> Generator[InferenceAPIData, None, None]: 58 | raise NotImplementedError 59 | 60 | @abstractmethod 61 | def is_io_distribution_supported(self) -> bool: 62 | raise NotImplementedError 63 | 64 | @abstractmethod 65 | def is_shared_prefix_supported(self) -> bool: 66 | raise NotImplementedError 67 | -------------------------------------------------------------------------------- /inference_perf/datagen/hf_sharegpt_datagen.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from inference_perf.apis import InferenceAPIData, CompletionAPIData, ChatCompletionAPIData, ChatMessage 15 | from inference_perf.utils.custom_tokenizer import CustomTokenizer 16 | from .base import DataGenerator 17 | from inference_perf.config import APIType, DataConfig 18 | from typing import Generator, List 19 | from datasets import load_dataset 20 | 21 | 22 | class HFShareGPTDataGenerator(DataGenerator): 23 | def __init__(self, apiType: APIType, config: DataConfig, tokenizer: CustomTokenizer) -> None: 24 | super().__init__(apiType, config, tokenizer) 25 | self.sharegpt_dataset = iter( 26 | load_dataset( 27 | "anon8231489123/ShareGPT_Vicuna_unfiltered", 28 | data_files="ShareGPT_V3_unfiltered_cleaned_split.json", 29 | streaming=True, 30 | split="train", 31 | ) 32 | ) 33 | self.min_num_turns = 2 34 | self.data_key = "conversations" 35 | self.role_key = "from" 36 | self.content_key = "value" 37 | # initialize data collection 38 | next(self.sharegpt_dataset) 39 | 40 | def get_supported_apis(self) -> List[APIType]: 41 | return [APIType.Chat, APIType.Completion] 42 | 43 | def get_data(self) -> Generator[InferenceAPIData, None, None]: 44 | if self.sharegpt_dataset is not None: 45 | while True: 46 | data = next(self.sharegpt_dataset) 47 | if ( 48 | data is None 49 | or data[self.data_key] is None 50 | or len(data[self.data_key]) < self.min_num_turns 51 | or len(data[self.data_key]) == 0 52 | ): 53 | continue 54 | 55 | if self.apiType == APIType.Completion: 56 | try: 57 | prompt = data[self.data_key][0].get(self.content_key) 58 | if not prompt: 59 | continue 60 | yield CompletionAPIData(prompt=prompt) 61 | except (KeyError, TypeError) as e: 62 | print(f"Skipping invalid completion data: {e}") 63 | continue 64 | elif self.apiType == APIType.Chat: 65 | yield ChatCompletionAPIData( 66 | messages=[ 67 | ChatMessage(role=conversation[self.role_key], content=conversation[self.content_key]) 68 | for conversation in data[self.data_key] 69 | ] 70 | ) 71 | else: 72 | raise Exception("Unsupported API type") 73 | 74 | def is_io_distribution_supported(self) -> bool: 75 | return False 76 | 77 | def is_shared_prefix_supported(self) -> bool: 78 | return False 79 | -------------------------------------------------------------------------------- /inference_perf/datagen/mock_datagen.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from typing import Generator, List, Optional 15 | from inference_perf.config import APIType, DataConfig 16 | from inference_perf.datagen.base import DataGenerator 17 | from inference_perf.apis import InferenceAPIData, CompletionAPIData, ChatCompletionAPIData, ChatMessage 18 | from inference_perf.utils.custom_tokenizer import CustomTokenizer 19 | 20 | 21 | class MockDataGenerator(DataGenerator): 22 | def __init__(self, apiType: APIType, config: DataConfig, tokenizer: Optional[CustomTokenizer]) -> None: 23 | super().__init__(apiType, config, tokenizer) 24 | 25 | def get_supported_apis(self) -> List[APIType]: 26 | return [APIType.Completion, APIType.Chat] 27 | 28 | def get_data(self) -> Generator[InferenceAPIData, None, None]: 29 | i = 0 30 | while True: 31 | i += 1 32 | if self.apiType == APIType.Completion: 33 | yield CompletionAPIData(prompt="text" + str(i)) 34 | elif self.apiType == APIType.Chat: 35 | yield ChatCompletionAPIData(messages=[ChatMessage(role="user", content="text" + str(i))]) 36 | else: 37 | raise Exception("Unsupported API type") 38 | 39 | def is_io_distribution_supported(self) -> bool: 40 | return False 41 | 42 | def is_shared_prefix_supported(self) -> bool: 43 | return False 44 | -------------------------------------------------------------------------------- /inference_perf/datagen/random_datagen.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import numpy as np 15 | from inference_perf.apis import InferenceAPIData, CompletionAPIData 16 | from inference_perf.utils.custom_tokenizer import CustomTokenizer 17 | from inference_perf.utils.distribution import generate_distribution 18 | from .base import DataGenerator 19 | from typing import Generator, List 20 | from inference_perf.config import APIType, DataConfig 21 | 22 | 23 | 24 | 25 | # Random data generator generates random tokens from the model's 26 | # vocabulary for the desired input and output distribution. 27 | class RandomDataGenerator(DataGenerator): 28 | def __init__( 29 | self, 30 | apiType: APIType, 31 | config: DataConfig, 32 | tokenizer: CustomTokenizer, 33 | ) -> None: 34 | super().__init__(apiType, config, tokenizer) 35 | 36 | if self.input_distribution is None or self.output_distribution is None: 37 | raise ValueError("Input and Output Distribution are required for RandomDataGenerator") 38 | 39 | self.input_lengths = generate_distribution( 40 | self.input_distribution.min, 41 | self.input_distribution.max, 42 | self.input_distribution.mean, 43 | self.input_distribution.std_dev, 44 | self.input_distribution.total_count, 45 | ) 46 | self.output_lengths = generate_distribution( 47 | self.output_distribution.min, 48 | self.output_distribution.max, 49 | self.output_distribution.mean, 50 | self.output_distribution.std_dev, 51 | self.output_distribution.total_count, 52 | ) 53 | 54 | if self.tokenizer is None: 55 | raise ValueError("Tokenizer is required for RandomDataGenerator") 56 | 57 | hf_tokenizer = self.tokenizer.get_tokenizer() 58 | if hasattr(hf_tokenizer, "vocab_size") and hf_tokenizer.vocab_size is not None: 59 | self.vocab_size: int = hf_tokenizer.vocab_size 60 | elif hasattr(hf_tokenizer, "get_vocab") and callable(hf_tokenizer.get_vocab): 61 | self.vocab_size = len(hf_tokenizer.get_vocab()) 62 | else: 63 | try: 64 | self.vocab_size = len(hf_tokenizer) 65 | except TypeError as e: 66 | raise ValueError( 67 | "Tokenizer does not have a 'vocab_size' attribute, 'get_vocab()' method, " 68 | "or support len() for vocabulary size. Cannot use random token generation." 69 | ) from e 70 | if self.vocab_size <= 0: 71 | raise ValueError(f"Tokenizer vocabulary size must be positive, got {self.vocab_size}.") 72 | 73 | def get_supported_apis(self) -> List[APIType]: 74 | return [APIType.Completion] 75 | 76 | def is_io_distribution_supported(self) -> bool: 77 | return True 78 | 79 | def is_shared_prefix_supported(self) -> bool: 80 | return False 81 | 82 | def get_data(self) -> Generator[InferenceAPIData, None, None]: 83 | i = 0 84 | 85 | while True: 86 | if self.tokenizer is None: 87 | raise ValueError("Tokenizer is required for RandomDataGenerator") 88 | 89 | if self.apiType == APIType.Completion: 90 | prompt_text: str 91 | if self.input_lengths[i] <= 0: 92 | random_token_ids_list = [] 93 | else: 94 | random_token_ids = np.random.randint(0, self.vocab_size, size=self.input_lengths[i], dtype=np.int64) 95 | random_token_ids_list = random_token_ids.tolist() 96 | prompt_text = self.tokenizer.get_tokenizer().decode(random_token_ids_list) 97 | 98 | yield CompletionAPIData( 99 | prompt=prompt_text, 100 | max_tokens=self.output_lengths[i], 101 | ) 102 | i += 1 103 | else: 104 | raise Exception(f"Unsupported API type: {self.apiType}. RandomDataGenerator only supports Completion.") 105 | -------------------------------------------------------------------------------- /inference_perf/datagen/shared_prefix_datagen.py: -------------------------------------------------------------------------------- 1 | import random 2 | from typing import Generator, List 3 | import numpy as np 4 | 5 | from inference_perf.apis.base import InferenceAPIData 6 | from inference_perf.apis.completion import CompletionAPIData 7 | from inference_perf.config import APIType, DataConfig 8 | from inference_perf.utils.custom_tokenizer import CustomTokenizer 9 | from .base import DataGenerator 10 | 11 | 12 | # Shared Prefix Generator generates shared prefix in the prompts that are sent. 13 | # This can be used to benchmark prefix caching cases. 14 | class SharedPrefixDataGenerator(DataGenerator): 15 | def __init__(self, apiType: APIType, config: DataConfig, tokenizer: CustomTokenizer) -> None: 16 | super().__init__(apiType, config, tokenizer) 17 | 18 | if self.tokenizer is None: 19 | raise ValueError("Tokenizer is required for SharedPrefixDataGenerator but was not initialized.") 20 | 21 | # Initialize vocab_size 22 | hf_tokenizer = self.tokenizer.get_tokenizer() 23 | if hasattr(hf_tokenizer, "vocab_size") and hf_tokenizer.vocab_size is not None: 24 | self.vocab_size: int = hf_tokenizer.vocab_size 25 | elif hasattr(hf_tokenizer, "get_vocab") and callable(hf_tokenizer.get_vocab): 26 | self.vocab_size = len(hf_tokenizer.get_vocab()) 27 | else: 28 | try: 29 | self.vocab_size = len(hf_tokenizer) 30 | except TypeError as e: 31 | raise ValueError( 32 | "Tokenizer does not have a 'vocab_size' attribute, 'get_vocab()' method, " 33 | "or support len() for vocabulary size. Cannot use random token generation." 34 | ) from e 35 | if self.vocab_size <= 0: 36 | raise ValueError(f"Tokenizer vocabulary size must be positive, got {self.vocab_size}.") 37 | 38 | if self.shared_prefix is None: 39 | raise ValueError("Shared Prefix config is required for SharedPrefixDataGenerator") 40 | 41 | self.num_groups: int = self.shared_prefix.num_groups 42 | self.num_prompts_per_group: int = self.shared_prefix.num_prompts_per_group 43 | self.system_prompt_len: int = self.shared_prefix.system_prompt_len 44 | self.question_len: int = self.shared_prefix.question_len 45 | self.output_len: int = self.shared_prefix.output_len 46 | 47 | self.prompts: List[str] = [] 48 | self._generate_prompts() 49 | 50 | def get_supported_apis(self) -> List[APIType]: 51 | return [APIType.Completion] 52 | 53 | def is_io_distribution_supported(self) -> bool: 54 | return True 55 | 56 | def is_shared_prefix_supported(self) -> bool: 57 | return True 58 | 59 | def get_data(self) -> Generator[InferenceAPIData, None, None]: 60 | if not self.prompts: 61 | return 62 | 63 | i = 0 64 | while True: 65 | yield CompletionAPIData(prompt=self.prompts[i], max_tokens=self.output_len) 66 | i = (i + 1) % len(self.prompts) 67 | 68 | def _generate_random_token_ids(self, length: int) -> List[int]: 69 | """Generates a list of random token IDs of a specified length.""" 70 | if length == 0: 71 | return [] 72 | # np.random.randint's high parameter is exclusive 73 | return np.random.randint(0, self.vocab_size, size=length, dtype=np.int64).tolist() # type: ignore[no-any-return] 74 | 75 | def _generate_prompts(self) -> None: 76 | """Pre-generates all prompts based on the configuration.""" 77 | if self.tokenizer is None: 78 | # This check is defensive; __init__ should have already validated this. 79 | raise ValueError("Tokenizer is not available for generating prompts.") 80 | 81 | hf_tokenizer = self.tokenizer.get_tokenizer() 82 | 83 | for _ in range(self.num_groups): 84 | # Generate a shared prefix (system prompt) 85 | shared_prefix_token_ids = self._generate_random_token_ids(self.system_prompt_len) 86 | shared_prefix_text = hf_tokenizer.decode(shared_prefix_token_ids, skip_special_tokens=True) 87 | 88 | for _ in range(self.num_prompts_per_group): 89 | # Generate a unique question 90 | question_token_ids = self._generate_random_token_ids(self.question_len) 91 | question_text = hf_tokenizer.decode(question_token_ids, skip_special_tokens=True) 92 | 93 | # Combine shared prefix and question 94 | full_prompt_text = shared_prefix_text + " " + question_text 95 | 96 | self.prompts.append(full_prompt_text) 97 | 98 | # Shuffle the generated prompts to ensure randomness if served sequentially by different workers 99 | random.shuffle(self.prompts) 100 | -------------------------------------------------------------------------------- /inference_perf/datagen/synthetic_datagen.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from inference_perf.apis import InferenceAPIData, CompletionAPIData 15 | from inference_perf.utils.custom_tokenizer import CustomTokenizer 16 | from inference_perf.utils.distribution import generate_distribution 17 | from .base import DataGenerator 18 | from typing import Generator, List 19 | from inference_perf.config import APIType, DataConfig 20 | 21 | 22 | class SyntheticDataGenerator(DataGenerator): 23 | def __init__(self, apiType: APIType, config: DataConfig, tokenizer: CustomTokenizer) -> None: 24 | super().__init__(apiType, config, tokenizer) 25 | 26 | if self.input_distribution is None or self.output_distribution is None or self.tokenizer is None: 27 | raise ValueError("IODistribution and tokenizer are required for SyntheticDataGenerator") 28 | 29 | self.input_lengths = generate_distribution( 30 | self.input_distribution.min, 31 | self.input_distribution.max, 32 | self.input_distribution.mean, 33 | self.input_distribution.std_dev, 34 | self.input_distribution.total_count, 35 | ) 36 | self.output_lengths = generate_distribution( 37 | self.output_distribution.min, 38 | self.output_distribution.max, 39 | self.output_distribution.mean, 40 | self.output_distribution.std_dev, 41 | self.output_distribution.total_count, 42 | ) 43 | base_prompt = "Pick as many lines as you can from these poem lines:\n" 44 | self.token_ids = self.tokenizer.get_tokenizer().encode(base_prompt + self.get_sonnet_data()) 45 | 46 | def get_supported_apis(self) -> List[APIType]: 47 | return [APIType.Completion] 48 | 49 | def is_io_distribution_supported(self) -> bool: 50 | return True 51 | 52 | def is_shared_prefix_supported(self) -> bool: 53 | return False 54 | 55 | def get_data(self) -> Generator[InferenceAPIData, None, None]: 56 | i = 0 57 | while True: 58 | if self.tokenizer is None: 59 | raise ValueError("Tokenizer is required for SyntheticDataGenerator") 60 | if self.apiType == APIType.Completion: 61 | yield CompletionAPIData( 62 | prompt=self.tokenizer.get_tokenizer().decode(self.token_ids[: self.input_lengths[i]]), 63 | max_tokens=self.output_lengths[i], 64 | ) 65 | i += 1 66 | else: 67 | raise Exception("Unsupported API type") 68 | 69 | # Hardcoded sonnet data that we can use for synthetic benchmarks. 70 | def get_sonnet_data(self) -> str: 71 | return """FROM fairest creatures we desire increase, 72 | That thereby beauty's rose might never die, 73 | But as the riper should by time decease, 74 | His tender heir might bear his memory: 75 | But thou, contracted to thine own bright eyes, 76 | Feed'st thy light'st flame with self-substantial fuel, 77 | Making a famine where abundance lies, 78 | Thyself thy foe, to thy sweet self too cruel. 79 | Thou that art now the world's fresh ornament 80 | And only herald to the gaudy spring, 81 | Within thine own bud buriest thy content 82 | And, tender churl, makest waste in niggarding. 83 | Pity the world, or else this glutton be, 84 | To eat the world's due, by the grave and thee. 85 | When forty winters shall beseige thy brow, 86 | And dig deep trenches in thy beauty's field, 87 | Thy youth's proud livery, so gazed on now, 88 | Will be a tatter'd weed, of small worth held: 89 | Then being ask'd where all thy beauty lies, 90 | Where all the treasure of thy lusty days, 91 | To say, within thine own deep-sunken eyes, 92 | Were an all-eating shame and thriftless praise. 93 | How much more praise deserved thy beauty's use, 94 | If thou couldst answer 'This fair child of mine 95 | Shall sum my count and make my old excuse,' 96 | Proving his beauty by succession thine! 97 | This were to be new made when thou art old, 98 | And see thy blood warm when thou feel'st it cold. 99 | Look in thy glass, and tell the face thou viewest 100 | Now is the time that face should form another; 101 | Whose fresh repair if now thou not renewest, 102 | Thou dost beguile the world, unbless some mother. 103 | For where is she so fair whose unear'd womb 104 | Disdains the tillage of thy husbandry? 105 | Or who is he so fond will be the tomb 106 | Of his self-love, to stop posterity? 107 | Thou art thy mother's glass, and she in thee 108 | Calls back the lovely April of her prime: 109 | So thou through windows of thine age shall see 110 | Despite of wrinkles this thy golden time. 111 | But if thou live, remember'd not to be, 112 | Die single, and thine image dies with thee. 113 | Unthrifty loveliness, why dost thou spend 114 | Upon thyself thy beauty's legacy? 115 | Nature's bequest gives nothing but doth lend, 116 | And being frank she lends to those are free. 117 | Then, beauteous niggard, why dost thou abuse 118 | The bounteous largess given thee to give? 119 | Profitless usurer, why dost thou use 120 | So great a sum of sums, yet canst not live? 121 | For having traffic with thyself alone, 122 | Thou of thyself thy sweet self dost deceive. 123 | Then how, when nature calls thee to be gone, 124 | What acceptable audit canst thou leave? 125 | Thy unused beauty must be tomb'd with thee, 126 | Which, used, lives th' executor to be. 127 | Those hours, that with gentle work did frame 128 | The lovely gaze where every eye doth dwell, 129 | Will play the tyrants to the very same 130 | And that unfair which fairly doth excel: 131 | For never-resting time leads summer on 132 | To hideous winter and confounds him there; 133 | Sap cheque'd with frost and lusty leaves quite gone, 134 | Beauty o'ersnow'd and bareness every where: 135 | Then, were not summer's distillation left, 136 | A liquid prisoner pent in walls of glass, 137 | Beauty's effect with beauty were bereft, 138 | Nor it nor no remembrance what it was: 139 | But flowers distill'd though they with winter meet, 140 | Leese but their show; their substance still lives sweet. 141 | Then let not winter's ragged hand deface 142 | In thee thy summer, ere thou be distill'd: 143 | Make sweet some vial; treasure thou some place 144 | With beauty's treasure, ere it be self-kill'd. 145 | That use is not forbidden usury, 146 | Which happies those that pay the willing loan; 147 | That's for thyself to breed another thee, 148 | Or ten times happier, be it ten for one; 149 | Ten times thyself were happier than thou art, 150 | If ten of thine ten times refigured thee: 151 | Then what could death do, if thou shouldst depart, 152 | Leaving thee living in posterity? 153 | Be not self-will'd, for thou art much too fair 154 | To be death's conquest and make worms thine heir. 155 | Lo! in the orient when the gracious light 156 | Lifts up his burning head, each under eye 157 | Doth homage to his new-appearing sight, 158 | Serving with looks his sacred majesty; 159 | And having climb'd the steep-up heavenly hill, 160 | Resembling strong youth in his middle age, 161 | yet mortal looks adore his beauty still, 162 | Attending on his golden pilgrimage; 163 | But when from highmost pitch, with weary car, 164 | Like feeble age, he reeleth from the day, 165 | The eyes, 'fore duteous, now converted are 166 | From his low tract and look another way: 167 | So thou, thyself out-going in thy noon, 168 | Unlook'd on diest, unless thou get a son. 169 | Music to hear, why hear'st thou music sadly? 170 | Sweets with sweets war not, joy delights in joy. 171 | Why lovest thou that which thou receivest not gladly, 172 | Or else receivest with pleasure thine annoy? 173 | If the true concord of well-tuned sounds, 174 | By unions married, do offend thine ear, 175 | They do but sweetly chide thee, who confounds 176 | In singleness the parts that thou shouldst bear. 177 | Mark how one string, sweet husband to another, 178 | Strikes each in each by mutual ordering, 179 | Resembling sire and child and happy mother 180 | Who all in one, one pleasing note do sing: 181 | Whose speechless song, being many, seeming one, 182 | Sings this to thee: 'thou single wilt prove none.' 183 | Is it for fear to wet a widow's eye 184 | That thou consumest thyself in single life? 185 | Ah! if thou issueless shalt hap to die. 186 | The world will wail thee, like a makeless wife; 187 | The world will be thy widow and still weep 188 | That thou no form of thee hast left behind, 189 | When every private widow well may keep 190 | By children's eyes her husband's shape in mind. 191 | Look, what an unthrift in the world doth spend 192 | Shifts but his place, for still the world enjoys it; 193 | But beauty's waste hath in the world an end, 194 | And kept unused, the user so destroys it. 195 | No love toward others in that bosom sits 196 | That on himself such murderous shame commits. 197 | For shame! deny that thou bear'st love to any, 198 | Who for thyself art so unprovident. 199 | Grant, if thou wilt, thou art beloved of many, 200 | But that thou none lovest is most evident; 201 | For thou art so possess'd with murderous hate 202 | That 'gainst thyself thou stick'st not to conspire. 203 | Seeking that beauteous roof to ruinate 204 | Which to repair should be thy chief desire. 205 | O, change thy thought, that I may change my mind! 206 | Shall hate be fairer lodged than gentle love? 207 | Be, as thy presence is, gracious and kind, 208 | Or to thyself at least kind-hearted prove: 209 | Make thee another self, for love of me, 210 | That beauty still may live in thine or thee. 211 | As fast as thou shalt wane, so fast thou growest 212 | In one of thine, from that which thou departest; 213 | And that fresh blood which youngly thou bestowest 214 | Thou mayst call thine when thou from youth convertest. 215 | Herein lives wisdom, beauty and increase: 216 | Without this, folly, age and cold decay: 217 | If all were minded so, the times should cease 218 | And threescore year would make the world away. 219 | Let those whom Nature hath not made for store, 220 | Harsh featureless and rude, barrenly perish: 221 | Look, whom she best endow'd she gave the more; 222 | Which bounteous gift thou shouldst in bounty cherish: 223 | She carved thee for her seal, and meant thereby 224 | Thou shouldst print more, not let that copy die. 225 | When I do count the clock that tells the time, 226 | And see the brave day sunk in hideous night; 227 | When I behold the violet past prime, 228 | And sable curls all silver'd o'er with white; 229 | When lofty trees I see barren of leaves 230 | Which erst from heat did canopy the herd, 231 | And summer's green all girded up in sheaves 232 | Borne on the bier with white and bristly beard, 233 | Then of thy beauty do I question make, 234 | That thou among the wastes of time must go, 235 | Since sweets and beauties do themselves forsake 236 | And die as fast as they see others grow; 237 | And nothing 'gainst Time's scythe can make defence 238 | Save breed, to brave him when he takes thee hence. 239 | O, that you were yourself! but, love, you are 240 | No longer yours than you yourself here live: 241 | Against this coming end you should prepare, 242 | And your sweet semblance to some other give. 243 | So should that beauty which you hold in lease 244 | Find no determination: then you were 245 | Yourself again after yourself's decease, 246 | When your sweet issue your sweet form should bear. 247 | Who lets so fair a house fall to decay, 248 | Which husbandry in honour might uphold 249 | Against the stormy gusts of winter's day 250 | And barren rage of death's eternal cold? 251 | O, none but unthrifts! Dear my love, you know 252 | You had a father: let your son say so. 253 | Not from the stars do I my judgment pluck; 254 | And yet methinks I have astronomy, 255 | But not to tell of good or evil luck, 256 | Of plagues, of dearths, or seasons' quality; 257 | Nor can I fortune to brief minutes tell, 258 | Pointing to each his thunder, rain and wind, 259 | Or say with princes if it shall go well, 260 | By oft predict that I in heaven find: 261 | But from thine eyes my knowledge I derive, 262 | And, constant stars, in them I read such art 263 | As truth and beauty shall together thrive, 264 | If from thyself to store thou wouldst convert; 265 | Or else of thee this I prognosticate: 266 | Thy end is truth's and beauty's doom and date. 267 | When I consider every thing that grows 268 | Holds in perfection but a little moment, 269 | That this huge stage presenteth nought but shows 270 | Whereon the stars in secret influence comment; 271 | When I perceive that men as plants increase, 272 | Cheered and cheque'd even by the self-same sky, 273 | Vaunt in their youthful sap, at height decrease, 274 | And wear their brave state out of memory; 275 | Then the conceit of this inconstant stay 276 | Sets you most rich in youth before my sight, 277 | Where wasteful Time debateth with Decay, 278 | To change your day of youth to sullied night; 279 | And all in war with Time for love of you, 280 | As he takes from you, I engraft you new. 281 | But wherefore do not you a mightier way 282 | Make war upon this bloody tyrant, Time? 283 | And fortify yourself in your decay 284 | With means more blessed than my barren rhyme? 285 | Now stand you on the top of happy hours, 286 | And many maiden gardens yet unset 287 | With virtuous wish would bear your living flowers, 288 | Much liker than your painted counterfeit: 289 | So should the lines of life that life repair, 290 | Which this, Time's pencil, or my pupil pen, 291 | Neither in inward worth nor outward fair, 292 | Can make you live yourself in eyes of men. 293 | To give away yourself keeps yourself still, 294 | And you must live, drawn by your own sweet skill. 295 | Who will believe my verse in time to come, 296 | If it were fill'd with your most high deserts? 297 | Though yet, heaven knows, it is but as a tomb 298 | Which hides your life and shows not half your parts. 299 | If I could write the beauty of your eyes 300 | And in fresh numbers number all your graces, 301 | The age to come would say 'This poet lies: 302 | Such heavenly touches ne'er touch'd earthly faces.' 303 | So should my papers yellow'd with their age 304 | Be scorn'd like old men of less truth than tongue, 305 | And your true rights be term'd a poet's rage 306 | And stretched metre of an antique song: 307 | But were some child of yours alive that time, 308 | You should live twice; in it and in my rhyme. 309 | Shall I compare thee to a summer's day? 310 | Thou art more lovely and more temperate: 311 | Rough winds do shake the darling buds of May, 312 | And summer's lease hath all too short a date: 313 | Sometime too hot the eye of heaven shines, 314 | And often is his gold complexion dimm'd; 315 | And every fair from fair sometime declines, 316 | By chance or nature's changing course untrimm'd; 317 | But thy eternal summer shall not fade 318 | Nor lose possession of that fair thou owest; 319 | Nor shall Death brag thou wander'st in his shade, 320 | When in eternal lines to time thou growest: 321 | So long as men can breathe or eyes can see, 322 | So long lives this and this gives life to thee. 323 | Devouring Time, blunt thou the lion's paws, 324 | And make the earth devour her own sweet brood; 325 | Pluck the keen teeth from the fierce tiger's jaws, 326 | And burn the long-lived phoenix in her blood; 327 | Make glad and sorry seasons as thou fleets, 328 | And do whate'er thou wilt, swift-footed Time, 329 | To the wide world and all her fading sweets; 330 | But I forbid thee one most heinous crime: 331 | O, carve not with thy hours my love's fair brow, 332 | Nor draw no lines there with thine antique pen; 333 | Him in thy course untainted do allow 334 | For beauty's pattern to succeeding men. 335 | Yet, do thy worst, old Time: despite thy wrong, 336 | My love shall in my verse ever live young. 337 | A woman's face with Nature's own hand painted 338 | Hast thou, the master-mistress of my passion; 339 | A woman's gentle heart, but not acquainted 340 | With shifting change, as is false women's fashion; 341 | An eye more bright than theirs, less false in rolling, 342 | Gilding the object whereupon it gazeth; 343 | A man in hue, all 'hues' in his controlling, 344 | Much steals men's eyes and women's souls amazeth. 345 | And for a woman wert thou first created; 346 | Till Nature, as she wrought thee, fell a-doting, 347 | And by addition me of thee defeated, 348 | By adding one thing to my purpose nothing. 349 | But since she prick'd thee out for women's pleasure, 350 | Mine be thy love and thy love's use their treasure. 351 | So is it not with me as with that Muse 352 | Stirr'd by a painted beauty to his verse, 353 | Who heaven itself for ornament doth use 354 | And every fair with his fair doth rehearse 355 | Making a couplement of proud compare, 356 | With sun and moon, with earth and sea's rich gems, 357 | With April's first-born flowers, and all things rare 358 | That heaven's air in this huge rondure hems. 359 | O' let me, true in love, but truly write, 360 | And then believe me, my love is as fair 361 | As any mother's child, though not so bright 362 | As those gold candles fix'd in heaven's air: 363 | Let them say more than like of hearsay well; 364 | I will not praise that purpose not to sell. 365 | My glass shall not persuade me I am old, 366 | So long as youth and thou are of one date; 367 | But when in thee time's furrows I behold, 368 | Then look I death my days should expiate. 369 | For all that beauty that doth cover thee 370 | Is but the seemly raiment of my heart, 371 | Which in thy breast doth live, as thine in me: 372 | How can I then be elder than thou art? 373 | O, therefore, love, be of thyself so wary 374 | As I, not for myself, but for thee will; 375 | Bearing thy heart, which I will keep so chary 376 | As tender nurse her babe from faring ill. 377 | Presume not on thy heart when mine is slain; 378 | Thou gavest me thine, not to give back again. 379 | As an unperfect actor on the stage 380 | Who with his fear is put besides his part, 381 | Or some fierce thing replete with too much rage, 382 | Whose strength's abundance weakens his own heart. 383 | So I, for fear of trust, forget to say 384 | The perfect ceremony of love's rite, 385 | And in mine own love's strength seem to decay, 386 | O'ercharged with burden of mine own love's might. 387 | O, let my books be then the eloquence 388 | And dumb presagers of my speaking breast, 389 | Who plead for love and look for recompense 390 | More than that tongue that more hath more express'd. 391 | O, learn to read what silent love hath writ: 392 | To hear with eyes belongs to love's fine wit. 393 | Mine eye hath play'd the painter and hath stell'd 394 | Thy beauty's form in table of my heart; 395 | My body is the frame wherein 'tis held, 396 | And perspective it is the painter's art. 397 | For through the painter must you see his skill, 398 | To find where your true image pictured lies; 399 | Which in my bosom's shop is hanging still, 400 | That hath his windows glazed with thine eyes. 401 | Now see what good turns eyes for eyes have done: 402 | Mine eyes have drawn thy shape, and thine for me 403 | Are windows to my breast, where-through the sun 404 | Delights to peep, to gaze therein on thee; 405 | Yet eyes this cunning want to grace their art; 406 | They draw but what they see, know not the heart. 407 | Let those who are in favour with their stars 408 | Of public honour and proud titles boast, 409 | Whilst I, whom fortune of such triumph bars, 410 | Unlook'd for joy in that I honour most. 411 | Great princes' favourites their fair leaves spread 412 | But as the marigold at the sun's eye, 413 | And in themselves their pride lies buried, 414 | For at a frown they in their glory die. 415 | The painful warrior famoused for fight, 416 | After a thousand victories once foil'd, 417 | Is from the book of honour razed quite, 418 | And all the rest forgot for which he toil'd: 419 | Then happy I, that love and am beloved 420 | Where I may not remove nor be removed. 421 | Lord of my love, to whom in vassalage 422 | Thy merit hath my duty strongly knit, 423 | To thee I send this written embassage, 424 | To witness duty, not to show my wit: 425 | Duty so great, which wit so poor as mine 426 | May make seem bare, in wanting words to show it, 427 | But that I hope some good conceit of thine 428 | In thy soul's thought, all naked, will bestow it; 429 | Till whatsoever star that guides my moving 430 | Points on me graciously with fair aspect 431 | And puts apparel on my tatter'd loving, 432 | To show me worthy of thy sweet respect: 433 | Then may I dare to boast how I do love thee; 434 | Till then not show my head where thou mayst prove me. 435 | Weary with toil, I haste me to my bed, 436 | The dear repose for limbs with travel tired; 437 | But then begins a journey in my head, 438 | To work my mind, when body's work's expired: 439 | For then my thoughts, from far where I abide, 440 | Intend a zealous pilgrimage to thee, 441 | And keep my drooping eyelids open wide, 442 | Looking on darkness which the blind do see 443 | Save that my soul's imaginary sight 444 | Presents thy shadow to my sightless view, 445 | Which, like a jewel hung in ghastly night, 446 | Makes black night beauteous and her old face new. 447 | Lo! thus, by day my limbs, by night my mind, 448 | For thee and for myself no quiet find. 449 | How can I then return in happy plight, 450 | That am debarr'd the benefit of rest? 451 | When day's oppression is not eased by night, 452 | But day by night, and night by day, oppress'd? 453 | And each, though enemies to either's reign, 454 | Do in consent shake hands to torture me; 455 | The one by toil, the other to complain 456 | How far I toil, still farther off from thee. 457 | I tell the day, to please them thou art bright 458 | And dost him grace when clouds do blot the heaven: 459 | So flatter I the swart-complexion'd night, 460 | When sparkling stars twire not thou gild'st the even. 461 | But day doth daily draw my sorrows longer 462 | And night doth nightly make grief's strength seem stronger. 463 | When, in disgrace with fortune and men's eyes, 464 | I all alone beweep my outcast state 465 | And trouble deal heaven with my bootless cries 466 | And look upon myself and curse my fate, 467 | Wishing me like to one more rich in hope, 468 | Featured like him, like him with friends possess'd, 469 | Desiring this man's art and that man's scope, 470 | With what I most enjoy contented least; 471 | Yet in these thoughts myself almost despising, 472 | Haply I think on thee, and then my state, 473 | Like to the lark at break of day arising 474 | From sullen earth, sings hymns at heaven's gate; 475 | For thy sweet love remember'd such wealth brings 476 | That then I scorn to change my state with kings. 477 | When to the sessions of sweet silent thought 478 | I summon up remembrance of things past, 479 | I sigh the lack of many a thing I sought, 480 | And with old woes new wail my dear time's waste: 481 | Then can I drown an eye, unused to flow, 482 | For precious friends hid in death's dateless night, 483 | And weep afresh love's long since cancell'd woe, 484 | And moan the expense of many a vanish'd sight: 485 | Then can I grieve at grievances foregone, 486 | And heavily from woe to woe tell o'er 487 | The sad account of fore-bemoaned moan, 488 | Which I new pay as if not paid before. 489 | But if the while I think on thee, dear friend, 490 | All losses are restored and sorrows end. 491 | Thy bosom is endeared with all hearts, 492 | Which I by lacking have supposed dead, 493 | And there reigns love and all love's loving parts, 494 | And all those friends which I thought buried. 495 | How many a holy and obsequious tear 496 | Hath dear religious love stol'n from mine eye 497 | As interest of the dead, which now appear 498 | But things removed that hidden in thee lie! 499 | Thou art the grave where buried love doth live, 500 | Hung with the trophies of my lovers gone, 501 | Who all their parts of me to thee did give; 502 | That due of many now is thine alone: 503 | Their images I loved I view in thee, 504 | And thou, all they, hast all the all of me. 505 | If thou survive my well-contented day, 506 | When that churl Death my bones with dust shall cover, 507 | And shalt by fortune once more re-survey 508 | These poor rude lines of thy deceased lover, 509 | Compare them with the bettering of the time, 510 | And though they be outstripp'd by every pen, 511 | Reserve them for my love, not for their rhyme, 512 | Exceeded by the height of happier men. 513 | O, then vouchsafe me but this loving thought: 514 | 'Had my friend's Muse grown with this growing age, 515 | A dearer birth than this his love had brought, 516 | To march in ranks of better equipage: 517 | But since he died and poets better prove, 518 | Theirs for their style I'll read, his for his love.' 519 | Full many a glorious morning have I seen 520 | Flatter the mountain-tops with sovereign eye, 521 | Kissing with golden face the meadows green, 522 | Gilding pale streams with heavenly alchemy; 523 | Anon permit the basest clouds to ride 524 | With ugly rack on his celestial face, 525 | And from the forlorn world his visage hide, 526 | Stealing unseen to west with this disgrace: 527 | Even so my sun one early morn did shine 528 | With all triumphant splendor on my brow; 529 | But out, alack! he was but one hour mine; 530 | The region cloud hath mask'd him from me now. 531 | Yet him for this my love no whit disdaineth; 532 | Suns of the world may stain when heaven's sun staineth. 533 | Why didst thou promise such a beauteous day, 534 | And make me travel forth without my cloak, 535 | To let base clouds o'ertake me in my way, 536 | Hiding thy bravery in their rotten smoke? 537 | 'Tis not enough that through the cloud thou break, 538 | To dry the rain on my storm-beaten face, 539 | For no man well of such a salve can speak 540 | That heals the wound and cures not the disgrace: 541 | Nor can thy shame give physic to my grief; 542 | Though thou repent, yet I have still the loss: 543 | The offender's sorrow lends but weak relief 544 | To him that bears the strong offence's cross. 545 | Ah! but those tears are pearl which thy love sheds, 546 | And they are rich and ransom all ill deeds. 547 | No more be grieved at that which thou hast done: 548 | Roses have thorns, and silver fountains mud; 549 | Clouds and eclipses stain both moon and sun, 550 | And loathsome canker lives in sweetest bud. 551 | All men make faults, and even I in this, 552 | Authorizing thy trespass with compare, 553 | Myself corrupting, salving thy amiss, 554 | Excusing thy sins more than thy sins are; 555 | For to thy sensual fault I bring in sense-- 556 | Thy adverse party is thy advocate-- 557 | And 'gainst myself a lawful plea commence: 558 | Such civil war is in my love and hate 559 | That I an accessary needs must be 560 | To that sweet thief which sourly robs from me. 561 | Let me confess that we two must be twain, 562 | Although our undivided loves are one: 563 | So shall those blots that do with me remain 564 | Without thy help by me be borne alone. 565 | In our two loves there is but one respect, 566 | Though in our lives a separable spite, 567 | Which though it alter not love's sole effect, 568 | Yet doth it steal sweet hours from love's delight. 569 | I may not evermore acknowledge thee, 570 | Lest my bewailed guilt should do thee shame, 571 | Nor thou with public kindness honour me, 572 | Unless thou take that honour from thy name: 573 | But do not so; I love thee in such sort 574 | As, thou being mine, mine is thy good report. 575 | As a decrepit father takes delight 576 | To see his active child do deeds of youth, 577 | So I, made lame by fortune's dearest spite, 578 | Take all my comfort of thy worth and truth. 579 | For whether beauty, birth, or wealth, or wit, 580 | Or any of these all, or all, or more, 581 | Entitled in thy parts do crowned sit, 582 | I make my love engrafted to this store: 583 | So then I am not lame, poor, nor despised, 584 | Whilst that this shadow doth such substance give 585 | That I in thy abundance am sufficed 586 | And by a part of all thy glory live. 587 | Look, what is best, that best I wish in thee: 588 | This wish I have; then ten times happy me!""" 589 | -------------------------------------------------------------------------------- /inference_perf/loadgen/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from .load_generator import LoadGenerator 15 | 16 | __all__ = ["LoadGenerator"] 17 | -------------------------------------------------------------------------------- /inference_perf/loadgen/load_generator.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from pydantic import BaseModel 15 | from .load_timer import LoadTimer, ConstantLoadTimer, PoissonLoadTimer 16 | from inference_perf.datagen import DataGenerator 17 | from inference_perf.client.modelserver import ModelServerClient 18 | from inference_perf.config import LoadType, LoadConfig 19 | from asyncio import TaskGroup, sleep 20 | import time 21 | 22 | 23 | class StageRuntimeInfo(BaseModel): 24 | stage_id: int 25 | end_time: float 26 | start_time: float 27 | 28 | 29 | class LoadGenerator: 30 | def __init__(self, datagen: DataGenerator, load_config: LoadConfig) -> None: 31 | self.datagen = datagen 32 | self.stageInterval = load_config.interval 33 | self.load_type = load_config.type 34 | self.stages = load_config.stages 35 | self.stage_runtime_info = dict[int, StageRuntimeInfo]() 36 | 37 | def get_timer(self, rate: float) -> LoadTimer: 38 | if self.load_type == LoadType.POISSON: 39 | return PoissonLoadTimer(rate=rate) 40 | return ConstantLoadTimer(rate=rate) 41 | 42 | async def run(self, client: ModelServerClient) -> None: 43 | for stage_id, stage in enumerate(self.stages): 44 | timer = self.get_timer(stage.rate) 45 | start_time = time.time() 46 | end_time = start_time + stage.duration 47 | print(f"Stage {stage_id} - run started") 48 | async with TaskGroup() as tg: 49 | for _, (data, time_index) in enumerate( 50 | zip(self.datagen.get_data(), timer.start_timer(start_time), strict=True) 51 | ): 52 | now = time.time() 53 | if time_index < end_time and now < end_time: 54 | if time_index > now: 55 | await sleep(time_index - time.time()) 56 | tg.create_task(client.process_request(data, stage_id)) 57 | continue 58 | else: 59 | break 60 | self.stage_runtime_info[stage_id] = StageRuntimeInfo( 61 | stage_id=stage_id, start_time=start_time, end_time=time.time() 62 | ) 63 | print(f"Stage {stage_id} - run completed") 64 | if self.stageInterval and stage_id < len(self.stages) - 1: 65 | await sleep(self.stageInterval) 66 | -------------------------------------------------------------------------------- /inference_perf/loadgen/load_timer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import time 15 | from abc import ABC, abstractmethod 16 | from typing import Generator, Optional, Tuple 17 | import numpy as np 18 | 19 | 20 | class LoadTimer(ABC): 21 | """Abstract base class for load generators.""" 22 | 23 | @abstractmethod 24 | def __init__(self, *args: Tuple[int, ...]) -> None: 25 | # TODO: Commmon functionallity 26 | pass 27 | 28 | @abstractmethod 29 | def start_timer(self, initial: Optional[float] = None) -> Generator[float, None, None]: 30 | """Yield the times at which requests should be made.""" 31 | raise NotImplementedError 32 | 33 | 34 | class ConstantLoadTimer(LoadTimer): 35 | """ 36 | A load generator that generates requests at a constant rate. 37 | Introduces a small amount of random noise in timing. 38 | """ 39 | 40 | def __init__(self, rate: float) -> None: 41 | self._rate = rate 42 | # TODO: Make random state a global seed 43 | self._rand = np.random.default_rng() 44 | 45 | def start_timer(self, initial: Optional[float] = None) -> Generator[float, None, None]: 46 | # Set start time 47 | next_time = time.monotonic() if initial is None else initial 48 | 49 | # Given a rate, yield a time to wait before the next request 50 | while True: 51 | next_time += self._rand.exponential(1 / self._rate) 52 | yield next_time 53 | 54 | 55 | class PoissonLoadTimer(LoadTimer): 56 | def __init__(self, rate: float) -> None: 57 | self._rate = rate 58 | self._rand = np.random.default_rng() 59 | 60 | def start_timer(self, initial: Optional[float] = None) -> Generator[float, None, None]: 61 | # Set start time 62 | next_time = time.monotonic() if initial is None else initial 63 | 64 | # Given a rate, yield a time to wait before the next request 65 | while True: 66 | # How many requests in the next second 67 | req_count = self._rand.poisson(self._rate) 68 | 69 | # If no requests, wait for 1 second 70 | if req_count < 1: 71 | yield next_time + 1.0 72 | continue 73 | 74 | # Schedule the requests over the next second 75 | timer = ConstantLoadTimer(req_count) 76 | for _ in range(req_count): 77 | next_time = next(timer.start_timer(next_time)) 78 | yield next_time 79 | -------------------------------------------------------------------------------- /inference_perf/main.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from typing import List, Optional 15 | from inference_perf.loadgen import LoadGenerator 16 | from inference_perf.config import ( 17 | DataGenType, 18 | MetricsClientType, 19 | ModelServerType, 20 | ReportConfig, 21 | read_config, 22 | ) 23 | from inference_perf.datagen import ( 24 | DataGenerator, 25 | MockDataGenerator, 26 | HFShareGPTDataGenerator, 27 | SyntheticDataGenerator, 28 | RandomDataGenerator, 29 | SharedPrefixDataGenerator, 30 | ) 31 | from inference_perf.client.modelserver import ModelServerClient, vLLMModelServerClient 32 | from inference_perf.client.metricsclient import MetricsClient, PerfRuntimeParameters, PrometheusMetricsClient 33 | from inference_perf.client.filestorage import StorageClient, GoogleCloudStorageClient 34 | from inference_perf.reportgen import ReportGenerator 35 | from inference_perf.utils import CustomTokenizer, ReportFile 36 | import asyncio 37 | import time 38 | 39 | 40 | class InferencePerfRunner: 41 | def __init__( 42 | self, 43 | client: ModelServerClient, 44 | loadgen: LoadGenerator, 45 | reportgen: ReportGenerator, 46 | storage_clients: List[StorageClient], 47 | ) -> None: 48 | self.client = client 49 | self.loadgen = loadgen 50 | self.reportgen = reportgen 51 | self.storage_clients = storage_clients 52 | 53 | def run(self) -> None: 54 | asyncio.run(self.loadgen.run(self.client)) 55 | 56 | def generate_reports(self, report_config: ReportConfig, runtime_parameters: PerfRuntimeParameters) -> List[ReportFile]: 57 | return asyncio.run(self.reportgen.generate_reports(report_config=report_config, runtime_parameters=runtime_parameters)) 58 | 59 | def save_reports(self, reports: List[ReportFile]) -> None: 60 | for storage_client in self.storage_clients: 61 | storage_client.save_report(reports) 62 | 63 | 64 | def main_cli() -> None: 65 | config = read_config() 66 | 67 | # Define Metrics Client 68 | metrics_client: Optional[MetricsClient] = None 69 | if config.metrics: 70 | if config.metrics.type == MetricsClientType.PROMETHEUS and config.metrics.prometheus: 71 | metrics_client = PrometheusMetricsClient(config=config.metrics.prometheus) 72 | 73 | # Define Storage Clients 74 | storage_clients: List[StorageClient] = [] 75 | if config.storage: 76 | if config.storage.google_cloud_storage: 77 | storage_clients.append(GoogleCloudStorageClient(config=config.storage.google_cloud_storage)) 78 | 79 | # Define Report Generator 80 | reportgen = ReportGenerator(metrics_client) 81 | 82 | # Create tokenizer based on tokenizer config 83 | tokenizer: Optional[CustomTokenizer] = None 84 | if config.tokenizer and config.tokenizer.pretrained_model_name_or_path: 85 | try: 86 | tokenizer = CustomTokenizer(config.tokenizer) 87 | except Exception as e: 88 | raise Exception("Tokenizer initialization failed") from e 89 | 90 | # Define Model Server Client 91 | model_server_client: ModelServerClient 92 | if config.server: 93 | if config.server.type == ModelServerType.VLLM: 94 | # The type error for vLLMModelServerClient's tokenizer argument indicates it expects CustomTokenizer, not Optional. 95 | if tokenizer is None: 96 | raise Exception( 97 | "vLLM client is configured, but it requires a custom tokenizer which was not provided or initialized successfully. " 98 | "Please ensure a valid tokenizer is configured in the 'tokenizer' section of your config file." 99 | ) 100 | model_server_client = vLLMModelServerClient( 101 | reportgen.get_metrics_collector(), 102 | api_type=config.api, 103 | uri=config.server.base_url, 104 | model_name=config.server.model_name, 105 | tokenizer=tokenizer, 106 | ignore_eos=config.server.ignore_eos, 107 | ) 108 | else: 109 | raise Exception("model server client config missing") 110 | 111 | # Define DataGenerator 112 | datagen: DataGenerator 113 | if config.data: 114 | # Common checks for generators that require a tokenizer / distribution 115 | if config.data.type in [DataGenType.ShareGPT, DataGenType.Synthetic, DataGenType.Random]: 116 | if tokenizer is None: 117 | raise Exception( 118 | f"{config.data.type.value} data generator requires a configured tokenizer. " 119 | "Please ensure a valid tokenizer is configured in the 'tokenizer' section of your config file." 120 | ) 121 | if config.data.type in [DataGenType.Synthetic, DataGenType.Random]: 122 | if config.data.input_distribution is None: 123 | raise Exception(f"{config.data.type.value} data generator requires 'input_distribution' to be configured") 124 | if config.data.output_distribution is None: 125 | raise Exception(f"{config.data.type.value} data generator requires 'output_distribution' to be configured") 126 | if config.data.type == DataGenType.SharedPrefix and config.data.shared_prefix is None: 127 | raise Exception(f"{config.data.type.value} data generator requires 'shared_prefix' to be configured") 128 | 129 | if config.data.type == DataGenType.ShareGPT: 130 | datagen = HFShareGPTDataGenerator(config.api, config.data, tokenizer) 131 | elif config.data.type == DataGenType.Synthetic: 132 | datagen = SyntheticDataGenerator(config.api, config.data, tokenizer) 133 | elif config.data.type == DataGenType.Random: 134 | datagen = RandomDataGenerator(config.api, config.data, tokenizer) 135 | elif config.data.type == DataGenType.SharedPrefix: 136 | datagen = SharedPrefixDataGenerator(config.api, config.data, tokenizer) 137 | else: 138 | datagen = MockDataGenerator(config.api, config.data, tokenizer) 139 | else: 140 | raise Exception("data config missing") 141 | 142 | # Define LoadGenerator 143 | if config.load: 144 | if isinstance(metrics_client, PrometheusMetricsClient) and config.report.prometheus.per_stage: 145 | config.load.interval = max(config.load.interval, metrics_client.scrape_interval) 146 | loadgen = LoadGenerator(datagen, config.load) 147 | else: 148 | raise Exception("load config missing") 149 | 150 | # Setup Perf Test Runner 151 | perfrunner = InferencePerfRunner(model_server_client, loadgen, reportgen, storage_clients) 152 | 153 | start_time = time.time() 154 | 155 | # Run Perf Test 156 | perfrunner.run() 157 | 158 | end_time = time.time() 159 | duration = end_time - start_time # Calculate the duration of the test 160 | 161 | # Generate Reports after the tests 162 | reports = perfrunner.generate_reports( 163 | report_config=config.report, 164 | runtime_parameters=PerfRuntimeParameters( 165 | start_time=start_time, 166 | duration=duration, 167 | model_server_client=model_server_client, 168 | stages=loadgen.stage_runtime_info, 169 | ), 170 | ) 171 | 172 | # Save Reports 173 | perfrunner.save_reports(reports=reports) 174 | 175 | 176 | if __name__ == "__main__": 177 | main_cli() 178 | -------------------------------------------------------------------------------- /inference_perf/reportgen/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from .base import ReportGenerator 15 | 16 | __all__ = ["ReportGenerator"] 17 | -------------------------------------------------------------------------------- /inference_perf/reportgen/base.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from typing import List, Optional, Any 15 | from pydantic import BaseModel 16 | from collections import defaultdict 17 | from inference_perf.client.metricsclient.base import ModelServerMetrics 18 | from inference_perf.client.metricsclient.prometheus_client import PrometheusMetricsClient 19 | from inference_perf.config import ReportConfig, PrometheusMetricsReportConfig 20 | from inference_perf.client.metricsclient import MetricsClient, PerfRuntimeParameters 21 | from inference_perf.utils import ReportFile 22 | from inference_perf.client.requestdatacollector import LocalRequestDataCollector, RequestDataCollector 23 | from inference_perf.apis import RequestLifecycleMetric 24 | import numpy as np 25 | 26 | 27 | def safe_float(value: Any) -> float: 28 | """NOTE: Only for use in summarize_requests after validating safe access""" 29 | try: 30 | return float(value) 31 | except (TypeError, ValueError): 32 | return 0 33 | 34 | 35 | def summarize(items: List[float]) -> Optional[dict[str, float]]: 36 | return ( 37 | { 38 | "mean": float(np.mean(items)), 39 | "min": float(np.min(items)), 40 | "p10": float(np.percentile(items, 10)), 41 | "p50": float(np.percentile(items, 50)), 42 | "p90": float(np.percentile(items, 90)), 43 | "max": float(np.max(items)), 44 | } 45 | if len(items) != 0 46 | else None 47 | ) 48 | 49 | 50 | class ResponsesSummary(BaseModel): 51 | load_summary: dict[str, Any] 52 | successes: dict[str, Any] 53 | failures: dict[str, Any] 54 | 55 | 56 | def summarize_prometheus_metrics(metrics: ModelServerMetrics) -> ResponsesSummary: 57 | return ResponsesSummary( 58 | load_summary={}, # model server doesn't report failed requests 59 | failures={}, 60 | successes={ 61 | "count": metrics.total_requests, 62 | "rate": metrics.requests_per_second, 63 | "prompt_len": { 64 | "mean": metrics.avg_prompt_tokens, 65 | "rate": metrics.prompt_tokens_per_second, 66 | }, 67 | "output_len": { 68 | "mean": metrics.avg_output_tokens, 69 | "rate": metrics.output_tokens_per_second, 70 | }, 71 | "queue_len": { 72 | "mean": metrics.avg_queue_length, 73 | }, 74 | "request_latency": { 75 | "mean": metrics.avg_request_latency, 76 | "p50": metrics.median_request_latency, 77 | "p90": metrics.p90_request_latency, 78 | "p99": metrics.p99_request_latency, 79 | }, 80 | "time_to_first_token": { 81 | "mean": metrics.avg_time_to_first_token, 82 | "p50": metrics.median_time_to_first_token, 83 | "p90": metrics.p90_time_to_first_token, 84 | "p99": metrics.p99_time_to_first_token, 85 | }, 86 | "time_per_output_token": { 87 | "mean": metrics.avg_time_per_output_token, 88 | "p50": metrics.median_time_per_output_token, 89 | "p90": metrics.p90_time_per_output_token, 90 | "p99": metrics.p99_time_per_output_token, 91 | }, 92 | }, 93 | ) 94 | 95 | 96 | def summarize_requests(metrics: List[RequestLifecycleMetric]) -> ResponsesSummary: 97 | all_successful: List[RequestLifecycleMetric] = [x for x in metrics if x.error is None] 98 | all_failed: List[RequestLifecycleMetric] = [x for x in metrics if x.error is not None] 99 | 100 | total_time = max(x.end_time for x in metrics) - min(x.start_time for x in metrics) 101 | 102 | return ResponsesSummary( 103 | load_summary={ 104 | "count": len(metrics), 105 | }, 106 | successes={ 107 | "count": len(all_successful), 108 | "throughput": { 109 | "input_tokens_per_sec": sum(x.info.input_tokens for x in all_successful) / total_time, 110 | "output_tokens_per_sec": sum(x.info.output_tokens for x in all_successful) / total_time, 111 | "total_tokens_per_sec": sum((x.info.input_tokens + x.info.output_tokens) for x in all_successful) / total_time, 112 | "requests_per_sec": len(all_successful) / total_time, 113 | }, 114 | "request_latency": summarize([(successful.end_time - successful.start_time) for successful in all_successful]), 115 | "prompt_len": summarize([safe_float(success.info.input_tokens) for success in all_successful]), 116 | "output_len": summarize([float(v) for success in all_successful if (v := success.info.output_tokens) is not None]), 117 | "normalized_time_per_output_token": summarize( 118 | [ 119 | ((metric.end_time - metric.start_time) / output_len) if output_len and output_len != 0 else 0 120 | for metric in all_successful 121 | for output_len in [safe_float(metric.info.output_tokens)] 122 | ] 123 | ), 124 | }, 125 | failures={ 126 | "count": len(all_failed), 127 | "request_latency": summarize([(failed.end_time - failed.start_time) for failed in all_failed]), 128 | }, 129 | ) 130 | 131 | 132 | class ReportGenerator: 133 | def __init__( 134 | self, 135 | metrics_client: Optional[MetricsClient], 136 | ) -> None: 137 | self.metrics_collector = LocalRequestDataCollector() 138 | self.metrics_client = metrics_client 139 | 140 | def get_metrics_collector(self) -> RequestDataCollector: 141 | """ 142 | Returns the metrics collector. 143 | """ 144 | return self.metrics_collector 145 | 146 | async def generate_reports( 147 | self, report_config: ReportConfig, runtime_parameters: PerfRuntimeParameters 148 | ) -> List[ReportFile]: 149 | print("\n\nGenerating Reports ..") 150 | lifecycle_reports = [] 151 | request_metrics = self.metrics_collector.get_metrics() 152 | if report_config.request_lifecycle.summary: 153 | if len(request_metrics) != 0: 154 | report_file = ReportFile( 155 | name="summary_lifecycle_metrics", 156 | contents=summarize_requests(request_metrics).model_dump(), 157 | ) 158 | lifecycle_reports.append(report_file) 159 | if report_file.path is not None: 160 | print(f"Successfully saved summary report of request lifecycle metrics to {report_file.path}") 161 | 162 | if report_config.request_lifecycle.per_stage: 163 | stage_buckets: dict[int, List[RequestLifecycleMetric]] = defaultdict(list) 164 | for metric in request_metrics: 165 | if metric.stage_id is not None: 166 | stage_buckets[metric.stage_id].append(metric) 167 | for stage_id, metrics in stage_buckets.items(): 168 | report_file = ReportFile( 169 | name=f"stage_{stage_id}_lifecycle_metrics", 170 | contents=summarize_requests(metrics).model_dump(), 171 | ) 172 | lifecycle_reports.append(report_file) 173 | if report_file is not None: 174 | print(f"Successfully saved stage {stage_id} report of request lifecycle metrics to {report_file.path}") 175 | 176 | if report_config.request_lifecycle.per_request: 177 | report_file = ReportFile( 178 | name="per_request_lifecycle_metrics", 179 | contents=[ 180 | { 181 | "start_time": metric.start_time, 182 | "end_time": metric.end_time, 183 | "request": metric.request_data, 184 | "response": metric.response_data, 185 | } 186 | for metric in request_metrics 187 | ], 188 | ) 189 | lifecycle_reports.append(report_file) 190 | if report_file is not None: 191 | print(f"Successfully saved per request report of request lifecycle metrics to {report_file.path}") 192 | 193 | lifecycle_reports.extend(self.generate_prometheus_metrics_report(runtime_parameters, report_config.prometheus)) 194 | return lifecycle_reports 195 | 196 | def generate_prometheus_metrics_report( 197 | self, runtime_parameters: PerfRuntimeParameters, report_config: PrometheusMetricsReportConfig 198 | ) -> List[ReportFile]: 199 | """ 200 | Report summary of the metrics collected by the metrics client during the run. 201 | Args: 202 | runtime_parameters (PerfRuntimeParameters): The runtime parameters containing the model server client, query eval time in the metrics db, duration. 203 | """ 204 | prometheus_metrics_reports: List[ReportFile] = [] 205 | 206 | if self.metrics_client is None or not isinstance(self.metrics_client, PrometheusMetricsClient): 207 | print("Prometheus Metrics Client is not configured or not of type PrometheusMetricsClient") 208 | return prometheus_metrics_reports 209 | 210 | # Wait for Prometheus to collect metrics for the last stage 211 | self.metrics_client.wait() 212 | 213 | if report_config.summary: 214 | collected_metrics = self.metrics_client.collect_metrics_summary(runtime_parameters) 215 | if collected_metrics is not None: 216 | report_file = ReportFile( 217 | name="summary_prometheus_metrics", 218 | contents=summarize_prometheus_metrics(collected_metrics).model_dump(), 219 | ) 220 | if report_file is not None: 221 | print(f"Successfully saved summary report of prometheus metrics to {report_file.path}") 222 | prometheus_metrics_reports.append(report_file) 223 | else: 224 | print("Report generation failed - no metrics collected by metrics client") 225 | 226 | if report_config.per_stage: 227 | for stage_id, _stage_info in runtime_parameters.stages.items(): 228 | collected_metrics = self.metrics_client.collect_metrics_for_stage(runtime_parameters, stage_id) 229 | if collected_metrics is not None: 230 | report_file = ReportFile( 231 | name=f"stage_{stage_id}_prometheus_metrics", 232 | contents=summarize_prometheus_metrics(collected_metrics).model_dump(), 233 | ) 234 | if report_file is not None: 235 | print(f"Successfully saved stage {stage_id} report of prometheus metrics to {report_file.path}") 236 | prometheus_metrics_reports.append(report_file) 237 | else: 238 | print(f"No metrics collected for Stage {stage_id}") 239 | 240 | return prometheus_metrics_reports 241 | -------------------------------------------------------------------------------- /inference_perf/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from .custom_tokenizer import CustomTokenizer 15 | from .report_file import ReportFile 16 | 17 | __all__ = ["CustomTokenizer", "ReportFile"] 18 | -------------------------------------------------------------------------------- /inference_perf/utils/custom_tokenizer.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from transformers import AutoTokenizer, PreTrainedTokenizerBase 15 | from inference_perf.config import CustomTokenizerConfig 16 | 17 | 18 | class CustomTokenizer: 19 | def __init__(self, config: CustomTokenizerConfig): 20 | self.tokenizer: PreTrainedTokenizerBase = AutoTokenizer.from_pretrained( 21 | config.pretrained_model_name_or_path, token=config.token, trust_remote_code=config.trust_remote_code 22 | ) 23 | 24 | def count_tokens(self, text: str) -> int: 25 | if text == "": 26 | return 0 27 | return len(self.tokenizer(text).input_ids) 28 | 29 | def get_tokenizer(self) -> PreTrainedTokenizerBase: 30 | return self.tokenizer 31 | -------------------------------------------------------------------------------- /inference_perf/utils/distribution.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import numpy as np 15 | from numpy.typing import NDArray 16 | 17 | 18 | def generate_distribution(min: int, max: int, mean: float, std_dev: float, total_count: int) -> NDArray[np.int_]: 19 | """ 20 | Generates an array of lengths in integer adhering to the specified distribution constraints. 21 | 22 | Args: 23 | min: The minimum allowed length. 24 | max: The maximum allowed length. 25 | mean: The target mean of the distribution. 26 | std_dev: The target standard deviation of the distribution. 27 | total_count: The total number of lengths to generate. 28 | 29 | Returns: 30 | A numpy array of integers representing lengths for input prompts or output generations. 31 | 32 | Raises: 33 | ValueError: If constraints are impossible (e.g., min_val > max_val). 34 | """ 35 | if min > max: 36 | raise ValueError("Minimum value cannot be greater than maximum value.") 37 | if total_count <= 0: 38 | raise ValueError("Total count must be a positive integer.") 39 | if std_dev < 0: 40 | raise ValueError("Standard deviation cannot be negative.") 41 | if mean < min or mean > max: 42 | raise ValueError("Mean cannot be outside min and max range.") 43 | 44 | # Generate floating-point numbers from a normal distribution 45 | # Use a large enough intermediate pool if std_dev is high relative to range 46 | # to increase chances of getting values within bounds after generation. 47 | # This is a heuristic; perfect adherence isn't guaranteed. 48 | generated_numbers = np.random.normal(loc=mean, scale=std_dev, size=total_count) 49 | 50 | # Clip the numbers to the specified min/max range 51 | clipped_numbers = np.clip(generated_numbers, min, max) 52 | 53 | # Round to the nearest integer and convert type 54 | generated_lengths = np.round(clipped_numbers).astype(int) 55 | 56 | # Ensure integer values are strictly within bounds after rounding 57 | # (e.g., rounding 4.6 when max is 4 could result in 5 without this) 58 | generated_lengths = np.clip(generated_lengths, min, max) 59 | 60 | return generated_lengths 61 | -------------------------------------------------------------------------------- /inference_perf/utils/report_file.py: -------------------------------------------------------------------------------- 1 | # Copyright 2025 The Kubernetes Authors. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import json 16 | import os 17 | from typing import Any, Optional 18 | 19 | 20 | class ReportFile: 21 | name: str 22 | contents: Any 23 | path: Optional[str] = None 24 | 25 | def __init__(self, name: str, contents: Any): 26 | self.name = f"{name}.json" 27 | self.contents = contents 28 | self._store_locally() 29 | 30 | def _store_locally(self) -> None: 31 | filename = self.get_filename() 32 | contents = self.get_contents() 33 | with open(filename, "w", encoding="utf-8") as f: 34 | f.write(json.dumps(contents, indent=2)) 35 | self.path = os.path.abspath(filename) 36 | 37 | def get_filename(self) -> str: 38 | return self.name 39 | 40 | def get_contents(self) -> Any: 41 | return self.contents 42 | -------------------------------------------------------------------------------- /pdm.lock: -------------------------------------------------------------------------------- 1 | # This file is @generated by PDM. 2 | # It is not intended for manual editing. 3 | 4 | [metadata] 5 | groups = ["default", "dev"] 6 | strategy = ["inherit_metadata"] 7 | targets = [] 8 | lock_version = "4.5.0" 9 | content_hash = "sha256:94dc44bb0ca871cbddd7f324cc3d79c8a503ad6cd6e18f147cc2867ca93d73c7" 10 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "inference-perf" 3 | version = "0.0.1" 4 | description = "A GenAI inference performance benchmarking tool." 5 | authors = [] 6 | dependencies = [ 7 | "aiohttp>=3.11.11", 8 | "pydantic>=2.10.6", 9 | "numpy>=2.2.2", 10 | "datasets>=3.3.2", 11 | "transformers>=4.50.2", 12 | "google-cloud-storage>=3.1.0", 13 | ] 14 | requires-python = ">=3.12" 15 | readme = "README.md" 16 | license = {text = "Apache-2.0"} 17 | 18 | [project.scripts] 19 | inference-perf = "inference_perf:main_cli" 20 | 21 | [project.optional-dependencies] 22 | dev = [ 23 | "mypy>=1.14.1", 24 | "ruff>=0.9.4", 25 | "pre-commit>=4.1.0", 26 | "pytest>=8.3.4", 27 | "types-PyYAML>=6.0.12.20241230", 28 | "ipykernel>=6.29.5", 29 | "types-requests>=2.32.0.20250328", 30 | ] 31 | 32 | [tool.ruff] 33 | # The GitHub editor is 127 chars wide 34 | line-length = 127 35 | indent-width = 4 36 | 37 | [tool.ruff.lint] 38 | # Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`) codes by default. 39 | # On top of the defaults (`E4`, E7`, `E9`, and `F`), enable flake8-bugbear (`B`) 40 | # Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or 41 | # McCabe complexity (`C901`) by default. 42 | select = ["E4", "E7", "E9", "F", "B"] 43 | ignore = [] 44 | 45 | # Allow fix for all enabled rules (when `--fix`) is provided. 46 | fixable = ["ALL"] 47 | unfixable = [] 48 | 49 | [tool.ruff.format] 50 | # Like Black, use double quotes for strings. 51 | quote-style = "double" 52 | 53 | # Like Black, indent with spaces, rather than tabs. 54 | indent-style = "space" 55 | 56 | # Like Black, respect magic trailing commas. 57 | skip-magic-trailing-comma = false 58 | 59 | # Like Black, automatically detect the appropriate line ending. 60 | line-ending = "auto" 61 | 62 | # Enable auto-formatting of code examples in docstrings. Markdown, 63 | # reStructuredText code/literal blocks and doctests are all supported. 64 | # 65 | # This is currently disabled by default, but it is planned for this 66 | # to be opt-out in the future. 67 | docstring-code-format = false 68 | 69 | # Set the line length limit used when formatting code snippets in 70 | # docstrings. 71 | # 72 | # This only has an effect when the `docstring-code-format` setting is 73 | # enabled. 74 | docstring-code-line-length = "dynamic" 75 | 76 | [tool.pdm] 77 | distribution = true 78 | 79 | [tool.pytest.ini_options] 80 | testpaths = ["."] 81 | python_files = ["test_*.py"] 82 | python_classes = ["Test*"] 83 | python_functions = ["test_*"] 84 | 85 | [tool.setuptools.packages.find] 86 | where = ["."] 87 | include = ["inference_perf*", "deploy*"] 88 | 89 | [[tool.mypy.overrides]] 90 | module = ["datasets.*"] 91 | ignore_missing_imports = true 92 | 93 | [tool.mypy] 94 | disable_error_code = ["attr-defined"] 95 | 96 | [build-system] 97 | requires = ["setuptools>=61"] 98 | build-backend = "setuptools.build_meta" -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | -e . # Install requirements from pyproject 2 | -------------------------------------------------------------------------------- /tests/apis/test_chat.py: -------------------------------------------------------------------------------- 1 | from inference_perf.apis.chat import ChatCompletionAPIData, ChatMessage 2 | from inference_perf.config import APIType 3 | 4 | 5 | def test_chat_completion_api_data() -> None: 6 | data = ChatCompletionAPIData(messages=[ChatMessage(role="user", content="Hello, world!")]) 7 | assert data.get_api_type() == APIType.Chat 8 | assert len(data.messages) == 1 9 | assert data.to_payload("test-model", 100, False) == { 10 | "model": "test-model", 11 | "messages": [{"role": "user", "content": "Hello, world!"}], 12 | "max_tokens": 100, 13 | "ignore_eos": False, 14 | } 15 | -------------------------------------------------------------------------------- /tests/apis/test_completion.py: -------------------------------------------------------------------------------- 1 | from inference_perf.apis.completion import CompletionAPIData 2 | from inference_perf.config import APIType 3 | 4 | 5 | def test_completion_api_data() -> None: 6 | data = CompletionAPIData(prompt="Hello, world!") 7 | assert data.get_api_type() == APIType.Completion 8 | assert data.prompt == "Hello, world!" 9 | assert data.to_payload("test-model", 100, False) == { 10 | "model": "test-model", 11 | "prompt": "Hello, world!", 12 | "max_tokens": 100, 13 | "ignore_eos": False, 14 | } 15 | -------------------------------------------------------------------------------- /tests/test_config.py: -------------------------------------------------------------------------------- 1 | from inference_perf.config import read_config, deep_merge, Config, APIType, DataGenType, LoadType, MetricsClientType 2 | import os 3 | 4 | 5 | def test_read_config() -> None: 6 | config_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "config.yml")) 7 | config = read_config(["-c", config_path]) 8 | 9 | assert isinstance(config, Config) 10 | assert config.api == APIType.Chat 11 | assert config.data.type == DataGenType.ShareGPT 12 | assert config.load.type == LoadType.CONSTANT 13 | if config.metrics: 14 | assert config.metrics.type == MetricsClientType.PROMETHEUS 15 | assert config.report.request_lifecycle.summary is True 16 | 17 | 18 | def test_deep_merge() -> None: 19 | base = { 20 | "api": APIType.Chat, 21 | "data": {"type": DataGenType.ShareGPT}, 22 | "load": {"type": LoadType.CONSTANT}, 23 | "metrics": {"type": MetricsClientType.PROMETHEUS}, 24 | } 25 | override = { 26 | "data": {"type": DataGenType.Mock}, 27 | "load": {"type": LoadType.POISSON}, 28 | } 29 | merged = deep_merge(base, override) 30 | 31 | assert merged["api"] == APIType.Chat 32 | assert merged["data"]["type"] == DataGenType.Mock 33 | assert merged["load"]["type"] == LoadType.POISSON 34 | assert merged["metrics"]["type"] == MetricsClientType.PROMETHEUS 35 | --------------------------------------------------------------------------------