├── .github
    ├── ISSUE_TEMPLATE
    │   ├── blank_issue.md
    │   ├── bug_request.md
    │   ├── config.yml
    │   └── feature_request.md
    ├── changelog-config.json
    └── workflows
    │   ├── docker-build.yml
    │   ├── format.yml
    │   ├── release.yml
    │   └── unit_test.yml
├── .gitignore
├── .pre-commit-config.yaml
├── CONTRIBUTING.md
├── Dockerfile
├── LICENSE
├── Makefile
├── OWNERS
├── OWNERS_ALIASES
├── README.md
├── RELEASE.md
├── SECURITY.md
├── SECURITY_CONTACTS
├── code-of-conduct.md
├── config.yml
├── deploy
    ├── README.md
    └── manifests.yaml
├── docs
    ├── design.md
    └── images
    │   └── design.png
├── examples
    └── vllm
    │   ├── config-random.yml
    │   ├── config-shared-prefix.yml
    │   ├── config-synthetic.yml
    │   ├── config.yml
    │   └── vllm_server.ipynb
├── inference_perf
    ├── __init__.py
    ├── apis
    │   ├── __init__.py
    │   ├── base.py
    │   ├── chat.py
    │   └── completion.py
    ├── client
    │   ├── filestorage
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   └── gcs.py
    │   ├── metricsclient
    │   │   ├── README.md
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── mock_client.py
    │   │   └── prometheus_client.py
    │   ├── modelserver
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   ├── mock_client.py
    │   │   └── vllm_client.py
    │   └── requestdatacollector
    │   │   ├── __init__.py
    │   │   ├── base.py
    │   │   └── local.py
    ├── config.py
    ├── datagen
    │   ├── __init__.py
    │   ├── base.py
    │   ├── hf_sharegpt_datagen.py
    │   ├── mock_datagen.py
    │   ├── random_datagen.py
    │   ├── shared_prefix_datagen.py
    │   └── synthetic_datagen.py
    ├── loadgen
    │   ├── __init__.py
    │   ├── load_generator.py
    │   └── load_timer.py
    ├── main.py
    ├── reportgen
    │   ├── __init__.py
    │   └── base.py
    └── utils
    │   ├── __init__.py
    │   ├── custom_tokenizer.py
    │   ├── distribution.py
    │   └── report_file.py
├── pdm.lock
├── pyproject.toml
├── requirements.txt
└── tests
    ├── apis
        ├── test_chat.py
        └── test_completion.py
    └── test_config.py


/.github/ISSUE_TEMPLATE/blank_issue.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Blank Issue
3 | about: Create a new issue from scratch
4 | title: ''
5 | labels: needs-triage
6 | assignees: ''
7 | 
8 | ---


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug Report
 3 | about: Report a bug you encountered
 4 | title: ''
 5 | labels: kind/bug, needs-triage
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | <!-- 
11 | Please use this template while reporting a bug and provide as much info as possible.
12 | -->
13 | 
14 | **What happened**:
15 | 
16 | **What you expected to happen**:
17 | 
18 | **How to reproduce it (as minimally and precisely as possible)**:
19 | 
20 | **Anything else we need to know?**:
21 | 
22 | **Environment**:
23 | - inference-perf version:
24 | - config.yml (entire one printed by the benchmark run): 
25 | - cloud provider or hardware configuration:
26 | - others:


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: ''
 5 | labels: needs-triage
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | <!-- Please only use this template for submitting enhancement requests -->
11 | 
12 | **What would you like to be added**:
13 | 
14 | **Why is this needed**:


--------------------------------------------------------------------------------
/.github/changelog-config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "categories": [
 3 |     {
 4 |       "title": "🚀 Features",
 5 |       "labels": ["feature", "enhancement"]
 6 |     },
 7 |     {
 8 |       "title": "🐛 Bug Fixes",
 9 |       "labels": ["bug", "fix"]
10 |     },
11 |     {
12 |       "title": "📚 Documentation",
13 |       "labels": ["documentation", "docs"]
14 |     },
15 |     {
16 |       "title": "⚡️ Performance",
17 |       "labels": ["performance", "perf"]
18 |     },
19 |     {
20 |       "title": "🔧 Dependencies",
21 |       "labels": ["dependencies", "deps"]
22 |     }
23 |   ],
24 |   "template": "${{CHANGELOG}}\n\n## Docker Image\n\n${{DOCKER_IMAGE}}\n\n## Contributors\n\n${{CONTRIBUTORS}}",
25 |   "pr-template": "- ${{TITLE}} (#${{NUMBER}})"
26 | } 


--------------------------------------------------------------------------------
/.github/workflows/docker-build.yml:
--------------------------------------------------------------------------------
 1 | name: Docker Build and Push
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 | 
 8 | jobs:
 9 |   docker:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - name: Checkout code
13 |         uses: actions/checkout@v4
14 | 
15 |       - name: Set up Docker Buildx
16 |         uses: docker/setup-buildx-action@v3
17 | 
18 |       - name: Login to Quay.io
19 |         uses: docker/login-action@v3
20 |         with:
21 |           registry: quay.io
22 |           username: ${{ secrets.QUAY_USERNAME }}
23 |           password: ${{ secrets.QUAY_PASSWORD }}
24 | 
25 |       - name: Extract metadata (tags, labels) for Docker
26 |         id: meta
27 |         uses: docker/metadata-action@v5
28 |         with:
29 |           images: quay.io/inference-perf/inference-perf
30 |           tags: |
31 |             type=ref,event=branch
32 |             type=sha,format=short
33 |             type=raw,value=latest,enable=${{ github.ref == 'refs/heads/main' }}
34 | 
35 |       - name: Build and push Docker image
36 |         uses: docker/build-push-action@v5
37 |         with:
38 |           context: .
39 |           platforms: linux/amd64
40 |           push: true
41 |           tags: ${{ steps.meta.outputs.tags }}
42 |           labels: ${{ steps.meta.outputs.labels }}
43 |           cache-from: type=gha
44 |           cache-to: type=gha,mode=max 


--------------------------------------------------------------------------------
/.github/workflows/format.yml:
--------------------------------------------------------------------------------
 1 | name: Python Linting and Type Checks
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - 'feature/**'
 8 |   pull_request:
 9 | 
10 | jobs:
11 |   format-check:
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - name: Checkout Code
15 |         uses: actions/checkout@v4
16 |       - name: Set up Python
17 |         uses: actions/setup-python@v5
18 |         with:
19 |           python-version: '3.13'
20 |       - name: Do Linting and Type Checks
21 |         run: |
22 |           make validate
23 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yml:
--------------------------------------------------------------------------------
 1 | name: Release Processing
 2 | on:
 3 |   push:
 4 |     tags:
 5 |       - 'v*.*.*'  # Matches semantic versioning tags like v1.0.0
 6 | permissions:
 7 |   contents: write
 8 |   pull-requests: read
 9 | jobs:
10 |   build-and-publish:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |     - uses: actions/checkout@v4
14 |       with:
15 |         fetch-depth: 0  # Get full history for changelog
16 | 
17 |     - name: Set env variable for tag name
18 |       run: echo "RELEASE_VERSION=${GITHUB_REF#refs/tags/}" >> $GITHUB_ENV
19 | 
20 |     - name: Generate changelog
21 |       id: github_release
22 |       uses: mikepenz/release-changelog-builder-action@v3
23 |       with:
24 |         configuration: ".github/changelog-config.json"
25 |         ignorePreReleases: false
26 |       env:
27 |         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
28 | 
29 |     - name: Create GitHub Release
30 |       uses: softprops/action-gh-release@v1
31 |       with:
32 |         name: Release ${{ env.RELEASE_VERSION }}
33 |         body: |
34 |           # Release ${{ env.RELEASE_VERSION }}
35 | 
36 |           ## What's Changed
37 |           ${{ steps.github_release.outputs.changelog }}
38 |       env:
39 |         GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
40 | 
41 |   docker:
42 |     needs: build-and-publish  # Run after the release is created
43 |     runs-on: ubuntu-latest
44 |     steps:
45 |       - uses: actions/checkout@v4
46 | 
47 |       - name: Set up Docker Buildx
48 |         uses: docker/setup-buildx-action@v3
49 | 
50 |       - name: Login to Quay.io
51 |         uses: docker/login-action@v3
52 |         with:
53 |           registry: quay.io
54 |           username: ${{ secrets.QUAY_USERNAME }}
55 |           password: ${{ secrets.QUAY_PASSWORD }}
56 | 
57 |       - name: Extract metadata (tags, labels) for Docker
58 |         id: meta
59 |         uses: docker/metadata-action@v5
60 |         with:
61 |           images: quay.io/${{ secrets.QUAY_USERNAME }}/inference-perf
62 |           tags: |
63 |             type=raw,value=${{ github.ref_name }},enable=true
64 |             type=raw,value=latest,enable=true
65 | 
66 |       - name: Build and push Docker image
67 |         uses: docker/build-push-action@v5
68 |         with:
69 |           context: .
70 |           platforms: linux/amd64
71 |           push: true
72 |           tags: ${{ steps.meta.outputs.tags }}
73 |           labels: ${{ steps.meta.outputs.labels }}
74 |           cache-from: type=gha
75 |           cache-to: type=gha,mode=max 


--------------------------------------------------------------------------------
/.github/workflows/unit_test.yml:
--------------------------------------------------------------------------------
 1 | name: Unit Tests
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |       - 'feature/**'
 8 |   pull_request:
 9 | 
10 | jobs:
11 |   format-check:
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - name: Checkout Code
15 |         uses: actions/checkout@v4
16 |       - name: Set up Python
17 |         uses: actions/setup-python@v5
18 |         with:
19 |           python-version: '3.13'
20 |       - name: Do Linting and Type Checks
21 |         run: |
22 |           make test
23 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # UV
 98 | #   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #uv.lock
102 | 
103 | # poetry
104 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
106 | #   commonly ignored for libraries.
107 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108 | #poetry.lock
109 | 
110 | # pdm
111 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112 | #pdm.lock
113 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114 | #   in version control.
115 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116 | .pdm.toml
117 | .pdm-python
118 | .pdm-build/
119 | .python-version
120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121 | __pypackages__/
122 | 
123 | # Celery stuff
124 | celerybeat-schedule
125 | celerybeat.pid
126 | 
127 | # SageMath parsed files
128 | *.sage.py
129 | 
130 | # Environments
131 | .env
132 | .venv
133 | env/
134 | venv/
135 | ENV/
136 | env.bak/
137 | venv.bak/
138 | 
139 | # Spyder project settings
140 | .spyderproject
141 | .spyproject
142 | 
143 | # Rope project settings
144 | .ropeproject
145 | 
146 | # mkdocs documentation
147 | /site
148 | 
149 | # mypy
150 | .mypy_cache/
151 | .dmypy.json
152 | dmypy.json
153 | 
154 | # Pyre type checker
155 | .pyre/
156 | 
157 | # pytype static type analyzer
158 | .pytype/
159 | 
160 | # Cython debug symbols
161 | cython_debug/
162 | 
163 | # PyCharm
164 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
167 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
168 | #.idea/
169 | 
170 | # Ruff stuff:
171 | .ruff_cache/
172 | 
173 | # PyPI configuration file
174 | .pypirc
175 | 
176 | # Test Reports
177 | *.json


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | - repo: https://github.com/pdm-project/pdm
 3 |   rev: 2.22.3
 4 |   hooks:
 5 |     - id: pdm-lock-check
 6 |       name: check lock file matches pyproject
 7 | - repo: https://github.com/astral-sh/ruff-pre-commit
 8 |   rev: v0.9.4
 9 |   hooks:
10 |     - id: ruff
11 |       name: run the linter
12 |       args: [ --fix ]
13 |     - id: ruff-format
14 |       name: run the formatter
15 | - repo: https://github.com/pre-commit/mirrors-mypy
16 |   rev: v1.14.1
17 |   hooks:
18 |     - id: mypy
19 |       name: run static type check
20 |       args: [--strict, --ignore-missing-imports]
21 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Welcome to Kubernetes. We are excited about the prospect of you joining our [community](https://git.k8s.io/community)! The Kubernetes community abides by the CNCF [code of conduct](code-of-conduct.md). Here is an excerpt:
 4 | 
 5 | _As contributors and maintainers of this project, and in the interest of fostering an open and welcoming community, we pledge to respect all people who contribute through reporting issues, posting feature requests, updating documentation, submitting pull requests or patches, and other activities._
 6 | 
 7 | ## Getting Started
 8 | 
 9 | We have full documentation on how to get started contributing here:
10 | 
11 | <!---
12 | If your repo has certain guidelines for contribution, put them here ahead of the general k8s resources
13 | -->
14 | 
15 | - [Contributor License Agreement](https://git.k8s.io/community/CLA.md) - Kubernetes projects require that you sign a Contributor License Agreement (CLA) before we can accept your pull requests
16 | - [Kubernetes Contributor Guide](https://k8s.dev/guide) - Main contributor documentation, or you can just jump directly to the [contributing page](https://k8s.dev/docs/guide/contributing/)
17 | - [Contributor Cheat Sheet](https://k8s.dev/cheatsheet) - Common resources for existing developers
18 | 
19 | ## Mentorship
20 | 
21 | - [Mentoring Initiatives](https://k8s.dev/community/mentoring) - We have a diverse set of mentorship programs available that are always looking for volunteers!
22 | 
23 | ## Contact Information
24 | 
25 | - [Slack](https://kubernetes.slack.com/messages/sig-scalability)
26 | - [Mailing List](https://groups.google.com/forum/#!forum/kubernetes-sig-scale)
27 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.12.9-slim-bookworm as dev
 2 | 
 3 | RUN apt-get update -y \
 4 |     && apt-get install -y python3-pip
 5 | 
 6 | # Upgrade pip
 7 | RUN pip3 install --upgrade pip
 8 | 
 9 | # Set working directory
10 | WORKDIR /workspace
11 | 
12 | # Copy project files
13 | COPY . /workspace
14 | 
15 | # Install dependencies
16 | RUN pip install -e .
17 | 
18 | # Run inference-perf
19 | CMD ["inference-perf", "--config_file", "config.yml"]
20 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "{}"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright {yyyy} {name of copyright owner}
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | VENV := .venv
 2 | 
 3 | # Format Python code with ruff format
 4 | .PHONY: format
 5 | format:
 6 | 	@echo "Formatting Python files with ruff format..."
 7 | 	$(VENV)/bin/ruff format
 8 | 
 9 | # Run ruff check to lint Python code in the whole repository
10 | .PHONY: lint
11 | lint:
12 | 	@echo "Linting Python files with ruff check..."
13 | 	$(VENV)/bin/ruff check
14 | 
15 | # Perform type checking
16 | .PHONY: type-check
17 | type-check:
18 | 	@echo "Running type checking with mypy..."
19 | 	$(VENV)/bin/mypy --strict ./inference_perf ./tests
20 | 
21 | # Check for and install dependencies
22 | .PHONY: all-deps
23 | all-deps: install-deps install-dev-deps
24 | 
25 | .PHONY:
26 | install-deps:
27 | 	@echo "Creating virtual environment if it doesn't exist..."
28 | 	@if [ ! -d $(VENV) ]; then \
29 | 	    python3 -m venv $(VENV); \
30 | 	fi
31 | 	@echo "Activating virtual environment and installing dependencies..."
32 | 	$(VENV)/bin/pip install --upgrade pip
33 | 	$(VENV)/bin/pip install -e .
34 | 
35 | .PHONY:
36 | install-dev-deps: install-deps
37 | 	@echo "Installing development dependencies..."
38 | 	$(VENV)/bin/pip install -e .[dev]
39 | 
40 | .PHONY: unit-test
41 | unit-test:
42 | 	$(VENV)/bin/pytest
43 | 
44 | .PHONY: validate
45 | validate: install-dev-deps format lint type-check
46 | 
47 | .PHONY: test
48 | test: install-dev-deps unit-test
49 | 
50 | 


--------------------------------------------------------------------------------
/OWNERS:
--------------------------------------------------------------------------------
1 | # See the OWNERS docs at https://go.k8s.io/owners
2 | 
3 | approvers:
4 | - inference-perf-maintainers
5 | - wg-serving-leads
6 | 


--------------------------------------------------------------------------------
/OWNERS_ALIASES:
--------------------------------------------------------------------------------
 1 | # See the OWNERS docs: https://git.k8s.io/community/contributors/guide/owners.md
 2 | # This file should be kept in sync with k/org.
 3 | 
 4 | aliases:
 5 |   inference-perf-maintainers:
 6 |   - achandrasekar
 7 |   - wangchen615
 8 |   - SachinVarghese
 9 | 
10 |   wg-serving-leads:
11 |   - ArangoGutierrez
12 |   - Jeffwan
13 |   - SergeyKanzhelev
14 |   - terrytangyuan
15 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Inference Perf
 2 | 
 3 | The Inference Perf project aims to provide GenAI inference performance benchmarking tool. It came out of [wg-serving](https://github.com/kubernetes/community/tree/master/wg-serving) and is sponsored by [SIG Scalability](https://github.com/kubernetes/community/blob/master/sig-scalability/README.md#inference-perf). See the [proposal](https://github.com/kubernetes-sigs/wg-serving/tree/main/proposals/013-inference-perf) for more info.
 4 | 
 5 | ## Status
 6 | 
 7 | This project is currently in development.
 8 | 
 9 | ## Getting Started
10 | 
11 | ### Configuration
12 | 
13 | You can configure inference-perf to run with different data generation and load generation configurations today. Please see `config.yml` and examples in `/examples`.
14 | 
15 | Supported datasets include the following:
16 | - ShareGPT (for a real world conversational dataset)
17 | - Synthetic (for specific input / output distributions)
18 | - Mock (for testing)
19 | 
20 | Similarly load generation can be configured to run with different request rates and durations. You can also run multiple stages with different request rates and durations within a single run.
21 | 
22 | ### Run locally
23 | 
24 | - Setup a virtual environment and install inference-perf
25 | 
26 |     ```
27 |     pip install .
28 |     ```
29 | 
30 | - Run inference-perf CLI with a configuration file
31 | 
32 |     ```
33 |     inference-perf --config_file config.yml
34 |     ```
35 | 
36 | - See more [examples](./examples/)
37 | 
38 | ### Run in a Docker container
39 | 
40 | - Build the container
41 | 
42 |     ```
43 |     docker build -t inference-perf .
44 |     ```
45 | 
46 | - Run the container
47 | 
48 |     ```
49 |     docker run -it --rm -v $(pwd)/config.yml:/workspace/config.yml inference-perf
50 | 
51 |     ```
52 | 
53 | ### Run in Kubernetes cluster
54 | 
55 | Refer to the [guide](./deploy/README.md) in `/deploy`.
56 | 
57 | ## Contributing
58 | 
59 | Our community meeting is weekly on Thursdays alternating betweem 09:00 and 11:30 PDT ([Zoom Link](https://zoom.us/j/9955436256?pwd=Z2FQWU1jeDZkVC9RRTN4TlZyZTBHZz09), [Meeting Notes](https://docs.google.com/document/d/15XSF8q4DShcXIiExDfyiXxAYQslCmOmO2ARSJErVTak/edit?usp=sharing), [Meeting Recordings](https://www.youtube.com/playlist?list=PL69nYSiGNLP30qNanabU75ayPK7OPNAAS)). 
60 | 
61 | We currently utilize the [#inference-perf](https://kubernetes.slack.com/?redir=%2Fmessages%2Finference-perf) channel in Kubernetes Slack workspace for communications.
62 | 
63 | Contributions are welcomed, thanks for joining us!
64 | 
65 | ### Code of conduct
66 | 
67 | Participation in the Kubernetes community is governed by the [Kubernetes Code of Conduct](code-of-conduct.md).
68 | 


--------------------------------------------------------------------------------
/RELEASE.md:
--------------------------------------------------------------------------------
 1 | # Release Process
 2 | 
 3 | The Kubernetes Template Project is released on an as-needed basis. The process is as follows:
 4 | 
 5 | 1. An issue is proposing a new release with a changelog since the last release
 6 | 1. All [OWNERS](OWNERS) must LGTM this release
 7 | 1. An OWNER runs `git tag -s $VERSION` and inserts the changelog and pushes the tag with `git push $VERSION`
 8 | 1. The release issue is closed
 9 | 1. An announcement email is sent to `dev@kubernetes.io` with the subject `[ANNOUNCE] kubernetes-template-project $VERSION is released`
10 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | # Security Policy
 2 | 
 3 | ## Security Announcements
 4 | 
 5 | Join the [kubernetes-security-announce] group for security and vulnerability announcements.
 6 | 
 7 | ## Reporting a Vulnerability
 8 | 
 9 | Instructions for reporting a vulnerability can be found on the
10 | [Kubernetes Security and Disclosure Information] page.
11 | 
12 | ## Supported Versions
13 | 
14 | Information about supported Kubernetes versions can be found on the
15 | [Kubernetes version and version skew support policy] page on the Kubernetes website.
16 | 
17 | [kubernetes-security-announce]: https://groups.google.com/forum/#!forum/kubernetes-security-announce
18 | [Kubernetes version and version skew support policy]: https://kubernetes.io/docs/setup/release/version-skew-policy/#supported-versions
19 | [Kubernetes Security and Disclosure Information]: https://kubernetes.io/docs/reference/issues-security/security/#report-a-vulnerability
20 | 


--------------------------------------------------------------------------------
/SECURITY_CONTACTS:
--------------------------------------------------------------------------------
 1 | # Defined below are the security contacts for this repo.
 2 | #
 3 | # They are the contact point for the Security Response Committee to reach out
 4 | # to for triaging and handling of incoming issues.
 5 | #
 6 | # The below names agree to abide by the
 7 | # [Embargo Policy](https://git.k8s.io/security/private-distributors-list.md#embargo-policy)
 8 | # and will be removed and replaced if they violate that agreement.
 9 | #
10 | # DO NOT REPORT SECURITY VULNERABILITIES DIRECTLY TO THESE NAMES, FOLLOW THE
11 | # INSTRUCTIONS AT https://kubernetes.io/security/
12 | 
13 | ArangoGutierrez
14 | Jeffwan
15 | SergeyKanzhelev
16 | terrytangyuan
17 | 


--------------------------------------------------------------------------------
/code-of-conduct.md:
--------------------------------------------------------------------------------
1 | # Kubernetes Community Code of Conduct
2 | 
3 | Please refer to our [Kubernetes Community Code of Conduct](https://git.k8s.io/community/code-of-conduct.md)
4 | 


--------------------------------------------------------------------------------
/config.yml:
--------------------------------------------------------------------------------
 1 | load:
 2 |   type: constant
 3 |   stages:
 4 |   - rate: 1
 5 |     duration: 30
 6 | api: chat
 7 | server:
 8 |   type: vllm
 9 |   model_name: HuggingFaceTB/SmolLM2-135M-Instruct
10 |   base_url: http://0.0.0.0:8000
11 |   ignore_eos: true
12 | tokenizer:
13 |   pretrained_model_name_or_path: HuggingFaceTB/SmolLM2-135M-Instruct
14 | data:
15 |   type: shareGPT
16 | metrics:
17 |   type: prometheus
18 |   prometheus:
19 |     url: http://localhost:9090
20 |     scrape_interval: 15
21 | report:
22 |   request_lifecycle:
23 |     summary: true
24 |     per_stage: true
25 |     per_request: false
26 |   prometheus:
27 |     summary: true
28 |     per_stage: false


--------------------------------------------------------------------------------
/deploy/README.md:
--------------------------------------------------------------------------------
 1 | ## Run `inference-perf` as a Job in a Kubernetes cluster
 2 | 
 3 | This guide explains how to deploy `inference-perf` to a Kubernetes cluster as a job.
 4 | 
 5 | > [!NOTE]
 6 | > There is currently no support for persisting output reports, all outputs are currently printed to standard output, please refer to issue [#59](https://github.com/kubernetes-sigs/inference-perf/issues/59)
 7 | 
 8 | ### Setup
 9 | 
10 | Since public container images are not actively being published, you'll need to build the `inference-perf` image yourself. Follow the [official guide](https://github.com/kubernetes-sigs/inference-perf?tab=readme-ov-file#run-in-a-docker-container) to build the container.
11 | 
12 | Once built, push the image to your preferred container registry:
13 | - [Artifact Registry (Google Cloud)](https://cloud.google.com/artifact-registry/docs/docker/pushing-and-pulling)
14 | - [Docker Hub](https://docs.docker.com/get-started/introduction/build-and-push-first-image/)
15 | 
16 | Take note of the image name once successfully pushed, replace `<your image here>` in `manifests.yaml` with this image name.
17 | 
18 | Running `inference-perf` requires an input file. This should be provided via a Kubernetes ConfigMap. Update the `config.yml` as needed then create the ConfigMap by running at the root of this repo:
19 | 
20 | ```bash
21 | kubectl create configmap inference-perf-config --from-file=config.yml
22 | ```
23 | 
24 | ### Instructions
25 | 
26 | Apply the job by running the following:
27 | ```bash
28 | kubectl apply -f manifests.yaml
29 | ```
30 | 
31 | ### Viewing Results
32 | 
33 | Currently, inference-perf outputs benchmark results to standard output only. To view the results after the job completes, run:
34 | ```bash
35 | kubectl wait --for=condition=complete job/inference-perf && kubectl logs jobs/inference-perf
36 | ```


--------------------------------------------------------------------------------
/deploy/manifests.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: batch/v1
 2 | kind: Job
 3 | metadata:
 4 |   name: inference-perf
 5 |   labels:
 6 |     app: inference-perf
 7 | spec:
 8 |   template:
 9 |     metadata:
10 |       labels:
11 |         app: inference-perf
12 |     spec:
13 |       containers:
14 |         - name: inference-perf
15 |           image: <your image here>
16 |           imagePullPolicy: Always
17 |           command: ["inference-perf"]
18 |           args: ["--config_file", "/etc/config/config.yml"]
19 |           volumeMounts:
20 |             - name: config-volume
21 |               mountPath: /etc/config
22 |               readOnly: true
23 |       restartPolicy: Never
24 |       volumes:
25 |         - name: config-volume
26 |           configMap:
27 |             name: inference-perf-config


--------------------------------------------------------------------------------
/docs/design.md:
--------------------------------------------------------------------------------
 1 | # Design
 2 | 
 3 | This document describes the high level design for the tool. It includes the
 4 | following components.
 5 | 
 6 | ## Dataset Preprocessor
 7 | 
 8 | Dataset Preprocessor takes in a known dataset like ShareGPT or OpenOrca as the
 9 | input and pre-processes them by making sure the prompt length and generation
10 | length are aligned with the user input to support different options like fixed
11 | input / output length tests, variable length tests (larger input / smaller
12 | output and the vice versa). This allows us to support different GenAI use cases
13 | like chat completion, summarization, code completion, etc. depending on the
14 | dataset and the benchmarking user’s inputs.
15 | 
16 | ## Load Generator
17 | 
18 | Load Generator is the component which generates different traffic patterns based
19 | on user input. This can include a fixed RPS test for a predetermined amount of
20 | time or include a way to generate bursts in traffic or other traffic patterns as
21 | desired for autoscaling and other use cases.
22 | 
23 | ## Request Processor
24 | 
25 | Request Processor provides a way to support different model servers and their
26 | corresponding request payload with different configurable parameters. This makes
27 | our tool model server agnostic and provides a generic way to benchmark different
28 | model servers and produce apples to apples comparison between them. This
29 | component will also support different protocols like http and grpc and options
30 | like request streaming which is important to produce time to first token (TTFT)
31 | metric.
32 | 
33 | ## Response Processor / Data Collector
34 | 
35 | Response Processor / Data Collector component allows us to process the response
36 | and measure the actual performance of the model server in terms of request
37 | latency, TPOT, TTFT and throughput.
38 | 
39 | ## Report Generator / Metrics Exporter
40 | 
41 | Report Generator / Metrics Exporter generates a report based on the data
42 | collected during benchmarking. It can also export the different metrics that we
43 | collected during benchmarking as metrics into Prometheus which can then be
44 | consumed by other monitoring or visualization solutions.
45 | 
46 | ![benchmarking-tool-architecture](./images/design.png)
47 | 
48 | ## Metrics to Collect
49 | 
50 | The following are the essential metrics that we want to collect using the
51 | benchmarking tool.
52 | 
53 | *   Throughput
54 |     *   Output tokens / second
55 |     *   Input tokens / second
56 |     *   Requests / second
57 | *   Latency at different percentiles (mean, median, p90, p99)
58 |     *   Time per output token (TPOT)
59 |     *   Inter-token latency (ITL)
60 |     *   Time to first token (TTFT)
61 |     *   Time per request
62 | *   Request metrics (mean, median, p90, p99)
63 |     *   Prompt tokens
64 |     *   Output tokens
65 | 
66 | Optionally we also want to collect specific accelerator and model server metrics.
67 | 
68 | *   Accelerator metrics (mean, median, p90, p99)
69 |     *   Accelerator utilization (duty cycle)
70 |     *   Accelerator memory utilization
71 |     *   Accelerator memory bandwidth utilization
72 |     *   Accelerator power usage
73 | *   Model server metrics (mean, median, p90, p99)
74 |     *   Batch size
75 |     *   Queue size
76 |     *   KV cache usage
77 | 


--------------------------------------------------------------------------------
/docs/images/design.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/kubernetes-sigs/inference-perf/14ea94163c5942119ca7e3fc5633e396a523b06e/docs/images/design.png


--------------------------------------------------------------------------------
/examples/vllm/config-random.yml:
--------------------------------------------------------------------------------
 1 | load:
 2 |   type: constant
 3 |   stages:
 4 |   - rate: 1
 5 |     duration: 30
 6 | api: completion
 7 | server:
 8 |   type: vllm
 9 |   model_name: HuggingFaceTB/SmolLM2-135M-Instruct
10 |   base_url: http://0.0.0.0:8000
11 |   ignore_eos: true
12 | tokenizer:
13 |   pretrained_model_name_or_path: HuggingFaceTB/SmolLM2-135M-Instruct
14 | data:
15 |   type: random
16 |   input_distribution:
17 |     min: 10
18 |     max: 100
19 |     mean: 50
20 |     std: 10
21 |     total_count: 100
22 |   output_distribution:
23 |     min: 10
24 |     max: 512
25 |     mean: 256
26 |     std: 100
27 |     total_count: 100
28 | metrics:
29 |   type: prometheus
30 |   prometheus:
31 |     url: http://localhost:9090
32 |     scrape_interval: 15
33 | report:
34 |   request_lifecycle:
35 |     summary: true
36 |     per_stage: true
37 |     per_request: true


--------------------------------------------------------------------------------
/examples/vllm/config-shared-prefix.yml:
--------------------------------------------------------------------------------
 1 | load:
 2 |   type: constant
 3 |   interval: 15
 4 |   stages:
 5 |   - rate: 1
 6 |     duration: 30
 7 |   - rate: 2
 8 |     duration: 30
 9 | api: completion
10 | server:
11 |   type: vllm
12 |   model_name: HuggingFaceTB/SmolLM2-135M-Instruct
13 |   base_url: http://0.0.0.0:8000
14 |   ignore_eos: true
15 | tokenizer:
16 |   pretrained_model_name_or_path: HuggingFaceTB/SmolLM2-135M-Instruct
17 | data:
18 |   type: shared_prefix
19 |   shared_prefix:
20 |     num_groups: 10                # Number of distinct shared prefixes
21 |     num_prompts_per_group: 10     # Number of unique questions per shared prefix
22 |     system_prompt_len: 100        # Length of the shared prefix (in tokens)
23 |     question_len: 50              # Length of the unique question part (in tokens)
24 |     output_len: 50                # Target length for the model's generated output (in tokens)
25 | metrics:
26 |   type: prometheus
27 |   prometheus:
28 |     url: http://localhost:9090
29 |     scrape_interval: 15
30 | report:
31 |   request_lifecycle:
32 |     summary: true
33 |     per_stage: true
34 |     per_request: true


--------------------------------------------------------------------------------
/examples/vllm/config-synthetic.yml:
--------------------------------------------------------------------------------
 1 | load:
 2 |   type: constant
 3 |   interval: 15
 4 |   stages:
 5 |   - rate: 1
 6 |     duration: 30
 7 |   - rate: 2
 8 |     duration: 30
 9 | api: completion
10 | server:
11 |   type: vllm
12 |   model_name: HuggingFaceTB/SmolLM2-135M-Instruct
13 |   base_url: http://0.0.0.0:8000
14 | tokenizer:
15 |   pretrained_model_name_or_path: HuggingFaceTB/SmolLM2-135M-Instruct
16 | data:
17 |   type: synthetic
18 |   input_distribution:
19 |     min: 10
20 |     max: 100
21 |     mean: 50
22 |     std: 10
23 |     total_count: 100
24 |   output_distribution:
25 |     min: 10
26 |     max: 100
27 |     mean: 50
28 |     std: 10
29 |     total_count: 100
30 | metrics:
31 |   type: prometheus
32 |   prometheus:
33 |     url: http://localhost:9090
34 |     scrape_interval: 15
35 | report:
36 |   request_lifecycle:
37 |     summary: true
38 |     per_stage: true
39 |     per_request: true
40 | 


--------------------------------------------------------------------------------
/examples/vllm/config.yml:
--------------------------------------------------------------------------------
 1 | data:
 2 |   type: shareGPT
 3 | load:
 4 |   type: constant
 5 |   stages:
 6 |   - rate: 1
 7 |     duration: 30
 8 | api: chat
 9 | server:
10 |   type: vllm
11 |   model_name: HuggingFaceTB/SmolLM2-135M-Instruct
12 |   base_url: http://0.0.0.0:8000


--------------------------------------------------------------------------------
/examples/vllm/vllm_server.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Benchmark vLLM Server with inference-perf"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {},
 13 |    "source": [
 14 |     "### Local vLLM Setup using docker"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "markdown",
 19 |    "metadata": {},
 20 |    "source": [
 21 |     "Run vLLM Server as a docker container with the model HuggingFace `HuggingFaceTB/SmolLM2-135M-Instruct`"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 1,
 27 |    "metadata": {},
 28 |    "outputs": [
 29 |     {
 30 |      "name": "stdout",
 31 |      "output_type": "stream",
 32 |      "text": [
 33 |       "71c1f998ef3488239cf88c97e0084e6287c87df3f3de3842e47c3751acc43329\n"
 34 |      ]
 35 |     }
 36 |    ],
 37 |    "source": [
 38 |     "!export MODEL_NAME=\"HuggingFaceTB/SmolLM2-135M-Instruct\" && \\\n",
 39 |     "    docker run --name vllm-server -d --runtime nvidia --gpus all \\\n",
 40 |     "    -v ~/.cache/huggingface:/root/.cache/huggingface \\\n",
 41 |     "    -p 8000:8000 vllm/vllm-openai:latest \\\n",
 42 |     "    --model ${MODEL_NAME}"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "**Note**: Configure [vLLM engine arguments](https://docs.vllm.ai/en/latest/serving/engine_args.html#engine-args) like `--max-model-len` and  `--max-num-seqs` according to local compute capacity"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "metadata": {},
 55 |    "source": [
 56 |     "### Benchmark with inference_perf"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "Create a configuration file for the test using `shareGPT` data and run the constant rate test for `30s`. You can also use the synthetic dataset if you prefer by running with the `config-synthetic.yml` file instead."
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 2,
 69 |    "metadata": {},
 70 |    "outputs": [
 71 |     {
 72 |      "name": "stdout",
 73 |      "output_type": "stream",
 74 |      "text": [
 75 |       "data:\n",
 76 |       "  type: shareGPT\n",
 77 |       "load:\n",
 78 |       "  type: constant\n",
 79 |       "  rate: 1\n",
 80 |       "  duration: 30\n",
 81 |       "vllm:\n",
 82 |       "  api: chat\n",
 83 |       "  model_name: HuggingFaceTB/SmolLM2-135M-Instruct\n",
 84 |       "  url: http://0.0.0.0:8000"
 85 |      ]
 86 |     }
 87 |    ],
 88 |    "source": [
 89 |     "!cat config.yml"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 3,
 95 |    "metadata": {},
 96 |    "outputs": [
 97 |     {
 98 |      "name": "stdout",
 99 |      "output_type": "stream",
100 |      "text": [
101 |       "Using configuration from: config.yml\n",
102 |       "Run started\n",
103 |       "Run completed\n",
104 |       "\n",
105 |       "\n",
106 |       "Generating Report ..\n",
107 |       "total_requests: 38\n",
108 |       "avg_prompt_tokens: 2.763157894736842\n",
109 |       "avg_output_tokens: 28.94736842105263\n",
110 |       "avg_time_per_request: 0.11538009351045873\n"
111 |      ]
112 |     }
113 |    ],
114 |    "source": [
115 |     "!inference-perf --config_file config.yml"
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "markdown",
120 |    "metadata": {},
121 |    "source": [
122 |     "### Cleanup"
123 |    ]
124 |   },
125 |   {
126 |    "cell_type": "markdown",
127 |    "metadata": {},
128 |    "source": [
129 |     "Delete vLLM Server docker processes"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": 4,
135 |    "metadata": {},
136 |    "outputs": [
137 |     {
138 |      "name": "stdout",
139 |      "output_type": "stream",
140 |      "text": [
141 |       "vllm-server\n",
142 |       "vllm-server\n"
143 |      ]
144 |     }
145 |    ],
146 |    "source": [
147 |     "!docker stop vllm-server && docker rm vllm-server"
148 |    ]
149 |   }
150 |  ],
151 |  "metadata": {
152 |   "kernelspec": {
153 |    "display_name": ".venv",
154 |    "language": "python",
155 |    "name": "python3"
156 |   },
157 |   "language_info": {
158 |    "codemirror_mode": {
159 |     "name": "ipython",
160 |     "version": 3
161 |    },
162 |    "file_extension": ".py",
163 |    "mimetype": "text/x-python",
164 |    "name": "python",
165 |    "nbconvert_exporter": "python",
166 |    "pygments_lexer": "ipython3",
167 |    "version": "3.11.2"
168 |   }
169 |  },
170 |  "nbformat": 4,
171 |  "nbformat_minor": 2
172 | }
173 | 


--------------------------------------------------------------------------------
/inference_perf/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The Kubernetes Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from .main import main_cli
15 | 
16 | __all__ = ["main_cli"]
17 | 


--------------------------------------------------------------------------------
/inference_perf/apis/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The Kubernetes Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from .base import InferenceAPIData, InferenceInfo, RequestLifecycleMetric, ErrorResponseInfo
15 | from .chat import ChatCompletionAPIData, ChatMessage
16 | from .completion import CompletionAPIData
17 | 
18 | __all__ = [
19 |     "InferenceAPIData",
20 |     "InferenceInfo",
21 |     "RequestLifecycleMetric",
22 |     "ErrorResponseInfo",
23 |     "ChatCompletionAPIData",
24 |     "ChatMessage",
25 |     "CompletionAPIData",
26 | ]
27 | 


--------------------------------------------------------------------------------
/inference_perf/apis/base.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The Kubernetes Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from abc import abstractmethod
16 | from typing import Any, Optional
17 | from pydantic import BaseModel
18 | from inference_perf.utils.custom_tokenizer import CustomTokenizer
19 | from inference_perf.config import APIType
20 | 
21 | 
22 | class InferenceInfo(BaseModel):
23 |     input_tokens: int = 0
24 |     output_tokens: int = 0
25 | 
26 | 
27 | class ErrorResponseInfo(BaseModel):
28 |     error_type: str
29 |     error_msg: str
30 | 
31 | 
32 | class RequestLifecycleMetric(BaseModel):
33 |     stage_id: Optional[int] = None
34 |     start_time: float
35 |     end_time: float
36 |     request_data: str
37 |     response_data: Optional[str] = None
38 |     info: InferenceInfo
39 |     error: Optional[ErrorResponseInfo]
40 | 
41 | 
42 | class InferenceAPIData(BaseModel):
43 |     @abstractmethod
44 |     def get_api_type(self) -> APIType:
45 |         raise NotImplementedError
46 | 
47 |     @abstractmethod
48 |     def get_route(self) -> str:
49 |         raise NotImplementedError
50 | 
51 |     @abstractmethod
52 |     def to_payload(self, model_name: str, max_tokens: int, ignore_eos: bool) -> dict[str, Any]:
53 |         raise NotImplementedError
54 | 
55 |     @abstractmethod
56 |     def process_response(self, data: dict[str, Any], tokenizer: CustomTokenizer) -> InferenceInfo:
57 |         raise NotImplementedError
58 | 


--------------------------------------------------------------------------------
/inference_perf/apis/chat.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The Kubernetes Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from typing import Any, List
16 | from pydantic import BaseModel
17 | from inference_perf.apis import InferenceAPIData, InferenceInfo
18 | from inference_perf.utils.custom_tokenizer import CustomTokenizer
19 | from inference_perf.config import APIType
20 | 
21 | 
22 | class ChatMessage(BaseModel):
23 |     role: str
24 |     content: str
25 | 
26 | 
27 | class ChatCompletionAPIData(InferenceAPIData):
28 |     messages: List[ChatMessage]
29 |     max_tokens: int = 0
30 | 
31 |     def get_api_type(self) -> APIType:
32 |         return APIType.Chat
33 | 
34 |     def get_route(self) -> str:
35 |         return "/v1/chat/completions"
36 | 
37 |     def to_payload(self, model_name: str, max_tokens: int, ignore_eos: bool) -> dict[str, Any]:
38 |         if self.max_tokens == 0:
39 |             self.max_tokens = max_tokens
40 |         return {
41 |             "model": model_name,
42 |             "messages": [{"role": m.role, "content": m.content} for m in self.messages],
43 |             "max_tokens": self.max_tokens,
44 |             "ignore_eos": ignore_eos,
45 |         }
46 | 
47 |     def process_response(self, data: dict[str, Any], tokenizer: CustomTokenizer) -> InferenceInfo:
48 |         choices = data.get("choices", [])
49 |         output_text = choices[0].get("message", {}).get("content", "")
50 |         output_len = tokenizer.count_tokens(output_text)
51 |         return InferenceInfo(
52 |             input_tokens=0,
53 |             output_tokens=output_len,
54 |         )
55 | 


--------------------------------------------------------------------------------
/inference_perf/apis/completion.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The Kubernetes Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | 
16 | from typing import Any
17 | from inference_perf.apis import InferenceAPIData, InferenceInfo
18 | from inference_perf.utils.custom_tokenizer import CustomTokenizer
19 | from inference_perf.config import APIType
20 | 
21 | 
22 | class CompletionAPIData(InferenceAPIData):
23 |     prompt: str
24 |     max_tokens: int = 0
25 | 
26 |     def get_api_type(self) -> APIType:
27 |         return APIType.Completion
28 | 
29 |     def get_route(self) -> str:
30 |         return "/v1/completions"
31 | 
32 |     def to_payload(self, model_name: str, max_tokens: int, ignore_eos: bool) -> dict[str, Any]:
33 |         if self.max_tokens == 0:
34 |             self.max_tokens = max_tokens
35 |         return {
36 |             "model": model_name,
37 |             "prompt": self.prompt,
38 |             "max_tokens": self.max_tokens,
39 |             "ignore_eos": ignore_eos,
40 |         }
41 | 
42 |     def process_response(self, data: dict[str, Any], tokenizer: CustomTokenizer) -> InferenceInfo:
43 |         choices = data.get("choices", [])
44 |         prompt_len = tokenizer.count_tokens(self.prompt)
45 |         output_text = choices[0].get("text", "")
46 |         output_len = tokenizer.count_tokens(output_text)
47 |         return InferenceInfo(input_tokens=prompt_len, output_tokens=output_len)
48 | 


--------------------------------------------------------------------------------
/inference_perf/client/filestorage/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The Kubernetes Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from .base import StorageClient
15 | from .gcs import GoogleCloudStorageClient
16 | 
17 | 
18 | __all__ = ["StorageClient", "GoogleCloudStorageClient"]
19 | 


--------------------------------------------------------------------------------
/inference_perf/client/filestorage/base.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The Kubernetes Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from abc import ABC, abstractmethod
16 | from typing import List
17 | from inference_perf.config import StorageConfigBase
18 | from inference_perf.utils import ReportFile
19 | 
20 | 
21 | class StorageClient(ABC):
22 |     def __init__(self, config: StorageConfigBase) -> None:
23 |         self.config = config
24 |         print(f"Report files will be stored at: {self.config.path}")
25 | 
26 |     @abstractmethod
27 |     def save_report(self, reports: List[ReportFile]) -> None:
28 |         raise NotImplementedError()
29 | 


--------------------------------------------------------------------------------
/inference_perf/client/filestorage/gcs.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The Kubernetes Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import json
15 | from typing import List
16 | from google.cloud import storage
17 | from google.cloud.exceptions import GoogleCloudError
18 | from inference_perf.client.filestorage import StorageClient
19 | from inference_perf.config import GoogleCloudStorageConfig
20 | from inference_perf.utils import ReportFile
21 | 
22 | 
23 | class GoogleCloudStorageClient(StorageClient):
24 |     def __init__(self, config: GoogleCloudStorageConfig) -> None:
25 |         super().__init__(config=config)
26 |         print("Created new GCS client")
27 |         self.output_bucket = config.bucket_name
28 |         self.client = storage.Client()
29 | 
30 |         self.bucket = self.client.lookup_bucket(config.bucket_name)
31 |         if self.bucket is None:
32 |             raise ValueError(f"GCS bucket '{config.bucket_name}' does not exist or is inaccessible.")
33 | 
34 |     def save_report(self, reports: List[ReportFile]) -> None:
35 |         filenames = [report.get_filename() for report in reports]
36 |         if len(filenames) != len(set(filenames)):
37 |             raise ValueError("Duplicate filenames detected", filenames)
38 | 
39 |         for _, report in enumerate(reports):
40 |             filename = report.get_filename()
41 |             blob_path = f"{self.config.path if self.config.path else ''}/{self.config.report_file_prefix if self.config.report_file_prefix else ''}{filename}"
42 |             blob = self.bucket.blob(blob_path)
43 | 
44 |             if blob.exists():
45 |                 print(f"Skipping upload: gs://{self.output_bucket}/{blob_path} already exists")
46 |                 continue
47 | 
48 |             try:
49 |                 blob.upload_from_string(json.dumps(report.get_contents()), content_type="application/json")
50 |                 print(f"Uploaded gs://{self.output_bucket}/{blob_path}")
51 |             except GoogleCloudError as e:
52 |                 print(f"Failed to upload {blob_path}: {e}")
53 | 


--------------------------------------------------------------------------------
/inference_perf/client/metricsclient/README.md:
--------------------------------------------------------------------------------
 1 | # Model Server Metrics Query Clients
 2 | 
 3 | This repository provides clients to query performance metrics from various monitoring platforms. Each model server exposes a list of relevant performance metrics, and these clients are designed to retrieve and process that data effectively.
 4 | 
 5 | ## Supported Monitoring Platforms
 6 | 
 7 | **Available now**:
 8 | - Self Deployed Prometheus
 9 | 
10 | **Todo**:
11 | - Google Cloud Monitoring
12 | - AWS CloudWatch
13 | - Azure Monitor


--------------------------------------------------------------------------------
/inference_perf/client/metricsclient/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The Kubernetes Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from .base import MetricsClient, PerfRuntimeParameters, ModelServerMetrics
15 | from .mock_client import MockMetricsClient
16 | from .prometheus_client import PrometheusMetricsClient
17 | 
18 | __all__ = ["MetricsClient", "MockMetricsClient", "PerfRuntimeParameters", "PrometheusMetricsClient", "ModelServerMetrics"]
19 | 


--------------------------------------------------------------------------------
/inference_perf/client/metricsclient/base.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The Kubernetes Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from abc import ABC, abstractmethod
15 | from inference_perf.client.modelserver.base import ModelServerClient
16 | from inference_perf.loadgen.load_generator import StageRuntimeInfo
17 | from pydantic import BaseModel
18 | 
19 | 
20 | class PerfRuntimeParameters:
21 |     def __init__(
22 |         self, start_time: float, duration: float, model_server_client: ModelServerClient, stages: dict[int, StageRuntimeInfo]
23 |     ) -> None:
24 |         self.start_time = start_time
25 |         self.duration = duration
26 |         self.stages = stages
27 |         self.model_server_client = model_server_client
28 | 
29 | 
30 | class ModelServerMetrics(BaseModel):
31 |     # Throughput
32 |     prompt_tokens_per_second: float = 0.0
33 |     output_tokens_per_second: float = 0.0
34 |     requests_per_second: float = 0.0
35 | 
36 |     # Latency
37 |     avg_request_latency: float = 0.0
38 |     median_request_latency: float = 0.0
39 |     p90_request_latency: float = 0.0
40 |     p99_request_latency: float = 0.0
41 |     avg_time_to_first_token: float = 0.0
42 |     median_time_to_first_token: float = 0.0
43 |     p90_time_to_first_token: float = 0.0
44 |     p99_time_to_first_token: float = 0.0
45 |     avg_time_per_output_token: float = 0.0
46 |     median_time_per_output_token: float = 0.0
47 |     p90_time_per_output_token: float = 0.0
48 |     p99_time_per_output_token: float = 0.0
49 | 
50 |     # Request
51 |     total_requests: int = 0
52 |     avg_prompt_tokens: int = 0
53 |     avg_output_tokens: int = 0
54 |     avg_queue_length: int = 0
55 | 
56 | 
57 | class MetricsClient(ABC):
58 |     @abstractmethod
59 |     def __init__(self) -> None:
60 |         pass
61 | 
62 |     @abstractmethod
63 |     def collect_metrics_summary(self, runtime_parameters: PerfRuntimeParameters) -> ModelServerMetrics | None:
64 |         raise NotImplementedError
65 | 
66 |     @abstractmethod
67 |     def collect_metrics_for_stage(self, runtime_parameters: PerfRuntimeParameters, stage_id: int) -> ModelServerMetrics | None:
68 |         raise NotImplementedError
69 | 
70 |     @abstractmethod
71 |     def wait(self) -> None:
72 |         raise NotImplementedError
73 | 


--------------------------------------------------------------------------------
/inference_perf/client/metricsclient/mock_client.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The Kubernetes Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from .base import MetricsClient, PerfRuntimeParameters, ModelServerMetrics
15 | 
16 | 
17 | class MockMetricsClient(MetricsClient):
18 |     def __init__(self) -> None:
19 |         pass
20 | 
21 |     def collect_metrics_summary(self, runtime_parameters: PerfRuntimeParameters) -> ModelServerMetrics | None:
22 |         return None
23 | 
24 |     def collect_metrics_for_stage(self, runtime_parameters: PerfRuntimeParameters, stage_id: int) -> ModelServerMetrics | None:
25 |         return None
26 | 
27 |     def wait(self) -> None:
28 |         pass
29 | 


--------------------------------------------------------------------------------
/inference_perf/client/metricsclient/prometheus_client.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2025 The Kubernetes Authors.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | import time
 15 | from typing import cast
 16 | import requests
 17 | from inference_perf.client.modelserver.base import ModelServerClient, ModelServerPrometheusMetric
 18 | from inference_perf.config import PrometheusClientConfig
 19 | from .base import MetricsClient, PerfRuntimeParameters, ModelServerMetrics
 20 | 
 21 | PROMETHEUS_SCRAPE_BUFFER_SEC = 2
 22 | 
 23 | 
 24 | class PrometheusQueryBuilder:
 25 |     def __init__(self, model_server_metric: ModelServerPrometheusMetric, duration: float):
 26 |         self.model_server_metric = model_server_metric
 27 |         self.duration = duration
 28 | 
 29 |     def get_queries(self) -> dict[str, dict[str, str]]:
 30 |         """
 31 |         Returns a dictionary of queries for each metric type.
 32 |         """
 33 |         metric_name = self.model_server_metric.name
 34 |         filter = self.model_server_metric.filters
 35 |         return {
 36 |             "gauge": {
 37 |                 "mean": "avg_over_time(%s{%s}[%.0fs])" % (metric_name, filter, self.duration),
 38 |                 "median": "quantile_over_time(0.5, %s{%s}[%.0fs])" % (metric_name, filter, self.duration),
 39 |                 "sd": "stddev_over_time(%s{%s}[%.0fs])" % (metric_name, filter, self.duration),
 40 |                 "min": "min_over_time(%s{%s}[%.0fs])" % (metric_name, filter, self.duration),
 41 |                 "max": "max_over_time(%s{%s}[%.0fs])" % (metric_name, filter, self.duration),
 42 |                 "p90": "quantile_over_time(0.9, %s{%s}[%.0fs])" % (metric_name, filter, self.duration),
 43 |                 "p99": "quantile_over_time(0.99, %s{%s}[%.0fs])" % (metric_name, filter, self.duration),
 44 |             },
 45 |             "histogram": {
 46 |                 "mean": "sum(rate(%s_sum{%s}[%.0fs])) / (sum(rate(%s_count{%s}[%.0fs])) > 0)"
 47 |                 % (metric_name, filter, self.duration, metric_name, filter, self.duration),
 48 |                 "median": "histogram_quantile(0.5, sum(rate(%s_bucket{%s}[%.0fs])) by (le))"
 49 |                 % (metric_name, filter, self.duration),
 50 |                 "min": "histogram_quantile(0, sum(rate(%s_bucket{%s}[%.0fs])) by (le))" % (metric_name, filter, self.duration),
 51 |                 "max": "histogram_quantile(1, sum(rate(%s_bucket{%s}[%.0fs])) by (le))" % (metric_name, filter, self.duration),
 52 |                 "p90": "histogram_quantile(0.9, sum(rate(%s_bucket{%s}[%.0fs])) by (le))"
 53 |                 % (metric_name, filter, self.duration),
 54 |                 "p99": "histogram_quantile(0.99, sum(rate(%s_bucket{%s}[%.0fs])) by (le))"
 55 |                 % (metric_name, filter, self.duration),
 56 |             },
 57 |             "counter": {
 58 |                 "rate": "sum(rate(%s{%s}[%.0fs]))" % (metric_name, filter, self.duration),
 59 |                 "increase": "sum(increase(%s{%s}[%.0fs]))" % (metric_name, filter, self.duration),
 60 |                 "mean": "avg_over_time(rate(%s{%s}[%.0fs])[%.0fs:%.0fs])"
 61 |                 % (metric_name, filter, self.duration, self.duration, self.duration),
 62 |                 "max": "max_over_time(rate(%s{%s}[%.0fs])[%.0fs:%.0fs])"
 63 |                 % (metric_name, filter, self.duration, self.duration, self.duration),
 64 |                 "min": "min_over_time(rate(%s{%s}[%.0fs])[%.0fs:%.0fs])"
 65 |                 % (metric_name, filter, self.duration, self.duration, self.duration),
 66 |                 "p90": "quantile_over_time(0.9, rate(%s{%s}[%.0fs])[%.0fs:%.0fs])"
 67 |                 % (metric_name, filter, self.duration, self.duration, self.duration),
 68 |                 "p99": "quantile_over_time(0.99, rate(%s{%s}[%.0fs])[%.0fs:%.0fs])"
 69 |                 % (metric_name, filter, self.duration, self.duration, self.duration),
 70 |             },
 71 |         }
 72 | 
 73 |     def build_query(self) -> str:
 74 |         """
 75 |         Builds the PromQL query for the given metric type and query operation.
 76 | 
 77 |         Returns:
 78 |         The PromQL query.
 79 |         """
 80 |         metric_type = self.model_server_metric.type
 81 |         query_op = self.model_server_metric.op
 82 | 
 83 |         queries = self.get_queries()
 84 |         if metric_type not in queries:
 85 |             print("Invalid metric type: %s" % (metric_type))
 86 |             return ""
 87 |         if query_op not in queries[metric_type]:
 88 |             print("Invalid query operation: %s" % (query_op))
 89 |             return ""
 90 |         return queries[metric_type][query_op]
 91 | 
 92 | 
 93 | class PrometheusMetricsClient(MetricsClient):
 94 |     def __init__(self, config: PrometheusClientConfig) -> None:
 95 |         if config:
 96 |             self.url = config.url
 97 |             if not self.url:
 98 |                 raise Exception("prometheus url missing")
 99 |             self.scrape_interval = config.scrape_interval or 30
100 |         else:
101 |             raise Exception("prometheus config missing")
102 | 
103 |     def wait(self) -> None:
104 |         """
105 |         Waits for the Prometheus server to scrape the metrics.
106 |         We have added a buffer of 5 seconds to the scrape interval to ensure that metrics for even the last request are collected.
107 |         """
108 |         wait_time = self.scrape_interval + PROMETHEUS_SCRAPE_BUFFER_SEC
109 |         time.sleep(wait_time)
110 | 
111 |     def collect_metrics_summary(self, runtime_parameters: PerfRuntimeParameters) -> ModelServerMetrics | None:
112 |         """
113 |         Collects the summary metrics for the given Perf Benchmark run.
114 | 
115 |         Args:
116 |         runtime_parameters: The runtime parameters containing details about the Perf Benchmark like the duration and model server client
117 | 
118 |         Returns:
119 |         A ModelServerMetrics object containing the summary metrics.
120 |         """
121 |         if runtime_parameters is None:
122 |             print("Perf Runtime parameters are not set, skipping metrics collection")
123 |             return None
124 | 
125 |         # Get the duration and model server client from the runtime parameters
126 |         query_eval_time = time.time()
127 |         query_duration = query_eval_time - runtime_parameters.start_time
128 | 
129 |         return self.get_model_server_metrics(runtime_parameters.model_server_client, query_duration, query_eval_time)
130 | 
131 |     def collect_metrics_for_stage(self, runtime_parameters: PerfRuntimeParameters, stage_id: int) -> ModelServerMetrics | None:
132 |         """
133 |         Collects the summary metrics for a specific stage.
134 | 
135 |         Args:
136 |         runtime_parameters: The runtime parameters containing details about the Perf Benchmark like the duration and model server client
137 |         stage_id: The ID of the stage for which to collect metrics
138 | 
139 |         Returns:
140 |         A ModelServerMetrics object containing the summary metrics for the specified stage.
141 |         """
142 |         if runtime_parameters is None:
143 |             print("Perf Runtime parameters are not set, skipping metrics collection")
144 |             return None
145 | 
146 |         if runtime_parameters.stages is None or stage_id not in runtime_parameters.stages:
147 |             print(f"Stage ID {stage_id} is not present in the runtime parameters, skipping metrics collection for this stage")
148 |             return None
149 | 
150 |         # Get the query evaluation time and duration for the stage
151 |         # The query evaluation time is the end time of the stage plus the scrape interval and a buffer to ensure metrics are collected
152 |         # Duration is calculated as the difference between the eval time and start time of the stage
153 |         query_eval_time = runtime_parameters.stages[stage_id].end_time + self.scrape_interval + PROMETHEUS_SCRAPE_BUFFER_SEC
154 |         query_duration = query_eval_time - runtime_parameters.stages[stage_id].start_time
155 |         return self.get_model_server_metrics(runtime_parameters.model_server_client, query_duration, query_eval_time)
156 | 
157 |     def get_model_server_metrics(
158 |         self, model_server_client: ModelServerClient, query_duration: float, query_eval_time: float
159 |     ) -> ModelServerMetrics | None:
160 |         """
161 |         Collects the summary metrics for the given Model Server Client and query duration.
162 | 
163 |         Args:
164 |         model_server_client: The model server client to use for collecting metrics
165 |         query_duration: The duration for which to collect metrics
166 |         query_eval_time: The time at which the query is evaluated, used to ensure we are querying the correct time range
167 | 
168 |         Returns:
169 |         A ModelServerMetrics object containing the summary metrics.
170 |         """
171 |         model_server_metrics: ModelServerMetrics = ModelServerMetrics()
172 | 
173 |         # Get the engine and model from the model server client
174 |         if not model_server_client:
175 |             print("Model server client is not set")
176 |             return None
177 | 
178 |         metrics_metadata = model_server_client.get_prometheus_metric_metadata()
179 |         if not metrics_metadata:
180 |             print("Metrics metadata is not present for the runtime")
181 |             return None
182 |         for summary_metric_name in metrics_metadata:
183 |             summary_metric_metadata = metrics_metadata.get(summary_metric_name)
184 |             if summary_metric_metadata is None:
185 |                 print("Metric metadata is not present for metric: %s. Skipping this metric." % (summary_metric_name))
186 |                 continue
187 |             summary_metric_metadata = cast(ModelServerPrometheusMetric, summary_metric_metadata)
188 |             if summary_metric_metadata is None:
189 |                 print(
190 |                     "Metric metadata for %s is missing or has an incorrect format. Skipping this metric."
191 |                     % (summary_metric_name)
192 |                 )
193 |                 continue
194 | 
195 |             query_builder = PrometheusQueryBuilder(summary_metric_metadata, query_duration)
196 |             query = query_builder.build_query()
197 |             if not query:
198 |                 print("No query found for metric: %s. Skipping metric." % (summary_metric_name))
199 |                 continue
200 | 
201 |             # Execute the query and get the result
202 |             result = self.execute_query(query, str(query_eval_time))
203 |             if result is None:
204 |                 print("Error executing query: %s" % (query))
205 |                 continue
206 |             # Set the result in metrics summary
207 |             attr = getattr(model_server_metrics, summary_metric_name)
208 |             if attr is not None:
209 |                 target_type = type(attr)
210 |                 setattr(model_server_metrics, summary_metric_name, target_type(result))
211 | 
212 |         return model_server_metrics
213 | 
214 |     def execute_query(self, query: str, eval_time: str) -> float:
215 |         """
216 |         Executes the given query on the Prometheus server and returns the result.
217 | 
218 |         Args:
219 |         query: the PromQL query to execute
220 |         eval_time: the time at which the query is evaluated, used to ensure we are querying the correct time range
221 | 
222 |         Returns:
223 |         The result of the query.
224 |         """
225 |         query_result = 0.0
226 |         try:
227 |             response = requests.get(f"{self.url}/api/v1/query", params={"query": query, "time": eval_time})
228 |             if response is None:
229 |                 print("Error executing query: %s" % (query))
230 |                 return query_result
231 | 
232 |             response.raise_for_status()
233 |         except Exception as e:
234 |             print("Error executing query: %s" % (e))
235 |             return query_result
236 | 
237 |         # Check if the response is valid
238 |         # Sample response:
239 |         # {
240 |         #     "status": "success",
241 |         #     "data": {
242 |         #         "resultType": "vector",
243 |         #         "result": [
244 |         #             {
245 |         #                 "metric": {},
246 |         #                 "value": [
247 |         #                     1632741820.781,
248 |         #                     "0.0000000000000000"
249 |         #                 ]
250 |         #             }
251 |         #         ]
252 |         #     }
253 |         # }
254 |         response_obj = response.json()
255 |         if response_obj.get("status") != "success":
256 |             print("Error executing query: %s" % (response_obj))
257 |             return query_result
258 | 
259 |         data = response_obj.get("data", {})
260 |         result = data.get("result", [])
261 |         if len(result) > 0 and "value" in result[0]:
262 |             if isinstance(result[0]["value"], list) and len(result[0]["value"]) > 1:
263 |                 # Return the value of the first result
264 |                 # The value is in the second element of the list
265 |                 # e.g. [1632741820.781, "0.0000000000000000"]
266 |                 # We need to convert it to float
267 |                 # and return it
268 |                 # Convert the value to float
269 |                 try:
270 |                     query_result = round(float(result[0]["value"][1]), 6)
271 |                 except ValueError:
272 |                     print("Error converting value to float: %s" % (result[0]["value"][1]))
273 |                     return query_result
274 |         return query_result
275 | 


--------------------------------------------------------------------------------
/inference_perf/client/modelserver/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The Kubernetes Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from .base import ModelServerClient
15 | from .mock_client import MockModelServerClient
16 | from .vllm_client import vLLMModelServerClient
17 | 
18 | 
19 | __all__ = ["ModelServerClient", "MockModelServerClient", "vLLMModelServerClient"]
20 | 


--------------------------------------------------------------------------------
/inference_perf/client/modelserver/base.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The Kubernetes Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from abc import ABC, abstractmethod
15 | from typing import List, Tuple, TypedDict
16 | from inference_perf.config import APIType
17 | 
18 | from inference_perf.apis import InferenceAPIData
19 | 
20 | 
21 | class ModelServerPrometheusMetric:
22 |     def __init__(self, name: str, op: str, type: str, filters: str) -> None:
23 |         self.name = name
24 |         self.op = op
25 |         self.type = type
26 |         self.filters = filters
27 | 
28 | 
29 | # PrometheusMetricMetadata stores the mapping of metrics to their model server names and types
30 | # and the filters to be applied to them.
31 | # This is used to generate Prometheus query for the metrics.
32 | class PrometheusMetricMetadata(TypedDict):
33 |     # Throughput
34 |     prompt_tokens_per_second: ModelServerPrometheusMetric
35 |     output_tokens_per_second: ModelServerPrometheusMetric
36 |     requests_per_second: ModelServerPrometheusMetric
37 | 
38 |     # Latency
39 |     avg_request_latency: ModelServerPrometheusMetric
40 |     median_request_latency: ModelServerPrometheusMetric
41 |     p90_request_latency: ModelServerPrometheusMetric
42 |     p99_request_latency: ModelServerPrometheusMetric
43 |     avg_time_to_first_token: ModelServerPrometheusMetric
44 |     median_time_to_first_token: ModelServerPrometheusMetric
45 |     p90_time_to_first_token: ModelServerPrometheusMetric
46 |     p99_time_to_first_token: ModelServerPrometheusMetric
47 |     avg_time_per_output_token: ModelServerPrometheusMetric
48 |     median_time_per_output_token: ModelServerPrometheusMetric
49 |     p90_time_per_output_token: ModelServerPrometheusMetric
50 |     p99_time_per_output_token: ModelServerPrometheusMetric
51 | 
52 |     # Request
53 |     total_requests: ModelServerPrometheusMetric
54 |     avg_prompt_tokens: ModelServerPrometheusMetric
55 |     avg_output_tokens: ModelServerPrometheusMetric
56 |     avg_queue_length: ModelServerPrometheusMetric
57 | 
58 | 
59 | class ModelServerClient(ABC):
60 |     @abstractmethod
61 |     def __init__(self, api_type: APIType, *args: Tuple[int, ...]) -> None:
62 |         if api_type not in self.get_supported_apis():
63 |             raise Exception(f"Unsupported API type {api_type}")
64 | 
65 |         self.apiType = api_type
66 | 
67 |     @abstractmethod
68 |     def get_supported_apis(self) -> List[APIType]:
69 |         raise NotImplementedError
70 | 
71 |     @abstractmethod
72 |     async def process_request(self, data: InferenceAPIData, stage_id: int) -> None:
73 |         raise NotImplementedError
74 | 
75 |     @abstractmethod
76 |     def get_prometheus_metric_metadata(self) -> PrometheusMetricMetadata:
77 |         # assumption: all metrics clients have metrics exported in Prometheus format
78 |         raise NotImplementedError
79 | 


--------------------------------------------------------------------------------
/inference_perf/client/modelserver/mock_client.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The Kubernetes Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from inference_perf.client.requestdatacollector import RequestDataCollector
16 | from typing import List
17 | from inference_perf.config import APIType
18 | from inference_perf.apis import InferenceAPIData, InferenceInfo, RequestLifecycleMetric
19 | from .base import ModelServerClient
20 | import asyncio
21 | import time
22 | 
23 | 
24 | class MockModelServerClient(ModelServerClient):
25 |     def __init__(self, metrics_collector: RequestDataCollector, api_type: APIType) -> None:
26 |         super().__init__(api_type)
27 |         self.metrics_collector = metrics_collector
28 | 
29 |     async def process_request(self, data: InferenceAPIData, stage_id: int) -> None:
30 |         start = time.monotonic()
31 |         print("Processing mock request for stage - " + str(stage_id))
32 |         await asyncio.sleep(3)
33 |         self.metrics_collector.record_metric(
34 |             RequestLifecycleMetric(
35 |                 stage_id=stage_id,
36 |                 request_data=str(data.to_payload("mock_model", 3, False)),
37 |                 info=InferenceInfo(
38 |                     input_tokens=0,
39 |                     output_tokens=0,
40 |                 ),
41 |                 error=None,
42 |                 start_time=start,
43 |                 end_time=time.monotonic(),
44 |             )
45 |         )
46 | 
47 |     def get_supported_apis(self) -> List[APIType]:
48 |         return [APIType.Completion, APIType.Chat]
49 | 


--------------------------------------------------------------------------------
/inference_perf/client/modelserver/vllm_client.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2025 The Kubernetes Authors.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | from inference_perf.client.requestdatacollector import RequestDataCollector
 16 | from inference_perf.config import APIType
 17 | from inference_perf.apis import InferenceAPIData, InferenceInfo, RequestLifecycleMetric, ErrorResponseInfo
 18 | from inference_perf.utils import CustomTokenizer
 19 | from .base import ModelServerClient, PrometheusMetricMetadata, ModelServerPrometheusMetric
 20 | from typing import List
 21 | import aiohttp
 22 | import json
 23 | import time
 24 | 
 25 | 
 26 | class vLLMModelServerClient(ModelServerClient):
 27 |     def __init__(
 28 |         self,
 29 |         metrics_collector: RequestDataCollector,
 30 |         api_type: APIType,
 31 |         uri: str,
 32 |         model_name: str,
 33 |         tokenizer: CustomTokenizer,
 34 |         ignore_eos: bool = True,
 35 |     ) -> None:
 36 |         super().__init__(api_type)
 37 |         self.model_name = model_name
 38 |         self.uri = uri
 39 |         self.max_completion_tokens = 30  # default to use when not set at the request level
 40 |         self.ignore_eos = ignore_eos
 41 |         self.tokenizer = tokenizer
 42 |         self.metrics_collector = metrics_collector
 43 | 
 44 |         self.prometheus_metric_metadata: PrometheusMetricMetadata = {
 45 |             "avg_queue_length": ModelServerPrometheusMetric(
 46 |                 "vllm:num_requests_waiting", "mean", "gauge", "model_name='%s'" % self.model_name
 47 |             ),
 48 |             "avg_time_to_first_token": ModelServerPrometheusMetric(
 49 |                 "vllm:time_to_first_token_seconds", "mean", "histogram", "model_name='%s'" % self.model_name
 50 |             ),
 51 |             "median_time_to_first_token": ModelServerPrometheusMetric(
 52 |                 "vllm:time_to_first_token_seconds", "median", "histogram", "model_name='%s'" % self.model_name
 53 |             ),
 54 |             "p90_time_to_first_token": ModelServerPrometheusMetric(
 55 |                 "vllm:time_to_first_token_seconds", "p90", "histogram", "model_name='%s'" % self.model_name
 56 |             ),
 57 |             "p99_time_to_first_token": ModelServerPrometheusMetric(
 58 |                 "vllm:time_to_first_token_seconds", "p99", "histogram", "model_name='%s'" % self.model_name
 59 |             ),
 60 |             "avg_time_per_output_token": ModelServerPrometheusMetric(
 61 |                 "vllm:time_per_output_token_seconds", "mean", "histogram", "model_name='%s'" % self.model_name
 62 |             ),
 63 |             "median_time_per_output_token": ModelServerPrometheusMetric(
 64 |                 "vllm:time_per_output_token_seconds", "median", "histogram", "model_name='%s'" % self.model_name
 65 |             ),
 66 |             "p90_time_per_output_token": ModelServerPrometheusMetric(
 67 |                 "vllm:time_per_output_token_seconds", "p90", "histogram", "model_name='%s'" % self.model_name
 68 |             ),
 69 |             "p99_time_per_output_token": ModelServerPrometheusMetric(
 70 |                 "vllm:time_per_output_token_seconds", "p99", "histogram", "model_name='%s'" % self.model_name
 71 |             ),
 72 |             "avg_prompt_tokens": ModelServerPrometheusMetric(
 73 |                 "vllm:prompt_tokens_total", "mean", "counter", "model_name='%s'" % self.model_name
 74 |             ),
 75 |             "prompt_tokens_per_second": ModelServerPrometheusMetric(
 76 |                 "vllm:prompt_tokens_total", "rate", "counter", "model_name='%s'" % self.model_name
 77 |             ),
 78 |             "avg_output_tokens": ModelServerPrometheusMetric(
 79 |                 "vllm:generation_tokens_total", "mean", "counter", "model_name='%s'" % self.model_name
 80 |             ),
 81 |             "output_tokens_per_second": ModelServerPrometheusMetric(
 82 |                 "vllm:generation_tokens_total", "rate", "counter", "model_name='%s'" % self.model_name
 83 |             ),
 84 |             "total_requests": ModelServerPrometheusMetric(
 85 |                 "vllm:e2e_request_latency_seconds_count", "increase", "counter", "model_name='%s'" % self.model_name
 86 |             ),
 87 |             "requests_per_second": ModelServerPrometheusMetric(
 88 |                 "vllm:e2e_request_latency_seconds_count", "rate", "counter", "model_name='%s'" % self.model_name
 89 |             ),
 90 |             "avg_request_latency": ModelServerPrometheusMetric(
 91 |                 "vllm:e2e_request_latency_seconds", "mean", "histogram", "model_name='%s'" % self.model_name
 92 |             ),
 93 |             "median_request_latency": ModelServerPrometheusMetric(
 94 |                 "vllm:e2e_request_latency_seconds", "median", "histogram", "model_name='%s'" % self.model_name
 95 |             ),
 96 |             "p90_request_latency": ModelServerPrometheusMetric(
 97 |                 "vllm:e2e_request_latency_seconds", "p90", "histogram", "model_name='%s'" % self.model_name
 98 |             ),
 99 |             "p99_request_latency": ModelServerPrometheusMetric(
100 |                 "vllm:e2e_request_latency_seconds", "p99", "histogram", "model_name='%s'" % self.model_name
101 |             ),
102 |         }
103 | 
104 |     async def process_request(self, data: InferenceAPIData, stage_id: int) -> None:
105 |         payload = data.to_payload(
106 |             model_name=self.model_name, max_tokens=self.max_completion_tokens, ignore_eos=self.ignore_eos
107 |         )
108 |         headers = {"Content-Type": "application/json"}
109 |         async with aiohttp.ClientSession() as session:
110 |             start = time.monotonic()
111 |             try:
112 |                 async with session.post(self.uri + data.get_route(), headers=headers, data=json.dumps(payload)) as response:
113 |                     if response.status == 200:
114 |                         content = await response.json()
115 |                         response_info = data.process_response(data=content, tokenizer=self.tokenizer)
116 |                         self.metrics_collector.record_metric(
117 |                             RequestLifecycleMetric(
118 |                                 stage_id=stage_id,
119 |                                 request_data=json.dumps(payload),
120 |                                 response_data=json.dumps(content),
121 |                                 info=response_info,
122 |                                 error=None,
123 |                                 start_time=start,
124 |                                 end_time=time.monotonic(),
125 |                             )
126 |                         )
127 |                     else:
128 |                         content = await response.text()
129 |                         self.metrics_collector.record_metric(
130 |                             RequestLifecycleMetric(
131 |                                 stage_id=stage_id,
132 |                                 request_data=json.dumps(payload),
133 |                                 response_data=content,
134 |                                 info=InferenceInfo(),
135 |                                 error=ErrorResponseInfo(error_msg=content, error_type="Non 200 reponse"),
136 |                                 start_time=start,
137 |                                 end_time=time.monotonic(),
138 |                             )
139 |                         )
140 |             except Exception as e:
141 |                 self.metrics_collector.record_metric(
142 |                     RequestLifecycleMetric(
143 |                         stage_id=stage_id,
144 |                         request_data=json.dumps(payload),
145 |                         info=InferenceInfo(),
146 |                         error=ErrorResponseInfo(
147 |                             error_msg=str(e),
148 |                             error_type=type(e).__name__,
149 |                         ),
150 |                         start_time=start,
151 |                         end_time=time.monotonic(),
152 |                     )
153 |                 )
154 | 
155 |     def get_supported_apis(self) -> List[APIType]:
156 |         return [APIType.Completion, APIType.Chat]
157 | 
158 |     def get_prometheus_metric_metadata(self) -> PrometheusMetricMetadata:
159 |         return self.prometheus_metric_metadata
160 | 


--------------------------------------------------------------------------------
/inference_perf/client/requestdatacollector/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The Kubernetes Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from .base import RequestDataCollector
15 | from .local import LocalRequestDataCollector
16 | 
17 | 
18 | __all__ = [
19 |     "RequestDataCollector",
20 |     "LocalRequestDataCollector",
21 | ]
22 | 


--------------------------------------------------------------------------------
/inference_perf/client/requestdatacollector/base.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The Kubernetes Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from abc import ABC, abstractmethod
15 | from typing import List
16 | 
17 | from inference_perf.apis import RequestLifecycleMetric
18 | 
19 | 
20 | class RequestDataCollector(ABC):
21 |     """
22 |     Responsible for collecting request information
23 |     """
24 | 
25 |     @abstractmethod
26 |     def record_metric(self, metric: RequestLifecycleMetric) -> None:
27 |         raise NotImplementedError
28 | 
29 |     @abstractmethod
30 |     def get_metrics(self) -> List[RequestLifecycleMetric]:
31 |         raise NotImplementedError
32 | 


--------------------------------------------------------------------------------
/inference_perf/client/requestdatacollector/local.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The Kubernetes Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | from typing import List
16 | from inference_perf.client.requestdatacollector import RequestDataCollector
17 | from inference_perf.apis import RequestLifecycleMetric
18 | 
19 | 
20 | class LocalRequestDataCollector(RequestDataCollector):
21 |     """Responsible for accumulating client request metrics"""
22 | 
23 |     def __init__(self) -> None:
24 |         self.metrics: List[RequestLifecycleMetric] = []
25 | 
26 |     def record_metric(self, metric: RequestLifecycleMetric) -> None:
27 |         self.metrics.append(metric)
28 | 
29 |     def get_metrics(self) -> List[RequestLifecycleMetric]:
30 |         return self.metrics
31 | 


--------------------------------------------------------------------------------
/inference_perf/config.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2025 The Kubernetes Authors.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | from datetime import datetime
 15 | from pydantic import BaseModel, HttpUrl
 16 | from typing import Any, Optional, List
 17 | from argparse import ArgumentParser
 18 | from enum import Enum
 19 | import yaml
 20 | 
 21 | 
 22 | class APIType(Enum):
 23 |     Completion = "completion"
 24 |     Chat = "chat"
 25 | 
 26 | 
 27 | class DataGenType(Enum):
 28 |     Mock = "mock"
 29 |     ShareGPT = "shareGPT"
 30 |     Synthetic = "synthetic"
 31 |     Random = "random"
 32 |     SharedPrefix = "shared_prefix"
 33 | 
 34 | 
 35 | # Represents the distribution for input prompts and output generations.
 36 | class Distribution(BaseModel):
 37 |     min: int = 10
 38 |     max: int = 1024
 39 |     mean: float = 512
 40 |     std_dev: float = 200
 41 |     total_count: int = 1000
 42 | 
 43 | 
 44 | # Configuration for shared prefix datagen which allows users to specify shared prefixes.
 45 | class SharedPrefix(BaseModel):
 46 |     num_groups: int = 10
 47 |     num_prompts_per_group: int = 10
 48 |     system_prompt_len: int = 100
 49 |     question_len: int = 50
 50 |     output_len: int = 50
 51 | 
 52 | 
 53 | class DataConfig(BaseModel):
 54 |     type: DataGenType = DataGenType.Mock
 55 |     # Distributions are only supported for synthetic/random dataset at this moment
 56 |     input_distribution: Optional[Distribution] = None
 57 |     output_distribution: Optional[Distribution] = None
 58 |     shared_prefix: Optional[SharedPrefix] = None
 59 | 
 60 | 
 61 | class ModelServerType(Enum):
 62 |     VLLM = "vllm"
 63 | 
 64 | 
 65 | class LoadType(Enum):
 66 |     CONSTANT = "constant"
 67 |     POISSON = "poisson"
 68 | 
 69 | 
 70 | class MetricsClientType(Enum):
 71 |     PROMETHEUS = "prometheus"
 72 |     DEFAULT = "default"
 73 | 
 74 | 
 75 | class LoadStage(BaseModel):
 76 |     rate: int
 77 |     duration: int
 78 | 
 79 | 
 80 | class LoadConfig(BaseModel):
 81 |     type: LoadType = LoadType.CONSTANT
 82 |     interval: float = 1.0
 83 |     stages: List[LoadStage] = []
 84 | 
 85 | 
 86 | class StorageConfigBase(BaseModel):
 87 |     path: str = f"reports-{datetime.now().strftime('%Y%m%d-%H%M%S')}"
 88 |     report_file_prefix: Optional[str] = None
 89 | 
 90 | 
 91 | class GoogleCloudStorageConfig(StorageConfigBase):
 92 |     bucket_name: str
 93 | 
 94 | 
 95 | class StorageConfig(BaseModel):
 96 |     google_cloud_storage: Optional[GoogleCloudStorageConfig] = None
 97 | 
 98 | 
 99 | class RequestLifecycleMetricsReportConfig(BaseModel):
100 |     summary: Optional[bool] = True
101 |     per_stage: Optional[bool] = True
102 |     per_request: Optional[bool] = False
103 | 
104 | 
105 | class PrometheusMetricsReportConfig(BaseModel):
106 |     summary: Optional[bool] = True
107 |     per_stage: Optional[bool] = False
108 | 
109 | 
110 | class ReportConfig(BaseModel):
111 |     request_lifecycle: RequestLifecycleMetricsReportConfig = RequestLifecycleMetricsReportConfig()
112 |     prometheus: PrometheusMetricsReportConfig = PrometheusMetricsReportConfig()
113 | 
114 | 
115 | class PrometheusClientConfig(BaseModel):
116 |     scrape_interval: int = 15
117 |     url: HttpUrl = HttpUrl(url="http://localhost:9090")
118 | 
119 | 
120 | class MetricsClientConfig(BaseModel):
121 |     type: MetricsClientType
122 |     prometheus: Optional[PrometheusClientConfig] = None
123 | 
124 | 
125 | class ModelServerClientConfig(BaseModel):
126 |     type: ModelServerType = ModelServerType.VLLM
127 |     model_name: str
128 |     base_url: str
129 |     ignore_eos: bool = True
130 | 
131 | 
132 | class CustomTokenizerConfig(BaseModel):
133 |     pretrained_model_name_or_path: str
134 |     trust_remote_code: Optional[bool] = None
135 |     token: Optional[str] = None
136 | 
137 | 
138 | class Config(BaseModel):
139 |     api: APIType = APIType.Completion
140 |     data: DataConfig = DataConfig()
141 |     load: LoadConfig = LoadConfig()
142 |     metrics: Optional[MetricsClientConfig] = None
143 |     report: ReportConfig = ReportConfig()
144 |     storage: Optional[StorageConfig] = StorageConfig()
145 |     server: Optional[ModelServerClientConfig] = None
146 |     tokenizer: Optional[CustomTokenizerConfig] = None
147 | 
148 | 
149 | def deep_merge(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
150 |     result = base.copy()
151 |     for k, v in override.items():
152 |         if k in result and isinstance(result[k], dict) and isinstance(v, dict):
153 |             result[k] = deep_merge(result[k], v)
154 |         else:
155 |             result[k] = v
156 |     return result
157 | 
158 | 
159 | def read_config(arg_list: Optional[list[str]] = None) -> Config:
160 |     parser = ArgumentParser()
161 | 
162 |     parser.add_argument("-c", "--config_file", help="Config File", required=True)
163 | 
164 |     args = parser.parse_args(arg_list)
165 |     if args.config_file:
166 |         print("Using configuration from: %s" % args.config_file)
167 |         with open(args.config_file, "r") as stream:
168 |             cfg = yaml.safe_load(stream)
169 | 
170 |         default_cfg = Config().model_dump(mode="json")
171 |         merged_cfg = deep_merge(default_cfg, cfg)
172 | 
173 |         print(
174 |             f"Benchmarking with the following config:\n\n{yaml.dump(merged_cfg, sort_keys=False, default_flow_style=False)}\n"
175 |         )
176 |         return Config(**merged_cfg)
177 |     return Config()
178 | 


--------------------------------------------------------------------------------
/inference_perf/datagen/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The Kubernetes Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from .base import DataGenerator
15 | from .mock_datagen import MockDataGenerator
16 | from .hf_sharegpt_datagen import HFShareGPTDataGenerator
17 | from .synthetic_datagen import SyntheticDataGenerator
18 | from .random_datagen import RandomDataGenerator
19 | from .shared_prefix_datagen import SharedPrefixDataGenerator
20 | 
21 | __all__ = [
22 |     "DataGenerator",
23 |     "MockDataGenerator",
24 |     "HFShareGPTDataGenerator",
25 |     "SyntheticDataGenerator",
26 |     "RandomDataGenerator",
27 |     "SharedPrefixDataGenerator",
28 | ]
29 | 


--------------------------------------------------------------------------------
/inference_perf/datagen/base.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The Kubernetes Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from inference_perf.apis import InferenceAPIData
15 | from inference_perf.utils.custom_tokenizer import CustomTokenizer
16 | from inference_perf.config import APIType, DataConfig, Distribution, SharedPrefix
17 | from abc import ABC, abstractmethod
18 | from typing import Generator, Optional, List
19 | 
20 | 
21 | class DataGenerator(ABC):
22 |     """Abstract base class for data generators."""
23 | 
24 |     apiType: APIType
25 |     input_distribution: Optional[Distribution]
26 |     output_distribution: Optional[Distribution]
27 |     shared_prefix: Optional[SharedPrefix]
28 |     tokenizer: Optional[CustomTokenizer]
29 | 
30 |     """Abstract base class for data generators."""
31 | 
32 |     def __init__(self, apiType: APIType, config: DataConfig, tokenizer: Optional[CustomTokenizer]) -> None:
33 |         if apiType not in self.get_supported_apis():
34 |             raise Exception(f"Unsupported API type {apiType}")
35 | 
36 |         if (
37 |             config.input_distribution is not None or config.output_distribution is not None
38 |         ) and not self.is_io_distribution_supported():
39 |             raise Exception("IO distribution not supported for this data generator")
40 | 
41 |         if config.shared_prefix is not None and not self.is_shared_prefix_supported():
42 |             raise Exception("Shared prefix not supported for this data generator")
43 | 
44 |         if tokenizer is not None:
45 |             self.tokenizer = tokenizer
46 | 
47 |         self.apiType = apiType
48 |         self.input_distribution = config.input_distribution
49 |         self.output_distribution = config.output_distribution
50 |         self.shared_prefix = config.shared_prefix
51 | 
52 |     @abstractmethod
53 |     def get_supported_apis(self) -> List[APIType]:
54 |         raise NotImplementedError
55 | 
56 |     @abstractmethod
57 |     def get_data(self) -> Generator[InferenceAPIData, None, None]:
58 |         raise NotImplementedError
59 | 
60 |     @abstractmethod
61 |     def is_io_distribution_supported(self) -> bool:
62 |         raise NotImplementedError
63 | 
64 |     @abstractmethod
65 |     def is_shared_prefix_supported(self) -> bool:
66 |         raise NotImplementedError
67 | 


--------------------------------------------------------------------------------
/inference_perf/datagen/hf_sharegpt_datagen.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The Kubernetes Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from inference_perf.apis import InferenceAPIData, CompletionAPIData, ChatCompletionAPIData, ChatMessage
15 | from inference_perf.utils.custom_tokenizer import CustomTokenizer
16 | from .base import DataGenerator
17 | from inference_perf.config import APIType, DataConfig
18 | from typing import Generator, List
19 | from datasets import load_dataset
20 | 
21 | 
22 | class HFShareGPTDataGenerator(DataGenerator):
23 |     def __init__(self, apiType: APIType, config: DataConfig, tokenizer: CustomTokenizer) -> None:
24 |         super().__init__(apiType, config, tokenizer)
25 |         self.sharegpt_dataset = iter(
26 |             load_dataset(
27 |                 "anon8231489123/ShareGPT_Vicuna_unfiltered",
28 |                 data_files="ShareGPT_V3_unfiltered_cleaned_split.json",
29 |                 streaming=True,
30 |                 split="train",
31 |             )
32 |         )
33 |         self.min_num_turns = 2
34 |         self.data_key = "conversations"
35 |         self.role_key = "from"
36 |         self.content_key = "value"
37 |         # initialize data collection
38 |         next(self.sharegpt_dataset)
39 | 
40 |     def get_supported_apis(self) -> List[APIType]:
41 |         return [APIType.Chat, APIType.Completion]
42 | 
43 |     def get_data(self) -> Generator[InferenceAPIData, None, None]:
44 |         if self.sharegpt_dataset is not None:
45 |             while True:
46 |                 data = next(self.sharegpt_dataset)
47 |                 if (
48 |                     data is None
49 |                     or data[self.data_key] is None
50 |                     or len(data[self.data_key]) < self.min_num_turns
51 |                     or len(data[self.data_key]) == 0
52 |                 ):
53 |                     continue
54 | 
55 |                 if self.apiType == APIType.Completion:
56 |                     try:
57 |                         prompt = data[self.data_key][0].get(self.content_key)
58 |                         if not prompt:
59 |                             continue
60 |                         yield CompletionAPIData(prompt=prompt)
61 |                     except (KeyError, TypeError) as e:
62 |                         print(f"Skipping invalid completion data: {e}")
63 |                         continue
64 |                 elif self.apiType == APIType.Chat:
65 |                     yield ChatCompletionAPIData(
66 |                         messages=[
67 |                             ChatMessage(role=conversation[self.role_key], content=conversation[self.content_key])
68 |                             for conversation in data[self.data_key]
69 |                         ]
70 |                     )
71 |                 else:
72 |                     raise Exception("Unsupported API type")
73 | 
74 |     def is_io_distribution_supported(self) -> bool:
75 |         return False
76 | 
77 |     def is_shared_prefix_supported(self) -> bool:
78 |         return False
79 | 


--------------------------------------------------------------------------------
/inference_perf/datagen/mock_datagen.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The Kubernetes Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from typing import Generator, List, Optional
15 | from inference_perf.config import APIType, DataConfig
16 | from inference_perf.datagen.base import DataGenerator
17 | from inference_perf.apis import InferenceAPIData, CompletionAPIData, ChatCompletionAPIData, ChatMessage
18 | from inference_perf.utils.custom_tokenizer import CustomTokenizer
19 | 
20 | 
21 | class MockDataGenerator(DataGenerator):
22 |     def __init__(self, apiType: APIType, config: DataConfig, tokenizer: Optional[CustomTokenizer]) -> None:
23 |         super().__init__(apiType, config, tokenizer)
24 | 
25 |     def get_supported_apis(self) -> List[APIType]:
26 |         return [APIType.Completion, APIType.Chat]
27 | 
28 |     def get_data(self) -> Generator[InferenceAPIData, None, None]:
29 |         i = 0
30 |         while True:
31 |             i += 1
32 |             if self.apiType == APIType.Completion:
33 |                 yield CompletionAPIData(prompt="text" + str(i))
34 |             elif self.apiType == APIType.Chat:
35 |                 yield ChatCompletionAPIData(messages=[ChatMessage(role="user", content="text" + str(i))])
36 |             else:
37 |                 raise Exception("Unsupported API type")
38 | 
39 |     def is_io_distribution_supported(self) -> bool:
40 |         return False
41 | 
42 |     def is_shared_prefix_supported(self) -> bool:
43 |         return False
44 | 


--------------------------------------------------------------------------------
/inference_perf/datagen/random_datagen.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2025 The Kubernetes Authors.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | import numpy as np
 15 | from inference_perf.apis import InferenceAPIData, CompletionAPIData
 16 | from inference_perf.utils.custom_tokenizer import CustomTokenizer
 17 | from inference_perf.utils.distribution import generate_distribution
 18 | from .base import DataGenerator
 19 | from typing import Generator, List
 20 | from inference_perf.config import APIType, DataConfig
 21 | 
 22 | 
 23 | 
 24 | 
 25 | # Random data generator generates random tokens from the model's
 26 | # vocabulary for the desired input and output distribution.
 27 | class RandomDataGenerator(DataGenerator):
 28 |     def __init__(
 29 |         self,
 30 |         apiType: APIType,
 31 |         config: DataConfig,
 32 |         tokenizer: CustomTokenizer,
 33 |     ) -> None:
 34 |         super().__init__(apiType, config, tokenizer)
 35 | 
 36 |         if self.input_distribution is None or self.output_distribution is None:
 37 |             raise ValueError("Input and Output Distribution are required for RandomDataGenerator")
 38 | 
 39 |         self.input_lengths = generate_distribution(
 40 |             self.input_distribution.min,
 41 |             self.input_distribution.max,
 42 |             self.input_distribution.mean,
 43 |             self.input_distribution.std_dev,
 44 |             self.input_distribution.total_count,
 45 |         )
 46 |         self.output_lengths = generate_distribution(
 47 |             self.output_distribution.min,
 48 |             self.output_distribution.max,
 49 |             self.output_distribution.mean,
 50 |             self.output_distribution.std_dev,
 51 |             self.output_distribution.total_count,
 52 |         )
 53 | 
 54 |         if self.tokenizer is None:
 55 |             raise ValueError("Tokenizer is required for RandomDataGenerator")
 56 | 
 57 |         hf_tokenizer = self.tokenizer.get_tokenizer()
 58 |         if hasattr(hf_tokenizer, "vocab_size") and hf_tokenizer.vocab_size is not None:
 59 |             self.vocab_size: int = hf_tokenizer.vocab_size
 60 |         elif hasattr(hf_tokenizer, "get_vocab") and callable(hf_tokenizer.get_vocab):
 61 |             self.vocab_size = len(hf_tokenizer.get_vocab())
 62 |         else:
 63 |             try:
 64 |                 self.vocab_size = len(hf_tokenizer)
 65 |             except TypeError as e:
 66 |                 raise ValueError(
 67 |                     "Tokenizer does not have a 'vocab_size' attribute, 'get_vocab()' method, "
 68 |                     "or support len() for vocabulary size. Cannot use random token generation."
 69 |                 ) from e
 70 |         if self.vocab_size <= 0:
 71 |             raise ValueError(f"Tokenizer vocabulary size must be positive, got {self.vocab_size}.")
 72 | 
 73 |     def get_supported_apis(self) -> List[APIType]:
 74 |         return [APIType.Completion]
 75 | 
 76 |     def is_io_distribution_supported(self) -> bool:
 77 |         return True
 78 | 
 79 |     def is_shared_prefix_supported(self) -> bool:
 80 |         return False
 81 | 
 82 |     def get_data(self) -> Generator[InferenceAPIData, None, None]:
 83 |         i = 0
 84 | 
 85 |         while True:
 86 |             if self.tokenizer is None:
 87 |                 raise ValueError("Tokenizer is required for RandomDataGenerator")
 88 | 
 89 |             if self.apiType == APIType.Completion:
 90 |                 prompt_text: str
 91 |                 if self.input_lengths[i] <= 0:
 92 |                     random_token_ids_list = []
 93 |                 else:
 94 |                     random_token_ids = np.random.randint(0, self.vocab_size, size=self.input_lengths[i], dtype=np.int64)
 95 |                     random_token_ids_list = random_token_ids.tolist()
 96 |                 prompt_text = self.tokenizer.get_tokenizer().decode(random_token_ids_list)
 97 | 
 98 |                 yield CompletionAPIData(
 99 |                     prompt=prompt_text,
100 |                     max_tokens=self.output_lengths[i],
101 |                 )
102 |                 i += 1
103 |             else:
104 |                 raise Exception(f"Unsupported API type: {self.apiType}. RandomDataGenerator only supports Completion.")
105 | 


--------------------------------------------------------------------------------
/inference_perf/datagen/shared_prefix_datagen.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | from typing import Generator, List
  3 | import numpy as np
  4 | 
  5 | from inference_perf.apis.base import InferenceAPIData
  6 | from inference_perf.apis.completion import CompletionAPIData
  7 | from inference_perf.config import APIType, DataConfig
  8 | from inference_perf.utils.custom_tokenizer import CustomTokenizer
  9 | from .base import DataGenerator
 10 | 
 11 | 
 12 | # Shared Prefix Generator generates shared prefix in the prompts that are sent.
 13 | # This can be used to benchmark prefix caching cases.
 14 | class SharedPrefixDataGenerator(DataGenerator):
 15 |     def __init__(self, apiType: APIType, config: DataConfig, tokenizer: CustomTokenizer) -> None:
 16 |         super().__init__(apiType, config, tokenizer)
 17 | 
 18 |         if self.tokenizer is None:
 19 |             raise ValueError("Tokenizer is required for SharedPrefixDataGenerator but was not initialized.")
 20 | 
 21 |         # Initialize vocab_size
 22 |         hf_tokenizer = self.tokenizer.get_tokenizer()
 23 |         if hasattr(hf_tokenizer, "vocab_size") and hf_tokenizer.vocab_size is not None:
 24 |             self.vocab_size: int = hf_tokenizer.vocab_size
 25 |         elif hasattr(hf_tokenizer, "get_vocab") and callable(hf_tokenizer.get_vocab):
 26 |             self.vocab_size = len(hf_tokenizer.get_vocab())
 27 |         else:
 28 |             try:
 29 |                 self.vocab_size = len(hf_tokenizer)
 30 |             except TypeError as e:
 31 |                 raise ValueError(
 32 |                     "Tokenizer does not have a 'vocab_size' attribute, 'get_vocab()' method, "
 33 |                     "or support len() for vocabulary size. Cannot use random token generation."
 34 |                 ) from e
 35 |         if self.vocab_size <= 0:
 36 |             raise ValueError(f"Tokenizer vocabulary size must be positive, got {self.vocab_size}.")
 37 | 
 38 |         if self.shared_prefix is None:
 39 |             raise ValueError("Shared Prefix config is required for SharedPrefixDataGenerator")
 40 | 
 41 |         self.num_groups: int = self.shared_prefix.num_groups
 42 |         self.num_prompts_per_group: int = self.shared_prefix.num_prompts_per_group
 43 |         self.system_prompt_len: int = self.shared_prefix.system_prompt_len
 44 |         self.question_len: int = self.shared_prefix.question_len
 45 |         self.output_len: int = self.shared_prefix.output_len
 46 | 
 47 |         self.prompts: List[str] = []
 48 |         self._generate_prompts()
 49 | 
 50 |     def get_supported_apis(self) -> List[APIType]:
 51 |         return [APIType.Completion]
 52 | 
 53 |     def is_io_distribution_supported(self) -> bool:
 54 |         return True
 55 | 
 56 |     def is_shared_prefix_supported(self) -> bool:
 57 |         return True
 58 | 
 59 |     def get_data(self) -> Generator[InferenceAPIData, None, None]:
 60 |         if not self.prompts:
 61 |             return
 62 | 
 63 |         i = 0
 64 |         while True:
 65 |             yield CompletionAPIData(prompt=self.prompts[i], max_tokens=self.output_len)
 66 |             i = (i + 1) % len(self.prompts)
 67 | 
 68 |     def _generate_random_token_ids(self, length: int) -> List[int]:
 69 |         """Generates a list of random token IDs of a specified length."""
 70 |         if length == 0:
 71 |             return []
 72 |         # np.random.randint's high parameter is exclusive
 73 |         return np.random.randint(0, self.vocab_size, size=length, dtype=np.int64).tolist()  # type: ignore[no-any-return]
 74 | 
 75 |     def _generate_prompts(self) -> None:
 76 |         """Pre-generates all prompts based on the configuration."""
 77 |         if self.tokenizer is None:
 78 |             # This check is defensive; __init__ should have already validated this.
 79 |             raise ValueError("Tokenizer is not available for generating prompts.")
 80 | 
 81 |         hf_tokenizer = self.tokenizer.get_tokenizer()
 82 | 
 83 |         for _ in range(self.num_groups):
 84 |             # Generate a shared prefix (system prompt)
 85 |             shared_prefix_token_ids = self._generate_random_token_ids(self.system_prompt_len)
 86 |             shared_prefix_text = hf_tokenizer.decode(shared_prefix_token_ids, skip_special_tokens=True)
 87 | 
 88 |             for _ in range(self.num_prompts_per_group):
 89 |                 # Generate a unique question
 90 |                 question_token_ids = self._generate_random_token_ids(self.question_len)
 91 |                 question_text = hf_tokenizer.decode(question_token_ids, skip_special_tokens=True)
 92 | 
 93 |                 # Combine shared prefix and question
 94 |                 full_prompt_text = shared_prefix_text + " " + question_text
 95 | 
 96 |                 self.prompts.append(full_prompt_text)
 97 | 
 98 |         # Shuffle the generated prompts to ensure randomness if served sequentially by different workers
 99 |         random.shuffle(self.prompts)
100 | 


--------------------------------------------------------------------------------
/inference_perf/datagen/synthetic_datagen.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2025 The Kubernetes Authors.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | from inference_perf.apis import InferenceAPIData, CompletionAPIData
 15 | from inference_perf.utils.custom_tokenizer import CustomTokenizer
 16 | from inference_perf.utils.distribution import generate_distribution
 17 | from .base import DataGenerator
 18 | from typing import Generator, List
 19 | from inference_perf.config import APIType, DataConfig
 20 | 
 21 | 
 22 | class SyntheticDataGenerator(DataGenerator):
 23 |     def __init__(self, apiType: APIType, config: DataConfig, tokenizer: CustomTokenizer) -> None:
 24 |         super().__init__(apiType, config, tokenizer)
 25 | 
 26 |         if self.input_distribution is None or self.output_distribution is None or self.tokenizer is None:
 27 |             raise ValueError("IODistribution and tokenizer are required for SyntheticDataGenerator")
 28 | 
 29 |         self.input_lengths = generate_distribution(
 30 |             self.input_distribution.min,
 31 |             self.input_distribution.max,
 32 |             self.input_distribution.mean,
 33 |             self.input_distribution.std_dev,
 34 |             self.input_distribution.total_count,
 35 |         )
 36 |         self.output_lengths = generate_distribution(
 37 |             self.output_distribution.min,
 38 |             self.output_distribution.max,
 39 |             self.output_distribution.mean,
 40 |             self.output_distribution.std_dev,
 41 |             self.output_distribution.total_count,
 42 |         )
 43 |         base_prompt = "Pick as many lines as you can from these poem lines:\n"
 44 |         self.token_ids = self.tokenizer.get_tokenizer().encode(base_prompt + self.get_sonnet_data())
 45 | 
 46 |     def get_supported_apis(self) -> List[APIType]:
 47 |         return [APIType.Completion]
 48 | 
 49 |     def is_io_distribution_supported(self) -> bool:
 50 |         return True
 51 | 
 52 |     def is_shared_prefix_supported(self) -> bool:
 53 |         return False
 54 | 
 55 |     def get_data(self) -> Generator[InferenceAPIData, None, None]:
 56 |         i = 0
 57 |         while True:
 58 |             if self.tokenizer is None:
 59 |                 raise ValueError("Tokenizer is required for SyntheticDataGenerator")
 60 |             if self.apiType == APIType.Completion:
 61 |                 yield CompletionAPIData(
 62 |                     prompt=self.tokenizer.get_tokenizer().decode(self.token_ids[: self.input_lengths[i]]),
 63 |                     max_tokens=self.output_lengths[i],
 64 |                 )
 65 |                 i += 1
 66 |             else:
 67 |                 raise Exception("Unsupported API type")
 68 | 
 69 |     # Hardcoded sonnet data that we can use for synthetic benchmarks.
 70 |     def get_sonnet_data(self) -> str:
 71 |         return """FROM fairest creatures we desire increase,
 72 | That thereby beauty's rose might never die,
 73 | But as the riper should by time decease,
 74 | His tender heir might bear his memory:
 75 | But thou, contracted to thine own bright eyes,
 76 | Feed'st thy light'st flame with self-substantial fuel,
 77 | Making a famine where abundance lies,
 78 | Thyself thy foe, to thy sweet self too cruel.
 79 | Thou that art now the world's fresh ornament
 80 | And only herald to the gaudy spring,
 81 | Within thine own bud buriest thy content
 82 | And, tender churl, makest waste in niggarding.
 83 | Pity the world, or else this glutton be,
 84 | To eat the world's due, by the grave and thee.
 85 | When forty winters shall beseige thy brow,
 86 | And dig deep trenches in thy beauty's field,
 87 | Thy youth's proud livery, so gazed on now,
 88 | Will be a tatter'd weed, of small worth held:
 89 | Then being ask'd where all thy beauty lies,
 90 | Where all the treasure of thy lusty days,
 91 | To say, within thine own deep-sunken eyes,
 92 | Were an all-eating shame and thriftless praise.
 93 | How much more praise deserved thy beauty's use,
 94 | If thou couldst answer 'This fair child of mine
 95 | Shall sum my count and make my old excuse,'
 96 | Proving his beauty by succession thine!
 97 | This were to be new made when thou art old,
 98 | And see thy blood warm when thou feel'st it cold.
 99 | Look in thy glass, and tell the face thou viewest
100 | Now is the time that face should form another;
101 | Whose fresh repair if now thou not renewest,
102 | Thou dost beguile the world, unbless some mother.
103 | For where is she so fair whose unear'd womb
104 | Disdains the tillage of thy husbandry?
105 | Or who is he so fond will be the tomb
106 | Of his self-love, to stop posterity?
107 | Thou art thy mother's glass, and she in thee
108 | Calls back the lovely April of her prime:
109 | So thou through windows of thine age shall see
110 | Despite of wrinkles this thy golden time.
111 | But if thou live, remember'd not to be,
112 | Die single, and thine image dies with thee.
113 | Unthrifty loveliness, why dost thou spend
114 | Upon thyself thy beauty's legacy?
115 | Nature's bequest gives nothing but doth lend,
116 | And being frank she lends to those are free.
117 | Then, beauteous niggard, why dost thou abuse
118 | The bounteous largess given thee to give?
119 | Profitless usurer, why dost thou use
120 | So great a sum of sums, yet canst not live?
121 | For having traffic with thyself alone,
122 | Thou of thyself thy sweet self dost deceive.
123 | Then how, when nature calls thee to be gone,
124 | What acceptable audit canst thou leave?
125 | Thy unused beauty must be tomb'd with thee,
126 | Which, used, lives th' executor to be.
127 | Those hours, that with gentle work did frame
128 | The lovely gaze where every eye doth dwell,
129 | Will play the tyrants to the very same
130 | And that unfair which fairly doth excel:
131 | For never-resting time leads summer on
132 | To hideous winter and confounds him there;
133 | Sap cheque'd with frost and lusty leaves quite gone,
134 | Beauty o'ersnow'd and bareness every where:
135 | Then, were not summer's distillation left,
136 | A liquid prisoner pent in walls of glass,
137 | Beauty's effect with beauty were bereft,
138 | Nor it nor no remembrance what it was:
139 | But flowers distill'd though they with winter meet,
140 | Leese but their show; their substance still lives sweet.
141 | Then let not winter's ragged hand deface
142 | In thee thy summer, ere thou be distill'd:
143 | Make sweet some vial; treasure thou some place
144 | With beauty's treasure, ere it be self-kill'd.
145 | That use is not forbidden usury,
146 | Which happies those that pay the willing loan;
147 | That's for thyself to breed another thee,
148 | Or ten times happier, be it ten for one;
149 | Ten times thyself were happier than thou art,
150 | If ten of thine ten times refigured thee:
151 | Then what could death do, if thou shouldst depart,
152 | Leaving thee living in posterity?
153 | Be not self-will'd, for thou art much too fair
154 | To be death's conquest and make worms thine heir.
155 | Lo! in the orient when the gracious light
156 | Lifts up his burning head, each under eye
157 | Doth homage to his new-appearing sight,
158 | Serving with looks his sacred majesty;
159 | And having climb'd the steep-up heavenly hill,
160 | Resembling strong youth in his middle age,
161 | yet mortal looks adore his beauty still,
162 | Attending on his golden pilgrimage;
163 | But when from highmost pitch, with weary car,
164 | Like feeble age, he reeleth from the day,
165 | The eyes, 'fore duteous, now converted are
166 | From his low tract and look another way:
167 | So thou, thyself out-going in thy noon,
168 | Unlook'd on diest, unless thou get a son.
169 | Music to hear, why hear'st thou music sadly?
170 | Sweets with sweets war not, joy delights in joy.
171 | Why lovest thou that which thou receivest not gladly,
172 | Or else receivest with pleasure thine annoy?
173 | If the true concord of well-tuned sounds,
174 | By unions married, do offend thine ear,
175 | They do but sweetly chide thee, who confounds
176 | In singleness the parts that thou shouldst bear.
177 | Mark how one string, sweet husband to another,
178 | Strikes each in each by mutual ordering,
179 | Resembling sire and child and happy mother
180 | Who all in one, one pleasing note do sing:
181 | Whose speechless song, being many, seeming one,
182 | Sings this to thee: 'thou single wilt prove none.'
183 | Is it for fear to wet a widow's eye
184 | That thou consumest thyself in single life?
185 | Ah! if thou issueless shalt hap to die.
186 | The world will wail thee, like a makeless wife;
187 | The world will be thy widow and still weep
188 | That thou no form of thee hast left behind,
189 | When every private widow well may keep
190 | By children's eyes her husband's shape in mind.
191 | Look, what an unthrift in the world doth spend
192 | Shifts but his place, for still the world enjoys it;
193 | But beauty's waste hath in the world an end,
194 | And kept unused, the user so destroys it.
195 | No love toward others in that bosom sits
196 | That on himself such murderous shame commits.
197 | For shame! deny that thou bear'st love to any,
198 | Who for thyself art so unprovident.
199 | Grant, if thou wilt, thou art beloved of many,
200 | But that thou none lovest is most evident;
201 | For thou art so possess'd with murderous hate
202 | That 'gainst thyself thou stick'st not to conspire.
203 | Seeking that beauteous roof to ruinate
204 | Which to repair should be thy chief desire.
205 | O, change thy thought, that I may change my mind!
206 | Shall hate be fairer lodged than gentle love?
207 | Be, as thy presence is, gracious and kind,
208 | Or to thyself at least kind-hearted prove:
209 | Make thee another self, for love of me,
210 | That beauty still may live in thine or thee.
211 | As fast as thou shalt wane, so fast thou growest
212 | In one of thine, from that which thou departest;
213 | And that fresh blood which youngly thou bestowest
214 | Thou mayst call thine when thou from youth convertest.
215 | Herein lives wisdom, beauty and increase:
216 | Without this, folly, age and cold decay:
217 | If all were minded so, the times should cease
218 | And threescore year would make the world away.
219 | Let those whom Nature hath not made for store,
220 | Harsh featureless and rude, barrenly perish:
221 | Look, whom she best endow'd she gave the more;
222 | Which bounteous gift thou shouldst in bounty cherish:
223 | She carved thee for her seal, and meant thereby
224 | Thou shouldst print more, not let that copy die.
225 | When I do count the clock that tells the time,
226 | And see the brave day sunk in hideous night;
227 | When I behold the violet past prime,
228 | And sable curls all silver'd o'er with white;
229 | When lofty trees I see barren of leaves
230 | Which erst from heat did canopy the herd,
231 | And summer's green all girded up in sheaves
232 | Borne on the bier with white and bristly beard,
233 | Then of thy beauty do I question make,
234 | That thou among the wastes of time must go,
235 | Since sweets and beauties do themselves forsake
236 | And die as fast as they see others grow;
237 | And nothing 'gainst Time's scythe can make defence
238 | Save breed, to brave him when he takes thee hence.
239 | O, that you were yourself! but, love, you are
240 | No longer yours than you yourself here live:
241 | Against this coming end you should prepare,
242 | And your sweet semblance to some other give.
243 | So should that beauty which you hold in lease
244 | Find no determination: then you were
245 | Yourself again after yourself's decease,
246 | When your sweet issue your sweet form should bear.
247 | Who lets so fair a house fall to decay,
248 | Which husbandry in honour might uphold
249 | Against the stormy gusts of winter's day
250 | And barren rage of death's eternal cold?
251 | O, none but unthrifts! Dear my love, you know
252 | You had a father: let your son say so.
253 | Not from the stars do I my judgment pluck;
254 | And yet methinks I have astronomy,
255 | But not to tell of good or evil luck,
256 | Of plagues, of dearths, or seasons' quality;
257 | Nor can I fortune to brief minutes tell,
258 | Pointing to each his thunder, rain and wind,
259 | Or say with princes if it shall go well,
260 | By oft predict that I in heaven find:
261 | But from thine eyes my knowledge I derive,
262 | And, constant stars, in them I read such art
263 | As truth and beauty shall together thrive,
264 | If from thyself to store thou wouldst convert;
265 | Or else of thee this I prognosticate:
266 | Thy end is truth's and beauty's doom and date.
267 | When I consider every thing that grows
268 | Holds in perfection but a little moment,
269 | That this huge stage presenteth nought but shows
270 | Whereon the stars in secret influence comment;
271 | When I perceive that men as plants increase,
272 | Cheered and cheque'd even by the self-same sky,
273 | Vaunt in their youthful sap, at height decrease,
274 | And wear their brave state out of memory;
275 | Then the conceit of this inconstant stay
276 | Sets you most rich in youth before my sight,
277 | Where wasteful Time debateth with Decay,
278 | To change your day of youth to sullied night;
279 | And all in war with Time for love of you,
280 | As he takes from you, I engraft you new.
281 | But wherefore do not you a mightier way
282 | Make war upon this bloody tyrant, Time?
283 | And fortify yourself in your decay
284 | With means more blessed than my barren rhyme?
285 | Now stand you on the top of happy hours,
286 | And many maiden gardens yet unset
287 | With virtuous wish would bear your living flowers,
288 | Much liker than your painted counterfeit:
289 | So should the lines of life that life repair,
290 | Which this, Time's pencil, or my pupil pen,
291 | Neither in inward worth nor outward fair,
292 | Can make you live yourself in eyes of men.
293 | To give away yourself keeps yourself still,
294 | And you must live, drawn by your own sweet skill.
295 | Who will believe my verse in time to come,
296 | If it were fill'd with your most high deserts?
297 | Though yet, heaven knows, it is but as a tomb
298 | Which hides your life and shows not half your parts.
299 | If I could write the beauty of your eyes
300 | And in fresh numbers number all your graces,
301 | The age to come would say 'This poet lies:
302 | Such heavenly touches ne'er touch'd earthly faces.'
303 | So should my papers yellow'd with their age
304 | Be scorn'd like old men of less truth than tongue,
305 | And your true rights be term'd a poet's rage
306 | And stretched metre of an antique song:
307 | But were some child of yours alive that time,
308 | You should live twice; in it and in my rhyme.
309 | Shall I compare thee to a summer's day?
310 | Thou art more lovely and more temperate:
311 | Rough winds do shake the darling buds of May,
312 | And summer's lease hath all too short a date:
313 | Sometime too hot the eye of heaven shines,
314 | And often is his gold complexion dimm'd;
315 | And every fair from fair sometime declines,
316 | By chance or nature's changing course untrimm'd;
317 | But thy eternal summer shall not fade
318 | Nor lose possession of that fair thou owest;
319 | Nor shall Death brag thou wander'st in his shade,
320 | When in eternal lines to time thou growest:
321 | So long as men can breathe or eyes can see,
322 | So long lives this and this gives life to thee.
323 | Devouring Time, blunt thou the lion's paws,
324 | And make the earth devour her own sweet brood;
325 | Pluck the keen teeth from the fierce tiger's jaws,
326 | And burn the long-lived phoenix in her blood;
327 | Make glad and sorry seasons as thou fleets,
328 | And do whate'er thou wilt, swift-footed Time,
329 | To the wide world and all her fading sweets;
330 | But I forbid thee one most heinous crime:
331 | O, carve not with thy hours my love's fair brow,
332 | Nor draw no lines there with thine antique pen;
333 | Him in thy course untainted do allow
334 | For beauty's pattern to succeeding men.
335 | Yet, do thy worst, old Time: despite thy wrong,
336 | My love shall in my verse ever live young.
337 | A woman's face with Nature's own hand painted
338 | Hast thou, the master-mistress of my passion;
339 | A woman's gentle heart, but not acquainted
340 | With shifting change, as is false women's fashion;
341 | An eye more bright than theirs, less false in rolling,
342 | Gilding the object whereupon it gazeth;
343 | A man in hue, all 'hues' in his controlling,
344 | Much steals men's eyes and women's souls amazeth.
345 | And for a woman wert thou first created;
346 | Till Nature, as she wrought thee, fell a-doting,
347 | And by addition me of thee defeated,
348 | By adding one thing to my purpose nothing.
349 | But since she prick'd thee out for women's pleasure,
350 | Mine be thy love and thy love's use their treasure.
351 | So is it not with me as with that Muse
352 | Stirr'd by a painted beauty to his verse,
353 | Who heaven itself for ornament doth use
354 | And every fair with his fair doth rehearse
355 | Making a couplement of proud compare,
356 | With sun and moon, with earth and sea's rich gems,
357 | With April's first-born flowers, and all things rare
358 | That heaven's air in this huge rondure hems.
359 | O' let me, true in love, but truly write,
360 | And then believe me, my love is as fair
361 | As any mother's child, though not so bright
362 | As those gold candles fix'd in heaven's air:
363 | Let them say more than like of hearsay well;
364 | I will not praise that purpose not to sell.
365 | My glass shall not persuade me I am old,
366 | So long as youth and thou are of one date;
367 | But when in thee time's furrows I behold,
368 | Then look I death my days should expiate.
369 | For all that beauty that doth cover thee
370 | Is but the seemly raiment of my heart,
371 | Which in thy breast doth live, as thine in me:
372 | How can I then be elder than thou art?
373 | O, therefore, love, be of thyself so wary
374 | As I, not for myself, but for thee will;
375 | Bearing thy heart, which I will keep so chary
376 | As tender nurse her babe from faring ill.
377 | Presume not on thy heart when mine is slain;
378 | Thou gavest me thine, not to give back again.
379 | As an unperfect actor on the stage
380 | Who with his fear is put besides his part,
381 | Or some fierce thing replete with too much rage,
382 | Whose strength's abundance weakens his own heart.
383 | So I, for fear of trust, forget to say
384 | The perfect ceremony of love's rite,
385 | And in mine own love's strength seem to decay,
386 | O'ercharged with burden of mine own love's might.
387 | O, let my books be then the eloquence
388 | And dumb presagers of my speaking breast,
389 | Who plead for love and look for recompense
390 | More than that tongue that more hath more express'd.
391 | O, learn to read what silent love hath writ:
392 | To hear with eyes belongs to love's fine wit.
393 | Mine eye hath play'd the painter and hath stell'd
394 | Thy beauty's form in table of my heart;
395 | My body is the frame wherein 'tis held,
396 | And perspective it is the painter's art.
397 | For through the painter must you see his skill,
398 | To find where your true image pictured lies;
399 | Which in my bosom's shop is hanging still,
400 | That hath his windows glazed with thine eyes.
401 | Now see what good turns eyes for eyes have done:
402 | Mine eyes have drawn thy shape, and thine for me
403 | Are windows to my breast, where-through the sun
404 | Delights to peep, to gaze therein on thee;
405 | Yet eyes this cunning want to grace their art;
406 | They draw but what they see, know not the heart.
407 | Let those who are in favour with their stars
408 | Of public honour and proud titles boast,
409 | Whilst I, whom fortune of such triumph bars,
410 | Unlook'd for joy in that I honour most.
411 | Great princes' favourites their fair leaves spread
412 | But as the marigold at the sun's eye,
413 | And in themselves their pride lies buried,
414 | For at a frown they in their glory die.
415 | The painful warrior famoused for fight,
416 | After a thousand victories once foil'd,
417 | Is from the book of honour razed quite,
418 | And all the rest forgot for which he toil'd:
419 | Then happy I, that love and am beloved
420 | Where I may not remove nor be removed.
421 | Lord of my love, to whom in vassalage
422 | Thy merit hath my duty strongly knit,
423 | To thee I send this written embassage,
424 | To witness duty, not to show my wit:
425 | Duty so great, which wit so poor as mine
426 | May make seem bare, in wanting words to show it,
427 | But that I hope some good conceit of thine
428 | In thy soul's thought, all naked, will bestow it;
429 | Till whatsoever star that guides my moving
430 | Points on me graciously with fair aspect
431 | And puts apparel on my tatter'd loving,
432 | To show me worthy of thy sweet respect:
433 | Then may I dare to boast how I do love thee;
434 | Till then not show my head where thou mayst prove me.
435 | Weary with toil, I haste me to my bed,
436 | The dear repose for limbs with travel tired;
437 | But then begins a journey in my head,
438 | To work my mind, when body's work's expired:
439 | For then my thoughts, from far where I abide,
440 | Intend a zealous pilgrimage to thee,
441 | And keep my drooping eyelids open wide,
442 | Looking on darkness which the blind do see
443 | Save that my soul's imaginary sight
444 | Presents thy shadow to my sightless view,
445 | Which, like a jewel hung in ghastly night,
446 | Makes black night beauteous and her old face new.
447 | Lo! thus, by day my limbs, by night my mind,
448 | For thee and for myself no quiet find.
449 | How can I then return in happy plight,
450 | That am debarr'd the benefit of rest?
451 | When day's oppression is not eased by night,
452 | But day by night, and night by day, oppress'd?
453 | And each, though enemies to either's reign,
454 | Do in consent shake hands to torture me;
455 | The one by toil, the other to complain
456 | How far I toil, still farther off from thee.
457 | I tell the day, to please them thou art bright
458 | And dost him grace when clouds do blot the heaven:
459 | So flatter I the swart-complexion'd night,
460 | When sparkling stars twire not thou gild'st the even.
461 | But day doth daily draw my sorrows longer
462 | And night doth nightly make grief's strength seem stronger.
463 | When, in disgrace with fortune and men's eyes,
464 | I all alone beweep my outcast state
465 | And trouble deal heaven with my bootless cries
466 | And look upon myself and curse my fate,
467 | Wishing me like to one more rich in hope,
468 | Featured like him, like him with friends possess'd,
469 | Desiring this man's art and that man's scope,
470 | With what I most enjoy contented least;
471 | Yet in these thoughts myself almost despising,
472 | Haply I think on thee, and then my state,
473 | Like to the lark at break of day arising
474 | From sullen earth, sings hymns at heaven's gate;
475 | For thy sweet love remember'd such wealth brings
476 | That then I scorn to change my state with kings.
477 | When to the sessions of sweet silent thought
478 | I summon up remembrance of things past,
479 | I sigh the lack of many a thing I sought,
480 | And with old woes new wail my dear time's waste:
481 | Then can I drown an eye, unused to flow,
482 | For precious friends hid in death's dateless night,
483 | And weep afresh love's long since cancell'd woe,
484 | And moan the expense of many a vanish'd sight:
485 | Then can I grieve at grievances foregone,
486 | And heavily from woe to woe tell o'er
487 | The sad account of fore-bemoaned moan,
488 | Which I new pay as if not paid before.
489 | But if the while I think on thee, dear friend,
490 | All losses are restored and sorrows end.
491 | Thy bosom is endeared with all hearts,
492 | Which I by lacking have supposed dead,
493 | And there reigns love and all love's loving parts,
494 | And all those friends which I thought buried.
495 | How many a holy and obsequious tear
496 | Hath dear religious love stol'n from mine eye
497 | As interest of the dead, which now appear
498 | But things removed that hidden in thee lie!
499 | Thou art the grave where buried love doth live,
500 | Hung with the trophies of my lovers gone,
501 | Who all their parts of me to thee did give;
502 | That due of many now is thine alone:
503 | Their images I loved I view in thee,
504 | And thou, all they, hast all the all of me.
505 | If thou survive my well-contented day,
506 | When that churl Death my bones with dust shall cover,
507 | And shalt by fortune once more re-survey
508 | These poor rude lines of thy deceased lover,
509 | Compare them with the bettering of the time,
510 | And though they be outstripp'd by every pen,
511 | Reserve them for my love, not for their rhyme,
512 | Exceeded by the height of happier men.
513 | O, then vouchsafe me but this loving thought:
514 | 'Had my friend's Muse grown with this growing age,
515 | A dearer birth than this his love had brought,
516 | To march in ranks of better equipage:
517 | But since he died and poets better prove,
518 | Theirs for their style I'll read, his for his love.'
519 | Full many a glorious morning have I seen
520 | Flatter the mountain-tops with sovereign eye,
521 | Kissing with golden face the meadows green,
522 | Gilding pale streams with heavenly alchemy;
523 | Anon permit the basest clouds to ride
524 | With ugly rack on his celestial face,
525 | And from the forlorn world his visage hide,
526 | Stealing unseen to west with this disgrace:
527 | Even so my sun one early morn did shine
528 | With all triumphant splendor on my brow;
529 | But out, alack! he was but one hour mine;
530 | The region cloud hath mask'd him from me now.
531 | Yet him for this my love no whit disdaineth;
532 | Suns of the world may stain when heaven's sun staineth.
533 | Why didst thou promise such a beauteous day,
534 | And make me travel forth without my cloak,
535 | To let base clouds o'ertake me in my way,
536 | Hiding thy bravery in their rotten smoke?
537 | 'Tis not enough that through the cloud thou break,
538 | To dry the rain on my storm-beaten face,
539 | For no man well of such a salve can speak
540 | That heals the wound and cures not the disgrace:
541 | Nor can thy shame give physic to my grief;
542 | Though thou repent, yet I have still the loss:
543 | The offender's sorrow lends but weak relief
544 | To him that bears the strong offence's cross.
545 | Ah! but those tears are pearl which thy love sheds,
546 | And they are rich and ransom all ill deeds.
547 | No more be grieved at that which thou hast done:
548 | Roses have thorns, and silver fountains mud;
549 | Clouds and eclipses stain both moon and sun,
550 | And loathsome canker lives in sweetest bud.
551 | All men make faults, and even I in this,
552 | Authorizing thy trespass with compare,
553 | Myself corrupting, salving thy amiss,
554 | Excusing thy sins more than thy sins are;
555 | For to thy sensual fault I bring in sense--
556 | Thy adverse party is thy advocate--
557 | And 'gainst myself a lawful plea commence:
558 | Such civil war is in my love and hate
559 | That I an accessary needs must be
560 | To that sweet thief which sourly robs from me.
561 | Let me confess that we two must be twain,
562 | Although our undivided loves are one:
563 | So shall those blots that do with me remain
564 | Without thy help by me be borne alone.
565 | In our two loves there is but one respect,
566 | Though in our lives a separable spite,
567 | Which though it alter not love's sole effect,
568 | Yet doth it steal sweet hours from love's delight.
569 | I may not evermore acknowledge thee,
570 | Lest my bewailed guilt should do thee shame,
571 | Nor thou with public kindness honour me,
572 | Unless thou take that honour from thy name:
573 | But do not so; I love thee in such sort
574 | As, thou being mine, mine is thy good report.
575 | As a decrepit father takes delight
576 | To see his active child do deeds of youth,
577 | So I, made lame by fortune's dearest spite,
578 | Take all my comfort of thy worth and truth.
579 | For whether beauty, birth, or wealth, or wit,
580 | Or any of these all, or all, or more,
581 | Entitled in thy parts do crowned sit,
582 | I make my love engrafted to this store:
583 | So then I am not lame, poor, nor despised,
584 | Whilst that this shadow doth such substance give
585 | That I in thy abundance am sufficed
586 | And by a part of all thy glory live.
587 | Look, what is best, that best I wish in thee:
588 | This wish I have; then ten times happy me!"""
589 | 


--------------------------------------------------------------------------------
/inference_perf/loadgen/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The Kubernetes Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from .load_generator import LoadGenerator
15 | 
16 | __all__ = ["LoadGenerator"]
17 | 


--------------------------------------------------------------------------------
/inference_perf/loadgen/load_generator.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The Kubernetes Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from pydantic import BaseModel
15 | from .load_timer import LoadTimer, ConstantLoadTimer, PoissonLoadTimer
16 | from inference_perf.datagen import DataGenerator
17 | from inference_perf.client.modelserver import ModelServerClient
18 | from inference_perf.config import LoadType, LoadConfig
19 | from asyncio import TaskGroup, sleep
20 | import time
21 | 
22 | 
23 | class StageRuntimeInfo(BaseModel):
24 |     stage_id: int
25 |     end_time: float
26 |     start_time: float
27 | 
28 | 
29 | class LoadGenerator:
30 |     def __init__(self, datagen: DataGenerator, load_config: LoadConfig) -> None:
31 |         self.datagen = datagen
32 |         self.stageInterval = load_config.interval
33 |         self.load_type = load_config.type
34 |         self.stages = load_config.stages
35 |         self.stage_runtime_info = dict[int, StageRuntimeInfo]()
36 | 
37 |     def get_timer(self, rate: float) -> LoadTimer:
38 |         if self.load_type == LoadType.POISSON:
39 |             return PoissonLoadTimer(rate=rate)
40 |         return ConstantLoadTimer(rate=rate)
41 | 
42 |     async def run(self, client: ModelServerClient) -> None:
43 |         for stage_id, stage in enumerate(self.stages):
44 |             timer = self.get_timer(stage.rate)
45 |             start_time = time.time()
46 |             end_time = start_time + stage.duration
47 |             print(f"Stage {stage_id} - run started")
48 |             async with TaskGroup() as tg:
49 |                 for _, (data, time_index) in enumerate(
50 |                     zip(self.datagen.get_data(), timer.start_timer(start_time), strict=True)
51 |                 ):
52 |                     now = time.time()
53 |                     if time_index < end_time and now < end_time:
54 |                         if time_index > now:
55 |                             await sleep(time_index - time.time())
56 |                         tg.create_task(client.process_request(data, stage_id))
57 |                         continue
58 |                     else:
59 |                         break
60 |             self.stage_runtime_info[stage_id] = StageRuntimeInfo(
61 |                 stage_id=stage_id, start_time=start_time, end_time=time.time()
62 |             )
63 |             print(f"Stage {stage_id} - run completed")
64 |             if self.stageInterval and stage_id < len(self.stages) - 1:
65 |                 await sleep(self.stageInterval)
66 | 


--------------------------------------------------------------------------------
/inference_perf/loadgen/load_timer.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The Kubernetes Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import time
15 | from abc import ABC, abstractmethod
16 | from typing import Generator, Optional, Tuple
17 | import numpy as np
18 | 
19 | 
20 | class LoadTimer(ABC):
21 |     """Abstract base class for load generators."""
22 | 
23 |     @abstractmethod
24 |     def __init__(self, *args: Tuple[int, ...]) -> None:
25 |         # TODO: Commmon functionallity
26 |         pass
27 | 
28 |     @abstractmethod
29 |     def start_timer(self, initial: Optional[float] = None) -> Generator[float, None, None]:
30 |         """Yield the times at which requests should be made."""
31 |         raise NotImplementedError
32 | 
33 | 
34 | class ConstantLoadTimer(LoadTimer):
35 |     """
36 |     A load generator that generates requests at a constant rate.
37 |     Introduces a small amount of random noise in timing.
38 |     """
39 | 
40 |     def __init__(self, rate: float) -> None:
41 |         self._rate = rate
42 |         # TODO: Make random state a global seed
43 |         self._rand = np.random.default_rng()
44 | 
45 |     def start_timer(self, initial: Optional[float] = None) -> Generator[float, None, None]:
46 |         # Set start time
47 |         next_time = time.monotonic() if initial is None else initial
48 | 
49 |         # Given a rate, yield a time to wait before the next request
50 |         while True:
51 |             next_time += self._rand.exponential(1 / self._rate)
52 |             yield next_time
53 | 
54 | 
55 | class PoissonLoadTimer(LoadTimer):
56 |     def __init__(self, rate: float) -> None:
57 |         self._rate = rate
58 |         self._rand = np.random.default_rng()
59 | 
60 |     def start_timer(self, initial: Optional[float] = None) -> Generator[float, None, None]:
61 |         # Set start time
62 |         next_time = time.monotonic() if initial is None else initial
63 | 
64 |         # Given a rate, yield a time to wait before the next request
65 |         while True:
66 |             # How many requests in the next second
67 |             req_count = self._rand.poisson(self._rate)
68 | 
69 |             # If no requests, wait for 1 second
70 |             if req_count < 1:
71 |                 yield next_time + 1.0
72 |                 continue
73 | 
74 |             # Schedule the requests over the next second
75 |             timer = ConstantLoadTimer(req_count)
76 |             for _ in range(req_count):
77 |                 next_time = next(timer.start_timer(next_time))
78 |                 yield next_time
79 | 


--------------------------------------------------------------------------------
/inference_perf/main.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2025 The Kubernetes Authors.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | from typing import List, Optional
 15 | from inference_perf.loadgen import LoadGenerator
 16 | from inference_perf.config import (
 17 |     DataGenType,
 18 |     MetricsClientType,
 19 |     ModelServerType,
 20 |     ReportConfig,
 21 |     read_config,
 22 | )
 23 | from inference_perf.datagen import (
 24 |     DataGenerator,
 25 |     MockDataGenerator,
 26 |     HFShareGPTDataGenerator,
 27 |     SyntheticDataGenerator,
 28 |     RandomDataGenerator,
 29 |     SharedPrefixDataGenerator,
 30 | )
 31 | from inference_perf.client.modelserver import ModelServerClient, vLLMModelServerClient
 32 | from inference_perf.client.metricsclient import MetricsClient, PerfRuntimeParameters, PrometheusMetricsClient
 33 | from inference_perf.client.filestorage import StorageClient, GoogleCloudStorageClient
 34 | from inference_perf.reportgen import ReportGenerator
 35 | from inference_perf.utils import CustomTokenizer, ReportFile
 36 | import asyncio
 37 | import time
 38 | 
 39 | 
 40 | class InferencePerfRunner:
 41 |     def __init__(
 42 |         self,
 43 |         client: ModelServerClient,
 44 |         loadgen: LoadGenerator,
 45 |         reportgen: ReportGenerator,
 46 |         storage_clients: List[StorageClient],
 47 |     ) -> None:
 48 |         self.client = client
 49 |         self.loadgen = loadgen
 50 |         self.reportgen = reportgen
 51 |         self.storage_clients = storage_clients
 52 | 
 53 |     def run(self) -> None:
 54 |         asyncio.run(self.loadgen.run(self.client))
 55 | 
 56 |     def generate_reports(self, report_config: ReportConfig, runtime_parameters: PerfRuntimeParameters) -> List[ReportFile]:
 57 |         return asyncio.run(self.reportgen.generate_reports(report_config=report_config, runtime_parameters=runtime_parameters))
 58 | 
 59 |     def save_reports(self, reports: List[ReportFile]) -> None:
 60 |         for storage_client in self.storage_clients:
 61 |             storage_client.save_report(reports)
 62 | 
 63 | 
 64 | def main_cli() -> None:
 65 |     config = read_config()
 66 | 
 67 |     # Define Metrics Client
 68 |     metrics_client: Optional[MetricsClient] = None
 69 |     if config.metrics:
 70 |         if config.metrics.type == MetricsClientType.PROMETHEUS and config.metrics.prometheus:
 71 |             metrics_client = PrometheusMetricsClient(config=config.metrics.prometheus)
 72 | 
 73 |     # Define Storage Clients
 74 |     storage_clients: List[StorageClient] = []
 75 |     if config.storage:
 76 |         if config.storage.google_cloud_storage:
 77 |             storage_clients.append(GoogleCloudStorageClient(config=config.storage.google_cloud_storage))
 78 | 
 79 |     # Define Report Generator
 80 |     reportgen = ReportGenerator(metrics_client)
 81 | 
 82 |     # Create tokenizer based on tokenizer config
 83 |     tokenizer: Optional[CustomTokenizer] = None
 84 |     if config.tokenizer and config.tokenizer.pretrained_model_name_or_path:
 85 |         try:
 86 |             tokenizer = CustomTokenizer(config.tokenizer)
 87 |         except Exception as e:
 88 |             raise Exception("Tokenizer initialization failed") from e
 89 | 
 90 |     # Define Model Server Client
 91 |     model_server_client: ModelServerClient
 92 |     if config.server:
 93 |         if config.server.type == ModelServerType.VLLM:
 94 |             # The type error for vLLMModelServerClient's tokenizer argument indicates it expects CustomTokenizer, not Optional.
 95 |             if tokenizer is None:
 96 |                 raise Exception(
 97 |                     "vLLM client is configured, but it requires a custom tokenizer which was not provided or initialized successfully. "
 98 |                     "Please ensure a valid tokenizer is configured in the 'tokenizer' section of your config file."
 99 |                 )
100 |             model_server_client = vLLMModelServerClient(
101 |                 reportgen.get_metrics_collector(),
102 |                 api_type=config.api,
103 |                 uri=config.server.base_url,
104 |                 model_name=config.server.model_name,
105 |                 tokenizer=tokenizer,
106 |                 ignore_eos=config.server.ignore_eos,
107 |             )
108 |     else:
109 |         raise Exception("model server client config missing")
110 | 
111 |     # Define DataGenerator
112 |     datagen: DataGenerator
113 |     if config.data:
114 |         # Common checks for generators that require a tokenizer / distribution
115 |         if config.data.type in [DataGenType.ShareGPT, DataGenType.Synthetic, DataGenType.Random]:
116 |             if tokenizer is None:
117 |                 raise Exception(
118 |                     f"{config.data.type.value} data generator requires a configured tokenizer. "
119 |                     "Please ensure a valid tokenizer is configured in the 'tokenizer' section of your config file."
120 |                 )
121 |         if config.data.type in [DataGenType.Synthetic, DataGenType.Random]:
122 |             if config.data.input_distribution is None:
123 |                 raise Exception(f"{config.data.type.value} data generator requires 'input_distribution' to be configured")
124 |             if config.data.output_distribution is None:
125 |                 raise Exception(f"{config.data.type.value} data generator requires 'output_distribution' to be configured")
126 |         if config.data.type == DataGenType.SharedPrefix and config.data.shared_prefix is None:
127 |             raise Exception(f"{config.data.type.value} data generator requires 'shared_prefix' to be configured")
128 | 
129 |         if config.data.type == DataGenType.ShareGPT:
130 |             datagen = HFShareGPTDataGenerator(config.api, config.data, tokenizer)
131 |         elif config.data.type == DataGenType.Synthetic:
132 |             datagen = SyntheticDataGenerator(config.api, config.data, tokenizer)
133 |         elif config.data.type == DataGenType.Random:
134 |             datagen = RandomDataGenerator(config.api, config.data, tokenizer)
135 |         elif config.data.type == DataGenType.SharedPrefix:
136 |             datagen = SharedPrefixDataGenerator(config.api, config.data, tokenizer)
137 |         else:
138 |             datagen = MockDataGenerator(config.api, config.data, tokenizer)
139 |     else:
140 |         raise Exception("data config missing")
141 | 
142 |     # Define LoadGenerator
143 |     if config.load:
144 |         if isinstance(metrics_client, PrometheusMetricsClient) and config.report.prometheus.per_stage:
145 |             config.load.interval = max(config.load.interval, metrics_client.scrape_interval)
146 |         loadgen = LoadGenerator(datagen, config.load)
147 |     else:
148 |         raise Exception("load config missing")
149 | 
150 |     # Setup Perf Test Runner
151 |     perfrunner = InferencePerfRunner(model_server_client, loadgen, reportgen, storage_clients)
152 | 
153 |     start_time = time.time()
154 | 
155 |     # Run Perf Test
156 |     perfrunner.run()
157 | 
158 |     end_time = time.time()
159 |     duration = end_time - start_time  # Calculate the duration of the test
160 | 
161 |     # Generate Reports after the tests
162 |     reports = perfrunner.generate_reports(
163 |         report_config=config.report,
164 |         runtime_parameters=PerfRuntimeParameters(
165 |             start_time=start_time,
166 |             duration=duration,
167 |             model_server_client=model_server_client,
168 |             stages=loadgen.stage_runtime_info,
169 |         ),
170 |     )
171 | 
172 |     # Save Reports
173 |     perfrunner.save_reports(reports=reports)
174 | 
175 | 
176 | if __name__ == "__main__":
177 |     main_cli()
178 | 


--------------------------------------------------------------------------------
/inference_perf/reportgen/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The Kubernetes Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from .base import ReportGenerator
15 | 
16 | __all__ = ["ReportGenerator"]
17 | 


--------------------------------------------------------------------------------
/inference_perf/reportgen/base.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2025 The Kubernetes Authors.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | # http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | from typing import List, Optional, Any
 15 | from pydantic import BaseModel
 16 | from collections import defaultdict
 17 | from inference_perf.client.metricsclient.base import ModelServerMetrics
 18 | from inference_perf.client.metricsclient.prometheus_client import PrometheusMetricsClient
 19 | from inference_perf.config import ReportConfig, PrometheusMetricsReportConfig
 20 | from inference_perf.client.metricsclient import MetricsClient, PerfRuntimeParameters
 21 | from inference_perf.utils import ReportFile
 22 | from inference_perf.client.requestdatacollector import LocalRequestDataCollector, RequestDataCollector
 23 | from inference_perf.apis import RequestLifecycleMetric
 24 | import numpy as np
 25 | 
 26 | 
 27 | def safe_float(value: Any) -> float:
 28 |     """NOTE: Only for use in summarize_requests after validating safe access"""
 29 |     try:
 30 |         return float(value)
 31 |     except (TypeError, ValueError):
 32 |         return 0
 33 | 
 34 | 
 35 | def summarize(items: List[float]) -> Optional[dict[str, float]]:
 36 |     return (
 37 |         {
 38 |             "mean": float(np.mean(items)),
 39 |             "min": float(np.min(items)),
 40 |             "p10": float(np.percentile(items, 10)),
 41 |             "p50": float(np.percentile(items, 50)),
 42 |             "p90": float(np.percentile(items, 90)),
 43 |             "max": float(np.max(items)),
 44 |         }
 45 |         if len(items) != 0
 46 |         else None
 47 |     )
 48 | 
 49 | 
 50 | class ResponsesSummary(BaseModel):
 51 |     load_summary: dict[str, Any]
 52 |     successes: dict[str, Any]
 53 |     failures: dict[str, Any]
 54 | 
 55 | 
 56 | def summarize_prometheus_metrics(metrics: ModelServerMetrics) -> ResponsesSummary:
 57 |     return ResponsesSummary(
 58 |         load_summary={},  # model server doesn't report failed requests
 59 |         failures={},
 60 |         successes={
 61 |             "count": metrics.total_requests,
 62 |             "rate": metrics.requests_per_second,
 63 |             "prompt_len": {
 64 |                 "mean": metrics.avg_prompt_tokens,
 65 |                 "rate": metrics.prompt_tokens_per_second,
 66 |             },
 67 |             "output_len": {
 68 |                 "mean": metrics.avg_output_tokens,
 69 |                 "rate": metrics.output_tokens_per_second,
 70 |             },
 71 |             "queue_len": {
 72 |                 "mean": metrics.avg_queue_length,
 73 |             },
 74 |             "request_latency": {
 75 |                 "mean": metrics.avg_request_latency,
 76 |                 "p50": metrics.median_request_latency,
 77 |                 "p90": metrics.p90_request_latency,
 78 |                 "p99": metrics.p99_request_latency,
 79 |             },
 80 |             "time_to_first_token": {
 81 |                 "mean": metrics.avg_time_to_first_token,
 82 |                 "p50": metrics.median_time_to_first_token,
 83 |                 "p90": metrics.p90_time_to_first_token,
 84 |                 "p99": metrics.p99_time_to_first_token,
 85 |             },
 86 |             "time_per_output_token": {
 87 |                 "mean": metrics.avg_time_per_output_token,
 88 |                 "p50": metrics.median_time_per_output_token,
 89 |                 "p90": metrics.p90_time_per_output_token,
 90 |                 "p99": metrics.p99_time_per_output_token,
 91 |             },
 92 |         },
 93 |     )
 94 | 
 95 | 
 96 | def summarize_requests(metrics: List[RequestLifecycleMetric]) -> ResponsesSummary:
 97 |     all_successful: List[RequestLifecycleMetric] = [x for x in metrics if x.error is None]
 98 |     all_failed: List[RequestLifecycleMetric] = [x for x in metrics if x.error is not None]
 99 | 
100 |     total_time = max(x.end_time for x in metrics) - min(x.start_time for x in metrics)
101 | 
102 |     return ResponsesSummary(
103 |         load_summary={
104 |             "count": len(metrics),
105 |         },
106 |         successes={
107 |             "count": len(all_successful),
108 |             "throughput": {
109 |                 "input_tokens_per_sec": sum(x.info.input_tokens for x in all_successful) / total_time,
110 |                 "output_tokens_per_sec": sum(x.info.output_tokens for x in all_successful) / total_time,
111 |                 "total_tokens_per_sec": sum((x.info.input_tokens + x.info.output_tokens) for x in all_successful) / total_time,
112 |                 "requests_per_sec": len(all_successful) / total_time,
113 |             },
114 |             "request_latency": summarize([(successful.end_time - successful.start_time) for successful in all_successful]),
115 |             "prompt_len": summarize([safe_float(success.info.input_tokens) for success in all_successful]),
116 |             "output_len": summarize([float(v) for success in all_successful if (v := success.info.output_tokens) is not None]),
117 |             "normalized_time_per_output_token": summarize(
118 |                 [
119 |                     ((metric.end_time - metric.start_time) / output_len) if output_len and output_len != 0 else 0
120 |                     for metric in all_successful
121 |                     for output_len in [safe_float(metric.info.output_tokens)]
122 |                 ]
123 |             ),
124 |         },
125 |         failures={
126 |             "count": len(all_failed),
127 |             "request_latency": summarize([(failed.end_time - failed.start_time) for failed in all_failed]),
128 |         },
129 |     )
130 | 
131 | 
132 | class ReportGenerator:
133 |     def __init__(
134 |         self,
135 |         metrics_client: Optional[MetricsClient],
136 |     ) -> None:
137 |         self.metrics_collector = LocalRequestDataCollector()
138 |         self.metrics_client = metrics_client
139 | 
140 |     def get_metrics_collector(self) -> RequestDataCollector:
141 |         """
142 |         Returns the metrics collector.
143 |         """
144 |         return self.metrics_collector
145 | 
146 |     async def generate_reports(
147 |         self, report_config: ReportConfig, runtime_parameters: PerfRuntimeParameters
148 |     ) -> List[ReportFile]:
149 |         print("\n\nGenerating Reports ..")
150 |         lifecycle_reports = []
151 |         request_metrics = self.metrics_collector.get_metrics()
152 |         if report_config.request_lifecycle.summary:
153 |             if len(request_metrics) != 0:
154 |                 report_file = ReportFile(
155 |                     name="summary_lifecycle_metrics",
156 |                     contents=summarize_requests(request_metrics).model_dump(),
157 |                 )
158 |                 lifecycle_reports.append(report_file)
159 |                 if report_file.path is not None:
160 |                     print(f"Successfully saved summary report of request lifecycle metrics to {report_file.path}")
161 | 
162 |         if report_config.request_lifecycle.per_stage:
163 |             stage_buckets: dict[int, List[RequestLifecycleMetric]] = defaultdict(list)
164 |             for metric in request_metrics:
165 |                 if metric.stage_id is not None:
166 |                     stage_buckets[metric.stage_id].append(metric)
167 |             for stage_id, metrics in stage_buckets.items():
168 |                 report_file = ReportFile(
169 |                     name=f"stage_{stage_id}_lifecycle_metrics",
170 |                     contents=summarize_requests(metrics).model_dump(),
171 |                 )
172 |                 lifecycle_reports.append(report_file)
173 |                 if report_file is not None:
174 |                     print(f"Successfully saved stage {stage_id} report of request lifecycle metrics to {report_file.path}")
175 | 
176 |         if report_config.request_lifecycle.per_request:
177 |             report_file = ReportFile(
178 |                 name="per_request_lifecycle_metrics",
179 |                 contents=[
180 |                     {
181 |                         "start_time": metric.start_time,
182 |                         "end_time": metric.end_time,
183 |                         "request": metric.request_data,
184 |                         "response": metric.response_data,
185 |                     }
186 |                     for metric in request_metrics
187 |                 ],
188 |             )
189 |             lifecycle_reports.append(report_file)
190 |             if report_file is not None:
191 |                 print(f"Successfully saved per request report of request lifecycle metrics to {report_file.path}")
192 | 
193 |         lifecycle_reports.extend(self.generate_prometheus_metrics_report(runtime_parameters, report_config.prometheus))
194 |         return lifecycle_reports
195 | 
196 |     def generate_prometheus_metrics_report(
197 |         self, runtime_parameters: PerfRuntimeParameters, report_config: PrometheusMetricsReportConfig
198 |     ) -> List[ReportFile]:
199 |         """
200 |         Report summary of the metrics collected by the metrics client during the run.
201 |         Args:
202 |             runtime_parameters (PerfRuntimeParameters): The runtime parameters containing the model server client, query eval time in the metrics db, duration.
203 |         """
204 |         prometheus_metrics_reports: List[ReportFile] = []
205 | 
206 |         if self.metrics_client is None or not isinstance(self.metrics_client, PrometheusMetricsClient):
207 |             print("Prometheus Metrics Client is not configured or not of type PrometheusMetricsClient")
208 |             return prometheus_metrics_reports
209 | 
210 |         # Wait for Prometheus to collect metrics for the last stage
211 |         self.metrics_client.wait()
212 | 
213 |         if report_config.summary:
214 |             collected_metrics = self.metrics_client.collect_metrics_summary(runtime_parameters)
215 |             if collected_metrics is not None:
216 |                 report_file = ReportFile(
217 |                     name="summary_prometheus_metrics",
218 |                     contents=summarize_prometheus_metrics(collected_metrics).model_dump(),
219 |                 )
220 |                 if report_file is not None:
221 |                     print(f"Successfully saved summary report of prometheus metrics to {report_file.path}")
222 |                 prometheus_metrics_reports.append(report_file)
223 |             else:
224 |                 print("Report generation failed - no metrics collected by metrics client")
225 | 
226 |         if report_config.per_stage:
227 |             for stage_id, _stage_info in runtime_parameters.stages.items():
228 |                 collected_metrics = self.metrics_client.collect_metrics_for_stage(runtime_parameters, stage_id)
229 |                 if collected_metrics is not None:
230 |                     report_file = ReportFile(
231 |                         name=f"stage_{stage_id}_prometheus_metrics",
232 |                         contents=summarize_prometheus_metrics(collected_metrics).model_dump(),
233 |                     )
234 |                     if report_file is not None:
235 |                         print(f"Successfully saved stage {stage_id} report of prometheus metrics to {report_file.path}")
236 |                     prometheus_metrics_reports.append(report_file)
237 |                 else:
238 |                     print(f"No metrics collected for Stage {stage_id}")
239 | 
240 |         return prometheus_metrics_reports
241 | 


--------------------------------------------------------------------------------
/inference_perf/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The Kubernetes Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from .custom_tokenizer import CustomTokenizer
15 | from .report_file import ReportFile
16 | 
17 | __all__ = ["CustomTokenizer", "ReportFile"]
18 | 


--------------------------------------------------------------------------------
/inference_perf/utils/custom_tokenizer.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The Kubernetes Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from transformers import AutoTokenizer, PreTrainedTokenizerBase
15 | from inference_perf.config import CustomTokenizerConfig
16 | 
17 | 
18 | class CustomTokenizer:
19 |     def __init__(self, config: CustomTokenizerConfig):
20 |         self.tokenizer: PreTrainedTokenizerBase = AutoTokenizer.from_pretrained(
21 |             config.pretrained_model_name_or_path, token=config.token, trust_remote_code=config.trust_remote_code
22 |         )
23 | 
24 |     def count_tokens(self, text: str) -> int:
25 |         if text == "":
26 |             return 0
27 |         return len(self.tokenizer(text).input_ids)
28 | 
29 |     def get_tokenizer(self) -> PreTrainedTokenizerBase:
30 |         return self.tokenizer
31 | 


--------------------------------------------------------------------------------
/inference_perf/utils/distribution.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The Kubernetes Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import numpy as np
15 | from numpy.typing import NDArray
16 | 
17 | 
18 | def generate_distribution(min: int, max: int, mean: float, std_dev: float, total_count: int) -> NDArray[np.int_]:
19 |     """
20 |     Generates an array of lengths in integer adhering to the specified distribution constraints.
21 | 
22 |     Args:
23 |         min: The minimum allowed length.
24 |         max: The maximum allowed length.
25 |         mean: The target mean of the distribution.
26 |         std_dev: The target standard deviation of the distribution.
27 |         total_count: The total number of lengths to generate.
28 | 
29 |     Returns:
30 |         A numpy array of integers representing lengths for input prompts or output generations.
31 | 
32 |     Raises:
33 |         ValueError: If constraints are impossible (e.g., min_val > max_val).
34 |     """
35 |     if min > max:
36 |         raise ValueError("Minimum value cannot be greater than maximum value.")
37 |     if total_count <= 0:
38 |         raise ValueError("Total count must be a positive integer.")
39 |     if std_dev < 0:
40 |         raise ValueError("Standard deviation cannot be negative.")
41 |     if mean < min or mean > max:
42 |         raise ValueError("Mean cannot be outside min and max range.")
43 | 
44 |     # Generate floating-point numbers from a normal distribution
45 |     # Use a large enough intermediate pool if std_dev is high relative to range
46 |     # to increase chances of getting values within bounds after generation.
47 |     # This is a heuristic; perfect adherence isn't guaranteed.
48 |     generated_numbers = np.random.normal(loc=mean, scale=std_dev, size=total_count)
49 | 
50 |     # Clip the numbers to the specified min/max range
51 |     clipped_numbers = np.clip(generated_numbers, min, max)
52 | 
53 |     # Round to the nearest integer and convert type
54 |     generated_lengths = np.round(clipped_numbers).astype(int)
55 | 
56 |     # Ensure integer values are strictly within bounds after rounding
57 |     # (e.g., rounding 4.6 when max is 4 could result in 5 without this)
58 |     generated_lengths = np.clip(generated_lengths, min, max)
59 | 
60 |     return generated_lengths
61 | 


--------------------------------------------------------------------------------
/inference_perf/utils/report_file.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2025 The Kubernetes Authors.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | # http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | 
15 | import json
16 | import os
17 | from typing import Any, Optional
18 | 
19 | 
20 | class ReportFile:
21 |     name: str
22 |     contents: Any
23 |     path: Optional[str] = None
24 | 
25 |     def __init__(self, name: str, contents: Any):
26 |         self.name = f"{name}.json"
27 |         self.contents = contents
28 |         self._store_locally()
29 | 
30 |     def _store_locally(self) -> None:
31 |         filename = self.get_filename()
32 |         contents = self.get_contents()
33 |         with open(filename, "w", encoding="utf-8") as f:
34 |             f.write(json.dumps(contents, indent=2))
35 |             self.path = os.path.abspath(filename)
36 | 
37 |     def get_filename(self) -> str:
38 |         return self.name
39 | 
40 |     def get_contents(self) -> Any:
41 |         return self.contents
42 | 


--------------------------------------------------------------------------------
/pdm.lock:
--------------------------------------------------------------------------------
 1 | # This file is @generated by PDM.
 2 | # It is not intended for manual editing.
 3 | 
 4 | [metadata]
 5 | groups = ["default", "dev"]
 6 | strategy = ["inherit_metadata"]
 7 | targets = []
 8 | lock_version = "4.5.0"
 9 | content_hash = "sha256:94dc44bb0ca871cbddd7f324cc3d79c8a503ad6cd6e18f147cc2867ca93d73c7"
10 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [project]
 2 | name = "inference-perf"
 3 | version = "0.0.1"
 4 | description = "A GenAI inference performance benchmarking tool."
 5 | authors = []
 6 | dependencies = [
 7 |     "aiohttp>=3.11.11",
 8 |     "pydantic>=2.10.6",
 9 |     "numpy>=2.2.2",
10 |     "datasets>=3.3.2",
11 |     "transformers>=4.50.2",
12 |     "google-cloud-storage>=3.1.0",
13 | ]
14 | requires-python = ">=3.12"
15 | readme = "README.md"
16 | license = {text = "Apache-2.0"}
17 | 
18 | [project.scripts]
19 | inference-perf = "inference_perf:main_cli"
20 | 
21 | [project.optional-dependencies]
22 | dev = [
23 |     "mypy>=1.14.1",
24 |     "ruff>=0.9.4",
25 |     "pre-commit>=4.1.0",
26 |     "pytest>=8.3.4",
27 |     "types-PyYAML>=6.0.12.20241230",
28 |     "ipykernel>=6.29.5",
29 |     "types-requests>=2.32.0.20250328",
30 | ]
31 | 
32 | [tool.ruff]
33 | # The GitHub editor is 127 chars wide
34 | line-length = 127
35 | indent-width = 4
36 | 
37 | [tool.ruff.lint]
38 | # Enable Pyflakes (`F`) and a subset of the pycodestyle (`E`)  codes by default.
39 | # On top of the defaults (`E4`, E7`, `E9`, and `F`), enable flake8-bugbear (`B`)
40 | # Unlike Flake8, Ruff doesn't enable pycodestyle warnings (`W`) or
41 | # McCabe complexity (`C901`) by default.
42 | select = ["E4", "E7", "E9", "F", "B"]
43 | ignore = []
44 | 
45 | # Allow fix for all enabled rules (when `--fix`) is provided.
46 | fixable = ["ALL"]
47 | unfixable = []
48 | 
49 | [tool.ruff.format]
50 | # Like Black, use double quotes for strings.
51 | quote-style = "double"
52 | 
53 | # Like Black, indent with spaces, rather than tabs.
54 | indent-style = "space"
55 | 
56 | # Like Black, respect magic trailing commas.
57 | skip-magic-trailing-comma = false
58 | 
59 | # Like Black, automatically detect the appropriate line ending.
60 | line-ending = "auto"
61 | 
62 | # Enable auto-formatting of code examples in docstrings. Markdown,
63 | # reStructuredText code/literal blocks and doctests are all supported.
64 | #
65 | # This is currently disabled by default, but it is planned for this
66 | # to be opt-out in the future.
67 | docstring-code-format = false
68 | 
69 | # Set the line length limit used when formatting code snippets in
70 | # docstrings.
71 | #
72 | # This only has an effect when the `docstring-code-format` setting is
73 | # enabled.
74 | docstring-code-line-length = "dynamic"
75 | 
76 | [tool.pdm]
77 | distribution = true
78 | 
79 | [tool.pytest.ini_options]
80 | testpaths = ["."]
81 | python_files = ["test_*.py"]
82 | python_classes = ["Test*"]
83 | python_functions = ["test_*"]
84 | 
85 | [tool.setuptools.packages.find]
86 | where = ["."]
87 | include = ["inference_perf*", "deploy*"]
88 | 
89 | [[tool.mypy.overrides]]
90 | module = ["datasets.*"]
91 | ignore_missing_imports = true
92 | 
93 | [tool.mypy]
94 | disable_error_code = ["attr-defined"]
95 | 
96 | [build-system]
97 | requires = ["setuptools>=61"]
98 | build-backend = "setuptools.build_meta"


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | -e . # Install requirements from pyproject
2 | 


--------------------------------------------------------------------------------
/tests/apis/test_chat.py:
--------------------------------------------------------------------------------
 1 | from inference_perf.apis.chat import ChatCompletionAPIData, ChatMessage
 2 | from inference_perf.config import APIType
 3 | 
 4 | 
 5 | def test_chat_completion_api_data() -> None:
 6 |     data = ChatCompletionAPIData(messages=[ChatMessage(role="user", content="Hello, world!")])
 7 |     assert data.get_api_type() == APIType.Chat
 8 |     assert len(data.messages) == 1
 9 |     assert data.to_payload("test-model", 100, False) == {
10 |         "model": "test-model",
11 |         "messages": [{"role": "user", "content": "Hello, world!"}],
12 |         "max_tokens": 100,
13 |         "ignore_eos": False,
14 |     }
15 | 


--------------------------------------------------------------------------------
/tests/apis/test_completion.py:
--------------------------------------------------------------------------------
 1 | from inference_perf.apis.completion import CompletionAPIData
 2 | from inference_perf.config import APIType
 3 | 
 4 | 
 5 | def test_completion_api_data() -> None:
 6 |     data = CompletionAPIData(prompt="Hello, world!")
 7 |     assert data.get_api_type() == APIType.Completion
 8 |     assert data.prompt == "Hello, world!"
 9 |     assert data.to_payload("test-model", 100, False) == {
10 |         "model": "test-model",
11 |         "prompt": "Hello, world!",
12 |         "max_tokens": 100,
13 |         "ignore_eos": False,
14 |     }
15 | 


--------------------------------------------------------------------------------
/tests/test_config.py:
--------------------------------------------------------------------------------
 1 | from inference_perf.config import read_config, deep_merge, Config, APIType, DataGenType, LoadType, MetricsClientType
 2 | import os
 3 | 
 4 | 
 5 | def test_read_config() -> None:
 6 |     config_path = os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "config.yml"))
 7 |     config = read_config(["-c", config_path])
 8 | 
 9 |     assert isinstance(config, Config)
10 |     assert config.api == APIType.Chat
11 |     assert config.data.type == DataGenType.ShareGPT
12 |     assert config.load.type == LoadType.CONSTANT
13 |     if config.metrics:
14 |         assert config.metrics.type == MetricsClientType.PROMETHEUS
15 |     assert config.report.request_lifecycle.summary is True
16 | 
17 | 
18 | def test_deep_merge() -> None:
19 |     base = {
20 |         "api": APIType.Chat,
21 |         "data": {"type": DataGenType.ShareGPT},
22 |         "load": {"type": LoadType.CONSTANT},
23 |         "metrics": {"type": MetricsClientType.PROMETHEUS},
24 |     }
25 |     override = {
26 |         "data": {"type": DataGenType.Mock},
27 |         "load": {"type": LoadType.POISSON},
28 |     }
29 |     merged = deep_merge(base, override)
30 | 
31 |     assert merged["api"] == APIType.Chat
32 |     assert merged["data"]["type"] == DataGenType.Mock
33 |     assert merged["load"]["type"] == LoadType.POISSON
34 |     assert merged["metrics"]["type"] == MetricsClientType.PROMETHEUS
35 | 


--------------------------------------------------------------------------------