├── .dockerignore ├── .github └── workflows │ ├── build.yaml │ ├── build_ci.yaml │ ├── rust.yaml │ └── trufflehog.yaml ├── .gitignore ├── CODE_OF_CONDUCT.md ├── Cargo.toml ├── Dockerfile ├── LICENSE ├── Makefile ├── README.md ├── assets ├── dashboard.png └── ui.png ├── build.rs ├── extra ├── dashboard │ ├── app.py │ └── parse_results.py ├── k8s │ └── inference-benchmarker │ │ ├── .helmignore │ │ ├── Chart.yaml │ │ ├── templates │ │ ├── _helpers.tpl │ │ ├── benchmark.yaml │ │ ├── checks.yaml │ │ ├── configmap.yml │ │ ├── secret.yaml │ │ ├── service.yaml │ │ ├── tgi.yaml │ │ └── vllm.yaml │ │ └── values.yaml └── slurm │ ├── benchmark.py │ ├── tgi.slurm │ └── vllm.slurm ├── poetry.lock ├── pyproject.toml ├── src ├── app.rs ├── benchmark.rs ├── event.rs ├── executors.rs ├── flux.rs ├── lib.rs ├── main.rs ├── profiles.rs ├── requests.rs ├── results.rs ├── scheduler.rs ├── table.rs └── writers.rs └── test_data ├── chat.json └── conversations.json /.dockerignore: -------------------------------------------------------------------------------- 1 | *.json 2 | *.txt 3 | .git 4 | .idea 5 | target -------------------------------------------------------------------------------- /.github/workflows/build.yaml: -------------------------------------------------------------------------------- 1 | name: Build and push docker image to registry 2 | 3 | on: 4 | workflow_call: 5 | concurrency: 6 | group: ${{ github.workflow }}-${{ github.ref }}-build 7 | cancel-in-progress: true 8 | jobs: 9 | build-and-push: 10 | runs-on: 11 | group: aws-general-8-plus 12 | permissions: 13 | packages: write 14 | contents: read 15 | steps: 16 | - name: Checkout repository 17 | uses: actions/checkout@v4 18 | - name: Inject slug/short variables 19 | uses: rlespinasse/github-slug-action@v4.4.1 20 | - name: Initialize Docker Buildx 21 | uses: docker/setup-buildx-action@v3 22 | with: 23 | install: true 24 | buildkitd-config: /tmp/buildkitd.toml 25 | - name: Login to GitHub Container Registry 26 | uses: docker/login-action@v3 27 | with: 28 | registry: ghcr.io 29 | username: ${{ github.actor }} 30 | password: ${{ secrets.GITHUB_TOKEN }} 31 | # If pull request 32 | - name: Extract metadata (tags, labels) for Docker 33 | if: ${{ github.event_name == 'pull_request' }} 34 | id: meta-pr 35 | uses: docker/metadata-action@v5 36 | with: 37 | images: | 38 | ghcr.io/huggingface/inference-benchmarker 39 | tags: | 40 | type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }} 41 | # If main, release or tag 42 | - name: Extract metadata (tags, labels) for Docker 43 | if: ${{ github.event_name != 'pull_request' }} 44 | id: meta 45 | uses: docker/metadata-action@v4.3.0 46 | with: 47 | flavor: | 48 | latest=auto 49 | images: | 50 | ghcr.io/huggingface/inference-benchmarker 51 | tags: | 52 | type=semver,pattern={{version}} 53 | type=semver,pattern={{major}}.{{minor}} 54 | type=raw,value=latest,enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }} 55 | type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }} 56 | - name: Build and push Docker image 57 | id: build-and-push 58 | uses: docker/build-push-action@v4 59 | with: 60 | context: . 61 | file: ${{ env.DOCKERFILE }} 62 | push: true 63 | platforms: 'linux/amd64' 64 | build-args: | 65 | GIT_SHA=${{ env.GITHUB_SHA }} 66 | DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }} 67 | PLATFORM=${{ env.PLATFORM }} 68 | tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }} 69 | labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }} 70 | - name: Extract binary from Docker 71 | if: ${{ github.event_name != 'pull_request' }} 72 | run: | 73 | docker create --name extract ghcr.io/huggingface/inference-benchmarker:sha-${{ env.GITHUB_SHA_SHORT }} 74 | docker cp extract:/usr/local/bin/inference-benchmarker ${{ github.workspace }}/inference-benchmarker 75 | docker rm -f extract 76 | - name: Upload binary 77 | if: ${{ github.event_name != 'pull_request' }} 78 | uses: actions/upload-artifact@v4 79 | with: 80 | name: inference-benchmarker_x86_64 81 | path: ${{ github.workspace }}/inference-benchmarker 82 | 83 | -------------------------------------------------------------------------------- /.github/workflows/build_ci.yaml: -------------------------------------------------------------------------------- 1 | name: Build inference-benchmarker 2 | 3 | on: 4 | workflow_dispatch: 5 | workflow_call: 6 | push: 7 | branches: 8 | - 'main' 9 | tags: 10 | - 'v*' 11 | pull_request: 12 | branches: 13 | - "main" 14 | 15 | concurrency: 16 | group: ${{ github.workflow }}-${{ github.ref }} 17 | cancel-in-progress: true 18 | 19 | jobs: 20 | rust: 21 | name: Rust checks 22 | permissions: 23 | pull-requests: write 24 | contents: read 25 | uses: ./.github/workflows/rust.yaml 26 | build: 27 | permissions: 28 | packages: write 29 | contents: read 30 | name: Build and push docker image 31 | uses: ./.github/workflows/build.yaml 32 | needs: rust -------------------------------------------------------------------------------- /.github/workflows/rust.yaml: -------------------------------------------------------------------------------- 1 | name: Rust checks 2 | 3 | on: 4 | workflow_call: 5 | 6 | permissions: 7 | pull-requests: write 8 | contents: read 9 | 10 | jobs: 11 | fmt: 12 | name: Rustfmt 13 | runs-on: 14 | group: aws-general-8-plus 15 | steps: 16 | - uses: actions/checkout@v4 17 | - uses: dtolnay/rust-toolchain@stable 18 | with: 19 | toolchain: nightly 20 | components: rustfmt 21 | - run: cargo fmt --all -- --check 22 | 23 | clippy: 24 | name: Clippy 25 | runs-on: 26 | group: aws-general-8-plus 27 | steps: 28 | - uses: actions/checkout@v4 29 | - uses: dtolnay/rust-toolchain@stable 30 | with: 31 | toolchain: stable 32 | components: clippy 33 | - name: Cache dependencies 34 | uses: actions/cache@v4 35 | env: 36 | cache-name: cache-dependencies 37 | with: 38 | path: | 39 | ~/.cargo/.crates.toml 40 | ~/.cargo/.crates2.json 41 | ~/.cargo/bin 42 | ~/.cargo/registry/index 43 | ~/.cargo/registry/cache 44 | ./target 45 | key: ${{ runner.os }}-build-${{ env.cache-name }}-${{ hashFiles('app/Cargo.lock') }} 46 | - uses: giraffate/clippy-action@v1 47 | with: 48 | github_token: ${{ secrets.GITHUB_TOKEN }} 49 | clippy_flags: -- -Dwarnings 50 | tool_name: clippy-action-output 51 | fail_on_error: true 52 | filter_mode: file 53 | level: info 54 | test: 55 | name: Test 56 | runs-on: 57 | group: aws-general-8-plus 58 | steps: 59 | - uses: actions/checkout@v4 60 | - uses: dtolnay/rust-toolchain@stable 61 | with: 62 | toolchain: stable 63 | - name: Cache dependencies 64 | uses: actions/cache@v4 65 | env: 66 | cache-name: cache-dependencies-test 67 | with: 68 | path: | 69 | ~/.cargo/.crates.toml 70 | ~/.cargo/.crates2.json 71 | ~/.cargo/bin 72 | ~/.cargo/registry/index 73 | ~/.cargo/registry/cache 74 | ./target 75 | key: ${{ runner.os }}-build-${{ env.cache-name }}-${{ hashFiles('app/Cargo.lock') }} 76 | - name: Run unit tests 77 | run: | 78 | cargo test --package inference-benchmarker 79 | 80 | -------------------------------------------------------------------------------- /.github/workflows/trufflehog.yaml: -------------------------------------------------------------------------------- 1 | name: Secret Leaks 2 | 3 | on: 4 | push: 5 | 6 | jobs: 7 | trufflehog: 8 | runs-on: 9 | group: aws-general-8-plus 10 | steps: 11 | - name: Checkout code 12 | uses: actions/checkout@v4 13 | with: 14 | fetch-depth: 0 15 | - name: Secret Scanning 16 | uses: trufflesecurity/trufflehog@main 17 | with: 18 | extra_args: --exclude-globs='test_data/.*' --results=verified,unknown --exclude-detectors=postgres 19 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | debug/ 2 | target/ 3 | Cargo.lock 4 | **/*.rs.bk 5 | *.pdb 6 | __pycache__/ 7 | 8 | *.json 9 | *.txt 10 | results 11 | *.parquet 12 | *.dev.* -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | We as members, contributors, and leaders pledge to make participation in our 6 | community a harassment-free experience for everyone, regardless of age, body 7 | size, visible or invisible disability, ethnicity, sex characteristics, gender 8 | identity and expression, level of experience, education, socio-economic status, 9 | nationality, personal appearance, race, religion, or sexual identity 10 | and orientation. 11 | 12 | We pledge to act and interact in ways that contribute to an open, welcoming, 13 | diverse, inclusive, and healthy community. 14 | 15 | ## Our Standards 16 | 17 | Examples of behavior that contributes to a positive environment for our 18 | community include: 19 | 20 | * Demonstrating empathy and kindness toward other people 21 | * Being respectful of differing opinions, viewpoints, and experiences 22 | * Giving and gracefully accepting constructive feedback 23 | * Accepting responsibility and apologizing to those affected by our mistakes, 24 | and learning from the experience 25 | * Focusing on what is best not just for us as individuals, but for the 26 | overall community 27 | 28 | Examples of unacceptable behavior include: 29 | 30 | * The use of sexualized language or imagery, and sexual attention or 31 | advances of any kind 32 | * Trolling, insulting or derogatory comments, and personal or political attacks 33 | * Public or private harassment 34 | * Publishing others' private information, such as a physical or email 35 | address, without their explicit permission 36 | * Other conduct which could reasonably be considered inappropriate in a 37 | professional setting 38 | 39 | ## Enforcement Responsibilities 40 | 41 | Community leaders are responsible for clarifying and enforcing our standards of 42 | acceptable behavior and will take appropriate and fair corrective action in 43 | response to any behavior that they deem inappropriate, threatening, offensive, 44 | or harmful. 45 | 46 | Community leaders have the right and responsibility to remove, edit, or reject 47 | comments, commits, code, wiki edits, issues, and other contributions that are 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation 49 | decisions when appropriate. 50 | 51 | ## Scope 52 | 53 | This Code of Conduct applies within all community spaces, and also applies when 54 | an individual is officially representing the community in public spaces. 55 | Examples of representing our community include using an official e-mail address, 56 | posting via an official social media account, or acting as an appointed 57 | representative at an online or offline event. 58 | 59 | ## Enforcement 60 | 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 62 | reported to the community leaders responsible for enforcement at 63 | feedback@huggingface.co. 64 | All complaints will be reviewed and investigated promptly and fairly. 65 | 66 | All community leaders are obligated to respect the privacy and security of the 67 | reporter of any incident. 68 | 69 | ## Enforcement Guidelines 70 | 71 | Community leaders will follow these Community Impact Guidelines in determining 72 | the consequences for any action they deem in violation of this Code of Conduct: 73 | 74 | ### 1. Correction 75 | 76 | **Community Impact**: Use of inappropriate language or other behavior deemed 77 | unprofessional or unwelcome in the community. 78 | 79 | **Consequence**: A private, written warning from community leaders, providing 80 | clarity around the nature of the violation and an explanation of why the 81 | behavior was inappropriate. A public apology may be requested. 82 | 83 | ### 2. Warning 84 | 85 | **Community Impact**: A violation through a single incident or series 86 | of actions. 87 | 88 | **Consequence**: A warning with consequences for continued behavior. No 89 | interaction with the people involved, including unsolicited interaction with 90 | those enforcing the Code of Conduct, for a specified period of time. This 91 | includes avoiding interactions in community spaces as well as external channels 92 | like social media. Violating these terms may lead to a temporary or 93 | permanent ban. 94 | 95 | ### 3. Temporary Ban 96 | 97 | **Community Impact**: A serious violation of community standards, including 98 | sustained inappropriate behavior. 99 | 100 | **Consequence**: A temporary ban from any sort of interaction or public 101 | communication with the community for a specified period of time. No public or 102 | private interaction with the people involved, including unsolicited interaction 103 | with those enforcing the Code of Conduct, is allowed during this period. 104 | Violating these terms may lead to a permanent ban. 105 | 106 | ### 4. Permanent Ban 107 | 108 | **Community Impact**: Demonstrating a pattern of violation of community 109 | standards, including sustained inappropriate behavior, harassment of an 110 | individual, or aggression toward or disparagement of classes of individuals. 111 | 112 | **Consequence**: A permanent ban from any sort of public interaction within 113 | the community. 114 | 115 | ## Attribution 116 | 117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], 118 | version 2.0, available at 119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. 120 | 121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct 122 | enforcement ladder](https://github.com/mozilla/diversity). 123 | 124 | [homepage]: https://www.contributor-covenant.org 125 | 126 | For answers to common questions about this code of conduct, see the FAQ at 127 | https://www.contributor-covenant.org/faq. Translations are available at 128 | https://www.contributor-covenant.org/translations. 129 | -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "inference-benchmarker" 3 | version = "1.1.0" 4 | edition = "2021" 5 | authors = ["Hugo Larcher "] 6 | repository = "https://github.com/huggingface/inference-benchmarker" 7 | readme = "README.md" 8 | license = "Apache-2.0" 9 | 10 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 11 | 12 | [dependencies] 13 | reqwest-eventsource = "0.6.0" 14 | log = "0.4.22" 15 | serde_json = "1.0.127" 16 | serde = { version = "1.0.209", features = ["derive"] } 17 | tokio = { version = "1.40.0", features = ["rt", "rt-multi-thread", "macros", "signal"] } 18 | anyhow = "1.0.86" 19 | tokenizers = { version = "0.21.1", features = ["http"] } 20 | rand_distr = "0.5.1" 21 | rand = "0.9.0" 22 | futures-util = "0.3.30" 23 | env_logger = "0.11.5" 24 | async-trait = "0.1.82" 25 | reqwest = { version = "0.12.7", features = ["json"] } 26 | strum_macros = "0.27.1" 27 | clap = { version = "4.5.17", features = ["derive", "env"] } 28 | humantime = "2.1.0" 29 | ratatui = { version = "0.29.0", features = ["all-widgets"] } 30 | crossterm = "0.28.1" 31 | chrono = "0.4.38" 32 | hf-hub = { version = "0.4.2", features = ["tokio"] } 33 | indicatif = "0.17.8" 34 | rayon = "1.10.0" 35 | serde_with = "3.9.0" 36 | sysinfo = "0.33.1" 37 | mockito = "1.5.0" 38 | tabled = "=0.14" 39 | uuid = { version = "1.11.0", features = ["v4", "fast-rng"] } 40 | 41 | [build-dependencies] 42 | vergen-gitcl = { version = "1.0.1" } 43 | 44 | [[bin]] 45 | name = "inference-benchmarker" 46 | path = "src/main.rs" 47 | test = false 48 | bench = false -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM rust:1-bullseye AS builder 2 | LABEL org.opencontainers.image.source=https://github.com/huggingface/inference-benchmarker 3 | LABEL org.opencontainers.image.description="A benchmark tool for LLM inference engines" 4 | LABEL org.opencontainers.image.licenses="Apache-2.0" 5 | ARG GIT_SHA 6 | WORKDIR /usr/src/inference-benchmarker 7 | COPY . . 8 | RUN cargo install --path . 9 | FROM debian:bullseye-slim 10 | RUN apt update && apt install -y ca-certificates && rm -rf /var/lib/apt/lists/* 11 | RUN mkdir -p /opt/inference-benchmarker/results 12 | WORKDIR /opt/inference-benchmarker 13 | COPY --from=builder /usr/local/cargo/bin/inference-benchmarker /usr/local/bin/inference-benchmarker 14 | CMD ["inference-benchmarker"] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright 2024 Hugging Face 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | build: 2 | cargo build --release --package inference-benchmarker --bin inference-benchmarker 3 | 4 | run: build 5 | cargo run --package inference-benchmarker --bin inference-benchmarker -- $@ 6 | 7 | test: 8 | cargo test --package inference-benchmarker 9 | 10 | lint: 11 | cargo +nightly fmt 12 | cargo clippy --package inference-benchmarker 13 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Inference Benchmarker 2 | **The best tool for benchmarking inference engines and LLM performance** 3 | 4 | 5 | Benchmarking inference servers for text generation models presents unique challenges. 6 | The performance of LLM models can vary greatly depending on factors like input prompts, 7 | decoding strategies, hardware specifications, and server configurations. 8 | 9 | **Inference Benchmarker** is designed to streamline this process by providing a comprehensive benchmarking tool 10 | that evaluates the real-world performance of text generation models and servers. 11 | With **Inference Benchmarker**, you can easily test your model's throughput and efficiency under various workloads, 12 | identify performance bottlenecks, and optimize your deployment for production environments. 13 | 14 | It can be used to benchmark any text generation server that exposes an OpenAI-compliant API. 15 | 16 | ## Features 17 | 18 | * Broad Compatibility: Benchmarks any text generation server with an OpenAPI-compliant chat API. 19 | * Automatic Sweep Mode: Detects maximum throughput and sweeps in-between. 20 | * Open-Loop Benchmarking: Uses constant arrival rates to simulate real-world workloads. 21 | * Benchmark profiles: Presets to benchmark for different model use cases (eg. chat, summarization, code completion...). 22 | * High-Performance: Built with Rust 🦀 for high-performance benchmarking. 23 | * JSON Output: Delivers performance results in a structured, easy-to-analyze format. 24 | 25 | ![ui.png](assets/ui.png) 26 | 27 | ## Table of contents 28 | 29 | 30 | * [Inference Benchmarker](#inference-benchmarker) 31 | * [Features](#features) 32 | * [Table of contents](#table-of-contents) 33 | * [Get started](#get-started) 34 | * [Run a benchmark](#run-a-benchmark) 35 | * [1. Start an inference server](#1-start-an-inference-server) 36 | * [2. Run a benchmark using Docker image](#2-run-a-benchmark-using-docker-image) 37 | * [Configure your benchmark](#configure-your-benchmark) 38 | * [Profiles](#profiles) 39 | * [Benchmark mode](#benchmark-mode) 40 | * [Dataset configuration](#dataset-configuration) 41 | * [Prompt configuration](#prompt-configuration) 42 | * [Decode options](#decode-options) 43 | * [Deploy on Kubernetes](#deploy-on-kubernetes) 44 | * [Deploy on Slurm](#deploy-on-slurm) 45 | * [Visualize the results](#visualize-the-results) 46 | * [Development](#development) 47 | * [Frequently Asked Questions](#frequently-asked-questions) 48 | 49 | 50 | ## Get started 51 | 52 | ### Install 53 | 54 | If you have [cargo](https://rustup.rs/) already installed: 55 | ```bash 56 | cargo install --git https://github.com/huggingface/inference-benchmarker/ 57 | ``` 58 | 59 | Or download the [latest released binary](https://github.com/huggingface/inference-benchmarker/releases/latest) 60 | 61 | Or you can run docker images. 62 | 63 | ### Run a benchmark 64 | 65 | #### 1. Start an inference server 66 | 67 | **TGI** 68 | 69 | ```bash 70 | MODEL=meta-llama/Llama-3.1-8B-Instruct 71 | HF_TOKEN= 72 | 73 | docker run --gpus all --shm-size 1g -p 8080:80 -e "HF_TOKEN=$HF_TOKEN" \ 74 | ghcr.io/huggingface/text-generation-inference:3.2.1 --model-id $MODEL 75 | ``` 76 | 77 | **vLLM** 78 | 79 | ```bash 80 | MODEL=meta-llama/Llama-3.1-8B-Instruct 81 | HF_TOKEN= 82 | docker run --runtime nvidia --gpus all \ 83 | --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \ 84 | -p 8080:8000 \ 85 | --ipc=host \ 86 | vllm/vllm-openai:latest \ 87 | --model $MODEL 88 | ``` 89 | 90 | 91 | #### 2. Run a benchmark 92 | 93 | ```shell 94 | inference-benchmarker 95 | --tokenizer-name "meta-llama/Llama-3.1-8B-Instruct" \ 96 | --url http://localhost:8080 \ 97 | --profile chat 98 | ``` 99 | 100 | Results will be saved in JSON format in current directory. 101 | 102 | ### Configure your benchmark 103 | 104 | #### Profiles 105 | 106 | Profiles are presets to benchmark for different model use cases. Available profiles: 107 | - `chat` 108 | Simulates a multi-turn chat scenario in which the model answers to successive user prompts. 109 | The model is prompted with the whole conversation history at each turn. Prefix caching will have a significant impact 110 | on the performance of this benchmark. 111 | - `code-generation` 112 | Simulates code-complete scenarios. Model is given large code snippets and asked to complete them with a few tokens 113 | (e.g. a function name, a few code lines). 114 | - `classification` 115 | Simulates cases where the model is fed with large chunks of business data or document repeatedly and users 116 | ask simple questions about the content (summarization, classification...). 117 | Those use cases benefit a lot from prefix caching and chunked prefill. 118 | - `fixed-length` 119 | Model is sent fixed-length prompts to void the impact of variable-length tokenization on the benchmark. 120 | This is a technical benchmark to evaluate the raw throughput of the model. 121 | 122 | #### Benchmark mode 123 | 124 | In default mode, tool runs a `sweep` benchmark. It first runs a throughput test to find the maximum throughput, then 125 | sweeps on QPS values up to the maximum throughput. 126 | 127 | Available modes: 128 | 129 | - `sweep`: runs a sweep benchmark 130 | - `rate`: runs a benchmark at a fixed request rate 131 | - `throughput`: runs a benchmark at a fixed throughput (constant VUs) 132 | 133 | Example running a benchmark at a fixed request rates: 134 | 135 | ```shell 136 | inference-benchmarker \ 137 | --tokenizer-name "meta-llama/Llama-3.1-8B-Instruct" \ 138 | --max-vus 800 \ 139 | --duration 120s \ 140 | --url http://localhost:8080 \ 141 | --warmup 30s \ 142 | --benchmark-kind rate \ 143 | --rates 1.0 \ 144 | --rates 5.0 \ 145 | --rates 10.0 \ 146 | --prompt-options "num_tokens=200,max_tokens=220,min_tokens=180,variance=10" \ 147 | --decode-options "num_tokens=200,max_tokens=220,min_tokens=180,variance=10" 148 | ``` 149 | 150 | #### Dataset configuration 151 | 152 | Prompts are sampled for a Hugging Face dataset file, using a [subset of ShareGPT 153 | as default](https://huggingface.co/datasets/hlarcher/share_gpt_small). You can specify a different dataset file using 154 | the 155 | `--dataset` and `--dataset-file` option. 156 | 157 | Dataset is expected to be JSON with the following format: 158 | 159 | ```json 160 | [ 161 | { 162 | "conversations": [ 163 | { 164 | "role": "user", 165 | "content": "rewrite that entire paragraph in the same style like this one: " 166 | } 167 | ] 168 | } 169 | ] 170 | ``` 171 | 172 | To benchmark with prefix caching, you can use a system prompt that will be sent with each request from a discussion. 173 | 174 | ```json 175 | [ 176 | { 177 | "conversations": [ 178 | { 179 | "role": "system", 180 | "content": "You are a helpful assistant that makes jokes at each response." 181 | }, 182 | { 183 | "role": "user", 184 | "content": "rewrite that entire paragraph in the same style like this one:" 185 | } 186 | ] 187 | } 188 | ] 189 | ``` 190 | 191 | #### Prompt configuration 192 | 193 | For consistent results you can configure the token count and variance. The tool will sample prompts with the specified 194 | values, sampling token counts from a normal distribution with the specified variance. 195 | 196 | ```shell 197 | --prompt-options "num_tokens=50,max_tokens=60,min_tokens=40,variance=10" 198 | ``` 199 | 200 | ### Decode options 201 | 202 | You can also configure the decoding options for the model. The tool will sample decoding options with the specified 203 | values, sampling token counts from a normal distribution with the specified variance. 204 | 205 | ```shell 206 | --decode-options "num_tokens=50,max_tokens=60,min_tokens=40,variance=10" 207 | ``` 208 | 209 | ## Deploy on Kubernetes 210 | 211 | You can deploy the benchmarking tool on Kubernetes using the provided Helm chart. 212 | 213 | Review the values (especially model, HF token and resources), and install the chart: 214 | ```shell 215 | $ helm install inference-benchmarker ./extra/k8s/inference-benchmarker 216 | ``` 217 | 218 | ## Deploy on Slurm 219 | 220 | Slurm example is provided in `extra/slurm`. 221 | 222 | ## Visualize the results 223 | 224 | You can use the provided Gradio app to quickly visualize the results. 225 | Considering that JSON results files are in `/path/to/results/dir`: 226 | ```shell 227 | $ poetry install 228 | $ cd extra/dashboard 229 | $ poetry run python app.py --from-results-dir /path/to/results/dir 230 | ``` 231 | 232 | ![dashboard.png](assets/dashboard.png) 233 | 234 | ## Development 235 | 236 | You need [Rust](https://rustup.rs/) installed to build the benchmarking tool. 237 | 238 | ```shell 239 | $ make build 240 | ``` 241 | 242 | ## Frequently Asked Questions 243 | 244 | * **What's the difference between constant arrival rate and constant virtual user count?** 245 | * **Constant virtual user count** means that the number of virtual users is fixed. Each virtual user can send a 246 | single requests and waits for server response. It's basically simulating a fixed number of users querying the 247 | server. 248 | * **Constant arrival rate** means that the rate of requests is fixed and the number of virtual users is adjusted to 249 | maintain that rate. Queries hit the server independently of responses performances. 250 | 251 | **Constant virtual user count** is a closed loop model where the server's response time dictates the number of 252 | iterations. **Constant arrival rate** is an open-loop model more representative of real-life workloads. 253 | 254 | 255 | * **Why do I get high error rate when running `thoughput` benchmark?** 256 | 257 | Throughput bench tries to saturate the server with a high request rate. The error rate is high because the server is 258 | not able to handle the request rate or rate limiting the requests. 259 | In the case of TGI, this is controlled by the `--max-concurrent-requests` option. 260 | 261 | 262 | * **What is the influence of CUDA graphs?** 263 | 264 | CUDA graphs are used to optimize the GPU usage by minimizing the overhead of launching kernels. This can lead to 265 | better performance in some cases, but can also lead to worse performance in others. 266 | If your CUDA graphs are not evenly distributed, you may see a performance drop at some request rates as batch size may 267 | fall in a bigger CUDA graph batch size leading to a lost of compute due to excessive padding. 268 | 269 | * **I get less tokens generated than expected in a benchmark.** 270 | 271 | Inference servers use `max_tokens` parameter to limit the number of tokens generated. If the model 272 | generates an end-of-sequence token before reaching `max_tokens`, the generation will stop. 273 | There is currently no way to guarantee a fixed number of tokens generated without modifying the inference server. 274 | So you may have `(successful requests) * max_tokens < generated tokens`. 275 | 276 | -------------------------------------------------------------------------------- /assets/dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/inference-benchmarker/687e477930b387d3c9c787d4953a266f6469f047/assets/dashboard.png -------------------------------------------------------------------------------- /assets/ui.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/inference-benchmarker/687e477930b387d3c9c787d4953a266f6469f047/assets/ui.png -------------------------------------------------------------------------------- /build.rs: -------------------------------------------------------------------------------- 1 | use std::error::Error; 2 | use vergen_gitcl::{Emitter, GitclBuilder}; 3 | 4 | fn main() -> Result<(), Box> { 5 | // Try to get the git sha from the local git repository 6 | match GitclBuilder::all_git() { 7 | Ok(gitcl) => match Emitter::default().fail_on_error().add_instructions(&gitcl) { 8 | Ok(emitter) => { 9 | if emitter.emit().is_err() { 10 | fallback_git_sha(); 11 | } 12 | } 13 | Err(_) => { 14 | fallback_git_sha(); 15 | } 16 | }, 17 | Err(_) => { 18 | fallback_git_sha(); 19 | } 20 | }; 21 | Ok(()) 22 | } 23 | 24 | fn fallback_git_sha() { 25 | // Unable to get the git sha 26 | if let Ok(sha) = std::env::var("GIT_SHA") { 27 | // Set it from an env var 28 | println!("cargo:rustc-env=VERGEN_GIT_SHA={sha}"); 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /extra/dashboard/app.py: -------------------------------------------------------------------------------- 1 | from contextlib import ExitStack 2 | from dataclasses import dataclass 3 | from typing import List 4 | 5 | import click 6 | import gradio as gr 7 | import pandas as pd 8 | 9 | from parse_results import build_results 10 | 11 | 12 | @dataclass 13 | class PlotConfig: 14 | x_title: str 15 | y_title: str 16 | title: str 17 | percentiles: List[float] = None 18 | 19 | 20 | def run(from_results_dir, datasource, port): 21 | css = ''' 22 | .summary span { 23 | font-size: 10px; 24 | padding-top:0; 25 | padding-bottom:0; 26 | } 27 | ''' 28 | 29 | summary_desc = ''' 30 | ## Summary 31 | This table shows the average of the metrics for each model and QPS rate. 32 | 33 | The metrics are: 34 | * Inter token latency: Time to generate a new output token for each user querying the system. 35 | It translates as the “speed” perceived by the end-user. We aim for at least 300 words per minute (average reading speed), so ITL<150ms 36 | * Time to First Token: Time the user has to wait before seeing the first token of its answer. 37 | Lower waiting time are essential for real-time interactions, less so for offline workloads. 38 | * End-to-end latency: The overall time the system took to generate the full response to the user. 39 | * Throughput: The number of tokens per second the system can generate across all requests 40 | * Successful requests: The number of requests the system was able to honor in the benchmark timeframe 41 | * Error rate: The percentage of requests that ended up in error, as the system could not process them in time or failed to process them. 42 | 43 | ''' 44 | 45 | df_bench = pd.DataFrame() 46 | line_plots_bench = [] 47 | column_mappings = {'inter_token_latency_ms_p90': 'ITL P90 (ms)', 'time_to_first_token_ms_p90': 'TTFT P90 (ms)', 48 | 'e2e_latency_ms_p90': 'E2E P90 (ms)', 'token_throughput_secs': 'Throughput (tokens/s)', 49 | 'successful_requests': 'Successful requests', 'error_rate': 'Error rate (%)', 'model': 'Model', 50 | 'rate': 'QPS', 'run_id': 'Run ID'} 51 | default_df = pd.DataFrame.from_dict( 52 | {"rate": [1, 2], "inter_token_latency_ms_p90": [10, 20], 53 | "version": ["default", "default"], 54 | "model": ["default", "default"]}) 55 | 56 | def load_demo(model_bench, percentiles): 57 | return update_bench(model_bench, percentiles) 58 | 59 | def update_bench(model, percentiles): 60 | res = [] 61 | for plot in line_plots_bench: 62 | if plot['config'].percentiles: 63 | k = plot['metric'] + '_' + str(percentiles) 64 | df_bench[plot['metric']] = df_bench[k] if k in df_bench.columns else 0 65 | res.append(df_bench[(df_bench['model'] == model)]) 66 | 67 | return res + [summary_table()] 68 | 69 | def summary_table() -> pd.DataFrame: 70 | data = df_bench.groupby(['model', 'run_id', 'rate']).agg( 71 | {'inter_token_latency_ms_p90': 'mean', 'time_to_first_token_ms_p90': 'mean', 72 | 'e2e_latency_ms_p90': 'mean', 'token_throughput_secs': 'mean', 73 | 'successful_requests': 'mean', 'error_rate': 'mean'}).reset_index() 74 | data = data[ 75 | ['run_id', 'model', 'rate', 'inter_token_latency_ms_p90', 'time_to_first_token_ms_p90', 76 | 'e2e_latency_ms_p90', 77 | 'token_throughput_secs']] 78 | for metric in ['inter_token_latency_ms_p90', 'time_to_first_token_ms_p90', 'e2e_latency_ms_p90', 79 | 'token_throughput_secs']: 80 | data[metric] = data[metric].apply(lambda x: f"{x:.2f}") 81 | data = data.rename( 82 | columns=column_mappings) 83 | return data 84 | 85 | def load_bench_results(source) -> pd.DataFrame: 86 | data = pd.read_parquet(source) 87 | # remove warmup and throughput 88 | data = data[(data['id'] != 'warmup') & (data['id'] != 'throughput')] 89 | # only keep constant rate 90 | data = data[data['executor_type'] == 'ConstantArrivalRate'] 91 | return data 92 | 93 | def select_region(selection: gr.SelectData, model): 94 | min_w, max_w = selection.index 95 | data = df_bench[(df_bench['model'] == model) & (df_bench['rate'] >= min_w) & ( 96 | df_bench['rate'] <= max_w)] 97 | res = [] 98 | for plot in line_plots_bench: 99 | # find the y values for the selected region 100 | metric = plot["metric"] 101 | y_min = data[metric].min() 102 | y_max = data[metric].max() 103 | res.append(gr.LinePlot(x_lim=[min_w, max_w], y_lim=[y_min, y_max])) 104 | return res 105 | 106 | def reset_region(): 107 | res = [] 108 | for _ in line_plots_bench: 109 | res.append(gr.LinePlot(x_lim=None, y_lim=None)) 110 | return res 111 | 112 | def load_datasource(datasource, fn): 113 | if datasource.startswith('file://'): 114 | return fn(datasource) 115 | elif datasource.startswith('s3://'): 116 | return fn(datasource) 117 | else: 118 | raise ValueError(f"Unknown datasource: {datasource}") 119 | 120 | if from_results_dir is not None: 121 | build_results(from_results_dir, 'benchmarks.parquet', None) 122 | # Load data 123 | df_bench = load_datasource(datasource, load_bench_results) 124 | 125 | # Define metrics 126 | metrics = { 127 | "inter_token_latency_ms": PlotConfig(title="Inter Token Latency (lower is better)", x_title="QPS", 128 | y_title="Time (ms)", percentiles=[0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99]), 129 | "time_to_first_token_ms": PlotConfig(title="TTFT (lower is better)", x_title="QPS", 130 | y_title="Time (ms)", percentiles=[0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99]), 131 | "e2e_latency_ms": PlotConfig(title="End to End Latency (lower is better)", x_title="QPS", 132 | y_title="Time (ms)", percentiles=[0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99]), 133 | "token_throughput_secs": PlotConfig(title="Request Output Throughput (higher is better)", x_title="QPS", 134 | y_title="Tokens/s"), 135 | "successful_requests": PlotConfig(title="Successful requests (higher is better)", x_title="QPS", 136 | y_title="Count"), 137 | "error_rate": PlotConfig(title="Error rate", x_title="QPS", y_title="%"), 138 | "prompt_tokens": PlotConfig(title="Prompt tokens", x_title="QPS", y_title="Count"), 139 | "decoded_tokens": PlotConfig(title="Decoded tokens", x_title="QPS", y_title="Count") 140 | } 141 | 142 | models = df_bench["model"].unique() 143 | run_ids = df_bench["run_id"].unique() 144 | 145 | # get all available percentiles 146 | percentiles = set() 147 | for k, v in metrics.items(): 148 | if v.percentiles: 149 | percentiles.update(v.percentiles) 150 | percentiles = map(lambda p: f'p{int(float(p) * 100)}', percentiles) 151 | percentiles = sorted(list(percentiles)) 152 | percentiles.append('avg') 153 | with gr.Blocks(css=css, title="Inference Benchmarker") as demo: 154 | with gr.Row(): 155 | gr.Markdown("# Inference-benchmarker 🤗\n## Benchmarks results") 156 | with gr.Row(): 157 | gr.Markdown(summary_desc) 158 | with gr.Row(): 159 | table = gr.DataFrame( 160 | pd.DataFrame(), 161 | elem_classes=["summary"], 162 | ) 163 | with gr.Row(): 164 | details_desc = gr.Markdown("## Details") 165 | with gr.Row(): 166 | model = gr.Dropdown(list(models), label="Select model", value=models[0]) 167 | with gr.Row(): 168 | percentiles_bench = gr.Radio(percentiles, label="", value="avg") 169 | i = 0 170 | with ExitStack() as stack: 171 | for k, v in metrics.items(): 172 | if i % 2 == 0: 173 | stack.close() 174 | gs = stack.enter_context(gr.Row()) 175 | line_plots_bench.append( 176 | {"component": gr.LinePlot(default_df, label=f'{v.title}', x="rate", y=k, 177 | y_title=v.y_title, x_title=v.x_title, 178 | color="run_id" 179 | ), 180 | "model": model.value, 181 | "metric": k, 182 | "config": v 183 | }, 184 | ) 185 | i += 1 186 | 187 | for component in [model, percentiles_bench]: 188 | component.change(update_bench, [model, percentiles_bench], 189 | [item["component"] for item in line_plots_bench] + [table]) 190 | gr.on([plot["component"].select for plot in line_plots_bench], select_region, [model], 191 | outputs=[item["component"] for item in line_plots_bench]) 192 | gr.on([plot["component"].double_click for plot in line_plots_bench], reset_region, None, 193 | outputs=[item["component"] for item in line_plots_bench]) 194 | demo.load(load_demo, [model, percentiles_bench], 195 | [item["component"] for item in line_plots_bench] + [table]) 196 | 197 | demo.launch(server_port=port, server_name="0.0.0.0") 198 | 199 | 200 | @click.command() 201 | @click.option('--from-results-dir', default=None, help='Load inference-benchmarker results from a directory') 202 | @click.option('--datasource', default='file://benchmarks.parquet', help='Load a Parquet file already generated') 203 | @click.option('--port', default=7860, help='Port to run the dashboard') 204 | def main(from_results_dir, datasource, port): 205 | run(from_results_dir, datasource, port) 206 | 207 | 208 | if __name__ == '__main__': 209 | main(auto_envvar_prefix='DASHBOARD') 210 | -------------------------------------------------------------------------------- /extra/dashboard/parse_results.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | import os 4 | 5 | import pandas as pd 6 | 7 | 8 | def build_df(model: str, data_files: dict[str, str]) -> pd.DataFrame: 9 | df = pd.DataFrame() 10 | # Load the results 11 | for key, filename in data_files.items(): 12 | with open(filename, 'r') as f: 13 | data = json.load(f) 14 | if data['config']['meta'] is None: 15 | data['config']['meta'] = {} 16 | for result in data['results']: 17 | entry = pd.json_normalize(result).to_dict(orient='records')[0] 18 | if 'engine' in data['config']['meta']: 19 | entry['engine'] = data['config']['meta']['engine'] 20 | if 'tp' in data['config']['meta']: 21 | entry['tp'] = data['config']['meta']['tp'] 22 | if 'version' in data['config']['meta']: 23 | entry['version'] = data['config']['meta']['version'] 24 | if 'device' in data['config']['meta']: 25 | entry['device'] = data['config']['meta']['device'] 26 | entry['model'] = data['config']['model_name'] 27 | entry['run_id'] = data['config']['run_id'] 28 | df_tmp = pd.DataFrame(entry, index=[0]) 29 | # rename columns that start with 'config.' 30 | df_tmp = df_tmp.rename(columns={c: c.split('config.')[-1] for c in df_tmp.columns}) 31 | # replace . with _ in column names 32 | df_tmp.columns = [c.replace('.', '_') for c in df_tmp.columns] 33 | 34 | df = pd.concat([df, df_tmp]) 35 | return df 36 | 37 | 38 | def build_results_df(results_dir) -> pd.DataFrame: 39 | df = pd.DataFrame() 40 | # list directories 41 | directories = [f'{results_dir}/{d}' for d in os.listdir(results_dir) if os.path.isdir(f'{results_dir}/{d}')] + [results_dir] 42 | for directory in directories: 43 | # list json files in results directory 44 | data_files = {} 45 | for filename in os.listdir(directory): 46 | if filename.endswith('.json'): 47 | data_files[filename.split('.')[-2]] = f'{directory}/{filename}' 48 | df = pd.concat([df, build_df(directory.split('/')[-1], data_files)]) 49 | return df 50 | 51 | 52 | def build_results(results_dir, results_file, device): 53 | df = build_results_df(results_dir) 54 | if 'device' not in df.columns: 55 | df['device'] = df['model'].apply(lambda x: device) 56 | df['error_rate'] = df['failed_requests'] / (df['failed_requests'] + df['successful_requests']) * 100.0 57 | df['prompt_tokens'] = df['total_tokens_sent'] / df['successful_requests'] 58 | df['decoded_tokens'] = df['total_tokens'] / df['successful_requests'] 59 | df.to_parquet(results_file) 60 | 61 | 62 | if __name__ == '__main__': 63 | parser = argparse.ArgumentParser() 64 | parser.add_argument('--results-dir', default='results', type=str, required=True, 65 | help='Path to the source directory containing the results') 66 | parser.add_argument('--results-file', type=str, required=True, 67 | help='Path to the results file to write to. Can be a S3 path') 68 | parser.add_argument('--device', type=str, required=True, help='GPU name used for benchmarking') 69 | args = parser.parse_args() 70 | build_results(args.results_dir, args.results_file, args.device) 71 | -------------------------------------------------------------------------------- /extra/k8s/inference-benchmarker/.helmignore: -------------------------------------------------------------------------------- 1 | # Patterns to ignore when building packages. 2 | # This supports shell glob matching, relative path matching, and 3 | # negation (prefixed with !). Only one pattern per line. 4 | .DS_Store 5 | # Common VCS dirs 6 | .git/ 7 | .gitignore 8 | .bzr/ 9 | .bzrignore 10 | .hg/ 11 | .hgignore 12 | .svn/ 13 | # Common backup files 14 | *.swp 15 | *.bak 16 | *.tmp 17 | *.orig 18 | *~ 19 | # Various IDEs 20 | .project 21 | .idea/ 22 | *.tmproj 23 | .vscode/ 24 | -------------------------------------------------------------------------------- /extra/k8s/inference-benchmarker/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | name: inference-benchmarker 3 | description: A Helm chart to run inference-benchmarker 4 | type: application 5 | version: 0.1.0 6 | appVersion: "2.3.0" 7 | -------------------------------------------------------------------------------- /extra/k8s/inference-benchmarker/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* 2 | Expand the name of the chart. 3 | */}} 4 | {{- define "inference-benchmarker.name" -}} 5 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} 6 | {{- end }} 7 | 8 | {{/* 9 | Create a default fully qualified app name. 10 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). 11 | If release name contains chart name it will be used as a full name. 12 | */}} 13 | {{- define "inference-benchmarker.fullname" -}} 14 | {{- if .Values.fullnameOverride }} 15 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} 16 | {{- else }} 17 | {{- $name := default .Chart.Name .Values.nameOverride }} 18 | {{- if contains $name .Release.Name }} 19 | {{- .Release.Name | trunc 63 | trimSuffix "-" }} 20 | {{- else }} 21 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} 22 | {{- end }} 23 | {{- end }} 24 | {{- end }} 25 | 26 | {{/* 27 | Create chart name and version as used by the chart label. 28 | */}} 29 | {{- define "inference-benchmarker.chart" -}} 30 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} 31 | {{- end }} 32 | 33 | {{/* 34 | Common labels 35 | */}} 36 | {{- define "inference-benchmarker.labels" -}} 37 | helm.sh/chart: {{ include "inference-benchmarker.chart" . }} 38 | {{ include "inference-benchmarker.selectorLabels" . }} 39 | {{- if .Chart.AppVersion }} 40 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 41 | {{- end }} 42 | app.kubernetes.io/managed-by: {{ .Release.Service }} 43 | {{- end }} 44 | 45 | {{/* 46 | Selector labels 47 | */}} 48 | {{- define "inference-benchmarker.selectorLabels" -}} 49 | app.kubernetes.io/name: {{ include "inference-benchmarker.name" . }} 50 | app.kubernetes.io/instance: {{ .Release.Name }} 51 | {{- end }} 52 | 53 | {{/* 54 | Create the name of the service account to use 55 | */}} 56 | {{- define "inference-benchmarker.serviceAccountName" -}} 57 | {{- if .Values.serviceAccount.create }} 58 | {{- default (include "inference-benchmarker.fullname" .) .Values.serviceAccount.name }} 59 | {{- else }} 60 | {{- default "default" .Values.serviceAccount.name }} 61 | {{- end }} 62 | {{- end }} 63 | -------------------------------------------------------------------------------- /extra/k8s/inference-benchmarker/templates/benchmark.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: batch/v1 2 | kind: Job 3 | metadata: 4 | name: {{ include "inference-benchmarker.fullname" . }}-benchmark 5 | labels: 6 | app.kubernetes.io/component: benchmark 7 | {{- include "inference-benchmarker.labels" . | nindent 4 }} 8 | spec: 9 | template: 10 | metadata: 11 | {{- with .Values.benchmark.podAnnotations }} 12 | annotations: 13 | {{- toYaml . | nindent 8 }} 14 | {{- end }} 15 | labels: 16 | app.kubernetes.io/component: benchmark 17 | {{- include "inference-benchmarker.labels" . | nindent 8 }} 18 | {{- with .Values.benchmark.podLabels }} 19 | {{- toYaml . | nindent 8 }} 20 | {{- end }} 21 | spec: 22 | {{- with .Values.imagePullSecrets }} 23 | imagePullSecrets: 24 | {{- toYaml . | nindent 8 }} 25 | {{- end }} 26 | securityContext: 27 | {{- toYaml .Values.benchmark.podSecurityContext | nindent 8 }} 28 | restartPolicy: Never 29 | initContainers: 30 | - name: wait-for-text-generation-inference 31 | image: alpine/curl:latest 32 | command: 33 | - sh 34 | - -c 35 | - | 36 | until curl -s http://{{ include "inference-benchmarker.fullname" . }}.{{ .Release.Namespace }}.svc.cluster.local:8080/health; do 37 | echo "Waiting for {{ include "inference-benchmarker.fullname" . }} service..." 38 | sleep 1 39 | done 40 | exit 0 41 | containers: 42 | - name: {{ .Chart.Name }} 43 | securityContext: 44 | {{- toYaml .Values.benchmark.securityContext | nindent 12 }} 45 | image: "{{ .Values.benchmark.image.repository }}:{{ .Values.benchmark.image.tag | default "latest" }}" 46 | imagePullPolicy: {{ .Values.benchmark.image.pullPolicy }} 47 | args: 48 | - "inference-benchmarker" 49 | - "--url" 50 | - "http://{{ include "inference-benchmarker.fullname" . }}.{{ .Release.Namespace }}.svc.cluster.local:8080" 51 | - "--tokenizer-name" 52 | - "{{ .Values.model_id }}" 53 | - "--no-console" 54 | {{- toYaml .Values.benchmark.extra_args | nindent 12 }} 55 | env: 56 | - name: HF_TOKEN 57 | valueFrom: 58 | secretKeyRef: 59 | name: {{ include "inference-benchmarker.fullname" . }}-hf-token 60 | key: HF_TOKEN 61 | - name: RUST_LOG 62 | value: "inference_benchmarker=info" 63 | resources: 64 | {{- toYaml .Values.benchmark.resources | nindent 12 }} 65 | volumeMounts: 66 | - name: results 67 | mountPath: /opt/inference-benchmarker/results 68 | - name: nginx 69 | image: nginx 70 | ports: 71 | - containerPort: 80 72 | name: http 73 | protocol: TCP 74 | volumeMounts: 75 | - name: results 76 | mountPath: /usr/share/nginx/html 77 | - name: default 78 | mountPath: /etc/nginx/conf.d/default.conf 79 | subPath: default 80 | terminationGracePeriodSeconds: 5 81 | volumes: 82 | - name: results 83 | emptyDir: { } 84 | - name: default 85 | configMap: 86 | name: {{ include "inference-benchmarker.fullname" . }}-nginx-config 87 | {{- with .Values.benchmark.nodeSelector }} 88 | nodeSelector: 89 | {{- toYaml . | nindent 8 }} 90 | {{- end }} 91 | {{- with .Values.benchmark.affinity }} 92 | affinity: 93 | {{- toYaml . | nindent 8 }} 94 | {{- end }} 95 | {{- with .Values.benchmark.tolerations }} 96 | tolerations: 97 | {{- toYaml . | nindent 8 }} 98 | {{- end }} 99 | -------------------------------------------------------------------------------- /extra/k8s/inference-benchmarker/templates/checks.yaml: -------------------------------------------------------------------------------- 1 | {{- if and (.Values.tgi.enabled) (.Values.vllm.enabled) }} 2 | {{- fail "You cannot enable multiple inference engines at the same time. Please check you values" }} 3 | {{- end }} -------------------------------------------------------------------------------- /extra/k8s/inference-benchmarker/templates/configmap.yml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ConfigMap 3 | metadata: 4 | name: {{ include "inference-benchmarker.fullname" . }}-nginx-config 5 | data: 6 | default: | 7 | server { 8 | listen 80; 9 | listen [::]:80; 10 | server_name localhost; 11 | 12 | location / { 13 | root /usr/share/nginx/html; 14 | index index.html index.htm; 15 | autoindex on; 16 | } 17 | 18 | error_page 500 502 503 504 /50x.html; 19 | location = /50x.html { 20 | root /usr/share/nginx/html; 21 | } 22 | } -------------------------------------------------------------------------------- /extra/k8s/inference-benchmarker/templates/secret.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Secret 3 | metadata: 4 | name: {{ include "inference-benchmarker.fullname" . }}-hf-token 5 | type: Opaque 6 | stringData: 7 | HF_TOKEN: "{{ .Values.hf_token }}" -------------------------------------------------------------------------------- /extra/k8s/inference-benchmarker/templates/service.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Service 3 | metadata: 4 | name: {{ include "inference-benchmarker.fullname" . }} 5 | labels: 6 | app.kubernetes.io/component: text-generation-inference 7 | {{- include "inference-benchmarker.labels" . | nindent 4 }} 8 | spec: 9 | type: ClusterIP 10 | ports: 11 | - port: 8080 12 | targetPort: http 13 | protocol: TCP 14 | name: http 15 | selector: 16 | app.kubernetes.io/component: text-generation-inference 17 | {{- include "inference-benchmarker.selectorLabels" . | nindent 4 }} 18 | --- 19 | apiVersion: v1 20 | kind: Service 21 | metadata: 22 | name: {{ include "inference-benchmarker.fullname" . }}-benchmark 23 | labels: 24 | app.kubernetes.io/component: benchmark 25 | {{- include "inference-benchmarker.labels" . | nindent 4 }} 26 | spec: 27 | type: ClusterIP 28 | ports: 29 | - port: 8080 30 | targetPort: http 31 | protocol: TCP 32 | name: http 33 | publishNotReadyAddresses: true 34 | selector: 35 | app.kubernetes.io/component: benchmark 36 | {{- include "inference-benchmarker.selectorLabels" . | nindent 4 }} 37 | -------------------------------------------------------------------------------- /extra/k8s/inference-benchmarker/templates/tgi.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.tgi.enabled }} 2 | apiVersion: apps/v1 3 | kind: Deployment 4 | metadata: 5 | name: {{ include "inference-benchmarker.fullname" . }} 6 | labels: 7 | app.kubernetes.io/component: text-generation-inference 8 | {{- include "inference-benchmarker.labels" . | nindent 4 }} 9 | spec: 10 | replicas: {{ .Values.tgi.replicaCount }} 11 | selector: 12 | matchLabels: 13 | app.kubernetes.io/component: text-generation-inference 14 | {{- include "inference-benchmarker.selectorLabels" . | nindent 6 }} 15 | template: 16 | metadata: 17 | {{- with .Values.tgi.podAnnotations }} 18 | annotations: 19 | {{- toYaml . | nindent 8 }} 20 | {{- end }} 21 | labels: 22 | app.kubernetes.io/component: text-generation-inference 23 | {{- include "inference-benchmarker.labels" . | nindent 8 }} 24 | {{- with .Values.tgi.podLabels }} 25 | {{- toYaml . | nindent 8 }} 26 | {{- end }} 27 | spec: 28 | {{- with .Values.imagePullSecrets }} 29 | imagePullSecrets: 30 | {{- toYaml . | nindent 8 }} 31 | {{- end }} 32 | securityContext: 33 | {{- toYaml .Values.tgi.podSecurityContext | nindent 8 }} 34 | containers: 35 | - name: text-generation-inference 36 | securityContext: 37 | {{- toYaml .Values.tgi.securityContext | nindent 12 }} 38 | image: "{{ .Values.tgi.image.repository }}:{{ .Values.tgi.image.tag | default "latest" }}" 39 | imagePullPolicy: {{ .Values.tgi.image.pullPolicy }} 40 | env: 41 | - name: HF_TOKEN 42 | valueFrom: 43 | secretKeyRef: 44 | name: {{ include "inference-benchmarker.fullname" . }}-hf-token 45 | key: HF_TOKEN 46 | - name: PORT 47 | value: "8080" 48 | args: 49 | - "--model-id" 50 | - "{{ .Values.model_id }}" 51 | {{- with .Values.tgi.extra_args }} 52 | {{- toYaml . | nindent 12 }} 53 | {{- end }} 54 | ports: 55 | - name: http 56 | containerPort: 8080 57 | protocol: TCP 58 | readinessProbe: 59 | httpGet: 60 | path: /health 61 | port: 8080 62 | initialDelaySeconds: 10 63 | periodSeconds: 3 64 | resources: 65 | {{- toYaml .Values.tgi.resources | nindent 12 }} 66 | volumeMounts: 67 | - name: shm 68 | mountPath: /dev/shm 69 | terminationGracePeriodSeconds: 10 70 | volumes: 71 | - name: shm 72 | emptyDir: 73 | medium: Memory 74 | sizeLimit: 1Gi 75 | {{- with .Values.tgi.nodeSelector }} 76 | nodeSelector: 77 | {{- toYaml . | nindent 8 }} 78 | {{- end }} 79 | {{- with .Values.tgi.affinity }} 80 | affinity: 81 | {{- toYaml . | nindent 8 }} 82 | {{- end }} 83 | {{- with .Values.tgi.tolerations }} 84 | tolerations: 85 | {{- toYaml . | nindent 8 }} 86 | {{- end }} 87 | {{- end }} -------------------------------------------------------------------------------- /extra/k8s/inference-benchmarker/templates/vllm.yaml: -------------------------------------------------------------------------------- 1 | {{- if .Values.vllm.enabled }} 2 | apiVersion: apps/v1 3 | kind: Deployment 4 | metadata: 5 | name: {{ include "inference-benchmarker.fullname" . }} 6 | labels: 7 | app.kubernetes.io/component: text-generation-inference 8 | {{- include "inference-benchmarker.labels" . | nindent 4 }} 9 | spec: 10 | replicas: {{ .Values.vllm.replicaCount }} 11 | selector: 12 | matchLabels: 13 | app.kubernetes.io/component: text-generation-inference 14 | {{- include "inference-benchmarker.selectorLabels" . | nindent 6 }} 15 | template: 16 | metadata: 17 | {{- with .Values.vllm.podAnnotations }} 18 | annotations: 19 | {{- toYaml . | nindent 8 }} 20 | {{- end }} 21 | labels: 22 | app.kubernetes.io/component: text-generation-inference 23 | {{- include "inference-benchmarker.labels" . | nindent 8 }} 24 | {{- with .Values.vllm.podLabels }} 25 | {{- toYaml . | nindent 8 }} 26 | {{- end }} 27 | spec: 28 | {{- with .Values.imagePullSecrets }} 29 | imagePullSecrets: 30 | {{- toYaml . | nindent 8 }} 31 | {{- end }} 32 | securityContext: 33 | {{- toYaml .Values.vllm.podSecurityContext | nindent 8 }} 34 | containers: 35 | - name: vllm 36 | securityContext: 37 | {{- toYaml .Values.vllm.securityContext | nindent 12 }} 38 | image: "{{ .Values.vllm.image.repository }}:{{ .Values.vllm.image.tag | default "latest" }}" 39 | imagePullPolicy: {{ .Values.vllm.image.pullPolicy }} 40 | env: 41 | - name: HUGGING_FACE_HUB_TOKEN 42 | valueFrom: 43 | secretKeyRef: 44 | name: {{ include "inference-benchmarker.fullname" . }}-hf-token 45 | key: HF_TOKEN 46 | args: 47 | - "--model" 48 | - "{{ .Values.model_id }}" 49 | - "--port" 50 | - "8080" 51 | {{- with .Values.vllm.extra_args }} 52 | {{- toYaml . | nindent 12 }} 53 | {{- end }} 54 | ports: 55 | - name: http 56 | containerPort: 8080 57 | protocol: TCP 58 | readinessProbe: 59 | httpGet: 60 | path: /health 61 | port: 8080 62 | initialDelaySeconds: 10 63 | periodSeconds: 3 64 | resources: 65 | {{- toYaml .Values.vllm.resources | nindent 12 }} 66 | volumeMounts: 67 | - name: shm 68 | mountPath: /dev/shm 69 | terminationGracePeriodSeconds: 10 70 | volumes: 71 | - name: shm 72 | emptyDir: 73 | medium: Memory 74 | sizeLimit: 1Gi 75 | {{- with .Values.vllm.nodeSelector }} 76 | nodeSelector: 77 | {{- toYaml . | nindent 8 }} 78 | {{- end }} 79 | {{- with .Values.vllm.affinity }} 80 | affinity: 81 | {{- toYaml . | nindent 8 }} 82 | {{- end }} 83 | {{- with .Values.vllm.tolerations }} 84 | tolerations: 85 | {{- toYaml . | nindent 8 }} 86 | {{- end }} 87 | {{- end }} -------------------------------------------------------------------------------- /extra/k8s/inference-benchmarker/values.yaml: -------------------------------------------------------------------------------- 1 | imagePullSecrets: [ ] 2 | nameOverride: "" 3 | fullnameOverride: "" 4 | 5 | hf_token: "" 6 | model_id: "meta-llama/Llama-3.1-8B-Instruct" 7 | server: tgi 8 | 9 | tgi: 10 | enabled: true 11 | extra_args: 12 | - "--max-concurrent-requests" 13 | - "512" 14 | image: 15 | repository: ghcr.io/huggingface/text-generation-inference 16 | pullPolicy: IfNotPresent 17 | tag: "latest" 18 | replicaCount: 1 19 | resources: 20 | limits: 21 | "nvidia.com/gpu": "1" 22 | podAnnotations: { } 23 | podLabels: { } 24 | podSecurityContext: { } 25 | securityContext: { } 26 | nodeSelector: { } 27 | tolerations: [ ] 28 | affinity: { } 29 | 30 | vllm: 31 | enabled: false 32 | extra_args: 33 | image: 34 | repository: vllm/vllm-openai 35 | pullPolicy: IfNotPresent 36 | tag: "latest" 37 | replicaCount: 1 38 | resources: 39 | limits: 40 | "nvidia.com/gpu": "1" 41 | podAnnotations: { } 42 | podLabels: { } 43 | podSecurityContext: { } 44 | securityContext: { } 45 | nodeSelector: { } 46 | tolerations: [ ] 47 | affinity: { } 48 | 49 | benchmark: 50 | extra_args: 51 | - "--profile" 52 | - "chat" 53 | - "800" 54 | image: 55 | repository: ghcr.io/huggingface/inference-benchmarker 56 | pullPolicy: IfNotPresent 57 | tag: "latest" 58 | podAnnotations: { } 59 | podLabels: { } 60 | podSecurityContext: { } 61 | securityContext: { } 62 | resources: { } 63 | nodeSelector: { } 64 | tolerations: [ ] 65 | affinity: { } 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /extra/slurm/benchmark.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | 4 | CPUS_PER_GPU = 20 5 | MEM_PER_CPU_GB = 11 6 | 7 | 8 | def main(): 9 | models = [ 10 | ('meta-llama/Llama-3.1-8B-Instruct', 1), 11 | ('meta-llama/Llama-3.1-70B-Instruct', 4), 12 | ('meta-llama/Llama-3.1-70B-Instruct', 2), 13 | ('mistralai/Mixtral-8x7B-Instruct-v0.1', 2), 14 | ] 15 | num_passes = 1 16 | engines = ['tgi', 'vllm'] 17 | for i in range(num_passes): 18 | for model in models: 19 | print(f"PASS {i} - Submitting job for {model[0]}") 20 | gpus = model[1] 21 | cpus_per_task = gpus * CPUS_PER_GPU 22 | for engine in engines: 23 | job_name = f'bench_{model[0].replace("/", "_")}_{engine}_pass_{i}' 24 | args = ['sbatch', 25 | '--job-name', job_name, 26 | '--output', f'/fsx/%u/logs/%x-%j.log', 27 | '--time', '1:50:00', 28 | '--qos', 'normal', 29 | '--partition', 'hopper-prod', 30 | '--gpus', str(gpus), 31 | '--ntasks', '1', 32 | '--cpus-per-task', str(cpus_per_task), 33 | '--mem-per-cpu', str(MEM_PER_CPU_GB) + 'G', 34 | '--nodes', '1', 35 | ':', 36 | '--gpus', '1', 37 | '--ntasks', '1', 38 | '--cpus-per-task', str(CPUS_PER_GPU), 39 | '--mem-per-cpu', str(MEM_PER_CPU_GB) + 'G', 40 | '--nodes', '1', 41 | f'{engine}.slurm'] 42 | env = os.environ.copy() 43 | env['MODEL'] = model[0] 44 | env['TP'] = str(gpus) 45 | process = subprocess.run(args, capture_output=True, 46 | env=env) 47 | print(process.stdout.decode()) 48 | print(process.stderr.decode()) 49 | if process.returncode != 0: 50 | print(f'Error while submitting :: {args}') 51 | exit(1) 52 | 53 | 54 | if __name__ == '__main__': 55 | main() 56 | -------------------------------------------------------------------------------- /extra/slurm/tgi.slurm: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #SBATCH --job-name tgi-benchmark 3 | #SBATCH --output /fsx/%u/logs/%x-%j.log 4 | #SBATCH --time 1:50:00 5 | #SBATCH --qos normal 6 | #SBATCH --partition hopper-prod 7 | #SBATCH --gpus 1 --ntasks 1 --cpus-per-task 11 --mem-per-cpu 20G --nodes=1 8 | #SBATCH hetjob 9 | #SBATCH --gpus 1 --ntasks 1 --cpus-per-task 11 --mem-per-cpu 20G --nodes=1 10 | 11 | if [ -z "$MODEL" ]; then 12 | echo "MODEL environment variable is not set" 13 | exit 1 14 | fi 15 | 16 | if [ -z "$TP" ]; then 17 | echo "TP environment variable is not set" 18 | exit 1 19 | fi 20 | 21 | echo "Starting TGI benchmark for $MODEL" 22 | export RUST_BACKTRACE=full 23 | export RUST_LOG=inference_benchmarker=info 24 | 25 | # set a random available port to avoid conflicts 26 | PORT=$(shuf -i 8000-9999 -n 1) 27 | export PORT 28 | export PREFILL_CHUNKING=1 29 | 30 | echo "Model will run on ${SLURM_JOB_NODELIST_HET_GROUP_0}:${PORT}" 31 | echo "Benchmark will run on ${SLURM_JOB_NODELIST_HET_GROUP_1}" 32 | 33 | # start TGI 34 | srun --het-group=0 \ 35 | -u \ 36 | -n 1 \ 37 | --container-image='ghcr.io#huggingface/text-generation-inference' \ 38 | --container-env=PORT \ 39 | --container-mounts="/scratch:/data" \ 40 | --container-workdir='/usr/src' \ 41 | --no-container-mount-home \ 42 | /usr/local/bin/text-generation-launcher \ 43 | --model-id $MODEL \ 44 | --max-concurrent-requests 1024& 45 | 46 | # wait until /health is available, die after 5 minutes 47 | timeout 600 bash -c "while [[ \"\$(curl -s -o /dev/null -w '%{http_code}' http://localhost:${PORT}/health)\" != \"200\" ]]; do sleep 1 && echo \"Waiting for TGI to start...\"; done" || exit 1 48 | exit_code=$? 49 | 50 | RESULTS_DIR="/fsx/$USER/benchmarks_results/tgi" 51 | mkdir -p "${RESULTS_DIR}" 52 | 53 | if [[ $exit_code != 124 ]]; then 54 | # run benchmark 55 | echo "Starting benchmark" 56 | VERSION=$(curl -s http://${SLURM_JOB_NODELIST_HET_GROUP_0}:${PORT}/info | jq -r '.sha') 57 | srun --het-group=1 \ 58 | -u \ 59 | -n 1 \ 60 | --container-image="ghcr.io#huggingface/inference-benchmarker:latest" \ 61 | --container-mounts="${RESULTS_DIR}:/opt/inference-benchmarker/results" \ 62 | --no-container-mount-home \ 63 | inference-benchmarker \ 64 | --tokenizer-name "$MODEL" \ 65 | --max-vus 128 \ 66 | --url "http://${SLURM_JOB_NODELIST_HET_GROUP_0}:${PORT}" \ 67 | --duration 120s \ 68 | --warmup 30s \ 69 | --benchmark-kind rate \ 70 | --rates 0.8 --rates 2.4 --rates 4.0 --rates 5.6 --rates 7.2 --rates 8.8 --rates 10.4 --rates 12.0 --rates 13.6 --rates 15.2 --rates 16.8 --rates 18.4 --rates 20.0 --rates 21.6 --rates 23.2 --rates 24.0 \ 71 | --extra-meta "version=$VERSION,engine=TGI,tp=$TP" \ 72 | --decode-options "num_tokens=800,max_tokens=800,min_tokens=50,variance=100" \ 73 | --dataset-file share_gpt_turns.json \ 74 | --no-console 75 | fi 76 | 77 | # stop TGI 78 | scancel --signal=TERM "$SLURM_JOB_ID+0" 79 | 80 | echo "End of benchmark" -------------------------------------------------------------------------------- /extra/slurm/vllm.slurm: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #SBATCH --job-name tgi-benchmark 3 | #SBATCH --output /fsx/%u/logs/%x-%j.log 4 | #SBATCH --time 1:50:00 5 | #SBATCH --qos normal 6 | #SBATCH --partition hopper-prod 7 | #SBATCH --gpus 1 --ntasks 1 --cpus-per-task 11 --mem-per-cpu 20G --nodes=1 8 | #SBATCH hetjob 9 | #SBATCH --gpus 1 --ntasks 1 --cpus-per-task 11 --mem-per-cpu 20G --nodes=1 10 | 11 | 12 | if [ -z "$MODEL" ]; then 13 | echo "MODEL environment variable is not set" 14 | exit 1 15 | fi 16 | 17 | if [ -z "$TP" ]; then 18 | echo "TP environment variable is not set" 19 | exit 1 20 | fi 21 | 22 | echo "Starting vLLM benchmark for $MODEL" 23 | export RUST_BACKTRACE=full 24 | export RUST_LOG=inference_benchmarker=info 25 | # set a random available port to avoid conflicts 26 | PORT=$(shuf -i 8000-9999 -n 1) 27 | export PORT 28 | 29 | echo "Model will run on ${SLURM_JOB_NODELIST_HET_GROUP_0}:${PORT}" 30 | echo "Benchmark will run on ${SLURM_JOB_NODELIST_HET_GROUP_1}" 31 | 32 | # start vLLM 33 | srun --het-group=0 \ 34 | -u \ 35 | -n 1 \ 36 | --container-image='vllm/vllm-openai:latest' \ 37 | --container-env=PORT \ 38 | --container-mounts="/scratch:/root/.cache/huggingface" \ 39 | --container-workdir='/usr/src' \ 40 | --no-container-mount-home \ 41 | python3 -m vllm.entrypoints.openai.api_server \ 42 | --model "${MODEL}" \ 43 | --port "${PORT}" \ 44 | --enable-chunked-prefill \ 45 | --max-num-batched-tokens 512 \ 46 | --tensor-parallel-size "${SLURM_GPUS_ON_NODE}"& 47 | 48 | # wait until /health is available, die after 5 minutes 49 | timeout 600 bash -c "while [[ \"\$(curl -s -o /dev/null -w '%{http_code}' http://localhost:${PORT}/health)\" != \"200\" ]]; do sleep 1 && echo \"Waiting for vLLM to start...\"; done" || exit 1 50 | exit_code=$? 51 | 52 | RESULTS_DIR="/fsx/$USER/benchmarks_results/vllm" 53 | mkdir -p "${RESULTS_DIR}" 54 | 55 | if [[ $exit_code != 124 ]]; then 56 | # run benchmark 57 | echo "Starting benchmark" 58 | VERSION=$(curl -s http://${SLURM_JOB_NODELIST_HET_GROUP_0}:${PORT}/version | jq -r '.version') 59 | srun --het-group=1 \ 60 | -u \ 61 | -n 1 \ 62 | --container-image="ghcr.io#huggingface/inference-benchmarker:latest" \ 63 | --container-mounts="${RESULTS_DIR}:/opt/inference-benchmarker/results" \ 64 | --no-container-mount-home \ 65 | inference-benchmarker \ 66 | --tokenizer-name "$MODEL" \ 67 | --max-vus 128 \ 68 | --url "http://${SLURM_JOB_NODELIST_HET_GROUP_0}:${PORT}" \ 69 | --duration 120s \ 70 | --warmup 30s \ 71 | --benchmark-kind rate \ 72 | --rates 0.8 --rates 2.4 --rates 4.0 --rates 5.6 --rates 7.2 --rates 8.8 --rates 10.4 --rates 12.0 --rates 13.6 --rates 15.2 --rates 16.8 --rates 18.4 --rates 20.0 --rates 21.6 --rates 23.2 --rates 24.0 \ 73 | --extra-meta "version=$VERSION,engine=vLLM,tp=$TP,max_num_batched_tokens=512" \ 74 | --decode-options "num_tokens=800,max_tokens=800,min_tokens=50,variance=100" \ 75 | --dataset-file share_gpt_turns.json \ 76 | --no-console 77 | fi 78 | 79 | # stop TGI 80 | scancel --signal=TERM "$SLURM_JOB_ID+0" 81 | 82 | echo "End of benchmark" -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "inference-benchmarker" 3 | version = "1.0.0" 4 | description = "" 5 | authors = ["Hugo Larcher "] 6 | readme = "README.md" 7 | 8 | [tool.poetry.dependencies] 9 | python = "^3.11" 10 | matplotlib = "^3.10.1" 11 | scienceplots = "^2.1.1" 12 | pandas = "^2.2.2" 13 | opensearch-py = "^2.7.1" 14 | boto3 = "^1.35.21" 15 | gradio = "^5.23.1" 16 | pyarrow = "^19.0.1" 17 | pygithub = "^2.4.0" 18 | click = "^8.1.7" 19 | 20 | 21 | [build-system] 22 | requires = ["poetry-core"] 23 | build-backend = "poetry.core.masonry.api" 24 | -------------------------------------------------------------------------------- /src/app.rs: -------------------------------------------------------------------------------- 1 | use crate::benchmark::Event as BenchmarkEvent; 2 | use crate::event::{terminal_event_task, AppEvent}; 3 | use crate::flux::{Action, AppState, Dispatcher, Store}; 4 | use crate::scheduler::ExecutorType; 5 | use crate::BenchmarkConfig; 6 | use crossterm::event::{KeyCode, KeyEvent, KeyModifiers}; 7 | use ratatui::layout::{Constraint, Direction, Layout}; 8 | use ratatui::text::Span; 9 | use ratatui::widgets::ListDirection::BottomToTop; 10 | use ratatui::widgets::{Cell, Dataset, List, ListItem, Row, Table}; 11 | use ratatui::{ 12 | buffer::Buffer, 13 | layout::{Alignment, Rect}, 14 | style::Stylize as OtherStylize, 15 | symbols, 16 | symbols::border, 17 | text::{Line, Text}, 18 | widgets::{Block, Paragraph, Widget}, 19 | DefaultTerminal, Frame, 20 | }; 21 | use std::collections::HashMap; 22 | use std::io; 23 | use std::sync::{Arc, Mutex}; 24 | use tokio::sync::broadcast::Sender; 25 | use tokio::sync::mpsc::{Receiver, UnboundedReceiver}; 26 | use tokio::sync::{broadcast, mpsc}; 27 | 28 | pub struct App { 29 | exit: bool, 30 | store: Arc>, 31 | dispatcher: Arc>, 32 | receiver: Receiver, 33 | benchmark_config: BenchmarkConfig, 34 | stop_sender: broadcast::Sender<()>, 35 | } 36 | 37 | pub async fn run_console( 38 | benchmark_config: BenchmarkConfig, 39 | mut receiver: UnboundedReceiver, 40 | stop_sender: broadcast::Sender<()>, 41 | ) { 42 | let (app_tx, app_rx) = mpsc::channel(8); 43 | // Create event task 44 | let stop_receiver_signal = stop_sender.subscribe(); 45 | tokio::spawn(terminal_event_task(250, app_tx, stop_receiver_signal)); 46 | 47 | let mut app = App::new(benchmark_config, app_rx, stop_sender.clone()); 48 | app.dispatcher 49 | .lock() 50 | .expect("lock") 51 | .dispatch(Action::LogMessage(LogMessageUI { 52 | message: "Starting benchmark".to_string(), 53 | level: LogLevel::Info, 54 | timestamp: chrono::Utc::now(), 55 | })); 56 | let dispatcher = app.dispatcher.clone(); 57 | let mut stop_receiver_signal = stop_sender.subscribe(); 58 | let event_thread = tokio::spawn(async move { 59 | tokio::select! { 60 | _=async{ 61 | while let Some(event) = receiver.recv().await { 62 | match event { 63 | BenchmarkEvent::BenchmarkStart(event) => { 64 | dispatcher.lock().expect("lock").dispatch(Action::AddBenchmark(BenchmarkUI { 65 | id: event.id, 66 | status: BenchmarkStatus::Running, 67 | progress: 0.0, 68 | throughput: "0".to_string(), 69 | successful_requests: 0, 70 | failed_requests: 0, 71 | })); 72 | } 73 | BenchmarkEvent::BenchmarkProgress(event) => { 74 | let (successful_requests,failed_requests) = (event.successful_requests,event.failed_requests); 75 | dispatcher.lock().expect("lock").dispatch(Action::AddBenchmark(BenchmarkUI { 76 | id: event.id, 77 | status: BenchmarkStatus::Running, 78 | progress: event.progress, 79 | throughput: event.request_throughput.map_or("0".to_string(), |e| format!("{e:.2}")), 80 | successful_requests, 81 | failed_requests, 82 | })); 83 | } 84 | BenchmarkEvent::BenchmarkEnd(event) => { 85 | dispatcher.lock().expect("lock").dispatch(Action::LogMessage(LogMessageUI { 86 | message: format!("Benchmark {} ended", event.id), 87 | level: LogLevel::Info, 88 | timestamp: chrono::Utc::now(), 89 | })); 90 | if let Some(results) = event.results { 91 | let (successful_requests,failed_requests) = (results.successful_requests() as u64,results.failed_requests() as u64); 92 | dispatcher.lock().expect("lock").dispatch(Action::AddBenchmark(BenchmarkUI { 93 | id: event.id, 94 | status: BenchmarkStatus::Completed, 95 | progress: 100.0, 96 | throughput: event.request_throughput.map_or("0".to_string(), |e| format!("{e:.2}")), 97 | successful_requests, 98 | failed_requests, 99 | })); 100 | dispatcher.lock().expect("lock").dispatch(Action::AddBenchmarkResults(results)); 101 | } 102 | } 103 | BenchmarkEvent::Message(event) => { 104 | dispatcher.lock().expect("lock").dispatch(Action::LogMessage(LogMessageUI { 105 | message: event.message, 106 | level: LogLevel::Info, 107 | timestamp: event.timestamp, 108 | })); 109 | } 110 | BenchmarkEvent::BenchmarkReportEnd(path) => { 111 | dispatcher.lock().expect("lock").dispatch(Action::LogMessage(LogMessageUI { 112 | message: format!("Benchmark report saved to {}", path), 113 | level: LogLevel::Info, 114 | timestamp: chrono::Utc::now(), 115 | })); 116 | break; 117 | } 118 | BenchmarkEvent::BenchmarkError(event) => { 119 | dispatcher.lock().expect("lock").dispatch(Action::LogMessage(LogMessageUI { 120 | message: format!("Error running benchmark: {:?}", event), 121 | level: LogLevel::Error, 122 | timestamp: chrono::Utc::now(), 123 | })); 124 | break; 125 | } 126 | } 127 | } 128 | }=>{} 129 | _ = stop_receiver_signal.recv() => {} 130 | } 131 | }); 132 | let mut stop_receiver_signal = stop_sender.subscribe(); 133 | let app_thread = tokio::spawn(async move { 134 | tokio::select! { 135 | _ = async { 136 | let _ = app.run(&mut ratatui::init()).await; 137 | ratatui::restore(); 138 | }=>{} 139 | _ = stop_receiver_signal.recv() => {} 140 | } 141 | }); 142 | let _ = event_thread.await; 143 | let _ = app_thread.await; 144 | } 145 | 146 | impl App { 147 | pub fn new( 148 | benchmark_config: BenchmarkConfig, 149 | receiver: Receiver, 150 | stop_sender: Sender<()>, 151 | ) -> App { 152 | let store = Arc::from(Mutex::new(Store::new())); 153 | let dispatcher = Arc::from(Mutex::new(Dispatcher::new(store.clone()))); 154 | App { 155 | exit: false, 156 | store: store.clone(), 157 | dispatcher: dispatcher.clone(), 158 | receiver, 159 | benchmark_config, 160 | stop_sender, 161 | } 162 | } 163 | pub async fn run(&mut self, terminal: &mut DefaultTerminal) -> io::Result<()> { 164 | while !self.exit { 165 | terminal.draw(|frame| self.draw(frame))?; 166 | self.handle_events().await?; 167 | } 168 | // signal everybody to stop 169 | let _ = self.stop_sender.send(()); 170 | Ok(()) 171 | } 172 | fn draw(&self, frame: &mut Frame) { 173 | frame.render_widget(self, frame.area()) 174 | } 175 | async fn handle_events(&mut self) -> io::Result<()> { 176 | match self.receiver.recv().await { 177 | None => Err(io::Error::new(io::ErrorKind::Other, "No event")), 178 | Some(event) => match event { 179 | AppEvent::Tick => Ok(()), 180 | AppEvent::Key(key_event) => self.handle_key_event(key_event), 181 | AppEvent::Resize => Ok(()), 182 | }, 183 | } 184 | } 185 | 186 | fn handle_key_event(&mut self, key_event: KeyEvent) -> io::Result<()> { 187 | match key_event { 188 | KeyEvent { 189 | code: KeyCode::Char('q'), 190 | .. 191 | } => self.exit(), 192 | KeyEvent { 193 | code: KeyCode::Char('c'), 194 | modifiers: KeyModifiers::CONTROL, 195 | .. 196 | } => self.exit(), 197 | _ => {} 198 | } 199 | Ok(()) 200 | } 201 | fn exit(&mut self) { 202 | self.exit = true; 203 | } 204 | 205 | fn create_datasets(&self, state: AppState) -> HashMap> { 206 | let token_throughput_rate = state 207 | .results 208 | .iter() 209 | .filter_map(|r| match r.executor_type() { 210 | ExecutorType::ConstantArrivalRate => { 211 | let throughput = r.token_throughput_secs().unwrap_or(0.0); 212 | Some((r.executor_config().rate.unwrap(), throughput)) 213 | } 214 | ExecutorType::ConstantVUs => None, 215 | }) 216 | .collect::>(); 217 | let token_throughput_vus = state 218 | .results 219 | .iter() 220 | .filter_map(|r| match r.executor_type() { 221 | ExecutorType::ConstantVUs => { 222 | let throughput = r.token_throughput_secs().unwrap_or(0.0); 223 | Some((r.executor_config().max_vus as f64, throughput)) 224 | } 225 | ExecutorType::ConstantArrivalRate => None, 226 | }) 227 | .collect::>(); 228 | let inter_token_latency_rate = state 229 | .results 230 | .iter() 231 | .filter_map(|r| match r.executor_type() { 232 | ExecutorType::ConstantArrivalRate => { 233 | let latency = r 234 | .inter_token_latency_avg() 235 | .unwrap_or_default() 236 | .as_secs_f64(); 237 | Some((r.executor_config().rate.unwrap(), latency)) 238 | } 239 | ExecutorType::ConstantVUs => None, 240 | }) 241 | .collect::>(); 242 | let inter_token_latency_vus = state 243 | .results 244 | .iter() 245 | .filter_map(|r| match r.executor_type() { 246 | ExecutorType::ConstantVUs => { 247 | let latency = r 248 | .inter_token_latency_avg() 249 | .unwrap_or_default() 250 | .as_secs_f64(); 251 | Some((r.executor_config().max_vus as f64, latency)) 252 | } 253 | ExecutorType::ConstantArrivalRate => None, 254 | }) 255 | .collect::>(); 256 | HashMap::from([ 257 | ("token_throughput_rate".to_string(), token_throughput_rate), 258 | ("token_throughput_vus".to_string(), token_throughput_vus), 259 | ( 260 | "inter_token_latency_rate".to_string(), 261 | inter_token_latency_rate, 262 | ), 263 | ( 264 | "inter_token_latency_vus".to_string(), 265 | inter_token_latency_vus, 266 | ), 267 | ]) 268 | } 269 | } 270 | 271 | impl Widget for &App { 272 | fn render(self, area: Rect, buf: &mut Buffer) { 273 | let state = self.store.lock().unwrap().get_state(); 274 | let data = self.create_datasets(state.clone()); 275 | 276 | let main_layout = Layout::default() 277 | .direction(Direction::Vertical) 278 | .constraints([Constraint::Length(1), Constraint::Min(20)]) 279 | .split(area); 280 | let bottom_layout = Layout::default() 281 | .direction(Direction::Vertical) 282 | .constraints([Constraint::Percentage(50), Constraint::Percentage(50)]) 283 | .split(main_layout[1]); 284 | let steps_graph_layout = Layout::default() 285 | .direction(Direction::Horizontal) 286 | .constraints([Constraint::Percentage(35), Constraint::Percentage(65)]) 287 | .split(bottom_layout[0]); 288 | // LOGS 289 | let logs_title = Line::from("Logs".bold()).centered(); 290 | let logs_block = Block::bordered() 291 | .title_top(logs_title) 292 | .border_set(border::THICK); 293 | List::new( 294 | state 295 | .messages 296 | .iter() 297 | .rev() 298 | .map(|m| { 299 | let level_span = match m.level { 300 | LogLevel::Info => { 301 | Span::raw(m.level.to_string().to_uppercase()).green().bold() 302 | } 303 | LogLevel::Warning => Span::raw(m.level.to_string().to_uppercase()) 304 | .yellow() 305 | .bold(), 306 | LogLevel::Error => { 307 | Span::raw(m.level.to_string().to_uppercase()).red().bold() 308 | } 309 | }; 310 | let content = Line::from(vec![ 311 | m.formatted_timestamp().clone().gray(), 312 | Span::raw(" "), 313 | level_span, 314 | Span::raw(" "), 315 | Span::raw(m.message.to_string()).bold(), 316 | ]); 317 | ListItem::new(content) 318 | }) 319 | .collect::>(), 320 | ) 321 | .direction(BottomToTop) 322 | .block(logs_block) 323 | .render(bottom_layout[1], buf); 324 | 325 | // BENCHMARK config 326 | let rate_mode = match self.benchmark_config.rates { 327 | None => "Automatic".to_string(), 328 | Some(_) => "Manual".to_string(), 329 | }; 330 | let config_text = Text::from(vec![Line::from(vec![ 331 | format!("Profile: {profile} | Benchmark: {kind} | Max VUs: {max_vus} | Duration: {duration} sec | Rates: {rates} | Warmup: {warmup} sec", 332 | profile = self.benchmark_config.profile.clone().unwrap_or("N/A".to_string()), 333 | kind = self.benchmark_config.benchmark_kind, 334 | max_vus = self.benchmark_config.max_vus, 335 | duration = self.benchmark_config.duration.as_secs_f64(), 336 | rates = rate_mode, 337 | warmup = self.benchmark_config.warmup_duration.as_secs_f64()).white().bold(), 338 | ])]); 339 | Paragraph::new(config_text.clone()).render(main_layout[0], buf); 340 | 341 | // STEPS 342 | let steps_block_title = Line::from("Benchmark steps".bold()).centered(); 343 | let steps_block = Block::bordered() 344 | .title(steps_block_title.alignment(Alignment::Center)) 345 | .border_set(border::THICK); 346 | let step_rows = state 347 | .benchmarks 348 | .iter() 349 | .map(|b| { 350 | let error_rate = if b.failed_requests > 0 { 351 | format!( 352 | "{:4.0}%", 353 | b.failed_requests as f64 354 | / (b.failed_requests + b.successful_requests) as f64 355 | * 100. 356 | ) 357 | .light_red() 358 | .bold() 359 | } else { 360 | format!("{:4.0}%", 0).to_string().white() 361 | }; 362 | let cells = vec![ 363 | b.id.clone().white(), 364 | b.status.to_string().white(), 365 | format!("{:4.0}%", b.progress).white(), 366 | error_rate, 367 | format!("{:>6.6} req/sec avg", b.throughput).green().bold(), 368 | ]; 369 | Row::new(cells) 370 | }) 371 | .collect::>(); 372 | let widths = [ 373 | Constraint::Length(30), 374 | Constraint::Length(10), 375 | Constraint::Length(5), 376 | Constraint::Length(5), 377 | Constraint::Length(20), 378 | ]; 379 | // steps table 380 | Table::new(step_rows, widths) 381 | .header(Row::new(vec![ 382 | Cell::new(Line::from("Bench").alignment(Alignment::Left)), 383 | Cell::new(Line::from("Status").alignment(Alignment::Left)), 384 | Cell::new(Line::from("%").alignment(Alignment::Left)), 385 | Cell::new(Line::from("Err").alignment(Alignment::Left)), 386 | Cell::new(Line::from("Throughput").alignment(Alignment::Left)), 387 | ])) 388 | .block(steps_block) 389 | .render(steps_graph_layout[0], buf); 390 | 391 | // CHARTS 392 | let graphs_block_title = Line::from("Token throughput rate".bold()).centered(); 393 | let graphs_block = Block::bordered() 394 | .title(graphs_block_title.alignment(Alignment::Center)) 395 | .border_set(border::THICK); 396 | let binding = data.get("token_throughput_rate").unwrap().clone(); 397 | let datasets = vec![Dataset::default() 398 | .name("Token throughput rate".to_string()) 399 | .marker(symbols::Marker::Dot) 400 | .graph_type(ratatui::widgets::GraphType::Scatter) 401 | .style(ratatui::style::Style::default().fg(ratatui::style::Color::LightMagenta)) 402 | .data(&binding)]; 403 | let (xmax, ymax) = get_max_bounds(&binding, (10.0, 100.0)); 404 | let x_axis = ratatui::widgets::Axis::default() 405 | .title("Arrival rate (req/s)".to_string()) 406 | .style(ratatui::style::Style::default().white()) 407 | .bounds([0.0, xmax]) 408 | .labels(get_axis_labels(0.0, xmax, 5)); 409 | let y_axis = ratatui::widgets::Axis::default() 410 | .title("Throughput (tokens/s)".to_string()) 411 | .style(ratatui::style::Style::default().white()) 412 | .bounds([0.0, ymax]) 413 | .labels(get_axis_labels(0.0, ymax, 5)); 414 | ratatui::widgets::Chart::new(datasets) 415 | .x_axis(x_axis) 416 | .y_axis(y_axis) 417 | .block(graphs_block) 418 | .legend_position(None) 419 | .render(steps_graph_layout[1], buf); 420 | } 421 | } 422 | 423 | fn get_max_bounds(data: &[(f64, f64)], default_max: (f64, f64)) -> (f64, f64) { 424 | let xmax = data 425 | .iter() 426 | .map(|(x, _)| x) 427 | .max_by(|a, b| a.partial_cmp(b).unwrap()) 428 | .unwrap_or(&default_max.0); 429 | let ymax = data 430 | .iter() 431 | .map(|(_, y)| y) 432 | .max_by(|a, b| a.partial_cmp(b).unwrap()) 433 | .unwrap_or(&default_max.1); 434 | (*xmax, *ymax) 435 | } 436 | 437 | fn get_axis_labels(min: f64, max: f64, num_labels: u32) -> Vec { 438 | let step = (max - min) / num_labels as f64; 439 | (0..num_labels) 440 | .map(|i| format!("{:.2}", min + i as f64 * step)) 441 | .collect() 442 | } 443 | 444 | #[allow(dead_code)] 445 | #[derive(Clone, strum_macros::Display)] 446 | enum LogLevel { 447 | Info, 448 | Warning, 449 | Error, 450 | } 451 | 452 | #[derive(Clone)] 453 | pub(crate) struct LogMessageUI { 454 | message: String, 455 | level: LogLevel, 456 | timestamp: chrono::DateTime, 457 | } 458 | 459 | impl LogMessageUI { 460 | fn formatted_timestamp(&self) -> String { 461 | self.timestamp.to_rfc3339() 462 | } 463 | } 464 | 465 | #[derive(Clone)] 466 | pub(crate) struct BenchmarkUI { 467 | pub(crate) id: String, 468 | status: BenchmarkStatus, 469 | progress: f64, 470 | throughput: String, 471 | successful_requests: u64, 472 | failed_requests: u64, 473 | } 474 | 475 | #[derive(Clone, strum_macros::Display)] 476 | enum BenchmarkStatus { 477 | Running, 478 | Completed, 479 | } 480 | -------------------------------------------------------------------------------- /src/benchmark.rs: -------------------------------------------------------------------------------- 1 | use crate::requests::{TextGenerationBackend, TextRequestGenerator, TokenizeOptions}; 2 | use crate::results::{BenchmarkReport, BenchmarkResults}; 3 | use crate::scheduler::{ExecutorType, SchedulerProgress}; 4 | use crate::{executors, scheduler}; 5 | use log::{debug, info}; 6 | use serde::Serialize; 7 | use std::collections::HashMap; 8 | use std::sync::Arc; 9 | use std::time::Duration; 10 | use tokio::sync::mpsc::{Receiver, Sender}; 11 | use tokio::sync::{broadcast, mpsc, Mutex}; 12 | 13 | const THROUGHPUT_BUDGET: f64 = 1.2; // sweep up to 120% of max throughput 14 | 15 | #[derive(Clone, Debug, strum_macros::Display, Serialize)] 16 | pub enum BenchmarkKind { 17 | Throughput, 18 | Sweep, 19 | Rate, 20 | } 21 | 22 | pub struct MessageEvent { 23 | pub message: String, 24 | pub timestamp: chrono::DateTime, 25 | pub level: log::Level, 26 | } 27 | 28 | pub struct BenchmarkEvent { 29 | pub id: String, 30 | pub scheduler_type: ExecutorType, 31 | pub request_throughput: Option, 32 | pub progress: f64, 33 | pub results: Option, 34 | pub successful_requests: u64, 35 | pub failed_requests: u64, 36 | } 37 | 38 | pub enum Event { 39 | BenchmarkStart(BenchmarkEvent), 40 | BenchmarkProgress(BenchmarkEvent), 41 | BenchmarkEnd(BenchmarkEvent), 42 | Message(MessageEvent), 43 | BenchmarkReportEnd(String), 44 | BenchmarkError(String), 45 | } 46 | 47 | pub struct Benchmark { 48 | start_time: Option, 49 | end_time: Option, 50 | backend: Box, 51 | requests: Arc>, 52 | report: BenchmarkReport, 53 | pub(crate) config: BenchmarkConfig, 54 | event_bus: mpsc::UnboundedSender, 55 | stop_sender: broadcast::Sender<()>, 56 | } 57 | 58 | #[serde_with::serde_as] 59 | #[derive(Clone, Serialize)] 60 | pub struct BenchmarkConfig { 61 | pub max_vus: u64, 62 | #[serde(rename = "duration_secs")] 63 | #[serde_as(as = "serde_with::DurationSeconds")] 64 | pub duration: Duration, 65 | pub benchmark_kind: BenchmarkKind, 66 | #[serde(rename = "warmup_duration_secs")] 67 | #[serde_as(as = "serde_with::DurationSeconds")] 68 | pub warmup_duration: Duration, 69 | pub rates: Option>, 70 | pub num_rates: u64, 71 | pub prompt_options: Option, 72 | pub decode_options: Option, 73 | pub tokenizer: String, 74 | pub model_name: String, 75 | pub profile: Option, 76 | #[serde(rename = "meta")] 77 | pub extra_metadata: Option>, 78 | pub run_id: String, 79 | } 80 | 81 | impl BenchmarkConfig { 82 | pub fn validate(&self) -> anyhow::Result<()> { 83 | if self.max_vus == 0 { 84 | return Err(anyhow::anyhow!("max_vus must be greater than 0")); 85 | } 86 | if self.duration.as_secs() == 0 { 87 | return Err(anyhow::anyhow!("duration must be greater than 0")); 88 | } 89 | if self.warmup_duration.as_secs() == 0 { 90 | return Err(anyhow::anyhow!("warmup_duration must be greater than 0")); 91 | } 92 | match self.benchmark_kind { 93 | BenchmarkKind::Throughput => { 94 | if self.rates.is_some() { 95 | return Err(anyhow::anyhow!( 96 | "rates must not be specified for throughput benchmark" 97 | )); 98 | } 99 | } 100 | BenchmarkKind::Sweep => { 101 | if self.rates.is_some() { 102 | return Err(anyhow::anyhow!( 103 | "rates must not be specified for sweep benchmark" 104 | )); 105 | } 106 | } 107 | BenchmarkKind::Rate => { 108 | if self.rates.is_none() { 109 | return Err(anyhow::anyhow!( 110 | "rates must be specified for rate benchmark" 111 | )); 112 | } 113 | } 114 | } 115 | Ok(()) 116 | } 117 | } 118 | 119 | pub struct BenchmarkProgress { 120 | id: String, 121 | progress: SchedulerProgress, 122 | } 123 | 124 | impl Benchmark { 125 | pub fn new( 126 | config: BenchmarkConfig, 127 | backend: Box, 128 | requests: Arc>, 129 | event_bus: mpsc::UnboundedSender, 130 | stop_sender: broadcast::Sender<()>, 131 | ) -> Benchmark { 132 | Benchmark { 133 | start_time: None, 134 | end_time: None, 135 | report: BenchmarkReport::new(), 136 | config: config.clone(), 137 | backend, 138 | requests, 139 | event_bus, 140 | stop_sender, 141 | } 142 | } 143 | 144 | pub fn get_report(&self) -> BenchmarkReport { 145 | self.report.clone() 146 | } 147 | 148 | pub async fn run(&mut self) -> anyhow::Result { 149 | self.start_time = Some(tokio::time::Instant::now()); 150 | self.report.start(); 151 | info!("Prewarming backend"); 152 | self.warmup().await?; 153 | info!("Prewarm complete"); 154 | match self.config.benchmark_kind { 155 | BenchmarkKind::Throughput => { 156 | self.run_throughput().await?; 157 | } 158 | BenchmarkKind::Sweep => { 159 | self.run_sweep().await?; 160 | } 161 | BenchmarkKind::Rate => { 162 | self.run_rates().await?; 163 | } 164 | } 165 | self.end_time = Some(tokio::time::Instant::now()); 166 | self.event_bus.send(Event::Message(MessageEvent { 167 | message: format!( 168 | "Benchmark complete in {:?}", 169 | self.duration().expect("duration exists") 170 | ), 171 | timestamp: chrono::Utc::now(), 172 | level: log::Level::Info, 173 | }))?; 174 | self.report.end(); 175 | Ok(self.report.clone()) 176 | } 177 | 178 | pub fn duration(&self) -> Option { 179 | match (self.start_time, self.end_time) { 180 | (Some(start), Some(end)) => Some(end.duration_since(start)), 181 | _ => None, 182 | } 183 | } 184 | 185 | async fn handle_progress(&self, id: String) -> Sender> { 186 | let (tx, mut rx): ( 187 | Sender>, 188 | Receiver>, 189 | ) = mpsc::channel(8); 190 | let event_bus = self.event_bus.clone(); 191 | tokio::spawn(async move { 192 | while let Some(event) = rx.recv().await { 193 | match event { 194 | None => { 195 | break; 196 | } 197 | Some(progress) => { 198 | let progress_evt = BenchmarkProgress { 199 | id: id.clone(), 200 | progress, 201 | }; 202 | let _ = event_bus.send(Event::BenchmarkProgress(BenchmarkEvent { 203 | id: progress_evt.id, 204 | scheduler_type: ExecutorType::ConstantVUs, 205 | request_throughput: Some(progress_evt.progress.requests_throughput), 206 | progress: progress_evt.progress.progress, 207 | successful_requests: progress_evt.progress.successful_requests, 208 | failed_requests: progress_evt.progress.failed_requests, 209 | results: None, 210 | })); 211 | } 212 | } 213 | } 214 | }); 215 | tx 216 | } 217 | 218 | pub async fn warmup(&mut self) -> anyhow::Result<()> { 219 | // run a warmup benchmark to prewarm the server 220 | 221 | let id = "warmup".to_string(); 222 | 223 | // notify start event 224 | self.event_bus.send(Event::BenchmarkStart(BenchmarkEvent { 225 | id: id.to_string(), 226 | scheduler_type: ExecutorType::ConstantVUs, 227 | request_throughput: None, 228 | progress: 0.0, 229 | results: None, 230 | successful_requests: 0, 231 | failed_requests: 0, 232 | }))?; 233 | 234 | // create progress handler 235 | let tx = self.handle_progress(id.clone()).await; 236 | 237 | // start scheduler 238 | let mut scheduler = scheduler::Scheduler::new( 239 | id, 240 | self.backend.clone(), 241 | ExecutorType::ConstantVUs, 242 | executors::ExecutorConfig { 243 | max_vus: 1, 244 | duration: self.config.warmup_duration, 245 | rate: None, 246 | }, 247 | self.requests.clone(), 248 | tx.clone(), 249 | self.stop_sender.clone(), 250 | ); 251 | scheduler.run().await?; 252 | 253 | let results = scheduler.get_results().lock().await.clone(); 254 | self.report.add_benchmark_result(results.clone()); 255 | 256 | // send None to close the progress handler 257 | tx.send(None).await.unwrap(); 258 | 259 | // notify end event 260 | self.event_bus.send(Event::BenchmarkEnd(BenchmarkEvent { 261 | id: "warmup".to_string(), 262 | scheduler_type: ExecutorType::ConstantVUs, 263 | request_throughput: results.successful_request_rate().ok(), 264 | progress: 100.0, 265 | results: Some(results.clone()), 266 | successful_requests: results.successful_requests() as u64, 267 | failed_requests: results.failed_requests() as u64, 268 | }))?; 269 | Ok(()) 270 | } 271 | 272 | pub async fn run_throughput(&mut self) -> anyhow::Result<()> { 273 | info!("Running throughput benchmark"); 274 | 275 | let id = "throughput".to_string(); 276 | 277 | // notify start event 278 | self.event_bus.send(Event::BenchmarkStart(BenchmarkEvent { 279 | id: id.clone(), 280 | scheduler_type: ExecutorType::ConstantVUs, 281 | request_throughput: None, 282 | progress: 0.0, 283 | results: None, 284 | successful_requests: 0, 285 | failed_requests: 0, 286 | }))?; 287 | 288 | // create progress handler 289 | let tx = self.handle_progress(id.clone()).await; 290 | 291 | // start scheduler 292 | let mut scheduler = scheduler::Scheduler::new( 293 | id.clone(), 294 | self.backend.clone(), 295 | ExecutorType::ConstantVUs, 296 | executors::ExecutorConfig { 297 | max_vus: self.config.max_vus, 298 | duration: self.config.duration, 299 | rate: None, 300 | }, 301 | self.requests.clone(), 302 | tx.clone(), 303 | self.stop_sender.clone(), 304 | ); 305 | scheduler.run().await?; 306 | let results = scheduler.get_results().lock().await.clone(); 307 | let rate = results.successful_request_rate().ok(); 308 | self.report.add_benchmark_result(results.clone()); 309 | 310 | // send None to close the progress handler 311 | tx.send(None).await.unwrap(); 312 | 313 | // notify end event 314 | self.event_bus.send(Event::BenchmarkEnd(BenchmarkEvent { 315 | id: id.clone(), 316 | scheduler_type: ExecutorType::ConstantVUs, 317 | request_throughput: rate, 318 | progress: 100.0, 319 | results: Some(results.clone()), 320 | successful_requests: results.successful_requests() as u64, 321 | failed_requests: results.failed_requests() as u64, 322 | }))?; 323 | Ok(()) 324 | } 325 | 326 | pub async fn run_sweep(&mut self) -> anyhow::Result<()> { 327 | // run a throughput benchmark to retrieve the maximum throughput of server 328 | self.run_throughput().await?; 329 | // get the max throughput from the second benchmark result (first is warmup) 330 | let throughput_results = &self.report.get_results()[1]; 331 | let max_throughput = throughput_results.successful_request_rate()?; 332 | let max_tokens_throughput = throughput_results.token_throughput_secs()?; 333 | // notify event bus 334 | self.event_bus.send(Event::Message(MessageEvent { 335 | message: format!( 336 | "Max throughput detected at: {:.2} req/s | {:.2} tokens/s", 337 | max_throughput, max_tokens_throughput 338 | ), 339 | timestamp: chrono::Utc::now(), 340 | level: log::Level::Info, 341 | }))?; 342 | // run a sweep benchmark for 10 different rates from 1req/s to max throughput 343 | let mut rates = Vec::new(); 344 | let num_rates = self.config.num_rates; 345 | for i in 1..=num_rates { 346 | rates.push(i as f64 * max_throughput * THROUGHPUT_BUDGET / num_rates as f64); 347 | } 348 | for rate in rates { 349 | self.run_rate(rate).await?; 350 | } 351 | Ok(()) 352 | } 353 | 354 | pub async fn run_rates(&mut self) -> anyhow::Result<()> { 355 | let rates = self.config.rates.clone().expect("config already validated"); 356 | for rate in rates { 357 | self.run_rate(rate).await?; 358 | } 359 | Ok(()) 360 | } 361 | 362 | pub async fn run_rate(&mut self, rate: f64) -> anyhow::Result<()> { 363 | debug!("Running benchmark with rate: {} req/s", rate); 364 | 365 | let id = format!("constant@{:.2}req/s", rate); 366 | 367 | // notify start event 368 | self.event_bus.send(Event::BenchmarkStart(BenchmarkEvent { 369 | id: id.clone(), 370 | scheduler_type: ExecutorType::ConstantArrivalRate, 371 | request_throughput: None, 372 | progress: 0.0, 373 | results: None, 374 | successful_requests: 0, 375 | failed_requests: 0, 376 | }))?; 377 | 378 | // create progress handler 379 | let tx = self.handle_progress(id.clone()).await; 380 | 381 | // start scheduler 382 | let mut scheduler = scheduler::Scheduler::new( 383 | id, 384 | self.backend.clone(), 385 | scheduler::ExecutorType::ConstantArrivalRate, 386 | executors::ExecutorConfig { 387 | max_vus: self.config.max_vus, 388 | duration: self.config.duration, 389 | rate: Some(rate), 390 | }, 391 | self.requests.clone(), 392 | tx.clone(), 393 | self.stop_sender.clone(), 394 | ); 395 | scheduler.run().await?; 396 | let results = scheduler.get_results().lock().await.clone(); 397 | self.report.add_benchmark_result(results.clone()); 398 | 399 | // send None to close the progress handler 400 | tx.send(None).await.unwrap(); 401 | 402 | // notify end event 403 | self.event_bus.send(Event::BenchmarkEnd(BenchmarkEvent { 404 | id: format!("constant@{:.2}req/s", rate), 405 | scheduler_type: ExecutorType::ConstantArrivalRate, 406 | request_throughput: results.successful_request_rate().ok(), 407 | progress: 100.0, 408 | results: Some(results.clone()), 409 | successful_requests: results.successful_requests() as u64, 410 | failed_requests: results.failed_requests() as u64, 411 | }))?; 412 | Ok(()) 413 | } 414 | } 415 | 416 | #[cfg(test)] 417 | mod tests { 418 | use super::*; 419 | use crate::requests::DummyTextGenerationBackend; 420 | use crate::requests::DummyTextRequestGenerator; 421 | use std::time::Duration; 422 | 423 | #[tokio::test] 424 | async fn test_sweep_benchmark_timings() { 425 | let generation_time = Duration::from_secs(2); 426 | let (event_tx, mut _event_rx) = tokio::sync::mpsc::unbounded_channel(); 427 | let (stop_sender, _) = tokio::sync::broadcast::channel(1); 428 | let backend = Box::new(DummyTextGenerationBackend::new(Duration::from_secs( 429 | generation_time.as_secs(), 430 | ))); 431 | let requests_generator = Arc::from(Mutex::from(DummyTextRequestGenerator::new())); 432 | let mut benchmark = Benchmark::new( 433 | BenchmarkConfig { 434 | max_vus: 100, 435 | duration: Duration::from_secs(10), 436 | benchmark_kind: BenchmarkKind::Sweep, 437 | warmup_duration: Duration::from_secs(1), 438 | rates: None, 439 | num_rates: 2, 440 | prompt_options: None, 441 | decode_options: None, 442 | tokenizer: "gpt2".to_string(), 443 | model_name: "gpt2".to_string(), 444 | profile: None, 445 | extra_metadata: None, 446 | run_id: "test".to_string(), 447 | }, 448 | backend, 449 | requests_generator, 450 | event_tx, 451 | stop_sender, 452 | ); 453 | let report = benchmark.run().await.unwrap(); 454 | assert_eq!(report.get_results().len(), 4); 455 | let generation_time_per_token_milli = generation_time.as_millis() as i128 / 10; 456 | for result in report.get_results() { 457 | let delta_ttft = result.time_to_first_token_avg().unwrap().as_millis() as i128 458 | - generation_time_per_token_milli; // Dummy backends generates 10 tokens 459 | let delta_itl = result.inter_token_latency_avg().unwrap().as_millis() as i128 460 | - generation_time_per_token_milli; 461 | let delta_e2e = result.e2e_latency_avg().unwrap().as_millis() as i128 462 | - generation_time.as_millis() as i128; 463 | let allowed_error_ms = 3; // allow error margin for timing tests 464 | assert!( 465 | delta_ttft.abs() <= allowed_error_ms, 466 | "time_to_first_token_delta: {:?}, expected {:?}", 467 | delta_ttft.abs(), 468 | allowed_error_ms 469 | ); 470 | assert!( 471 | delta_itl.abs() <= allowed_error_ms, 472 | "inter_token_latency_delta: {:?}, expected {:?}", 473 | delta_itl.abs(), 474 | allowed_error_ms 475 | ); 476 | assert!( 477 | delta_e2e.abs() <= allowed_error_ms * 10, // Cumulative error for 10 tokens 478 | "e2e_latency_delta: {:?}, expected {:?}", 479 | delta_e2e.abs(), 480 | allowed_error_ms * 10 481 | ); 482 | } 483 | } 484 | } 485 | -------------------------------------------------------------------------------- /src/event.rs: -------------------------------------------------------------------------------- 1 | use crossterm::event; 2 | use crossterm::event::KeyEvent; 3 | use std::time::{Duration, Instant}; 4 | use tokio::sync::{broadcast, mpsc}; 5 | 6 | pub enum AppEvent { 7 | Tick, 8 | Key(KeyEvent), 9 | Resize, 10 | } 11 | 12 | pub async fn terminal_event_task( 13 | fps: u32, 14 | event_sender: mpsc::Sender, 15 | mut shutdown_receiver: broadcast::Receiver<()>, 16 | ) { 17 | // End task if a message is received on shutdown_receiver 18 | // _shutdown_guard_sender will be dropped once the task is finished 19 | tokio::select! { 20 | _ = event_loop(fps, event_sender) => { 21 | }, 22 | _ = shutdown_receiver.recv() => {} 23 | } 24 | } 25 | 26 | async fn event_loop(fps: u32, event_sender: mpsc::Sender) { 27 | // Frame budget 28 | let per_frame = Duration::from_secs(1) / fps; 29 | 30 | // When was last frame executed 31 | let mut last_frame = Instant::now(); 32 | 33 | loop { 34 | // Sleep to avoid blocking the thread for too long 35 | if let Some(sleep) = per_frame.checked_sub(last_frame.elapsed()) { 36 | tokio::time::sleep(sleep).await; 37 | } 38 | 39 | // Get crossterm event and send a new one over the channel 40 | if event::poll(Duration::from_secs(0)).expect("no events available") { 41 | match event::read().expect("unable to read event") { 42 | event::Event::Key(e) => event_sender.send(AppEvent::Key(e)).await.unwrap_or(()), 43 | event::Event::Resize(_w, _h) => { 44 | event_sender.send(AppEvent::Resize).await.unwrap_or(()) 45 | } 46 | _ => (), 47 | } 48 | } 49 | 50 | // Frame budget exceeded 51 | if last_frame.elapsed() >= per_frame { 52 | // Send tick 53 | event_sender.send(AppEvent::Tick).await.unwrap_or(()); 54 | // Rest last_frame time 55 | last_frame = Instant::now(); 56 | } 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/executors.rs: -------------------------------------------------------------------------------- 1 | use std::sync::atomic::AtomicI64; 2 | use std::sync::Arc; 3 | use std::time::Duration; 4 | 5 | use async_trait::async_trait; 6 | use log::{info, trace, warn}; 7 | use serde::Serialize; 8 | use tokio::sync::mpsc::{Receiver, Sender, UnboundedSender}; 9 | use tokio::sync::{broadcast, Mutex}; 10 | use tokio::task::JoinHandle; 11 | 12 | use crate::requests::{ 13 | TextGenerationAggregatedResponse, TextGenerationBackend, TextGenerationRequest, 14 | TextRequestGenerator, 15 | }; 16 | 17 | #[serde_with::serde_as] 18 | #[derive(Clone, Serialize)] 19 | pub struct ExecutorConfig { 20 | pub max_vus: u64, 21 | #[serde(rename = "duration_secs")] 22 | #[serde_as(as = "serde_with::DurationSeconds")] 23 | pub duration: Duration, 24 | pub rate: Option, 25 | } 26 | 27 | #[async_trait] 28 | pub trait Executor { 29 | async fn run( 30 | &self, 31 | requests: Arc>, 32 | responses_tx: UnboundedSender, 33 | stop_sender: broadcast::Sender<()>, 34 | ); 35 | } 36 | 37 | pub struct ConstantVUsExecutor { 38 | config: ExecutorConfig, 39 | backend: Box, 40 | } 41 | 42 | impl ConstantVUsExecutor { 43 | pub fn new( 44 | backend: Box, 45 | max_vus: u64, 46 | duration: Duration, 47 | ) -> ConstantVUsExecutor { 48 | Self { 49 | backend, 50 | config: ExecutorConfig { 51 | max_vus, 52 | duration, 53 | rate: None, 54 | }, 55 | } 56 | } 57 | } 58 | 59 | #[async_trait] 60 | impl Executor for ConstantVUsExecutor { 61 | async fn run( 62 | &self, 63 | requests: Arc>, 64 | responses_tx: UnboundedSender, 65 | stop_sender: broadcast::Sender<()>, 66 | ) { 67 | let start = std::time::Instant::now(); 68 | // channel to handle ending VUs 69 | let (end_tx, mut end_rx): (Sender, Receiver) = 70 | tokio::sync::mpsc::channel(self.config.max_vus as usize); 71 | let active_vus = Arc::new(AtomicI64::new(0)); 72 | // start all VUs 73 | for _ in 0..self.config.max_vus { 74 | let mut requests_guard = requests.lock().await; 75 | let request = Arc::from(requests_guard.generate_request()); 76 | drop(requests_guard); 77 | start_vu( 78 | self.backend.clone(), 79 | request, 80 | responses_tx.clone(), 81 | end_tx.clone(), 82 | stop_sender.clone(), 83 | ) 84 | .await; 85 | active_vus.fetch_add(1, std::sync::atomic::Ordering::SeqCst); 86 | } 87 | let mut stop_receiver = stop_sender.subscribe(); 88 | tokio::select! { 89 | _ = stop_receiver.recv() => { 90 | return; 91 | }, 92 | _ = async { 93 | // replenish VUs as they finish 94 | while end_rx.recv().await.is_some() { 95 | active_vus.fetch_sub(1, std::sync::atomic::Ordering::SeqCst); 96 | if start.elapsed() > self.config.duration{ 97 | // signal that the VU work is done 98 | let _ = responses_tx.send(TextGenerationAggregatedResponse::new_as_ended()); 99 | info!("Duration reached, waiting for all VUs to finish..."); 100 | if active_vus.load(std::sync::atomic::Ordering::SeqCst) == 0 { 101 | break; 102 | } 103 | } else { 104 | let mut requests_guard = requests.lock().await; 105 | let request = Arc::from(requests_guard.generate_request()); 106 | drop(requests_guard); 107 | active_vus.fetch_add(1, std::sync::atomic::Ordering::SeqCst); 108 | start_vu(self.backend.clone(), request, responses_tx.clone(), end_tx.clone(), stop_sender.clone()).await; 109 | } 110 | } 111 | }=>{} 112 | } 113 | } 114 | } 115 | 116 | async fn start_vu( 117 | backend: Box, 118 | request: Arc, 119 | responses_tx: UnboundedSender, 120 | end_tx: Sender, 121 | stop_sender: broadcast::Sender<()>, 122 | ) -> JoinHandle<()> { 123 | let mut stop_receiver = stop_sender.subscribe(); 124 | tokio::spawn(async move { 125 | tokio::select! { 126 | _ = stop_receiver.recv() => { 127 | let _ = end_tx.send(true).await; 128 | }, 129 | _ = async{ 130 | let (tx, mut rx): (Sender, Receiver) = tokio::sync::mpsc::channel(1); 131 | trace!("VU started with request: {:?}", request); 132 | let req_thread = tokio::spawn(async move { 133 | backend.generate(request.clone(), tx).await; 134 | }); 135 | let send_thread = tokio::spawn(async move { 136 | while let Some(response) = rx.recv().await { 137 | // ignore errors, if the receiver is gone we want to finish the request 138 | // to leave remote server in clean state 139 | let _ = responses_tx.send(response); 140 | } 141 | }); 142 | req_thread.await.unwrap(); 143 | send_thread.await.unwrap(); 144 | // signal that the VU work is done 145 | let _ = end_tx.send(true).await; 146 | }=>{} 147 | } 148 | }) 149 | } 150 | 151 | pub struct ConstantArrivalRateExecutor { 152 | config: ExecutorConfig, 153 | backend: Box, 154 | } 155 | 156 | impl ConstantArrivalRateExecutor { 157 | pub fn new( 158 | backend: Box, 159 | max_vus: u64, 160 | duration: Duration, 161 | rate: f64, 162 | ) -> ConstantArrivalRateExecutor { 163 | Self { 164 | backend, 165 | config: ExecutorConfig { 166 | max_vus, 167 | duration, 168 | rate: Some(rate), 169 | }, 170 | } 171 | } 172 | } 173 | 174 | #[async_trait] 175 | impl Executor for ConstantArrivalRateExecutor { 176 | async fn run( 177 | &self, 178 | requests: Arc>, 179 | responses_tx: UnboundedSender, 180 | stop_sender: broadcast::Sender<()>, 181 | ) { 182 | let start = std::time::Instant::now(); 183 | let active_vus = Arc::new(AtomicI64::new(0)); 184 | // channel to handle ending VUs 185 | let (end_tx, mut end_rx): (Sender, Receiver) = 186 | tokio::sync::mpsc::channel(self.config.max_vus as usize); 187 | let rate = self.config.rate.expect("checked in new()"); 188 | // spawn new VUs every `tick_ms` to reach the expected `rate` per second, until the duration is reached 189 | let tick_ms = 10; 190 | let mut interval = tokio::time::interval(Duration::from_millis(tick_ms)); 191 | 192 | let backend = self.backend.clone(); 193 | let duration = self.config.duration; 194 | let max_vus = self.config.max_vus; 195 | let active_vus_thread = active_vus.clone(); 196 | let mut stop_receiver_signal = stop_sender.subscribe(); 197 | let vu_thread = tokio::spawn(async move { 198 | tokio::select! { 199 | _ = stop_receiver_signal.recv() => {}, 200 | _= async { 201 | let mut spawn_queue = 0.; // start with at least one VU 202 | while start.elapsed() < duration { 203 | spawn_queue += rate * (tick_ms as f64) / 1000.0; 204 | // delay spawning if we can't spawn a full VU yet 205 | if spawn_queue < 1.0 { 206 | interval.tick().await; 207 | continue; 208 | } 209 | // spawn VUs, keep track of the fraction of VU to spawn for the next iteration 210 | let to_spawn = spawn_queue.floor() as u64; 211 | spawn_queue -= to_spawn as f64; 212 | for _ in 0..to_spawn { 213 | if active_vus_thread.load(std::sync::atomic::Ordering::SeqCst) < max_vus as i64 { 214 | let mut requests_guard = requests.lock().await; 215 | let request = Arc::from(requests_guard.generate_request()); 216 | start_vu(backend.clone(), request.clone(), responses_tx.clone(), end_tx.clone(),stop_sender.clone()).await; 217 | active_vus_thread.fetch_add(1, std::sync::atomic::Ordering::SeqCst); 218 | } else { 219 | warn!("Max VUs reached, skipping request"); 220 | break; 221 | } 222 | } 223 | interval.tick().await; 224 | } 225 | // signal that the VU work is done 226 | info!("Duration reached, waiting for all VUs to finish..."); 227 | let _ = responses_tx.send(TextGenerationAggregatedResponse::new_as_ended()); 228 | }=>{} 229 | } 230 | }); 231 | while end_rx.recv().await.is_some() { 232 | active_vus.fetch_sub(1, std::sync::atomic::Ordering::SeqCst); 233 | // wait for all VUs to finish 234 | if start.elapsed() > self.config.duration 235 | && active_vus.load(std::sync::atomic::Ordering::SeqCst) == 0 236 | { 237 | break; 238 | } 239 | } 240 | // wait for the VU thread to finish 241 | vu_thread.await.unwrap(); 242 | } 243 | } 244 | -------------------------------------------------------------------------------- /src/flux.rs: -------------------------------------------------------------------------------- 1 | use crate::results::BenchmarkResults; 2 | use std::sync::{Arc, Mutex}; 3 | 4 | // Flux pattern 5 | #[derive(Clone)] 6 | pub struct Dispatcher { 7 | store: Arc>, 8 | } 9 | 10 | impl Dispatcher { 11 | pub(crate) fn new(store: Arc>) -> Self { 12 | Self { store } 13 | } 14 | pub(crate) fn dispatch(&mut self, action: Action) { 15 | self.store.lock().unwrap().update(action); 16 | } 17 | } 18 | 19 | #[derive(Clone)] 20 | pub struct AppState { 21 | pub(crate) messages: Vec, 22 | pub(crate) benchmarks: Vec, 23 | pub(crate) results: Vec, 24 | } 25 | 26 | impl AppState { 27 | fn new() -> Self { 28 | Self { 29 | messages: Vec::new(), 30 | benchmarks: Vec::new(), 31 | results: Vec::new(), 32 | } 33 | } 34 | } 35 | 36 | pub struct Store { 37 | state: AppState, 38 | } 39 | 40 | impl Store { 41 | pub(crate) fn new() -> Self { 42 | let state = AppState::new(); 43 | Self { state } 44 | } 45 | 46 | fn update(&mut self, action: Action) { 47 | match action { 48 | Action::LogMessage(message) => self.state.messages.push(message), 49 | Action::AddBenchmark(benchmark) => { 50 | // add or update benchmark 51 | let index = self 52 | .state 53 | .benchmarks 54 | .iter() 55 | .position(|b| b.id == benchmark.id); 56 | match index { 57 | Some(i) => { 58 | self.state.benchmarks[i] = benchmark; 59 | } 60 | None => { 61 | self.state.benchmarks.push(benchmark); 62 | } 63 | } 64 | } 65 | Action::AddBenchmarkResults(results) => { 66 | let index = self 67 | .state 68 | .results 69 | .iter_mut() 70 | .position(|b| b.id == results.id); 71 | match index { 72 | Some(i) => { 73 | self.state.results[i] = results; 74 | } 75 | None => { 76 | self.state.results.push(results); 77 | } 78 | } 79 | } 80 | } 81 | } 82 | 83 | pub(crate) fn get_state(&self) -> AppState { 84 | self.state.clone() 85 | } 86 | } 87 | 88 | pub enum Action { 89 | LogMessage(crate::app::LogMessageUI), 90 | AddBenchmark(crate::app::BenchmarkUI), 91 | AddBenchmarkResults(BenchmarkResults), 92 | } 93 | -------------------------------------------------------------------------------- /src/lib.rs: -------------------------------------------------------------------------------- 1 | use std::collections::HashMap; 2 | use std::fs::File; 3 | use std::io; 4 | use std::io::Write; 5 | use std::path::Path; 6 | use std::sync::Arc; 7 | 8 | pub use crate::app::run_console; 9 | pub use crate::benchmark::{BenchmarkConfig, BenchmarkKind}; 10 | use crate::benchmark::{Event, MessageEvent}; 11 | pub use crate::profiles::apply_profile; 12 | use crate::requests::OpenAITextGenerationBackend; 13 | pub use crate::requests::TokenizeOptions; 14 | use chrono::Local; 15 | use crossterm::ExecutableCommand; 16 | use log::{debug, error, info, warn, Level, LevelFilter}; 17 | use reqwest::Url; 18 | use tokenizers::{FromPretrainedParameters, Tokenizer}; 19 | use tokio::sync::broadcast::Sender; 20 | use tokio::sync::Mutex; 21 | use writers::BenchmarkReportWriter; 22 | 23 | mod app; 24 | mod benchmark; 25 | mod event; 26 | mod executors; 27 | mod flux; 28 | mod profiles; 29 | mod requests; 30 | mod results; 31 | mod scheduler; 32 | mod table; 33 | mod writers; 34 | 35 | pub struct RunConfiguration { 36 | pub url: Url, 37 | pub api_key: String, 38 | pub tokenizer_name: String, 39 | pub profile: Option, 40 | pub max_vus: u64, 41 | pub duration: std::time::Duration, 42 | pub rates: Option>, 43 | pub num_rates: u64, 44 | pub benchmark_kind: String, 45 | pub warmup_duration: std::time::Duration, 46 | pub interactive: bool, 47 | pub prompt_options: Option, 48 | pub decode_options: Option, 49 | pub dataset: String, 50 | pub dataset_file: String, 51 | pub hf_token: Option, 52 | pub extra_metadata: Option>, 53 | pub model_name: String, 54 | pub run_id: String, 55 | } 56 | 57 | pub async fn run(mut run_config: RunConfiguration, stop_sender: Sender<()>) -> anyhow::Result<()> { 58 | info!("Starting benchmark"); 59 | // set process system limits 60 | sysinfo::set_open_files_limit(0); 61 | // apply profile if needed 62 | run_config = match run_config.profile.clone() { 63 | None => run_config, 64 | Some(profile) => match apply_profile(profile.as_str(), run_config) { 65 | Ok(config) => { 66 | info!("Profile applied: {}", profile); 67 | config 68 | } 69 | Err(e) => { 70 | error!("Failed to apply profile: {:?}", e); 71 | return Err(e); 72 | } 73 | }, 74 | }; 75 | // initialize tokenizer 76 | let params = FromPretrainedParameters { 77 | token: run_config.hf_token.clone(), 78 | ..Default::default() 79 | }; 80 | let tokenizer = 81 | match Tokenizer::from_pretrained(run_config.tokenizer_name.clone(), Some(params)) { 82 | Ok(tokenizer) => tokenizer, 83 | Err(e) => { 84 | return Err(anyhow::anyhow!("Error loading tokenizer: {e}")); 85 | } 86 | }; 87 | let tokenizer = Arc::new(tokenizer); 88 | let backend = OpenAITextGenerationBackend::try_new( 89 | run_config.api_key, 90 | run_config.url, 91 | run_config.model_name.clone(), 92 | tokenizer, 93 | run_config.duration, 94 | )?; 95 | 96 | let config = BenchmarkConfig { 97 | max_vus: run_config.max_vus, 98 | duration: run_config.duration, 99 | benchmark_kind: match run_config.benchmark_kind.to_lowercase().as_str() { 100 | "throughput" => BenchmarkKind::Throughput, 101 | "sweep" => BenchmarkKind::Sweep, 102 | "rate" => BenchmarkKind::Rate, 103 | _ => BenchmarkKind::Sweep, 104 | }, 105 | warmup_duration: run_config.warmup_duration, 106 | rates: run_config.rates, 107 | num_rates: run_config.num_rates, 108 | prompt_options: run_config.prompt_options.clone(), 109 | decode_options: run_config.decode_options.clone(), 110 | tokenizer: run_config.tokenizer_name.clone(), 111 | model_name: run_config.model_name.clone(), 112 | profile: run_config.profile.clone(), 113 | extra_metadata: run_config.extra_metadata.clone(), 114 | run_id: run_config.run_id.clone(), 115 | }; 116 | config.validate()?; 117 | let (tx, mut rx) = tokio::sync::mpsc::unbounded_channel(); 118 | if run_config.interactive { 119 | // send logs to file 120 | let target = Box::new(File::create("log.txt").expect("Can't create file")); 121 | env_logger::Builder::new() 122 | .target(env_logger::Target::Pipe(target)) 123 | .filter(Some("inference_benchmarker"), LevelFilter::Debug) 124 | .format(|buf, record| { 125 | writeln!( 126 | buf, 127 | "[{} {} {}:{}] {}", 128 | Local::now().format("%Y-%m-%d %H:%M:%S%.3f"), 129 | record.level(), 130 | record.file().unwrap_or("unknown"), 131 | record.line().unwrap_or(0), 132 | record.args() 133 | ) 134 | }) 135 | .init(); 136 | } else { 137 | env_logger::init(); 138 | } 139 | let config_clone = config.clone(); 140 | let mut stop_receiver = stop_sender.subscribe(); 141 | let stop_sender_clone = stop_sender.clone(); 142 | let ui_thread = tokio::spawn(async move { 143 | tokio::select! { 144 | _ = stop_receiver.recv() => { 145 | debug!("Received stop signal, stopping benchmark"); 146 | } 147 | _ = async{ 148 | if run_config.interactive { 149 | run_console(config_clone, rx, stop_sender_clone).await; 150 | } else { 151 | // consume the channel to avoid closed channel error 152 | while rx.recv().await.is_some() {} 153 | } 154 | } => {} 155 | } 156 | }); 157 | 158 | // download prompts dataset 159 | info!("Downloading dataset"); 160 | let _ = tx.send(Event::Message(MessageEvent { 161 | message: "Downloading dataset".to_string(), 162 | timestamp: chrono::Utc::now(), 163 | level: Level::Info, 164 | })); 165 | let filepath = requests::ConversationTextRequestGenerator::download_dataset( 166 | run_config.dataset, 167 | run_config.dataset_file, 168 | run_config.hf_token.clone(), 169 | ) 170 | .expect("Can't download dataset"); 171 | let requests = requests::ConversationTextRequestGenerator::load( 172 | filepath, 173 | run_config.tokenizer_name.clone(), 174 | run_config.prompt_options, 175 | run_config.decode_options, 176 | run_config.hf_token, 177 | )?; 178 | 179 | let mut benchmark = benchmark::Benchmark::new( 180 | config.clone(), 181 | Box::new(backend), 182 | Arc::from(Mutex::from(requests)), 183 | tx.clone(), 184 | stop_sender.clone(), 185 | ); 186 | let mut stop_receiver = stop_sender.subscribe(); 187 | tokio::select! { 188 | report = benchmark.run() => { 189 | match report { 190 | Ok(_) => { 191 | let report = benchmark.get_report(); 192 | let path = format!("results/{}_{}.json",run_config.tokenizer_name.replace("/","_").replace(".","_"), chrono::Utc::now().format("%Y-%m-%d-%H-%M-%S")); 193 | let path=Path::new(&path); 194 | let writer=BenchmarkReportWriter::try_new(config.clone(), report)?; 195 | writer.json(path).await?; 196 | info!("Report saved to {:?}",path); 197 | let _ = tx.send(Event::BenchmarkReportEnd(format!("{:?}", path))); 198 | }, 199 | Err(e) => { 200 | error!("Error running benchmark: {:?}", e.to_string()); 201 | let _ = tx.send(Event::BenchmarkError(e.to_string())); 202 | } 203 | }; 204 | } 205 | _ = stop_receiver.recv() => { 206 | debug!("Received stop signal, stopping benchmark"); 207 | } 208 | } 209 | info!("Benchmark finished"); 210 | if !run_config.interactive { 211 | // quit app if not interactive 212 | let _ = stop_sender.send(()); 213 | } 214 | ui_thread.await?; 215 | 216 | // Revert terminal to original view 217 | io::stdout().execute(ratatui::crossterm::terminal::LeaveAlternateScreen)?; 218 | ratatui::crossterm::terminal::disable_raw_mode()?; 219 | io::stdout().execute(ratatui::crossterm::cursor::Show)?; 220 | 221 | let report = benchmark.get_report(); 222 | match BenchmarkReportWriter::try_new(config.clone(), report) { 223 | Ok(writer) => { 224 | writer.stdout().await?; 225 | } 226 | Err(_) => { 227 | warn!("No results to report."); 228 | } 229 | }; 230 | 231 | Ok(()) 232 | } 233 | -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | use clap::error::ErrorKind::InvalidValue; 2 | use clap::{ArgGroup, Error, Parser}; 3 | use inference_benchmarker::{run, RunConfiguration, TokenizeOptions}; 4 | use log::{debug, error}; 5 | use reqwest::Url; 6 | use std::collections::HashMap; 7 | use std::time::Duration; 8 | use tokio::sync::broadcast; 9 | 10 | #[derive(Parser, Debug)] 11 | #[clap(author, version, about, long_about = None, group(ArgGroup::new("group_profile").multiple(true)),group(ArgGroup::new("group_manual").multiple(true).conflicts_with("group_profile")) 12 | )] 13 | struct Args { 14 | /// The name of the tokenizer to use 15 | #[clap(short, long, env)] 16 | tokenizer_name: String, 17 | 18 | /// The name of the model to use. If not provided, the same name as the tokenizer will be used. 19 | #[clap(long, env)] 20 | model_name: Option, 21 | 22 | /// The maximum number of virtual users to use 23 | #[clap(default_value = "128", short, long, env, group = "group_manual")] 24 | max_vus: u64, 25 | /// The duration of each benchmark step 26 | #[clap(default_value = "120s", short, long, env, group = "group_manual")] 27 | #[arg(value_parser = parse_duration)] 28 | duration: Duration, 29 | /// A list of rates of requests to send per second (only valid for the ConstantArrivalRate benchmark). 30 | #[clap(short, long, env)] 31 | rates: Option>, 32 | /// The number of rates to sweep through (only valid for the "sweep" benchmark) 33 | /// The rates will be linearly spaced up to the detected maximum rate 34 | #[clap(default_value = "10", long, env)] 35 | num_rates: u64, 36 | /// A benchmark profile to use 37 | #[clap(long, env, group = "group_profile")] 38 | profile: Option, 39 | /// The kind of benchmark to run (throughput, sweep, optimum) 40 | #[clap(default_value = "sweep", short, long, env, group = "group_manual")] 41 | benchmark_kind: String, 42 | /// The duration of the prewarm step ran before the benchmark to warm up the backend (JIT, caches, etc.) 43 | #[clap(default_value = "30s", short, long, env, group = "group_manual")] 44 | #[arg(value_parser = parse_duration)] 45 | warmup: Duration, 46 | /// The URL of the backend to benchmark. Must be compatible with OpenAI Message API 47 | #[clap(default_value = "http://localhost:8000", short, long, env)] 48 | url: Url, 49 | 50 | /// The api key send to the [`url`] as Header "Authorization: Bearer {API_KEY}". 51 | #[clap(default_value = "", short, long, env)] 52 | api_key: String, 53 | 54 | /// Disable console UI 55 | #[clap(short, long, env)] 56 | no_console: bool, 57 | /// Constraints for prompt length. 58 | /// No value means use the input prompt as defined in input dataset. 59 | /// We sample the number of tokens to generate from a normal distribution. 60 | /// Specified as a comma-separated list of key=value pairs. 61 | /// * num_tokens: target number of prompt tokens 62 | /// * min_tokens: minimum number of prompt tokens 63 | /// * max_tokens: maximum number of prompt tokens 64 | /// * variance: variance in the number of prompt tokens 65 | /// 66 | /// Example: num_tokens=200,max_tokens=210,min_tokens=190,variance=10 67 | #[clap( 68 | long, 69 | env, 70 | value_parser(parse_tokenizer_options), 71 | group = "group_manual" 72 | )] 73 | prompt_options: Option, 74 | /// Constraints for the generated text. 75 | /// We sample the number of tokens to generate from a normal distribution. 76 | /// Specified as a comma-separated list of key=value pairs. 77 | /// * num_tokens: target number of generated tokens 78 | /// * min_tokens: minimum number of generated tokens 79 | /// * max_tokens: maximum number of generated tokens 80 | /// * variance: variance in the number of generated tokens 81 | /// 82 | /// Example: num_tokens=200,max_tokens=210,min_tokens=190,variance=10 83 | #[clap( 84 | long, 85 | env, 86 | value_parser(parse_tokenizer_options), 87 | group = "group_manual" 88 | )] 89 | decode_options: Option, 90 | /// Hugging Face dataset to use for prompt generation 91 | #[clap( 92 | default_value = "hlarcher/inference-benchmarker", 93 | long, 94 | env, 95 | group = "group_manual" 96 | )] 97 | dataset: String, 98 | /// File to use in the Dataset 99 | #[clap( 100 | default_value = "share_gpt_filtered_small.json", 101 | long, 102 | env, 103 | group = "group_manual" 104 | )] 105 | dataset_file: String, 106 | /// Extra metadata to include in the benchmark results file, comma-separated key-value pairs. 107 | /// It can be, for example, used to include information about the configuration of the 108 | /// benched server. 109 | /// Example: --extra-meta "key1=value1,key2=value2" 110 | #[clap(long, env, value_parser(parse_key_val))] 111 | extra_meta: Option>, 112 | // A run identifier to use for the benchmark. This is used to identify the benchmark in the 113 | // results file. 114 | #[clap(long, env)] 115 | run_id: Option, 116 | } 117 | 118 | fn parse_duration(s: &str) -> Result { 119 | humantime::parse_duration(s).map_err(|_| Error::new(InvalidValue)) 120 | } 121 | 122 | fn parse_key_val(s: &str) -> Result, Error> { 123 | let mut key_val_map = HashMap::new(); 124 | let items = s.split(",").collect::>(); 125 | for item in items.iter() { 126 | let key_value = item.split("=").collect::>(); 127 | if key_value.len() % 2 != 0 { 128 | return Err(Error::new(InvalidValue)); 129 | } 130 | for i in 0..key_value.len() / 2 { 131 | key_val_map.insert( 132 | key_value[i * 2].to_string(), 133 | key_value[i * 2 + 1].to_string(), 134 | ); 135 | } 136 | } 137 | Ok(key_val_map) 138 | } 139 | 140 | fn parse_tokenizer_options(s: &str) -> Result { 141 | let mut tokenizer_options = TokenizeOptions::new(); 142 | let items = s.split(",").collect::>(); 143 | for item in items.iter() { 144 | let key_value = item.split("=").collect::>(); 145 | if key_value.len() != 2 { 146 | return Err(Error::new(InvalidValue)); 147 | } 148 | match key_value[0] { 149 | "num_tokens" => { 150 | tokenizer_options.num_tokens = Some(key_value[1].parse::().unwrap()) 151 | } 152 | "min_tokens" => tokenizer_options.min_tokens = key_value[1].parse::().unwrap(), 153 | "max_tokens" => tokenizer_options.max_tokens = key_value[1].parse::().unwrap(), 154 | "variance" => tokenizer_options.variance = key_value[1].parse::().unwrap(), 155 | _ => return Err(Error::new(InvalidValue)), 156 | } 157 | } 158 | if tokenizer_options.num_tokens.is_some() 159 | && (tokenizer_options.num_tokens.unwrap() == 0 160 | || tokenizer_options.min_tokens == 0 161 | || tokenizer_options.max_tokens == 0) 162 | { 163 | return Err(Error::new(InvalidValue)); 164 | } 165 | if tokenizer_options.min_tokens > tokenizer_options.max_tokens { 166 | return Err(Error::new(InvalidValue)); 167 | } 168 | Ok(tokenizer_options) 169 | } 170 | 171 | #[tokio::main] 172 | async fn main() { 173 | let args = Args::parse(); 174 | let git_sha = option_env!("VERGEN_GIT_SHA").unwrap_or("unknown"); 175 | println!( 176 | "Text Generation Inference Benchmark {} ({})", 177 | env!("CARGO_PKG_VERSION"), 178 | git_sha 179 | ); 180 | 181 | let (stop_sender, _) = broadcast::channel(1); 182 | // handle ctrl-c 183 | let stop_sender_clone = stop_sender.clone(); 184 | tokio::spawn(async move { 185 | tokio::signal::ctrl_c() 186 | .await 187 | .expect("Failed to listen for ctrl-c"); 188 | debug!("Received stop signal, stopping benchmark"); 189 | stop_sender_clone 190 | .send(()) 191 | .expect("Failed to send stop signal"); 192 | }); 193 | 194 | let stop_sender_clone = stop_sender.clone(); 195 | // get HF token 196 | let token_env_key = "HF_TOKEN".to_string(); 197 | let cache = hf_hub::Cache::from_env(); 198 | let hf_token = match std::env::var(token_env_key).ok() { 199 | Some(token) => Some(token), 200 | None => cache.token(), 201 | }; 202 | let model_name = args 203 | .model_name 204 | .clone() 205 | .unwrap_or(args.tokenizer_name.clone()); 206 | let run_id = args 207 | .run_id 208 | .unwrap_or(uuid::Uuid::new_v4().to_string()[..7].to_string()); 209 | let run_config = RunConfiguration { 210 | url: args.url, 211 | api_key: args.api_key, 212 | profile: args.profile.clone(), 213 | tokenizer_name: args.tokenizer_name.clone(), 214 | max_vus: args.max_vus, 215 | duration: args.duration, 216 | rates: args.rates, 217 | num_rates: args.num_rates, 218 | benchmark_kind: args.benchmark_kind.clone(), 219 | warmup_duration: args.warmup, 220 | interactive: !args.no_console, 221 | prompt_options: args.prompt_options.clone(), 222 | decode_options: args.decode_options.clone(), 223 | dataset: args.dataset.clone(), 224 | dataset_file: args.dataset_file.clone(), 225 | extra_metadata: args.extra_meta.clone(), 226 | hf_token, 227 | model_name, 228 | run_id, 229 | }; 230 | let main_thread = tokio::spawn(async move { 231 | match run(run_config, stop_sender_clone).await { 232 | Ok(_) => {} 233 | Err(e) => { 234 | error!("Fatal: {:?}", e); 235 | println!("Fatal: {:?}", e) 236 | } 237 | }; 238 | }); 239 | let _ = main_thread.await; 240 | } 241 | -------------------------------------------------------------------------------- /src/profiles.rs: -------------------------------------------------------------------------------- 1 | use crate::{RunConfiguration, TokenizeOptions}; 2 | use std::string::ToString; 3 | 4 | pub fn apply_profile( 5 | profile: &str, 6 | run_configuration: RunConfiguration, 7 | ) -> anyhow::Result { 8 | match profile { 9 | "fixed-length" => Ok(RunConfiguration { 10 | max_vus: 128, 11 | duration: std::time::Duration::from_secs(120), 12 | rates: None, 13 | num_rates: 10, 14 | benchmark_kind: "sweep".to_string(), 15 | warmup_duration: std::time::Duration::from_secs(30), 16 | prompt_options: Some(TokenizeOptions { 17 | num_tokens: Some(200), 18 | min_tokens: 200, 19 | max_tokens: 200, 20 | variance: 0, 21 | }), 22 | decode_options: Some(TokenizeOptions { 23 | num_tokens: Some(800), 24 | min_tokens: 50, 25 | max_tokens: 800, 26 | variance: 100, 27 | }), 28 | dataset: "hlarcher/inference-benchmarker".to_string(), 29 | dataset_file: "share_gpt_0_turns.json".to_string(), 30 | ..run_configuration 31 | }), 32 | "chat" => { 33 | // Simulates a multi-turn chat scenario 34 | // in which the model answers to successive user prompts. 35 | // The model is prompted with the whole conversation history 36 | // at each turn. Prefix caching will have a significant impact 37 | // on the performance of this benchmark. 38 | Ok(RunConfiguration { 39 | max_vus: 128, 40 | duration: std::time::Duration::from_secs(120), 41 | rates: None, 42 | num_rates: 10, 43 | benchmark_kind: "sweep".to_string(), 44 | warmup_duration: std::time::Duration::from_secs(30), 45 | prompt_options: None, // use prompts from dataset 46 | decode_options: Some(TokenizeOptions { 47 | num_tokens: Some(800), // decode up to 800 tokens 48 | min_tokens: 50, 49 | max_tokens: 800, 50 | variance: 100, 51 | }), 52 | dataset: "hlarcher/inference-benchmarker".to_string(), 53 | dataset_file: "share_gpt_turns.json".to_string(), 54 | ..run_configuration 55 | }) 56 | } 57 | "code-generation" => { 58 | // Simulates code-complete scenarios. Model is given large code snippets and 59 | // asked to complete them with a few tokens (e.g. a function name, a few code lines). 60 | Ok(RunConfiguration { 61 | max_vus: 128, 62 | duration: std::time::Duration::from_secs(120), 63 | rates: None, 64 | num_rates: 10, 65 | benchmark_kind: "sweep".to_string(), 66 | warmup_duration: std::time::Duration::from_secs(30), 67 | prompt_options: Some(TokenizeOptions { 68 | num_tokens: Some(4096), 69 | min_tokens: 3000, 70 | max_tokens: 6000, 71 | variance: 1000, 72 | }), 73 | decode_options: Some(TokenizeOptions { 74 | num_tokens: Some(50), 75 | min_tokens: 30, 76 | max_tokens: 80, 77 | variance: 10, 78 | }), 79 | dataset: "hlarcher/inference-benchmarker".to_string(), 80 | dataset_file: "github_code.json".to_string(), 81 | ..run_configuration 82 | }) 83 | } 84 | "classification" => { 85 | // Simulates cases where the model is fed with large chunks 86 | // of business data or document repeatedly and users 87 | // ask simple questions about the content (summarization, classification...) 88 | // Those use cases benefit a lot from prefix caching and chunked prefill. 89 | Ok(RunConfiguration { 90 | max_vus: 128, 91 | duration: std::time::Duration::from_secs(120), 92 | rates: None, 93 | num_rates: 10, 94 | benchmark_kind: "sweep".to_string(), 95 | warmup_duration: std::time::Duration::from_secs(30), 96 | prompt_options: Some(TokenizeOptions { 97 | num_tokens: Some(10000), 98 | min_tokens: 8000, 99 | max_tokens: 12000, 100 | variance: 5000, 101 | }), 102 | decode_options: Some(TokenizeOptions { 103 | num_tokens: Some(50), 104 | min_tokens: 30, 105 | max_tokens: 80, 106 | variance: 10, 107 | }), 108 | dataset: "hlarcher/inference-benchmarker".to_string(), 109 | dataset_file: "classification.json".to_string(), 110 | ..run_configuration 111 | }) 112 | } 113 | _ => Err(anyhow::anyhow!("Unknown profile: {}", profile)), 114 | } 115 | } 116 | -------------------------------------------------------------------------------- /src/results.rs: -------------------------------------------------------------------------------- 1 | use crate::executors::ExecutorConfig; 2 | use crate::requests::TextGenerationAggregatedResponse; 3 | use crate::results::BenchmarkErrors::NoResponses; 4 | use crate::scheduler::ExecutorType; 5 | use chrono::Utc; 6 | use std::fmt::{Debug, Display, Formatter}; 7 | use std::time::Duration; 8 | 9 | #[derive(Debug)] 10 | pub(crate) enum BenchmarkErrors { 11 | NoResponses, 12 | } 13 | 14 | impl Display for BenchmarkErrors { 15 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { 16 | match self { 17 | NoResponses => write!(f, "Backend did not return any valid response. It is either not responding or test duration is too short."), 18 | } 19 | } 20 | } 21 | 22 | #[derive(Clone)] 23 | pub struct BenchmarkResults { 24 | pub id: String, 25 | aggregated_responses: Vec, 26 | executor_type: ExecutorType, 27 | executor_config: ExecutorConfig, 28 | } 29 | 30 | impl BenchmarkResults { 31 | pub fn new( 32 | id: String, 33 | executor_type: ExecutorType, 34 | executor_config: ExecutorConfig, 35 | ) -> BenchmarkResults { 36 | BenchmarkResults { 37 | id, 38 | aggregated_responses: Vec::new(), 39 | executor_type, 40 | executor_config, 41 | } 42 | } 43 | 44 | pub fn add_response(&mut self, response: TextGenerationAggregatedResponse) { 45 | self.aggregated_responses.push(response); 46 | } 47 | 48 | pub fn total_requests(&self) -> usize { 49 | self.aggregated_responses.len() 50 | } 51 | 52 | pub fn start_time(&self) -> Option { 53 | self.aggregated_responses 54 | .first() 55 | .and_then(|response| response.start_time) 56 | } 57 | 58 | pub fn end_time(&self) -> Option { 59 | self.aggregated_responses 60 | .last() 61 | .and_then(|response| response.end_time) 62 | } 63 | 64 | fn is_ready(&self) -> bool { 65 | self.start_time().is_some() && self.end_time().is_some() 66 | } 67 | 68 | pub fn failed_requests(&self) -> usize { 69 | self.aggregated_responses 70 | .iter() 71 | .filter(|response| response.failed) 72 | .count() 73 | } 74 | 75 | pub fn successful_requests(&self) -> usize { 76 | self.aggregated_responses 77 | .iter() 78 | .filter(|response| !response.failed) 79 | .count() 80 | } 81 | 82 | pub fn token_throughput_secs(&self) -> anyhow::Result { 83 | if self.is_ready() { 84 | let total_tokens: u64 = self.total_tokens(); 85 | Ok(total_tokens as f64 / self.duration().unwrap_or_default().as_secs_f64()) 86 | } else { 87 | Err(anyhow::anyhow!(NoResponses)) 88 | } 89 | } 90 | 91 | pub fn total_tokens_sent(&self) -> u64 { 92 | self.get_successful_responses() 93 | .iter() 94 | .map(|response| response.request.clone().unwrap().num_prompt_tokens) 95 | .sum() 96 | } 97 | 98 | pub fn total_prompt_tokens(&self) -> u64 { 99 | self.get_successful_responses() 100 | .iter() 101 | .map(|response| response.request.clone().unwrap().num_prompt_tokens) 102 | .sum() 103 | } 104 | 105 | pub fn prompt_tokens_avg(&self) -> anyhow::Result { 106 | if self.is_ready() { 107 | let total_prompt_tokens = self.total_prompt_tokens(); 108 | Ok(total_prompt_tokens as f64 / self.successful_requests() as f64) 109 | } else { 110 | Err(anyhow::anyhow!(NoResponses)) 111 | } 112 | } 113 | 114 | pub fn successful_request_rate(&self) -> anyhow::Result { 115 | if self.is_ready() { 116 | let total_requests = self.successful_requests(); 117 | Ok(total_requests as f64 / self.duration().unwrap_or_default().as_secs_f64()) 118 | } else { 119 | Err(anyhow::anyhow!(NoResponses)) 120 | } 121 | } 122 | 123 | pub fn total_tokens(&self) -> u64 { 124 | self.get_successful_responses() 125 | .iter() 126 | .map(|response| response.num_generated_tokens) 127 | .sum() 128 | } 129 | 130 | pub fn duration(&self) -> anyhow::Result { 131 | if self.is_ready() { 132 | Ok(self 133 | .end_time() 134 | .unwrap() 135 | .duration_since(self.start_time().unwrap())) 136 | } else { 137 | Err(anyhow::anyhow!(NoResponses)) 138 | } 139 | } 140 | 141 | pub fn e2e_latency_avg(&self) -> anyhow::Result { 142 | if self.is_ready() { 143 | if self.successful_requests() == 0 { 144 | return Ok(Duration::from_secs(0)); 145 | } 146 | Ok(self 147 | .get_successful_responses() 148 | .iter() 149 | .map(|response| response.e2e_latency().unwrap_or_default()) 150 | .sum::() 151 | / self.successful_requests() as u32) 152 | } else { 153 | Err(anyhow::anyhow!(NoResponses)) 154 | } 155 | } 156 | 157 | pub fn e2e_latency_percentile(&self, percentile: f64) -> anyhow::Result { 158 | let quantile = self.quantile_duration( 159 | self.get_successful_responses() 160 | .iter() 161 | .map(|response| response.e2e_latency().unwrap_or_default()) 162 | .collect(), 163 | percentile, 164 | )?; 165 | Ok(Duration::from_secs_f64(quantile)) 166 | } 167 | 168 | pub fn time_to_first_token_avg(&self) -> anyhow::Result { 169 | if self.is_ready() { 170 | if self.successful_requests() == 0 { 171 | return Ok(Duration::from_secs(0)); 172 | } 173 | Ok(self 174 | .get_successful_responses() 175 | .iter() 176 | .map(|response| response.time_to_first_token().unwrap_or_default()) 177 | .sum::() 178 | / self.successful_requests() as u32) 179 | } else { 180 | Err(anyhow::anyhow!(NoResponses)) 181 | } 182 | } 183 | 184 | pub fn time_to_first_token_percentile(&self, percentile: f64) -> anyhow::Result { 185 | let quantile = self.quantile_duration( 186 | self.get_successful_responses() 187 | .iter() 188 | .map(|response| response.time_to_first_token().unwrap_or_default()) 189 | .collect(), 190 | percentile, 191 | )?; 192 | Ok(Duration::from_secs_f64(quantile)) 193 | } 194 | 195 | pub fn inter_token_latency_avg(&self) -> anyhow::Result { 196 | if self.is_ready() { 197 | if self.successful_requests() == 0 { 198 | return Ok(Duration::from_secs(0)); 199 | } 200 | Ok(self 201 | .get_successful_responses() 202 | .iter() 203 | .map(|response| response.inter_token_latency().unwrap_or_default()) 204 | .sum::() 205 | / self.successful_requests() as u32) 206 | } else { 207 | Err(anyhow::anyhow!(NoResponses)) 208 | } 209 | } 210 | 211 | pub fn inter_token_latency_percentile(&self, percentile: f64) -> anyhow::Result { 212 | let quantile = self.quantile_duration( 213 | self.get_successful_responses() 214 | .iter() 215 | .map(|response| response.inter_token_latency().unwrap_or_default()) 216 | .collect(), 217 | percentile, 218 | )?; 219 | Ok(Duration::from_secs_f64(quantile)) 220 | } 221 | 222 | pub fn executor_type(&self) -> ExecutorType { 223 | self.executor_type.clone() 224 | } 225 | 226 | pub fn executor_config(&self) -> ExecutorConfig { 227 | self.executor_config.clone() 228 | } 229 | 230 | fn get_successful_responses(&self) -> Vec<&TextGenerationAggregatedResponse> { 231 | self.aggregated_responses 232 | .iter() 233 | .filter(|response| !response.failed) 234 | .collect() 235 | } 236 | 237 | pub fn get_responses(&self) -> Vec { 238 | self.aggregated_responses.clone() 239 | } 240 | 241 | /// Calculate the quantile of a given data set using interpolation method 242 | /// Results are similar to `numpy.percentile` 243 | fn quantile_duration(&self, mut data: Vec, quantile: f64) -> anyhow::Result { 244 | if self.is_ready() { 245 | data.sort(); 246 | let i = (quantile * (data.len() - 1) as f64).floor(); 247 | let delta = (data.len() - 1) as f64 * quantile - i; 248 | if i as usize >= data.len() { 249 | return Err(anyhow::anyhow!(NoResponses)); 250 | } 251 | let quantile = (1. - delta) * data[i as usize].as_secs_f64() 252 | + delta * data[i as usize + 1].as_secs_f64(); 253 | Ok(quantile) 254 | } else { 255 | Err(anyhow::anyhow!(NoResponses)) 256 | } 257 | } 258 | } 259 | 260 | impl Debug for BenchmarkResults { 261 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { 262 | f.debug_struct("BenchmarkResult") 263 | .field("id", &self.id) 264 | .field("executor_type", &self.executor_type.to_string()) 265 | .field("total_requests", &self.total_requests()) 266 | .field("start_time", &self.start_time()) 267 | .field("end_time", &self.end_time()) 268 | .field("total_tokens", &self.total_tokens()) 269 | .field( 270 | "token_throughput_secs", 271 | &self 272 | .token_throughput_secs() 273 | .or::>(Ok(-1.0)), 274 | ) 275 | .field( 276 | "duration_ms", 277 | &self 278 | .duration() 279 | .or::>(Ok(Duration::from_secs(0))), 280 | ) 281 | .field( 282 | "average_time_to_first_token", 283 | &self 284 | .time_to_first_token_avg() 285 | .or::>(Ok(Duration::from_secs(0))), 286 | ) 287 | .field( 288 | "average_inter_token_latency", 289 | &self 290 | .inter_token_latency_avg() 291 | .or::>(Ok(Duration::from_secs(0))), 292 | ) 293 | .field("failed_requests", &self.failed_requests()) 294 | .field("successful_requests", &self.successful_requests()) 295 | .field( 296 | "request_rate", 297 | &self 298 | .successful_request_rate() 299 | .or::>(Ok(-1.0)), 300 | ) 301 | .field("sent_prompt_tokens", &self.total_tokens_sent()) 302 | .field( 303 | "e2e_latency_avg", 304 | &self 305 | .e2e_latency_avg() 306 | .or::>(Ok(Duration::from_secs(0))), 307 | ) 308 | .finish() 309 | } 310 | } 311 | 312 | #[derive(Debug, Clone)] 313 | pub struct BenchmarkReport { 314 | results: Vec, 315 | start_time: Option>, 316 | end_time: Option>, 317 | } 318 | 319 | impl BenchmarkReport { 320 | pub fn new() -> BenchmarkReport { 321 | BenchmarkReport { 322 | results: Vec::new(), 323 | start_time: None, 324 | end_time: None, 325 | } 326 | } 327 | 328 | pub fn start(&mut self) { 329 | self.start_time = Some(Utc::now()); 330 | } 331 | 332 | pub fn end(&mut self) { 333 | self.end_time = Some(Utc::now()); 334 | } 335 | 336 | pub fn add_benchmark_result(&mut self, result: BenchmarkResults) { 337 | self.results.push(result); 338 | } 339 | 340 | pub fn get_results(&self) -> Vec { 341 | self.results.clone() 342 | } 343 | 344 | pub fn start_time(&self) -> Option> { 345 | self.start_time 346 | } 347 | 348 | pub fn end_time(&self) -> Option> { 349 | self.end_time 350 | } 351 | } 352 | 353 | #[cfg(test)] 354 | mod test { 355 | use super::*; 356 | use crate::requests::TextGenerationRequest; 357 | use std::sync::Arc; 358 | #[test] 359 | fn test_time_to_first_token_percentile() { 360 | let request = Arc::from(TextGenerationRequest { 361 | id: None, 362 | prompt: "test".to_string(), 363 | num_prompt_tokens: 10, 364 | num_decode_tokens: None, 365 | }); 366 | let mut response1 = TextGenerationAggregatedResponse::new(request.clone()); 367 | response1.start_time = Some(tokio::time::Instant::now()); 368 | response1.end_time = 369 | Some(tokio::time::Instant::now() + tokio::time::Duration::from_millis(100)); 370 | response1.num_generated_tokens = 100; 371 | response1.failed = false; 372 | response1.times_to_tokens = vec![ 373 | Duration::from_millis(100), 374 | Duration::from_millis(200), 375 | Duration::from_millis(300), 376 | Duration::from_millis(400), 377 | Duration::from_millis(500), 378 | ]; 379 | 380 | let mut response2 = TextGenerationAggregatedResponse::new(request.clone()); 381 | response2.start_time = Some(tokio::time::Instant::now()); 382 | response2.end_time = 383 | Some(tokio::time::Instant::now() + tokio::time::Duration::from_millis(200)); 384 | response2.num_generated_tokens = 100; 385 | response2.failed = false; 386 | response2.times_to_tokens = vec![ 387 | Duration::from_millis(600), 388 | Duration::from_millis(700), 389 | Duration::from_millis(800), 390 | Duration::from_millis(900), 391 | Duration::from_millis(1000), 392 | ]; 393 | 394 | let mut response3 = TextGenerationAggregatedResponse::new(request.clone()); 395 | response3.start_time = Some(tokio::time::Instant::now()); 396 | response3.end_time = 397 | Some(tokio::time::Instant::now() + tokio::time::Duration::from_millis(300)); 398 | response3.num_generated_tokens = 100; 399 | response3.failed = false; 400 | response3.times_to_tokens = vec![ 401 | Duration::from_millis(1100), 402 | Duration::from_millis(1200), 403 | Duration::from_millis(1300), 404 | Duration::from_millis(1400), 405 | Duration::from_millis(1500), 406 | ]; 407 | 408 | let mut response4 = TextGenerationAggregatedResponse::new(request.clone()); 409 | response4.start_time = Some(tokio::time::Instant::now()); 410 | response4.end_time = 411 | Some(tokio::time::Instant::now() + tokio::time::Duration::from_millis(300)); 412 | response4.num_generated_tokens = 100; 413 | response4.failed = false; 414 | response4.times_to_tokens = vec![ 415 | Duration::from_millis(1600), 416 | Duration::from_millis(1700), 417 | Duration::from_millis(1800), 418 | Duration::from_millis(1900), 419 | Duration::from_millis(2000), 420 | ]; 421 | 422 | let mut results = BenchmarkResults::new( 423 | "test".to_string(), 424 | ExecutorType::ConstantArrivalRate, 425 | ExecutorConfig { 426 | max_vus: 0, 427 | duration: Default::default(), 428 | rate: None, 429 | }, 430 | ); 431 | results.add_response(response1); 432 | results.add_response(response2); 433 | results.add_response(response3); 434 | results.add_response(response4); 435 | 436 | assert_eq!( 437 | results.time_to_first_token_percentile(0.9).unwrap(), 438 | Duration::from_millis(1450) 439 | ); 440 | assert_eq!( 441 | results.time_to_first_token_percentile(0.5).unwrap(), 442 | Duration::from_millis(850) 443 | ); 444 | } 445 | } 446 | -------------------------------------------------------------------------------- /src/scheduler.rs: -------------------------------------------------------------------------------- 1 | use crate::executors::{ 2 | ConstantArrivalRateExecutor, ConstantVUsExecutor, Executor, ExecutorConfig, 3 | }; 4 | use crate::requests::{ 5 | TextGenerationAggregatedResponse, TextGenerationBackend, TextRequestGenerator, 6 | }; 7 | use crate::results::BenchmarkErrors::NoResponses; 8 | use crate::results::BenchmarkResults; 9 | use log::{debug, trace, warn}; 10 | use std::sync::Arc; 11 | use tokio::sync::mpsc::{Sender, UnboundedReceiver, UnboundedSender}; 12 | use tokio::sync::{broadcast, Mutex}; 13 | 14 | #[derive(Clone, strum_macros::Display)] 15 | pub enum ExecutorType { 16 | ConstantVUs, 17 | ConstantArrivalRate, 18 | } 19 | 20 | pub struct Scheduler { 21 | id: String, 22 | executor: Arc>, 23 | requests_generator: Arc>, 24 | results: Arc>, 25 | progress_tx: Sender>, 26 | stop_sender: broadcast::Sender<()>, 27 | } 28 | 29 | pub struct SchedulerProgress { 30 | pub progress: f64, 31 | pub requests_throughput: f64, 32 | pub successful_requests: u64, 33 | pub failed_requests: u64, 34 | } 35 | 36 | impl Scheduler { 37 | pub fn new( 38 | id: String, 39 | backend: Box, 40 | executor_type: ExecutorType, 41 | config: ExecutorConfig, 42 | requests_generator: Arc>, 43 | progress_tx: Sender>, 44 | stop_sender: broadcast::Sender<()>, 45 | ) -> Scheduler { 46 | match executor_type { 47 | ExecutorType::ConstantVUs => Scheduler { 48 | id: id.clone(), 49 | executor: Arc::from(Mutex::from(ConstantVUsExecutor::new( 50 | backend.clone(), 51 | config.max_vus, 52 | config.duration, 53 | ))), 54 | results: Arc::from(Mutex::from(BenchmarkResults::new( 55 | id.clone(), 56 | ExecutorType::ConstantVUs, 57 | config, 58 | ))), 59 | requests_generator, 60 | progress_tx, 61 | stop_sender, 62 | }, 63 | ExecutorType::ConstantArrivalRate => { 64 | if config.rate.is_none() { 65 | panic!("Rate must be specified for ConstantArrivalRateExecutor"); 66 | } 67 | let rate = config.rate.unwrap(); 68 | Scheduler { 69 | id: id.clone(), 70 | executor: Arc::from(Mutex::from(ConstantArrivalRateExecutor::new( 71 | backend.clone(), 72 | config.max_vus, 73 | config.duration, 74 | rate, 75 | ))), 76 | results: Arc::from(Mutex::from(BenchmarkResults::new( 77 | id.clone(), 78 | ExecutorType::ConstantArrivalRate, 79 | config, 80 | ))), 81 | requests_generator, 82 | progress_tx, 83 | stop_sender, 84 | } 85 | } 86 | } 87 | } 88 | 89 | pub async fn run(&mut self) -> anyhow::Result { 90 | debug!("Starting scheduler '{}'", self.id); 91 | // add responses to the benchmark result as they arrive 92 | let (tx, mut rx): ( 93 | UnboundedSender, 94 | UnboundedReceiver, 95 | ) = tokio::sync::mpsc::unbounded_channel(); 96 | let results = self.results.clone(); 97 | let progress_tx = self.progress_tx.clone(); 98 | let mut stop_receiver = self.stop_sender.subscribe(); 99 | let req_gen = self.requests_generator.clone(); 100 | tokio::spawn(async move { 101 | tokio::select! { 102 | _ = stop_receiver.recv() => { 103 | debug!("Received stop signal, stopping benchmark"); 104 | } 105 | _ = async{ 106 | while let Some(response) = rx.recv().await{ 107 | // call generator callback 108 | let response_txt=response.response.clone(); 109 | if let Some(request)= response.request.clone(){ 110 | req_gen.lock().await.callback(request, response_txt.unwrap_or_default().as_str()); 111 | } 112 | let result = results.clone(); 113 | let progress_tx = progress_tx.clone(); 114 | trace!("Received response: {:?}", response); 115 | if response.ended { 116 | return; 117 | } 118 | let mut result = result.lock().await; 119 | result.add_response(response); 120 | let expected_duration = result.executor_config().duration.as_secs_f64(); 121 | let start_time = result.start_time().unwrap_or(tokio::time::Instant::now()); 122 | let _ = progress_tx.send(Some(SchedulerProgress { 123 | progress: (100.0 * (1.0 - (expected_duration - start_time.elapsed().as_secs_f64()) / expected_duration)).min(100.0), 124 | requests_throughput: result.successful_request_rate().unwrap_or_default(), 125 | successful_requests: result.successful_requests() as u64, 126 | failed_requests: result.failed_requests() as u64, 127 | })).await; 128 | } 129 | }=>{} 130 | } 131 | }); 132 | self.executor 133 | .lock() 134 | .await 135 | .run( 136 | self.requests_generator.clone(), 137 | tx, 138 | self.stop_sender.clone(), 139 | ) 140 | .await; 141 | warn!("{:?}", self.results.clone()); 142 | if self.results.lock().await.successful_requests() == 0 { 143 | Err(anyhow::anyhow!(NoResponses)) 144 | } else { 145 | Ok(self.results.lock().await.clone()) 146 | } 147 | } 148 | 149 | pub fn get_results(&self) -> Arc> { 150 | self.results.clone() 151 | } 152 | } 153 | 154 | #[cfg(test)] 155 | mod tests { 156 | use super::*; 157 | use crate::requests::OpenAITextGenerationBackend; 158 | use std::time::Duration; 159 | use tokenizers::Tokenizer; 160 | use tokio::time; 161 | 162 | #[tokio::test] 163 | async fn test_constant_arrival_rate_scheduler() { 164 | let (progress_tx, _) = tokio::sync::mpsc::channel(10000); 165 | let (stop_sender, _) = tokio::sync::broadcast::channel(1); 166 | let backend = Box::new(crate::requests::DummyTextGenerationBackend::new( 167 | Duration::from_secs(1), 168 | )); 169 | let requests_generator = Arc::from(Mutex::from( 170 | crate::requests::DummyTextRequestGenerator::new(), 171 | )); 172 | let mut scheduler = Scheduler::new( 173 | "test".to_string(), 174 | backend, 175 | ExecutorType::ConstantArrivalRate, 176 | ExecutorConfig { 177 | max_vus: 800, 178 | duration: std::time::Duration::from_secs(10), 179 | rate: Some(20.0), 180 | }, 181 | requests_generator, 182 | progress_tx, 183 | stop_sender, 184 | ); 185 | let results = scheduler.run().await.unwrap(); 186 | assert_eq!(results.successful_requests(), 180); // 20 requests per second for 10 seconds - 20 requests for last second as the backend has a 1 second delay 187 | } 188 | 189 | #[tokio::test] 190 | async fn test_constant_vus_scheduler() { 191 | let (progress_tx, _) = tokio::sync::mpsc::channel(10000); 192 | let (stop_sender, _) = broadcast::channel(1); 193 | let backend = Box::new(crate::requests::DummyTextGenerationBackend::new( 194 | Duration::from_secs(1), 195 | )); 196 | let requests_generator = Arc::from(Mutex::from( 197 | crate::requests::DummyTextRequestGenerator::new(), 198 | )); 199 | let mut scheduler = Scheduler::new( 200 | "test".to_string(), 201 | backend, 202 | ExecutorType::ConstantVUs, 203 | ExecutorConfig { 204 | max_vus: 800, 205 | duration: Duration::from_secs(10), 206 | rate: None, 207 | }, 208 | requests_generator, 209 | progress_tx, 210 | stop_sender, 211 | ); 212 | let results = scheduler.run().await.unwrap(); 213 | assert!( 214 | results.successful_requests() > 7200, 215 | "Expected at least 7200 requests, got {}", 216 | results.successful_requests() 217 | ); 218 | } 219 | 220 | #[tokio::test] 221 | async fn test_constant_arrival_rate_openai_backend() { 222 | let (progress_tx, _) = tokio::sync::mpsc::channel(10000); 223 | let (stop_sender, _) = tokio::sync::broadcast::channel(1); 224 | let mut s = mockito::Server::new_async().await; 225 | s.mock("POST", "/v1/chat/completions") 226 | .with_status(200) 227 | .with_header("content-type", "text/event-stream") 228 | .with_chunked_body(|w| { 229 | w.write_all(b"data: {\"choices\": [{\"message\": null, \"finish_reason\": null, \"delta\": {\"content\": \"Hello, world!\"}}]}\n\n").unwrap(); 230 | std::thread::sleep(Duration::from_millis(500)); 231 | w.write_all(b"data: {\"choices\": [{\"message\": {\"content\": \"Hello, world!Hello, world!Hello, world!Hello, world!\", \"role\": \"user\"}, \"finish_reason\": \"stop\", \"delta\": {\"content\": \"Hello, world!\"}}]}\n\n").unwrap(); 232 | w.write_all(b"data: [DONE]\n\n") 233 | }) 234 | .create_async().await; 235 | let url = s.url().parse().unwrap(); 236 | let tokenizer = Arc::new(Tokenizer::from_pretrained("gpt2", None).unwrap()); 237 | let backend = OpenAITextGenerationBackend::try_new( 238 | "".to_string(), 239 | url, 240 | "gpt2".to_string(), 241 | tokenizer, 242 | time::Duration::from_secs(10), 243 | ) 244 | .unwrap(); 245 | let requests_generator = Arc::from(Mutex::from( 246 | crate::requests::DummyTextRequestGenerator::new(), 247 | )); 248 | let mut scheduler = Scheduler::new( 249 | "test".to_string(), 250 | Box::new(backend), 251 | ExecutorType::ConstantArrivalRate, 252 | ExecutorConfig { 253 | max_vus: 800, 254 | duration: Duration::from_secs(10), 255 | rate: Some(50.0), 256 | }, 257 | requests_generator, 258 | progress_tx, 259 | stop_sender, 260 | ); 261 | let results = scheduler.run().await.unwrap(); 262 | assert_eq!(results.successful_requests(), 475); // 25 expected missing requests due to the 500ms delay in the backend 263 | } 264 | } 265 | -------------------------------------------------------------------------------- /src/table.rs: -------------------------------------------------------------------------------- 1 | use crate::results::BenchmarkReport; 2 | use crate::BenchmarkConfig; 3 | use tabled::builder::Builder; 4 | 5 | pub fn parameters_table(benchmark: BenchmarkConfig) -> anyhow::Result { 6 | let mut builder = Builder::default(); 7 | let rates = benchmark 8 | .rates 9 | .map_or("N/A".to_string(), |e| format!("{:?}", e)); 10 | let prompt_options = benchmark 11 | .prompt_options 12 | .map_or("N/A".to_string(), |e| format!("{}", e)); 13 | let decode_options = benchmark 14 | .decode_options 15 | .map_or("N/A".to_string(), |e| format!("{}", e)); 16 | let extra_metadata = benchmark 17 | .extra_metadata 18 | .map_or("N/A".to_string(), |e| format!("{:?}", e)); 19 | builder.set_header(vec!["Parameter", "Value"]); 20 | builder.push_record(vec!["Max VUs", benchmark.max_vus.to_string().as_str()]); 21 | builder.push_record(vec![ 22 | "Duration", 23 | benchmark.duration.as_secs().to_string().as_str(), 24 | ]); 25 | builder.push_record(vec![ 26 | "Warmup Duration", 27 | benchmark.warmup_duration.as_secs().to_string().as_str(), 28 | ]); 29 | builder.push_record(vec![ 30 | "Benchmark Kind", 31 | benchmark.benchmark_kind.to_string().as_str(), 32 | ]); 33 | builder.push_record(vec!["Rates", rates.as_str()]); 34 | builder.push_record(vec!["Num Rates", benchmark.num_rates.to_string().as_str()]); 35 | builder.push_record(vec!["Prompt Options", prompt_options.as_str()]); 36 | builder.push_record(vec!["Decode Options", decode_options.as_str()]); 37 | builder.push_record(vec!["Tokenizer", benchmark.tokenizer.to_string().as_str()]); 38 | builder.push_record(vec!["Extra Metadata", extra_metadata.as_str()]); 39 | let mut table = builder.build(); 40 | table.with(tabled::settings::Style::sharp()); 41 | Ok(table) 42 | } 43 | 44 | pub fn results_table(benchmark: BenchmarkReport) -> anyhow::Result { 45 | let mut builder = Builder::default(); 46 | builder.set_header(vec![ 47 | "Benchmark", 48 | "QPS", 49 | "E2E Latency (avg)", 50 | "TTFT (avg)", 51 | "ITL (avg)", 52 | "Throughput", 53 | "Error Rate", 54 | "Successful Requests", 55 | "Prompt tokens per req (avg)", 56 | "Decoded tokens per req (avg)", 57 | ]); 58 | let results = benchmark.get_results(); 59 | for result in results { 60 | let qps = format!("{:.2} req/s", result.successful_request_rate()?); 61 | let e2e = format!("{:.2} sec", result.e2e_latency_avg()?.as_secs_f64()); 62 | let ttft = format!( 63 | "{:.2} ms", 64 | result.time_to_first_token_avg()?.as_micros() as f64 / 1000.0 65 | ); 66 | let itl = format!( 67 | "{:.2} ms", 68 | result.inter_token_latency_avg()?.as_micros() as f64 / 1000.0 69 | ); 70 | let throughput = format!("{:.2} tokens/sec", result.token_throughput_secs()?); 71 | let error_rate = result.failed_requests() as f64 / result.total_requests() as f64 * 100.0; 72 | let error_rate = format!("{:.2}%", error_rate); 73 | builder.push_record(vec![ 74 | result.id.as_str(), 75 | qps.as_str(), 76 | e2e.as_str(), 77 | ttft.as_str(), 78 | itl.as_str(), 79 | throughput.as_str(), 80 | error_rate.as_str(), 81 | format!( 82 | "{}/{}", 83 | result.successful_requests(), 84 | result.total_requests() 85 | ) 86 | .as_str(), 87 | format!("{:.2}", result.prompt_tokens_avg()?).as_str(), 88 | format!( 89 | "{:.2}", 90 | result.total_tokens() as f64 / result.successful_requests() as f64 91 | ) 92 | .as_str(), 93 | ]); 94 | } 95 | let mut table = builder.build(); 96 | table.with(tabled::settings::Style::sharp()); 97 | Ok(table) 98 | } 99 | -------------------------------------------------------------------------------- /src/writers.rs: -------------------------------------------------------------------------------- 1 | use crate::results::{BenchmarkReport, BenchmarkResults}; 2 | use crate::{executors, table, BenchmarkConfig}; 3 | use serde::Serialize; 4 | use std::path::Path; 5 | use sysinfo::{CpuRefreshKind, MemoryRefreshKind, System}; 6 | use tokio::fs; 7 | 8 | #[derive(Serialize)] 9 | pub struct PercentilesWriter { 10 | pub p50: f64, 11 | pub p60: f64, 12 | pub p70: f64, 13 | pub p80: f64, 14 | pub p90: f64, 15 | pub p95: f64, 16 | pub p99: f64, 17 | pub avg: f64, 18 | } 19 | 20 | #[derive(Serialize)] 21 | pub struct BenchmarkResultsWriter { 22 | id: String, 23 | executor_type: String, 24 | config: executors::ExecutorConfig, 25 | total_requests: u64, 26 | total_tokens: u64, 27 | token_throughput_secs: f64, 28 | duration_ms: u128, 29 | time_to_first_token_ms: PercentilesWriter, 30 | inter_token_latency_ms: PercentilesWriter, 31 | failed_requests: u64, 32 | successful_requests: u64, 33 | request_rate: f64, 34 | total_tokens_sent: u64, 35 | e2e_latency_ms: PercentilesWriter, 36 | } 37 | 38 | impl BenchmarkResultsWriter { 39 | pub fn new(results: BenchmarkResults) -> anyhow::Result { 40 | Ok(BenchmarkResultsWriter { 41 | id: results.id.clone(), 42 | executor_type: results.executor_type().to_string(), 43 | config: results.executor_config(), 44 | total_requests: results.total_requests() as u64, 45 | total_tokens: results.total_tokens(), 46 | token_throughput_secs: results.token_throughput_secs()?, 47 | duration_ms: results.duration().ok().unwrap().as_micros() / 1000, 48 | time_to_first_token_ms: PercentilesWriter { 49 | p50: results.time_to_first_token_percentile(0.5)?.as_micros() as f64 / 1000., 50 | p60: results.time_to_first_token_percentile(0.6)?.as_micros() as f64 / 1000., 51 | p70: results.time_to_first_token_percentile(0.7)?.as_micros() as f64 / 1000., 52 | p80: results.time_to_first_token_percentile(0.8)?.as_micros() as f64 / 1000., 53 | p90: results.time_to_first_token_percentile(0.9)?.as_micros() as f64 / 1000., 54 | p95: results.time_to_first_token_percentile(0.95)?.as_micros() as f64 / 1000., 55 | p99: results.time_to_first_token_percentile(0.99)?.as_micros() as f64 / 1000., 56 | avg: results.time_to_first_token_avg().ok().unwrap().as_micros() as f64 / 1000., 57 | }, 58 | inter_token_latency_ms: PercentilesWriter { 59 | p50: results.inter_token_latency_percentile(0.5)?.as_micros() as f64 / 1000., 60 | p60: results.inter_token_latency_percentile(0.6)?.as_micros() as f64 / 1000., 61 | p70: results.inter_token_latency_percentile(0.7)?.as_micros() as f64 / 1000., 62 | p80: results.inter_token_latency_percentile(0.8)?.as_micros() as f64 / 1000., 63 | p90: results.inter_token_latency_percentile(0.9)?.as_micros() as f64 / 1000., 64 | p95: results.inter_token_latency_percentile(0.95)?.as_micros() as f64 / 1000., 65 | p99: results.inter_token_latency_percentile(0.99)?.as_micros() as f64 / 1000., 66 | avg: results.inter_token_latency_avg().ok().unwrap().as_micros() as f64 / 1000., 67 | }, 68 | failed_requests: results.failed_requests() as u64, 69 | successful_requests: results.successful_requests() as u64, 70 | request_rate: results.successful_request_rate()?, 71 | total_tokens_sent: results.total_tokens_sent(), 72 | e2e_latency_ms: PercentilesWriter { 73 | p50: results.e2e_latency_percentile(0.5)?.as_micros() as f64 / 1000., 74 | p60: results.e2e_latency_percentile(0.6)?.as_micros() as f64 / 1000., 75 | p70: results.e2e_latency_percentile(0.7)?.as_micros() as f64 / 1000., 76 | p80: results.e2e_latency_percentile(0.8)?.as_micros() as f64 / 1000., 77 | p90: results.e2e_latency_percentile(0.9)?.as_micros() as f64 / 1000., 78 | p95: results.e2e_latency_percentile(0.95)?.as_micros() as f64 / 1000., 79 | p99: results.e2e_latency_percentile(0.99)?.as_micros() as f64 / 1000., 80 | avg: results.e2e_latency_avg().ok().unwrap().as_micros() as f64 / 1000., 81 | }, 82 | }) 83 | } 84 | } 85 | 86 | #[derive(Serialize)] 87 | pub struct SystemInfo { 88 | pub cpu: Vec, 89 | pub memory: String, 90 | pub os_name: String, 91 | pub os_version: String, 92 | pub kernel: String, 93 | pub hostname: String, 94 | } 95 | 96 | impl SystemInfo { 97 | pub fn new() -> SystemInfo { 98 | let s = System::new_with_specifics( 99 | sysinfo::RefreshKind::nothing() 100 | .with_memory(MemoryRefreshKind::everything()) 101 | .with_cpu(CpuRefreshKind::everything()), 102 | ); 103 | let cpu_info = s 104 | .cpus() 105 | .iter() 106 | .map(|cpu| format!("{} {}@{:.0}MHz", cpu.brand(), cpu.name(), cpu.frequency())) 107 | .collect::>(); 108 | SystemInfo { 109 | cpu: cpu_info, 110 | memory: format!( 111 | "{:.2} GB", 112 | s.total_memory() as f64 / 1024.0 / 1024.0 / 1024.0 113 | ), 114 | os_name: System::name().ok_or("N/A").unwrap(), 115 | os_version: System::os_version().ok_or("N/A").unwrap(), 116 | kernel: System::kernel_version().ok_or("N/A").unwrap(), 117 | hostname: System::host_name().ok_or("N/A").unwrap(), 118 | } 119 | } 120 | } 121 | 122 | #[derive(Serialize)] 123 | pub struct BenchmarkReportWriter { 124 | config: BenchmarkConfig, 125 | results: Vec, 126 | start_time: String, 127 | end_time: String, 128 | system: SystemInfo, 129 | #[serde(skip)] 130 | report: BenchmarkReport, 131 | } 132 | 133 | impl BenchmarkReportWriter { 134 | pub fn try_new( 135 | config: BenchmarkConfig, 136 | report: BenchmarkReport, 137 | ) -> anyhow::Result { 138 | let mut results: Vec = Vec::new(); 139 | for result in report.get_results() { 140 | let writer = BenchmarkResultsWriter::new(result)?; 141 | results.push(writer); 142 | } 143 | Ok(BenchmarkReportWriter { 144 | config, 145 | results, 146 | start_time: report 147 | .start_time() 148 | .ok_or(anyhow::anyhow!("start_time not set"))? 149 | .to_rfc3339(), 150 | end_time: report 151 | .end_time() 152 | .ok_or(anyhow::anyhow!("end_time not set"))? 153 | .to_rfc3339(), 154 | system: SystemInfo::new(), 155 | report, 156 | }) 157 | } 158 | pub async fn json(&self, path: &Path) -> anyhow::Result<()> { 159 | // write the benchmark report to json 160 | let report = serde_json::to_string(&self)?; 161 | 162 | // create path hierarchy if it doesn't exist 163 | if !path.exists() { 164 | if let Some(parent) = path.parent() { 165 | fs::create_dir_all(parent).await?; 166 | } 167 | } 168 | fs::write(path, report).await?; 169 | Ok(()) 170 | } 171 | 172 | pub async fn stdout(&self) -> anyhow::Result<()> { 173 | let param_table = table::parameters_table(self.config.clone())?; 174 | println!("\n{param_table}\n"); 175 | let results_table = table::results_table(self.report.clone())?; 176 | println!("\n{results_table}\n"); 177 | Ok(()) 178 | } 179 | } 180 | -------------------------------------------------------------------------------- /test_data/chat.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "id": "QWJhYvA_0", 4 | "conversations": [ 5 | { 6 | "role": "user", 7 | "content": "Summarize the main ideas of Jeff Walker's Product Launch Formula into bullet points as it pertains to a growth marketing agency implementing these strategies and tactics for their clients..." 8 | }, 9 | { 10 | "role": "assistant", 11 | "content": "Here are the main ideas of Jeff Walker's Product Launch Formula that can be applied by a growth marketing agency for their clients:\n\n1. Identify the target audience and their needs: Understand the ideal customer for the product or service, and create a messaging that resonates with them.\n2. Pre-launch: Build anticipation and excitement for the launch by creating buzz, gathering testimonials and case studies, and using social media to create awareness.\n3. Launch: Use a well-crafted launch sequence to maximize sales and conversions. This can include offering bonuses, creating scarcity, and using a deadline to create urgency.\n4. Post-launch: Follow up with customers, gather feedback, and continue to provide value to keep them engaged and loyal.\n5. Create a product suite: Once the initial product is successful, expand the offering by creating additional products or services that address other needs of the same audience.\n6. Continual optimization: Continually monitor and optimize the launch process and product suite to improve results.\n7. Build a community: Use the launch process to build a community of customers who are passionate about the product and can help spread the word.\n8. Use automation: Use technology and automation to streamline the launch process and improve efficiency." 12 | }, 13 | { 14 | "role": "user", 15 | "content": "Summarize the main ideas of Brendon Burchard's Experts Academy into bullet points as it pertains to a growth marketing agency implementing these strategies and tactics for their clients..." 16 | }, 17 | { 18 | "role": "assistant", 19 | "content": "Here are the main ideas of Brendon Burchard's Experts Academy that can be applied by a growth marketing agency for their clients:\n\n1. Identify your expertise: Define your area of expertise and create a unique value proposition that differentiates you from others.\n2. Create a content strategy: Develop a content strategy that showcases your expertise and provides value to your audience. This can include blog posts, videos, podcasts, and social media content.\n3. Build your audience: Use social media, advertising, and other channels to build your audience and increase your visibility.\n4. Develop your products: Create products and services that align with your expertise and meet the needs of your audience. These can include digital products, courses, coaching, and consulting services.\n5. Create a brand: Develop a strong brand that reflects your expertise and resonates with your audience.\n6. Build authority: Establish yourself as an authority in your industry by speaking at events, publishing a book, or appearing on podcasts or TV shows.\n7. Monetize your expertise: Develop a monetization strategy that leverages your expertise and products to generate revenue.\n8. Build a team: As your business grows, build a team of experts to help you scale your business and provide excellent service to your clients.\n9. Continual improvement: Continually improve your products, services, and marketing strategies to stay ahead of the competition and provide the best possible experience for your clients.\n10. Focus on impact: Ultimately, focus on making a positive impact in the lives of your clients and using your expertise to make a difference in the world." 20 | }, 21 | { 22 | "role": "user", 23 | "content": "What are the mental triggers in Jeff Walker's Product Launch Formula and \"Launch\" book?" 24 | }, 25 | { 26 | "role": "assistant", 27 | "content": "Here are the mental triggers described in Jeff Walker's Product Launch Formula and \"Launch\" book:\n\n1. Scarcity: Creating a sense of urgency by limiting the availability of the product or service being launched.\n2. Authority: Establishing oneself as an expert or authority in the industry to build credibility and trust with the audience.\n3. Social proof: Using testimonials, case studies, and other forms of social proof to demonstrate the effectiveness and value of the product or service.\n4. Reciprocity: Offering something of value to the audience before asking for a sale or commitment.\n5. Likeability: Building a personal connection with the audience and creating a likable persona to increase the chances of a sale.\n6. Anticipation: Building anticipation and excitement leading up to the launch to generate buzz and increase interest in the product or service.\n7. Exclusivity: Creating a feeling of exclusivity by offering early access or special deals to a select group of people.\n8. Belonging: Creating a sense of community or belonging among the audience to increase engagement and loyalty to the brand.\n9. Storytelling: Using storytelling to connect with the audience on an emotional level and make the product or service more relatable and memorable.\n\nBy leveraging these mental triggers, businesses can create an effective product launch strategy that resonates with their target audience and drives sales and growth." 28 | }, 29 | { 30 | "role": "user", 31 | "content": "Write a summary of why scarcity and urgency are the strongest mental triggers and have been the driving force behind many of our best performing campaigns over the last 8 years." 32 | }, 33 | { 34 | "role": "assistant", 35 | "content": "Scarcity and urgency are two of the strongest mental triggers in marketing because they create a sense of exclusivity and importance around a product or service. By creating a sense of limited availability or a deadline for a promotion, businesses can motivate customers to take action and make a purchase before it's too late.\n\nOver the last eight years, many businesses have leveraged these triggers in their marketing campaigns to great success. Scarcity and urgency have been used to sell everything from digital products to physical goods and services, and have proven to be highly effective in driving conversions and increasing revenue.\n\nThis is because scarcity and urgency tap into the natural human desire for instant gratification and fear of missing out (FOMO). When customers believe that they may miss out on a valuable opportunity if they don't act quickly, they are more likely to take action and make a purchase.\n\nFurthermore, scarcity and urgency can be combined with other mental triggers, such as social proof and authority, to create even more powerful marketing campaigns. By demonstrating the popularity and effectiveness of a product or service, businesses can increase the perceived value and urgency of a promotion, leading to even more sales and revenue.\n\nOverall, scarcity and urgency are powerful mental triggers that have been proven to drive results in marketing campaigns. By leveraging these triggers, businesses can create a sense of urgency and exclusivity around their products or services, motivating customers to take action and make a purchase before it's too late." 36 | }, 37 | { 38 | "role": "user", 39 | "content": "Summarize Russell Brunson's Perfect Webinar Script..." 40 | }, 41 | { 42 | "role": "assistant", 43 | "content": "Russell Brunson's Perfect Webinar Script is a framework for delivering webinars that are designed to sell a product or service. Here are the key components of the Perfect Webinar Script:\n\n1. Introduction: The webinar starts with an introduction that establishes the presenter's credibility and establishes the topic.\n2. Story: The presenter then tells a personal story that is related to the topic and establishes a connection with the audience.\n3. Content: The presenter provides valuable content related to the topic, demonstrating their expertise and building trust with the audience.\n4. Offer: The presenter then introduces the product or service they are selling, explaining how it solves the audience's problem and providing social proof to support their claims.\n5. Bonuses: The presenter offers bonuses to sweeten the deal and make the offer more enticing.\n6. Scarcity: The presenter creates a sense of urgency by limiting the availability of the offer or including a deadline for purchase.\n7. FAQs: The presenter answers common questions about the offer, addressing objections and building trust with the audience.\n8. Closing: The presenter then delivers a clear call to action, directing the audience to take a specific action to purchase the product or service.\n\nOverall, the Perfect Webinar Script is designed to build trust with the audience, establish the presenter's credibility, and provide valuable content, while also creating a sense of urgency and scarcity around the offer being presented. By following this framework, businesses can use webinars as a powerful tool to sell their products and services to a targeted audience." 44 | }, 45 | { 46 | "role": "user", 47 | "content": "Summarize the 6 human needs as Tony Robbins explains..." 48 | }, 49 | { 50 | "role": "assistant", 51 | "content": "Tony Robbins describes six core human needs that drive our behaviors and motivations. These six needs are:\n\n1. Certainty: The need for safety, stability, and predictability. This includes the need for comfort, security, and control over our environment.\n2. Variety: The need for novelty, excitement, and change. This includes the need for adventure, stimulation, and new experiences.\n3. Significance: The need to feel important, special, and unique. This includes the need for recognition, achievement, and respect.\n4. Connection: The need for social connection and love. This includes the need for intimacy, belonging, and friendship.\n5. Growth: The need for personal growth and development. This includes the need for learning, self-improvement, and progress.\n6. Contribution: The need to make a positive impact in the world. This includes the need to give back, help others, and make a difference.\n\nAccording to Tony Robbins, these six needs are universal and apply to all individuals. Each person may prioritize these needs differently, and may fulfill them in different ways, but they are fundamental drivers of human behavior. By understanding these needs, individuals can gain insight into their own motivations and behaviors, and can use this knowledge to create a more fulfilling and meaningful life." 52 | } 53 | ] 54 | }, 55 | { 56 | "id": "i6IyJda_0", 57 | "conversations": [ 58 | { 59 | "role": "system", 60 | "content": "You are a helpful assistant." 61 | }, 62 | { 63 | "role": "user", 64 | "content": "How to tell if a customer segment is well segmented? In 3 bullet points." 65 | }, 66 | { 67 | "role": "assistant", 68 | "content": "1. Homogeneity: The segment should consist of customers who share similar characteristics and behaviors.\n2. Distinctiveness: The segment should be different from other segments in terms of their characteristics and behaviors.\n3. Stability: The segment should remain relatively stable over time and not change drastically. The characteristics and behaviors of customers within the segment should not change significantly." 69 | } 70 | ] 71 | } 72 | ] 73 | --------------------------------------------------------------------------------