├── .dockerignore
├── .github
    └── workflows
    │   ├── build.yaml
    │   ├── build_ci.yaml
    │   ├── rust.yaml
    │   └── trufflehog.yaml
├── .gitignore
├── CODE_OF_CONDUCT.md
├── Cargo.toml
├── Dockerfile
├── LICENSE
├── Makefile
├── README.md
├── assets
    ├── dashboard.png
    └── ui.png
├── build.rs
├── extra
    ├── dashboard
    │   ├── app.py
    │   └── parse_results.py
    ├── k8s
    │   └── inference-benchmarker
    │   │   ├── .helmignore
    │   │   ├── Chart.yaml
    │   │   ├── templates
    │   │       ├── _helpers.tpl
    │   │       ├── benchmark.yaml
    │   │       ├── checks.yaml
    │   │       ├── configmap.yml
    │   │       ├── secret.yaml
    │   │       ├── service.yaml
    │   │       ├── tgi.yaml
    │   │       └── vllm.yaml
    │   │   └── values.yaml
    └── slurm
    │   ├── benchmark.py
    │   ├── tgi.slurm
    │   └── vllm.slurm
├── poetry.lock
├── pyproject.toml
├── src
    ├── app.rs
    ├── benchmark.rs
    ├── event.rs
    ├── executors.rs
    ├── flux.rs
    ├── lib.rs
    ├── main.rs
    ├── profiles.rs
    ├── requests.rs
    ├── results.rs
    ├── scheduler.rs
    ├── table.rs
    └── writers.rs
└── test_data
    ├── chat.json
    └── conversations.json


/.dockerignore:
--------------------------------------------------------------------------------
1 | *.json
2 | *.txt
3 | .git
4 | .idea
5 | target


--------------------------------------------------------------------------------
/.github/workflows/build.yaml:
--------------------------------------------------------------------------------
 1 | name: Build and push docker image to registry
 2 | 
 3 | on:
 4 |   workflow_call:
 5 | concurrency:
 6 |   group: ${{ github.workflow }}-${{ github.ref }}-build
 7 |   cancel-in-progress: true
 8 | jobs:
 9 |   build-and-push:
10 |     runs-on:
11 |       group: aws-general-8-plus
12 |     permissions:
13 |       packages: write
14 |       contents: read
15 |     steps:
16 |       - name: Checkout repository
17 |         uses: actions/checkout@v4
18 |       - name: Inject slug/short variables
19 |         uses: rlespinasse/github-slug-action@v4.4.1
20 |       - name: Initialize Docker Buildx
21 |         uses: docker/setup-buildx-action@v3
22 |         with:
23 |           install: true
24 |           buildkitd-config: /tmp/buildkitd.toml
25 |       - name: Login to GitHub Container Registry
26 |         uses: docker/login-action@v3
27 |         with:
28 |           registry: ghcr.io
29 |           username: ${{ github.actor }}
30 |           password: ${{ secrets.GITHUB_TOKEN }}
31 |       # If pull request
32 |       - name: Extract metadata (tags, labels) for Docker
33 |         if: ${{ github.event_name == 'pull_request' }}
34 |         id: meta-pr
35 |         uses: docker/metadata-action@v5
36 |         with:
37 |           images: |
38 |             ghcr.io/huggingface/inference-benchmarker
39 |           tags: |
40 |             type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}
41 |       # If main, release or tag
42 |       - name: Extract metadata (tags, labels) for Docker
43 |         if: ${{ github.event_name != 'pull_request' }}
44 |         id: meta
45 |         uses: docker/metadata-action@v4.3.0
46 |         with:
47 |           flavor: |
48 |             latest=auto
49 |           images: |
50 |             ghcr.io/huggingface/inference-benchmarker
51 |           tags: |
52 |             type=semver,pattern={{version}}
53 |             type=semver,pattern={{major}}.{{minor}}
54 |             type=raw,value=latest,enable=${{ github.ref == format('refs/heads/{0}', github.event.repository.default_branch) }}
55 |             type=raw,value=sha-${{ env.GITHUB_SHA_SHORT }}
56 |       - name: Build and push Docker image
57 |         id: build-and-push
58 |         uses: docker/build-push-action@v4
59 |         with:
60 |           context: .
61 |           file: ${{ env.DOCKERFILE }}
62 |           push: true
63 |           platforms: 'linux/amd64'
64 |           build-args: |
65 |             GIT_SHA=${{ env.GITHUB_SHA }}
66 |             DOCKER_LABEL=sha-${{ env.GITHUB_SHA_SHORT }}
67 |             PLATFORM=${{ env.PLATFORM }}
68 |           tags: ${{ steps.meta.outputs.tags || steps.meta-pr.outputs.tags }}
69 |           labels: ${{ steps.meta.outputs.labels || steps.meta-pr.outputs.labels }}
70 |       - name: Extract binary from Docker
71 |         if: ${{ github.event_name != 'pull_request' }}
72 |         run: |
73 |           docker create --name extract ghcr.io/huggingface/inference-benchmarker:sha-${{ env.GITHUB_SHA_SHORT }}
74 |           docker cp extract:/usr/local/bin/inference-benchmarker ${{ github.workspace }}/inference-benchmarker
75 |           docker rm -f extract
76 |       - name: Upload binary
77 |         if: ${{ github.event_name != 'pull_request' }}
78 |         uses: actions/upload-artifact@v4
79 |         with:
80 |           name: inference-benchmarker_x86_64
81 |           path: ${{ github.workspace }}/inference-benchmarker
82 | 
83 | 


--------------------------------------------------------------------------------
/.github/workflows/build_ci.yaml:
--------------------------------------------------------------------------------
 1 | name: Build inference-benchmarker
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 |   workflow_call:
 6 |   push:
 7 |     branches:
 8 |       - 'main'
 9 |     tags:
10 |       - 'v*'
11 |   pull_request:
12 |     branches:
13 |       - "main"
14 | 
15 | concurrency:
16 |   group: ${{ github.workflow }}-${{ github.ref }}
17 |   cancel-in-progress: true
18 | 
19 | jobs:
20 |   rust:
21 |     name: Rust checks
22 |     permissions:
23 |       pull-requests: write
24 |       contents: read
25 |     uses: ./.github/workflows/rust.yaml
26 |   build:
27 |     permissions:
28 |       packages: write
29 |       contents: read
30 |     name: Build and push docker image
31 |     uses: ./.github/workflows/build.yaml
32 |     needs: rust


--------------------------------------------------------------------------------
/.github/workflows/rust.yaml:
--------------------------------------------------------------------------------
 1 | name: Rust checks
 2 | 
 3 | on:
 4 |   workflow_call:
 5 | 
 6 | permissions:
 7 |   pull-requests: write
 8 |   contents: read
 9 | 
10 | jobs:
11 |   fmt:
12 |     name: Rustfmt
13 |     runs-on:
14 |       group: aws-general-8-plus
15 |     steps:
16 |       - uses: actions/checkout@v4
17 |       - uses: dtolnay/rust-toolchain@stable
18 |         with:
19 |           toolchain: nightly
20 |           components: rustfmt
21 |       - run: cargo fmt --all -- --check
22 | 
23 |   clippy:
24 |     name: Clippy
25 |     runs-on:
26 |       group: aws-general-8-plus
27 |     steps:
28 |       - uses: actions/checkout@v4
29 |       - uses: dtolnay/rust-toolchain@stable
30 |         with:
31 |           toolchain: stable
32 |           components: clippy
33 |       - name: Cache dependencies
34 |         uses: actions/cache@v4
35 |         env:
36 |           cache-name: cache-dependencies
37 |         with:
38 |           path: |
39 |             ~/.cargo/.crates.toml
40 |             ~/.cargo/.crates2.json
41 |             ~/.cargo/bin
42 |             ~/.cargo/registry/index
43 |             ~/.cargo/registry/cache
44 |             ./target
45 |           key: ${{ runner.os }}-build-${{ env.cache-name }}-${{ hashFiles('app/Cargo.lock') }}
46 |       - uses: giraffate/clippy-action@v1
47 |         with:
48 |           github_token: ${{ secrets.GITHUB_TOKEN }}
49 |           clippy_flags: -- -Dwarnings
50 |           tool_name: clippy-action-output
51 |           fail_on_error: true
52 |           filter_mode: file
53 |           level: info
54 |   test:
55 |     name: Test
56 |     runs-on:
57 |       group: aws-general-8-plus
58 |     steps:
59 |       - uses: actions/checkout@v4
60 |       - uses: dtolnay/rust-toolchain@stable
61 |         with:
62 |           toolchain: stable
63 |       - name: Cache dependencies
64 |         uses: actions/cache@v4
65 |         env:
66 |           cache-name: cache-dependencies-test
67 |         with:
68 |           path: |
69 |             ~/.cargo/.crates.toml
70 |             ~/.cargo/.crates2.json
71 |             ~/.cargo/bin
72 |             ~/.cargo/registry/index
73 |             ~/.cargo/registry/cache
74 |             ./target
75 |           key: ${{ runner.os }}-build-${{ env.cache-name }}-${{ hashFiles('app/Cargo.lock') }}
76 |       - name: Run unit tests
77 |         run: |
78 |           cargo test --package inference-benchmarker
79 | 
80 | 


--------------------------------------------------------------------------------
/.github/workflows/trufflehog.yaml:
--------------------------------------------------------------------------------
 1 | name: Secret Leaks
 2 | 
 3 | on:
 4 |   push:
 5 | 
 6 | jobs:
 7 |   trufflehog:
 8 |     runs-on:
 9 |       group: aws-general-8-plus
10 |     steps:
11 |     - name: Checkout code
12 |       uses: actions/checkout@v4
13 |       with:
14 |         fetch-depth: 0
15 |     - name: Secret Scanning
16 |       uses: trufflesecurity/trufflehog@main
17 |       with:
18 |         extra_args: --exclude-globs='test_data/.*' --results=verified,unknown --exclude-detectors=postgres
19 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | debug/
 2 | target/
 3 | Cargo.lock
 4 | **/*.rs.bk
 5 | *.pdb
 6 | __pycache__/
 7 | 
 8 | *.json
 9 | *.txt
10 | results
11 | *.parquet
12 | *.dev.*


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
  1 | # Contributor Covenant Code of Conduct
  2 | 
  3 | ## Our Pledge
  4 | 
  5 | We as members, contributors, and leaders pledge to make participation in our
  6 | community a harassment-free experience for everyone, regardless of age, body
  7 | size, visible or invisible disability, ethnicity, sex characteristics, gender
  8 | identity and expression, level of experience, education, socio-economic status,
  9 | nationality, personal appearance, race, religion, or sexual identity
 10 | and orientation.
 11 | 
 12 | We pledge to act and interact in ways that contribute to an open, welcoming,
 13 | diverse, inclusive, and healthy community.
 14 | 
 15 | ## Our Standards
 16 | 
 17 | Examples of behavior that contributes to a positive environment for our
 18 | community include:
 19 | 
 20 | * Demonstrating empathy and kindness toward other people
 21 | * Being respectful of differing opinions, viewpoints, and experiences
 22 | * Giving and gracefully accepting constructive feedback
 23 | * Accepting responsibility and apologizing to those affected by our mistakes,
 24 |   and learning from the experience
 25 | * Focusing on what is best not just for us as individuals, but for the
 26 |   overall community
 27 | 
 28 | Examples of unacceptable behavior include:
 29 | 
 30 | * The use of sexualized language or imagery, and sexual attention or
 31 |   advances of any kind
 32 | * Trolling, insulting or derogatory comments, and personal or political attacks
 33 | * Public or private harassment
 34 | * Publishing others' private information, such as a physical or email
 35 |   address, without their explicit permission
 36 | * Other conduct which could reasonably be considered inappropriate in a
 37 |   professional setting
 38 | 
 39 | ## Enforcement Responsibilities
 40 | 
 41 | Community leaders are responsible for clarifying and enforcing our standards of
 42 | acceptable behavior and will take appropriate and fair corrective action in
 43 | response to any behavior that they deem inappropriate, threatening, offensive,
 44 | or harmful.
 45 | 
 46 | Community leaders have the right and responsibility to remove, edit, or reject
 47 | comments, commits, code, wiki edits, issues, and other contributions that are
 48 | not aligned to this Code of Conduct, and will communicate reasons for moderation
 49 | decisions when appropriate.
 50 | 
 51 | ## Scope
 52 | 
 53 | This Code of Conduct applies within all community spaces, and also applies when
 54 | an individual is officially representing the community in public spaces.
 55 | Examples of representing our community include using an official e-mail address,
 56 | posting via an official social media account, or acting as an appointed
 57 | representative at an online or offline event.
 58 | 
 59 | ## Enforcement
 60 | 
 61 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
 62 | reported to the community leaders responsible for enforcement at
 63 | feedback@huggingface.co.
 64 | All complaints will be reviewed and investigated promptly and fairly.
 65 | 
 66 | All community leaders are obligated to respect the privacy and security of the
 67 | reporter of any incident.
 68 | 
 69 | ## Enforcement Guidelines
 70 | 
 71 | Community leaders will follow these Community Impact Guidelines in determining
 72 | the consequences for any action they deem in violation of this Code of Conduct:
 73 | 
 74 | ### 1. Correction
 75 | 
 76 | **Community Impact**: Use of inappropriate language or other behavior deemed
 77 | unprofessional or unwelcome in the community.
 78 | 
 79 | **Consequence**: A private, written warning from community leaders, providing
 80 | clarity around the nature of the violation and an explanation of why the
 81 | behavior was inappropriate. A public apology may be requested.
 82 | 
 83 | ### 2. Warning
 84 | 
 85 | **Community Impact**: A violation through a single incident or series
 86 | of actions.
 87 | 
 88 | **Consequence**: A warning with consequences for continued behavior. No
 89 | interaction with the people involved, including unsolicited interaction with
 90 | those enforcing the Code of Conduct, for a specified period of time. This
 91 | includes avoiding interactions in community spaces as well as external channels
 92 | like social media. Violating these terms may lead to a temporary or
 93 | permanent ban.
 94 | 
 95 | ### 3. Temporary Ban
 96 | 
 97 | **Community Impact**: A serious violation of community standards, including
 98 | sustained inappropriate behavior.
 99 | 
100 | **Consequence**: A temporary ban from any sort of interaction or public
101 | communication with the community for a specified period of time. No public or
102 | private interaction with the people involved, including unsolicited interaction
103 | with those enforcing the Code of Conduct, is allowed during this period.
104 | Violating these terms may lead to a permanent ban.
105 | 
106 | ### 4. Permanent Ban
107 | 
108 | **Community Impact**: Demonstrating a pattern of violation of community
109 | standards, including sustained inappropriate behavior,  harassment of an
110 | individual, or aggression toward or disparagement of classes of individuals.
111 | 
112 | **Consequence**: A permanent ban from any sort of public interaction within
113 | the community.
114 | 
115 | ## Attribution
116 | 
117 | This Code of Conduct is adapted from the [Contributor Covenant][homepage],
118 | version 2.0, available at
119 | https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
120 | 
121 | Community Impact Guidelines were inspired by [Mozilla's code of conduct
122 | enforcement ladder](https://github.com/mozilla/diversity).
123 | 
124 | [homepage]: https://www.contributor-covenant.org
125 | 
126 | For answers to common questions about this code of conduct, see the FAQ at
127 | https://www.contributor-covenant.org/faq. Translations are available at
128 | https://www.contributor-covenant.org/translations.
129 | 


--------------------------------------------------------------------------------
/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "inference-benchmarker"
 3 | version = "1.1.0"
 4 | edition = "2021"
 5 | authors = ["Hugo Larcher <hugo.larcher@huggingface.co>"]
 6 | repository = "https://github.com/huggingface/inference-benchmarker"
 7 | readme = "README.md"
 8 | license = "Apache-2.0"
 9 | 
10 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
11 | 
12 | [dependencies]
13 | reqwest-eventsource = "0.6.0"
14 | log = "0.4.22"
15 | serde_json = "1.0.127"
16 | serde = { version = "1.0.209", features = ["derive"] }
17 | tokio = { version = "1.40.0", features = ["rt", "rt-multi-thread", "macros", "signal"] }
18 | anyhow = "1.0.86"
19 | tokenizers = { version = "0.21.1", features = ["http"] }
20 | rand_distr = "0.5.1"
21 | rand = "0.9.0"
22 | futures-util = "0.3.30"
23 | env_logger = "0.11.5"
24 | async-trait = "0.1.82"
25 | reqwest = { version = "0.12.7", features = ["json"] }
26 | strum_macros = "0.27.1"
27 | clap = { version = "4.5.17", features = ["derive", "env"] }
28 | humantime = "2.1.0"
29 | ratatui = { version = "0.29.0", features = ["all-widgets"] }
30 | crossterm = "0.28.1"
31 | chrono = "0.4.38"
32 | hf-hub = { version = "0.4.2", features = ["tokio"] }
33 | indicatif = "0.17.8"
34 | rayon = "1.10.0"
35 | serde_with = "3.9.0"
36 | sysinfo = "0.33.1"
37 | mockito = "1.5.0"
38 | tabled = "=0.14"
39 | uuid = { version = "1.11.0", features = ["v4", "fast-rng"] }
40 | 
41 | [build-dependencies]
42 | vergen-gitcl = { version = "1.0.1" }
43 | 
44 | [[bin]]
45 | name = "inference-benchmarker"
46 | path = "src/main.rs"
47 | test = false
48 | bench = false


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM rust:1-bullseye AS builder
 2 | LABEL org.opencontainers.image.source=https://github.com/huggingface/inference-benchmarker
 3 | LABEL org.opencontainers.image.description="A benchmark tool for LLM inference engines"
 4 | LABEL org.opencontainers.image.licenses="Apache-2.0"
 5 | ARG GIT_SHA
 6 | WORKDIR /usr/src/inference-benchmarker
 7 | COPY . .
 8 | RUN cargo install --path .
 9 | FROM debian:bullseye-slim
10 | RUN apt update && apt install -y ca-certificates && rm -rf /var/lib/apt/lists/*
11 | RUN mkdir -p /opt/inference-benchmarker/results
12 | WORKDIR /opt/inference-benchmarker
13 | COPY --from=builder /usr/local/cargo/bin/inference-benchmarker /usr/local/bin/inference-benchmarker
14 | CMD ["inference-benchmarker"]


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2024 Hugging Face
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | build:
 2 | 	cargo build --release --package inference-benchmarker --bin inference-benchmarker
 3 | 
 4 | run: build
 5 | 	cargo run --package inference-benchmarker --bin inference-benchmarker -- $@
 6 | 
 7 | test:
 8 | 	cargo test --package inference-benchmarker
 9 | 
10 | lint:
11 | 	cargo +nightly fmt
12 | 	cargo clippy --package inference-benchmarker
13 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Inference Benchmarker
  2 | **The best tool for benchmarking inference engines and LLM performance**
  3 | 
  4 | 
  5 | Benchmarking inference servers for text generation models presents unique challenges.
  6 | The performance of LLM models can vary greatly depending on factors like input prompts,
  7 | decoding strategies, hardware specifications, and server configurations.
  8 | 
  9 | **Inference Benchmarker** is designed to streamline this process by providing a comprehensive benchmarking tool
 10 | that evaluates the real-world performance of text generation models and servers.
 11 | With **Inference Benchmarker**, you can easily test your model's throughput and efficiency under various workloads,
 12 | identify performance bottlenecks, and optimize your deployment for production environments.
 13 | 
 14 | It can be used to benchmark any text generation server that exposes an OpenAI-compliant API.
 15 | 
 16 | ## Features
 17 | 
 18 | * Broad Compatibility: Benchmarks any text generation server with an OpenAPI-compliant chat API.
 19 | * Automatic Sweep Mode: Detects maximum throughput and sweeps in-between.
 20 | * Open-Loop Benchmarking: Uses constant arrival rates to simulate real-world workloads.
 21 | * Benchmark profiles: Presets to benchmark for different model use cases (eg. chat, summarization, code completion...).
 22 | * High-Performance: Built with Rust 🦀 for high-performance benchmarking.
 23 | * JSON Output: Delivers performance results in a structured, easy-to-analyze format.
 24 | 
 25 | ![ui.png](assets/ui.png)
 26 | 
 27 | ## Table of contents
 28 | 
 29 | <!-- TOC -->
 30 | * [Inference Benchmarker](#inference-benchmarker)
 31 |   * [Features](#features)
 32 |   * [Table of contents](#table-of-contents)
 33 |   * [Get started](#get-started)
 34 |     * [Run a benchmark](#run-a-benchmark)
 35 |       * [1. Start an inference server](#1-start-an-inference-server)
 36 |       * [2. Run a benchmark using Docker image](#2-run-a-benchmark-using-docker-image)
 37 |     * [Configure your benchmark](#configure-your-benchmark)
 38 |       * [Profiles](#profiles)
 39 |       * [Benchmark mode](#benchmark-mode)
 40 |       * [Dataset configuration](#dataset-configuration)
 41 |       * [Prompt configuration](#prompt-configuration)
 42 |     * [Decode options](#decode-options)
 43 |   * [Deploy on Kubernetes](#deploy-on-kubernetes)
 44 |   * [Deploy on Slurm](#deploy-on-slurm)
 45 |   * [Visualize the results](#visualize-the-results)
 46 |   * [Development](#development)
 47 |   * [Frequently Asked Questions](#frequently-asked-questions)
 48 | <!-- TOC -->
 49 | 
 50 | ## Get started
 51 | 
 52 | ### Install
 53 | 
 54 | If you have [cargo](https://rustup.rs/) already installed:
 55 | ```bash
 56 | cargo install --git https://github.com/huggingface/inference-benchmarker/
 57 | ```
 58 | 
 59 | Or download the [latest released binary](https://github.com/huggingface/inference-benchmarker/releases/latest)
 60 | 
 61 | Or you can run docker images.
 62 | 
 63 | ### Run a benchmark
 64 | 
 65 | #### 1. Start an inference server
 66 | 
 67 | **TGI**
 68 | 
 69 | ```bash
 70 | MODEL=meta-llama/Llama-3.1-8B-Instruct
 71 | HF_TOKEN=<your HF READ token>
 72 | 
 73 | docker run --gpus all --shm-size 1g -p 8080:80 -e "HF_TOKEN=$HF_TOKEN" \
 74 |     ghcr.io/huggingface/text-generation-inference:3.2.1 --model-id $MODEL
 75 | ```
 76 | 
 77 | **vLLM**
 78 | 
 79 | ```bash
 80 | MODEL=meta-llama/Llama-3.1-8B-Instruct
 81 | HF_TOKEN=<your HF READ token>
 82 | docker run --runtime nvidia --gpus all \
 83 |     --env "HUGGING_FACE_HUB_TOKEN=$HF_TOKEN" \
 84 |     -p 8080:8000 \
 85 |     --ipc=host \
 86 |     vllm/vllm-openai:latest \
 87 |     --model $MODEL
 88 | ```
 89 | 
 90 | 
 91 | #### 2. Run a benchmark
 92 | 
 93 | ```shell
 94 | inference-benchmarker
 95 |     --tokenizer-name "meta-llama/Llama-3.1-8B-Instruct" \
 96 |     --url http://localhost:8080 \
 97 |     --profile chat
 98 | ```
 99 | 
100 | Results will be saved in JSON format in current directory.
101 | 
102 | ### Configure your benchmark
103 | 
104 | #### Profiles
105 | 
106 | Profiles are presets to benchmark for different model use cases. Available profiles:
107 | - `chat`
108 |   Simulates a multi-turn chat scenario in which the model answers to successive user prompts.
109 |   The model is prompted with the whole conversation history at each turn. Prefix caching will have a significant impact
110 |   on the performance of this benchmark.
111 | - `code-generation`
112 |   Simulates code-complete scenarios. Model is given large code snippets and asked to complete them with a few tokens 
113 |   (e.g. a function name, a few code lines).
114 | - `classification`
115 |   Simulates cases where the model is fed with large chunks of business data or document repeatedly and users
116 |   ask simple questions about the content (summarization, classification...).
117 |   Those use cases benefit a lot from prefix caching and chunked prefill.
118 | - `fixed-length`
119 |   Model is sent fixed-length prompts to void the impact of variable-length tokenization on the benchmark.
120 |   This is a technical benchmark to evaluate the raw throughput of the model.
121 | 
122 | #### Benchmark mode
123 | 
124 | In default mode, tool runs a `sweep` benchmark. It first runs a throughput test to find the maximum throughput, then
125 | sweeps on QPS values up to the maximum throughput.
126 | 
127 | Available modes:
128 | 
129 | - `sweep`: runs a sweep benchmark
130 | - `rate`: runs a benchmark at a fixed request rate
131 | - `throughput`: runs a benchmark at a fixed throughput (constant VUs)
132 | 
133 | Example running a benchmark at a fixed request rates:
134 | 
135 | ```shell 
136 | inference-benchmarker \
137 |     --tokenizer-name "meta-llama/Llama-3.1-8B-Instruct" \
138 |     --max-vus 800 \
139 |     --duration 120s \
140 |     --url http://localhost:8080 \
141 |     --warmup 30s \
142 |     --benchmark-kind rate \
143 |     --rates 1.0 \
144 |     --rates 5.0 \
145 |     --rates 10.0 \
146 |     --prompt-options "num_tokens=200,max_tokens=220,min_tokens=180,variance=10" \
147 |     --decode-options "num_tokens=200,max_tokens=220,min_tokens=180,variance=10"
148 | ```
149 | 
150 | #### Dataset configuration
151 | 
152 | Prompts are sampled for a Hugging Face dataset file, using a [subset of ShareGPT
153 | as default](https://huggingface.co/datasets/hlarcher/share_gpt_small). You can specify a different dataset file using
154 | the
155 | `--dataset` and `--dataset-file` option.
156 | 
157 | Dataset is expected to be JSON with the following format:
158 | 
159 | ```json
160 | [
161 |   {
162 |     "conversations": [
163 |       {
164 |         "role": "user",
165 |         "content": "rewrite that entire paragraph in the same style like this one: "
166 |       }
167 |     ]
168 |   }
169 | ]
170 | ```
171 | 
172 | To benchmark with prefix caching, you can use a system prompt that will be sent with each request from a discussion.
173 | 
174 | ```json
175 | [
176 |   {
177 |     "conversations": [
178 |       {
179 |         "role": "system",
180 |         "content": "You are a helpful assistant that makes jokes at each response."
181 |       },
182 |       {
183 |         "role": "user",
184 |         "content": "rewrite that entire paragraph in the same style like this one:"
185 |       }
186 |     ]
187 |   }
188 | ]
189 | ```
190 | 
191 | #### Prompt configuration
192 | 
193 | For consistent results you can configure the token count and variance. The tool will sample prompts with the specified
194 | values, sampling token counts from a normal distribution with the specified variance.
195 | 
196 | ```shell
197 | --prompt-options "num_tokens=50,max_tokens=60,min_tokens=40,variance=10"
198 | ```
199 | 
200 | ### Decode options
201 | 
202 | You can also configure the decoding options for the model. The tool will sample decoding options with the specified
203 | values, sampling token counts from a normal distribution with the specified variance.
204 | 
205 | ```shell
206 | --decode-options "num_tokens=50,max_tokens=60,min_tokens=40,variance=10"
207 | ```
208 | 
209 | ## Deploy on Kubernetes
210 | 
211 | You can deploy the benchmarking tool on Kubernetes using the provided Helm chart.
212 | 
213 | Review the values (especially model, HF token and resources), and install the chart:
214 | ```shell
215 | $ helm install inference-benchmarker ./extra/k8s/inference-benchmarker
216 | ```
217 | 
218 | ## Deploy on Slurm
219 | 
220 | Slurm example is provided in `extra/slurm`.
221 | 
222 | ## Visualize the results
223 | 
224 | You can use the provided Gradio app to quickly visualize the results.
225 | Considering that JSON results files are in `/path/to/results/dir`:
226 | ```shell
227 | $ poetry install
228 | $ cd extra/dashboard
229 | $ poetry run python app.py --from-results-dir /path/to/results/dir
230 | ```
231 | 
232 | ![dashboard.png](assets/dashboard.png)
233 | 
234 | ## Development
235 | 
236 | You need [Rust](https://rustup.rs/) installed to build the benchmarking tool.
237 | 
238 | ```shell
239 | $ make build
240 | ```
241 | 
242 | ## Frequently Asked Questions
243 | 
244 | * **What's the difference between constant arrival rate and constant virtual user count?**
245 |     * **Constant virtual user count** means that the number of virtual users is fixed. Each virtual user can send a
246 |       single requests and waits for server response. It's basically simulating a fixed number of users querying the
247 |       server.
248 |     * **Constant arrival rate** means that the rate of requests is fixed and the number of virtual users is adjusted to
249 |       maintain that rate. Queries hit the server independently of responses performances.
250 | 
251 |   **Constant virtual user count** is a closed loop model where the server's response time dictates the number of
252 |   iterations. **Constant arrival rate** is an open-loop model more representative of real-life workloads.
253 | 
254 | 
255 | * **Why do I get high error rate when running `thoughput` benchmark?**
256 | 
257 |   Throughput bench tries to saturate the server with a high request rate. The error rate is high because the server is
258 |   not able to handle the request rate or rate limiting the requests.
259 |   In the case of TGI, this is controlled by the `--max-concurrent-requests` option.
260 | 
261 | 
262 | * **What is the influence of CUDA graphs?**
263 | 
264 |   CUDA graphs are used to optimize the GPU usage by minimizing the overhead of launching kernels. This can lead to
265 |   better performance in some cases, but can also lead to worse performance in others.
266 |   If your CUDA graphs are not evenly distributed, you may see a performance drop at some request rates as batch size may
267 |   fall in a bigger CUDA graph batch size leading to a lost of compute due to excessive padding.
268 | 
269 | * **I get less tokens generated than expected in a benchmark.**
270 | 
271 |   Inference servers use `max_tokens` parameter to limit the number of tokens generated. If the model
272 |   generates an end-of-sequence token before reaching `max_tokens`, the generation will stop.
273 |   There is currently no way to guarantee a fixed number of tokens generated without modifying the inference server.
274 |   So you may have `(successful requests) * max_tokens < generated tokens`.
275 | 
276 | 


--------------------------------------------------------------------------------
/assets/dashboard.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/inference-benchmarker/687e477930b387d3c9c787d4953a266f6469f047/assets/dashboard.png


--------------------------------------------------------------------------------
/assets/ui.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/huggingface/inference-benchmarker/687e477930b387d3c9c787d4953a266f6469f047/assets/ui.png


--------------------------------------------------------------------------------
/build.rs:
--------------------------------------------------------------------------------
 1 | use std::error::Error;
 2 | use vergen_gitcl::{Emitter, GitclBuilder};
 3 | 
 4 | fn main() -> Result<(), Box<dyn Error>> {
 5 |     // Try to get the git sha from the local git repository
 6 |     match GitclBuilder::all_git() {
 7 |         Ok(gitcl) => match Emitter::default().fail_on_error().add_instructions(&gitcl) {
 8 |             Ok(emitter) => {
 9 |                 if emitter.emit().is_err() {
10 |                     fallback_git_sha();
11 |                 }
12 |             }
13 |             Err(_) => {
14 |                 fallback_git_sha();
15 |             }
16 |         },
17 |         Err(_) => {
18 |             fallback_git_sha();
19 |         }
20 |     };
21 |     Ok(())
22 | }
23 | 
24 | fn fallback_git_sha() {
25 |     // Unable to get the git sha
26 |     if let Ok(sha) = std::env::var("GIT_SHA") {
27 |         // Set it from an env var
28 |         println!("cargo:rustc-env=VERGEN_GIT_SHA={sha}");
29 |     }
30 | }
31 | 


--------------------------------------------------------------------------------
/extra/dashboard/app.py:
--------------------------------------------------------------------------------
  1 | from contextlib import ExitStack
  2 | from dataclasses import dataclass
  3 | from typing import List
  4 | 
  5 | import click
  6 | import gradio as gr
  7 | import pandas as pd
  8 | 
  9 | from parse_results import build_results
 10 | 
 11 | 
 12 | @dataclass
 13 | class PlotConfig:
 14 |     x_title: str
 15 |     y_title: str
 16 |     title: str
 17 |     percentiles: List[float] = None
 18 | 
 19 | 
 20 | def run(from_results_dir, datasource, port):
 21 |     css = '''
 22 |     .summary span {
 23 |         font-size: 10px;
 24 |         padding-top:0;
 25 |         padding-bottom:0;
 26 |     }
 27 |     '''
 28 | 
 29 |     summary_desc = '''
 30 |     ## Summary
 31 |     This table shows the average of the metrics for each model and QPS rate.
 32 |     
 33 |     The metrics are:
 34 |     * Inter token latency: Time to generate a new output token for each user querying the system. 
 35 |       It translates as the “speed” perceived by the end-user. We aim for at least 300 words per minute (average reading speed), so ITL<150ms
 36 |     * Time to First Token: Time the user has to wait before seeing the first token of its answer. 
 37 |       Lower waiting time are essential for real-time interactions, less so for offline workloads.
 38 |     * End-to-end latency: The overall time the system took to generate the full response to the user.
 39 |     * Throughput: The number of tokens per second the system can generate across all requests
 40 |     * Successful requests: The number of requests the system was able to honor in the benchmark timeframe
 41 |     * Error rate: The percentage of requests that ended up in error, as the system could not process them in time or failed to process them. 
 42 |           
 43 |     '''
 44 | 
 45 |     df_bench = pd.DataFrame()
 46 |     line_plots_bench = []
 47 |     column_mappings = {'inter_token_latency_ms_p90': 'ITL P90 (ms)', 'time_to_first_token_ms_p90': 'TTFT P90 (ms)',
 48 |                        'e2e_latency_ms_p90': 'E2E P90 (ms)', 'token_throughput_secs': 'Throughput (tokens/s)',
 49 |                        'successful_requests': 'Successful requests', 'error_rate': 'Error rate (%)', 'model': 'Model',
 50 |                        'rate': 'QPS', 'run_id': 'Run ID'}
 51 |     default_df = pd.DataFrame.from_dict(
 52 |         {"rate": [1, 2], "inter_token_latency_ms_p90": [10, 20],
 53 |          "version": ["default", "default"],
 54 |          "model": ["default", "default"]})
 55 | 
 56 |     def load_demo(model_bench, percentiles):
 57 |         return update_bench(model_bench, percentiles)
 58 | 
 59 |     def update_bench(model, percentiles):
 60 |         res = []
 61 |         for plot in line_plots_bench:
 62 |             if plot['config'].percentiles:
 63 |                 k = plot['metric'] + '_' + str(percentiles)
 64 |                 df_bench[plot['metric']] = df_bench[k] if k in df_bench.columns else 0
 65 |             res.append(df_bench[(df_bench['model'] == model)])
 66 | 
 67 |         return res + [summary_table()]
 68 | 
 69 |     def summary_table() -> pd.DataFrame:
 70 |         data = df_bench.groupby(['model', 'run_id', 'rate']).agg(
 71 |             {'inter_token_latency_ms_p90': 'mean', 'time_to_first_token_ms_p90': 'mean',
 72 |              'e2e_latency_ms_p90': 'mean', 'token_throughput_secs': 'mean',
 73 |              'successful_requests': 'mean', 'error_rate': 'mean'}).reset_index()
 74 |         data = data[
 75 |             ['run_id', 'model', 'rate', 'inter_token_latency_ms_p90', 'time_to_first_token_ms_p90',
 76 |              'e2e_latency_ms_p90',
 77 |              'token_throughput_secs']]
 78 |         for metric in ['inter_token_latency_ms_p90', 'time_to_first_token_ms_p90', 'e2e_latency_ms_p90',
 79 |                        'token_throughput_secs']:
 80 |             data[metric] = data[metric].apply(lambda x: f"{x:.2f}")
 81 |         data = data.rename(
 82 |             columns=column_mappings)
 83 |         return data
 84 | 
 85 |     def load_bench_results(source) -> pd.DataFrame:
 86 |         data = pd.read_parquet(source)
 87 |         # remove warmup and throughput
 88 |         data = data[(data['id'] != 'warmup') & (data['id'] != 'throughput')]
 89 |         # only keep constant rate
 90 |         data = data[data['executor_type'] == 'ConstantArrivalRate']
 91 |         return data
 92 | 
 93 |     def select_region(selection: gr.SelectData, model):
 94 |         min_w, max_w = selection.index
 95 |         data = df_bench[(df_bench['model'] == model) & (df_bench['rate'] >= min_w) & (
 96 |                 df_bench['rate'] <= max_w)]
 97 |         res = []
 98 |         for plot in line_plots_bench:
 99 |             # find the y values for the selected region
100 |             metric = plot["metric"]
101 |             y_min = data[metric].min()
102 |             y_max = data[metric].max()
103 |             res.append(gr.LinePlot(x_lim=[min_w, max_w], y_lim=[y_min, y_max]))
104 |         return res
105 | 
106 |     def reset_region():
107 |         res = []
108 |         for _ in line_plots_bench:
109 |             res.append(gr.LinePlot(x_lim=None, y_lim=None))
110 |         return res
111 | 
112 |     def load_datasource(datasource, fn):
113 |         if datasource.startswith('file://'):
114 |             return fn(datasource)
115 |         elif datasource.startswith('s3://'):
116 |             return fn(datasource)
117 |         else:
118 |             raise ValueError(f"Unknown datasource: {datasource}")
119 | 
120 |     if from_results_dir is not None:
121 |         build_results(from_results_dir, 'benchmarks.parquet', None)
122 |     # Load data
123 |     df_bench = load_datasource(datasource, load_bench_results)
124 | 
125 |     # Define metrics
126 |     metrics = {
127 |         "inter_token_latency_ms": PlotConfig(title="Inter Token Latency (lower is better)", x_title="QPS",
128 |                                              y_title="Time (ms)", percentiles=[0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99]),
129 |         "time_to_first_token_ms": PlotConfig(title="TTFT (lower is better)", x_title="QPS",
130 |                                              y_title="Time (ms)", percentiles=[0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99]),
131 |         "e2e_latency_ms": PlotConfig(title="End to End Latency (lower is better)", x_title="QPS",
132 |                                      y_title="Time (ms)", percentiles=[0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99]),
133 |         "token_throughput_secs": PlotConfig(title="Request Output Throughput (higher is better)", x_title="QPS",
134 |                                             y_title="Tokens/s"),
135 |         "successful_requests": PlotConfig(title="Successful requests (higher is better)", x_title="QPS",
136 |                                           y_title="Count"),
137 |         "error_rate": PlotConfig(title="Error rate", x_title="QPS", y_title="%"),
138 |         "prompt_tokens": PlotConfig(title="Prompt tokens", x_title="QPS", y_title="Count"),
139 |         "decoded_tokens": PlotConfig(title="Decoded tokens", x_title="QPS", y_title="Count")
140 |     }
141 | 
142 |     models = df_bench["model"].unique()
143 |     run_ids = df_bench["run_id"].unique()
144 | 
145 |     # get all available percentiles
146 |     percentiles = set()
147 |     for k, v in metrics.items():
148 |         if v.percentiles:
149 |             percentiles.update(v.percentiles)
150 |     percentiles = map(lambda p: f'p{int(float(p) * 100)}', percentiles)
151 |     percentiles = sorted(list(percentiles))
152 |     percentiles.append('avg')
153 |     with gr.Blocks(css=css, title="Inference Benchmarker") as demo:
154 |         with gr.Row():
155 |             gr.Markdown("# Inference-benchmarker 🤗\n## Benchmarks results")
156 |         with gr.Row():
157 |             gr.Markdown(summary_desc)
158 |         with gr.Row():
159 |             table = gr.DataFrame(
160 |                 pd.DataFrame(),
161 |                 elem_classes=["summary"],
162 |             )
163 |         with gr.Row():
164 |             details_desc = gr.Markdown("## Details")
165 |         with gr.Row():
166 |             model = gr.Dropdown(list(models), label="Select model", value=models[0])
167 |         with gr.Row():
168 |             percentiles_bench = gr.Radio(percentiles, label="", value="avg")
169 |         i = 0
170 |         with ExitStack() as stack:
171 |             for k, v in metrics.items():
172 |                 if i % 2 == 0:
173 |                     stack.close()
174 |                     gs = stack.enter_context(gr.Row())
175 |                 line_plots_bench.append(
176 |                     {"component": gr.LinePlot(default_df, label=f'{v.title}', x="rate", y=k,
177 |                                               y_title=v.y_title, x_title=v.x_title,
178 |                                               color="run_id"
179 |                                               ),
180 |                      "model": model.value,
181 |                      "metric": k,
182 |                      "config": v
183 |                      },
184 |                 )
185 |                 i += 1
186 | 
187 |         for component in [model, percentiles_bench]:
188 |             component.change(update_bench, [model, percentiles_bench],
189 |                              [item["component"] for item in line_plots_bench] + [table])
190 |         gr.on([plot["component"].select for plot in line_plots_bench], select_region, [model],
191 |               outputs=[item["component"] for item in line_plots_bench])
192 |         gr.on([plot["component"].double_click for plot in line_plots_bench], reset_region, None,
193 |               outputs=[item["component"] for item in line_plots_bench])
194 |         demo.load(load_demo, [model, percentiles_bench],
195 |                   [item["component"] for item in line_plots_bench] + [table])
196 | 
197 |     demo.launch(server_port=port, server_name="0.0.0.0")
198 | 
199 | 
200 | @click.command()
201 | @click.option('--from-results-dir', default=None, help='Load inference-benchmarker results from a directory')
202 | @click.option('--datasource', default='file://benchmarks.parquet', help='Load a Parquet file already generated')
203 | @click.option('--port', default=7860, help='Port to run the dashboard')
204 | def main(from_results_dir, datasource, port):
205 |     run(from_results_dir, datasource, port)
206 | 
207 | 
208 | if __name__ == '__main__':
209 |     main(auto_envvar_prefix='DASHBOARD')
210 | 


--------------------------------------------------------------------------------
/extra/dashboard/parse_results.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | import os
 4 | 
 5 | import pandas as pd
 6 | 
 7 | 
 8 | def build_df(model: str, data_files: dict[str, str]) -> pd.DataFrame:
 9 |     df = pd.DataFrame()
10 |     # Load the results
11 |     for key, filename in data_files.items():
12 |         with open(filename, 'r') as f:
13 |             data = json.load(f)
14 |             if data['config']['meta'] is None:
15 |                 data['config']['meta'] = {}
16 |             for result in data['results']:
17 |                 entry = pd.json_normalize(result).to_dict(orient='records')[0]
18 |                 if 'engine' in data['config']['meta']:
19 |                     entry['engine'] = data['config']['meta']['engine']
20 |                 if 'tp' in data['config']['meta']:
21 |                     entry['tp'] = data['config']['meta']['tp']
22 |                 if 'version' in data['config']['meta']:
23 |                     entry['version'] = data['config']['meta']['version']
24 |                 if 'device' in data['config']['meta']:
25 |                     entry['device'] = data['config']['meta']['device']
26 |                 entry['model'] = data['config']['model_name']
27 |                 entry['run_id'] = data['config']['run_id']
28 |                 df_tmp = pd.DataFrame(entry, index=[0])
29 |                 # rename columns that start with 'config.'
30 |                 df_tmp = df_tmp.rename(columns={c: c.split('config.')[-1] for c in df_tmp.columns})
31 |                 # replace . with _ in column names
32 |                 df_tmp.columns = [c.replace('.', '_') for c in df_tmp.columns]
33 | 
34 |                 df = pd.concat([df, df_tmp])
35 |     return df
36 | 
37 | 
38 | def build_results_df(results_dir) -> pd.DataFrame:
39 |     df = pd.DataFrame()
40 |     # list directories
41 |     directories = [f'{results_dir}/{d}' for d in os.listdir(results_dir) if os.path.isdir(f'{results_dir}/{d}')] + [results_dir]
42 |     for directory in directories:
43 |         # list json files in results directory
44 |         data_files = {}
45 |         for filename in os.listdir(directory):
46 |             if filename.endswith('.json'):
47 |                 data_files[filename.split('.')[-2]] = f'{directory}/{filename}'
48 |         df = pd.concat([df, build_df(directory.split('/')[-1], data_files)])
49 |     return df
50 | 
51 | 
52 | def build_results(results_dir, results_file, device):
53 |     df = build_results_df(results_dir)
54 |     if 'device' not in df.columns:
55 |         df['device'] = df['model'].apply(lambda x: device)
56 |     df['error_rate'] = df['failed_requests'] / (df['failed_requests'] + df['successful_requests']) * 100.0
57 |     df['prompt_tokens'] = df['total_tokens_sent'] / df['successful_requests']
58 |     df['decoded_tokens'] = df['total_tokens'] / df['successful_requests']
59 |     df.to_parquet(results_file)
60 | 
61 | 
62 | if __name__ == '__main__':
63 |     parser = argparse.ArgumentParser()
64 |     parser.add_argument('--results-dir', default='results', type=str, required=True,
65 |                         help='Path to the source directory containing the results')
66 |     parser.add_argument('--results-file', type=str, required=True,
67 |                         help='Path to the results file to write to. Can be a S3 path')
68 |     parser.add_argument('--device', type=str, required=True, help='GPU name used for benchmarking')
69 |     args = parser.parse_args()
70 |     build_results(args.results_dir, args.results_file, args.device)
71 | 


--------------------------------------------------------------------------------
/extra/k8s/inference-benchmarker/.helmignore:
--------------------------------------------------------------------------------
 1 | # Patterns to ignore when building packages.
 2 | # This supports shell glob matching, relative path matching, and
 3 | # negation (prefixed with !). Only one pattern per line.
 4 | .DS_Store
 5 | # Common VCS dirs
 6 | .git/
 7 | .gitignore
 8 | .bzr/
 9 | .bzrignore
10 | .hg/
11 | .hgignore
12 | .svn/
13 | # Common backup files
14 | *.swp
15 | *.bak
16 | *.tmp
17 | *.orig
18 | *~
19 | # Various IDEs
20 | .project
21 | .idea/
22 | *.tmproj
23 | .vscode/
24 | 


--------------------------------------------------------------------------------
/extra/k8s/inference-benchmarker/Chart.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v2
2 | name: inference-benchmarker
3 | description: A Helm chart to run inference-benchmarker
4 | type: application
5 | version: 0.1.0
6 | appVersion: "2.3.0"
7 | 


--------------------------------------------------------------------------------
/extra/k8s/inference-benchmarker/templates/_helpers.tpl:
--------------------------------------------------------------------------------
 1 | {{/*
 2 | Expand the name of the chart.
 3 | */}}
 4 | {{- define "inference-benchmarker.name" -}}
 5 | {{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }}
 6 | {{- end }}
 7 | 
 8 | {{/*
 9 | Create a default fully qualified app name.
10 | We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec).
11 | If release name contains chart name it will be used as a full name.
12 | */}}
13 | {{- define "inference-benchmarker.fullname" -}}
14 | {{- if .Values.fullnameOverride }}
15 | {{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }}
16 | {{- else }}
17 | {{- $name := default .Chart.Name .Values.nameOverride }}
18 | {{- if contains $name .Release.Name }}
19 | {{- .Release.Name | trunc 63 | trimSuffix "-" }}
20 | {{- else }}
21 | {{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }}
22 | {{- end }}
23 | {{- end }}
24 | {{- end }}
25 | 
26 | {{/*
27 | Create chart name and version as used by the chart label.
28 | */}}
29 | {{- define "inference-benchmarker.chart" -}}
30 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }}
31 | {{- end }}
32 | 
33 | {{/*
34 | Common labels
35 | */}}
36 | {{- define "inference-benchmarker.labels" -}}
37 | helm.sh/chart: {{ include "inference-benchmarker.chart" . }}
38 | {{ include "inference-benchmarker.selectorLabels" . }}
39 | {{- if .Chart.AppVersion }}
40 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
41 | {{- end }}
42 | app.kubernetes.io/managed-by: {{ .Release.Service }}
43 | {{- end }}
44 | 
45 | {{/*
46 | Selector labels
47 | */}}
48 | {{- define "inference-benchmarker.selectorLabels" -}}
49 | app.kubernetes.io/name: {{ include "inference-benchmarker.name" . }}
50 | app.kubernetes.io/instance: {{ .Release.Name }}
51 | {{- end }}
52 | 
53 | {{/*
54 | Create the name of the service account to use
55 | */}}
56 | {{- define "inference-benchmarker.serviceAccountName" -}}
57 | {{- if .Values.serviceAccount.create }}
58 | {{- default (include "inference-benchmarker.fullname" .) .Values.serviceAccount.name }}
59 | {{- else }}
60 | {{- default "default" .Values.serviceAccount.name }}
61 | {{- end }}
62 | {{- end }}
63 | 


--------------------------------------------------------------------------------
/extra/k8s/inference-benchmarker/templates/benchmark.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: batch/v1
 2 | kind: Job
 3 | metadata:
 4 |   name: {{ include "inference-benchmarker.fullname" . }}-benchmark
 5 |   labels:
 6 |     app.kubernetes.io/component: benchmark
 7 |     {{- include "inference-benchmarker.labels" . | nindent 4 }}
 8 | spec:
 9 |   template:
10 |     metadata:
11 |       {{- with .Values.benchmark.podAnnotations }}
12 |       annotations:
13 |         {{- toYaml . | nindent 8 }}
14 |       {{- end }}
15 |       labels:
16 |         app.kubernetes.io/component: benchmark
17 |         {{- include "inference-benchmarker.labels" . | nindent 8 }}
18 |         {{- with .Values.benchmark.podLabels }}
19 |         {{- toYaml . | nindent 8 }}
20 |         {{- end }}
21 |     spec:
22 |       {{- with .Values.imagePullSecrets }}
23 |       imagePullSecrets:
24 |         {{- toYaml . | nindent 8 }}
25 |       {{- end }}
26 |       securityContext:
27 |         {{- toYaml .Values.benchmark.podSecurityContext | nindent 8 }}
28 |       restartPolicy: Never
29 |       initContainers:
30 |         - name: wait-for-text-generation-inference
31 |           image: alpine/curl:latest
32 |           command:
33 |             - sh
34 |             - -c
35 |             - |
36 |               until curl -s http://{{ include "inference-benchmarker.fullname" . }}.{{ .Release.Namespace }}.svc.cluster.local:8080/health; do
37 |                 echo "Waiting for  {{ include "inference-benchmarker.fullname" . }} service..."
38 |                 sleep 1
39 |               done
40 |               exit 0
41 |       containers:
42 |         - name: {{ .Chart.Name }}
43 |           securityContext:
44 |             {{- toYaml .Values.benchmark.securityContext | nindent 12 }}
45 |           image: "{{ .Values.benchmark.image.repository }}:{{ .Values.benchmark.image.tag | default "latest" }}"
46 |           imagePullPolicy: {{ .Values.benchmark.image.pullPolicy }}
47 |           args:
48 |             - "inference-benchmarker"
49 |             - "--url"
50 |             - "http://{{ include "inference-benchmarker.fullname" . }}.{{ .Release.Namespace }}.svc.cluster.local:8080"
51 |             - "--tokenizer-name"
52 |             - "{{ .Values.model_id }}"
53 |             - "--no-console"
54 |             {{- toYaml .Values.benchmark.extra_args | nindent 12 }}
55 |           env:
56 |             - name: HF_TOKEN
57 |               valueFrom:
58 |                 secretKeyRef:
59 |                   name: {{ include "inference-benchmarker.fullname" . }}-hf-token
60 |                   key: HF_TOKEN
61 |             - name: RUST_LOG
62 |               value: "inference_benchmarker=info"
63 |           resources:
64 |             {{- toYaml .Values.benchmark.resources | nindent 12 }}
65 |           volumeMounts:
66 |             - name: results
67 |               mountPath: /opt/inference-benchmarker/results
68 |         - name: nginx
69 |           image: nginx
70 |           ports:
71 |             - containerPort: 80
72 |               name: http
73 |               protocol: TCP
74 |           volumeMounts:
75 |             - name: results
76 |               mountPath: /usr/share/nginx/html
77 |             - name: default
78 |               mountPath: /etc/nginx/conf.d/default.conf
79 |               subPath: default
80 |       terminationGracePeriodSeconds: 5
81 |       volumes:
82 |         - name: results
83 |           emptyDir: { }
84 |         - name: default
85 |           configMap:
86 |               name: {{ include "inference-benchmarker.fullname" . }}-nginx-config
87 |       {{- with .Values.benchmark.nodeSelector }}
88 |       nodeSelector:
89 |         {{- toYaml . | nindent 8 }}
90 |       {{- end }}
91 |       {{- with .Values.benchmark.affinity }}
92 |       affinity:
93 |         {{- toYaml . | nindent 8 }}
94 |       {{- end }}
95 |       {{- with .Values.benchmark.tolerations }}
96 |       tolerations:
97 |         {{- toYaml . | nindent 8 }}
98 |       {{- end }}
99 | 


--------------------------------------------------------------------------------
/extra/k8s/inference-benchmarker/templates/checks.yaml:
--------------------------------------------------------------------------------
1 | {{- if and (.Values.tgi.enabled) (.Values.vllm.enabled) }}
2 | {{- fail "You cannot enable multiple inference engines at the same time. Please check you values" }}
3 | {{- end }}


--------------------------------------------------------------------------------
/extra/k8s/inference-benchmarker/templates/configmap.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: ConfigMap
 3 | metadata:
 4 |   name: {{ include "inference-benchmarker.fullname" . }}-nginx-config
 5 | data:
 6 |   default: |
 7 |     server {
 8 |       listen       80;
 9 |       listen  [::]:80;
10 |       server_name  localhost;
11 | 
12 |       location / {
13 |         root   /usr/share/nginx/html;
14 |         index  index.html index.htm;
15 |         autoindex on;
16 |       }
17 | 
18 |       error_page   500 502 503 504  /50x.html;
19 |         location = /50x.html {
20 |         root   /usr/share/nginx/html;
21 |       }
22 |     }


--------------------------------------------------------------------------------
/extra/k8s/inference-benchmarker/templates/secret.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Secret
3 | metadata:
4 |   name: {{ include "inference-benchmarker.fullname" . }}-hf-token
5 | type: Opaque
6 | stringData:
7 |   HF_TOKEN: "{{ .Values.hf_token }}"


--------------------------------------------------------------------------------
/extra/k8s/inference-benchmarker/templates/service.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Service
 3 | metadata:
 4 |   name: {{ include "inference-benchmarker.fullname" . }}
 5 |   labels:
 6 |     app.kubernetes.io/component: text-generation-inference
 7 |     {{- include "inference-benchmarker.labels" . | nindent 4 }}
 8 | spec:
 9 |   type: ClusterIP
10 |   ports:
11 |     - port: 8080
12 |       targetPort: http
13 |       protocol: TCP
14 |       name: http
15 |   selector:
16 |     app.kubernetes.io/component: text-generation-inference
17 |     {{- include "inference-benchmarker.selectorLabels" . | nindent 4 }}
18 | ---
19 | apiVersion: v1
20 | kind: Service
21 | metadata:
22 |   name: {{ include "inference-benchmarker.fullname" . }}-benchmark
23 |   labels:
24 |     app.kubernetes.io/component: benchmark
25 |     {{- include "inference-benchmarker.labels" . | nindent 4 }}
26 | spec:
27 |   type: ClusterIP
28 |   ports:
29 |     - port: 8080
30 |       targetPort: http
31 |       protocol: TCP
32 |       name: http
33 |   publishNotReadyAddresses: true
34 |   selector:
35 |     app.kubernetes.io/component: benchmark
36 |     {{- include "inference-benchmarker.selectorLabels" . | nindent 4 }}
37 | 


--------------------------------------------------------------------------------
/extra/k8s/inference-benchmarker/templates/tgi.yaml:
--------------------------------------------------------------------------------
 1 | {{- if .Values.tgi.enabled }}
 2 | apiVersion: apps/v1
 3 | kind: Deployment
 4 | metadata:
 5 |   name: {{ include "inference-benchmarker.fullname" . }}
 6 |   labels:
 7 |     app.kubernetes.io/component: text-generation-inference
 8 |     {{- include "inference-benchmarker.labels" . | nindent 4 }}
 9 | spec:
10 |   replicas: {{ .Values.tgi.replicaCount }}
11 |   selector:
12 |     matchLabels:
13 |       app.kubernetes.io/component: text-generation-inference
14 |       {{- include "inference-benchmarker.selectorLabels" . | nindent 6 }}
15 |   template:
16 |     metadata:
17 |       {{- with .Values.tgi.podAnnotations }}
18 |       annotations:
19 |         {{- toYaml . | nindent 8 }}
20 |       {{- end }}
21 |       labels:
22 |         app.kubernetes.io/component: text-generation-inference
23 |         {{- include "inference-benchmarker.labels" . | nindent 8 }}
24 |         {{- with .Values.tgi.podLabels }}
25 |         {{- toYaml . | nindent 8 }}
26 |         {{- end }}
27 |     spec:
28 |       {{- with .Values.imagePullSecrets }}
29 |       imagePullSecrets:
30 |         {{- toYaml . | nindent 8 }}
31 |       {{- end }}
32 |       securityContext:
33 |         {{- toYaml .Values.tgi.podSecurityContext | nindent 8 }}
34 |       containers:
35 |         - name: text-generation-inference
36 |           securityContext:
37 |             {{- toYaml .Values.tgi.securityContext | nindent 12 }}
38 |           image: "{{ .Values.tgi.image.repository }}:{{ .Values.tgi.image.tag | default "latest" }}"
39 |           imagePullPolicy: {{ .Values.tgi.image.pullPolicy }}
40 |           env:
41 |             - name: HF_TOKEN
42 |               valueFrom:
43 |                 secretKeyRef:
44 |                   name: {{ include "inference-benchmarker.fullname" . }}-hf-token
45 |                   key: HF_TOKEN
46 |             - name: PORT
47 |               value: "8080"
48 |           args:
49 |             - "--model-id"
50 |             - "{{ .Values.model_id }}"
51 |             {{- with .Values.tgi.extra_args }}
52 |             {{- toYaml . | nindent 12 }}
53 |             {{- end }}
54 |           ports:
55 |             - name: http
56 |               containerPort: 8080
57 |               protocol: TCP
58 |           readinessProbe:
59 |             httpGet:
60 |               path: /health
61 |               port: 8080
62 |             initialDelaySeconds: 10
63 |             periodSeconds: 3
64 |           resources:
65 |             {{- toYaml .Values.tgi.resources | nindent 12 }}
66 |           volumeMounts:
67 |             - name: shm
68 |               mountPath: /dev/shm
69 |       terminationGracePeriodSeconds: 10
70 |       volumes:
71 |         - name: shm
72 |           emptyDir:
73 |             medium: Memory
74 |             sizeLimit: 1Gi
75 |       {{- with .Values.tgi.nodeSelector }}
76 |       nodeSelector:
77 |         {{- toYaml . | nindent 8 }}
78 |       {{- end }}
79 |       {{- with .Values.tgi.affinity }}
80 |       affinity:
81 |         {{- toYaml . | nindent 8 }}
82 |       {{- end }}
83 |       {{- with .Values.tgi.tolerations }}
84 |       tolerations:
85 |         {{- toYaml . | nindent 8 }}
86 |       {{- end }}
87 | {{- end }}


--------------------------------------------------------------------------------
/extra/k8s/inference-benchmarker/templates/vllm.yaml:
--------------------------------------------------------------------------------
 1 | {{- if .Values.vllm.enabled }}
 2 | apiVersion: apps/v1
 3 | kind: Deployment
 4 | metadata:
 5 |   name: {{ include "inference-benchmarker.fullname" . }}
 6 |   labels:
 7 |     app.kubernetes.io/component: text-generation-inference
 8 |     {{- include "inference-benchmarker.labels" . | nindent 4 }}
 9 | spec:
10 |   replicas: {{ .Values.vllm.replicaCount }}
11 |   selector:
12 |     matchLabels:
13 |       app.kubernetes.io/component: text-generation-inference
14 |       {{- include "inference-benchmarker.selectorLabels" . | nindent 6 }}
15 |   template:
16 |     metadata:
17 |       {{- with .Values.vllm.podAnnotations }}
18 |       annotations:
19 |         {{- toYaml . | nindent 8 }}
20 |       {{- end }}
21 |       labels:
22 |         app.kubernetes.io/component: text-generation-inference
23 |         {{- include "inference-benchmarker.labels" . | nindent 8 }}
24 |         {{- with .Values.vllm.podLabels }}
25 |         {{- toYaml . | nindent 8 }}
26 |         {{- end }}
27 |     spec:
28 |       {{- with .Values.imagePullSecrets }}
29 |       imagePullSecrets:
30 |         {{- toYaml . | nindent 8 }}
31 |       {{- end }}
32 |       securityContext:
33 |         {{- toYaml .Values.vllm.podSecurityContext | nindent 8 }}
34 |       containers:
35 |         - name: vllm
36 |           securityContext:
37 |             {{- toYaml .Values.vllm.securityContext | nindent 12 }}
38 |           image: "{{ .Values.vllm.image.repository }}:{{ .Values.vllm.image.tag | default "latest" }}"
39 |           imagePullPolicy: {{ .Values.vllm.image.pullPolicy }}
40 |           env:
41 |             - name: HUGGING_FACE_HUB_TOKEN
42 |               valueFrom:
43 |                 secretKeyRef:
44 |                   name: {{ include "inference-benchmarker.fullname" . }}-hf-token
45 |                   key: HF_TOKEN
46 |           args:
47 |             - "--model"
48 |             - "{{ .Values.model_id }}"
49 |             - "--port"
50 |             - "8080"
51 |             {{- with .Values.vllm.extra_args }}
52 |             {{- toYaml . | nindent 12 }}
53 |             {{- end }}
54 |           ports:
55 |             - name: http
56 |               containerPort: 8080
57 |               protocol: TCP
58 |           readinessProbe:
59 |             httpGet:
60 |               path: /health
61 |               port: 8080
62 |             initialDelaySeconds: 10
63 |             periodSeconds: 3
64 |           resources:
65 |             {{- toYaml .Values.vllm.resources | nindent 12 }}
66 |           volumeMounts:
67 |             - name: shm
68 |               mountPath: /dev/shm
69 |       terminationGracePeriodSeconds: 10
70 |       volumes:
71 |         - name: shm
72 |           emptyDir:
73 |             medium: Memory
74 |             sizeLimit: 1Gi
75 |       {{- with .Values.vllm.nodeSelector }}
76 |       nodeSelector:
77 |         {{- toYaml . | nindent 8 }}
78 |       {{- end }}
79 |       {{- with .Values.vllm.affinity }}
80 |       affinity:
81 |         {{- toYaml . | nindent 8 }}
82 |       {{- end }}
83 |       {{- with .Values.vllm.tolerations }}
84 |       tolerations:
85 |         {{- toYaml . | nindent 8 }}
86 |       {{- end }}
87 | {{- end }}


--------------------------------------------------------------------------------
/extra/k8s/inference-benchmarker/values.yaml:
--------------------------------------------------------------------------------
 1 | imagePullSecrets: [ ]
 2 | nameOverride: ""
 3 | fullnameOverride: ""
 4 | 
 5 | hf_token: ""
 6 | model_id: "meta-llama/Llama-3.1-8B-Instruct"
 7 | server: tgi
 8 | 
 9 | tgi:
10 |   enabled: true
11 |   extra_args:
12 |     - "--max-concurrent-requests"
13 |     - "512"
14 |   image:
15 |     repository: ghcr.io/huggingface/text-generation-inference
16 |     pullPolicy: IfNotPresent
17 |     tag: "latest"
18 |   replicaCount: 1
19 |   resources:
20 |     limits:
21 |       "nvidia.com/gpu": "1"
22 |   podAnnotations: { }
23 |   podLabels: { }
24 |   podSecurityContext: { }
25 |   securityContext: { }
26 |   nodeSelector: { }
27 |   tolerations: [ ]
28 |   affinity: { }
29 | 
30 | vllm:
31 |   enabled: false
32 |   extra_args:
33 |   image:
34 |     repository: vllm/vllm-openai
35 |     pullPolicy: IfNotPresent
36 |     tag: "latest"
37 |   replicaCount: 1
38 |   resources:
39 |     limits:
40 |       "nvidia.com/gpu": "1"
41 |   podAnnotations: { }
42 |   podLabels: { }
43 |   podSecurityContext: { }
44 |   securityContext: { }
45 |   nodeSelector: { }
46 |   tolerations: [ ]
47 |   affinity: { }
48 | 
49 | benchmark:
50 |   extra_args:
51 |     - "--profile"
52 |     - "chat"
53 |     - "800"
54 |   image:
55 |     repository: ghcr.io/huggingface/inference-benchmarker
56 |     pullPolicy: IfNotPresent
57 |     tag: "latest"
58 |   podAnnotations: { }
59 |   podLabels: { }
60 |   podSecurityContext: { }
61 |   securityContext: { }
62 |   resources: { }
63 |   nodeSelector: { }
64 |   tolerations: [ ]
65 |   affinity: { }
66 | 
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/extra/slurm/benchmark.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | 
 4 | CPUS_PER_GPU = 20
 5 | MEM_PER_CPU_GB = 11
 6 | 
 7 | 
 8 | def main():
 9 |     models = [
10 |         ('meta-llama/Llama-3.1-8B-Instruct', 1),
11 |         ('meta-llama/Llama-3.1-70B-Instruct', 4),
12 |         ('meta-llama/Llama-3.1-70B-Instruct', 2),
13 |         ('mistralai/Mixtral-8x7B-Instruct-v0.1', 2),
14 |     ]
15 |     num_passes = 1
16 |     engines = ['tgi', 'vllm']
17 |     for i in range(num_passes):
18 |         for model in models:
19 |             print(f"PASS {i} - Submitting job for {model[0]}")
20 |             gpus = model[1]
21 |             cpus_per_task = gpus * CPUS_PER_GPU
22 |             for engine in engines:
23 |                 job_name = f'bench_{model[0].replace("/", "_")}_{engine}_pass_{i}'
24 |                 args = ['sbatch',
25 |                         '--job-name', job_name,
26 |                         '--output', f'/fsx/%u/logs/%x-%j.log',
27 |                         '--time', '1:50:00',
28 |                         '--qos', 'normal',
29 |                         '--partition', 'hopper-prod',
30 |                         '--gpus', str(gpus),
31 |                         '--ntasks', '1',
32 |                         '--cpus-per-task', str(cpus_per_task),
33 |                         '--mem-per-cpu', str(MEM_PER_CPU_GB) + 'G',
34 |                         '--nodes', '1',
35 |                         ':',
36 |                         '--gpus', '1',
37 |                         '--ntasks', '1',
38 |                         '--cpus-per-task', str(CPUS_PER_GPU),
39 |                         '--mem-per-cpu', str(MEM_PER_CPU_GB) + 'G',
40 |                         '--nodes', '1',
41 |                         f'{engine}.slurm']
42 |                 env = os.environ.copy()
43 |                 env['MODEL'] = model[0]
44 |                 env['TP'] = str(gpus)
45 |                 process = subprocess.run(args, capture_output=True,
46 |                                          env=env)
47 |                 print(process.stdout.decode())
48 |                 print(process.stderr.decode())
49 |                 if process.returncode != 0:
50 |                     print(f'Error while submitting :: {args}')
51 |                     exit(1)
52 | 
53 | 
54 | if __name__ == '__main__':
55 |     main()
56 | 


--------------------------------------------------------------------------------
/extra/slurm/tgi.slurm:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #SBATCH --job-name tgi-benchmark
 3 | #SBATCH --output /fsx/%u/logs/%x-%j.log
 4 | #SBATCH --time 1:50:00
 5 | #SBATCH --qos normal
 6 | #SBATCH --partition hopper-prod
 7 | #SBATCH --gpus 1 --ntasks 1 --cpus-per-task 11 --mem-per-cpu 20G --nodes=1
 8 | #SBATCH hetjob
 9 | #SBATCH --gpus 1 --ntasks 1 --cpus-per-task 11 --mem-per-cpu 20G --nodes=1
10 | 
11 | if [ -z "$MODEL" ]; then
12 |     echo "MODEL environment variable is not set"
13 |     exit 1
14 | fi
15 | 
16 | if [ -z "$TP" ]; then
17 |     echo "TP environment variable is not set"
18 |     exit 1
19 | fi
20 | 
21 | echo "Starting TGI benchmark for $MODEL"
22 | export RUST_BACKTRACE=full
23 | export RUST_LOG=inference_benchmarker=info
24 | 
25 | # set a random available port to avoid conflicts
26 | PORT=$(shuf -i 8000-9999 -n 1)
27 | export PORT
28 | export PREFILL_CHUNKING=1
29 | 
30 | echo "Model will run on ${SLURM_JOB_NODELIST_HET_GROUP_0}:${PORT}"
31 | echo "Benchmark will run on ${SLURM_JOB_NODELIST_HET_GROUP_1}"
32 | 
33 | # start TGI
34 | srun --het-group=0 \
35 |      -u \
36 |      -n 1 \
37 |      --container-image='ghcr.io#huggingface/text-generation-inference' \
38 |      --container-env=PORT \
39 |      --container-mounts="/scratch:/data" \
40 |      --container-workdir='/usr/src' \
41 |      --no-container-mount-home \
42 |      /usr/local/bin/text-generation-launcher \
43 |       --model-id $MODEL \
44 |       --max-concurrent-requests 1024&
45 | 
46 | # wait until /health is available, die after 5 minutes
47 | timeout 600 bash -c "while [[ \"\$(curl -s -o /dev/null -w '%{http_code}' http://localhost:${PORT}/health)\" != \"200\" ]]; do sleep 1 && echo \"Waiting for TGI to start...\"; done" || exit 1
48 | exit_code=$?
49 | 
50 | RESULTS_DIR="/fsx/$USER/benchmarks_results/tgi"
51 | mkdir -p "${RESULTS_DIR}"
52 | 
53 | if [[ $exit_code != 124 ]]; then
54 |     # run benchmark
55 |     echo "Starting benchmark"
56 |     VERSION=$(curl -s http://${SLURM_JOB_NODELIST_HET_GROUP_0}:${PORT}/info | jq -r '.sha')
57 |     srun --het-group=1 \
58 |          -u \
59 |          -n 1 \
60 |          --container-image="ghcr.io#huggingface/inference-benchmarker:latest" \
61 |          --container-mounts="${RESULTS_DIR}:/opt/inference-benchmarker/results" \
62 |          --no-container-mount-home \
63 |          inference-benchmarker \
64 |              --tokenizer-name "$MODEL" \
65 |              --max-vus 128 \
66 |              --url "http://${SLURM_JOB_NODELIST_HET_GROUP_0}:${PORT}" \
67 |              --duration 120s \
68 |              --warmup 30s \
69 |              --benchmark-kind rate \
70 |              --rates 0.8 --rates 2.4 --rates 4.0 --rates 5.6 --rates 7.2 --rates 8.8 --rates 10.4 --rates 12.0 --rates 13.6 --rates 15.2 --rates 16.8 --rates 18.4 --rates 20.0 --rates 21.6 --rates 23.2 --rates 24.0 \
71 |              --extra-meta "version=$VERSION,engine=TGI,tp=$TP" \
72 |              --decode-options "num_tokens=800,max_tokens=800,min_tokens=50,variance=100" \
73 |              --dataset-file share_gpt_turns.json \
74 |              --no-console
75 | fi
76 | 
77 | # stop TGI
78 | scancel --signal=TERM "$SLURM_JOB_ID+0"
79 | 
80 | echo "End of benchmark"


--------------------------------------------------------------------------------
/extra/slurm/vllm.slurm:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | #SBATCH --job-name tgi-benchmark
 3 | #SBATCH --output /fsx/%u/logs/%x-%j.log
 4 | #SBATCH --time 1:50:00
 5 | #SBATCH --qos normal
 6 | #SBATCH --partition hopper-prod
 7 | #SBATCH --gpus 1 --ntasks 1 --cpus-per-task 11 --mem-per-cpu 20G --nodes=1
 8 | #SBATCH hetjob
 9 | #SBATCH --gpus 1 --ntasks 1 --cpus-per-task 11 --mem-per-cpu 20G --nodes=1
10 | 
11 | 
12 | if [ -z "$MODEL" ]; then
13 |     echo "MODEL environment variable is not set"
14 |     exit 1
15 | fi
16 | 
17 | if [ -z "$TP" ]; then
18 |     echo "TP environment variable is not set"
19 |     exit 1
20 | fi
21 | 
22 | echo "Starting vLLM benchmark for $MODEL"
23 | export RUST_BACKTRACE=full
24 | export RUST_LOG=inference_benchmarker=info
25 | # set a random available port to avoid conflicts
26 | PORT=$(shuf -i 8000-9999 -n 1)
27 | export PORT
28 | 
29 | echo "Model will run on ${SLURM_JOB_NODELIST_HET_GROUP_0}:${PORT}"
30 | echo "Benchmark will run on ${SLURM_JOB_NODELIST_HET_GROUP_1}"
31 | 
32 | # start vLLM
33 | srun --het-group=0 \
34 |      -u \
35 |      -n 1 \
36 |      --container-image='vllm/vllm-openai:latest' \
37 |      --container-env=PORT \
38 |      --container-mounts="/scratch:/root/.cache/huggingface" \
39 |      --container-workdir='/usr/src' \
40 |      --no-container-mount-home \
41 |      python3 -m vllm.entrypoints.openai.api_server \
42 |         --model "${MODEL}" \
43 |         --port "${PORT}" \
44 |         --enable-chunked-prefill \
45 |         --max-num-batched-tokens 512 \
46 |         --tensor-parallel-size "${SLURM_GPUS_ON_NODE}"&
47 | 
48 | # wait until /health is available, die after 5 minutes
49 | timeout 600 bash -c "while [[ \"\$(curl -s -o /dev/null -w '%{http_code}' http://localhost:${PORT}/health)\" != \"200\" ]]; do sleep 1 && echo \"Waiting for vLLM to start...\"; done" || exit 1
50 | exit_code=$?
51 | 
52 | RESULTS_DIR="/fsx/$USER/benchmarks_results/vllm"
53 | mkdir -p "${RESULTS_DIR}"
54 | 
55 | if [[ $exit_code != 124 ]]; then
56 |     # run benchmark
57 |     echo "Starting benchmark"
58 |     VERSION=$(curl -s http://${SLURM_JOB_NODELIST_HET_GROUP_0}:${PORT}/version | jq -r '.version')
59 |     srun --het-group=1 \
60 |          -u \
61 |          -n 1 \
62 |          --container-image="ghcr.io#huggingface/inference-benchmarker:latest" \
63 |          --container-mounts="${RESULTS_DIR}:/opt/inference-benchmarker/results" \
64 |          --no-container-mount-home \
65 |          inference-benchmarker \
66 |              --tokenizer-name "$MODEL" \
67 |              --max-vus 128 \
68 |              --url "http://${SLURM_JOB_NODELIST_HET_GROUP_0}:${PORT}" \
69 |              --duration 120s \
70 |              --warmup 30s \
71 |              --benchmark-kind rate \
72 |              --rates 0.8 --rates 2.4 --rates 4.0 --rates 5.6 --rates 7.2 --rates 8.8 --rates 10.4 --rates 12.0 --rates 13.6 --rates 15.2 --rates 16.8 --rates 18.4 --rates 20.0 --rates 21.6 --rates 23.2 --rates 24.0 \
73 |              --extra-meta "version=$VERSION,engine=vLLM,tp=$TP,max_num_batched_tokens=512" \
74 |              --decode-options "num_tokens=800,max_tokens=800,min_tokens=50,variance=100" \
75 |              --dataset-file share_gpt_turns.json \
76 |              --no-console
77 | fi
78 | 
79 | # stop TGI
80 | scancel --signal=TERM "$SLURM_JOB_ID+0"
81 | 
82 | echo "End of benchmark"


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "inference-benchmarker"
 3 | version = "1.0.0"
 4 | description = ""
 5 | authors = ["Hugo Larcher <hugo.larcher@huggingface.co>"]
 6 | readme = "README.md"
 7 | 
 8 | [tool.poetry.dependencies]
 9 | python = "^3.11"
10 | matplotlib = "^3.10.1"
11 | scienceplots = "^2.1.1"
12 | pandas = "^2.2.2"
13 | opensearch-py = "^2.7.1"
14 | boto3 = "^1.35.21"
15 | gradio = "^5.23.1"
16 | pyarrow = "^19.0.1"
17 | pygithub = "^2.4.0"
18 | click = "^8.1.7"
19 | 
20 | 
21 | [build-system]
22 | requires = ["poetry-core"]
23 | build-backend = "poetry.core.masonry.api"
24 | 


--------------------------------------------------------------------------------
/src/app.rs:
--------------------------------------------------------------------------------
  1 | use crate::benchmark::Event as BenchmarkEvent;
  2 | use crate::event::{terminal_event_task, AppEvent};
  3 | use crate::flux::{Action, AppState, Dispatcher, Store};
  4 | use crate::scheduler::ExecutorType;
  5 | use crate::BenchmarkConfig;
  6 | use crossterm::event::{KeyCode, KeyEvent, KeyModifiers};
  7 | use ratatui::layout::{Constraint, Direction, Layout};
  8 | use ratatui::text::Span;
  9 | use ratatui::widgets::ListDirection::BottomToTop;
 10 | use ratatui::widgets::{Cell, Dataset, List, ListItem, Row, Table};
 11 | use ratatui::{
 12 |     buffer::Buffer,
 13 |     layout::{Alignment, Rect},
 14 |     style::Stylize as OtherStylize,
 15 |     symbols,
 16 |     symbols::border,
 17 |     text::{Line, Text},
 18 |     widgets::{Block, Paragraph, Widget},
 19 |     DefaultTerminal, Frame,
 20 | };
 21 | use std::collections::HashMap;
 22 | use std::io;
 23 | use std::sync::{Arc, Mutex};
 24 | use tokio::sync::broadcast::Sender;
 25 | use tokio::sync::mpsc::{Receiver, UnboundedReceiver};
 26 | use tokio::sync::{broadcast, mpsc};
 27 | 
 28 | pub struct App {
 29 |     exit: bool,
 30 |     store: Arc<Mutex<Store>>,
 31 |     dispatcher: Arc<Mutex<Dispatcher>>,
 32 |     receiver: Receiver<AppEvent>,
 33 |     benchmark_config: BenchmarkConfig,
 34 |     stop_sender: broadcast::Sender<()>,
 35 | }
 36 | 
 37 | pub async fn run_console(
 38 |     benchmark_config: BenchmarkConfig,
 39 |     mut receiver: UnboundedReceiver<BenchmarkEvent>,
 40 |     stop_sender: broadcast::Sender<()>,
 41 | ) {
 42 |     let (app_tx, app_rx) = mpsc::channel(8);
 43 |     // Create event task
 44 |     let stop_receiver_signal = stop_sender.subscribe();
 45 |     tokio::spawn(terminal_event_task(250, app_tx, stop_receiver_signal));
 46 | 
 47 |     let mut app = App::new(benchmark_config, app_rx, stop_sender.clone());
 48 |     app.dispatcher
 49 |         .lock()
 50 |         .expect("lock")
 51 |         .dispatch(Action::LogMessage(LogMessageUI {
 52 |             message: "Starting benchmark".to_string(),
 53 |             level: LogLevel::Info,
 54 |             timestamp: chrono::Utc::now(),
 55 |         }));
 56 |     let dispatcher = app.dispatcher.clone();
 57 |     let mut stop_receiver_signal = stop_sender.subscribe();
 58 |     let event_thread = tokio::spawn(async move {
 59 |         tokio::select! {
 60 |             _=async{
 61 |                 while let Some(event) = receiver.recv().await {
 62 |                     match event {
 63 |                         BenchmarkEvent::BenchmarkStart(event) => {
 64 |                             dispatcher.lock().expect("lock").dispatch(Action::AddBenchmark(BenchmarkUI {
 65 |                                 id: event.id,
 66 |                                 status: BenchmarkStatus::Running,
 67 |                                 progress: 0.0,
 68 |                                 throughput: "0".to_string(),
 69 |                                 successful_requests: 0,
 70 |                                 failed_requests: 0,
 71 |                             }));
 72 |                         }
 73 |                         BenchmarkEvent::BenchmarkProgress(event) => {
 74 |                             let (successful_requests,failed_requests) = (event.successful_requests,event.failed_requests);
 75 |                             dispatcher.lock().expect("lock").dispatch(Action::AddBenchmark(BenchmarkUI {
 76 |                                 id: event.id,
 77 |                                 status: BenchmarkStatus::Running,
 78 |                                 progress: event.progress,
 79 |                                 throughput: event.request_throughput.map_or("0".to_string(), |e| format!("{e:.2}")),
 80 |                                 successful_requests,
 81 |                                 failed_requests,
 82 |                             }));
 83 |                         }
 84 |                         BenchmarkEvent::BenchmarkEnd(event) => {
 85 |                             dispatcher.lock().expect("lock").dispatch(Action::LogMessage(LogMessageUI {
 86 |                                 message: format!("Benchmark {} ended", event.id),
 87 |                                 level: LogLevel::Info,
 88 |                                 timestamp: chrono::Utc::now(),
 89 |                             }));
 90 |                             if let Some(results) = event.results {
 91 |                                 let (successful_requests,failed_requests) = (results.successful_requests() as u64,results.failed_requests() as u64);
 92 |                                 dispatcher.lock().expect("lock").dispatch(Action::AddBenchmark(BenchmarkUI {
 93 |                                     id: event.id,
 94 |                                     status: BenchmarkStatus::Completed,
 95 |                                     progress: 100.0,
 96 |                                     throughput: event.request_throughput.map_or("0".to_string(), |e| format!("{e:.2}")),
 97 |                                     successful_requests,
 98 |                                     failed_requests,
 99 |                                 }));
100 |                                 dispatcher.lock().expect("lock").dispatch(Action::AddBenchmarkResults(results));
101 |                             }
102 |                         }
103 |                         BenchmarkEvent::Message(event) => {
104 |                             dispatcher.lock().expect("lock").dispatch(Action::LogMessage(LogMessageUI {
105 |                                 message: event.message,
106 |                                 level: LogLevel::Info,
107 |                                 timestamp: event.timestamp,
108 |                             }));
109 |                         }
110 |                         BenchmarkEvent::BenchmarkReportEnd(path) => {
111 |                             dispatcher.lock().expect("lock").dispatch(Action::LogMessage(LogMessageUI {
112 |                                 message: format!("Benchmark report saved to {}", path),
113 |                                 level: LogLevel::Info,
114 |                                 timestamp: chrono::Utc::now(),
115 |                             }));
116 |                             break;
117 |                         }
118 |                         BenchmarkEvent::BenchmarkError(event) => {
119 |                             dispatcher.lock().expect("lock").dispatch(Action::LogMessage(LogMessageUI {
120 |                                 message: format!("Error running benchmark: {:?}", event),
121 |                                 level: LogLevel::Error,
122 |                                 timestamp: chrono::Utc::now(),
123 |                             }));
124 |                             break;
125 |                         }
126 |                     }
127 |                 }
128 |             }=>{}
129 |             _ = stop_receiver_signal.recv() => {}
130 |         }
131 |     });
132 |     let mut stop_receiver_signal = stop_sender.subscribe();
133 |     let app_thread = tokio::spawn(async move {
134 |         tokio::select! {
135 |             _ = async {
136 |                 let _ = app.run(&mut ratatui::init()).await;
137 |                 ratatui::restore();
138 |             }=>{}
139 |             _ = stop_receiver_signal.recv() => {}
140 |         }
141 |     });
142 |     let _ = event_thread.await;
143 |     let _ = app_thread.await;
144 | }
145 | 
146 | impl App {
147 |     pub fn new(
148 |         benchmark_config: BenchmarkConfig,
149 |         receiver: Receiver<AppEvent>,
150 |         stop_sender: Sender<()>,
151 |     ) -> App {
152 |         let store = Arc::from(Mutex::new(Store::new()));
153 |         let dispatcher = Arc::from(Mutex::new(Dispatcher::new(store.clone())));
154 |         App {
155 |             exit: false,
156 |             store: store.clone(),
157 |             dispatcher: dispatcher.clone(),
158 |             receiver,
159 |             benchmark_config,
160 |             stop_sender,
161 |         }
162 |     }
163 |     pub async fn run(&mut self, terminal: &mut DefaultTerminal) -> io::Result<()> {
164 |         while !self.exit {
165 |             terminal.draw(|frame| self.draw(frame))?;
166 |             self.handle_events().await?;
167 |         }
168 |         // signal everybody to stop
169 |         let _ = self.stop_sender.send(());
170 |         Ok(())
171 |     }
172 |     fn draw(&self, frame: &mut Frame) {
173 |         frame.render_widget(self, frame.area())
174 |     }
175 |     async fn handle_events(&mut self) -> io::Result<()> {
176 |         match self.receiver.recv().await {
177 |             None => Err(io::Error::new(io::ErrorKind::Other, "No event")),
178 |             Some(event) => match event {
179 |                 AppEvent::Tick => Ok(()),
180 |                 AppEvent::Key(key_event) => self.handle_key_event(key_event),
181 |                 AppEvent::Resize => Ok(()),
182 |             },
183 |         }
184 |     }
185 | 
186 |     fn handle_key_event(&mut self, key_event: KeyEvent) -> io::Result<()> {
187 |         match key_event {
188 |             KeyEvent {
189 |                 code: KeyCode::Char('q'),
190 |                 ..
191 |             } => self.exit(),
192 |             KeyEvent {
193 |                 code: KeyCode::Char('c'),
194 |                 modifiers: KeyModifiers::CONTROL,
195 |                 ..
196 |             } => self.exit(),
197 |             _ => {}
198 |         }
199 |         Ok(())
200 |     }
201 |     fn exit(&mut self) {
202 |         self.exit = true;
203 |     }
204 | 
205 |     fn create_datasets(&self, state: AppState) -> HashMap<String, Vec<(f64, f64)>> {
206 |         let token_throughput_rate = state
207 |             .results
208 |             .iter()
209 |             .filter_map(|r| match r.executor_type() {
210 |                 ExecutorType::ConstantArrivalRate => {
211 |                     let throughput = r.token_throughput_secs().unwrap_or(0.0);
212 |                     Some((r.executor_config().rate.unwrap(), throughput))
213 |                 }
214 |                 ExecutorType::ConstantVUs => None,
215 |             })
216 |             .collect::<Vec<_>>();
217 |         let token_throughput_vus = state
218 |             .results
219 |             .iter()
220 |             .filter_map(|r| match r.executor_type() {
221 |                 ExecutorType::ConstantVUs => {
222 |                     let throughput = r.token_throughput_secs().unwrap_or(0.0);
223 |                     Some((r.executor_config().max_vus as f64, throughput))
224 |                 }
225 |                 ExecutorType::ConstantArrivalRate => None,
226 |             })
227 |             .collect::<Vec<_>>();
228 |         let inter_token_latency_rate = state
229 |             .results
230 |             .iter()
231 |             .filter_map(|r| match r.executor_type() {
232 |                 ExecutorType::ConstantArrivalRate => {
233 |                     let latency = r
234 |                         .inter_token_latency_avg()
235 |                         .unwrap_or_default()
236 |                         .as_secs_f64();
237 |                     Some((r.executor_config().rate.unwrap(), latency))
238 |                 }
239 |                 ExecutorType::ConstantVUs => None,
240 |             })
241 |             .collect::<Vec<_>>();
242 |         let inter_token_latency_vus = state
243 |             .results
244 |             .iter()
245 |             .filter_map(|r| match r.executor_type() {
246 |                 ExecutorType::ConstantVUs => {
247 |                     let latency = r
248 |                         .inter_token_latency_avg()
249 |                         .unwrap_or_default()
250 |                         .as_secs_f64();
251 |                     Some((r.executor_config().max_vus as f64, latency))
252 |                 }
253 |                 ExecutorType::ConstantArrivalRate => None,
254 |             })
255 |             .collect::<Vec<_>>();
256 |         HashMap::from([
257 |             ("token_throughput_rate".to_string(), token_throughput_rate),
258 |             ("token_throughput_vus".to_string(), token_throughput_vus),
259 |             (
260 |                 "inter_token_latency_rate".to_string(),
261 |                 inter_token_latency_rate,
262 |             ),
263 |             (
264 |                 "inter_token_latency_vus".to_string(),
265 |                 inter_token_latency_vus,
266 |             ),
267 |         ])
268 |     }
269 | }
270 | 
271 | impl Widget for &App {
272 |     fn render(self, area: Rect, buf: &mut Buffer) {
273 |         let state = self.store.lock().unwrap().get_state();
274 |         let data = self.create_datasets(state.clone());
275 | 
276 |         let main_layout = Layout::default()
277 |             .direction(Direction::Vertical)
278 |             .constraints([Constraint::Length(1), Constraint::Min(20)])
279 |             .split(area);
280 |         let bottom_layout = Layout::default()
281 |             .direction(Direction::Vertical)
282 |             .constraints([Constraint::Percentage(50), Constraint::Percentage(50)])
283 |             .split(main_layout[1]);
284 |         let steps_graph_layout = Layout::default()
285 |             .direction(Direction::Horizontal)
286 |             .constraints([Constraint::Percentage(35), Constraint::Percentage(65)])
287 |             .split(bottom_layout[0]);
288 |         // LOGS
289 |         let logs_title = Line::from("Logs".bold()).centered();
290 |         let logs_block = Block::bordered()
291 |             .title_top(logs_title)
292 |             .border_set(border::THICK);
293 |         List::new(
294 |             state
295 |                 .messages
296 |                 .iter()
297 |                 .rev()
298 |                 .map(|m| {
299 |                     let level_span = match m.level {
300 |                         LogLevel::Info => {
301 |                             Span::raw(m.level.to_string().to_uppercase()).green().bold()
302 |                         }
303 |                         LogLevel::Warning => Span::raw(m.level.to_string().to_uppercase())
304 |                             .yellow()
305 |                             .bold(),
306 |                         LogLevel::Error => {
307 |                             Span::raw(m.level.to_string().to_uppercase()).red().bold()
308 |                         }
309 |                     };
310 |                     let content = Line::from(vec![
311 |                         m.formatted_timestamp().clone().gray(),
312 |                         Span::raw(" "),
313 |                         level_span,
314 |                         Span::raw(" "),
315 |                         Span::raw(m.message.to_string()).bold(),
316 |                     ]);
317 |                     ListItem::new(content)
318 |                 })
319 |                 .collect::<Vec<_>>(),
320 |         )
321 |         .direction(BottomToTop)
322 |         .block(logs_block)
323 |         .render(bottom_layout[1], buf);
324 | 
325 |         // BENCHMARK config
326 |         let rate_mode = match self.benchmark_config.rates {
327 |             None => "Automatic".to_string(),
328 |             Some(_) => "Manual".to_string(),
329 |         };
330 |         let config_text = Text::from(vec![Line::from(vec![
331 |             format!("Profile: {profile} | Benchmark: {kind} | Max VUs: {max_vus} | Duration: {duration} sec | Rates: {rates} | Warmup: {warmup} sec",
332 |                     profile = self.benchmark_config.profile.clone().unwrap_or("N/A".to_string()),
333 |                     kind = self.benchmark_config.benchmark_kind,
334 |                     max_vus = self.benchmark_config.max_vus,
335 |                     duration = self.benchmark_config.duration.as_secs_f64(),
336 |                     rates = rate_mode,
337 |                     warmup = self.benchmark_config.warmup_duration.as_secs_f64()).white().bold(),
338 |         ])]);
339 |         Paragraph::new(config_text.clone()).render(main_layout[0], buf);
340 | 
341 |         // STEPS
342 |         let steps_block_title = Line::from("Benchmark steps".bold()).centered();
343 |         let steps_block = Block::bordered()
344 |             .title(steps_block_title.alignment(Alignment::Center))
345 |             .border_set(border::THICK);
346 |         let step_rows = state
347 |             .benchmarks
348 |             .iter()
349 |             .map(|b| {
350 |                 let error_rate = if b.failed_requests > 0 {
351 |                     format!(
352 |                         "{:4.0}%",
353 |                         b.failed_requests as f64
354 |                             / (b.failed_requests + b.successful_requests) as f64
355 |                             * 100.
356 |                     )
357 |                     .light_red()
358 |                     .bold()
359 |                 } else {
360 |                     format!("{:4.0}%", 0).to_string().white()
361 |                 };
362 |                 let cells = vec![
363 |                     b.id.clone().white(),
364 |                     b.status.to_string().white(),
365 |                     format!("{:4.0}%", b.progress).white(),
366 |                     error_rate,
367 |                     format!("{:>6.6} req/sec avg", b.throughput).green().bold(),
368 |                 ];
369 |                 Row::new(cells)
370 |             })
371 |             .collect::<Vec<_>>();
372 |         let widths = [
373 |             Constraint::Length(30),
374 |             Constraint::Length(10),
375 |             Constraint::Length(5),
376 |             Constraint::Length(5),
377 |             Constraint::Length(20),
378 |         ];
379 |         // steps table
380 |         Table::new(step_rows, widths)
381 |             .header(Row::new(vec![
382 |                 Cell::new(Line::from("Bench").alignment(Alignment::Left)),
383 |                 Cell::new(Line::from("Status").alignment(Alignment::Left)),
384 |                 Cell::new(Line::from("%").alignment(Alignment::Left)),
385 |                 Cell::new(Line::from("Err").alignment(Alignment::Left)),
386 |                 Cell::new(Line::from("Throughput").alignment(Alignment::Left)),
387 |             ]))
388 |             .block(steps_block)
389 |             .render(steps_graph_layout[0], buf);
390 | 
391 |         // CHARTS
392 |         let graphs_block_title = Line::from("Token throughput rate".bold()).centered();
393 |         let graphs_block = Block::bordered()
394 |             .title(graphs_block_title.alignment(Alignment::Center))
395 |             .border_set(border::THICK);
396 |         let binding = data.get("token_throughput_rate").unwrap().clone();
397 |         let datasets = vec![Dataset::default()
398 |             .name("Token throughput rate".to_string())
399 |             .marker(symbols::Marker::Dot)
400 |             .graph_type(ratatui::widgets::GraphType::Scatter)
401 |             .style(ratatui::style::Style::default().fg(ratatui::style::Color::LightMagenta))
402 |             .data(&binding)];
403 |         let (xmax, ymax) = get_max_bounds(&binding, (10.0, 100.0));
404 |         let x_axis = ratatui::widgets::Axis::default()
405 |             .title("Arrival rate (req/s)".to_string())
406 |             .style(ratatui::style::Style::default().white())
407 |             .bounds([0.0, xmax])
408 |             .labels(get_axis_labels(0.0, xmax, 5));
409 |         let y_axis = ratatui::widgets::Axis::default()
410 |             .title("Throughput (tokens/s)".to_string())
411 |             .style(ratatui::style::Style::default().white())
412 |             .bounds([0.0, ymax])
413 |             .labels(get_axis_labels(0.0, ymax, 5));
414 |         ratatui::widgets::Chart::new(datasets)
415 |             .x_axis(x_axis)
416 |             .y_axis(y_axis)
417 |             .block(graphs_block)
418 |             .legend_position(None)
419 |             .render(steps_graph_layout[1], buf);
420 |     }
421 | }
422 | 
423 | fn get_max_bounds(data: &[(f64, f64)], default_max: (f64, f64)) -> (f64, f64) {
424 |     let xmax = data
425 |         .iter()
426 |         .map(|(x, _)| x)
427 |         .max_by(|a, b| a.partial_cmp(b).unwrap())
428 |         .unwrap_or(&default_max.0);
429 |     let ymax = data
430 |         .iter()
431 |         .map(|(_, y)| y)
432 |         .max_by(|a, b| a.partial_cmp(b).unwrap())
433 |         .unwrap_or(&default_max.1);
434 |     (*xmax, *ymax)
435 | }
436 | 
437 | fn get_axis_labels(min: f64, max: f64, num_labels: u32) -> Vec<String> {
438 |     let step = (max - min) / num_labels as f64;
439 |     (0..num_labels)
440 |         .map(|i| format!("{:.2}", min + i as f64 * step))
441 |         .collect()
442 | }
443 | 
444 | #[allow(dead_code)]
445 | #[derive(Clone, strum_macros::Display)]
446 | enum LogLevel {
447 |     Info,
448 |     Warning,
449 |     Error,
450 | }
451 | 
452 | #[derive(Clone)]
453 | pub(crate) struct LogMessageUI {
454 |     message: String,
455 |     level: LogLevel,
456 |     timestamp: chrono::DateTime<chrono::Utc>,
457 | }
458 | 
459 | impl LogMessageUI {
460 |     fn formatted_timestamp(&self) -> String {
461 |         self.timestamp.to_rfc3339()
462 |     }
463 | }
464 | 
465 | #[derive(Clone)]
466 | pub(crate) struct BenchmarkUI {
467 |     pub(crate) id: String,
468 |     status: BenchmarkStatus,
469 |     progress: f64,
470 |     throughput: String,
471 |     successful_requests: u64,
472 |     failed_requests: u64,
473 | }
474 | 
475 | #[derive(Clone, strum_macros::Display)]
476 | enum BenchmarkStatus {
477 |     Running,
478 |     Completed,
479 | }
480 | 


--------------------------------------------------------------------------------
/src/benchmark.rs:
--------------------------------------------------------------------------------
  1 | use crate::requests::{TextGenerationBackend, TextRequestGenerator, TokenizeOptions};
  2 | use crate::results::{BenchmarkReport, BenchmarkResults};
  3 | use crate::scheduler::{ExecutorType, SchedulerProgress};
  4 | use crate::{executors, scheduler};
  5 | use log::{debug, info};
  6 | use serde::Serialize;
  7 | use std::collections::HashMap;
  8 | use std::sync::Arc;
  9 | use std::time::Duration;
 10 | use tokio::sync::mpsc::{Receiver, Sender};
 11 | use tokio::sync::{broadcast, mpsc, Mutex};
 12 | 
 13 | const THROUGHPUT_BUDGET: f64 = 1.2; // sweep up to 120% of max throughput
 14 | 
 15 | #[derive(Clone, Debug, strum_macros::Display, Serialize)]
 16 | pub enum BenchmarkKind {
 17 |     Throughput,
 18 |     Sweep,
 19 |     Rate,
 20 | }
 21 | 
 22 | pub struct MessageEvent {
 23 |     pub message: String,
 24 |     pub timestamp: chrono::DateTime<chrono::Utc>,
 25 |     pub level: log::Level,
 26 | }
 27 | 
 28 | pub struct BenchmarkEvent {
 29 |     pub id: String,
 30 |     pub scheduler_type: ExecutorType,
 31 |     pub request_throughput: Option<f64>,
 32 |     pub progress: f64,
 33 |     pub results: Option<BenchmarkResults>,
 34 |     pub successful_requests: u64,
 35 |     pub failed_requests: u64,
 36 | }
 37 | 
 38 | pub enum Event {
 39 |     BenchmarkStart(BenchmarkEvent),
 40 |     BenchmarkProgress(BenchmarkEvent),
 41 |     BenchmarkEnd(BenchmarkEvent),
 42 |     Message(MessageEvent),
 43 |     BenchmarkReportEnd(String),
 44 |     BenchmarkError(String),
 45 | }
 46 | 
 47 | pub struct Benchmark {
 48 |     start_time: Option<tokio::time::Instant>,
 49 |     end_time: Option<tokio::time::Instant>,
 50 |     backend: Box<dyn TextGenerationBackend + Send + Sync>,
 51 |     requests: Arc<Mutex<dyn TextRequestGenerator + Send>>,
 52 |     report: BenchmarkReport,
 53 |     pub(crate) config: BenchmarkConfig,
 54 |     event_bus: mpsc::UnboundedSender<Event>,
 55 |     stop_sender: broadcast::Sender<()>,
 56 | }
 57 | 
 58 | #[serde_with::serde_as]
 59 | #[derive(Clone, Serialize)]
 60 | pub struct BenchmarkConfig {
 61 |     pub max_vus: u64,
 62 |     #[serde(rename = "duration_secs")]
 63 |     #[serde_as(as = "serde_with::DurationSeconds<u64>")]
 64 |     pub duration: Duration,
 65 |     pub benchmark_kind: BenchmarkKind,
 66 |     #[serde(rename = "warmup_duration_secs")]
 67 |     #[serde_as(as = "serde_with::DurationSeconds<u64>")]
 68 |     pub warmup_duration: Duration,
 69 |     pub rates: Option<Vec<f64>>,
 70 |     pub num_rates: u64,
 71 |     pub prompt_options: Option<TokenizeOptions>,
 72 |     pub decode_options: Option<TokenizeOptions>,
 73 |     pub tokenizer: String,
 74 |     pub model_name: String,
 75 |     pub profile: Option<String>,
 76 |     #[serde(rename = "meta")]
 77 |     pub extra_metadata: Option<HashMap<String, String>>,
 78 |     pub run_id: String,
 79 | }
 80 | 
 81 | impl BenchmarkConfig {
 82 |     pub fn validate(&self) -> anyhow::Result<()> {
 83 |         if self.max_vus == 0 {
 84 |             return Err(anyhow::anyhow!("max_vus must be greater than 0"));
 85 |         }
 86 |         if self.duration.as_secs() == 0 {
 87 |             return Err(anyhow::anyhow!("duration must be greater than 0"));
 88 |         }
 89 |         if self.warmup_duration.as_secs() == 0 {
 90 |             return Err(anyhow::anyhow!("warmup_duration must be greater than 0"));
 91 |         }
 92 |         match self.benchmark_kind {
 93 |             BenchmarkKind::Throughput => {
 94 |                 if self.rates.is_some() {
 95 |                     return Err(anyhow::anyhow!(
 96 |                         "rates must not be specified for throughput benchmark"
 97 |                     ));
 98 |                 }
 99 |             }
100 |             BenchmarkKind::Sweep => {
101 |                 if self.rates.is_some() {
102 |                     return Err(anyhow::anyhow!(
103 |                         "rates must not be specified for sweep benchmark"
104 |                     ));
105 |                 }
106 |             }
107 |             BenchmarkKind::Rate => {
108 |                 if self.rates.is_none() {
109 |                     return Err(anyhow::anyhow!(
110 |                         "rates must be specified for rate benchmark"
111 |                     ));
112 |                 }
113 |             }
114 |         }
115 |         Ok(())
116 |     }
117 | }
118 | 
119 | pub struct BenchmarkProgress {
120 |     id: String,
121 |     progress: SchedulerProgress,
122 | }
123 | 
124 | impl Benchmark {
125 |     pub fn new(
126 |         config: BenchmarkConfig,
127 |         backend: Box<dyn TextGenerationBackend + Send + Sync>,
128 |         requests: Arc<Mutex<dyn TextRequestGenerator + Send>>,
129 |         event_bus: mpsc::UnboundedSender<Event>,
130 |         stop_sender: broadcast::Sender<()>,
131 |     ) -> Benchmark {
132 |         Benchmark {
133 |             start_time: None,
134 |             end_time: None,
135 |             report: BenchmarkReport::new(),
136 |             config: config.clone(),
137 |             backend,
138 |             requests,
139 |             event_bus,
140 |             stop_sender,
141 |         }
142 |     }
143 | 
144 |     pub fn get_report(&self) -> BenchmarkReport {
145 |         self.report.clone()
146 |     }
147 | 
148 |     pub async fn run(&mut self) -> anyhow::Result<BenchmarkReport> {
149 |         self.start_time = Some(tokio::time::Instant::now());
150 |         self.report.start();
151 |         info!("Prewarming backend");
152 |         self.warmup().await?;
153 |         info!("Prewarm complete");
154 |         match self.config.benchmark_kind {
155 |             BenchmarkKind::Throughput => {
156 |                 self.run_throughput().await?;
157 |             }
158 |             BenchmarkKind::Sweep => {
159 |                 self.run_sweep().await?;
160 |             }
161 |             BenchmarkKind::Rate => {
162 |                 self.run_rates().await?;
163 |             }
164 |         }
165 |         self.end_time = Some(tokio::time::Instant::now());
166 |         self.event_bus.send(Event::Message(MessageEvent {
167 |             message: format!(
168 |                 "Benchmark complete in {:?}",
169 |                 self.duration().expect("duration exists")
170 |             ),
171 |             timestamp: chrono::Utc::now(),
172 |             level: log::Level::Info,
173 |         }))?;
174 |         self.report.end();
175 |         Ok(self.report.clone())
176 |     }
177 | 
178 |     pub fn duration(&self) -> Option<std::time::Duration> {
179 |         match (self.start_time, self.end_time) {
180 |             (Some(start), Some(end)) => Some(end.duration_since(start)),
181 |             _ => None,
182 |         }
183 |     }
184 | 
185 |     async fn handle_progress(&self, id: String) -> Sender<Option<SchedulerProgress>> {
186 |         let (tx, mut rx): (
187 |             Sender<Option<SchedulerProgress>>,
188 |             Receiver<Option<SchedulerProgress>>,
189 |         ) = mpsc::channel(8);
190 |         let event_bus = self.event_bus.clone();
191 |         tokio::spawn(async move {
192 |             while let Some(event) = rx.recv().await {
193 |                 match event {
194 |                     None => {
195 |                         break;
196 |                     }
197 |                     Some(progress) => {
198 |                         let progress_evt = BenchmarkProgress {
199 |                             id: id.clone(),
200 |                             progress,
201 |                         };
202 |                         let _ = event_bus.send(Event::BenchmarkProgress(BenchmarkEvent {
203 |                             id: progress_evt.id,
204 |                             scheduler_type: ExecutorType::ConstantVUs,
205 |                             request_throughput: Some(progress_evt.progress.requests_throughput),
206 |                             progress: progress_evt.progress.progress,
207 |                             successful_requests: progress_evt.progress.successful_requests,
208 |                             failed_requests: progress_evt.progress.failed_requests,
209 |                             results: None,
210 |                         }));
211 |                     }
212 |                 }
213 |             }
214 |         });
215 |         tx
216 |     }
217 | 
218 |     pub async fn warmup(&mut self) -> anyhow::Result<()> {
219 |         // run a warmup benchmark to prewarm the server
220 | 
221 |         let id = "warmup".to_string();
222 | 
223 |         // notify start event
224 |         self.event_bus.send(Event::BenchmarkStart(BenchmarkEvent {
225 |             id: id.to_string(),
226 |             scheduler_type: ExecutorType::ConstantVUs,
227 |             request_throughput: None,
228 |             progress: 0.0,
229 |             results: None,
230 |             successful_requests: 0,
231 |             failed_requests: 0,
232 |         }))?;
233 | 
234 |         // create progress handler
235 |         let tx = self.handle_progress(id.clone()).await;
236 | 
237 |         // start scheduler
238 |         let mut scheduler = scheduler::Scheduler::new(
239 |             id,
240 |             self.backend.clone(),
241 |             ExecutorType::ConstantVUs,
242 |             executors::ExecutorConfig {
243 |                 max_vus: 1,
244 |                 duration: self.config.warmup_duration,
245 |                 rate: None,
246 |             },
247 |             self.requests.clone(),
248 |             tx.clone(),
249 |             self.stop_sender.clone(),
250 |         );
251 |         scheduler.run().await?;
252 | 
253 |         let results = scheduler.get_results().lock().await.clone();
254 |         self.report.add_benchmark_result(results.clone());
255 | 
256 |         // send None to close the progress handler
257 |         tx.send(None).await.unwrap();
258 | 
259 |         // notify end event
260 |         self.event_bus.send(Event::BenchmarkEnd(BenchmarkEvent {
261 |             id: "warmup".to_string(),
262 |             scheduler_type: ExecutorType::ConstantVUs,
263 |             request_throughput: results.successful_request_rate().ok(),
264 |             progress: 100.0,
265 |             results: Some(results.clone()),
266 |             successful_requests: results.successful_requests() as u64,
267 |             failed_requests: results.failed_requests() as u64,
268 |         }))?;
269 |         Ok(())
270 |     }
271 | 
272 |     pub async fn run_throughput(&mut self) -> anyhow::Result<()> {
273 |         info!("Running throughput benchmark");
274 | 
275 |         let id = "throughput".to_string();
276 | 
277 |         // notify start event
278 |         self.event_bus.send(Event::BenchmarkStart(BenchmarkEvent {
279 |             id: id.clone(),
280 |             scheduler_type: ExecutorType::ConstantVUs,
281 |             request_throughput: None,
282 |             progress: 0.0,
283 |             results: None,
284 |             successful_requests: 0,
285 |             failed_requests: 0,
286 |         }))?;
287 | 
288 |         // create progress handler
289 |         let tx = self.handle_progress(id.clone()).await;
290 | 
291 |         // start scheduler
292 |         let mut scheduler = scheduler::Scheduler::new(
293 |             id.clone(),
294 |             self.backend.clone(),
295 |             ExecutorType::ConstantVUs,
296 |             executors::ExecutorConfig {
297 |                 max_vus: self.config.max_vus,
298 |                 duration: self.config.duration,
299 |                 rate: None,
300 |             },
301 |             self.requests.clone(),
302 |             tx.clone(),
303 |             self.stop_sender.clone(),
304 |         );
305 |         scheduler.run().await?;
306 |         let results = scheduler.get_results().lock().await.clone();
307 |         let rate = results.successful_request_rate().ok();
308 |         self.report.add_benchmark_result(results.clone());
309 | 
310 |         // send None to close the progress handler
311 |         tx.send(None).await.unwrap();
312 | 
313 |         // notify end event
314 |         self.event_bus.send(Event::BenchmarkEnd(BenchmarkEvent {
315 |             id: id.clone(),
316 |             scheduler_type: ExecutorType::ConstantVUs,
317 |             request_throughput: rate,
318 |             progress: 100.0,
319 |             results: Some(results.clone()),
320 |             successful_requests: results.successful_requests() as u64,
321 |             failed_requests: results.failed_requests() as u64,
322 |         }))?;
323 |         Ok(())
324 |     }
325 | 
326 |     pub async fn run_sweep(&mut self) -> anyhow::Result<()> {
327 |         // run a throughput benchmark to retrieve the maximum throughput of server
328 |         self.run_throughput().await?;
329 |         // get the max throughput from the second benchmark result (first is warmup)
330 |         let throughput_results = &self.report.get_results()[1];
331 |         let max_throughput = throughput_results.successful_request_rate()?;
332 |         let max_tokens_throughput = throughput_results.token_throughput_secs()?;
333 |         // notify event bus
334 |         self.event_bus.send(Event::Message(MessageEvent {
335 |             message: format!(
336 |                 "Max throughput detected at: {:.2} req/s | {:.2} tokens/s",
337 |                 max_throughput, max_tokens_throughput
338 |             ),
339 |             timestamp: chrono::Utc::now(),
340 |             level: log::Level::Info,
341 |         }))?;
342 |         // run a sweep benchmark for 10 different rates from 1req/s to max throughput
343 |         let mut rates = Vec::new();
344 |         let num_rates = self.config.num_rates;
345 |         for i in 1..=num_rates {
346 |             rates.push(i as f64 * max_throughput * THROUGHPUT_BUDGET / num_rates as f64);
347 |         }
348 |         for rate in rates {
349 |             self.run_rate(rate).await?;
350 |         }
351 |         Ok(())
352 |     }
353 | 
354 |     pub async fn run_rates(&mut self) -> anyhow::Result<()> {
355 |         let rates = self.config.rates.clone().expect("config already validated");
356 |         for rate in rates {
357 |             self.run_rate(rate).await?;
358 |         }
359 |         Ok(())
360 |     }
361 | 
362 |     pub async fn run_rate(&mut self, rate: f64) -> anyhow::Result<()> {
363 |         debug!("Running benchmark with rate: {} req/s", rate);
364 | 
365 |         let id = format!("constant@{:.2}req/s", rate);
366 | 
367 |         // notify start event
368 |         self.event_bus.send(Event::BenchmarkStart(BenchmarkEvent {
369 |             id: id.clone(),
370 |             scheduler_type: ExecutorType::ConstantArrivalRate,
371 |             request_throughput: None,
372 |             progress: 0.0,
373 |             results: None,
374 |             successful_requests: 0,
375 |             failed_requests: 0,
376 |         }))?;
377 | 
378 |         // create progress handler
379 |         let tx = self.handle_progress(id.clone()).await;
380 | 
381 |         // start scheduler
382 |         let mut scheduler = scheduler::Scheduler::new(
383 |             id,
384 |             self.backend.clone(),
385 |             scheduler::ExecutorType::ConstantArrivalRate,
386 |             executors::ExecutorConfig {
387 |                 max_vus: self.config.max_vus,
388 |                 duration: self.config.duration,
389 |                 rate: Some(rate),
390 |             },
391 |             self.requests.clone(),
392 |             tx.clone(),
393 |             self.stop_sender.clone(),
394 |         );
395 |         scheduler.run().await?;
396 |         let results = scheduler.get_results().lock().await.clone();
397 |         self.report.add_benchmark_result(results.clone());
398 | 
399 |         // send None to close the progress handler
400 |         tx.send(None).await.unwrap();
401 | 
402 |         // notify end event
403 |         self.event_bus.send(Event::BenchmarkEnd(BenchmarkEvent {
404 |             id: format!("constant@{:.2}req/s", rate),
405 |             scheduler_type: ExecutorType::ConstantArrivalRate,
406 |             request_throughput: results.successful_request_rate().ok(),
407 |             progress: 100.0,
408 |             results: Some(results.clone()),
409 |             successful_requests: results.successful_requests() as u64,
410 |             failed_requests: results.failed_requests() as u64,
411 |         }))?;
412 |         Ok(())
413 |     }
414 | }
415 | 
416 | #[cfg(test)]
417 | mod tests {
418 |     use super::*;
419 |     use crate::requests::DummyTextGenerationBackend;
420 |     use crate::requests::DummyTextRequestGenerator;
421 |     use std::time::Duration;
422 | 
423 |     #[tokio::test]
424 |     async fn test_sweep_benchmark_timings() {
425 |         let generation_time = Duration::from_secs(2);
426 |         let (event_tx, mut _event_rx) = tokio::sync::mpsc::unbounded_channel();
427 |         let (stop_sender, _) = tokio::sync::broadcast::channel(1);
428 |         let backend = Box::new(DummyTextGenerationBackend::new(Duration::from_secs(
429 |             generation_time.as_secs(),
430 |         )));
431 |         let requests_generator = Arc::from(Mutex::from(DummyTextRequestGenerator::new()));
432 |         let mut benchmark = Benchmark::new(
433 |             BenchmarkConfig {
434 |                 max_vus: 100,
435 |                 duration: Duration::from_secs(10),
436 |                 benchmark_kind: BenchmarkKind::Sweep,
437 |                 warmup_duration: Duration::from_secs(1),
438 |                 rates: None,
439 |                 num_rates: 2,
440 |                 prompt_options: None,
441 |                 decode_options: None,
442 |                 tokenizer: "gpt2".to_string(),
443 |                 model_name: "gpt2".to_string(),
444 |                 profile: None,
445 |                 extra_metadata: None,
446 |                 run_id: "test".to_string(),
447 |             },
448 |             backend,
449 |             requests_generator,
450 |             event_tx,
451 |             stop_sender,
452 |         );
453 |         let report = benchmark.run().await.unwrap();
454 |         assert_eq!(report.get_results().len(), 4);
455 |         let generation_time_per_token_milli = generation_time.as_millis() as i128 / 10;
456 |         for result in report.get_results() {
457 |             let delta_ttft = result.time_to_first_token_avg().unwrap().as_millis() as i128
458 |                 - generation_time_per_token_milli; // Dummy backends generates 10 tokens
459 |             let delta_itl = result.inter_token_latency_avg().unwrap().as_millis() as i128
460 |                 - generation_time_per_token_milli;
461 |             let delta_e2e = result.e2e_latency_avg().unwrap().as_millis() as i128
462 |                 - generation_time.as_millis() as i128;
463 |             let allowed_error_ms = 3; // allow error margin for timing tests
464 |             assert!(
465 |                 delta_ttft.abs() <= allowed_error_ms,
466 |                 "time_to_first_token_delta: {:?}, expected {:?}",
467 |                 delta_ttft.abs(),
468 |                 allowed_error_ms
469 |             );
470 |             assert!(
471 |                 delta_itl.abs() <= allowed_error_ms,
472 |                 "inter_token_latency_delta: {:?}, expected {:?}",
473 |                 delta_itl.abs(),
474 |                 allowed_error_ms
475 |             );
476 |             assert!(
477 |                 delta_e2e.abs() <= allowed_error_ms * 10, // Cumulative error for 10 tokens
478 |                 "e2e_latency_delta: {:?}, expected {:?}",
479 |                 delta_e2e.abs(),
480 |                 allowed_error_ms * 10
481 |             );
482 |         }
483 |     }
484 | }
485 | 


--------------------------------------------------------------------------------
/src/event.rs:
--------------------------------------------------------------------------------
 1 | use crossterm::event;
 2 | use crossterm::event::KeyEvent;
 3 | use std::time::{Duration, Instant};
 4 | use tokio::sync::{broadcast, mpsc};
 5 | 
 6 | pub enum AppEvent {
 7 |     Tick,
 8 |     Key(KeyEvent),
 9 |     Resize,
10 | }
11 | 
12 | pub async fn terminal_event_task(
13 |     fps: u32,
14 |     event_sender: mpsc::Sender<AppEvent>,
15 |     mut shutdown_receiver: broadcast::Receiver<()>,
16 | ) {
17 |     // End task if a message is received on shutdown_receiver
18 |     // _shutdown_guard_sender will be dropped once the task is finished
19 |     tokio::select! {
20 |         _ = event_loop(fps, event_sender)  => {
21 |         },
22 |         _ = shutdown_receiver.recv() => {}
23 |     }
24 | }
25 | 
26 | async fn event_loop(fps: u32, event_sender: mpsc::Sender<AppEvent>) {
27 |     // Frame budget
28 |     let per_frame = Duration::from_secs(1) / fps;
29 | 
30 |     // When was last frame executed
31 |     let mut last_frame = Instant::now();
32 | 
33 |     loop {
34 |         // Sleep to avoid blocking the thread for too long
35 |         if let Some(sleep) = per_frame.checked_sub(last_frame.elapsed()) {
36 |             tokio::time::sleep(sleep).await;
37 |         }
38 | 
39 |         // Get crossterm event and send a new one over the channel
40 |         if event::poll(Duration::from_secs(0)).expect("no events available") {
41 |             match event::read().expect("unable to read event") {
42 |                 event::Event::Key(e) => event_sender.send(AppEvent::Key(e)).await.unwrap_or(()),
43 |                 event::Event::Resize(_w, _h) => {
44 |                     event_sender.send(AppEvent::Resize).await.unwrap_or(())
45 |                 }
46 |                 _ => (),
47 |             }
48 |         }
49 | 
50 |         // Frame budget exceeded
51 |         if last_frame.elapsed() >= per_frame {
52 |             // Send tick
53 |             event_sender.send(AppEvent::Tick).await.unwrap_or(());
54 |             // Rest last_frame time
55 |             last_frame = Instant::now();
56 |         }
57 |     }
58 | }
59 | 


--------------------------------------------------------------------------------
/src/executors.rs:
--------------------------------------------------------------------------------
  1 | use std::sync::atomic::AtomicI64;
  2 | use std::sync::Arc;
  3 | use std::time::Duration;
  4 | 
  5 | use async_trait::async_trait;
  6 | use log::{info, trace, warn};
  7 | use serde::Serialize;
  8 | use tokio::sync::mpsc::{Receiver, Sender, UnboundedSender};
  9 | use tokio::sync::{broadcast, Mutex};
 10 | use tokio::task::JoinHandle;
 11 | 
 12 | use crate::requests::{
 13 |     TextGenerationAggregatedResponse, TextGenerationBackend, TextGenerationRequest,
 14 |     TextRequestGenerator,
 15 | };
 16 | 
 17 | #[serde_with::serde_as]
 18 | #[derive(Clone, Serialize)]
 19 | pub struct ExecutorConfig {
 20 |     pub max_vus: u64,
 21 |     #[serde(rename = "duration_secs")]
 22 |     #[serde_as(as = "serde_with::DurationSeconds<u64>")]
 23 |     pub duration: Duration,
 24 |     pub rate: Option<f64>,
 25 | }
 26 | 
 27 | #[async_trait]
 28 | pub trait Executor {
 29 |     async fn run(
 30 |         &self,
 31 |         requests: Arc<Mutex<dyn TextRequestGenerator + Send>>,
 32 |         responses_tx: UnboundedSender<TextGenerationAggregatedResponse>,
 33 |         stop_sender: broadcast::Sender<()>,
 34 |     );
 35 | }
 36 | 
 37 | pub struct ConstantVUsExecutor {
 38 |     config: ExecutorConfig,
 39 |     backend: Box<dyn TextGenerationBackend + Send + Sync>,
 40 | }
 41 | 
 42 | impl ConstantVUsExecutor {
 43 |     pub fn new(
 44 |         backend: Box<dyn TextGenerationBackend + Send + Sync>,
 45 |         max_vus: u64,
 46 |         duration: Duration,
 47 |     ) -> ConstantVUsExecutor {
 48 |         Self {
 49 |             backend,
 50 |             config: ExecutorConfig {
 51 |                 max_vus,
 52 |                 duration,
 53 |                 rate: None,
 54 |             },
 55 |         }
 56 |     }
 57 | }
 58 | 
 59 | #[async_trait]
 60 | impl Executor for ConstantVUsExecutor {
 61 |     async fn run(
 62 |         &self,
 63 |         requests: Arc<Mutex<dyn TextRequestGenerator + Send>>,
 64 |         responses_tx: UnboundedSender<TextGenerationAggregatedResponse>,
 65 |         stop_sender: broadcast::Sender<()>,
 66 |     ) {
 67 |         let start = std::time::Instant::now();
 68 |         // channel to handle ending VUs
 69 |         let (end_tx, mut end_rx): (Sender<bool>, Receiver<bool>) =
 70 |             tokio::sync::mpsc::channel(self.config.max_vus as usize);
 71 |         let active_vus = Arc::new(AtomicI64::new(0));
 72 |         // start all VUs
 73 |         for _ in 0..self.config.max_vus {
 74 |             let mut requests_guard = requests.lock().await;
 75 |             let request = Arc::from(requests_guard.generate_request());
 76 |             drop(requests_guard);
 77 |             start_vu(
 78 |                 self.backend.clone(),
 79 |                 request,
 80 |                 responses_tx.clone(),
 81 |                 end_tx.clone(),
 82 |                 stop_sender.clone(),
 83 |             )
 84 |             .await;
 85 |             active_vus.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
 86 |         }
 87 |         let mut stop_receiver = stop_sender.subscribe();
 88 |         tokio::select! {
 89 |             _ = stop_receiver.recv() => {
 90 |                 return;
 91 |             },
 92 |             _ = async {
 93 |                 // replenish VUs as they finish
 94 |                 while end_rx.recv().await.is_some() {
 95 |                     active_vus.fetch_sub(1, std::sync::atomic::Ordering::SeqCst);
 96 |                     if start.elapsed() > self.config.duration{
 97 |                         // signal that the VU work is done
 98 |                         let _ = responses_tx.send(TextGenerationAggregatedResponse::new_as_ended());
 99 |                         info!("Duration reached, waiting for all VUs to finish...");
100 |                         if active_vus.load(std::sync::atomic::Ordering::SeqCst) == 0 {
101 |                             break;
102 |                         }
103 |                     } else {
104 |                         let mut requests_guard = requests.lock().await;
105 |                         let request = Arc::from(requests_guard.generate_request());
106 |                         drop(requests_guard);
107 |                         active_vus.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
108 |                         start_vu(self.backend.clone(), request, responses_tx.clone(), end_tx.clone(), stop_sender.clone()).await;
109 |                     }
110 |                 }
111 |             }=>{}
112 |         }
113 |     }
114 | }
115 | 
116 | async fn start_vu(
117 |     backend: Box<dyn TextGenerationBackend + Send + Sync>,
118 |     request: Arc<TextGenerationRequest>,
119 |     responses_tx: UnboundedSender<TextGenerationAggregatedResponse>,
120 |     end_tx: Sender<bool>,
121 |     stop_sender: broadcast::Sender<()>,
122 | ) -> JoinHandle<()> {
123 |     let mut stop_receiver = stop_sender.subscribe();
124 |     tokio::spawn(async move {
125 |         tokio::select! {
126 |             _ = stop_receiver.recv() => {
127 |                 let _ = end_tx.send(true).await;
128 |             },
129 |             _ = async{
130 |                 let (tx, mut rx): (Sender<TextGenerationAggregatedResponse>, Receiver<TextGenerationAggregatedResponse>) = tokio::sync::mpsc::channel(1);
131 |                 trace!("VU started with request: {:?}", request);
132 |                 let req_thread = tokio::spawn(async move {
133 |                     backend.generate(request.clone(), tx).await;
134 |                 });
135 |                 let send_thread = tokio::spawn(async move {
136 |                     while let Some(response) = rx.recv().await {
137 |                         // ignore errors, if the receiver is gone we want to finish the request
138 |                         // to leave remote server in clean state
139 |                         let _ = responses_tx.send(response);
140 |                     }
141 |                 });
142 |                 req_thread.await.unwrap();
143 |                 send_thread.await.unwrap();
144 |                 // signal that the VU work is done
145 |                 let _ = end_tx.send(true).await;
146 |             }=>{}
147 |         }
148 |     })
149 | }
150 | 
151 | pub struct ConstantArrivalRateExecutor {
152 |     config: ExecutorConfig,
153 |     backend: Box<dyn TextGenerationBackend + Send + Sync>,
154 | }
155 | 
156 | impl ConstantArrivalRateExecutor {
157 |     pub fn new(
158 |         backend: Box<dyn TextGenerationBackend + Send + Sync>,
159 |         max_vus: u64,
160 |         duration: Duration,
161 |         rate: f64,
162 |     ) -> ConstantArrivalRateExecutor {
163 |         Self {
164 |             backend,
165 |             config: ExecutorConfig {
166 |                 max_vus,
167 |                 duration,
168 |                 rate: Some(rate),
169 |             },
170 |         }
171 |     }
172 | }
173 | 
174 | #[async_trait]
175 | impl Executor for ConstantArrivalRateExecutor {
176 |     async fn run(
177 |         &self,
178 |         requests: Arc<Mutex<dyn TextRequestGenerator + Send>>,
179 |         responses_tx: UnboundedSender<TextGenerationAggregatedResponse>,
180 |         stop_sender: broadcast::Sender<()>,
181 |     ) {
182 |         let start = std::time::Instant::now();
183 |         let active_vus = Arc::new(AtomicI64::new(0));
184 |         // channel to handle ending VUs
185 |         let (end_tx, mut end_rx): (Sender<bool>, Receiver<bool>) =
186 |             tokio::sync::mpsc::channel(self.config.max_vus as usize);
187 |         let rate = self.config.rate.expect("checked in new()");
188 |         // spawn new VUs every `tick_ms` to reach the expected `rate` per second, until the duration is reached
189 |         let tick_ms = 10;
190 |         let mut interval = tokio::time::interval(Duration::from_millis(tick_ms));
191 | 
192 |         let backend = self.backend.clone();
193 |         let duration = self.config.duration;
194 |         let max_vus = self.config.max_vus;
195 |         let active_vus_thread = active_vus.clone();
196 |         let mut stop_receiver_signal = stop_sender.subscribe();
197 |         let vu_thread = tokio::spawn(async move {
198 |             tokio::select! {
199 |                 _ = stop_receiver_signal.recv() => {},
200 |                 _= async {
201 |                     let mut spawn_queue = 0.; // start with at least one VU
202 |                     while start.elapsed() < duration {
203 |                         spawn_queue += rate * (tick_ms as f64) / 1000.0;
204 |                         // delay spawning if we can't spawn a full VU yet
205 |                         if spawn_queue < 1.0 {
206 |                             interval.tick().await;
207 |                             continue;
208 |                         }
209 |                         // spawn VUs, keep track of the fraction of VU to spawn for the next iteration
210 |                         let to_spawn = spawn_queue.floor() as u64;
211 |                         spawn_queue -= to_spawn as f64;
212 |                         for _ in 0..to_spawn {
213 |                             if active_vus_thread.load(std::sync::atomic::Ordering::SeqCst) < max_vus as i64 {
214 |                                 let mut requests_guard = requests.lock().await;
215 |                                 let request = Arc::from(requests_guard.generate_request());
216 |                                 start_vu(backend.clone(), request.clone(), responses_tx.clone(), end_tx.clone(),stop_sender.clone()).await;
217 |                                 active_vus_thread.fetch_add(1, std::sync::atomic::Ordering::SeqCst);
218 |                             } else {
219 |                                 warn!("Max VUs reached, skipping request");
220 |                                 break;
221 |                             }
222 |                         }
223 |                         interval.tick().await;
224 |                     }
225 |                     // signal that the VU work is done
226 |                     info!("Duration reached, waiting for all VUs to finish...");
227 |                     let _ = responses_tx.send(TextGenerationAggregatedResponse::new_as_ended());
228 |                 }=>{}
229 |             }
230 |         });
231 |         while end_rx.recv().await.is_some() {
232 |             active_vus.fetch_sub(1, std::sync::atomic::Ordering::SeqCst);
233 |             // wait for all VUs to finish
234 |             if start.elapsed() > self.config.duration
235 |                 && active_vus.load(std::sync::atomic::Ordering::SeqCst) == 0
236 |             {
237 |                 break;
238 |             }
239 |         }
240 |         // wait for the VU thread to finish
241 |         vu_thread.await.unwrap();
242 |     }
243 | }
244 | 


--------------------------------------------------------------------------------
/src/flux.rs:
--------------------------------------------------------------------------------
 1 | use crate::results::BenchmarkResults;
 2 | use std::sync::{Arc, Mutex};
 3 | 
 4 | // Flux pattern
 5 | #[derive(Clone)]
 6 | pub struct Dispatcher {
 7 |     store: Arc<Mutex<Store>>,
 8 | }
 9 | 
10 | impl Dispatcher {
11 |     pub(crate) fn new(store: Arc<Mutex<Store>>) -> Self {
12 |         Self { store }
13 |     }
14 |     pub(crate) fn dispatch(&mut self, action: Action) {
15 |         self.store.lock().unwrap().update(action);
16 |     }
17 | }
18 | 
19 | #[derive(Clone)]
20 | pub struct AppState {
21 |     pub(crate) messages: Vec<crate::app::LogMessageUI>,
22 |     pub(crate) benchmarks: Vec<crate::app::BenchmarkUI>,
23 |     pub(crate) results: Vec<BenchmarkResults>,
24 | }
25 | 
26 | impl AppState {
27 |     fn new() -> Self {
28 |         Self {
29 |             messages: Vec::new(),
30 |             benchmarks: Vec::new(),
31 |             results: Vec::new(),
32 |         }
33 |     }
34 | }
35 | 
36 | pub struct Store {
37 |     state: AppState,
38 | }
39 | 
40 | impl Store {
41 |     pub(crate) fn new() -> Self {
42 |         let state = AppState::new();
43 |         Self { state }
44 |     }
45 | 
46 |     fn update(&mut self, action: Action) {
47 |         match action {
48 |             Action::LogMessage(message) => self.state.messages.push(message),
49 |             Action::AddBenchmark(benchmark) => {
50 |                 // add or update benchmark
51 |                 let index = self
52 |                     .state
53 |                     .benchmarks
54 |                     .iter()
55 |                     .position(|b| b.id == benchmark.id);
56 |                 match index {
57 |                     Some(i) => {
58 |                         self.state.benchmarks[i] = benchmark;
59 |                     }
60 |                     None => {
61 |                         self.state.benchmarks.push(benchmark);
62 |                     }
63 |                 }
64 |             }
65 |             Action::AddBenchmarkResults(results) => {
66 |                 let index = self
67 |                     .state
68 |                     .results
69 |                     .iter_mut()
70 |                     .position(|b| b.id == results.id);
71 |                 match index {
72 |                     Some(i) => {
73 |                         self.state.results[i] = results;
74 |                     }
75 |                     None => {
76 |                         self.state.results.push(results);
77 |                     }
78 |                 }
79 |             }
80 |         }
81 |     }
82 | 
83 |     pub(crate) fn get_state(&self) -> AppState {
84 |         self.state.clone()
85 |     }
86 | }
87 | 
88 | pub enum Action {
89 |     LogMessage(crate::app::LogMessageUI),
90 |     AddBenchmark(crate::app::BenchmarkUI),
91 |     AddBenchmarkResults(BenchmarkResults),
92 | }
93 | 


--------------------------------------------------------------------------------
/src/lib.rs:
--------------------------------------------------------------------------------
  1 | use std::collections::HashMap;
  2 | use std::fs::File;
  3 | use std::io;
  4 | use std::io::Write;
  5 | use std::path::Path;
  6 | use std::sync::Arc;
  7 | 
  8 | pub use crate::app::run_console;
  9 | pub use crate::benchmark::{BenchmarkConfig, BenchmarkKind};
 10 | use crate::benchmark::{Event, MessageEvent};
 11 | pub use crate::profiles::apply_profile;
 12 | use crate::requests::OpenAITextGenerationBackend;
 13 | pub use crate::requests::TokenizeOptions;
 14 | use chrono::Local;
 15 | use crossterm::ExecutableCommand;
 16 | use log::{debug, error, info, warn, Level, LevelFilter};
 17 | use reqwest::Url;
 18 | use tokenizers::{FromPretrainedParameters, Tokenizer};
 19 | use tokio::sync::broadcast::Sender;
 20 | use tokio::sync::Mutex;
 21 | use writers::BenchmarkReportWriter;
 22 | 
 23 | mod app;
 24 | mod benchmark;
 25 | mod event;
 26 | mod executors;
 27 | mod flux;
 28 | mod profiles;
 29 | mod requests;
 30 | mod results;
 31 | mod scheduler;
 32 | mod table;
 33 | mod writers;
 34 | 
 35 | pub struct RunConfiguration {
 36 |     pub url: Url,
 37 |     pub api_key: String,
 38 |     pub tokenizer_name: String,
 39 |     pub profile: Option<String>,
 40 |     pub max_vus: u64,
 41 |     pub duration: std::time::Duration,
 42 |     pub rates: Option<Vec<f64>>,
 43 |     pub num_rates: u64,
 44 |     pub benchmark_kind: String,
 45 |     pub warmup_duration: std::time::Duration,
 46 |     pub interactive: bool,
 47 |     pub prompt_options: Option<TokenizeOptions>,
 48 |     pub decode_options: Option<TokenizeOptions>,
 49 |     pub dataset: String,
 50 |     pub dataset_file: String,
 51 |     pub hf_token: Option<String>,
 52 |     pub extra_metadata: Option<HashMap<String, String>>,
 53 |     pub model_name: String,
 54 |     pub run_id: String,
 55 | }
 56 | 
 57 | pub async fn run(mut run_config: RunConfiguration, stop_sender: Sender<()>) -> anyhow::Result<()> {
 58 |     info!("Starting benchmark");
 59 |     // set process system limits
 60 |     sysinfo::set_open_files_limit(0);
 61 |     // apply profile if needed
 62 |     run_config = match run_config.profile.clone() {
 63 |         None => run_config,
 64 |         Some(profile) => match apply_profile(profile.as_str(), run_config) {
 65 |             Ok(config) => {
 66 |                 info!("Profile applied: {}", profile);
 67 |                 config
 68 |             }
 69 |             Err(e) => {
 70 |                 error!("Failed to apply profile: {:?}", e);
 71 |                 return Err(e);
 72 |             }
 73 |         },
 74 |     };
 75 |     // initialize tokenizer
 76 |     let params = FromPretrainedParameters {
 77 |         token: run_config.hf_token.clone(),
 78 |         ..Default::default()
 79 |     };
 80 |     let tokenizer =
 81 |         match Tokenizer::from_pretrained(run_config.tokenizer_name.clone(), Some(params)) {
 82 |             Ok(tokenizer) => tokenizer,
 83 |             Err(e) => {
 84 |                 return Err(anyhow::anyhow!("Error loading tokenizer: {e}"));
 85 |             }
 86 |         };
 87 |     let tokenizer = Arc::new(tokenizer);
 88 |     let backend = OpenAITextGenerationBackend::try_new(
 89 |         run_config.api_key,
 90 |         run_config.url,
 91 |         run_config.model_name.clone(),
 92 |         tokenizer,
 93 |         run_config.duration,
 94 |     )?;
 95 | 
 96 |     let config = BenchmarkConfig {
 97 |         max_vus: run_config.max_vus,
 98 |         duration: run_config.duration,
 99 |         benchmark_kind: match run_config.benchmark_kind.to_lowercase().as_str() {
100 |             "throughput" => BenchmarkKind::Throughput,
101 |             "sweep" => BenchmarkKind::Sweep,
102 |             "rate" => BenchmarkKind::Rate,
103 |             _ => BenchmarkKind::Sweep,
104 |         },
105 |         warmup_duration: run_config.warmup_duration,
106 |         rates: run_config.rates,
107 |         num_rates: run_config.num_rates,
108 |         prompt_options: run_config.prompt_options.clone(),
109 |         decode_options: run_config.decode_options.clone(),
110 |         tokenizer: run_config.tokenizer_name.clone(),
111 |         model_name: run_config.model_name.clone(),
112 |         profile: run_config.profile.clone(),
113 |         extra_metadata: run_config.extra_metadata.clone(),
114 |         run_id: run_config.run_id.clone(),
115 |     };
116 |     config.validate()?;
117 |     let (tx, mut rx) = tokio::sync::mpsc::unbounded_channel();
118 |     if run_config.interactive {
119 |         // send logs to file
120 |         let target = Box::new(File::create("log.txt").expect("Can't create file"));
121 |         env_logger::Builder::new()
122 |             .target(env_logger::Target::Pipe(target))
123 |             .filter(Some("inference_benchmarker"), LevelFilter::Debug)
124 |             .format(|buf, record| {
125 |                 writeln!(
126 |                     buf,
127 |                     "[{} {} {}:{}] {}",
128 |                     Local::now().format("%Y-%m-%d %H:%M:%S%.3f"),
129 |                     record.level(),
130 |                     record.file().unwrap_or("unknown"),
131 |                     record.line().unwrap_or(0),
132 |                     record.args()
133 |                 )
134 |             })
135 |             .init();
136 |     } else {
137 |         env_logger::init();
138 |     }
139 |     let config_clone = config.clone();
140 |     let mut stop_receiver = stop_sender.subscribe();
141 |     let stop_sender_clone = stop_sender.clone();
142 |     let ui_thread = tokio::spawn(async move {
143 |         tokio::select! {
144 |             _ = stop_receiver.recv() => {
145 |                 debug!("Received stop signal, stopping benchmark");
146 |             }
147 |             _ = async{
148 |                 if run_config.interactive {
149 |                     run_console(config_clone, rx, stop_sender_clone).await;
150 |                 } else {
151 |                     // consume the channel to avoid closed channel error
152 |                     while rx.recv().await.is_some() {}
153 |                 }
154 |             } => {}
155 |         }
156 |     });
157 | 
158 |     // download prompts dataset
159 |     info!("Downloading dataset");
160 |     let _ = tx.send(Event::Message(MessageEvent {
161 |         message: "Downloading dataset".to_string(),
162 |         timestamp: chrono::Utc::now(),
163 |         level: Level::Info,
164 |     }));
165 |     let filepath = requests::ConversationTextRequestGenerator::download_dataset(
166 |         run_config.dataset,
167 |         run_config.dataset_file,
168 |         run_config.hf_token.clone(),
169 |     )
170 |     .expect("Can't download dataset");
171 |     let requests = requests::ConversationTextRequestGenerator::load(
172 |         filepath,
173 |         run_config.tokenizer_name.clone(),
174 |         run_config.prompt_options,
175 |         run_config.decode_options,
176 |         run_config.hf_token,
177 |     )?;
178 | 
179 |     let mut benchmark = benchmark::Benchmark::new(
180 |         config.clone(),
181 |         Box::new(backend),
182 |         Arc::from(Mutex::from(requests)),
183 |         tx.clone(),
184 |         stop_sender.clone(),
185 |     );
186 |     let mut stop_receiver = stop_sender.subscribe();
187 |     tokio::select! {
188 |         report = benchmark.run() => {
189 |             match report {
190 |                 Ok(_) => {
191 |                     let report = benchmark.get_report();
192 |                     let path = format!("results/{}_{}.json",run_config.tokenizer_name.replace("/","_").replace(".","_"), chrono::Utc::now().format("%Y-%m-%d-%H-%M-%S"));
193 |                     let path=Path::new(&path);
194 |                     let writer=BenchmarkReportWriter::try_new(config.clone(), report)?;
195 |                     writer.json(path).await?;
196 |                     info!("Report saved to {:?}",path);
197 |                     let _ = tx.send(Event::BenchmarkReportEnd(format!("{:?}", path)));
198 |                 },
199 |                 Err(e) => {
200 |                     error!("Error running benchmark: {:?}", e.to_string());
201 |                     let _ = tx.send(Event::BenchmarkError(e.to_string()));
202 |                 }
203 |             };
204 |         }
205 |         _ = stop_receiver.recv() => {
206 |             debug!("Received stop signal, stopping benchmark");
207 |         }
208 |     }
209 |     info!("Benchmark finished");
210 |     if !run_config.interactive {
211 |         // quit app if not interactive
212 |         let _ = stop_sender.send(());
213 |     }
214 |     ui_thread.await?;
215 | 
216 |     // Revert terminal to original view
217 |     io::stdout().execute(ratatui::crossterm::terminal::LeaveAlternateScreen)?;
218 |     ratatui::crossterm::terminal::disable_raw_mode()?;
219 |     io::stdout().execute(ratatui::crossterm::cursor::Show)?;
220 | 
221 |     let report = benchmark.get_report();
222 |     match BenchmarkReportWriter::try_new(config.clone(), report) {
223 |         Ok(writer) => {
224 |             writer.stdout().await?;
225 |         }
226 |         Err(_) => {
227 |             warn!("No results to report.");
228 |         }
229 |     };
230 | 
231 |     Ok(())
232 | }
233 | 


--------------------------------------------------------------------------------
/src/main.rs:
--------------------------------------------------------------------------------
  1 | use clap::error::ErrorKind::InvalidValue;
  2 | use clap::{ArgGroup, Error, Parser};
  3 | use inference_benchmarker::{run, RunConfiguration, TokenizeOptions};
  4 | use log::{debug, error};
  5 | use reqwest::Url;
  6 | use std::collections::HashMap;
  7 | use std::time::Duration;
  8 | use tokio::sync::broadcast;
  9 | 
 10 | #[derive(Parser, Debug)]
 11 | #[clap(author, version, about, long_about = None, group(ArgGroup::new("group_profile").multiple(true)),group(ArgGroup::new("group_manual").multiple(true).conflicts_with("group_profile"))
 12 | )]
 13 | struct Args {
 14 |     /// The name of the tokenizer to use
 15 |     #[clap(short, long, env)]
 16 |     tokenizer_name: String,
 17 | 
 18 |     /// The name of the model to use. If not provided, the same name as the tokenizer will be used.
 19 |     #[clap(long, env)]
 20 |     model_name: Option<String>,
 21 | 
 22 |     /// The maximum number of virtual users to use
 23 |     #[clap(default_value = "128", short, long, env, group = "group_manual")]
 24 |     max_vus: u64,
 25 |     /// The duration of each benchmark step
 26 |     #[clap(default_value = "120s", short, long, env, group = "group_manual")]
 27 |     #[arg(value_parser = parse_duration)]
 28 |     duration: Duration,
 29 |     /// A list of rates of requests to send per second (only valid for the ConstantArrivalRate benchmark).
 30 |     #[clap(short, long, env)]
 31 |     rates: Option<Vec<f64>>,
 32 |     /// The number of rates to sweep through (only valid for the "sweep" benchmark)
 33 |     /// The rates will be linearly spaced up to the detected maximum rate
 34 |     #[clap(default_value = "10", long, env)]
 35 |     num_rates: u64,
 36 |     /// A benchmark profile to use
 37 |     #[clap(long, env, group = "group_profile")]
 38 |     profile: Option<String>,
 39 |     /// The kind of benchmark to run (throughput, sweep, optimum)
 40 |     #[clap(default_value = "sweep", short, long, env, group = "group_manual")]
 41 |     benchmark_kind: String,
 42 |     /// The duration of the prewarm step ran before the benchmark to warm up the backend (JIT, caches, etc.)
 43 |     #[clap(default_value = "30s", short, long, env, group = "group_manual")]
 44 |     #[arg(value_parser = parse_duration)]
 45 |     warmup: Duration,
 46 |     /// The URL of the backend to benchmark. Must be compatible with OpenAI Message API
 47 |     #[clap(default_value = "http://localhost:8000", short, long, env)]
 48 |     url: Url,
 49 | 
 50 |     /// The api key send to the [`url`] as Header "Authorization: Bearer {API_KEY}".
 51 |     #[clap(default_value = "", short, long, env)]
 52 |     api_key: String,
 53 | 
 54 |     /// Disable console UI
 55 |     #[clap(short, long, env)]
 56 |     no_console: bool,
 57 |     /// Constraints for prompt length.
 58 |     /// No value means use the input prompt as defined in input dataset.
 59 |     /// We sample the number of tokens to generate from a normal distribution.
 60 |     /// Specified as a comma-separated list of key=value pairs.
 61 |     /// * num_tokens: target number of prompt tokens
 62 |     /// * min_tokens: minimum number of prompt tokens
 63 |     /// * max_tokens: maximum number of prompt tokens
 64 |     /// * variance: variance in the number of prompt tokens
 65 |     ///
 66 |     /// Example: num_tokens=200,max_tokens=210,min_tokens=190,variance=10
 67 |     #[clap(
 68 |         long,
 69 |         env,
 70 |         value_parser(parse_tokenizer_options),
 71 |         group = "group_manual"
 72 |     )]
 73 |     prompt_options: Option<TokenizeOptions>,
 74 |     /// Constraints for the generated text.
 75 |     /// We sample the number of tokens to generate from a normal distribution.
 76 |     /// Specified as a comma-separated list of key=value pairs.
 77 |     /// * num_tokens: target number of generated tokens
 78 |     /// * min_tokens: minimum number of generated tokens
 79 |     /// * max_tokens: maximum number of generated tokens
 80 |     /// * variance: variance in the number of generated tokens
 81 |     ///
 82 |     /// Example: num_tokens=200,max_tokens=210,min_tokens=190,variance=10
 83 |     #[clap(
 84 |         long,
 85 |         env,
 86 |         value_parser(parse_tokenizer_options),
 87 |         group = "group_manual"
 88 |     )]
 89 |     decode_options: Option<TokenizeOptions>,
 90 |     /// Hugging Face dataset to use for prompt generation
 91 |     #[clap(
 92 |         default_value = "hlarcher/inference-benchmarker",
 93 |         long,
 94 |         env,
 95 |         group = "group_manual"
 96 |     )]
 97 |     dataset: String,
 98 |     /// File to use in the Dataset
 99 |     #[clap(
100 |         default_value = "share_gpt_filtered_small.json",
101 |         long,
102 |         env,
103 |         group = "group_manual"
104 |     )]
105 |     dataset_file: String,
106 |     /// Extra metadata to include in the benchmark results file, comma-separated key-value pairs.
107 |     /// It can be, for example, used to include information about the configuration of the
108 |     /// benched server.
109 |     /// Example: --extra-meta "key1=value1,key2=value2"
110 |     #[clap(long, env, value_parser(parse_key_val))]
111 |     extra_meta: Option<HashMap<String, String>>,
112 |     // A run identifier to use for the benchmark. This is used to identify the benchmark in the
113 |     // results file.
114 |     #[clap(long, env)]
115 |     run_id: Option<String>,
116 | }
117 | 
118 | fn parse_duration(s: &str) -> Result<Duration, Error> {
119 |     humantime::parse_duration(s).map_err(|_| Error::new(InvalidValue))
120 | }
121 | 
122 | fn parse_key_val(s: &str) -> Result<HashMap<String, String>, Error> {
123 |     let mut key_val_map = HashMap::new();
124 |     let items = s.split(",").collect::<Vec<&str>>();
125 |     for item in items.iter() {
126 |         let key_value = item.split("=").collect::<Vec<&str>>();
127 |         if key_value.len() % 2 != 0 {
128 |             return Err(Error::new(InvalidValue));
129 |         }
130 |         for i in 0..key_value.len() / 2 {
131 |             key_val_map.insert(
132 |                 key_value[i * 2].to_string(),
133 |                 key_value[i * 2 + 1].to_string(),
134 |             );
135 |         }
136 |     }
137 |     Ok(key_val_map)
138 | }
139 | 
140 | fn parse_tokenizer_options(s: &str) -> Result<TokenizeOptions, Error> {
141 |     let mut tokenizer_options = TokenizeOptions::new();
142 |     let items = s.split(",").collect::<Vec<&str>>();
143 |     for item in items.iter() {
144 |         let key_value = item.split("=").collect::<Vec<&str>>();
145 |         if key_value.len() != 2 {
146 |             return Err(Error::new(InvalidValue));
147 |         }
148 |         match key_value[0] {
149 |             "num_tokens" => {
150 |                 tokenizer_options.num_tokens = Some(key_value[1].parse::<u64>().unwrap())
151 |             }
152 |             "min_tokens" => tokenizer_options.min_tokens = key_value[1].parse::<u64>().unwrap(),
153 |             "max_tokens" => tokenizer_options.max_tokens = key_value[1].parse::<u64>().unwrap(),
154 |             "variance" => tokenizer_options.variance = key_value[1].parse::<u64>().unwrap(),
155 |             _ => return Err(Error::new(InvalidValue)),
156 |         }
157 |     }
158 |     if tokenizer_options.num_tokens.is_some()
159 |         && (tokenizer_options.num_tokens.unwrap() == 0
160 |             || tokenizer_options.min_tokens == 0
161 |             || tokenizer_options.max_tokens == 0)
162 |     {
163 |         return Err(Error::new(InvalidValue));
164 |     }
165 |     if tokenizer_options.min_tokens > tokenizer_options.max_tokens {
166 |         return Err(Error::new(InvalidValue));
167 |     }
168 |     Ok(tokenizer_options)
169 | }
170 | 
171 | #[tokio::main]
172 | async fn main() {
173 |     let args = Args::parse();
174 |     let git_sha = option_env!("VERGEN_GIT_SHA").unwrap_or("unknown");
175 |     println!(
176 |         "Text Generation Inference Benchmark {} ({})",
177 |         env!("CARGO_PKG_VERSION"),
178 |         git_sha
179 |     );
180 | 
181 |     let (stop_sender, _) = broadcast::channel(1);
182 |     // handle ctrl-c
183 |     let stop_sender_clone = stop_sender.clone();
184 |     tokio::spawn(async move {
185 |         tokio::signal::ctrl_c()
186 |             .await
187 |             .expect("Failed to listen for ctrl-c");
188 |         debug!("Received stop signal, stopping benchmark");
189 |         stop_sender_clone
190 |             .send(())
191 |             .expect("Failed to send stop signal");
192 |     });
193 | 
194 |     let stop_sender_clone = stop_sender.clone();
195 |     // get HF token
196 |     let token_env_key = "HF_TOKEN".to_string();
197 |     let cache = hf_hub::Cache::from_env();
198 |     let hf_token = match std::env::var(token_env_key).ok() {
199 |         Some(token) => Some(token),
200 |         None => cache.token(),
201 |     };
202 |     let model_name = args
203 |         .model_name
204 |         .clone()
205 |         .unwrap_or(args.tokenizer_name.clone());
206 |     let run_id = args
207 |         .run_id
208 |         .unwrap_or(uuid::Uuid::new_v4().to_string()[..7].to_string());
209 |     let run_config = RunConfiguration {
210 |         url: args.url,
211 |         api_key: args.api_key,
212 |         profile: args.profile.clone(),
213 |         tokenizer_name: args.tokenizer_name.clone(),
214 |         max_vus: args.max_vus,
215 |         duration: args.duration,
216 |         rates: args.rates,
217 |         num_rates: args.num_rates,
218 |         benchmark_kind: args.benchmark_kind.clone(),
219 |         warmup_duration: args.warmup,
220 |         interactive: !args.no_console,
221 |         prompt_options: args.prompt_options.clone(),
222 |         decode_options: args.decode_options.clone(),
223 |         dataset: args.dataset.clone(),
224 |         dataset_file: args.dataset_file.clone(),
225 |         extra_metadata: args.extra_meta.clone(),
226 |         hf_token,
227 |         model_name,
228 |         run_id,
229 |     };
230 |     let main_thread = tokio::spawn(async move {
231 |         match run(run_config, stop_sender_clone).await {
232 |             Ok(_) => {}
233 |             Err(e) => {
234 |                 error!("Fatal: {:?}", e);
235 |                 println!("Fatal: {:?}", e)
236 |             }
237 |         };
238 |     });
239 |     let _ = main_thread.await;
240 | }
241 | 


--------------------------------------------------------------------------------
/src/profiles.rs:
--------------------------------------------------------------------------------
  1 | use crate::{RunConfiguration, TokenizeOptions};
  2 | use std::string::ToString;
  3 | 
  4 | pub fn apply_profile(
  5 |     profile: &str,
  6 |     run_configuration: RunConfiguration,
  7 | ) -> anyhow::Result<RunConfiguration> {
  8 |     match profile {
  9 |         "fixed-length" => Ok(RunConfiguration {
 10 |             max_vus: 128,
 11 |             duration: std::time::Duration::from_secs(120),
 12 |             rates: None,
 13 |             num_rates: 10,
 14 |             benchmark_kind: "sweep".to_string(),
 15 |             warmup_duration: std::time::Duration::from_secs(30),
 16 |             prompt_options: Some(TokenizeOptions {
 17 |                 num_tokens: Some(200),
 18 |                 min_tokens: 200,
 19 |                 max_tokens: 200,
 20 |                 variance: 0,
 21 |             }),
 22 |             decode_options: Some(TokenizeOptions {
 23 |                 num_tokens: Some(800),
 24 |                 min_tokens: 50,
 25 |                 max_tokens: 800,
 26 |                 variance: 100,
 27 |             }),
 28 |             dataset: "hlarcher/inference-benchmarker".to_string(),
 29 |             dataset_file: "share_gpt_0_turns.json".to_string(),
 30 |             ..run_configuration
 31 |         }),
 32 |         "chat" => {
 33 |             // Simulates a multi-turn chat scenario
 34 |             // in which the model answers to successive user prompts.
 35 |             // The model is prompted with the whole conversation history
 36 |             // at each turn. Prefix caching will have a significant impact
 37 |             // on the performance of this benchmark.
 38 |             Ok(RunConfiguration {
 39 |                 max_vus: 128,
 40 |                 duration: std::time::Duration::from_secs(120),
 41 |                 rates: None,
 42 |                 num_rates: 10,
 43 |                 benchmark_kind: "sweep".to_string(),
 44 |                 warmup_duration: std::time::Duration::from_secs(30),
 45 |                 prompt_options: None, // use prompts from dataset
 46 |                 decode_options: Some(TokenizeOptions {
 47 |                     num_tokens: Some(800), // decode up to 800 tokens
 48 |                     min_tokens: 50,
 49 |                     max_tokens: 800,
 50 |                     variance: 100,
 51 |                 }),
 52 |                 dataset: "hlarcher/inference-benchmarker".to_string(),
 53 |                 dataset_file: "share_gpt_turns.json".to_string(),
 54 |                 ..run_configuration
 55 |             })
 56 |         }
 57 |         "code-generation" => {
 58 |             // Simulates code-complete scenarios. Model is given large code snippets and
 59 |             // asked to complete them with a few tokens (e.g. a function name, a few code lines).
 60 |             Ok(RunConfiguration {
 61 |                 max_vus: 128,
 62 |                 duration: std::time::Duration::from_secs(120),
 63 |                 rates: None,
 64 |                 num_rates: 10,
 65 |                 benchmark_kind: "sweep".to_string(),
 66 |                 warmup_duration: std::time::Duration::from_secs(30),
 67 |                 prompt_options: Some(TokenizeOptions {
 68 |                     num_tokens: Some(4096),
 69 |                     min_tokens: 3000,
 70 |                     max_tokens: 6000,
 71 |                     variance: 1000,
 72 |                 }),
 73 |                 decode_options: Some(TokenizeOptions {
 74 |                     num_tokens: Some(50),
 75 |                     min_tokens: 30,
 76 |                     max_tokens: 80,
 77 |                     variance: 10,
 78 |                 }),
 79 |                 dataset: "hlarcher/inference-benchmarker".to_string(),
 80 |                 dataset_file: "github_code.json".to_string(),
 81 |                 ..run_configuration
 82 |             })
 83 |         }
 84 |         "classification" => {
 85 |             // Simulates cases where the model is fed with large chunks
 86 |             // of business data or document repeatedly and users
 87 |             // ask simple questions about the content (summarization, classification...)
 88 |             // Those use cases benefit a lot from prefix caching and chunked prefill.
 89 |             Ok(RunConfiguration {
 90 |                 max_vus: 128,
 91 |                 duration: std::time::Duration::from_secs(120),
 92 |                 rates: None,
 93 |                 num_rates: 10,
 94 |                 benchmark_kind: "sweep".to_string(),
 95 |                 warmup_duration: std::time::Duration::from_secs(30),
 96 |                 prompt_options: Some(TokenizeOptions {
 97 |                     num_tokens: Some(10000),
 98 |                     min_tokens: 8000,
 99 |                     max_tokens: 12000,
100 |                     variance: 5000,
101 |                 }),
102 |                 decode_options: Some(TokenizeOptions {
103 |                     num_tokens: Some(50),
104 |                     min_tokens: 30,
105 |                     max_tokens: 80,
106 |                     variance: 10,
107 |                 }),
108 |                 dataset: "hlarcher/inference-benchmarker".to_string(),
109 |                 dataset_file: "classification.json".to_string(),
110 |                 ..run_configuration
111 |             })
112 |         }
113 |         _ => Err(anyhow::anyhow!("Unknown profile: {}", profile)),
114 |     }
115 | }
116 | 


--------------------------------------------------------------------------------
/src/results.rs:
--------------------------------------------------------------------------------
  1 | use crate::executors::ExecutorConfig;
  2 | use crate::requests::TextGenerationAggregatedResponse;
  3 | use crate::results::BenchmarkErrors::NoResponses;
  4 | use crate::scheduler::ExecutorType;
  5 | use chrono::Utc;
  6 | use std::fmt::{Debug, Display, Formatter};
  7 | use std::time::Duration;
  8 | 
  9 | #[derive(Debug)]
 10 | pub(crate) enum BenchmarkErrors {
 11 |     NoResponses,
 12 | }
 13 | 
 14 | impl Display for BenchmarkErrors {
 15 |     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
 16 |         match self {
 17 |             NoResponses => write!(f, "Backend did not return any valid response. It is either not responding or test duration is too short."),
 18 |         }
 19 |     }
 20 | }
 21 | 
 22 | #[derive(Clone)]
 23 | pub struct BenchmarkResults {
 24 |     pub id: String,
 25 |     aggregated_responses: Vec<TextGenerationAggregatedResponse>,
 26 |     executor_type: ExecutorType,
 27 |     executor_config: ExecutorConfig,
 28 | }
 29 | 
 30 | impl BenchmarkResults {
 31 |     pub fn new(
 32 |         id: String,
 33 |         executor_type: ExecutorType,
 34 |         executor_config: ExecutorConfig,
 35 |     ) -> BenchmarkResults {
 36 |         BenchmarkResults {
 37 |             id,
 38 |             aggregated_responses: Vec::new(),
 39 |             executor_type,
 40 |             executor_config,
 41 |         }
 42 |     }
 43 | 
 44 |     pub fn add_response(&mut self, response: TextGenerationAggregatedResponse) {
 45 |         self.aggregated_responses.push(response);
 46 |     }
 47 | 
 48 |     pub fn total_requests(&self) -> usize {
 49 |         self.aggregated_responses.len()
 50 |     }
 51 | 
 52 |     pub fn start_time(&self) -> Option<tokio::time::Instant> {
 53 |         self.aggregated_responses
 54 |             .first()
 55 |             .and_then(|response| response.start_time)
 56 |     }
 57 | 
 58 |     pub fn end_time(&self) -> Option<tokio::time::Instant> {
 59 |         self.aggregated_responses
 60 |             .last()
 61 |             .and_then(|response| response.end_time)
 62 |     }
 63 | 
 64 |     fn is_ready(&self) -> bool {
 65 |         self.start_time().is_some() && self.end_time().is_some()
 66 |     }
 67 | 
 68 |     pub fn failed_requests(&self) -> usize {
 69 |         self.aggregated_responses
 70 |             .iter()
 71 |             .filter(|response| response.failed)
 72 |             .count()
 73 |     }
 74 | 
 75 |     pub fn successful_requests(&self) -> usize {
 76 |         self.aggregated_responses
 77 |             .iter()
 78 |             .filter(|response| !response.failed)
 79 |             .count()
 80 |     }
 81 | 
 82 |     pub fn token_throughput_secs(&self) -> anyhow::Result<f64> {
 83 |         if self.is_ready() {
 84 |             let total_tokens: u64 = self.total_tokens();
 85 |             Ok(total_tokens as f64 / self.duration().unwrap_or_default().as_secs_f64())
 86 |         } else {
 87 |             Err(anyhow::anyhow!(NoResponses))
 88 |         }
 89 |     }
 90 | 
 91 |     pub fn total_tokens_sent(&self) -> u64 {
 92 |         self.get_successful_responses()
 93 |             .iter()
 94 |             .map(|response| response.request.clone().unwrap().num_prompt_tokens)
 95 |             .sum()
 96 |     }
 97 | 
 98 |     pub fn total_prompt_tokens(&self) -> u64 {
 99 |         self.get_successful_responses()
100 |             .iter()
101 |             .map(|response| response.request.clone().unwrap().num_prompt_tokens)
102 |             .sum()
103 |     }
104 | 
105 |     pub fn prompt_tokens_avg(&self) -> anyhow::Result<f64> {
106 |         if self.is_ready() {
107 |             let total_prompt_tokens = self.total_prompt_tokens();
108 |             Ok(total_prompt_tokens as f64 / self.successful_requests() as f64)
109 |         } else {
110 |             Err(anyhow::anyhow!(NoResponses))
111 |         }
112 |     }
113 | 
114 |     pub fn successful_request_rate(&self) -> anyhow::Result<f64> {
115 |         if self.is_ready() {
116 |             let total_requests = self.successful_requests();
117 |             Ok(total_requests as f64 / self.duration().unwrap_or_default().as_secs_f64())
118 |         } else {
119 |             Err(anyhow::anyhow!(NoResponses))
120 |         }
121 |     }
122 | 
123 |     pub fn total_tokens(&self) -> u64 {
124 |         self.get_successful_responses()
125 |             .iter()
126 |             .map(|response| response.num_generated_tokens)
127 |             .sum()
128 |     }
129 | 
130 |     pub fn duration(&self) -> anyhow::Result<std::time::Duration> {
131 |         if self.is_ready() {
132 |             Ok(self
133 |                 .end_time()
134 |                 .unwrap()
135 |                 .duration_since(self.start_time().unwrap()))
136 |         } else {
137 |             Err(anyhow::anyhow!(NoResponses))
138 |         }
139 |     }
140 | 
141 |     pub fn e2e_latency_avg(&self) -> anyhow::Result<std::time::Duration> {
142 |         if self.is_ready() {
143 |             if self.successful_requests() == 0 {
144 |                 return Ok(Duration::from_secs(0));
145 |             }
146 |             Ok(self
147 |                 .get_successful_responses()
148 |                 .iter()
149 |                 .map(|response| response.e2e_latency().unwrap_or_default())
150 |                 .sum::<Duration>()
151 |                 / self.successful_requests() as u32)
152 |         } else {
153 |             Err(anyhow::anyhow!(NoResponses))
154 |         }
155 |     }
156 | 
157 |     pub fn e2e_latency_percentile(&self, percentile: f64) -> anyhow::Result<std::time::Duration> {
158 |         let quantile = self.quantile_duration(
159 |             self.get_successful_responses()
160 |                 .iter()
161 |                 .map(|response| response.e2e_latency().unwrap_or_default())
162 |                 .collect(),
163 |             percentile,
164 |         )?;
165 |         Ok(Duration::from_secs_f64(quantile))
166 |     }
167 | 
168 |     pub fn time_to_first_token_avg(&self) -> anyhow::Result<std::time::Duration> {
169 |         if self.is_ready() {
170 |             if self.successful_requests() == 0 {
171 |                 return Ok(Duration::from_secs(0));
172 |             }
173 |             Ok(self
174 |                 .get_successful_responses()
175 |                 .iter()
176 |                 .map(|response| response.time_to_first_token().unwrap_or_default())
177 |                 .sum::<Duration>()
178 |                 / self.successful_requests() as u32)
179 |         } else {
180 |             Err(anyhow::anyhow!(NoResponses))
181 |         }
182 |     }
183 | 
184 |     pub fn time_to_first_token_percentile(&self, percentile: f64) -> anyhow::Result<Duration> {
185 |         let quantile = self.quantile_duration(
186 |             self.get_successful_responses()
187 |                 .iter()
188 |                 .map(|response| response.time_to_first_token().unwrap_or_default())
189 |                 .collect(),
190 |             percentile,
191 |         )?;
192 |         Ok(Duration::from_secs_f64(quantile))
193 |     }
194 | 
195 |     pub fn inter_token_latency_avg(&self) -> anyhow::Result<std::time::Duration> {
196 |         if self.is_ready() {
197 |             if self.successful_requests() == 0 {
198 |                 return Ok(Duration::from_secs(0));
199 |             }
200 |             Ok(self
201 |                 .get_successful_responses()
202 |                 .iter()
203 |                 .map(|response| response.inter_token_latency().unwrap_or_default())
204 |                 .sum::<Duration>()
205 |                 / self.successful_requests() as u32)
206 |         } else {
207 |             Err(anyhow::anyhow!(NoResponses))
208 |         }
209 |     }
210 | 
211 |     pub fn inter_token_latency_percentile(&self, percentile: f64) -> anyhow::Result<Duration> {
212 |         let quantile = self.quantile_duration(
213 |             self.get_successful_responses()
214 |                 .iter()
215 |                 .map(|response| response.inter_token_latency().unwrap_or_default())
216 |                 .collect(),
217 |             percentile,
218 |         )?;
219 |         Ok(Duration::from_secs_f64(quantile))
220 |     }
221 | 
222 |     pub fn executor_type(&self) -> ExecutorType {
223 |         self.executor_type.clone()
224 |     }
225 | 
226 |     pub fn executor_config(&self) -> ExecutorConfig {
227 |         self.executor_config.clone()
228 |     }
229 | 
230 |     fn get_successful_responses(&self) -> Vec<&TextGenerationAggregatedResponse> {
231 |         self.aggregated_responses
232 |             .iter()
233 |             .filter(|response| !response.failed)
234 |             .collect()
235 |     }
236 | 
237 |     pub fn get_responses(&self) -> Vec<TextGenerationAggregatedResponse> {
238 |         self.aggregated_responses.clone()
239 |     }
240 | 
241 |     /// Calculate the quantile of a given data set using interpolation method
242 |     /// Results are similar to `numpy.percentile`
243 |     fn quantile_duration(&self, mut data: Vec<Duration>, quantile: f64) -> anyhow::Result<f64> {
244 |         if self.is_ready() {
245 |             data.sort();
246 |             let i = (quantile * (data.len() - 1) as f64).floor();
247 |             let delta = (data.len() - 1) as f64 * quantile - i;
248 |             if i as usize >= data.len() {
249 |                 return Err(anyhow::anyhow!(NoResponses));
250 |             }
251 |             let quantile = (1. - delta) * data[i as usize].as_secs_f64()
252 |                 + delta * data[i as usize + 1].as_secs_f64();
253 |             Ok(quantile)
254 |         } else {
255 |             Err(anyhow::anyhow!(NoResponses))
256 |         }
257 |     }
258 | }
259 | 
260 | impl Debug for BenchmarkResults {
261 |     fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
262 |         f.debug_struct("BenchmarkResult")
263 |             .field("id", &self.id)
264 |             .field("executor_type", &self.executor_type.to_string())
265 |             .field("total_requests", &self.total_requests())
266 |             .field("start_time", &self.start_time())
267 |             .field("end_time", &self.end_time())
268 |             .field("total_tokens", &self.total_tokens())
269 |             .field(
270 |                 "token_throughput_secs",
271 |                 &self
272 |                     .token_throughput_secs()
273 |                     .or::<anyhow::Result<f64>>(Ok(-1.0)),
274 |             )
275 |             .field(
276 |                 "duration_ms",
277 |                 &self
278 |                     .duration()
279 |                     .or::<anyhow::Result<Duration>>(Ok(Duration::from_secs(0))),
280 |             )
281 |             .field(
282 |                 "average_time_to_first_token",
283 |                 &self
284 |                     .time_to_first_token_avg()
285 |                     .or::<anyhow::Result<Duration>>(Ok(Duration::from_secs(0))),
286 |             )
287 |             .field(
288 |                 "average_inter_token_latency",
289 |                 &self
290 |                     .inter_token_latency_avg()
291 |                     .or::<anyhow::Result<Duration>>(Ok(Duration::from_secs(0))),
292 |             )
293 |             .field("failed_requests", &self.failed_requests())
294 |             .field("successful_requests", &self.successful_requests())
295 |             .field(
296 |                 "request_rate",
297 |                 &self
298 |                     .successful_request_rate()
299 |                     .or::<anyhow::Result<f64>>(Ok(-1.0)),
300 |             )
301 |             .field("sent_prompt_tokens", &self.total_tokens_sent())
302 |             .field(
303 |                 "e2e_latency_avg",
304 |                 &self
305 |                     .e2e_latency_avg()
306 |                     .or::<anyhow::Result<Duration>>(Ok(Duration::from_secs(0))),
307 |             )
308 |             .finish()
309 |     }
310 | }
311 | 
312 | #[derive(Debug, Clone)]
313 | pub struct BenchmarkReport {
314 |     results: Vec<BenchmarkResults>,
315 |     start_time: Option<chrono::DateTime<Utc>>,
316 |     end_time: Option<chrono::DateTime<Utc>>,
317 | }
318 | 
319 | impl BenchmarkReport {
320 |     pub fn new() -> BenchmarkReport {
321 |         BenchmarkReport {
322 |             results: Vec::new(),
323 |             start_time: None,
324 |             end_time: None,
325 |         }
326 |     }
327 | 
328 |     pub fn start(&mut self) {
329 |         self.start_time = Some(Utc::now());
330 |     }
331 | 
332 |     pub fn end(&mut self) {
333 |         self.end_time = Some(Utc::now());
334 |     }
335 | 
336 |     pub fn add_benchmark_result(&mut self, result: BenchmarkResults) {
337 |         self.results.push(result);
338 |     }
339 | 
340 |     pub fn get_results(&self) -> Vec<BenchmarkResults> {
341 |         self.results.clone()
342 |     }
343 | 
344 |     pub fn start_time(&self) -> Option<chrono::DateTime<Utc>> {
345 |         self.start_time
346 |     }
347 | 
348 |     pub fn end_time(&self) -> Option<chrono::DateTime<Utc>> {
349 |         self.end_time
350 |     }
351 | }
352 | 
353 | #[cfg(test)]
354 | mod test {
355 |     use super::*;
356 |     use crate::requests::TextGenerationRequest;
357 |     use std::sync::Arc;
358 |     #[test]
359 |     fn test_time_to_first_token_percentile() {
360 |         let request = Arc::from(TextGenerationRequest {
361 |             id: None,
362 |             prompt: "test".to_string(),
363 |             num_prompt_tokens: 10,
364 |             num_decode_tokens: None,
365 |         });
366 |         let mut response1 = TextGenerationAggregatedResponse::new(request.clone());
367 |         response1.start_time = Some(tokio::time::Instant::now());
368 |         response1.end_time =
369 |             Some(tokio::time::Instant::now() + tokio::time::Duration::from_millis(100));
370 |         response1.num_generated_tokens = 100;
371 |         response1.failed = false;
372 |         response1.times_to_tokens = vec![
373 |             Duration::from_millis(100),
374 |             Duration::from_millis(200),
375 |             Duration::from_millis(300),
376 |             Duration::from_millis(400),
377 |             Duration::from_millis(500),
378 |         ];
379 | 
380 |         let mut response2 = TextGenerationAggregatedResponse::new(request.clone());
381 |         response2.start_time = Some(tokio::time::Instant::now());
382 |         response2.end_time =
383 |             Some(tokio::time::Instant::now() + tokio::time::Duration::from_millis(200));
384 |         response2.num_generated_tokens = 100;
385 |         response2.failed = false;
386 |         response2.times_to_tokens = vec![
387 |             Duration::from_millis(600),
388 |             Duration::from_millis(700),
389 |             Duration::from_millis(800),
390 |             Duration::from_millis(900),
391 |             Duration::from_millis(1000),
392 |         ];
393 | 
394 |         let mut response3 = TextGenerationAggregatedResponse::new(request.clone());
395 |         response3.start_time = Some(tokio::time::Instant::now());
396 |         response3.end_time =
397 |             Some(tokio::time::Instant::now() + tokio::time::Duration::from_millis(300));
398 |         response3.num_generated_tokens = 100;
399 |         response3.failed = false;
400 |         response3.times_to_tokens = vec![
401 |             Duration::from_millis(1100),
402 |             Duration::from_millis(1200),
403 |             Duration::from_millis(1300),
404 |             Duration::from_millis(1400),
405 |             Duration::from_millis(1500),
406 |         ];
407 | 
408 |         let mut response4 = TextGenerationAggregatedResponse::new(request.clone());
409 |         response4.start_time = Some(tokio::time::Instant::now());
410 |         response4.end_time =
411 |             Some(tokio::time::Instant::now() + tokio::time::Duration::from_millis(300));
412 |         response4.num_generated_tokens = 100;
413 |         response4.failed = false;
414 |         response4.times_to_tokens = vec![
415 |             Duration::from_millis(1600),
416 |             Duration::from_millis(1700),
417 |             Duration::from_millis(1800),
418 |             Duration::from_millis(1900),
419 |             Duration::from_millis(2000),
420 |         ];
421 | 
422 |         let mut results = BenchmarkResults::new(
423 |             "test".to_string(),
424 |             ExecutorType::ConstantArrivalRate,
425 |             ExecutorConfig {
426 |                 max_vus: 0,
427 |                 duration: Default::default(),
428 |                 rate: None,
429 |             },
430 |         );
431 |         results.add_response(response1);
432 |         results.add_response(response2);
433 |         results.add_response(response3);
434 |         results.add_response(response4);
435 | 
436 |         assert_eq!(
437 |             results.time_to_first_token_percentile(0.9).unwrap(),
438 |             Duration::from_millis(1450)
439 |         );
440 |         assert_eq!(
441 |             results.time_to_first_token_percentile(0.5).unwrap(),
442 |             Duration::from_millis(850)
443 |         );
444 |     }
445 | }
446 | 


--------------------------------------------------------------------------------
/src/scheduler.rs:
--------------------------------------------------------------------------------
  1 | use crate::executors::{
  2 |     ConstantArrivalRateExecutor, ConstantVUsExecutor, Executor, ExecutorConfig,
  3 | };
  4 | use crate::requests::{
  5 |     TextGenerationAggregatedResponse, TextGenerationBackend, TextRequestGenerator,
  6 | };
  7 | use crate::results::BenchmarkErrors::NoResponses;
  8 | use crate::results::BenchmarkResults;
  9 | use log::{debug, trace, warn};
 10 | use std::sync::Arc;
 11 | use tokio::sync::mpsc::{Sender, UnboundedReceiver, UnboundedSender};
 12 | use tokio::sync::{broadcast, Mutex};
 13 | 
 14 | #[derive(Clone, strum_macros::Display)]
 15 | pub enum ExecutorType {
 16 |     ConstantVUs,
 17 |     ConstantArrivalRate,
 18 | }
 19 | 
 20 | pub struct Scheduler {
 21 |     id: String,
 22 |     executor: Arc<Mutex<dyn Executor + Send>>,
 23 |     requests_generator: Arc<Mutex<dyn TextRequestGenerator + Send>>,
 24 |     results: Arc<Mutex<BenchmarkResults>>,
 25 |     progress_tx: Sender<Option<SchedulerProgress>>,
 26 |     stop_sender: broadcast::Sender<()>,
 27 | }
 28 | 
 29 | pub struct SchedulerProgress {
 30 |     pub progress: f64,
 31 |     pub requests_throughput: f64,
 32 |     pub successful_requests: u64,
 33 |     pub failed_requests: u64,
 34 | }
 35 | 
 36 | impl Scheduler {
 37 |     pub fn new(
 38 |         id: String,
 39 |         backend: Box<dyn TextGenerationBackend + Send + Sync>,
 40 |         executor_type: ExecutorType,
 41 |         config: ExecutorConfig,
 42 |         requests_generator: Arc<Mutex<dyn TextRequestGenerator + Send>>,
 43 |         progress_tx: Sender<Option<SchedulerProgress>>,
 44 |         stop_sender: broadcast::Sender<()>,
 45 |     ) -> Scheduler {
 46 |         match executor_type {
 47 |             ExecutorType::ConstantVUs => Scheduler {
 48 |                 id: id.clone(),
 49 |                 executor: Arc::from(Mutex::from(ConstantVUsExecutor::new(
 50 |                     backend.clone(),
 51 |                     config.max_vus,
 52 |                     config.duration,
 53 |                 ))),
 54 |                 results: Arc::from(Mutex::from(BenchmarkResults::new(
 55 |                     id.clone(),
 56 |                     ExecutorType::ConstantVUs,
 57 |                     config,
 58 |                 ))),
 59 |                 requests_generator,
 60 |                 progress_tx,
 61 |                 stop_sender,
 62 |             },
 63 |             ExecutorType::ConstantArrivalRate => {
 64 |                 if config.rate.is_none() {
 65 |                     panic!("Rate must be specified for ConstantArrivalRateExecutor");
 66 |                 }
 67 |                 let rate = config.rate.unwrap();
 68 |                 Scheduler {
 69 |                     id: id.clone(),
 70 |                     executor: Arc::from(Mutex::from(ConstantArrivalRateExecutor::new(
 71 |                         backend.clone(),
 72 |                         config.max_vus,
 73 |                         config.duration,
 74 |                         rate,
 75 |                     ))),
 76 |                     results: Arc::from(Mutex::from(BenchmarkResults::new(
 77 |                         id.clone(),
 78 |                         ExecutorType::ConstantArrivalRate,
 79 |                         config,
 80 |                     ))),
 81 |                     requests_generator,
 82 |                     progress_tx,
 83 |                     stop_sender,
 84 |                 }
 85 |             }
 86 |         }
 87 |     }
 88 | 
 89 |     pub async fn run(&mut self) -> anyhow::Result<BenchmarkResults> {
 90 |         debug!("Starting scheduler '{}'", self.id);
 91 |         // add responses to the benchmark result as they arrive
 92 |         let (tx, mut rx): (
 93 |             UnboundedSender<TextGenerationAggregatedResponse>,
 94 |             UnboundedReceiver<TextGenerationAggregatedResponse>,
 95 |         ) = tokio::sync::mpsc::unbounded_channel();
 96 |         let results = self.results.clone();
 97 |         let progress_tx = self.progress_tx.clone();
 98 |         let mut stop_receiver = self.stop_sender.subscribe();
 99 |         let req_gen = self.requests_generator.clone();
100 |         tokio::spawn(async move {
101 |             tokio::select! {
102 |                 _ = stop_receiver.recv() => {
103 |                     debug!("Received stop signal, stopping benchmark");
104 |                 }
105 |                 _ = async{
106 |                     while let Some(response) = rx.recv().await{
107 |                         // call generator callback
108 |                         let response_txt=response.response.clone();
109 |                         if let Some(request)= response.request.clone(){
110 |                             req_gen.lock().await.callback(request, response_txt.unwrap_or_default().as_str());
111 |                         }
112 |                         let result = results.clone();
113 |                         let progress_tx = progress_tx.clone();
114 |                         trace!("Received response: {:?}", response);
115 |                         if response.ended {
116 |                             return;
117 |                         }
118 |                         let mut result = result.lock().await;
119 |                         result.add_response(response);
120 |                         let expected_duration = result.executor_config().duration.as_secs_f64();
121 |                         let start_time = result.start_time().unwrap_or(tokio::time::Instant::now());
122 |                         let _ = progress_tx.send(Some(SchedulerProgress {
123 |                             progress: (100.0 * (1.0 - (expected_duration - start_time.elapsed().as_secs_f64()) / expected_duration)).min(100.0),
124 |                             requests_throughput: result.successful_request_rate().unwrap_or_default(),
125 |                             successful_requests: result.successful_requests() as u64,
126 |                             failed_requests: result.failed_requests() as u64,
127 |                         })).await;
128 |                     }
129 |                 }=>{}
130 |             }
131 |         });
132 |         self.executor
133 |             .lock()
134 |             .await
135 |             .run(
136 |                 self.requests_generator.clone(),
137 |                 tx,
138 |                 self.stop_sender.clone(),
139 |             )
140 |             .await;
141 |         warn!("{:?}", self.results.clone());
142 |         if self.results.lock().await.successful_requests() == 0 {
143 |             Err(anyhow::anyhow!(NoResponses))
144 |         } else {
145 |             Ok(self.results.lock().await.clone())
146 |         }
147 |     }
148 | 
149 |     pub fn get_results(&self) -> Arc<Mutex<BenchmarkResults>> {
150 |         self.results.clone()
151 |     }
152 | }
153 | 
154 | #[cfg(test)]
155 | mod tests {
156 |     use super::*;
157 |     use crate::requests::OpenAITextGenerationBackend;
158 |     use std::time::Duration;
159 |     use tokenizers::Tokenizer;
160 |     use tokio::time;
161 | 
162 |     #[tokio::test]
163 |     async fn test_constant_arrival_rate_scheduler() {
164 |         let (progress_tx, _) = tokio::sync::mpsc::channel(10000);
165 |         let (stop_sender, _) = tokio::sync::broadcast::channel(1);
166 |         let backend = Box::new(crate::requests::DummyTextGenerationBackend::new(
167 |             Duration::from_secs(1),
168 |         ));
169 |         let requests_generator = Arc::from(Mutex::from(
170 |             crate::requests::DummyTextRequestGenerator::new(),
171 |         ));
172 |         let mut scheduler = Scheduler::new(
173 |             "test".to_string(),
174 |             backend,
175 |             ExecutorType::ConstantArrivalRate,
176 |             ExecutorConfig {
177 |                 max_vus: 800,
178 |                 duration: std::time::Duration::from_secs(10),
179 |                 rate: Some(20.0),
180 |             },
181 |             requests_generator,
182 |             progress_tx,
183 |             stop_sender,
184 |         );
185 |         let results = scheduler.run().await.unwrap();
186 |         assert_eq!(results.successful_requests(), 180); // 20 requests per second for 10 seconds - 20 requests for last second as the backend has a 1 second delay
187 |     }
188 | 
189 |     #[tokio::test]
190 |     async fn test_constant_vus_scheduler() {
191 |         let (progress_tx, _) = tokio::sync::mpsc::channel(10000);
192 |         let (stop_sender, _) = broadcast::channel(1);
193 |         let backend = Box::new(crate::requests::DummyTextGenerationBackend::new(
194 |             Duration::from_secs(1),
195 |         ));
196 |         let requests_generator = Arc::from(Mutex::from(
197 |             crate::requests::DummyTextRequestGenerator::new(),
198 |         ));
199 |         let mut scheduler = Scheduler::new(
200 |             "test".to_string(),
201 |             backend,
202 |             ExecutorType::ConstantVUs,
203 |             ExecutorConfig {
204 |                 max_vus: 800,
205 |                 duration: Duration::from_secs(10),
206 |                 rate: None,
207 |             },
208 |             requests_generator,
209 |             progress_tx,
210 |             stop_sender,
211 |         );
212 |         let results = scheduler.run().await.unwrap();
213 |         assert!(
214 |             results.successful_requests() > 7200,
215 |             "Expected at least 7200 requests, got {}",
216 |             results.successful_requests()
217 |         );
218 |     }
219 | 
220 |     #[tokio::test]
221 |     async fn test_constant_arrival_rate_openai_backend() {
222 |         let (progress_tx, _) = tokio::sync::mpsc::channel(10000);
223 |         let (stop_sender, _) = tokio::sync::broadcast::channel(1);
224 |         let mut s = mockito::Server::new_async().await;
225 |         s.mock("POST", "/v1/chat/completions")
226 |             .with_status(200)
227 |             .with_header("content-type", "text/event-stream")
228 |             .with_chunked_body(|w| {
229 |                 w.write_all(b"data: {\"choices\": [{\"message\": null, \"finish_reason\": null, \"delta\": {\"content\": \"Hello, world!\"}}]}\n\n").unwrap();
230 |                 std::thread::sleep(Duration::from_millis(500));
231 |                 w.write_all(b"data: {\"choices\": [{\"message\": {\"content\": \"Hello, world!Hello, world!Hello, world!Hello, world!\", \"role\": \"user\"}, \"finish_reason\": \"stop\", \"delta\": {\"content\": \"Hello, world!\"}}]}\n\n").unwrap();
232 |                 w.write_all(b"data: [DONE]\n\n")
233 |             })
234 |             .create_async().await;
235 |         let url = s.url().parse().unwrap();
236 |         let tokenizer = Arc::new(Tokenizer::from_pretrained("gpt2", None).unwrap());
237 |         let backend = OpenAITextGenerationBackend::try_new(
238 |             "".to_string(),
239 |             url,
240 |             "gpt2".to_string(),
241 |             tokenizer,
242 |             time::Duration::from_secs(10),
243 |         )
244 |         .unwrap();
245 |         let requests_generator = Arc::from(Mutex::from(
246 |             crate::requests::DummyTextRequestGenerator::new(),
247 |         ));
248 |         let mut scheduler = Scheduler::new(
249 |             "test".to_string(),
250 |             Box::new(backend),
251 |             ExecutorType::ConstantArrivalRate,
252 |             ExecutorConfig {
253 |                 max_vus: 800,
254 |                 duration: Duration::from_secs(10),
255 |                 rate: Some(50.0),
256 |             },
257 |             requests_generator,
258 |             progress_tx,
259 |             stop_sender,
260 |         );
261 |         let results = scheduler.run().await.unwrap();
262 |         assert_eq!(results.successful_requests(), 475); // 25 expected missing requests due to the 500ms delay in the backend
263 |     }
264 | }
265 | 


--------------------------------------------------------------------------------
/src/table.rs:
--------------------------------------------------------------------------------
 1 | use crate::results::BenchmarkReport;
 2 | use crate::BenchmarkConfig;
 3 | use tabled::builder::Builder;
 4 | 
 5 | pub fn parameters_table(benchmark: BenchmarkConfig) -> anyhow::Result<tabled::Table> {
 6 |     let mut builder = Builder::default();
 7 |     let rates = benchmark
 8 |         .rates
 9 |         .map_or("N/A".to_string(), |e| format!("{:?}", e));
10 |     let prompt_options = benchmark
11 |         .prompt_options
12 |         .map_or("N/A".to_string(), |e| format!("{}", e));
13 |     let decode_options = benchmark
14 |         .decode_options
15 |         .map_or("N/A".to_string(), |e| format!("{}", e));
16 |     let extra_metadata = benchmark
17 |         .extra_metadata
18 |         .map_or("N/A".to_string(), |e| format!("{:?}", e));
19 |     builder.set_header(vec!["Parameter", "Value"]);
20 |     builder.push_record(vec!["Max VUs", benchmark.max_vus.to_string().as_str()]);
21 |     builder.push_record(vec![
22 |         "Duration",
23 |         benchmark.duration.as_secs().to_string().as_str(),
24 |     ]);
25 |     builder.push_record(vec![
26 |         "Warmup Duration",
27 |         benchmark.warmup_duration.as_secs().to_string().as_str(),
28 |     ]);
29 |     builder.push_record(vec![
30 |         "Benchmark Kind",
31 |         benchmark.benchmark_kind.to_string().as_str(),
32 |     ]);
33 |     builder.push_record(vec!["Rates", rates.as_str()]);
34 |     builder.push_record(vec!["Num Rates", benchmark.num_rates.to_string().as_str()]);
35 |     builder.push_record(vec!["Prompt Options", prompt_options.as_str()]);
36 |     builder.push_record(vec!["Decode Options", decode_options.as_str()]);
37 |     builder.push_record(vec!["Tokenizer", benchmark.tokenizer.to_string().as_str()]);
38 |     builder.push_record(vec!["Extra Metadata", extra_metadata.as_str()]);
39 |     let mut table = builder.build();
40 |     table.with(tabled::settings::Style::sharp());
41 |     Ok(table)
42 | }
43 | 
44 | pub fn results_table(benchmark: BenchmarkReport) -> anyhow::Result<tabled::Table> {
45 |     let mut builder = Builder::default();
46 |     builder.set_header(vec![
47 |         "Benchmark",
48 |         "QPS",
49 |         "E2E Latency (avg)",
50 |         "TTFT (avg)",
51 |         "ITL (avg)",
52 |         "Throughput",
53 |         "Error Rate",
54 |         "Successful Requests",
55 |         "Prompt tokens per req (avg)",
56 |         "Decoded tokens per req (avg)",
57 |     ]);
58 |     let results = benchmark.get_results();
59 |     for result in results {
60 |         let qps = format!("{:.2} req/s", result.successful_request_rate()?);
61 |         let e2e = format!("{:.2} sec", result.e2e_latency_avg()?.as_secs_f64());
62 |         let ttft = format!(
63 |             "{:.2} ms",
64 |             result.time_to_first_token_avg()?.as_micros() as f64 / 1000.0
65 |         );
66 |         let itl = format!(
67 |             "{:.2} ms",
68 |             result.inter_token_latency_avg()?.as_micros() as f64 / 1000.0
69 |         );
70 |         let throughput = format!("{:.2} tokens/sec", result.token_throughput_secs()?);
71 |         let error_rate = result.failed_requests() as f64 / result.total_requests() as f64 * 100.0;
72 |         let error_rate = format!("{:.2}%", error_rate);
73 |         builder.push_record(vec![
74 |             result.id.as_str(),
75 |             qps.as_str(),
76 |             e2e.as_str(),
77 |             ttft.as_str(),
78 |             itl.as_str(),
79 |             throughput.as_str(),
80 |             error_rate.as_str(),
81 |             format!(
82 |                 "{}/{}",
83 |                 result.successful_requests(),
84 |                 result.total_requests()
85 |             )
86 |             .as_str(),
87 |             format!("{:.2}", result.prompt_tokens_avg()?).as_str(),
88 |             format!(
89 |                 "{:.2}",
90 |                 result.total_tokens() as f64 / result.successful_requests() as f64
91 |             )
92 |             .as_str(),
93 |         ]);
94 |     }
95 |     let mut table = builder.build();
96 |     table.with(tabled::settings::Style::sharp());
97 |     Ok(table)
98 | }
99 | 


--------------------------------------------------------------------------------
/src/writers.rs:
--------------------------------------------------------------------------------
  1 | use crate::results::{BenchmarkReport, BenchmarkResults};
  2 | use crate::{executors, table, BenchmarkConfig};
  3 | use serde::Serialize;
  4 | use std::path::Path;
  5 | use sysinfo::{CpuRefreshKind, MemoryRefreshKind, System};
  6 | use tokio::fs;
  7 | 
  8 | #[derive(Serialize)]
  9 | pub struct PercentilesWriter {
 10 |     pub p50: f64,
 11 |     pub p60: f64,
 12 |     pub p70: f64,
 13 |     pub p80: f64,
 14 |     pub p90: f64,
 15 |     pub p95: f64,
 16 |     pub p99: f64,
 17 |     pub avg: f64,
 18 | }
 19 | 
 20 | #[derive(Serialize)]
 21 | pub struct BenchmarkResultsWriter {
 22 |     id: String,
 23 |     executor_type: String,
 24 |     config: executors::ExecutorConfig,
 25 |     total_requests: u64,
 26 |     total_tokens: u64,
 27 |     token_throughput_secs: f64,
 28 |     duration_ms: u128,
 29 |     time_to_first_token_ms: PercentilesWriter,
 30 |     inter_token_latency_ms: PercentilesWriter,
 31 |     failed_requests: u64,
 32 |     successful_requests: u64,
 33 |     request_rate: f64,
 34 |     total_tokens_sent: u64,
 35 |     e2e_latency_ms: PercentilesWriter,
 36 | }
 37 | 
 38 | impl BenchmarkResultsWriter {
 39 |     pub fn new(results: BenchmarkResults) -> anyhow::Result<BenchmarkResultsWriter> {
 40 |         Ok(BenchmarkResultsWriter {
 41 |             id: results.id.clone(),
 42 |             executor_type: results.executor_type().to_string(),
 43 |             config: results.executor_config(),
 44 |             total_requests: results.total_requests() as u64,
 45 |             total_tokens: results.total_tokens(),
 46 |             token_throughput_secs: results.token_throughput_secs()?,
 47 |             duration_ms: results.duration().ok().unwrap().as_micros() / 1000,
 48 |             time_to_first_token_ms: PercentilesWriter {
 49 |                 p50: results.time_to_first_token_percentile(0.5)?.as_micros() as f64 / 1000.,
 50 |                 p60: results.time_to_first_token_percentile(0.6)?.as_micros() as f64 / 1000.,
 51 |                 p70: results.time_to_first_token_percentile(0.7)?.as_micros() as f64 / 1000.,
 52 |                 p80: results.time_to_first_token_percentile(0.8)?.as_micros() as f64 / 1000.,
 53 |                 p90: results.time_to_first_token_percentile(0.9)?.as_micros() as f64 / 1000.,
 54 |                 p95: results.time_to_first_token_percentile(0.95)?.as_micros() as f64 / 1000.,
 55 |                 p99: results.time_to_first_token_percentile(0.99)?.as_micros() as f64 / 1000.,
 56 |                 avg: results.time_to_first_token_avg().ok().unwrap().as_micros() as f64 / 1000.,
 57 |             },
 58 |             inter_token_latency_ms: PercentilesWriter {
 59 |                 p50: results.inter_token_latency_percentile(0.5)?.as_micros() as f64 / 1000.,
 60 |                 p60: results.inter_token_latency_percentile(0.6)?.as_micros() as f64 / 1000.,
 61 |                 p70: results.inter_token_latency_percentile(0.7)?.as_micros() as f64 / 1000.,
 62 |                 p80: results.inter_token_latency_percentile(0.8)?.as_micros() as f64 / 1000.,
 63 |                 p90: results.inter_token_latency_percentile(0.9)?.as_micros() as f64 / 1000.,
 64 |                 p95: results.inter_token_latency_percentile(0.95)?.as_micros() as f64 / 1000.,
 65 |                 p99: results.inter_token_latency_percentile(0.99)?.as_micros() as f64 / 1000.,
 66 |                 avg: results.inter_token_latency_avg().ok().unwrap().as_micros() as f64 / 1000.,
 67 |             },
 68 |             failed_requests: results.failed_requests() as u64,
 69 |             successful_requests: results.successful_requests() as u64,
 70 |             request_rate: results.successful_request_rate()?,
 71 |             total_tokens_sent: results.total_tokens_sent(),
 72 |             e2e_latency_ms: PercentilesWriter {
 73 |                 p50: results.e2e_latency_percentile(0.5)?.as_micros() as f64 / 1000.,
 74 |                 p60: results.e2e_latency_percentile(0.6)?.as_micros() as f64 / 1000.,
 75 |                 p70: results.e2e_latency_percentile(0.7)?.as_micros() as f64 / 1000.,
 76 |                 p80: results.e2e_latency_percentile(0.8)?.as_micros() as f64 / 1000.,
 77 |                 p90: results.e2e_latency_percentile(0.9)?.as_micros() as f64 / 1000.,
 78 |                 p95: results.e2e_latency_percentile(0.95)?.as_micros() as f64 / 1000.,
 79 |                 p99: results.e2e_latency_percentile(0.99)?.as_micros() as f64 / 1000.,
 80 |                 avg: results.e2e_latency_avg().ok().unwrap().as_micros() as f64 / 1000.,
 81 |             },
 82 |         })
 83 |     }
 84 | }
 85 | 
 86 | #[derive(Serialize)]
 87 | pub struct SystemInfo {
 88 |     pub cpu: Vec<String>,
 89 |     pub memory: String,
 90 |     pub os_name: String,
 91 |     pub os_version: String,
 92 |     pub kernel: String,
 93 |     pub hostname: String,
 94 | }
 95 | 
 96 | impl SystemInfo {
 97 |     pub fn new() -> SystemInfo {
 98 |         let s = System::new_with_specifics(
 99 |             sysinfo::RefreshKind::nothing()
100 |                 .with_memory(MemoryRefreshKind::everything())
101 |                 .with_cpu(CpuRefreshKind::everything()),
102 |         );
103 |         let cpu_info = s
104 |             .cpus()
105 |             .iter()
106 |             .map(|cpu| format!("{} {}@{:.0}MHz", cpu.brand(), cpu.name(), cpu.frequency()))
107 |             .collect::<Vec<String>>();
108 |         SystemInfo {
109 |             cpu: cpu_info,
110 |             memory: format!(
111 |                 "{:.2} GB",
112 |                 s.total_memory() as f64 / 1024.0 / 1024.0 / 1024.0
113 |             ),
114 |             os_name: System::name().ok_or("N/A").unwrap(),
115 |             os_version: System::os_version().ok_or("N/A").unwrap(),
116 |             kernel: System::kernel_version().ok_or("N/A").unwrap(),
117 |             hostname: System::host_name().ok_or("N/A").unwrap(),
118 |         }
119 |     }
120 | }
121 | 
122 | #[derive(Serialize)]
123 | pub struct BenchmarkReportWriter {
124 |     config: BenchmarkConfig,
125 |     results: Vec<BenchmarkResultsWriter>,
126 |     start_time: String,
127 |     end_time: String,
128 |     system: SystemInfo,
129 |     #[serde(skip)]
130 |     report: BenchmarkReport,
131 | }
132 | 
133 | impl BenchmarkReportWriter {
134 |     pub fn try_new(
135 |         config: BenchmarkConfig,
136 |         report: BenchmarkReport,
137 |     ) -> anyhow::Result<BenchmarkReportWriter> {
138 |         let mut results: Vec<BenchmarkResultsWriter> = Vec::new();
139 |         for result in report.get_results() {
140 |             let writer = BenchmarkResultsWriter::new(result)?;
141 |             results.push(writer);
142 |         }
143 |         Ok(BenchmarkReportWriter {
144 |             config,
145 |             results,
146 |             start_time: report
147 |                 .start_time()
148 |                 .ok_or(anyhow::anyhow!("start_time not set"))?
149 |                 .to_rfc3339(),
150 |             end_time: report
151 |                 .end_time()
152 |                 .ok_or(anyhow::anyhow!("end_time not set"))?
153 |                 .to_rfc3339(),
154 |             system: SystemInfo::new(),
155 |             report,
156 |         })
157 |     }
158 |     pub async fn json(&self, path: &Path) -> anyhow::Result<()> {
159 |         // write the benchmark report to json
160 |         let report = serde_json::to_string(&self)?;
161 | 
162 |         // create path hierarchy if it doesn't exist
163 |         if !path.exists() {
164 |             if let Some(parent) = path.parent() {
165 |                 fs::create_dir_all(parent).await?;
166 |             }
167 |         }
168 |         fs::write(path, report).await?;
169 |         Ok(())
170 |     }
171 | 
172 |     pub async fn stdout(&self) -> anyhow::Result<()> {
173 |         let param_table = table::parameters_table(self.config.clone())?;
174 |         println!("\n{param_table}\n");
175 |         let results_table = table::results_table(self.report.clone())?;
176 |         println!("\n{results_table}\n");
177 |         Ok(())
178 |     }
179 | }
180 | 


--------------------------------------------------------------------------------
/test_data/chat.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "id": "QWJhYvA_0",
 4 |     "conversations": [
 5 |       {
 6 |         "role": "user",
 7 |         "content": "Summarize the main ideas of Jeff Walker's Product Launch Formula into bullet points as it pertains to a growth marketing agency implementing these strategies and tactics for their clients..."
 8 |       },
 9 |       {
10 |         "role": "assistant",
11 |         "content": "Here are the main ideas of Jeff Walker's Product Launch Formula that can be applied by a growth marketing agency for their clients:\n\n1. Identify the target audience and their needs: Understand the ideal customer for the product or service, and create a messaging that resonates with them.\n2. Pre-launch: Build anticipation and excitement for the launch by creating buzz, gathering testimonials and case studies, and using social media to create awareness.\n3. Launch: Use a well-crafted launch sequence to maximize sales and conversions. This can include offering bonuses, creating scarcity, and using a deadline to create urgency.\n4. Post-launch: Follow up with customers, gather feedback, and continue to provide value to keep them engaged and loyal.\n5. Create a product suite: Once the initial product is successful, expand the offering by creating additional products or services that address other needs of the same audience.\n6. Continual optimization: Continually monitor and optimize the launch process and product suite to improve results.\n7. Build a community: Use the launch process to build a community of customers who are passionate about the product and can help spread the word.\n8. Use automation: Use technology and automation to streamline the launch process and improve efficiency."
12 |       },
13 |       {
14 |         "role": "user",
15 |         "content": "Summarize the main ideas of Brendon Burchard's Experts Academy into bullet points as it pertains to a growth marketing agency implementing these strategies and tactics for their clients..."
16 |       },
17 |       {
18 |         "role": "assistant",
19 |         "content": "Here are the main ideas of Brendon Burchard's Experts Academy that can be applied by a growth marketing agency for their clients:\n\n1. Identify your expertise: Define your area of expertise and create a unique value proposition that differentiates you from others.\n2. Create a content strategy: Develop a content strategy that showcases your expertise and provides value to your audience. This can include blog posts, videos, podcasts, and social media content.\n3. Build your audience: Use social media, advertising, and other channels to build your audience and increase your visibility.\n4. Develop your products: Create products and services that align with your expertise and meet the needs of your audience. These can include digital products, courses, coaching, and consulting services.\n5. Create a brand: Develop a strong brand that reflects your expertise and resonates with your audience.\n6. Build authority: Establish yourself as an authority in your industry by speaking at events, publishing a book, or appearing on podcasts or TV shows.\n7. Monetize your expertise: Develop a monetization strategy that leverages your expertise and products to generate revenue.\n8. Build a team: As your business grows, build a team of experts to help you scale your business and provide excellent service to your clients.\n9. Continual improvement: Continually improve your products, services, and marketing strategies to stay ahead of the competition and provide the best possible experience for your clients.\n10. Focus on impact: Ultimately, focus on making a positive impact in the lives of your clients and using your expertise to make a difference in the world."
20 |       },
21 |       {
22 |         "role": "user",
23 |         "content": "What are the mental triggers in Jeff Walker's Product Launch Formula and \"Launch\" book?"
24 |       },
25 |       {
26 |         "role": "assistant",
27 |         "content": "Here are the mental triggers described in Jeff Walker's Product Launch Formula and \"Launch\" book:\n\n1. Scarcity: Creating a sense of urgency by limiting the availability of the product or service being launched.\n2. Authority: Establishing oneself as an expert or authority in the industry to build credibility and trust with the audience.\n3. Social proof: Using testimonials, case studies, and other forms of social proof to demonstrate the effectiveness and value of the product or service.\n4. Reciprocity: Offering something of value to the audience before asking for a sale or commitment.\n5. Likeability: Building a personal connection with the audience and creating a likable persona to increase the chances of a sale.\n6. Anticipation: Building anticipation and excitement leading up to the launch to generate buzz and increase interest in the product or service.\n7. Exclusivity: Creating a feeling of exclusivity by offering early access or special deals to a select group of people.\n8. Belonging: Creating a sense of community or belonging among the audience to increase engagement and loyalty to the brand.\n9. Storytelling: Using storytelling to connect with the audience on an emotional level and make the product or service more relatable and memorable.\n\nBy leveraging these mental triggers, businesses can create an effective product launch strategy that resonates with their target audience and drives sales and growth."
28 |       },
29 |       {
30 |         "role": "user",
31 |         "content": "Write a summary of why scarcity and urgency are the strongest mental triggers and have been the driving force behind many of our best performing campaigns over the last 8 years."
32 |       },
33 |       {
34 |         "role": "assistant",
35 |         "content": "Scarcity and urgency are two of the strongest mental triggers in marketing because they create a sense of exclusivity and importance around a product or service. By creating a sense of limited availability or a deadline for a promotion, businesses can motivate customers to take action and make a purchase before it's too late.\n\nOver the last eight years, many businesses have leveraged these triggers in their marketing campaigns to great success. Scarcity and urgency have been used to sell everything from digital products to physical goods and services, and have proven to be highly effective in driving conversions and increasing revenue.\n\nThis is because scarcity and urgency tap into the natural human desire for instant gratification and fear of missing out (FOMO). When customers believe that they may miss out on a valuable opportunity if they don't act quickly, they are more likely to take action and make a purchase.\n\nFurthermore, scarcity and urgency can be combined with other mental triggers, such as social proof and authority, to create even more powerful marketing campaigns. By demonstrating the popularity and effectiveness of a product or service, businesses can increase the perceived value and urgency of a promotion, leading to even more sales and revenue.\n\nOverall, scarcity and urgency are powerful mental triggers that have been proven to drive results in marketing campaigns. By leveraging these triggers, businesses can create a sense of urgency and exclusivity around their products or services, motivating customers to take action and make a purchase before it's too late."
36 |       },
37 |       {
38 |         "role": "user",
39 |         "content": "Summarize Russell Brunson's Perfect Webinar Script..."
40 |       },
41 |       {
42 |         "role": "assistant",
43 |         "content": "Russell Brunson's Perfect Webinar Script is a framework for delivering webinars that are designed to sell a product or service. Here are the key components of the Perfect Webinar Script:\n\n1. Introduction: The webinar starts with an introduction that establishes the presenter's credibility and establishes the topic.\n2. Story: The presenter then tells a personal story that is related to the topic and establishes a connection with the audience.\n3. Content: The presenter provides valuable content related to the topic, demonstrating their expertise and building trust with the audience.\n4. Offer: The presenter then introduces the product or service they are selling, explaining how it solves the audience's problem and providing social proof to support their claims.\n5. Bonuses: The presenter offers bonuses to sweeten the deal and make the offer more enticing.\n6. Scarcity: The presenter creates a sense of urgency by limiting the availability of the offer or including a deadline for purchase.\n7. FAQs: The presenter answers common questions about the offer, addressing objections and building trust with the audience.\n8. Closing: The presenter then delivers a clear call to action, directing the audience to take a specific action to purchase the product or service.\n\nOverall, the Perfect Webinar Script is designed to build trust with the audience, establish the presenter's credibility, and provide valuable content, while also creating a sense of urgency and scarcity around the offer being presented. By following this framework, businesses can use webinars as a powerful tool to sell their products and services to a targeted audience."
44 |       },
45 |       {
46 |         "role": "user",
47 |         "content": "Summarize the 6 human needs as Tony Robbins explains..."
48 |       },
49 |       {
50 |         "role": "assistant",
51 |         "content": "Tony Robbins describes six core human needs that drive our behaviors and motivations. These six needs are:\n\n1. Certainty: The need for safety, stability, and predictability. This includes the need for comfort, security, and control over our environment.\n2. Variety: The need for novelty, excitement, and change. This includes the need for adventure, stimulation, and new experiences.\n3. Significance: The need to feel important, special, and unique. This includes the need for recognition, achievement, and respect.\n4. Connection: The need for social connection and love. This includes the need for intimacy, belonging, and friendship.\n5. Growth: The need for personal growth and development. This includes the need for learning, self-improvement, and progress.\n6. Contribution: The need to make a positive impact in the world. This includes the need to give back, help others, and make a difference.\n\nAccording to Tony Robbins, these six needs are universal and apply to all individuals. Each person may prioritize these needs differently, and may fulfill them in different ways, but they are fundamental drivers of human behavior. By understanding these needs, individuals can gain insight into their own motivations and behaviors, and can use this knowledge to create a more fulfilling and meaningful life."
52 |       }
53 |     ]
54 |   },
55 |   {
56 |     "id": "i6IyJda_0",
57 |     "conversations": [
58 |       {
59 |         "role": "system",
60 |         "content": "You are a helpful assistant."
61 |       },
62 |       {
63 |         "role": "user",
64 |         "content": "How to tell if a customer segment is well segmented? In 3 bullet points."
65 |       },
66 |       {
67 |         "role": "assistant",
68 |         "content": "1. Homogeneity: The segment should consist of customers who share similar characteristics and behaviors.\n2. Distinctiveness: The segment should be different from other segments in terms of their characteristics and behaviors.\n3. Stability: The segment should remain relatively stable over time and not change drastically. The characteristics and behaviors of customers within the segment should not change significantly."
69 |       }
70 |     ]
71 |   }
72 | ]
73 | 


--------------------------------------------------------------------------------