├── .gitignore
├── LICENSE
├── README.md
├── classification-analysis-gradio.ipynb
├── classification-analysis.ipynb
├── classification-optimization.ipynb
├── data
    └── .gitkeep
├── embedding-analysis-gradio.ipynb
├── embedding-analysis.ipynb
├── embedding-optimization.ipynb
├── generated
    └── .gitkeep
├── media
    ├── classification-accuracy.png
    ├── classification-contour.png
    ├── classification-image-cost.png
    ├── classification-latency.png
    ├── classification-token-distribution.png
    ├── embedding-contour.png
    ├── embedding-latency.png
    ├── embedding-token-distribution.png
    ├── vision-embedding-contour.png
    ├── vision-embedding-file-size.png
    ├── vision-embedding-image-dimensions.png
    └── vision-embedding-latency.png
├── requirements.txt
├── results
    └── .gitkeep
├── src
    ├── deployment.py
    ├── k6.py
    └── process_dataset.py
├── templates
    ├── classification-analysis.js.j2
    ├── embedding-analysis.js.j2
    └── vision-embedding-analysis.js.j2
├── vision-embedding-analysis-gradio.ipynb
├── vision-embedding-analysis.ipynb
└── vision-embedding-optimization.ipynb


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | *.ipynb_checkpoints*
3 | results/*
4 | generated/*
5 | data/*
6 | aggregated_results.csv


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Derek Thomas
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <!-- PROJECT SHIELDS -->
  2 | <!--
  3 | *** I'm using markdown "reference style" links for readability.
  4 | *** Reference links are enclosed in brackets [ ] instead of parentheses ( ).
  5 | *** See the bottom of this document for the declaration of the reference variables
  6 | *** for contributors-url, forks-url, etc. This is an optional, concise syntax you may use.
  7 | *** https://www.markdownguide.org/basic-syntax/#reference-style-links
  8 | -->
  9 | [![Contributors][contributors-shield]][contributors-url]
 10 | [![Stargazers][stars-shield]][stars-url]
 11 | [![Issues][issues-shield]][issues-url]
 12 | [![MIT License][license-shield]][license-url]
 13 | 
 14 | # Table of Contents
 15 | 
 16 | - [Introduction](#introduction)
 17 | - [Installation](#installation)
 18 | - [Getting Started](#getting-started)
 19 | - [Project Structure](#project-structure)
 20 | - [How does it work?](#how-does-it-work)
 21 | - [Results](#results)
 22 | - [References and Links](#references-and-links)
 23 | 
 24 | # Introduction
 25 | 
 26 | This repository supports a blog post that helps users estimate costs for large-scale classification, embedding, or
 27 | vision embedding tasks. It provides benchmarking tools for different GPU types, batch sizes, and inference methods,
 28 | using [michaelfeil/infinity](https://github.com/michaelfeil/infinity/)
 29 | and [Inference Endpoints](https://huggingface.co/inference-endpoints/dedicated).
 30 | 
 31 | I considered a variety of things like:
 32 | 
 33 | - GPU type
 34 | - Infinity Image type
 35 | - Varying Batch Sizes
 36 | - Varying VUs amounts
 37 | - Multiple Architectures
 38 | 
 39 | # Installation
 40 | 
 41 | I used ![Python](https://img.shields.io/badge/python-3.12-blue)
 42 | 
 43 | 1. `git clone https://github.com/datavistics/encoder-analysis.git`
 44 | 2. `cd encoder-analysis`
 45 | 3. `pip install -r requirements.txt`
 46 | 4. `pip install jupyterlab`
 47 | 5. [Install k6](https://grafana.com/docs/k6/latest/set-up/install-k6/#install-k6) based on your platform
 48 | 
 49 | ## Getting Started
 50 | Make sure you have the ability to [deploy an Inference Endpoint](https://endpoints.huggingface.co/new)
 51 | 
 52 | 1. Run `jupyter lab`
 53 | 2. Choose your task [`classification`, `embedding`, `vision-embedding`]
 54 | 3. Run `<task>-optimization.ipynb` to get the best configuration
 55 | 4. Run `<task>-analysis.ipynb` to visualize the results
 56 | 5. Alternatively run `<task>-analysis-gradio.ipynb` to have more interactive results
 57 | 
 58 | # Project Structure
 59 | 
 60 | - There are notebooks in the top level for convenience. Its probably cleaner to put them in `./notebooks` but its
 61 |   annoying to add it to path, so I opted for user satisfaction rather than aesthetics
 62 |     - **\*-optimization.ipynb** - These were used for generating and conducting the experiments
 63 |     - **\*-analysis.ipynb** - These show the analysis in a clean notebook-centric way
 64 |     - **\*-analysis-gradio.ipynb** - These show the analysis in an interactive gradio-centric way
 65 | - `src` I abstracted a fair amount of code here. I tried to keep any important details in the notebooks
 66 | - `templates` these are the k6 jinja templates that I use to generate each experiment
 67 | - `data`, `generated`, and `results` are used to store non-version-controlled project files
 68 | 
 69 | # How does it work?
 70 | 
 71 | Each of the **\*-optimization.ipynb** notebooks facilitates this structure:
 72 | 
 73 | ```mermaid
 74 | flowchart TD;
 75 |     subgraph Benchmarking Server
 76 |         A[k6 Load Testing]
 77 |         D[Instance Config]
 78 |     end
 79 | 
 80 |     subgraph Inference Endpoint
 81 |         C[Container Running Infinity]
 82 |         E[Next Inference Endpoint]
 83 |     end
 84 | 
 85 |     D -->|Defines Test Parameters| A
 86 |     D -->|Deploys Inference Endpoint| E
 87 |     A -->|Sends Test Data| C
 88 |     C -->|Processes and Returns| A
 89 | ```
 90 | 
 91 | 1. Define the benchmarking parameters (GPU, batch size, VUs, etc)
 92 | 2. Deploy the inference server (Infinity on Hugging Face Endpoints)
 93 | 3. Run K6 performance tests to evaluate speed, cost, and efficiency
 94 | 4. Store and visualize results for optimization
 95 | 
 96 | # Results
 97 | 
 98 | Do check out these [notebooks](https://nbviewer.org/github/datavistics/encoder-analysis/tree/main/) in nbviewer, as I
 99 | took a lot of effort to make sure they are **interactive**. Unfortunately they look better in light mode due to the
100 | tables.
101 | But follow your heart.
102 | 
103 | - [classification-analysis-gradio.ipynb](https://nbviewer.org/github/datavistics/encoder-analysis/blob/main/classification-analysis-gradio.ipynb)
104 | - [embedding-analysis-gradio.ipynb](https://nbviewer.org/github/datavistics/encoder-analysis/blob/main/embedding-analysis-gradio.ipynb)
105 | - [vision-embedding-analysis-gradio.ipynb](https://nbviewer.org/github/datavistics/encoder-analysis/blob/main/vision-embedding-analysis-gradio.ipynb)
106 | 
107 | ## Classification
108 | 
109 | For [lxyuan/distilbert-base-multilingual-cased-sentiments-student](https://huggingface.co/lxyuan/distilbert-base-multilingual-cased-sentiments-student)
110 | on a dataset like [tyqiangz/multilingual-sentiments](https://huggingface.co/datasets/tyqiangz/multilingual-sentiments)
111 | (using the `text` column) we can do 1 Billion classifications for only `$253.82`.
112 | 
113 | | GPU           | Image         | Batch Size | VUs     | Min Cost    |
114 | |---------------|---------------|------------|---------|-------------|
115 | | **nvidia-l4** | **`default`** | **64**     | **448** | **$253.82** |
116 | 
117 | ![classification-results.png](media/classification-contour.png)
118 | [Interactive Version here](https://nbviewer.org/github/datavistics/encoder-analysis/blob/main/classification-analysis-gradio.ipynb)
119 | 
120 | ## Embedding
121 | 
122 | For [Alibaba-NLP/gte-modernbert-base](https://huggingface.co/Alibaba-NLP/gte-modernbert-base) on a dataset
123 | like [sentence-transformers/trivia-qa-triplet](https://huggingface.co/datasets/sentence-transformers/trivia-qa-triplet)
124 | (using the `positive` column) we can do 1 Billion embeddings for only `$409.44`.
125 | 
126 | | GPU       | Batch Size | VUs | Min Cost |
127 | |-----------|------------|-----|----------|
128 | | nvidia-l4 | 256        | 32  | $409.44  |
129 | 
130 | ![embedding-results.png](media/embedding-contour.png)
131 | [Interactive Version here](https://nbviewer.org/github/datavistics/encoder-analysis/blob/main/embedding-analysis-gradio.ipynb)
132 | 
133 | ## Vision Embedding
134 | 
135 | For [vidore/colqwen2-v1.0-merged](https://huggingface.co/vidore/colqwen2-v1.0-merged) on a dataset
136 | like [openbmb/RLAIF-V-Dataset](https://huggingface.co/datasets/openbmb/RLAIF-V-Dataset)
137 | (using the `image` column) we can do 1 Billion ColBERT style embeddings (late interaction) on images for `$44496.51`.
138 | 
139 | | GPU       | Batch Size | VUs | Min Cost  |
140 | |-----------|------------|-----|-----------|
141 | | nvidia-l4 | 4          | 4   | $44496.51 |
142 | 
143 | ![vision-embedding-results.png](media/vision-embedding-contour.png)
144 | [Interactive Version here](https://nbviewer.org/github/datavistics/encoder-analysis/blob/main/embedding-analysis-gradio.ipynb)
145 | 
146 | # References and Links
147 | 
148 | - [Inference Endpoints](https://huggingface.co/inference-endpoints/dedicated)
149 | - [michaelfeil/infinity](https://github.com/michaelfeil/infinity/)
150 | - [Infinity Swagger](https://michaelfeil.eu/infinity/0.0.75/swagger_ui/)
151 | - [k6 Docs](https://grafana.com/docs/k6/latest/)
152 | 
153 | <!-- MARKDOWN LINKS & IMAGES -->
154 | <!-- https://www.markdownguide.org/basic-syntax/#reference-style-links -->
155 | 
156 | [contributors-shield]: https://img.shields.io/github/contributors/datavistics/encoder-analysis.svg?style=for-the-badge
157 | 
158 | [contributors-url]: https://github.com/datavistics/encoder-analysis/graphs/contributors
159 | 
160 | [stars-shield]: https://img.shields.io/github/stars/datavistics/encoder-analysis.svg?style=for-the-badge
161 | 
162 | [stars-url]: https://github.com/datavistics/encoder-analysis/stargazers
163 | 
164 | [issues-shield]: https://img.shields.io/github/issues/datavistics/encoder-analysis.svg?style=for-the-badge
165 | 
166 | [issues-url]: https://github.com/datavistics/encoder-analysis/issues
167 | 
168 | [license-shield]: https://img.shields.io/github/license/datavistics/encoder-analysis.svg?style=for-the-badge
169 | 
170 | [license-url]: https://github.com/datavistics/encoder-analysis/blob/main/LICENSE
171 | 


--------------------------------------------------------------------------------
/data/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datavistics/encoder-analysis/430b433ba259f5cfc4dd957fb45f1e6c5379c1d7/data/.gitkeep


--------------------------------------------------------------------------------
/generated/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datavistics/encoder-analysis/430b433ba259f5cfc4dd957fb45f1e6c5379c1d7/generated/.gitkeep


--------------------------------------------------------------------------------
/media/classification-accuracy.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datavistics/encoder-analysis/430b433ba259f5cfc4dd957fb45f1e6c5379c1d7/media/classification-accuracy.png


--------------------------------------------------------------------------------
/media/classification-contour.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datavistics/encoder-analysis/430b433ba259f5cfc4dd957fb45f1e6c5379c1d7/media/classification-contour.png


--------------------------------------------------------------------------------
/media/classification-image-cost.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datavistics/encoder-analysis/430b433ba259f5cfc4dd957fb45f1e6c5379c1d7/media/classification-image-cost.png


--------------------------------------------------------------------------------
/media/classification-latency.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datavistics/encoder-analysis/430b433ba259f5cfc4dd957fb45f1e6c5379c1d7/media/classification-latency.png


--------------------------------------------------------------------------------
/media/classification-token-distribution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datavistics/encoder-analysis/430b433ba259f5cfc4dd957fb45f1e6c5379c1d7/media/classification-token-distribution.png


--------------------------------------------------------------------------------
/media/embedding-contour.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datavistics/encoder-analysis/430b433ba259f5cfc4dd957fb45f1e6c5379c1d7/media/embedding-contour.png


--------------------------------------------------------------------------------
/media/embedding-latency.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datavistics/encoder-analysis/430b433ba259f5cfc4dd957fb45f1e6c5379c1d7/media/embedding-latency.png


--------------------------------------------------------------------------------
/media/embedding-token-distribution.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datavistics/encoder-analysis/430b433ba259f5cfc4dd957fb45f1e6c5379c1d7/media/embedding-token-distribution.png


--------------------------------------------------------------------------------
/media/vision-embedding-contour.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datavistics/encoder-analysis/430b433ba259f5cfc4dd957fb45f1e6c5379c1d7/media/vision-embedding-contour.png


--------------------------------------------------------------------------------
/media/vision-embedding-file-size.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datavistics/encoder-analysis/430b433ba259f5cfc4dd957fb45f1e6c5379c1d7/media/vision-embedding-file-size.png


--------------------------------------------------------------------------------
/media/vision-embedding-image-dimensions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datavistics/encoder-analysis/430b433ba259f5cfc4dd957fb45f1e6c5379c1d7/media/vision-embedding-image-dimensions.png


--------------------------------------------------------------------------------
/media/vision-embedding-latency.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datavistics/encoder-analysis/430b433ba259f5cfc4dd957fb45f1e6c5379c1d7/media/vision-embedding-latency.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | plotly==5.24.1
 2 | scipy==1.15.1
 3 | huggingface-hub==0.27.1
 4 | transformers==4.48.0
 5 | pandas==2.2.3
 6 | loguru==0.7.3
 7 | Jinja2==3.1.5
 8 | gradio==5.15.0
 9 | datasets==3.2.0
10 | ipywidgets==8.1.5
11 | matplotlib==3.10.0
12 | datasets==3.2.0


--------------------------------------------------------------------------------
/results/.gitkeep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/datavistics/encoder-analysis/430b433ba259f5cfc4dd957fb45f1e6c5379c1d7/results/.gitkeep


--------------------------------------------------------------------------------
/src/deployment.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | from dataclasses import asdict
 3 | from typing import Any, Dict
 4 | 
 5 | from huggingface_hub import create_inference_endpoint, get_inference_endpoint, whoami
 6 | from loguru import logger
 7 | 
 8 | 
 9 | def deploy_endpoint(
10 |         instance_config: Dict[str, Any],
11 |         endpoint_name: str,
12 |         wait: bool = False
13 |         ):
14 |     """Creates and deploys an inference endpoint with the given configuration.
15 | 
16 |     Args:
17 |         instance_config (Dict[str, Any]): Configuration for the endpoint.
18 |         endpoint_name (str): Name of the endpoint.
19 |         wait (bool, optional): Whether to wait for deployment. Defaults to False.
20 | 
21 |     Returns:
22 |         Any: The endpoint object or None if creation fails.
23 |     """
24 | 
25 |     # Try to Re-Use an endpoint
26 |     namespace = whoami()['name']
27 |     try:
28 |         endpoint = get_inference_endpoint(endpoint_name, namespace=namespace)
29 |         hw_type = endpoint.__dict__['raw']['compute']['instanceType']
30 |         batch_size = endpoint.__dict__['raw']['model']['env']['INFINITY_BATCH_SIZE']
31 |         logger.success(f"Re-using Endpoint: hw={hw_type}\tbs={batch_size}\t")
32 |         return endpoint
33 |     except Exception as e:
34 |         logger.warning(f"Endpoint not found. Proceeding with creation: {e}")
35 | 
36 |     # If that doesnt work try to create one
37 |     try:
38 |         logger.info("Creating inference endpoint...")
39 |         start_time = time.time()  # Record start time
40 |         endpoint = create_inference_endpoint(
41 |                 endpoint_name,
42 |                 namespace=namespace,
43 |                 # namespace='HF-test-lab',
44 |                 framework="pytorch",
45 |                 task='text-classification',
46 |                 min_replica=0,
47 |                 max_replica=1,
48 |                 scale_to_zero_timeout=300,
49 |                 type="protected",
50 |                 **asdict(instance_config)
51 |                 )
52 |     except Exception as e:
53 |         logger.error(f"Failed to create inference endpoint: {e}")
54 |         return None
55 | 
56 |     logger.info("Waiting for endpoint to be ready...")
57 |     if not wait:
58 |         logger.info("Waiting for endpoint to be ready...")
59 |         return endpoint
60 |     endpoint.wait()  # Wait for the endpoint to be ready
61 | 
62 |     # Calculate elapsed time
63 |     elapsed_time = time.time() - start_time
64 |     elapsed_minutes, elapsed_seconds = divmod(elapsed_time, 60)
65 | 
66 |     hw_type = endpoint.__dict__['raw']['compute']['instanceType']
67 |     batch_size = endpoint.__dict__['raw']['model']['env']['INFINITY_BATCH_SIZE']
68 | 
69 |     logger.success(
70 |             f"Endpoint created successfully: hw={hw_type}\tbs={batch_size}\t"
71 |             f"Time taken: {int(elapsed_minutes)}m {elapsed_seconds:.2f}s"
72 |             )
73 |     return endpoint
74 | 


--------------------------------------------------------------------------------
/src/k6.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import subprocess
  4 | from pathlib import Path
  5 | from typing import Any, Dict
  6 | 
  7 | from huggingface_hub import InferenceEndpoint, get_token
  8 | from jinja2 import Environment, FileSystemLoader
  9 | from loguru import logger
 10 | 
 11 | template_dir = "./templates"
 12 | template_file = "classification-analysis.js.j2"
 13 | 
 14 | output_file = Path("./generated").resolve() / "classification-analysis.js"
 15 | 
 16 | # I hardcoded this because for most people this is all that will be tested. Do send me a PR if you need more.
 17 | image_dict = {
 18 |     'michaelf34/infinity:0.0.75-trt-onnx': 'trt-onnx',
 19 |     'michaelf34/infinity:0.0.75': 'default',
 20 |     }
 21 | 
 22 | 
 23 | def call_k6(
 24 |         endpoint: InferenceEndpoint,
 25 |         text_column: str,
 26 |         vus: int,
 27 |         total_requests: int,
 28 |         template_file: str,
 29 |         output_file: Path,
 30 |         dataset_path: str,
 31 |         k6_bin: str
 32 |         ) -> float:
 33 |     """
 34 |     Runs a k6 performance test on a given endpoint using a specified template.
 35 | 
 36 |     Args:
 37 |         endpoint: Endpoint object containing model and compute metadata.
 38 |         text_column (str): Name of the text column in the dataset.
 39 |         vus (int): Number of virtual users for k6 load testing.
 40 |         total_requests (int): Total number of requests to simulate.
 41 |         template_file (str): Jinja2 template file used for generating test scripts.
 42 |         output_file (Path): Path to the generated JavaScript test script.
 43 |         dataset_path (str): Path to the dataset file.
 44 |         k6_bin (str): Path to the k6 binary.
 45 | 
 46 |     Returns:
 47 |         float: The measured throughput in requests per second.
 48 |     """
 49 |     # Determine the task type based on the template file name
 50 |     if 'classification' in template_file:
 51 |         task = 'classification'
 52 |     elif 'vision-embedding' in template_file:
 53 |         task = 'vision-embedding'
 54 |     elif 'embedding' in template_file:
 55 |         task = 'embedding'
 56 |     else:
 57 |         raise ValueError('Unknown task type in template file')
 58 | 
 59 |     # Load Jinja2 template for script generation
 60 |     env = Environment(loader=FileSystemLoader(template_dir))
 61 |     template = env.get_template(template_file)
 62 | 
 63 |     # Extract relevant metadata from the endpoint
 64 |     image = endpoint.__dict__['raw']['model']['image']['custom']['url']
 65 |     image_short = image_dict.get(image, 'other_image')
 66 |     hw_type = endpoint.__dict__['raw']['compute']['instanceType']
 67 |     vendor = endpoint.__dict__['raw']['provider']['vendor']
 68 |     batch_size = endpoint.__dict__['raw']['model']['env']['INFINITY_BATCH_SIZE']
 69 |     engine = endpoint.__dict__['raw']['model']['env']['INFINITY_ENGINE']
 70 | 
 71 |     results_file = Path(
 72 |         "./results").resolve() / task / f'{hw_type}' / f'{vendor}_{hw_type}_{image_short}_{engine}_{batch_size}_{vus}.json'
 73 | 
 74 |     if results_file.exists():
 75 |         logger.info(f"Results file {results_file} already exists. Loading existing data.")
 76 |         with open(results_file) as f:
 77 |             data = json.load(f)
 78 |             return data.get("throughput_req_per_sec", 0)
 79 | 
 80 |     # Render the test script from the template
 81 |     rendered_script = template.render(
 82 |             text_column=text_column,
 83 |             host=endpoint.url,
 84 |             data_file=str(Path(dataset_path).resolve()),
 85 |             results_file=str(results_file),
 86 |             pre_allocated_vus=vus,
 87 |             total_requests=total_requests,
 88 |             hw_type=hw_type,
 89 |             batch_size=batch_size,
 90 |             vendor=vendor,
 91 |             image=image,
 92 |             engine=engine,
 93 |             duration="1m"
 94 |             )
 95 | 
 96 |     # Ensure necessary directories exist
 97 |     os.makedirs(os.path.dirname(output_file), exist_ok=True)
 98 |     os.makedirs(os.path.dirname(results_file), exist_ok=True)
 99 | 
100 |     # Write the generated script to file
101 |     with open(str(output_file), "w") as f:
102 |         f.write(rendered_script)
103 | 
104 |     # Execute the k6 load test
105 |     logger.info(f"Running k6 test with {vus} VUs")
106 |     K6_BIN = os.path.expanduser(k6_bin)
107 |     process = subprocess.run(
108 |             [K6_BIN, "run", str(output_file)],
109 |             env={'HF_TOKEN': get_token(), **os.environ},
110 |             capture_output=True,
111 |             text=True
112 |             )
113 |     logger.info(f"Results written to {results_file}")
114 | 
115 |     # Load and return the throughput result
116 |     try:
117 |         with open(results_file) as f:
118 |             data = json.load(f)
119 |             return data.get("throughput_req_per_sec", 0)
120 |     except Exception as e:
121 |         logger.error(f"Failed to read results file: {e}")
122 |         return 0
123 | 
124 | 
125 | def optimal_vus(
126 |         max_vus: int,
127 |         args_dict: Dict[str, Any],
128 |         start_vus: int = 1
129 |         ) -> int:
130 |     """
131 |     Finds the optimal number of virtual users (VUs) for maximum throughput.
132 | 
133 |     Args:
134 |         max_vus (int): Maximum number of virtual users to test.
135 |         args_dict (dict): Dictionary of arguments to pass to `call_k6`.
136 |         start_vus (int): Initial number of VUs for testing (default: 1).
137 | 
138 |     Returns:
139 |         int: Optimal number of VUs.
140 |     """
141 |     vus = start_vus
142 |     prev_throughput = 0
143 |     vus_history = []
144 | 
145 |     logger.info("Starting exponential search for optimal VUs")
146 |     while vus <= max_vus:
147 |         logger.info(f"Testing with {vus} VUs")
148 |         throughput = call_k6(vus=vus, **args_dict)
149 |         vus_history.append((vus, throughput))
150 |         logger.info(f"Throughput for {vus} VUs: {throughput:.2f} req/sec")
151 | 
152 |         if throughput < prev_throughput * 1.02:  # Stop if improvement is <2%
153 |             logger.info("Throughput improvement is less than 2%, stopping search.")
154 |             break
155 | 
156 |         prev_throughput = throughput
157 |         vus *= 2  # Double the VUs
158 | 
159 |     if vus > max_vus:
160 |         logger.info(f"Reached maximum VU limit: {max_vus}")
161 | 
162 |     # Binary search refinement
163 |     logger.info("Starting binary search refinement")
164 |     low, high = vus_history[-2][0], vus_history[-1][0]
165 |     while low < high:
166 |         mid = (low + high) // 2
167 |         logger.info(f"Testing with {mid} VUs")
168 |         throughput = call_k6(vus=mid, **args_dict)
169 | 
170 |         if throughput > prev_throughput:
171 |             logger.info(f"Throughput improved to {throughput:.2f} req/sec with {mid} VUs")
172 |             prev_throughput = throughput
173 |             low = mid + 1
174 |         else:
175 |             logger.info(f"Throughput did not improve with {mid} VUs")
176 |             high = mid - 1
177 | 
178 |     optimal_vus = low
179 |     logger.info(f"Optimal VUs determined: {optimal_vus}")
180 |     return optimal_vus
181 | 


--------------------------------------------------------------------------------
/src/process_dataset.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import json
  3 | import os
  4 | import random
  5 | from io import BytesIO
  6 | from pathlib import Path
  7 | 
  8 | import pandas as pd
  9 | from PIL import Image
 10 | from datasets import Dataset
 11 | from loguru import logger
 12 | 
 13 | 
 14 | def tokenize_and_filter(dataset: Dataset, tokenizer, text_column: str, min_tokens: int = None, max_tokens: int = None,
 15 |                         num_proc: int = 8):
 16 |     """
 17 |     Tokenizes a dataset, adds a `num_tokens` column, and filters based on token length constraints.
 18 | 
 19 |     :param dataset: Dataset to tokenize and filter.
 20 |     :param tokenizer: Tokenizer object with an `encode` method.
 21 |     :param text_column: Column name containing text data.
 22 |     :param min_tokens: Minimum number of tokens for filtering (optional).
 23 |     :param max_tokens: Maximum number of tokens for filtering (optional).
 24 |     :param num_proc: Number of processes for parallel execution.
 25 |     :return: Filtered dataset with token counts.
 26 |     """
 27 |     logger.info("Tokenizing dataset and applying token count filter")
 28 | 
 29 |     dataset = dataset.map(
 30 |             lambda example: {"num_tokens": len(tokenizer.encode(example[text_column]))},
 31 |             num_proc=num_proc,
 32 |             )
 33 | 
 34 |     if min_tokens is not None and max_tokens is not None:
 35 |         dataset = dataset.filter(lambda x: min_tokens <= x["num_tokens"] <= max_tokens, num_proc=num_proc)
 36 |         logger.info(f"Filtered dataset with token range [{min_tokens}, {max_tokens}]")
 37 | 
 38 |     return dataset
 39 | 
 40 | 
 41 | def sample_dataset(dataset, n_samples: int, seed: int = 42):
 42 |     """
 43 |     Samples a dataset randomly if it has more than `n_samples`.
 44 | 
 45 |     :param dataset: Dataset to sample from.
 46 |     :param n_samples: Number of samples to retain.
 47 |     :param seed: Random seed for reproducibility.
 48 |     :return: Sampled dataset.
 49 |     """
 50 |     total_samples = len(dataset)
 51 | 
 52 |     if total_samples <= n_samples:
 53 |         return dataset
 54 | 
 55 |     random.seed(seed)
 56 |     random_indices = random.sample(range(total_samples), n_samples)
 57 |     return_dataset = dataset.select(random_indices)
 58 |     logger.success(f"Sampled dataset down to {len(return_dataset)} samples")
 59 | 
 60 |     return return_dataset
 61 | 
 62 | 
 63 | def save_dataset(data, file_path: str):
 64 |     """
 65 |     Saves a dataset in JSON or JSONL format based on the file extension.
 66 | 
 67 |     :param data: Dataset to save.
 68 |     :param file_path: Path where the dataset should be saved.
 69 |     """
 70 |     os.makedirs(os.path.dirname(file_path), exist_ok=True)
 71 | 
 72 |     # Convert to a list of dictionaries
 73 |     data = data.to_list()
 74 | 
 75 |     if file_path.endswith(".jsonl"):
 76 |         # Save as JSONL (one JSON object per line)
 77 |         with open(file_path, "w", encoding="utf-8") as f:
 78 |             for entry in data:
 79 |                 f.write(json.dumps(entry, ensure_ascii=False) + "\n")
 80 |         logger.info(f"Saved dataset to {file_path} in JSONL format")
 81 |     elif file_path.endswith(".json"):
 82 |         # Save as a JSON array
 83 |         with open(file_path, "w", encoding="utf-8") as f:
 84 |             json.dump(data, f, ensure_ascii=False, indent=4)
 85 |         logger.info(f"Saved dataset to {file_path} in JSON format")
 86 |     else:
 87 |         logger.error("Unsupported file extension. Use '.json' or '.jsonl'.")
 88 |         raise ValueError("Unsupported file extension. Use '.json' or '.jsonl'.")
 89 | 
 90 | 
 91 | def load_json_files(folder_path: str) -> pd.DataFrame:
 92 |     """
 93 |     Loads JSON files from a folder into a Pandas DataFrame.
 94 | 
 95 |     :param folder_path: Path to the folder containing JSON files.
 96 |     :return: DataFrame containing all loaded data.
 97 |     """
 98 |     all_data = []
 99 |     folder = Path(folder_path)
100 | 
101 |     # Iterate over all JSON files in the folder
102 |     for file_path in folder.glob("*/*.json"):
103 |         try:
104 |             with file_path.open("r", encoding="utf-8") as f:
105 |                 data = json.load(f)
106 | 
107 |                 # If data is a list of dicts, extend it
108 |                 if isinstance(data, list):
109 |                     all_data.extend(data)
110 |                 # If data is a single dict, append it as a row
111 |                 elif isinstance(data, dict):
112 |                     all_data.append(data)
113 |                 else:
114 |                     logger.warning(f"Skipping {file_path.name}: Unexpected format")
115 |         except json.JSONDecodeError:
116 |             logger.error(f"Skipping {file_path.name}: Invalid JSON")
117 | 
118 |     logger.info(f"Loaded {len(all_data)} entries from {folder_path}")
119 |     return pd.DataFrame(all_data)
120 | 
121 | 
122 | def pil_to_base64(image: Image.Image, format: str = "PNG", modality: str = "image") -> str:
123 |     """
124 |     Converts a PIL image to a base64-encoded data URI.
125 | 
126 |     :param image: PIL Image object
127 |     :param format: Image format (e.g., "PNG", "JPEG")
128 |     :param modality: MIME type category (default: "image")
129 |     :return: Base64-encoded data URI
130 |     """
131 |     buffered = BytesIO()
132 |     image.save(buffered, format=format)
133 |     base64_encoded = base64.b64encode(buffered.getvalue()).decode("utf-8")
134 |     mimetype = f"{modality}/{format.lower()}"
135 |     return f"data:{mimetype};base64,{base64_encoded}"
136 | 


--------------------------------------------------------------------------------
/templates/classification-analysis.js.j2:
--------------------------------------------------------------------------------
  1 | import {check, fail} from 'k6';
  2 | import http from 'k6/http';
  3 | import {scenario} from 'k6/execution';
  4 | import {Trend, Counter, Rate} from 'k6/metrics';
  5 | import {textSummary} from 'https://jslib.k6.io/k6-summary/0.0.2/index.js';
  6 | 
  7 | const host = "{{ host }}";
  8 | const apiKey = __ENV.HF_TOKEN;
  9 | const dataset = JSON.parse(open("{{ data_file }}"));
 10 | const filePath = `{{ results_file }}`;
 11 | const hwType = "{{ hw_type }}";
 12 | const batchSize = {{ batch_size }};
 13 | const image = "{{ image }}";
 14 | const engine = "{{ engine }}";
 15 | const vendor = "{{ vendor }}";
 16 | 
 17 | // Metrics definition
 18 | const responseLatency = new Trend('response_latency', true);
 19 | const requestThroughput = new Counter('request_throughput');
 20 | const accuracyCount = new Counter('accuracy_count');
 21 | const totalRequests = new Trend('total_requests', true);
 22 | 
 23 | 
 24 | export function generatePayload(example) {
 25 |     return {
 26 |         "input": [example.{{ text_column }}]
 27 |     };
 28 | }
 29 | 
 30 | export const options = {
 31 |     scenarios: {
 32 |         shared_load_test: {
 33 |             executor: 'shared-iterations',
 34 |             vus: {{ pre_allocated_vus }}, // Number of VUs
 35 |             iterations: {{ total_requests }}, // Total number of requests
 36 |             maxDuration: '{{ duration }}', // Time limit
 37 |         },
 38 |     },
 39 | };
 40 | 
 41 | let localRequestCount = 0; // Track within each VU
 42 | 
 43 | export default function run() {
 44 |     const headers = {
 45 |         Accept: "application/json",
 46 |         Authorization: "Bearer " + apiKey,
 47 |         "Content-Type": "application/json",
 48 |     };
 49 |     const query = dataset[scenario.iterationInTest % dataset.length];
 50 |     const payload = JSON.stringify(generatePayload(query));
 51 |     const url = `${host}/classify`;
 52 |     const params = {
 53 |         method: 'POST',
 54 |         body: payload,
 55 |         headers,
 56 |     };
 57 | 
 58 |     // Sending the request
 59 |     const startTime = Date.now();
 60 |     const res = http.post(url, payload, params);
 61 |     const endTime = Date.now();
 62 |     const deltaMs = endTime - startTime; // Duration in milliseconds
 63 | 
 64 |     // Add response latency
 65 |     responseLatency.add(deltaMs);
 66 |     localRequestCount += 1;
 67 |     requestThroughput.add(1); // Still use k6 counter
 68 | 
 69 |     // Check the response
 70 |     const passedCheck = check(res, {
 71 |         'http_200': (r) => r.status === 200,
 72 |     });
 73 | 
 74 |     if (!passedCheck) {
 75 |         fail('Request failed with status ' + res.status);
 76 |         return;
 77 |     }
 78 | 
 79 |     // Process the response
 80 |     let tokenCount = 0;
 81 |     let isCorrectClassification = false;
 82 |     try {
 83 |         const data = JSON.parse(res.body);
 84 |         if (data.object === 'classify' && data.data && data.data[0] && data.data[0].length > 0) {
 85 |             const predictedLabel = data.data[0][0].label;
 86 |             if (query.hasOwnProperty('label_text')) {
 87 |                 if (predictedLabel === query.label_text) {
 88 |                     accuracyCount.add(1);
 89 |                     isCorrectClassification = true;
 90 |                 }
 91 |             }
 92 | 
 93 |             // Add token count to tokens throughput
 94 |             if (data.usage && data.usage.total_tokens) {
 95 |                 tokenCount = data.usage.total_tokens;
 96 |             }
 97 |         }
 98 |     } catch (e) {
 99 |         fail('Failed to parse response body: ' + e);
100 |     }
101 | 
102 |     // Update custom accuracy metrics
103 |     const correctCount = accuracyCount.count || 0;
104 |     const totalCount = localRequestCount;
105 | 
106 |     // Add custom metrics
107 |     totalRequests.add(totalCount);
108 |     accuracyCount.add(correctCount);
109 | }
110 | 
111 | // Writing Summary to a JSON File
112 | export function handleSummary(data) {
113 |     // Extract required metrics
114 |     const totalRequests = data.metrics.iterations.values.count; // Total requests
115 |     const failedRequests = data.metrics.http_req_failed.values.count || 0;
116 |     const successfulRequests = totalRequests - failedRequests; // Corrected calculation
117 |     const avgLatency = data.metrics.response_latency.values.avg; // Average latency in ms
118 |     const p95Latency = data.metrics.response_latency.values['p(95)']; // 95th percentile latency in ms
119 |     const accuracyPercentage = (data.metrics.accuracy_count.values.count / totalRequests) * 100; // Accuracy %
120 |     const testDuration = data.state.testRunDurationMs / 1000; // Convert from ms to seconds
121 |     const throughput = data.metrics.iterations.values.rate; // Requests per second
122 | 
123 |     // Construct the summary object with units in keys
124 |     const summary = {
125 |         total_requests: totalRequests,
126 |         test_duration_sec: testDuration, // Seconds
127 |         successful_requests: successfulRequests,
128 |         avg_latency_ms: avgLatency, // Milliseconds
129 |         p95_latency_ms: p95Latency, // Milliseconds
130 |         accuracy_percentage: accuracyPercentage, // Percentage
131 |         throughput_req_per_sec: throughput, // Requests per second
132 |         hw_type: hwType,
133 |         batch_size: batchSize,
134 |         image: image,
135 |         engine: engine,
136 |         vendor: vendor,
137 |         vus: {{ pre_allocated_vus }}
138 |     };
139 | 
140 |     // Write the summary to a JSON file
141 |     return {
142 |         [filePath]: JSON.stringify(summary, null, 2), // Save JSON output
143 |         stdout: textSummary(data, { indent: ' ', enableColors: true }), // Show summary in console
144 |     };
145 | }
146 | 


--------------------------------------------------------------------------------
/templates/embedding-analysis.js.j2:
--------------------------------------------------------------------------------
  1 | import {check, fail} from 'k6';
  2 | import http from 'k6/http';
  3 | import {scenario} from 'k6/execution';
  4 | import {Trend, Counter} from 'k6/metrics';
  5 | import {textSummary} from 'https://jslib.k6.io/k6-summary/0.0.2/index.js';
  6 | 
  7 | const host = "{{ host }}";
  8 | const apiKey = __ENV.HF_TOKEN;
  9 | const dataset = JSON.parse(open("{{ data_file }}"));
 10 | const filePath = `{{ results_file }}`;
 11 | const hwType = "{{ hw_type }}";
 12 | const batchSize = {{ batch_size }};
 13 | const image = "{{ image }}";
 14 | const engine = "{{ engine }}";
 15 | const vendor = "{{ vendor }}";
 16 | 
 17 | // Metrics definition
 18 | const responseLatency = new Trend('response_latency', true);
 19 | const requestThroughput = new Counter('request_throughput');
 20 | const totalRequests = new Trend('total_requests', true);
 21 | const embeddingSize = new Trend('embedding_size', true);
 22 | 
 23 | export function generatePayload(example) {
 24 |     return {
 25 |         "input": [example.{{ text_column }}]
 26 |     };
 27 | }
 28 | 
 29 | export const options = {
 30 |     scenarios: {
 31 |         shared_load_test: {
 32 |             executor: 'shared-iterations',
 33 |             vus: {{ pre_allocated_vus }}, // Number of VUs
 34 |             iterations: {{ total_requests }}, // Total number of requests
 35 |             maxDuration: '{{ duration }}', // Time limit
 36 |         },
 37 |     },
 38 | };
 39 | 
 40 | let localRequestCount = 0; // Track within each VU
 41 | 
 42 | export default function run() {
 43 |     const headers = {
 44 |         Accept: "application/json",
 45 |         Authorization: "Bearer " + apiKey,
 46 |         "Content-Type": "application/json",
 47 |     };
 48 |     const query = dataset[scenario.iterationInTest % dataset.length];
 49 |     const payload = JSON.stringify(generatePayload(query));
 50 |     const url = `${host}/embeddings`;
 51 |     const params = {
 52 |         method: 'POST',
 53 |         body: payload,
 54 |         headers,
 55 |     };
 56 | 
 57 |     // Sending the request
 58 |     const startTime = Date.now();
 59 |     const res = http.post(url, payload, params);
 60 |     const endTime = Date.now();
 61 |     const deltaMs = endTime - startTime; // Duration in milliseconds
 62 | 
 63 |     // Add response latency
 64 |     responseLatency.add(deltaMs);
 65 |     localRequestCount += 1;
 66 |     requestThroughput.add(1); // Still use k6 counter
 67 | 
 68 |     // Check the response
 69 |     const passedCheck = check(res, {
 70 |         'http_200': (r) => r.status === 200,
 71 |         'valid_embedding': (r) => {
 72 |             try {
 73 |                 const data = JSON.parse(r.body);
 74 |                 return data.object === 'list' && data.data && data.data.length > 0 && Array.isArray(data.data[0].embedding);
 75 |             } catch (e) {
 76 |                 return false;
 77 |             }
 78 |         }
 79 |     });
 80 | 
 81 |     if (!passedCheck) {
 82 |         fail('Request failed with status ' + res.status + ' or invalid embedding format');
 83 |         return;
 84 |     }
 85 | 
 86 |     // Process the response
 87 |     try {
 88 |         const data = JSON.parse(res.body);
 89 |         if (data.object === 'list' && data.data && data.data[0] && data.data[0].embedding) {
 90 |             embeddingSize.add(data.data[0].embedding.length);
 91 |         }
 92 |     } catch (e) {
 93 |         fail('Failed to parse response body: ' + e);
 94 |     }
 95 | 
 96 |     // Add custom metrics
 97 |     totalRequests.add(localRequestCount);
 98 | }
 99 | 
100 | // Writing Summary to a JSON File
101 | export function handleSummary(data) {
102 |     // Extract required metrics
103 |     const totalRequests = data.metrics.iterations.values.count; // Total requests
104 |     const failedRequests = data.metrics.http_req_failed.values.count || 0;
105 |     const successfulRequests = totalRequests - failedRequests; // Corrected calculation
106 |     const avgLatency = data.metrics.response_latency.values.avg; // Average latency in ms
107 |     const p95Latency = data.metrics.response_latency.values['p(95)']; // 95th percentile latency in ms
108 |     const testDuration = data.state.testRunDurationMs / 1000; // Convert from ms to seconds
109 |     const throughput = data.metrics.iterations.values.rate; // Requests per second
110 |     const avgEmbeddingSize = data.metrics.embedding_size.values.avg || 0; // Average embedding size
111 | 
112 |     // Construct the summary object with units in keys
113 |     const summary = {
114 |         total_requests: totalRequests,
115 |         test_duration_sec: testDuration, // Seconds
116 |         successful_requests: successfulRequests,
117 |         avg_latency_ms: avgLatency, // Milliseconds
118 |         p95_latency_ms: p95Latency, // Milliseconds
119 |         throughput_req_per_sec: throughput, // Requests per second
120 |         avg_embedding_size: avgEmbeddingSize, // Average embedding size
121 |         hw_type: hwType,
122 |         batch_size: batchSize,
123 |         image: image,
124 |         engine: engine,
125 |         vendor: vendor,
126 |         vus: {{ pre_allocated_vus }}
127 |     };
128 | 
129 |     // Write the summary to a JSON file
130 |     return {
131 |         [filePath]: JSON.stringify(summary, null, 2), // Save JSON output
132 |         stdout: textSummary(data, { indent: ' ', enableColors: true }), // Show summary in console
133 |     };
134 | }
135 | 


--------------------------------------------------------------------------------
/templates/vision-embedding-analysis.js.j2:
--------------------------------------------------------------------------------
  1 | import {check, fail} from 'k6';
  2 | import http from 'k6/http';
  3 | import {scenario} from 'k6/execution';
  4 | import {Trend, Counter} from 'k6/metrics';
  5 | import {textSummary} from 'https://jslib.k6.io/k6-summary/0.0.2/index.js';
  6 | import { SharedArray } from 'k6/data';
  7 | 
  8 | const host = "{{ host }}";
  9 | const apiKey = __ENV.HF_TOKEN;
 10 | 
 11 | // Load dataset once using SharedArray to avoid copies across VUs
 12 | const dataset = new SharedArray("dataset", function () {
 13 |     return open("{{ data_file }}")
 14 |         .split('\n')
 15 |         .filter(line => line.trim() !== "") // Remove empty lines
 16 |         .map(line => {
 17 |             try {
 18 |                 return JSON.parse(line);
 19 |             } catch (e) {
 20 |                 console.log(`Skipping invalid JSON line: ${line}`);
 21 |                 return null;
 22 |             }
 23 |         })
 24 |         .filter(entry => entry !== null); // Remove null values from failed JSON parsing
 25 | });
 26 | 
 27 | const filePath = `{{ results_file }}`;
 28 | const hwType = "{{ hw_type }}";
 29 | const batchSize = {{ batch_size }};
 30 | const image = "{{ image }}";
 31 | const engine = "{{ engine }}";
 32 | const vendor = "{{ vendor }}";
 33 | 
 34 | // Metrics definition
 35 | const responseLatency = new Trend('response_latency', true);
 36 | const requestThroughput = new Counter('request_throughput');
 37 | const numVectorsTrend = new Trend('num_vectors', true);
 38 | const invalidEmbeddingsCounter = new Counter('invalid_embeddings');
 39 | 
 40 | export function generatePayload(example) {
 41 |     return {
 42 |         "input": [example.{{ text_column }}_b64], //I just kept the same naming, sorry :'(
 43 |         "encoding_format": "float",
 44 |         "modality": "image"
 45 |     };
 46 | }
 47 | 
 48 | export const options = {
 49 |     scenarios: {
 50 |         shared_load_test: {
 51 |             executor: 'shared-iterations',
 52 |             vus: {{ pre_allocated_vus }}, // Number of VUs
 53 |             iterations: {{ total_requests }}, // Total number of requests
 54 |             maxDuration: '{{ duration }}', // Time limit
 55 |         },
 56 |     },
 57 | };
 58 | 
 59 | export default function run() {
 60 |     const headers = {
 61 |         Accept: "application/json",
 62 |         Authorization: "Bearer " + apiKey,
 63 |         "Content-Type": "application/json",
 64 |     };
 65 | 
 66 |     const query = dataset[scenario.iterationInTest % dataset.length];
 67 |     const payload = JSON.stringify(generatePayload(query));
 68 |     const url = `${host}/embeddings`;
 69 | 
 70 |     const startTime = Date.now();
 71 |     const res = http.post(url, payload, { headers });
 72 |     const endTime = Date.now();
 73 |     responseLatency.add(endTime - startTime);
 74 |     requestThroughput.add(1);
 75 | 
 76 |     // Check the response
 77 |     const passedCheck = check(res, {
 78 |         'http_200': (r) => r.status === 200,
 79 |         'valid_embedding': (r) => {
 80 |             try {
 81 |                 const data = JSON.parse(r.body);
 82 |                 const isValid = data.object === 'list' && data.data && data.data.length > 0 &&
 83 |                                Array.isArray(data.data[0].embedding) &&
 84 |                                data.data[0].embedding.every(vec => Array.isArray(vec) && vec.length === 128);
 85 |                 if (!isValid) invalidEmbeddingsCounter.add(1);
 86 |                 return isValid;
 87 |             } catch (e) {
 88 |                 return false;
 89 |             }
 90 |         }
 91 |     });
 92 | 
 93 |     if (!passedCheck) {
 94 |         fail('Request failed with status ' + res.status + ' or invalid embedding format');
 95 |         return;
 96 |     }
 97 | 
 98 |     // Process the response
 99 |     try {
100 |         const data = JSON.parse(res.body);
101 |         if (data.object === 'list' && Array.isArray(data.data) && data.data.length === 1) {
102 |             let numVectors = data.data[0].embedding.length;
103 |             numVectorsTrend.add(numVectors);
104 |         } else {
105 |             fail("Response format incorrect or missing required fields.");
106 |         }
107 |     } catch (e) {
108 |         fail("Failed to parse response body: " + e);
109 |     }
110 | }
111 | 
112 | // Write the summary to a JSON file
113 | export function handleSummary(data) {
114 |     const invalidEmbeddings = data.metrics.invalid_embeddings ? data.metrics.invalid_embeddings.values.count : 0;
115 | 
116 |     const summary = {
117 |         total_requests: data.metrics.iterations.values.count,
118 |         test_duration_sec: data.state.testRunDurationMs / 1000,
119 |         successful_requests: data.metrics.iterations.values.count - (data.metrics.http_req_failed.values.count || 0),
120 |         avg_latency_ms: data.metrics.response_latency.values.avg,
121 |         p95_latency_ms: data.metrics.response_latency.values['p(95)'],
122 |         throughput_req_per_sec: data.metrics.iterations.values.rate,
123 |         avg_num_vectors: data.metrics.num_vectors.values.avg || 0,
124 |         min_num_vectors: data.metrics.num_vectors.values.min || 0,
125 |         max_num_vectors: data.metrics.num_vectors.values.max || 0,
126 |         invalid_embeddings: invalidEmbeddings,
127 |         hw_type: hwType,
128 |         batch_size: batchSize,
129 |         image: image,
130 |         engine: engine,
131 |         vendor: vendor,
132 |         vus: {{ pre_allocated_vus }}
133 |     };
134 | 
135 |     return {
136 |         [filePath]: JSON.stringify(summary, null, 2), // Save JSON output
137 |         stdout: textSummary(data, { indent: ' ', enableColors: true }), // Show summary in console
138 |     };
139 | }
140 | 


--------------------------------------------------------------------------------