├── .gitignore ├── LICENSE ├── README.md ├── classification-analysis-gradio.ipynb ├── classification-analysis.ipynb ├── classification-optimization.ipynb ├── data └── .gitkeep ├── embedding-analysis-gradio.ipynb ├── embedding-analysis.ipynb ├── embedding-optimization.ipynb ├── generated └── .gitkeep ├── media ├── classification-accuracy.png ├── classification-contour.png ├── classification-image-cost.png ├── classification-latency.png ├── classification-token-distribution.png ├── embedding-contour.png ├── embedding-latency.png ├── embedding-token-distribution.png ├── vision-embedding-contour.png ├── vision-embedding-file-size.png ├── vision-embedding-image-dimensions.png └── vision-embedding-latency.png ├── requirements.txt ├── results └── .gitkeep ├── src ├── deployment.py ├── k6.py └── process_dataset.py ├── templates ├── classification-analysis.js.j2 ├── embedding-analysis.js.j2 └── vision-embedding-analysis.js.j2 ├── vision-embedding-analysis-gradio.ipynb ├── vision-embedding-analysis.ipynb └── vision-embedding-optimization.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | *.ipynb_checkpoints* 3 | results/* 4 | generated/* 5 | data/* 6 | aggregated_results.csv -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Derek Thomas 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 9 | [![Contributors][contributors-shield]][contributors-url] 10 | [![Stargazers][stars-shield]][stars-url] 11 | [![Issues][issues-shield]][issues-url] 12 | [![MIT License][license-shield]][license-url] 13 | 14 | # Table of Contents 15 | 16 | - [Introduction](#introduction) 17 | - [Installation](#installation) 18 | - [Getting Started](#getting-started) 19 | - [Project Structure](#project-structure) 20 | - [How does it work?](#how-does-it-work) 21 | - [Results](#results) 22 | - [References and Links](#references-and-links) 23 | 24 | # Introduction 25 | 26 | This repository supports a blog post that helps users estimate costs for large-scale classification, embedding, or 27 | vision embedding tasks. It provides benchmarking tools for different GPU types, batch sizes, and inference methods, 28 | using [michaelfeil/infinity](https://github.com/michaelfeil/infinity/) 29 | and [Inference Endpoints](https://huggingface.co/inference-endpoints/dedicated). 30 | 31 | I considered a variety of things like: 32 | 33 | - GPU type 34 | - Infinity Image type 35 | - Varying Batch Sizes 36 | - Varying VUs amounts 37 | - Multiple Architectures 38 | 39 | # Installation 40 | 41 | I used ![Python](https://img.shields.io/badge/python-3.12-blue) 42 | 43 | 1. `git clone https://github.com/datavistics/encoder-analysis.git` 44 | 2. `cd encoder-analysis` 45 | 3. `pip install -r requirements.txt` 46 | 4. `pip install jupyterlab` 47 | 5. [Install k6](https://grafana.com/docs/k6/latest/set-up/install-k6/#install-k6) based on your platform 48 | 49 | ## Getting Started 50 | Make sure you have the ability to [deploy an Inference Endpoint](https://endpoints.huggingface.co/new) 51 | 52 | 1. Run `jupyter lab` 53 | 2. Choose your task [`classification`, `embedding`, `vision-embedding`] 54 | 3. Run `-optimization.ipynb` to get the best configuration 55 | 4. Run `-analysis.ipynb` to visualize the results 56 | 5. Alternatively run `-analysis-gradio.ipynb` to have more interactive results 57 | 58 | # Project Structure 59 | 60 | - There are notebooks in the top level for convenience. Its probably cleaner to put them in `./notebooks` but its 61 | annoying to add it to path, so I opted for user satisfaction rather than aesthetics 62 | - **\*-optimization.ipynb** - These were used for generating and conducting the experiments 63 | - **\*-analysis.ipynb** - These show the analysis in a clean notebook-centric way 64 | - **\*-analysis-gradio.ipynb** - These show the analysis in an interactive gradio-centric way 65 | - `src` I abstracted a fair amount of code here. I tried to keep any important details in the notebooks 66 | - `templates` these are the k6 jinja templates that I use to generate each experiment 67 | - `data`, `generated`, and `results` are used to store non-version-controlled project files 68 | 69 | # How does it work? 70 | 71 | Each of the **\*-optimization.ipynb** notebooks facilitates this structure: 72 | 73 | ```mermaid 74 | flowchart TD; 75 | subgraph Benchmarking Server 76 | A[k6 Load Testing] 77 | D[Instance Config] 78 | end 79 | 80 | subgraph Inference Endpoint 81 | C[Container Running Infinity] 82 | E[Next Inference Endpoint] 83 | end 84 | 85 | D -->|Defines Test Parameters| A 86 | D -->|Deploys Inference Endpoint| E 87 | A -->|Sends Test Data| C 88 | C -->|Processes and Returns| A 89 | ``` 90 | 91 | 1. Define the benchmarking parameters (GPU, batch size, VUs, etc) 92 | 2. Deploy the inference server (Infinity on Hugging Face Endpoints) 93 | 3. Run K6 performance tests to evaluate speed, cost, and efficiency 94 | 4. Store and visualize results for optimization 95 | 96 | # Results 97 | 98 | Do check out these [notebooks](https://nbviewer.org/github/datavistics/encoder-analysis/tree/main/) in nbviewer, as I 99 | took a lot of effort to make sure they are **interactive**. Unfortunately they look better in light mode due to the 100 | tables. 101 | But follow your heart. 102 | 103 | - [classification-analysis-gradio.ipynb](https://nbviewer.org/github/datavistics/encoder-analysis/blob/main/classification-analysis-gradio.ipynb) 104 | - [embedding-analysis-gradio.ipynb](https://nbviewer.org/github/datavistics/encoder-analysis/blob/main/embedding-analysis-gradio.ipynb) 105 | - [vision-embedding-analysis-gradio.ipynb](https://nbviewer.org/github/datavistics/encoder-analysis/blob/main/vision-embedding-analysis-gradio.ipynb) 106 | 107 | ## Classification 108 | 109 | For [lxyuan/distilbert-base-multilingual-cased-sentiments-student](https://huggingface.co/lxyuan/distilbert-base-multilingual-cased-sentiments-student) 110 | on a dataset like [tyqiangz/multilingual-sentiments](https://huggingface.co/datasets/tyqiangz/multilingual-sentiments) 111 | (using the `text` column) we can do 1 Billion classifications for only `$253.82`. 112 | 113 | | GPU | Image | Batch Size | VUs | Min Cost | 114 | |---------------|---------------|------------|---------|-------------| 115 | | **nvidia-l4** | **`default`** | **64** | **448** | **$253.82** | 116 | 117 | ![classification-results.png](media/classification-contour.png) 118 | [Interactive Version here](https://nbviewer.org/github/datavistics/encoder-analysis/blob/main/classification-analysis-gradio.ipynb) 119 | 120 | ## Embedding 121 | 122 | For [Alibaba-NLP/gte-modernbert-base](https://huggingface.co/Alibaba-NLP/gte-modernbert-base) on a dataset 123 | like [sentence-transformers/trivia-qa-triplet](https://huggingface.co/datasets/sentence-transformers/trivia-qa-triplet) 124 | (using the `positive` column) we can do 1 Billion embeddings for only `$409.44`. 125 | 126 | | GPU | Batch Size | VUs | Min Cost | 127 | |-----------|------------|-----|----------| 128 | | nvidia-l4 | 256 | 32 | $409.44 | 129 | 130 | ![embedding-results.png](media/embedding-contour.png) 131 | [Interactive Version here](https://nbviewer.org/github/datavistics/encoder-analysis/blob/main/embedding-analysis-gradio.ipynb) 132 | 133 | ## Vision Embedding 134 | 135 | For [vidore/colqwen2-v1.0-merged](https://huggingface.co/vidore/colqwen2-v1.0-merged) on a dataset 136 | like [openbmb/RLAIF-V-Dataset](https://huggingface.co/datasets/openbmb/RLAIF-V-Dataset) 137 | (using the `image` column) we can do 1 Billion ColBERT style embeddings (late interaction) on images for `$44496.51`. 138 | 139 | | GPU | Batch Size | VUs | Min Cost | 140 | |-----------|------------|-----|-----------| 141 | | nvidia-l4 | 4 | 4 | $44496.51 | 142 | 143 | ![vision-embedding-results.png](media/vision-embedding-contour.png) 144 | [Interactive Version here](https://nbviewer.org/github/datavistics/encoder-analysis/blob/main/embedding-analysis-gradio.ipynb) 145 | 146 | # References and Links 147 | 148 | - [Inference Endpoints](https://huggingface.co/inference-endpoints/dedicated) 149 | - [michaelfeil/infinity](https://github.com/michaelfeil/infinity/) 150 | - [Infinity Swagger](https://michaelfeil.eu/infinity/0.0.75/swagger_ui/) 151 | - [k6 Docs](https://grafana.com/docs/k6/latest/) 152 | 153 | 154 | 155 | 156 | [contributors-shield]: https://img.shields.io/github/contributors/datavistics/encoder-analysis.svg?style=for-the-badge 157 | 158 | [contributors-url]: https://github.com/datavistics/encoder-analysis/graphs/contributors 159 | 160 | [stars-shield]: https://img.shields.io/github/stars/datavistics/encoder-analysis.svg?style=for-the-badge 161 | 162 | [stars-url]: https://github.com/datavistics/encoder-analysis/stargazers 163 | 164 | [issues-shield]: https://img.shields.io/github/issues/datavistics/encoder-analysis.svg?style=for-the-badge 165 | 166 | [issues-url]: https://github.com/datavistics/encoder-analysis/issues 167 | 168 | [license-shield]: https://img.shields.io/github/license/datavistics/encoder-analysis.svg?style=for-the-badge 169 | 170 | [license-url]: https://github.com/datavistics/encoder-analysis/blob/main/LICENSE 171 | -------------------------------------------------------------------------------- /data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datavistics/encoder-analysis/430b433ba259f5cfc4dd957fb45f1e6c5379c1d7/data/.gitkeep -------------------------------------------------------------------------------- /generated/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datavistics/encoder-analysis/430b433ba259f5cfc4dd957fb45f1e6c5379c1d7/generated/.gitkeep -------------------------------------------------------------------------------- /media/classification-accuracy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datavistics/encoder-analysis/430b433ba259f5cfc4dd957fb45f1e6c5379c1d7/media/classification-accuracy.png -------------------------------------------------------------------------------- /media/classification-contour.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datavistics/encoder-analysis/430b433ba259f5cfc4dd957fb45f1e6c5379c1d7/media/classification-contour.png -------------------------------------------------------------------------------- /media/classification-image-cost.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datavistics/encoder-analysis/430b433ba259f5cfc4dd957fb45f1e6c5379c1d7/media/classification-image-cost.png -------------------------------------------------------------------------------- /media/classification-latency.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datavistics/encoder-analysis/430b433ba259f5cfc4dd957fb45f1e6c5379c1d7/media/classification-latency.png -------------------------------------------------------------------------------- /media/classification-token-distribution.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datavistics/encoder-analysis/430b433ba259f5cfc4dd957fb45f1e6c5379c1d7/media/classification-token-distribution.png -------------------------------------------------------------------------------- /media/embedding-contour.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datavistics/encoder-analysis/430b433ba259f5cfc4dd957fb45f1e6c5379c1d7/media/embedding-contour.png -------------------------------------------------------------------------------- /media/embedding-latency.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datavistics/encoder-analysis/430b433ba259f5cfc4dd957fb45f1e6c5379c1d7/media/embedding-latency.png -------------------------------------------------------------------------------- /media/embedding-token-distribution.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datavistics/encoder-analysis/430b433ba259f5cfc4dd957fb45f1e6c5379c1d7/media/embedding-token-distribution.png -------------------------------------------------------------------------------- /media/vision-embedding-contour.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datavistics/encoder-analysis/430b433ba259f5cfc4dd957fb45f1e6c5379c1d7/media/vision-embedding-contour.png -------------------------------------------------------------------------------- /media/vision-embedding-file-size.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datavistics/encoder-analysis/430b433ba259f5cfc4dd957fb45f1e6c5379c1d7/media/vision-embedding-file-size.png -------------------------------------------------------------------------------- /media/vision-embedding-image-dimensions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datavistics/encoder-analysis/430b433ba259f5cfc4dd957fb45f1e6c5379c1d7/media/vision-embedding-image-dimensions.png -------------------------------------------------------------------------------- /media/vision-embedding-latency.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datavistics/encoder-analysis/430b433ba259f5cfc4dd957fb45f1e6c5379c1d7/media/vision-embedding-latency.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | plotly==5.24.1 2 | scipy==1.15.1 3 | huggingface-hub==0.27.1 4 | transformers==4.48.0 5 | pandas==2.2.3 6 | loguru==0.7.3 7 | Jinja2==3.1.5 8 | gradio==5.15.0 9 | datasets==3.2.0 10 | ipywidgets==8.1.5 11 | matplotlib==3.10.0 12 | datasets==3.2.0 -------------------------------------------------------------------------------- /results/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/datavistics/encoder-analysis/430b433ba259f5cfc4dd957fb45f1e6c5379c1d7/results/.gitkeep -------------------------------------------------------------------------------- /src/deployment.py: -------------------------------------------------------------------------------- 1 | import time 2 | from dataclasses import asdict 3 | from typing import Any, Dict 4 | 5 | from huggingface_hub import create_inference_endpoint, get_inference_endpoint, whoami 6 | from loguru import logger 7 | 8 | 9 | def deploy_endpoint( 10 | instance_config: Dict[str, Any], 11 | endpoint_name: str, 12 | wait: bool = False 13 | ): 14 | """Creates and deploys an inference endpoint with the given configuration. 15 | 16 | Args: 17 | instance_config (Dict[str, Any]): Configuration for the endpoint. 18 | endpoint_name (str): Name of the endpoint. 19 | wait (bool, optional): Whether to wait for deployment. Defaults to False. 20 | 21 | Returns: 22 | Any: The endpoint object or None if creation fails. 23 | """ 24 | 25 | # Try to Re-Use an endpoint 26 | namespace = whoami()['name'] 27 | try: 28 | endpoint = get_inference_endpoint(endpoint_name, namespace=namespace) 29 | hw_type = endpoint.__dict__['raw']['compute']['instanceType'] 30 | batch_size = endpoint.__dict__['raw']['model']['env']['INFINITY_BATCH_SIZE'] 31 | logger.success(f"Re-using Endpoint: hw={hw_type}\tbs={batch_size}\t") 32 | return endpoint 33 | except Exception as e: 34 | logger.warning(f"Endpoint not found. Proceeding with creation: {e}") 35 | 36 | # If that doesnt work try to create one 37 | try: 38 | logger.info("Creating inference endpoint...") 39 | start_time = time.time() # Record start time 40 | endpoint = create_inference_endpoint( 41 | endpoint_name, 42 | namespace=namespace, 43 | # namespace='HF-test-lab', 44 | framework="pytorch", 45 | task='text-classification', 46 | min_replica=0, 47 | max_replica=1, 48 | scale_to_zero_timeout=300, 49 | type="protected", 50 | **asdict(instance_config) 51 | ) 52 | except Exception as e: 53 | logger.error(f"Failed to create inference endpoint: {e}") 54 | return None 55 | 56 | logger.info("Waiting for endpoint to be ready...") 57 | if not wait: 58 | logger.info("Waiting for endpoint to be ready...") 59 | return endpoint 60 | endpoint.wait() # Wait for the endpoint to be ready 61 | 62 | # Calculate elapsed time 63 | elapsed_time = time.time() - start_time 64 | elapsed_minutes, elapsed_seconds = divmod(elapsed_time, 60) 65 | 66 | hw_type = endpoint.__dict__['raw']['compute']['instanceType'] 67 | batch_size = endpoint.__dict__['raw']['model']['env']['INFINITY_BATCH_SIZE'] 68 | 69 | logger.success( 70 | f"Endpoint created successfully: hw={hw_type}\tbs={batch_size}\t" 71 | f"Time taken: {int(elapsed_minutes)}m {elapsed_seconds:.2f}s" 72 | ) 73 | return endpoint 74 | -------------------------------------------------------------------------------- /src/k6.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import subprocess 4 | from pathlib import Path 5 | from typing import Any, Dict 6 | 7 | from huggingface_hub import InferenceEndpoint, get_token 8 | from jinja2 import Environment, FileSystemLoader 9 | from loguru import logger 10 | 11 | template_dir = "./templates" 12 | template_file = "classification-analysis.js.j2" 13 | 14 | output_file = Path("./generated").resolve() / "classification-analysis.js" 15 | 16 | # I hardcoded this because for most people this is all that will be tested. Do send me a PR if you need more. 17 | image_dict = { 18 | 'michaelf34/infinity:0.0.75-trt-onnx': 'trt-onnx', 19 | 'michaelf34/infinity:0.0.75': 'default', 20 | } 21 | 22 | 23 | def call_k6( 24 | endpoint: InferenceEndpoint, 25 | text_column: str, 26 | vus: int, 27 | total_requests: int, 28 | template_file: str, 29 | output_file: Path, 30 | dataset_path: str, 31 | k6_bin: str 32 | ) -> float: 33 | """ 34 | Runs a k6 performance test on a given endpoint using a specified template. 35 | 36 | Args: 37 | endpoint: Endpoint object containing model and compute metadata. 38 | text_column (str): Name of the text column in the dataset. 39 | vus (int): Number of virtual users for k6 load testing. 40 | total_requests (int): Total number of requests to simulate. 41 | template_file (str): Jinja2 template file used for generating test scripts. 42 | output_file (Path): Path to the generated JavaScript test script. 43 | dataset_path (str): Path to the dataset file. 44 | k6_bin (str): Path to the k6 binary. 45 | 46 | Returns: 47 | float: The measured throughput in requests per second. 48 | """ 49 | # Determine the task type based on the template file name 50 | if 'classification' in template_file: 51 | task = 'classification' 52 | elif 'vision-embedding' in template_file: 53 | task = 'vision-embedding' 54 | elif 'embedding' in template_file: 55 | task = 'embedding' 56 | else: 57 | raise ValueError('Unknown task type in template file') 58 | 59 | # Load Jinja2 template for script generation 60 | env = Environment(loader=FileSystemLoader(template_dir)) 61 | template = env.get_template(template_file) 62 | 63 | # Extract relevant metadata from the endpoint 64 | image = endpoint.__dict__['raw']['model']['image']['custom']['url'] 65 | image_short = image_dict.get(image, 'other_image') 66 | hw_type = endpoint.__dict__['raw']['compute']['instanceType'] 67 | vendor = endpoint.__dict__['raw']['provider']['vendor'] 68 | batch_size = endpoint.__dict__['raw']['model']['env']['INFINITY_BATCH_SIZE'] 69 | engine = endpoint.__dict__['raw']['model']['env']['INFINITY_ENGINE'] 70 | 71 | results_file = Path( 72 | "./results").resolve() / task / f'{hw_type}' / f'{vendor}_{hw_type}_{image_short}_{engine}_{batch_size}_{vus}.json' 73 | 74 | if results_file.exists(): 75 | logger.info(f"Results file {results_file} already exists. Loading existing data.") 76 | with open(results_file) as f: 77 | data = json.load(f) 78 | return data.get("throughput_req_per_sec", 0) 79 | 80 | # Render the test script from the template 81 | rendered_script = template.render( 82 | text_column=text_column, 83 | host=endpoint.url, 84 | data_file=str(Path(dataset_path).resolve()), 85 | results_file=str(results_file), 86 | pre_allocated_vus=vus, 87 | total_requests=total_requests, 88 | hw_type=hw_type, 89 | batch_size=batch_size, 90 | vendor=vendor, 91 | image=image, 92 | engine=engine, 93 | duration="1m" 94 | ) 95 | 96 | # Ensure necessary directories exist 97 | os.makedirs(os.path.dirname(output_file), exist_ok=True) 98 | os.makedirs(os.path.dirname(results_file), exist_ok=True) 99 | 100 | # Write the generated script to file 101 | with open(str(output_file), "w") as f: 102 | f.write(rendered_script) 103 | 104 | # Execute the k6 load test 105 | logger.info(f"Running k6 test with {vus} VUs") 106 | K6_BIN = os.path.expanduser(k6_bin) 107 | process = subprocess.run( 108 | [K6_BIN, "run", str(output_file)], 109 | env={'HF_TOKEN': get_token(), **os.environ}, 110 | capture_output=True, 111 | text=True 112 | ) 113 | logger.info(f"Results written to {results_file}") 114 | 115 | # Load and return the throughput result 116 | try: 117 | with open(results_file) as f: 118 | data = json.load(f) 119 | return data.get("throughput_req_per_sec", 0) 120 | except Exception as e: 121 | logger.error(f"Failed to read results file: {e}") 122 | return 0 123 | 124 | 125 | def optimal_vus( 126 | max_vus: int, 127 | args_dict: Dict[str, Any], 128 | start_vus: int = 1 129 | ) -> int: 130 | """ 131 | Finds the optimal number of virtual users (VUs) for maximum throughput. 132 | 133 | Args: 134 | max_vus (int): Maximum number of virtual users to test. 135 | args_dict (dict): Dictionary of arguments to pass to `call_k6`. 136 | start_vus (int): Initial number of VUs for testing (default: 1). 137 | 138 | Returns: 139 | int: Optimal number of VUs. 140 | """ 141 | vus = start_vus 142 | prev_throughput = 0 143 | vus_history = [] 144 | 145 | logger.info("Starting exponential search for optimal VUs") 146 | while vus <= max_vus: 147 | logger.info(f"Testing with {vus} VUs") 148 | throughput = call_k6(vus=vus, **args_dict) 149 | vus_history.append((vus, throughput)) 150 | logger.info(f"Throughput for {vus} VUs: {throughput:.2f} req/sec") 151 | 152 | if throughput < prev_throughput * 1.02: # Stop if improvement is <2% 153 | logger.info("Throughput improvement is less than 2%, stopping search.") 154 | break 155 | 156 | prev_throughput = throughput 157 | vus *= 2 # Double the VUs 158 | 159 | if vus > max_vus: 160 | logger.info(f"Reached maximum VU limit: {max_vus}") 161 | 162 | # Binary search refinement 163 | logger.info("Starting binary search refinement") 164 | low, high = vus_history[-2][0], vus_history[-1][0] 165 | while low < high: 166 | mid = (low + high) // 2 167 | logger.info(f"Testing with {mid} VUs") 168 | throughput = call_k6(vus=mid, **args_dict) 169 | 170 | if throughput > prev_throughput: 171 | logger.info(f"Throughput improved to {throughput:.2f} req/sec with {mid} VUs") 172 | prev_throughput = throughput 173 | low = mid + 1 174 | else: 175 | logger.info(f"Throughput did not improve with {mid} VUs") 176 | high = mid - 1 177 | 178 | optimal_vus = low 179 | logger.info(f"Optimal VUs determined: {optimal_vus}") 180 | return optimal_vus 181 | -------------------------------------------------------------------------------- /src/process_dataset.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import json 3 | import os 4 | import random 5 | from io import BytesIO 6 | from pathlib import Path 7 | 8 | import pandas as pd 9 | from PIL import Image 10 | from datasets import Dataset 11 | from loguru import logger 12 | 13 | 14 | def tokenize_and_filter(dataset: Dataset, tokenizer, text_column: str, min_tokens: int = None, max_tokens: int = None, 15 | num_proc: int = 8): 16 | """ 17 | Tokenizes a dataset, adds a `num_tokens` column, and filters based on token length constraints. 18 | 19 | :param dataset: Dataset to tokenize and filter. 20 | :param tokenizer: Tokenizer object with an `encode` method. 21 | :param text_column: Column name containing text data. 22 | :param min_tokens: Minimum number of tokens for filtering (optional). 23 | :param max_tokens: Maximum number of tokens for filtering (optional). 24 | :param num_proc: Number of processes for parallel execution. 25 | :return: Filtered dataset with token counts. 26 | """ 27 | logger.info("Tokenizing dataset and applying token count filter") 28 | 29 | dataset = dataset.map( 30 | lambda example: {"num_tokens": len(tokenizer.encode(example[text_column]))}, 31 | num_proc=num_proc, 32 | ) 33 | 34 | if min_tokens is not None and max_tokens is not None: 35 | dataset = dataset.filter(lambda x: min_tokens <= x["num_tokens"] <= max_tokens, num_proc=num_proc) 36 | logger.info(f"Filtered dataset with token range [{min_tokens}, {max_tokens}]") 37 | 38 | return dataset 39 | 40 | 41 | def sample_dataset(dataset, n_samples: int, seed: int = 42): 42 | """ 43 | Samples a dataset randomly if it has more than `n_samples`. 44 | 45 | :param dataset: Dataset to sample from. 46 | :param n_samples: Number of samples to retain. 47 | :param seed: Random seed for reproducibility. 48 | :return: Sampled dataset. 49 | """ 50 | total_samples = len(dataset) 51 | 52 | if total_samples <= n_samples: 53 | return dataset 54 | 55 | random.seed(seed) 56 | random_indices = random.sample(range(total_samples), n_samples) 57 | return_dataset = dataset.select(random_indices) 58 | logger.success(f"Sampled dataset down to {len(return_dataset)} samples") 59 | 60 | return return_dataset 61 | 62 | 63 | def save_dataset(data, file_path: str): 64 | """ 65 | Saves a dataset in JSON or JSONL format based on the file extension. 66 | 67 | :param data: Dataset to save. 68 | :param file_path: Path where the dataset should be saved. 69 | """ 70 | os.makedirs(os.path.dirname(file_path), exist_ok=True) 71 | 72 | # Convert to a list of dictionaries 73 | data = data.to_list() 74 | 75 | if file_path.endswith(".jsonl"): 76 | # Save as JSONL (one JSON object per line) 77 | with open(file_path, "w", encoding="utf-8") as f: 78 | for entry in data: 79 | f.write(json.dumps(entry, ensure_ascii=False) + "\n") 80 | logger.info(f"Saved dataset to {file_path} in JSONL format") 81 | elif file_path.endswith(".json"): 82 | # Save as a JSON array 83 | with open(file_path, "w", encoding="utf-8") as f: 84 | json.dump(data, f, ensure_ascii=False, indent=4) 85 | logger.info(f"Saved dataset to {file_path} in JSON format") 86 | else: 87 | logger.error("Unsupported file extension. Use '.json' or '.jsonl'.") 88 | raise ValueError("Unsupported file extension. Use '.json' or '.jsonl'.") 89 | 90 | 91 | def load_json_files(folder_path: str) -> pd.DataFrame: 92 | """ 93 | Loads JSON files from a folder into a Pandas DataFrame. 94 | 95 | :param folder_path: Path to the folder containing JSON files. 96 | :return: DataFrame containing all loaded data. 97 | """ 98 | all_data = [] 99 | folder = Path(folder_path) 100 | 101 | # Iterate over all JSON files in the folder 102 | for file_path in folder.glob("*/*.json"): 103 | try: 104 | with file_path.open("r", encoding="utf-8") as f: 105 | data = json.load(f) 106 | 107 | # If data is a list of dicts, extend it 108 | if isinstance(data, list): 109 | all_data.extend(data) 110 | # If data is a single dict, append it as a row 111 | elif isinstance(data, dict): 112 | all_data.append(data) 113 | else: 114 | logger.warning(f"Skipping {file_path.name}: Unexpected format") 115 | except json.JSONDecodeError: 116 | logger.error(f"Skipping {file_path.name}: Invalid JSON") 117 | 118 | logger.info(f"Loaded {len(all_data)} entries from {folder_path}") 119 | return pd.DataFrame(all_data) 120 | 121 | 122 | def pil_to_base64(image: Image.Image, format: str = "PNG", modality: str = "image") -> str: 123 | """ 124 | Converts a PIL image to a base64-encoded data URI. 125 | 126 | :param image: PIL Image object 127 | :param format: Image format (e.g., "PNG", "JPEG") 128 | :param modality: MIME type category (default: "image") 129 | :return: Base64-encoded data URI 130 | """ 131 | buffered = BytesIO() 132 | image.save(buffered, format=format) 133 | base64_encoded = base64.b64encode(buffered.getvalue()).decode("utf-8") 134 | mimetype = f"{modality}/{format.lower()}" 135 | return f"data:{mimetype};base64,{base64_encoded}" 136 | -------------------------------------------------------------------------------- /templates/classification-analysis.js.j2: -------------------------------------------------------------------------------- 1 | import {check, fail} from 'k6'; 2 | import http from 'k6/http'; 3 | import {scenario} from 'k6/execution'; 4 | import {Trend, Counter, Rate} from 'k6/metrics'; 5 | import {textSummary} from 'https://jslib.k6.io/k6-summary/0.0.2/index.js'; 6 | 7 | const host = "{{ host }}"; 8 | const apiKey = __ENV.HF_TOKEN; 9 | const dataset = JSON.parse(open("{{ data_file }}")); 10 | const filePath = `{{ results_file }}`; 11 | const hwType = "{{ hw_type }}"; 12 | const batchSize = {{ batch_size }}; 13 | const image = "{{ image }}"; 14 | const engine = "{{ engine }}"; 15 | const vendor = "{{ vendor }}"; 16 | 17 | // Metrics definition 18 | const responseLatency = new Trend('response_latency', true); 19 | const requestThroughput = new Counter('request_throughput'); 20 | const accuracyCount = new Counter('accuracy_count'); 21 | const totalRequests = new Trend('total_requests', true); 22 | 23 | 24 | export function generatePayload(example) { 25 | return { 26 | "input": [example.{{ text_column }}] 27 | }; 28 | } 29 | 30 | export const options = { 31 | scenarios: { 32 | shared_load_test: { 33 | executor: 'shared-iterations', 34 | vus: {{ pre_allocated_vus }}, // Number of VUs 35 | iterations: {{ total_requests }}, // Total number of requests 36 | maxDuration: '{{ duration }}', // Time limit 37 | }, 38 | }, 39 | }; 40 | 41 | let localRequestCount = 0; // Track within each VU 42 | 43 | export default function run() { 44 | const headers = { 45 | Accept: "application/json", 46 | Authorization: "Bearer " + apiKey, 47 | "Content-Type": "application/json", 48 | }; 49 | const query = dataset[scenario.iterationInTest % dataset.length]; 50 | const payload = JSON.stringify(generatePayload(query)); 51 | const url = `${host}/classify`; 52 | const params = { 53 | method: 'POST', 54 | body: payload, 55 | headers, 56 | }; 57 | 58 | // Sending the request 59 | const startTime = Date.now(); 60 | const res = http.post(url, payload, params); 61 | const endTime = Date.now(); 62 | const deltaMs = endTime - startTime; // Duration in milliseconds 63 | 64 | // Add response latency 65 | responseLatency.add(deltaMs); 66 | localRequestCount += 1; 67 | requestThroughput.add(1); // Still use k6 counter 68 | 69 | // Check the response 70 | const passedCheck = check(res, { 71 | 'http_200': (r) => r.status === 200, 72 | }); 73 | 74 | if (!passedCheck) { 75 | fail('Request failed with status ' + res.status); 76 | return; 77 | } 78 | 79 | // Process the response 80 | let tokenCount = 0; 81 | let isCorrectClassification = false; 82 | try { 83 | const data = JSON.parse(res.body); 84 | if (data.object === 'classify' && data.data && data.data[0] && data.data[0].length > 0) { 85 | const predictedLabel = data.data[0][0].label; 86 | if (query.hasOwnProperty('label_text')) { 87 | if (predictedLabel === query.label_text) { 88 | accuracyCount.add(1); 89 | isCorrectClassification = true; 90 | } 91 | } 92 | 93 | // Add token count to tokens throughput 94 | if (data.usage && data.usage.total_tokens) { 95 | tokenCount = data.usage.total_tokens; 96 | } 97 | } 98 | } catch (e) { 99 | fail('Failed to parse response body: ' + e); 100 | } 101 | 102 | // Update custom accuracy metrics 103 | const correctCount = accuracyCount.count || 0; 104 | const totalCount = localRequestCount; 105 | 106 | // Add custom metrics 107 | totalRequests.add(totalCount); 108 | accuracyCount.add(correctCount); 109 | } 110 | 111 | // Writing Summary to a JSON File 112 | export function handleSummary(data) { 113 | // Extract required metrics 114 | const totalRequests = data.metrics.iterations.values.count; // Total requests 115 | const failedRequests = data.metrics.http_req_failed.values.count || 0; 116 | const successfulRequests = totalRequests - failedRequests; // Corrected calculation 117 | const avgLatency = data.metrics.response_latency.values.avg; // Average latency in ms 118 | const p95Latency = data.metrics.response_latency.values['p(95)']; // 95th percentile latency in ms 119 | const accuracyPercentage = (data.metrics.accuracy_count.values.count / totalRequests) * 100; // Accuracy % 120 | const testDuration = data.state.testRunDurationMs / 1000; // Convert from ms to seconds 121 | const throughput = data.metrics.iterations.values.rate; // Requests per second 122 | 123 | // Construct the summary object with units in keys 124 | const summary = { 125 | total_requests: totalRequests, 126 | test_duration_sec: testDuration, // Seconds 127 | successful_requests: successfulRequests, 128 | avg_latency_ms: avgLatency, // Milliseconds 129 | p95_latency_ms: p95Latency, // Milliseconds 130 | accuracy_percentage: accuracyPercentage, // Percentage 131 | throughput_req_per_sec: throughput, // Requests per second 132 | hw_type: hwType, 133 | batch_size: batchSize, 134 | image: image, 135 | engine: engine, 136 | vendor: vendor, 137 | vus: {{ pre_allocated_vus }} 138 | }; 139 | 140 | // Write the summary to a JSON file 141 | return { 142 | [filePath]: JSON.stringify(summary, null, 2), // Save JSON output 143 | stdout: textSummary(data, { indent: ' ', enableColors: true }), // Show summary in console 144 | }; 145 | } 146 | -------------------------------------------------------------------------------- /templates/embedding-analysis.js.j2: -------------------------------------------------------------------------------- 1 | import {check, fail} from 'k6'; 2 | import http from 'k6/http'; 3 | import {scenario} from 'k6/execution'; 4 | import {Trend, Counter} from 'k6/metrics'; 5 | import {textSummary} from 'https://jslib.k6.io/k6-summary/0.0.2/index.js'; 6 | 7 | const host = "{{ host }}"; 8 | const apiKey = __ENV.HF_TOKEN; 9 | const dataset = JSON.parse(open("{{ data_file }}")); 10 | const filePath = `{{ results_file }}`; 11 | const hwType = "{{ hw_type }}"; 12 | const batchSize = {{ batch_size }}; 13 | const image = "{{ image }}"; 14 | const engine = "{{ engine }}"; 15 | const vendor = "{{ vendor }}"; 16 | 17 | // Metrics definition 18 | const responseLatency = new Trend('response_latency', true); 19 | const requestThroughput = new Counter('request_throughput'); 20 | const totalRequests = new Trend('total_requests', true); 21 | const embeddingSize = new Trend('embedding_size', true); 22 | 23 | export function generatePayload(example) { 24 | return { 25 | "input": [example.{{ text_column }}] 26 | }; 27 | } 28 | 29 | export const options = { 30 | scenarios: { 31 | shared_load_test: { 32 | executor: 'shared-iterations', 33 | vus: {{ pre_allocated_vus }}, // Number of VUs 34 | iterations: {{ total_requests }}, // Total number of requests 35 | maxDuration: '{{ duration }}', // Time limit 36 | }, 37 | }, 38 | }; 39 | 40 | let localRequestCount = 0; // Track within each VU 41 | 42 | export default function run() { 43 | const headers = { 44 | Accept: "application/json", 45 | Authorization: "Bearer " + apiKey, 46 | "Content-Type": "application/json", 47 | }; 48 | const query = dataset[scenario.iterationInTest % dataset.length]; 49 | const payload = JSON.stringify(generatePayload(query)); 50 | const url = `${host}/embeddings`; 51 | const params = { 52 | method: 'POST', 53 | body: payload, 54 | headers, 55 | }; 56 | 57 | // Sending the request 58 | const startTime = Date.now(); 59 | const res = http.post(url, payload, params); 60 | const endTime = Date.now(); 61 | const deltaMs = endTime - startTime; // Duration in milliseconds 62 | 63 | // Add response latency 64 | responseLatency.add(deltaMs); 65 | localRequestCount += 1; 66 | requestThroughput.add(1); // Still use k6 counter 67 | 68 | // Check the response 69 | const passedCheck = check(res, { 70 | 'http_200': (r) => r.status === 200, 71 | 'valid_embedding': (r) => { 72 | try { 73 | const data = JSON.parse(r.body); 74 | return data.object === 'list' && data.data && data.data.length > 0 && Array.isArray(data.data[0].embedding); 75 | } catch (e) { 76 | return false; 77 | } 78 | } 79 | }); 80 | 81 | if (!passedCheck) { 82 | fail('Request failed with status ' + res.status + ' or invalid embedding format'); 83 | return; 84 | } 85 | 86 | // Process the response 87 | try { 88 | const data = JSON.parse(res.body); 89 | if (data.object === 'list' && data.data && data.data[0] && data.data[0].embedding) { 90 | embeddingSize.add(data.data[0].embedding.length); 91 | } 92 | } catch (e) { 93 | fail('Failed to parse response body: ' + e); 94 | } 95 | 96 | // Add custom metrics 97 | totalRequests.add(localRequestCount); 98 | } 99 | 100 | // Writing Summary to a JSON File 101 | export function handleSummary(data) { 102 | // Extract required metrics 103 | const totalRequests = data.metrics.iterations.values.count; // Total requests 104 | const failedRequests = data.metrics.http_req_failed.values.count || 0; 105 | const successfulRequests = totalRequests - failedRequests; // Corrected calculation 106 | const avgLatency = data.metrics.response_latency.values.avg; // Average latency in ms 107 | const p95Latency = data.metrics.response_latency.values['p(95)']; // 95th percentile latency in ms 108 | const testDuration = data.state.testRunDurationMs / 1000; // Convert from ms to seconds 109 | const throughput = data.metrics.iterations.values.rate; // Requests per second 110 | const avgEmbeddingSize = data.metrics.embedding_size.values.avg || 0; // Average embedding size 111 | 112 | // Construct the summary object with units in keys 113 | const summary = { 114 | total_requests: totalRequests, 115 | test_duration_sec: testDuration, // Seconds 116 | successful_requests: successfulRequests, 117 | avg_latency_ms: avgLatency, // Milliseconds 118 | p95_latency_ms: p95Latency, // Milliseconds 119 | throughput_req_per_sec: throughput, // Requests per second 120 | avg_embedding_size: avgEmbeddingSize, // Average embedding size 121 | hw_type: hwType, 122 | batch_size: batchSize, 123 | image: image, 124 | engine: engine, 125 | vendor: vendor, 126 | vus: {{ pre_allocated_vus }} 127 | }; 128 | 129 | // Write the summary to a JSON file 130 | return { 131 | [filePath]: JSON.stringify(summary, null, 2), // Save JSON output 132 | stdout: textSummary(data, { indent: ' ', enableColors: true }), // Show summary in console 133 | }; 134 | } 135 | -------------------------------------------------------------------------------- /templates/vision-embedding-analysis.js.j2: -------------------------------------------------------------------------------- 1 | import {check, fail} from 'k6'; 2 | import http from 'k6/http'; 3 | import {scenario} from 'k6/execution'; 4 | import {Trend, Counter} from 'k6/metrics'; 5 | import {textSummary} from 'https://jslib.k6.io/k6-summary/0.0.2/index.js'; 6 | import { SharedArray } from 'k6/data'; 7 | 8 | const host = "{{ host }}"; 9 | const apiKey = __ENV.HF_TOKEN; 10 | 11 | // Load dataset once using SharedArray to avoid copies across VUs 12 | const dataset = new SharedArray("dataset", function () { 13 | return open("{{ data_file }}") 14 | .split('\n') 15 | .filter(line => line.trim() !== "") // Remove empty lines 16 | .map(line => { 17 | try { 18 | return JSON.parse(line); 19 | } catch (e) { 20 | console.log(`Skipping invalid JSON line: ${line}`); 21 | return null; 22 | } 23 | }) 24 | .filter(entry => entry !== null); // Remove null values from failed JSON parsing 25 | }); 26 | 27 | const filePath = `{{ results_file }}`; 28 | const hwType = "{{ hw_type }}"; 29 | const batchSize = {{ batch_size }}; 30 | const image = "{{ image }}"; 31 | const engine = "{{ engine }}"; 32 | const vendor = "{{ vendor }}"; 33 | 34 | // Metrics definition 35 | const responseLatency = new Trend('response_latency', true); 36 | const requestThroughput = new Counter('request_throughput'); 37 | const numVectorsTrend = new Trend('num_vectors', true); 38 | const invalidEmbeddingsCounter = new Counter('invalid_embeddings'); 39 | 40 | export function generatePayload(example) { 41 | return { 42 | "input": [example.{{ text_column }}_b64], //I just kept the same naming, sorry :'( 43 | "encoding_format": "float", 44 | "modality": "image" 45 | }; 46 | } 47 | 48 | export const options = { 49 | scenarios: { 50 | shared_load_test: { 51 | executor: 'shared-iterations', 52 | vus: {{ pre_allocated_vus }}, // Number of VUs 53 | iterations: {{ total_requests }}, // Total number of requests 54 | maxDuration: '{{ duration }}', // Time limit 55 | }, 56 | }, 57 | }; 58 | 59 | export default function run() { 60 | const headers = { 61 | Accept: "application/json", 62 | Authorization: "Bearer " + apiKey, 63 | "Content-Type": "application/json", 64 | }; 65 | 66 | const query = dataset[scenario.iterationInTest % dataset.length]; 67 | const payload = JSON.stringify(generatePayload(query)); 68 | const url = `${host}/embeddings`; 69 | 70 | const startTime = Date.now(); 71 | const res = http.post(url, payload, { headers }); 72 | const endTime = Date.now(); 73 | responseLatency.add(endTime - startTime); 74 | requestThroughput.add(1); 75 | 76 | // Check the response 77 | const passedCheck = check(res, { 78 | 'http_200': (r) => r.status === 200, 79 | 'valid_embedding': (r) => { 80 | try { 81 | const data = JSON.parse(r.body); 82 | const isValid = data.object === 'list' && data.data && data.data.length > 0 && 83 | Array.isArray(data.data[0].embedding) && 84 | data.data[0].embedding.every(vec => Array.isArray(vec) && vec.length === 128); 85 | if (!isValid) invalidEmbeddingsCounter.add(1); 86 | return isValid; 87 | } catch (e) { 88 | return false; 89 | } 90 | } 91 | }); 92 | 93 | if (!passedCheck) { 94 | fail('Request failed with status ' + res.status + ' or invalid embedding format'); 95 | return; 96 | } 97 | 98 | // Process the response 99 | try { 100 | const data = JSON.parse(res.body); 101 | if (data.object === 'list' && Array.isArray(data.data) && data.data.length === 1) { 102 | let numVectors = data.data[0].embedding.length; 103 | numVectorsTrend.add(numVectors); 104 | } else { 105 | fail("Response format incorrect or missing required fields."); 106 | } 107 | } catch (e) { 108 | fail("Failed to parse response body: " + e); 109 | } 110 | } 111 | 112 | // Write the summary to a JSON file 113 | export function handleSummary(data) { 114 | const invalidEmbeddings = data.metrics.invalid_embeddings ? data.metrics.invalid_embeddings.values.count : 0; 115 | 116 | const summary = { 117 | total_requests: data.metrics.iterations.values.count, 118 | test_duration_sec: data.state.testRunDurationMs / 1000, 119 | successful_requests: data.metrics.iterations.values.count - (data.metrics.http_req_failed.values.count || 0), 120 | avg_latency_ms: data.metrics.response_latency.values.avg, 121 | p95_latency_ms: data.metrics.response_latency.values['p(95)'], 122 | throughput_req_per_sec: data.metrics.iterations.values.rate, 123 | avg_num_vectors: data.metrics.num_vectors.values.avg || 0, 124 | min_num_vectors: data.metrics.num_vectors.values.min || 0, 125 | max_num_vectors: data.metrics.num_vectors.values.max || 0, 126 | invalid_embeddings: invalidEmbeddings, 127 | hw_type: hwType, 128 | batch_size: batchSize, 129 | image: image, 130 | engine: engine, 131 | vendor: vendor, 132 | vus: {{ pre_allocated_vus }} 133 | }; 134 | 135 | return { 136 | [filePath]: JSON.stringify(summary, null, 2), // Save JSON output 137 | stdout: textSummary(data, { indent: ' ', enableColors: true }), // Show summary in console 138 | }; 139 | } 140 | --------------------------------------------------------------------------------