├── requirements.txt ├── .gitignore ├── userdef.py ├── README.md ├── openllm_llama2_400_prompt.py ├── openllm_llama2_20_prompt.py ├── benchmark.py └── common.py /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohttp==3.9.5 2 | transformers==4.41.2 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.pyc 3 | databricks-dolly-15k.jsonl 4 | -------------------------------------------------------------------------------- /userdef.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | class UserDef: 4 | BASE_URL = "" 5 | 6 | @classmethod 7 | def ping_url(cls): 8 | return f"{cls.BASE_URL}/healthz" 9 | 10 | @staticmethod 11 | async def rest(): 12 | import asyncio 13 | 14 | await asyncio.sleep(0.01) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # BentoCloud Benchmark Client 2 | 3 | ## Usage 4 | 5 | ### 1. Set up environment variables 6 | 7 | Make sure you have logged into Huggingface 8 | 9 | ```bash 10 | huggingface-cli login 11 | ``` 12 | 13 | Set environment variables for benchmarking 14 | 15 | ```bash 16 | export BASE_URL= 17 | export SYSTEM_PROMPT=1 // 1 or 0 18 | ``` 19 | 20 | ### 2. Run benchmark 21 | 22 | ```bash 23 | python benchmark.py --max_users 10 --session_time 300 --ping_correction 24 | ``` 25 | 26 | 27 | - `max_users` is the max number of concurrent users to spawn 28 | - `session_time` is the duration of the benchmark sesssion, in seconds 29 | - `ping_correction` is a flag that determines whether ping latency should be deducted from the metrics 30 | -------------------------------------------------------------------------------- /openllm_llama2_400_prompt.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from openllm_llama2_20_prompt import UserDef as BaseUserDef 3 | from common import get_prompt_set, start_benchmark_session 4 | 5 | 6 | class UserDef(BaseUserDef): 7 | @classmethod 8 | def make_request(cls): 9 | import openllm 10 | import json 11 | import random 12 | 13 | prompt = random.choice(get_prompt_set(300, 500)) 14 | 15 | headers = {"accept": "application/json", "Content-Type": "application/json"} 16 | config = ( 17 | openllm.AutoConfig.for_model("llama") 18 | .model_construct_env(max_new_tokens=200, top_p=0.21) 19 | .model_dump() 20 | ) 21 | data = {"prompt": prompt, "llm_config": config, "adapter_name": None} 22 | url = f"{cls.BASE_URL}/v1/generate_stream" 23 | return url, headers, json.dumps(data) 24 | 25 | 26 | if __name__ == "__main__": 27 | asyncio.run(start_benchmark_session(UserDef)) 28 | -------------------------------------------------------------------------------- /openllm_llama2_20_prompt.py: -------------------------------------------------------------------------------- 1 | from common import start_benchmark_session, get_tokenizer, get_prompt_set 2 | import asyncio 3 | 4 | class UserDef: 5 | # BASE_URL = "http://llama27bchat-org-ss-org-1--aws-us-east-1.mt2.bentoml.ai" 6 | # BASE_URL= "http://llama2-7b-org-ss-org-1--aws-us-east-1.mt2.bentoml.ai" 7 | # BASE_URL = "http://llama2-13b-org-ss-org-1--aws-us-east-1.mt2.bentoml.ai" 8 | # BASE_URL = "http://184.105.5.107:3000" 9 | BASE_URL = "http://184.105.217.197:3000" # Aaron's machine 10 | 11 | @classmethod 12 | def ping_url(cls): 13 | return f"{cls.BASE_URL}/healthz" 14 | 15 | @classmethod 16 | def make_request(cls): 17 | """ 18 | return url, headers, body 19 | """ 20 | import openllm 21 | import json 22 | import random 23 | 24 | prompt = random.choice(get_prompt_set(15, 25)) 25 | 26 | headers = {"accept": "application/json", "Content-Type": "application/json"} 27 | config = ( 28 | openllm.AutoConfig.for_model("llama") 29 | .model_construct_env(max_new_tokens=20, top_p=0.21) 30 | .model_dump() 31 | ) 32 | data = {"prompt": prompt, "llm_config": config, "adapter_name": None} 33 | url = f"{cls.BASE_URL}/v1/generate_stream" 34 | return url, headers, json.dumps(data) 35 | 36 | @classmethod 37 | def parse_response(cls, chunk): 38 | """ 39 | take chunk and return list of tokens, used for token counting 40 | """ 41 | text = chunk.decode("utf-8").strip() 42 | return get_tokenizer()(text) 43 | 44 | @staticmethod 45 | async def rest(): 46 | import asyncio 47 | 48 | await asyncio.sleep(0.01) 49 | 50 | 51 | if __name__ == "__main__": 52 | asyncio.run(start_benchmark_session(UserDef)) 53 | -------------------------------------------------------------------------------- /benchmark.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import functools 3 | import os 4 | 5 | from transformers import AutoTokenizer 6 | 7 | from userdef import UserDef as BaseUserDef 8 | 9 | try: 10 | max_tokens = int(os.environ.get("MAX_TOKENS")) 11 | except (TypeError, ValueError): 12 | max_tokens = 512 13 | 14 | print(f"max_tokens set to {max_tokens}") 15 | 16 | tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct") 17 | 18 | default_system_prompt = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. 19 | 20 | If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""" 21 | 22 | if os.environ.get("SYSTEM_PROMPT") == "1": 23 | system_prompt = default_system_prompt 24 | system_prompt_file = os.environ.get("SYSTEM_PROMPT_FILE") 25 | if system_prompt_file is not None: 26 | with open(system_prompt_file) as f: 27 | system_prompt = f.read().strip() 28 | else: 29 | system_prompt = "" 30 | 31 | base_url = os.environ.get("BASE_URL", "http://localhost:3000") 32 | 33 | 34 | @functools.lru_cache(maxsize=8) 35 | def get_prompt_set(min_input_length=0, max_input_length=500): 36 | """ 37 | return a list of prompts with length between min_input_length and max_input_length 38 | """ 39 | import json 40 | import requests 41 | import os 42 | 43 | # check if the dataset is cached 44 | if os.path.exists("databricks-dolly-15k.jsonl"): 45 | print("Loading cached dataset") 46 | with open("databricks-dolly-15k.jsonl", "r") as f: 47 | dataset = [json.loads(line) for line in f.readlines()] 48 | else: 49 | print("Downloading dataset") 50 | raw_dataset = requests.get( 51 | "https://huggingface.co/datasets/databricks/databricks-dolly-15k/resolve/main/databricks-dolly-15k.jsonl" 52 | ) 53 | content = raw_dataset.content 54 | open("databricks-dolly-15k.jsonl", "wb").write(content) 55 | dataset = [json.loads(line) for line in content.decode().split("\n")] 56 | print("Dataset downloaded") 57 | 58 | for d in dataset: 59 | d["question"] = d["context"] + d["instruction"] 60 | d["input_tokens"] = len(tokenizer(d["question"])["input_ids"]) 61 | d["output_tokens"] = len(tokenizer(d["response"])) 62 | return [ 63 | d["question"] 64 | for d in dataset 65 | if min_input_length <= d["input_tokens"] <= max_input_length 66 | ] 67 | 68 | prompts = get_prompt_set(30, 150) 69 | 70 | 71 | class UserDef(BaseUserDef): 72 | BASE_URL = base_url 73 | PROMPTS = prompts 74 | 75 | @classmethod 76 | def make_request(cls): 77 | import json 78 | import random 79 | 80 | prompt = random.choice(cls.PROMPTS) 81 | headers = {"Content-Type": "application/json"} 82 | url = f"{cls.BASE_URL}/generate" 83 | data = { 84 | "prompt": prompt, 85 | "system_prompt": system_prompt, # this is important because there's a default system prompt 86 | "max_tokens": max_tokens, 87 | } 88 | return url, headers, json.dumps(data) 89 | 90 | @staticmethod 91 | def parse_response(chunk: bytes): 92 | import json 93 | text = chunk.decode("utf-8").strip() 94 | return tokenizer.encode(text, add_special_tokens=False) 95 | 96 | 97 | if __name__ == "__main__": 98 | import asyncio 99 | from common import start_benchmark_session 100 | 101 | # arg parsing 102 | parser = argparse.ArgumentParser(description="Benchmark") 103 | parser.add_argument("--max_users", type=int, required=True) 104 | parser.add_argument("--session_time", type=float, default=None) 105 | parser.add_argument("--ping_correction", action="store_true") 106 | args = parser.parse_args() 107 | 108 | asyncio.run(start_benchmark_session(args, UserDef)) 109 | -------------------------------------------------------------------------------- /common.py: -------------------------------------------------------------------------------- 1 | from asyncio.tasks import Task 2 | import argparse 3 | import asyncio 4 | import numpy as np 5 | import aiohttp 6 | import time 7 | import collections 8 | import contextlib 9 | import math 10 | import functools 11 | 12 | 13 | class MetricsCollector: 14 | def __init__(self, user_def, session_time=None, ping_latency=0.0): 15 | self.start_time = math.floor(time.time()) 16 | self.response_word_bucket = collections.defaultdict(int) 17 | self.response_head_latency_bucket = collections.defaultdict(list) 18 | self.response_latency_bucket = collections.defaultdict(list) 19 | self.on_going_requests = 0 20 | self.response_bucket = collections.defaultdict(int) 21 | self.total_requests = 0 22 | self.on_going_users = 0 23 | self.status_bucket = collections.defaultdict(int) 24 | self.user_def = user_def 25 | self.session_time = session_time 26 | self.ping_latency = ping_latency 27 | 28 | def collect_response_chunk(self, chunk: list): 29 | self.response_word_bucket[math.floor(time.time())] += len(chunk) 30 | 31 | def collect_response_status(self, status): 32 | self.status_bucket[status] += 1 33 | 34 | def collect_response_head_latency(self, latency): 35 | self.response_head_latency_bucket[math.floor(time.time())] += [ 36 | latency - self.ping_latency 37 | ] 38 | 39 | @contextlib.contextmanager 40 | def collect_http_request(self): 41 | start_time = time.time() 42 | self.on_going_requests += 1 43 | yield 44 | self.on_going_requests -= 1 45 | self.response_bucket[math.floor(time.time())] += 1 46 | self.response_latency_bucket[math.floor(time.time())] += [ 47 | time.time() - start_time - self.ping_latency 48 | ] 49 | 50 | @contextlib.contextmanager 51 | def collect_user(self): 52 | self.on_going_users += 1 53 | yield 54 | self.on_going_users -= 1 55 | 56 | async def report_loop(self, time_window=5): 57 | """ 58 | Each bucket is in 1s. This function will report the avg metrics in the past time_window seconds. 59 | """ 60 | while True: 61 | await asyncio.sleep(time_window) 62 | now = math.floor(time.time()) 63 | print(f"Time: {now - self.start_time}") 64 | print(f"Active Users: {self.on_going_users}") 65 | print( 66 | f"Request/s: {sum(self.response_bucket[i] for i in range(now - time_window, now)) / time_window}" 67 | ) 68 | print(f"Total Requests: {self.total_requests}") 69 | print(f"Active Requests: {self.on_going_requests}") 70 | latency_bucket = [ 71 | j 72 | for i in range(now - time_window, now) 73 | for j in self.response_head_latency_bucket[i] 74 | ] 75 | if latency_bucket: 76 | print(f"Response Head Latency: {np.mean(latency_bucket)}") 77 | latency_bucket = [ 78 | j 79 | for i in range(now - time_window, now) 80 | for j in self.response_latency_bucket[i] 81 | ] 82 | if latency_bucket: 83 | print(f"Response Latency: {np.mean(latency_bucket)}") 84 | print( 85 | f"Response Tokens/s: {sum(self.response_word_bucket[i] for i in range(now - time_window, now)) / time_window}" 86 | ) 87 | print(f"Status: {self.status_bucket}") 88 | print() 89 | 90 | if self.session_time and now - self.start_time >= self.session_time: 91 | self.report_final() 92 | break 93 | 94 | def report_final(self): 95 | print("=================== Final Report ====================") 96 | print(f"Total Requests: {self.total_requests}") 97 | print( 98 | f"Average Request/s: {self.total_requests / (time.time() - self.start_time)}" 99 | ) 100 | 101 | head_latency_size = sum(len(i) for i in self.response_head_latency_bucket.values()) 102 | if head_latency_size: 103 | head_latencies = [j for i in self.response_head_latency_bucket.values() for j in i] 104 | 105 | print( 106 | f"Average Response Head Latency: {sum(head_latencies) / head_latency_size}" 107 | ) 108 | print( 109 | f"Median Response Head Latency: {np.percentile(head_latencies, 50)}" 110 | ) 111 | print( 112 | f"95% Response Head Latency: {np.percentile(head_latencies, 95)}" 113 | ) 114 | print( 115 | f"99% Response Head Latency: {np.percentile(head_latencies, 99)}" 116 | ) 117 | 118 | latency_size = sum(len(i) for i in self.response_latency_bucket.values()) 119 | if latency_size: 120 | latencies = [j for i in self.response_latency_bucket.values() for j in i] 121 | print( 122 | f"Average Response Latency: {sum(latencies) / latency_size}" 123 | ) 124 | print( 125 | f"Median Response Latency: {np.percentile(latencies, 50)}" 126 | ) 127 | print( 128 | f"95% Response Latency: {np.percentile(latencies, 95)}" 129 | ) 130 | print( 131 | f"99% Response Latency: {np.percentile(latencies, 99)}" 132 | ) 133 | 134 | print( 135 | f"Average Response Tokens/s: {sum(self.response_word_bucket.values()) / (time.time() - self.start_time)}" 136 | ) 137 | 138 | 139 | def linear_regression(x, y): 140 | x = tuple((i, 1) for i in x) 141 | y = tuple(i for i in y) 142 | a, b = np.linalg.lstsq(x, y, rcond=None)[0] 143 | return a, b 144 | 145 | 146 | class UserSpawner: 147 | def __init__( 148 | self, 149 | user_def, 150 | collector: MetricsCollector, 151 | target_user_count=None, 152 | target_time=None, 153 | ): 154 | self.target_user_count = 1 if target_user_count is None else target_user_count 155 | self.target_time = time.time() + 10 if target_time is None else target_time 156 | 157 | self.data_collector = collector 158 | self.user_def = user_def 159 | 160 | self.user_list: list[Task] = [] 161 | 162 | async def sync(self): 163 | while True: 164 | if self.current_user_count == self.target_user_count: 165 | return 166 | await asyncio.sleep(0.1) 167 | 168 | @property 169 | def current_user_count(self): 170 | return len(self.user_list) 171 | 172 | async def user_loop(self): 173 | with self.data_collector.collect_user(): 174 | cookie_jar = aiohttp.DummyCookieJar() 175 | try: 176 | async with aiohttp.ClientSession(cookie_jar=cookie_jar) as session: 177 | while True: 178 | url, headers, data = self.user_def.make_request() 179 | self.data_collector.total_requests += 1 180 | with self.data_collector.collect_http_request(): 181 | req_start = time.time() 182 | async with session.post( 183 | url, 184 | headers=headers, 185 | data=data, 186 | ) as response: 187 | self.data_collector.collect_response_status( 188 | response.status 189 | ) 190 | try: 191 | if response.status != 200: 192 | continue 193 | 194 | first = True 195 | async for data, end_of_http_chunk in response.content.iter_chunks(): 196 | result = self.user_def.parse_response(data) 197 | if first: 198 | first = False 199 | self.data_collector.collect_response_head_latency( 200 | time.time() - req_start 201 | ) 202 | 203 | self.data_collector.collect_response_chunk( 204 | result 205 | ) 206 | if not end_of_http_chunk: 207 | break 208 | except Exception as e: 209 | self.data_collector.collect_response_status(str(e)) 210 | raise e 211 | await self.user_def.rest() 212 | except asyncio.CancelledError: 213 | pass 214 | 215 | def spawn_user(self): 216 | self.user_list.append(asyncio.create_task(self.user_loop())) 217 | 218 | async def cancel_all_users(self): 219 | try: 220 | user = self.user_list.pop() 221 | user.cancel() 222 | except IndexError: 223 | pass 224 | await asyncio.sleep(0) 225 | 226 | async def spawner_loop(self): 227 | while True: 228 | current_users = len(self.user_list) 229 | if current_users == self.target_user_count: 230 | await asyncio.sleep(0.1) 231 | elif current_users < self.target_user_count: 232 | self.spawn_user() 233 | sleep_time = max( 234 | (self.target_time - time.time()) 235 | / (self.target_user_count - current_users), 236 | 0, 237 | ) 238 | await asyncio.sleep(sleep_time) 239 | elif current_users > self.target_user_count: 240 | self.user_list.pop().cancel() 241 | sleep_time = max( 242 | (time.time() - self.target_time) 243 | / (current_users - self.target_user_count), 244 | 0, 245 | ) 246 | await asyncio.sleep(sleep_time) 247 | 248 | async def aimd_loop( 249 | self, 250 | adjust_interval=5, 251 | sampling_interval=5, 252 | ss_delta=1, 253 | ): 254 | """ 255 | Detect a suitable number of users to maximize the words/s. 256 | """ 257 | while True: 258 | while True: 259 | # slow start 260 | now = math.floor(time.time()) 261 | words_per_seconds = [ 262 | self.data_collector.response_word_bucket[i] 263 | for i in range(now - sampling_interval, now) 264 | ] 265 | slope = linear_regression( 266 | range(len(words_per_seconds)), words_per_seconds 267 | )[0] 268 | if slope >= -0.01: 269 | # throughput is increasing 270 | cwnd = self.current_user_count 271 | target_cwnd = max(int(cwnd * (1 + ss_delta)), cwnd + 1) 272 | self.target_user_count = target_cwnd 273 | self.target_time = time.time() + adjust_interval 274 | print(f"SS: {cwnd} -> {target_cwnd}") 275 | await asyncio.sleep(adjust_interval) 276 | else: 277 | # throughput is decreasing, stop slow start 278 | cwnd = self.current_user_count 279 | target_cwnd = math.ceil(cwnd * 0.5) 280 | self.target_user_count = target_cwnd 281 | self.target_time = time.time() + adjust_interval 282 | print(f"SS Ended: {target_cwnd}") 283 | break 284 | 285 | await self.sync() 286 | await asyncio.sleep(min(adjust_interval, sampling_interval, 10)) 287 | return 0 288 | 289 | 290 | async def start_benchmark_session(args, user_def): 291 | # ping server 292 | response_times = [] 293 | async with aiohttp.ClientSession() as session: 294 | async with session.get(user_def.ping_url()) as response: 295 | assert response.status == 200 296 | await asyncio.sleep(0.3) 297 | 298 | for _ in range(5): 299 | time_start = time.time() 300 | async with session.get(user_def.ping_url()) as response: 301 | assert response.status == 200 302 | response_times.append(time.time() - time_start) 303 | await asyncio.sleep(0.3) 304 | ping_latency = sum(response_times) / len(response_times) 305 | print(f"Ping latency: {ping_latency}. ping correction: {args.ping_correction}") 306 | 307 | # init 308 | collector = MetricsCollector( 309 | user_def, args.session_time, ping_latency - 0.005 if args.ping_correction else 0 310 | ) 311 | user_spawner = UserSpawner( 312 | user_def, collector, args.max_users, target_time=time.time() + 20 313 | ) 314 | asyncio.create_task(user_spawner.spawner_loop()) 315 | asyncio.create_task(collector.report_loop()) 316 | if args.max_users is None: 317 | asyncio.create_task(user_spawner.aimd_loop()) 318 | 319 | if args.session_time is not None: 320 | await asyncio.sleep(args.session_time + 1) 321 | else: 322 | await asyncio.wait(user_spawner.user_list) 323 | 324 | await user_spawner.cancel_all_users() 325 | return 0 326 | 327 | 328 | @functools.lru_cache(maxsize=1) 329 | def get_tokenizer(): 330 | from transformers import LlamaTokenizer 331 | 332 | tokenizer = LlamaTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf") 333 | 334 | def _tokenizer(text): 335 | return tokenizer(text)["input_ids"][1:] 336 | 337 | return _tokenizer 338 | 339 | 340 | @functools.lru_cache(maxsize=8) 341 | def get_prompt_set(min_input_length=0, max_input_length=500): 342 | """ 343 | return a list of prompts with length between min_input_length and max_input_length 344 | """ 345 | import json 346 | import requests 347 | import os 348 | 349 | # check if the dataset is cached 350 | if os.path.exists("databricks-dolly-15k.jsonl"): 351 | print("Loading cached dataset") 352 | with open("databricks-dolly-15k.jsonl", "r") as f: 353 | dataset = [json.loads(line) for line in f.readlines()] 354 | else: 355 | print("Downloading dataset") 356 | raw_dataset = requests.get( 357 | "https://huggingface.co/datasets/databricks/databricks-dolly-15k/resolve/main/databricks-dolly-15k.jsonl" 358 | ) 359 | content = raw_dataset.content 360 | open("databricks-dolly-15k.jsonl", "wb").write(content) 361 | dataset = [json.loads(line) for line in content.decode().split("\n")] 362 | print("Dataset downloaded") 363 | 364 | tokenizer = get_tokenizer() 365 | for d in dataset: 366 | d["input_tokens"] = len(tokenizer(d["instruction"])) 367 | d["output_tokens"] = len(tokenizer(d["response"])) 368 | return [ 369 | d["instruction"] 370 | for d in dataset 371 | if min_input_length <= d["input_tokens"] <= max_input_length 372 | ] 373 | --------------------------------------------------------------------------------