├── .gitignore
├── .vscode
    └── settings.json
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── pyproject.toml
├── sample_notebooks
    ├── huggingface_pytorch_ner
    │   └── huggingface_pytorch_ner_example.ipynb
    ├── tensorflow_mobilenet_ic
    │   ├── sample_images
    │   │   ├── n01440764_451.JPEG
    │   │   ├── n02102040_281.JPEG
    │   │   ├── n02979186_600.JPEG
    │   │   ├── n03000684_481.JPEG
    │   │   ├── n03028079_1002.JPEG
    │   │   └── n03445777_451.JPEG
    │   └── tf_ic_mobilenet.ipynb
    └── xgboost_classification
    │   └── xgboost_clf.ipynb
├── setup.py
└── src
    └── sm_serverless_benchmarking
        ├── __init__.py
        ├── __main__.py
        ├── analysis.py
        ├── benchmark.py
        ├── cost_constants.py
        ├── endpoint.py
        ├── report.py
        ├── report_templates
            ├── __init__.py
            └── report_template.html
        ├── requirements.txt
        ├── sagemaker_runner.py
        └── utils.py


/.gitignore:
--------------------------------------------------------------------------------
1 | **/__pycache__
2 | sm_serverless_benchmarking.egg-info
3 | dev_artifacts
4 | .vscode/
5 | dist/
6 | 
7 | 
8 | # Jupyter Notebook
9 | .ipynb_checkpoints


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "python.linting.flake8Enabled": true,
3 |     "python.linting.enabled": false
4 | }


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Code of Conduct
2 | 
3 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
4 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
5 | opensource-codeofconduct@amazon.com with any additional questions or comments.


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | ## Reporting Bugs/Feature Requests
10 | 
11 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
12 | 
13 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
14 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
15 | 
16 | * A reproducible test case or series of steps
17 | * The version of our code being used
18 | * Any modifications you've made relevant to the bug
19 | * Anything unusual about your environment or deployment
20 | 
21 | ## Contributing via Pull Requests
22 | 
23 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
24 | 
25 | 1. You are working against the latest source on the *main* branch.
26 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
27 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
28 | 
29 | To send us a pull request, please:
30 | 
31 | 1. Fork the repository.
32 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
33 | 3. Ensure local tests pass.
34 | 4. Commit to your fork using clear commit messages.
35 | 5. Send us a pull request, answering any default questions in the pull request interface.
36 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
37 | 
38 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
39 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
40 | 
41 | ## Finding contributions to work on
42 | 
43 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
44 | 
45 | ## Code of Conduct
46 | 
47 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
48 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
49 | opensource-codeofconduct@amazon.com with any additional questions or comments.
50 | 
51 | ## Security issue notifications
52 | 
53 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
54 | 
55 | ## Licensing
56 | 
57 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 7 | the Software, and to permit persons to whom the Software is furnished to do so.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include src/sm_serverless_benchmarking/report_templates/*.html
2 | include src/sm_serverless_benchmarking/requirements.txt
3 | exclude sample_notebooks/


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # SageMaker Serverless Inference Toolkit
 2 | 
 3 | Tools to benchmark sagemaker serverless endpoint configurations and help find the most optimal one
 4 | 
 5 | ## Installation and Prerequisites
 6 | To install the toolkit into your environment, first clone this repo. Then inside of the repo directory run
 7 | ```
 8 | pip install sm-serverless-benchmarking
 9 | ```
10 | In order to run the benchmark, your user profile or execution role would need to have the appropriate IAM Permissions Including:
11 | #### **SageMaker**
12 | - CreateModel
13 | - CreateEndpointConfig / DeleteEndpointConfig
14 | - CreateEndpoint / DeleteEndpoint
15 | - CreateProcessingJob (if using SageMaker Runner) 
16 | #### **SageMaker Runtime**
17 | - InvokeEndpoint
18 | #### **CloudWatch**
19 | - GetMetricStatistics
20 | 
21 | ## Quick Start
22 | To run a benchmark locally, provide your sagemaker [Model](https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_CreateModel.html) name and a list of example invocation arguments. Each of these arguments will be passed directly to the SageMaker runtime [InvokeEndpoint](https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/sagemaker-runtime.html#SageMakerRuntime.Client.invoke_endpoint) API
23 | ```
24 | from sm_serverless_benchmarking import benchmark
25 | from sm_serverless_benchmarking.utils import convert_invoke_args_to_jsonl
26 | 
27 | model_name = "<SageMaker Model Name>"
28 | 
29 | example_invoke_args = [
30 |                {'Body': '1,2,3,4,5', "ContentType": "text/csv"},
31 |                {'Body': '6,7,8,9,10', "ContentType": "text/csv"}
32 |               ]
33 |               
34 | example_args_file = convert_invoke_args_to_jsonl(example_invoke_args, 
35 |                                                  output_path=".")
36 |               
37 | r = benchmark.run_serverless_benchmarks(model_name, example_args_file)
38 | ```
39 | Alternativelly, you can run the benchmarks as SageMaker Processing job
40 | ```
41 | from sm_serverless_benchmarking.sagemaker_runner import run_as_sagemaker_job
42 | 
43 | run_as_sagemaker_job(
44 |         role="<execution_role_arn>",
45 |         model_name="<model_name>",
46 |         invoke_args_examples_file="<invoke_args_examples_file>",
47 |     )
48 | ```
49 | A utility function `sm_serverless_benchmarking.utils.convert_invoke_args_to_jsonl` is provided to convert a list of invocation argument examples into a JSONLines file. If working with data that cannot be serialized to JSON such as binary data including images, audio, and video, use the `sm_serverless_benchmarking.utils.convert_invoke_args_to_pkl` function which will serilize the examples to a pickle file instead.
50 | 
51 | Refer to the [sample_notebooks](sample_notebooks) directory for complete examples
52 | 
53 | ## Types of Benchmarks
54 | By default two types of benchmarks will be executed
55 | 
56 | - **Stability Benchmark** For each memory configuration, and a [MaxConcurency](https://docs.aws.amazon.com/sagemaker/latest/dg/serverless-endpoints-create.html#serverless-endpoints-create-config) of 1, will invoke the endpoint a specified number of times sequentially. The goal of this benchmark is to determine the most cost effective and stable memory configuration
57 | - **Concurrency Benchmark** Will invoke an endpoint with a simulated number of concurrent clients under different MaxConcurrency configurations 
58 | 
59 | ## Configuring the Benchmarks
60 | For either of the two approaches described above, you can specify a number of parameters to configure the benchmarking job
61 | 
62 |         cold_start_delay (int, optional): Number of seconds to sleep before starting the benchmark. Helps to induce a cold start on initial invocation. Defaults to 0.
63 |         memory_sizes (List[int], optional): List of memory configurations to benchmark Defaults to [1024, 2048, 3072, 4096, 5120, 6144].
64 |         stability_benchmark_invocations (int, optional): Total number of invocations for the stability benchmark. Defaults to 1000.
65 |         stability_benchmark_error_thresh (int, optional): The allowed number of endpoint invocation errors before the benchmark is terminated for a configuration. Defaults to 3.
66 |         include_concurrency_benchmark (bool, optional): Set True to run the concurrency benchmark with the optimal configuration from the stability benchmark. Defaults to True.
67 |         concurrency_benchmark_max_conc (List[int], optional): A list of max_concurency settings to benchmark. Defaults to [2, 4, 8].
68 |         concurrency_benchmark_invocations (int, optional): Total number of invocations for the concurency benchmark. Defaults to 1000.
69 |         concurrency_num_clients_multiplier (List[int], optional): List of multipliers to specify the number of simulated clients which is determined by max_concurency * multiplier. Defaults to [1, 1.5, 1.75, 2].
70 |         result_save_path (str, optional): The location to which the output artifacts will be saved. Defaults to ".".
71 | 
72 | 
73 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires      = ["setuptools>=61.0.0", "wheel"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | 
 6 | [project]
 7 | name = "sm-serverless-benchmarking"
 8 | version = "0.2.3"
 9 | description = "Benchmark sagemaker serverless endpoints for cost and performance"
10 | readme = "README.md"
11 | authors = [{ name = "Amazon Web Services"}]
12 | license = { file = "LICENSE" }
13 | classifiers = [
14 |     "License :: OSI Approved :: MIT License",
15 |     "Programming Language :: Python",
16 |     "Programming Language :: Python :: 3",
17 | ]
18 | keywords = ["sagemaker", "inference", "hosting"]
19 | dependencies = [
20 |     "boto3>=1.20.21,<2.0",
21 |     "pandas",
22 |     "seaborn",
23 |     "Jinja2",
24 |     "numpy",
25 |     "matplotlib",
26 |     'tomli; python_version < "3.11"',
27 | ]
28 | requires-python = ">=3.7"
29 | 
30 | [project.optional-dependencies]
31 | dev = ["black", "bumpver", "isort", "pip-tools"]
32 | sagemaker = ["sagemaker>2.0,<3.0"]
33 | 
34 | 


--------------------------------------------------------------------------------
/sample_notebooks/huggingface_pytorch_ner/huggingface_pytorch_ner_example.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%pip install sagemaker xgboost==1.5.1 scikit-learn install sm-serverless-benchmarking -Uqq"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "## Deploy a pretrained Named Entity Recognition Model\n",
 17 |     "In this example, we'll deploy a pretrained Named Entity Recognition (NER) using SageMaker Jumpstart then benchmark the model using the SageMaker Serverless Inference Benchmarking toolkit"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": null,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "import sagemaker\n",
 27 |     "from sagemaker import image_uris, model_uris, script_uris\n",
 28 |     "from sagemaker.model import Model\n",
 29 |     "import uuid\n",
 30 |     "\n",
 31 |     "role = (\n",
 32 |     "    sagemaker.get_execution_role()\n",
 33 |     ")  # manually provide role if using non role based identity\n",
 34 |     "sess = sagemaker.Session()\n",
 35 |     "region = sess.boto_region_name"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "model_id, model_version = (\n",
 45 |     "    \"huggingface-ner-distilbert-base-cased-finetuned-conll03-english\",\n",
 46 |     "    \"*\",\n",
 47 |     ")"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": null,
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "model_name = f\"js-huggingface-ner-distilbert-{str(uuid.uuid1())[:5]}\"\n",
 57 |     "\n",
 58 |     "inference_instance_type = \"ml.m5.xlarge\"  # used to lookup cpu inference container. No instance will be deployed\n",
 59 |     "\n",
 60 |     "# Retrieve the inference docker container uri. This is the base HuggingFace container image for the default model above.\n",
 61 |     "deploy_image_uri = image_uris.retrieve(\n",
 62 |     "    region=None,\n",
 63 |     "    framework=None,  # automatically inferred from model_id\n",
 64 |     "    image_scope=\"inference\",\n",
 65 |     "    model_id=model_id,\n",
 66 |     "    model_version=model_version,\n",
 67 |     "    instance_type=inference_instance_type,\n",
 68 |     ")\n",
 69 |     "\n",
 70 |     "# Retrieve the inference script uri. This includes all dependencies and scripts for model loading, inference handling etc.\n",
 71 |     "deploy_source_uri = script_uris.retrieve(\n",
 72 |     "    model_id=model_id, model_version=model_version, script_scope=\"inference\"\n",
 73 |     ")\n",
 74 |     "\n",
 75 |     "\n",
 76 |     "# Retrieve the model uri. This includes the pre-trained model and parameters.\n",
 77 |     "model_uri = model_uris.retrieve(\n",
 78 |     "    model_id=model_id, model_version=model_version, model_scope=\"inference\"\n",
 79 |     ")\n",
 80 |     "\n",
 81 |     "\n",
 82 |     "# Create the SageMaker model instance\n",
 83 |     "model = Model(\n",
 84 |     "    image_uri=deploy_image_uri,\n",
 85 |     "    source_dir=deploy_source_uri,\n",
 86 |     "    model_data=model_uri,\n",
 87 |     "    entry_point=\"inference.py\",  # entry point file in source_dir and present in deploy_source_uri\n",
 88 |     "    role=role,\n",
 89 |     "    name=model_name,\n",
 90 |     ")"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "model.sagemaker_session = sess\n",
100 |     "model.create(instance_type=inference_instance_type)"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "metadata": {},
106 |    "source": [
107 |     "## Validate Endpoint\n",
108 |     "Before launching a full benchmarking job, it is a good idea to first deploy the model on a test endpoint to ensure everything is functioning as it should. Here we will deploy a temporary endpoint and test it with an example payload. Afterwards, the endpoint is deleted. "
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "# create a temporary endpoint\n",
118 |     "from sm_serverless_benchmarking.endpoint import ServerlessEndpoint\n",
119 |     "\n",
120 |     "endpoint = ServerlessEndpoint(model_name=model.name, memory_size=6144)\n",
121 |     "endpoint.create_endpoint()"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "# invoke it with a sample payload and make sure a valid response is returned\n",
131 |     "input_text = \"My name is Wolfgang and I live in Berlin\"\n",
132 |     "response = endpoint.invoke_endpoint(\n",
133 |     "    {\"Body\": input_text, \"ContentType\": \"application/x-text\"}\n",
134 |     ")\n",
135 |     "print(response[\"Body\"].read().decode(\"utf8\"))"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "endpoint.clean_up()  # delete the endpoint"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "markdown",
149 |    "metadata": {},
150 |    "source": [
151 |     "## Launch Benchmarking SageMaker Job"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": null,
157 |    "metadata": {},
158 |    "outputs": [],
159 |    "source": [
160 |     "from sm_serverless_benchmarking.utils import convert_invoke_args_to_jsonl\n",
161 |     "from sm_serverless_benchmarking.sagemaker_runner import run_as_sagemaker_job\n",
162 |     "\n",
163 |     "example_invoke_args = [\n",
164 |     "    {\n",
165 |     "        \"Body\": \"My name is Wolfgang and I live in Berlin\",\n",
166 |     "        \"ContentType\": \"application/x-text\",\n",
167 |     "    },\n",
168 |     "    {\n",
169 |     "        \"Body\": \"Amazon.com, Inc. is an American multinational technology company which focuses on e-commerce, cloud computing, digital streaming, and artificial intelligence. It is headquartered in Seattle, WA\",\n",
170 |     "        \"ContentType\": \"application/x-text\",\n",
171 |     "    },\n",
172 |     "    {\n",
173 |     "        \"Body\": \"Simon is attending a machine learning workshop in New York next week\",\n",
174 |     "        \"ContentType\": \"application/x-text\",\n",
175 |     "    },\n",
176 |     "]\n",
177 |     "\n",
178 |     "example_invoke_file = convert_invoke_args_to_jsonl(example_invoke_args)"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": null,
184 |    "metadata": {},
185 |    "outputs": [],
186 |    "source": [
187 |     "processor = run_as_sagemaker_job(\n",
188 |     "    role=role, model_name=model.name, invoke_args_examples_file=example_invoke_file\n",
189 |     ")"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": null,
195 |    "metadata": {},
196 |    "outputs": [],
197 |    "source": [
198 |     "print(\n",
199 |     "    f\"Once the job is finished, the outputs will be uploaded to {processor.latest_job.outputs[0].destination}\"\n",
200 |     ")"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "markdown",
205 |    "metadata": {},
206 |    "source": [
207 |     "You can optionally run the command below to copy all of the benchmark output artifacts into the current directory. The primary report output will be under the `benchmarking_report/` directory"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "metadata": {},
214 |    "outputs": [],
215 |    "source": [
216 |     "!aws s3 cp --recursive {processor.latest_job.outputs[0].destination} ."
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "markdown",
221 |    "metadata": {},
222 |    "source": [
223 |     "## Run a Local Benchmarking Job [OPTIONAL]\n",
224 |     "You can also run the same benchmark locally "
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": null,
230 |    "metadata": {},
231 |    "outputs": [],
232 |    "source": [
233 |     "from sm_serverless_benchmarking.benchmark import run_serverless_benchmarks\n",
234 |     "\n",
235 |     "report = run_serverless_benchmarks(\n",
236 |     "    model_name=model.name, invoke_args_examples_file=example_invoke_file\n",
237 |     ")"
238 |    ]
239 |   }
240 |  ],
241 |  "metadata": {
242 |   "interpreter": {
243 |    "hash": "37058495916f7ab2e7db9963171426deb73c0dc04073ed3a56b3427789bc2f48"
244 |   },
245 |   "kernelspec": {
246 |    "display_name": "Python 3.8.13 ('serverless-benchmarking')",
247 |    "language": "python",
248 |    "name": "python3"
249 |   },
250 |   "language_info": {
251 |    "codemirror_mode": {
252 |     "name": "ipython",
253 |     "version": 3
254 |    },
255 |    "file_extension": ".py",
256 |    "mimetype": "text/x-python",
257 |    "name": "python",
258 |    "nbconvert_exporter": "python",
259 |    "pygments_lexer": "ipython3",
260 |    "version": "3.8.13"
261 |   },
262 |   "orig_nbformat": 4
263 |  },
264 |  "nbformat": 4,
265 |  "nbformat_minor": 2
266 | }
267 | 


--------------------------------------------------------------------------------
/sample_notebooks/tensorflow_mobilenet_ic/sample_images/n01440764_451.JPEG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/sagemaker-serverless-inference-benchmarking/cb0028576fe60e4704526a46368cbd53851eebc7/sample_notebooks/tensorflow_mobilenet_ic/sample_images/n01440764_451.JPEG


--------------------------------------------------------------------------------
/sample_notebooks/tensorflow_mobilenet_ic/sample_images/n02102040_281.JPEG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/sagemaker-serverless-inference-benchmarking/cb0028576fe60e4704526a46368cbd53851eebc7/sample_notebooks/tensorflow_mobilenet_ic/sample_images/n02102040_281.JPEG


--------------------------------------------------------------------------------
/sample_notebooks/tensorflow_mobilenet_ic/sample_images/n02979186_600.JPEG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/sagemaker-serverless-inference-benchmarking/cb0028576fe60e4704526a46368cbd53851eebc7/sample_notebooks/tensorflow_mobilenet_ic/sample_images/n02979186_600.JPEG


--------------------------------------------------------------------------------
/sample_notebooks/tensorflow_mobilenet_ic/sample_images/n03000684_481.JPEG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/sagemaker-serverless-inference-benchmarking/cb0028576fe60e4704526a46368cbd53851eebc7/sample_notebooks/tensorflow_mobilenet_ic/sample_images/n03000684_481.JPEG


--------------------------------------------------------------------------------
/sample_notebooks/tensorflow_mobilenet_ic/sample_images/n03028079_1002.JPEG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/sagemaker-serverless-inference-benchmarking/cb0028576fe60e4704526a46368cbd53851eebc7/sample_notebooks/tensorflow_mobilenet_ic/sample_images/n03028079_1002.JPEG


--------------------------------------------------------------------------------
/sample_notebooks/tensorflow_mobilenet_ic/sample_images/n03445777_451.JPEG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/sagemaker-serverless-inference-benchmarking/cb0028576fe60e4704526a46368cbd53851eebc7/sample_notebooks/tensorflow_mobilenet_ic/sample_images/n03445777_451.JPEG


--------------------------------------------------------------------------------
/sample_notebooks/tensorflow_mobilenet_ic/tf_ic_mobilenet.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%pip install sagemaker xgboost==1.5.1 scikit-learn install sm-serverless-benchmarking -Uqq"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "## Deploy a pretrained Image Classification Model\n",
 17 |     "In this example, we'll deploy a pretrained Image Classification model using SageMaker Jumpstart then benchmark the model using the SageMaker Serverless Inference Benchmarking toolkit"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": null,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "import sagemaker\n",
 27 |     "from sagemaker import image_uris, model_uris, script_uris\n",
 28 |     "from sagemaker.model import Model\n",
 29 |     "import uuid\n",
 30 |     "\n",
 31 |     "role = (\n",
 32 |     "    sagemaker.get_execution_role()\n",
 33 |     ")  # manually provide role if using non role based identity\n",
 34 |     "sess = sagemaker.Session()\n",
 35 |     "region = sess.boto_region_name"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": null,
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "model_id, model_version = (\n",
 45 |     "    \"tensorflow-ic-imagenet-mobilenet-v2-100-224-classification-4\",\n",
 46 |     "    \"*\",\n",
 47 |     ")"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": null,
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "model_name = f\"js-tf-ic-mobilenet-{str(uuid.uuid1())[:5]}\"\n",
 57 |     "\n",
 58 |     "inference_instance_type = \"ml.m5.xlarge\"  # used to lookup cpu inference container. No instance will be deployed\n",
 59 |     "\n",
 60 |     "# Retrieve the inference docker container uri. This is the base HuggingFace container image for the default model above.\n",
 61 |     "deploy_image_uri = image_uris.retrieve(\n",
 62 |     "    region=None,\n",
 63 |     "    framework=None,  # automatically inferred from model_id\n",
 64 |     "    image_scope=\"inference\",\n",
 65 |     "    model_id=model_id,\n",
 66 |     "    model_version=model_version,\n",
 67 |     "    instance_type=inference_instance_type,\n",
 68 |     ")\n",
 69 |     "\n",
 70 |     "# Retrieve the inference script uri. This includes all dependencies and scripts for model loading, inference handling etc.\n",
 71 |     "deploy_source_uri = script_uris.retrieve(\n",
 72 |     "    model_id=model_id, model_version=model_version, script_scope=\"inference\"\n",
 73 |     ")\n",
 74 |     "\n",
 75 |     "\n",
 76 |     "# Retrieve the model uri. This includes the pre-trained model and parameters.\n",
 77 |     "model_uri = model_uris.retrieve(\n",
 78 |     "    model_id=model_id, model_version=model_version, model_scope=\"inference\"\n",
 79 |     ")\n",
 80 |     "\n",
 81 |     "\n",
 82 |     "# Create the SageMaker model instance\n",
 83 |     "model = Model(\n",
 84 |     "    image_uri=deploy_image_uri,\n",
 85 |     "    source_dir=deploy_source_uri,\n",
 86 |     "    model_data=model_uri,\n",
 87 |     "    entry_point=\"inference.py\",  # entry point file in source_dir and present in deploy_source_uri\n",
 88 |     "    role=role,\n",
 89 |     "    name=model_name,\n",
 90 |     ")"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": null,
 96 |    "metadata": {},
 97 |    "outputs": [],
 98 |    "source": [
 99 |     "model.sagemaker_session = sess\n",
100 |     "model.create(instance_type=inference_instance_type)"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "markdown",
105 |    "metadata": {},
106 |    "source": [
107 |     "## Validate Endpoint\n",
108 |     "Before launching a full benchmarking job, it is a good idea to first deploy the model on a test endpoint to ensure everything is functioning as it should. Here we will deploy a temporary endpoint and test it with an example payload. Afterwards, the endpoint is deleted. "
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "# create a temporary endpoint\n",
118 |     "from sm_serverless_benchmarking.endpoint import ServerlessEndpoint\n",
119 |     "\n",
120 |     "endpoint = ServerlessEndpoint(model_name=model.name, memory_size=6144)\n",
121 |     "endpoint.create_endpoint()"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "from pathlib import Path\n",
131 |     "\n",
132 |     "sample_image_path = Path(\"sample_images\")\n",
133 |     "image_paths = list(sample_image_path.glob(\"*.JPEG\"))"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "# invoke it with a sample payload and make sure a valid response is returned\n",
143 |     "image_payload = image_paths[0].open(\"rb\").read()\n",
144 |     "response = endpoint.invoke_endpoint(\n",
145 |     "    {\"Body\": image_payload, \"ContentType\": \"application/x-image\"}\n",
146 |     ")\n",
147 |     "print(\n",
148 |     "    len(response[\"Body\"].read().decode(\"utf8\"))\n",
149 |     ")  # response is a long list of probabilities so just printing the length"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": [
158 |     "endpoint.clean_up()  # delete the endpoint"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "metadata": {},
164 |    "source": [
165 |     "## Launch Benchmarking SageMaker Job"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "code",
170 |    "execution_count": null,
171 |    "metadata": {},
172 |    "outputs": [],
173 |    "source": [
174 |     "from sm_serverless_benchmarking.utils import convert_invoke_args_to_pkl\n",
175 |     "from sm_serverless_benchmarking.sagemaker_runner import run_as_sagemaker_job\n",
176 |     "\n",
177 |     "example_invoke_args = [\n",
178 |     "    {\"Body\": img.open(\"rb\").read(), \"ContentType\": \"application/x-image\"}\n",
179 |     "    for img in image_paths\n",
180 |     "]\n",
181 |     "\n",
182 |     "example_invoke_file = convert_invoke_args_to_pkl(example_invoke_args)"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": null,
188 |    "metadata": {},
189 |    "outputs": [],
190 |    "source": [
191 |     "processor = run_as_sagemaker_job(\n",
192 |     "    role=role,\n",
193 |     "    model_name=model.name,\n",
194 |     "    invoke_args_examples_file=example_invoke_file,\n",
195 |     "    stability_benchmark_invocations=2500,\n",
196 |     "    concurrency_benchmark_invocations=2500,\n",
197 |     ")"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "code",
202 |    "execution_count": null,
203 |    "metadata": {},
204 |    "outputs": [],
205 |    "source": [
206 |     "print(\n",
207 |     "    f\"Once the job is finished, the outputs will be uploaded to {processor.latest_job.outputs[0].destination}\"\n",
208 |     ")"
209 |    ]
210 |   },
211 |   {
212 |    "cell_type": "markdown",
213 |    "metadata": {},
214 |    "source": [
215 |     "You can optionally run the command below to copy all of the benchmark output artifacts into the current directory. The primary report output will be under the `benchmarking_report/` directory"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": null,
221 |    "metadata": {},
222 |    "outputs": [],
223 |    "source": [
224 |     "!aws s3 cp --recursive {processor.latest_job.outputs[0].destination} ."
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "markdown",
229 |    "metadata": {},
230 |    "source": [
231 |     "## Run a Local Benchmarking Job [OPTIONAL]\n",
232 |     "You can also run the same benchmark locally "
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "code",
237 |    "execution_count": null,
238 |    "metadata": {
239 |     "vscode": {
240 |      "languageId": "markdown"
241 |     }
242 |    },
243 |    "outputs": [],
244 |    "source": [
245 |     "from sm_serverless_benchmarking.benchmark import run_serverless_benchmarks\n",
246 |     "report = run_serverless_benchmarks(model_name=model.name, invoke_args_examples_file=example_invoke_file)"
247 |    ]
248 |   }
249 |  ],
250 |  "metadata": {
251 |   "interpreter": {
252 |    "hash": "37058495916f7ab2e7db9963171426deb73c0dc04073ed3a56b3427789bc2f48"
253 |   },
254 |   "kernelspec": {
255 |    "display_name": "Python 3.8.13 ('serverless-benchmarking')",
256 |    "language": "python",
257 |    "name": "python3"
258 |   },
259 |   "language_info": {
260 |    "codemirror_mode": {
261 |     "name": "ipython",
262 |     "version": 3
263 |    },
264 |    "file_extension": ".py",
265 |    "mimetype": "text/x-python",
266 |    "name": "python",
267 |    "nbconvert_exporter": "python",
268 |    "pygments_lexer": "ipython3",
269 |    "version": "3.8.13"
270 |   },
271 |   "orig_nbformat": 4
272 |  },
273 |  "nbformat": 4,
274 |  "nbformat_minor": 2
275 | }
276 | 


--------------------------------------------------------------------------------
/sample_notebooks/xgboost_classification/xgboost_clf.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%pip install sagemaker xgboost==1.5.1 scikit-learn install sm-serverless-benchmarking -Uqq"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "## Train and deploy an XGBooost Model\n",
 17 |     "In this example, we'll train an XGBoost model on an synthetic dataset and then deploy it for benchmarking"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": null,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "import tarfile\n",
 27 |     "\n",
 28 |     "\n",
 29 |     "import sagemaker\n",
 30 |     "import xgboost as xgb\n",
 31 |     "import numpy as np\n",
 32 |     "from sagemaker.model import Model\n",
 33 |     "from sagemaker.serializers import CSVSerializer\n",
 34 |     "from sklearn.datasets import make_classification\n",
 35 |     "\n",
 36 |     "role = (\n",
 37 |     "    sagemaker.get_execution_role()\n",
 38 |     ")  # manually provide role if using non role based identity\n",
 39 |     "sess = sagemaker.Session()\n",
 40 |     "region = sess.boto_region_name\n",
 41 |     "bucket = sess.default_bucket()\n",
 42 |     "image_uri = sagemaker.image_uris.retrieve(\n",
 43 |     "    framework=\"xgboost\", region=region, version=\"1.5-1\"\n",
 44 |     ")"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "# train a model on some synthetic data\n",
 54 |     "X, y = make_classification(n_samples=1000, n_features=20)\n",
 55 |     "data = xgb.DMatrix(data=X, label=y)\n",
 56 |     "bst = xgb.train(params={}, dtrain=data)\n",
 57 |     "bst.save_model(\"./xgboost-model\")"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": null,
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "# package the model for deployment and upload to S3\n",
 67 |     "with tarfile.open(\"model.tar.gz\", \"w:gz\") as f:\n",
 68 |     "    f.add(\"xgboost-model\")\n",
 69 |     "model_uri = sess.upload_data(\n",
 70 |     "    \"model.tar.gz\", bucket=bucket, key_prefix=\"sm-sl-bench-xgboost\"\n",
 71 |     ")"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": null,
 77 |    "metadata": {},
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "sm_model = Model(image_uri=image_uri, model_data=model_uri, role=role)\n",
 81 |     "sm_model.create(instance_type=\"ml.m5.xlarge\")"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "markdown",
 86 |    "metadata": {},
 87 |    "source": [
 88 |     "## Validate Endpoint\n",
 89 |     "Before launching a full benchmarking job, it is a good idea to first deploy the model on a test endpoint to ensure everything is functioning as it should. Here we will deploy a temporary endpoint and test it with an example payload. Afterwards, the endpoint is deleted. "
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "# create a temporary endpoint\n",
 99 |     "from sm_serverless_benchmarking.endpoint import ServerlessEndpoint\n",
100 |     "\n",
101 |     "endpoint = ServerlessEndpoint(model_name=sm_model.name, memory_size=6144)\n",
102 |     "endpoint.create_endpoint()"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "code",
107 |    "execution_count": null,
108 |    "metadata": {},
109 |    "outputs": [],
110 |    "source": [
111 |     "# convert first 10 records into csv format\n",
112 |     "ser = CSVSerializer()\n",
113 |     "payload = ser.serialize(X[:10])"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "code",
118 |    "execution_count": null,
119 |    "metadata": {},
120 |    "outputs": [],
121 |    "source": [
122 |     "# invoke endpoint and print predictions\n",
123 |     "response = endpoint.invoke_endpoint({\"Body\": payload, \"ContentType\": \"text/csv\"})\n",
124 |     "print(response[\"Body\"].read().decode(\"utf-8\"))"
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": null,
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": [
133 |     "endpoint.clean_up()  # delete the endpoint"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "metadata": {},
139 |    "source": [
140 |     "## Launch Benchmarking SageMaker Job"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": null,
146 |    "metadata": {},
147 |    "outputs": [],
148 |    "source": [
149 |     "from sm_serverless_benchmarking.utils import convert_invoke_args_to_jsonl\n",
150 |     "from sm_serverless_benchmarking.sagemaker_runner import run_as_sagemaker_job\n",
151 |     "\n",
152 |     "# we'll use 20 random inputs\n",
153 |     "sample_inputs = X[np.random.choice(X.shape[0], size=20, replace=False)]\n",
154 |     "\n",
155 |     "example_invoke_args = [\n",
156 |     "    {\"Body\": ser.serialize(inp), \"ContentType\": \"text/csv\"} for inp in sample_inputs\n",
157 |     "]\n",
158 |     "\n",
159 |     "example_invoke_file = convert_invoke_args_to_jsonl(example_invoke_args)"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": null,
165 |    "metadata": {},
166 |    "outputs": [],
167 |    "source": [
168 |     "processor = run_as_sagemaker_job(\n",
169 |     "    role=role,\n",
170 |     "    model_name=sm_model.name,\n",
171 |     "    invoke_args_examples_file=example_invoke_file,\n",
172 |     "    stability_benchmark_invocations=2500,\n",
173 |     "    concurrency_benchmark_invocations=2500,\n",
174 |     ")"
175 |    ]
176 |   },
177 |   {
178 |    "cell_type": "code",
179 |    "execution_count": null,
180 |    "metadata": {},
181 |    "outputs": [],
182 |    "source": [
183 |     "print(\n",
184 |     "    f\"Once the job is finished, the outputs will be uploaded to {processor.latest_job.outputs[0].destination}\"\n",
185 |     ")"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "markdown",
190 |    "metadata": {},
191 |    "source": [
192 |     "You can optionally run the command below to copy all of the benchmark output artifacts into the current directory. The primary report output will be under the `benchmarking_report/` directory"
193 |    ]
194 |   },
195 |   {
196 |    "cell_type": "code",
197 |    "execution_count": null,
198 |    "metadata": {},
199 |    "outputs": [],
200 |    "source": [
201 |     "!aws s3 cp --recursive {processor.latest_job.outputs[0].destination} ."
202 |    ]
203 |   },
204 |   {
205 |    "cell_type": "markdown",
206 |    "metadata": {},
207 |    "source": [
208 |     "## Run a Local Benchmarking Job [OPTIONAL]\n",
209 |     "You can also run the same benchmark locally "
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "code",
214 |    "execution_count": null,
215 |    "metadata": {},
216 |    "outputs": [],
217 |    "source": [
218 |     "from sm_serverless_benchmarking.benchmark import run_serverless_benchmarks\n",
219 |     "\n",
220 |     "report = run_serverless_benchmarks(\n",
221 |     "    model_name=sm_model.name, invoke_args_examples_file=example_invoke_file\n",
222 |     ")"
223 |    ]
224 |   }
225 |  ],
226 |  "metadata": {
227 |   "interpreter": {
228 |    "hash": "37058495916f7ab2e7db9963171426deb73c0dc04073ed3a56b3427789bc2f48"
229 |   },
230 |   "kernelspec": {
231 |    "display_name": "Python 3.8.13 ('serverless-benchmarking')",
232 |    "language": "python",
233 |    "name": "python3"
234 |   },
235 |   "language_info": {
236 |    "codemirror_mode": {
237 |     "name": "ipython",
238 |     "version": 3
239 |    },
240 |    "file_extension": ".py",
241 |    "mimetype": "text/x-python",
242 |    "name": "python",
243 |    "nbconvert_exporter": "python",
244 |    "pygments_lexer": "ipython3",
245 |    "version": "3.8.13"
246 |   },
247 |   "orig_nbformat": 4
248 |  },
249 |  "nbformat": 4,
250 |  "nbformat_minor": 2
251 | }
252 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup
2 | 
3 | setup()


--------------------------------------------------------------------------------
/src/sm_serverless_benchmarking/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.2.3"


--------------------------------------------------------------------------------
/src/sm_serverless_benchmarking/__main__.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os
 3 | 
 4 | if "SAGEMAKER_TRAINING_MODULE" in os.environ:
 5 |     print("Running in a SageMaker Environment")
 6 |     import shutil
 7 |     shutil.copytree(".", "sm_serverless_benchmarking")
 8 | 
 9 | 
10 | import argparse
11 | from pathlib import Path
12 | 
13 | from sm_serverless_benchmarking.benchmark import run_serverless_benchmarks
14 | 
15 | 
16 | def parse_args():
17 |     parser = argparse.ArgumentParser(description="Run a suite of SageMaker Serverless Benchmarks on the specified model_name")
18 |     parser.add_argument("model_name", type=str, help="Name of the SageMaker Model resource")
19 |     parser.add_argument("invoke_args_examples_file", type=Path, help="Path to the jsonl file containing the example invocation arcguments")
20 |     parser.add_argument("--cold_start_delay", type=int, help="Number of seconds to sleep before starting the benchmark. Helps to induce a cold start on initial invocation. Defaults to 0")
21 |     parser.add_argument("--memory_sizes", type=int, nargs="+", choices=[1024, 2048, 3072, 4096, 5120, 6144], help="List of memory configurations to benchmark Defaults to [1024, 2048, 3072, 4096, 5120, 6144]")
22 |     parser.add_argument("--stability_benchmark_invocations", type=int, help="Total number of invocations for the stability benchmark. Defaults to 1000")
23 |     parser.add_argument("--stability_benchmark_error_thresh", type=int, help="The allowed number of endpoint invocation errors before the benchmark is terminated for that endpoint. Defaults to 3.")
24 |     parser.add_argument("--no_include_concurrency_benchmark", action='store_true', help="Do not run the concurrency benchmark with the optimal configuration from the stability benchmark. Defaults to False")
25 |     parser.add_argument("--concurrency_benchmark_max_conc", type=int, nargs="+",  help="A list of max_concurency settings to benchmark. Defaults to [2, 4, 8]")
26 |     parser.add_argument("--concurrency_benchmark_invocations", type=int, help="Total number of invocations for the concurency benchmark. Defaults to 1000")
27 |     parser.add_argument("--concurrency_num_clients_multiplier", type=float, nargs="+",  help="List of multipliers to specify the number of simulated clients which is determined by max_concurency * multiplier. Defaults to [1, 1.5, 1.75, 2]")
28 |     parser.add_argument("--result_save_path", type=Path, help="The location to which the output artifacts will be saved. Defaults to .")
29 |     args = parser.parse_args()
30 |     arg_dict = vars(args)
31 |     arg_dict = {k:v for k, v in arg_dict.items() if v is not None}
32 |    
33 |     
34 |     if not arg_dict["no_include_concurrency_benchmark"]:
35 |         arg_dict["include_concurrency_benchmark"] = True
36 |     else:
37 |         arg_dict["include_concurrency_benchmark"] = False
38 |     
39 |     arg_dict.pop("no_include_concurrency_benchmark")
40 | 
41 |     return arg_dict
42 | 
43 | def main():
44 |     kwargs = parse_args()
45 |     run_serverless_benchmarks(**kwargs)
46 | 
47 | if __name__ == "__main__":
48 |     main()
49 | 
50 | 


--------------------------------------------------------------------------------
/src/sm_serverless_benchmarking/analysis.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | from functools import reduce
  3 | from pathlib import Path
  4 | from typing import Dict, List, Tuple, Union
  5 | 
  6 | import matplotlib
  7 | import matplotlib.pyplot as plt
  8 | import numpy as np
  9 | import pandas as pd
 10 | import seaborn as sns
 11 | 
 12 | from sm_serverless_benchmarking.cost_constants import (INFERENCE_COST,
 13 |                                                        INSTANCE_MAPPING,
 14 |                                                        MONTHLY_INSTANCE_COST,
 15 |                                                        PROCESSING_COST)
 16 | 
 17 | 
 18 | def iqr(ps: pd.Series) -> float:
 19 |     p25 = ps.quantile(0.25)
 20 |     p75 = ps.quantile(0.75)
 21 |     iqr = p75 - p25
 22 | 
 23 |     return iqr
 24 | 
 25 | 
 26 | def convert_units(ps: pd.Series) -> pd.Series:
 27 | 
 28 |     aggregates = ["Average", "Minimum", "Maximum", "p25", "p50", "p75"]
 29 |     if ps["Unit"] == "Microseconds":
 30 |         ps[aggregates] = ps[aggregates] / 1000
 31 |         ps["Unit"] = "Milliseconds"
 32 |         return ps
 33 |     else:
 34 |         return ps
 35 | 
 36 | 
 37 | def compute_tps(df: pd.DataFrame) -> pd.DataFrame:
 38 | 
 39 |     invocation_end_time_counts = df["end_time"].astype(int).value_counts()
 40 | 
 41 |     tps_metrics = (
 42 |         invocation_end_time_counts.describe().drop(["count", "std", "min"]).astype(int)
 43 |     )
 44 |     tps_metrics.rename(
 45 |         {
 46 |             "25%": "tps_p25",
 47 |             "50%": "tps_p50",
 48 |             "75%": "tps_p75",
 49 |             "max": "tps_max",
 50 |             "mean": "tps_avg",
 51 |         },
 52 |         inplace=True,
 53 |     )
 54 | 
 55 |     return tps_metrics
 56 | 
 57 | 
 58 | def summarize_stability_results(
 59 |     df_benchmark_results: pd.DataFrame,
 60 |     df_endpoint_metrics: pd.DataFrame,
 61 |     result_save_path: str = ".",
 62 | ) -> Tuple[pd.DataFrame, pd.DataFrame, matplotlib.figure.Figure]:
 63 | 
 64 |     save_path = Path(result_save_path) / "stability_benchmark_summary_results"
 65 |     save_path.mkdir(exist_ok=True, parents=True)
 66 | 
 67 |     df_benchmark_results_success = df_benchmark_results.query(
 68 |         "(invocation_latency > 0) & (response_size > 0)"
 69 |     )
 70 | 
 71 |     df_benchmark_summary = df_benchmark_results_success.groupby("memory_size").agg(
 72 |         {
 73 |             "invocation_latency": ["count", "min", "mean", "median", "max", iqr],
 74 |             # "throttle_exception":["sum"],
 75 |             # "insufficient_memory_error":["sum"],
 76 |             # "other_model_error":["sum"]
 77 |         }
 78 |     )
 79 | 
 80 |     df_benchmark_summary.columns = [
 81 |         f"{x[0]}_{x[1]}" for x in df_benchmark_summary.columns.to_flat_index()
 82 |     ]
 83 |     df_benchmark_summary.rename(
 84 |         columns={"invocation_latency_count": "successful_invocations"}, inplace=True
 85 |     )
 86 | 
 87 |     df_endpoint_metrics = df_endpoint_metrics.apply(convert_units, axis=1)
 88 |     df_metric_summary = df_endpoint_metrics.pivot(
 89 |         index="memory_size", columns="metric_name", values="Average"
 90 |     ).dropna(thresh=2)
 91 | 
 92 |     df_benchmark_summary.to_csv(
 93 |         save_path / "invocation_benchmark_summary.csv", index=True
 94 |     )
 95 |     df_metric_summary.to_csv(save_path / "endpoint_metrics_summary.csv", index=True)
 96 | 
 97 |     latency_thresholds = df_benchmark_summary.eval(
 98 |         """low = invocation_latency_mean - 5 *invocation_latency_iqr
 99 |                                                  high = invocation_latency_mean + 5 *invocation_latency_iqr
100 |                                                 """
101 |     )[["low", "high"]]
102 | 
103 |     df_benchmark_results_no_outliers = df_benchmark_results_success.merge(
104 |         latency_thresholds, on="memory_size"
105 |     ).query("(invocation_latency >= 0) & (invocation_latency <= high)")
106 |     fig, ax = plt.subplots(figsize=(10, 6))
107 |     sns.kdeplot(
108 |         data=df_benchmark_results_no_outliers,
109 |         x="invocation_latency",
110 |         hue="memory_size",
111 |         palette="tab10",
112 |         ax=ax,
113 |     ).set_title("Invocation Latency (ms)")
114 | 
115 |     fig.savefig(save_path / "stability_benchmark_distribution.png")
116 | 
117 |     return df_benchmark_results_success, df_benchmark_summary, df_metric_summary, fig
118 | 
119 | 
120 | def summarize_concurrency_results(
121 |     df_benchmark_results: pd.DataFrame,
122 |     df_endpoint_metrics: pd.DataFrame,
123 |     result_save_path: str = ".",
124 | ) -> Tuple[pd.DataFrame, matplotlib.figure.Figure]:
125 | 
126 |     save_path = Path(result_save_path) / "concurrency_benchmark_summary_results"
127 |     save_path.mkdir(exist_ok=True, parents=True)
128 | 
129 |     df_endpoint_metrics = df_endpoint_metrics.apply(convert_units, axis=1)
130 |     df_metric_summary = pd.pivot_table(
131 |         df_endpoint_metrics,
132 |         index=["memory_size", "max_concurrency"],
133 |         columns="metric_name",
134 |         values="Average",
135 |     ).dropna(thresh=2)
136 | 
137 |     df_outlier_thresh = df_benchmark_results.groupby(
138 |         ["max_concurrency", "num_clients", "memory_size"], as_index=False
139 |     ).agg({"invocation_latency": ["mean", iqr]})
140 |     df_outlier_thresh.columns = [
141 |         f"{x[0]}_{x[1]}".strip("_") for x in df_outlier_thresh.columns.to_flat_index()
142 |     ]
143 |     df_outlier_thresh = df_outlier_thresh.eval(
144 |         """low = invocation_latency_mean - 5 *invocation_latency_iqr
145 |                                                      high = invocation_latency_mean + 5 *invocation_latency_iqr
146 |                                                     """
147 |     )[["max_concurrency", "num_clients", "low", "high"]]
148 | 
149 |     df_benchmark_results_thresh = df_benchmark_results.merge(
150 |         df_outlier_thresh, on=["max_concurrency", "num_clients"]
151 |     )
152 |     df_benchmark_success = df_benchmark_results_thresh.query(
153 |         "(invocation_latency <= high) & (invocation_latency > 0)"
154 |     )
155 | 
156 |     df_invocation_error_metrics = df_benchmark_results.groupby(
157 |         ["max_concurrency", "num_clients", "memory_size"], as_index=False
158 |     ).agg(
159 |         {
160 |             "throttle_exception": "sum",
161 |             "insufficient_memory_error": "sum",
162 |             "other_model_error": "sum",
163 |             "invocation_latency": "count",
164 |         }
165 |     )
166 |     df_invocation_error_metrics.rename(
167 |         columns={"invocation_latency": "num_invocations"}, inplace=True
168 |     )
169 | 
170 |     df_invocation_latency_metrics = df_benchmark_success.groupby(
171 |         ["max_concurrency", "num_clients", "memory_size"], as_index=False
172 |     ).agg({"invocation_latency": ["median", "mean", "max", iqr]})
173 | 
174 |     df_invocation_latency_metrics.columns = [
175 |         f"{x[0]}_{x[1]}".strip("_")
176 |         for x in df_invocation_latency_metrics.columns.to_flat_index()
177 |     ]
178 | 
179 |     df_invocation_metrics = df_invocation_error_metrics.merge(
180 |         df_invocation_latency_metrics,
181 |         on=["max_concurrency", "num_clients", "memory_size"],
182 |     )
183 | 
184 |     df_tps_metrics = df_benchmark_success.groupby(
185 |         ["max_concurrency", "num_clients", "memory_size"], as_index=False
186 |     ).apply(compute_tps)
187 |     df_concurrency_metrics = df_invocation_metrics.merge(
188 |         df_tps_metrics, on=["max_concurrency", "num_clients", "memory_size"]
189 |     )
190 | 
191 |     concurrency_settings = df_concurrency_metrics["max_concurrency"].unique().tolist()
192 |     num_plots = len(concurrency_settings)
193 | 
194 |     fig, axs = plt.subplots(len(concurrency_settings), 1, figsize=(10, 6 * num_plots))
195 |     for max_conc, ax in zip(concurrency_settings, axs):
196 |         sns.kdeplot(
197 |             data=df_benchmark_success.query(f"max_concurrency=={max_conc}"),
198 |             x="invocation_latency",
199 |             hue="num_clients",
200 |             palette="tab10",
201 |             ax=ax,
202 |         ).set_title(f"Invocation Latency (ms) with max_concurrency={max_conc}")
203 | 
204 |     df_concurrency_metrics.to_csv(
205 |         save_path / "concurrency_benchmark_summary.csv", index=True
206 |     )
207 |     df_metric_summary.to_csv(save_path / "endpoint_metrics_summary.csv", index=True)
208 |     fig.savefig(save_path / "concurrency_benchmark_distribution.png")
209 | 
210 |     return df_concurrency_metrics, df_metric_summary, fig
211 | 
212 | 
213 | def plot_savings_latency(df_stability_metric_summary: pd.DataFrame):
214 |     sns.set(style="white")
215 |     paper_rc = {"lines.linewidth": 1, "lines.markersize": 10}
216 |     sns.set_context("paper", rc=paper_rc)
217 |     fig, ax = plt.subplots(1, 1, figsize=(10, 6))
218 |     sns.lineplot(
219 |         x=df_stability_metric_summary.index,
220 |         y=df_stability_metric_summary["cost_per_1M_invocations"],
221 |         marker="s",
222 |         ax=ax,
223 |         color="#e76f51",
224 |     )
225 |     ax2 = ax.twinx()
226 |     sns.lineplot(
227 |         x=df_stability_metric_summary.index,
228 |         y=df_stability_metric_summary["average_latency"],
229 |         marker="s",
230 |         ax=ax2,
231 |         color="#588157",
232 |     )
233 |     ax.set_xticks(df_stability_metric_summary.index)
234 |     fig.legend(
235 |         labels=["Cost Per 1M Invocations", "Average Latency"],
236 |         loc="upper center",
237 |         ncol=2,
238 |         fontsize=10,
239 |     )
240 |     for memory, cost, latency in zip(
241 |         df_stability_metric_summary.index,
242 |         df_stability_metric_summary["cost_per_1M_invocations"],
243 |         df_stability_metric_summary["average_latency"],
244 |     ):
245 |         ax.text(
246 |             x=memory - 128, y=cost, s=f"${cost:.2f}", color="white"
247 |         ).set_backgroundcolor("#e76f51")
248 |         ax2.text(
249 |             x=memory - 128, y=latency, s=f"{latency:.2f}", color="white"
250 |         ).set_backgroundcolor("#588157")
251 | 
252 |     min_latency = df_stability_metric_summary["average_latency"].min()
253 |     max_latency = df_stability_metric_summary["average_latency"].max()
254 |     min_max_diff = max_latency - min_latency
255 | 
256 |     if min_max_diff < 10:
257 |         yticks = np.linspace(min_latency - 1, max_latency + 1, 5)
258 | 
259 |     elif min_max_diff < 50:
260 |         yticks = np.linspace(min_latency - 1, max_latency + 1, 10, dtype=np.int32)
261 |     else:
262 |         yticks = np.linspace(min_latency - 1, max_latency + 1, 20, dtype=np.int32)
263 | 
264 |     ax2.set_yticks(yticks)
265 | 
266 |     for x in [ax, ax2]:
267 |         x.spines["top"].set_visible(False)
268 |         x.spines["right"].set_visible(False)
269 |         x.spines["bottom"].set_visible(False)
270 |         x.spines["left"].set_visible(False)
271 | 
272 |     ax.set_xlabel("Memory Size", fontsize=12)
273 |     ax.set_ylabel("Cost Per 1M Invocations", fontsize=12)
274 |     ax2.set_ylabel("Average Latency (ms)", fontsize=12)
275 | 
276 |     return fig
277 | 
278 | 
279 | def compute_cost_savings(
280 |     df_stability_metric_summary: pd.DataFrame,
281 |     invoke_args_list: List[Dict[str, str]],
282 |     average_response_size: int = 1000,
283 |     result_save_path: str = ".",
284 | ) -> Tuple[pd.DataFrame, int, str]:
285 | 
286 |     save_path = Path(result_save_path) / "cost_analysis_summary_results"
287 |     save_path.mkdir(exist_ok=True, parents=True)
288 | 
289 |     # df_stability_metric_summary.eval("average_latency = ModelLatency + OverheadLatency", inplace=True) Removed overhead latency for now due to variability when there is no cold start
290 |     average_overhead_latency = df_stability_metric_summary["OverheadLatency"].mean()
291 |     df_stability_metric_summary.eval(
292 |         f"average_latency = ModelLatency + {average_overhead_latency}", inplace=True
293 |     )
294 | 
295 |     #     try:
296 |     #         minimal_successful_config = df_metric_summary["memory_size"].min()
297 |     #     except:
298 |     #         minimal_successful_config = df_metric_summary.index.min()
299 | 
300 |     #     average_latency = df_metric_summary.query(f"memory_size == {minimal_successful_config}").eval("ModelLatency + OverheadLatency").values[0]
301 |     average_request_size = reduce(
302 |         lambda x, y: x + sys.getsizeof(y["Body"]), invoke_args_list, 0
303 |     ) / len(invoke_args_list)
304 | 
305 |     # endpoint_inference_cost = INFERENCE_COST[minimal_successful_config]
306 |     endpoint_processing_cost = (
307 |         average_request_size + average_response_size
308 |     ) * PROCESSING_COST
309 | 
310 |     df_stability_metric_summary["cost_per_invocation"] = (
311 |         df_stability_metric_summary.index.map(INFERENCE_COST)
312 |         * df_stability_metric_summary["average_latency"]
313 |     ) + endpoint_processing_cost
314 |     df_stability_metric_summary["cost_per_1M_invocations"] = (
315 |         df_stability_metric_summary["cost_per_invocation"] * 1_000_000
316 |     )
317 | 
318 |     discount_factor = np.linspace(1, 0.5, len(INFERENCE_COST))[:df_stability_metric_summary.shape[0]]
319 | 
320 |     optimal_memory_config = int(
321 |         (
322 |             df_stability_metric_summary.eval(
323 |                 "average_latency * cost_per_1M_invocations"
324 |             )
325 |             * discount_factor
326 |         ).idxmin()
327 |     )
328 | 
329 |     average_cost_per_invocation = df_stability_metric_summary.loc[
330 |         optimal_memory_config, "cost_per_invocation"
331 |     ]
332 | 
333 |     cost_latency_fig = plot_savings_latency(df_stability_metric_summary)
334 | 
335 |     # average_cost_per_invocation = (endpoint_inference_cost * average_latency) + endpoint_processing_in_cost
336 | 
337 |     comparable_sagemaker_instance = INSTANCE_MAPPING[optimal_memory_config]
338 |     instance_monthly_cost = MONTHLY_INSTANCE_COST[comparable_sagemaker_instance]
339 |     break_even_invocations = instance_monthly_cost / average_cost_per_invocation
340 | 
341 |     if break_even_invocations < 200_000:
342 |         stride = 10_000
343 |     elif break_even_invocations < 1_000_000:
344 |         stride = 50_000
345 |     elif break_even_invocations < 2_000_000:
346 |         stride = 100_000
347 |     elif break_even_invocations < 5_000_000:
348 |         stride = 200_000
349 |     else:
350 |         stride = 500_000
351 | 
352 |     monthly_invocations = np.arange(
353 |         stride, break_even_invocations, stride, dtype=np.int32
354 |     )
355 |     monthly_cost = monthly_invocations * average_cost_per_invocation
356 |     monthly_percent_savings = np.round(
357 |         100
358 |         * (instance_monthly_cost - (monthly_invocations * average_cost_per_invocation))
359 |         / instance_monthly_cost,
360 |         2,
361 |     )
362 |     df_savings = pd.DataFrame(
363 |         dict(
364 |             monthly_invocations=monthly_invocations,
365 |             serverless_monthly_cost=monthly_cost,
366 |             instance_monthly_cost=instance_monthly_cost,
367 |             monthly_percent_savings=monthly_percent_savings,
368 |         )
369 |     )
370 | 
371 |     df_stability_metric_summary.to_csv(save_path / "metrics_with_cost.csv", index=False)
372 |     df_savings.to_csv(save_path / "cost_savings_summary.csv", index=False)
373 |     cost_latency_fig.savefig(save_path / "cost_vs_performance.png")
374 | 
375 |     df_stability_metric_summary.drop(
376 |         ["average_latency", "cost_per_invocation"], axis=1, inplace=True
377 |     )
378 | 
379 |     return (
380 |         df_savings,
381 |         optimal_memory_config,
382 |         comparable_sagemaker_instance,
383 |         cost_latency_fig,
384 |     )
385 | 


--------------------------------------------------------------------------------
/src/sm_serverless_benchmarking/benchmark.py:
--------------------------------------------------------------------------------
  1 | import datetime as dt
  2 | import math
  3 | import random
  4 | import sys
  5 | import time
  6 | from concurrent.futures import ThreadPoolExecutor, as_completed
  7 | from pathlib import Path
  8 | from typing import Dict, List, Union
  9 | 
 10 | import botocore.exceptions
 11 | import pandas as pd
 12 | 
 13 | from sm_serverless_benchmarking.analysis import (compute_cost_savings,
 14 |                                                  summarize_concurrency_results,
 15 |                                                  summarize_stability_results)
 16 | from sm_serverless_benchmarking.endpoint import ServerlessEndpoint
 17 | from sm_serverless_benchmarking.report import generate_html_report
 18 | from sm_serverless_benchmarking.utils import read_example_args_file
 19 | 
 20 | 
 21 | def create_endpoint(
 22 |     model_name: str, memory_size: int = 1024, max_concurrency: int = 1
 23 | ) -> ServerlessEndpoint:
 24 |     ep = ServerlessEndpoint(
 25 |         model_name=model_name, memory_size=memory_size, max_concurrency=max_concurrency
 26 |     )
 27 |     ep.create_endpoint()
 28 |     return ep
 29 | 
 30 | 
 31 | def timed_invocation(endpoint: ServerlessEndpoint, invoke_args: Dict[str, str]):
 32 |     time.sleep(random.random())
 33 |     t1 = time.perf_counter()
 34 |     response_size = 0
 35 |     try:
 36 |         response = endpoint.invoke_endpoint(invoke_args)
 37 |         response_size = int(
 38 |             response["ResponseMetadata"]["HTTPHeaders"]["content-length"]
 39 |         )
 40 |         # body = response["Body"].read()
 41 |         # response_size = sys.getsizeof(
 42 |         #     body
 43 |         # )  # calculate response size to estimate per inference cost
 44 | 
 45 |     except botocore.exceptions.ClientError as error:
 46 |         if error.response["Error"]["Code"] == "ThrottlingException":
 47 |             return {
 48 |                 "invocation_latency": -1,
 49 |                 "throttle_exception": 1,
 50 |                 "start_time": t1,
 51 |                 "end_time": time.perf_counter(),
 52 |             }
 53 | 
 54 |         elif error.response["Error"]["Code"] == "ModelError":
 55 |             if "insufficient memory" in error.response["Error"]["Message"]:
 56 |                 return {
 57 |                     "invocation_latency": -1,
 58 |                     "throttle_exception": 0,
 59 |                     "insufficient_memory_error": 1,
 60 |                     "other_model_error": 0,
 61 |                     "start_time": t1,
 62 |                     "end_time": time.perf_counter(),
 63 |                 }
 64 |             else:
 65 |                 print(
 66 |                     f"error in endpoint {endpoint.endpoint_name} ",
 67 |                     error.response["Error"]["Message"],
 68 |                 )
 69 |                 return {
 70 |                     "invocation_latency": -1,
 71 |                     "throttle_exception": 0,
 72 |                     "insufficient_memory_error": 0,
 73 |                     "other_model_error": 1,
 74 |                     "start_time": t1,
 75 |                     "end_time": time.perf_counter(),
 76 |                 }
 77 | 
 78 |     t2 = time.perf_counter()
 79 | 
 80 |     return {
 81 |         "invocation_latency": (t2 - t1) * 1000,
 82 |         "response_size": response_size,
 83 |         "throttle_exception": 0,
 84 |         "insufficient_memory_error": 0,
 85 |         "other_model_error": 0,
 86 |         "start_time": t1,
 87 |         "end_time": t2,
 88 |     }
 89 | 
 90 | 
 91 | def create_endpoint_configs(
 92 |     memory_size: Union[int, List[int]], max_concurrency: Union[int, List[int]]
 93 | ):
 94 | 
 95 |     if type(memory_size) == int:
 96 |         memory_size = [memory_size]
 97 | 
 98 |     if type(max_concurrency) == int:
 99 |         max_concurrency = [max_concurrency]
100 | 
101 |     endpoint_configs = []
102 |     for mem_size in memory_size:
103 |         for max_conc in max_concurrency:
104 |             endpoint_configs.append(
105 |                 {"memory_size": mem_size, "max_concurrency": max_conc}
106 |             )
107 | 
108 |     return endpoint_configs
109 | 
110 | 
111 | def setup_endpoints(
112 |     model_name: str,
113 |     memory_size: Union[int, List[int]],
114 |     max_concurrency: Union[int, List[int]],
115 |     sleep: int = 0,
116 | ):
117 | 
118 |     update_configs = create_endpoint_configs(memory_size, max_concurrency)
119 | 
120 |     endpoint_futures = []
121 | 
122 |     with ThreadPoolExecutor(max_workers=len(update_configs)) as executor:
123 | 
124 |         for config_kwargs in update_configs:
125 | 
126 |             future = executor.submit(create_endpoint, model_name, **config_kwargs)
127 |             endpoint_futures.append(future)
128 |             time.sleep(2)
129 | 
130 |         endpoints = [future.result() for future in as_completed(endpoint_futures)]
131 | 
132 | 
133 |     endpoints = [endpoint for endpoint in endpoints if endpoint._created]
134 |     time.sleep(sleep)  # sleep to increase chance of cold start
135 | 
136 |     return endpoints
137 | 
138 | 
139 | def stability_benchmark(
140 |     endpoint: ServerlessEndpoint, invoke_args_list, num_invocations=1000, error_thresh=3
141 | ):
142 | 
143 |     results = []
144 |     errors = 0
145 | 
146 |     for _ in range(num_invocations):
147 |         invoke_arg_idx = random.randint(0, len(invoke_args_list)-1)
148 |         invoke_args = invoke_args_list[invoke_arg_idx]
149 |         result = timed_invocation(endpoint, invoke_args)
150 |         result["invoke_arg_index"] = invoke_arg_idx 
151 | 
152 |         if result["invocation_latency"] == -1:
153 |             errors += 1
154 | 
155 |         if errors >= error_thresh:
156 |             print(
157 |                 f"Terminating benchmark for {endpoint.endpoint_name} due to excessive endpoint errors"
158 |             )
159 | 
160 |             return results
161 | 
162 |         results.append(result)
163 | 
164 |     return results
165 | 
166 | 
167 | def run_stability_benchmark(
168 |     endpoints: List[ServerlessEndpoint],
169 |     invoke_args_list: List[Dict[str, str]],
170 |     num_invocations: int = 1000,
171 |     error_thresh: int = 3,
172 |     result_save_path: str = ".",
173 | ):
174 | 
175 |     endpoint_benchmark_futures = {}
176 |     benchmark_results = []
177 |     all_endpoint_metrics = []
178 | 
179 |     save_path = Path(result_save_path) / "stability_benchmark_raw_results"
180 |     save_path.mkdir(exist_ok=True, parents=True)
181 | 
182 |     benchmark_start_time = dt.datetime.utcnow()
183 | 
184 |     with ThreadPoolExecutor(max_workers=len(endpoints)) as executor:
185 | 
186 |         for endpoint in endpoints:
187 |             future = executor.submit(
188 |                 stability_benchmark,
189 |                 endpoint,
190 |                 invoke_args_list,
191 |                 num_invocations,
192 |                 error_thresh,
193 |             )
194 |             endpoint_benchmark_futures[future] = endpoint
195 | 
196 |         for future in as_completed(endpoint_benchmark_futures):
197 |             endpoint = endpoint_benchmark_futures[future]
198 | 
199 |             result = future.result()
200 |             df_result = pd.DataFrame(result)
201 |             df_result["memory_size"] = endpoint.memory_size
202 |             benchmark_results.append(df_result)
203 | 
204 |             benchmark_end_time = dt.datetime.utcnow()
205 |             lookback_window = (
206 |                 benchmark_end_time - benchmark_start_time
207 |             ) + dt.timedelta(hours=1)
208 |         
209 |             endpoint_metrics = endpoint.get_endpoint_metrics(lookback_window)
210 |             df_endpoint_metrics = pd.DataFrame(endpoint_metrics)
211 |             df_endpoint_metrics["memory_size"] = endpoint.memory_size
212 |             all_endpoint_metrics.append(df_endpoint_metrics)
213 | 
214 |     df_benchmark_results = pd.concat(benchmark_results).reset_index(drop=True)
215 |     df_endpoint_metrics = pd.concat(all_endpoint_metrics).reset_index(drop=True)
216 | 
217 |     df_benchmark_results.to_csv(
218 |         save_path / "invocation_benchmark_results.csv", index=False
219 |     )
220 |     df_endpoint_metrics.to_csv(save_path / "endpoint_metrics.csv", index=False)
221 | 
222 |     return df_benchmark_results, df_endpoint_metrics
223 | 
224 | 
225 | def concurrency_benchmark(
226 |     endpoint: ServerlessEndpoint, invoke_args_list, num_invocations=50, num_clients=1
227 | ):
228 | 
229 |     futures = []
230 | 
231 |     with ThreadPoolExecutor(max_workers=num_clients) as executor:
232 | 
233 |         for _ in range(num_invocations):
234 |             invoke_args = random.choice(invoke_args_list)
235 | 
236 |             futures.append(executor.submit(timed_invocation, endpoint, invoke_args))
237 | 
238 |         results = [future.result() for future in futures]
239 | 
240 |     return results
241 | 
242 | 
243 | def run_concurrency_benchmark(
244 |     endpoints: List[ServerlessEndpoint],
245 |     invoke_args_list: List[Dict[str, str]],
246 |     num_invocations: int = 1000,
247 |     num_clients_multipliers: List[float] = [1, 1.5, 1.75, 2],
248 |     result_save_path: str = ".",
249 | ):
250 | 
251 |     endpoint_benchmark_futures = {}
252 |     benchmark_results = []
253 |     all_endpoint_metrics = []
254 | 
255 |     save_path = Path(result_save_path) / "concurrency_benchmark_raw_results"
256 |     save_path.mkdir(exist_ok=True, parents=True)
257 | 
258 |     benchmark_start_time = dt.datetime.now()
259 | 
260 |     with ThreadPoolExecutor(max_workers=len(endpoints)) as executor:
261 |         seen_conc_clients = set()
262 | 
263 |         for multiplier in num_clients_multipliers:
264 |             futures = []
265 |             for endpoint in endpoints:
266 |                 num_clients = math.ceil(endpoint.max_concurrency * multiplier)
267 |                 max_conc_num_clients = (endpoint.max_concurrency, num_clients)
268 | 
269 |                 if max_conc_num_clients in seen_conc_clients:
270 |                     continue
271 |                 seen_conc_clients.add(max_conc_num_clients)
272 | 
273 |                 future = executor.submit(
274 |                     concurrency_benchmark,
275 |                     endpoint,
276 |                     invoke_args_list,
277 |                     num_invocations,
278 |                     num_clients,
279 |                 )
280 |                 endpoint_benchmark_futures[future] = (endpoint, num_clients)
281 |                 futures.append(future)
282 | 
283 |             while any([future.running() for future in futures]):
284 |                 time.sleep(5)
285 | 
286 |         for future in as_completed(endpoint_benchmark_futures):
287 |             endpoint = endpoint_benchmark_futures[future][0]
288 | 
289 |             result = future.result()
290 |             df_result = pd.DataFrame(result)
291 |             df_result["memory_size"] = endpoint.memory_size
292 |             df_result["max_concurrency"] = endpoint.max_concurrency
293 |             df_result["num_clients"] = endpoint_benchmark_futures[future][1]
294 |             benchmark_results.append(df_result)
295 | 
296 |             benchmark_end_time = dt.datetime.now()
297 |             lookback_window = (
298 |                 benchmark_end_time - benchmark_start_time
299 |             ) + dt.timedelta(hours=1)
300 |             endpoint_metrics = endpoint.get_endpoint_metrics(lookback_window)
301 |             df_endpoint_metrics = pd.DataFrame(endpoint_metrics)
302 |             df_endpoint_metrics["memory_size"] = endpoint.memory_size
303 |             df_endpoint_metrics["max_concurrency"] = endpoint.max_concurrency
304 |             df_endpoint_metrics["num_clients"] = endpoint_benchmark_futures[future][1]
305 |             all_endpoint_metrics.append(df_endpoint_metrics)
306 | 
307 |     df_benchmark_results = pd.concat(benchmark_results).reset_index(drop=True)
308 |     df_endpoint_metrics = pd.concat(all_endpoint_metrics).reset_index(drop=True)
309 | 
310 |     df_benchmark_results.to_csv(
311 |         save_path / "invocation_benchmark_results.csv", index=False
312 |     )
313 |     df_endpoint_metrics.to_csv(save_path / "endpoint_metrics.csv", index=False)
314 | 
315 |     return df_benchmark_results, df_endpoint_metrics
316 | 
317 | 
318 | def tear_down_endpoints(endpoints: List[ServerlessEndpoint]):
319 |     for endpoint in endpoints:
320 |         try:
321 |             endpoint.clean_up()
322 |         except:
323 |             pass
324 | 
325 | 
326 | 
327 | def run_serverless_benchmarks(
328 |     model_name: str,
329 |     invoke_args_examples_file: Path,
330 |     cold_start_delay: int = 0,
331 |     memory_sizes: List[int] = [1024, 2048, 3072, 4096, 5120, 6144],
332 |     stability_benchmark_invocations: int = 1000,
333 |     stability_benchmark_error_thresh: int = 3,
334 |     include_concurrency_benchmark: bool = True,
335 |     concurrency_benchmark_max_conc: List[int] = [2, 4, 8],
336 |     concurrency_benchmark_invocations: int = 1000,
337 |     concurrency_num_clients_multiplier: List[float] = [1, 1.5, 1.75, 2],
338 |     result_save_path: str = ".",
339 | )->str:
340 |     """Runs a suite of SageMaker Serverless Benchmarks on the specified model_name. 
341 |     Will automatically deploy endpoints for the specified model_name and perform a tear down
342 |     upon completion of the benchmark or an error.
343 | 
344 |     There are two types of benchmarks that are supported and both are executed by defaults
345 |     - Stability Benchmark: Deploys an endpoint for each of the specified memory configurations and
346 |     max concurrency of 1. Invokes the endpoint the specified number of times and determines the stable
347 |     and most cost effective configuration
348 | 
349 |     - Concurrency Benchmark: Deploys endpoints with different max_concurrency configurations and performs 
350 |     a load test with a simulated number of concurrent clients 
351 | 
352 |     Args:
353 |         model_name (str): Name of the SageMaker Model resource
354 |         invoke_args_examples_file (Path: Path to the jsonl file containing the example invocation arcguments
355 |         cold_start_delay (int, optional): Number of seconds to sleep before starting the benchmark. Helps to induce a cold start on initial invocation. Defaults to 0.
356 |         memory_sizes (List[int], optional): List of memory configurations to benchmark Defaults to [1024, 2048, 3072, 4096, 5120, 6144].
357 |         stability_benchmark_invocations (int, optional): Total number of invocations for the stability benchmark. Defaults to 1000.
358 |         stability_benchmark_error_thresh (int, optional): The allowed number of endpoint invocation errors before the benchmark is terminated for a configuration. Defaults to 3.
359 |         include_concurrency_benchmark (bool, optional): Set True to run the concurrency benchmark with the optimal configuration from the stability benchmark. Defaults to True.
360 |         concurrency_benchmark_max_conc (List[int], optional): A list of max_concurency settings to benchmark. Defaults to [2, 4, 8].
361 |         concurrency_benchmark_invocations (int, optional): Total number of invocations for the concurency benchmark. Defaults to 1000.
362 |         concurrency_num_clients_multiplier (List[int], optional): List of multipliers to specify the number of simulated clients which is determined by max_concurency * multiplier. Defaults to [1, 1.5, 1.75, 2].
363 |         result_save_path (str, optional): The location to which the output artifacts will be saved. Defaults to ".".
364 | 
365 | 
366 |     Returns:
367 |         str: HTML for the generated benchmarking report
368 |     """
369 | 
370 |     function_args = locals()
371 |     benchmark_config = pd.Series(function_args).to_frame()
372 | 
373 |     invoke_args_examples = read_example_args_file(invoke_args_examples_file)
374 | 
375 |     stability_endpoints = setup_endpoints(
376 |         model_name, memory_size=memory_sizes, max_concurrency=1, sleep=cold_start_delay
377 |     )
378 | 
379 |     try:
380 |         (
381 |             df_stability_benchmark_results,
382 |             df_stability_endpoint_metrics,
383 |         ) = run_stability_benchmark(
384 |             stability_endpoints,
385 |             invoke_args_list=invoke_args_examples,
386 |             num_invocations=stability_benchmark_invocations,
387 |             error_thresh=stability_benchmark_error_thresh,
388 |             result_save_path=result_save_path,
389 |         )
390 |         (
391 |             df_stability_results,
392 |             df_stability_summary,
393 |             df_stability_metric_summary,
394 |             stability_latency_distribution_fig,
395 |         ) = summarize_stability_results(
396 |             df_stability_benchmark_results,
397 |             df_stability_endpoint_metrics,
398 |             result_save_path=result_save_path,
399 |         )
400 | 
401 |         avg_response_size = df_stability_results["response_size"].mean()
402 |         (
403 |             df_cost_savings,
404 |             minimal_successful_config,
405 |             comparable_sagemaker_instance,
406 |             cost_vs_performance,
407 |         ) = compute_cost_savings(
408 |             df_stability_metric_summary,
409 |             invoke_args_examples,
410 |             average_response_size=avg_response_size,
411 |             result_save_path=result_save_path
412 |         )
413 | 
414 |     except Exception as e:
415 |         print(f"Could not complete benchmark due to Exception: {e}")
416 |         raise e
417 | 
418 |     finally:
419 |         tear_down_endpoints(stability_endpoints)
420 | 
421 |     if include_concurrency_benchmark:
422 |         concurrency_endpoints = setup_endpoints(
423 |             model_name,
424 |             memory_size=minimal_successful_config,
425 |             max_concurrency=concurrency_benchmark_max_conc,
426 |         )
427 |         try:
428 |             (
429 |                 df_conc_benchmark_results,
430 |                 df_conc_endpoint_metrics,
431 |             ) = run_concurrency_benchmark(
432 |                 concurrency_endpoints,
433 |                 invoke_args_examples,
434 |                 num_invocations=concurrency_benchmark_invocations,
435 |                 num_clients_multipliers=concurrency_num_clients_multiplier,
436 |                 result_save_path=result_save_path,
437 |             )
438 |             (
439 |                 df_concurrency_metrics,
440 |                 df_concurrency_metric_summary,
441 |                 concurrency_latency_distribution_fig,
442 |             ) = summarize_concurrency_results(
443 |                 df_conc_benchmark_results,
444 |                 df_conc_endpoint_metrics,
445 |                 result_save_path=result_save_path,
446 |             )
447 | 
448 |         except Exception as e:
449 |             print(f"Could not complete benchmark due to Exception: {e}")
450 | 
451 |         finally:
452 |             tear_down_endpoints(concurrency_endpoints)
453 | 
454 |         report = generate_html_report(
455 |             benchmark_config=benchmark_config,
456 |             df_stability_summary=df_stability_summary,
457 |             df_stability_metric_summary=df_stability_metric_summary,
458 |             stability_latency_distribution=stability_latency_distribution_fig,
459 |             df_cost_savings=df_cost_savings,
460 |             cost_vs_performance=cost_vs_performance,
461 |             optimal_memory_config=minimal_successful_config,
462 |             comparable_instance=comparable_sagemaker_instance,
463 |             df_concurrency_metrics=df_concurrency_metrics,
464 |             df_concurrency_metric_summary=df_concurrency_metric_summary,
465 |             concurrency_latency_distribution=concurrency_latency_distribution_fig,
466 |             result_save_path=result_save_path,
467 |         )
468 |         return report
469 | 
470 |     else:
471 |         report = generate_html_report(
472 |             benchmark_config=benchmark_config,
473 |             df_stability_summary=df_stability_summary,
474 |             df_stability_metric_summary=df_stability_metric_summary,
475 |             stability_latency_distribution=stability_latency_distribution_fig,
476 |             df_cost_savings=df_cost_savings,
477 |             cost_vs_performance=cost_vs_performance,
478 |             optimal_memory_config=minimal_successful_config,
479 |             comparable_instance=comparable_sagemaker_instance,
480 |             result_save_path=result_save_path,
481 |         )
482 |         return report


--------------------------------------------------------------------------------
/src/sm_serverless_benchmarking/cost_constants.py:
--------------------------------------------------------------------------------
 1 | 
 2 | INFERENCE_COST = {1024:0.0000200 / 1000, 
 3 |                   2048:0.0000400 / 1000, 
 4 |                   3072:0.0000600 / 1000, 
 5 |                   4096:0.0000800 / 1000, 
 6 |                   5120:0.0001000 / 1000, 
 7 |                   6144:0.0001200 / 1000}
 8 | 
 9 | PROCESSING_COST = 0.016 / (1024**3)
10 | 
11 | INSTANCE_MAPPING = {1024: "ml.t2.medium", 
12 |                     2048: "ml.t2.medium", 
13 |                     3072: "ml.c5.large", 
14 |                     4096: "ml.c5.large", 
15 |                     5120: "ml.c5.large", 
16 |                     6144: "ml.c5.xlarge"}
17 | 
18 | MONTHLY_INSTANCE_COST = {"ml.t2.medium": 0.056*24*30, 
19 |                          "ml.c5.large": 0.102*24*30, 
20 |                          "ml.c5.xlarge": 0.204*24*30}


--------------------------------------------------------------------------------
/src/sm_serverless_benchmarking/endpoint.py:
--------------------------------------------------------------------------------
  1 | import datetime as dt
  2 | import logging
  3 | import time
  4 | import uuid
  5 | from dataclasses import dataclass
  6 | 
  7 | import boto3
  8 | import botocore
  9 | from botocore.config import Config
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | ALLOWED_MEM_VALUES = {1024, 2048, 3072, 4096, 5120, 6144}
 14 | MIN_CONCURRENCY = 1
 15 | MAX_CONCURRENCY = 200
 16 | 
 17 | 
 18 | @dataclass(unsafe_hash=True)
 19 | class ServerlessEndpoint:
 20 |     model_name: str
 21 |     memory_size: int = 1024
 22 |     max_concurrency: int = 1
 23 |     _created: bool = False
 24 |     _deployment_failed: bool = False
 25 | 
 26 |     def __post_init__(self):
 27 | 
 28 |         self._validate_inital_config()
 29 |         self._sm_client = boto3.client("sagemaker")
 30 | 
 31 |         # disable retries when throttled
 32 |         self._boto_config = Config(retries={"max_attempts": 1, "mode": "standard"})
 33 |         self._smr_client = boto3.client("sagemaker-runtime", config=self._boto_config)
 34 |         self._cw_client = boto3.client("cloudwatch")
 35 |         self._endpoint_name = f"{self.model_name[:50]}-ep-{str(uuid.uuid1())[:5]}"
 36 | 
 37 |     def _validate_inital_config(self):
 38 |         if self.memory_size not in ALLOWED_MEM_VALUES:
 39 |             raise ValueError(
 40 |                 f"{self.memory_size} is not a valid memory_size value. Valid values are {ALLOWED_MEM_VALUES}"
 41 |             )
 42 | 
 43 |         if (self.max_concurrency < MIN_CONCURRENCY) | (
 44 |             self.max_concurrency > MAX_CONCURRENCY
 45 |         ):
 46 |             raise ValueError(
 47 |                 f"max_concurrency must fall within the {MIN_CONCURRENCY} to {MAX_CONCURRENCY} range"
 48 |             )
 49 | 
 50 |     def _create_endpoint_config(self):
 51 |         self._endpoint_config_name = (
 52 |             f"{self.model_name[:50]}-cfg-{str(uuid.uuid1())[:5]}"
 53 |         )
 54 | 
 55 |         endpoint_config_response = self._sm_client.create_endpoint_config(
 56 |             EndpointConfigName=self._endpoint_config_name,
 57 |             ProductionVariants=[
 58 |                 {
 59 |                     "VariantName": "variant1",
 60 |                     "ModelName": self.model_name,
 61 |                     "ServerlessConfig": {
 62 |                         "MemorySizeInMB": self.memory_size,
 63 |                         "MaxConcurrency": self.max_concurrency,
 64 |                     },
 65 |                 }
 66 |             ],
 67 |         )
 68 | 
 69 |         return self._endpoint_config_name
 70 | 
 71 |     def create_endpoint(self):
 72 | 
 73 |         ep_config = self._create_endpoint_config()
 74 |         create_endpoint_response = self._sm_client.create_endpoint(
 75 |             EndpointName=self._endpoint_name, EndpointConfigName=ep_config
 76 |         )
 77 |         self.wait()
 78 | 
 79 |         if self._deployment_failed:
 80 |             logger.warn(f"Failed to deploy endpoint {self.endpoint_name} with memory_size {self.memory_size}. Failure reason: {self._failure_reason} Endpoint will not be used")
 81 |         else:
 82 |             self._created = True
 83 | 
 84 |     def _validate_deployment(self):
 85 |         assert (
 86 |             self._created
 87 |         ), "Operation can not be performed because the endpoint has not been deployed"
 88 | 
 89 |     def describe_endpoint(self):
 90 | 
 91 |         resp = self._sm_client.describe_endpoint(EndpointName=self._endpoint_name)
 92 | 
 93 |         return resp
 94 | 
 95 |     @property
 96 |     def endpoint_name(self):
 97 |         return self._endpoint_name
 98 | 
 99 |     def update_endpoint(self, memory_size, max_concurrency):
100 | 
101 |         if (memory_size == self.memory_size) & (
102 |             max_concurrency == self.max_concurrency
103 |         ):
104 |             logger.info(
105 |                 "Updated configuration matches current configuration. No update required"
106 |             )
107 |             return None
108 | 
109 |         self._validate_deployment()
110 | 
111 |         current_config_name = self._endpoint_config_name
112 |         updated_endpoint_config_name = (
113 |             f"{self.model_name[:50]}-cfg-{str(uuid.uuid1())[:5]}"
114 |         )
115 |         updated_config = [
116 |             {
117 |                 "VariantName": "variant1",
118 |                 "ModelName": self.model_name,
119 |                 "ServerlessConfig": {
120 |                     "MemorySizeInMB": memory_size,
121 |                     "MaxConcurrency": max_concurrency,
122 |                 },
123 |             }
124 |         ]
125 | 
126 |         endpoint_config_response = self._sm_client.create_endpoint_config(
127 |             EndpointConfigName=updated_endpoint_config_name,
128 |             ProductionVariants=updated_config,
129 |         )
130 | 
131 |         self._sm_client.update_endpoint(
132 |             EndpointName=self._endpoint_name,
133 |             EndpointConfigName=updated_endpoint_config_name,
134 |         )
135 | 
136 |         self.wait()
137 |         self._endpoint_config_name = updated_endpoint_config_name
138 |         self._sm_client.delete_endpoint_config(EndpointConfigName=current_config_name)
139 |         self.memory_size = memory_size
140 |         self.max_concurrency = max_concurrency
141 | 
142 |     def get_endpoint_metric(
143 |         self, namespace: str, metric_name: str, lookback_window: dt.timedelta
144 |     ):
145 | 
146 |         response = self._cw_client.get_metric_statistics(
147 |             Namespace=namespace,
148 |             MetricName=metric_name,
149 |             Dimensions=[
150 |                 {"Name": "EndpointName", "Value": self.endpoint_name},
151 |                 {"Name": "VariantName", "Value": "variant1"},
152 |             ],
153 |             StartTime=dt.datetime.utcnow() - lookback_window,
154 |             EndTime=dt.datetime.utcnow(),
155 |             Period=3600 * 24,
156 |             Statistics=["Average", "Minimum", "Maximum"],
157 |             ExtendedStatistics=["p25", "p50", "p75"],
158 |         )
159 |         metric_data_points = response["Datapoints"]
160 | 
161 |         if len(metric_data_points) == 0:
162 |             logger.warn(f"Did not get any CloudWatch data for the {metric_name} metric for endpoint {self.endpoint_name}")
163 |             return {"metric_name": metric_name}
164 |         else:
165 |             metric = metric_data_points[0]
166 | 
167 |         extended_statistics = metric.pop("ExtendedStatistics")
168 |         metric.update(extended_statistics)
169 |         metric["metric_name"] = metric_name
170 | 
171 |         return metric
172 | 
173 |     def get_endpoint_metrics(self, lookback_window: dt.timedelta):
174 | 
175 |         all_metrics = []
176 |         memory_util_metric = self.get_endpoint_metric(
177 |             namespace="/aws/sagemaker/Endpoints",
178 |             metric_name="MemoryUtilization",
179 |             lookback_window=lookback_window,
180 |         )
181 |         all_metrics.append(memory_util_metric)
182 | 
183 |         for metric_name in ["ModelSetupTime", "ModelLatency", "OverheadLatency"]:
184 |             metric = self.get_endpoint_metric(
185 |                 namespace="AWS/SageMaker",
186 |                 metric_name=metric_name,
187 |                 lookback_window=lookback_window,
188 |             )
189 |             all_metrics.append(metric)
190 | 
191 |         return all_metrics
192 | 
193 |     def clean_up(self):
194 |         
195 |         self._sm_client.delete_endpoint(EndpointName=self._endpoint_name)
196 |         self._sm_client.delete_endpoint_config(
197 |             EndpointConfigName=self._endpoint_config_name
198 |         )
199 |         self._created = False
200 | 
201 |     def invoke_endpoint(self, invoke_args):
202 | 
203 |         self._validate_deployment()
204 | 
205 |         resp = self._smr_client.invoke_endpoint(
206 |             EndpointName=self._endpoint_name, **invoke_args
207 |         )
208 | 
209 |         return resp
210 | 
211 |     def wait(self):
212 | 
213 |         time.sleep(2)  # wait a few seconds for status to update
214 |         waiter = self._sm_client.get_waiter("endpoint_in_service")
215 |         print(f"Waiting for endpoint {self._endpoint_name} to start...")
216 | 
217 |         try:
218 |             waiter.wait(EndpointName=self._endpoint_name)
219 | 
220 |         except botocore.exceptions.WaiterError as err:
221 |             self._deployment_failed = True
222 |             self._failure_reason = self.describe_endpoint().get("FailureReason")
223 |             self.clean_up()
224 |             return None
225 | 
226 |         resp = self.describe_endpoint()
227 |         print(f"Endpoint {self.endpoint_name} Status: {resp['EndpointStatus']}")
228 | 
229 |         return resp
230 | 
231 | 
232 | 
233 | 
234 | 
235 | 


--------------------------------------------------------------------------------
/src/sm_serverless_benchmarking/report.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | from importlib import resources
  3 | from io import BytesIO
  4 | from pathlib import Path
  5 | 
  6 | import matplotlib
  7 | import pandas as pd
  8 | from jinja2 import Environment, FileSystemLoader
  9 | 
 10 | 
 11 | def b64_png_encode(fig: matplotlib.figure.Figure):
 12 |     png = BytesIO()
 13 |     fig.savefig(png, bbox_inches="tight")
 14 |     png.flush()
 15 | 
 16 |     encoded_img = base64.b64encode(png.getvalue())
 17 | 
 18 |     return encoded_img
 19 | 
 20 | 
 21 | def generate_html_report(
 22 |     benchmark_config: pd.DataFrame,
 23 |     df_stability_summary: pd.DataFrame,
 24 |     df_stability_metric_summary: pd.DataFrame,
 25 |     stability_latency_distribution: matplotlib.figure.Figure,
 26 |     cost_vs_performance: matplotlib.figure.Figure,
 27 |     df_cost_savings: pd.DataFrame,
 28 |     optimal_memory_config: int,
 29 |     comparable_instance: str,
 30 |     df_concurrency_metrics: pd.DataFrame = pd.DataFrame(),
 31 |     df_concurrency_metric_summary: pd.DataFrame = pd.DataFrame(),
 32 |     concurrency_latency_distribution: matplotlib.figure.Figure = matplotlib.figure.Figure(),
 33 |     result_save_path: str = ".",
 34 | ):
 35 | 
 36 |     report_path = Path(result_save_path) / "benchmarking_report"
 37 |     report_path.mkdir(exist_ok=True, parents=True)
 38 | 
 39 |     with resources.path(
 40 |         "sm_serverless_benchmarking.report_templates", "report_template.html"
 41 |     ) as p:
 42 |         templates_path = p.parent
 43 | 
 44 |     environment = Environment(
 45 |         loader=FileSystemLoader(templates_path)
 46 |     )
 47 |     template = environment.get_template("report_template.html")
 48 | 
 49 |     stability_latency_distribution_encoded = b64_png_encode(
 50 |         stability_latency_distribution
 51 |     )
 52 |     concurrency_latency_distribution_encoded = b64_png_encode(
 53 |         concurrency_latency_distribution
 54 |     )
 55 |     cost_vs_performance_encoded = b64_png_encode(cost_vs_performance)
 56 | 
 57 |     context = {
 58 |         "benchmark_configuration": benchmark_config.to_html(
 59 |             index=True,
 60 |             float_format="%.2f",
 61 |             justify="left",
 62 |             header=False,
 63 |             na_rep="",
 64 |             escape=False,
 65 |         ),
 66 |         "stability_benchmark_summary": df_stability_summary.to_html(
 67 |             index=True,
 68 |             float_format="%.2f",
 69 |             na_rep="",
 70 |             justify="center",
 71 |             notebook=True,
 72 |             escape=False,
 73 |         ).replace("<td>", '<td align="center">'),
 74 |         "stability_endpoint_metrics": df_stability_metric_summary.to_html(
 75 |             index=True,
 76 |             float_format="%.2f",
 77 |             justify="center",
 78 |             na_rep="",
 79 |             notebook=True,
 80 |             escape=False,
 81 |         ).replace("<td>", '<td align="center">'),
 82 |         "stability_latency_distribution": stability_latency_distribution_encoded.decode(
 83 |             "utf8"
 84 |         ),
 85 |         "cost_vs_performance": cost_vs_performance_encoded.decode("utf8"),
 86 |         "cost_savings_table": df_cost_savings.to_html(
 87 |             index=False,
 88 |             escape=False,
 89 |             formatters={
 90 |                 "monthly_invocations": lambda x: f"{x:,}",
 91 |                 "serverless_monthly_cost": lambda x: f"${x:.2f}",
 92 |                 "instance_monthly_cost": lambda x: f"${x:.2f}",
 93 |                 "monthly_percent_savings": lambda x: f"{x}%",
 94 |             },
 95 |         ).replace("<td>", '<td align="center">'),
 96 |         "optimal_memory_config": optimal_memory_config,
 97 |         "comparable_instance": comparable_instance,
 98 |         "concurrency_benchmark_summary": df_concurrency_metrics.to_html(
 99 |             index=False,
100 |             escape=False,
101 |             float_format="%.2f",
102 |             na_rep="",
103 |             justify="center",
104 |             notebook=True,
105 |             formatters={
106 |                 "insufficient_memory_error": lambda x: f"{x:0.0f}",
107 |                 "other_model_error": lambda x: f"{x:0.0f}",
108 |             },
109 |         ).replace("<td>", '<td align="center">'),
110 |         "concurrency_latency_distribution": concurrency_latency_distribution_encoded.decode(
111 |             "utf8"
112 |         ),
113 |         "concurrency_cloudwatch_metrics": df_concurrency_metric_summary.to_html(
114 |             index=True,
115 |             float_format="%.2f",
116 |             justify="center",
117 |             na_rep="",
118 |             notebook=True,
119 |             escape=False,
120 |         ).replace("<td>", '<td align="center">'),
121 |     }
122 | 
123 |     report = template.render(context=context)
124 | 
125 |     with (report_path / "benchmarking_report.html").open("w") as f:
126 |         f.write(report)
127 | 
128 |     return report
129 | 


--------------------------------------------------------------------------------
/src/sm_serverless_benchmarking/report_templates/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/sagemaker-serverless-inference-benchmarking/cb0028576fe60e4704526a46368cbd53851eebc7/src/sm_serverless_benchmarking/report_templates/__init__.py


--------------------------------------------------------------------------------
/src/sm_serverless_benchmarking/report_templates/report_template.html:
--------------------------------------------------------------------------------
 1 | <!DOCTYPE html>
 2 | <html lang="en">
 3 |    <head>
 4 |       <meta charset="utf-8">
 5 |       <title>SageMaker Serverless Endpoint Benchmarking Results</title>
 6 |    </head>
 7 | 
 8 |    <style type="text/css"> 
 9 | 
10 |       body{
11 |           margin-top:50px;
12 |           margin-left:50px;
13 |           position:absolute;
14 |       
15 |           top:0; left:0; bottom:0;right:0;
16 |       
17 |       }
18 |       
19 |       p{
20 |           width: 1000px;
21 |           word-wrap: break-word;
22 |       }
23 |    </style>
24 | 
25 |    <body>
26 |       <h1>Benchmark Configuration </h1>
27 |        <p>The table below provides a list of configuration options that were supplied to this benchmark</p>
28 |       {{ context.benchmark_configuration }}
29 |        
30 |       <h1>Stability Benchmark Results</h1>
31 |     
32 |       <h2>Invocation Latency and Error Metrics</h2>
33 |       <p> The table below provides a summary of invocation latency metrics as measured from the client side. The metrics include the minimum, mean, median, and max latencies in addition to the interquartile range (iqr) which shows the difference 
34 |         in latency between the 75th and 25th percentiles. Only the successful memory configurations are included.</p>
35 |       {{ context.stability_benchmark_summary }}
36 |       <h2>Request Latency Distribution</h2>
37 |        <p>The distribution of latencies is summarized in the chart below. Longer latencies due to cold start are not included.</p>
38 |       <img src="data:image/png;base64,{{ context.stability_latency_distribution }}">
39 |       
40 |       <h2>Endpoint CloudWatch Metrics</h2>
41 |     <p>The average values of the metrics monitored by CloudWatch are captured below. The ModelSetupTime metric represents the time it takes to launch new compute resources for a serverless endpoint and indicates the impact of a cold start. This metric may not appear as endpoints are launched in a warm state. You can invoke a cold start by increasing the <b>cold_start_delay</b> parameter when configuring the benchmark. Alternatively, the CloudWatch metrics for the concurrency benchmark bellow are more likely to capture this metric due to the larger number of compute resources involved. Refer to the <a href='https://docs.aws.amazon.com/sagemaker/latest/dg/monitoring-cloudwatch.html'>documentation </a>for an explanation of each metric.</p>
42 |       {{ context.stability_endpoint_metrics }}
43 |        
44 |       <h2>Cost Savings and Performance Analysis</h2>
45 |       <p>This section provides an analysis of cost and performance of each memory configuration. Additionally it provides and overview of the expected cost savings compared to a Real Time endpoint running on a comparable SageMaker hosting instance. 
46 |       <p>The graph below graph visualizes the performance and cost trade-off of each memory configuration. </p>
47 |       <img src="data:image/png;base64,{{ context.cost_vs_performance }}">
48 |       <p>The table below provides an estimate of the savings compared against a real-time hosting instance based on the number of monthly invocations.</p>
49 |       <p><b>Optimal memory configuration: </b>{{ context.optimal_memory_config }}</p>
50 |       <p><b>Comparable SageMaker Hosting Instance: </b>{{ context.comparable_instance }}</p>
51 |       {{ context.cost_savings_table }}
52 |        
53 |       <h1>Concurency Benchmark Results</h1>
54 |        <p>This benchmark tests the performance of specified MaxConcurrency configurations. It helps determine the right setting to support the expected invocation volumes.</p>
55 |       <h2>Invocation Latency and Error Metrics</h2>
56 |        <p>Latency, error, and throughput (TPS) metrics are captured in the table below. This should help inform the minimum MaxConcurrency configuration that can support the expected traffic.</p>
57 |        
58 |       {{ context.concurrency_benchmark_summary }}
59 |       <h2>Request Latency Distribution</h2>
60 |        <p>The charts below summarize the latency distributions under different load patterns (number of concurrent clients) and MaxConcurrency settings</p>
61 |       <img src="data:image/png;base64,{{ context.concurrency_latency_distribution }}">
62 |       <h2>Endpoint CloudWatch Metrics</h2>
63 |       <p>The average values of the metrics monitored by CloudWatch are captured below. The ModelSetupTime metric represents the time it takes to launch new compute resources for a serverless endpoint and indicates the impact of a cold start. Refer to the <a href='https://docs.aws.amazon.com/sagemaker/latest/dg/monitoring-cloudwatch.html'>documentation </a>for an explanation of each metric.</p>
64 |        {{ context.concurrency_cloudwatch_metrics }}
65 |        
66 |    </body>
67 | </html>
68 | 


--------------------------------------------------------------------------------
/src/sm_serverless_benchmarking/requirements.txt:
--------------------------------------------------------------------------------
1 | boto3>=1.20.21, <2.0
2 | pandas>=1.2, <2.0
3 | seaborn<0.12
4 | Jinja2<4.0
5 | 
6 | 


--------------------------------------------------------------------------------
/src/sm_serverless_benchmarking/sagemaker_runner.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from importlib import resources
 3 | from pathlib import Path
 4 | from typing import List, Union
 5 | 
 6 | import sagemaker
 7 | from sagemaker.processing import ProcessingInput, ProcessingOutput
 8 | from sagemaker.xgboost import XGBoostProcessor
 9 | 
10 | 
11 | def args_constructor(
12 |     benchmark_args: dict, sm_job_input_dir: str, sm_job_output_dir: str
13 | ):
14 | 
15 |     input_file_name = os.path.basename(benchmark_args["invoke_args_examples_file"])
16 | 
17 |     args = []
18 |     for k, v in benchmark_args.items():
19 |         if k in {"role", "s3_output_path", "wait"}:
20 |             continue
21 |         if k == "model_name":
22 |             args.extend([v])
23 |         elif k == "invoke_args_examples_file":
24 |             args.extend([f"{sm_job_input_dir}/{input_file_name}"])
25 |         elif k == "result_save_path":
26 |             args.extend([f"--{k}", sm_job_output_dir])
27 |         elif k == "include_concurrency_benchmark":
28 |             if v:
29 |                 continue
30 |             else:
31 |                 args.extend(["--no_include_concurrency_benchmark"])
32 |         else:
33 |             args.extend([f"--{k}"])
34 |             if type(v) == list:
35 |                 for param in v:
36 |                     args.extend([str(param)])
37 |             else:
38 |                 args.extend([str(v)])
39 | 
40 |     return args
41 | 
42 | 
43 | def run_as_sagemaker_job(
44 |     role: str,
45 |     model_name: str,
46 |     invoke_args_examples_file: Union[Path, str],
47 |     s3_output_path: str = None,
48 |     wait: bool = False, 
49 |     cold_start_delay: int = 0,
50 |     memory_sizes: List[int] = [1024, 2048, 3072, 4096, 5120, 6144],
51 |     stability_benchmark_invocations: int = 1000,
52 |     stability_benchmark_error_thresh: int = 3,
53 |     include_concurrency_benchmark: bool = True,
54 |     concurrency_benchmark_max_conc: List[int] = [2, 4, 8],
55 |     concurrency_benchmark_invocations: int = 1000,
56 |     concurrency_num_clients_multiplier: List[float] = [1, 1.5, 1.75, 2],
57 |     result_save_path: str = ".",
58 | ):
59 | 
60 |     benchmark_args = locals()
61 | 
62 |     with resources.path("sm_serverless_benchmarking", "__main__.py") as p:
63 |         source_path = str(p.parent)
64 | 
65 |     sm_job_input_dir = "/opt/ml/processing/input/data"
66 |     sm_job_output_dir = "/opt/ml/processing/output/"
67 |     sm_session = sagemaker.Session()
68 |     region = sm_session.boto_region_name
69 |    
70 |     job_args = args_constructor(benchmark_args, sm_job_input_dir, sm_job_output_dir)
71 | 
72 |     processor = XGBoostProcessor(
73 |         role=role,
74 |         framework_version="1.5-1",
75 |         instance_type="ml.m5.large",
76 |         instance_count=1,
77 |         base_job_name="sagemaker-serverless-inf-bench",
78 |         env={"AWS_DEFAULT_REGION": region}
79 |     )
80 | 
81 |     processor.run(
82 |         code="__main__.py",
83 |         source_dir=source_path,
84 |         inputs=[
85 |             ProcessingInput(
86 |                 source=str(invoke_args_examples_file),
87 |                 destination=sm_job_input_dir,
88 |             )
89 |         ],
90 |         outputs=[
91 |             ProcessingOutput(output_name="benchmark_outputs", source=sm_job_output_dir)
92 |         ],
93 |         arguments=job_args,
94 |         wait=wait
95 |     )
96 | 
97 |     return processor


--------------------------------------------------------------------------------
/src/sm_serverless_benchmarking/utils.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import pickle
 3 | from pathlib import Path
 4 | from typing import Dict, List, Union
 5 | 
 6 | 
 7 | def convert_invoke_args_to_jsonl(
 8 |     invoke_args_examples: List[Dict[str, str]], output_path: str = "."
 9 | ):
10 |     """Converts a list of invokation argument examples to JSON lines format and saves the output to a file
11 | 
12 |     Args:
13 |         invoke_args_examples (List[Dict[str, str]]): A list of example arguments that will be passed to the InvokeEndpoint SageMaker Runtime API
14 |         output_path (str, optional): The directory to which the output jsonl will be written Defaults to ".".
15 |     """
16 |     output_file = Path(output_path) / "invoke_args_examples.jsonl"
17 | 
18 |     with output_file.open("w+") as f:
19 |         for example in invoke_args_examples:
20 |             f.write(f"{json.dumps(example)}\n")
21 | 
22 |     return output_file
23 | 
24 | 
25 | def convert_invoke_args_to_pkl(
26 |     invoke_args_examples: List[Dict[str, str]], output_path: str = "."
27 | ):
28 |     """Converts a list of invokation argument examples to pickle. For use with binary data such as images, video, and audio
29 | 
30 |     Args:
31 |         invoke_args_examples (List[Dict[str, str]]): A list of example arguments that will be passed to the InvokeEndpoint SageMaker Runtime API
32 |         output_path (str, optional): The directory to which the output pkl will be written Defaults to ".".
33 |     """
34 |     output_file = Path(output_path) / "invoke_args_examples.pkl"
35 | 
36 |     with output_file.open("wb+") as f:
37 |         pickle.dump(invoke_args_examples, f)
38 | 
39 |     return output_file
40 | 
41 | 
42 | 
43 | def read_example_args_file(example_args_file: Union[Path, str]):
44 | 
45 | 
46 |     example_args_file = Path(example_args_file)
47 | 
48 |     if example_args_file.suffix == ".pkl":
49 |         example_args = pickle.loads(example_args_file.open("rb").read())
50 | 
51 |     else:
52 |         example_args = []
53 |         with example_args_file.open("r") as f:
54 | 
55 |             for line in f:
56 |                 example_args.append(json.loads(line))
57 | 
58 |     return example_args
59 | 


--------------------------------------------------------------------------------