├── pytorch ├── code │ ├── requirements.txt │ └── inference.py └── pytorch.ipynb ├── huggingface ├── code │ ├── requirements.txt │ └── inference.py └── huggingface.ipynb ├── images └── solution_overview.png ├── CODE_OF_CONDUCT.md ├── LICENSE ├── README.md └── CONTRIBUTING.md /pytorch/code/requirements.txt: -------------------------------------------------------------------------------- 1 | openai-whisper 2 | ffmpeg-python 3 | torchaudio 4 | nvgpu -------------------------------------------------------------------------------- /huggingface/code/requirements.txt: -------------------------------------------------------------------------------- 1 | openai-whisper 2 | ffmpeg-python 3 | torchaudio 4 | nvgpu -------------------------------------------------------------------------------- /images/solution_overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-sagemaker-host-and-inference-whisper-model/HEAD/images/solution_overview.png -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT No Attribution 2 | 3 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so. 10 | 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 13 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 14 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 15 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 16 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 17 | 18 | -------------------------------------------------------------------------------- /pytorch/code/inference.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | 4 | import os 5 | import io 6 | import sys 7 | import time 8 | import json 9 | import logging 10 | import whisper 11 | import torch 12 | import boto3 13 | import tempfile 14 | import torchaudio 15 | from botocore.exceptions import NoCredentialsError 16 | 17 | 18 | DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 19 | 20 | def model_fn(model_dir): 21 | """ 22 | Load and return the model 23 | """ 24 | model = whisper.load_model(os.path.join(model_dir, 'base.pt')) 25 | model = model.to(DEVICE) 26 | print(f'whisper model has been loaded to this device: {model.device.type}') 27 | return model 28 | 29 | def transform_fn(model, request_body, request_content_type, response_content_type="application/json"): 30 | """ 31 | Transform the input data and generate a transcription result 32 | """ 33 | logging.info("Check out the request_body type: %s", type(request_body)) 34 | start_time = time.time() 35 | 36 | file = io.BytesIO(request_body) 37 | tfile = tempfile.NamedTemporaryFile(delete=False) 38 | tfile.write(file.read()) 39 | 40 | logging.info("Start generating the transcription ...") 41 | result = model.transcribe(tfile.name) 42 | logging.info("Transcription generation completed.") 43 | 44 | end_time = time.time() 45 | elapsed_time = end_time - start_time 46 | logging.info("Elapsed time: %s seconds", elapsed_time) 47 | 48 | return json.dumps(result), response_content_type -------------------------------------------------------------------------------- /huggingface/code/inference.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | 4 | 5 | import os 6 | import io 7 | import sys 8 | import time 9 | import json 10 | import logging 11 | 12 | import whisper 13 | import torch 14 | import boto3 15 | import ffmpeg 16 | import torchaudio 17 | import tempfile 18 | from transformers import pipeline 19 | 20 | device = "cuda:0" if torch.cuda.is_available() else "cpu" 21 | chunk_length_s = int(os.environ.get('chunk_length_s')) 22 | 23 | def model_fn(model_dir): 24 | model = pipeline( 25 | "automatic-speech-recognition", 26 | model=model_dir, 27 | chunk_length_s=chunk_length_s, 28 | device=device, 29 | ) 30 | return model 31 | 32 | 33 | def transform_fn(model, request_body, request_content_type, response_content_type="application/json"): 34 | 35 | logging.info("Check out the request_body type: %s", type(request_body)) 36 | 37 | start_time = time.time() 38 | 39 | file = io.BytesIO(request_body) 40 | tfile = tempfile.NamedTemporaryFile(delete=False) 41 | tfile.write(file.read()) 42 | 43 | logging.info("Start to generate the transcription ...") 44 | result = model(tfile.name, batch_size=8)["text"] 45 | 46 | logging.info("Upload transcription results back to S3 bucket ...") 47 | 48 | # Calculate the elapsed time 49 | end_time = time.time() 50 | elapsed_time = end_time - start_time 51 | logging.info("The time for running this program is %s", elapsed_time) 52 | 53 | return json.dumps(result), response_content_type -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Host the Whisper Model on Amazon SageMaker: exploring inference options 2 | 3 | In this post, we embark on an exploration of SageMaker's capabilities, specifically focusing on hosting Whisper models. We'll dive deep into two methods for doing this: one utilizing the Whisper PyTorch model and the other using the Hugging Face implementation of the Whisper model. Additionally, we’ll conduct an in-depth examination of SageMaker's inference options, comparing them across parameters such as speed, cost, payload size, and scalability. This analysis empowers users to make informed decisions when integrating Whisper models into their specific use cases and systems. 4 | 5 | 6 | 7 | 8 | 1. In order to host the model on Amazon SageMaker, the first step is to save the model artifacts. These artifacts refer to the essential components of a machine learning model needed for various applications, including deployment and retraining. They can include model parameters, configuration files, pre-processing components, as well as metadata, such as version details, authorship, and any notes related to its performance. It's important to note that Whisper models for PyTorch and Hugging Face implementations consist of different model artifacts. 9 | 10 | 2. Next, we create custom inference scripts. Within these scripts, we define how the model should be loaded and specify the inference process. This is also where we can incorporate custom parameters as needed. Additionally, you can list the required Python packages in a *requirements.txt* file. During the model's deployment, these Python packages are automatically installed in the initialization phase. 11 | 12 | 3. Then we select either the PyTorch or Hugging Face deep learning containers (DLC) provided and maintained by AWS. These containers are pre-built Docker images with deep learning frameworks and other necessary Python packages. For more information, you can check this [link](https://docs.aws.amazon.com/sagemaker/latest/dg/docker-containers-prebuilt.html). 13 | 14 | 4. With the model artefacts, custom inference scripts and selected DLCs, we’ll create Amazon SageMaker models for PyTorch and Hugging Face respectively. 15 | 16 | 5. Finally, the models can be deployed on SageMaker and used with the following options: real-time inference endpoints, batch transform jobs, and asynchronous inference endpoints. We’ll dive into these options in more detail later in this post. 17 | 18 | 19 | This notebook is tested in SageMaker Studio. Below shows detailed setup. 20 | - SageMaker Studio: **ml.m5.large** instance with **Data Science 2.0** kernel. 21 | 22 | ## Tested Software Versions: 23 | 24 | * sagemaker : 2.184.0 25 | * transformers : 4.34.0 26 | * openai-whisper : 20230918 27 | * torchaudio : 2.1.0 28 | * accelerate : 0.23.0 29 | * datasets : 2.16.1 30 | * librosa : 0.10.1 31 | * soundfile : 0.12.1 32 | 33 | ## Security 34 | 35 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information. 36 | 37 | ## License 38 | 39 | This library is licensed under the MIT-0 License. See the LICENSE file. -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | -------------------------------------------------------------------------------- /pytorch/pytorch.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "db53f808-9fbd-408d-a6a0-d18200733876", 6 | "metadata": {}, 7 | "source": [ 8 | "## Using PyTorch DLC to Host the Whisper Model for Automatic Speech Recognition Tasks" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "94a75f20-21ff-4d05-a0e8-50a6ceaa49c2", 14 | "metadata": {}, 15 | "source": [ 16 | "## Common set up \n", 17 | "\n", 18 | "**❗If you run this notebook in SageMaker Studio, please select the Data Science 2.0 image and choose the ml.m5.large instance.**" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "id": "12501f18-d807-4337-b4e9-7e1c2d7590df", 25 | "metadata": { 26 | "tags": [] 27 | }, 28 | "outputs": [], 29 | "source": [ 30 | "# Install required packages\n", 31 | "%pip install openai-whisper==20230918 -q\n", 32 | "%pip install torchaudio==2.1.0 -q\n", 33 | "%pip install datasets==2.16.1 -q\n", 34 | "%pip install sagemaker==2.184.0 -q\n", 35 | "%pip install librosa -q\n", 36 | "%pip install soundfile -q" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "id": "4ab1118a-a0fa-4140-a4c8-7725f512f772", 42 | "metadata": {}, 43 | "source": [ 44 | "**❗Please restart the kernel before executing the cells below.**" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "id": "5fec1136-d7b0-402f-b959-d0302680c508", 51 | "metadata": { 52 | "tags": [] 53 | }, 54 | "outputs": [], 55 | "source": [ 56 | "# Import required packages\n", 57 | "import torch\n", 58 | "import whisper\n", 59 | "import torchaudio\n", 60 | "import sagemaker\n", 61 | "import time\n", 62 | "import json\n", 63 | "import boto3\n", 64 | "import soundfile as sf\n", 65 | "from datasets import load_dataset" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "id": "443ea851-3d8c-42d6-bcbe-334d7cde01ba", 72 | "metadata": { 73 | "tags": [] 74 | }, 75 | "outputs": [], 76 | "source": [ 77 | "# Basic configurations\n", 78 | "sess = sagemaker.session.Session()\n", 79 | "bucket = '[BUCKET NAME]'\n", 80 | "prefix = 'whisper_blog_post'\n", 81 | "role = sagemaker.get_execution_role()\n", 82 | "\n", 83 | "# below boto3 clients are for invoking asynchronous endpoint \n", 84 | "sm_runtime = boto3.client(\"sagemaker-runtime\")" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "id": "98eb0f2b-1dd6-4624-ad4b-b65a19937d2d", 90 | "metadata": {}, 91 | "source": [ 92 | "### Create Whisper pytorch model artifacts and upload to S3 bucket" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "id": "0c6071c7-dfac-4114-ab17-c71bfaf5df66", 99 | "metadata": { 100 | "tags": [] 101 | }, 102 | "outputs": [], 103 | "source": [ 104 | "# Load the PyTorch model and save it in the local repo\n", 105 | "model = whisper.load_model(\"base\")\n", 106 | "torch.save(\n", 107 | " {\n", 108 | " 'model_state_dict': model.state_dict(),\n", 109 | " 'dims': model.dims.__dict__,\n", 110 | " },\n", 111 | " 'base.pt'\n", 112 | ")" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "id": "2be5edfa-3268-4cbf-9bd6-47f5951b1e02", 119 | "metadata": { 120 | "tags": [] 121 | }, 122 | "outputs": [], 123 | "source": [ 124 | "# Move the model to the 'model' directory and create a tarball\n", 125 | "!mkdir -p model\n", 126 | "!mv base.pt model\n", 127 | "!tar cvzf model.tar.gz -C model/ .\n", 128 | "\n", 129 | "# Upload the model to S3\n", 130 | "model_uri = sess.upload_data('model.tar.gz', bucket=bucket, key_prefix=f\"{prefix}/pytorch/model\")\n", 131 | "!rm model.tar.gz\n", 132 | "!rm -rf model\n", 133 | "model_uri" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "id": "8fc49067-13c1-4512-b453-aead1c46164c", 140 | "metadata": { 141 | "tags": [] 142 | }, 143 | "outputs": [], 144 | "source": [ 145 | "# Generate a unique model name and provide image uri\n", 146 | "\n", 147 | "id = int(time.time())\n", 148 | "model_name = f'whisper-pytorch-model-{id}'\n", 149 | "\n", 150 | "# !Please change the image URI for the region that you are using: e.g. us-east-1\n", 151 | "image = \"763104351884.dkr.ecr.[REGION].amazonaws.com/huggingface-pytorch-inference:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04\"" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "id": "f9682718-6ddf-4a75-a99e-7df8ba9a9c03", 158 | "metadata": { 159 | "tags": [] 160 | }, 161 | "outputs": [], 162 | "source": [ 163 | "# Create a PyTorchModel for deployment\n", 164 | "from sagemaker.pytorch.model import PyTorchModel\n", 165 | "\n", 166 | "whisper_pytorch_model = PyTorchModel(\n", 167 | " model_data=model_uri,\n", 168 | " image_uri=image,\n", 169 | " role=role,\n", 170 | " entry_point=\"inference.py\",\n", 171 | " source_dir='code',\n", 172 | " name=model_name,\n", 173 | " env = {\n", 174 | " 'MMS_MAX_REQUEST_SIZE': '2000000000',\n", 175 | " 'MMS_MAX_RESPONSE_SIZE': '2000000000',\n", 176 | " 'MMS_DEFAULT_RESPONSE_TIMEOUT': '900' \n", 177 | " } # we use huggingface container, so add MMS env variables\n", 178 | ")" 179 | ] 180 | }, 181 | { 182 | "cell_type": "markdown", 183 | "id": "dee32e99-edb5-43da-9bd4-015d4477933d", 184 | "metadata": {}, 185 | "source": [ 186 | "### Real-time inference " 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "id": "25574fd6-d888-47b3-beb4-ab9a441d648a", 193 | "metadata": { 194 | "tags": [] 195 | }, 196 | "outputs": [], 197 | "source": [ 198 | "from sagemaker.serializers import DataSerializer\n", 199 | "from sagemaker.deserializers import JSONDeserializer\n", 200 | "\n", 201 | "# Define serializers and deserializer\n", 202 | "audio_serializer = DataSerializer(content_type=\"audio/x-audio\")\n", 203 | "deserializer = JSONDeserializer()" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": null, 209 | "id": "98a97d22-3990-4024-b859-1df12da7741d", 210 | "metadata": { 211 | "tags": [] 212 | }, 213 | "outputs": [], 214 | "source": [ 215 | "%%time\n", 216 | "# Deploy the model for real-time inference\n", 217 | "endpoint_name = f'whisper-pytorch-real-time-endpoint-{id}'\n", 218 | "\n", 219 | "real_time_predictor = whisper_pytorch_model.deploy(\n", 220 | " initial_instance_count=1, # number of instances\n", 221 | " instance_type=\"ml.g4dn.xlarge\", # instance type\n", 222 | " endpoint_name = endpoint_name,\n", 223 | " serializer=audio_serializer,\n", 224 | " deserializer = deserializer\n", 225 | ")\n", 226 | "# this step takes about 7 mins" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "id": "2eb6c2b0-2b90-4313-a0c9-b46dd282b95e", 233 | "metadata": { 234 | "tags": [] 235 | }, 236 | "outputs": [], 237 | "source": [ 238 | "# Download a test data sample from huggingface dataset\n", 239 | "dataset = load_dataset('MLCommons/peoples_speech', split='train', streaming = True)\n", 240 | "sample = next(iter(dataset))\n", 241 | "audio_data = sample['audio']['array']\n", 242 | "output_path = 'sample_audio.wav'\n", 243 | "sf.write(output_path, audio_data, sample['audio']['sampling_rate'])\n", 244 | "\n", 245 | "print(f\"Audio sample saved to '{output_path}'.\") " 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": null, 251 | "id": "8635750c-1d6e-48b6-ba28-3122924392da", 252 | "metadata": { 253 | "tags": [] 254 | }, 255 | "outputs": [], 256 | "source": [ 257 | "import json\n", 258 | "# Perform real-time inference\n", 259 | "audio_path = \"sample_audio.wav\" \n", 260 | "response = real_time_predictor.predict(data=audio_path)\n", 261 | "\n", 262 | "print(json.loads(response[0])['text'])" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "id": "6bfdd88c-8ded-408c-b1ba-aa9b273d1d2b", 269 | "metadata": { 270 | "tags": [] 271 | }, 272 | "outputs": [], 273 | "source": [ 274 | "# optional: Delete real-time inference endpoint, this is not required for below steps\n", 275 | "real_time_predictor.delete_endpoint()\n" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "id": "83701383-1e21-42b1-9cd8-24e3e60c4b8b", 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [] 285 | }, 286 | { 287 | "cell_type": "markdown", 288 | "id": "f9378e69-2aba-4060-bd6a-21561d169323", 289 | "metadata": {}, 290 | "source": [ 291 | "### Batch Transform Inference" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": null, 297 | "id": "44690feb-f5c8-4c36-b79a-7af263f66edf", 298 | "metadata": { 299 | "tags": [] 300 | }, 301 | "outputs": [], 302 | "source": [ 303 | "# Create a transformer\n", 304 | "whisper_transformer = whisper_pytorch_model.transformer(\n", 305 | " instance_count = 1,\n", 306 | " instance_type = \"ml.g4dn.xlarge\", \n", 307 | " output_path=\"s3://{}/{}/batch-transform/\".format(bucket, prefix),\n", 308 | " max_payload = 100\n", 309 | ")" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": null, 315 | "id": "51def5a5-a7e0-4926-89c4-f3ccb724c0eb", 316 | "metadata": { 317 | "tags": [] 318 | }, 319 | "outputs": [], 320 | "source": [ 321 | "# Please provide the S3 path where you have one or more audio files that you want to process \n", 322 | "\n", 323 | "data = \"s3://xxx/audio-files/\"" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": null, 329 | "id": "002c8b5c-5dc0-407a-8f39-616416cd8aa8", 330 | "metadata": { 331 | "tags": [] 332 | }, 333 | "outputs": [], 334 | "source": [ 335 | "# Define request data and job name\n", 336 | "job_name = f\"whisper-pytorch-batch-transform-{id}\"\n", 337 | "\n", 338 | "# Start batch transform job\n", 339 | "whisper_transformer.transform(data = data, job_name= job_name, wait = False)" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "id": "0e4bd514-27bd-4194-9dc8-41de4d2641aa", 346 | "metadata": {}, 347 | "outputs": [], 348 | "source": [] 349 | }, 350 | { 351 | "cell_type": "markdown", 352 | "id": "579c16fd-99e0-4178-8845-10a0fdaf02ac", 353 | "metadata": {}, 354 | "source": [ 355 | "### Asynchronous Inference " 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "id": "3470dd4a-f898-4344-9e65-bc95f0fa76b0", 362 | "metadata": { 363 | "tags": [] 364 | }, 365 | "outputs": [], 366 | "source": [ 367 | "%%time\n", 368 | "from sagemaker.async_inference import AsyncInferenceConfig\n", 369 | "\n", 370 | "# Create an AsyncInferenceConfig object\n", 371 | "async_config = AsyncInferenceConfig(\n", 372 | " output_path=f\"s3://{bucket}/{prefix}/output\", \n", 373 | " max_concurrent_invocations_per_instance = 4,\n", 374 | " # notification_config = {\n", 375 | " # \"SuccessTopic\": \"arn:aws:sns:us-east-2:123456789012:MyTopic\",\n", 376 | " # \"ErrorTopic\": \"arn:aws:sns:us-east-2:123456789012:MyTopic\",\n", 377 | " # }, # Notification configuration \n", 378 | ")\n", 379 | "\n", 380 | "# Deploy the model for async inference\n", 381 | "endpoint_name = f'whisper-pytorch-async-endpoint-{id}'\n", 382 | "async_predictor = whisper_pytorch_model.deploy(\n", 383 | " async_inference_config=async_config,\n", 384 | " initial_instance_count=1, # number of instances\n", 385 | " instance_type ='ml.g4dn.xlarge', # instance type\n", 386 | " endpoint_name = endpoint_name\n", 387 | ")" 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": null, 393 | "id": "0f9bf42a-5cba-49fb-96d2-4ac40c96d5fb", 394 | "metadata": { 395 | "tags": [] 396 | }, 397 | "outputs": [], 398 | "source": [ 399 | "# Provide the S3 path for the audio file you want to processs\n", 400 | "\n", 401 | "input_path = \"s3://xxx/audio-files/xxx.mp3\"" 402 | ] 403 | }, 404 | { 405 | "cell_type": "code", 406 | "execution_count": null, 407 | "id": "dddbb3d2-0b8d-4431-9cdb-6f48acaad709", 408 | "metadata": { 409 | "tags": [] 410 | }, 411 | "outputs": [], 412 | "source": [ 413 | "# Perform async inference\n", 414 | "initial_args = {'ContentType':\"audio/x-audio\"}\n", 415 | "response = async_predictor.predict_async(initial_args = initial_args, input_path=input_path)\n", 416 | "response.output_path" 417 | ] 418 | }, 419 | { 420 | "cell_type": "markdown", 421 | "id": "d804fecd-42d2-494e-8169-9c48f5fec233", 422 | "metadata": { 423 | "tags": [] 424 | }, 425 | "source": [ 426 | "### Optional: Test autoscaling configurations for Async inference " 427 | ] 428 | }, 429 | { 430 | "cell_type": "code", 431 | "execution_count": null, 432 | "id": "7c16a29a-67ac-4884-94ba-e2424debe2cc", 433 | "metadata": { 434 | "tags": [] 435 | }, 436 | "outputs": [], 437 | "source": [ 438 | "autoscale = boto3.client('application-autoscaling') \n", 439 | "resource_id='endpoint/' + endpoint_name + '/variant/' + 'AllTraffic'\n", 440 | "\n", 441 | "# Register scalable target\n", 442 | "register_response = autoscale.register_scalable_target(\n", 443 | " ServiceNamespace='sagemaker', \n", 444 | " ResourceId=resource_id,\n", 445 | " ScalableDimension='sagemaker:variant:DesiredInstanceCount',\n", 446 | " MinCapacity=0, \n", 447 | " MaxCapacity=3 # * check how many instances available in your account\n", 448 | ")\n", 449 | "\n", 450 | "# Define scaling policy\n", 451 | "scalingPolicy_response = autoscale.put_scaling_policy(\n", 452 | " PolicyName='Invocations-ScalingPolicy',\n", 453 | " ServiceNamespace='sagemaker', # The namespace of the AWS service that provides the resource. \n", 454 | " ResourceId=resource_id, \n", 455 | " ScalableDimension='sagemaker:variant:DesiredInstanceCount', # SageMaker supports only Instance Count\n", 456 | " PolicyType='TargetTrackingScaling', # 'StepScaling'|'TargetTrackingScaling'\n", 457 | " TargetTrackingScalingPolicyConfiguration={\n", 458 | " 'TargetValue': 3.0, # The target value for the metric. \n", 459 | " 'CustomizedMetricSpecification': {\n", 460 | " 'MetricName': 'ApproximateBacklogSizePerInstance',\n", 461 | " 'Namespace': 'AWS/SageMaker',\n", 462 | " 'Dimensions': [\n", 463 | " {'Name': 'EndpointName', 'Value': endpoint_name }\n", 464 | " ],\n", 465 | " 'Statistic': 'Average',\n", 466 | " },\n", 467 | " 'ScaleInCooldown': 60, # The cooldown period helps you prevent your Auto Scaling group from launching or terminating \n", 468 | " # additional instances before the effects of previous activities are visible. \n", 469 | " # You can configure the length of time based on your instance startup time or other application needs.\n", 470 | " # ScaleInCooldown - The amount of time, in seconds, after a scale in activity completes before another scale in activity can start. \n", 471 | " 'ScaleOutCooldown': 60 # ScaleOutCooldown - The amount of time, in seconds, after a scale out activity completes before another scale out activity can start.\n", 472 | " \n", 473 | " # 'DisableScaleIn': True|False - indicates whether scale in by the target tracking policy is disabled. \n", 474 | " # If the value is true , scale in is disabled and the target tracking policy won't remove capacity from the scalable resource.\n", 475 | " }\n", 476 | ")\n", 477 | "\n", 478 | "scalingPolicy_response" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": null, 484 | "id": "5b4b7350-ed96-4ae8-b81b-97c2fe8a9cca", 485 | "metadata": {}, 486 | "outputs": [], 487 | "source": [ 488 | "# Trigger 1000 asynchronous invocations with autoscaling from 1 to 3\n", 489 | "# then scale down to 0 on completion\n", 490 | "\n", 491 | "print(endpoint_name)\n", 492 | "for i in range(1,1000):\n", 493 | " response = sm_runtime.invoke_endpoint_async(\n", 494 | " EndpointName=endpoint_name, \n", 495 | " InputLocation=input_path)\n", 496 | " \n", 497 | "print(\"\\nAsync invocations for PyTorch serving with autoscaling\\n\")" 498 | ] 499 | }, 500 | { 501 | "cell_type": "markdown", 502 | "id": "67964e18-fab6-457a-9325-7ebea398d4a6", 503 | "metadata": {}, 504 | "source": [ 505 | "### Clean up" 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": null, 511 | "id": "660fd945-ee90-4773-add4-2168bb35d022", 512 | "metadata": { 513 | "tags": [] 514 | }, 515 | "outputs": [], 516 | "source": [ 517 | "# Delete Asynchronous inference endpoint\n", 518 | "async_predictor.delete_endpoint()" 519 | ] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "execution_count": null, 524 | "id": "7fa31bad-1ccf-4cfe-ab0c-34ffb5eac997", 525 | "metadata": {}, 526 | "outputs": [], 527 | "source": [] 528 | } 529 | ], 530 | "metadata": { 531 | "availableInstances": [ 532 | { 533 | "_defaultOrder": 0, 534 | "_isFastLaunch": true, 535 | "category": "General purpose", 536 | "gpuNum": 0, 537 | "hideHardwareSpecs": false, 538 | "memoryGiB": 4, 539 | "name": "ml.t3.medium", 540 | "vcpuNum": 2 541 | }, 542 | { 543 | "_defaultOrder": 1, 544 | "_isFastLaunch": false, 545 | "category": "General purpose", 546 | "gpuNum": 0, 547 | "hideHardwareSpecs": false, 548 | "memoryGiB": 8, 549 | "name": "ml.t3.large", 550 | "vcpuNum": 2 551 | }, 552 | { 553 | "_defaultOrder": 2, 554 | "_isFastLaunch": false, 555 | "category": "General purpose", 556 | "gpuNum": 0, 557 | "hideHardwareSpecs": false, 558 | "memoryGiB": 16, 559 | "name": "ml.t3.xlarge", 560 | "vcpuNum": 4 561 | }, 562 | { 563 | "_defaultOrder": 3, 564 | "_isFastLaunch": false, 565 | "category": "General purpose", 566 | "gpuNum": 0, 567 | "hideHardwareSpecs": false, 568 | "memoryGiB": 32, 569 | "name": "ml.t3.2xlarge", 570 | "vcpuNum": 8 571 | }, 572 | { 573 | "_defaultOrder": 4, 574 | "_isFastLaunch": true, 575 | "category": "General purpose", 576 | "gpuNum": 0, 577 | "hideHardwareSpecs": false, 578 | "memoryGiB": 8, 579 | "name": "ml.m5.large", 580 | "vcpuNum": 2 581 | }, 582 | { 583 | "_defaultOrder": 5, 584 | "_isFastLaunch": false, 585 | "category": "General purpose", 586 | "gpuNum": 0, 587 | "hideHardwareSpecs": false, 588 | "memoryGiB": 16, 589 | "name": "ml.m5.xlarge", 590 | "vcpuNum": 4 591 | }, 592 | { 593 | "_defaultOrder": 6, 594 | "_isFastLaunch": false, 595 | "category": "General purpose", 596 | "gpuNum": 0, 597 | "hideHardwareSpecs": false, 598 | "memoryGiB": 32, 599 | "name": "ml.m5.2xlarge", 600 | "vcpuNum": 8 601 | }, 602 | { 603 | "_defaultOrder": 7, 604 | "_isFastLaunch": false, 605 | "category": "General purpose", 606 | "gpuNum": 0, 607 | "hideHardwareSpecs": false, 608 | "memoryGiB": 64, 609 | "name": "ml.m5.4xlarge", 610 | "vcpuNum": 16 611 | }, 612 | { 613 | "_defaultOrder": 8, 614 | "_isFastLaunch": false, 615 | "category": "General purpose", 616 | "gpuNum": 0, 617 | "hideHardwareSpecs": false, 618 | "memoryGiB": 128, 619 | "name": "ml.m5.8xlarge", 620 | "vcpuNum": 32 621 | }, 622 | { 623 | "_defaultOrder": 9, 624 | "_isFastLaunch": false, 625 | "category": "General purpose", 626 | "gpuNum": 0, 627 | "hideHardwareSpecs": false, 628 | "memoryGiB": 192, 629 | "name": "ml.m5.12xlarge", 630 | "vcpuNum": 48 631 | }, 632 | { 633 | "_defaultOrder": 10, 634 | "_isFastLaunch": false, 635 | "category": "General purpose", 636 | "gpuNum": 0, 637 | "hideHardwareSpecs": false, 638 | "memoryGiB": 256, 639 | "name": "ml.m5.16xlarge", 640 | "vcpuNum": 64 641 | }, 642 | { 643 | "_defaultOrder": 11, 644 | "_isFastLaunch": false, 645 | "category": "General purpose", 646 | "gpuNum": 0, 647 | "hideHardwareSpecs": false, 648 | "memoryGiB": 384, 649 | "name": "ml.m5.24xlarge", 650 | "vcpuNum": 96 651 | }, 652 | { 653 | "_defaultOrder": 12, 654 | "_isFastLaunch": false, 655 | "category": "General purpose", 656 | "gpuNum": 0, 657 | "hideHardwareSpecs": false, 658 | "memoryGiB": 8, 659 | "name": "ml.m5d.large", 660 | "vcpuNum": 2 661 | }, 662 | { 663 | "_defaultOrder": 13, 664 | "_isFastLaunch": false, 665 | "category": "General purpose", 666 | "gpuNum": 0, 667 | "hideHardwareSpecs": false, 668 | "memoryGiB": 16, 669 | "name": "ml.m5d.xlarge", 670 | "vcpuNum": 4 671 | }, 672 | { 673 | "_defaultOrder": 14, 674 | "_isFastLaunch": false, 675 | "category": "General purpose", 676 | "gpuNum": 0, 677 | "hideHardwareSpecs": false, 678 | "memoryGiB": 32, 679 | "name": "ml.m5d.2xlarge", 680 | "vcpuNum": 8 681 | }, 682 | { 683 | "_defaultOrder": 15, 684 | "_isFastLaunch": false, 685 | "category": "General purpose", 686 | "gpuNum": 0, 687 | "hideHardwareSpecs": false, 688 | "memoryGiB": 64, 689 | "name": "ml.m5d.4xlarge", 690 | "vcpuNum": 16 691 | }, 692 | { 693 | "_defaultOrder": 16, 694 | "_isFastLaunch": false, 695 | "category": "General purpose", 696 | "gpuNum": 0, 697 | "hideHardwareSpecs": false, 698 | "memoryGiB": 128, 699 | "name": "ml.m5d.8xlarge", 700 | "vcpuNum": 32 701 | }, 702 | { 703 | "_defaultOrder": 17, 704 | "_isFastLaunch": false, 705 | "category": "General purpose", 706 | "gpuNum": 0, 707 | "hideHardwareSpecs": false, 708 | "memoryGiB": 192, 709 | "name": "ml.m5d.12xlarge", 710 | "vcpuNum": 48 711 | }, 712 | { 713 | "_defaultOrder": 18, 714 | "_isFastLaunch": false, 715 | "category": "General purpose", 716 | "gpuNum": 0, 717 | "hideHardwareSpecs": false, 718 | "memoryGiB": 256, 719 | "name": "ml.m5d.16xlarge", 720 | "vcpuNum": 64 721 | }, 722 | { 723 | "_defaultOrder": 19, 724 | "_isFastLaunch": false, 725 | "category": "General purpose", 726 | "gpuNum": 0, 727 | "hideHardwareSpecs": false, 728 | "memoryGiB": 384, 729 | "name": "ml.m5d.24xlarge", 730 | "vcpuNum": 96 731 | }, 732 | { 733 | "_defaultOrder": 20, 734 | "_isFastLaunch": false, 735 | "category": "General purpose", 736 | "gpuNum": 0, 737 | "hideHardwareSpecs": true, 738 | "memoryGiB": 0, 739 | "name": "ml.geospatial.interactive", 740 | "supportedImageNames": [ 741 | "sagemaker-geospatial-v1-0" 742 | ], 743 | "vcpuNum": 0 744 | }, 745 | { 746 | "_defaultOrder": 21, 747 | "_isFastLaunch": true, 748 | "category": "Compute optimized", 749 | "gpuNum": 0, 750 | "hideHardwareSpecs": false, 751 | "memoryGiB": 4, 752 | "name": "ml.c5.large", 753 | "vcpuNum": 2 754 | }, 755 | { 756 | "_defaultOrder": 22, 757 | "_isFastLaunch": false, 758 | "category": "Compute optimized", 759 | "gpuNum": 0, 760 | "hideHardwareSpecs": false, 761 | "memoryGiB": 8, 762 | "name": "ml.c5.xlarge", 763 | "vcpuNum": 4 764 | }, 765 | { 766 | "_defaultOrder": 23, 767 | "_isFastLaunch": false, 768 | "category": "Compute optimized", 769 | "gpuNum": 0, 770 | "hideHardwareSpecs": false, 771 | "memoryGiB": 16, 772 | "name": "ml.c5.2xlarge", 773 | "vcpuNum": 8 774 | }, 775 | { 776 | "_defaultOrder": 24, 777 | "_isFastLaunch": false, 778 | "category": "Compute optimized", 779 | "gpuNum": 0, 780 | "hideHardwareSpecs": false, 781 | "memoryGiB": 32, 782 | "name": "ml.c5.4xlarge", 783 | "vcpuNum": 16 784 | }, 785 | { 786 | "_defaultOrder": 25, 787 | "_isFastLaunch": false, 788 | "category": "Compute optimized", 789 | "gpuNum": 0, 790 | "hideHardwareSpecs": false, 791 | "memoryGiB": 72, 792 | "name": "ml.c5.9xlarge", 793 | "vcpuNum": 36 794 | }, 795 | { 796 | "_defaultOrder": 26, 797 | "_isFastLaunch": false, 798 | "category": "Compute optimized", 799 | "gpuNum": 0, 800 | "hideHardwareSpecs": false, 801 | "memoryGiB": 96, 802 | "name": "ml.c5.12xlarge", 803 | "vcpuNum": 48 804 | }, 805 | { 806 | "_defaultOrder": 27, 807 | "_isFastLaunch": false, 808 | "category": "Compute optimized", 809 | "gpuNum": 0, 810 | "hideHardwareSpecs": false, 811 | "memoryGiB": 144, 812 | "name": "ml.c5.18xlarge", 813 | "vcpuNum": 72 814 | }, 815 | { 816 | "_defaultOrder": 28, 817 | "_isFastLaunch": false, 818 | "category": "Compute optimized", 819 | "gpuNum": 0, 820 | "hideHardwareSpecs": false, 821 | "memoryGiB": 192, 822 | "name": "ml.c5.24xlarge", 823 | "vcpuNum": 96 824 | }, 825 | { 826 | "_defaultOrder": 29, 827 | "_isFastLaunch": true, 828 | "category": "Accelerated computing", 829 | "gpuNum": 1, 830 | "hideHardwareSpecs": false, 831 | "memoryGiB": 16, 832 | "name": "ml.g4dn.xlarge", 833 | "vcpuNum": 4 834 | }, 835 | { 836 | "_defaultOrder": 30, 837 | "_isFastLaunch": false, 838 | "category": "Accelerated computing", 839 | "gpuNum": 1, 840 | "hideHardwareSpecs": false, 841 | "memoryGiB": 32, 842 | "name": "ml.g4dn.2xlarge", 843 | "vcpuNum": 8 844 | }, 845 | { 846 | "_defaultOrder": 31, 847 | "_isFastLaunch": false, 848 | "category": "Accelerated computing", 849 | "gpuNum": 1, 850 | "hideHardwareSpecs": false, 851 | "memoryGiB": 64, 852 | "name": "ml.g4dn.4xlarge", 853 | "vcpuNum": 16 854 | }, 855 | { 856 | "_defaultOrder": 32, 857 | "_isFastLaunch": false, 858 | "category": "Accelerated computing", 859 | "gpuNum": 1, 860 | "hideHardwareSpecs": false, 861 | "memoryGiB": 128, 862 | "name": "ml.g4dn.8xlarge", 863 | "vcpuNum": 32 864 | }, 865 | { 866 | "_defaultOrder": 33, 867 | "_isFastLaunch": false, 868 | "category": "Accelerated computing", 869 | "gpuNum": 4, 870 | "hideHardwareSpecs": false, 871 | "memoryGiB": 192, 872 | "name": "ml.g4dn.12xlarge", 873 | "vcpuNum": 48 874 | }, 875 | { 876 | "_defaultOrder": 34, 877 | "_isFastLaunch": false, 878 | "category": "Accelerated computing", 879 | "gpuNum": 1, 880 | "hideHardwareSpecs": false, 881 | "memoryGiB": 256, 882 | "name": "ml.g4dn.16xlarge", 883 | "vcpuNum": 64 884 | }, 885 | { 886 | "_defaultOrder": 35, 887 | "_isFastLaunch": false, 888 | "category": "Accelerated computing", 889 | "gpuNum": 1, 890 | "hideHardwareSpecs": false, 891 | "memoryGiB": 61, 892 | "name": "ml.p3.2xlarge", 893 | "vcpuNum": 8 894 | }, 895 | { 896 | "_defaultOrder": 36, 897 | "_isFastLaunch": false, 898 | "category": "Accelerated computing", 899 | "gpuNum": 4, 900 | "hideHardwareSpecs": false, 901 | "memoryGiB": 244, 902 | "name": "ml.p3.8xlarge", 903 | "vcpuNum": 32 904 | }, 905 | { 906 | "_defaultOrder": 37, 907 | "_isFastLaunch": false, 908 | "category": "Accelerated computing", 909 | "gpuNum": 8, 910 | "hideHardwareSpecs": false, 911 | "memoryGiB": 488, 912 | "name": "ml.p3.16xlarge", 913 | "vcpuNum": 64 914 | }, 915 | { 916 | "_defaultOrder": 38, 917 | "_isFastLaunch": false, 918 | "category": "Accelerated computing", 919 | "gpuNum": 8, 920 | "hideHardwareSpecs": false, 921 | "memoryGiB": 768, 922 | "name": "ml.p3dn.24xlarge", 923 | "vcpuNum": 96 924 | }, 925 | { 926 | "_defaultOrder": 39, 927 | "_isFastLaunch": false, 928 | "category": "Memory Optimized", 929 | "gpuNum": 0, 930 | "hideHardwareSpecs": false, 931 | "memoryGiB": 16, 932 | "name": "ml.r5.large", 933 | "vcpuNum": 2 934 | }, 935 | { 936 | "_defaultOrder": 40, 937 | "_isFastLaunch": false, 938 | "category": "Memory Optimized", 939 | "gpuNum": 0, 940 | "hideHardwareSpecs": false, 941 | "memoryGiB": 32, 942 | "name": "ml.r5.xlarge", 943 | "vcpuNum": 4 944 | }, 945 | { 946 | "_defaultOrder": 41, 947 | "_isFastLaunch": false, 948 | "category": "Memory Optimized", 949 | "gpuNum": 0, 950 | "hideHardwareSpecs": false, 951 | "memoryGiB": 64, 952 | "name": "ml.r5.2xlarge", 953 | "vcpuNum": 8 954 | }, 955 | { 956 | "_defaultOrder": 42, 957 | "_isFastLaunch": false, 958 | "category": "Memory Optimized", 959 | "gpuNum": 0, 960 | "hideHardwareSpecs": false, 961 | "memoryGiB": 128, 962 | "name": "ml.r5.4xlarge", 963 | "vcpuNum": 16 964 | }, 965 | { 966 | "_defaultOrder": 43, 967 | "_isFastLaunch": false, 968 | "category": "Memory Optimized", 969 | "gpuNum": 0, 970 | "hideHardwareSpecs": false, 971 | "memoryGiB": 256, 972 | "name": "ml.r5.8xlarge", 973 | "vcpuNum": 32 974 | }, 975 | { 976 | "_defaultOrder": 44, 977 | "_isFastLaunch": false, 978 | "category": "Memory Optimized", 979 | "gpuNum": 0, 980 | "hideHardwareSpecs": false, 981 | "memoryGiB": 384, 982 | "name": "ml.r5.12xlarge", 983 | "vcpuNum": 48 984 | }, 985 | { 986 | "_defaultOrder": 45, 987 | "_isFastLaunch": false, 988 | "category": "Memory Optimized", 989 | "gpuNum": 0, 990 | "hideHardwareSpecs": false, 991 | "memoryGiB": 512, 992 | "name": "ml.r5.16xlarge", 993 | "vcpuNum": 64 994 | }, 995 | { 996 | "_defaultOrder": 46, 997 | "_isFastLaunch": false, 998 | "category": "Memory Optimized", 999 | "gpuNum": 0, 1000 | "hideHardwareSpecs": false, 1001 | "memoryGiB": 768, 1002 | "name": "ml.r5.24xlarge", 1003 | "vcpuNum": 96 1004 | }, 1005 | { 1006 | "_defaultOrder": 47, 1007 | "_isFastLaunch": false, 1008 | "category": "Accelerated computing", 1009 | "gpuNum": 1, 1010 | "hideHardwareSpecs": false, 1011 | "memoryGiB": 16, 1012 | "name": "ml.g5.xlarge", 1013 | "vcpuNum": 4 1014 | }, 1015 | { 1016 | "_defaultOrder": 48, 1017 | "_isFastLaunch": false, 1018 | "category": "Accelerated computing", 1019 | "gpuNum": 1, 1020 | "hideHardwareSpecs": false, 1021 | "memoryGiB": 32, 1022 | "name": "ml.g5.2xlarge", 1023 | "vcpuNum": 8 1024 | }, 1025 | { 1026 | "_defaultOrder": 49, 1027 | "_isFastLaunch": false, 1028 | "category": "Accelerated computing", 1029 | "gpuNum": 1, 1030 | "hideHardwareSpecs": false, 1031 | "memoryGiB": 64, 1032 | "name": "ml.g5.4xlarge", 1033 | "vcpuNum": 16 1034 | }, 1035 | { 1036 | "_defaultOrder": 50, 1037 | "_isFastLaunch": false, 1038 | "category": "Accelerated computing", 1039 | "gpuNum": 1, 1040 | "hideHardwareSpecs": false, 1041 | "memoryGiB": 128, 1042 | "name": "ml.g5.8xlarge", 1043 | "vcpuNum": 32 1044 | }, 1045 | { 1046 | "_defaultOrder": 51, 1047 | "_isFastLaunch": false, 1048 | "category": "Accelerated computing", 1049 | "gpuNum": 1, 1050 | "hideHardwareSpecs": false, 1051 | "memoryGiB": 256, 1052 | "name": "ml.g5.16xlarge", 1053 | "vcpuNum": 64 1054 | }, 1055 | { 1056 | "_defaultOrder": 52, 1057 | "_isFastLaunch": false, 1058 | "category": "Accelerated computing", 1059 | "gpuNum": 4, 1060 | "hideHardwareSpecs": false, 1061 | "memoryGiB": 192, 1062 | "name": "ml.g5.12xlarge", 1063 | "vcpuNum": 48 1064 | }, 1065 | { 1066 | "_defaultOrder": 53, 1067 | "_isFastLaunch": false, 1068 | "category": "Accelerated computing", 1069 | "gpuNum": 4, 1070 | "hideHardwareSpecs": false, 1071 | "memoryGiB": 384, 1072 | "name": "ml.g5.24xlarge", 1073 | "vcpuNum": 96 1074 | }, 1075 | { 1076 | "_defaultOrder": 54, 1077 | "_isFastLaunch": false, 1078 | "category": "Accelerated computing", 1079 | "gpuNum": 8, 1080 | "hideHardwareSpecs": false, 1081 | "memoryGiB": 768, 1082 | "name": "ml.g5.48xlarge", 1083 | "vcpuNum": 192 1084 | }, 1085 | { 1086 | "_defaultOrder": 55, 1087 | "_isFastLaunch": false, 1088 | "category": "Accelerated computing", 1089 | "gpuNum": 8, 1090 | "hideHardwareSpecs": false, 1091 | "memoryGiB": 1152, 1092 | "name": "ml.p4d.24xlarge", 1093 | "vcpuNum": 96 1094 | }, 1095 | { 1096 | "_defaultOrder": 56, 1097 | "_isFastLaunch": false, 1098 | "category": "Accelerated computing", 1099 | "gpuNum": 8, 1100 | "hideHardwareSpecs": false, 1101 | "memoryGiB": 1152, 1102 | "name": "ml.p4de.24xlarge", 1103 | "vcpuNum": 96 1104 | }, 1105 | { 1106 | "_defaultOrder": 57, 1107 | "_isFastLaunch": false, 1108 | "category": "Accelerated computing", 1109 | "gpuNum": 0, 1110 | "hideHardwareSpecs": false, 1111 | "memoryGiB": 32, 1112 | "name": "ml.trn1.2xlarge", 1113 | "vcpuNum": 8 1114 | }, 1115 | { 1116 | "_defaultOrder": 58, 1117 | "_isFastLaunch": false, 1118 | "category": "Accelerated computing", 1119 | "gpuNum": 0, 1120 | "hideHardwareSpecs": false, 1121 | "memoryGiB": 512, 1122 | "name": "ml.trn1.32xlarge", 1123 | "vcpuNum": 128 1124 | }, 1125 | { 1126 | "_defaultOrder": 59, 1127 | "_isFastLaunch": false, 1128 | "category": "Accelerated computing", 1129 | "gpuNum": 0, 1130 | "hideHardwareSpecs": false, 1131 | "memoryGiB": 512, 1132 | "name": "ml.trn1n.32xlarge", 1133 | "vcpuNum": 128 1134 | } 1135 | ], 1136 | "instance_type": "ml.m5.large", 1137 | "kernelspec": { 1138 | "display_name": "Python 3 (Data Science 2.0)", 1139 | "language": "python", 1140 | "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:081325390199:image/sagemaker-data-science-38" 1141 | }, 1142 | "language_info": { 1143 | "codemirror_mode": { 1144 | "name": "ipython", 1145 | "version": 3 1146 | }, 1147 | "file_extension": ".py", 1148 | "mimetype": "text/x-python", 1149 | "name": "python", 1150 | "nbconvert_exporter": "python", 1151 | "pygments_lexer": "ipython3", 1152 | "version": "3.8.13" 1153 | } 1154 | }, 1155 | "nbformat": 4, 1156 | "nbformat_minor": 5 1157 | } 1158 | -------------------------------------------------------------------------------- /huggingface/huggingface.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "db53f808-9fbd-408d-a6a0-d18200733876", 6 | "metadata": {}, 7 | "source": [ 8 | "## Using Huggingface DLC to Host the Whisper Model for Automatic Speech Recognition Tasks" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "94a75f20-21ff-4d05-a0e8-50a6ceaa49c2", 14 | "metadata": {}, 15 | "source": [ 16 | "## Common set up \n", 17 | "**❗If you run this notebook in SageMaker Studio, please select the Data Science 2.0 image and choose the ml.m5.large instance.**" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "id": "12501f18-d807-4337-b4e9-7e1c2d7590df", 24 | "metadata": { 25 | "tags": [] 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "# Install required packages\n", 30 | "%pip install openai-whisper==20230918 -q\n", 31 | "%pip install torchaudio==2.1.0 -q\n", 32 | "%pip install datasets==2.16.1 -q\n", 33 | "%pip install sagemaker==2.184.0 -q\n", 34 | "%pip install librosa -q\n", 35 | "%pip install soundfile -q" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": null, 41 | "id": "a6e6e597-e406-4851-bdcc-25c118e006f9", 42 | "metadata": { 43 | "tags": [] 44 | }, 45 | "outputs": [], 46 | "source": [ 47 | "!pip install transformers==4.28.1 -q\n", 48 | "!pip install accelerate>=0.20.3 -q" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": null, 54 | "id": "42d2eb07-63a4-4feb-80ec-2a7508c0927d", 55 | "metadata": { 56 | "tags": [] 57 | }, 58 | "outputs": [], 59 | "source": [ 60 | "# !conda install -y ffmpeg" 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "id": "da52b2e1-dc62-41fd-a8e7-00dd6e31285c", 66 | "metadata": {}, 67 | "source": [ 68 | "**❗Please restart the kernel before executing the cells below.**" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "id": "5fec1136-d7b0-402f-b959-d0302680c508", 75 | "metadata": { 76 | "tags": [] 77 | }, 78 | "outputs": [], 79 | "source": [ 80 | "# import required packages \n", 81 | "import torch\n", 82 | "import whisper\n", 83 | "import torchaudio\n", 84 | "import sagemaker\n", 85 | "import time\n", 86 | "import boto3" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "id": "443ea851-3d8c-42d6-bcbe-334d7cde01ba", 93 | "metadata": { 94 | "tags": [] 95 | }, 96 | "outputs": [], 97 | "source": [ 98 | "# basic configurations \n", 99 | "sess = sagemaker.session.Session()\n", 100 | "bucket = '[BUCKET NAME]'\n", 101 | "prefix = 'whisper_blog_post'\n", 102 | "role = sagemaker.get_execution_role()\n", 103 | "\n", 104 | "# below boto3 clients are for invoking asynchronous endpoint \n", 105 | "sm_runtime = boto3.client(\"sagemaker-runtime\")" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "id": "7530e5bc-8c1d-47e1-9554-b22635b2de18", 111 | "metadata": {}, 112 | "source": [ 113 | "### Create Whisper Hugging Face model artifacts and upload to S3 bucket" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "id": "098bce2b-7151-43c3-8b8b-3c5000a9b60b", 120 | "metadata": { 121 | "tags": [] 122 | }, 123 | "outputs": [], 124 | "source": [ 125 | "!mkdir -p model" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "id": "85bce568-21b0-4d48-9849-1ac8cf18388b", 132 | "metadata": { 133 | "tags": [] 134 | }, 135 | "outputs": [], 136 | "source": [ 137 | "from transformers import WhisperProcessor, AutoModelForSpeechSeq2Seq\n", 138 | "\n", 139 | "# Load the pre-trained model\n", 140 | "model_name = \"openai/whisper-base\"\n", 141 | "model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name)\n", 142 | "\n", 143 | "# Define a directory where you want to save the model\n", 144 | "save_directory = \"./model\"\n", 145 | "\n", 146 | "# Save the model to the specified directory\n", 147 | "model.save_pretrained(save_directory)\n", 148 | "\n", 149 | "from transformers import WhisperTokenizer\n", 150 | "\n", 151 | "tokenizer = WhisperTokenizer.from_pretrained(model_name)\n", 152 | "tokenizer.save_pretrained(save_directory)\n", 153 | "processor = WhisperProcessor.from_pretrained(model_name)\n", 154 | "processor.save_pretrained(save_directory)" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": null, 160 | "id": "02164e7e-7244-4f22-a435-b383caf61b96", 161 | "metadata": { 162 | "tags": [] 163 | }, 164 | "outputs": [], 165 | "source": [ 166 | "!tar cvzf model.tar.gz -C model/ .\n", 167 | "\n", 168 | "model_uri = sess.upload_data('model.tar.gz', bucket = bucket, key_prefix=f\"{prefix}/huggingface/model\")\n", 169 | "!rm model.tar.gz\n", 170 | "!rm -rf model\n", 171 | "model_uri" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "id": "70fe78a4-f976-4096-adfc-c7beb4e2b2cb", 178 | "metadata": { 179 | "tags": [] 180 | }, 181 | "outputs": [], 182 | "source": [ 183 | "# Generate a unique model name and provide image uri\n", 184 | "\n", 185 | "id = int(time.time())\n", 186 | "model_name = f'whisper-hf-model-{id}'\n", 187 | "\n", 188 | "# !Please change the image URI for the region that you are using:e.g. us-east-1\n", 189 | "image = \"763104351884.dkr.ecr.[REGION].amazonaws.com/huggingface-pytorch-inference:2.0.0-transformers4.28.1-gpu-py310-cu118-ubuntu20.04\"" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "id": "4821404d-d25a-4503-a26f-04980e93575b", 196 | "metadata": { 197 | "tags": [] 198 | }, 199 | "outputs": [], 200 | "source": [ 201 | "# Create a HuggingFaceModel for deployment\n", 202 | "from sagemaker.huggingface.model import HuggingFaceModel\n", 203 | "\n", 204 | "whisper_hf_model = HuggingFaceModel(\n", 205 | " model_data=model_uri,\n", 206 | " role=role, \n", 207 | " image_uri = image,\n", 208 | " entry_point=\"inference.py\",\n", 209 | " source_dir='code',\n", 210 | " name=model_name,\n", 211 | " env = {\n", 212 | " \"chunk_length_s\":\"30\",\n", 213 | " 'MMS_MAX_REQUEST_SIZE': '2000000000',\n", 214 | " 'MMS_MAX_RESPONSE_SIZE': '2000000000',\n", 215 | " 'MMS_DEFAULT_RESPONSE_TIMEOUT': '900'\n", 216 | " }\n", 217 | ")" 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "id": "a4305c4f-67af-4f2d-b545-9e112f9722c0", 223 | "metadata": {}, 224 | "source": [ 225 | "### Real-time inference " 226 | ] 227 | }, 228 | { 229 | "cell_type": "code", 230 | "execution_count": null, 231 | "id": "2c1075c2-8146-468c-bd6e-54496e4ee6eb", 232 | "metadata": { 233 | "tags": [] 234 | }, 235 | "outputs": [], 236 | "source": [ 237 | "from sagemaker.serializers import DataSerializer\n", 238 | "from sagemaker.deserializers import JSONDeserializer\n", 239 | "\n", 240 | "# Define serializers and deserializer\n", 241 | "audio_serializer = DataSerializer(content_type=\"audio/x-audio\")\n", 242 | "deserializer = JSONDeserializer()" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": null, 248 | "id": "9b6375d7-8116-49a1-a50d-28c1b30133b0", 249 | "metadata": { 250 | "tags": [] 251 | }, 252 | "outputs": [], 253 | "source": [ 254 | "# Deploy the model for real-time inference\n", 255 | "endpoint_name = f'whisper-hf-real-time-endpoint-{id}'\n", 256 | "\n", 257 | "real_time_predictor = whisper_hf_model.deploy(\n", 258 | " initial_instance_count=1,\n", 259 | " instance_type=\"ml.g4dn.xlarge\",\n", 260 | " endpoint_name = endpoint_name,\n", 261 | " serializer=audio_serializer,\n", 262 | " deserializer = deserializer\n", 263 | " )" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": null, 269 | "id": "88f4e679-e83e-436f-ad00-8d02b038f582", 270 | "metadata": { 271 | "tags": [] 272 | }, 273 | "outputs": [], 274 | "source": [ 275 | "# Download a test data sample from huggingface dataset\n", 276 | "import soundfile as sf\n", 277 | "from datasets import load_dataset\n", 278 | "dataset = load_dataset('MLCommons/peoples_speech', split='train', streaming = True)\n", 279 | "sample = next(iter(dataset))\n", 280 | "audio_data = sample['audio']['array']\n", 281 | "output_path = 'sample_audio.wav'\n", 282 | "sf.write(output_path, audio_data, sample['audio']['sampling_rate'])\n", 283 | "\n", 284 | "print(f\"Audio sample saved to '{output_path}'.\")" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": null, 290 | "id": "3381911f-3f81-4cb1-a9e0-eedbaa41161d", 291 | "metadata": { 292 | "tags": [] 293 | }, 294 | "outputs": [], 295 | "source": [ 296 | "import json\n", 297 | "# Perform real-time inference\n", 298 | "audio_path = \"sample_audio.wav\" \n", 299 | "response = real_time_predictor.predict(data=audio_path)\n", 300 | "\n", 301 | "print(response[0])" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": null, 307 | "id": "cc0a7c98-c5f2-4ec0-91a3-c0851870e969", 308 | "metadata": { 309 | "tags": [] 310 | }, 311 | "outputs": [], 312 | "source": [ 313 | "# optional: Delete real-time inference endpoint, this is not required for below steps\n", 314 | "real_time_predictor.delete_endpoint()" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": null, 320 | "id": "f37d757e-b3da-469e-8f31-79b4e284009b", 321 | "metadata": {}, 322 | "outputs": [], 323 | "source": [] 324 | }, 325 | { 326 | "cell_type": "markdown", 327 | "id": "bf6f6800-fcc3-425b-8d83-2c94e08cd34d", 328 | "metadata": {}, 329 | "source": [ 330 | "### Batch Transform Inference" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": null, 336 | "id": "02fcbeae-bb46-44c5-8028-b58fdf2cfb61", 337 | "metadata": { 338 | "tags": [] 339 | }, 340 | "outputs": [], 341 | "source": [ 342 | "# Create a transformer\n", 343 | "whisper_transformer = whisper_hf_model.transformer(\n", 344 | " instance_count = 1,\n", 345 | " instance_type = \"ml.g4dn.xlarge\",\n", 346 | " output_path=\"s3://{}/{}/batch-transform/\".format(bucket, prefix),\n", 347 | " max_payload = 100\n", 348 | ")" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": null, 354 | "id": "d8746208-69b6-40fd-8b76-147f8a59469c", 355 | "metadata": { 356 | "tags": [] 357 | }, 358 | "outputs": [], 359 | "source": [ 360 | "# Please provide the S3 path where you have one or more audio files that you want to process \n", 361 | "data = \"s3://xxx/audio-files/\"" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": null, 367 | "id": "897627a2-00d9-4860-a0e4-381688e06ffe", 368 | "metadata": { 369 | "tags": [] 370 | }, 371 | "outputs": [], 372 | "source": [ 373 | "# Define request data and job name\n", 374 | "job_name = f\"whisper-hf-batch-transform-{id}\"\n", 375 | "\n", 376 | "# Start batch transform job\n", 377 | "whisper_transformer.transform(data = data, job_name= job_name, wait = False)" 378 | ] 379 | }, 380 | { 381 | "cell_type": "markdown", 382 | "id": "82561d70-22d4-45d4-9b4d-740fb077e511", 383 | "metadata": {}, 384 | "source": [ 385 | "### Asynchronous Inference " 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": null, 391 | "id": "f2625d15-bbff-420e-ac9c-0ffeebb3023f", 392 | "metadata": { 393 | "tags": [] 394 | }, 395 | "outputs": [], 396 | "source": [ 397 | "%%time\n", 398 | "from sagemaker.async_inference import AsyncInferenceConfig\n", 399 | "\n", 400 | "# Create an AsyncInferenceConfig object\n", 401 | "async_config = AsyncInferenceConfig(\n", 402 | " output_path=f\"s3://{bucket}/{prefix}/output\", \n", 403 | " max_concurrent_invocations_per_instance = 4,\n", 404 | " # notification_config = {\n", 405 | " # \"SuccessTopic\": \"arn:aws:sns:us-east-2:123456789012:MyTopic\",\n", 406 | " # \"ErrorTopic\": \"arn:aws:sns:us-east-2:123456789012:MyTopic\",\n", 407 | " # }, # Notification configuration \n", 408 | ")\n", 409 | "\n", 410 | "# Deploy the model for async inference\n", 411 | "endpoint_name = f'whisper-hf-async-endpoint-{id}'\n", 412 | "async_predictor = whisper_hf_model.deploy(\n", 413 | " async_inference_config=async_config,\n", 414 | " initial_instance_count=1, # number of instances\n", 415 | " instance_type ='ml.g4dn.xlarge', # instance type\n", 416 | " endpoint_name = endpoint_name\n", 417 | ")" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": null, 423 | "id": "d03d4add-f11d-4fcf-a33c-47a4eb99e4ec", 424 | "metadata": { 425 | "tags": [] 426 | }, 427 | "outputs": [], 428 | "source": [ 429 | "# Provide the S3 path for the audio file you want to processs\n", 430 | "input_path = \"s3://xxx/audio-files/xxx.mp3\"" 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": null, 436 | "id": "db4b6114-6439-4a0c-9431-0fb74e58f617", 437 | "metadata": { 438 | "tags": [] 439 | }, 440 | "outputs": [], 441 | "source": [ 442 | "# Perform async inference\n", 443 | "initial_args = {'ContentType':\"audio/x-audio\"}\n", 444 | "response = async_predictor.predict_async(initial_args = initial_args, input_path=input_path)\n", 445 | "response.output_path" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": null, 451 | "id": "91944dd9-1055-4cc1-b01f-1570cd222fa7", 452 | "metadata": {}, 453 | "outputs": [], 454 | "source": [] 455 | }, 456 | { 457 | "cell_type": "markdown", 458 | "id": "109a8a77-3036-456c-8fea-f5316648cdc1", 459 | "metadata": {}, 460 | "source": [ 461 | "### Optional: Test autoscaling configurations for Async inference " 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": null, 467 | "id": "b3b4139b-51e2-4eaf-8d0b-9f23327a4545", 468 | "metadata": { 469 | "tags": [] 470 | }, 471 | "outputs": [], 472 | "source": [ 473 | "autoscale = boto3.client('application-autoscaling') \n", 474 | "resource_id='endpoint/' + endpoint_name + '/variant/' + 'AllTraffic'\n", 475 | "\n", 476 | "# Register scalable target\n", 477 | "register_response = autoscale.register_scalable_target(\n", 478 | " ServiceNamespace='sagemaker', \n", 479 | " ResourceId=resource_id,\n", 480 | " ScalableDimension='sagemaker:variant:DesiredInstanceCount',\n", 481 | " MinCapacity=0, \n", 482 | " MaxCapacity=3 # * check how many instances available in your account\n", 483 | ")\n", 484 | "\n", 485 | "# Define scaling policy\n", 486 | "scalingPolicy_response = autoscale.put_scaling_policy(\n", 487 | " PolicyName='Invocations-ScalingPolicy',\n", 488 | " ServiceNamespace='sagemaker', # The namespace of the AWS service that provides the resource. \n", 489 | " ResourceId=resource_id, \n", 490 | " ScalableDimension='sagemaker:variant:DesiredInstanceCount', # SageMaker supports only Instance Count\n", 491 | " PolicyType='TargetTrackingScaling', # 'StepScaling'|'TargetTrackingScaling'\n", 492 | " TargetTrackingScalingPolicyConfiguration={\n", 493 | " 'TargetValue': 3.0, # The target value for the metric. \n", 494 | " 'CustomizedMetricSpecification': {\n", 495 | " 'MetricName': 'ApproximateBacklogSizePerInstance',\n", 496 | " 'Namespace': 'AWS/SageMaker',\n", 497 | " 'Dimensions': [\n", 498 | " {'Name': 'EndpointName', 'Value': endpoint_name }\n", 499 | " ],\n", 500 | " 'Statistic': 'Average',\n", 501 | " },\n", 502 | " 'ScaleInCooldown': 60, # The cooldown period helps you prevent your Auto Scaling group from launching or terminating \n", 503 | " # additional instances before the effects of previous activities are visible. \n", 504 | " # You can configure the length of time based on your instance startup time or other application needs.\n", 505 | " # ScaleInCooldown - The amount of time, in seconds, after a scale in activity completes before another scale in activity can start. \n", 506 | " 'ScaleOutCooldown': 60 # ScaleOutCooldown - The amount of time, in seconds, after a scale out activity completes before another scale out activity can start.\n", 507 | " \n", 508 | " # 'DisableScaleIn': True|False - indicates whether scale in by the target tracking policy is disabled. \n", 509 | " # If the value is true , scale in is disabled and the target tracking policy won't remove capacity from the scalable resource.\n", 510 | " }\n", 511 | ")\n", 512 | "\n", 513 | "scalingPolicy_response" 514 | ] 515 | }, 516 | { 517 | "cell_type": "code", 518 | "execution_count": null, 519 | "id": "633991e3-9d1d-4537-a84e-3e464d5e1720", 520 | "metadata": { 521 | "tags": [] 522 | }, 523 | "outputs": [], 524 | "source": [ 525 | "# Trigger 1000 asynchronous invocations with autoscaling from 1 to 3\n", 526 | "# then scale down to 0 on completion\n", 527 | "\n", 528 | "print(endpoint_name)\n", 529 | "for i in range(1,1000):\n", 530 | " response = sm_runtime.invoke_endpoint_async(\n", 531 | " EndpointName=endpoint_name, \n", 532 | " InputLocation=input_path)\n", 533 | " \n", 534 | "print(\"\\nAsync invocations for Hugging Face model serving with autoscaling\\n\")" 535 | ] 536 | }, 537 | { 538 | "cell_type": "markdown", 539 | "id": "1ae0feb3-e8e8-438f-95bb-99e422823262", 540 | "metadata": {}, 541 | "source": [ 542 | "### Clean up" 543 | ] 544 | }, 545 | { 546 | "cell_type": "code", 547 | "execution_count": null, 548 | "id": "37456ce7-2513-4b0e-9361-6aa18817fa6d", 549 | "metadata": {}, 550 | "outputs": [], 551 | "source": [ 552 | "# Delete Asynchronous inference endpoint\n", 553 | "async_predictor.delete_endpoint()" 554 | ] 555 | }, 556 | { 557 | "cell_type": "code", 558 | "execution_count": null, 559 | "id": "d3bc0813-a319-4a50-9520-c497937860df", 560 | "metadata": {}, 561 | "outputs": [], 562 | "source": [] 563 | } 564 | ], 565 | "metadata": { 566 | "availableInstances": [ 567 | { 568 | "_defaultOrder": 0, 569 | "_isFastLaunch": true, 570 | "category": "General purpose", 571 | "gpuNum": 0, 572 | "hideHardwareSpecs": false, 573 | "memoryGiB": 4, 574 | "name": "ml.t3.medium", 575 | "vcpuNum": 2 576 | }, 577 | { 578 | "_defaultOrder": 1, 579 | "_isFastLaunch": false, 580 | "category": "General purpose", 581 | "gpuNum": 0, 582 | "hideHardwareSpecs": false, 583 | "memoryGiB": 8, 584 | "name": "ml.t3.large", 585 | "vcpuNum": 2 586 | }, 587 | { 588 | "_defaultOrder": 2, 589 | "_isFastLaunch": false, 590 | "category": "General purpose", 591 | "gpuNum": 0, 592 | "hideHardwareSpecs": false, 593 | "memoryGiB": 16, 594 | "name": "ml.t3.xlarge", 595 | "vcpuNum": 4 596 | }, 597 | { 598 | "_defaultOrder": 3, 599 | "_isFastLaunch": false, 600 | "category": "General purpose", 601 | "gpuNum": 0, 602 | "hideHardwareSpecs": false, 603 | "memoryGiB": 32, 604 | "name": "ml.t3.2xlarge", 605 | "vcpuNum": 8 606 | }, 607 | { 608 | "_defaultOrder": 4, 609 | "_isFastLaunch": true, 610 | "category": "General purpose", 611 | "gpuNum": 0, 612 | "hideHardwareSpecs": false, 613 | "memoryGiB": 8, 614 | "name": "ml.m5.large", 615 | "vcpuNum": 2 616 | }, 617 | { 618 | "_defaultOrder": 5, 619 | "_isFastLaunch": false, 620 | "category": "General purpose", 621 | "gpuNum": 0, 622 | "hideHardwareSpecs": false, 623 | "memoryGiB": 16, 624 | "name": "ml.m5.xlarge", 625 | "vcpuNum": 4 626 | }, 627 | { 628 | "_defaultOrder": 6, 629 | "_isFastLaunch": false, 630 | "category": "General purpose", 631 | "gpuNum": 0, 632 | "hideHardwareSpecs": false, 633 | "memoryGiB": 32, 634 | "name": "ml.m5.2xlarge", 635 | "vcpuNum": 8 636 | }, 637 | { 638 | "_defaultOrder": 7, 639 | "_isFastLaunch": false, 640 | "category": "General purpose", 641 | "gpuNum": 0, 642 | "hideHardwareSpecs": false, 643 | "memoryGiB": 64, 644 | "name": "ml.m5.4xlarge", 645 | "vcpuNum": 16 646 | }, 647 | { 648 | "_defaultOrder": 8, 649 | "_isFastLaunch": false, 650 | "category": "General purpose", 651 | "gpuNum": 0, 652 | "hideHardwareSpecs": false, 653 | "memoryGiB": 128, 654 | "name": "ml.m5.8xlarge", 655 | "vcpuNum": 32 656 | }, 657 | { 658 | "_defaultOrder": 9, 659 | "_isFastLaunch": false, 660 | "category": "General purpose", 661 | "gpuNum": 0, 662 | "hideHardwareSpecs": false, 663 | "memoryGiB": 192, 664 | "name": "ml.m5.12xlarge", 665 | "vcpuNum": 48 666 | }, 667 | { 668 | "_defaultOrder": 10, 669 | "_isFastLaunch": false, 670 | "category": "General purpose", 671 | "gpuNum": 0, 672 | "hideHardwareSpecs": false, 673 | "memoryGiB": 256, 674 | "name": "ml.m5.16xlarge", 675 | "vcpuNum": 64 676 | }, 677 | { 678 | "_defaultOrder": 11, 679 | "_isFastLaunch": false, 680 | "category": "General purpose", 681 | "gpuNum": 0, 682 | "hideHardwareSpecs": false, 683 | "memoryGiB": 384, 684 | "name": "ml.m5.24xlarge", 685 | "vcpuNum": 96 686 | }, 687 | { 688 | "_defaultOrder": 12, 689 | "_isFastLaunch": false, 690 | "category": "General purpose", 691 | "gpuNum": 0, 692 | "hideHardwareSpecs": false, 693 | "memoryGiB": 8, 694 | "name": "ml.m5d.large", 695 | "vcpuNum": 2 696 | }, 697 | { 698 | "_defaultOrder": 13, 699 | "_isFastLaunch": false, 700 | "category": "General purpose", 701 | "gpuNum": 0, 702 | "hideHardwareSpecs": false, 703 | "memoryGiB": 16, 704 | "name": "ml.m5d.xlarge", 705 | "vcpuNum": 4 706 | }, 707 | { 708 | "_defaultOrder": 14, 709 | "_isFastLaunch": false, 710 | "category": "General purpose", 711 | "gpuNum": 0, 712 | "hideHardwareSpecs": false, 713 | "memoryGiB": 32, 714 | "name": "ml.m5d.2xlarge", 715 | "vcpuNum": 8 716 | }, 717 | { 718 | "_defaultOrder": 15, 719 | "_isFastLaunch": false, 720 | "category": "General purpose", 721 | "gpuNum": 0, 722 | "hideHardwareSpecs": false, 723 | "memoryGiB": 64, 724 | "name": "ml.m5d.4xlarge", 725 | "vcpuNum": 16 726 | }, 727 | { 728 | "_defaultOrder": 16, 729 | "_isFastLaunch": false, 730 | "category": "General purpose", 731 | "gpuNum": 0, 732 | "hideHardwareSpecs": false, 733 | "memoryGiB": 128, 734 | "name": "ml.m5d.8xlarge", 735 | "vcpuNum": 32 736 | }, 737 | { 738 | "_defaultOrder": 17, 739 | "_isFastLaunch": false, 740 | "category": "General purpose", 741 | "gpuNum": 0, 742 | "hideHardwareSpecs": false, 743 | "memoryGiB": 192, 744 | "name": "ml.m5d.12xlarge", 745 | "vcpuNum": 48 746 | }, 747 | { 748 | "_defaultOrder": 18, 749 | "_isFastLaunch": false, 750 | "category": "General purpose", 751 | "gpuNum": 0, 752 | "hideHardwareSpecs": false, 753 | "memoryGiB": 256, 754 | "name": "ml.m5d.16xlarge", 755 | "vcpuNum": 64 756 | }, 757 | { 758 | "_defaultOrder": 19, 759 | "_isFastLaunch": false, 760 | "category": "General purpose", 761 | "gpuNum": 0, 762 | "hideHardwareSpecs": false, 763 | "memoryGiB": 384, 764 | "name": "ml.m5d.24xlarge", 765 | "vcpuNum": 96 766 | }, 767 | { 768 | "_defaultOrder": 20, 769 | "_isFastLaunch": false, 770 | "category": "General purpose", 771 | "gpuNum": 0, 772 | "hideHardwareSpecs": true, 773 | "memoryGiB": 0, 774 | "name": "ml.geospatial.interactive", 775 | "supportedImageNames": [ 776 | "sagemaker-geospatial-v1-0" 777 | ], 778 | "vcpuNum": 0 779 | }, 780 | { 781 | "_defaultOrder": 21, 782 | "_isFastLaunch": true, 783 | "category": "Compute optimized", 784 | "gpuNum": 0, 785 | "hideHardwareSpecs": false, 786 | "memoryGiB": 4, 787 | "name": "ml.c5.large", 788 | "vcpuNum": 2 789 | }, 790 | { 791 | "_defaultOrder": 22, 792 | "_isFastLaunch": false, 793 | "category": "Compute optimized", 794 | "gpuNum": 0, 795 | "hideHardwareSpecs": false, 796 | "memoryGiB": 8, 797 | "name": "ml.c5.xlarge", 798 | "vcpuNum": 4 799 | }, 800 | { 801 | "_defaultOrder": 23, 802 | "_isFastLaunch": false, 803 | "category": "Compute optimized", 804 | "gpuNum": 0, 805 | "hideHardwareSpecs": false, 806 | "memoryGiB": 16, 807 | "name": "ml.c5.2xlarge", 808 | "vcpuNum": 8 809 | }, 810 | { 811 | "_defaultOrder": 24, 812 | "_isFastLaunch": false, 813 | "category": "Compute optimized", 814 | "gpuNum": 0, 815 | "hideHardwareSpecs": false, 816 | "memoryGiB": 32, 817 | "name": "ml.c5.4xlarge", 818 | "vcpuNum": 16 819 | }, 820 | { 821 | "_defaultOrder": 25, 822 | "_isFastLaunch": false, 823 | "category": "Compute optimized", 824 | "gpuNum": 0, 825 | "hideHardwareSpecs": false, 826 | "memoryGiB": 72, 827 | "name": "ml.c5.9xlarge", 828 | "vcpuNum": 36 829 | }, 830 | { 831 | "_defaultOrder": 26, 832 | "_isFastLaunch": false, 833 | "category": "Compute optimized", 834 | "gpuNum": 0, 835 | "hideHardwareSpecs": false, 836 | "memoryGiB": 96, 837 | "name": "ml.c5.12xlarge", 838 | "vcpuNum": 48 839 | }, 840 | { 841 | "_defaultOrder": 27, 842 | "_isFastLaunch": false, 843 | "category": "Compute optimized", 844 | "gpuNum": 0, 845 | "hideHardwareSpecs": false, 846 | "memoryGiB": 144, 847 | "name": "ml.c5.18xlarge", 848 | "vcpuNum": 72 849 | }, 850 | { 851 | "_defaultOrder": 28, 852 | "_isFastLaunch": false, 853 | "category": "Compute optimized", 854 | "gpuNum": 0, 855 | "hideHardwareSpecs": false, 856 | "memoryGiB": 192, 857 | "name": "ml.c5.24xlarge", 858 | "vcpuNum": 96 859 | }, 860 | { 861 | "_defaultOrder": 29, 862 | "_isFastLaunch": true, 863 | "category": "Accelerated computing", 864 | "gpuNum": 1, 865 | "hideHardwareSpecs": false, 866 | "memoryGiB": 16, 867 | "name": "ml.g4dn.xlarge", 868 | "vcpuNum": 4 869 | }, 870 | { 871 | "_defaultOrder": 30, 872 | "_isFastLaunch": false, 873 | "category": "Accelerated computing", 874 | "gpuNum": 1, 875 | "hideHardwareSpecs": false, 876 | "memoryGiB": 32, 877 | "name": "ml.g4dn.2xlarge", 878 | "vcpuNum": 8 879 | }, 880 | { 881 | "_defaultOrder": 31, 882 | "_isFastLaunch": false, 883 | "category": "Accelerated computing", 884 | "gpuNum": 1, 885 | "hideHardwareSpecs": false, 886 | "memoryGiB": 64, 887 | "name": "ml.g4dn.4xlarge", 888 | "vcpuNum": 16 889 | }, 890 | { 891 | "_defaultOrder": 32, 892 | "_isFastLaunch": false, 893 | "category": "Accelerated computing", 894 | "gpuNum": 1, 895 | "hideHardwareSpecs": false, 896 | "memoryGiB": 128, 897 | "name": "ml.g4dn.8xlarge", 898 | "vcpuNum": 32 899 | }, 900 | { 901 | "_defaultOrder": 33, 902 | "_isFastLaunch": false, 903 | "category": "Accelerated computing", 904 | "gpuNum": 4, 905 | "hideHardwareSpecs": false, 906 | "memoryGiB": 192, 907 | "name": "ml.g4dn.12xlarge", 908 | "vcpuNum": 48 909 | }, 910 | { 911 | "_defaultOrder": 34, 912 | "_isFastLaunch": false, 913 | "category": "Accelerated computing", 914 | "gpuNum": 1, 915 | "hideHardwareSpecs": false, 916 | "memoryGiB": 256, 917 | "name": "ml.g4dn.16xlarge", 918 | "vcpuNum": 64 919 | }, 920 | { 921 | "_defaultOrder": 35, 922 | "_isFastLaunch": false, 923 | "category": "Accelerated computing", 924 | "gpuNum": 1, 925 | "hideHardwareSpecs": false, 926 | "memoryGiB": 61, 927 | "name": "ml.p3.2xlarge", 928 | "vcpuNum": 8 929 | }, 930 | { 931 | "_defaultOrder": 36, 932 | "_isFastLaunch": false, 933 | "category": "Accelerated computing", 934 | "gpuNum": 4, 935 | "hideHardwareSpecs": false, 936 | "memoryGiB": 244, 937 | "name": "ml.p3.8xlarge", 938 | "vcpuNum": 32 939 | }, 940 | { 941 | "_defaultOrder": 37, 942 | "_isFastLaunch": false, 943 | "category": "Accelerated computing", 944 | "gpuNum": 8, 945 | "hideHardwareSpecs": false, 946 | "memoryGiB": 488, 947 | "name": "ml.p3.16xlarge", 948 | "vcpuNum": 64 949 | }, 950 | { 951 | "_defaultOrder": 38, 952 | "_isFastLaunch": false, 953 | "category": "Accelerated computing", 954 | "gpuNum": 8, 955 | "hideHardwareSpecs": false, 956 | "memoryGiB": 768, 957 | "name": "ml.p3dn.24xlarge", 958 | "vcpuNum": 96 959 | }, 960 | { 961 | "_defaultOrder": 39, 962 | "_isFastLaunch": false, 963 | "category": "Memory Optimized", 964 | "gpuNum": 0, 965 | "hideHardwareSpecs": false, 966 | "memoryGiB": 16, 967 | "name": "ml.r5.large", 968 | "vcpuNum": 2 969 | }, 970 | { 971 | "_defaultOrder": 40, 972 | "_isFastLaunch": false, 973 | "category": "Memory Optimized", 974 | "gpuNum": 0, 975 | "hideHardwareSpecs": false, 976 | "memoryGiB": 32, 977 | "name": "ml.r5.xlarge", 978 | "vcpuNum": 4 979 | }, 980 | { 981 | "_defaultOrder": 41, 982 | "_isFastLaunch": false, 983 | "category": "Memory Optimized", 984 | "gpuNum": 0, 985 | "hideHardwareSpecs": false, 986 | "memoryGiB": 64, 987 | "name": "ml.r5.2xlarge", 988 | "vcpuNum": 8 989 | }, 990 | { 991 | "_defaultOrder": 42, 992 | "_isFastLaunch": false, 993 | "category": "Memory Optimized", 994 | "gpuNum": 0, 995 | "hideHardwareSpecs": false, 996 | "memoryGiB": 128, 997 | "name": "ml.r5.4xlarge", 998 | "vcpuNum": 16 999 | }, 1000 | { 1001 | "_defaultOrder": 43, 1002 | "_isFastLaunch": false, 1003 | "category": "Memory Optimized", 1004 | "gpuNum": 0, 1005 | "hideHardwareSpecs": false, 1006 | "memoryGiB": 256, 1007 | "name": "ml.r5.8xlarge", 1008 | "vcpuNum": 32 1009 | }, 1010 | { 1011 | "_defaultOrder": 44, 1012 | "_isFastLaunch": false, 1013 | "category": "Memory Optimized", 1014 | "gpuNum": 0, 1015 | "hideHardwareSpecs": false, 1016 | "memoryGiB": 384, 1017 | "name": "ml.r5.12xlarge", 1018 | "vcpuNum": 48 1019 | }, 1020 | { 1021 | "_defaultOrder": 45, 1022 | "_isFastLaunch": false, 1023 | "category": "Memory Optimized", 1024 | "gpuNum": 0, 1025 | "hideHardwareSpecs": false, 1026 | "memoryGiB": 512, 1027 | "name": "ml.r5.16xlarge", 1028 | "vcpuNum": 64 1029 | }, 1030 | { 1031 | "_defaultOrder": 46, 1032 | "_isFastLaunch": false, 1033 | "category": "Memory Optimized", 1034 | "gpuNum": 0, 1035 | "hideHardwareSpecs": false, 1036 | "memoryGiB": 768, 1037 | "name": "ml.r5.24xlarge", 1038 | "vcpuNum": 96 1039 | }, 1040 | { 1041 | "_defaultOrder": 47, 1042 | "_isFastLaunch": false, 1043 | "category": "Accelerated computing", 1044 | "gpuNum": 1, 1045 | "hideHardwareSpecs": false, 1046 | "memoryGiB": 16, 1047 | "name": "ml.g5.xlarge", 1048 | "vcpuNum": 4 1049 | }, 1050 | { 1051 | "_defaultOrder": 48, 1052 | "_isFastLaunch": false, 1053 | "category": "Accelerated computing", 1054 | "gpuNum": 1, 1055 | "hideHardwareSpecs": false, 1056 | "memoryGiB": 32, 1057 | "name": "ml.g5.2xlarge", 1058 | "vcpuNum": 8 1059 | }, 1060 | { 1061 | "_defaultOrder": 49, 1062 | "_isFastLaunch": false, 1063 | "category": "Accelerated computing", 1064 | "gpuNum": 1, 1065 | "hideHardwareSpecs": false, 1066 | "memoryGiB": 64, 1067 | "name": "ml.g5.4xlarge", 1068 | "vcpuNum": 16 1069 | }, 1070 | { 1071 | "_defaultOrder": 50, 1072 | "_isFastLaunch": false, 1073 | "category": "Accelerated computing", 1074 | "gpuNum": 1, 1075 | "hideHardwareSpecs": false, 1076 | "memoryGiB": 128, 1077 | "name": "ml.g5.8xlarge", 1078 | "vcpuNum": 32 1079 | }, 1080 | { 1081 | "_defaultOrder": 51, 1082 | "_isFastLaunch": false, 1083 | "category": "Accelerated computing", 1084 | "gpuNum": 1, 1085 | "hideHardwareSpecs": false, 1086 | "memoryGiB": 256, 1087 | "name": "ml.g5.16xlarge", 1088 | "vcpuNum": 64 1089 | }, 1090 | { 1091 | "_defaultOrder": 52, 1092 | "_isFastLaunch": false, 1093 | "category": "Accelerated computing", 1094 | "gpuNum": 4, 1095 | "hideHardwareSpecs": false, 1096 | "memoryGiB": 192, 1097 | "name": "ml.g5.12xlarge", 1098 | "vcpuNum": 48 1099 | }, 1100 | { 1101 | "_defaultOrder": 53, 1102 | "_isFastLaunch": false, 1103 | "category": "Accelerated computing", 1104 | "gpuNum": 4, 1105 | "hideHardwareSpecs": false, 1106 | "memoryGiB": 384, 1107 | "name": "ml.g5.24xlarge", 1108 | "vcpuNum": 96 1109 | }, 1110 | { 1111 | "_defaultOrder": 54, 1112 | "_isFastLaunch": false, 1113 | "category": "Accelerated computing", 1114 | "gpuNum": 8, 1115 | "hideHardwareSpecs": false, 1116 | "memoryGiB": 768, 1117 | "name": "ml.g5.48xlarge", 1118 | "vcpuNum": 192 1119 | }, 1120 | { 1121 | "_defaultOrder": 55, 1122 | "_isFastLaunch": false, 1123 | "category": "Accelerated computing", 1124 | "gpuNum": 8, 1125 | "hideHardwareSpecs": false, 1126 | "memoryGiB": 1152, 1127 | "name": "ml.p4d.24xlarge", 1128 | "vcpuNum": 96 1129 | }, 1130 | { 1131 | "_defaultOrder": 56, 1132 | "_isFastLaunch": false, 1133 | "category": "Accelerated computing", 1134 | "gpuNum": 8, 1135 | "hideHardwareSpecs": false, 1136 | "memoryGiB": 1152, 1137 | "name": "ml.p4de.24xlarge", 1138 | "vcpuNum": 96 1139 | }, 1140 | { 1141 | "_defaultOrder": 57, 1142 | "_isFastLaunch": false, 1143 | "category": "Accelerated computing", 1144 | "gpuNum": 0, 1145 | "hideHardwareSpecs": false, 1146 | "memoryGiB": 32, 1147 | "name": "ml.trn1.2xlarge", 1148 | "vcpuNum": 8 1149 | }, 1150 | { 1151 | "_defaultOrder": 58, 1152 | "_isFastLaunch": false, 1153 | "category": "Accelerated computing", 1154 | "gpuNum": 0, 1155 | "hideHardwareSpecs": false, 1156 | "memoryGiB": 512, 1157 | "name": "ml.trn1.32xlarge", 1158 | "vcpuNum": 128 1159 | }, 1160 | { 1161 | "_defaultOrder": 59, 1162 | "_isFastLaunch": false, 1163 | "category": "Accelerated computing", 1164 | "gpuNum": 0, 1165 | "hideHardwareSpecs": false, 1166 | "memoryGiB": 512, 1167 | "name": "ml.trn1n.32xlarge", 1168 | "vcpuNum": 128 1169 | } 1170 | ], 1171 | "instance_type": "ml.m5.large", 1172 | "kernelspec": { 1173 | "display_name": "Python 3 (Data Science 2.0)", 1174 | "language": "python", 1175 | "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:us-east-1:081325390199:image/sagemaker-data-science-38" 1176 | }, 1177 | "language_info": { 1178 | "codemirror_mode": { 1179 | "name": "ipython", 1180 | "version": 3 1181 | }, 1182 | "file_extension": ".py", 1183 | "mimetype": "text/x-python", 1184 | "name": "python", 1185 | "nbconvert_exporter": "python", 1186 | "pygments_lexer": "ipython3", 1187 | "version": "3.8.13" 1188 | } 1189 | }, 1190 | "nbformat": 4, 1191 | "nbformat_minor": 5 1192 | } 1193 | --------------------------------------------------------------------------------