├── code ├── requirements.txt ├── model_def.py ├── inference.py └── cifar10-training-sagemaker.py ├── docker ├── requirements.txt ├── Dockerfile ├── model_def.py ├── inference.py ├── build_docker_push_to_ecr.ipynb └── cifar10-training-sagemaker.py ├── sm_kpf.png ├── 1000_dog.png ├── pipeline.png ├── create_eks_cluster.sh ├── README.md ├── .gitignore ├── generate_cifar10_tfrecords.py ├── kfp-sagemaker-script-mode.ipynb └── kfp-sagemaker-custom-container.ipynb /code/requirements.txt: -------------------------------------------------------------------------------- 1 | Pillow 2 | numpy 3 | -------------------------------------------------------------------------------- /docker/requirements.txt: -------------------------------------------------------------------------------- 1 | Pillow 2 | numpy 3 | -------------------------------------------------------------------------------- /sm_kpf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/kubeflow-pipelines-sagemaker-examples/HEAD/sm_kpf.png -------------------------------------------------------------------------------- /1000_dog.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/kubeflow-pipelines-sagemaker-examples/HEAD/1000_dog.png -------------------------------------------------------------------------------- /pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shashankprasanna/kubeflow-pipelines-sagemaker-examples/HEAD/pipeline.png -------------------------------------------------------------------------------- /create_eks_cluster.sh: -------------------------------------------------------------------------------- 1 | eksctl create cluster \ 2 | --name kubeflow-sm \ 3 | --version 1.15 \ 4 | --region us-west-2 \ 5 | --nodegroup-name cpu-nodes \ 6 | --node-type c5.xlarge \ 7 | --nodes 2 \ 8 | --node-volume-size 50 \ 9 | --node-zones us-west-2a \ 10 | --timeout=40m \ 11 | --auto-kubeconfig 12 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM 763104351884.dkr.ecr.us-west-2.amazonaws.com/tensorflow-training:1.15.2-gpu-py36-cu100-ubuntu18.04 2 | 3 | RUN pip3 install sagemaker-training 4 | 5 | COPY cifar10-training-sagemaker.py /opt/ml/code/cifar10-training-sagemaker.py 6 | COPY model_def.py /opt/ml/code/model_def.py 7 | COPY inference.py /opt/ml/model/code/inference.py 8 | COPY requirements.txt /opt/ml/model/code/requirements.txt 9 | 10 | ENV SAGEMAKER_PROGRAM cifar10-training-sagemaker.py 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Examples for using Amazon SageMaker components in Kubeflow Pipelines 2 | 3 | 4 | 5 | ![Demo_pipeline](pipeline.png) 6 | 7 | ### Blog post: https://aws.amazon.com/blogs/machine-learning/introducing-amazon-sagemaker-components-for-kubeflow-pipelines/ 8 | 9 | ### Video with demo walkthrough: https://www.youtube.com/watch?v=LKmkiUdhV58 10 | 11 | #### First component: 12 | Runs an Amazon SageMaker hyperparameter tuning job to optimize the following hyperparameters: 13 | 14 | * learning-rate: [0.0001, 0.1] log scale 15 | * optimizer : [sgd, adam] 16 | * batch-size: [32, 128, 256] 17 | * model-type: [resnet, custom model] 18 | 19 | **Input**: N/A
20 | **Output**: best hyperparameters 21 | 22 | #### Second component: 23 | During hyperparameter search in the previous step, models are only trained for 10 epochs to determine well performing hyperparameters. In the current step the best hyperparameters are taken and the epochs are updated to 80 to give the best hyperparameters an opportunity to deliver higher accuracy in the next step. 24 | 25 | **Input**: best hyperparameters
26 | **Output**: best hyperparameters with updated epochs (80) 27 | 28 | #### Third component: 29 | Run an Amazon SageMaker training job using the best hyperparameters and for higher epochs. 30 | 31 | **Input**: best hyperparameters with updated epochs (80)
32 | **Output**: training job name 33 | 34 | #### Fourth component: 35 | Create an Amazon SageMaker model artifact 36 | 37 | **Input**: training job name
38 | **Output**: model artifact name 39 | 40 | #### Fifth component: 41 | Deploy a model with Amazon SageMaker deployment 42 | 43 | **Input**: model artifact name
44 | **Output**: N/A 45 | 46 | ## Mapping between Amazon SageMaker Kubeflow Pipeline component and Amazon SageMaker capabilities: 47 | 48 | ![mapping](sm_kpf.png) 49 | -------------------------------------------------------------------------------- /code/model_def.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras.layers import Activation, Conv2D, Dense, Dropout, Flatten, MaxPooling2D, BatchNormalization 3 | from tensorflow.keras.models import Sequential 4 | from tensorflow.keras.optimizers import Adam, SGD, RMSprop 5 | 6 | NUM_CLASSES = 10 7 | 8 | def get_custom_model(input_shape, learning_rate, weight_decay, optimizer, momentum): 9 | 10 | model = Sequential() 11 | model.add(Conv2D(32, (3, 3), padding='same', input_shape=input_shape)) 12 | model.add(BatchNormalization()) 13 | model.add(Activation('relu')) 14 | model.add(Conv2D(32, (3, 3))) 15 | model.add(BatchNormalization()) 16 | model.add(Activation('relu')) 17 | model.add(MaxPooling2D(pool_size=(2, 2))) 18 | model.add(Dropout(0.2)) 19 | 20 | model.add(Conv2D(64, (3, 3), padding='same')) 21 | model.add(BatchNormalization()) 22 | model.add(Activation('relu')) 23 | model.add(Conv2D(64, (3, 3))) 24 | model.add(BatchNormalization()) 25 | model.add(Activation('relu')) 26 | model.add(MaxPooling2D(pool_size=(2, 2))) 27 | model.add(Dropout(0.3)) 28 | 29 | model.add(Conv2D(128, (3, 3), padding='same')) 30 | model.add(BatchNormalization()) 31 | model.add(Activation('relu')) 32 | model.add(Conv2D(128, (3, 3))) 33 | model.add(BatchNormalization()) 34 | model.add(Activation('relu')) 35 | model.add(MaxPooling2D(pool_size=(2, 2))) 36 | model.add(Dropout(0.4)) 37 | 38 | model.add(Flatten()) 39 | model.add(Dense(256)) 40 | model.add(Activation('relu')) 41 | model.add(Dropout(0.5)) 42 | model.add(Dense(NUM_CLASSES)) 43 | model.add(Activation('softmax')) 44 | 45 | if optimizer.lower() == 'sgd': 46 | opt = SGD(lr=learning_rate, decay=weight_decay, momentum=momentum) 47 | elif optimizer.lower() == 'rmsprop': 48 | opt = RMSprop(lr=learning_rate, decay=weight_decay) 49 | else: 50 | opt = Adam(lr=learning_rate, decay=weight_decay) 51 | 52 | return model 53 | 54 | -------------------------------------------------------------------------------- /docker/model_def.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow.keras.layers import Activation, Conv2D, Dense, Dropout, Flatten, MaxPooling2D, BatchNormalization 3 | from tensorflow.keras.models import Sequential 4 | from tensorflow.keras.optimizers import Adam, SGD, RMSprop 5 | 6 | NUM_CLASSES = 10 7 | 8 | def get_custom_model(input_shape, learning_rate, weight_decay, optimizer, momentum): 9 | 10 | model = Sequential() 11 | model.add(Conv2D(32, (3, 3), padding='same', input_shape=input_shape)) 12 | model.add(BatchNormalization()) 13 | model.add(Activation('relu')) 14 | model.add(Conv2D(32, (3, 3))) 15 | model.add(BatchNormalization()) 16 | model.add(Activation('relu')) 17 | model.add(MaxPooling2D(pool_size=(2, 2))) 18 | model.add(Dropout(0.2)) 19 | 20 | model.add(Conv2D(64, (3, 3), padding='same')) 21 | model.add(BatchNormalization()) 22 | model.add(Activation('relu')) 23 | model.add(Conv2D(64, (3, 3))) 24 | model.add(BatchNormalization()) 25 | model.add(Activation('relu')) 26 | model.add(MaxPooling2D(pool_size=(2, 2))) 27 | model.add(Dropout(0.3)) 28 | 29 | model.add(Conv2D(128, (3, 3), padding='same')) 30 | model.add(BatchNormalization()) 31 | model.add(Activation('relu')) 32 | model.add(Conv2D(128, (3, 3))) 33 | model.add(BatchNormalization()) 34 | model.add(Activation('relu')) 35 | model.add(MaxPooling2D(pool_size=(2, 2))) 36 | model.add(Dropout(0.4)) 37 | 38 | model.add(Flatten()) 39 | model.add(Dense(256)) 40 | model.add(Activation('relu')) 41 | model.add(Dropout(0.5)) 42 | model.add(Dense(NUM_CLASSES)) 43 | model.add(Activation('softmax')) 44 | 45 | if optimizer.lower() == 'sgd': 46 | opt = SGD(lr=learning_rate, decay=weight_decay, momentum=momentum) 47 | elif optimizer.lower() == 'rmsprop': 48 | opt = RMSprop(lr=learning_rate, decay=weight_decay) 49 | else: 50 | opt = Adam(lr=learning_rate, decay=weight_decay) 51 | 52 | return model 53 | 54 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /code/inference.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | import io 15 | import json 16 | import numpy as np 17 | from collections import namedtuple 18 | from PIL import Image 19 | 20 | Context = namedtuple('Context', 21 | 'model_name, model_version, method, rest_uri, grpc_uri, ' 22 | 'custom_attributes, request_content_type, accept_header') 23 | 24 | 25 | def input_handler(data, context): 26 | """ Pre-process request input before it is sent to TensorFlow Serving REST API 27 | 28 | Args: 29 | data (obj): the request data, in format of dict or string 30 | context (Context): an object containing request and configuration details 31 | 32 | Returns: 33 | (dict): a JSON-serializable dict that contains request body and headers 34 | """ 35 | 36 | if context.request_content_type == 'application/x-image': 37 | 38 | image_as_bytes = io.BytesIO(data.read()) 39 | image = Image.open(image_as_bytes) 40 | instance = np.expand_dims(image, axis=0) 41 | return json.dumps({"instances": instance.tolist()}) 42 | 43 | else: 44 | _return_error(415, 'Unsupported content type "{}"'.format(context.request_content_type or 'Unknown')) 45 | 46 | 47 | def output_handler(data, context): 48 | """Post-process TensorFlow Serving output before it is returned to the client. 49 | 50 | Args: 51 | data (obj): the TensorFlow serving response 52 | context (Context): an object containing request and configuration details 53 | 54 | Returns: 55 | (bytes, string): data to return to client, response content type 56 | """ 57 | if data.status_code != 200: 58 | raise Exception(data.content.decode('utf-8')) 59 | response_content_type = context.accept_header 60 | prediction = data.content 61 | return prediction, response_content_type 62 | 63 | 64 | def _return_error(code, message): 65 | raise ValueError('Error: {}, {}'.format(str(code), message)) 66 | -------------------------------------------------------------------------------- /docker/inference.py: -------------------------------------------------------------------------------- 1 | # Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | 14 | import io 15 | import json 16 | import numpy as np 17 | from collections import namedtuple 18 | from PIL import Image 19 | 20 | Context = namedtuple('Context', 21 | 'model_name, model_version, method, rest_uri, grpc_uri, ' 22 | 'custom_attributes, request_content_type, accept_header') 23 | 24 | 25 | def input_handler(data, context): 26 | """ Pre-process request input before it is sent to TensorFlow Serving REST API 27 | 28 | Args: 29 | data (obj): the request data, in format of dict or string 30 | context (Context): an object containing request and configuration details 31 | 32 | Returns: 33 | (dict): a JSON-serializable dict that contains request body and headers 34 | """ 35 | 36 | if context.request_content_type == 'application/x-image': 37 | 38 | image_as_bytes = io.BytesIO(data.read()) 39 | image = Image.open(image_as_bytes) 40 | instance = np.expand_dims(image, axis=0) 41 | return json.dumps({"instances": instance.tolist()}) 42 | 43 | else: 44 | _return_error(415, 'Unsupported content type "{}"'.format(context.request_content_type or 'Unknown')) 45 | 46 | 47 | def output_handler(data, context): 48 | """Post-process TensorFlow Serving output before it is returned to the client. 49 | 50 | Args: 51 | data (obj): the TensorFlow serving response 52 | context (Context): an object containing request and configuration details 53 | 54 | Returns: 55 | (bytes, string): data to return to client, response content type 56 | """ 57 | if data.status_code != 200: 58 | raise Exception(data.content.decode('utf-8')) 59 | response_content_type = context.accept_header 60 | prediction = data.content 61 | return prediction, response_content_type 62 | 63 | 64 | def _return_error(code, message): 65 | raise ValueError('Error: {}, {}'.format(str(code), message)) 66 | -------------------------------------------------------------------------------- /docker/build_docker_push_to_ecr.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import boto3\n", 10 | "sess = boto3.Session()" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "**Select a name for the ECR repository**" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "ecr_repo_name = 'sagemaker-kubernetes'\n", 27 | "\n", 28 | "region = boto3.Session().region_name\n", 29 | "account = boto3.client('sts').get_caller_identity().get('Account')\n", 30 | "image = '{}.dkr.ecr.{}.amazonaws.com/{}:latest'.format(account, region, ecr_repo_name)" 31 | ] 32 | }, 33 | { 34 | "cell_type": "markdown", 35 | "metadata": {}, 36 | "source": [ 37 | "**Build a container image locally with training scripts**" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "!$(aws ecr get-login --no-include-email --region us-west-2 --registry-ids 763104351884)\n", 47 | "!docker build -t {ecr_repo_name}:latest ." 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "**Create an ECR repository if one doesn't exist**" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "!aws ecr create-repository --repository-name {ecr_repo_name}" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "**Push image to ECR**" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "!$(aws ecr get-login --no-include-email --region {region})\n", 80 | "!docker tag {ecr_repo_name}:latest {image}\n", 81 | "!docker push {image}" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [] 90 | } 91 | ], 92 | "metadata": { 93 | "kernelspec": { 94 | "display_name": "Environment (conda_python3)", 95 | "language": "python", 96 | "name": "conda_python3" 97 | }, 98 | "language_info": { 99 | "codemirror_mode": { 100 | "name": "ipython", 101 | "version": 3 102 | }, 103 | "file_extension": ".py", 104 | "mimetype": "text/x-python", 105 | "name": "python", 106 | "nbconvert_exporter": "python", 107 | "pygments_lexer": "ipython3", 108 | "version": "3.6.5" 109 | } 110 | }, 111 | "nbformat": 4, 112 | "nbformat_minor": 4 113 | } 114 | -------------------------------------------------------------------------------- /generate_cifar10_tfrecords.py: -------------------------------------------------------------------------------- 1 | # Copyright 2017 The TensorFlow Authors. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # ============================================================================== 15 | """Read CIFAR-10 data from pickled numpy arrays and writes TFRecords. 16 | 17 | Generates tf.train.Example protos and writes them to TFRecord files from the 18 | python version of the CIFAR-10 dataset downloaded from 19 | https://www.cs.toronto.edu/~kriz/cifar.html. 20 | """ 21 | 22 | from __future__ import absolute_import 23 | from __future__ import division 24 | from __future__ import print_function 25 | 26 | import argparse 27 | import os 28 | import sys 29 | 30 | import tarfile 31 | from six.moves import cPickle as pickle 32 | from six.moves import xrange # pylint: disable=redefined-builtin 33 | import tensorflow as tf 34 | 35 | tf.logging.set_verbosity(tf.logging.ERROR) 36 | if type(tf.contrib) != type(tf): tf.contrib._warning = None 37 | 38 | CIFAR_FILENAME = 'cifar-10-python.tar.gz' 39 | CIFAR_DOWNLOAD_URL = 'https://www.cs.toronto.edu/~kriz/' + CIFAR_FILENAME 40 | CIFAR_LOCAL_FOLDER = 'cifar-10-batches-py' 41 | 42 | 43 | def download_and_extract(data_dir): 44 | # download CIFAR-10 if not already downloaded. 45 | tf.contrib.learn.datasets.base.maybe_download(CIFAR_FILENAME, data_dir, 46 | CIFAR_DOWNLOAD_URL) 47 | tarfile.open(os.path.join(data_dir, CIFAR_FILENAME), 48 | 'r:gz').extractall(data_dir) 49 | 50 | 51 | def _int64_feature(value): 52 | return tf.train.Feature(int64_list=tf.train.Int64List(value=[value])) 53 | 54 | 55 | def _bytes_feature(value): 56 | return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value])) 57 | 58 | 59 | def _get_file_names(): 60 | """Returns the file names expected to exist in the input_dir.""" 61 | file_names = {} 62 | file_names['train'] = ['data_batch_%d' % i for i in xrange(1, 5)] 63 | file_names['validation'] = ['data_batch_5'] 64 | file_names['eval'] = ['test_batch'] 65 | return file_names 66 | 67 | 68 | def read_pickle_from_file(filename): 69 | with tf.gfile.Open(filename, 'rb') as f: 70 | if sys.version_info >= (3, 0): 71 | data_dict = pickle.load(f, encoding='bytes') 72 | else: 73 | data_dict = pickle.load(f) 74 | return data_dict 75 | 76 | 77 | def convert_to_tfrecord(input_files, output_file): 78 | """Converts a file to TFRecords.""" 79 | print('Generating %s' % output_file) 80 | with tf.python_io.TFRecordWriter(output_file) as record_writer: 81 | for input_file in input_files: 82 | data_dict = read_pickle_from_file(input_file) 83 | data = data_dict[b'data'] 84 | labels = data_dict[b'labels'] 85 | 86 | num_entries_in_batch = len(labels) 87 | for i in range(num_entries_in_batch): 88 | example = tf.train.Example(features=tf.train.Features( 89 | feature={ 90 | 'image': _bytes_feature(data[i].tobytes()), 91 | 'label': _int64_feature(labels[i]) 92 | })) 93 | record_writer.write(example.SerializeToString()) 94 | 95 | 96 | def main(data_dir): 97 | print('Download from {} and extract.'.format(CIFAR_DOWNLOAD_URL)) 98 | download_and_extract(data_dir) 99 | file_names = _get_file_names() 100 | input_dir = os.path.join(data_dir, CIFAR_LOCAL_FOLDER) 101 | for mode, files in file_names.items(): 102 | input_files = [os.path.join(input_dir, f) for f in files] 103 | output_file = os.path.join(data_dir+'/'+mode, mode + '.tfrecords') 104 | if not os.path.exists(data_dir+'/'+mode): 105 | os.makedirs(data_dir+'/'+mode) 106 | try: 107 | os.remove(output_file) 108 | except OSError: 109 | pass 110 | # Convert to tf.train.Example and write the to TFRecords. 111 | convert_to_tfrecord(input_files, output_file) 112 | print('Done!') 113 | import shutil 114 | shutil.rmtree(data_dir+'/cifar-10-batches-py') 115 | os.remove(data_dir+'/cifar-10-python.tar.gz') 116 | 117 | 118 | if __name__ == '__main__': 119 | parser = argparse.ArgumentParser() 120 | parser.add_argument( 121 | '--data-dir', 122 | type=str, 123 | default='', 124 | help='Directory to download and extract CIFAR-10 to.') 125 | 126 | args = parser.parse_args() 127 | main(args.data_dir) 128 | -------------------------------------------------------------------------------- /code/cifar10-training-sagemaker.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow import keras 3 | from tensorflow.keras.callbacks import ModelCheckpoint 4 | from tensorflow.keras.layers import Input, Dense, Flatten 5 | from tensorflow.keras.models import Model, load_model 6 | from tensorflow.keras.optimizers import Adam, SGD 7 | from model_def import get_custom_model 8 | import time 9 | import argparse 10 | import os 11 | import re 12 | 13 | HEIGHT = 32 14 | WIDTH = 32 15 | DEPTH = 3 16 | NUM_CLASSES = 10 17 | 18 | # Copy inference pre/post-processing script so that it'll be included in the model package 19 | os.system('mkdir /opt/ml/model/code') 20 | os.system('cp inference.py /opt/ml/model/code') 21 | os.system('cp requirements.txt /opt/ml/model/code') 22 | 23 | def single_example_parser(serialized_example): 24 | """Parses a single tf.Example into image and label tensors.""" 25 | # Dimensions of the images in the CIFAR-10 dataset. 26 | # See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the 27 | # input format. 28 | features = tf.io.parse_single_example( 29 | serialized_example, 30 | features={ 31 | 'image': tf.io.FixedLenFeature([], tf.string), 32 | 'label': tf.io.FixedLenFeature([], tf.int64), 33 | }) 34 | image = tf.decode_raw(features['image'], tf.uint8) 35 | image.set_shape([DEPTH * HEIGHT * WIDTH]) 36 | 37 | # Reshape from [depth * height * width] to [depth, height, width]. 38 | image = tf.cast( 39 | tf.transpose(tf.reshape(image, [DEPTH, HEIGHT, WIDTH]), [1, 2, 0]), 40 | tf.float32) 41 | label = tf.cast(features['label'], tf.int32) 42 | 43 | image = train_preprocess_fn(image) 44 | label = tf.one_hot(label, NUM_CLASSES) 45 | 46 | return image, label 47 | 48 | def train_preprocess_fn(image): 49 | 50 | # Resize the image to add four extra pixels on each side. 51 | image = tf.image.resize_with_crop_or_pad(image, HEIGHT + 8, WIDTH + 8) 52 | 53 | # Randomly crop a [HEIGHT, WIDTH] section of the image. 54 | image = tf.image.random_crop(image, [HEIGHT, WIDTH, DEPTH]) 55 | 56 | # Randomly flip the image horizontally. 57 | image = tf.image.random_flip_left_right(image) 58 | return image 59 | 60 | def get_dataset(filenames, batch_size): 61 | """Read the images and labels from 'filenames'.""" 62 | # Repeat infinitely. 63 | dataset = tf.data.TFRecordDataset(filenames).repeat().shuffle(10000) 64 | 65 | # Parse records. 66 | dataset = dataset.map(single_example_parser, num_parallel_calls=tf.data.experimental.AUTOTUNE) 67 | 68 | # Batch it up. 69 | dataset = dataset.batch(batch_size, drop_remainder=True) 70 | return dataset 71 | 72 | def get_model(model_type, input_shape, learning_rate, weight_decay, optimizer, momentum): 73 | input_tensor = Input(shape=input_shape) 74 | if model_type == 'resnet': 75 | base_model = keras.applications.resnet50.ResNet50(include_top=False, 76 | weights='imagenet', 77 | input_tensor=input_tensor, 78 | input_shape=input_shape, 79 | classes=None) 80 | x = Flatten()(base_model.output) 81 | predictions = Dense(NUM_CLASSES, activation='softmax')(x) 82 | model = Model(inputs=base_model.input, outputs=predictions) 83 | 84 | elif model_type == 'vgg': 85 | base_model = keras.applications.vgg19.VGG19(include_top=False, 86 | weights=None, 87 | input_tensor=input_tensor, 88 | input_shape=input_shape, 89 | classes=None) 90 | x = Flatten()(base_model.output) 91 | predictions = Dense(NUM_CLASSES, activation='softmax')(x) 92 | model = Model(inputs=base_model.input, outputs=predictions) 93 | 94 | else: 95 | model = get_custom_model(input_shape, learning_rate, weight_decay, optimizer, momentum) 96 | 97 | return model 98 | 99 | def main(args): 100 | # Hyper-parameters 101 | epochs = args.epochs 102 | lr = args.learning_rate 103 | batch_size = args.batch_size 104 | momentum = args.momentum 105 | weight_decay = args.weight_decay 106 | optimizer = args.optimizer 107 | model_type = args.model_type 108 | 109 | # SageMaker options 110 | training_dir = args.training 111 | validation_dir = args.validation 112 | eval_dir = args.eval 113 | 114 | train_dataset = get_dataset(training_dir+'/train.tfrecords', batch_size) 115 | val_dataset = get_dataset(validation_dir+'/validation.tfrecords', batch_size) 116 | eval_dataset = get_dataset(eval_dir+'/eval.tfrecords', batch_size) 117 | 118 | input_shape = (HEIGHT, WIDTH, DEPTH) 119 | 120 | # Load model 121 | model = get_model(model_type, input_shape, lr, weight_decay, optimizer, momentum) 122 | 123 | # Optimizer 124 | if optimizer.lower() == 'sgd': 125 | opt = SGD(lr=lr, decay=weight_decay, momentum=momentum) 126 | else: 127 | opt = Adam(lr=lr, decay=weight_decay) 128 | 129 | # Compile model 130 | model.compile(optimizer=opt, 131 | loss='categorical_crossentropy', 132 | metrics=['accuracy']) 133 | 134 | # Train model 135 | history = model.fit(train_dataset, steps_per_epoch=40000 // batch_size, 136 | validation_data=val_dataset, 137 | validation_steps=10000 // batch_size, 138 | epochs=epochs) 139 | 140 | # Evaluate model performance 141 | score = model.evaluate(eval_dataset, steps=10000 // batch_size, verbose=1) 142 | print('Test loss :', score[0]) 143 | print('Test accuracy:', score[1]) 144 | 145 | # Save model to model directory 146 | model.save(f'{os.environ["SM_MODEL_DIR"]}/{time.strftime("%m%d%H%M%S", time.gmtime())}', save_format='tf') 147 | 148 | if __name__ == "__main__": 149 | 150 | parser = argparse.ArgumentParser() 151 | # Hyper-parameters 152 | parser.add_argument('--epochs', type=int, default=10) 153 | parser.add_argument('--learning-rate', type=float, default=0.01) 154 | parser.add_argument('--batch-size', type=int, default=128) 155 | parser.add_argument('--weight-decay', type=float, default=2e-4) 156 | parser.add_argument('--momentum', type=float, default='0.9') 157 | parser.add_argument('--optimizer', type=str, default='sgd') 158 | parser.add_argument('--model-type', type=str, default='resnet') 159 | 160 | # SageMaker parameters 161 | parser.add_argument('--model_dir', type=str, default=os.environ['SM_MODEL_DIR']) 162 | parser.add_argument('--training', type=str, default=os.environ['SM_CHANNEL_TRAIN']) 163 | parser.add_argument('--validation', type=str, default=os.environ['SM_CHANNEL_VALIDATION']) 164 | parser.add_argument('--eval', type=str, default=os.environ['SM_CHANNEL_EVAL']) 165 | 166 | args = parser.parse_args() 167 | main(args) 168 | -------------------------------------------------------------------------------- /docker/cifar10-training-sagemaker.py: -------------------------------------------------------------------------------- 1 | import tensorflow as tf 2 | from tensorflow import keras 3 | from tensorflow.keras.callbacks import ModelCheckpoint 4 | from tensorflow.keras.layers import Input, Dense, Flatten 5 | from tensorflow.keras.models import Model, load_model 6 | from tensorflow.keras.optimizers import Adam, SGD 7 | from model_def import get_custom_model 8 | import time 9 | import argparse 10 | import os 11 | import re 12 | 13 | HEIGHT = 32 14 | WIDTH = 32 15 | DEPTH = 3 16 | NUM_CLASSES = 10 17 | 18 | # Copy inference pre/post-processing script so that it'll be included in the model package 19 | os.system('mkdir /opt/ml/model/code') 20 | os.system('cp inference.py /opt/ml/model/code') 21 | os.system('cp requirements.txt /opt/ml/model/code') 22 | 23 | def single_example_parser(serialized_example): 24 | """Parses a single tf.Example into image and label tensors.""" 25 | # Dimensions of the images in the CIFAR-10 dataset. 26 | # See http://www.cs.toronto.edu/~kriz/cifar.html for a description of the 27 | # input format. 28 | features = tf.io.parse_single_example( 29 | serialized_example, 30 | features={ 31 | 'image': tf.io.FixedLenFeature([], tf.string), 32 | 'label': tf.io.FixedLenFeature([], tf.int64), 33 | }) 34 | image = tf.decode_raw(features['image'], tf.uint8) 35 | image.set_shape([DEPTH * HEIGHT * WIDTH]) 36 | 37 | # Reshape from [depth * height * width] to [depth, height, width]. 38 | image = tf.cast( 39 | tf.transpose(tf.reshape(image, [DEPTH, HEIGHT, WIDTH]), [1, 2, 0]), 40 | tf.float32) 41 | label = tf.cast(features['label'], tf.int32) 42 | 43 | image = train_preprocess_fn(image) 44 | label = tf.one_hot(label, NUM_CLASSES) 45 | 46 | return image, label 47 | 48 | def train_preprocess_fn(image): 49 | 50 | # Resize the image to add four extra pixels on each side. 51 | image = tf.image.resize_with_crop_or_pad(image, HEIGHT + 8, WIDTH + 8) 52 | 53 | # Randomly crop a [HEIGHT, WIDTH] section of the image. 54 | image = tf.image.random_crop(image, [HEIGHT, WIDTH, DEPTH]) 55 | 56 | # Randomly flip the image horizontally. 57 | image = tf.image.random_flip_left_right(image) 58 | return image 59 | 60 | def get_dataset(filenames, batch_size): 61 | """Read the images and labels from 'filenames'.""" 62 | # Repeat infinitely. 63 | dataset = tf.data.TFRecordDataset(filenames).repeat().shuffle(10000) 64 | 65 | # Parse records. 66 | dataset = dataset.map(single_example_parser, num_parallel_calls=tf.data.experimental.AUTOTUNE) 67 | 68 | # Batch it up. 69 | dataset = dataset.batch(batch_size, drop_remainder=True) 70 | return dataset 71 | 72 | def get_model(model_type, input_shape, learning_rate, weight_decay, optimizer, momentum): 73 | input_tensor = Input(shape=input_shape) 74 | if model_type == 'resnet': 75 | base_model = keras.applications.resnet50.ResNet50(include_top=False, 76 | weights='imagenet', 77 | input_tensor=input_tensor, 78 | input_shape=input_shape, 79 | classes=None) 80 | x = Flatten()(base_model.output) 81 | predictions = Dense(NUM_CLASSES, activation='softmax')(x) 82 | model = Model(inputs=base_model.input, outputs=predictions) 83 | 84 | elif model_type == 'vgg': 85 | base_model = keras.applications.vgg19.VGG19(include_top=False, 86 | weights=None, 87 | input_tensor=input_tensor, 88 | input_shape=input_shape, 89 | classes=None) 90 | x = Flatten()(base_model.output) 91 | predictions = Dense(NUM_CLASSES, activation='softmax')(x) 92 | model = Model(inputs=base_model.input, outputs=predictions) 93 | 94 | else: 95 | model = get_custom_model(input_shape, learning_rate, weight_decay, optimizer, momentum) 96 | 97 | return model 98 | 99 | def main(args): 100 | # Hyper-parameters 101 | epochs = args.epochs 102 | lr = args.learning_rate 103 | batch_size = args.batch_size 104 | momentum = args.momentum 105 | weight_decay = args.weight_decay 106 | optimizer = args.optimizer 107 | model_type = args.model_type 108 | 109 | # SageMaker options 110 | training_dir = args.training 111 | validation_dir = args.validation 112 | eval_dir = args.eval 113 | 114 | train_dataset = get_dataset(training_dir+'/train.tfrecords', batch_size) 115 | val_dataset = get_dataset(validation_dir+'/validation.tfrecords', batch_size) 116 | eval_dataset = get_dataset(eval_dir+'/eval.tfrecords', batch_size) 117 | 118 | input_shape = (HEIGHT, WIDTH, DEPTH) 119 | 120 | # Load model 121 | model = get_model(model_type, input_shape, lr, weight_decay, optimizer, momentum) 122 | 123 | # Optimizer 124 | if optimizer.lower() == 'sgd': 125 | opt = SGD(lr=lr, decay=weight_decay, momentum=momentum) 126 | else: 127 | opt = Adam(lr=lr, decay=weight_decay) 128 | 129 | # Compile model 130 | model.compile(optimizer=opt, 131 | loss='categorical_crossentropy', 132 | metrics=['accuracy']) 133 | 134 | # Train model 135 | history = model.fit(train_dataset, steps_per_epoch=40000 // batch_size, 136 | validation_data=val_dataset, 137 | validation_steps=10000 // batch_size, 138 | epochs=epochs) 139 | 140 | # Evaluate model performance 141 | score = model.evaluate(eval_dataset, steps=10000 // batch_size, verbose=1) 142 | print('Test loss :', score[0]) 143 | print('Test accuracy:', score[1]) 144 | 145 | # Save model to model directory 146 | model.save(f'{os.environ["SM_MODEL_DIR"]}/{time.strftime("%m%d%H%M%S", time.gmtime())}', save_format='tf') 147 | 148 | if __name__ == "__main__": 149 | 150 | parser = argparse.ArgumentParser() 151 | # Hyper-parameters 152 | parser.add_argument('--epochs', type=int, default=10) 153 | parser.add_argument('--learning-rate', type=float, default=0.01) 154 | parser.add_argument('--batch-size', type=int, default=128) 155 | parser.add_argument('--weight-decay', type=float, default=2e-4) 156 | parser.add_argument('--momentum', type=float, default='0.9') 157 | parser.add_argument('--optimizer', type=str, default='sgd') 158 | parser.add_argument('--model-type', type=str, default='resnet') 159 | 160 | # SageMaker parameters 161 | parser.add_argument('--model_dir', type=str, default=os.environ['SM_MODEL_DIR']) 162 | parser.add_argument('--training', type=str, default=os.environ['SM_CHANNEL_TRAIN']) 163 | parser.add_argument('--validation', type=str, default=os.environ['SM_CHANNEL_VALIDATION']) 164 | parser.add_argument('--eval', type=str, default=os.environ['SM_CHANNEL_EVAL']) 165 | 166 | args = parser.parse_args() 167 | main(args) 168 | -------------------------------------------------------------------------------- /kfp-sagemaker-script-mode.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "!pip install kfp --upgrade\n", 10 | "!which dsl-compile" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "## Amazon SageMaker Components for Kubeflow Pipelines - script mode\n", 18 | "In this example we'll build a Kubeflow pipeline where every component call a different Amazon SageMaker feature.\n", 19 | "Our simple pipeline will perform:\n", 20 | "\n", 21 | "1. Hyperparameter optimization \n", 22 | "1. Select best hyperparameters and increase epochs\n", 23 | "1. Training model on the best hyperparameters \n", 24 | "1. Create an Amazon SageMaker model\n", 25 | "1. Deploy model" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "import kfp\n", 35 | "from kfp import components\n", 36 | "from kfp.components import func_to_container_op\n", 37 | "from kfp import dsl\n", 38 | "import time, os, json" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "https://github.com/kubeflow/pipelines/tree/master/components/aws/sagemaker" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "sagemaker_hpo_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/cb36f87b727df0578f4c1e3fe9c24a30bb59e5a2/components/aws/sagemaker/hyperparameter_tuning/component.yaml')\n", 55 | "sagemaker_train_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/cb36f87b727df0578f4c1e3fe9c24a30bb59e5a2/components/aws/sagemaker/train/component.yaml')\n", 56 | "sagemaker_model_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/cb36f87b727df0578f4c1e3fe9c24a30bb59e5a2/components/aws/sagemaker/model/component.yaml')\n", 57 | "sagemaker_deploy_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/cb36f87b727df0578f4c1e3fe9c24a30bb59e5a2/components/aws/sagemaker/deploy/component.yaml')" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "import sagemaker\n", 67 | "import boto3\n", 68 | "\n", 69 | "sess = boto3.Session()\n", 70 | "sm = sess.client('sagemaker')\n", 71 | "role = sagemaker.get_execution_role()\n", 72 | "sagemaker_session = sagemaker.Session(boto_session=sess)" 73 | ] 74 | }, 75 | { 76 | "cell_type": "markdown", 77 | "metadata": {}, 78 | "source": [ 79 | "#### Prepare training datasets and upload to Amazon S3" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": {}, 86 | "outputs": [], 87 | "source": [ 88 | "bucket_name = sagemaker_session.default_bucket()\n", 89 | "job_folder = 'jobs'\n", 90 | "dataset_folder = 'datasets'\n", 91 | "local_dataset = 'cifar10'\n", 92 | "\n", 93 | "!python generate_cifar10_tfrecords.py --data-dir {local_dataset}\n", 94 | "datasets = sagemaker_session.upload_data(path='cifar10', key_prefix='datasets/cifar10-dataset')\n", 95 | "\n", 96 | "# If dataset is already in S3 use the dataset's path:\n", 97 | "# datasets = 's3://{bucket_name}/{dataset_folder}/cifar10-dataset'" 98 | ] 99 | }, 100 | { 101 | "cell_type": "markdown", 102 | "metadata": {}, 103 | "source": [ 104 | "#### Upload training scripts to Amazon S3" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "!tar cvfz sourcedir.tar.gz --exclude=\".ipynb*\" -C code .\n", 114 | "source_s3 = sagemaker_session.upload_data(path='sourcedir.tar.gz', key_prefix='training-scripts')\n", 115 | "print('\\nUploaded to S3 location:')\n", 116 | "print(source_s3)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "#### Create a custom pipeline op\n", 124 | "Takes the results from a hyperparameter tuning job and increases the number of epochs for the next training job" 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "def update_best_model_hyperparams(hpo_results, best_model_epoch = \"80\") -> str:\n", 134 | " import json\n", 135 | " r = json.loads(str(hpo_results))\n", 136 | " return json.dumps(dict(r,epochs=best_model_epoch))\n", 137 | "\n", 138 | "get_best_hyp_op = func_to_container_op(update_best_model_hyperparams)" 139 | ] 140 | }, 141 | { 142 | "cell_type": "markdown", 143 | "metadata": {}, 144 | "source": [ 145 | "#### Create a pipeline" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "@dsl.pipeline(\n", 155 | " name='cifar10 hpo train deploy pipeline',\n", 156 | " description='cifar10 hpo train deploy pipeline using sagemaker'\n", 157 | ")\n", 158 | "def cifar10_hpo_train_deploy(region='us-west-2',\n", 159 | " training_input_mode='File',\n", 160 | " train_image='763104351884.dkr.ecr.us-west-2.amazonaws.com/tensorflow-training:1.15.2-gpu-py36-cu100-ubuntu18.04',\n", 161 | " serving_image='763104351884.dkr.ecr.us-west-2.amazonaws.com/tensorflow-inference:1.15.2-cpu',\n", 162 | " volume_size='50',\n", 163 | " max_run_time='86400',\n", 164 | " instance_type='ml.p3.2xlarge',\n", 165 | " network_isolation='False',\n", 166 | " traffic_encryption='False',\n", 167 | " spot_instance='False',\n", 168 | " channels='[ \\\n", 169 | " { \\\n", 170 | " \"ChannelName\": \"train\", \\\n", 171 | " \"DataSource\": { \\\n", 172 | " \"S3DataSource\": { \\\n", 173 | " \"S3DataType\": \"S3Prefix\", \\\n", 174 | " \"S3Uri\": \"'+datasets+'/train\", \\\n", 175 | " \"S3DataDistributionType\": \"FullyReplicated\" \\\n", 176 | " } \\\n", 177 | " }, \\\n", 178 | " \"CompressionType\": \"None\", \\\n", 179 | " \"RecordWrapperType\": \"None\" \\\n", 180 | " }, \\\n", 181 | " { \\\n", 182 | " \"ChannelName\": \"validation\", \\\n", 183 | " \"DataSource\": { \\\n", 184 | " \"S3DataSource\": { \\\n", 185 | " \"S3DataType\": \"S3Prefix\", \\\n", 186 | " \"S3Uri\": \"'+datasets+'/validation\", \\\n", 187 | " \"S3DataDistributionType\": \"FullyReplicated\" \\\n", 188 | " } \\\n", 189 | " }, \\\n", 190 | " \"CompressionType\": \"None\", \\\n", 191 | " \"RecordWrapperType\": \"None\" \\\n", 192 | " }, \\\n", 193 | " { \\\n", 194 | " \"ChannelName\": \"eval\", \\\n", 195 | " \"DataSource\": { \\\n", 196 | " \"S3DataSource\": { \\\n", 197 | " \"S3DataType\": \"S3Prefix\", \\\n", 198 | " \"S3Uri\": \"'+datasets+'/eval\", \\\n", 199 | " \"S3DataDistributionType\": \"FullyReplicated\" \\\n", 200 | " } \\\n", 201 | " }, \\\n", 202 | " \"CompressionType\": \"None\", \\\n", 203 | " \"RecordWrapperType\": \"None\" \\\n", 204 | " } \\\n", 205 | " ]'\n", 206 | " ):\n", 207 | " # Component 1\n", 208 | " hpo = sagemaker_hpo_op(\n", 209 | " region=region,\n", 210 | " image=train_image,\n", 211 | " training_input_mode=training_input_mode,\n", 212 | " strategy='Bayesian',\n", 213 | " metric_name='val_acc',\n", 214 | " metric_definitions='{\"val_acc\": \"val_acc: ([0-9\\\\\\\\.]+)\"}',\n", 215 | " metric_type='Maximize',\n", 216 | " static_parameters='{ \\\n", 217 | " \"epochs\": \"10\", \\\n", 218 | " \"momentum\": \"0.9\", \\\n", 219 | " \"weight-decay\": \"0.0002\", \\\n", 220 | " \"model_dir\":\"s3://'+bucket_name+'/jobs\", \\\n", 221 | " \"sagemaker_program\": \"cifar10-training-sagemaker.py\", \\\n", 222 | " \"sagemaker_region\": \"us-west-2\", \\\n", 223 | " \"sagemaker_submit_directory\": \"'+source_s3+'\" \\\n", 224 | " }',\n", 225 | " continuous_parameters='[ \\\n", 226 | " {\"Name\": \"learning-rate\", \"MinValue\": \"0.0001\", \"MaxValue\": \"0.1\", \"ScalingType\": \"Logarithmic\"} \\\n", 227 | " ]',\n", 228 | " categorical_parameters='[ \\\n", 229 | " {\"Name\": \"optimizer\", \"Values\": [\"sgd\", \"adam\"]}, \\\n", 230 | " {\"Name\": \"batch-size\", \"Values\": [\"32\", \"128\", \"256\"]}, \\\n", 231 | " {\"Name\": \"model-type\", \"Values\": [\"resnet\", \"custom\"]} \\\n", 232 | " ]',\n", 233 | " channels=channels,\n", 234 | " output_location=f's3://{bucket_name}/jobs',\n", 235 | " instance_type=instance_type,\n", 236 | " instance_count='1',\n", 237 | " volume_size=volume_size,\n", 238 | " max_num_jobs='16',\n", 239 | " max_parallel_jobs='4',\n", 240 | " max_run_time=max_run_time,\n", 241 | " network_isolation=network_isolation,\n", 242 | " traffic_encryption=traffic_encryption,\n", 243 | " spot_instance=spot_instance,\n", 244 | " role=role\n", 245 | " )\n", 246 | " \n", 247 | " # Component 2\n", 248 | " training_hyp = get_best_hyp_op(hpo.outputs['best_hyperparameters'])\n", 249 | " \n", 250 | " # Component 3\n", 251 | " training = sagemaker_train_op(\n", 252 | " region=region,\n", 253 | " image=train_image,\n", 254 | " training_input_mode=training_input_mode,\n", 255 | " hyperparameters=training_hyp.output,\n", 256 | " channels=channels,\n", 257 | " instance_type=instance_type,\n", 258 | " instance_count='1',\n", 259 | " volume_size=volume_size,\n", 260 | " max_run_time=max_run_time,\n", 261 | " model_artifact_path=f's3://{bucket_name}/jobs',\n", 262 | " network_isolation=network_isolation,\n", 263 | " traffic_encryption=traffic_encryption,\n", 264 | " spot_instance=spot_instance,\n", 265 | " role=role,\n", 266 | " )\n", 267 | "\n", 268 | " # Component 4\n", 269 | " create_model = sagemaker_model_op(\n", 270 | " region=region,\n", 271 | " model_name=training.outputs['job_name'],\n", 272 | " image=serving_image,\n", 273 | " model_artifact_url=training.outputs['model_artifact_url'],\n", 274 | " network_isolation=network_isolation,\n", 275 | " role=role\n", 276 | " )\n", 277 | "\n", 278 | " # Component 5\n", 279 | " prediction = sagemaker_deploy_op(\n", 280 | " region=region,\n", 281 | " model_name_1=create_model.output,\n", 282 | " instance_type_1='ml.m5.large'\n", 283 | " )" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": null, 289 | "metadata": {}, 290 | "outputs": [], 291 | "source": [ 292 | "kfp.compiler.Compiler().compile(cifar10_hpo_train_deploy,'sm-hpo-train-deploy-pipeline.zip')" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": null, 298 | "metadata": {}, 299 | "outputs": [], 300 | "source": [ 301 | "client = kfp.Client()\n", 302 | "aws_experiment = client.create_experiment(name='sm-kfp-experiment')\n", 303 | "\n", 304 | "exp_name = f'cifar10-hpo-train-deploy-kfp-{time.strftime(\"%Y-%m-%d-%H-%M-%S\", time.gmtime())}'\n", 305 | "my_run = client.run_pipeline(aws_experiment.id, exp_name, 'sm-hpo-train-deploy-pipeline.zip')" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": {}, 312 | "outputs": [], 313 | "source": [ 314 | "import json, boto3, numpy as np\n", 315 | "client = boto3.client('runtime.sagemaker')\n", 316 | "\n", 317 | "file_name = '1000_dog.png'\n", 318 | "with open(file_name, 'rb') as f:\n", 319 | " payload = f.read()\n", 320 | "\n", 321 | "response = client.invoke_endpoint(EndpointName='Endpoint-20200522021801-DR5P', \n", 322 | " ContentType='application/x-image', \n", 323 | " Body=payload)\n", 324 | "pred = json.loads(response['Body'].read())['predictions']\n", 325 | "labels = ['airplane','automobile','bird','cat','deer','dog','frog','horse','ship','truck']\n", 326 | "for l,p in zip(labels, pred[0]):\n", 327 | " print(l,\"{:.4f}\".format(p*100))" 328 | ] 329 | } 330 | ], 331 | "metadata": { 332 | "kernelspec": { 333 | "display_name": "Environment (conda_kfp)", 334 | "language": "python", 335 | "name": "conda_kfp" 336 | }, 337 | "language_info": { 338 | "codemirror_mode": { 339 | "name": "ipython", 340 | "version": 3 341 | }, 342 | "file_extension": ".py", 343 | "mimetype": "text/x-python", 344 | "name": "python", 345 | "nbconvert_exporter": "python", 346 | "pygments_lexer": "ipython3", 347 | "version": "3.6.5" 348 | } 349 | }, 350 | "nbformat": 4, 351 | "nbformat_minor": 2 352 | } 353 | -------------------------------------------------------------------------------- /kfp-sagemaker-custom-container.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "!pip install kfp --upgrade\n", 10 | "!which dsl-compile" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "## Amazon SageMaker Components for Kubeflow Pipelines Example - custom container\n", 18 | "In this example we'll build a Kubeflow pipeline where every component call a different Amazon SageMaker feature.\n", 19 | "Our simple pipeline will perform:\n", 20 | "\n", 21 | "1. Hyperparameter optimization \n", 22 | "1. Select best hyperparameters and increase epochs\n", 23 | "1. Training model on the best hyperparameters \n", 24 | "1. Create an Amazon SageMaker model\n", 25 | "1. Deploy model" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": {}, 32 | "outputs": [], 33 | "source": [ 34 | "import kfp\n", 35 | "from kfp import components\n", 36 | "from kfp.components import func_to_container_op\n", 37 | "from kfp import dsl\n", 38 | "import time, os, json" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "Amazon SageMaker Component URLs are available here:
\n", 46 | "https://github.com/kubeflow/pipelines/tree/master/components/aws/sagemaker" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": null, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "sagemaker_hpo_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/cb36f87b727df0578f4c1e3fe9c24a30bb59e5a2/components/aws/sagemaker/hyperparameter_tuning/component.yaml')\n", 56 | "sagemaker_train_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/cb36f87b727df0578f4c1e3fe9c24a30bb59e5a2/components/aws/sagemaker/train/component.yaml')\n", 57 | "sagemaker_model_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/cb36f87b727df0578f4c1e3fe9c24a30bb59e5a2/components/aws/sagemaker/model/component.yaml')\n", 58 | "sagemaker_deploy_op = components.load_component_from_url('https://raw.githubusercontent.com/kubeflow/pipelines/cb36f87b727df0578f4c1e3fe9c24a30bb59e5a2/components/aws/sagemaker/deploy/component.yaml')" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": null, 64 | "metadata": {}, 65 | "outputs": [], 66 | "source": [ 67 | "import sagemaker\n", 68 | "import boto3\n", 69 | "\n", 70 | "sess = boto3.Session()\n", 71 | "account = boto3.client('sts').get_caller_identity().get('Account')\n", 72 | "sm = sess.client('sagemaker')\n", 73 | "role = sagemaker.get_execution_role()\n", 74 | "sagemaker_session = sagemaker.Session(boto_session=sess)" 75 | ] 76 | }, 77 | { 78 | "cell_type": "markdown", 79 | "metadata": {}, 80 | "source": [ 81 | "#### Prepare training datasets and upload to Amazon S3" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "bucket_name = sagemaker_session.default_bucket()\n", 91 | "job_folder = 'jobs'\n", 92 | "dataset_folder = 'datasets'\n", 93 | "local_dataset = 'cifar10'\n", 94 | "\n", 95 | "# TensorFlow is required to download and convert the dataset to TFRecord format\n", 96 | "!python generate_cifar10_tfrecords.py --data-dir {local_dataset}\n", 97 | "datasets = sagemaker_session.upload_data(path='cifar10', key_prefix='datasets/cifar10-dataset')\n", 98 | "\n", 99 | "# If dataset is already in S3 use the dataset's path:\n", 100 | "# datasets = 's3://{bucket_name}/{dataset_folder}/cifar10-dataset'" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "#### Build your Docker container and push it to Amazon ECR" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "------------------------------------------------------------\n", 115 | "**STOP:** Open **`/docker/build_docker_push_to_ecr.ipynb`** and follow steps to build and push container to Amazon ECR before proceeding\n", 116 | "\n", 117 | "------------------------------------------------------------" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": {}, 123 | "source": [ 124 | "#### Create a custom pipeline op\n", 125 | "Takes the results from a hyperparameter tuning job and increases the number of epochs for the next training job" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": null, 131 | "metadata": {}, 132 | "outputs": [], 133 | "source": [ 134 | "def update_best_model_hyperparams(hpo_results, best_model_epoch = \"80\") -> str:\n", 135 | " import json\n", 136 | " r = json.loads(str(hpo_results))\n", 137 | " return json.dumps(dict(r,epochs=best_model_epoch))\n", 138 | "\n", 139 | "get_best_hyp_op = func_to_container_op(update_best_model_hyperparams)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "markdown", 144 | "metadata": {}, 145 | "source": [ 146 | "#### Create a pipeline" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "@dsl.pipeline(\n", 156 | " name='cifar10 hpo train deploy pipeline',\n", 157 | " description='cifar10 hpo train deploy pipeline using sagemaker'\n", 158 | ")\n", 159 | "def cifar10_hpo_train_deploy(region='us-west-2',\n", 160 | " training_input_mode='File',\n", 161 | " train_image=f'{account}.dkr.ecr.us-west-2.amazonaws.com/sagemaker-kubernetes:latest',\n", 162 | " serving_image='763104351884.dkr.ecr.us-west-2.amazonaws.com/tensorflow-inference:1.15.2-cpu',\n", 163 | " volume_size='50',\n", 164 | " max_run_time='86400',\n", 165 | " instance_type='ml.p3.2xlarge',\n", 166 | " network_isolation='False',\n", 167 | " traffic_encryption='False',\n", 168 | " spot_instance='False',\n", 169 | " channels='[ \\\n", 170 | " { \\\n", 171 | " \"ChannelName\": \"train\", \\\n", 172 | " \"DataSource\": { \\\n", 173 | " \"S3DataSource\": { \\\n", 174 | " \"S3DataType\": \"S3Prefix\", \\\n", 175 | " \"S3Uri\": \"s3://'+bucket_name+'/datasets/cifar10-dataset/train\", \\\n", 176 | " \"S3DataDistributionType\": \"FullyReplicated\" \\\n", 177 | " } \\\n", 178 | " }, \\\n", 179 | " \"CompressionType\": \"None\", \\\n", 180 | " \"RecordWrapperType\": \"None\" \\\n", 181 | " }, \\\n", 182 | " { \\\n", 183 | " \"ChannelName\": \"validation\", \\\n", 184 | " \"DataSource\": { \\\n", 185 | " \"S3DataSource\": { \\\n", 186 | " \"S3DataType\": \"S3Prefix\", \\\n", 187 | " \"S3Uri\": \"s3://'+bucket_name+'/datasets/cifar10-dataset/validation\", \\\n", 188 | " \"S3DataDistributionType\": \"FullyReplicated\" \\\n", 189 | " } \\\n", 190 | " }, \\\n", 191 | " \"CompressionType\": \"None\", \\\n", 192 | " \"RecordWrapperType\": \"None\" \\\n", 193 | " }, \\\n", 194 | " { \\\n", 195 | " \"ChannelName\": \"eval\", \\\n", 196 | " \"DataSource\": { \\\n", 197 | " \"S3DataSource\": { \\\n", 198 | " \"S3DataType\": \"S3Prefix\", \\\n", 199 | " \"S3Uri\": \"s3://'+bucket_name+'/datasets/cifar10-dataset/eval\", \\\n", 200 | " \"S3DataDistributionType\": \"FullyReplicated\" \\\n", 201 | " } \\\n", 202 | " }, \\\n", 203 | " \"CompressionType\": \"None\", \\\n", 204 | " \"RecordWrapperType\": \"None\" \\\n", 205 | " } \\\n", 206 | " ]'\n", 207 | " ):\n", 208 | " # Component 1\n", 209 | " hpo = sagemaker_hpo_op(\n", 210 | " region=region,\n", 211 | " image=train_image,\n", 212 | " training_input_mode=training_input_mode,\n", 213 | " strategy='Bayesian',\n", 214 | " metric_name='val_acc',\n", 215 | " metric_definitions='{\"val_acc\": \"val_acc: ([0-9\\\\\\\\.]+)\"}',\n", 216 | " metric_type='Maximize',\n", 217 | " static_parameters='{ \\\n", 218 | " \"epochs\": \"1\", \\\n", 219 | " \"momentum\": \"0.9\", \\\n", 220 | " \"weight-decay\": \"0.0002\", \\\n", 221 | " \"model_dir\":\"s3://'+bucket_name+'/jobs\", \\\n", 222 | " \"sagemaker_region\": \"us-west-2\" \\\n", 223 | " }',\n", 224 | " continuous_parameters='[ \\\n", 225 | " {\"Name\": \"learning-rate\", \"MinValue\": \"0.0001\", \"MaxValue\": \"0.1\", \"ScalingType\": \"Logarithmic\"} \\\n", 226 | " ]',\n", 227 | " categorical_parameters='[ \\\n", 228 | " {\"Name\": \"optimizer\", \"Values\": [\"sgd\", \"adam\"]}, \\\n", 229 | " {\"Name\": \"batch-size\", \"Values\": [\"32\", \"128\", \"256\"]}, \\\n", 230 | " {\"Name\": \"model-type\", \"Values\": [\"resnet\", \"custom\"]} \\\n", 231 | " ]',\n", 232 | " channels=channels,\n", 233 | " output_location=f's3://{bucket_name}/jobs',\n", 234 | " instance_type=instance_type,\n", 235 | " instance_count='1',\n", 236 | " volume_size=volume_size,\n", 237 | " max_num_jobs='1',\n", 238 | " max_parallel_jobs='1',\n", 239 | " max_run_time=max_run_time,\n", 240 | " network_isolation=network_isolation,\n", 241 | " traffic_encryption=traffic_encryption,\n", 242 | " spot_instance=spot_instance,\n", 243 | " role=role\n", 244 | " )\n", 245 | " \n", 246 | " # Component 2\n", 247 | " training_hyp = get_best_hyp_op(hpo.outputs['best_hyperparameters'])\n", 248 | " \n", 249 | " # Component 3\n", 250 | " training = sagemaker_train_op(\n", 251 | " region=region,\n", 252 | " image=train_image,\n", 253 | " training_input_mode=training_input_mode,\n", 254 | " hyperparameters=training_hyp.output,\n", 255 | " channels=channels,\n", 256 | " instance_type=instance_type,\n", 257 | " instance_count='1',\n", 258 | " volume_size=volume_size,\n", 259 | " max_run_time=max_run_time,\n", 260 | " model_artifact_path=f's3://{bucket_name}/jobs',\n", 261 | " network_isolation=network_isolation,\n", 262 | " traffic_encryption=traffic_encryption,\n", 263 | " spot_instance=spot_instance,\n", 264 | " role=role,\n", 265 | " )\n", 266 | "\n", 267 | " # Component 4\n", 268 | " create_model = sagemaker_model_op(\n", 269 | " region=region,\n", 270 | " model_name=training.outputs['job_name'],\n", 271 | " image=serving_image,\n", 272 | " model_artifact_url=training.outputs['model_artifact_url'],\n", 273 | " network_isolation=network_isolation,\n", 274 | " role=role\n", 275 | " )\n", 276 | "\n", 277 | " # Component 5\n", 278 | " prediction = sagemaker_deploy_op(\n", 279 | " region=region,\n", 280 | " model_name_1=create_model.output,\n", 281 | " instance_type_1='ml.m5.large'\n", 282 | " )" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": null, 288 | "metadata": {}, 289 | "outputs": [], 290 | "source": [ 291 | "kfp.compiler.Compiler().compile(cifar10_hpo_train_deploy,'sm-hpo-train-deploy-pipeline.zip')" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": null, 297 | "metadata": {}, 298 | "outputs": [], 299 | "source": [ 300 | "client = kfp.Client()\n", 301 | "aws_experiment = client.create_experiment(name='aws')\n", 302 | "\n", 303 | "exp_name = f'cifar10-hpo-train-deploy-kfp-{time.strftime(\"%Y-%m-%d-%H-%M-%S\", time.gmtime())}'\n", 304 | "my_run = client.run_pipeline(aws_experiment.id, exp_name, 'sm-hpo-train-deploy-pipeline.zip')" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": null, 310 | "metadata": {}, 311 | "outputs": [], 312 | "source": [ 313 | "import json, boto3\n", 314 | "client = boto3.client('runtime.sagemaker')\n", 315 | "\n", 316 | "file_name = '1000_dog.png'\n", 317 | "with open(file_name, 'rb') as f:\n", 318 | " payload = f.read()\n", 319 | "\n", 320 | "response = client.invoke_endpoint(EndpointName='Endpoint-20200502070427-8KDX', \n", 321 | " ContentType='application/x-image', \n", 322 | " Body=payload)\n", 323 | "print(response['Body'].read())\n", 324 | "labels = ['airplane','automobile','bird','cat','deer','dog','frog','horse','ship','truck']" 325 | ] 326 | } 327 | ], 328 | "metadata": { 329 | "kernelspec": { 330 | "display_name": "Environment (conda_kfp)", 331 | "language": "python", 332 | "name": "conda_kfp" 333 | }, 334 | "language_info": { 335 | "codemirror_mode": { 336 | "name": "ipython", 337 | "version": 3 338 | }, 339 | "file_extension": ".py", 340 | "mimetype": "text/x-python", 341 | "name": "python", 342 | "nbconvert_exporter": "python", 343 | "pygments_lexer": "ipython3", 344 | "version": "3.6.5" 345 | } 346 | }, 347 | "nbformat": 4, 348 | "nbformat_minor": 2 349 | } 350 | --------------------------------------------------------------------------------