├── sagemaker-pipeline ├── pipelines │ ├── __init__.py │ ├── diabetes │ │ ├── __init__.py │ │ ├── xgb_evaluate.py │ │ ├── dtree_evaluate.py │ │ ├── preprocess.py │ │ └── pipeline.py │ ├── __version__.py │ ├── _utils.py │ ├── get_pipeline_definition.py │ └── run_pipeline.py ├── setup.cfg ├── codebuild-buildspec.yml ├── setup.py └── diabetes.flow ├── container ├── local_test │ ├── test_dir │ │ └── input │ │ │ └── config │ │ │ ├── resourceConfig.json │ │ │ └── hyperparameters.json │ ├── serve_local.sh │ ├── predict.sh │ └── train_local.sh ├── decision_trees │ ├── wsgi.py │ ├── nginx.conf │ ├── serve │ ├── predictor.py │ └── train ├── app-image-config-input.json ├── build_and_push.sh └── Dockerfile ├── diabetes-project-iam.json ├── README.md ├── diabetes-project-with-mlops.ipynb └── diabetes-project.ipynb /sagemaker-pipeline/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sagemaker-pipeline/pipelines/diabetes/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /container/local_test/test_dir/input/config/resourceConfig.json: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /container/local_test/test_dir/input/config/hyperparameters.json: -------------------------------------------------------------------------------- 1 | {"max_depth": "2", "max_leaf_nodes": "2"} 2 | -------------------------------------------------------------------------------- /container/local_test/serve_local.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | image=$1 4 | 5 | docker run -v $(pwd)/test_dir:/opt/ml -p 8080:8080 --rm ${image} serve 6 | -------------------------------------------------------------------------------- /container/local_test/predict.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | payload=$1 4 | content=${2:-text/csv} 5 | 6 | curl --data-binary @${payload} -H "Content-Type: ${content}" -v http://localhost:8080/invocations 7 | -------------------------------------------------------------------------------- /container/decision_trees/wsgi.py: -------------------------------------------------------------------------------- 1 | import predictor as myapp 2 | 3 | # This is just a simple wrapper for gunicorn to find your app. 4 | # If you want to change the algorithm file, simply change "predictor" above to the 5 | # new file. 6 | 7 | app = myapp.app 8 | -------------------------------------------------------------------------------- /container/local_test/train_local.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | image=$1 4 | 5 | mkdir -p test_dir/model 6 | mkdir -p test_dir/output 7 | 8 | rm test_dir/model/* 9 | rm test_dir/output/* 10 | 11 | docker run -v $(pwd)/test_dir:/opt/ml --rm ${image} train 12 | -------------------------------------------------------------------------------- /sagemaker-pipeline/setup.cfg: -------------------------------------------------------------------------------- 1 | [tool:pytest] 2 | addopts = 3 | -vv 4 | testpaths = tests 5 | 6 | [aliases] 7 | test=pytest 8 | 9 | [metadata] 10 | description-file = README.md 11 | license_file = LICENSE 12 | 13 | [wheel] 14 | universal = 1 15 | -------------------------------------------------------------------------------- /sagemaker-pipeline/pipelines/__version__.py: -------------------------------------------------------------------------------- 1 | """Metadata for the pipelines package.""" 2 | 3 | __title__ = "pipelines" 4 | __description__ = "pipelines - template package" 5 | __version__ = "0.0.1" 6 | __author__ = "" 7 | __author_email__ = "" 8 | __license__ = "Apache 2.0" 9 | __url__ = "" 10 | -------------------------------------------------------------------------------- /container/app-image-config-input.json: -------------------------------------------------------------------------------- 1 | { 2 | "AppImageConfigName": "diabetes-dtree-config", 3 | "KernelGatewayImageConfig": { 4 | "KernelSpecs": [ 5 | { 6 | "Name": "python3", 7 | "DisplayName": "Python 3 (ipykernel)" 8 | } 9 | ], 10 | "FileSystemConfig": { 11 | "MountPath": "/home/sagemaker-user", 12 | "DefaultUid": 1000, 13 | "DefaultGid": 100 14 | } 15 | } 16 | } -------------------------------------------------------------------------------- /container/decision_trees/nginx.conf: -------------------------------------------------------------------------------- 1 | worker_processes 1; 2 | daemon off; # Prevent forking 3 | 4 | 5 | pid /tmp/nginx.pid; 6 | error_log /var/log/nginx/error.log; 7 | 8 | events { 9 | # defaults 10 | } 11 | 12 | http { 13 | include /etc/nginx/mime.types; 14 | default_type application/octet-stream; 15 | access_log /var/log/nginx/access.log combined; 16 | 17 | upstream gunicorn { 18 | server unix:/tmp/gunicorn.sock; 19 | } 20 | 21 | server { 22 | listen 8080 deferred; 23 | client_max_body_size 5m; 24 | 25 | keepalive_timeout 5; 26 | proxy_read_timeout 1200s; 27 | 28 | location ~ ^/(ping|invocations) { 29 | proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; 30 | proxy_set_header Host $http_host; 31 | proxy_redirect off; 32 | proxy_pass http://gunicorn; 33 | } 34 | 35 | location / { 36 | return 404 "{}"; 37 | } 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /sagemaker-pipeline/codebuild-buildspec.yml: -------------------------------------------------------------------------------- 1 | version: 0.2 2 | 3 | phases: 4 | install: 5 | runtime-versions: 6 | python: 3.8 7 | commands: 8 | - pip install --upgrade --force-reinstall . "awscli>1.20.30" 9 | 10 | build: 11 | commands: 12 | - export PYTHONUNBUFFERED=TRUE 13 | - export SAGEMAKER_PROJECT_NAME_ID="${SAGEMAKER_PROJECT_NAME}-${SAGEMAKER_PROJECT_ID}" 14 | - | 15 | run-pipeline --module-name pipelines.diabetes.pipeline \ 16 | --role-arn $SAGEMAKER_PIPELINE_ROLE_ARN \ 17 | --tags "[{\"Key\":\"sagemaker:project-name\", \"Value\":\"${SAGEMAKER_PROJECT_NAME}\"}, {\"Key\":\"sagemaker:project-id\", \"Value\":\"${SAGEMAKER_PROJECT_ID}\"}]" \ 18 | --kwargs "{\"region\":\"${AWS_REGION}\",\"sagemaker_project_arn\":\"${SAGEMAKER_PROJECT_ARN}\",\"role\":\"${SAGEMAKER_PIPELINE_ROLE_ARN}\",\"default_bucket\":\"${ARTIFACT_BUCKET}\",\"pipeline_name\":\"${SAGEMAKER_PROJECT_NAME_ID}\",\"model_package_group_name\":\"${SAGEMAKER_PROJECT_NAME_ID}\",\"base_job_prefix\":\"${SAGEMAKER_PROJECT_NAME_ID}\"}" 19 | - echo "Create/Update of the SageMaker Pipeline and execution completed." 20 | 21 | -------------------------------------------------------------------------------- /container/build_and_push.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # This script shows how to build the Docker image and push it to ECR to be ready for use 4 | # by SageMaker. 5 | 6 | # The argument to this script is the image name. This will be used as the image on the local 7 | # machine and combined with the account and region to form the repository name for ECR. 8 | image=$1 9 | 10 | if [ "$image" == "" ] 11 | then 12 | echo "Usage: $0 " 13 | exit 1 14 | fi 15 | 16 | chmod +x decision_trees/train 17 | chmod +x decision_trees/serve 18 | 19 | # Get the account number associated with the current IAM credentials 20 | account=$(aws sts get-caller-identity --query Account --output text) 21 | 22 | if [ $? -ne 0 ] 23 | then 24 | exit 255 25 | fi 26 | 27 | 28 | # Get the region defined in the current configuration (default to us-west-2 if none defined) 29 | region=$(aws configure get region) 30 | region=${region:-us-west-2} 31 | 32 | 33 | fullname="${account}.dkr.ecr.${region}.amazonaws.com/${image}:latest" 34 | 35 | # If the repository doesn't exist in ECR, create it. 36 | 37 | aws ecr describe-repositories --repository-names "${image}" > /dev/null 2>&1 38 | 39 | if [ $? -ne 0 ] 40 | then 41 | aws ecr create-repository --repository-name "${image}" > /dev/null 42 | fi 43 | 44 | # Get the login command from ECR and execute it directly 45 | aws ecr get-login-password --region "${region}" | docker login --username AWS --password-stdin "${account}".dkr.ecr."${region}".amazonaws.com 46 | 47 | # Build the docker image locally with the image name and then push it to ECR 48 | # with the full name. 49 | 50 | docker build -t ${image} . 51 | docker tag ${image} ${fullname} 52 | 53 | docker push ${fullname} 54 | -------------------------------------------------------------------------------- /sagemaker-pipeline/setup.py: -------------------------------------------------------------------------------- 1 | import os 2 | import setuptools 3 | 4 | 5 | about = {} 6 | here = os.path.abspath(os.path.dirname(__file__)) 7 | with open(os.path.join(here, "pipelines", "__version__.py")) as f: 8 | exec(f.read(), about) 9 | 10 | 11 | with open("README.md", "r") as f: 12 | readme = f.read() 13 | 14 | 15 | required_packages = ["sagemaker", "awswrangler", "sagemaker-experiments"] 16 | extras = { 17 | "test": [ 18 | "black", 19 | "coverage", 20 | "flake8", 21 | "mock", 22 | "pydocstyle", 23 | "pytest", 24 | "pytest-cov", 25 | "sagemaker", 26 | "tox", 27 | ] 28 | } 29 | setuptools.setup( 30 | name=about["__title__"], 31 | description=about["__description__"], 32 | version=about["__version__"], 33 | author=about["__author__"], 34 | author_email=["__author_email__"], 35 | long_description=readme, 36 | long_description_content_type="text/markdown", 37 | url=about["__url__"], 38 | license=about["__license__"], 39 | packages=setuptools.find_packages(), 40 | include_package_data=True, 41 | python_requires=">=3.6", 42 | install_requires=required_packages, 43 | extras_require=extras, 44 | entry_points={ 45 | "console_scripts": [ 46 | "get-pipeline-definition=pipelines.get_pipeline_definition:main", 47 | "run-pipeline=pipelines.run_pipeline:main", 48 | ] 49 | }, 50 | classifiers=[ 51 | "Development Status :: 3 - Alpha", 52 | "Intended Audience :: Developers", 53 | "Natural Language :: English", 54 | "Programming Language :: Python", 55 | "Programming Language :: Python :: 3", 56 | "Programming Language :: Python :: 3.6", 57 | "Programming Language :: Python :: 3.7", 58 | "Programming Language :: Python :: 3.8", 59 | ], 60 | ) 61 | -------------------------------------------------------------------------------- /container/Dockerfile: -------------------------------------------------------------------------------- 1 | # Build an image that can do training and inference in SageMaker 2 | # This is a Python 3 image that uses the nginx, gunicorn, flask stack 3 | # for serving inferences in a stable way. 4 | 5 | FROM ubuntu:18.04 6 | 7 | ARG NB_USER="sagemaker-user" 8 | ARG NB_UID="1000" 9 | ARG NB_GID="100" 10 | 11 | RUN apt-get -y update && apt-get install -y --no-install-recommends \ 12 | wget \ 13 | python3-pip \ 14 | python3-setuptools \ 15 | nginx \ 16 | ca-certificates \ 17 | sudo \ 18 | && rm -rf /var/lib/apt/lists/* 19 | 20 | # Setup the "sagemaker-user" user with root privileges. 21 | RUN \ 22 | useradd -m -s /bin/bash -N -u $NB_UID $NB_USER && \ 23 | chmod g+w /etc/passwd && \ 24 | echo "${NB_USER} ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers 25 | 26 | USER $NB_UID 27 | 28 | RUN \ 29 | sudo ln -s /usr/bin/python3 /usr/bin/python && \ 30 | sudo ln -s /usr/bin/pip3 /usr/bin/pip 31 | 32 | # Here we get all python packages. 33 | # There's substantial overlap between scipy and numpy that we eliminate by 34 | # linking them together. Likewise, pip leaves the install caches populated which uses 35 | # a significant amount of space. These optimizations save a fair amount of space in the 36 | # image, which reduces start up time. 37 | RUN sudo pip --no-cache-dir install numpy==1.16.2 scipy==1.2.1 scikit-learn==0.20.2 pandas flask gunicorn 38 | 39 | # Set some environment variables. PYTHONUNBUFFERED keeps Python from buffering our standard 40 | # output stream, which means that logs can be delivered to the user quickly. PYTHONDONTWRITEBYTECODE 41 | # keeps Python from writing the .pyc files which are unnecessary in this case. We also update 42 | # PATH so that the train and serve programs are found when the container is invoked. 43 | 44 | ENV PYTHONUNBUFFERED=TRUE 45 | ENV PYTHONDONTWRITEBYTECODE=TRUE 46 | ENV PATH="/opt/program:${PATH}" 47 | 48 | # Set up the program in the image 49 | COPY decision_trees /opt/program 50 | WORKDIR /opt/program 51 | 52 | 53 | 54 | -------------------------------------------------------------------------------- /sagemaker-pipeline/pipelines/_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | """Provides utilities for SageMaker Pipeline CLI.""" 14 | from __future__ import absolute_import 15 | 16 | import ast 17 | 18 | 19 | def get_pipeline_driver(module_name, passed_args=None): 20 | """Gets the driver for generating your pipeline definition. 21 | 22 | Pipeline modules must define a get_pipeline() module-level method. 23 | 24 | Args: 25 | module_name: The module name of your pipeline. 26 | passed_args: Optional passed arguments that your pipeline may be templated by. 27 | 28 | Returns: 29 | The SageMaker Workflow pipeline. 30 | """ 31 | _imports = __import__(module_name, fromlist=["get_pipeline"]) 32 | kwargs = convert_struct(passed_args) 33 | return _imports.get_pipeline(**kwargs) 34 | 35 | 36 | def convert_struct(str_struct=None): 37 | return ast.literal_eval(str_struct) if str_struct else {} 38 | 39 | 40 | def get_pipeline_custom_tags(module_name, args, tags): 41 | """Gets the custom tags for pipeline 42 | 43 | Returns: 44 | Custom tags to be added to the pipeline 45 | """ 46 | try: 47 | _imports = __import__(module_name, fromlist=["get_pipeline_custom_tags"]) 48 | kwargs = convert_struct(args) 49 | return _imports.get_pipeline_custom_tags( 50 | tags, kwargs["region"], kwargs["sagemaker_project_arn"] 51 | ) 52 | except Exception as e: 53 | print(f"Error getting project tags: {e}") 54 | return tags 55 | -------------------------------------------------------------------------------- /sagemaker-pipeline/pipelines/get_pipeline_definition.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | """A CLI to get pipeline definitions from pipeline modules.""" 14 | from __future__ import absolute_import 15 | 16 | import argparse 17 | import sys 18 | 19 | from pipelines._utils import get_pipeline_driver 20 | 21 | 22 | def main(): # pragma: no cover 23 | """The main harness that gets the pipeline definition JSON. 24 | 25 | Prints the json to stdout or saves to file. 26 | """ 27 | parser = argparse.ArgumentParser("Gets the pipeline definition for the pipeline script.") 28 | 29 | parser.add_argument( 30 | "-n", 31 | "--module-name", 32 | dest="module_name", 33 | type=str, 34 | help="The module name of the pipeline to import.", 35 | ) 36 | parser.add_argument( 37 | "-f", 38 | "--file-name", 39 | dest="file_name", 40 | type=str, 41 | default=None, 42 | help="The file to output the pipeline definition json to.", 43 | ) 44 | parser.add_argument( 45 | "-kwargs", 46 | "--kwargs", 47 | dest="kwargs", 48 | default=None, 49 | help="Dict string of keyword arguments for the pipeline generation (if supported)", 50 | ) 51 | args = parser.parse_args() 52 | 53 | if args.module_name is None: 54 | parser.print_help() 55 | sys.exit(2) 56 | 57 | try: 58 | pipeline = get_pipeline_driver(args.module_name, args.kwargs) 59 | content = pipeline.definition() 60 | if args.file_name: 61 | with open(args.file_name, "w") as f: 62 | f.write(content) 63 | else: 64 | print(content) 65 | except Exception as e: # pylint: disable=W0703 66 | print(f"Exception: {e}") 67 | sys.exit(1) 68 | 69 | 70 | if __name__ == "__main__": 71 | main() 72 | -------------------------------------------------------------------------------- /container/decision_trees/serve: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # This file implements the scoring service shell. You don't necessarily need to modify it for various 4 | # algorithms. It starts nginx and gunicorn with the correct configurations and then simply waits until 5 | # gunicorn exits. 6 | # 7 | # The flask server is specified to be the app object in wsgi.py 8 | # 9 | # We set the following parameters: 10 | # 11 | # Parameter Environment Variable Default Value 12 | # --------- -------------------- ------------- 13 | # number of workers MODEL_SERVER_WORKERS the number of CPU cores 14 | # timeout MODEL_SERVER_TIMEOUT 60 seconds 15 | 16 | import multiprocessing 17 | import os 18 | import signal 19 | import subprocess 20 | import sys 21 | 22 | cpu_count = multiprocessing.cpu_count() 23 | 24 | model_server_timeout = os.environ.get('MODEL_SERVER_TIMEOUT', 60) 25 | model_server_workers = int(os.environ.get('MODEL_SERVER_WORKERS', cpu_count)) 26 | 27 | def sigterm_handler(nginx_pid, gunicorn_pid): 28 | try: 29 | os.kill(nginx_pid, signal.SIGQUIT) 30 | except OSError: 31 | pass 32 | try: 33 | os.kill(gunicorn_pid, signal.SIGTERM) 34 | except OSError: 35 | pass 36 | 37 | sys.exit(0) 38 | 39 | def start_server(): 40 | print('Starting the inference server with {} workers.'.format(model_server_workers)) 41 | 42 | 43 | # link the log streams to stdout/err so they will be logged to the container logs 44 | subprocess.check_call(['ln', '-sf', '/dev/stdout', '/var/log/nginx/access.log']) 45 | subprocess.check_call(['ln', '-sf', '/dev/stderr', '/var/log/nginx/error.log']) 46 | 47 | nginx = subprocess.Popen(['nginx', '-c', '/opt/program/nginx.conf']) 48 | gunicorn = subprocess.Popen(['gunicorn', 49 | '--timeout', str(model_server_timeout), 50 | '-k', 'sync', 51 | '-b', 'unix:/tmp/gunicorn.sock', 52 | '-w', str(model_server_workers), 53 | 'wsgi:app']) 54 | 55 | signal.signal(signal.SIGTERM, lambda a, b: sigterm_handler(nginx.pid, gunicorn.pid)) 56 | 57 | # If either subprocess exits, so do we. 58 | pids = set([nginx.pid, gunicorn.pid]) 59 | while True: 60 | pid, _ = os.wait() 61 | if pid in pids: 62 | break 63 | 64 | sigterm_handler(nginx.pid, gunicorn.pid) 65 | print('Inference server exiting') 66 | 67 | # The main routine just invokes the start function. 68 | 69 | if __name__ == '__main__': 70 | start_server() 71 | -------------------------------------------------------------------------------- /sagemaker-pipeline/pipelines/diabetes/xgb_evaluate.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | """Evaluation script for measuring model accuracy.""" 14 | 15 | import json 16 | import logging 17 | import os 18 | import pickle 19 | import tarfile 20 | 21 | import pandas as pd 22 | import numpy 23 | import xgboost 24 | import boto3 25 | 26 | 27 | logger = logging.getLogger() 28 | logger.setLevel(logging.INFO) 29 | logger.addHandler(logging.StreamHandler()) 30 | 31 | # May need to import additional metrics depending on what you are measuring. 32 | # See https://docs.aws.amazon.com/sagemaker/latest/dg/model-monitor-model-quality-metrics.html 33 | from sklearn.metrics import ( 34 | accuracy_score, 35 | classification_report, 36 | roc_auc_score, 37 | mean_squared_error, 38 | mean_absolute_error, 39 | r2_score, 40 | ) 41 | 42 | if __name__ == "__main__": 43 | 44 | model_path = "/opt/ml/processing/model/model.tar.gz" 45 | with tarfile.open(model_path) as tar: 46 | tar.extractall(path="..") 47 | 48 | logger.debug("Loading XGB model.") 49 | model = pickle.load(open("xgboost-model", "rb")) 50 | 51 | test_path = "/opt/ml/processing/test/test.csv" 52 | 53 | logger.info("Loading test input data") 54 | 55 | df = pd.read_csv(test_path, header=None) 56 | 57 | logger.debug("Reading test data.") 58 | y_test = df.iloc[:, 0].to_numpy() 59 | 60 | df.drop(df.columns[0], axis=1, inplace=True) 61 | X_test = xgboost.DMatrix(df.values) 62 | 63 | logger.info("Performing predictions against test data.") 64 | predictions = model.predict(X_test) 65 | 66 | logger.info("Creating classification evaluation report") 67 | 68 | acc = accuracy_score(y_test, predictions.round()) 69 | roc = roc_auc_score(y_test, predictions.round()) 70 | 71 | report_dict = { 72 | "classification_metrics": { 73 | "acc": {"value": acc}, 74 | "roc": {"value": roc}, 75 | }, 76 | } 77 | 78 | logger.info("Regression report:\n{}".format(report_dict)) 79 | 80 | evaluation_output_path = os.path.join("/opt/ml/processing/evaluation", "xgb_evaluation.json") 81 | logger.info("Saving regression report to {}".format(evaluation_output_path)) 82 | 83 | with open(evaluation_output_path, "w") as f: 84 | f.write(json.dumps(report_dict)) 85 | -------------------------------------------------------------------------------- /sagemaker-pipeline/pipelines/diabetes/dtree_evaluate.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | """Evaluation script for measuring model accuracy.""" 14 | 15 | import json 16 | import logging 17 | import os 18 | import pickle 19 | import tarfile 20 | 21 | import pandas as pd 22 | import numpy 23 | 24 | logger = logging.getLogger() 25 | logger.setLevel(logging.INFO) 26 | logger.addHandler(logging.StreamHandler()) 27 | 28 | # May need to import additional metrics depending on what you are measuring. 29 | # See https://docs.aws.amazon.com/sagemaker/latest/dg/model-monitor-model-quality-metrics.html 30 | from sklearn.metrics import ( 31 | accuracy_score, 32 | classification_report, 33 | roc_auc_score, 34 | mean_squared_error, 35 | mean_absolute_error, 36 | r2_score, 37 | ) 38 | 39 | if __name__ == "__main__": 40 | 41 | prefix = "/opt/ml/processing/" 42 | tar_model_path = os.path.join(prefix, 'model/model.tar.gz') 43 | model_path = os.path.join(prefix, 'model/decision-tree-model.pkl') 44 | 45 | os.system('sudo chown -R 1000:100 ' + prefix) 46 | with tarfile.open(tar_model_path) as tar: 47 | tar.extractall(path="/opt/ml/processing/model/") 48 | 49 | logger.debug("Loading DTree model.") 50 | 51 | model = pickle.load(open(model_path, "rb")) 52 | 53 | test_path = "/opt/ml/processing/test/test.csv" 54 | 55 | logger.info("Loading test input data") 56 | 57 | df = pd.read_csv(test_path, header=None) 58 | 59 | logger.debug("Reading test data.") 60 | y_test = df.iloc[:, 0].to_numpy() 61 | df.drop(df.columns[0], axis=1, inplace=True) 62 | X_test = numpy.array(df.values) 63 | 64 | logger.info("Performing predictions against test data.") 65 | predictions = model.predict(X_test) 66 | 67 | logger.info("Creating classification evaluation report") 68 | 69 | acc = accuracy_score(y_test, predictions) 70 | roc = roc_auc_score(y_test, predictions) 71 | 72 | report_dict = { 73 | "classification_metrics": { 74 | "acc": {"value": acc}, 75 | "roc": {"value": roc}, 76 | }, 77 | } 78 | 79 | logger.info("Regression report:\n{}".format(report_dict)) 80 | 81 | evaluation_output_path = os.path.join("/opt/ml/processing/evaluation", "dtree_evaluation.json") 82 | logger.info("Saving regression report to {}".format(evaluation_output_path)) 83 | 84 | with open(evaluation_output_path, "w") as f: 85 | f.write(json.dumps(report_dict)) 86 | -------------------------------------------------------------------------------- /sagemaker-pipeline/pipelines/diabetes/preprocess.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | """Feature engineers the customer churn dataset.""" 14 | import argparse 15 | import logging 16 | import pathlib 17 | 18 | import boto3 19 | import numpy as np 20 | import pandas as pd 21 | 22 | import os 23 | import glob 24 | 25 | 26 | logger = logging.getLogger() 27 | logger.setLevel(logging.INFO) 28 | logger.addHandler(logging.StreamHandler()) 29 | 30 | if __name__ == "__main__": 31 | logger.info("Starting preprocessing") 32 | parser = argparse.ArgumentParser() 33 | parser.add_argument("--input-data", type=str, required=True) 34 | args = parser.parse_args() 35 | 36 | logger.info("Setting memory workaround") 37 | os.system("echo 1 > /proc/sys/vm/overcommit_memory") 38 | 39 | base_dir = "/opt/ml/processing" 40 | pathlib.Path(f"{base_dir}/data").mkdir(parents=True, exist_ok=True) 41 | input_data = args.input_data 42 | bucket = input_data.split("/")[2] 43 | s3_output_prefix = "/".join(input_data.split("/")[3:]) 44 | 45 | s3_resource = boto3.resource("s3") 46 | temp_s3_bucket = s3_resource.Bucket(bucket) 47 | prefix_objs = temp_s3_bucket.objects.filter(Prefix=s3_output_prefix) 48 | for obj in prefix_objs: 49 | key = obj.key 50 | logger.info("Downloading data from bucket: %s, key: %s", bucket, key) 51 | s3fn = key.split("/") 52 | s3fn = s3fn[len(s3fn) - 1] 53 | fn = f"{base_dir}/data/{s3fn}" 54 | s3_resource.Bucket(bucket).download_file(key, fn) 55 | 56 | logger.info("Reading downloaded data") 57 | all_files = glob.iglob(os.path.join(f"{base_dir}/data", "*.csv")) 58 | df_from_each_file = (pd.read_csv(f) for f in all_files) 59 | model_data = pd.concat(df_from_each_file, ignore_index=True) 60 | 61 | logger.info(model_data.info()) 62 | 63 | # Split the data 64 | train_data, validation_data, test_data = np.split( 65 | model_data.sample(frac=1, random_state=1729), 66 | [int(0.7 * len(model_data)), int(0.8 * len(model_data))], 67 | ) 68 | 69 | test_data = test_data[train_data.columns] 70 | validation_data = validation_data[train_data.columns] 71 | 72 | pd.DataFrame(train_data).to_csv(f"{base_dir}/train/train.csv", header=False, index=False) 73 | pd.DataFrame(validation_data).to_csv( 74 | f"{base_dir}/validation/validation.csv", header=False, index=False 75 | ) 76 | pd.DataFrame(test_data).to_csv(f"{base_dir}/test/test.csv", header=False, index=False) 77 | -------------------------------------------------------------------------------- /container/decision_trees/predictor.py: -------------------------------------------------------------------------------- 1 | # This is the file that implements a flask server to do inferences. It's the file that you will modify to 2 | # implement the scoring for your own algorithm. 3 | 4 | from __future__ import print_function 5 | 6 | import io 7 | import json 8 | import os 9 | import pickle 10 | import signal 11 | import sys 12 | import traceback 13 | 14 | import flask 15 | import pandas as pd 16 | 17 | prefix = "/opt/ml/" 18 | model_path = os.path.join(prefix, "model") 19 | 20 | # A singleton for holding the model. This simply loads the model and holds it. 21 | # It has a predict function that does a prediction based on the model and the input data. 22 | 23 | 24 | class ScoringService(object): 25 | model = None # Where we keep the model when it's loaded 26 | 27 | @classmethod 28 | def get_model(cls): 29 | """Get the model object for this instance, loading it if it's not already loaded.""" 30 | if cls.model == None: 31 | with open(os.path.join(model_path, "decision-tree-model.pkl"), "rb") as inp: 32 | cls.model = pickle.load(inp) 33 | return cls.model 34 | 35 | @classmethod 36 | def predict(cls, input): 37 | """For the input, do the predictions and return them. 38 | 39 | Args: 40 | input (a pandas dataframe): The data on which to do the predictions. There will be 41 | one prediction per row in the dataframe""" 42 | clf = cls.get_model() 43 | return clf.predict(input) 44 | 45 | 46 | # The flask app for serving predictions 47 | app = flask.Flask(__name__) 48 | 49 | 50 | @app.route("/ping", methods=["GET"]) 51 | def ping(): 52 | """Determine if the container is working and healthy. In this sample container, we declare 53 | it healthy if we can load the model successfully.""" 54 | health = ScoringService.get_model() is not None # You can insert a health check here 55 | 56 | status = 200 if health else 404 57 | return flask.Response(response="\n", status=status, mimetype="application/json") 58 | 59 | 60 | @app.route("/invocations", methods=["POST"]) 61 | def transformation(): 62 | """Do an inference on a single batch of data. In this sample server, we take data as CSV, convert 63 | it to a pandas data frame for internal use and then convert the predictions back to CSV (which really 64 | just means one prediction per line, since there's a single column. 65 | """ 66 | data = None 67 | 68 | # Convert from CSV to pandas 69 | if flask.request.content_type == "text/csv": 70 | data = flask.request.data.decode("utf-8") 71 | s = io.StringIO(data) 72 | data = pd.read_csv(s, header=None) 73 | else: 74 | return flask.Response( 75 | response="This predictor only supports CSV data", status=415, mimetype="text/plain" 76 | ) 77 | 78 | print("Invoked with {} records".format(data.shape[0])) 79 | 80 | # Do the prediction 81 | predictions = ScoringService.predict(data) 82 | 83 | # Convert from numpy back to CSV 84 | out = io.StringIO() 85 | pd.DataFrame({"results": predictions}).to_csv(out, header=False, index=False) 86 | result = out.getvalue() 87 | 88 | return flask.Response(response=result, status=200, mimetype="text/csv") 89 | -------------------------------------------------------------------------------- /sagemaker-pipeline/pipelines/run_pipeline.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You 4 | # may not use this file except in compliance with the License. A copy of 5 | # the License is located at 6 | # 7 | # http://aws.amazon.com/apache2.0/ 8 | # 9 | # or in the "license" file accompanying this file. This file is 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 11 | # ANY KIND, either express or implied. See the License for the specific 12 | # language governing permissions and limitations under the License. 13 | """A CLI to create or update and run pipelines.""" 14 | from __future__ import absolute_import 15 | 16 | import argparse 17 | import json 18 | import sys 19 | 20 | from pipelines._utils import get_pipeline_driver, convert_struct, get_pipeline_custom_tags 21 | 22 | 23 | def main(): # pragma: no cover 24 | """The main harness that creates or updates and runs the pipeline. 25 | 26 | Creates or updates the pipeline and runs it. 27 | """ 28 | parser = argparse.ArgumentParser( 29 | "Creates or updates and runs the pipeline for the pipeline script." 30 | ) 31 | 32 | parser.add_argument( 33 | "-n", 34 | "--module-name", 35 | dest="module_name", 36 | type=str, 37 | help="The module name of the pipeline to import.", 38 | ) 39 | parser.add_argument( 40 | "-kwargs", 41 | "--kwargs", 42 | dest="kwargs", 43 | default=None, 44 | help="Dict string of keyword arguments for the pipeline generation (if supported)", 45 | ) 46 | parser.add_argument( 47 | "-role-arn", 48 | "--role-arn", 49 | dest="role_arn", 50 | type=str, 51 | help="The role arn for the pipeline service execution role.", 52 | ) 53 | parser.add_argument( 54 | "-description", 55 | "--description", 56 | dest="description", 57 | type=str, 58 | default=None, 59 | help="The description of the pipeline.", 60 | ) 61 | parser.add_argument( 62 | "-tags", 63 | "--tags", 64 | dest="tags", 65 | default=None, 66 | help="""List of dict strings of '[{"Key": "string", "Value": "string"}, ..]'""", 67 | ) 68 | args = parser.parse_args() 69 | 70 | if args.module_name is None or args.role_arn is None: 71 | parser.print_help() 72 | sys.exit(2) 73 | tags = convert_struct(args.tags) 74 | 75 | try: 76 | pipeline = get_pipeline_driver(args.module_name, args.kwargs) 77 | print("###### Creating/updating a SageMaker Pipeline with the following definition:") 78 | parsed = json.loads(pipeline.definition()) 79 | print(json.dumps(parsed, indent=2, sort_keys=True)) 80 | 81 | all_tags = get_pipeline_custom_tags(args.module_name, args.kwargs, tags) 82 | 83 | upsert_response = pipeline.upsert( 84 | role_arn=args.role_arn, description=args.description, tags=all_tags 85 | ) 86 | print("\n###### Created/Updated SageMaker Pipeline: Response received:") 87 | print(upsert_response) 88 | 89 | execution = pipeline.start() 90 | print(f"\n###### Execution started with PipelineExecutionArn: {execution.arn}") 91 | 92 | print("Waiting for the execution to finish...") 93 | execution.wait() 94 | print("\n#####Execution completed. Execution step details:") 95 | 96 | print(execution.list_steps()) 97 | # Todo print the status? 98 | except Exception as e: # pylint: disable=W0703 99 | print(f"Exception: {e}") 100 | sys.exit(1) 101 | 102 | 103 | if __name__ == "__main__": 104 | main() 105 | -------------------------------------------------------------------------------- /diabetes-project-iam.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Effect": "Allow", 6 | "Action": [ 7 | "iam:CreateRole", 8 | "iam:AttachRolePolicy", 9 | "iam:CreatePolicy", 10 | "iam:PassRole" 11 | ], 12 | "Resource": "*" 13 | }, 14 | { 15 | "Action": [ 16 | "cloudwatch:PutMetricData" 17 | ], 18 | "Resource": "*", 19 | "Effect": "Allow" 20 | }, 21 | { 22 | "Action": [ 23 | "codecommit:*" 24 | ], 25 | "Resource": "arn:aws:codecommit:*:*:sagemaker-*", 26 | "Effect": "Allow" 27 | }, 28 | { 29 | "Action": [ 30 | "codepipeline:StartPipelineExecution" 31 | ], 32 | "Resource": "arn:aws:codepipeline:*:*:sagemaker-*", 33 | "Effect": "Allow" 34 | }, 35 | { 36 | "Action": [ 37 | "codebuild:StartBuild" 38 | ], 39 | "Resource": "arn:aws:codebuild:*:*:sagemaker-*", 40 | "Effect": "Allow" 41 | }, 42 | { 43 | "Action": [ 44 | "ecr:BatchCheckLayerAvailability", 45 | "ecr:BatchGetImage", 46 | "ecr:Describe*", 47 | "ecr:GetAuthorizationToken", 48 | "ecr:GetDownloadUrlForLayer" 49 | ], 50 | "Resource": "*", 51 | "Effect": "Allow" 52 | }, 53 | { 54 | "Effect": "Allow", 55 | "Action": [ 56 | "ecr:BatchDeleteImage", 57 | "ecr:CompleteLayerUpload", 58 | "ecr:CreateRepository", 59 | "ecr:DeleteRepository", 60 | "ecr:InitiateLayerUpload", 61 | "ecr:PutImage", 62 | "ecr:UploadLayerPart" 63 | ], 64 | "Resource": [ 65 | "arn:aws:ecr:*:*:repository/*" 66 | ] 67 | }, 68 | { 69 | "Action": [ 70 | "logs:CreateLogDelivery", 71 | "logs:CreateLogGroup", 72 | "logs:CreateLogStream", 73 | "logs:DeleteLogDelivery", 74 | "logs:Describe*", 75 | "logs:GetLogDelivery", 76 | "logs:GetLogEvents", 77 | "logs:ListLogDeliveries", 78 | "logs:PutLogEvents", 79 | "logs:PutResourcePolicy", 80 | "logs:UpdateLogDelivery" 81 | ], 82 | "Resource": "*", 83 | "Effect": "Allow" 84 | }, 85 | { 86 | "Effect": "Allow", 87 | "Action": [ 88 | "s3:CreateBucket", 89 | "s3:DeleteBucket", 90 | "s3:GetBucketAcl", 91 | "s3:GetBucketCors", 92 | "s3:GetBucketLocation", 93 | "s3:ListAllMyBuckets", 94 | "s3:ListBucket", 95 | "s3:ListBucketMultipartUploads", 96 | "s3:PutBucketCors", 97 | "s3:PutObjectAcl" 98 | ], 99 | "Resource": [ 100 | "arn:aws:s3:::sagemaker-*" 101 | ] 102 | }, 103 | { 104 | "Effect": "Allow", 105 | "Action": [ 106 | "s3:AbortMultipartUpload", 107 | "s3:DeleteObject", 108 | "s3:GetObject", 109 | "s3:GetObjectVersion", 110 | "s3:PutObject", 111 | "s3:PutEncryptionConfiguration" 112 | ], 113 | "Resource": [ 114 | "arn:aws:s3:::sagemaker-*" 115 | ] 116 | }, 117 | { 118 | "Effect": "Allow", 119 | "Action": [ 120 | "sagemaker:*" 121 | ], 122 | "NotResource": [ 123 | "arn:aws:sagemaker:*:*:domain/*", 124 | "arn:aws:sagemaker:*:*:user-profile/*", 125 | "arn:aws:sagemaker:*:*:app/*", 126 | "arn:aws:sagemaker:*:*:flow-definition/*" 127 | ] 128 | } 129 | ] 130 | } 131 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Predict diabetic patient readmission using multi-model training on SageMaker Pipelines 2 | 3 | This project has two (2) components: (1) `container` - custom Docker image with custom Decision Tree algorithm using scikit-learn with hyperpameter tuning support, and (2) `sagemaker-pipeline` - a SageMaker pipeline that supports two (2) algorithms: XGBoost on SageMaker container and Decision Tree on custom container built from the first component. The pipeline imports the data from an S3 bucket for ML training using SageMaker Data Wrangler. The pipeline also supports SageMaker HyperParameter Tuning. The best performing model in terms of RPC is then registered to the model registry, ready for inference deployment. 4 | 5 | ## Start here 6 | 7 | In this example, we are solving binary classification problem to determine if a hospital diabetic patient is predicted to be readmitted in the hospital. This example uses [Diabetes 130-US hospitals for years 1999-2008 Data Set](https://archive.ics.uci.edu/ml/datasets/diabetes+130-us+hospitals+for+years+1999-2008). The dataset is uploaded to an S3 bucket and the pipeline imports the data from this bucket. Data Wrangler transforms the data (i.e. one-hot encoding, etc) as the initial step in the pipeline. The pipeline then proceeds with preprocessing, training using Decision Tree and XGBoost algorithms with hyperparameter tuning, evaluation, and registration of the winning model to the registry. This pipeline is a modified version of the pipeline provided by [Amazon SageMaker Examples multi-model pipeline](https://github.com/aws/amazon-sagemaker-examples/tree/main/sagemaker-pipeline-multi-model). 8 | 9 | Prior to running the pipeline, you have to push the Decision Tree custom container to your own Amazon Elastic Container Registry (ECR). This container is a modified version of [Scikit BYO](https://github.com/aws/amazon-sagemaker-examples/tree/main/advanced_functionality/scikit_bring_your_own/container). 10 | 11 | You can use the `diabetes-project-with-mlops.ipynb` notebook to experiment from SageMaker Studio before you are ready to checkin your code. Alternatively, you can run the pipeline outside of SageMaker Projects using `diabetes-project.ipynb`. 12 | 13 | ## DataSet 14 | 15 | The dataset represents 10 years (1999-2008) of clinical care at 130 US hospitals and integrated delivery networks. It includes over 50 features representing patient and hospital outcomes. Information was extracted from the database for encounters that satisfied the following criteria. More dataset information can be found in [Diabetes 130-US hospitals for years 1999-2008 Data Set](https://archive.ics.uci.edu/ml/datasets/diabetes+130-us+hospitals+for+years+1999-2008). 16 | 17 | ## Assumptions and Prerequisites 18 | 19 | - S3 bucket `sagemaker-diabetes-` is created and raw data has been uploaded to `s3://sagemaker-diabetes-/`. 20 | - SageMaker project is already created. Recommendation is to create a SageMaker project using [SageMaker-provide MLOps template for model building, training, and deployment template](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-projects-templates-sm.html#sagemaker-projects-templates-code-commit). 21 | - Necessary IAM service roles are already created. 22 | 23 | ## Security 24 | 25 | This sample code is not designed for production deployment out-of-the-box, so further security enhancements may need to be added according to your own requirements before pushing to production. Security recommendations include, but are not limited to, the following: 26 | - Use private ECR 27 | - Use a more defined IAM permission for service roles 28 | - Use interface / gateway VPC endpoints to prevent communication traffic from traversing public network 29 | - Use S3 VPC endpoint policy which controls access to specified Amazon S3 buckets only 30 | 31 | AmazonSageMakerServiceCatalogProductsUseRole-diabetes with AmazonSageMakerFullAccess. [This is required as we are creating a custom SageMaker image](https://docs.aws.amazon.com/sagemaker/latest/dg/studio-byoi-create.html). 32 | 33 | 34 | [diabetes-project-with-mlops.ipynb](diabetes-project-with-mlops.ipynb) and [diabetes-project.ipynb](diabetes-project.ipynb) have been tested in a SageMaker notebook instance that is using a kernel with Python 3.7 installed. This SageMaker notebook is attached with an [IAM role with an in-line policy](diabetes-project-iam.json). 35 | 36 | ## License 37 | 38 | This library is licensed under the MIT-0 License. See the LICENSE file. 39 | -------------------------------------------------------------------------------- /container/decision_trees/train: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # A sample training component that trains a simple scikit-learn decision tree model. 4 | # This implementation works in File mode and makes no assumptions about the input file names. 5 | # Input is specified as CSV with a data point in each row and the labels in the first column. 6 | 7 | from __future__ import print_function 8 | 9 | import json 10 | import os 11 | import pickle 12 | import sys 13 | import traceback 14 | import logging 15 | 16 | import pandas as pd 17 | from sklearn import tree 18 | 19 | from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score 20 | 21 | logger = logging.getLogger() 22 | logger.setLevel(logging.INFO) 23 | logger.addHandler(logging.StreamHandler()) 24 | # These are the paths to where SageMaker mounts interesting things in your container. 25 | 26 | prefix = '/opt/ml/' 27 | 28 | input_path = os.path.join(prefix, 'input/data') 29 | output_path = os.path.join(prefix, 'output') 30 | model_path = os.path.join(prefix, 'model') 31 | param_path = os.path.join(prefix, 'input/config/hyperparameters.json') 32 | 33 | # This algorithm has a single channel of input data called 'training'. Since we run in 34 | # File mode, the input files are copied to the directory specified here. 35 | channel_name_training='training' 36 | training_path = os.path.join(input_path, channel_name_training) 37 | 38 | channel_name_validation='validation' 39 | validation_path = os.path.join(input_path, channel_name_validation) 40 | 41 | # The function to execute the training. 42 | def train(): 43 | print('Starting the training.') 44 | try: 45 | # Read in any hyperparameters that the user passed with the training job 46 | with open(param_path, 'r') as tc: 47 | trainingParams = json.load(tc) 48 | 49 | # Take the set of files and read them all into a single pandas dataframe 50 | input_files = [ os.path.join(training_path, file) for file in os.listdir(training_path) ] 51 | if len(input_files) == 0: 52 | raise ValueError(('There are no files in {}.\n' + 53 | 'This usually indicates that the channel ({}) was incorrectly specified,\n' + 54 | 'the data specification in S3 was incorrectly specified or the role specified\n' + 55 | 'does not have permission to access the data.').format(training_path, channel_name_training)) 56 | raw_data = [ pd.read_csv(file, header=None) for file in input_files ] 57 | train_data = pd.concat(raw_data) 58 | 59 | # labels are in the first column 60 | train_y = train_data.iloc[:,0] 61 | train_X = train_data.iloc[:,1:] 62 | 63 | # Here we only support a single hyperparameter. Note that hyperparameters are always passed in as 64 | # strings, so we need to do any necessary conversions. 65 | max_leaf_nodes = trainingParams.get('max_leaf_nodes', None) 66 | if max_leaf_nodes is not None: 67 | max_leaf_nodes = int(max_leaf_nodes) 68 | max_depth = trainingParams.get('max_depth', None) 69 | if max_depth is not None: 70 | max_depth = int(max_depth) 71 | 72 | clf = tree.DecisionTreeClassifier(max_leaf_nodes=max_leaf_nodes, max_depth=max_depth) 73 | clf = clf.fit(train_X, train_y) 74 | 75 | 76 | # save the model 77 | os.system('sudo chown -R 1000:100 ' + model_path) 78 | with open(os.path.join(model_path, 'decision-tree-model.pkl'), 'wb') as out: 79 | pickle.dump(clf, out) 80 | 81 | print('Training complete.') 82 | 83 | return clf 84 | except Exception as e: 85 | # Write out an error file. This will be returned as the failureReason in the 86 | # DescribeTrainingJob result. 87 | trc = traceback.format_exc() 88 | os.system('sudo chown -R 1000:100 ' + output_path) 89 | with open(os.path.join(output_path, 'failure'), 'w') as s: 90 | s.write('Exception during training: ' + str(e) + '\n' + trc) 91 | 92 | # Printing this causes the exception to be in the training job logs, as well. 93 | print('Exception during training: ' + str(e) + '\n' + trc, file=sys.stderr) 94 | # A non-zero exit code causes the training job to be marked as Failed. 95 | sys.exit(255) 96 | 97 | # The function to execute the validation. 98 | def validation(clf): 99 | print('Starting the validation.') 100 | try: 101 | 102 | # Take the set of files and read them all into a single pandas dataframe 103 | input_files = [ os.path.join(validation_path, file) for file in os.listdir(validation_path) ] 104 | if len(input_files) == 0: 105 | raise ValueError(('There are no files in {}.\n' + 106 | 'This usually indicates that the channel ({}) was incorrectly specified,\n' + 107 | 'the data specification in S3 was incorrectly specified or the role specified\n' + 108 | 'does not have permission to access the data.').format(validation_path, channel_name_validation)) 109 | raw_data = [ pd.read_csv(file, header=None) for file in input_files ] 110 | validation_data = pd.concat(raw_data) 111 | 112 | # labels are in the first column 113 | train_y = validation_data.iloc[:,0] 114 | train_X = validation_data.iloc[:,1:] 115 | 116 | predictions = clf.predict(train_X) 117 | auc = roc_auc_score(train_y, predictions) 118 | logger.info(('auc:{}').format(auc)) 119 | 120 | except Exception as e: 121 | # Write out an error file. This will be returned as the failureReason in the 122 | # DescribeTrainingJob result. 123 | trc = traceback.format_exc() 124 | os.system('sudo chown -R 1000:100 ' + output_path) 125 | with open(os.path.join(output_path, 'failure'), 'w') as s: 126 | s.write('Exception during training: ' + str(e) + '\n' + trc) 127 | # Printing this causes the exception to be in the training job logs, as well. 128 | print('Exception during training: ' + str(e) + '\n' + trc, file=sys.stderr) 129 | # A non-zero exit code causes the training job to be marked as Failed. 130 | sys.exit(255) 131 | 132 | 133 | if __name__ == '__main__': 134 | clf = train() 135 | validation(clf) 136 | 137 | # A zero exit code causes the job to be marked a Succeeded. 138 | sys.exit(0) 139 | -------------------------------------------------------------------------------- /diabetes-project-with-mlops.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "af6b42fd", 6 | "metadata": {}, 7 | "source": [ 8 | "# Multi-model SageMaker Pipeline with Hyperparamater Tuning and Experiments" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "a1e0b8cc", 14 | "metadata": {}, 15 | "source": [ 16 | "Before proceeding, please see context of this notebook in [README.md](README.md). This notebook has been tested in a SageMaker notebook that is using a kernel with Python 3.7 installed, e.g. conda_mxnet_latest_p37, conda_python3. Make sure you have created a SageMaker project outside of this notebook with the name `diabetes`. Recommendation is to create a SageMaker project using [SageMaker-provide MLOps template for model building, training, and deployment template](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-projects-templates-sm.html#sagemaker-projects-templates-code-commit). Note that this notebook will not create the SageMaker project for you. \n" 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "f81c95b4", 22 | "metadata": {}, 23 | "source": [ 24 | "## Prepare the raw data" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "id": "2fdd2357", 30 | "metadata": {}, 31 | "source": [ 32 | "We create an S3 bucket and with encryption enabled for additional security. \n", 33 | "\n", 34 | "#### If you are running this Notebook in us-east-1 region, don't use 'CreateBucketConfiguration' parameter with create_bucket(). us-east-1 is the default location." 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "id": "d9393765", 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "import boto3\n", 45 | "\n", 46 | "AWS_ACCOUNT = boto3.client(\"sts\").get_caller_identity()[\"Account\"]\n", 47 | "AWS_REGION = boto3.Session().region_name\n", 48 | "BUCKET_NAME = \"sagemaker-diabetes-{AWS_ACCOUNT}\".format(AWS_ACCOUNT=AWS_ACCOUNT)\n", 49 | "\n", 50 | "s3_client = boto3.client(\"s3\")\n", 51 | "location = {\"LocationConstraint\": AWS_REGION}\n", 52 | "\n", 53 | "# default location is us-east-1, so CreateBucketConfiguration is not needed\n", 54 | "s3_client.create_bucket(Bucket=BUCKET_NAME)\n", 55 | "\n", 56 | "# use this create_bucket statement for any AWS region other than us-east-1\n", 57 | "#s3_client.create_bucket(Bucket=BUCKET_NAME, CreateBucketConfiguration=location) \n", 58 | "\n", 59 | "s3_client.put_bucket_encryption(\n", 60 | " Bucket=BUCKET_NAME,\n", 61 | " ServerSideEncryptionConfiguration={\n", 62 | " \"Rules\": [\n", 63 | " {\n", 64 | " \"ApplyServerSideEncryptionByDefault\": {\"SSEAlgorithm\": \"AES256\"},\n", 65 | " },\n", 66 | " ]\n", 67 | " },\n", 68 | ")" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "id": "2404183d", 74 | "metadata": {}, 75 | "source": [ 76 | "## Dataset collection\n", 77 | "\n", 78 | "Download UCI dataset and copy to S3 bucket" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": null, 84 | "id": "feb6ba21", 85 | "metadata": { 86 | "scrolled": true 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "%%sh\n", 91 | "\n", 92 | "AWS_ACCOUNT=$(aws sts get-caller-identity --query Account --output text)\n", 93 | "BUCKET_NAME=\"sagemaker-diabetes-${AWS_ACCOUNT}\"\n", 94 | "\n", 95 | "wget https://archive.ics.uci.edu/ml/machine-learning-databases/00296/dataset_diabetes.zip\n", 96 | "unzip dataset_diabetes.zip\n", 97 | "aws s3 cp dataset_diabetes/diabetic_data.csv s3://${BUCKET_NAME}/\n", 98 | " " 99 | ] 100 | }, 101 | { 102 | "cell_type": "markdown", 103 | "id": "75078e91", 104 | "metadata": {}, 105 | "source": [ 106 | "Update diabetes.flow to use your AWS account ID. " 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": null, 112 | "id": "bb6fa1e3", 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "%%sh\n", 117 | "\n", 118 | "AWS_ACCOUNT=$(aws sts get-caller-identity --query Account --output text)\n", 119 | "sed -i \"s/AWS_ACCOUNT/${AWS_ACCOUNT}/g\" sagemaker-pipeline/diabetes.flow" 120 | ] 121 | }, 122 | { 123 | "cell_type": "markdown", 124 | "id": "3a7c5961", 125 | "metadata": {}, 126 | "source": [ 127 | "Next, Create IAM Role for ML workflow steps" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "id": "4bb6d6dd", 134 | "metadata": {}, 135 | "outputs": [], 136 | "source": [ 137 | "import json\n", 138 | "\n", 139 | "iam_client = boto3.client(\"iam\")\n", 140 | "\n", 141 | "sagemaker_assume_role_policy_document = json.dumps(\n", 142 | " {\n", 143 | " \"Version\": \"2012-10-17\",\n", 144 | " \"Statement\": [\n", 145 | " {\n", 146 | " \"Effect\": \"Allow\",\n", 147 | " \"Principal\": {\"Service\": \"sagemaker.amazonaws.com\"},\n", 148 | " \"Action\": \"sts:AssumeRole\",\n", 149 | " }\n", 150 | " ],\n", 151 | " }\n", 152 | ")\n", 153 | "\n", 154 | "response_role = iam_client.create_role(\n", 155 | " RoleName=\"AmazonSageMakerServiceCatalogProductsUseRole-diabetes\",\n", 156 | " AssumeRolePolicyDocument=sagemaker_assume_role_policy_document,\n", 157 | ")\n", 158 | "\n", 159 | "\n", 160 | "iam_client.attach_role_policy(\n", 161 | " RoleName=response_role[\"Role\"][\"RoleName\"],\n", 162 | " PolicyArn='arn:aws:iam::aws:policy/AmazonSageMakerFullAccess'\n", 163 | ")\n" 164 | ] 165 | }, 166 | { 167 | "cell_type": "markdown", 168 | "id": "e127e0d0", 169 | "metadata": {}, 170 | "source": [ 171 | "## Prepare the Decision Tree custom Docker image" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "id": "fc5881fa", 177 | "metadata": {}, 178 | "source": [ 179 | "We make a Docker image containing a custom algorithm using [Scikit-learn Decision Tree Regressor](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html#sklearn.tree.DecisionTreeRegressor). Note that the Docker image has been modified to support hyperparameter tuning and validation data. \n", 180 | "\n" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "id": "44e33823", 187 | "metadata": {}, 188 | "outputs": [], 189 | "source": [ 190 | "! sudo yum install docker -y" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "id": "fa53c46b", 197 | "metadata": {}, 198 | "outputs": [], 199 | "source": [ 200 | "%%sh\n", 201 | "\n", 202 | "# The name of our algorithm\n", 203 | "ALGORITHM_NAME=\"diabetes-decision-trees\"\n", 204 | "\n", 205 | "cd container\n", 206 | "\n", 207 | "chmod +x decision_trees/train\n", 208 | "chmod +x decision_trees/serve\n", 209 | "\n", 210 | "AWS_ACCOUNT=$(aws sts get-caller-identity --query Account --output text)\n", 211 | "AWS_REGION=$(aws configure get region)\n", 212 | "\n", 213 | "IMAGE_FULLNAME=\"${AWS_ACCOUNT}.dkr.ecr.${AWS_REGION}.amazonaws.com/${ALGORITHM_NAME}:latest\"\n", 214 | "\n", 215 | "# If the repository doesn't exist in ECR, create it.\n", 216 | "aws ecr describe-repositories --repository-names \"${ALGORITHM_NAME}\" > /dev/null 2>&1\n", 217 | "\n", 218 | "if [ $? -ne 0 ]\n", 219 | "then\n", 220 | " aws ecr create-repository --repository-name \"${ALGORITHM_NAME}\" > /dev/null\n", 221 | "fi\n", 222 | "\n", 223 | "# Get the login command from ECR and execute it directly\n", 224 | "aws ecr get-login-password --region ${AWS_REGION}|docker login --username AWS --password-stdin ${IMAGE_FULLNAME}\n", 225 | "\n", 226 | "# Build the docker image locally with the image name and then push it to ECR with the full name.\n", 227 | "# Ensure your notebook IAM role has required permission for pushing image to ECR\n", 228 | "\n", 229 | "docker build -t ${ALGORITHM_NAME} .\n", 230 | "docker tag ${ALGORITHM_NAME} ${IMAGE_FULLNAME}\n", 231 | "docker push ${IMAGE_FULLNAME}\n" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "id": "3390ff94", 237 | "metadata": {}, 238 | "source": [ 239 | "Once Docker image is pushed to ECR repository, we make the image accessible from SageMaker. " 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "id": "6ff0f84e", 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "%%sh\n", 250 | "\n", 251 | "# The name of our algorithm\n", 252 | "SM_IMAGE_NAME=diabetes-dtree\n", 253 | "AWS_ACCOUNT=$(aws sts get-caller-identity --query Account --output text)\n", 254 | "\n", 255 | "# This assumes the role name is AmazonSageMakerServiceCatalogProductsUseRole-diabetes\n", 256 | "ROLE_ARN=\"arn:aws:iam::${AWS_ACCOUNT}:role/AmazonSageMakerServiceCatalogProductsUseRole-diabetes\"\n", 257 | "\n", 258 | "aws sagemaker create-image \\\n", 259 | " --image-name ${SM_IMAGE_NAME} \\\n", 260 | " --role-arn ${ROLE_ARN}\n", 261 | "\n", 262 | "aws sagemaker create-app-image-config \\\n", 263 | " --cli-input-json file://container/app-image-config-input.json\n" 264 | ] 265 | }, 266 | { 267 | "cell_type": "code", 268 | "execution_count": null, 269 | "id": "7e6cf39b", 270 | "metadata": {}, 271 | "outputs": [], 272 | "source": [ 273 | "%%sh\n", 274 | "AWS_ACCOUNT=$(aws sts get-caller-identity --query Account --output text)\n", 275 | "ALGORITHM_NAME=diabetes-decision-trees\n", 276 | "AWS_REGION=$(aws configure get region)\n", 277 | "SM_IMAGE_NAME=diabetes-dtree\n", 278 | "SM_BASE_IMAGE=\"${AWS_ACCOUNT}.dkr.ecr.${AWS_REGION}.amazonaws.com/${ALGORITHM_NAME}:latest\"\n", 279 | "\n", 280 | "aws sagemaker create-image-version \\\n", 281 | " --image-name ${SM_IMAGE_NAME} \\\n", 282 | " --base-image ${SM_BASE_IMAGE}" 283 | ] 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "id": "8ecc8a74", 288 | "metadata": {}, 289 | "source": [ 290 | "## Trigger the SageMaker Pipelines pipeline\n", 291 | "\n", 292 | "Here we perform following steps:\n", 293 | "\n", 294 | "1) Clone SageMaker Projects model-build repo from code commit\n", 295 | "\n", 296 | "2) Copy local project sagemaker-pipeline to the SageMaker Project repo\n", 297 | "\n", 298 | "3) Commit these changes to code commit\n", 299 | "\n", 300 | "Above 3 steps will trigger the SageMaker Projects model-build pipeline." 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": null, 306 | "id": "b9a3c5ae", 307 | "metadata": {}, 308 | "outputs": [], 309 | "source": [ 310 | "%%sh\n", 311 | "SAGEMAKER_PROJECT_NAME=diabetes\n", 312 | "AWS_REGION=$(aws configure get region)\n", 313 | "SAGEMAKER_PROJECT_ID=$(aws sagemaker describe-project --project-name ${SAGEMAKER_PROJECT_NAME} --query 'ProjectId' | tr -d '\"')\n", 314 | "SAGEMAKER_PROJECT_REPO=\"sagemaker-${SAGEMAKER_PROJECT_NAME}-${SAGEMAKER_PROJECT_ID}-modelbuild\"\n", 315 | "\n", 316 | "rm -rf ~/${SAGEMAKER_PROJECT_REPO}/\n", 317 | "git clone https://git-codecommit.${AWS_REGION}.amazonaws.com/v1/repos/${SAGEMAKER_PROJECT_REPO} ~/${SAGEMAKER_PROJECT_REPO}\n", 318 | "rsync -a sagemaker-pipeline/ ~/${SAGEMAKER_PROJECT_REPO}/ && rm -rf ~/${SAGEMAKER_PROJECT_REPO}/pipelines/abalone/ ~/${SAGEMAKER_PROJECT_REPO}/build/\n", 319 | "cd ~/${SAGEMAKER_PROJECT_REPO}/ && git config --global user.name \"name\" && git config --global user.email name@email.com && git config advice.addIgnoredFile false && git add --all && git commit -am \"initial commit\" && git push origin main \n", 320 | "\n" 321 | ] 322 | }, 323 | { 324 | "cell_type": "markdown", 325 | "id": "e2a43a57", 326 | "metadata": {}, 327 | "source": [ 328 | "The commit should trigger a pipeline run. Proceed to monitor your pipeline run until completion in SageMaker Studio. " 329 | ] 330 | }, 331 | { 332 | "cell_type": "markdown", 333 | "id": "4d35bcef", 334 | "metadata": {}, 335 | "source": [ 336 | "If you inspect the pipeline, you will see that the XGBoost model performs better than Decision Tree. Therefore, the XGBoost model is registered in the model registry." 337 | ] 338 | }, 339 | { 340 | "cell_type": "markdown", 341 | "id": "1718d5a3", 342 | "metadata": {}, 343 | "source": [ 344 | "## Approve top performing model in Model registry" 345 | ] 346 | }, 347 | { 348 | "cell_type": "markdown", 349 | "id": "593b322b", 350 | "metadata": {}, 351 | "source": [ 352 | "After the SageMaker Pipeline is complete, new trained Model will be registered in Model Registry.\n", 353 | "\n", 354 | "1) Make sure to update your desired `MODEL_VERSION`. We assume we approve the model version 1. \n", 355 | "\n", 356 | "2) As EventBridge monitors Model Registry status changes, Model status change will trigger SageMaker Projects model-deploy pipeline." 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": null, 362 | "id": "f2eec1e2", 363 | "metadata": {}, 364 | "outputs": [], 365 | "source": [ 366 | "from sagemaker import get_execution_role, session\n", 367 | "import boto3\n", 368 | "\n", 369 | "role = get_execution_role()\n", 370 | "sm_client = boto3.client(\"sagemaker\")\n", 371 | "\n", 372 | "MODEL_VERSION = \"1\"\n", 373 | "SAGEMAKER_PROJECT_NAME = \"diabetes\"\n", 374 | "SAGEMAKER_PROJECT_ID = sm_client.describe_project(ProjectName=SAGEMAKER_PROJECT_NAME)[\"ProjectId\"]\n", 375 | "AWS_REGION = boto3.Session().region_name\n", 376 | "MODEL_PACKAGE_ARN = \"arn:aws:sagemaker:{AWS_REGION}:{AWS_ACCOUNT}:model-package/{SAGEMAKER_PROJECT_NAME}-{SAGEMAKER_PROJECT_ID}/{MODEL_VERSION}\".format(\n", 377 | " AWS_REGION=AWS_REGION,\n", 378 | " AWS_ACCOUNT=AWS_ACCOUNT,\n", 379 | " SAGEMAKER_PROJECT_NAME=SAGEMAKER_PROJECT_NAME,\n", 380 | " SAGEMAKER_PROJECT_ID=SAGEMAKER_PROJECT_ID,\n", 381 | " MODEL_VERSION=MODEL_VERSION,\n", 382 | ")\n", 383 | "\n", 384 | "\n", 385 | "model_package_update_response = sm_client.update_model_package(\n", 386 | " ModelPackageArn=MODEL_PACKAGE_ARN, ModelApprovalStatus=\"Approved\"\n", 387 | ")" 388 | ] 389 | }, 390 | { 391 | "cell_type": "markdown", 392 | "id": "4d06cf5e", 393 | "metadata": {}, 394 | "source": [ 395 | "## Run predictions on model" 396 | ] 397 | }, 398 | { 399 | "cell_type": "markdown", 400 | "id": "89d32adf", 401 | "metadata": {}, 402 | "source": [ 403 | "Wait until SageMaker Projects model-deploy pipeline has deployed the staging inference endpoint. Use the following data for inference:\n", 404 | "\n", 405 | "Example 1\n", 406 | "------------\n", 407 | "`5.0,64.0,0.0,18.0,0.0,0.0,7.0,9.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0\n", 408 | "`\n", 409 | "\n", 410 | "In summary, this is a diabetic patient that is Caucasian Female age 60-70, who has spent 5 days in the hospital under emergency care in the current encounter. Prior to this encounter, patient has spent 0 days in outpatient care, 0 days in emergency care, 7 days in inpatient care. 64 laboratory procedures have been performed on the patient. Patient is not using metformin, repaglinide, pioglitazone, rosiglitazone, and insulin prescription is steady.\n", 411 | "\n" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": null, 417 | "id": "124746c4", 418 | "metadata": {}, 419 | "outputs": [], 420 | "source": [ 421 | "import json\n", 422 | "import boto3\n", 423 | "\n", 424 | "sm_runtime = boto3.client(\"runtime.sagemaker\")\n", 425 | "endpoint_name =\"diabetes-staging\"\n", 426 | "line = \"5.0,64.0,0.0,18.0,0.0,0.0,7.0,9.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0\"\n", 427 | "response = sm_runtime.invoke_endpoint(EndpointName=endpoint_name, ContentType=\"text/csv\", Body=line)\n", 428 | "result = json.loads(response[\"Body\"].read().decode())\n", 429 | "print(\"Predicted class : {}\".format(round(result)))" 430 | ] 431 | }, 432 | { 433 | "cell_type": "markdown", 434 | "id": "1fef914f", 435 | "metadata": {}, 436 | "source": [ 437 | "Now you try:\n", 438 | "\n", 439 | "Example 2\n", 440 | "------------\n", 441 | "\n", 442 | "`3.0,19.0,3.0,19.0,0.0,0.0,0.0,8.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0\n", 443 | "`\n", 444 | "\n", 445 | "In summary, this is a dibetic patient that is Caucasian Female age 70-80, who has spent 3 days in the hospital under elective care in the current encounter. Prior to this encounter, patient has spent 0 days in outpatient care, 0 days in emergency care, 0 days in inpatient care. 19 laboratory procedures have been performed on the patient. Patient is not using metformin, repaglinide, pioglitazone, rosiglitazone. Patient is not using insulin. " 446 | ] 447 | }, 448 | { 449 | "cell_type": "markdown", 450 | "id": "4afcc66c", 451 | "metadata": {}, 452 | "source": [ 453 | "## Cleanup" 454 | ] 455 | }, 456 | { 457 | "cell_type": "markdown", 458 | "id": "710b9232", 459 | "metadata": {}, 460 | "source": [ 461 | "To avoid incurring future charges, clean up created resources such as the S3 bucket, ECR repository, and SageMaker Studio. Prior to deleting the SageMaker Studio, make sure to delete the SageMaker model and endpoint resources, delete the entire SageMaker project diabetes, as well as its peripheral resources CodePipeline pipelines and CodeCommit repositories.\n", 462 | "Finally, delete the Jupyter instance containing the notebook. " 463 | ] 464 | } 465 | ], 466 | "metadata": { 467 | "instance_type": "ml.t3.medium", 468 | "kernelspec": { 469 | "display_name": "conda_python3", 470 | "language": "python", 471 | "name": "conda_python3" 472 | }, 473 | "language_info": { 474 | "codemirror_mode": { 475 | "name": "ipython", 476 | "version": 3 477 | }, 478 | "file_extension": ".py", 479 | "mimetype": "text/x-python", 480 | "name": "python", 481 | "nbconvert_exporter": "python", 482 | "pygments_lexer": "ipython3", 483 | "version": "3.8.12" 484 | } 485 | }, 486 | "nbformat": 4, 487 | "nbformat_minor": 5 488 | } 489 | -------------------------------------------------------------------------------- /sagemaker-pipeline/pipelines/diabetes/pipeline.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed under the Apache License, Version 2.0 (the "License"). You 3 | # may not use this file except in compliance with the License. A copy of 4 | # the License is located at 5 | # 6 | # http://aws.amazon.com/apache2.0/ 7 | # 8 | # or in the "license" file accompanying this file. This file is 9 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 10 | # ANY KIND, either express or implied. See the License for the specific 11 | # language governing permissions and limitations under the License. 12 | """Example workflow pipeline script for RESVM pipeline. 13 | . -RegisterModel 14 | . 15 | Process-> Train -> Evaluate -> Condition . 16 | . 17 | . -(stop) 18 | Implements a get_pipeline(**kwargs) method. 19 | """ 20 | 21 | import os 22 | 23 | import boto3 24 | import sagemaker 25 | import sagemaker.session 26 | from sagemaker.estimator import Estimator 27 | from sagemaker.inputs import TrainingInput 28 | from sagemaker.model_metrics import MetricsSource, ModelMetrics 29 | from sagemaker.processing import ProcessingInput, ProcessingOutput, ScriptProcessor 30 | from sagemaker.sklearn.processing import SKLearnProcessor 31 | from sagemaker.workflow.condition_step import ConditionStep, JsonGet 32 | from sagemaker.workflow.conditions import ConditionGreaterThanOrEqualTo 33 | from sagemaker.workflow.parameters import ParameterInteger, ParameterString 34 | from sagemaker.workflow.pipeline import Pipeline 35 | from sagemaker.workflow.properties import PropertyFile 36 | from sagemaker.workflow.step_collections import RegisterModel 37 | from sagemaker.workflow.steps import ProcessingStep, TrainingStep, CacheConfig, TuningStep 38 | 39 | ### 40 | from sagemaker.processing import ProcessingInput, ProcessingOutput 41 | from sagemaker.dataset_definition.inputs import ( 42 | AthenaDatasetDefinition, 43 | DatasetDefinition, 44 | RedshiftDatasetDefinition, 45 | ) 46 | 47 | 48 | import time 49 | import uuid 50 | import sagemaker 51 | 52 | import os 53 | import json 54 | import boto3 55 | 56 | from sagemaker.processing import Processor 57 | from sagemaker.network import NetworkConfig 58 | 59 | from sagemaker.workflow.steps import ProcessingStep 60 | 61 | from smexperiments.experiment import Experiment 62 | from smexperiments.trial import Trial 63 | from smexperiments.trial_component import TrialComponent 64 | from smexperiments.tracker import Tracker 65 | from sagemaker.workflow.pipeline_experiment_config import PipelineExperimentConfig 66 | 67 | from sagemaker.tuner import ( 68 | ContinuousParameter, 69 | IntegerParameter, 70 | CategoricalParameter, 71 | HyperparameterTuner, 72 | WarmStartConfig, 73 | WarmStartTypes, 74 | ) 75 | 76 | 77 | BASE_DIR = os.path.dirname(os.path.realpath(__file__)) 78 | 79 | 80 | def get_session(region, default_bucket): 81 | """Gets the sagemaker session based on the region. 82 | Args: 83 | region: the aws region to start the session 84 | default_bucket: the bucket to use for storing the artifacts 85 | Returns: 86 | `sagemaker.session.Session instance 87 | """ 88 | 89 | boto_session = boto3.Session(region_name=region) 90 | 91 | sagemaker_client = boto_session.client("sagemaker") 92 | runtime_client = boto_session.client("sagemaker-runtime") 93 | return sagemaker.session.Session( 94 | boto_session=boto_session, 95 | sagemaker_client=sagemaker_client, 96 | sagemaker_runtime_client=runtime_client, 97 | default_bucket=default_bucket, 98 | ) 99 | 100 | 101 | def get_pipeline( 102 | region, 103 | sagemaker_project_arn=None, 104 | role=None, 105 | default_bucket=None, 106 | model_package_group_name="", # Choose any name 107 | pipeline_name="", # You can find your pipeline name in the Studio UI (project -> Pipelines -> name) 108 | base_job_prefix="", # Choose any name 109 | ): 110 | """Gets a SageMaker ML Pipeline instance working with on RE data. 111 | Args: 112 | region: AWS region to create and run the pipeline. 113 | role: IAM role to create and run steps and pipeline. 114 | default_bucket: the bucket to use for storing the artifacts 115 | Returns: 116 | an instance of a pipeline 117 | """ 118 | sagemaker_session = get_session(region, default_bucket) 119 | if role is None: 120 | role = sagemaker.session.get_execution_role(sagemaker_session) 121 | 122 | # Parameters for pipeline execution 123 | processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1) 124 | processing_instance_type = ParameterString( 125 | name="ProcessingInstanceType", default_value="ml.m5.2xlarge" 126 | ) 127 | training_instance_type = ParameterString( 128 | name="TrainingInstanceType", default_value="ml.m5.xlarge" 129 | ) 130 | model_approval_status = ParameterString( 131 | name="ModelApprovalStatus", 132 | default_value="PendingManualApproval", # ModelApprovalStatus can be set to a default of "Approved" if you don't want manual approval. 133 | ) 134 | input_data = ParameterString( 135 | name="InputDataUrl", 136 | default_value=f"", # Change this to point to the s3 location of your raw input data. 137 | ) 138 | 139 | # Sagemaker session 140 | sess = sagemaker_session 141 | 142 | # You can configure this with your own bucket name, e.g. 143 | # bucket = "my-bucket" 144 | bucket = sess.default_bucket() 145 | 146 | print(f"Data Wrangler export storage bucket: {bucket}") 147 | 148 | # unique flow export ID 149 | flow_export_id = f"{time.strftime('%d-%H-%M-%S', time.gmtime())}-{str(uuid.uuid4())[:8]}" 150 | flow_export_name = f"flow-{flow_export_id}" 151 | 152 | # Output name is auto-generated from the select node's ID + output name from the flow file. 153 | output_name = "d593101e-278b-4330-9779-b6e02fbeb99e.default" 154 | 155 | s3_output_prefix = f"export-{flow_export_name}/output" 156 | s3_output_path = f"s3://{bucket}/{s3_output_prefix}" 157 | print(f"Flow S3 export result path: {s3_output_path}") 158 | 159 | processing_job_output = ProcessingOutput( 160 | output_name=output_name, 161 | source="/opt/ml/processing/output", 162 | destination=s3_output_path, 163 | s3_upload_mode="EndOfJob", 164 | ) 165 | 166 | # name of the flow file which should exist in the current notebook working directory 167 | flow_file_name = "diabetes.flow" 168 | 169 | # Load .flow file from current notebook working directory 170 | #!echo "Loading flow file from current notebook working directory: $PWD" 171 | 172 | with open(flow_file_name) as f: 173 | flow = json.load(f) 174 | 175 | # Upload flow to S3 176 | s3_client = boto3.client("s3") 177 | s3_client.upload_file( 178 | flow_file_name, 179 | bucket, 180 | f"data_wrangler_flows/{flow_export_name}.flow", 181 | ExtraArgs={"ServerSideEncryption": "aws:kms"}, 182 | ) 183 | 184 | flow_s3_uri = f"s3://{bucket}/data_wrangler_flows/{flow_export_name}.flow" 185 | 186 | print(f"Data Wrangler flow {flow_file_name} uploaded to {flow_s3_uri}") 187 | 188 | flow_input = ProcessingInput( 189 | source=flow_s3_uri, 190 | destination="/opt/ml/processing/flow", 191 | input_name="flow", 192 | s3_data_type="S3Prefix", 193 | s3_input_mode="File", 194 | s3_data_distribution_type="FullyReplicated", 195 | ) 196 | 197 | # IAM role for executing the processing job. 198 | iam_role = role 199 | 200 | # Unique processing job name. Give a unique name every time you re-execute processing jobs 201 | processing_job_name = f"data-wrangler-flow-processing-{flow_export_id}" 202 | 203 | # Data Wrangler Container URL. 204 | container_uri = sagemaker.image_uris.retrieve( 205 | framework="data-wrangler", # we are using the Sagemaker built in xgboost algorithm 206 | region=region, 207 | ) 208 | 209 | # Processing Job Instance count and instance type. 210 | instance_count = 2 211 | instance_type = "ml.m5.4xlarge" 212 | 213 | # Size in GB of the EBS volume to use for storing data during processing 214 | volume_size_in_gb = 30 215 | 216 | # Content type for each output. Data Wrangler supports CSV as default and Parquet. 217 | output_content_type = "CSV" 218 | 219 | # Network Isolation mode; default is off 220 | enable_network_isolation = False 221 | 222 | # List of tags to be passed to the processing job 223 | user_tags = [] 224 | 225 | # Output configuration used as processing job container arguments 226 | output_config = {output_name: {"content_type": output_content_type}} 227 | 228 | # KMS key for per object encryption; default is None 229 | kms_key = None 230 | 231 | processor = Processor( 232 | role=iam_role, 233 | image_uri=container_uri, 234 | instance_count=instance_count, 235 | instance_type=instance_type, 236 | volume_size_in_gb=volume_size_in_gb, 237 | network_config=NetworkConfig(enable_network_isolation=enable_network_isolation), 238 | sagemaker_session=sess, 239 | output_kms_key=kms_key, 240 | tags=user_tags, 241 | ) 242 | 243 | data_wrangler_step = ProcessingStep( 244 | name="DataWranglerProcess", 245 | processor=processor, 246 | inputs=[flow_input], 247 | outputs=[processing_job_output], 248 | job_arguments=[f"--output-config '{json.dumps(output_config)}'"], 249 | ) 250 | 251 | # Processing step for feature engineering 252 | # this processor does not have awswrangler installed 253 | sklearn_processor = SKLearnProcessor( 254 | framework_version="0.23-1", 255 | instance_type=processing_instance_type, 256 | instance_count=processing_instance_count, 257 | base_job_name=f"{base_job_prefix}/sklearn-diabetes-preprocess", # choose any name 258 | sagemaker_session=sagemaker_session, 259 | role=role, 260 | ) 261 | 262 | step_process = ProcessingStep( 263 | name="Preprocess", # choose any name 264 | processor=sklearn_processor, 265 | inputs=[ 266 | ProcessingInput( 267 | source=data_wrangler_step.properties.ProcessingOutputConfig.Outputs[ 268 | output_name 269 | ].S3Output.S3Uri, 270 | destination="/opt/ml/processing/data/raw-data-dir", 271 | ) 272 | ], 273 | outputs=[ 274 | ProcessingOutput(output_name="train", source="/opt/ml/processing/train"), 275 | ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"), 276 | ProcessingOutput(output_name="test", source="/opt/ml/processing/test"), 277 | ], 278 | code=os.path.join(BASE_DIR, "preprocess.py"), 279 | job_arguments=[ 280 | "--input-data", 281 | data_wrangler_step.properties.ProcessingOutputConfig.Outputs[ 282 | output_name 283 | ].S3Output.S3Uri, 284 | ], 285 | ) 286 | 287 | # Training step for generating model artifacts 288 | model_path = f"s3://{sagemaker_session.default_bucket()}/{base_job_prefix}/diabetesTrain" 289 | model_bucket_key = f"{sagemaker_session.default_bucket()}/{base_job_prefix}/diabetesTrain" 290 | cache_config = CacheConfig(enable_caching=True, expire_after="30d") 291 | 292 | xgb_image_uri = sagemaker.image_uris.retrieve( 293 | framework="xgboost", # we are using the Sagemaker built in xgboost algorithm 294 | region=region, 295 | version="1.0-1", 296 | py_version="py3", 297 | instance_type=training_instance_type, 298 | ) 299 | xgb_train = Estimator( 300 | image_uri=xgb_image_uri, 301 | instance_type=training_instance_type, 302 | instance_count=1, 303 | output_path=model_path, 304 | base_job_name=f"{base_job_prefix}/diabetes-xgb-train", 305 | sagemaker_session=sagemaker_session, 306 | role=role, 307 | ) 308 | xgb_train.set_hyperparameters( 309 | num_round=50, 310 | objective="binary:logistic", 311 | ) 312 | 313 | xgb_train.set_hyperparameters(grow_policy="lossguide") 314 | 315 | xgb_objective_metric_name = "validation:auc" 316 | xgb_hyperparameter_ranges = { 317 | "max_depth": IntegerParameter(5, 10, scaling_type="Auto"), 318 | "min_child_weight": IntegerParameter(5, 10, scaling_type="Auto"), 319 | "eta": ContinuousParameter(0.1, 0.9, scaling_type="Auto"), 320 | "gamma": IntegerParameter(4, 9, scaling_type="Auto"), 321 | "subsample": ContinuousParameter(0.7, 0.9, scaling_type="Auto"), 322 | } 323 | 324 | xgb_tuner_log = HyperparameterTuner( 325 | xgb_train, 326 | xgb_objective_metric_name, 327 | xgb_hyperparameter_ranges, 328 | max_jobs=5, 329 | max_parallel_jobs=5, 330 | strategy="Random", 331 | objective_type="Maximize", 332 | ) 333 | 334 | xgb_step_tuning = TuningStep( 335 | name="XGBHPTune", 336 | tuner=xgb_tuner_log, 337 | inputs={ 338 | "train": TrainingInput( 339 | s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ 340 | "train" 341 | ].S3Output.S3Uri, 342 | content_type="text/csv", 343 | ), 344 | "validation": TrainingInput( 345 | s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ 346 | "validation" 347 | ].S3Output.S3Uri, 348 | content_type="text/csv", 349 | ), 350 | }, 351 | cache_config=cache_config, 352 | ) 353 | 354 | dtree_image_uri = sagemaker_session.sagemaker_client.describe_image_version( 355 | ImageName="diabetes-dtree" 356 | )["ContainerImage"] 357 | 358 | dtree_train = Estimator( 359 | image_uri=dtree_image_uri, 360 | role=role, 361 | instance_count=1, 362 | instance_type=training_instance_type, 363 | base_job_name=f"{base_job_prefix}/diabetes-dtree-train", 364 | output_path=model_path, 365 | sagemaker_session=sagemaker_session, 366 | ) 367 | 368 | dtree_objective_metric_name = "validation:auc" 369 | dtree_metric_definitions = [{"Name": "validation:auc", "Regex": "auc:(\S+)"}] 370 | 371 | dtree_hyperparameter_ranges = { 372 | "max_depth": IntegerParameter(5, 10, scaling_type="Linear"), 373 | "max_leaf_nodes": IntegerParameter(2, 10, scaling_type="Linear"), 374 | } 375 | 376 | dtree_tuner_log = HyperparameterTuner( 377 | dtree_train, 378 | dtree_objective_metric_name, 379 | dtree_hyperparameter_ranges, 380 | dtree_metric_definitions, 381 | max_jobs=5, 382 | max_parallel_jobs=5, 383 | strategy="Random", 384 | objective_type="Maximize", 385 | ) 386 | 387 | dtree_step_tuning = TuningStep( 388 | name="DTreeHPTune", 389 | tuner=dtree_tuner_log, 390 | inputs={ 391 | "training": TrainingInput( 392 | s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ 393 | "train" 394 | ].S3Output.S3Uri, 395 | content_type="text/csv", 396 | ), 397 | "validation": TrainingInput( 398 | s3_data=step_process.properties.ProcessingOutputConfig.Outputs[ 399 | "validation" 400 | ].S3Output.S3Uri, 401 | content_type="text/csv", 402 | ), 403 | }, 404 | cache_config=cache_config, 405 | ) 406 | 407 | dtree_script_eval = ScriptProcessor( 408 | image_uri=dtree_image_uri, 409 | command=["python3"], 410 | instance_type=processing_instance_type, 411 | instance_count=1, 412 | base_job_name=f"{base_job_prefix}/script-dtree-eval", 413 | sagemaker_session=sagemaker_session, 414 | role=role, 415 | ) 416 | 417 | dtree_evaluation_report = PropertyFile( 418 | name="EvaluationReportDTree", 419 | output_name="dtree_evaluation", 420 | path="dtree_evaluation.json", 421 | ) 422 | 423 | dtree_step_eval = ProcessingStep( 424 | name="DTreeEval", 425 | processor=dtree_script_eval, 426 | inputs=[ 427 | ProcessingInput( 428 | source=dtree_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key), 429 | destination="/opt/ml/processing/model", 430 | ), 431 | ProcessingInput( 432 | source=step_process.properties.ProcessingOutputConfig.Outputs[ 433 | "test" 434 | ].S3Output.S3Uri, 435 | destination="/opt/ml/processing/test", 436 | ), 437 | ], 438 | outputs=[ 439 | ProcessingOutput( 440 | output_name="dtree_evaluation", source="/opt/ml/processing/evaluation" 441 | ), 442 | ], 443 | code=os.path.join(BASE_DIR, "dtree_evaluate.py"), 444 | property_files=[dtree_evaluation_report], 445 | ) 446 | 447 | xgb_script_eval = ScriptProcessor( 448 | image_uri=xgb_image_uri, 449 | command=["python3"], 450 | instance_type=processing_instance_type, 451 | instance_count=1, 452 | base_job_name=f"{base_job_prefix}/script-xgb-eval", 453 | sagemaker_session=sagemaker_session, 454 | role=role, 455 | ) 456 | 457 | xgb_evaluation_report = PropertyFile( 458 | name="EvaluationReportXGBoost", 459 | output_name="xgb_evaluation", 460 | path="xgb_evaluation.json", 461 | ) 462 | 463 | xgb_step_eval = ProcessingStep( 464 | name="XGBEval", 465 | processor=xgb_script_eval, 466 | inputs=[ 467 | ProcessingInput( 468 | source=xgb_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key), 469 | destination="/opt/ml/processing/model", 470 | ), 471 | ProcessingInput( 472 | source=step_process.properties.ProcessingOutputConfig.Outputs[ 473 | "test" 474 | ].S3Output.S3Uri, 475 | destination="/opt/ml/processing/test", 476 | ), 477 | ], 478 | outputs=[ 479 | ProcessingOutput(output_name="xgb_evaluation", source="/opt/ml/processing/evaluation"), 480 | ], 481 | code=os.path.join(BASE_DIR, "xgb_evaluate.py"), 482 | property_files=[xgb_evaluation_report], 483 | ) 484 | 485 | xgb_model_metrics = ModelMetrics( 486 | model_statistics=MetricsSource( 487 | s3_uri="{}/xgb_evaluation.json".format( 488 | xgb_step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"] 489 | ), 490 | content_type="application/json", 491 | ) 492 | ) 493 | 494 | dtree_model_metrics = ModelMetrics( 495 | model_statistics=MetricsSource( 496 | s3_uri="{}/dtree_evaluation.json".format( 497 | dtree_step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"][ 498 | "S3Uri" 499 | ] 500 | ), 501 | content_type="application/json", 502 | ) 503 | ) 504 | 505 | xgb_eval_metrics = JsonGet( 506 | step=xgb_step_eval, 507 | property_file=xgb_evaluation_report, 508 | json_path="regression_metrics.roc.value", # This should follow the structure of your report_dict defined in the evaluate.py file. 509 | ) 510 | 511 | dtree_eval_metrics = JsonGet( 512 | step=dtree_step_eval, 513 | property_file=dtree_evaluation_report, 514 | json_path="regression_metrics.roc.value", # This should follow the structure of your report_dict defined in the evaluate.py file. 515 | ) 516 | 517 | # Register model step that will be conditionally executed 518 | dtree_step_register = RegisterModel( 519 | name="DTreeReg", 520 | estimator=dtree_train, 521 | model_data=dtree_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key), 522 | content_types=["text/csv"], 523 | response_types=["text/csv"], 524 | inference_instances=["ml.t2.medium", "ml.m5.large"], 525 | transform_instances=["ml.m5.large"], 526 | model_package_group_name=model_package_group_name, 527 | approval_status=model_approval_status, 528 | model_metrics=dtree_model_metrics, 529 | ) 530 | 531 | # Register model step that will be conditionally executed 532 | xgb_step_register = RegisterModel( 533 | name="XGBReg", 534 | estimator=xgb_train, 535 | model_data=xgb_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key), 536 | content_types=["text/csv"], 537 | response_types=["text/csv"], 538 | inference_instances=["ml.t2.medium", "ml.m5.large"], 539 | transform_instances=["ml.m5.large"], 540 | model_package_group_name=model_package_group_name, 541 | approval_status=model_approval_status, 542 | model_metrics=xgb_model_metrics, 543 | ) 544 | 545 | # Condition step for evaluating model quality and branching execution 546 | cond_lte = ConditionGreaterThanOrEqualTo( # You can change the condition here 547 | left=JsonGet( 548 | step=dtree_step_eval, 549 | property_file=dtree_evaluation_report, 550 | json_path="regression_metrics.roc.value", # This should follow the structure of your report_dict defined in the evaluate.py file. 551 | ), 552 | right=JsonGet( 553 | step=xgb_step_eval, 554 | property_file=xgb_evaluation_report, 555 | json_path="regression_metrics.roc.value" 556 | ), # You can change the threshold here 557 | ) 558 | 559 | step_cond = ConditionStep( 560 | name="AccuracyCond", 561 | conditions=[cond_lte], 562 | if_steps=[dtree_step_register], 563 | else_steps=[xgb_step_register], 564 | ) 565 | create_date = time.strftime("%Y-%m-%d-%H-%M-%S") 566 | 567 | # Pipeline instance 568 | pipeline = Pipeline( 569 | name=pipeline_name, 570 | parameters=[ 571 | processing_instance_type, 572 | processing_instance_count, 573 | training_instance_type, 574 | model_approval_status, 575 | input_data 576 | ], 577 | pipeline_experiment_config=PipelineExperimentConfig( 578 | pipeline_name + "-" + create_date, "diabetes-{}".format(create_date) 579 | ), 580 | steps=[ 581 | data_wrangler_step, 582 | step_process, 583 | dtree_step_tuning, 584 | xgb_step_tuning, 585 | dtree_step_eval, 586 | xgb_step_eval, 587 | step_cond, 588 | ], 589 | sagemaker_session=sagemaker_session, 590 | ) 591 | return pipeline 592 | -------------------------------------------------------------------------------- /sagemaker-pipeline/diabetes.flow: -------------------------------------------------------------------------------- 1 | { 2 | "metadata": { 3 | "version": 1, 4 | "disable_limits": false, 5 | "instance_type": "ml.m5.4xlarge" 6 | }, 7 | "nodes": [ 8 | { 9 | "node_id": "0b3a3943-97b0-49e3-9894-830443f522ca", 10 | "type": "SOURCE", 11 | "operator": "sagemaker.s3_source_0.1", 12 | "parameters": { 13 | "dataset_definition": { 14 | "__typename": "S3CreateDatasetDefinitionOutput", 15 | "datasetSourceType": "S3", 16 | "name": "diabetic_data.csv", 17 | "description": null, 18 | "s3ExecutionContext": { 19 | "__typename": "S3ExecutionContext", 20 | "s3Uri": "s3://sagemaker-diabetes-AWS_ACCOUNT/diabetic_data.csv", 21 | "s3ContentType": "csv", 22 | "s3HasHeader": true, 23 | "s3FieldDelimiter": ",", 24 | "s3DirIncludesNested": false, 25 | "s3AddsFilenameColumn": false 26 | } 27 | } 28 | }, 29 | "inputs": [], 30 | "outputs": [ 31 | { 32 | "name": "default" 33 | } 34 | ] 35 | }, 36 | { 37 | "node_id": "b3d30123-1423-4472-a251-b9ff24d9d381", 38 | "type": "TRANSFORM", 39 | "operator": "sagemaker.spark.infer_and_cast_type_0.1", 40 | "parameters": {}, 41 | "trained_parameters": { 42 | "schema": { 43 | "encounter_id": "long", 44 | "patient_nbr": "long", 45 | "race": "string", 46 | "gender": "string", 47 | "age": "string", 48 | "weight": "string", 49 | "admission_type_id": "long", 50 | "discharge_disposition_id": "long", 51 | "admission_source_id": "long", 52 | "time_in_hospital": "long", 53 | "payer_code": "string", 54 | "medical_specialty": "string", 55 | "num_lab_procedures": "long", 56 | "num_procedures": "long", 57 | "num_medications": "long", 58 | "number_outpatient": "long", 59 | "number_emergency": "long", 60 | "number_inpatient": "long", 61 | "diag_1": "long", 62 | "diag_2": "long", 63 | "diag_3": "long", 64 | "number_diagnoses": "long", 65 | "max_glu_serum": "string", 66 | "A1Cresult": "string", 67 | "metformin": "string", 68 | "repaglinide": "string", 69 | "nateglinide": "string", 70 | "chlorpropamide": "string", 71 | "glimepiride": "string", 72 | "acetohexamide": "string", 73 | "glipizide": "string", 74 | "glyburide": "string", 75 | "tolbutamide": "string", 76 | "pioglitazone": "string", 77 | "rosiglitazone": "string", 78 | "acarbose": "string", 79 | "miglitol": "string", 80 | "troglitazone": "string", 81 | "tolazamide": "string", 82 | "examide": "string", 83 | "citoglipton": "string", 84 | "insulin": "string", 85 | "glyburide-metformin": "string", 86 | "glipizide-metformin": "string", 87 | "glimepiride-pioglitazone": "string", 88 | "metformin-rosiglitazone": "string", 89 | "metformin-pioglitazone": "string", 90 | "change": "string", 91 | "diabetesMed": "string", 92 | "readmitted": "string" 93 | } 94 | }, 95 | "inputs": [ 96 | { 97 | "name": "default", 98 | "node_id": "0b3a3943-97b0-49e3-9894-830443f522ca", 99 | "output_name": "default" 100 | } 101 | ], 102 | "outputs": [ 103 | { 104 | "name": "default" 105 | } 106 | ] 107 | }, 108 | { 109 | "node_id": "a0fcc4ad-932f-4c61-b04f-85165ec49f54", 110 | "type": "TRANSFORM", 111 | "operator": "sagemaker.spark.manage_columns_0.1", 112 | "parameters": { 113 | "operator": "Move column", 114 | "move_column_parameters": { 115 | "move_type": "Move to start", 116 | "move_to_start_parameters": { 117 | "column_to_move": "readmitted" 118 | } 119 | }, 120 | "drop_column_parameters": {} 121 | }, 122 | "inputs": [ 123 | { 124 | "name": "df", 125 | "node_id": "b3d30123-1423-4472-a251-b9ff24d9d381", 126 | "output_name": "default" 127 | } 128 | ], 129 | "outputs": [ 130 | { 131 | "name": "default" 132 | } 133 | ] 134 | }, 135 | { 136 | "node_id": "9c491942-6270-410a-8734-dafaa3bee672", 137 | "type": "TRANSFORM", 138 | "operator": "sagemaker.spark.custom_code_0.1", 139 | "parameters": { 140 | "operator": "Python (User-Defined Function)", 141 | "udf_parameters": { 142 | "return_type": "float", 143 | "udf_mode": "Pandas", 144 | "input_col": "readmitted", 145 | "output_col": "readmitted", 146 | "pandas_code": "import pandas as pd\n# Add imports for sklearn, numpy, or any other packages you might need.\n\ndef custom_func(series: pd.Series) -> pd.Series:\n #series.apply(lambda x : 'other' if (str(x).find('V') != -1 or str(x).find('E') != -1) else ('circulatory' if int(x) in range(390, 460) or int(x) == 785) else ('test'))\n series = series.apply(lambda x : 0.0 if (str(x) == 'NO') else (1.0)) \n return series\n " 147 | }, 148 | "pyspark_parameters": {}, 149 | "name": "readmitted" 150 | }, 151 | "inputs": [ 152 | { 153 | "name": "df", 154 | "node_id": "a0fcc4ad-932f-4c61-b04f-85165ec49f54", 155 | "output_name": "default" 156 | } 157 | ], 158 | "outputs": [ 159 | { 160 | "name": "default" 161 | } 162 | ] 163 | }, 164 | { 165 | "node_id": "e589c34a-c6ec-4d8c-9549-87550098951a", 166 | "type": "TRANSFORM", 167 | "operator": "sagemaker.spark.manage_columns_0.1", 168 | "parameters": { 169 | "operator": "Drop column", 170 | "drop_column_parameters": { 171 | "column_to_drop": [ 172 | "payer_code", 173 | "encounter_id", 174 | "patient_nbr", 175 | "weight", 176 | "medical_specialty", 177 | "acarbose", 178 | "metformin-pioglitazone", 179 | "acetohexamide", 180 | "metformin-rosiglitazone", 181 | "glimepiride", 182 | "glimepiride-pioglitazone", 183 | "glipizide", 184 | "glyburide-metformin", 185 | "examide", 186 | "troglitazone", 187 | "miglitol", 188 | "citoglipton", 189 | "glipizide-metformin", 190 | "chlorpropamide", 191 | "tolbutamide", 192 | "glyburide", 193 | "tolazamide", 194 | "nateglinide" 195 | ] 196 | } 197 | }, 198 | "inputs": [ 199 | { 200 | "name": "df", 201 | "node_id": "9c491942-6270-410a-8734-dafaa3bee672", 202 | "output_name": "default" 203 | } 204 | ], 205 | "outputs": [ 206 | { 207 | "name": "default" 208 | } 209 | ] 210 | }, 211 | { 212 | "node_id": "4ea28cf4-b062-494c-a49f-6bb840d9128b", 213 | "type": "TRANSFORM", 214 | "operator": "sagemaker.spark.handle_missing_0.1", 215 | "parameters": { 216 | "operator": "Fill missing", 217 | "fill_missing_parameters": { 218 | "input_column": [ 219 | "diag_1", 220 | "diag_2", 221 | "diag_3" 222 | ], 223 | "fill_value": "0" 224 | }, 225 | "impute_parameters": { 226 | "column_type": "Numeric", 227 | "numeric_parameters": { 228 | "strategy": "Approximate Median" 229 | } 230 | } 231 | }, 232 | "inputs": [ 233 | { 234 | "name": "df", 235 | "node_id": "e589c34a-c6ec-4d8c-9549-87550098951a", 236 | "output_name": "default" 237 | } 238 | ], 239 | "outputs": [ 240 | { 241 | "name": "default" 242 | } 243 | ] 244 | }, 245 | { 246 | "node_id": "13fa0709-b2a5-4e92-9f72-eb247015018d", 247 | "type": "TRANSFORM", 248 | "operator": "sagemaker.spark.search_and_edit_0.1", 249 | "parameters": { 250 | "operator": "Find and replace substring", 251 | "find_and_replace_substring_parameters": { 252 | "input_column": [ 253 | "race" 254 | ], 255 | "pattern": "\\?", 256 | "replacement": "Unknown" 257 | } 258 | }, 259 | "inputs": [ 260 | { 261 | "name": "df", 262 | "node_id": "4ea28cf4-b062-494c-a49f-6bb840d9128b", 263 | "output_name": "default" 264 | } 265 | ], 266 | "outputs": [ 267 | { 268 | "name": "default" 269 | } 270 | ] 271 | }, 272 | { 273 | "node_id": "e440e602-6db8-478d-a99f-82cba34c3cf3", 274 | "type": "TRANSFORM", 275 | "operator": "sagemaker.spark.custom_code_0.1", 276 | "parameters": { 277 | "operator": "Python (User-Defined Function)", 278 | "udf_parameters": { 279 | "return_type": "string", 280 | "udf_mode": "Pandas", 281 | "input_col": "diag_1", 282 | "output_col": "diag_1", 283 | "pandas_code": "import pandas as pd\n# Add imports for sklearn, numpy, or any other packages you might need.\n\ndef custom_func(series: pd.Series) -> pd.Series:\n #series.apply(lambda x : 'other' if (str(x).find('V') != -1 or str(x).find('E') != -1) else ('circulatory' if int(x) in range(390, 460) or int(x) == 785) else ('test'))\n series = series.apply(lambda x : 'other' if (str(x).find('V') != -1 or str(x).find('E') != -1 or int(x) == 0 ) \n else ('circulatory' if int(x) in range(390, 460) or int(x) == 785\n else ('respiratory' if int(float(x)) in range(460, 520) or int(float(x)) == 786\n else ('digestive' if int(float(x)) in range(520, 580) or int(float(x)) == 787\n else ('diabetes' if int(float(x)) == 250\n else ('injury' if int(float(x)) in range(800, 1000)\n else ('musculoskeletal' if int(float(x)) in range(710, 740)\n else ('genitourinary' if int(float(x)) in range(580, 630) or int(float(x)) == 788\n else ('neoplasms' if int(float(x)) in range(140, 240)\n else ('pregnecy' if int(float(x)) in range(630, 680)\n else 'other'))))))))))\n return series\n \n \"\"\" The following function is applied over batches of the input. The Series that it outputs must be the same length as the input Series.\n\n Example:\n\n def lowercase(series: pd.Series) -> pd.Series:\n return series.str.lower()\n \"\"\"" 284 | }, 285 | "pyspark_parameters": {}, 286 | "name": "diag-1" 287 | }, 288 | "inputs": [ 289 | { 290 | "name": "df", 291 | "node_id": "13fa0709-b2a5-4e92-9f72-eb247015018d", 292 | "output_name": "default" 293 | } 294 | ], 295 | "outputs": [ 296 | { 297 | "name": "default" 298 | } 299 | ] 300 | }, 301 | { 302 | "node_id": "6b5b607a-03b9-4133-8ac8-12b2540e28ab", 303 | "type": "TRANSFORM", 304 | "operator": "sagemaker.spark.custom_code_0.1", 305 | "parameters": { 306 | "operator": "Python (User-Defined Function)", 307 | "udf_parameters": { 308 | "return_type": "string", 309 | "udf_mode": "Pandas", 310 | "input_col": "diag_2", 311 | "output_col": "diag_2", 312 | "pandas_code": "import pandas as pd\n# Add imports for sklearn, numpy, or any other packages you might need.\n\ndef custom_func(series: pd.Series) -> pd.Series:\n #series.apply(lambda x : 'other' if (str(x).find('V') != -1 or str(x).find('E') != -1) else ('circulatory' if int(x) in range(390, 460) or int(x) == 785) else ('test'))\n series = series.apply(lambda x : 'other' if (str(x).find('V') != -1 or str(x).find('E') != -1 or int(x) == 0 ) \n else ('circulatory' if int(x) in range(390, 460) or int(x) == 785\n else ('respiratory' if int(float(x)) in range(460, 520) or int(float(x)) == 786\n else ('digestive' if int(float(x)) in range(520, 580) or int(float(x)) == 787\n else ('diabetes' if int(float(x)) == 250\n else ('injury' if int(float(x)) in range(800, 1000)\n else ('musculoskeletal' if int(float(x)) in range(710, 740)\n else ('genitourinary' if int(float(x)) in range(580, 630) or int(float(x)) == 788\n else ('neoplasms' if int(float(x)) in range(140, 240)\n else ('pregnecy' if int(float(x)) in range(630, 680)\n else 'other'))))))))))\n return series\n" 313 | }, 314 | "pyspark_parameters": {}, 315 | "name": "diag-2" 316 | }, 317 | "inputs": [ 318 | { 319 | "name": "df", 320 | "node_id": "e440e602-6db8-478d-a99f-82cba34c3cf3", 321 | "output_name": "default" 322 | } 323 | ], 324 | "outputs": [ 325 | { 326 | "name": "default" 327 | } 328 | ] 329 | }, 330 | { 331 | "node_id": "5e825fe5-8545-4746-b27a-ea05971880cd", 332 | "type": "TRANSFORM", 333 | "operator": "sagemaker.spark.custom_code_0.1", 334 | "parameters": { 335 | "operator": "Python (User-Defined Function)", 336 | "udf_parameters": { 337 | "return_type": "string", 338 | "udf_mode": "Pandas", 339 | "input_col": "diag_3", 340 | "output_col": "diag_3", 341 | "pandas_code": "import pandas as pd\n# Add imports for sklearn, numpy, or any other packages you might need.\n\ndef custom_func(series: pd.Series) -> pd.Series:\n #series.apply(lambda x : 'other' if (str(x).find('V') != -1 or str(x).find('E') != -1) else ('circulatory' if int(x) in range(390, 460) or int(x) == 785) else ('test'))\n series = series.apply(lambda x : 'other' if (str(x).find('V') != -1 or str(x).find('E') != -1 or int(x) == 0 ) \n else ('circulatory' if int(x) in range(390, 460) or int(x) == 785\n else ('respiratory' if int(float(x)) in range(460, 520) or int(float(x)) == 786\n else ('digestive' if int(float(x)) in range(520, 580) or int(float(x)) == 787\n else ('diabetes' if int(float(x)) == 250\n else ('injury' if int(float(x)) in range(800, 1000)\n else ('musculoskeletal' if int(float(x)) in range(710, 740)\n else ('genitourinary' if int(float(x)) in range(580, 630) or int(float(x)) == 788\n else ('neoplasms' if int(float(x)) in range(140, 240)\n else ('pregnecy' if int(float(x)) in range(630, 680)\n else 'other'))))))))))\n return series\n" 342 | }, 343 | "pyspark_parameters": {}, 344 | "name": "diag-3" 345 | }, 346 | "inputs": [ 347 | { 348 | "name": "df", 349 | "node_id": "6b5b607a-03b9-4133-8ac8-12b2540e28ab", 350 | "output_name": "default" 351 | } 352 | ], 353 | "outputs": [ 354 | { 355 | "name": "default" 356 | } 357 | ] 358 | }, 359 | { 360 | "node_id": "c413d501-02e3-4817-b10a-0ac6faf7d41a", 361 | "type": "TRANSFORM", 362 | "operator": "sagemaker.spark.custom_code_0.1", 363 | "parameters": { 364 | "operator": "Python (User-Defined Function)", 365 | "udf_parameters": { 366 | "return_type": "string", 367 | "udf_mode": "Pandas", 368 | "input_col": "admission_type_id", 369 | "output_col": "admission_type_id", 370 | "pandas_code": "import pandas as pd\n# Add imports for sklearn, numpy, or any other packages you might need.\n\ndef custom_func(series: pd.Series) -> pd.Series:\n series = series.apply(lambda x : 'Unknown' if (int(x) in [5,6,8] ) else ('Emergency' if int(x) == 1 else ('Urgent' if int(x) == 2 else ('Elective' if int(x) == 3 else ('Newborn' if int(x) == 4 else ('TraumaCenter'))))))\n return series\n" 371 | }, 372 | "pyspark_parameters": {}, 373 | "name": "admission-type-id" 374 | }, 375 | "inputs": [ 376 | { 377 | "name": "df", 378 | "node_id": "5e825fe5-8545-4746-b27a-ea05971880cd", 379 | "output_name": "default" 380 | } 381 | ], 382 | "outputs": [ 383 | { 384 | "name": "default" 385 | } 386 | ] 387 | }, 388 | { 389 | "node_id": "5c7bf83d-6a99-4b0e-b3e2-a6a0d3d30f05", 390 | "type": "TRANSFORM", 391 | "operator": "sagemaker.spark.custom_code_0.1", 392 | "parameters": { 393 | "operator": "Python (User-Defined Function)", 394 | "udf_parameters": { 395 | "return_type": "string", 396 | "udf_mode": "Pandas", 397 | "input_col": "discharge_disposition_id", 398 | "output_col": "discharge_disposition_id", 399 | "pandas_code": "import pandas as pd\n# Add imports for sklearn, numpy, or any other packages you might need.\n\ndef custom_func(series: pd.Series) -> pd.Series:\n series = series.apply(lambda x : 'Discharged' if (int(x) in [1,2,3,4,5,6,8,10,15,16,17,22,23,24,27,28,29,30]) else ('LeftAMA' if int(x) == 7 else ('InPatient' if int(x) == 9 else ('OutPatient' if int(x) == 12 else ('Expired' if int(x) in [11,19,20,21] else ('Hospice' if int(x) in [13,14] else ('Unknown')))))))\n\n return series" 400 | }, 401 | "pyspark_parameters": {}, 402 | "name": "discharge-disposition-id" 403 | }, 404 | "inputs": [ 405 | { 406 | "name": "df", 407 | "node_id": "c413d501-02e3-4817-b10a-0ac6faf7d41a", 408 | "output_name": "default" 409 | } 410 | ], 411 | "outputs": [ 412 | { 413 | "name": "default" 414 | } 415 | ] 416 | }, 417 | { 418 | "node_id": "b6d7bd2b-611c-4969-80e5-a2c9dfea4d78", 419 | "type": "TRANSFORM", 420 | "operator": "sagemaker.spark.custom_code_0.1", 421 | "parameters": { 422 | "operator": "Python (User-Defined Function)", 423 | "udf_parameters": { 424 | "return_type": "string", 425 | "udf_mode": "Pandas", 426 | "input_col": "admission_source_id", 427 | "output_col": "admission_source_id", 428 | "pandas_code": "import pandas as pd\n# Add imports for sklearn, numpy, or any other packages you might need.\n\ndef custom_func(series: pd.Series) -> pd.Series:\n series = series.apply(lambda x : 'Referral' if (int(x) in [1,2,3]) else ('Transfer' if int(x) in [4,5,6,10,18,19,22,25,26] else ('Emergency' if int(x) == 7 else ('Court' if int(x) == 8 else ('Unknown' if int(x) in [9,15,17,20,21] else ('NormalDelivery' if int(x) == 11 else ('AbnormalDelivery' if int(x) in [12,13,14] else ('BornInside' if int(x) == 23 else ('BornOutside')))))))))\n return series" 429 | }, 430 | "pyspark_parameters": {}, 431 | "name": "admission-source-id" 432 | }, 433 | "inputs": [ 434 | { 435 | "name": "df", 436 | "node_id": "5c7bf83d-6a99-4b0e-b3e2-a6a0d3d30f05", 437 | "output_name": "default" 438 | } 439 | ], 440 | "outputs": [ 441 | { 442 | "name": "default" 443 | } 444 | ] 445 | }, 446 | { 447 | "node_id": "48aab63d-84d2-4eb4-aed7-23210ebc3ab9", 448 | "type": "TRANSFORM", 449 | "operator": "sagemaker.spark.manage_rows_0.1", 450 | "parameters": { 451 | "operator": "Drop duplicates", 452 | "drop_duplicates_parameters": {}, 453 | "sort_parameters": { 454 | "order": "Ascending" 455 | } 456 | }, 457 | "inputs": [ 458 | { 459 | "name": "df", 460 | "node_id": "b6d7bd2b-611c-4969-80e5-a2c9dfea4d78", 461 | "output_name": "default" 462 | } 463 | ], 464 | "outputs": [ 465 | { 466 | "name": "default" 467 | } 468 | ] 469 | }, 470 | { 471 | "node_id": "b31d277f-ecf4-48bc-bddd-fc19e8b30254", 472 | "type": "TRANSFORM", 473 | "operator": "sagemaker.spark.balance_data_0.1", 474 | "parameters": { 475 | "operator": "SMOTE", 476 | "ratio": 1, 477 | "smote_params": { 478 | "num_neighbors": 10 479 | }, 480 | "target_column": "readmitted" 481 | }, 482 | "inputs": [ 483 | { 484 | "name": "df", 485 | "node_id": "48aab63d-84d2-4eb4-aed7-23210ebc3ab9", 486 | "output_name": "default" 487 | } 488 | ], 489 | "outputs": [ 490 | { 491 | "name": "default" 492 | } 493 | ] 494 | }, 495 | { 496 | "node_id": "d593101e-278b-4330-9779-b6e02fbeb99e", 497 | "type": "TRANSFORM", 498 | "operator": "sagemaker.spark.encode_categorical_0.1", 499 | "parameters": { 500 | "operator": "One-hot encode", 501 | "one_hot_encode_parameters": { 502 | "invalid_handling_strategy": "Keep", 503 | "drop_last": false, 504 | "output_style": "Columns", 505 | "input_column": [ 506 | "race", 507 | "gender", 508 | "age", 509 | "diag_1", 510 | "diag_2", 511 | "diag_3", 512 | "max_glu_serum", 513 | "A1Cresult", 514 | "metformin", 515 | "repaglinide", 516 | "pioglitazone", 517 | "rosiglitazone", 518 | "insulin", 519 | "change", 520 | "diabetesMed", 521 | "admission_type_id", 522 | "discharge_disposition_id", 523 | "admission_source_id" 524 | ] 525 | }, 526 | "ordinal_encode_parameters": { 527 | "invalid_handling_strategy": "Replace with NaN" 528 | } 529 | }, 530 | "inputs": [ 531 | { 532 | "name": "df", 533 | "node_id": "b31d277f-ecf4-48bc-bddd-fc19e8b30254", 534 | "output_name": "default" 535 | } 536 | ], 537 | "outputs": [ 538 | { 539 | "name": "default" 540 | } 541 | ] 542 | } 543 | ] 544 | } 545 | -------------------------------------------------------------------------------- /diabetes-project.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "671a914c", 6 | "metadata": {}, 7 | "source": [ 8 | "# Multi-model SageMaker Pipeline with Hyperparamater Tuning and Experiments" 9 | ] 10 | }, 11 | { 12 | "cell_type": "markdown", 13 | "id": "d697da1a", 14 | "metadata": {}, 15 | "source": [ 16 | "This notebook has been tested in a SageMaker notebook that is using a kernel with Python 3.7 installed, e.g. conda_mxnet_latest_p37, conda_python3." 17 | ] 18 | }, 19 | { 20 | "cell_type": "markdown", 21 | "id": "bd2ffcca", 22 | "metadata": {}, 23 | "source": [ 24 | "## Prepare the dataset collection" 25 | ] 26 | }, 27 | { 28 | "cell_type": "markdown", 29 | "id": "c125794f", 30 | "metadata": {}, 31 | "source": [ 32 | "We create an S3 bucket and with encryption enabled for additional security. \n", 33 | "\n", 34 | "#### If you are running this Notebook in us-east-1 region, don't use 'CreateBucketConfiguration' parameter with create_bucket(). us-east-1 is the default location." 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": null, 40 | "id": "6f7edb84", 41 | "metadata": {}, 42 | "outputs": [], 43 | "source": [ 44 | "import boto3\n", 45 | "\n", 46 | "AWS_ACCOUNT = boto3.client(\"sts\").get_caller_identity()[\"Account\"]\n", 47 | "AWS_REGION = boto3.Session().region_name" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "id": "ef0780a3", 54 | "metadata": {}, 55 | "outputs": [], 56 | "source": [ 57 | "import boto3\n", 58 | "\n", 59 | "AWS_ACCOUNT = boto3.client(\"sts\").get_caller_identity()[\"Account\"]\n", 60 | "AWS_REGION = boto3.Session().region_name\n", 61 | "PREFIX = \"sagemaker-diabetes\"\n", 62 | "BUCKET_NAME = \"{PREFIX}-{AWS_ACCOUNT}\".format(PREFIX=PREFIX,AWS_ACCOUNT=AWS_ACCOUNT)\n", 63 | "\n", 64 | "s3_client = boto3.client(\"s3\")\n", 65 | "location = {\"LocationConstraint\": AWS_REGION}\n", 66 | "\n", 67 | "# default location is us-east-1, so CreateBucketConfiguration is not needed\n", 68 | "s3_client.create_bucket(Bucket=BUCKET_NAME)\n", 69 | "\n", 70 | "# use this create_bucket statement for any AWS region other than us-east-1\n", 71 | "#s3_client.create_bucket(Bucket=BUCKET_NAME, CreateBucketConfiguration=location) \n", 72 | "\n", 73 | "s3_client.put_bucket_encryption(\n", 74 | " Bucket=BUCKET_NAME,\n", 75 | " ServerSideEncryptionConfiguration={\n", 76 | " \"Rules\": [\n", 77 | " {\n", 78 | " \"ApplyServerSideEncryptionByDefault\": {\"SSEAlgorithm\": \"AES256\"},\n", 79 | " },\n", 80 | " ]\n", 81 | " },\n", 82 | ")" 83 | ] 84 | }, 85 | { 86 | "cell_type": "markdown", 87 | "id": "a28a1f0e", 88 | "metadata": {}, 89 | "source": [ 90 | "Download UCI dataset and copy to S3 bucket. " 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": null, 96 | "id": "57356c93", 97 | "metadata": { 98 | "scrolled": true 99 | }, 100 | "outputs": [], 101 | "source": [ 102 | "%%sh\n", 103 | "\n", 104 | "AWS_ACCOUNT=$(aws sts get-caller-identity --query Account --output text)\n", 105 | "BUCKET_NAME=\"sagemaker-diabetes-${AWS_ACCOUNT}\"\n", 106 | "\n", 107 | "wget https://archive.ics.uci.edu/ml/machine-learning-databases/00296/dataset_diabetes.zip\n", 108 | "unzip dataset_diabetes.zip\n", 109 | "aws s3 cp dataset_diabetes/diabetic_data.csv s3://${BUCKET_NAME}/\n", 110 | " " 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "id": "61768499", 116 | "metadata": {}, 117 | "source": [ 118 | "Update diabetes.flow to use your AWS account ID. " 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": null, 124 | "id": "ee47dce2", 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "%%sh\n", 129 | "\n", 130 | "AWS_ACCOUNT=$(aws sts get-caller-identity --query Account --output text)\n", 131 | "sed -i \"s/AWS_ACCOUNT/${AWS_ACCOUNT}/g\" sagemaker-pipeline/diabetes.flow" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "id": "50020dd6", 137 | "metadata": {}, 138 | "source": [ 139 | "Next, Create IAM Role for ML workflow steps" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": null, 145 | "id": "c1ff2f77", 146 | "metadata": {}, 147 | "outputs": [], 148 | "source": [ 149 | "import json\n", 150 | "\n", 151 | "iam_client = boto3.client(\"iam\")\n", 152 | "\n", 153 | "sagemaker_assume_role_policy_document = json.dumps(\n", 154 | " {\n", 155 | " \"Version\": \"2012-10-17\",\n", 156 | " \"Statement\": [\n", 157 | " {\n", 158 | " \"Effect\": \"Allow\",\n", 159 | " \"Principal\": {\"Service\": \"sagemaker.amazonaws.com\"},\n", 160 | " \"Action\": \"sts:AssumeRole\",\n", 161 | " }\n", 162 | " ],\n", 163 | " }\n", 164 | ")\n", 165 | "\n", 166 | "response_role = iam_client.create_role(\n", 167 | " RoleName=\"AmazonSageMakerServiceCatalogProductsUseRole-diabetes\",\n", 168 | " AssumeRolePolicyDocument=sagemaker_assume_role_policy_document,\n", 169 | ")\n", 170 | "\n", 171 | "\n", 172 | "iam_client.attach_role_policy(\n", 173 | " RoleName=response_role[\"Role\"][\"RoleName\"],\n", 174 | " PolicyArn='arn:aws:iam::aws:policy/AmazonSageMakerFullAccess'\n", 175 | ")\n" 176 | ] 177 | }, 178 | { 179 | "cell_type": "markdown", 180 | "id": "6f41e95e", 181 | "metadata": {}, 182 | "source": [ 183 | "## Prepare the Decision Tree custom Docker image" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "id": "6c642455", 189 | "metadata": {}, 190 | "source": [ 191 | "We make a Docker image containing a custom algorithm using [Scikit-learn Decision Tree Classifier](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html). Note that the Docker image has been modified to support hyperparameter tuning and validation data. \n", 192 | "\n" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": null, 198 | "id": "70f0000a", 199 | "metadata": {}, 200 | "outputs": [], 201 | "source": [ 202 | "! sudo yum install docker -y" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "id": "3aec9c87", 209 | "metadata": {}, 210 | "outputs": [], 211 | "source": [ 212 | "%%sh\n", 213 | "\n", 214 | "# The name of our algorithm\n", 215 | "ALGORITHM_NAME=\"diabetes-decision-trees\"\n", 216 | "\n", 217 | "cd container\n", 218 | "\n", 219 | "chmod +x decision_trees/train\n", 220 | "chmod +x decision_trees/serve\n", 221 | "\n", 222 | "AWS_ACCOUNT=$(aws sts get-caller-identity --query Account --output text)\n", 223 | "AWS_REGION=$(aws configure get region)\n", 224 | "\n", 225 | "IMAGE_FULLNAME=\"${AWS_ACCOUNT}.dkr.ecr.${AWS_REGION}.amazonaws.com/${ALGORITHM_NAME}:latest\"\n", 226 | "\n", 227 | "# If the repository doesn't exist in ECR, create it.\n", 228 | "aws ecr describe-repositories --repository-names \"${ALGORITHM_NAME}\" > /dev/null 2>&1\n", 229 | "\n", 230 | "if [ $? -ne 0 ]\n", 231 | "then\n", 232 | " aws ecr create-repository --repository-name \"${ALGORITHM_NAME}\" > /dev/null\n", 233 | "fi\n", 234 | "\n", 235 | "# Get the login command from ECR and execute it directly\n", 236 | "aws ecr get-login-password --region ${AWS_REGION}|docker login --username AWS --password-stdin ${IMAGE_FULLNAME}\n", 237 | "\n", 238 | "# Build the docker image locally with the image name and then push it to ECR with the full name.\n", 239 | "# Ensure your notebook IAM role has required permission for pushing image to ECR\n", 240 | "\n", 241 | "docker build -t ${ALGORITHM_NAME} .\n", 242 | "docker tag ${ALGORITHM_NAME} ${IMAGE_FULLNAME}\n", 243 | "docker push ${IMAGE_FULLNAME}\n" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "id": "11b15c71", 249 | "metadata": {}, 250 | "source": [ 251 | "Once Docker image is pushed to ECR repository, we make the image accessible from SageMaker. " 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": null, 257 | "id": "f3e03c17", 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "%%sh\n", 262 | "\n", 263 | "# The name of our algorithm\n", 264 | "SM_IMAGE_NAME=diabetes-dtree\n", 265 | "AWS_ACCOUNT=$(aws sts get-caller-identity --query Account --output text)\n", 266 | "\n", 267 | "# This assumes the role name is AmazonSageMakerServiceCatalogProductsUseRole-diabetes\n", 268 | "ROLE_ARN=\"arn:aws:iam::${AWS_ACCOUNT}:role/AmazonSageMakerServiceCatalogProductsUseRole-diabetes\"\n", 269 | "\n", 270 | "aws sagemaker create-image \\\n", 271 | " --image-name ${SM_IMAGE_NAME} \\\n", 272 | " --role-arn ${ROLE_ARN}\n", 273 | "\n", 274 | "aws sagemaker create-app-image-config \\\n", 275 | " --cli-input-json file://container/app-image-config-input.json\n" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": null, 281 | "id": "ad3a940d", 282 | "metadata": {}, 283 | "outputs": [], 284 | "source": [ 285 | "%%sh\n", 286 | "AWS_ACCOUNT=$(aws sts get-caller-identity --query Account --output text)\n", 287 | "ALGORITHM_NAME=diabetes-decision-trees\n", 288 | "AWS_REGION=$(aws configure get region)\n", 289 | "SM_IMAGE_NAME=diabetes-dtree\n", 290 | "SM_BASE_IMAGE=\"${AWS_ACCOUNT}.dkr.ecr.${AWS_REGION}.amazonaws.com/${ALGORITHM_NAME}:latest\"\n", 291 | "\n", 292 | "aws sagemaker create-image-version \\\n", 293 | " --image-name ${SM_IMAGE_NAME} \\\n", 294 | " --base-image ${SM_BASE_IMAGE}" 295 | ] 296 | }, 297 | { 298 | "cell_type": "markdown", 299 | "id": "9eb7dd2b", 300 | "metadata": {}, 301 | "source": [ 302 | "## Define and start the SageMaker pipeline" 303 | ] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "id": "4e0f7a38", 308 | "metadata": {}, 309 | "source": [ 310 | "Install the necessary Python library `awswrangler` for the SageMaker pipeline. " 311 | ] 312 | }, 313 | { 314 | "cell_type": "code", 315 | "execution_count": null, 316 | "id": "21ed4a32", 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [ 320 | "! pip3 install awswrangler" 321 | ] 322 | }, 323 | { 324 | "cell_type": "markdown", 325 | "id": "10abe34d", 326 | "metadata": {}, 327 | "source": [ 328 | "Import the necessary Python modules for the SageMaker pipeline. " 329 | ] 330 | }, 331 | { 332 | "cell_type": "code", 333 | "execution_count": null, 334 | "id": "faaac6eb", 335 | "metadata": {}, 336 | "outputs": [], 337 | "source": [ 338 | "import os\n", 339 | "import time\n", 340 | "import uuid\n", 341 | "import json\n", 342 | "import boto3\n", 343 | "import sagemaker\n", 344 | "import sagemaker.session\n", 345 | "from sagemaker.estimator import Estimator\n", 346 | "from sagemaker.inputs import TrainingInput\n", 347 | "from sagemaker.model_metrics import MetricsSource, ModelMetrics\n", 348 | "from sagemaker.processing import ProcessingInput, ProcessingOutput, ScriptProcessor\n", 349 | "from sagemaker.sklearn.processing import SKLearnProcessor\n", 350 | "from sagemaker.workflow.condition_step import ConditionStep, JsonGet\n", 351 | "#from sagemaker.workflow.functions import JsonGet\n", 352 | "from sagemaker.workflow.pipeline_context import PipelineSession\n", 353 | "from sagemaker.workflow.conditions import ConditionGreaterThanOrEqualTo\n", 354 | "from sagemaker.workflow.parameters import ParameterInteger, ParameterString\n", 355 | "from sagemaker.workflow.pipeline import Pipeline\n", 356 | "from sagemaker.workflow.properties import PropertyFile\n", 357 | "from sagemaker.workflow.step_collections import RegisterModel\n", 358 | "from sagemaker.workflow.steps import ProcessingStep, TrainingStep, CacheConfig, TuningStep\n", 359 | "from sagemaker.processing import ProcessingInput, ProcessingOutput\n", 360 | "from sagemaker.processing import Processor\n", 361 | "from sagemaker.network import NetworkConfig\n", 362 | "from sagemaker.tuner import (\n", 363 | " ContinuousParameter,\n", 364 | " IntegerParameter,\n", 365 | " CategoricalParameter,\n", 366 | " HyperparameterTuner,\n", 367 | " WarmStartConfig,\n", 368 | " WarmStartTypes,\n", 369 | ")" 370 | ] 371 | }, 372 | { 373 | "cell_type": "markdown", 374 | "id": "a0667f2e", 375 | "metadata": {}, 376 | "source": [ 377 | "Create boto3 session and define pipeline step instance count and other configuration. " 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": null, 383 | "id": "6a425a1e", 384 | "metadata": {}, 385 | "outputs": [], 386 | "source": [ 387 | "region = AWS_REGION\n", 388 | "default_bucket = BUCKET_NAME\n", 389 | "\n", 390 | "pipeline_session = PipelineSession()\n", 391 | "role = sagemaker.session.get_execution_role(pipeline_session)\n", 392 | "\n", 393 | "datawrangler_instance_count = 2\n", 394 | "datawrangler_instance_type = \"ml.m5.4xlarge\" \n", 395 | "processing_instance_count = 1\n", 396 | "processing_instance_type = \"ml.m5.2xlarge\"\n", 397 | "training_instance_count = 1\n", 398 | "training_instance_type = \"ml.m5.2xlarge\"\n", 399 | "model_approval_status = ParameterString(\n", 400 | " name=\"ModelApprovalStatus\",\n", 401 | " default_value=\"PendingManualApproval\", # ModelApprovalStatus can be set to a default of \"Approved\" if you don't want manual approval.\n", 402 | ")\n", 403 | "input_data = ParameterString(\n", 404 | " name=\"InputDataUrl\",\n", 405 | " default_value=f\"\", # Change this to point to the s3 location of your raw input data.\n", 406 | ")\n" 407 | ] 408 | }, 409 | { 410 | "cell_type": "markdown", 411 | "id": "db038a2b", 412 | "metadata": {}, 413 | "source": [ 414 | "Define and create the `DataWranglerProcess` step in the SageMaker pipeline. This step uses an existing Data Wrangler flow file `diabetes.flow` that has the following transformations:\n", 415 | "* Move column readmitted to the beginning. This column is to be predicted in the classification problem. \n", 416 | "* Convert readmitted column value to 0 if it is NO and 1 if it is <30 or >30.\n", 417 | "* Drop the columns that have minimal to zero prediction power based on Data Wrangler Data Quality and Insights Report, e.g. payer_code and encounter_id. \n", 418 | "* Group values into finite categories using Python custom transform in the following columns: diag_1, diag_2, diag_3, admission_type_id, admission_source_id, and discharge_disposition_id. \n", 419 | "* Fill missing values in columns diag_1, diag_2, diag_3 and replace strings in column race.\n", 420 | "* Drop duplicates, balance data using SMOTE, and one-hot encode the following columns: race, gender, age, diag_1, diag_2, diag_3, max_glu_serum, A1Cresult, metformin, repaglinide, pioglitazone, rosiglitazone, insulin, change, diabetesMed, admission_type_id, discharge_disposition_id, admission_source_id. \n", 421 | "\n", 422 | "To use your own transformations, replace `output_name` and `flow_file_name`." 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": null, 428 | "id": "fb83bcb5", 429 | "metadata": {}, 430 | "outputs": [], 431 | "source": [ 432 | "print(f\"Data Wrangler export storage bucket: {default_bucket}\")\n", 433 | "\n", 434 | "# unique flow export ID\n", 435 | "flow_export_id = f\"{time.strftime('%d-%H-%M-%S', time.gmtime())}-{str(uuid.uuid4())[:8]}\"\n", 436 | "flow_export_name = f\"flow-{flow_export_id}\"\n", 437 | "\n", 438 | "output_name = \"d593101e-278b-4330-9779-b6e02fbeb99e.default\"\n", 439 | "\n", 440 | "s3_output_prefix = f\"export-{flow_export_name}/output\"\n", 441 | "s3_output_path = f\"s3://{default_bucket}/{s3_output_prefix}\"\n", 442 | "print(f\"Flow S3 export result path: {s3_output_path}\")\n", 443 | "\n", 444 | "processing_job_output = ProcessingOutput(\n", 445 | " output_name=output_name,\n", 446 | " source=\"/opt/ml/processing/output\",\n", 447 | " destination=s3_output_path,\n", 448 | " s3_upload_mode=\"EndOfJob\",\n", 449 | ")\n", 450 | "\n", 451 | "# name of the flow file which should exist in the current notebook working directory\n", 452 | "flow_file_name = \"sagemaker-pipeline/diabetes.flow\"\n", 453 | "\n", 454 | "# Load .flow file from current notebook working directory\n", 455 | "#!echo \"Loading flow file from current notebook working directory: $PWD\"\n", 456 | "\n", 457 | "with open(flow_file_name) as f:\n", 458 | " flow = json.load(f)\n", 459 | "\n", 460 | "# Upload flow to S3\n", 461 | "s3_client = boto3.client(\"s3\")\n", 462 | "s3_client.upload_file(\n", 463 | " flow_file_name,\n", 464 | " default_bucket,\n", 465 | " f\"data_wrangler_flows/{flow_export_name}.flow\",\n", 466 | " ExtraArgs={\"ServerSideEncryption\": \"aws:kms\"},\n", 467 | ")\n", 468 | "\n", 469 | "flow_s3_uri = f\"s3://{default_bucket}/data_wrangler_flows/{flow_export_name}.flow\"\n", 470 | "\n", 471 | "print(f\"Data Wrangler flow {flow_file_name} uploaded to {flow_s3_uri}\")\n", 472 | "\n", 473 | "flow_input = ProcessingInput(\n", 474 | " source=flow_s3_uri,\n", 475 | " destination=\"/opt/ml/processing/flow\",\n", 476 | " input_name=\"flow\",\n", 477 | " s3_data_type=\"S3Prefix\",\n", 478 | " s3_input_mode=\"File\",\n", 479 | " s3_data_distribution_type=\"FullyReplicated\",\n", 480 | ")\n", 481 | "\n", 482 | "# IAM role for executing the processing job.\n", 483 | "iam_role = role\n", 484 | "\n", 485 | "# Unique processing job name. Give a unique name every time you re-execute processing jobs\n", 486 | "processing_job_name = f\"data-wrangler-flow-processing-{flow_export_id}\"\n", 487 | "\n", 488 | "# Size in GB of the EBS volume to use for storing data during processing\n", 489 | "volume_size_in_gb = 30\n", 490 | "\n", 491 | "# Content type for each output. Data Wrangler supports CSV as default and Parquet.\n", 492 | "output_content_type = \"CSV\"\n", 493 | "\n", 494 | "# Network Isolation mode; default is off\n", 495 | "enable_network_isolation = False\n", 496 | "\n", 497 | "# List of tags to be passed to the processing job\n", 498 | "user_tags = []\n", 499 | "\n", 500 | "# Output configuration used as processing job container arguments\n", 501 | "output_config = {output_name: {\"content_type\": output_content_type}}\n", 502 | "\n", 503 | "# KMS key for per object encryption; default is None\n", 504 | "kms_key = None\n", 505 | "\n", 506 | "# Data Wrangler Container URL.\n", 507 | "container_uri = sagemaker.image_uris.retrieve(\n", 508 | " framework=\"data-wrangler\", \n", 509 | " region=region, \n", 510 | " version=\"1.x\",\n", 511 | ")\n", 512 | "\n", 513 | "processor = Processor(\n", 514 | " role=iam_role,\n", 515 | " image_uri=container_uri,\n", 516 | " instance_count=datawrangler_instance_count,\n", 517 | " instance_type=datawrangler_instance_type,\n", 518 | " volume_size_in_gb=volume_size_in_gb,\n", 519 | " network_config=NetworkConfig(enable_network_isolation=enable_network_isolation),\n", 520 | " sagemaker_session=pipeline_session,\n", 521 | " output_kms_key=kms_key,\n", 522 | " tags=user_tags,\n", 523 | ")\n", 524 | "\n", 525 | "data_wrangler_step = ProcessingStep(\n", 526 | " name=\"DataWranglerProcess\",\n", 527 | " processor=processor,\n", 528 | " inputs=[flow_input],\n", 529 | " outputs=[processing_job_output],\n", 530 | " job_arguments=[f\"--output-config '{json.dumps(output_config)}'\"],\n", 531 | ")\n" 532 | ] 533 | }, 534 | { 535 | "cell_type": "markdown", 536 | "id": "db4599ea", 537 | "metadata": {}, 538 | "source": [ 539 | "Define and create the `Preprocess` step in the SageMaker pipeline. This step reads the transformed data from the DataWranglerProcess, randomizes, and splits the data into train (70%), validation (10%), and test data (20%). \n", 540 | "\n", 541 | "You can also put here other necessary transformations and pre-processing changes that are done outside of Data Wrangler. " 542 | ] 543 | }, 544 | { 545 | "cell_type": "code", 546 | "execution_count": null, 547 | "id": "bfb50e53", 548 | "metadata": {}, 549 | "outputs": [], 550 | "source": [ 551 | "base_job_prefix = \"sagemaker-diabetes\"\n", 552 | "\n", 553 | "sklearn_processor = SKLearnProcessor(\n", 554 | " framework_version=\"0.23-1\",\n", 555 | " instance_type=processing_instance_type,\n", 556 | " instance_count=processing_instance_count,\n", 557 | " base_job_name=f\"{base_job_prefix}/sklearn-diabetes-preprocess\", # choose any name\n", 558 | " sagemaker_session=pipeline_session,\n", 559 | " role=role\n", 560 | ")\n", 561 | "\n", 562 | "step_process = ProcessingStep(\n", 563 | " name=\"Preprocess\", # choose any name\n", 564 | " processor=sklearn_processor,\n", 565 | " inputs=[\n", 566 | " ProcessingInput(\n", 567 | " source=data_wrangler_step.properties.ProcessingOutputConfig.Outputs[\n", 568 | " output_name\n", 569 | " ].S3Output.S3Uri,\n", 570 | " destination=\"/opt/ml/processing/data/raw-data-dir\",\n", 571 | " )\n", 572 | " ],\n", 573 | " outputs=[\n", 574 | " ProcessingOutput(output_name=\"train\", source=\"/opt/ml/processing/train\"),\n", 575 | " ProcessingOutput(output_name=\"validation\", source=\"/opt/ml/processing/validation\"),\n", 576 | " ProcessingOutput(output_name=\"test\", source=\"/opt/ml/processing/test\"),\n", 577 | " ],\n", 578 | " code=\"sagemaker-pipeline/pipelines/diabetes/preprocess.py\",\n", 579 | " job_arguments=[\n", 580 | " \"--input-data\",\n", 581 | " data_wrangler_step.properties.ProcessingOutputConfig.Outputs[\n", 582 | " output_name\n", 583 | " ].S3Output.S3Uri,\n", 584 | " ],\n", 585 | ")\n" 586 | ] 587 | }, 588 | { 589 | "cell_type": "markdown", 590 | "id": "ade846a1", 591 | "metadata": {}, 592 | "source": [ 593 | "Define and create the `XGBHPTune` step in the SageMaker pipeline. This is a hyperparameter tuning job using SageMaker XGBoost algorithm." 594 | ] 595 | }, 596 | { 597 | "cell_type": "code", 598 | "execution_count": null, 599 | "id": "f1815db4", 600 | "metadata": {}, 601 | "outputs": [], 602 | "source": [ 603 | "# Training step for generating model artifacts\n", 604 | "model_path = f\"s3://{pipeline_session.default_bucket()}/{base_job_prefix}/diabetesTrain\"\n", 605 | "model_bucket_key = f\"{pipeline_session.default_bucket()}/{base_job_prefix}/diabetesTrain\"\n", 606 | "cache_config = CacheConfig(enable_caching=True, expire_after=\"30d\")\n", 607 | "\n", 608 | "xgb_image_uri = sagemaker.image_uris.retrieve(\n", 609 | " framework=\"xgboost\", # we are using the Sagemaker built in xgboost algorithm\n", 610 | " region=region,\n", 611 | " version=\"1.0-1\",\n", 612 | " py_version=\"py3\",\n", 613 | " instance_type=training_instance_type,\n", 614 | " image_scope=\"training\"\n", 615 | ")\n", 616 | "xgb_train = Estimator(\n", 617 | " image_uri=xgb_image_uri,\n", 618 | " instance_type=training_instance_type,\n", 619 | " instance_count=training_instance_count,\n", 620 | " output_path=model_path,\n", 621 | " base_job_name=f\"{base_job_prefix}/diabetes-xgb-train\",\n", 622 | " sagemaker_session=pipeline_session,\n", 623 | " role=role,\n", 624 | ")\n", 625 | "xgb_train.set_hyperparameters(\n", 626 | " num_round=50,\n", 627 | " objective=\"binary:logistic\", # we are using binary:logistic as the objective function for classification \n", 628 | ")\n", 629 | "\n", 630 | "xgb_train.set_hyperparameters(grow_policy=\"lossguide\")\n", 631 | "\n", 632 | "xgb_objective_metric_name = \"validation:auc\" # we are using AUC as a performance metric \n", 633 | "xgb_hyperparameter_ranges = {\n", 634 | " \"max_depth\": IntegerParameter(5, 10, scaling_type=\"Auto\"),\n", 635 | " \"min_child_weight\": IntegerParameter(5, 10, scaling_type=\"Auto\"),\n", 636 | " \"eta\": ContinuousParameter(0.1, 0.9, scaling_type=\"Auto\"),\n", 637 | " \"gamma\": IntegerParameter(4, 9, scaling_type=\"Auto\"),\n", 638 | " \"subsample\": ContinuousParameter(0.7, 0.9, scaling_type=\"Auto\"),\n", 639 | "}\n", 640 | "\n", 641 | "xgb_tuner_log = HyperparameterTuner(\n", 642 | " xgb_train,\n", 643 | " xgb_objective_metric_name,\n", 644 | " xgb_hyperparameter_ranges,\n", 645 | " max_jobs=5,\n", 646 | " max_parallel_jobs=5,\n", 647 | " strategy=\"Random\",\n", 648 | " objective_type=\"Maximize\",\n", 649 | ")\n", 650 | "\n", 651 | "xgb_step_tuning = TuningStep(\n", 652 | " name=\"XGBHPTune\",\n", 653 | " tuner=xgb_tuner_log,\n", 654 | " inputs={\n", 655 | " \"train\": TrainingInput(\n", 656 | " s3_data=step_process.properties.ProcessingOutputConfig.Outputs[\n", 657 | " \"train\"\n", 658 | " ].S3Output.S3Uri,\n", 659 | " content_type=\"text/csv\",\n", 660 | " ),\n", 661 | " \"validation\": TrainingInput(\n", 662 | " s3_data=step_process.properties.ProcessingOutputConfig.Outputs[\n", 663 | " \"validation\"\n", 664 | " ].S3Output.S3Uri,\n", 665 | " content_type=\"text/csv\",\n", 666 | " ),\n", 667 | " },\n", 668 | " cache_config=cache_config,\n", 669 | ")\n" 670 | ] 671 | }, 672 | { 673 | "cell_type": "markdown", 674 | "id": "d480e28f", 675 | "metadata": {}, 676 | "source": [ 677 | "Define and create the `DTreeHPTune` step in the SageMaker pipeline. This is a hyperparameter tuning job using Scikit-learn Decision Tree algorithm. Note that this is in a custom Docker image pushed to the repository in section ` Prepare the Decision Tree custom Docker image`. " 678 | ] 679 | }, 680 | { 681 | "cell_type": "code", 682 | "execution_count": null, 683 | "id": "a0abbb71", 684 | "metadata": {}, 685 | "outputs": [], 686 | "source": [ 687 | "dtree_image_uri = pipeline_session.sagemaker_client.describe_image_version(ImageName=\"diabetes-dtree\")[\"ContainerImage\"]\n", 688 | "\n", 689 | "dtree_train = Estimator(\n", 690 | " image_uri=dtree_image_uri,\n", 691 | " role=role,\n", 692 | " instance_count=1,\n", 693 | " instance_type=training_instance_type,\n", 694 | " base_job_name=f\"{base_job_prefix}/diabetes-dtree-train\",\n", 695 | " output_path=model_path,\n", 696 | " sagemaker_session=pipeline_session,\n", 697 | ")\n", 698 | "\n", 699 | "dtree_objective_metric_name = \"validation:auc\"\n", 700 | "dtree_metric_definitions = [{\"Name\": \"validation:auc\", \"Regex\": \"auc:(\\S+)\"}]\n", 701 | "\n", 702 | "dtree_hyperparameter_ranges = {\n", 703 | " \"max_depth\": IntegerParameter(5, 10, scaling_type=\"Linear\"),\n", 704 | " \"max_leaf_nodes\": IntegerParameter(2, 10, scaling_type=\"Linear\"),\n", 705 | "}\n", 706 | "\n", 707 | "dtree_tuner_log = HyperparameterTuner(\n", 708 | " dtree_train,\n", 709 | " dtree_objective_metric_name,\n", 710 | " dtree_hyperparameter_ranges,\n", 711 | " dtree_metric_definitions,\n", 712 | " max_jobs=5,\n", 713 | " max_parallel_jobs=5,\n", 714 | " strategy=\"Random\",\n", 715 | " objective_type=\"Maximize\",\n", 716 | ")\n", 717 | "\n", 718 | "dtree_step_tuning = TuningStep(\n", 719 | " name=\"DTreeHPTune\",\n", 720 | " tuner=dtree_tuner_log,\n", 721 | " inputs={\n", 722 | " \"training\": TrainingInput(\n", 723 | " s3_data=step_process.properties.ProcessingOutputConfig.Outputs[\n", 724 | " \"train\"\n", 725 | " ].S3Output.S3Uri,\n", 726 | " content_type=\"text/csv\",\n", 727 | " ),\n", 728 | " \"validation\": TrainingInput(\n", 729 | " s3_data=step_process.properties.ProcessingOutputConfig.Outputs[\n", 730 | " \"validation\"\n", 731 | " ].S3Output.S3Uri,\n", 732 | " content_type=\"text/csv\",\n", 733 | " ),\n", 734 | " },\n", 735 | " cache_config=cache_config,\n", 736 | ")\n" 737 | ] 738 | }, 739 | { 740 | "cell_type": "markdown", 741 | "id": "581bd6de", 742 | "metadata": {}, 743 | "source": [ 744 | "Define and create the `DtreeEval` step in the SageMaker pipeline. This uses `dtree_evaluate.py` to evaluate the performance of the generated model from `DTreeHPTune` step using test data. " 745 | ] 746 | }, 747 | { 748 | "cell_type": "code", 749 | "execution_count": null, 750 | "id": "3d08243a", 751 | "metadata": {}, 752 | "outputs": [], 753 | "source": [ 754 | "dtree_script_eval = ScriptProcessor(\n", 755 | " image_uri=dtree_image_uri,\n", 756 | " command=[\"python3\"],\n", 757 | " instance_type=processing_instance_type,\n", 758 | " instance_count=1,\n", 759 | " base_job_name=f\"{base_job_prefix}/script-dtree-eval\",\n", 760 | " sagemaker_session=pipeline_session,\n", 761 | " role=role,\n", 762 | ")\n", 763 | "\n", 764 | "dtree_evaluation_report = PropertyFile(\n", 765 | " name=\"EvaluationReportDTree\",\n", 766 | " output_name=\"dtree_evaluation\",\n", 767 | " path=\"dtree_evaluation.json\",\n", 768 | ")\n", 769 | "\n", 770 | "dtree_step_eval = ProcessingStep(\n", 771 | " name=\"DTreeEval\",\n", 772 | " processor=dtree_script_eval,\n", 773 | " inputs=[\n", 774 | " ProcessingInput(\n", 775 | " source=dtree_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key),\n", 776 | " destination=\"/opt/ml/processing/model\",\n", 777 | " ),\n", 778 | " ProcessingInput(\n", 779 | " source=step_process.properties.ProcessingOutputConfig.Outputs[\n", 780 | " \"test\"\n", 781 | " ].S3Output.S3Uri,\n", 782 | " destination=\"/opt/ml/processing/test\",\n", 783 | " ),\n", 784 | " ],\n", 785 | " outputs=[\n", 786 | " ProcessingOutput(\n", 787 | " output_name=\"dtree_evaluation\", source=\"/opt/ml/processing/evaluation\"\n", 788 | " ),\n", 789 | " ],\n", 790 | " code=\"sagemaker-pipeline/pipelines/diabetes/dtree_evaluate.py\",\n", 791 | " property_files=[dtree_evaluation_report],\n", 792 | ")\n" 793 | ] 794 | }, 795 | { 796 | "cell_type": "markdown", 797 | "id": "85b7e5e0", 798 | "metadata": {}, 799 | "source": [ 800 | "Define and create the `XGBEval` step in the SageMaker pipeline. This uses `xgb_evaluate.py` to evaluate the performance of the generated model from `XGBHPTune` step using test data. " 801 | ] 802 | }, 803 | { 804 | "cell_type": "code", 805 | "execution_count": null, 806 | "id": "6271fd78", 807 | "metadata": {}, 808 | "outputs": [], 809 | "source": [ 810 | "xgb_script_eval = ScriptProcessor(\n", 811 | " image_uri=xgb_image_uri,\n", 812 | " command=[\"python3\"],\n", 813 | " instance_type=processing_instance_type,\n", 814 | " instance_count=1,\n", 815 | " base_job_name=f\"{base_job_prefix}/script-xgb-eval\",\n", 816 | " sagemaker_session=pipeline_session,\n", 817 | " role=role,\n", 818 | ")\n", 819 | "\n", 820 | "xgb_evaluation_report = PropertyFile(\n", 821 | " name=\"EvaluationReportXGBoost\",\n", 822 | " output_name=\"xgb_evaluation\",\n", 823 | " path=\"xgb_evaluation.json\",\n", 824 | ")\n", 825 | "\n", 826 | "xgb_step_eval = ProcessingStep(\n", 827 | " name=\"XGBEval\",\n", 828 | " processor=xgb_script_eval,\n", 829 | " inputs=[\n", 830 | " ProcessingInput(\n", 831 | " source=xgb_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key),\n", 832 | " destination=\"/opt/ml/processing/model\",\n", 833 | " ),\n", 834 | " ProcessingInput(\n", 835 | " source=step_process.properties.ProcessingOutputConfig.Outputs[\n", 836 | " \"test\"\n", 837 | " ].S3Output.S3Uri,\n", 838 | " destination=\"/opt/ml/processing/test\",\n", 839 | " ),\n", 840 | " ],\n", 841 | " outputs=[\n", 842 | " ProcessingOutput(output_name=\"xgb_evaluation\", source=\"/opt/ml/processing/evaluation\"),\n", 843 | " ],\n", 844 | " code=\"sagemaker-pipeline/pipelines/diabetes/xgb_evaluate.py\",\n", 845 | " property_files=[xgb_evaluation_report],\n", 846 | ")\n" 847 | ] 848 | }, 849 | { 850 | "cell_type": "markdown", 851 | "id": "694e6b79", 852 | "metadata": {}, 853 | "source": [ 854 | "Retrieve the resulting AUC-ROC score from steps `DTreeEval` and `XGBEval` in the SageMaker pipeline." 855 | ] 856 | }, 857 | { 858 | "cell_type": "code", 859 | "execution_count": null, 860 | "id": "6f7decd9", 861 | "metadata": {}, 862 | "outputs": [], 863 | "source": [ 864 | "xgb_model_metrics = ModelMetrics(\n", 865 | " model_statistics=MetricsSource(\n", 866 | " s3_uri=\"{}/xgb_evaluation.json\".format(\n", 867 | " xgb_step_eval.arguments[\"ProcessingOutputConfig\"][\"Outputs\"][0][\"S3Output\"][\"S3Uri\"]\n", 868 | " ),\n", 869 | " content_type=\"application/json\",\n", 870 | " )\n", 871 | ")\n", 872 | "\n", 873 | "dtree_model_metrics = ModelMetrics(\n", 874 | " model_statistics=MetricsSource(\n", 875 | " s3_uri=\"{}/dtree_evaluation.json\".format(\n", 876 | " dtree_step_eval.arguments[\"ProcessingOutputConfig\"][\"Outputs\"][0][\"S3Output\"][\n", 877 | " \"S3Uri\"\n", 878 | " ]\n", 879 | " ),\n", 880 | " content_type=\"application/json\",\n", 881 | " )\n", 882 | ")\n", 883 | "\n", 884 | "xgb_eval_metrics = JsonGet(\n", 885 | " #step_name=xgb_step_eval,\n", 886 | " step=xgb_step_eval,\n", 887 | " property_file=xgb_evaluation_report,\n", 888 | " json_path=\"classification_metrics.roc.value\", # This should follow the structure of your report_dict defined in the evaluate.py file.\n", 889 | ")\n", 890 | "\n", 891 | "dtree_eval_metrics = JsonGet(\n", 892 | " #step_name=dtree_step_eval,\n", 893 | " step=dtree_step_eval,\n", 894 | " property_file=dtree_evaluation_report,\n", 895 | " json_path=\"classification_metrics.roc.value\", # This should follow the structure of your report_dict defined in the evaluate.py file.\n", 896 | ")\n" 897 | ] 898 | }, 899 | { 900 | "cell_type": "markdown", 901 | "id": "d187a447", 902 | "metadata": {}, 903 | "source": [ 904 | "Define and create ` AccuracyCond`. `DTreeReg-RegisterModel` and `XGBReg-RegisterModel` steps in the SageMaker pipeline. IF AUC-ROC score of Scikit-learn Decision Tree is greater than SageMaker XGBoost, then the Decision Tree model is registered in the model registry. Else, XGBoost is registered in the model registry. " 905 | ] 906 | }, 907 | { 908 | "cell_type": "code", 909 | "execution_count": null, 910 | "id": "0e4cd8c1", 911 | "metadata": {}, 912 | "outputs": [], 913 | "source": [ 914 | "model_package_group_name = \"sagemaker-diabetes\"\n", 915 | "# Register model step that will be conditionally executed\n", 916 | "dtree_step_register = RegisterModel(\n", 917 | " name=\"DTreeReg\",\n", 918 | " estimator=dtree_train,\n", 919 | " model_data=dtree_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key),\n", 920 | " content_types=[\"text/csv\"],\n", 921 | " response_types=[\"text/csv\"],\n", 922 | " inference_instances=[\"ml.t2.medium\", \"ml.m5.large\"],\n", 923 | " transform_instances=[\"ml.m5.large\"],\n", 924 | " model_package_group_name=model_package_group_name,\n", 925 | " approval_status=model_approval_status,\n", 926 | " model_metrics=dtree_model_metrics,\n", 927 | ")\n", 928 | "\n", 929 | "# Register model step that will be conditionally executed\n", 930 | "xgb_step_register = RegisterModel(\n", 931 | " name=\"XGBReg\",\n", 932 | " estimator=xgb_train,\n", 933 | " model_data=xgb_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key),\n", 934 | " content_types=[\"text/csv\"],\n", 935 | " response_types=[\"text/csv\"],\n", 936 | " inference_instances=[\"ml.t2.medium\", \"ml.m5.large\"],\n", 937 | " transform_instances=[\"ml.m5.large\"],\n", 938 | " model_package_group_name=model_package_group_name,\n", 939 | " approval_status=model_approval_status,\n", 940 | " model_metrics=xgb_model_metrics,\n", 941 | ")\n", 942 | "\n", 943 | "# Condition step for evaluating model quality and branching execution\n", 944 | "cond_lte = ConditionGreaterThanOrEqualTo( # You can change the condition here\n", 945 | " left=JsonGet(\n", 946 | " #step_name=dtree_step_eval,\n", 947 | " step=dtree_step_eval,\n", 948 | " property_file=dtree_evaluation_report,\n", 949 | " json_path=\"classification_metrics.roc.value\", # This should follow the structure of your report_dict defined in the evaluate.py file.\n", 950 | " ),\n", 951 | " right=JsonGet(\n", 952 | " #step_name=xgb_step_eval,\n", 953 | " step=xgb_step_eval,\n", 954 | " property_file=xgb_evaluation_report,\n", 955 | " json_path=\"classification_metrics.roc.value\"\n", 956 | " ), \n", 957 | ")\n", 958 | "\n", 959 | "step_cond = ConditionStep(\n", 960 | " name=\"AccuracyCond\",\n", 961 | " conditions=[cond_lte],\n", 962 | " if_steps=[dtree_step_register],\n", 963 | " else_steps=[xgb_step_register],\n", 964 | ")\n", 965 | "\n" 966 | ] 967 | }, 968 | { 969 | "cell_type": "markdown", 970 | "id": "b9d1ace6", 971 | "metadata": {}, 972 | "source": [ 973 | "Define and start the SageMaker pipeline. You should be able to see the running SageMaker pipeline in SageMaker Studio. " 974 | ] 975 | }, 976 | { 977 | "cell_type": "code", 978 | "execution_count": null, 979 | "id": "5cbfb086", 980 | "metadata": {}, 981 | "outputs": [], 982 | "source": [ 983 | "pipeline_name = \"sagemaker-diabetes\"\n", 984 | "\n", 985 | "pipeline = Pipeline(\n", 986 | " name=pipeline_name,\n", 987 | " parameters=[\n", 988 | " datawrangler_instance_type,\n", 989 | " datawrangler_instance_count, \n", 990 | " processing_instance_type,\n", 991 | " processing_instance_count,\n", 992 | " training_instance_type,\n", 993 | " training_instance_count,\n", 994 | " model_approval_status,\n", 995 | " input_data\n", 996 | " ],\n", 997 | " steps=[\n", 998 | " data_wrangler_step,\n", 999 | " step_process,\n", 1000 | " dtree_step_tuning,\n", 1001 | " xgb_step_tuning,\n", 1002 | " dtree_step_eval,\n", 1003 | " xgb_step_eval,\n", 1004 | " step_cond,\n", 1005 | " ],\n", 1006 | " sagemaker_session=pipeline_session,\n", 1007 | ")\n", 1008 | "\n", 1009 | "\n", 1010 | "pipeline.upsert(role_arn=role)\n", 1011 | "execution = pipeline.start()" 1012 | ] 1013 | }, 1014 | { 1015 | "cell_type": "markdown", 1016 | "id": "858e43e8", 1017 | "metadata": {}, 1018 | "source": [ 1019 | "## Approve top performing model in SageMaker model registry" 1020 | ] 1021 | }, 1022 | { 1023 | "cell_type": "markdown", 1024 | "id": "1ff84ce4", 1025 | "metadata": {}, 1026 | "source": [ 1027 | "After the SageMaker Pipeline is complete, new trained Model will be registered in Model Registry.\n", 1028 | "\n", 1029 | "1) Make sure to update your desired `MODEL_VERSION`. We assume we approve the model version 1. \n", 1030 | "\n", 1031 | "2) As EventBridge monitors Model Registry status changes, Model status change will trigger SageMaker Projects model-deploy pipeline." 1032 | ] 1033 | }, 1034 | { 1035 | "cell_type": "code", 1036 | "execution_count": null, 1037 | "id": "698e3477", 1038 | "metadata": {}, 1039 | "outputs": [], 1040 | "source": [ 1041 | "from sagemaker import get_execution_role, session\n", 1042 | "import boto3\n", 1043 | "\n", 1044 | "role = get_execution_role()\n", 1045 | "sm_client = boto3.client(\"sagemaker\")\n", 1046 | "\n", 1047 | "MODEL_VERSION = \"2\"\n", 1048 | "AWS_REGION = boto3.Session().region_name\n", 1049 | "MODEL_PACKAGE_ARN = \"arn:aws:sagemaker:{AWS_REGION}:{AWS_ACCOUNT}:model-package/sagemaker-diabetes/{MODEL_VERSION}\".format(\n", 1050 | " AWS_REGION=AWS_REGION,\n", 1051 | " AWS_ACCOUNT=AWS_ACCOUNT, \n", 1052 | " MODEL_VERSION=MODEL_VERSION\n", 1053 | ")\n", 1054 | "\n", 1055 | "\n", 1056 | "model_package_update_response = sm_client.update_model_package(\n", 1057 | " ModelPackageArn=MODEL_PACKAGE_ARN, ModelApprovalStatus=\"Approved\"\n", 1058 | ")" 1059 | ] 1060 | }, 1061 | { 1062 | "cell_type": "markdown", 1063 | "id": "6ce11a41", 1064 | "metadata": {}, 1065 | "source": [ 1066 | "## Deploy the SageMaker inference endpoint" 1067 | ] 1068 | }, 1069 | { 1070 | "cell_type": "markdown", 1071 | "id": "e4c895fc", 1072 | "metadata": {}, 1073 | "source": [ 1074 | "Import model into hosting. Register the model with hosting. This allows the flexibility of importing models trained elsewhere." 1075 | ] 1076 | }, 1077 | { 1078 | "cell_type": "code", 1079 | "execution_count": null, 1080 | "id": "73d09101", 1081 | "metadata": {}, 1082 | "outputs": [], 1083 | "source": [ 1084 | "from time import gmtime, strftime\n", 1085 | "\n", 1086 | "model_name = \"diabetes-modelregistry-model-\" + strftime(\"%Y-%m-%d-%H-%M-%S\", gmtime())\n", 1087 | "print(\"Model name : {}\".format(model_name))\n", 1088 | "container_list = [{\"ModelPackageName\": MODEL_PACKAGE_ARN}]\n", 1089 | "\n", 1090 | "create_model_response = sm_client.create_model(\n", 1091 | " ModelName=model_name, ExecutionRoleArn=role, Containers=container_list\n", 1092 | ")\n", 1093 | "print(\"Model arn : {}\".format(create_model_response[\"ModelArn\"]))" 1094 | ] 1095 | }, 1096 | { 1097 | "cell_type": "markdown", 1098 | "id": "4a7623ed", 1099 | "metadata": {}, 1100 | "source": [ 1101 | "Create endpoint configuration. SageMaker supports configuring REST endpoints in hosting with multiple models, e.g. for A/B testing purposes. In order to support this, customers create an endpoint configuration, that describes the distribution of traffic across the models, whether split, shadowed, or sampled in some way. In addition, the endpoint configuration describes the instance type required for model deployment." 1102 | ] 1103 | }, 1104 | { 1105 | "cell_type": "code", 1106 | "execution_count": null, 1107 | "id": "40838588", 1108 | "metadata": {}, 1109 | "outputs": [], 1110 | "source": [ 1111 | "endpoint_config_name = \"diabetes-modelregistry-EndpointConfig-\" + strftime(\n", 1112 | " \"%Y-%m-%d-%H-%M-%S\", gmtime()\n", 1113 | ")\n", 1114 | "print(endpoint_config_name)\n", 1115 | "create_endpoint_config_response = sm_client.create_endpoint_config(\n", 1116 | " EndpointConfigName=endpoint_config_name,\n", 1117 | " ProductionVariants=[\n", 1118 | " {\n", 1119 | " \"InstanceType\": \"ml.m5.large\",\n", 1120 | " \"InitialVariantWeight\": 1,\n", 1121 | " \"InitialInstanceCount\": 1,\n", 1122 | " \"ModelName\": model_name,\n", 1123 | " \"VariantName\": \"AllTraffic\",\n", 1124 | " }\n", 1125 | " ],\n", 1126 | ")" 1127 | ] 1128 | }, 1129 | { 1130 | "cell_type": "markdown", 1131 | "id": "c0e6759f", 1132 | "metadata": {}, 1133 | "source": [ 1134 | "Create endpoint. Lastly, the customer creates the endpoint that serves up the model, through specifying the name and configuration defined above. The end result is an endpoint that can be validated and incorporated into your applications. This takes 9-11 minutes to complete." 1135 | ] 1136 | }, 1137 | { 1138 | "cell_type": "code", 1139 | "execution_count": null, 1140 | "id": "ef5578c8", 1141 | "metadata": {}, 1142 | "outputs": [], 1143 | "source": [ 1144 | "endpoint_name = \"diabetes-staging\"\n", 1145 | "print(\"EndpointName={}\".format(endpoint_name))\n", 1146 | "\n", 1147 | "create_endpoint_response = sm_client.create_endpoint(\n", 1148 | " EndpointName=endpoint_name, EndpointConfigName=endpoint_config_name\n", 1149 | ")\n", 1150 | "\n", 1151 | "\n", 1152 | "while True:\n", 1153 | " endpoint = sm_client.describe_endpoint(EndpointName=endpoint_name)\n", 1154 | " if endpoint[\"EndpointStatus\"] == \"InService\":\n", 1155 | " break\n", 1156 | " print(\"Waiting for the endpoint to be completed..\")\n", 1157 | " time.sleep(60)\n", 1158 | "\n", 1159 | "print(\"Endpoint arn : {}\".format(create_endpoint_response[\"EndpointArn\"]))" 1160 | ] 1161 | }, 1162 | { 1163 | "cell_type": "markdown", 1164 | "id": "1cd1356a", 1165 | "metadata": {}, 1166 | "source": [ 1167 | "## Run predictions on model" 1168 | ] 1169 | }, 1170 | { 1171 | "cell_type": "markdown", 1172 | "id": "087b25ad", 1173 | "metadata": {}, 1174 | "source": [ 1175 | "Wait until SageMaker Projects model-deploy pipeline has deployed the staging inference endpoint. Use the following data for inference:\n", 1176 | "\n", 1177 | "Example 1\n", 1178 | "------------\n", 1179 | "`5.0,64.0,0.0,18.0,0.0,0.0,7.0,9.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0\n", 1180 | "`\n", 1181 | "\n", 1182 | "In summary, this is a diabetic patient that is Caucasian Female age 60-70, who has spent 5 days in the hospital under emergency care in the current encounter. Prior to this encounter, patient has spent 0 days in outpatient care, 0 days in emergency care, 7 days in inpatient care. 64 laboratory procedures have been performed on the patient. Patient is not using metformin, repaglinide, pioglitazone, rosiglitazone, and insulin prescription is steady.\n", 1183 | "\n" 1184 | ] 1185 | }, 1186 | { 1187 | "cell_type": "code", 1188 | "execution_count": null, 1189 | "id": "fa19de14", 1190 | "metadata": {}, 1191 | "outputs": [], 1192 | "source": [ 1193 | "import json\n", 1194 | "import boto3\n", 1195 | "\n", 1196 | "sm_runtime = boto3.client(\"runtime.sagemaker\")\n", 1197 | "endpoint_name =\"diabetes-staging\"\n", 1198 | "line = \"5.0,64.0,0.0,18.0,0.0,0.0,7.0,9.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0\"\n", 1199 | "response = sm_runtime.invoke_endpoint(EndpointName=endpoint_name, ContentType=\"text/csv\", Body=line)\n", 1200 | "result = json.loads(response[\"Body\"].read().decode())\n", 1201 | "print(\"Predicted class : {}\".format(round(result)))" 1202 | ] 1203 | }, 1204 | { 1205 | "cell_type": "markdown", 1206 | "id": "3b16d0f3", 1207 | "metadata": {}, 1208 | "source": [ 1209 | "Now you try:\n", 1210 | "\n", 1211 | "Example 2\n", 1212 | "------------\n", 1213 | "\n", 1214 | "`3.0,19.0,3.0,19.0,0.0,0.0,0.0,8.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0\n", 1215 | "`\n", 1216 | "\n", 1217 | "In summary, this is a diabetic patient that is Caucasian Female age 70-80, who has spent 3 days in the hospital under elective care in the current encounter. Prior to this encounter, patient has spent 0 days in outpatient care, 0 days in emergency care, 0 days in inpatient care. 19 laboratory procedures have been performed on the patient. Patient is not using metformin, repaglinide, pioglitazone, rosiglitazone. Patient is not using insulin. " 1218 | ] 1219 | }, 1220 | { 1221 | "cell_type": "markdown", 1222 | "id": "8fcb475b", 1223 | "metadata": {}, 1224 | "source": [ 1225 | "## Cleanup" 1226 | ] 1227 | }, 1228 | { 1229 | "cell_type": "markdown", 1230 | "id": "9787688e", 1231 | "metadata": {}, 1232 | "source": [ 1233 | "To avoid incurring future charges, clean up created resources such as the S3 bucket, ECR repository, and SageMaker Studio. Prior to deleting the SageMaker Studio, make sure to delete the SageMaker model and endpoint resources. \n", 1234 | "Finally, delete the Jupyter instance containing the notebook. " 1235 | ] 1236 | } 1237 | ], 1238 | "metadata": { 1239 | "instance_type": "ml.t3.medium", 1240 | "kernelspec": { 1241 | "display_name": "conda_python3", 1242 | "language": "python", 1243 | "name": "conda_python3" 1244 | }, 1245 | "language_info": { 1246 | "codemirror_mode": { 1247 | "name": "ipython", 1248 | "version": 3 1249 | }, 1250 | "file_extension": ".py", 1251 | "mimetype": "text/x-python", 1252 | "name": "python", 1253 | "nbconvert_exporter": "python", 1254 | "pygments_lexer": "ipython3", 1255 | "version": "3.8.12" 1256 | } 1257 | }, 1258 | "nbformat": 4, 1259 | "nbformat_minor": 5 1260 | } 1261 | --------------------------------------------------------------------------------