├── sagemaker-pipeline
    ├── pipelines
    │   ├── __init__.py
    │   ├── diabetes
    │   │   ├── __init__.py
    │   │   ├── xgb_evaluate.py
    │   │   ├── dtree_evaluate.py
    │   │   ├── preprocess.py
    │   │   └── pipeline.py
    │   ├── __version__.py
    │   ├── _utils.py
    │   ├── get_pipeline_definition.py
    │   └── run_pipeline.py
    ├── setup.cfg
    ├── codebuild-buildspec.yml
    ├── setup.py
    └── diabetes.flow
├── container
    ├── local_test
    │   ├── test_dir
    │   │   └── input
    │   │   │   └── config
    │   │   │       ├── resourceConfig.json
    │   │   │       └── hyperparameters.json
    │   ├── serve_local.sh
    │   ├── predict.sh
    │   └── train_local.sh
    ├── decision_trees
    │   ├── wsgi.py
    │   ├── nginx.conf
    │   ├── serve
    │   ├── predictor.py
    │   └── train
    ├── app-image-config-input.json
    ├── build_and_push.sh
    └── Dockerfile
├── diabetes-project-iam.json
├── README.md
├── diabetes-project-with-mlops.ipynb
└── diabetes-project.ipynb


/sagemaker-pipeline/pipelines/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/sagemaker-pipeline/pipelines/diabetes/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/container/local_test/test_dir/input/config/resourceConfig.json:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/container/local_test/test_dir/input/config/hyperparameters.json:
--------------------------------------------------------------------------------
1 | {"max_depth": "2", "max_leaf_nodes": "2"}
2 | 


--------------------------------------------------------------------------------
/container/local_test/serve_local.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | image=$1
4 | 
5 | docker run -v $(pwd)/test_dir:/opt/ml -p 8080:8080 --rm ${image} serve
6 | 


--------------------------------------------------------------------------------
/container/local_test/predict.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | payload=$1
4 | content=${2:-text/csv}
5 | 
6 | curl --data-binary @${payload} -H "Content-Type: ${content}" -v http://localhost:8080/invocations
7 | 


--------------------------------------------------------------------------------
/container/decision_trees/wsgi.py:
--------------------------------------------------------------------------------
1 | import predictor as myapp
2 | 
3 | # This is just a simple wrapper for gunicorn to find your app.
4 | # If you want to change the algorithm file, simply change "predictor" above to the
5 | # new file.
6 | 
7 | app = myapp.app
8 | 


--------------------------------------------------------------------------------
/container/local_test/train_local.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | image=$1
 4 | 
 5 | mkdir -p test_dir/model
 6 | mkdir -p test_dir/output
 7 | 
 8 | rm test_dir/model/*
 9 | rm test_dir/output/*
10 | 
11 | docker run -v $(pwd)/test_dir:/opt/ml --rm ${image} train
12 | 


--------------------------------------------------------------------------------
/sagemaker-pipeline/setup.cfg:
--------------------------------------------------------------------------------
 1 | [tool:pytest]
 2 | addopts =
 3 |     -vv
 4 | testpaths = tests
 5 | 
 6 | [aliases]
 7 | test=pytest
 8 | 
 9 | [metadata]
10 | description-file = README.md
11 | license_file = LICENSE
12 | 
13 | [wheel]
14 | universal = 1
15 | 


--------------------------------------------------------------------------------
/sagemaker-pipeline/pipelines/__version__.py:
--------------------------------------------------------------------------------
 1 | """Metadata for the pipelines package."""
 2 | 
 3 | __title__ = "pipelines"
 4 | __description__ = "pipelines - template package"
 5 | __version__ = "0.0.1"
 6 | __author__ = "<Your Name>"
 7 | __author_email__ = "<your-email@python.org>"
 8 | __license__ = "Apache 2.0"
 9 | __url__ = "<https://your-website>"
10 | 


--------------------------------------------------------------------------------
/container/app-image-config-input.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "AppImageConfigName": "diabetes-dtree-config",
 3 |     "KernelGatewayImageConfig": {
 4 |         "KernelSpecs": [
 5 |             {
 6 |                 "Name": "python3",
 7 |                 "DisplayName": "Python 3 (ipykernel)"
 8 |             }
 9 |         ],
10 |         "FileSystemConfig": {
11 |             "MountPath": "/home/sagemaker-user",
12 |             "DefaultUid": 1000,
13 |             "DefaultGid": 100
14 |         }
15 |     }
16 | }


--------------------------------------------------------------------------------
/container/decision_trees/nginx.conf:
--------------------------------------------------------------------------------
 1 | worker_processes 1;
 2 | daemon off; # Prevent forking
 3 | 
 4 | 
 5 | pid /tmp/nginx.pid;
 6 | error_log /var/log/nginx/error.log;
 7 | 
 8 | events {
 9 |   # defaults
10 | }
11 | 
12 | http {
13 |   include /etc/nginx/mime.types;
14 |   default_type application/octet-stream;
15 |   access_log /var/log/nginx/access.log combined;
16 |   
17 |   upstream gunicorn {
18 |     server unix:/tmp/gunicorn.sock;
19 |   }
20 | 
21 |   server {
22 |     listen 8080 deferred;
23 |     client_max_body_size 5m;
24 | 
25 |     keepalive_timeout 5;
26 |     proxy_read_timeout 1200s;
27 | 
28 |     location ~ ^/(ping|invocations) {
29 |       proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
30 |       proxy_set_header Host $http_host;
31 |       proxy_redirect off;
32 |       proxy_pass http://gunicorn;
33 |     }
34 | 
35 |     location / {
36 |       return 404 "{}";
37 |     }
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/sagemaker-pipeline/codebuild-buildspec.yml:
--------------------------------------------------------------------------------
 1 | version: 0.2
 2 | 
 3 | phases:
 4 |   install:
 5 |     runtime-versions:
 6 |       python: 3.8
 7 |     commands:
 8 |       - pip install --upgrade --force-reinstall . "awscli>1.20.30"
 9 |   
10 |   build:
11 |     commands:
12 |       - export PYTHONUNBUFFERED=TRUE
13 |       - export SAGEMAKER_PROJECT_NAME_ID="${SAGEMAKER_PROJECT_NAME}-${SAGEMAKER_PROJECT_ID}"
14 |       - |
15 |         run-pipeline --module-name pipelines.diabetes.pipeline \
16 |           --role-arn $SAGEMAKER_PIPELINE_ROLE_ARN \
17 |           --tags "[{\"Key\":\"sagemaker:project-name\", \"Value\":\"${SAGEMAKER_PROJECT_NAME}\"}, {\"Key\":\"sagemaker:project-id\", \"Value\":\"${SAGEMAKER_PROJECT_ID}\"}]" \
18 |           --kwargs "{\"region\":\"${AWS_REGION}\",\"sagemaker_project_arn\":\"${SAGEMAKER_PROJECT_ARN}\",\"role\":\"${SAGEMAKER_PIPELINE_ROLE_ARN}\",\"default_bucket\":\"${ARTIFACT_BUCKET}\",\"pipeline_name\":\"${SAGEMAKER_PROJECT_NAME_ID}\",\"model_package_group_name\":\"${SAGEMAKER_PROJECT_NAME_ID}\",\"base_job_prefix\":\"${SAGEMAKER_PROJECT_NAME_ID}\"}"
19 |       - echo "Create/Update of the SageMaker Pipeline and execution completed."
20 | 
21 | 


--------------------------------------------------------------------------------
/container/build_and_push.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # This script shows how to build the Docker image and push it to ECR to be ready for use
 4 | # by SageMaker.
 5 | 
 6 | # The argument to this script is the image name. This will be used as the image on the local
 7 | # machine and combined with the account and region to form the repository name for ECR.
 8 | image=$1
 9 | 
10 | if [ "$image" == "" ]
11 | then
12 |     echo "Usage: $0 <image-name>"
13 |     exit 1
14 | fi
15 | 
16 | chmod +x decision_trees/train
17 | chmod +x decision_trees/serve
18 | 
19 | # Get the account number associated with the current IAM credentials
20 | account=$(aws sts get-caller-identity --query Account --output text)
21 | 
22 | if [ $? -ne 0 ]
23 | then
24 |     exit 255
25 | fi
26 | 
27 | 
28 | # Get the region defined in the current configuration (default to us-west-2 if none defined)
29 | region=$(aws configure get region)
30 | region=${region:-us-west-2}
31 | 
32 | 
33 | fullname="${account}.dkr.ecr.${region}.amazonaws.com/${image}:latest"
34 | 
35 | # If the repository doesn't exist in ECR, create it.
36 | 
37 | aws ecr describe-repositories --repository-names "${image}" > /dev/null 2>&1
38 | 
39 | if [ $? -ne 0 ]
40 | then
41 |     aws ecr create-repository --repository-name "${image}" > /dev/null
42 | fi
43 | 
44 | # Get the login command from ECR and execute it directly
45 | aws ecr get-login-password --region "${region}" | docker login --username AWS --password-stdin "${account}".dkr.ecr."${region}".amazonaws.com
46 | 
47 | # Build the docker image locally with the image name and then push it to ECR
48 | # with the full name.
49 | 
50 | docker build  -t ${image} .
51 | docker tag ${image} ${fullname}
52 | 
53 | docker push ${fullname}
54 | 


--------------------------------------------------------------------------------
/sagemaker-pipeline/setup.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import setuptools
 3 | 
 4 | 
 5 | about = {}
 6 | here = os.path.abspath(os.path.dirname(__file__))
 7 | with open(os.path.join(here, "pipelines", "__version__.py")) as f:
 8 |     exec(f.read(), about)
 9 | 
10 | 
11 | with open("README.md", "r") as f:
12 |     readme = f.read()
13 | 
14 | 
15 | required_packages = ["sagemaker", "awswrangler", "sagemaker-experiments"]
16 | extras = {
17 |     "test": [
18 |         "black",
19 |         "coverage",
20 |         "flake8",
21 |         "mock",
22 |         "pydocstyle",
23 |         "pytest",
24 |         "pytest-cov",
25 |         "sagemaker",
26 |         "tox",
27 |     ]
28 | }
29 | setuptools.setup(
30 |     name=about["__title__"],
31 |     description=about["__description__"],
32 |     version=about["__version__"],
33 |     author=about["__author__"],
34 |     author_email=["__author_email__"],
35 |     long_description=readme,
36 |     long_description_content_type="text/markdown",
37 |     url=about["__url__"],
38 |     license=about["__license__"],
39 |     packages=setuptools.find_packages(),
40 |     include_package_data=True,
41 |     python_requires=">=3.6",
42 |     install_requires=required_packages,
43 |     extras_require=extras,
44 |     entry_points={
45 |         "console_scripts": [
46 |             "get-pipeline-definition=pipelines.get_pipeline_definition:main",
47 |             "run-pipeline=pipelines.run_pipeline:main",
48 |         ]
49 |     },
50 |     classifiers=[
51 |         "Development Status :: 3 - Alpha",
52 |         "Intended Audience :: Developers",
53 |         "Natural Language :: English",
54 |         "Programming Language :: Python",
55 |         "Programming Language :: Python :: 3",
56 |         "Programming Language :: Python :: 3.6",
57 |         "Programming Language :: Python :: 3.7",
58 |         "Programming Language :: Python :: 3.8",
59 |     ],
60 | )
61 | 


--------------------------------------------------------------------------------
/container/Dockerfile:
--------------------------------------------------------------------------------
 1 | # Build an image that can do training and inference in SageMaker
 2 | # This is a Python 3 image that uses the nginx, gunicorn, flask stack
 3 | # for serving inferences in a stable way.
 4 | 
 5 | FROM ubuntu:18.04
 6 | 
 7 | ARG NB_USER="sagemaker-user"
 8 | ARG NB_UID="1000"
 9 | ARG NB_GID="100"
10 | 
11 | RUN apt-get -y update && apt-get install -y --no-install-recommends \
12 |          wget \
13 |          python3-pip \
14 |          python3-setuptools \
15 |          nginx \
16 |          ca-certificates \
17 |          sudo \ 
18 |     && rm -rf /var/lib/apt/lists/*
19 |         
20 | # Setup the "sagemaker-user" user with root privileges.
21 | RUN \
22 |     useradd -m -s /bin/bash -N -u $NB_UID $NB_USER && \
23 |     chmod g+w /etc/passwd && \
24 |     echo "${NB_USER}    ALL=(ALL)    NOPASSWD:    ALL" >> /etc/sudoers 
25 |     
26 | USER $NB_UID
27 |     
28 | RUN \
29 |     sudo ln -s /usr/bin/python3 /usr/bin/python && \
30 |     sudo ln -s /usr/bin/pip3 /usr/bin/pip
31 | 
32 | # Here we get all python packages.
33 | # There's substantial overlap between scipy and numpy that we eliminate by
34 | # linking them together. Likewise, pip leaves the install caches populated which uses
35 | # a significant amount of space. These optimizations save a fair amount of space in the
36 | # image, which reduces start up time.
37 | RUN sudo pip --no-cache-dir install numpy==1.16.2 scipy==1.2.1 scikit-learn==0.20.2 pandas flask gunicorn
38 | 
39 | # Set some environment variables. PYTHONUNBUFFERED keeps Python from buffering our standard
40 | # output stream, which means that logs can be delivered to the user quickly. PYTHONDONTWRITEBYTECODE
41 | # keeps Python from writing the .pyc files which are unnecessary in this case. We also update
42 | # PATH so that the train and serve programs are found when the container is invoked.
43 | 
44 | ENV PYTHONUNBUFFERED=TRUE
45 | ENV PYTHONDONTWRITEBYTECODE=TRUE
46 | ENV PATH="/opt/program:${PATH}"
47 | 
48 | # Set up the program in the image
49 | COPY decision_trees /opt/program
50 | WORKDIR /opt/program
51 | 
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/sagemaker-pipeline/pipelines/_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #     http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | """Provides utilities for SageMaker Pipeline CLI."""
14 | from __future__ import absolute_import
15 | 
16 | import ast
17 | 
18 | 
19 | def get_pipeline_driver(module_name, passed_args=None):
20 |     """Gets the driver for generating your pipeline definition.
21 | 
22 |     Pipeline modules must define a get_pipeline() module-level method.
23 | 
24 |     Args:
25 |         module_name: The module name of your pipeline.
26 |         passed_args: Optional passed arguments that your pipeline may be templated by.
27 | 
28 |     Returns:
29 |         The SageMaker Workflow pipeline.
30 |     """
31 |     _imports = __import__(module_name, fromlist=["get_pipeline"])
32 |     kwargs = convert_struct(passed_args)
33 |     return _imports.get_pipeline(**kwargs)
34 | 
35 | 
36 | def convert_struct(str_struct=None):
37 |     return ast.literal_eval(str_struct) if str_struct else {}
38 | 
39 | 
40 | def get_pipeline_custom_tags(module_name, args, tags):
41 |     """Gets the custom tags for pipeline
42 | 
43 |     Returns:
44 |         Custom tags to be added to the pipeline
45 |     """
46 |     try:
47 |         _imports = __import__(module_name, fromlist=["get_pipeline_custom_tags"])
48 |         kwargs = convert_struct(args)
49 |         return _imports.get_pipeline_custom_tags(
50 |             tags, kwargs["region"], kwargs["sagemaker_project_arn"]
51 |         )
52 |     except Exception as e:
53 |         print(f"Error getting project tags: {e}")
54 |     return tags
55 | 


--------------------------------------------------------------------------------
/sagemaker-pipeline/pipelines/get_pipeline_definition.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #     http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | """A CLI to get pipeline definitions from pipeline modules."""
14 | from __future__ import absolute_import
15 | 
16 | import argparse
17 | import sys
18 | 
19 | from pipelines._utils import get_pipeline_driver
20 | 
21 | 
22 | def main():  # pragma: no cover
23 |     """The main harness that gets the pipeline definition JSON.
24 | 
25 |     Prints the json to stdout or saves to file.
26 |     """
27 |     parser = argparse.ArgumentParser("Gets the pipeline definition for the pipeline script.")
28 | 
29 |     parser.add_argument(
30 |         "-n",
31 |         "--module-name",
32 |         dest="module_name",
33 |         type=str,
34 |         help="The module name of the pipeline to import.",
35 |     )
36 |     parser.add_argument(
37 |         "-f",
38 |         "--file-name",
39 |         dest="file_name",
40 |         type=str,
41 |         default=None,
42 |         help="The file to output the pipeline definition json to.",
43 |     )
44 |     parser.add_argument(
45 |         "-kwargs",
46 |         "--kwargs",
47 |         dest="kwargs",
48 |         default=None,
49 |         help="Dict string of keyword arguments for the pipeline generation (if supported)",
50 |     )
51 |     args = parser.parse_args()
52 | 
53 |     if args.module_name is None:
54 |         parser.print_help()
55 |         sys.exit(2)
56 | 
57 |     try:
58 |         pipeline = get_pipeline_driver(args.module_name, args.kwargs)
59 |         content = pipeline.definition()
60 |         if args.file_name:
61 |             with open(args.file_name, "w") as f:
62 |                 f.write(content)
63 |         else:
64 |             print(content)
65 |     except Exception as e:  # pylint: disable=W0703
66 |         print(f"Exception: {e}")
67 |         sys.exit(1)
68 | 
69 | 
70 | if __name__ == "__main__":
71 |     main()
72 | 


--------------------------------------------------------------------------------
/container/decision_trees/serve:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # This file implements the scoring service shell. You don't necessarily need to modify it for various
 4 | # algorithms. It starts nginx and gunicorn with the correct configurations and then simply waits until
 5 | # gunicorn exits.
 6 | #
 7 | # The flask server is specified to be the app object in wsgi.py
 8 | #
 9 | # We set the following parameters:
10 | #
11 | # Parameter                Environment Variable              Default Value
12 | # ---------                --------------------              -------------
13 | # number of workers        MODEL_SERVER_WORKERS              the number of CPU cores
14 | # timeout                  MODEL_SERVER_TIMEOUT              60 seconds
15 | 
16 | import multiprocessing
17 | import os
18 | import signal
19 | import subprocess
20 | import sys
21 | 
22 | cpu_count = multiprocessing.cpu_count()
23 | 
24 | model_server_timeout = os.environ.get('MODEL_SERVER_TIMEOUT', 60)
25 | model_server_workers = int(os.environ.get('MODEL_SERVER_WORKERS', cpu_count))
26 | 
27 | def sigterm_handler(nginx_pid, gunicorn_pid):
28 |     try:
29 |         os.kill(nginx_pid, signal.SIGQUIT)
30 |     except OSError:
31 |         pass
32 |     try:
33 |         os.kill(gunicorn_pid, signal.SIGTERM)
34 |     except OSError:
35 |         pass
36 | 
37 |     sys.exit(0)
38 | 
39 | def start_server():
40 |     print('Starting the inference server with {} workers.'.format(model_server_workers))
41 | 
42 | 
43 |     # link the log streams to stdout/err so they will be logged to the container logs
44 |     subprocess.check_call(['ln', '-sf', '/dev/stdout', '/var/log/nginx/access.log'])
45 |     subprocess.check_call(['ln', '-sf', '/dev/stderr', '/var/log/nginx/error.log'])
46 | 
47 |     nginx = subprocess.Popen(['nginx', '-c', '/opt/program/nginx.conf'])
48 |     gunicorn = subprocess.Popen(['gunicorn',
49 |                                  '--timeout', str(model_server_timeout),
50 |                                  '-k', 'sync',
51 |                                  '-b', 'unix:/tmp/gunicorn.sock',
52 |                                  '-w', str(model_server_workers),
53 |                                  'wsgi:app'])
54 | 
55 |     signal.signal(signal.SIGTERM, lambda a, b: sigterm_handler(nginx.pid, gunicorn.pid))
56 | 
57 |     # If either subprocess exits, so do we.
58 |     pids = set([nginx.pid, gunicorn.pid])
59 |     while True:
60 |         pid, _ = os.wait()
61 |         if pid in pids:
62 |             break
63 | 
64 |     sigterm_handler(nginx.pid, gunicorn.pid)
65 |     print('Inference server exiting')
66 | 
67 | # The main routine just invokes the start function.
68 | 
69 | if __name__ == '__main__':
70 |     start_server()
71 | 


--------------------------------------------------------------------------------
/sagemaker-pipeline/pipelines/diabetes/xgb_evaluate.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #     http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | """Evaluation script for measuring model accuracy."""
14 | 
15 | import json
16 | import logging
17 | import os
18 | import pickle
19 | import tarfile
20 | 
21 | import pandas as pd
22 | import numpy
23 | import xgboost
24 | import boto3
25 | 
26 | 
27 | logger = logging.getLogger()
28 | logger.setLevel(logging.INFO)
29 | logger.addHandler(logging.StreamHandler())
30 | 
31 | # May need to import additional metrics depending on what you are measuring.
32 | # See https://docs.aws.amazon.com/sagemaker/latest/dg/model-monitor-model-quality-metrics.html
33 | from sklearn.metrics import (
34 |     accuracy_score,
35 |     classification_report,
36 |     roc_auc_score,
37 |     mean_squared_error,
38 |     mean_absolute_error,
39 |     r2_score,
40 | )
41 | 
42 | if __name__ == "__main__":
43 | 
44 |     model_path = "/opt/ml/processing/model/model.tar.gz"
45 |     with tarfile.open(model_path) as tar:
46 |         tar.extractall(path="..")
47 | 
48 |     logger.debug("Loading XGB model.")
49 |     model = pickle.load(open("xgboost-model", "rb"))
50 | 
51 |     test_path = "/opt/ml/processing/test/test.csv"
52 | 
53 |     logger.info("Loading test input data")
54 | 
55 |     df = pd.read_csv(test_path, header=None)
56 | 
57 |     logger.debug("Reading test data.")
58 |     y_test = df.iloc[:, 0].to_numpy()
59 | 
60 |     df.drop(df.columns[0], axis=1, inplace=True)
61 |     X_test = xgboost.DMatrix(df.values)
62 | 
63 |     logger.info("Performing predictions against test data.")
64 |     predictions = model.predict(X_test)
65 | 
66 |     logger.info("Creating classification evaluation report")
67 | 
68 |     acc = accuracy_score(y_test, predictions.round())
69 |     roc = roc_auc_score(y_test, predictions.round())
70 | 
71 |     report_dict = {
72 |         "classification_metrics": {
73 |             "acc": {"value": acc},
74 |             "roc": {"value": roc},
75 |         },
76 |     }
77 | 
78 |     logger.info("Regression report:\n{}".format(report_dict))
79 | 
80 |     evaluation_output_path = os.path.join("/opt/ml/processing/evaluation", "xgb_evaluation.json")
81 |     logger.info("Saving regression report to {}".format(evaluation_output_path))
82 | 
83 |     with open(evaluation_output_path, "w") as f:
84 |         f.write(json.dumps(report_dict))
85 | 


--------------------------------------------------------------------------------
/sagemaker-pipeline/pipelines/diabetes/dtree_evaluate.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #     http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | """Evaluation script for measuring model accuracy."""
14 | 
15 | import json
16 | import logging
17 | import os
18 | import pickle
19 | import tarfile
20 | 
21 | import pandas as pd
22 | import numpy
23 | 
24 | logger = logging.getLogger()
25 | logger.setLevel(logging.INFO)
26 | logger.addHandler(logging.StreamHandler())
27 | 
28 | # May need to import additional metrics depending on what you are measuring.
29 | # See https://docs.aws.amazon.com/sagemaker/latest/dg/model-monitor-model-quality-metrics.html
30 | from sklearn.metrics import (
31 |     accuracy_score,
32 |     classification_report,
33 |     roc_auc_score,
34 |     mean_squared_error,
35 |     mean_absolute_error,
36 |     r2_score,
37 | )
38 | 
39 | if __name__ == "__main__":
40 | 
41 |     prefix = "/opt/ml/processing/"
42 |     tar_model_path = os.path.join(prefix, 'model/model.tar.gz')
43 |     model_path = os.path.join(prefix, 'model/decision-tree-model.pkl')
44 |     
45 |     os.system('sudo chown -R 1000:100 ' + prefix)
46 |     with tarfile.open(tar_model_path) as tar:
47 |         tar.extractall(path="/opt/ml/processing/model/")
48 | 
49 |     logger.debug("Loading DTree model.")
50 | 
51 |     model = pickle.load(open(model_path, "rb"))
52 | 
53 |     test_path = "/opt/ml/processing/test/test.csv"
54 | 
55 |     logger.info("Loading test input data")
56 | 
57 |     df = pd.read_csv(test_path, header=None)
58 |    
59 |     logger.debug("Reading test data.")
60 |     y_test = df.iloc[:, 0].to_numpy()
61 |     df.drop(df.columns[0], axis=1, inplace=True)
62 |     X_test = numpy.array(df.values)
63 | 
64 |     logger.info("Performing predictions against test data.")
65 |     predictions = model.predict(X_test)
66 | 
67 |     logger.info("Creating classification evaluation report")
68 | 
69 |     acc = accuracy_score(y_test, predictions)
70 |     roc = roc_auc_score(y_test, predictions)
71 | 
72 |     report_dict = {
73 |         "classification_metrics": {
74 |             "acc": {"value": acc},
75 |             "roc": {"value": roc},
76 |         },
77 |     }
78 | 
79 |     logger.info("Regression report:\n{}".format(report_dict))
80 | 
81 |     evaluation_output_path = os.path.join("/opt/ml/processing/evaluation", "dtree_evaluation.json")
82 |     logger.info("Saving regression report to {}".format(evaluation_output_path))
83 | 
84 |     with open(evaluation_output_path, "w") as f:
85 |         f.write(json.dumps(report_dict))
86 | 


--------------------------------------------------------------------------------
/sagemaker-pipeline/pipelines/diabetes/preprocess.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
 4 | # may not use this file except in compliance with the License. A copy of
 5 | # the License is located at
 6 | #
 7 | #     http://aws.amazon.com/apache2.0/
 8 | #
 9 | # or in the "license" file accompanying this file. This file is
10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11 | # ANY KIND, either express or implied. See the License for the specific
12 | # language governing permissions and limitations under the License.
13 | """Feature engineers the customer churn dataset."""
14 | import argparse
15 | import logging
16 | import pathlib
17 | 
18 | import boto3
19 | import numpy as np
20 | import pandas as pd
21 | 
22 | import os
23 | import glob
24 | 
25 | 
26 | logger = logging.getLogger()
27 | logger.setLevel(logging.INFO)
28 | logger.addHandler(logging.StreamHandler())
29 | 
30 | if __name__ == "__main__":
31 |     logger.info("Starting preprocessing")
32 |     parser = argparse.ArgumentParser()
33 |     parser.add_argument("--input-data", type=str, required=True)
34 |     args = parser.parse_args()
35 | 
36 |     logger.info("Setting memory workaround")
37 |     os.system("echo 1 > /proc/sys/vm/overcommit_memory")
38 | 
39 |     base_dir = "/opt/ml/processing"
40 |     pathlib.Path(f"{base_dir}/data").mkdir(parents=True, exist_ok=True)
41 |     input_data = args.input_data
42 |     bucket = input_data.split("/")[2]
43 |     s3_output_prefix = "/".join(input_data.split("/")[3:])
44 | 
45 |     s3_resource = boto3.resource("s3")
46 |     temp_s3_bucket = s3_resource.Bucket(bucket)
47 |     prefix_objs = temp_s3_bucket.objects.filter(Prefix=s3_output_prefix)
48 |     for obj in prefix_objs:
49 |         key = obj.key
50 |         logger.info("Downloading data from bucket: %s, key: %s", bucket, key)
51 |         s3fn = key.split("/")
52 |         s3fn = s3fn[len(s3fn) - 1]
53 |         fn = f"{base_dir}/data/{s3fn}"
54 |         s3_resource.Bucket(bucket).download_file(key, fn)
55 | 
56 |     logger.info("Reading downloaded data")
57 |     all_files = glob.iglob(os.path.join(f"{base_dir}/data", "*.csv"))
58 |     df_from_each_file = (pd.read_csv(f) for f in all_files)
59 |     model_data = pd.concat(df_from_each_file, ignore_index=True)
60 | 
61 |     logger.info(model_data.info())
62 | 
63 |     # Split the data
64 |     train_data, validation_data, test_data = np.split(
65 |         model_data.sample(frac=1, random_state=1729),
66 |         [int(0.7 * len(model_data)), int(0.8 * len(model_data))],
67 |     )
68 | 
69 |     test_data = test_data[train_data.columns]
70 |     validation_data = validation_data[train_data.columns]
71 | 
72 |     pd.DataFrame(train_data).to_csv(f"{base_dir}/train/train.csv", header=False, index=False)
73 |     pd.DataFrame(validation_data).to_csv(
74 |         f"{base_dir}/validation/validation.csv", header=False, index=False
75 |     )
76 |     pd.DataFrame(test_data).to_csv(f"{base_dir}/test/test.csv", header=False, index=False)
77 | 


--------------------------------------------------------------------------------
/container/decision_trees/predictor.py:
--------------------------------------------------------------------------------
 1 | # This is the file that implements a flask server to do inferences. It's the file that you will modify to
 2 | # implement the scoring for your own algorithm.
 3 | 
 4 | from __future__ import print_function
 5 | 
 6 | import io
 7 | import json
 8 | import os
 9 | import pickle
10 | import signal
11 | import sys
12 | import traceback
13 | 
14 | import flask
15 | import pandas as pd
16 | 
17 | prefix = "/opt/ml/"
18 | model_path = os.path.join(prefix, "model")
19 | 
20 | # A singleton for holding the model. This simply loads the model and holds it.
21 | # It has a predict function that does a prediction based on the model and the input data.
22 | 
23 | 
24 | class ScoringService(object):
25 |     model = None  # Where we keep the model when it's loaded
26 | 
27 |     @classmethod
28 |     def get_model(cls):
29 |         """Get the model object for this instance, loading it if it's not already loaded."""
30 |         if cls.model == None:
31 |             with open(os.path.join(model_path, "decision-tree-model.pkl"), "rb") as inp:
32 |                 cls.model = pickle.load(inp)
33 |         return cls.model
34 | 
35 |     @classmethod
36 |     def predict(cls, input):
37 |         """For the input, do the predictions and return them.
38 | 
39 |         Args:
40 |             input (a pandas dataframe): The data on which to do the predictions. There will be
41 |                 one prediction per row in the dataframe"""
42 |         clf = cls.get_model()
43 |         return clf.predict(input)
44 | 
45 | 
46 | # The flask app for serving predictions
47 | app = flask.Flask(__name__)
48 | 
49 | 
50 | @app.route("/ping", methods=["GET"])
51 | def ping():
52 |     """Determine if the container is working and healthy. In this sample container, we declare
53 |     it healthy if we can load the model successfully."""
54 |     health = ScoringService.get_model() is not None  # You can insert a health check here
55 | 
56 |     status = 200 if health else 404
57 |     return flask.Response(response="\n", status=status, mimetype="application/json")
58 | 
59 | 
60 | @app.route("/invocations", methods=["POST"])
61 | def transformation():
62 |     """Do an inference on a single batch of data. In this sample server, we take data as CSV, convert
63 |     it to a pandas data frame for internal use and then convert the predictions back to CSV (which really
64 |     just means one prediction per line, since there's a single column.
65 |     """
66 |     data = None
67 | 
68 |     # Convert from CSV to pandas
69 |     if flask.request.content_type == "text/csv":
70 |         data = flask.request.data.decode("utf-8")
71 |         s = io.StringIO(data)
72 |         data = pd.read_csv(s, header=None)
73 |     else:
74 |         return flask.Response(
75 |             response="This predictor only supports CSV data", status=415, mimetype="text/plain"
76 |         )
77 | 
78 |     print("Invoked with {} records".format(data.shape[0]))
79 | 
80 |     # Do the prediction
81 |     predictions = ScoringService.predict(data)
82 | 
83 |     # Convert from numpy back to CSV
84 |     out = io.StringIO()
85 |     pd.DataFrame({"results": predictions}).to_csv(out, header=False, index=False)
86 |     result = out.getvalue()
87 | 
88 |     return flask.Response(response=result, status=200, mimetype="text/csv")
89 | 


--------------------------------------------------------------------------------
/sagemaker-pipeline/pipelines/run_pipeline.py:
--------------------------------------------------------------------------------
  1 | # Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License"). You
  4 | # may not use this file except in compliance with the License. A copy of
  5 | # the License is located at
  6 | #
  7 | #     http://aws.amazon.com/apache2.0/
  8 | #
  9 | # or in the "license" file accompanying this file. This file is
 10 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 11 | # ANY KIND, either express or implied. See the License for the specific
 12 | # language governing permissions and limitations under the License.
 13 | """A CLI to create or update and run pipelines."""
 14 | from __future__ import absolute_import
 15 | 
 16 | import argparse
 17 | import json
 18 | import sys
 19 | 
 20 | from pipelines._utils import get_pipeline_driver, convert_struct, get_pipeline_custom_tags
 21 | 
 22 | 
 23 | def main():  # pragma: no cover
 24 |     """The main harness that creates or updates and runs the pipeline.
 25 | 
 26 |     Creates or updates the pipeline and runs it.
 27 |     """
 28 |     parser = argparse.ArgumentParser(
 29 |         "Creates or updates and runs the pipeline for the pipeline script."
 30 |     )
 31 | 
 32 |     parser.add_argument(
 33 |         "-n",
 34 |         "--module-name",
 35 |         dest="module_name",
 36 |         type=str,
 37 |         help="The module name of the pipeline to import.",
 38 |     )
 39 |     parser.add_argument(
 40 |         "-kwargs",
 41 |         "--kwargs",
 42 |         dest="kwargs",
 43 |         default=None,
 44 |         help="Dict string of keyword arguments for the pipeline generation (if supported)",
 45 |     )
 46 |     parser.add_argument(
 47 |         "-role-arn",
 48 |         "--role-arn",
 49 |         dest="role_arn",
 50 |         type=str,
 51 |         help="The role arn for the pipeline service execution role.",
 52 |     )
 53 |     parser.add_argument(
 54 |         "-description",
 55 |         "--description",
 56 |         dest="description",
 57 |         type=str,
 58 |         default=None,
 59 |         help="The description of the pipeline.",
 60 |     )
 61 |     parser.add_argument(
 62 |         "-tags",
 63 |         "--tags",
 64 |         dest="tags",
 65 |         default=None,
 66 |         help="""List of dict strings of '[{"Key": "string", "Value": "string"}, ..]'""",
 67 |     )
 68 |     args = parser.parse_args()
 69 | 
 70 |     if args.module_name is None or args.role_arn is None:
 71 |         parser.print_help()
 72 |         sys.exit(2)
 73 |     tags = convert_struct(args.tags)
 74 | 
 75 |     try:
 76 |         pipeline = get_pipeline_driver(args.module_name, args.kwargs)
 77 |         print("###### Creating/updating a SageMaker Pipeline with the following definition:")
 78 |         parsed = json.loads(pipeline.definition())
 79 |         print(json.dumps(parsed, indent=2, sort_keys=True))
 80 | 
 81 |         all_tags = get_pipeline_custom_tags(args.module_name, args.kwargs, tags)
 82 | 
 83 |         upsert_response = pipeline.upsert(
 84 |             role_arn=args.role_arn, description=args.description, tags=all_tags
 85 |         )
 86 |         print("\n###### Created/Updated SageMaker Pipeline: Response received:")
 87 |         print(upsert_response)
 88 | 
 89 |         execution = pipeline.start()
 90 |         print(f"\n###### Execution started with PipelineExecutionArn: {execution.arn}")
 91 | 
 92 |         print("Waiting for the execution to finish...")
 93 |         execution.wait()
 94 |         print("\n#####Execution completed. Execution step details:")
 95 | 
 96 |         print(execution.list_steps())
 97 |         # Todo print the status?
 98 |     except Exception as e:  # pylint: disable=W0703
 99 |         print(f"Exception: {e}")
100 |         sys.exit(1)
101 | 
102 | 
103 | if __name__ == "__main__":
104 |     main()
105 | 


--------------------------------------------------------------------------------
/diabetes-project-iam.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "Version": "2012-10-17",
  3 |     "Statement": [
  4 |         {
  5 |             "Effect": "Allow",
  6 |             "Action": [
  7 |                 "iam:CreateRole",
  8 |                 "iam:AttachRolePolicy",
  9 |                 "iam:CreatePolicy",
 10 |                 "iam:PassRole"
 11 |             ],
 12 |             "Resource": "*"
 13 |         },
 14 |         {
 15 |             "Action": [
 16 |                 "cloudwatch:PutMetricData"
 17 |             ],
 18 |             "Resource": "*",
 19 |             "Effect": "Allow"
 20 |         },
 21 |         {
 22 |             "Action": [
 23 |                 "codecommit:*"
 24 |             ],
 25 |             "Resource": "arn:aws:codecommit:*:*:sagemaker-*",
 26 |             "Effect": "Allow"
 27 |         },
 28 |         {
 29 |             "Action": [
 30 |                 "codepipeline:StartPipelineExecution"
 31 |             ],
 32 |             "Resource": "arn:aws:codepipeline:*:*:sagemaker-*",
 33 |             "Effect": "Allow"
 34 |         },
 35 |         {
 36 |             "Action": [
 37 |                 "codebuild:StartBuild"
 38 |             ],
 39 |             "Resource": "arn:aws:codebuild:*:*:sagemaker-*",
 40 |             "Effect": "Allow"
 41 |         },
 42 |         {
 43 |             "Action": [
 44 |                 "ecr:BatchCheckLayerAvailability",
 45 |                 "ecr:BatchGetImage",
 46 |                 "ecr:Describe*",
 47 |                 "ecr:GetAuthorizationToken",
 48 |                 "ecr:GetDownloadUrlForLayer"
 49 |             ],
 50 |             "Resource": "*",
 51 |             "Effect": "Allow"
 52 |         },
 53 |         {
 54 |             "Effect": "Allow",
 55 |             "Action": [
 56 |                 "ecr:BatchDeleteImage",
 57 |                 "ecr:CompleteLayerUpload",
 58 |                 "ecr:CreateRepository",
 59 |                 "ecr:DeleteRepository",
 60 |                 "ecr:InitiateLayerUpload",
 61 |                 "ecr:PutImage",
 62 |                 "ecr:UploadLayerPart"
 63 |             ],
 64 |             "Resource": [
 65 |                 "arn:aws:ecr:*:*:repository/*"
 66 |             ]
 67 |         },
 68 |         {
 69 |             "Action": [
 70 |                 "logs:CreateLogDelivery",
 71 |                 "logs:CreateLogGroup",
 72 |                 "logs:CreateLogStream",
 73 |                 "logs:DeleteLogDelivery",
 74 |                 "logs:Describe*",
 75 |                 "logs:GetLogDelivery",
 76 |                 "logs:GetLogEvents",
 77 |                 "logs:ListLogDeliveries",
 78 |                 "logs:PutLogEvents",
 79 |                 "logs:PutResourcePolicy",
 80 |                 "logs:UpdateLogDelivery"
 81 |             ],
 82 |             "Resource": "*",
 83 |             "Effect": "Allow"
 84 |         },
 85 |         {
 86 |             "Effect": "Allow",
 87 |             "Action": [
 88 |                 "s3:CreateBucket",
 89 |                 "s3:DeleteBucket",
 90 |                 "s3:GetBucketAcl",
 91 |                 "s3:GetBucketCors",
 92 |                 "s3:GetBucketLocation",
 93 |                 "s3:ListAllMyBuckets",
 94 |                 "s3:ListBucket",
 95 |                 "s3:ListBucketMultipartUploads",
 96 |                 "s3:PutBucketCors",
 97 |                 "s3:PutObjectAcl"
 98 |             ],
 99 |             "Resource": [
100 |                 "arn:aws:s3:::sagemaker-*"
101 |             ]
102 |         },
103 |         {
104 |             "Effect": "Allow",
105 |             "Action": [
106 |                 "s3:AbortMultipartUpload",
107 |                 "s3:DeleteObject",
108 |                 "s3:GetObject",
109 |                 "s3:GetObjectVersion",
110 |                 "s3:PutObject",
111 |                 "s3:PutEncryptionConfiguration"
112 |             ],
113 |             "Resource": [
114 |                 "arn:aws:s3:::sagemaker-*"
115 |             ]
116 |         },
117 |         {
118 |             "Effect": "Allow",
119 |             "Action": [
120 |                 "sagemaker:*"
121 |             ],
122 |             "NotResource": [
123 |                 "arn:aws:sagemaker:*:*:domain/*",
124 |                 "arn:aws:sagemaker:*:*:user-profile/*",
125 |                 "arn:aws:sagemaker:*:*:app/*",
126 |                 "arn:aws:sagemaker:*:*:flow-definition/*"
127 |             ]
128 |         }
129 |     ]
130 | }
131 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Predict diabetic patient readmission using multi-model training on SageMaker Pipelines
 2 | 
 3 | This project has two (2) components: (1) `container` - custom Docker image with custom Decision Tree algorithm using scikit-learn with hyperpameter tuning support, and (2) `sagemaker-pipeline` - a SageMaker pipeline that supports two (2) algorithms: XGBoost on SageMaker container and Decision Tree on custom container built from the first component. The pipeline imports the data from an S3 bucket for ML training using SageMaker Data Wrangler. The pipeline also supports SageMaker HyperParameter Tuning. The best performing model in terms of RPC is then registered to the model registry, ready for inference deployment.
 4 | 
 5 | ## Start here
 6 | 
 7 | In this example, we are solving binary classification problem to determine if a hospital diabetic patient is predicted to be readmitted in the hospital. This example uses [Diabetes 130-US hospitals for years 1999-2008 Data Set](https://archive.ics.uci.edu/ml/datasets/diabetes+130-us+hospitals+for+years+1999-2008). The dataset is uploaded to an S3 bucket and the pipeline imports the data from this bucket. Data Wrangler transforms the data (i.e. one-hot encoding, etc) as the initial step in the pipeline. The pipeline then proceeds with preprocessing, training using Decision Tree and XGBoost algorithms with hyperparameter tuning, evaluation, and registration of the winning model to the registry. This pipeline is a modified version of the pipeline provided by [Amazon SageMaker Examples multi-model pipeline](https://github.com/aws/amazon-sagemaker-examples/tree/main/sagemaker-pipeline-multi-model).
 8 | 
 9 | Prior to running the pipeline, you have to push the Decision Tree custom container to your own Amazon Elastic Container Registry (ECR). This container is a modified version of [Scikit BYO](https://github.com/aws/amazon-sagemaker-examples/tree/main/advanced_functionality/scikit_bring_your_own/container).
10 | 
11 | You can use the `diabetes-project-with-mlops.ipynb` notebook to experiment from SageMaker Studio before you are ready to checkin your code. Alternatively, you can run the pipeline outside of SageMaker Projects using `diabetes-project.ipynb`.
12 | 
13 | ## DataSet
14 | 
15 | The dataset represents 10 years (1999-2008) of clinical care at 130 US hospitals and integrated delivery networks. It includes over 50 features representing patient and hospital outcomes. Information was extracted from the database for encounters that satisfied the following criteria. More dataset information can be found in [Diabetes 130-US hospitals for years 1999-2008 Data Set](https://archive.ics.uci.edu/ml/datasets/diabetes+130-us+hospitals+for+years+1999-2008).
16 | 
17 | ## Assumptions and Prerequisites
18 | 
19 | - S3 bucket `sagemaker-diabetes-<AWS ACCOUNT ID>` is created and raw data has been uploaded to `s3://sagemaker-diabetes-<AWS ACCOUNT ID>/`.
20 | - SageMaker project is already created. Recommendation is to create a SageMaker project using [SageMaker-provide MLOps template for model building, training, and deployment template](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-projects-templates-sm.html#sagemaker-projects-templates-code-commit).
21 | - Necessary IAM service roles are already created.
22 | 
23 | ## Security
24 | 
25 | This sample code is not designed for production deployment out-of-the-box, so further security enhancements may need to be added according to your own requirements before pushing to production. Security recommendations include, but are not limited to, the following:
26 | - Use private ECR
27 | - Use a more defined IAM permission for service roles
28 | - Use interface / gateway VPC endpoints to prevent communication traffic from traversing public network
29 | - Use S3 VPC endpoint policy which controls access to specified Amazon S3 buckets only
30 | 
31 | AmazonSageMakerServiceCatalogProductsUseRole-diabetes with AmazonSageMakerFullAccess. [This is required as we are creating a custom SageMaker image](https://docs.aws.amazon.com/sagemaker/latest/dg/studio-byoi-create.html).
32 | 
33 | 
34 | [diabetes-project-with-mlops.ipynb](diabetes-project-with-mlops.ipynb) and [diabetes-project.ipynb](diabetes-project.ipynb) have been tested in a SageMaker notebook instance that is using a kernel with Python 3.7 installed. This SageMaker notebook is attached with an [IAM role with an in-line policy](diabetes-project-iam.json).
35 | 
36 | ## License
37 | 
38 | This library is licensed under the MIT-0 License. See the LICENSE file.
39 | 


--------------------------------------------------------------------------------
/container/decision_trees/train:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | 
  3 | # A sample training component that trains a simple scikit-learn decision tree model.
  4 | # This implementation works in File mode and makes no assumptions about the input file names.
  5 | # Input is specified as CSV with a data point in each row and the labels in the first column.
  6 | 
  7 | from __future__ import print_function
  8 | 
  9 | import json
 10 | import os
 11 | import pickle
 12 | import sys
 13 | import traceback
 14 | import logging
 15 | 
 16 | import pandas as pd
 17 | from sklearn import tree
 18 | 
 19 | from sklearn.metrics import mean_squared_error, r2_score, roc_auc_score
 20 | 
 21 | logger = logging.getLogger()
 22 | logger.setLevel(logging.INFO)
 23 | logger.addHandler(logging.StreamHandler())
 24 | # These are the paths to where SageMaker mounts interesting things in your container.
 25 | 
 26 | prefix = '/opt/ml/'
 27 | 
 28 | input_path = os.path.join(prefix, 'input/data')
 29 | output_path = os.path.join(prefix, 'output')
 30 | model_path = os.path.join(prefix, 'model')
 31 | param_path = os.path.join(prefix, 'input/config/hyperparameters.json')
 32 | 
 33 | # This algorithm has a single channel of input data called 'training'. Since we run in
 34 | # File mode, the input files are copied to the directory specified here.
 35 | channel_name_training='training'
 36 | training_path = os.path.join(input_path, channel_name_training)
 37 | 
 38 | channel_name_validation='validation'
 39 | validation_path = os.path.join(input_path, channel_name_validation)
 40 | 
 41 | # The function to execute the training.
 42 | def train():
 43 |     print('Starting the training.')
 44 |     try:
 45 |         # Read in any hyperparameters that the user passed with the training job
 46 |         with open(param_path, 'r') as tc:
 47 |             trainingParams = json.load(tc)
 48 | 
 49 |         # Take the set of files and read them all into a single pandas dataframe
 50 |         input_files = [ os.path.join(training_path, file) for file in os.listdir(training_path) ]
 51 |         if len(input_files) == 0:
 52 |             raise ValueError(('There are no files in {}.\n' +
 53 |                               'This usually indicates that the channel ({}) was incorrectly specified,\n' +
 54 |                               'the data specification in S3 was incorrectly specified or the role specified\n' +
 55 |                               'does not have permission to access the data.').format(training_path, channel_name_training))
 56 |         raw_data = [ pd.read_csv(file, header=None) for file in input_files ]
 57 |         train_data = pd.concat(raw_data)
 58 | 
 59 |         # labels are in the first column
 60 |         train_y = train_data.iloc[:,0]
 61 |         train_X = train_data.iloc[:,1:]
 62 | 
 63 |         # Here we only support a single hyperparameter. Note that hyperparameters are always passed in as
 64 |         # strings, so we need to do any necessary conversions.
 65 |         max_leaf_nodes = trainingParams.get('max_leaf_nodes', None)
 66 |         if max_leaf_nodes is not None:
 67 |             max_leaf_nodes = int(max_leaf_nodes)
 68 |         max_depth = trainingParams.get('max_depth', None)
 69 |         if max_depth is not None:
 70 |             max_depth = int(max_depth)
 71 | 
 72 |         clf = tree.DecisionTreeClassifier(max_leaf_nodes=max_leaf_nodes, max_depth=max_depth)
 73 |         clf = clf.fit(train_X, train_y)
 74 | 
 75 | 
 76 |         # save the model
 77 |         os.system('sudo chown -R 1000:100 ' + model_path)
 78 |         with open(os.path.join(model_path, 'decision-tree-model.pkl'), 'wb') as out:
 79 |             pickle.dump(clf, out)
 80 |             
 81 |         print('Training complete.')
 82 | 
 83 |         return clf
 84 |     except Exception as e:
 85 |         # Write out an error file. This will be returned as the failureReason in the
 86 |         # DescribeTrainingJob result.
 87 |         trc = traceback.format_exc()
 88 |         os.system('sudo chown -R 1000:100 ' + output_path)
 89 |         with open(os.path.join(output_path, 'failure'), 'w') as s:
 90 |             s.write('Exception during training: ' + str(e) + '\n' + trc)
 91 | 
 92 |         # Printing this causes the exception to be in the training job logs, as well.
 93 |         print('Exception during training: ' + str(e) + '\n' + trc, file=sys.stderr)
 94 |         # A non-zero exit code causes the training job to be marked as Failed.
 95 |         sys.exit(255)
 96 | 
 97 | # The function to execute the validation.
 98 | def validation(clf):
 99 |     print('Starting the validation.')
100 |     try:
101 | 
102 |         # Take the set of files and read them all into a single pandas dataframe
103 |         input_files = [ os.path.join(validation_path, file) for file in os.listdir(validation_path) ]
104 |         if len(input_files) == 0:
105 |             raise ValueError(('There are no files in {}.\n' +
106 |                               'This usually indicates that the channel ({}) was incorrectly specified,\n' +
107 |                               'the data specification in S3 was incorrectly specified or the role specified\n' +
108 |                               'does not have permission to access the data.').format(validation_path, channel_name_validation))
109 |         raw_data = [ pd.read_csv(file, header=None) for file in input_files ]
110 |         validation_data = pd.concat(raw_data)
111 | 
112 |         # labels are in the first column
113 |         train_y = validation_data.iloc[:,0]
114 |         train_X = validation_data.iloc[:,1:]
115 | 
116 |         predictions = clf.predict(train_X)
117 |         auc = roc_auc_score(train_y, predictions)
118 |         logger.info(('auc:{}').format(auc))
119 | 
120 |     except Exception as e:
121 |         # Write out an error file. This will be returned as the failureReason in the
122 |         # DescribeTrainingJob result.
123 |         trc = traceback.format_exc()
124 |         os.system('sudo chown -R 1000:100 ' + output_path)
125 |         with open(os.path.join(output_path, 'failure'), 'w') as s:
126 |             s.write('Exception during training: ' + str(e) + '\n' + trc)
127 |         # Printing this causes the exception to be in the training job logs, as well.
128 |         print('Exception during training: ' + str(e) + '\n' + trc, file=sys.stderr)
129 |         # A non-zero exit code causes the training job to be marked as Failed.
130 |         sys.exit(255)
131 | 
132 | 
133 | if __name__ == '__main__':
134 |     clf = train()
135 |     validation(clf)
136 | 
137 |     # A zero exit code causes the job to be marked a Succeeded.
138 |     sys.exit(0)
139 | 


--------------------------------------------------------------------------------
/diabetes-project-with-mlops.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "af6b42fd",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Multi-model SageMaker Pipeline with Hyperparamater Tuning and Experiments"
  9 |    ]
 10 |   },
 11 |   {
 12 |    "cell_type": "markdown",
 13 |    "id": "a1e0b8cc",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "Before proceeding, please see context of this notebook in [README.md](README.md). This notebook has been tested in a SageMaker notebook that is using a kernel with Python 3.7 installed, e.g. conda_mxnet_latest_p37, conda_python3. Make sure you have created a SageMaker project outside of this notebook with the name `diabetes`. Recommendation is to create a SageMaker project using [SageMaker-provide MLOps template for model building, training, and deployment template](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-projects-templates-sm.html#sagemaker-projects-templates-code-commit). Note that this notebook will not create the SageMaker project for you. \n"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "markdown",
 21 |    "id": "f81c95b4",
 22 |    "metadata": {},
 23 |    "source": [
 24 |     "## Prepare the raw data"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "markdown",
 29 |    "id": "2fdd2357",
 30 |    "metadata": {},
 31 |    "source": [
 32 |     "We create an S3 bucket and with encryption enabled for additional security. \n",
 33 |     "\n",
 34 |     "#### If you are running this Notebook in us-east-1 region, don't use 'CreateBucketConfiguration' parameter with create_bucket().  us-east-1 is the default location."
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": null,
 40 |    "id": "d9393765",
 41 |    "metadata": {},
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "import boto3\n",
 45 |     "\n",
 46 |     "AWS_ACCOUNT = boto3.client(\"sts\").get_caller_identity()[\"Account\"]\n",
 47 |     "AWS_REGION = boto3.Session().region_name\n",
 48 |     "BUCKET_NAME = \"sagemaker-diabetes-{AWS_ACCOUNT}\".format(AWS_ACCOUNT=AWS_ACCOUNT)\n",
 49 |     "\n",
 50 |     "s3_client = boto3.client(\"s3\")\n",
 51 |     "location = {\"LocationConstraint\": AWS_REGION}\n",
 52 |     "\n",
 53 |     "# default location is us-east-1, so CreateBucketConfiguration is not needed\n",
 54 |     "s3_client.create_bucket(Bucket=BUCKET_NAME)\n",
 55 |     "\n",
 56 |     "# use this create_bucket statement for any AWS region other than us-east-1\n",
 57 |     "#s3_client.create_bucket(Bucket=BUCKET_NAME, CreateBucketConfiguration=location) \n",
 58 |     "\n",
 59 |     "s3_client.put_bucket_encryption(\n",
 60 |     "    Bucket=BUCKET_NAME,\n",
 61 |     "    ServerSideEncryptionConfiguration={\n",
 62 |     "        \"Rules\": [\n",
 63 |     "            {\n",
 64 |     "                \"ApplyServerSideEncryptionByDefault\": {\"SSEAlgorithm\": \"AES256\"},\n",
 65 |     "            },\n",
 66 |     "        ]\n",
 67 |     "    },\n",
 68 |     ")"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "id": "2404183d",
 74 |    "metadata": {},
 75 |    "source": [
 76 |     "## Dataset collection\n",
 77 |     "\n",
 78 |     "Download UCI dataset and copy to S3 bucket"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": null,
 84 |    "id": "feb6ba21",
 85 |    "metadata": {
 86 |     "scrolled": true
 87 |    },
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "%%sh\n",
 91 |     "\n",
 92 |     "AWS_ACCOUNT=$(aws sts get-caller-identity --query Account --output text)\n",
 93 |     "BUCKET_NAME=\"sagemaker-diabetes-${AWS_ACCOUNT}\"\n",
 94 |     "\n",
 95 |     "wget https://archive.ics.uci.edu/ml/machine-learning-databases/00296/dataset_diabetes.zip\n",
 96 |     "unzip dataset_diabetes.zip\n",
 97 |     "aws s3 cp dataset_diabetes/diabetic_data.csv s3://${BUCKET_NAME}/\n",
 98 |     "    "
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "markdown",
103 |    "id": "75078e91",
104 |    "metadata": {},
105 |    "source": [
106 |     "Update diabetes.flow to use your AWS account ID. "
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": null,
112 |    "id": "bb6fa1e3",
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "%%sh\n",
117 |     "\n",
118 |     "AWS_ACCOUNT=$(aws sts get-caller-identity --query Account --output text)\n",
119 |     "sed -i \"s/AWS_ACCOUNT/${AWS_ACCOUNT}/g\" sagemaker-pipeline/diabetes.flow"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "markdown",
124 |    "id": "3a7c5961",
125 |    "metadata": {},
126 |    "source": [
127 |     "Next, Create IAM Role for ML workflow steps"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "id": "4bb6d6dd",
134 |    "metadata": {},
135 |    "outputs": [],
136 |    "source": [
137 |     "import json\n",
138 |     "\n",
139 |     "iam_client = boto3.client(\"iam\")\n",
140 |     "\n",
141 |     "sagemaker_assume_role_policy_document = json.dumps(\n",
142 |     "    {\n",
143 |     "        \"Version\": \"2012-10-17\",\n",
144 |     "        \"Statement\": [\n",
145 |     "            {\n",
146 |     "                \"Effect\": \"Allow\",\n",
147 |     "                \"Principal\": {\"Service\": \"sagemaker.amazonaws.com\"},\n",
148 |     "                \"Action\": \"sts:AssumeRole\",\n",
149 |     "            }\n",
150 |     "        ],\n",
151 |     "    }\n",
152 |     ")\n",
153 |     "\n",
154 |     "response_role = iam_client.create_role(\n",
155 |     "    RoleName=\"AmazonSageMakerServiceCatalogProductsUseRole-diabetes\",\n",
156 |     "    AssumeRolePolicyDocument=sagemaker_assume_role_policy_document,\n",
157 |     ")\n",
158 |     "\n",
159 |     "\n",
160 |     "iam_client.attach_role_policy(\n",
161 |     "    RoleName=response_role[\"Role\"][\"RoleName\"],\n",
162 |     "    PolicyArn='arn:aws:iam::aws:policy/AmazonSageMakerFullAccess'\n",
163 |     ")\n"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "markdown",
168 |    "id": "e127e0d0",
169 |    "metadata": {},
170 |    "source": [
171 |     "## Prepare the Decision Tree custom Docker image"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "markdown",
176 |    "id": "fc5881fa",
177 |    "metadata": {},
178 |    "source": [
179 |     "We make a  Docker image containing a custom algorithm using [Scikit-learn Decision Tree Regressor](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html#sklearn.tree.DecisionTreeRegressor). Note that the Docker image has been modified to support hyperparameter tuning and validation data. \n",
180 |     "\n"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": null,
186 |    "id": "44e33823",
187 |    "metadata": {},
188 |    "outputs": [],
189 |    "source": [
190 |     "! sudo yum install docker -y"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": null,
196 |    "id": "fa53c46b",
197 |    "metadata": {},
198 |    "outputs": [],
199 |    "source": [
200 |     "%%sh\n",
201 |     "\n",
202 |     "# The name of our algorithm\n",
203 |     "ALGORITHM_NAME=\"diabetes-decision-trees\"\n",
204 |     "\n",
205 |     "cd container\n",
206 |     "\n",
207 |     "chmod +x decision_trees/train\n",
208 |     "chmod +x decision_trees/serve\n",
209 |     "\n",
210 |     "AWS_ACCOUNT=$(aws sts get-caller-identity --query Account --output text)\n",
211 |     "AWS_REGION=$(aws configure get region)\n",
212 |     "\n",
213 |     "IMAGE_FULLNAME=\"${AWS_ACCOUNT}.dkr.ecr.${AWS_REGION}.amazonaws.com/${ALGORITHM_NAME}:latest\"\n",
214 |     "\n",
215 |     "# If the repository doesn't exist in ECR, create it.\n",
216 |     "aws ecr describe-repositories --repository-names \"${ALGORITHM_NAME}\" > /dev/null 2>&1\n",
217 |     "\n",
218 |     "if [ $? -ne 0 ]\n",
219 |     "then\n",
220 |     "    aws ecr create-repository --repository-name \"${ALGORITHM_NAME}\" > /dev/null\n",
221 |     "fi\n",
222 |     "\n",
223 |     "# Get the login command from ECR and execute it directly\n",
224 |     "aws ecr get-login-password --region ${AWS_REGION}|docker login --username AWS --password-stdin ${IMAGE_FULLNAME}\n",
225 |     "\n",
226 |     "# Build the docker image locally with the image name and then push it to ECR with the full name.\n",
227 |     "# Ensure your notebook IAM role has required permission for pushing image to ECR\n",
228 |     "\n",
229 |     "docker build  -t ${ALGORITHM_NAME} .\n",
230 |     "docker tag ${ALGORITHM_NAME} ${IMAGE_FULLNAME}\n",
231 |     "docker push ${IMAGE_FULLNAME}\n"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "markdown",
236 |    "id": "3390ff94",
237 |    "metadata": {},
238 |    "source": [
239 |     "Once Docker image is pushed to ECR repository, we make the image accessible from SageMaker. "
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": null,
245 |    "id": "6ff0f84e",
246 |    "metadata": {},
247 |    "outputs": [],
248 |    "source": [
249 |     "%%sh\n",
250 |     "\n",
251 |     "# The name of our algorithm\n",
252 |     "SM_IMAGE_NAME=diabetes-dtree\n",
253 |     "AWS_ACCOUNT=$(aws sts get-caller-identity --query Account --output text)\n",
254 |     "\n",
255 |     "# This assumes the role name is AmazonSageMakerServiceCatalogProductsUseRole-diabetes\n",
256 |     "ROLE_ARN=\"arn:aws:iam::${AWS_ACCOUNT}:role/AmazonSageMakerServiceCatalogProductsUseRole-diabetes\"\n",
257 |     "\n",
258 |     "aws sagemaker create-image \\\n",
259 |     "    --image-name ${SM_IMAGE_NAME} \\\n",
260 |     "    --role-arn ${ROLE_ARN}\n",
261 |     "\n",
262 |     "aws sagemaker create-app-image-config \\\n",
263 |     "    --cli-input-json file://container/app-image-config-input.json\n"
264 |    ]
265 |   },
266 |   {
267 |    "cell_type": "code",
268 |    "execution_count": null,
269 |    "id": "7e6cf39b",
270 |    "metadata": {},
271 |    "outputs": [],
272 |    "source": [
273 |     "%%sh\n",
274 |     "AWS_ACCOUNT=$(aws sts get-caller-identity --query Account --output text)\n",
275 |     "ALGORITHM_NAME=diabetes-decision-trees\n",
276 |     "AWS_REGION=$(aws configure get region)\n",
277 |     "SM_IMAGE_NAME=diabetes-dtree\n",
278 |     "SM_BASE_IMAGE=\"${AWS_ACCOUNT}.dkr.ecr.${AWS_REGION}.amazonaws.com/${ALGORITHM_NAME}:latest\"\n",
279 |     "\n",
280 |     "aws sagemaker create-image-version \\\n",
281 |     "    --image-name ${SM_IMAGE_NAME} \\\n",
282 |     "    --base-image ${SM_BASE_IMAGE}"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "markdown",
287 |    "id": "8ecc8a74",
288 |    "metadata": {},
289 |    "source": [
290 |     "## Trigger the SageMaker Pipelines pipeline\n",
291 |     "\n",
292 |     "Here we perform following steps:\n",
293 |     "\n",
294 |     "1) Clone SageMaker Projects model-build repo from code commit\n",
295 |     "\n",
296 |     "2) Copy local project sagemaker-pipeline to the SageMaker Project repo\n",
297 |     "\n",
298 |     "3) Commit these changes to code commit\n",
299 |     "\n",
300 |     "Above 3 steps will trigger the SageMaker Projects model-build pipeline."
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "code",
305 |    "execution_count": null,
306 |    "id": "b9a3c5ae",
307 |    "metadata": {},
308 |    "outputs": [],
309 |    "source": [
310 |     "%%sh\n",
311 |     "SAGEMAKER_PROJECT_NAME=diabetes\n",
312 |     "AWS_REGION=$(aws configure get region)\n",
313 |     "SAGEMAKER_PROJECT_ID=$(aws sagemaker describe-project --project-name ${SAGEMAKER_PROJECT_NAME} --query 'ProjectId' | tr -d '\"')\n",
314 |     "SAGEMAKER_PROJECT_REPO=\"sagemaker-${SAGEMAKER_PROJECT_NAME}-${SAGEMAKER_PROJECT_ID}-modelbuild\"\n",
315 |     "\n",
316 |     "rm -rf ~/${SAGEMAKER_PROJECT_REPO}/\n",
317 |     "git clone https://git-codecommit.${AWS_REGION}.amazonaws.com/v1/repos/${SAGEMAKER_PROJECT_REPO} ~/${SAGEMAKER_PROJECT_REPO}\n",
318 |     "rsync -a sagemaker-pipeline/ ~/${SAGEMAKER_PROJECT_REPO}/ && rm -rf ~/${SAGEMAKER_PROJECT_REPO}/pipelines/abalone/ ~/${SAGEMAKER_PROJECT_REPO}/build/\n",
319 |     "cd ~/${SAGEMAKER_PROJECT_REPO}/ && git config --global user.name \"name\" && git config --global user.email name@email.com && git config advice.addIgnoredFile false && git add --all && git commit -am \"initial commit\" && git push origin main \n",
320 |     "\n"
321 |    ]
322 |   },
323 |   {
324 |    "cell_type": "markdown",
325 |    "id": "e2a43a57",
326 |    "metadata": {},
327 |    "source": [
328 |     "The commit should trigger a pipeline run. Proceed to monitor your pipeline run until completion in SageMaker Studio. "
329 |    ]
330 |   },
331 |   {
332 |    "cell_type": "markdown",
333 |    "id": "4d35bcef",
334 |    "metadata": {},
335 |    "source": [
336 |     "If you inspect the pipeline, you will see that the XGBoost model performs better than Decision Tree. Therefore, the XGBoost model is registered in the model registry."
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "markdown",
341 |    "id": "1718d5a3",
342 |    "metadata": {},
343 |    "source": [
344 |     "## Approve top performing model in Model registry"
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "markdown",
349 |    "id": "593b322b",
350 |    "metadata": {},
351 |    "source": [
352 |     "After the SageMaker Pipeline is complete, new trained Model will be registered in Model Registry.\n",
353 |     "\n",
354 |     "1) Make sure to update your desired `MODEL_VERSION`. We assume we approve the model version 1. \n",
355 |     "\n",
356 |     "2) As EventBridge monitors Model Registry status changes, Model status change will trigger SageMaker Projects model-deploy pipeline."
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "code",
361 |    "execution_count": null,
362 |    "id": "f2eec1e2",
363 |    "metadata": {},
364 |    "outputs": [],
365 |    "source": [
366 |     "from sagemaker import get_execution_role, session\n",
367 |     "import boto3\n",
368 |     "\n",
369 |     "role = get_execution_role()\n",
370 |     "sm_client = boto3.client(\"sagemaker\")\n",
371 |     "\n",
372 |     "MODEL_VERSION = \"1\"\n",
373 |     "SAGEMAKER_PROJECT_NAME = \"diabetes\"\n",
374 |     "SAGEMAKER_PROJECT_ID = sm_client.describe_project(ProjectName=SAGEMAKER_PROJECT_NAME)[\"ProjectId\"]\n",
375 |     "AWS_REGION = boto3.Session().region_name\n",
376 |     "MODEL_PACKAGE_ARN = \"arn:aws:sagemaker:{AWS_REGION}:{AWS_ACCOUNT}:model-package/{SAGEMAKER_PROJECT_NAME}-{SAGEMAKER_PROJECT_ID}/{MODEL_VERSION}\".format(\n",
377 |     "    AWS_REGION=AWS_REGION,\n",
378 |     "    AWS_ACCOUNT=AWS_ACCOUNT,\n",
379 |     "    SAGEMAKER_PROJECT_NAME=SAGEMAKER_PROJECT_NAME,\n",
380 |     "    SAGEMAKER_PROJECT_ID=SAGEMAKER_PROJECT_ID,\n",
381 |     "    MODEL_VERSION=MODEL_VERSION,\n",
382 |     ")\n",
383 |     "\n",
384 |     "\n",
385 |     "model_package_update_response = sm_client.update_model_package(\n",
386 |     "    ModelPackageArn=MODEL_PACKAGE_ARN, ModelApprovalStatus=\"Approved\"\n",
387 |     ")"
388 |    ]
389 |   },
390 |   {
391 |    "cell_type": "markdown",
392 |    "id": "4d06cf5e",
393 |    "metadata": {},
394 |    "source": [
395 |     "## Run predictions on model"
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "markdown",
400 |    "id": "89d32adf",
401 |    "metadata": {},
402 |    "source": [
403 |     "Wait until SageMaker Projects model-deploy pipeline has deployed the staging inference endpoint. Use the following data for inference:\n",
404 |     "\n",
405 |     "Example 1\n",
406 |     "------------\n",
407 |     "`5.0,64.0,0.0,18.0,0.0,0.0,7.0,9.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0\n",
408 |     "`\n",
409 |     "\n",
410 |     "In summary, this is a diabetic patient that is Caucasian Female age 60-70, who has spent 5 days in the hospital under emergency care in the current encounter. Prior to this encounter, patient has spent 0 days in outpatient care, 0 days in emergency care, 7 days in inpatient care. 64 laboratory procedures have been performed on the patient. Patient is not using metformin, repaglinide, pioglitazone, rosiglitazone, and insulin prescription is steady.\n",
411 |     "\n"
412 |    ]
413 |   },
414 |   {
415 |    "cell_type": "code",
416 |    "execution_count": null,
417 |    "id": "124746c4",
418 |    "metadata": {},
419 |    "outputs": [],
420 |    "source": [
421 |     "import json\n",
422 |     "import boto3\n",
423 |     "\n",
424 |     "sm_runtime = boto3.client(\"runtime.sagemaker\")\n",
425 |     "endpoint_name =\"diabetes-staging\"\n",
426 |     "line = \"5.0,64.0,0.0,18.0,0.0,0.0,7.0,9.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0\"\n",
427 |     "response = sm_runtime.invoke_endpoint(EndpointName=endpoint_name, ContentType=\"text/csv\", Body=line)\n",
428 |     "result = json.loads(response[\"Body\"].read().decode())\n",
429 |     "print(\"Predicted class : {}\".format(round(result)))"
430 |    ]
431 |   },
432 |   {
433 |    "cell_type": "markdown",
434 |    "id": "1fef914f",
435 |    "metadata": {},
436 |    "source": [
437 |     "Now you try:\n",
438 |     "\n",
439 |     "Example 2\n",
440 |     "------------\n",
441 |     "\n",
442 |     "`3.0,19.0,3.0,19.0,0.0,0.0,0.0,8.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0\n",
443 |     "`\n",
444 |     "\n",
445 |     "In summary, this is a dibetic patient that is Caucasian Female age 70-80, who has spent 3 days in the hospital under elective care in the current encounter. Prior to this encounter, patient has spent 0 days in outpatient care, 0 days in emergency care, 0 days in inpatient care. 19 laboratory procedures have been performed on the patient. Patient is not using metformin, repaglinide, pioglitazone, rosiglitazone. Patient is not using insulin. "
446 |    ]
447 |   },
448 |   {
449 |    "cell_type": "markdown",
450 |    "id": "4afcc66c",
451 |    "metadata": {},
452 |    "source": [
453 |     "## Cleanup"
454 |    ]
455 |   },
456 |   {
457 |    "cell_type": "markdown",
458 |    "id": "710b9232",
459 |    "metadata": {},
460 |    "source": [
461 |     "To avoid incurring future charges, clean up created resources such as the S3 bucket, ECR repository, and SageMaker Studio. Prior to deleting the SageMaker Studio, make sure to delete the SageMaker model and endpoint resources, delete the entire SageMaker project diabetes, as well as its peripheral resources CodePipeline pipelines and CodeCommit repositories.\n",
462 |     "Finally, delete the Jupyter instance containing the notebook. "
463 |    ]
464 |   }
465 |  ],
466 |  "metadata": {
467 |   "instance_type": "ml.t3.medium",
468 |   "kernelspec": {
469 |    "display_name": "conda_python3",
470 |    "language": "python",
471 |    "name": "conda_python3"
472 |   },
473 |   "language_info": {
474 |    "codemirror_mode": {
475 |     "name": "ipython",
476 |     "version": 3
477 |    },
478 |    "file_extension": ".py",
479 |    "mimetype": "text/x-python",
480 |    "name": "python",
481 |    "nbconvert_exporter": "python",
482 |    "pygments_lexer": "ipython3",
483 |    "version": "3.8.12"
484 |   }
485 |  },
486 |  "nbformat": 4,
487 |  "nbformat_minor": 5
488 | }
489 | 


--------------------------------------------------------------------------------
/sagemaker-pipeline/pipelines/diabetes/pipeline.py:
--------------------------------------------------------------------------------
  1 | #
  2 | # Licensed under the Apache License, Version 2.0 (the "License"). You
  3 | # may not use this file except in compliance with the License. A copy of
  4 | # the License is located at
  5 | #
  6 | #     http://aws.amazon.com/apache2.0/
  7 | #
  8 | # or in the "license" file accompanying this file. This file is
  9 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
 10 | # ANY KIND, either express or implied. See the License for the specific
 11 | # language governing permissions and limitations under the License.
 12 | """Example workflow pipeline script for RESVM pipeline.
 13 |                                                . -RegisterModel
 14 |                                               .
 15 |     Process-> Train -> Evaluate -> Condition .
 16 |                                               .
 17 |                                                . -(stop)
 18 | Implements a get_pipeline(**kwargs) method.
 19 | """
 20 | 
 21 | import os
 22 | 
 23 | import boto3
 24 | import sagemaker
 25 | import sagemaker.session
 26 | from sagemaker.estimator import Estimator
 27 | from sagemaker.inputs import TrainingInput
 28 | from sagemaker.model_metrics import MetricsSource, ModelMetrics
 29 | from sagemaker.processing import ProcessingInput, ProcessingOutput, ScriptProcessor
 30 | from sagemaker.sklearn.processing import SKLearnProcessor
 31 | from sagemaker.workflow.condition_step import ConditionStep, JsonGet
 32 | from sagemaker.workflow.conditions import ConditionGreaterThanOrEqualTo
 33 | from sagemaker.workflow.parameters import ParameterInteger, ParameterString
 34 | from sagemaker.workflow.pipeline import Pipeline
 35 | from sagemaker.workflow.properties import PropertyFile
 36 | from sagemaker.workflow.step_collections import RegisterModel
 37 | from sagemaker.workflow.steps import ProcessingStep, TrainingStep, CacheConfig, TuningStep
 38 | 
 39 | ###
 40 | from sagemaker.processing import ProcessingInput, ProcessingOutput
 41 | from sagemaker.dataset_definition.inputs import (
 42 |     AthenaDatasetDefinition,
 43 |     DatasetDefinition,
 44 |     RedshiftDatasetDefinition,
 45 | )
 46 | 
 47 | 
 48 | import time
 49 | import uuid
 50 | import sagemaker
 51 | 
 52 | import os
 53 | import json
 54 | import boto3
 55 | 
 56 | from sagemaker.processing import Processor
 57 | from sagemaker.network import NetworkConfig
 58 | 
 59 | from sagemaker.workflow.steps import ProcessingStep
 60 | 
 61 | from smexperiments.experiment import Experiment
 62 | from smexperiments.trial import Trial
 63 | from smexperiments.trial_component import TrialComponent
 64 | from smexperiments.tracker import Tracker
 65 | from sagemaker.workflow.pipeline_experiment_config import PipelineExperimentConfig
 66 | 
 67 | from sagemaker.tuner import (
 68 |     ContinuousParameter,
 69 |     IntegerParameter,
 70 |     CategoricalParameter,
 71 |     HyperparameterTuner,
 72 |     WarmStartConfig,
 73 |     WarmStartTypes,
 74 | )
 75 | 
 76 | 
 77 | BASE_DIR = os.path.dirname(os.path.realpath(__file__))
 78 | 
 79 | 
 80 | def get_session(region, default_bucket):
 81 |     """Gets the sagemaker session based on the region.
 82 |     Args:
 83 |         region: the aws region to start the session
 84 |         default_bucket: the bucket to use for storing the artifacts
 85 |     Returns:
 86 |         `sagemaker.session.Session instance
 87 |     """
 88 | 
 89 |     boto_session = boto3.Session(region_name=region)
 90 | 
 91 |     sagemaker_client = boto_session.client("sagemaker")
 92 |     runtime_client = boto_session.client("sagemaker-runtime")
 93 |     return sagemaker.session.Session(
 94 |         boto_session=boto_session,
 95 |         sagemaker_client=sagemaker_client,
 96 |         sagemaker_runtime_client=runtime_client,
 97 |         default_bucket=default_bucket,
 98 |     )
 99 | 
100 | 
101 | def get_pipeline(
102 |     region,
103 |     sagemaker_project_arn=None,
104 |     role=None,
105 |     default_bucket=None,
106 |     model_package_group_name="",  # Choose any name
107 |     pipeline_name="",  # You can find your pipeline name in the Studio UI (project -> Pipelines -> name)
108 |     base_job_prefix="",  # Choose any name
109 | ):
110 |     """Gets a SageMaker ML Pipeline instance working with on RE data.
111 |     Args:
112 |         region: AWS region to create and run the pipeline.
113 |         role: IAM role to create and run steps and pipeline.
114 |         default_bucket: the bucket to use for storing the artifacts
115 |     Returns:
116 |         an instance of a pipeline
117 |     """
118 |     sagemaker_session = get_session(region, default_bucket)
119 |     if role is None:
120 |         role = sagemaker.session.get_execution_role(sagemaker_session)
121 | 
122 |     # Parameters for pipeline execution
123 |     processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)
124 |     processing_instance_type = ParameterString(
125 |         name="ProcessingInstanceType", default_value="ml.m5.2xlarge"
126 |     )
127 |     training_instance_type = ParameterString(
128 |         name="TrainingInstanceType", default_value="ml.m5.xlarge"
129 |     )
130 |     model_approval_status = ParameterString(
131 |         name="ModelApprovalStatus",
132 |         default_value="PendingManualApproval",  # ModelApprovalStatus can be set to a default of "Approved" if you don't want manual approval.
133 |     )
134 |     input_data = ParameterString(
135 |         name="InputDataUrl",
136 |         default_value=f"",  # Change this to point to the s3 location of your raw input data.
137 |     )
138 | 
139 |     # Sagemaker session
140 |     sess = sagemaker_session
141 | 
142 |     # You can configure this with your own bucket name, e.g.
143 |     # bucket = "my-bucket"
144 |     bucket = sess.default_bucket()
145 | 
146 |     print(f"Data Wrangler export storage bucket: {bucket}")
147 | 
148 |     # unique flow export ID
149 |     flow_export_id = f"{time.strftime('%d-%H-%M-%S', time.gmtime())}-{str(uuid.uuid4())[:8]}"
150 |     flow_export_name = f"flow-{flow_export_id}"
151 | 
152 |     # Output name is auto-generated from the select node's ID + output name from the flow file.
153 |     output_name = "d593101e-278b-4330-9779-b6e02fbeb99e.default"
154 | 
155 |     s3_output_prefix = f"export-{flow_export_name}/output"
156 |     s3_output_path = f"s3://{bucket}/{s3_output_prefix}"
157 |     print(f"Flow S3 export result path: {s3_output_path}")
158 | 
159 |     processing_job_output = ProcessingOutput(
160 |         output_name=output_name,
161 |         source="/opt/ml/processing/output",
162 |         destination=s3_output_path,
163 |         s3_upload_mode="EndOfJob",
164 |     )
165 | 
166 |     # name of the flow file which should exist in the current notebook working directory
167 |     flow_file_name = "diabetes.flow"
168 | 
169 |     # Load .flow file from current notebook working directory
170 |     #!echo "Loading flow file from current notebook working directory: $PWD"
171 | 
172 |     with open(flow_file_name) as f:
173 |         flow = json.load(f)
174 | 
175 |     # Upload flow to S3
176 |     s3_client = boto3.client("s3")
177 |     s3_client.upload_file(
178 |         flow_file_name,
179 |         bucket,
180 |         f"data_wrangler_flows/{flow_export_name}.flow",
181 |         ExtraArgs={"ServerSideEncryption": "aws:kms"},
182 |     )
183 | 
184 |     flow_s3_uri = f"s3://{bucket}/data_wrangler_flows/{flow_export_name}.flow"
185 | 
186 |     print(f"Data Wrangler flow {flow_file_name} uploaded to {flow_s3_uri}")
187 | 
188 |     flow_input = ProcessingInput(
189 |         source=flow_s3_uri,
190 |         destination="/opt/ml/processing/flow",
191 |         input_name="flow",
192 |         s3_data_type="S3Prefix",
193 |         s3_input_mode="File",
194 |         s3_data_distribution_type="FullyReplicated",
195 |     )
196 | 
197 |     # IAM role for executing the processing job.
198 |     iam_role = role
199 | 
200 |     # Unique processing job name. Give a unique name every time you re-execute processing jobs
201 |     processing_job_name = f"data-wrangler-flow-processing-{flow_export_id}"
202 | 
203 |     # Data Wrangler Container URL.
204 |     container_uri = sagemaker.image_uris.retrieve(
205 |         framework="data-wrangler",  # we are using the Sagemaker built in xgboost algorithm
206 |         region=region,
207 |     )
208 | 
209 |     # Processing Job Instance count and instance type.
210 |     instance_count = 2
211 |     instance_type = "ml.m5.4xlarge"
212 | 
213 |     # Size in GB of the EBS volume to use for storing data during processing
214 |     volume_size_in_gb = 30
215 | 
216 |     # Content type for each output. Data Wrangler supports CSV as default and Parquet.
217 |     output_content_type = "CSV"
218 | 
219 |     # Network Isolation mode; default is off
220 |     enable_network_isolation = False
221 | 
222 |     # List of tags to be passed to the processing job
223 |     user_tags = []
224 | 
225 |     # Output configuration used as processing job container arguments
226 |     output_config = {output_name: {"content_type": output_content_type}}
227 | 
228 |     # KMS key for per object encryption; default is None
229 |     kms_key = None
230 | 
231 |     processor = Processor(
232 |         role=iam_role,
233 |         image_uri=container_uri,
234 |         instance_count=instance_count,
235 |         instance_type=instance_type,
236 |         volume_size_in_gb=volume_size_in_gb,
237 |         network_config=NetworkConfig(enable_network_isolation=enable_network_isolation),
238 |         sagemaker_session=sess,
239 |         output_kms_key=kms_key,
240 |         tags=user_tags,
241 |     )
242 | 
243 |     data_wrangler_step = ProcessingStep(
244 |         name="DataWranglerProcess",
245 |         processor=processor,
246 |         inputs=[flow_input],
247 |         outputs=[processing_job_output],
248 |         job_arguments=[f"--output-config '{json.dumps(output_config)}'"],
249 |     )
250 | 
251 |     # Processing step for feature engineering
252 |     # this processor does not have awswrangler installed
253 |     sklearn_processor = SKLearnProcessor(
254 |         framework_version="0.23-1",
255 |         instance_type=processing_instance_type,
256 |         instance_count=processing_instance_count,
257 |         base_job_name=f"{base_job_prefix}/sklearn-diabetes-preprocess",  # choose any name
258 |         sagemaker_session=sagemaker_session,
259 |         role=role,
260 |     )
261 | 
262 |     step_process = ProcessingStep(
263 |         name="Preprocess",  # choose any name
264 |         processor=sklearn_processor,
265 |         inputs=[
266 |             ProcessingInput(
267 |                 source=data_wrangler_step.properties.ProcessingOutputConfig.Outputs[
268 |                     output_name
269 |                 ].S3Output.S3Uri,
270 |                 destination="/opt/ml/processing/data/raw-data-dir",
271 |             )
272 |         ],
273 |         outputs=[
274 |             ProcessingOutput(output_name="train", source="/opt/ml/processing/train"),
275 |             ProcessingOutput(output_name="validation", source="/opt/ml/processing/validation"),
276 |             ProcessingOutput(output_name="test", source="/opt/ml/processing/test"),
277 |         ],
278 |         code=os.path.join(BASE_DIR, "preprocess.py"),
279 |         job_arguments=[
280 |             "--input-data",
281 |             data_wrangler_step.properties.ProcessingOutputConfig.Outputs[
282 |                 output_name
283 |             ].S3Output.S3Uri,
284 |         ],
285 |     )
286 | 
287 |     # Training step for generating model artifacts
288 |     model_path = f"s3://{sagemaker_session.default_bucket()}/{base_job_prefix}/diabetesTrain"
289 |     model_bucket_key = f"{sagemaker_session.default_bucket()}/{base_job_prefix}/diabetesTrain"
290 |     cache_config = CacheConfig(enable_caching=True, expire_after="30d")
291 | 
292 |     xgb_image_uri = sagemaker.image_uris.retrieve(
293 |         framework="xgboost",  # we are using the Sagemaker built in xgboost algorithm
294 |         region=region,
295 |         version="1.0-1",
296 |         py_version="py3",
297 |         instance_type=training_instance_type,
298 |     )
299 |     xgb_train = Estimator(
300 |         image_uri=xgb_image_uri,
301 |         instance_type=training_instance_type,
302 |         instance_count=1,
303 |         output_path=model_path,
304 |         base_job_name=f"{base_job_prefix}/diabetes-xgb-train",
305 |         sagemaker_session=sagemaker_session,
306 |         role=role,
307 |     )
308 |     xgb_train.set_hyperparameters(
309 |         num_round=50,
310 |         objective="binary:logistic",        
311 |     )
312 | 
313 |     xgb_train.set_hyperparameters(grow_policy="lossguide")
314 | 
315 |     xgb_objective_metric_name = "validation:auc"
316 |     xgb_hyperparameter_ranges = {
317 |         "max_depth": IntegerParameter(5, 10, scaling_type="Auto"),
318 |         "min_child_weight": IntegerParameter(5, 10, scaling_type="Auto"),
319 |         "eta": ContinuousParameter(0.1, 0.9, scaling_type="Auto"),
320 |         "gamma": IntegerParameter(4, 9, scaling_type="Auto"),
321 |         "subsample": ContinuousParameter(0.7, 0.9, scaling_type="Auto"),
322 |     }
323 | 
324 |     xgb_tuner_log = HyperparameterTuner(
325 |         xgb_train,
326 |         xgb_objective_metric_name,
327 |         xgb_hyperparameter_ranges,
328 |         max_jobs=5,
329 |         max_parallel_jobs=5,
330 |         strategy="Random",
331 |         objective_type="Maximize",
332 |     )
333 | 
334 |     xgb_step_tuning = TuningStep(
335 |         name="XGBHPTune",
336 |         tuner=xgb_tuner_log,
337 |         inputs={
338 |             "train": TrainingInput(
339 |                 s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
340 |                     "train"
341 |                 ].S3Output.S3Uri,
342 |                 content_type="text/csv",
343 |             ),
344 |             "validation": TrainingInput(
345 |                 s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
346 |                     "validation"
347 |                 ].S3Output.S3Uri,
348 |                 content_type="text/csv",
349 |             ),
350 |         },
351 |         cache_config=cache_config,
352 |     )
353 | 
354 |     dtree_image_uri = sagemaker_session.sagemaker_client.describe_image_version(
355 |         ImageName="diabetes-dtree"
356 |     )["ContainerImage"]
357 | 
358 |     dtree_train = Estimator(
359 |         image_uri=dtree_image_uri,
360 |         role=role,
361 |         instance_count=1,
362 |         instance_type=training_instance_type,
363 |         base_job_name=f"{base_job_prefix}/diabetes-dtree-train",
364 |         output_path=model_path,
365 |         sagemaker_session=sagemaker_session,
366 |     )
367 | 
368 |     dtree_objective_metric_name = "validation:auc"
369 |     dtree_metric_definitions = [{"Name": "validation:auc", "Regex": "auc:(\S+)"}]
370 | 
371 |     dtree_hyperparameter_ranges = {
372 |         "max_depth": IntegerParameter(5, 10, scaling_type="Linear"),
373 |         "max_leaf_nodes": IntegerParameter(2, 10, scaling_type="Linear"),
374 |     }
375 | 
376 |     dtree_tuner_log = HyperparameterTuner(
377 |         dtree_train,
378 |         dtree_objective_metric_name,
379 |         dtree_hyperparameter_ranges,
380 |         dtree_metric_definitions,
381 |         max_jobs=5,
382 |         max_parallel_jobs=5,
383 |         strategy="Random",
384 |         objective_type="Maximize",
385 |     )
386 | 
387 |     dtree_step_tuning = TuningStep(
388 |         name="DTreeHPTune",
389 |         tuner=dtree_tuner_log,
390 |         inputs={
391 |             "training": TrainingInput(
392 |                 s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
393 |                     "train"
394 |                 ].S3Output.S3Uri,
395 |                 content_type="text/csv",
396 |             ),
397 |             "validation": TrainingInput(
398 |                 s3_data=step_process.properties.ProcessingOutputConfig.Outputs[
399 |                     "validation"
400 |                 ].S3Output.S3Uri,
401 |                 content_type="text/csv",
402 |             ),
403 |         },
404 |         cache_config=cache_config,
405 |     )
406 | 
407 |     dtree_script_eval = ScriptProcessor(
408 |         image_uri=dtree_image_uri,
409 |         command=["python3"],
410 |         instance_type=processing_instance_type,
411 |         instance_count=1,
412 |         base_job_name=f"{base_job_prefix}/script-dtree-eval",
413 |         sagemaker_session=sagemaker_session,
414 |         role=role,
415 |     )
416 | 
417 |     dtree_evaluation_report = PropertyFile(
418 |         name="EvaluationReportDTree",
419 |         output_name="dtree_evaluation",
420 |         path="dtree_evaluation.json",
421 |     )
422 | 
423 |     dtree_step_eval = ProcessingStep(
424 |         name="DTreeEval",
425 |         processor=dtree_script_eval,
426 |         inputs=[
427 |             ProcessingInput(
428 |                 source=dtree_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key),
429 |                 destination="/opt/ml/processing/model",
430 |             ),
431 |             ProcessingInput(
432 |                 source=step_process.properties.ProcessingOutputConfig.Outputs[
433 |                     "test"
434 |                 ].S3Output.S3Uri,
435 |                 destination="/opt/ml/processing/test",
436 |             ),
437 |         ],
438 |         outputs=[
439 |             ProcessingOutput(
440 |                 output_name="dtree_evaluation", source="/opt/ml/processing/evaluation"
441 |             ),
442 |         ],
443 |         code=os.path.join(BASE_DIR, "dtree_evaluate.py"),
444 |         property_files=[dtree_evaluation_report],
445 |     )
446 | 
447 |     xgb_script_eval = ScriptProcessor(
448 |         image_uri=xgb_image_uri,
449 |         command=["python3"],
450 |         instance_type=processing_instance_type,
451 |         instance_count=1,
452 |         base_job_name=f"{base_job_prefix}/script-xgb-eval",
453 |         sagemaker_session=sagemaker_session,
454 |         role=role,
455 |     )
456 | 
457 |     xgb_evaluation_report = PropertyFile(
458 |         name="EvaluationReportXGBoost",
459 |         output_name="xgb_evaluation",
460 |         path="xgb_evaluation.json",
461 |     )
462 | 
463 |     xgb_step_eval = ProcessingStep(
464 |         name="XGBEval",
465 |         processor=xgb_script_eval,
466 |         inputs=[
467 |             ProcessingInput(
468 |                 source=xgb_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key),
469 |                 destination="/opt/ml/processing/model",
470 |             ),
471 |             ProcessingInput(
472 |                 source=step_process.properties.ProcessingOutputConfig.Outputs[
473 |                     "test"
474 |                 ].S3Output.S3Uri,
475 |                 destination="/opt/ml/processing/test",
476 |             ),
477 |         ],
478 |         outputs=[
479 |             ProcessingOutput(output_name="xgb_evaluation", source="/opt/ml/processing/evaluation"),
480 |         ],
481 |         code=os.path.join(BASE_DIR, "xgb_evaluate.py"),
482 |         property_files=[xgb_evaluation_report],
483 |     )
484 | 
485 |     xgb_model_metrics = ModelMetrics(
486 |         model_statistics=MetricsSource(
487 |             s3_uri="{}/xgb_evaluation.json".format(
488 |                 xgb_step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"]["S3Uri"]
489 |             ),
490 |             content_type="application/json",
491 |         )
492 |     )
493 | 
494 |     dtree_model_metrics = ModelMetrics(
495 |         model_statistics=MetricsSource(
496 |             s3_uri="{}/dtree_evaluation.json".format(
497 |                 dtree_step_eval.arguments["ProcessingOutputConfig"]["Outputs"][0]["S3Output"][
498 |                     "S3Uri"
499 |                 ]
500 |             ),
501 |             content_type="application/json",
502 |         )
503 |     )
504 | 
505 |     xgb_eval_metrics = JsonGet(
506 |         step=xgb_step_eval,
507 |         property_file=xgb_evaluation_report,
508 |         json_path="regression_metrics.roc.value",  # This should follow the structure of your report_dict defined in the evaluate.py file.
509 |     )
510 | 
511 |     dtree_eval_metrics = JsonGet(
512 |         step=dtree_step_eval,
513 |         property_file=dtree_evaluation_report,
514 |         json_path="regression_metrics.roc.value",  # This should follow the structure of your report_dict defined in the evaluate.py file.
515 |     )
516 | 
517 |     # Register model step that will be conditionally executed
518 |     dtree_step_register = RegisterModel(
519 |         name="DTreeReg",
520 |         estimator=dtree_train,
521 |         model_data=dtree_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key),
522 |         content_types=["text/csv"],
523 |         response_types=["text/csv"],
524 |         inference_instances=["ml.t2.medium", "ml.m5.large"],
525 |         transform_instances=["ml.m5.large"],
526 |         model_package_group_name=model_package_group_name,
527 |         approval_status=model_approval_status,
528 |         model_metrics=dtree_model_metrics,
529 |     )
530 | 
531 |     # Register model step that will be conditionally executed
532 |     xgb_step_register = RegisterModel(
533 |         name="XGBReg",
534 |         estimator=xgb_train,
535 |         model_data=xgb_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key),
536 |         content_types=["text/csv"],
537 |         response_types=["text/csv"],
538 |         inference_instances=["ml.t2.medium", "ml.m5.large"],
539 |         transform_instances=["ml.m5.large"],
540 |         model_package_group_name=model_package_group_name,
541 |         approval_status=model_approval_status,
542 |         model_metrics=xgb_model_metrics,
543 |     )
544 | 
545 |     # Condition step for evaluating model quality and branching execution
546 |     cond_lte = ConditionGreaterThanOrEqualTo(  # You can change the condition here
547 |         left=JsonGet(
548 |             step=dtree_step_eval,
549 |             property_file=dtree_evaluation_report,
550 |             json_path="regression_metrics.roc.value",  # This should follow the structure of your report_dict defined in the evaluate.py file.
551 |         ),
552 |         right=JsonGet(
553 |             step=xgb_step_eval,
554 |             property_file=xgb_evaluation_report,
555 |             json_path="regression_metrics.roc.value"
556 |         ),  # You can change the threshold here
557 |     )
558 | 
559 |     step_cond = ConditionStep(
560 |         name="AccuracyCond",
561 |         conditions=[cond_lte],
562 |         if_steps=[dtree_step_register],
563 |         else_steps=[xgb_step_register],
564 |     )
565 |     create_date = time.strftime("%Y-%m-%d-%H-%M-%S")
566 | 
567 |     # Pipeline instance
568 |     pipeline = Pipeline(
569 |         name=pipeline_name,
570 |         parameters=[
571 |             processing_instance_type,
572 |             processing_instance_count,
573 |             training_instance_type,
574 |             model_approval_status,
575 |             input_data
576 |         ],
577 |         pipeline_experiment_config=PipelineExperimentConfig(
578 |             pipeline_name + "-" + create_date, "diabetes-{}".format(create_date)
579 |         ),
580 |         steps=[
581 |             data_wrangler_step,
582 |             step_process,
583 |             dtree_step_tuning,
584 |             xgb_step_tuning,
585 |             dtree_step_eval,
586 |             xgb_step_eval,
587 |             step_cond,
588 |         ],
589 |         sagemaker_session=sagemaker_session,
590 |     )
591 |     return pipeline
592 | 


--------------------------------------------------------------------------------
/sagemaker-pipeline/diabetes.flow:
--------------------------------------------------------------------------------
  1 | {
  2 |   "metadata": {
  3 |     "version": 1,
  4 |     "disable_limits": false,
  5 |     "instance_type": "ml.m5.4xlarge"
  6 |   },
  7 |   "nodes": [
  8 |     {
  9 |       "node_id": "0b3a3943-97b0-49e3-9894-830443f522ca",
 10 |       "type": "SOURCE",
 11 |       "operator": "sagemaker.s3_source_0.1",
 12 |       "parameters": {
 13 |         "dataset_definition": {
 14 |           "__typename": "S3CreateDatasetDefinitionOutput",
 15 |           "datasetSourceType": "S3",
 16 |           "name": "diabetic_data.csv",
 17 |           "description": null,
 18 |           "s3ExecutionContext": {
 19 |             "__typename": "S3ExecutionContext",
 20 |             "s3Uri": "s3://sagemaker-diabetes-AWS_ACCOUNT/diabetic_data.csv",
 21 |             "s3ContentType": "csv",
 22 |             "s3HasHeader": true,
 23 |             "s3FieldDelimiter": ",",
 24 |             "s3DirIncludesNested": false,
 25 |             "s3AddsFilenameColumn": false
 26 |           }
 27 |         }
 28 |       },
 29 |       "inputs": [],
 30 |       "outputs": [
 31 |         {
 32 |           "name": "default"
 33 |         }
 34 |       ]
 35 |     },
 36 |     {
 37 |       "node_id": "b3d30123-1423-4472-a251-b9ff24d9d381",
 38 |       "type": "TRANSFORM",
 39 |       "operator": "sagemaker.spark.infer_and_cast_type_0.1",
 40 |       "parameters": {},
 41 |       "trained_parameters": {
 42 |         "schema": {
 43 |           "encounter_id": "long",
 44 |           "patient_nbr": "long",
 45 |           "race": "string",
 46 |           "gender": "string",
 47 |           "age": "string",
 48 |           "weight": "string",
 49 |           "admission_type_id": "long",
 50 |           "discharge_disposition_id": "long",
 51 |           "admission_source_id": "long",
 52 |           "time_in_hospital": "long",
 53 |           "payer_code": "string",
 54 |           "medical_specialty": "string",
 55 |           "num_lab_procedures": "long",
 56 |           "num_procedures": "long",
 57 |           "num_medications": "long",
 58 |           "number_outpatient": "long",
 59 |           "number_emergency": "long",
 60 |           "number_inpatient": "long",
 61 |           "diag_1": "long",
 62 |           "diag_2": "long",
 63 |           "diag_3": "long",
 64 |           "number_diagnoses": "long",
 65 |           "max_glu_serum": "string",
 66 |           "A1Cresult": "string",
 67 |           "metformin": "string",
 68 |           "repaglinide": "string",
 69 |           "nateglinide": "string",
 70 |           "chlorpropamide": "string",
 71 |           "glimepiride": "string",
 72 |           "acetohexamide": "string",
 73 |           "glipizide": "string",
 74 |           "glyburide": "string",
 75 |           "tolbutamide": "string",
 76 |           "pioglitazone": "string",
 77 |           "rosiglitazone": "string",
 78 |           "acarbose": "string",
 79 |           "miglitol": "string",
 80 |           "troglitazone": "string",
 81 |           "tolazamide": "string",
 82 |           "examide": "string",
 83 |           "citoglipton": "string",
 84 |           "insulin": "string",
 85 |           "glyburide-metformin": "string",
 86 |           "glipizide-metformin": "string",
 87 |           "glimepiride-pioglitazone": "string",
 88 |           "metformin-rosiglitazone": "string",
 89 |           "metformin-pioglitazone": "string",
 90 |           "change": "string",
 91 |           "diabetesMed": "string",
 92 |           "readmitted": "string"
 93 |         }
 94 |       },
 95 |       "inputs": [
 96 |         {
 97 |           "name": "default",
 98 |           "node_id": "0b3a3943-97b0-49e3-9894-830443f522ca",
 99 |           "output_name": "default"
100 |         }
101 |       ],
102 |       "outputs": [
103 |         {
104 |           "name": "default"
105 |         }
106 |       ]
107 |     },
108 |     {
109 |       "node_id": "a0fcc4ad-932f-4c61-b04f-85165ec49f54",
110 |       "type": "TRANSFORM",
111 |       "operator": "sagemaker.spark.manage_columns_0.1",
112 |       "parameters": {
113 |         "operator": "Move column",
114 |         "move_column_parameters": {
115 |           "move_type": "Move to start",
116 |           "move_to_start_parameters": {
117 |             "column_to_move": "readmitted"
118 |           }
119 |         },
120 |         "drop_column_parameters": {}
121 |       },
122 |       "inputs": [
123 |         {
124 |           "name": "df",
125 |           "node_id": "b3d30123-1423-4472-a251-b9ff24d9d381",
126 |           "output_name": "default"
127 |         }
128 |       ],
129 |       "outputs": [
130 |         {
131 |           "name": "default"
132 |         }
133 |       ]
134 |     },
135 |     {
136 |       "node_id": "9c491942-6270-410a-8734-dafaa3bee672",
137 |       "type": "TRANSFORM",
138 |       "operator": "sagemaker.spark.custom_code_0.1",
139 |       "parameters": {
140 |         "operator": "Python (User-Defined Function)",
141 |         "udf_parameters": {
142 |           "return_type": "float",
143 |           "udf_mode": "Pandas",
144 |           "input_col": "readmitted",
145 |           "output_col": "readmitted",
146 |           "pandas_code": "import pandas as pd\n# Add imports for sklearn, numpy, or any other packages you might need.\n\ndef custom_func(series: pd.Series) -> pd.Series:\n  #series.apply(lambda x : 'other' if (str(x).find('V') != -1 or str(x).find('E') != -1) else ('circulatory' if int(x) in range(390, 460) or int(x) == 785) else ('test'))\n  series = series.apply(lambda x : 0.0 if (str(x) == 'NO') else (1.0))                                   \n  return series\n               "
147 |         },
148 |         "pyspark_parameters": {},
149 |         "name": "readmitted"
150 |       },
151 |       "inputs": [
152 |         {
153 |           "name": "df",
154 |           "node_id": "a0fcc4ad-932f-4c61-b04f-85165ec49f54",
155 |           "output_name": "default"
156 |         }
157 |       ],
158 |       "outputs": [
159 |         {
160 |           "name": "default"
161 |         }
162 |       ]
163 |     },
164 |     {
165 |       "node_id": "e589c34a-c6ec-4d8c-9549-87550098951a",
166 |       "type": "TRANSFORM",
167 |       "operator": "sagemaker.spark.manage_columns_0.1",
168 |       "parameters": {
169 |         "operator": "Drop column",
170 |         "drop_column_parameters": {
171 |           "column_to_drop": [
172 |             "payer_code",
173 |             "encounter_id",
174 |             "patient_nbr",
175 |             "weight",
176 |             "medical_specialty",
177 |             "acarbose",
178 |             "metformin-pioglitazone",
179 |             "acetohexamide",
180 |             "metformin-rosiglitazone",
181 |             "glimepiride",
182 |             "glimepiride-pioglitazone",
183 |             "glipizide",
184 |             "glyburide-metformin",
185 |             "examide",
186 |             "troglitazone",
187 |             "miglitol",
188 |             "citoglipton",
189 |             "glipizide-metformin",
190 |             "chlorpropamide",
191 |             "tolbutamide",
192 |             "glyburide",
193 |             "tolazamide",
194 |             "nateglinide"
195 |           ]
196 |         }
197 |       },
198 |       "inputs": [
199 |         {
200 |           "name": "df",
201 |           "node_id": "9c491942-6270-410a-8734-dafaa3bee672",
202 |           "output_name": "default"
203 |         }
204 |       ],
205 |       "outputs": [
206 |         {
207 |           "name": "default"
208 |         }
209 |       ]
210 |     },
211 |     {
212 |       "node_id": "4ea28cf4-b062-494c-a49f-6bb840d9128b",
213 |       "type": "TRANSFORM",
214 |       "operator": "sagemaker.spark.handle_missing_0.1",
215 |       "parameters": {
216 |         "operator": "Fill missing",
217 |         "fill_missing_parameters": {
218 |           "input_column": [
219 |             "diag_1",
220 |             "diag_2",
221 |             "diag_3"
222 |           ],
223 |           "fill_value": "0"
224 |         },
225 |         "impute_parameters": {
226 |           "column_type": "Numeric",
227 |           "numeric_parameters": {
228 |             "strategy": "Approximate Median"
229 |           }
230 |         }
231 |       },
232 |       "inputs": [
233 |         {
234 |           "name": "df",
235 |           "node_id": "e589c34a-c6ec-4d8c-9549-87550098951a",
236 |           "output_name": "default"
237 |         }
238 |       ],
239 |       "outputs": [
240 |         {
241 |           "name": "default"
242 |         }
243 |       ]
244 |     },
245 |     {
246 |       "node_id": "13fa0709-b2a5-4e92-9f72-eb247015018d",
247 |       "type": "TRANSFORM",
248 |       "operator": "sagemaker.spark.search_and_edit_0.1",
249 |       "parameters": {
250 |         "operator": "Find and replace substring",
251 |         "find_and_replace_substring_parameters": {
252 |           "input_column": [
253 |             "race"
254 |           ],
255 |           "pattern": "\\?",
256 |           "replacement": "Unknown"
257 |         }
258 |       },
259 |       "inputs": [
260 |         {
261 |           "name": "df",
262 |           "node_id": "4ea28cf4-b062-494c-a49f-6bb840d9128b",
263 |           "output_name": "default"
264 |         }
265 |       ],
266 |       "outputs": [
267 |         {
268 |           "name": "default"
269 |         }
270 |       ]
271 |     },
272 |     {
273 |       "node_id": "e440e602-6db8-478d-a99f-82cba34c3cf3",
274 |       "type": "TRANSFORM",
275 |       "operator": "sagemaker.spark.custom_code_0.1",
276 |       "parameters": {
277 |         "operator": "Python (User-Defined Function)",
278 |         "udf_parameters": {
279 |           "return_type": "string",
280 |           "udf_mode": "Pandas",
281 |           "input_col": "diag_1",
282 |           "output_col": "diag_1",
283 |           "pandas_code": "import pandas as pd\n# Add imports for sklearn, numpy, or any other packages you might need.\n\ndef custom_func(series: pd.Series) -> pd.Series:\n  #series.apply(lambda x : 'other' if (str(x).find('V') != -1 or str(x).find('E') != -1) else ('circulatory' if int(x) in range(390, 460) or int(x) == 785) else ('test'))\n  series = series.apply(lambda x : 'other' if (str(x).find('V') != -1 or str(x).find('E') != -1 or int(x) == 0 )  \n                                        else ('circulatory' if int(x) in range(390, 460) or int(x) == 785\n                                        else     ('respiratory' if int(float(x)) in range(460, 520) or int(float(x)) == 786\n                                        else     ('digestive'   if int(float(x)) in range(520, 580) or int(float(x)) == 787\n                                        else     ('diabetes'    if int(float(x)) == 250\n                                        else     ('injury'      if int(float(x)) in range(800, 1000)\n                                        else ('musculoskeletal' if int(float(x)) in range(710, 740)\n                                        else ('genitourinary'   if int(float(x)) in range(580, 630) or int(float(x)) == 788\n                                        else ('neoplasms'       if int(float(x)) in range(140, 240)\n                                        else ('pregnecy'        if int(float(x)) in range(630, 680)\n                                        else 'other'))))))))))\n  return series\n               \n  \"\"\" The following function is applied over batches of the input. The Series that it outputs must be the same length as the input Series.\n\n  Example:\n\n    def lowercase(series: pd.Series) -> pd.Series:\n      return series.str.lower()\n  \"\"\""
284 |         },
285 |         "pyspark_parameters": {},
286 |         "name": "diag-1"
287 |       },
288 |       "inputs": [
289 |         {
290 |           "name": "df",
291 |           "node_id": "13fa0709-b2a5-4e92-9f72-eb247015018d",
292 |           "output_name": "default"
293 |         }
294 |       ],
295 |       "outputs": [
296 |         {
297 |           "name": "default"
298 |         }
299 |       ]
300 |     },
301 |     {
302 |       "node_id": "6b5b607a-03b9-4133-8ac8-12b2540e28ab",
303 |       "type": "TRANSFORM",
304 |       "operator": "sagemaker.spark.custom_code_0.1",
305 |       "parameters": {
306 |         "operator": "Python (User-Defined Function)",
307 |         "udf_parameters": {
308 |           "return_type": "string",
309 |           "udf_mode": "Pandas",
310 |           "input_col": "diag_2",
311 |           "output_col": "diag_2",
312 |           "pandas_code": "import pandas as pd\n# Add imports for sklearn, numpy, or any other packages you might need.\n\ndef custom_func(series: pd.Series) -> pd.Series:\n  #series.apply(lambda x : 'other' if (str(x).find('V') != -1 or str(x).find('E') != -1) else ('circulatory' if int(x) in range(390, 460) or int(x) == 785) else ('test'))\n  series = series.apply(lambda x : 'other' if (str(x).find('V') != -1 or str(x).find('E') != -1 or int(x) == 0 )  \n                                        else ('circulatory' if int(x) in range(390, 460) or int(x) == 785\n                                        else     ('respiratory' if int(float(x)) in range(460, 520) or int(float(x)) == 786\n                                        else     ('digestive'   if int(float(x)) in range(520, 580) or int(float(x)) == 787\n                                        else     ('diabetes'    if int(float(x)) == 250\n                                        else     ('injury'      if int(float(x)) in range(800, 1000)\n                                        else ('musculoskeletal' if int(float(x)) in range(710, 740)\n                                        else ('genitourinary'   if int(float(x)) in range(580, 630) or int(float(x)) == 788\n                                        else ('neoplasms'       if int(float(x)) in range(140, 240)\n                                        else ('pregnecy'        if int(float(x)) in range(630, 680)\n                                        else 'other'))))))))))\n  return series\n"
313 |         },
314 |         "pyspark_parameters": {},
315 |         "name": "diag-2"
316 |       },
317 |       "inputs": [
318 |         {
319 |           "name": "df",
320 |           "node_id": "e440e602-6db8-478d-a99f-82cba34c3cf3",
321 |           "output_name": "default"
322 |         }
323 |       ],
324 |       "outputs": [
325 |         {
326 |           "name": "default"
327 |         }
328 |       ]
329 |     },
330 |     {
331 |       "node_id": "5e825fe5-8545-4746-b27a-ea05971880cd",
332 |       "type": "TRANSFORM",
333 |       "operator": "sagemaker.spark.custom_code_0.1",
334 |       "parameters": {
335 |         "operator": "Python (User-Defined Function)",
336 |         "udf_parameters": {
337 |           "return_type": "string",
338 |           "udf_mode": "Pandas",
339 |           "input_col": "diag_3",
340 |           "output_col": "diag_3",
341 |           "pandas_code": "import pandas as pd\n# Add imports for sklearn, numpy, or any other packages you might need.\n\ndef custom_func(series: pd.Series) -> pd.Series:\n  #series.apply(lambda x : 'other' if (str(x).find('V') != -1 or str(x).find('E') != -1) else ('circulatory' if int(x) in range(390, 460) or int(x) == 785) else ('test'))\n  series = series.apply(lambda x : 'other' if (str(x).find('V') != -1 or str(x).find('E') != -1 or int(x) == 0 )  \n                                        else ('circulatory' if int(x) in range(390, 460) or int(x) == 785\n                                        else     ('respiratory' if int(float(x)) in range(460, 520) or int(float(x)) == 786\n                                        else     ('digestive'   if int(float(x)) in range(520, 580) or int(float(x)) == 787\n                                        else     ('diabetes'    if int(float(x)) == 250\n                                        else     ('injury'      if int(float(x)) in range(800, 1000)\n                                        else ('musculoskeletal' if int(float(x)) in range(710, 740)\n                                        else ('genitourinary'   if int(float(x)) in range(580, 630) or int(float(x)) == 788\n                                        else ('neoplasms'       if int(float(x)) in range(140, 240)\n                                        else ('pregnecy'        if int(float(x)) in range(630, 680)\n                                        else 'other'))))))))))\n  return series\n"
342 |         },
343 |         "pyspark_parameters": {},
344 |         "name": "diag-3"
345 |       },
346 |       "inputs": [
347 |         {
348 |           "name": "df",
349 |           "node_id": "6b5b607a-03b9-4133-8ac8-12b2540e28ab",
350 |           "output_name": "default"
351 |         }
352 |       ],
353 |       "outputs": [
354 |         {
355 |           "name": "default"
356 |         }
357 |       ]
358 |     },
359 |     {
360 |       "node_id": "c413d501-02e3-4817-b10a-0ac6faf7d41a",
361 |       "type": "TRANSFORM",
362 |       "operator": "sagemaker.spark.custom_code_0.1",
363 |       "parameters": {
364 |         "operator": "Python (User-Defined Function)",
365 |         "udf_parameters": {
366 |           "return_type": "string",
367 |           "udf_mode": "Pandas",
368 |           "input_col": "admission_type_id",
369 |           "output_col": "admission_type_id",
370 |           "pandas_code": "import pandas as pd\n# Add imports for sklearn, numpy, or any other packages you might need.\n\ndef custom_func(series: pd.Series) -> pd.Series:\n  series = series.apply(lambda x : 'Unknown' if (int(x) in [5,6,8] ) else ('Emergency' if int(x) == 1 else ('Urgent' if int(x) == 2 else ('Elective' if int(x) == 3 else     ('Newborn' if int(x) == 4 else ('TraumaCenter'))))))\n  return series\n"
371 |         },
372 |         "pyspark_parameters": {},
373 |         "name": "admission-type-id"
374 |       },
375 |       "inputs": [
376 |         {
377 |           "name": "df",
378 |           "node_id": "5e825fe5-8545-4746-b27a-ea05971880cd",
379 |           "output_name": "default"
380 |         }
381 |       ],
382 |       "outputs": [
383 |         {
384 |           "name": "default"
385 |         }
386 |       ]
387 |     },
388 |     {
389 |       "node_id": "5c7bf83d-6a99-4b0e-b3e2-a6a0d3d30f05",
390 |       "type": "TRANSFORM",
391 |       "operator": "sagemaker.spark.custom_code_0.1",
392 |       "parameters": {
393 |         "operator": "Python (User-Defined Function)",
394 |         "udf_parameters": {
395 |           "return_type": "string",
396 |           "udf_mode": "Pandas",
397 |           "input_col": "discharge_disposition_id",
398 |           "output_col": "discharge_disposition_id",
399 |           "pandas_code": "import pandas as pd\n# Add imports for sklearn, numpy, or any other packages you might need.\n\ndef custom_func(series: pd.Series) -> pd.Series:\n  series = series.apply(lambda x : 'Discharged' if (int(x) in [1,2,3,4,5,6,8,10,15,16,17,22,23,24,27,28,29,30]) else ('LeftAMA' if int(x) == 7 else ('InPatient' if int(x) == 9 else ('OutPatient' if int(x) == 12 else ('Expired' if int(x) in [11,19,20,21] else ('Hospice' if int(x) in [13,14] else ('Unknown')))))))\n\n  return series"
400 |         },
401 |         "pyspark_parameters": {},
402 |         "name": "discharge-disposition-id"
403 |       },
404 |       "inputs": [
405 |         {
406 |           "name": "df",
407 |           "node_id": "c413d501-02e3-4817-b10a-0ac6faf7d41a",
408 |           "output_name": "default"
409 |         }
410 |       ],
411 |       "outputs": [
412 |         {
413 |           "name": "default"
414 |         }
415 |       ]
416 |     },
417 |     {
418 |       "node_id": "b6d7bd2b-611c-4969-80e5-a2c9dfea4d78",
419 |       "type": "TRANSFORM",
420 |       "operator": "sagemaker.spark.custom_code_0.1",
421 |       "parameters": {
422 |         "operator": "Python (User-Defined Function)",
423 |         "udf_parameters": {
424 |           "return_type": "string",
425 |           "udf_mode": "Pandas",
426 |           "input_col": "admission_source_id",
427 |           "output_col": "admission_source_id",
428 |           "pandas_code": "import pandas as pd\n# Add imports for sklearn, numpy, or any other packages you might need.\n\ndef custom_func(series: pd.Series) -> pd.Series:\n  series = series.apply(lambda x : 'Referral' if (int(x) in [1,2,3]) else ('Transfer' if int(x) in [4,5,6,10,18,19,22,25,26] else ('Emergency' if int(x) == 7 else ('Court' if int(x) == 8 else ('Unknown' if int(x) in [9,15,17,20,21] else ('NormalDelivery' if int(x) == 11 else ('AbnormalDelivery' if int(x) in [12,13,14] else ('BornInside' if int(x) == 23 else ('BornOutside')))))))))\n  return series"
429 |         },
430 |         "pyspark_parameters": {},
431 |         "name": "admission-source-id"
432 |       },
433 |       "inputs": [
434 |         {
435 |           "name": "df",
436 |           "node_id": "5c7bf83d-6a99-4b0e-b3e2-a6a0d3d30f05",
437 |           "output_name": "default"
438 |         }
439 |       ],
440 |       "outputs": [
441 |         {
442 |           "name": "default"
443 |         }
444 |       ]
445 |     },
446 |     {
447 |       "node_id": "48aab63d-84d2-4eb4-aed7-23210ebc3ab9",
448 |       "type": "TRANSFORM",
449 |       "operator": "sagemaker.spark.manage_rows_0.1",
450 |       "parameters": {
451 |         "operator": "Drop duplicates",
452 |         "drop_duplicates_parameters": {},
453 |         "sort_parameters": {
454 |           "order": "Ascending"
455 |         }
456 |       },
457 |       "inputs": [
458 |         {
459 |           "name": "df",
460 |           "node_id": "b6d7bd2b-611c-4969-80e5-a2c9dfea4d78",
461 |           "output_name": "default"
462 |         }
463 |       ],
464 |       "outputs": [
465 |         {
466 |           "name": "default"
467 |         }
468 |       ]
469 |     },
470 |     {
471 |       "node_id": "b31d277f-ecf4-48bc-bddd-fc19e8b30254",
472 |       "type": "TRANSFORM",
473 |       "operator": "sagemaker.spark.balance_data_0.1",
474 |       "parameters": {
475 |         "operator": "SMOTE",
476 |         "ratio": 1,
477 |         "smote_params": {
478 |           "num_neighbors": 10
479 |         },
480 |         "target_column": "readmitted"
481 |       },
482 |       "inputs": [
483 |         {
484 |           "name": "df",
485 |           "node_id": "48aab63d-84d2-4eb4-aed7-23210ebc3ab9",
486 |           "output_name": "default"
487 |         }
488 |       ],
489 |       "outputs": [
490 |         {
491 |           "name": "default"
492 |         }
493 |       ]
494 |     },
495 |     {
496 |       "node_id": "d593101e-278b-4330-9779-b6e02fbeb99e",
497 |       "type": "TRANSFORM",
498 |       "operator": "sagemaker.spark.encode_categorical_0.1",
499 |       "parameters": {
500 |         "operator": "One-hot encode",
501 |         "one_hot_encode_parameters": {
502 |           "invalid_handling_strategy": "Keep",
503 |           "drop_last": false,
504 |           "output_style": "Columns",
505 |           "input_column": [
506 |             "race",
507 |             "gender",
508 |             "age",
509 |             "diag_1",
510 |             "diag_2",
511 |             "diag_3",
512 |             "max_glu_serum",
513 |             "A1Cresult",
514 |             "metformin",
515 |             "repaglinide",
516 |             "pioglitazone",
517 |             "rosiglitazone",
518 |             "insulin",
519 |             "change",
520 |             "diabetesMed",
521 |             "admission_type_id",
522 |             "discharge_disposition_id",
523 |             "admission_source_id"
524 |           ]
525 |         },
526 |         "ordinal_encode_parameters": {
527 |           "invalid_handling_strategy": "Replace with NaN"
528 |         }
529 |       },
530 |       "inputs": [
531 |         {
532 |           "name": "df",
533 |           "node_id": "b31d277f-ecf4-48bc-bddd-fc19e8b30254",
534 |           "output_name": "default"
535 |         }
536 |       ],
537 |       "outputs": [
538 |         {
539 |           "name": "default"
540 |         }
541 |       ]
542 |     }
543 |   ]
544 | }
545 | 


--------------------------------------------------------------------------------
/diabetes-project.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "id": "671a914c",
   6 |    "metadata": {},
   7 |    "source": [
   8 |     "# Multi-model SageMaker Pipeline with Hyperparamater Tuning and Experiments"
   9 |    ]
  10 |   },
  11 |   {
  12 |    "cell_type": "markdown",
  13 |    "id": "d697da1a",
  14 |    "metadata": {},
  15 |    "source": [
  16 |     "This notebook has been tested in a SageMaker notebook that is using a kernel with Python 3.7 installed, e.g. conda_mxnet_latest_p37, conda_python3."
  17 |    ]
  18 |   },
  19 |   {
  20 |    "cell_type": "markdown",
  21 |    "id": "bd2ffcca",
  22 |    "metadata": {},
  23 |    "source": [
  24 |     "## Prepare the dataset collection"
  25 |    ]
  26 |   },
  27 |   {
  28 |    "cell_type": "markdown",
  29 |    "id": "c125794f",
  30 |    "metadata": {},
  31 |    "source": [
  32 |     "We create an S3 bucket and with encryption enabled for additional security. \n",
  33 |     "\n",
  34 |     "#### If you are running this Notebook in us-east-1 region, don't use 'CreateBucketConfiguration' parameter with create_bucket().  us-east-1 is the default location."
  35 |    ]
  36 |   },
  37 |   {
  38 |    "cell_type": "code",
  39 |    "execution_count": null,
  40 |    "id": "6f7edb84",
  41 |    "metadata": {},
  42 |    "outputs": [],
  43 |    "source": [
  44 |     "import boto3\n",
  45 |     "\n",
  46 |     "AWS_ACCOUNT = boto3.client(\"sts\").get_caller_identity()[\"Account\"]\n",
  47 |     "AWS_REGION = boto3.Session().region_name"
  48 |    ]
  49 |   },
  50 |   {
  51 |    "cell_type": "code",
  52 |    "execution_count": null,
  53 |    "id": "ef0780a3",
  54 |    "metadata": {},
  55 |    "outputs": [],
  56 |    "source": [
  57 |     "import boto3\n",
  58 |     "\n",
  59 |     "AWS_ACCOUNT = boto3.client(\"sts\").get_caller_identity()[\"Account\"]\n",
  60 |     "AWS_REGION = boto3.Session().region_name\n",
  61 |     "PREFIX = \"sagemaker-diabetes\"\n",
  62 |     "BUCKET_NAME = \"{PREFIX}-{AWS_ACCOUNT}\".format(PREFIX=PREFIX,AWS_ACCOUNT=AWS_ACCOUNT)\n",
  63 |     "\n",
  64 |     "s3_client = boto3.client(\"s3\")\n",
  65 |     "location = {\"LocationConstraint\": AWS_REGION}\n",
  66 |     "\n",
  67 |     "# default location is us-east-1, so CreateBucketConfiguration is not needed\n",
  68 |     "s3_client.create_bucket(Bucket=BUCKET_NAME)\n",
  69 |     "\n",
  70 |     "# use this create_bucket statement for any AWS region other than us-east-1\n",
  71 |     "#s3_client.create_bucket(Bucket=BUCKET_NAME, CreateBucketConfiguration=location) \n",
  72 |     "\n",
  73 |     "s3_client.put_bucket_encryption(\n",
  74 |     "    Bucket=BUCKET_NAME,\n",
  75 |     "    ServerSideEncryptionConfiguration={\n",
  76 |     "        \"Rules\": [\n",
  77 |     "            {\n",
  78 |     "                \"ApplyServerSideEncryptionByDefault\": {\"SSEAlgorithm\": \"AES256\"},\n",
  79 |     "            },\n",
  80 |     "        ]\n",
  81 |     "    },\n",
  82 |     ")"
  83 |    ]
  84 |   },
  85 |   {
  86 |    "cell_type": "markdown",
  87 |    "id": "a28a1f0e",
  88 |    "metadata": {},
  89 |    "source": [
  90 |     "Download UCI dataset and copy to S3 bucket. "
  91 |    ]
  92 |   },
  93 |   {
  94 |    "cell_type": "code",
  95 |    "execution_count": null,
  96 |    "id": "57356c93",
  97 |    "metadata": {
  98 |     "scrolled": true
  99 |    },
 100 |    "outputs": [],
 101 |    "source": [
 102 |     "%%sh\n",
 103 |     "\n",
 104 |     "AWS_ACCOUNT=$(aws sts get-caller-identity --query Account --output text)\n",
 105 |     "BUCKET_NAME=\"sagemaker-diabetes-${AWS_ACCOUNT}\"\n",
 106 |     "\n",
 107 |     "wget https://archive.ics.uci.edu/ml/machine-learning-databases/00296/dataset_diabetes.zip\n",
 108 |     "unzip dataset_diabetes.zip\n",
 109 |     "aws s3 cp dataset_diabetes/diabetic_data.csv s3://${BUCKET_NAME}/\n",
 110 |     "    "
 111 |    ]
 112 |   },
 113 |   {
 114 |    "cell_type": "markdown",
 115 |    "id": "61768499",
 116 |    "metadata": {},
 117 |    "source": [
 118 |     "Update diabetes.flow to use your AWS account ID. "
 119 |    ]
 120 |   },
 121 |   {
 122 |    "cell_type": "code",
 123 |    "execution_count": null,
 124 |    "id": "ee47dce2",
 125 |    "metadata": {},
 126 |    "outputs": [],
 127 |    "source": [
 128 |     "%%sh\n",
 129 |     "\n",
 130 |     "AWS_ACCOUNT=$(aws sts get-caller-identity --query Account --output text)\n",
 131 |     "sed -i \"s/AWS_ACCOUNT/${AWS_ACCOUNT}/g\" sagemaker-pipeline/diabetes.flow"
 132 |    ]
 133 |   },
 134 |   {
 135 |    "cell_type": "markdown",
 136 |    "id": "50020dd6",
 137 |    "metadata": {},
 138 |    "source": [
 139 |     "Next, Create IAM Role for ML workflow steps"
 140 |    ]
 141 |   },
 142 |   {
 143 |    "cell_type": "code",
 144 |    "execution_count": null,
 145 |    "id": "c1ff2f77",
 146 |    "metadata": {},
 147 |    "outputs": [],
 148 |    "source": [
 149 |     "import json\n",
 150 |     "\n",
 151 |     "iam_client = boto3.client(\"iam\")\n",
 152 |     "\n",
 153 |     "sagemaker_assume_role_policy_document = json.dumps(\n",
 154 |     "    {\n",
 155 |     "        \"Version\": \"2012-10-17\",\n",
 156 |     "        \"Statement\": [\n",
 157 |     "            {\n",
 158 |     "                \"Effect\": \"Allow\",\n",
 159 |     "                \"Principal\": {\"Service\": \"sagemaker.amazonaws.com\"},\n",
 160 |     "                \"Action\": \"sts:AssumeRole\",\n",
 161 |     "            }\n",
 162 |     "        ],\n",
 163 |     "    }\n",
 164 |     ")\n",
 165 |     "\n",
 166 |     "response_role = iam_client.create_role(\n",
 167 |     "    RoleName=\"AmazonSageMakerServiceCatalogProductsUseRole-diabetes\",\n",
 168 |     "    AssumeRolePolicyDocument=sagemaker_assume_role_policy_document,\n",
 169 |     ")\n",
 170 |     "\n",
 171 |     "\n",
 172 |     "iam_client.attach_role_policy(\n",
 173 |     "    RoleName=response_role[\"Role\"][\"RoleName\"],\n",
 174 |     "    PolicyArn='arn:aws:iam::aws:policy/AmazonSageMakerFullAccess'\n",
 175 |     ")\n"
 176 |    ]
 177 |   },
 178 |   {
 179 |    "cell_type": "markdown",
 180 |    "id": "6f41e95e",
 181 |    "metadata": {},
 182 |    "source": [
 183 |     "## Prepare the Decision Tree custom Docker image"
 184 |    ]
 185 |   },
 186 |   {
 187 |    "cell_type": "markdown",
 188 |    "id": "6c642455",
 189 |    "metadata": {},
 190 |    "source": [
 191 |     "We make a  Docker image containing a custom algorithm using [Scikit-learn Decision Tree Classifier](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html). Note that the Docker image has been modified to support hyperparameter tuning and validation data. \n",
 192 |     "\n"
 193 |    ]
 194 |   },
 195 |   {
 196 |    "cell_type": "code",
 197 |    "execution_count": null,
 198 |    "id": "70f0000a",
 199 |    "metadata": {},
 200 |    "outputs": [],
 201 |    "source": [
 202 |     "! sudo yum install docker -y"
 203 |    ]
 204 |   },
 205 |   {
 206 |    "cell_type": "code",
 207 |    "execution_count": null,
 208 |    "id": "3aec9c87",
 209 |    "metadata": {},
 210 |    "outputs": [],
 211 |    "source": [
 212 |     "%%sh\n",
 213 |     "\n",
 214 |     "# The name of our algorithm\n",
 215 |     "ALGORITHM_NAME=\"diabetes-decision-trees\"\n",
 216 |     "\n",
 217 |     "cd container\n",
 218 |     "\n",
 219 |     "chmod +x decision_trees/train\n",
 220 |     "chmod +x decision_trees/serve\n",
 221 |     "\n",
 222 |     "AWS_ACCOUNT=$(aws sts get-caller-identity --query Account --output text)\n",
 223 |     "AWS_REGION=$(aws configure get region)\n",
 224 |     "\n",
 225 |     "IMAGE_FULLNAME=\"${AWS_ACCOUNT}.dkr.ecr.${AWS_REGION}.amazonaws.com/${ALGORITHM_NAME}:latest\"\n",
 226 |     "\n",
 227 |     "# If the repository doesn't exist in ECR, create it.\n",
 228 |     "aws ecr describe-repositories --repository-names \"${ALGORITHM_NAME}\" > /dev/null 2>&1\n",
 229 |     "\n",
 230 |     "if [ $? -ne 0 ]\n",
 231 |     "then\n",
 232 |     "    aws ecr create-repository --repository-name \"${ALGORITHM_NAME}\" > /dev/null\n",
 233 |     "fi\n",
 234 |     "\n",
 235 |     "# Get the login command from ECR and execute it directly\n",
 236 |     "aws ecr get-login-password --region ${AWS_REGION}|docker login --username AWS --password-stdin ${IMAGE_FULLNAME}\n",
 237 |     "\n",
 238 |     "# Build the docker image locally with the image name and then push it to ECR with the full name.\n",
 239 |     "# Ensure your notebook IAM role has required permission for pushing image to ECR\n",
 240 |     "\n",
 241 |     "docker build  -t ${ALGORITHM_NAME} .\n",
 242 |     "docker tag ${ALGORITHM_NAME} ${IMAGE_FULLNAME}\n",
 243 |     "docker push ${IMAGE_FULLNAME}\n"
 244 |    ]
 245 |   },
 246 |   {
 247 |    "cell_type": "markdown",
 248 |    "id": "11b15c71",
 249 |    "metadata": {},
 250 |    "source": [
 251 |     "Once Docker image is pushed to ECR repository, we make the image accessible from SageMaker. "
 252 |    ]
 253 |   },
 254 |   {
 255 |    "cell_type": "code",
 256 |    "execution_count": null,
 257 |    "id": "f3e03c17",
 258 |    "metadata": {},
 259 |    "outputs": [],
 260 |    "source": [
 261 |     "%%sh\n",
 262 |     "\n",
 263 |     "# The name of our algorithm\n",
 264 |     "SM_IMAGE_NAME=diabetes-dtree\n",
 265 |     "AWS_ACCOUNT=$(aws sts get-caller-identity --query Account --output text)\n",
 266 |     "\n",
 267 |     "# This assumes the role name is AmazonSageMakerServiceCatalogProductsUseRole-diabetes\n",
 268 |     "ROLE_ARN=\"arn:aws:iam::${AWS_ACCOUNT}:role/AmazonSageMakerServiceCatalogProductsUseRole-diabetes\"\n",
 269 |     "\n",
 270 |     "aws sagemaker create-image \\\n",
 271 |     "    --image-name ${SM_IMAGE_NAME} \\\n",
 272 |     "    --role-arn ${ROLE_ARN}\n",
 273 |     "\n",
 274 |     "aws sagemaker create-app-image-config \\\n",
 275 |     "    --cli-input-json file://container/app-image-config-input.json\n"
 276 |    ]
 277 |   },
 278 |   {
 279 |    "cell_type": "code",
 280 |    "execution_count": null,
 281 |    "id": "ad3a940d",
 282 |    "metadata": {},
 283 |    "outputs": [],
 284 |    "source": [
 285 |     "%%sh\n",
 286 |     "AWS_ACCOUNT=$(aws sts get-caller-identity --query Account --output text)\n",
 287 |     "ALGORITHM_NAME=diabetes-decision-trees\n",
 288 |     "AWS_REGION=$(aws configure get region)\n",
 289 |     "SM_IMAGE_NAME=diabetes-dtree\n",
 290 |     "SM_BASE_IMAGE=\"${AWS_ACCOUNT}.dkr.ecr.${AWS_REGION}.amazonaws.com/${ALGORITHM_NAME}:latest\"\n",
 291 |     "\n",
 292 |     "aws sagemaker create-image-version \\\n",
 293 |     "    --image-name ${SM_IMAGE_NAME} \\\n",
 294 |     "    --base-image ${SM_BASE_IMAGE}"
 295 |    ]
 296 |   },
 297 |   {
 298 |    "cell_type": "markdown",
 299 |    "id": "9eb7dd2b",
 300 |    "metadata": {},
 301 |    "source": [
 302 |     "## Define and start the SageMaker pipeline"
 303 |    ]
 304 |   },
 305 |   {
 306 |    "cell_type": "markdown",
 307 |    "id": "4e0f7a38",
 308 |    "metadata": {},
 309 |    "source": [
 310 |     "Install the necessary Python library `awswrangler` for the SageMaker pipeline. "
 311 |    ]
 312 |   },
 313 |   {
 314 |    "cell_type": "code",
 315 |    "execution_count": null,
 316 |    "id": "21ed4a32",
 317 |    "metadata": {},
 318 |    "outputs": [],
 319 |    "source": [
 320 |     "! pip3 install awswrangler"
 321 |    ]
 322 |   },
 323 |   {
 324 |    "cell_type": "markdown",
 325 |    "id": "10abe34d",
 326 |    "metadata": {},
 327 |    "source": [
 328 |     "Import the necessary Python modules for the SageMaker pipeline. "
 329 |    ]
 330 |   },
 331 |   {
 332 |    "cell_type": "code",
 333 |    "execution_count": null,
 334 |    "id": "faaac6eb",
 335 |    "metadata": {},
 336 |    "outputs": [],
 337 |    "source": [
 338 |     "import os\n",
 339 |     "import time\n",
 340 |     "import uuid\n",
 341 |     "import json\n",
 342 |     "import boto3\n",
 343 |     "import sagemaker\n",
 344 |     "import sagemaker.session\n",
 345 |     "from sagemaker.estimator import Estimator\n",
 346 |     "from sagemaker.inputs import TrainingInput\n",
 347 |     "from sagemaker.model_metrics import MetricsSource, ModelMetrics\n",
 348 |     "from sagemaker.processing import ProcessingInput, ProcessingOutput, ScriptProcessor\n",
 349 |     "from sagemaker.sklearn.processing import SKLearnProcessor\n",
 350 |     "from sagemaker.workflow.condition_step import ConditionStep, JsonGet\n",
 351 |     "#from sagemaker.workflow.functions import JsonGet\n",
 352 |     "from sagemaker.workflow.pipeline_context import PipelineSession\n",
 353 |     "from sagemaker.workflow.conditions import ConditionGreaterThanOrEqualTo\n",
 354 |     "from sagemaker.workflow.parameters import ParameterInteger, ParameterString\n",
 355 |     "from sagemaker.workflow.pipeline import Pipeline\n",
 356 |     "from sagemaker.workflow.properties import PropertyFile\n",
 357 |     "from sagemaker.workflow.step_collections import RegisterModel\n",
 358 |     "from sagemaker.workflow.steps import ProcessingStep, TrainingStep, CacheConfig, TuningStep\n",
 359 |     "from sagemaker.processing import ProcessingInput, ProcessingOutput\n",
 360 |     "from sagemaker.processing import Processor\n",
 361 |     "from sagemaker.network import NetworkConfig\n",
 362 |     "from sagemaker.tuner import (\n",
 363 |     "    ContinuousParameter,\n",
 364 |     "    IntegerParameter,\n",
 365 |     "    CategoricalParameter,\n",
 366 |     "    HyperparameterTuner,\n",
 367 |     "    WarmStartConfig,\n",
 368 |     "    WarmStartTypes,\n",
 369 |     ")"
 370 |    ]
 371 |   },
 372 |   {
 373 |    "cell_type": "markdown",
 374 |    "id": "a0667f2e",
 375 |    "metadata": {},
 376 |    "source": [
 377 |     "Create boto3 session and define pipeline step instance count and other configuration. "
 378 |    ]
 379 |   },
 380 |   {
 381 |    "cell_type": "code",
 382 |    "execution_count": null,
 383 |    "id": "6a425a1e",
 384 |    "metadata": {},
 385 |    "outputs": [],
 386 |    "source": [
 387 |     "region = AWS_REGION\n",
 388 |     "default_bucket = BUCKET_NAME\n",
 389 |     "\n",
 390 |     "pipeline_session = PipelineSession()\n",
 391 |     "role = sagemaker.session.get_execution_role(pipeline_session)\n",
 392 |     "\n",
 393 |     "datawrangler_instance_count = 2\n",
 394 |     "datawrangler_instance_type = \"ml.m5.4xlarge\" \n",
 395 |     "processing_instance_count = 1\n",
 396 |     "processing_instance_type = \"ml.m5.2xlarge\"\n",
 397 |     "training_instance_count = 1\n",
 398 |     "training_instance_type = \"ml.m5.2xlarge\"\n",
 399 |     "model_approval_status = ParameterString(\n",
 400 |     "    name=\"ModelApprovalStatus\",\n",
 401 |     "    default_value=\"PendingManualApproval\",  # ModelApprovalStatus can be set to a default of \"Approved\" if you don't want manual approval.\n",
 402 |     ")\n",
 403 |     "input_data = ParameterString(\n",
 404 |     "    name=\"InputDataUrl\",\n",
 405 |     "    default_value=f\"\",  # Change this to point to the s3 location of your raw input data.\n",
 406 |     ")\n"
 407 |    ]
 408 |   },
 409 |   {
 410 |    "cell_type": "markdown",
 411 |    "id": "db038a2b",
 412 |    "metadata": {},
 413 |    "source": [
 414 |     "Define and create the `DataWranglerProcess` step in the SageMaker pipeline. This step uses an existing Data Wrangler flow file `diabetes.flow` that has the following transformations:\n",
 415 |     "* Move column readmitted to the beginning. This column is to be predicted in the classification problem. \n",
 416 |     "* Convert readmitted column value to 0 if it is NO and 1 if it is <30 or >30.\n",
 417 |     "* Drop the columns that have minimal to zero prediction power based on Data Wrangler Data Quality and Insights Report, e.g. payer_code and encounter_id. \n",
 418 |     "* Group values into finite categories using Python custom transform in the following columns: diag_1, diag_2, diag_3, admission_type_id, admission_source_id, and discharge_disposition_id. \n",
 419 |     "* Fill missing values in columns diag_1, diag_2, diag_3 and replace strings in column race.\n",
 420 |     "* Drop duplicates, balance data using SMOTE, and one-hot encode the following columns: race, gender, age, diag_1, diag_2, diag_3, max_glu_serum, A1Cresult, metformin, repaglinide, pioglitazone, rosiglitazone, insulin, change, diabetesMed, admission_type_id, discharge_disposition_id, admission_source_id. \n",
 421 |     "\n",
 422 |     "To use your own transformations, replace `output_name` and `flow_file_name`."
 423 |    ]
 424 |   },
 425 |   {
 426 |    "cell_type": "code",
 427 |    "execution_count": null,
 428 |    "id": "fb83bcb5",
 429 |    "metadata": {},
 430 |    "outputs": [],
 431 |    "source": [
 432 |     "print(f\"Data Wrangler export storage bucket: {default_bucket}\")\n",
 433 |     "\n",
 434 |     "# unique flow export ID\n",
 435 |     "flow_export_id = f\"{time.strftime('%d-%H-%M-%S', time.gmtime())}-{str(uuid.uuid4())[:8]}\"\n",
 436 |     "flow_export_name = f\"flow-{flow_export_id}\"\n",
 437 |     "\n",
 438 |     "output_name = \"d593101e-278b-4330-9779-b6e02fbeb99e.default\"\n",
 439 |     "\n",
 440 |     "s3_output_prefix = f\"export-{flow_export_name}/output\"\n",
 441 |     "s3_output_path = f\"s3://{default_bucket}/{s3_output_prefix}\"\n",
 442 |     "print(f\"Flow S3 export result path: {s3_output_path}\")\n",
 443 |     "\n",
 444 |     "processing_job_output = ProcessingOutput(\n",
 445 |     "    output_name=output_name,\n",
 446 |     "    source=\"/opt/ml/processing/output\",\n",
 447 |     "    destination=s3_output_path,\n",
 448 |     "    s3_upload_mode=\"EndOfJob\",\n",
 449 |     ")\n",
 450 |     "\n",
 451 |     "# name of the flow file which should exist in the current notebook working directory\n",
 452 |     "flow_file_name = \"sagemaker-pipeline/diabetes.flow\"\n",
 453 |     "\n",
 454 |     "# Load .flow file from current notebook working directory\n",
 455 |     "#!echo \"Loading flow file from current notebook working directory: $PWD\"\n",
 456 |     "\n",
 457 |     "with open(flow_file_name) as f:\n",
 458 |     "    flow = json.load(f)\n",
 459 |     "\n",
 460 |     "# Upload flow to S3\n",
 461 |     "s3_client = boto3.client(\"s3\")\n",
 462 |     "s3_client.upload_file(\n",
 463 |     "    flow_file_name,\n",
 464 |     "    default_bucket,\n",
 465 |     "    f\"data_wrangler_flows/{flow_export_name}.flow\",\n",
 466 |     "    ExtraArgs={\"ServerSideEncryption\": \"aws:kms\"},\n",
 467 |     ")\n",
 468 |     "\n",
 469 |     "flow_s3_uri = f\"s3://{default_bucket}/data_wrangler_flows/{flow_export_name}.flow\"\n",
 470 |     "\n",
 471 |     "print(f\"Data Wrangler flow {flow_file_name} uploaded to {flow_s3_uri}\")\n",
 472 |     "\n",
 473 |     "flow_input = ProcessingInput(\n",
 474 |     "    source=flow_s3_uri,\n",
 475 |     "    destination=\"/opt/ml/processing/flow\",\n",
 476 |     "    input_name=\"flow\",\n",
 477 |     "    s3_data_type=\"S3Prefix\",\n",
 478 |     "    s3_input_mode=\"File\",\n",
 479 |     "    s3_data_distribution_type=\"FullyReplicated\",\n",
 480 |     ")\n",
 481 |     "\n",
 482 |     "# IAM role for executing the processing job.\n",
 483 |     "iam_role = role\n",
 484 |     "\n",
 485 |     "# Unique processing job name. Give a unique name every time you re-execute processing jobs\n",
 486 |     "processing_job_name = f\"data-wrangler-flow-processing-{flow_export_id}\"\n",
 487 |     "\n",
 488 |     "# Size in GB of the EBS volume to use for storing data during processing\n",
 489 |     "volume_size_in_gb = 30\n",
 490 |     "\n",
 491 |     "# Content type for each output. Data Wrangler supports CSV as default and Parquet.\n",
 492 |     "output_content_type = \"CSV\"\n",
 493 |     "\n",
 494 |     "# Network Isolation mode; default is off\n",
 495 |     "enable_network_isolation = False\n",
 496 |     "\n",
 497 |     "# List of tags to be passed to the processing job\n",
 498 |     "user_tags = []\n",
 499 |     "\n",
 500 |     "# Output configuration used as processing job container arguments\n",
 501 |     "output_config = {output_name: {\"content_type\": output_content_type}}\n",
 502 |     "\n",
 503 |     "# KMS key for per object encryption; default is None\n",
 504 |     "kms_key = None\n",
 505 |     "\n",
 506 |     "# Data Wrangler Container URL.\n",
 507 |     "container_uri = sagemaker.image_uris.retrieve(\n",
 508 |     "    framework=\"data-wrangler\",  \n",
 509 |     "    region=region, \n",
 510 |     "    version=\"1.x\",\n",
 511 |     ")\n",
 512 |     "\n",
 513 |     "processor = Processor(\n",
 514 |     "    role=iam_role,\n",
 515 |     "    image_uri=container_uri,\n",
 516 |     "    instance_count=datawrangler_instance_count,\n",
 517 |     "    instance_type=datawrangler_instance_type,\n",
 518 |     "    volume_size_in_gb=volume_size_in_gb,\n",
 519 |     "    network_config=NetworkConfig(enable_network_isolation=enable_network_isolation),\n",
 520 |     "    sagemaker_session=pipeline_session,\n",
 521 |     "    output_kms_key=kms_key,\n",
 522 |     "    tags=user_tags,\n",
 523 |     ")\n",
 524 |     "\n",
 525 |     "data_wrangler_step = ProcessingStep(\n",
 526 |     "    name=\"DataWranglerProcess\",\n",
 527 |     "    processor=processor,\n",
 528 |     "    inputs=[flow_input],\n",
 529 |     "    outputs=[processing_job_output],\n",
 530 |     "    job_arguments=[f\"--output-config '{json.dumps(output_config)}'\"],\n",
 531 |     ")\n"
 532 |    ]
 533 |   },
 534 |   {
 535 |    "cell_type": "markdown",
 536 |    "id": "db4599ea",
 537 |    "metadata": {},
 538 |    "source": [
 539 |     "Define and create the `Preprocess` step in the SageMaker pipeline. This step reads the transformed data from the DataWranglerProcess, randomizes, and splits the data into train (70%), validation (10%), and test data (20%). \n",
 540 |     "\n",
 541 |     "You can also put here other necessary transformations and pre-processing changes that are done outside of Data Wrangler. "
 542 |    ]
 543 |   },
 544 |   {
 545 |    "cell_type": "code",
 546 |    "execution_count": null,
 547 |    "id": "bfb50e53",
 548 |    "metadata": {},
 549 |    "outputs": [],
 550 |    "source": [
 551 |     "base_job_prefix = \"sagemaker-diabetes\"\n",
 552 |     "\n",
 553 |     "sklearn_processor = SKLearnProcessor(\n",
 554 |     "    framework_version=\"0.23-1\",\n",
 555 |     "    instance_type=processing_instance_type,\n",
 556 |     "    instance_count=processing_instance_count,\n",
 557 |     "    base_job_name=f\"{base_job_prefix}/sklearn-diabetes-preprocess\",  # choose any name\n",
 558 |     "    sagemaker_session=pipeline_session,\n",
 559 |     "    role=role\n",
 560 |     ")\n",
 561 |     "\n",
 562 |     "step_process = ProcessingStep(\n",
 563 |     "    name=\"Preprocess\",  # choose any name\n",
 564 |     "    processor=sklearn_processor,\n",
 565 |     "    inputs=[\n",
 566 |     "            ProcessingInput(\n",
 567 |     "                source=data_wrangler_step.properties.ProcessingOutputConfig.Outputs[\n",
 568 |     "                    output_name\n",
 569 |     "                ].S3Output.S3Uri,\n",
 570 |     "                destination=\"/opt/ml/processing/data/raw-data-dir\",\n",
 571 |     "            )\n",
 572 |     "        ],\n",
 573 |     "    outputs=[\n",
 574 |     "            ProcessingOutput(output_name=\"train\", source=\"/opt/ml/processing/train\"),\n",
 575 |     "            ProcessingOutput(output_name=\"validation\", source=\"/opt/ml/processing/validation\"),\n",
 576 |     "            ProcessingOutput(output_name=\"test\", source=\"/opt/ml/processing/test\"),\n",
 577 |     "        ],\n",
 578 |     "    code=\"sagemaker-pipeline/pipelines/diabetes/preprocess.py\",\n",
 579 |     "    job_arguments=[\n",
 580 |     "            \"--input-data\",\n",
 581 |     "            data_wrangler_step.properties.ProcessingOutputConfig.Outputs[\n",
 582 |     "                output_name\n",
 583 |     "            ].S3Output.S3Uri,\n",
 584 |     "        ],\n",
 585 |     ")\n"
 586 |    ]
 587 |   },
 588 |   {
 589 |    "cell_type": "markdown",
 590 |    "id": "ade846a1",
 591 |    "metadata": {},
 592 |    "source": [
 593 |     "Define and create the `XGBHPTune` step in the SageMaker pipeline. This is a hyperparameter tuning job using SageMaker XGBoost algorithm."
 594 |    ]
 595 |   },
 596 |   {
 597 |    "cell_type": "code",
 598 |    "execution_count": null,
 599 |    "id": "f1815db4",
 600 |    "metadata": {},
 601 |    "outputs": [],
 602 |    "source": [
 603 |     "# Training step for generating model artifacts\n",
 604 |     "model_path = f\"s3://{pipeline_session.default_bucket()}/{base_job_prefix}/diabetesTrain\"\n",
 605 |     "model_bucket_key = f\"{pipeline_session.default_bucket()}/{base_job_prefix}/diabetesTrain\"\n",
 606 |     "cache_config = CacheConfig(enable_caching=True, expire_after=\"30d\")\n",
 607 |     "\n",
 608 |     "xgb_image_uri = sagemaker.image_uris.retrieve(\n",
 609 |     "        framework=\"xgboost\",  # we are using the Sagemaker built in xgboost algorithm\n",
 610 |     "        region=region,\n",
 611 |     "        version=\"1.0-1\",\n",
 612 |     "        py_version=\"py3\",\n",
 613 |     "        instance_type=training_instance_type,\n",
 614 |     "        image_scope=\"training\"\n",
 615 |     ")\n",
 616 |     "xgb_train = Estimator(\n",
 617 |     "        image_uri=xgb_image_uri,\n",
 618 |     "        instance_type=training_instance_type,\n",
 619 |     "        instance_count=training_instance_count,\n",
 620 |     "        output_path=model_path,\n",
 621 |     "        base_job_name=f\"{base_job_prefix}/diabetes-xgb-train\",\n",
 622 |     "        sagemaker_session=pipeline_session,\n",
 623 |     "        role=role,\n",
 624 |     ")\n",
 625 |     "xgb_train.set_hyperparameters(\n",
 626 |     "        num_round=50,\n",
 627 |     "        objective=\"binary:logistic\", # we are using binary:logistic as the objective function for classification      \n",
 628 |     ")\n",
 629 |     "\n",
 630 |     "xgb_train.set_hyperparameters(grow_policy=\"lossguide\")\n",
 631 |     "\n",
 632 |     "xgb_objective_metric_name = \"validation:auc\" # we are using AUC as a performance metric \n",
 633 |     "xgb_hyperparameter_ranges = {\n",
 634 |     "        \"max_depth\": IntegerParameter(5, 10, scaling_type=\"Auto\"),\n",
 635 |     "        \"min_child_weight\": IntegerParameter(5, 10, scaling_type=\"Auto\"),\n",
 636 |     "        \"eta\": ContinuousParameter(0.1, 0.9, scaling_type=\"Auto\"),\n",
 637 |     "        \"gamma\": IntegerParameter(4, 9, scaling_type=\"Auto\"),\n",
 638 |     "        \"subsample\": ContinuousParameter(0.7, 0.9, scaling_type=\"Auto\"),\n",
 639 |     "}\n",
 640 |     "\n",
 641 |     "xgb_tuner_log = HyperparameterTuner(\n",
 642 |     "        xgb_train,\n",
 643 |     "        xgb_objective_metric_name,\n",
 644 |     "        xgb_hyperparameter_ranges,\n",
 645 |     "        max_jobs=5,\n",
 646 |     "        max_parallel_jobs=5,\n",
 647 |     "        strategy=\"Random\",\n",
 648 |     "        objective_type=\"Maximize\",\n",
 649 |     ")\n",
 650 |     "\n",
 651 |     "xgb_step_tuning = TuningStep(\n",
 652 |     "        name=\"XGBHPTune\",\n",
 653 |     "        tuner=xgb_tuner_log,\n",
 654 |     "        inputs={\n",
 655 |     "            \"train\": TrainingInput(\n",
 656 |     "                s3_data=step_process.properties.ProcessingOutputConfig.Outputs[\n",
 657 |     "                    \"train\"\n",
 658 |     "                ].S3Output.S3Uri,\n",
 659 |     "                content_type=\"text/csv\",\n",
 660 |     "            ),\n",
 661 |     "            \"validation\": TrainingInput(\n",
 662 |     "                s3_data=step_process.properties.ProcessingOutputConfig.Outputs[\n",
 663 |     "                    \"validation\"\n",
 664 |     "                ].S3Output.S3Uri,\n",
 665 |     "                content_type=\"text/csv\",\n",
 666 |     "            ),\n",
 667 |     "        },\n",
 668 |     "        cache_config=cache_config,\n",
 669 |     ")\n"
 670 |    ]
 671 |   },
 672 |   {
 673 |    "cell_type": "markdown",
 674 |    "id": "d480e28f",
 675 |    "metadata": {},
 676 |    "source": [
 677 |     "Define and create the `DTreeHPTune` step in the SageMaker pipeline. This is a hyperparameter tuning job using Scikit-learn Decision Tree algorithm. Note that this is in a custom Docker image pushed to the repository in section ` Prepare the Decision Tree custom Docker image`. "
 678 |    ]
 679 |   },
 680 |   {
 681 |    "cell_type": "code",
 682 |    "execution_count": null,
 683 |    "id": "a0abbb71",
 684 |    "metadata": {},
 685 |    "outputs": [],
 686 |    "source": [
 687 |     "dtree_image_uri = pipeline_session.sagemaker_client.describe_image_version(ImageName=\"diabetes-dtree\")[\"ContainerImage\"]\n",
 688 |     "\n",
 689 |     "dtree_train = Estimator(\n",
 690 |     "        image_uri=dtree_image_uri,\n",
 691 |     "        role=role,\n",
 692 |     "        instance_count=1,\n",
 693 |     "        instance_type=training_instance_type,\n",
 694 |     "        base_job_name=f\"{base_job_prefix}/diabetes-dtree-train\",\n",
 695 |     "        output_path=model_path,\n",
 696 |     "        sagemaker_session=pipeline_session,\n",
 697 |     ")\n",
 698 |     "\n",
 699 |     "dtree_objective_metric_name = \"validation:auc\"\n",
 700 |     "dtree_metric_definitions = [{\"Name\": \"validation:auc\", \"Regex\": \"auc:(\\S+)\"}]\n",
 701 |     "\n",
 702 |     "dtree_hyperparameter_ranges = {\n",
 703 |     "        \"max_depth\": IntegerParameter(5, 10, scaling_type=\"Linear\"),\n",
 704 |     "        \"max_leaf_nodes\": IntegerParameter(2, 10, scaling_type=\"Linear\"),\n",
 705 |     "}\n",
 706 |     "\n",
 707 |     "dtree_tuner_log = HyperparameterTuner(\n",
 708 |     "        dtree_train,\n",
 709 |     "        dtree_objective_metric_name,\n",
 710 |     "        dtree_hyperparameter_ranges,\n",
 711 |     "        dtree_metric_definitions,\n",
 712 |     "        max_jobs=5,\n",
 713 |     "        max_parallel_jobs=5,\n",
 714 |     "        strategy=\"Random\",\n",
 715 |     "        objective_type=\"Maximize\",\n",
 716 |     ")\n",
 717 |     "\n",
 718 |     "dtree_step_tuning = TuningStep(\n",
 719 |     "        name=\"DTreeHPTune\",\n",
 720 |     "        tuner=dtree_tuner_log,\n",
 721 |     "        inputs={\n",
 722 |     "            \"training\": TrainingInput(\n",
 723 |     "                s3_data=step_process.properties.ProcessingOutputConfig.Outputs[\n",
 724 |     "                    \"train\"\n",
 725 |     "                ].S3Output.S3Uri,\n",
 726 |     "                content_type=\"text/csv\",\n",
 727 |     "            ),\n",
 728 |     "            \"validation\": TrainingInput(\n",
 729 |     "                s3_data=step_process.properties.ProcessingOutputConfig.Outputs[\n",
 730 |     "                    \"validation\"\n",
 731 |     "                ].S3Output.S3Uri,\n",
 732 |     "                content_type=\"text/csv\",\n",
 733 |     "            ),\n",
 734 |     "        },\n",
 735 |     "        cache_config=cache_config,\n",
 736 |     ")\n"
 737 |    ]
 738 |   },
 739 |   {
 740 |    "cell_type": "markdown",
 741 |    "id": "581bd6de",
 742 |    "metadata": {},
 743 |    "source": [
 744 |     "Define and create the `DtreeEval` step in the SageMaker pipeline. This uses `dtree_evaluate.py` to evaluate the performance of the generated model from `DTreeHPTune` step using test data. "
 745 |    ]
 746 |   },
 747 |   {
 748 |    "cell_type": "code",
 749 |    "execution_count": null,
 750 |    "id": "3d08243a",
 751 |    "metadata": {},
 752 |    "outputs": [],
 753 |    "source": [
 754 |     "dtree_script_eval = ScriptProcessor(\n",
 755 |     "        image_uri=dtree_image_uri,\n",
 756 |     "        command=[\"python3\"],\n",
 757 |     "        instance_type=processing_instance_type,\n",
 758 |     "        instance_count=1,\n",
 759 |     "        base_job_name=f\"{base_job_prefix}/script-dtree-eval\",\n",
 760 |     "        sagemaker_session=pipeline_session,\n",
 761 |     "        role=role,\n",
 762 |     ")\n",
 763 |     "\n",
 764 |     "dtree_evaluation_report = PropertyFile(\n",
 765 |     "        name=\"EvaluationReportDTree\",\n",
 766 |     "        output_name=\"dtree_evaluation\",\n",
 767 |     "        path=\"dtree_evaluation.json\",\n",
 768 |     ")\n",
 769 |     "\n",
 770 |     "dtree_step_eval = ProcessingStep(\n",
 771 |     "        name=\"DTreeEval\",\n",
 772 |     "        processor=dtree_script_eval,\n",
 773 |     "        inputs=[\n",
 774 |     "            ProcessingInput(\n",
 775 |     "                source=dtree_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key),\n",
 776 |     "                destination=\"/opt/ml/processing/model\",\n",
 777 |     "            ),\n",
 778 |     "            ProcessingInput(\n",
 779 |     "                source=step_process.properties.ProcessingOutputConfig.Outputs[\n",
 780 |     "                    \"test\"\n",
 781 |     "                ].S3Output.S3Uri,\n",
 782 |     "                destination=\"/opt/ml/processing/test\",\n",
 783 |     "            ),\n",
 784 |     "        ],\n",
 785 |     "        outputs=[\n",
 786 |     "            ProcessingOutput(\n",
 787 |     "                output_name=\"dtree_evaluation\", source=\"/opt/ml/processing/evaluation\"\n",
 788 |     "            ),\n",
 789 |     "        ],\n",
 790 |     "        code=\"sagemaker-pipeline/pipelines/diabetes/dtree_evaluate.py\",\n",
 791 |     "        property_files=[dtree_evaluation_report],\n",
 792 |     ")\n"
 793 |    ]
 794 |   },
 795 |   {
 796 |    "cell_type": "markdown",
 797 |    "id": "85b7e5e0",
 798 |    "metadata": {},
 799 |    "source": [
 800 |     "Define and create the `XGBEval` step in the SageMaker pipeline. This uses `xgb_evaluate.py` to evaluate the performance of the generated model from `XGBHPTune` step using test data. "
 801 |    ]
 802 |   },
 803 |   {
 804 |    "cell_type": "code",
 805 |    "execution_count": null,
 806 |    "id": "6271fd78",
 807 |    "metadata": {},
 808 |    "outputs": [],
 809 |    "source": [
 810 |     "xgb_script_eval = ScriptProcessor(\n",
 811 |     "        image_uri=xgb_image_uri,\n",
 812 |     "        command=[\"python3\"],\n",
 813 |     "        instance_type=processing_instance_type,\n",
 814 |     "        instance_count=1,\n",
 815 |     "        base_job_name=f\"{base_job_prefix}/script-xgb-eval\",\n",
 816 |     "        sagemaker_session=pipeline_session,\n",
 817 |     "        role=role,\n",
 818 |     ")\n",
 819 |     "\n",
 820 |     "xgb_evaluation_report = PropertyFile(\n",
 821 |     "        name=\"EvaluationReportXGBoost\",\n",
 822 |     "        output_name=\"xgb_evaluation\",\n",
 823 |     "        path=\"xgb_evaluation.json\",\n",
 824 |     ")\n",
 825 |     "\n",
 826 |     "xgb_step_eval = ProcessingStep(\n",
 827 |     "        name=\"XGBEval\",\n",
 828 |     "        processor=xgb_script_eval,\n",
 829 |     "        inputs=[\n",
 830 |     "            ProcessingInput(\n",
 831 |     "                source=xgb_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key),\n",
 832 |     "                destination=\"/opt/ml/processing/model\",\n",
 833 |     "            ),\n",
 834 |     "            ProcessingInput(\n",
 835 |     "                source=step_process.properties.ProcessingOutputConfig.Outputs[\n",
 836 |     "                    \"test\"\n",
 837 |     "                ].S3Output.S3Uri,\n",
 838 |     "                destination=\"/opt/ml/processing/test\",\n",
 839 |     "            ),\n",
 840 |     "        ],\n",
 841 |     "        outputs=[\n",
 842 |     "            ProcessingOutput(output_name=\"xgb_evaluation\", source=\"/opt/ml/processing/evaluation\"),\n",
 843 |     "        ],\n",
 844 |     "        code=\"sagemaker-pipeline/pipelines/diabetes/xgb_evaluate.py\",\n",
 845 |     "        property_files=[xgb_evaluation_report],\n",
 846 |     ")\n"
 847 |    ]
 848 |   },
 849 |   {
 850 |    "cell_type": "markdown",
 851 |    "id": "694e6b79",
 852 |    "metadata": {},
 853 |    "source": [
 854 |     "Retrieve the resulting AUC-ROC score from steps `DTreeEval` and `XGBEval` in the SageMaker pipeline."
 855 |    ]
 856 |   },
 857 |   {
 858 |    "cell_type": "code",
 859 |    "execution_count": null,
 860 |    "id": "6f7decd9",
 861 |    "metadata": {},
 862 |    "outputs": [],
 863 |    "source": [
 864 |     "xgb_model_metrics = ModelMetrics(\n",
 865 |     "        model_statistics=MetricsSource(\n",
 866 |     "            s3_uri=\"{}/xgb_evaluation.json\".format(\n",
 867 |     "                xgb_step_eval.arguments[\"ProcessingOutputConfig\"][\"Outputs\"][0][\"S3Output\"][\"S3Uri\"]\n",
 868 |     "            ),\n",
 869 |     "            content_type=\"application/json\",\n",
 870 |     "        )\n",
 871 |     ")\n",
 872 |     "\n",
 873 |     "dtree_model_metrics = ModelMetrics(\n",
 874 |     "        model_statistics=MetricsSource(\n",
 875 |     "            s3_uri=\"{}/dtree_evaluation.json\".format(\n",
 876 |     "                dtree_step_eval.arguments[\"ProcessingOutputConfig\"][\"Outputs\"][0][\"S3Output\"][\n",
 877 |     "                    \"S3Uri\"\n",
 878 |     "                ]\n",
 879 |     "            ),\n",
 880 |     "            content_type=\"application/json\",\n",
 881 |     "        )\n",
 882 |     ")\n",
 883 |     "\n",
 884 |     "xgb_eval_metrics = JsonGet(\n",
 885 |     "        #step_name=xgb_step_eval,\n",
 886 |     "        step=xgb_step_eval,\n",
 887 |     "        property_file=xgb_evaluation_report,\n",
 888 |     "        json_path=\"classification_metrics.roc.value\",  # This should follow the structure of your report_dict defined in the evaluate.py file.\n",
 889 |     ")\n",
 890 |     "\n",
 891 |     "dtree_eval_metrics = JsonGet(\n",
 892 |     "        #step_name=dtree_step_eval,\n",
 893 |     "        step=dtree_step_eval,\n",
 894 |     "        property_file=dtree_evaluation_report,\n",
 895 |     "        json_path=\"classification_metrics.roc.value\",  # This should follow the structure of your report_dict defined in the evaluate.py file.\n",
 896 |     ")\n"
 897 |    ]
 898 |   },
 899 |   {
 900 |    "cell_type": "markdown",
 901 |    "id": "d187a447",
 902 |    "metadata": {},
 903 |    "source": [
 904 |     "Define and create ` AccuracyCond`. `DTreeReg-RegisterModel` and `XGBReg-RegisterModel` steps in the SageMaker pipeline. IF AUC-ROC score of Scikit-learn Decision Tree is greater than SageMaker XGBoost, then the Decision Tree model is registered in the model registry. Else, XGBoost is registered in the model registry. "
 905 |    ]
 906 |   },
 907 |   {
 908 |    "cell_type": "code",
 909 |    "execution_count": null,
 910 |    "id": "0e4cd8c1",
 911 |    "metadata": {},
 912 |    "outputs": [],
 913 |    "source": [
 914 |     "model_package_group_name = \"sagemaker-diabetes\"\n",
 915 |     "# Register model step that will be conditionally executed\n",
 916 |     "dtree_step_register = RegisterModel(\n",
 917 |     "        name=\"DTreeReg\",\n",
 918 |     "        estimator=dtree_train,\n",
 919 |     "        model_data=dtree_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key),\n",
 920 |     "        content_types=[\"text/csv\"],\n",
 921 |     "        response_types=[\"text/csv\"],\n",
 922 |     "        inference_instances=[\"ml.t2.medium\", \"ml.m5.large\"],\n",
 923 |     "        transform_instances=[\"ml.m5.large\"],\n",
 924 |     "        model_package_group_name=model_package_group_name,\n",
 925 |     "        approval_status=model_approval_status,\n",
 926 |     "        model_metrics=dtree_model_metrics,\n",
 927 |     ")\n",
 928 |     "\n",
 929 |     "# Register model step that will be conditionally executed\n",
 930 |     "xgb_step_register = RegisterModel(\n",
 931 |     "        name=\"XGBReg\",\n",
 932 |     "        estimator=xgb_train,\n",
 933 |     "        model_data=xgb_step_tuning.get_top_model_s3_uri(top_k=0, s3_bucket=model_bucket_key),\n",
 934 |     "        content_types=[\"text/csv\"],\n",
 935 |     "        response_types=[\"text/csv\"],\n",
 936 |     "        inference_instances=[\"ml.t2.medium\", \"ml.m5.large\"],\n",
 937 |     "        transform_instances=[\"ml.m5.large\"],\n",
 938 |     "        model_package_group_name=model_package_group_name,\n",
 939 |     "        approval_status=model_approval_status,\n",
 940 |     "        model_metrics=xgb_model_metrics,\n",
 941 |     ")\n",
 942 |     "\n",
 943 |     "# Condition step for evaluating model quality and branching execution\n",
 944 |     "cond_lte = ConditionGreaterThanOrEqualTo(  # You can change the condition here\n",
 945 |     "        left=JsonGet(\n",
 946 |     "            #step_name=dtree_step_eval,\n",
 947 |     "            step=dtree_step_eval,\n",
 948 |     "            property_file=dtree_evaluation_report,\n",
 949 |     "            json_path=\"classification_metrics.roc.value\",  # This should follow the structure of your report_dict defined in the evaluate.py file.\n",
 950 |     "        ),\n",
 951 |     "        right=JsonGet(\n",
 952 |     "            #step_name=xgb_step_eval,\n",
 953 |     "            step=xgb_step_eval,\n",
 954 |     "            property_file=xgb_evaluation_report,\n",
 955 |     "            json_path=\"classification_metrics.roc.value\"\n",
 956 |     "        ), \n",
 957 |     ")\n",
 958 |     "\n",
 959 |     "step_cond = ConditionStep(\n",
 960 |     "        name=\"AccuracyCond\",\n",
 961 |     "        conditions=[cond_lte],\n",
 962 |     "        if_steps=[dtree_step_register],\n",
 963 |     "        else_steps=[xgb_step_register],\n",
 964 |     ")\n",
 965 |     "\n"
 966 |    ]
 967 |   },
 968 |   {
 969 |    "cell_type": "markdown",
 970 |    "id": "b9d1ace6",
 971 |    "metadata": {},
 972 |    "source": [
 973 |     "Define and start the SageMaker pipeline. You should be able to see the running SageMaker pipeline in SageMaker Studio. "
 974 |    ]
 975 |   },
 976 |   {
 977 |    "cell_type": "code",
 978 |    "execution_count": null,
 979 |    "id": "5cbfb086",
 980 |    "metadata": {},
 981 |    "outputs": [],
 982 |    "source": [
 983 |     "pipeline_name = \"sagemaker-diabetes\"\n",
 984 |     "\n",
 985 |     "pipeline = Pipeline(\n",
 986 |     "        name=pipeline_name,\n",
 987 |     "        parameters=[\n",
 988 |     "            datawrangler_instance_type,\n",
 989 |     "            datawrangler_instance_count,            \n",
 990 |     "            processing_instance_type,\n",
 991 |     "            processing_instance_count,\n",
 992 |     "            training_instance_type,\n",
 993 |     "            training_instance_count,\n",
 994 |     "            model_approval_status,\n",
 995 |     "            input_data\n",
 996 |     "        ],\n",
 997 |     "        steps=[\n",
 998 |     "           data_wrangler_step,\n",
 999 |     "            step_process,\n",
1000 |     "            dtree_step_tuning,\n",
1001 |     "            xgb_step_tuning,\n",
1002 |     "            dtree_step_eval,\n",
1003 |     "            xgb_step_eval,\n",
1004 |     "            step_cond,\n",
1005 |     "        ],\n",
1006 |     "       sagemaker_session=pipeline_session,\n",
1007 |     ")\n",
1008 |     "\n",
1009 |     "\n",
1010 |     "pipeline.upsert(role_arn=role)\n",
1011 |     "execution = pipeline.start()"
1012 |    ]
1013 |   },
1014 |   {
1015 |    "cell_type": "markdown",
1016 |    "id": "858e43e8",
1017 |    "metadata": {},
1018 |    "source": [
1019 |     "## Approve top performing model in SageMaker model registry"
1020 |    ]
1021 |   },
1022 |   {
1023 |    "cell_type": "markdown",
1024 |    "id": "1ff84ce4",
1025 |    "metadata": {},
1026 |    "source": [
1027 |     "After the SageMaker Pipeline is complete, new trained Model will be registered in Model Registry.\n",
1028 |     "\n",
1029 |     "1) Make sure to update your desired `MODEL_VERSION`. We assume we approve the model version 1. \n",
1030 |     "\n",
1031 |     "2) As EventBridge monitors Model Registry status changes, Model status change will trigger SageMaker Projects model-deploy pipeline."
1032 |    ]
1033 |   },
1034 |   {
1035 |    "cell_type": "code",
1036 |    "execution_count": null,
1037 |    "id": "698e3477",
1038 |    "metadata": {},
1039 |    "outputs": [],
1040 |    "source": [
1041 |     "from sagemaker import get_execution_role, session\n",
1042 |     "import boto3\n",
1043 |     "\n",
1044 |     "role = get_execution_role()\n",
1045 |     "sm_client = boto3.client(\"sagemaker\")\n",
1046 |     "\n",
1047 |     "MODEL_VERSION = \"2\"\n",
1048 |     "AWS_REGION = boto3.Session().region_name\n",
1049 |     "MODEL_PACKAGE_ARN = \"arn:aws:sagemaker:{AWS_REGION}:{AWS_ACCOUNT}:model-package/sagemaker-diabetes/{MODEL_VERSION}\".format(\n",
1050 |     "    AWS_REGION=AWS_REGION,\n",
1051 |     "    AWS_ACCOUNT=AWS_ACCOUNT, \n",
1052 |     "    MODEL_VERSION=MODEL_VERSION\n",
1053 |     ")\n",
1054 |     "\n",
1055 |     "\n",
1056 |     "model_package_update_response = sm_client.update_model_package(\n",
1057 |     "    ModelPackageArn=MODEL_PACKAGE_ARN, ModelApprovalStatus=\"Approved\"\n",
1058 |     ")"
1059 |    ]
1060 |   },
1061 |   {
1062 |    "cell_type": "markdown",
1063 |    "id": "6ce11a41",
1064 |    "metadata": {},
1065 |    "source": [
1066 |     "## Deploy the SageMaker inference endpoint"
1067 |    ]
1068 |   },
1069 |   {
1070 |    "cell_type": "markdown",
1071 |    "id": "e4c895fc",
1072 |    "metadata": {},
1073 |    "source": [
1074 |     "Import model into hosting. Register the model with hosting. This allows the flexibility of importing models trained elsewhere."
1075 |    ]
1076 |   },
1077 |   {
1078 |    "cell_type": "code",
1079 |    "execution_count": null,
1080 |    "id": "73d09101",
1081 |    "metadata": {},
1082 |    "outputs": [],
1083 |    "source": [
1084 |     "from time import gmtime, strftime\n",
1085 |     "\n",
1086 |     "model_name = \"diabetes-modelregistry-model-\" + strftime(\"%Y-%m-%d-%H-%M-%S\", gmtime())\n",
1087 |     "print(\"Model name : {}\".format(model_name))\n",
1088 |     "container_list = [{\"ModelPackageName\": MODEL_PACKAGE_ARN}]\n",
1089 |     "\n",
1090 |     "create_model_response = sm_client.create_model(\n",
1091 |     "    ModelName=model_name, ExecutionRoleArn=role, Containers=container_list\n",
1092 |     ")\n",
1093 |     "print(\"Model arn : {}\".format(create_model_response[\"ModelArn\"]))"
1094 |    ]
1095 |   },
1096 |   {
1097 |    "cell_type": "markdown",
1098 |    "id": "4a7623ed",
1099 |    "metadata": {},
1100 |    "source": [
1101 |     "Create endpoint configuration. SageMaker supports configuring REST endpoints in hosting with multiple models, e.g. for A/B testing purposes. In order to support this, customers create an endpoint configuration, that describes the distribution of traffic across the models, whether split, shadowed, or sampled in some way. In addition, the endpoint configuration describes the instance type required for model deployment."
1102 |    ]
1103 |   },
1104 |   {
1105 |    "cell_type": "code",
1106 |    "execution_count": null,
1107 |    "id": "40838588",
1108 |    "metadata": {},
1109 |    "outputs": [],
1110 |    "source": [
1111 |     "endpoint_config_name = \"diabetes-modelregistry-EndpointConfig-\" + strftime(\n",
1112 |     "    \"%Y-%m-%d-%H-%M-%S\", gmtime()\n",
1113 |     ")\n",
1114 |     "print(endpoint_config_name)\n",
1115 |     "create_endpoint_config_response = sm_client.create_endpoint_config(\n",
1116 |     "    EndpointConfigName=endpoint_config_name,\n",
1117 |     "    ProductionVariants=[\n",
1118 |     "        {\n",
1119 |     "            \"InstanceType\": \"ml.m5.large\",\n",
1120 |     "            \"InitialVariantWeight\": 1,\n",
1121 |     "            \"InitialInstanceCount\": 1,\n",
1122 |     "            \"ModelName\": model_name,\n",
1123 |     "            \"VariantName\": \"AllTraffic\",\n",
1124 |     "        }\n",
1125 |     "    ],\n",
1126 |     ")"
1127 |    ]
1128 |   },
1129 |   {
1130 |    "cell_type": "markdown",
1131 |    "id": "c0e6759f",
1132 |    "metadata": {},
1133 |    "source": [
1134 |     "Create endpoint. Lastly, the customer creates the endpoint that serves up the model, through specifying the name and configuration defined above. The end result is an endpoint that can be validated and incorporated into your applications. This takes 9-11 minutes to complete."
1135 |    ]
1136 |   },
1137 |   {
1138 |    "cell_type": "code",
1139 |    "execution_count": null,
1140 |    "id": "ef5578c8",
1141 |    "metadata": {},
1142 |    "outputs": [],
1143 |    "source": [
1144 |     "endpoint_name = \"diabetes-staging\"\n",
1145 |     "print(\"EndpointName={}\".format(endpoint_name))\n",
1146 |     "\n",
1147 |     "create_endpoint_response = sm_client.create_endpoint(\n",
1148 |     "    EndpointName=endpoint_name, EndpointConfigName=endpoint_config_name\n",
1149 |     ")\n",
1150 |     "\n",
1151 |     "\n",
1152 |     "while True:\n",
1153 |     "    endpoint = sm_client.describe_endpoint(EndpointName=endpoint_name)\n",
1154 |     "    if endpoint[\"EndpointStatus\"] == \"InService\":\n",
1155 |     "        break\n",
1156 |     "    print(\"Waiting for the endpoint to be completed..\")\n",
1157 |     "    time.sleep(60)\n",
1158 |     "\n",
1159 |     "print(\"Endpoint arn : {}\".format(create_endpoint_response[\"EndpointArn\"]))"
1160 |    ]
1161 |   },
1162 |   {
1163 |    "cell_type": "markdown",
1164 |    "id": "1cd1356a",
1165 |    "metadata": {},
1166 |    "source": [
1167 |     "## Run predictions on model"
1168 |    ]
1169 |   },
1170 |   {
1171 |    "cell_type": "markdown",
1172 |    "id": "087b25ad",
1173 |    "metadata": {},
1174 |    "source": [
1175 |     "Wait until SageMaker Projects model-deploy pipeline has deployed the staging inference endpoint. Use the following data for inference:\n",
1176 |     "\n",
1177 |     "Example 1\n",
1178 |     "------------\n",
1179 |     "`5.0,64.0,0.0,18.0,0.0,0.0,7.0,9.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0\n",
1180 |     "`\n",
1181 |     "\n",
1182 |     "In summary, this is a diabetic patient that is Caucasian Female age 60-70, who has spent 5 days in the hospital under emergency care in the current encounter. Prior to this encounter, patient has spent 0 days in outpatient care, 0 days in emergency care, 7 days in inpatient care. 64 laboratory procedures have been performed on the patient. Patient is not using metformin, repaglinide, pioglitazone, rosiglitazone, and insulin prescription is steady.\n",
1183 |     "\n"
1184 |    ]
1185 |   },
1186 |   {
1187 |    "cell_type": "code",
1188 |    "execution_count": null,
1189 |    "id": "fa19de14",
1190 |    "metadata": {},
1191 |    "outputs": [],
1192 |    "source": [
1193 |     "import json\n",
1194 |     "import boto3\n",
1195 |     "\n",
1196 |     "sm_runtime = boto3.client(\"runtime.sagemaker\")\n",
1197 |     "endpoint_name =\"diabetes-staging\"\n",
1198 |     "line = \"5.0,64.0,0.0,18.0,0.0,0.0,7.0,9.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0\"\n",
1199 |     "response = sm_runtime.invoke_endpoint(EndpointName=endpoint_name, ContentType=\"text/csv\", Body=line)\n",
1200 |     "result = json.loads(response[\"Body\"].read().decode())\n",
1201 |     "print(\"Predicted class : {}\".format(round(result)))"
1202 |    ]
1203 |   },
1204 |   {
1205 |    "cell_type": "markdown",
1206 |    "id": "3b16d0f3",
1207 |    "metadata": {},
1208 |    "source": [
1209 |     "Now you try:\n",
1210 |     "\n",
1211 |     "Example 2\n",
1212 |     "------------\n",
1213 |     "\n",
1214 |     "`3.0,19.0,3.0,19.0,0.0,0.0,0.0,8.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0\n",
1215 |     "`\n",
1216 |     "\n",
1217 |     "In summary, this is a diabetic patient that is Caucasian Female age 70-80, who has spent 3 days in the hospital under elective care in the current encounter. Prior to this encounter, patient has spent 0 days in outpatient care, 0 days in emergency care, 0 days in inpatient care. 19 laboratory procedures have been performed on the patient. Patient is not using metformin, repaglinide, pioglitazone, rosiglitazone. Patient is not using insulin. "
1218 |    ]
1219 |   },
1220 |   {
1221 |    "cell_type": "markdown",
1222 |    "id": "8fcb475b",
1223 |    "metadata": {},
1224 |    "source": [
1225 |     "## Cleanup"
1226 |    ]
1227 |   },
1228 |   {
1229 |    "cell_type": "markdown",
1230 |    "id": "9787688e",
1231 |    "metadata": {},
1232 |    "source": [
1233 |     "To avoid incurring future charges, clean up created resources such as the S3 bucket, ECR repository, and SageMaker Studio. Prior to deleting the SageMaker Studio, make sure to delete the SageMaker model and endpoint resources. \n",
1234 |     "Finally, delete the Jupyter instance containing the notebook. "
1235 |    ]
1236 |   }
1237 |  ],
1238 |  "metadata": {
1239 |   "instance_type": "ml.t3.medium",
1240 |   "kernelspec": {
1241 |    "display_name": "conda_python3",
1242 |    "language": "python",
1243 |    "name": "conda_python3"
1244 |   },
1245 |   "language_info": {
1246 |    "codemirror_mode": {
1247 |     "name": "ipython",
1248 |     "version": 3
1249 |    },
1250 |    "file_extension": ".py",
1251 |    "mimetype": "text/x-python",
1252 |    "name": "python",
1253 |    "nbconvert_exporter": "python",
1254 |    "pygments_lexer": "ipython3",
1255 |    "version": "3.8.12"
1256 |   }
1257 |  },
1258 |  "nbformat": 4,
1259 |  "nbformat_minor": 5
1260 | }
1261 | 


--------------------------------------------------------------------------------