├── modules ├── sagemaker │ ├── output.tf │ ├── variables.tf │ ├── template │ │ └── sagemaker_instance_init.sh │ └── main.tf ├── s3 │ ├── outputs.tf │ ├── variables.tf │ └── main.tf └── iam │ ├── variables.tf │ ├── outputs.tf │ └── main.tf ├── main ├── terraform_backend.tf.template ├── variables.tf ├── terraform.tfvars.template └── main.tf ├── .gitignore ├── LICENSE ├── README.md └── source ├── scripts └── scikit_learn_script.py └── notebooks └── Scikit-learn_Estimator_Example_With_Terraform.ipynb /modules/sagemaker/output.tf: -------------------------------------------------------------------------------- 1 | variable "bucket_name" {} 2 | -------------------------------------------------------------------------------- /modules/s3/outputs.tf: -------------------------------------------------------------------------------- 1 | output "bucket_name" { 2 | value = aws_s3_bucket.notebook.id 3 | } 4 | 5 | -------------------------------------------------------------------------------- /modules/s3/variables.tf: -------------------------------------------------------------------------------- 1 | variable "notebook_bucket_name" {} 2 | variable "sagemaker_bucket_name" {} 3 | -------------------------------------------------------------------------------- /modules/sagemaker/variables.tf: -------------------------------------------------------------------------------- 1 | variable "sagemaker_notebook_name" {} 2 | variable "aws_iam_role" {} 3 | -------------------------------------------------------------------------------- /modules/iam/variables.tf: -------------------------------------------------------------------------------- 1 | variable "aws_region" {} 2 | variable "iam_name" {} 3 | variable "identifier" {} 4 | data "aws_caller_identity" "current" {} 5 | -------------------------------------------------------------------------------- /main/terraform_backend.tf.template: -------------------------------------------------------------------------------- 1 | terraform { 2 | required_version = "0.12.6" 3 | backend "s3" { 4 | bucket = "" 5 | key = "sagemaker-sample/terraform.tfstate" 6 | region = "" 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /main/variables.tf: -------------------------------------------------------------------------------- 1 | variable "aws_region" {} 2 | variable "aws_profile" {} 3 | variable "iam_name" {} 4 | variable "identifier" {} 5 | variable "notebook_bucket_name" {} 6 | variable "sagemaker_bucket_name" {} 7 | variable "sagemaker_notebook_name" {} 8 | -------------------------------------------------------------------------------- /modules/iam/outputs.tf: -------------------------------------------------------------------------------- 1 | output "iam_role_arn" { 2 | value = aws_iam_role.default.arn 3 | } 4 | 5 | output "iam_role_name" { 6 | value = aws_iam_role.default.name 7 | } 8 | 9 | output "policy_attachment_id" { 10 | value = aws_iam_role_policy_attachment.default.id 11 | } 12 | 13 | -------------------------------------------------------------------------------- /main/terraform.tfvars.template: -------------------------------------------------------------------------------- 1 | aws_region = "" 2 | aws_profile = "" 3 | iam_name = "" 4 | identifier = "sagemaker.amazonaws.com" 5 | notebook_bucket_name = "" 6 | sagemaker_bucket_name = "" 7 | sagemaker_notebook_name = "" 8 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # See https://help.github.com/articles/ignoring-files for more about ignoring files. 2 | # 3 | # If you find yourself ignoring temporary files generated by your text editor 4 | # or operating system, you probably want to add a global ignore instead: 5 | # git config --global core.excludesfile '~/.gitignore_global' 6 | 7 | # Ignore build files. 8 | */.terraform 9 | */.terraform/* 10 | */terraform_backend.tf 11 | */terraform.tfvars 12 | -------------------------------------------------------------------------------- /modules/sagemaker/template/sagemaker_instance_init.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | 4 | cd /home/ec2-user/SageMaker 5 | aws s3 cp s3://${bucket_name}/sagemaker/sample/notebooks/Scikit-learn_Estimator_Example_With_Terraform.ipynb . 6 | aws s3 cp s3://${bucket_name}/sagemaker/sample/scripts/scikit_learn_script.py . 7 | 8 | ENVIRONMENT=python3 9 | NOTEBOOK_FILE=/home/ec2-user/SageMaker/Scikit-learn_Estimator_Example_With_Terraform.ipynb 10 | 11 | source /home/ec2-user/anaconda3/bin/activate "$ENVIRONMENT" 12 | nohup jupyter nbconvert "$NOTEBOOK_FILE" --ExecutePreprocessor.kernel_name=python3 --ExecutePreprocessor.timeout=1500 --execute 13 | source /home/ec2-user/anaconda3/bin/deactivate 14 | -------------------------------------------------------------------------------- /main/main.tf: -------------------------------------------------------------------------------- 1 | provider "aws" { 2 | region = var.aws_region 3 | profile = var.aws_profile 4 | version = "2.23.0" 5 | } 6 | 7 | module "iam" { 8 | source = "../modules/iam" 9 | aws_region = var.aws_region 10 | 11 | iam_name = var.iam_name 12 | identifier = var.identifier 13 | } 14 | 15 | module "s3" { 16 | source = "../modules/s3" 17 | 18 | notebook_bucket_name = var.notebook_bucket_name 19 | sagemaker_bucket_name = var.sagemaker_bucket_name 20 | } 21 | 22 | module "sagemaker" { 23 | source = "../modules/sagemaker" 24 | 25 | sagemaker_notebook_name = var.sagemaker_notebook_name 26 | aws_iam_role = "${module.iam.iam_role_arn}" 27 | bucket_name = "${module.s3.bucket_name}" 28 | } 29 | 30 | -------------------------------------------------------------------------------- /modules/sagemaker/main.tf: -------------------------------------------------------------------------------- 1 | resource "aws_sagemaker_notebook_instance" "default" { 2 | name = var.sagemaker_notebook_name 3 | role_arn = var.aws_iam_role 4 | instance_type = "ml.t2.medium" 5 | lifecycle_config_name = aws_sagemaker_notebook_instance_lifecycle_configuration.default.name 6 | } 7 | 8 | data "template_file" "instance_init" { 9 | template = "${file("${path.module}/template/sagemaker_instance_init.sh")}" 10 | 11 | vars = { 12 | bucket_name = "${var.bucket_name}" 13 | } 14 | } 15 | 16 | resource "aws_sagemaker_notebook_instance_lifecycle_configuration" "default" { 17 | name = var.sagemaker_notebook_name 18 | on_start = "${base64encode(data.template_file.instance_init.rendered)}" 19 | } 20 | 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Yuya Sugano 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Machine Learning Infrastructure with Terraform 2 | 3 | This example show how to set up end to end demo architecture for predicting boston housing dataset with Machine Learning using `Amazon SageMaker` and `Terraform`. 4 | 5 | ## Terraform version 6 | 7 | Ensure your `Terraform` version is as follows (some modifications would be required if you run other `Terraform` versions): 8 | ```sh 9 | $ cd main 10 | $ terraform --version 11 | Terraform v0.12.6 12 | + provider.aws v2.23.0 13 | + provider.template v2.1.2 14 | ``` 15 | To download `Terraform`, visit https://releases.hashicorp.com/terraform/ 16 | 17 | ## Setup steps 18 | 19 | From `terraform` folder: 20 | 1. Copy `terraform_backend.tf.template` to `terraform_backend.tf` and modify values accordingly. You need to manually create an S3 bucket or use an existing one to store the Terraform state file. 21 | 2. Copy `terraform.tfvars.template` to `terraform.tfvars` and modify input variables accordingly. You don't need to create any buckets specified in here, they're to be created by terraform apply. 22 | 3. Run the followings: 23 | ```sh 24 | export AWS_PROFILE= 25 | 26 | terraform init 27 | terraform validate 28 | terraform plan -var-file=terraform.tfvars 29 | terraform apply -var-file=terraform.tfvars 30 | ``` 31 | 32 | ## Clean up 33 | 34 | ``` 35 | terraform plan -destroy -var-file=terraform.tfvars 36 | terraform destroy -var-file=terraform.tfvars 37 | ``` 38 | 39 | ## License 40 | 41 | This library is licensed under the Apache 2.0 License. 42 | -------------------------------------------------------------------------------- /modules/s3/main.tf: -------------------------------------------------------------------------------- 1 | resource "aws_s3_bucket" "notebook" { 2 | bucket = var.notebook_bucket_name 3 | force_destroy = true 4 | acl = "private" 5 | 6 | server_side_encryption_configuration { 7 | rule { 8 | apply_server_side_encryption_by_default { 9 | sse_algorithm = "AES256" 10 | } 11 | } 12 | } 13 | } 14 | 15 | resource "aws_s3_bucket" "sagemaker" { 16 | bucket = var.sagemaker_bucket_name 17 | force_destroy = true 18 | acl = "private" 19 | 20 | server_side_encryption_configuration { 21 | rule { 22 | apply_server_side_encryption_by_default { 23 | sse_algorithm = "AES256" 24 | } 25 | } 26 | } 27 | } 28 | 29 | resource "aws_s3_bucket_object" "notebook" { 30 | bucket = aws_s3_bucket.notebook.id 31 | key = "sagemaker/sample/notebooks/Scikit-learn_Estimator_Example_With_Terraform.ipynb" 32 | source = "${path.module}/../../source/notebooks/Scikit-learn_Estimator_Example_With_Terraform.ipynb" 33 | 34 | # The filemd5() function is available in Terraform 0.11.12 and later 35 | # For Terraform 0.11.11 and earlier, use the md5() function and the file() function: 36 | # etag = "${md5(file("path/to/file"))}" 37 | etag = "${filemd5("${path.module}/../../source/notebooks/Scikit-learn_Estimator_Example_With_Terraform.ipynb")}" 38 | } 39 | 40 | resource "aws_s3_bucket_object" "script" { 41 | bucket = aws_s3_bucket.notebook.id 42 | key = "sagemaker/sample/scripts/scikit_learn_script.py" 43 | source = "${path.module}/../../source/scripts/scikit_learn_script.py" 44 | 45 | # The filemd5() function is available in Terraform 0.11.12 and later 46 | # For Terraform 0.11.11 and earlier, use the md5() function and the file() function: 47 | # etag = "${md5(file("path/to/file"))}" 48 | etag = "${filemd5("${path.module}/../../source/scripts/scikit_learn_script.py")}" 49 | } 50 | 51 | -------------------------------------------------------------------------------- /modules/iam/main.tf: -------------------------------------------------------------------------------- 1 | resource "aws_iam_role" "default" { 2 | name = var.iam_name 3 | assume_role_policy = data.aws_iam_policy_document.assume_role.json 4 | } 5 | 6 | data "aws_iam_policy_document" "assume_role" { 7 | statement { 8 | actions = ["sts:AssumeRole"] 9 | 10 | principals { 11 | type = "Service" 12 | identifiers = [var.identifier] 13 | } 14 | } 15 | } 16 | 17 | resource "aws_iam_role_policy_attachment" "default" { 18 | role = aws_iam_role.default.name 19 | policy_arn = aws_iam_policy.default.arn 20 | } 21 | 22 | resource "aws_iam_policy" "default" { 23 | name = var.iam_name 24 | path = "/" 25 | description = "Policy for the Notebook Instance to manage training jobs, models and endpoints" 26 | policy = data.aws_iam_policy_document.sagemaker_role_policy.json 27 | } 28 | 29 | data "aws_iam_policy_document" "sagemaker_role_policy" { 30 | statement { 31 | effect = "Allow" 32 | actions = [ 33 | "s3:CreateBucket", 34 | "s3:GetBucketLocation", 35 | "s3:ListBucket", 36 | "s3:ListAllMyBuckets", 37 | "s3:GetObject", 38 | "s3:PutObject", 39 | "s3:DeleteObject", 40 | "s3:GetBucketCors", 41 | "s3:PutBucketCors" 42 | ] 43 | resources = [ 44 | "arn:aws:s3:::*" 45 | ] 46 | } 47 | 48 | statement { 49 | effect = "Allow" 50 | actions = [ 51 | "sagemaker:CreateTrainingJob", 52 | "sagemaker:DescribeTrainingJob", 53 | "sagemaker:CreateModel", 54 | "sagemaker:DescribeModel", 55 | "sagemaker:DeleteModel", 56 | "sagemaker:CreateEndpoint", 57 | "sagemaker:CreateEndpointConfig", 58 | "sagemaker:DescribeEndpoint", 59 | "sagemaker:DescribeEndpointConfig", 60 | "sagemaker:DeleteEndpoint" 61 | ] 62 | resources = [ 63 | "arn:aws:sagemaker:${var.aws_region}:${data.aws_caller_identity.current.account_id}:*" 64 | ] 65 | } 66 | 67 | statement { 68 | effect = "Allow" 69 | actions = [ 70 | "ecr:GetDownloadUrlForLayer", 71 | "ecr:BatchGetImage", 72 | "ecr:BatchCheckLayerAvailability" 73 | ] 74 | resources = [ 75 | "arn:aws:ecr:${var.aws_region}:${data.aws_caller_identity.current.account_id}:repository/*" 76 | ] 77 | } 78 | 79 | statement { 80 | effect = "Allow" 81 | actions = [ 82 | "ec2:CreateVpcEndpoint", 83 | "ec2:DescribeRouteTables" 84 | ] 85 | resources = [ 86 | "*" 87 | ] 88 | } 89 | 90 | statement { 91 | effect = "Allow" 92 | actions = [ 93 | "cloudwatch:PutMetricData", 94 | "cloudwatch:GetMetricData", 95 | "cloudwatch:GetMetricStatistics", 96 | "cloudwatch:ListMetrics" 97 | ] 98 | resources = [ 99 | "arn:aws:cloudwatch:${var.aws_region}:${data.aws_caller_identity.current.account_id}:*" 100 | ] 101 | } 102 | 103 | statement { 104 | effect = "Allow" 105 | actions = [ 106 | "logs:CreateLogGroup", 107 | "logs:CreateLogStream", 108 | "logs:DescribeLogStreams", 109 | "logs:GetLogEvents", 110 | "logs:PutLogEvents" 111 | ] 112 | resources = [ 113 | "arn:aws:logs:${var.aws_region}:${data.aws_caller_identity.current.account_id}:log-group:/aws/sagemaker/*" 114 | ] 115 | } 116 | 117 | statement { 118 | effect = "Allow" 119 | actions = ["iam:PassRole"] 120 | resources = [ 121 | "${aws_iam_role.default.arn}" 122 | ] 123 | condition { 124 | test = "StringEquals" 125 | variable = "iam:PassedToService" 126 | values = ["sagemaker.amazonaws.com"] 127 | } 128 | } 129 | 130 | statement { 131 | effect = "Allow" 132 | actions = ["iam:GetRole"] 133 | resources = [ 134 | "${aws_iam_role.default.arn}" 135 | ] 136 | } 137 | } 138 | 139 | -------------------------------------------------------------------------------- /source/scripts/scikit_learn_script.py: -------------------------------------------------------------------------------- 1 | ''' 2 | DERIVED FROM:https://github.com/aws/sagemaker-python-sdk/blob/master/src/sagemaker/sklearn/README.rst 3 | Preparing the Scikit-learn training script 4 | Your Scikit-learn training script must be a Python 2.7 or 3.5 compatible source file. 5 | The training script is very similar to a training script you might run outside of SageMaker, 6 | but you can access useful properties about the training environment through various environment variables, 7 | such as 8 | - SM_MODEL_DIR: 9 | A string representing the path to the directory to write model artifacts to. 10 | These artifacts are uploaded to S3 for model hosting. 11 | - SM_OUTPUT_DATA_DIR: 12 | A string representing the filesystem path to write output artifacts to. 13 | Output artifacts may include checkpoints, graphs, and other files to save, 14 | not including model artifacts. These artifacts are compressed and uploaded 15 | to S3 to the same S3 prefix as the model artifacts. 16 | Supposing two input channels, 'train' and 'test', 17 | were used in the call to the Scikit-learn estimator's fit() method, 18 | the following will be set, following the format "SM_CHANNEL_[channel_name]": 19 | - SM_CHANNEL_TRAIN: 20 | A string representing the path to the directory containing data in the 'train' channel 21 | - SM_CHANNEL_TEST: 22 | Same as above, but for the 'test' channel. 23 | A typical training script loads data from the input channels, 24 | configures training with hyperparameters, trains a model, 25 | and saves a model to model_dir so that it can be hosted later. 26 | Hyperparameters are passed to your script as arguments and can 27 | be retrieved with an argparse.ArgumentParser instance. 28 | For example, a training script might start with the following: 29 | Because the SageMaker imports your training script, 30 | you should put your training code in a main guard (if __name__=='__main__':) 31 | if you are using the same script to host your model, 32 | so that SageMaker does not inadvertently run your training code at the wrong point in execution. 33 | For more on training environment variables, please visit https://github.com/aws/sagemaker-containers. 34 | ''' 35 | 36 | import argparse 37 | import pandas as pd 38 | import os 39 | 40 | # GradientBoosting Regressor 41 | from sklearn.ensemble import GradientBoostingRegressor 42 | from sklearn.externals import joblib 43 | 44 | # Pipeline and StandardScaler 45 | from sklearn.preprocessing import StandardScaler 46 | from sklearn.pipeline import Pipeline 47 | 48 | if __name__ == '__main__': 49 | parser = argparse.ArgumentParser() 50 | 51 | # Hyperparameters are described here. In this simple example we are just including one hyperparameter. 52 | parser.add_argument('--learning_rate', type=float, default=0.1) 53 | parser.add_argument('--n_estimators', type=int, default=100) 54 | 55 | # Sagemaker specific arguments. Defaults are set in the environment variables. 56 | parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR']) 57 | parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR']) 58 | parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN']) 59 | 60 | args = parser.parse_args() 61 | 62 | # Take the set of files and read them all into a single pandas dataframe 63 | input_files = [os.path.join(args.train, file) for file in os.listdir(args.train) ] 64 | if len(input_files) == 0: 65 | raise ValueError(('There are no files in {}.\n' + 66 | 'This usually indicates that the channel ({}) was incorrectly specified,\n' + 67 | 'the data specification in S3 was incorrectly specified or the role specified\n' + 68 | 'does not have permission to access the data.').format(args.train, "train")) 69 | raw_data = [pd.read_csv(file, header=None, engine="python") for file in input_files] 70 | train_data = pd.concat(raw_data) 71 | 72 | # labels are in the last column, train data are in the latter columns 73 | train_y = train_data.iloc[:,-1] 74 | train_X = train_data.iloc[:,0:-1] 75 | 76 | # Here we support a single hyperparameter 77 | learning_rate = args.learning_rate 78 | n_estimators = args.n_estimators 79 | 80 | # Now use scikit-learn's decision tree classifier to train the model. 81 | clf = GradientBoostingRegressor(learning_rate=learning_rate, n_estimators=n_estimators) 82 | clf = clf.fit(train_X, train_y) 83 | print(clf) 84 | 85 | # The trained classifier, and save the coefficients 86 | joblib.dump(clf, os.path.join(args.model_dir, "model.joblib")) 87 | 88 | def model_fn(model_dir): 89 | """Deserialized and return fitted model 90 | 91 | Note that this should have the same name as the serialized model in the main method 92 | """ 93 | clf = joblib.load(os.path.join(model_dir, "model.joblib")) 94 | return clf 95 | -------------------------------------------------------------------------------- /source/notebooks/Scikit-learn_Estimator_Example_With_Terraform.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 5, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "data": { 10 | "text/html": [ 11 | "
\n", 12 | "\n", 25 | "\n", 26 | " \n", 27 | " \n", 28 | " \n", 29 | " \n", 30 | " \n", 31 | " \n", 32 | " \n", 33 | " \n", 34 | " \n", 35 | " \n", 36 | " \n", 37 | " \n", 38 | " \n", 39 | " \n", 40 | " \n", 41 | " \n", 42 | " \n", 43 | " \n", 44 | " \n", 45 | " \n", 46 | " \n", 47 | " \n", 48 | " \n", 49 | " \n", 50 | " \n", 51 | " \n", 52 | " \n", 53 | " \n", 54 | " \n", 55 | " \n", 56 | " \n", 57 | " \n", 58 | " \n", 59 | " \n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | " \n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | "
CRIMZNINDUSCHASNOXRMAGEDISRADTAXPTRATIOBLSTAT
count506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000506.000000
mean3.61352411.36363611.1367790.0691700.5546956.28463468.5749013.7950439.549407408.23715418.455534356.67403212.653063
std8.60154523.3224536.8603530.2539940.1158780.70261728.1488612.1057108.707259168.5371162.16494691.2948647.141062
min0.0063200.0000000.4600000.0000000.3850003.5610002.9000001.1296001.000000187.00000012.6000000.3200001.730000
25%0.0820450.0000005.1900000.0000000.4490005.88550045.0250002.1001754.000000279.00000017.400000375.3775006.950000
50%0.2565100.0000009.6900000.0000000.5380006.20850077.5000003.2074505.000000330.00000019.050000391.44000011.360000
75%3.67708312.50000018.1000000.0000000.6240006.62350094.0750005.18842524.000000666.00000020.200000396.22500016.955000
max88.976200100.00000027.7400001.0000000.8710008.780000100.00000012.12650024.000000711.00000022.000000396.90000037.970000
\n", 175 | "
" 176 | ], 177 | "text/plain": [ 178 | " CRIM ZN INDUS CHAS NOX RM \\\n", 179 | "count 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 \n", 180 | "mean 3.613524 11.363636 11.136779 0.069170 0.554695 6.284634 \n", 181 | "std 8.601545 23.322453 6.860353 0.253994 0.115878 0.702617 \n", 182 | "min 0.006320 0.000000 0.460000 0.000000 0.385000 3.561000 \n", 183 | "25% 0.082045 0.000000 5.190000 0.000000 0.449000 5.885500 \n", 184 | "50% 0.256510 0.000000 9.690000 0.000000 0.538000 6.208500 \n", 185 | "75% 3.677083 12.500000 18.100000 0.000000 0.624000 6.623500 \n", 186 | "max 88.976200 100.000000 27.740000 1.000000 0.871000 8.780000 \n", 187 | "\n", 188 | " AGE DIS RAD TAX PTRATIO B \\\n", 189 | "count 506.000000 506.000000 506.000000 506.000000 506.000000 506.000000 \n", 190 | "mean 68.574901 3.795043 9.549407 408.237154 18.455534 356.674032 \n", 191 | "std 28.148861 2.105710 8.707259 168.537116 2.164946 91.294864 \n", 192 | "min 2.900000 1.129600 1.000000 187.000000 12.600000 0.320000 \n", 193 | "25% 45.025000 2.100175 4.000000 279.000000 17.400000 375.377500 \n", 194 | "50% 77.500000 3.207450 5.000000 330.000000 19.050000 391.440000 \n", 195 | "75% 94.075000 5.188425 24.000000 666.000000 20.200000 396.225000 \n", 196 | "max 100.000000 12.126500 24.000000 711.000000 22.000000 396.900000 \n", 197 | "\n", 198 | " LSTAT \n", 199 | "count 506.000000 \n", 200 | "mean 12.653063 \n", 201 | "std 7.141062 \n", 202 | "min 1.730000 \n", 203 | "25% 6.950000 \n", 204 | "50% 11.360000 \n", 205 | "75% 16.955000 \n", 206 | "max 37.970000 " 207 | ] 208 | }, 209 | "execution_count": 5, 210 | "metadata": {}, 211 | "output_type": "execute_result" 212 | } 213 | ], 214 | "source": [ 215 | "# Reading boston housing dataset\n", 216 | "from sklearn.datasets import load_boston\n", 217 | "from sklearn.model_selection import train_test_split\n", 218 | "import pandas as pd\n", 219 | "\n", 220 | "boston = load_boston()\n", 221 | "df = pd.DataFrame(boston.data, columns=boston.feature_names)\n", 222 | "df.describe() # describe dataset overview" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 7, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "# save files as csv\n", 232 | "import os\n", 233 | "\n", 234 | "WORK_DIRECTORY='data'\n", 235 | "os.makedirs('{}'.format(WORK_DIRECTORY), exist_ok=True)\n", 236 | "df.to_csv('{}/boston_housing.csv'.format(WORK_DIRECTORY), header=False, index=False)" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 12, 242 | "metadata": {}, 243 | "outputs": [ 244 | { 245 | "name": "stdout", 246 | "output_type": "stream", 247 | "text": [ 248 | "Execution role is arn:aws:iam::251344623468:role/service-role/AmazonSageMaker-ExecutionRole-20191017T203175\n", 249 | "Success - the MySageMakerInstance is in the ap-northeast-1.\n" 250 | ] 251 | } 252 | ], 253 | "source": [ 254 | "# S3 prefix\n", 255 | "bucket = 'sagemaker-bucket-sample-test'\n", 256 | "prefix = 'sagemaker/sample'\n", 257 | "\n", 258 | "# Import libraries\n", 259 | "from sagemaker import get_execution_role\n", 260 | "import boto3, sys, os\n", 261 | "import sagemaker\n", 262 | "\n", 263 | "sagemaker_session = sagemaker.Session()\n", 264 | "\n", 265 | "# Get a SageMaker-compatible role used by this Notebook Instance.\n", 266 | "role = get_execution_role()\n", 267 | "my_region = boto3.session.Session().region_name # set the region of the instance\n", 268 | "print(\"Execution role is \" + role)\n", 269 | "print(\"Success - the MySageMakerInstance is in the \" + my_region + \".\")" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 13, 275 | "metadata": {}, 276 | "outputs": [ 277 | { 278 | "name": "stdout", 279 | "output_type": "stream", 280 | "text": [ 281 | "S3 error: An error occurred (IllegalLocationConstraintException) when calling the CreateBucket operation: The unspecified location constraint is incompatible for the region specific endpoint this request was sent to.\n" 282 | ] 283 | } 284 | ], 285 | "source": [ 286 | "s3 = boto3.resource('s3')\n", 287 | "\n", 288 | "try:\n", 289 | " if my_region == 'ap-northeast-1':\n", 290 | " s3.create_bucket(Bucket=bucket)\n", 291 | " else:\n", 292 | " s3.create_bucket(Bucket=bucket, CreateBucketConfiguration={'LocationConstraint': my_region})\n", 293 | " print('S3 bucket created successfully')\n", 294 | "except Exception as e:\n", 295 | " print('S3 error: ', e)" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 14, 301 | "metadata": {}, 302 | "outputs": [ 303 | { 304 | "name": "stdout", 305 | "output_type": "stream", 306 | "text": [ 307 | "Uploaded training data location: s3://sagemaker-getting-start-test/sagemaker/sample/data\n", 308 | "Training artifacts will be uploaded to: s3://sagemaker-getting-start-test/sagemaker/sample/output\n" 309 | ] 310 | } 311 | ], 312 | "source": [ 313 | "# send data to S3.SageMaker will take training data from s3\n", 314 | "training_path = sagemaker_session.upload_data(path='{}/boston_housing.csv'.format(WORK_DIRECTORY), bucket=bucket, key_prefix=prefix)\n", 315 | "s3_train_data = 's3://{}/{}/{}'.format(bucket, prefix, WORK_DIRECTORY)\n", 316 | "print('Uploaded training data location: {}'.format(s3_train_data))\n", 317 | "\n", 318 | "output_location = 's3://{}/{}/output'.format(bucket, prefix)\n", 319 | "print('Training artifacts will be uploaded to: {}'.format(output_location))" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": 15, 325 | "metadata": {}, 326 | "outputs": [ 327 | { 328 | "name": "stdout", 329 | "output_type": "stream", 330 | "text": [ 331 | "Estimator object: \n" 332 | ] 333 | } 334 | ], 335 | "source": [ 336 | "# We use the Estimator from the SageMaker Python SDK\n", 337 | "from sagemaker.sklearn.estimator import SKLearn\n", 338 | "\n", 339 | "script_path = 'scikit_learn_script.py'\n", 340 | "\n", 341 | "# Initialise SDK\n", 342 | "sklearn_estimator = SKLearn(\n", 343 | " entry_point=script_path,\n", 344 | " role = role,\n", 345 | " train_instance_type=\"ml.c4.xlarge\",\n", 346 | " sagemaker_session=sagemaker_session,\n", 347 | " output_path=output_location\n", 348 | ")\n", 349 | "\n", 350 | "print(\"Estimator object: {}\".format(sklearn_estimator))" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": 16, 356 | "metadata": {}, 357 | "outputs": [ 358 | { 359 | "name": "stdout", 360 | "output_type": "stream", 361 | "text": [ 362 | "2020-01-13 06:19:35 Starting - Starting the training job...\n", 363 | "2020-01-13 06:19:37 Starting - Launching requested ML instances......\n", 364 | "2020-01-13 06:20:40 Starting - Preparing the instances for training...\n", 365 | "2020-01-13 06:21:18 Downloading - Downloading input data...\n", 366 | "2020-01-13 06:22:03 Training - Training image download completed. Training in progress.\n", 367 | "2020-01-13 06:22:03 Uploading - Uploading generated training model\u001b[34m2020-01-13 06:21:58,438 sagemaker-containers INFO Imported framework sagemaker_sklearn_container.training\u001b[0m\n", 368 | "\u001b[34m2020-01-13 06:21:58,440 sagemaker-containers INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", 369 | "\u001b[34m2020-01-13 06:21:58,450 sagemaker_sklearn_container.training INFO Invoking user training script.\u001b[0m\n", 370 | "\u001b[34m2020-01-13 06:21:58,782 sagemaker-containers INFO Module scikit_learn_script does not provide a setup.py. \u001b[0m\n", 371 | "\u001b[34mGenerating setup.py\u001b[0m\n", 372 | "\u001b[34m2020-01-13 06:21:58,782 sagemaker-containers INFO Generating setup.cfg\u001b[0m\n", 373 | "\u001b[34m2020-01-13 06:21:58,782 sagemaker-containers INFO Generating MANIFEST.in\u001b[0m\n", 374 | "\u001b[34m2020-01-13 06:21:58,783 sagemaker-containers INFO Installing module with the following command:\u001b[0m\n", 375 | "\u001b[34m/miniconda3/bin/python -m pip install . \u001b[0m\n", 376 | "\u001b[34mProcessing /opt/ml/code\u001b[0m\n", 377 | "\u001b[34mBuilding wheels for collected packages: scikit-learn-script\n", 378 | " Building wheel for scikit-learn-script (setup.py): started\n", 379 | " Building wheel for scikit-learn-script (setup.py): finished with status 'done'\n", 380 | " Created wheel for scikit-learn-script: filename=scikit_learn_script-1.0.0-py2.py3-none-any.whl size=8295 sha256=07fb54998da9c4d696b6abc931d34235002b0285255da2131d0051e86cb4d9e9\n", 381 | " Stored in directory: /tmp/pip-ephem-wheel-cache-vhkmjzek/wheels/35/24/16/37574d11bf9bde50616c67372a334f94fa8356bc7164af8ca3\u001b[0m\n", 382 | "\u001b[34mSuccessfully built scikit-learn-script\u001b[0m\n", 383 | "\u001b[34mInstalling collected packages: scikit-learn-script\u001b[0m\n", 384 | "\u001b[34mSuccessfully installed scikit-learn-script-1.0.0\u001b[0m\n", 385 | "\u001b[34m2020-01-13 06:22:00,086 sagemaker-containers INFO No GPUs detected (normal if no gpus installed)\u001b[0m\n", 386 | "\u001b[34m2020-01-13 06:22:00,097 sagemaker-containers INFO Invoking user script\n", 387 | "\u001b[0m\n", 388 | "\u001b[34mTraining Env:\n", 389 | "\u001b[0m\n", 390 | "\u001b[34m{\n", 391 | " \"additional_framework_parameters\": {},\n", 392 | " \"channel_input_dirs\": {\n", 393 | " \"train\": \"/opt/ml/input/data/train\"\n", 394 | " },\n", 395 | " \"current_host\": \"algo-1\",\n", 396 | " \"framework_module\": \"sagemaker_sklearn_container.training:main\",\n", 397 | " \"hosts\": [\n", 398 | " \"algo-1\"\n", 399 | " ],\n", 400 | " \"hyperparameters\": {},\n", 401 | " \"input_config_dir\": \"/opt/ml/input/config\",\n", 402 | " \"input_data_config\": {\n", 403 | " \"train\": {\n", 404 | " \"TrainingInputMode\": \"File\",\n", 405 | " \"S3DistributionType\": \"FullyReplicated\",\n", 406 | " \"RecordWrapperType\": \"None\"\n", 407 | " }\n", 408 | " },\n", 409 | " \"input_dir\": \"/opt/ml/input\",\n", 410 | " \"is_master\": true,\n", 411 | " \"job_name\": \"sagemaker-scikit-learn-2020-01-13-06-19-35-519\",\n", 412 | " \"log_level\": 20,\n", 413 | " \"master_hostname\": \"algo-1\",\n", 414 | " \"model_dir\": \"/opt/ml/model\",\n", 415 | " \"module_dir\": \"s3://sagemaker-getting-start-test/sagemaker-scikit-learn-2020-01-13-06-19-35-519/source/sourcedir.tar.gz\",\n", 416 | " \"module_name\": \"scikit_learn_script\",\n", 417 | " \"network_interface_name\": \"eth0\",\n", 418 | " \"num_cpus\": 4,\n", 419 | " \"num_gpus\": 0,\n", 420 | " \"output_data_dir\": \"/opt/ml/output/data\",\n", 421 | " \"output_dir\": \"/opt/ml/output\",\n", 422 | " \"output_intermediate_dir\": \"/opt/ml/output/intermediate\",\n", 423 | " \"resource_config\": {\n", 424 | " \"current_host\": \"algo-1\",\n", 425 | " \"hosts\": [\n", 426 | " \"algo-1\"\n", 427 | " ],\n", 428 | " \"network_interface_name\": \"eth0\"\n", 429 | " },\n", 430 | " \"user_entry_point\": \"scikit_learn_script.py\"\u001b[0m\n", 431 | "\u001b[34m}\n", 432 | "\u001b[0m\n", 433 | "\u001b[34mEnvironment variables:\n", 434 | "\u001b[0m\n", 435 | "\u001b[34mSM_HOSTS=[\"algo-1\"]\u001b[0m\n", 436 | "\u001b[34mSM_NETWORK_INTERFACE_NAME=eth0\u001b[0m\n", 437 | "\u001b[34mSM_HPS={}\u001b[0m\n", 438 | "\u001b[34mSM_USER_ENTRY_POINT=scikit_learn_script.py\u001b[0m\n", 439 | "\u001b[34mSM_FRAMEWORK_PARAMS={}\u001b[0m\n", 440 | "\u001b[34mSM_RESOURCE_CONFIG={\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"}\u001b[0m\n", 441 | "\u001b[34mSM_INPUT_DATA_CONFIG={\"train\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}}\u001b[0m\n", 442 | "\u001b[34mSM_OUTPUT_DATA_DIR=/opt/ml/output/data\u001b[0m\n", 443 | "\u001b[34mSM_CHANNELS=[\"train\"]\u001b[0m\n", 444 | "\u001b[34mSM_CURRENT_HOST=algo-1\u001b[0m\n", 445 | "\u001b[34mSM_MODULE_NAME=scikit_learn_script\u001b[0m\n", 446 | "\u001b[34mSM_LOG_LEVEL=20\u001b[0m\n", 447 | "\u001b[34mSM_FRAMEWORK_MODULE=sagemaker_sklearn_container.training:main\u001b[0m\n", 448 | "\u001b[34mSM_INPUT_DIR=/opt/ml/input\u001b[0m\n", 449 | "\u001b[34mSM_INPUT_CONFIG_DIR=/opt/ml/input/config\u001b[0m\n", 450 | "\u001b[34mSM_OUTPUT_DIR=/opt/ml/output\u001b[0m\n", 451 | "\u001b[34mSM_NUM_CPUS=4\u001b[0m\n", 452 | "\u001b[34mSM_NUM_GPUS=0\u001b[0m\n", 453 | "\u001b[34mSM_MODEL_DIR=/opt/ml/model\u001b[0m\n", 454 | "\u001b[34mSM_MODULE_DIR=s3://sagemaker-getting-start-test/sagemaker-scikit-learn-2020-01-13-06-19-35-519/source/sourcedir.tar.gz\u001b[0m\n", 455 | "\u001b[34mSM_TRAINING_ENV={\"additional_framework_parameters\":{},\"channel_input_dirs\":{\"train\":\"/opt/ml/input/data/train\"},\"current_host\":\"algo-1\",\"framework_module\":\"sagemaker_sklearn_container.training:main\",\"hosts\":[\"algo-1\"],\"hyperparameters\":{},\"input_config_dir\":\"/opt/ml/input/config\",\"input_data_config\":{\"train\":{\"RecordWrapperType\":\"None\",\"S3DistributionType\":\"FullyReplicated\",\"TrainingInputMode\":\"File\"}},\"input_dir\":\"/opt/ml/input\",\"is_master\":true,\"job_name\":\"sagemaker-scikit-learn-2020-01-13-06-19-35-519\",\"log_level\":20,\"master_hostname\":\"algo-1\",\"model_dir\":\"/opt/ml/model\",\"module_dir\":\"s3://sagemaker-getting-start-test/sagemaker-scikit-learn-2020-01-13-06-19-35-519/source/sourcedir.tar.gz\",\"module_name\":\"scikit_learn_script\",\"network_interface_name\":\"eth0\",\"num_cpus\":4,\"num_gpus\":0,\"output_data_dir\":\"/opt/ml/output/data\",\"output_dir\":\"/opt/ml/output\",\"output_intermediate_dir\":\"/opt/ml/output/intermediate\",\"resource_config\":{\"current_host\":\"algo-1\",\"hosts\":[\"algo-1\"],\"network_interface_name\":\"eth0\"},\"user_entry_point\":\"scikit_learn_script.py\"}\u001b[0m\n", 456 | "\u001b[34mSM_USER_ARGS=[]\u001b[0m\n", 457 | "\u001b[34mSM_OUTPUT_INTERMEDIATE_DIR=/opt/ml/output/intermediate\u001b[0m\n", 458 | "\u001b[34mSM_CHANNEL_TRAIN=/opt/ml/input/data/train\u001b[0m\n", 459 | "\u001b[34mPYTHONPATH=/miniconda3/bin:/miniconda3/lib/python37.zip:/miniconda3/lib/python3.7:/miniconda3/lib/python3.7/lib-dynload:/miniconda3/lib/python3.7/site-packages\n", 460 | "\u001b[0m\n", 461 | "\u001b[34mInvoking script with the following command:\n", 462 | "\u001b[0m\n", 463 | "\u001b[34m/miniconda3/bin/python -m scikit_learn_script\n", 464 | "\n", 465 | "\u001b[0m\n", 466 | "\u001b[34m/miniconda3/lib/python3.7/site-packages/sklearn/externals/joblib/externals/cloudpickle/cloudpickle.py:47: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses\n", 467 | " import imp\u001b[0m\n", 468 | "\u001b[34mGradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,\n", 469 | " learning_rate=0.1, loss='ls', max_depth=3, max_features=None,\n", 470 | " max_leaf_nodes=None, min_impurity_decrease=0.0,\n", 471 | " min_impurity_split=None, min_samples_leaf=1,\n", 472 | " min_samples_split=2, min_weight_fraction_leaf=0.0,\n", 473 | " n_estimators=100, n_iter_no_change=None, presort='auto',\n", 474 | " random_state=None, subsample=1.0, tol=0.0001,\n", 475 | " validation_fraction=0.1, verbose=0, warm_start=False)\u001b[0m\n", 476 | "\u001b[34m2020-01-13 06:22:01,491 sagemaker-containers INFO Reporting training SUCCESS\u001b[0m\n", 477 | "\n", 478 | "2020-01-13 06:22:10 Completed - Training job completed\n", 479 | "Training seconds: 52\n", 480 | "Billable seconds: 52\n" 481 | ] 482 | } 483 | ], 484 | "source": [ 485 | "# Run model training job\n", 486 | "sklearn_estimator.fit({'train': training_path})" 487 | ] 488 | }, 489 | { 490 | "cell_type": "code", 491 | "execution_count": 17, 492 | "metadata": {}, 493 | "outputs": [ 494 | { 495 | "name": "stdout", 496 | "output_type": "stream", 497 | "text": [ 498 | "---------------------------------------------------------------------------!" 499 | ] 500 | } 501 | ], 502 | "source": [ 503 | "# Deploy an estimator and endpoint\n", 504 | "from sagemaker.predictor import csv_serializer, json_deserializer\n", 505 | "predictor = sklearn_estimator.deploy(initial_instance_count=1, instance_type=\"ml.m4.xlarge\", endpoint_name=\"sagemaker-terraform-test\")\n", 506 | "\n", 507 | "# Specify input and output formats.\n", 508 | "predictor.content_type = 'text/csv'\n", 509 | "predictor.serializer = csv_serializer\n", 510 | "predictor.deserializer = json_deserializer" 511 | ] 512 | }, 513 | { 514 | "cell_type": "code", 515 | "execution_count": 18, 516 | "metadata": {}, 517 | "outputs": [], 518 | "source": [ 519 | "# predictor.delete_endpoint()" 520 | ] 521 | }, 522 | { 523 | "cell_type": "code", 524 | "execution_count": null, 525 | "metadata": {}, 526 | "outputs": [], 527 | "source": [] 528 | } 529 | ], 530 | "metadata": { 531 | "kernelspec": { 532 | "display_name": "conda_python3", 533 | "language": "python", 534 | "name": "conda_python3" 535 | }, 536 | "language_info": { 537 | "codemirror_mode": { 538 | "name": "ipython", 539 | "version": 3 540 | }, 541 | "file_extension": ".py", 542 | "mimetype": "text/x-python", 543 | "name": "python", 544 | "nbconvert_exporter": "python", 545 | "pygments_lexer": "ipython3", 546 | "version": "3.6.5" 547 | } 548 | }, 549 | "nbformat": 4, 550 | "nbformat_minor": 2 551 | } 552 | --------------------------------------------------------------------------------