├── 50_Startups.csv ├── AWS_Train.ipynb ├── README.md ├── sagemaker_lambda.py └── startup_prediction.py /50_Startups.csv: -------------------------------------------------------------------------------- 1 | R&D Spend,Administration,Marketing Spend,State,Profit 2 | 165349.2,136897.8,471784.1,New York,192261.83 3 | 162597.7,151377.59,443898.53,California,191792.06 4 | 153441.51,101145.55,407934.54,Florida,191050.39 5 | 144372.41,118671.85,383199.62,New York,182901.99 6 | 142107.34,91391.77,366168.42,Florida,166187.94 7 | 131876.9,99814.71,362861.36,New York,156991.12 8 | 134615.46,147198.87,127716.82,California,156122.51 9 | 130298.13,145530.06,323876.68,Florida,155752.6 10 | 120542.52,148718.95,311613.29,New York,152211.77 11 | 123334.88,108679.17,304981.62,California,149759.96 12 | 101913.08,110594.11,229160.95,Florida,146121.95 13 | 100671.96,91790.61,249744.55,California,144259.4 14 | 93863.75,127320.38,249839.44,Florida,141585.52 15 | 91992.39,135495.07,252664.93,California,134307.35 16 | 119943.24,156547.42,256512.92,Florida,132602.65 17 | 114523.61,122616.84,261776.23,New York,129917.04 18 | 78013.11,121597.55,264346.06,California,126992.93 19 | 94657.16,145077.58,282574.31,New York,125370.37 20 | 91749.16,114175.79,294919.57,Florida,124266.9 21 | 86419.7,153514.11,0,New York,122776.86 22 | 76253.86,113867.3,298664.47,California,118474.03 23 | 78389.47,153773.43,299737.29,New York,111313.02 24 | 73994.56,122782.75,303319.26,Florida,110352.25 25 | 67532.53,105751.03,304768.73,Florida,108733.99 26 | 77044.01,99281.34,140574.81,New York,108552.04 27 | 64664.71,139553.16,137962.62,California,107404.34 28 | 75328.87,144135.98,134050.07,Florida,105733.54 29 | 72107.6,127864.55,353183.81,New York,105008.31 30 | 66051.52,182645.56,118148.2,Florida,103282.38 31 | 65605.48,153032.06,107138.38,New York,101004.64 32 | 61994.48,115641.28,91131.24,Florida,99937.59 33 | 61136.38,152701.92,88218.23,New York,97483.56 34 | 63408.86,129219.61,46085.25,California,97427.84 35 | 55493.95,103057.49,214634.81,Florida,96778.92 36 | 46426.07,157693.92,210797.67,California,96712.8 37 | 46014.02,85047.44,205517.64,New York,96479.51 38 | 28663.76,127056.21,201126.82,Florida,90708.19 39 | 44069.95,51283.14,197029.42,California,89949.14 40 | 20229.59,65947.93,185265.1,New York,81229.06 41 | 38558.51,82982.09,174999.3,California,81005.76 42 | 28754.33,118546.05,172795.67,California,78239.91 43 | 27892.92,84710.77,164470.71,Florida,77798.83 44 | 23640.93,96189.63,148001.11,California,71498.49 45 | 15505.73,127382.3,35534.17,New York,69758.98 46 | 22177.74,154806.14,28334.72,California,65200.33 47 | 1000.23,124153.04,1903.93,New York,64926.08 48 | 1315.46,115816.21,297114.46,Florida,49490.75 49 | 0,135426.92,0,California,42559.73 50 | 542.05,51743.15,0,New York,35673.41 51 | 0,116983.8,45173.06,California,14681.4 52 | -------------------------------------------------------------------------------- /AWS_Train.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import sagemaker\n", 10 | "from sagemaker import get_execution_role" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "sagemaker_session = sagemaker.Session()" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": null, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "# Get a SageMaker-compatible role used by this Notebook Instance.\n", 29 | "role = get_execution_role()" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "role" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "## Upload the data for training \n" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "train_input = sagemaker_session.upload_data(\"data\")" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "train_input" 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "# Create SageMaker Scikit Estimator" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "from sagemaker.sklearn.estimator import SKLearn\n", 80 | "\n", 81 | "script_path = 'startup_prediction.py'\n", 82 | "\n", 83 | "sklearn = SKLearn(\n", 84 | " entry_point=script_path,\n", 85 | " instance_type=\"ml.m4.xlarge\",\n", 86 | " framework_version=\"0.20.0\",\n", 87 | " py_version=\"py3\",\n", 88 | " role=role,\n", 89 | " sagemaker_session=sagemaker_session)" 90 | ] 91 | }, 92 | { 93 | "cell_type": "markdown", 94 | "metadata": {}, 95 | "source": [ 96 | "# Train SKLearn Estimator on Startup data \n" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": {}, 103 | "outputs": [], 104 | "source": [ 105 | "sklearn.fit({'train': train_input})" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "# Deploy the model " 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": {}, 119 | "outputs": [], 120 | "source": [ 121 | "deployment = sklearn.deploy(initial_instance_count=1, instance_type=\"ml.m4.xlarge\")" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "deployment.endpoint" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "deployment.predict([[1,0,50000,25000,40000]])" 140 | ] 141 | } 142 | ], 143 | "metadata": { 144 | "kernelspec": { 145 | "display_name": "Python 3", 146 | "language": "python", 147 | "name": "python3" 148 | }, 149 | "language_info": { 150 | "codemirror_mode": { 151 | "name": "ipython", 152 | "version": 3 153 | }, 154 | "file_extension": ".py", 155 | "mimetype": "text/x-python", 156 | "name": "python", 157 | "nbconvert_exporter": "python", 158 | "pygments_lexer": "ipython3", 159 | "version": "3.7.4" 160 | } 161 | }, 162 | "nbformat": 4, 163 | "nbformat_minor": 4 164 | } 165 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Build & Deploy SciKit Learn Machine Learning Model with AWS Sagemaker and Integrate it to Lambda, API Gatway 2 | 3 | Amazon SageMaker is a fully-managed platform that enables developers and data scientists to quickly and easily build, train, and deploy machine learning (ML) models at any scale. Amazon SageMaker removes all the barriers that typically slow down developers who want to use machine learning. In this tech talk, we will introduce you to the concepts of Amazon SageMaker including a one-click training environment, highly-optimized machine learning algorithms with built-in model tuning, and deployment of ML models. With zero setup required, Amazon SageMaker significantly decreases your training time and the overall cost of getting ML models from concept to production. 4 | 5 | AWS Lambda lets you run code without provisioning or managing servers. You pay only for the compute time you consume. 6 | 7 | Amazon API Gateway is a fully managed service that makes it easy for developers to create, publish, maintain, monitor, and secure APIs at any scale. APIs act as the "front door" for applications to access data, business logic, or functionality from your backend services. 8 | 9 | Check out this tutorial for this code : https://youtu.be/2-mCo7q2Iw4 10 | -------------------------------------------------------------------------------- /sagemaker_lambda.py: -------------------------------------------------------------------------------- 1 | import os 2 | import io 3 | import boto3 4 | import json 5 | 6 | # grab environment variables 7 | ENDPOINT_NAME = "{SAGEMAKER ENDPOINT}" 8 | runtime= boto3.client('runtime.sagemaker') 9 | 10 | def lambda_handler(event, context): 11 | print("Received event: " + json.dumps(event, indent=2)) 12 | 13 | data = json.loads(json.dumps(event)) 14 | payload = data['data'] 15 | print(payload) 16 | 17 | response = runtime.invoke_endpoint(EndpointName=ENDPOINT_NAME, 18 | Body=json.dumps(payload)) 19 | print(response) 20 | result = json.loads(response['Body'].read().decode()) 21 | print(result) 22 | 23 | return result[0] 24 | -------------------------------------------------------------------------------- /startup_prediction.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import argparse 4 | import os 5 | import pandas as pd 6 | 7 | from sklearn import tree 8 | from sklearn.externals import joblib 9 | 10 | 11 | if __name__ == '__main__': 12 | parser = argparse.ArgumentParser() 13 | # Sagemaker specific arguments. Defaults are set in the environment variables. 14 | 15 | #Saves Checkpoints and graphs 16 | parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR']) 17 | 18 | #Save model artifacts 19 | parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR']) 20 | 21 | #Train data 22 | parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN']) 23 | 24 | args = parser.parse_args() 25 | 26 | file = os.path.join(args.train, "50_Startups.csv") 27 | dataset = pd.read_csv(file, engine="python") 28 | 29 | # labels are in the first column 30 | X = dataset.iloc[:, :-1].values 31 | y = dataset.iloc[:, 4].values 32 | 33 | # Encoding categorical data 34 | from sklearn.preprocessing import LabelEncoder, OneHotEncoder 35 | labelencoder = LabelEncoder() 36 | X[:, 3] = labelencoder.fit_transform(X[:, 3]) 37 | onehotencoder = OneHotEncoder(categorical_features = [3]) 38 | X = onehotencoder.fit_transform(X).toarray() 39 | 40 | # Avoiding the Dummy Variable Trap 41 | X = X[:, 1:] 42 | 43 | from sklearn.model_selection import train_test_split 44 | X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) 45 | from sklearn.linear_model import LinearRegression 46 | regressor = LinearRegression() 47 | regressor.fit(X_train, y_train) 48 | 49 | # Print the coefficients of the trained classifier, and save the coefficients 50 | joblib.dump(regressor, os.path.join(args.model_dir, "model.joblib")) 51 | 52 | 53 | def model_fn(model_dir): 54 | """Deserialized and return fitted model 55 | 56 | Note that this should have the same name as the serialized model in the main method 57 | """ 58 | regressor = joblib.load(os.path.join(model_dir, "model.joblib")) 59 | return regressor 60 | --------------------------------------------------------------------------------