├── 50_Startups.csv
├── AWS_Train.ipynb
├── README.md
├── sagemaker_lambda.py
└── startup_prediction.py


/50_Startups.csv:
--------------------------------------------------------------------------------
 1 | R&D Spend,Administration,Marketing Spend,State,Profit
 2 | 165349.2,136897.8,471784.1,New York,192261.83
 3 | 162597.7,151377.59,443898.53,California,191792.06
 4 | 153441.51,101145.55,407934.54,Florida,191050.39
 5 | 144372.41,118671.85,383199.62,New York,182901.99
 6 | 142107.34,91391.77,366168.42,Florida,166187.94
 7 | 131876.9,99814.71,362861.36,New York,156991.12
 8 | 134615.46,147198.87,127716.82,California,156122.51
 9 | 130298.13,145530.06,323876.68,Florida,155752.6
10 | 120542.52,148718.95,311613.29,New York,152211.77
11 | 123334.88,108679.17,304981.62,California,149759.96
12 | 101913.08,110594.11,229160.95,Florida,146121.95
13 | 100671.96,91790.61,249744.55,California,144259.4
14 | 93863.75,127320.38,249839.44,Florida,141585.52
15 | 91992.39,135495.07,252664.93,California,134307.35
16 | 119943.24,156547.42,256512.92,Florida,132602.65
17 | 114523.61,122616.84,261776.23,New York,129917.04
18 | 78013.11,121597.55,264346.06,California,126992.93
19 | 94657.16,145077.58,282574.31,New York,125370.37
20 | 91749.16,114175.79,294919.57,Florida,124266.9
21 | 86419.7,153514.11,0,New York,122776.86
22 | 76253.86,113867.3,298664.47,California,118474.03
23 | 78389.47,153773.43,299737.29,New York,111313.02
24 | 73994.56,122782.75,303319.26,Florida,110352.25
25 | 67532.53,105751.03,304768.73,Florida,108733.99
26 | 77044.01,99281.34,140574.81,New York,108552.04
27 | 64664.71,139553.16,137962.62,California,107404.34
28 | 75328.87,144135.98,134050.07,Florida,105733.54
29 | 72107.6,127864.55,353183.81,New York,105008.31
30 | 66051.52,182645.56,118148.2,Florida,103282.38
31 | 65605.48,153032.06,107138.38,New York,101004.64
32 | 61994.48,115641.28,91131.24,Florida,99937.59
33 | 61136.38,152701.92,88218.23,New York,97483.56
34 | 63408.86,129219.61,46085.25,California,97427.84
35 | 55493.95,103057.49,214634.81,Florida,96778.92
36 | 46426.07,157693.92,210797.67,California,96712.8
37 | 46014.02,85047.44,205517.64,New York,96479.51
38 | 28663.76,127056.21,201126.82,Florida,90708.19
39 | 44069.95,51283.14,197029.42,California,89949.14
40 | 20229.59,65947.93,185265.1,New York,81229.06
41 | 38558.51,82982.09,174999.3,California,81005.76
42 | 28754.33,118546.05,172795.67,California,78239.91
43 | 27892.92,84710.77,164470.71,Florida,77798.83
44 | 23640.93,96189.63,148001.11,California,71498.49
45 | 15505.73,127382.3,35534.17,New York,69758.98
46 | 22177.74,154806.14,28334.72,California,65200.33
47 | 1000.23,124153.04,1903.93,New York,64926.08
48 | 1315.46,115816.21,297114.46,Florida,49490.75
49 | 0,135426.92,0,California,42559.73
50 | 542.05,51743.15,0,New York,35673.41
51 | 0,116983.8,45173.06,California,14681.4
52 | 


--------------------------------------------------------------------------------
/AWS_Train.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import sagemaker\n",
 10 |     "from sagemaker import get_execution_role"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "sagemaker_session = sagemaker.Session()"
 20 |    ]
 21 |   },
 22 |   {
 23 |    "cell_type": "code",
 24 |    "execution_count": null,
 25 |    "metadata": {},
 26 |    "outputs": [],
 27 |    "source": [
 28 |     "# Get a SageMaker-compatible role used by this Notebook Instance.\n",
 29 |     "role = get_execution_role()"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": null,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "role"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "markdown",
 43 |    "metadata": {},
 44 |    "source": [
 45 |     "## Upload the data for training \n"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "train_input = sagemaker_session.upload_data(\"data\")"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": null,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "train_input"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "# Create SageMaker Scikit Estimator"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": null,
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "from sagemaker.sklearn.estimator import SKLearn\n",
 80 |     "\n",
 81 |     "script_path = 'startup_prediction.py'\n",
 82 |     "\n",
 83 |     "sklearn = SKLearn(\n",
 84 |     "    entry_point=script_path,\n",
 85 |     "    instance_type=\"ml.m4.xlarge\",\n",
 86 |     "    framework_version=\"0.20.0\",\n",
 87 |     "    py_version=\"py3\",\n",
 88 |     "    role=role,\n",
 89 |     "    sagemaker_session=sagemaker_session)"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "markdown",
 94 |    "metadata": {},
 95 |    "source": [
 96 |     "# Train SKLearn Estimator on Startup data \n"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "metadata": {},
103 |    "outputs": [],
104 |    "source": [
105 |     "sklearn.fit({'train': train_input})"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "markdown",
110 |    "metadata": {},
111 |    "source": [
112 |     "# Deploy the model "
113 |    ]
114 |   },
115 |   {
116 |    "cell_type": "code",
117 |    "execution_count": null,
118 |    "metadata": {},
119 |    "outputs": [],
120 |    "source": [
121 |     "deployment = sklearn.deploy(initial_instance_count=1, instance_type=\"ml.m4.xlarge\")"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "deployment.endpoint"
131 |    ]
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "metadata": {},
137 |    "outputs": [],
138 |    "source": [
139 |     "deployment.predict([[1,0,50000,25000,40000]])"
140 |    ]
141 |   }
142 |  ],
143 |  "metadata": {
144 |   "kernelspec": {
145 |    "display_name": "Python 3",
146 |    "language": "python",
147 |    "name": "python3"
148 |   },
149 |   "language_info": {
150 |    "codemirror_mode": {
151 |     "name": "ipython",
152 |     "version": 3
153 |    },
154 |    "file_extension": ".py",
155 |    "mimetype": "text/x-python",
156 |    "name": "python",
157 |    "nbconvert_exporter": "python",
158 |    "pygments_lexer": "ipython3",
159 |    "version": "3.7.4"
160 |   }
161 |  },
162 |  "nbformat": 4,
163 |  "nbformat_minor": 4
164 | }
165 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Build & Deploy SciKit Learn Machine Learning Model with AWS Sagemaker and Integrate it to Lambda, API Gatway
 2 | 
 3 | Amazon SageMaker is a fully-managed platform that enables developers and data scientists to quickly and easily build, train, and deploy machine learning (ML) models at any scale. Amazon SageMaker removes all the barriers that typically slow down developers who want to use machine learning. In this tech talk, we will introduce you to the concepts of Amazon SageMaker including a one-click training environment, highly-optimized machine learning algorithms with built-in model tuning, and deployment of ML models. With zero setup required, Amazon SageMaker significantly decreases your training time and the overall cost of getting ML models from concept to production.
 4 | 
 5 | AWS Lambda lets you run code without provisioning or managing servers. You pay only for the compute time you consume.
 6 | 
 7 | Amazon API Gateway is a fully managed service that makes it easy for developers to create, publish, maintain, monitor, and secure APIs at any scale. APIs act as the "front door" for applications to access data, business logic, or functionality from your backend services.
 8 | 
 9 | Check out this tutorial for this code : https://youtu.be/2-mCo7q2Iw4
10 | 


--------------------------------------------------------------------------------
/sagemaker_lambda.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import io
 3 | import boto3
 4 | import json
 5 | 
 6 | # grab environment variables
 7 | ENDPOINT_NAME = "{SAGEMAKER ENDPOINT}"
 8 | runtime= boto3.client('runtime.sagemaker')
 9 | 
10 | def lambda_handler(event, context):
11 |     print("Received event: " + json.dumps(event, indent=2))
12 |     
13 |     data = json.loads(json.dumps(event))
14 |     payload = data['data']
15 |     print(payload)
16 |     
17 |     response = runtime.invoke_endpoint(EndpointName=ENDPOINT_NAME,
18 |                                        Body=json.dumps(payload))
19 |     print(response)
20 |     result = json.loads(response['Body'].read().decode())
21 |     print(result)
22 |     
23 |     return result[0]
24 | 


--------------------------------------------------------------------------------
/startup_prediction.py:
--------------------------------------------------------------------------------
 1 | from __future__ import print_function
 2 | 
 3 | import argparse
 4 | import os
 5 | import pandas as pd
 6 | 
 7 | from sklearn import tree
 8 | from sklearn.externals import joblib
 9 | 
10 | 
11 | if __name__ == '__main__':
12 |     parser = argparse.ArgumentParser()
13 |     # Sagemaker specific arguments. Defaults are set in the environment variables.
14 | 
15 |     #Saves Checkpoints and graphs
16 |     parser.add_argument('--output-data-dir', type=str, default=os.environ['SM_OUTPUT_DATA_DIR'])
17 | 
18 |     #Save model artifacts
19 |     parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
20 | 
21 |     #Train data
22 |     parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
23 | 
24 |     args = parser.parse_args()
25 | 
26 |     file = os.path.join(args.train, "50_Startups.csv")
27 |     dataset = pd.read_csv(file, engine="python")
28 | 
29 |     # labels are in the first column
30 |     X = dataset.iloc[:, :-1].values
31 |     y = dataset.iloc[:, 4].values
32 | 
33 |     # Encoding categorical data
34 |     from sklearn.preprocessing import LabelEncoder, OneHotEncoder
35 |     labelencoder = LabelEncoder()
36 |     X[:, 3] = labelencoder.fit_transform(X[:, 3])
37 |     onehotencoder = OneHotEncoder(categorical_features = [3])
38 |     X = onehotencoder.fit_transform(X).toarray()
39 | 
40 |     # Avoiding the Dummy Variable Trap
41 |     X = X[:, 1:]
42 |     
43 |     from sklearn.model_selection import train_test_split
44 |     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
45 |     from sklearn.linear_model import LinearRegression
46 |     regressor = LinearRegression()
47 |     regressor.fit(X_train, y_train)
48 | 
49 |     # Print the coefficients of the trained classifier, and save the coefficients
50 |     joblib.dump(regressor, os.path.join(args.model_dir, "model.joblib"))
51 | 
52 | 
53 | def model_fn(model_dir):
54 |     """Deserialized and return fitted model
55 |     
56 |     Note that this should have the same name as the serialized model in the main method
57 |     """
58 |     regressor = joblib.load(os.path.join(model_dir, "model.joblib"))
59 |     return regressor
60 | 


--------------------------------------------------------------------------------