├── Chapter3 ├── create-subset-transformation-query.sql ├── index.js ├── put-record-python-program.py └── setup-data-producer.yml ├── Chapter4 ├── age-groupings.sql ├── gender-mapping.py ├── gender-percentages.sql ├── most-common-ages.sql └── user-data ├── Chapter5 ├── box-plot-example.ipynb ├── car_data.csv └── my-manifest.json ├── Chapter6 ├── ufo-modeling-lab.ipynb └── ufo_fullset.csv ├── Chapter7 ├── ufo-algorithms-lab.ipynb └── ufo_fullset.csv ├── Chapter8 ├── ufo-evaluation-optimization-lab.ipynb ├── ufo_sightings_train_recordIO_protobuf.data └── ufo_sightings_validatioin_recordIO_protobuf.data ├── Chapter9 ├── lambda_function.py ├── sample_request.json ├── ufo-implementation-operations-lab.ipynb └── ufo_fullset.csv ├── LAB-PerformRealTimeDataAnalysisWithKinesis ├── aws-config.txt ├── kinesis-analytics-popular-captains.sql.txt ├── kinesis-analytics-rating-anomaly.sql.txt └── send_captains_to_cloud.py ├── LICENSE └── README.md /Chapter3/create-subset-transformation-query.sql: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE STREAM "DESTINATION_USER_DATA" ( 2 | first VARCHAR(16), 3 | last VARCHAR(16), 4 | age INTEGER, 5 | gender VARCHAR(16), 6 | latitude FLOAT, 7 | longitude FLOAT 8 | ); 9 | CREATE OR REPLACE PUMP "STREAM_PUMP" AS INSERT INTO "DESTINATION_USER_DATA" 10 | 11 | SELECT STREAM "first", "last", "age", "gender", "latitude", "longitude" 12 | FROM "SOURCE_SQL_STREAM_001" 13 | WHERE "age" >= 21; -------------------------------------------------------------------------------- /Chapter3/index.js: -------------------------------------------------------------------------------- 1 | 'use strict'; 2 | console.log('Loading function'); 3 | 4 | exports.handler = (event, context, callback) => { 5 | /* Process the list of records and transform them */ 6 | 7 | let buff = Buffer.from('\n'); 8 | let base64data = buff.toString('base64'); 9 | 10 | const output = event.records.map((record) => ({ 11 | /* This transformation is the "identity" transformation, the data is left intact */ 12 | recordId: record.recordId, 13 | result: 'Ok', 14 | data: record.data + base64data, 15 | })); 16 | 17 | console.log(`Processing completed. Successful records ${output.length}.`); 18 | callback(null, { records: output }); 19 | }; 20 | -------------------------------------------------------------------------------- /Chapter3/put-record-python-program.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import boto3 3 | import uuid 4 | import time 5 | import random 6 | import json 7 | 8 | client = boto3.client('kinesis', region_name='') 9 | partition_key = str(uuid.uuid4()) 10 | 11 | # Added 08/2020 since randomuser.me is starting to throttle API calls 12 | # The following code loads 500 random users into memory 13 | number_of_results = 500 14 | r = requests.get('https://randomuser.me/api/?exc=login&results=' + str(number_of_results)) 15 | data = r.json()["results"] 16 | 17 | while True: 18 | # The following chooses a random user from the 500 random users pulled from the API in a single API call. 19 | random_user_index = int(random.uniform(0, (number_of_results - 1))) 20 | random_user = data[random_user_index] 21 | random_user = json.dumps(data[random_user_index]) 22 | client.put_record( 23 | StreamName='', 24 | Data=random_user, 25 | PartitionKey=partition_key) 26 | time.sleep(random.uniform(0, 1)) -------------------------------------------------------------------------------- /Chapter3/setup-data-producer.yml: -------------------------------------------------------------------------------- 1 | Parameters: 2 | KinesisDataStream: 3 | Description: The name of your Kinesis Data Stream. 4 | Type: String 5 | Mappings: 6 | RegionMap: 7 | us-east-1: 8 | AMI: ami-0080e4c5bc078760e 9 | us-east-2: 10 | AMI: ami-0cd3dfa4e37921605 11 | us-west-1: 12 | AMI: ami-0ec6517f6edbf8044 13 | us-west-2: 14 | AMI: ami-01e24be29428c15b2 15 | Resources: 16 | LnDataProducerInstance: 17 | Type: AWS::EC2::Instance 18 | Properties: 19 | InstanceType: t2.micro 20 | ImageId: 21 | Fn::FindInMap: 22 | - RegionMap 23 | - !Ref AWS::Region 24 | - AMI 25 | IamInstanceProfile: !Ref LnInstanceProfiler 26 | UserData: 27 | Fn::Base64: 28 | !Join [ "", [ 29 | "#!/bin/bash -xe\n", 30 | "sudo /opt/aws/bin/cfn-init -v ", #use cfn-init to install packages in cloudformation init 31 | !Sub "--stack ${AWS::StackName} ", 32 | "--resource LnDataProducerInstance ", 33 | "--configsets InstallAndConfigure ", 34 | !Sub "--region ${AWS::Region}", 35 | "\n"] ] 36 | Metadata: 37 | AWS::CloudFormation::Init: 38 | configSets: 39 | InstallAndConfigure: 40 | - "install_boto3" 41 | - "install_requests" 42 | - "create_script" 43 | install_boto3: 44 | commands: 45 | test: 46 | command: sudo pip install boto3 47 | install_requests: 48 | commands: 49 | test: 50 | command: sudo pip install requests 51 | create_script: 52 | files: 53 | "/tmp/stream.py": 54 | content: !Join [ "", [ 55 | "import requests\n", 56 | "import json\n", 57 | "import boto3\n", 58 | "import uuid\n", 59 | "import time\n", 60 | "import random\n", 61 | 62 | "client = boto3.client('kinesis', region_name='", !Sub "${AWS::Region}", "')\n", 63 | "partition_key = str(uuid.uuid4())\n", 64 | "number_of_results = 500\n", 65 | "r = requests.get('https://randomuser.me/api/?exc=login&results=' + str(number_of_results))\n", 66 | "data = r.json()['results']\n", 67 | "while True:\n", 68 | "\trandom_user_index = int(random.uniform(0, (number_of_results - 1)))\n", 69 | "\trandom_user = json.dumps(data[random_user_index])\n", 70 | "\tresponse = client.put_record(StreamName='", !Sub "${KinesisDataStream}", "', Data=random_user, PartitionKey=partition_key)\n", 71 | "\ttime.sleep(random.uniform(0, 1))\n", 72 | "\n" ] ] 73 | mode: "000600" 74 | owner: "ec2-user" 75 | group: "ec2-user" 76 | commands: 77 | test: 78 | command: python /tmp/stream.py 79 | LnInstanceProfiler: 80 | Type: AWS::IAM::InstanceProfile 81 | Properties: 82 | Roles: 83 | - !Ref LnDataProducerRole 84 | LnKinesisPolicy: 85 | Type: AWS::IAM::Policy 86 | Properties: 87 | PolicyDocument: 88 | Statement: 89 | - 90 | Effect: "Allow" 91 | Action: 92 | - "kinesis:DescribeStream" 93 | - "kinesis:PutRecord" 94 | - "kinesis:PutRecords" 95 | Resource: 96 | - !Sub "arn:aws:kinesis:${AWS::Region}:${AWS::AccountId}:stream/${KinesisDataStream}" 97 | PolicyName: "kinesis-put-records-policy-lab-ml-specialty-course" 98 | Roles: 99 | - !Ref LnDataProducerRole 100 | LnDataProducerRole: 101 | Type: AWS::IAM::Role 102 | Properties: 103 | AssumeRolePolicyDocument: 104 | Statement: 105 | - 106 | Effect: "Allow" 107 | Principal: 108 | Service: 109 | - "ec2.amazonaws.com" 110 | Action: 111 | - "sts:AssumeRole" 112 | RoleName: data-producer-role-kinesis-lab-ml-specialty-course -------------------------------------------------------------------------------- /Chapter4/age-groupings.sql: -------------------------------------------------------------------------------- 1 | SELECT SUM(CASE WHEN age BETWEEN 21 AND 29 THEN 1 ELSE 0 END) AS "21-29", 2 | SUM(CASE WHEN age BETWEEN 30 AND 39 THEN 1 ELSE 0 END) AS "30-39", 3 | SUM(CASE WHEN age BETWEEN 40 AND 49 THEN 1 ELSE 0 END) AS "40-49", 4 | SUM(CASE WHEN age BETWEEN 50 AND 59 THEN 1 ELSE 0 END) AS "50-59", 5 | SUM(CASE WHEN age BETWEEN 60 AND 69 THEN 1 ELSE 0 END) AS "60-69", 6 | SUM(CASE WHEN age BETWEEN 70 AND 79 THEN 1 ELSE 0 END) AS "70-79" 7 | FROM ; -------------------------------------------------------------------------------- /Chapter4/gender-mapping.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from awsglue.transforms import * 3 | from awsglue.utils import getResolvedOptions 4 | from pyspark.context import SparkContext 5 | from awsglue.context import GlueContext 6 | from awsglue.job import Job 7 | from itertools import chain 8 | from pyspark.sql.functions import create_map, lit 9 | from awsglue.dynamicframe import DynamicFrame 10 | 11 | args = getResolvedOptions(sys.argv, ['JOB_NAME']) 12 | 13 | sc = SparkContext() 14 | glueContext = GlueContext(sc) 15 | spark = glueContext.spark_session 16 | job = Job(glueContext) 17 | job.init(args['JOB_NAME'], args) 18 | 19 | datasource0 = glueContext.create_dynamic_frame.from_catalog(database = "", table_name = "", transformation_ctx = "datasource0") 20 | 21 | # Here is the custom gender mapping transformation 22 | df = datasource0.toDF() 23 | gender_dict = { 'male': 1, 'female':0 } 24 | mapping_expr = create_map([lit(x) for x in chain(*gender_dict.items())]) 25 | df = df.withColumn('gender', mapping_expr[df['gender']]) 26 | datasource_transformed = DynamicFrame.fromDF(df, glueContext, "datasource0") 27 | 28 | applymapping1 = ApplyMapping.apply(frame = datasource_transformed, mappings = [("first", "string", "first", "string"), ("last", "string", "last", "string"), ("age", "int", "age", "int"), ("gender", "string", "gender", "string"), ("latitude", "double", "latitude", "double"), ("longitude", "double", "longitude", "double")], transformation_ctx = "applymapping1") 29 | 30 | datasink2 = glueContext.write_dynamic_frame.from_options(frame = applymapping1, connection_type = "s3", connection_options = {"path": "s3://"}, format = "csv", transformation_ctx = "datasink2") 31 | job.commit() -------------------------------------------------------------------------------- /Chapter4/gender-percentages.sql: -------------------------------------------------------------------------------- 1 | SELECT gender, (COUNT(gender) * 100.0 / (SELECT COUNT(*) FROM )) AS percent 2 | FROM 3 | GROUP BY gender; -------------------------------------------------------------------------------- /Chapter4/most-common-ages.sql: -------------------------------------------------------------------------------- 1 | SELECT age, COUNT(age) AS occurances 2 | FROM 3 | GROUP BY age 4 | ORDER BY occurances DESC 5 | LIMIT 5; -------------------------------------------------------------------------------- /Chapter5/box-plot-example.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 5, 6 | "metadata": { 7 | "scrolled": true 8 | }, 9 | "outputs": [ 10 | { 11 | "name": "stdout", 12 | "output_type": "stream", 13 | "text": [ 14 | "Python 3.6.5 :: Anaconda custom (64-bit)\r\n" 15 | ] 16 | } 17 | ], 18 | "source": [ 19 | "!python --version" 20 | ] 21 | }, 22 | { 23 | "cell_type": "markdown", 24 | "metadata": {}, 25 | "source": [ 26 | "Importing the important libraries" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 6, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "import boto3\n", 36 | "import pandas as pd\n", 37 | "from sagemaker import get_execution_role\n", 38 | "import numpy as np\n", 39 | "import matplotlib.pyplot as plt\n", 40 | "%matplotlib inline" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "Getting the car data from S3" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 7, 53 | "metadata": {}, 54 | "outputs": [ 55 | { 56 | "name": "stdout", 57 | "output_type": "stream", 58 | "text": [ 59 | "s3://car-data-analysis-acg/car_data.csv\n" 60 | ] 61 | } 62 | ], 63 | "source": [ 64 | "role = get_execution_role()\n", 65 | "bucket=''\n", 66 | "data_key = 'car_data.csv'\n", 67 | "data_location = 's3://{}/{}'.format(bucket, data_key)\n", 68 | "print(data_location)" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 8, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "df = pd.read_csv(data_location)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 10, 83 | "metadata": {}, 84 | "outputs": [ 85 | { 86 | "data": { 87 | "text/html": [ 88 | "
\n", 89 | "\n", 102 | "\n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | "
caryearengine_hpavg_mpgpricesalesmandealershipsold_datesold_month
0Corvette2011335.022.5461352Big Bobs2012-05-065
1Corvette2011300.023.5406502Uptown Cars2011-05-165
2Corvette2011300.024.0363502Uptown Cars2013-07-317
3Corvette2011230.023.0294502Uptown Cars2014-07-057
4Corvette2011230.023.0345002Uptown Cars2013-05-205
\n", 180 | "
" 181 | ], 182 | "text/plain": [ 183 | " car year engine_hp avg_mpg price salesman dealership \\\n", 184 | "0 Corvette 2011 335.0 22.5 46135 2 Big Bobs \n", 185 | "1 Corvette 2011 300.0 23.5 40650 2 Uptown Cars \n", 186 | "2 Corvette 2011 300.0 24.0 36350 2 Uptown Cars \n", 187 | "3 Corvette 2011 230.0 23.0 29450 2 Uptown Cars \n", 188 | "4 Corvette 2011 230.0 23.0 34500 2 Uptown Cars \n", 189 | "\n", 190 | " sold_date sold_month \n", 191 | "0 2012-05-06 5 \n", 192 | "1 2011-05-16 5 \n", 193 | "2 2013-07-31 7 \n", 194 | "3 2014-07-05 7 \n", 195 | "4 2013-05-20 5 " 196 | ] 197 | }, 198 | "execution_count": 10, 199 | "metadata": {}, 200 | "output_type": "execute_result" 201 | } 202 | ], 203 | "source": [ 204 | "df.head()" 205 | ] 206 | }, 207 | { 208 | "cell_type": "markdown", 209 | "metadata": {}, 210 | "source": [ 211 | "Let's build out our box plot" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 11, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "df_vet = df[df['car'] == 'Corvette']\n", 221 | "df_mustang = df[df['car'] == 'Mustang']\n", 222 | "df_camaro = df[df['car'] == 'Camaro']\n" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 13, 228 | "metadata": {}, 229 | "outputs": [ 230 | { 231 | "name": "stderr", 232 | "output_type": "stream", 233 | "text": [ 234 | "/home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/numpy/core/fromnumeric.py:51: FutureWarning: reshape is deprecated and will raise in a subsequent release. Please use .values.reshape(...) instead\n", 235 | " return getattr(obj, method)(*args, **kwds)\n" 236 | ] 237 | }, 238 | { 239 | "data": { 240 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZIAAAEWCAYAAABMoxE0AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDIuMi4yLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvhp/UCwAAGQtJREFUeJzt3XucXWdd7/HPtxemhUKaacKdEEAQSRNKOyA9FixCE+BACooWRRHtaU9bmiMea2s9viR9qRWEFjheEitgPcLhKnjCRSmUFgWVkvRCGrlLQ6lcUpuUNlWg4Xf+2GtgJyaZyTw72bNnPu/Xa71mrWc9a63n2XvPfPd61t5rUlVIkjRThw27AZKk0WaQSJKaGCSSpCYGiSSpiUEiSWpikEiSmhgkmreSLElyd5LDh92WUZDk1CRfHXY7NPsYJJr1ktyS5N+7P/qT0x+17reqvlJVx1TVrkG0s1/X5mftUfayJB/fo85kv76R5Mokx+xnf5N1tyf5QJJHHIR2vyzJru4430pyY5LnzWA/Vyb53UG3T7OTQaJR8fzuj/7kdP6wGzQgz6+qY4ATgQngt6ZR9yHAN4A/PEht+sfuOMcCbwLemWThQTqW5gCDRCNt8l1+ktd279S/nOQ5fesfleTvktyV5CNJ/jjJW7p1S5NUkiO65WuT/E6ST3T1r0qyqG9fT03yD0l2JLkpyamD6kdV3Qb8DXD8NOr+B/Bu4Al9bVuQ5P8k2ZZka5LfSnJYt25dkr/qq/vqJFcnyRTH+R7wZuBo4DF7rk/yI91jtiPJliSru/KzgZcAF3ZnNu+bxkOgEWaQaC74UeBzwCLgD4A39f2R/L/AdcBxwFrgF6bY188BvwQ8ELgPcAFAkocBHwB+Fxjvyv8qyeJBdKAbpnoucMM06t4XOAP4p77iPwQWAI8Gfhx4adcPgF8Dlneh+zTgTOAXa4r7I3UB+9+Au4Ev7LHuSOB9wFX0Hqs1wFuT/HBVXQG8FfiD7uzx+VP1SaPtiGE3QJqmv05yb9/yr1fVn3XzWyfnk/wF8CfAg5LcB3gy8Myq+g7w8SQbpjjOn1fV57t9vRNY3ZX/PPDBqvpgt/zhJBvp/fH/i2m2+T7A9fuocye9oLp0P22brHs/YBuwqmvn4cCLgROq6i7griSX0QvNN1XVPUl+gd4Zz13Amqra30XzpybZAdwLfBF4YVXduccJzFOBY4BXdWcuH03yfuBn6QW25hGDRKPiBVX1kX2s+/rkTPdHE3p/5BYBd1TVPX11bwX2d5H6633z93T7AXgk8NNJ+t9dHwlcM902J3kZvXf4+6wzhRdU1Ue64Dgd+FiSJwDVtWVrX92twMMmF6rqk0n+hd7ZwzunOM4/VdUpU9R5KHBrFyJ7PabmD4e2NJd9DRjvhoImzfSTTrcCf1lVx/ZN96uqV7U388BU1a6qeg+wCzgFuB34Lr2wm7QEuG1yIcnLgTHgX4ELB9CMfwUeMXkdZi/H9Lbi84hBojmrqrYCG4G1Se6T5GRgpuP1bwGen2RVksOTHNV9r+LhA2vwNKXndGAh8Jnu48vvBH4vyf2TPBL4n12bSfI4etd2fp7ecNeFSU5obMYn6Z2xXZjkyO6DB88H3t6t/wa96zWaBwwSjYr3Zffvkbx3mtu9BDgZ+Dd6f0zfAXz7QA9eVbfSG076TXrXJ24Ffp1D+zv0viR3A98Cfo/eBfMt3bo1wE7gX4CP0/uQwZu7C+ZvAV5dVTdV1Re6PvxlkrGZNqS75vR84Dn0zoj+BHhpVX22q/Im4AndJ7r+eqbH0WiI/9hK80mSdwCfrapXDrst0lzhGYnmtCRPTvKYJIcleTa9swrfIUsD5Ke2NNc9GHgPve+RfBU4t6qm/K6GpOlzaEuS1MShLUlSk3kxtLVo0aJaunTpsJshSSNl06ZNt1fVlLcBmhdBsnTpUjZu3DjsZkjSSEmydepaDm1JkhoZJJKkJgaJJKmJQSJJamKQSJKaGCSSpCYGiSSpiUEiSWpikEiSmhgkkqQmBokkqYlBIklqYpBIkpoYJJKkJgaJJKmJQSJJamKQSJKaGCSSpCYGiSSpiUEiSWpikEiSmhgkkqQmBokkqYlBIklqYpBov8bHx0kykIm1Cwa2r5lM4+Pjw344pTnpiGE3QLPb9u3bqarB7GztgsHtawaSDO3Y0lzmGYkkqYlBIklqYpBIkpoYJFNwXF3zja95HSiDRJLUZMogSVJJ3tK3fESSbUneP5MDJnlFkvvOZFtJ0uwznTOSncDxSY7ulk8Dbms45isAg0QaMWvWrOGoo44iCUcddRRr1qwZdpO+77jjjtvtO0PHHXfcfuuvWrWKww47jCQcdthhrFq16hC19NBZsWLFbo/JihUrDtqxpju09UHgv3bzPwu8bXJFkrVJLuhbvjnJ0iT3S/KBJDd1ZWck+R/AQ4FrklzT1V+XZGOSLUku6dvPLUkuSXJ9ks1JHt+VL07y4a7+G5NsTbKo7WGQtD9r1qxh/fr1XHrppezcuZNLL72U9evXz4owOe6447jjjjtYtmwZW7duZdmyZdxxxx37DJNVq1Zx1VVXcc4557Bjxw7OOeccrrrqqjkVJitWrGDz5s2sXr2abdu2sXr1ajZv3nzwwqSq9jsBdwMrgHcDRwE3AqcC7+/WrwUu6Kt/M7AU+Cngz/rKF3Q/bwEW9ZWPdz8PB64FVvTVW9PNnwe8sZv/I+Dibv7ZQPXvb2/TSSedVDPVe4jmr4H2/5UPGNy+ZmC+P5fTtbfHaWxsrC677LLdyi677LIaGxs7VM3aJ6CWLVu2W9myZcv2+XwnqXPPPXe3snPPPbeSHLQ2HmpArV69erey1atXH/DvALCxpsiIqppekHQ/NwK/BFw6zSB5XBcGrwae1rd+zyA5B7ge+DSwDXhxX72HdfM/Cnykm78ReFTf9nfsLUiAs7s2b1yyZMkBPXh7PJDzfhqYWRAkTjN7zoHauXPnbmU7d+4c7OtjhoDaunXrbmVbt27dZ9uA2rFjx25lO3bsmBV9GRSgtm3btlvZtm3bDriPTDNIDuRTWxuA19I3rNW5l92HyI6i19rPAycCm4HfTfLbe+4wyaOAC4BnVtUK4AOT23e+3f3cxQHezqWqrqiqiaqaWLx48YFsurd9zdtprhn24zkK096MjY2xfv363crWr1/P2NjYoXjapvTc5z53v8v9knDxxRfvVnbxxRfPuY89n3nmmftdHqQDCZI3A5dU1eY9ym+hFxgkORF4VDf/UOCeqnoL8JrJOsBdwP27+QfQu5h/Z5IHAc+ZRjs+AfxMd4yVwMID6IOkGTjrrLO46KKLuPzyy7nnnnu4/PLLueiiizjrrLOG3TTGx8fZsmULxx9/PF/5ylc4/vjj2bJlyz5v0nnaaaexbt06zjvvPO68807OO+881q1bx2mnnXaIW37wLF++nA0bNnD66adz++23c/rpp7NhwwaWL19+cA44jXcnd++l7FR+MLR1NHAVsIVe2HyG3tDWKnrDVTcCnwImuvprgM8B13TLVwKfB64G3gO8rCu/hW7ICpgAru3mH9jVvRn4M+BrwNj++uA1kpkbaP9nwdCWpravx+n888+vsbGxAmpsbKzOP//8Q9yyfRsfH99taG58fHy/9VeuXFlJCqgktXLlykPU0kNn+fLluz0my5cvP+B9MM2hrdSIDV8kGQN2VdW9SU4G1lXVCfvbZmJiojZu3DjT483JIZ7pGmj/1y6AtXcOZl8zMN+fy+nycdKkJJuqamKqeqN4G/klwDuTHAZ8Bxj+ubUkzWMjFyRV9QXgSYfweIfqUNKs4GteB8p7bUmSmhgkkqQmBokkqYlBoin13/itZRrkvmYyLVzoV46kg2HkLrbr0Br0hddaO9DdSZoFPCORJDUxSCRJTQwSSVITg0SS1MQgkSQ1MUgkSU0MEklSE4NEktTEIJEkNTFIJElNDBJJUhODRJLUxCCRJDUxSCRJTQwSSVITg0SS1MQgkSQ1MUgkSU0MEklSE4NEktTEIJEkNTFIJElNDBJJUhODRJLUxCCRJDUxSCRJTQwSSVITg0SS1MQgkSQ1MUgkSU0MEklSE4NEktTEIJEkNTFIJElNDBJJUhODRJLUxCCRJDUxSCRJTQwSSVITg0SS1MQgkSQ1MUgkSU0MEklSE4NEktTEIJEkNTFIJElNDBJJUhODRJLUxCCRJDUxSDQrjI+Pk2QoE2sXDO3Yg5jGx8eH/fRpnjti2A2QALZv305VDefgaxcM79gDkGTYTdA85xmJJKmJQSJJamKQSJKaGCSSpCYGyTzmRVrp4JsPv2cGiSSpybSCJMmDk7w9yZeSbErywSSPOxgNSnJskvP6lpcm+bmDcSxJUrspgyS987L3AtdW1WOq6iTgYuBB09h2Jt9TORY4r295KWCQSNIsNZ0zkmcA362q9ZMFVXUT8PEkr0lyc5LNSc4ASHJqkr9PsgH45ySvSvLyyW2TrE1yQTf/60k+leTTSS7pqrwKeEySG5O8plt+Wrf8q0kO7447ud1/H8xDIUmaiemcMRwPbNpL+U8CJwBPBBYBn0ryd926E4Hjq+rLSZ4EvB74427dzwCrkqwEHgs8BQiwIcnTgd/otj0BesEEXFBVz+uWzwburKonJxkDPpHkqqr6cn/junpnAyxZsmQa3Zyf5sOFwPnA51HD1HKLlFOAt1XVLuAbST4GPBn4FnDd5B/2qrohyQOTPBRYDGyvqluT/AqwErih298x9ILlK1McdyWwIsmLuuUF3Xa7BUlVXQFcATAxMTG69784yGbLrUH8Q9hmtjyP+s/mw2t7OkGyBXjRlLV2t3OP5Xd1+3gw8I6uLMDvV9Wf9ldMsnSKfQdYU1UfOsA2SZIOgulcI/koMNYNFQGQZAWwAziju2axGHg6cN0+9vEO4MX0wuRdXdmHgF9Ocky3z4cleSBwF3D/vm33XP4QcG6SI7vtHpfkftPohyTpIJjyjKSqKskLgdcnuQj4D+AW4BX0hqNuAgq4sKq+nuTxe9nHliT3B26rqq91ZVcl+RHgH7tTv7uBn6+qLyX5RJKbgb8BfhPYleQm4ErgDfQ+yXV994mybcALGh4DSVKDzIex1YmJidq4ceOwmzHrJJk1Y+tDbcvaBbD2zuEcewBm0/Oo/2yUn58km6pqYqp6frNdktTEIJnHRvVdkjRK5sPvmUEiSWpikEiSmhgkkqQmLd9slwZqWN8Arlc+YKS/fbxw4cJhN0HznEGiWWHYFyRr7VAPL400h7YkSU0MEklSE4NEktTEIJEkNTFIJElNDBJJUhODRJLUxCCRJDUxSCRJTQwSSVITg0SS1MQgkSQ1MUgkSU0MEklSE4NEktTEIJEkNTFIJElNDBJJUhODRJLUxCCRJDUxSCRJTQwSSVITg0SS1MQgkSQ1MUgkSU0MEklSE4NEktTEIJEkNTFIJElNDBJJUhODRJLUxCCRJDUxSCRJTQwSSVITg0SS1MQgkSQ1MUgkSU0MEklSE4NEktTEIJEkNTFIJElNDBJJUhODRJLUxCCRJDUxSCRJTQwSSVITg0SS1MQgkSQ1MUgkSU0MkjlsfHycJLN6Yu2CoR17fHx82E+RNCccMewG6ODZvn07VTXsZuzf2gVDa2OSoRxXmms8I5EkNTFIJElNDBJJUhOD5CBzHF6D4OtIs5lBIklqMqMgSfLgJG9P8qUkm5J8MMnjBt04SdLsd8BBkt459nuBa6vqMVV1EnAx8KBBN24vx/bjypI0y8zkjOQZwHerav1kQVXdBNyQ5Ook1yfZnOR0gCRLk3w2yZVJPp/krUmeleQTSb6Q5Cldvack+cckNyT5hyQ/3JW/LMmGJB8Frk7Pa5Lc3B3njPaHQZI0UzN5h388sGkv5f8BvLCqvpVkEfBPSTZ0634I+Gngl4FPAT8HnAKsBn4TeAHwWeBpVXVvkmcBlwI/1W1/IrCiqu5I8lPACcATgUXAp5L8XVV9bQZ9kSQ1GuRQUYBLkzwd+B7wMH4w3PXlqtoMkGQLcHVVVZLNwNKuzgLgL5I8FijgyL59f7iq7ujmTwHeVlW7gG8k+RjwZGBDX32SnA2cDbBkyZIBdvPA+Ymb2cvnRmo3kyDZArxoL+UvARYDJ1XVd5PcAhzVrft2X73v9S1/r68NvwNcU1UvTLIUuLZvm50H2siqugK4AmBiYmKo9wnxFiCz16y/hUzH51Kz2UyukXwUGOve8QOQZAXwSOCbXYg8o1s+EAuA27r5l+2n3t8DZyQ5PMli4OnAdQd4LEnSgBxwkFTvLdwLgWd1H//dAvw+8EFgohuueim9ax4H4g+A309yA/s/U3ov8GngJnqhdmFVff0AjyVJGpCMyql9i4mJidq4ceNQjp1kqENbs/75XbsA1t45lEOPxOPTGaW2au5IsqmqJqaq5zfbJUlNDJKDzHeRGgRfR5rNDBJJUhODRJLUxCCRJDXxJohz3Gz/Ilu98gFDa+PChQuHclxprjFI5rBRuUBba4fdAkktHNqSJDUxSCRJTQwSSVITg0SS1MQgkSQ1MUgkSU0MEklSE4NEktTEIJEkNTFIJElNDBJJUhODRJLUxCCRJDUxSCRJTQwSSVITg0SS1MQgkSQ1MUgkSU0MEklSE4NEktTEIJEkNTFIJElNDBJJUhODRJLUJFU17DYcdEm2AVuH3Q5gEXD7sBsxIPZldporfZkr/YDR7ssjq2rxVJXmRZDMFkk2VtXEsNsxCPZldporfZkr/YC51Zd9cWhLktTEIJEkNTFIDq0rht2AAbIvs9Nc6ctc6QfMrb7slddIJElNPCORJDUxSCRJTQySAUry5iTfTHJzX9l4kg8n+UL3c2FXniT/O8kXk3w6yYnDa/nukjwiyTVJ/jnJliS/0pWPYl+OSnJdkpu6vlzSlT8qySe7Nr8jyX268rFu+Yvd+qXDbP/eJDk8yQ1J3t8tj2RfktySZHOSG5Ns7MpG7jUGkOTYJO9O8tkkn0ly8qj2ZSYMksG6Enj2HmW/AVxdVY8Fru6WAZ4DPLabzgbWHaI2Tse9wK9V1ROApwIvT/IERrMv3wZ+oqqeCJwAPDvJU4FXA6+rqh8CtgNndvXPBLZ35a/r6s02vwJ8pm95lPvyjKo6oe97FqP4GgN4A/C3VfV44In0np9R7cuBqyqnAU7AUuDmvuXPAQ/p5h8CfK6b/1PgZ/dWb7ZNwP8DThv1vgD3Ba4HfpTeN42P6MpPBj7UzX8IOLmbP6Krl2G3va8PD6f3R+kngPcDGeG+3AIs2qNs5F5jwALgy3s+tqPYl5lOnpEcfA+qqq91818HHtTNPwy4ta/eV7uyWaUbDnkS8ElGtC/dUNCNwDeBDwNfAnZU1b1dlf72fr8v3fo7geMObYv36/XAhcD3uuXjGN2+FHBVkk1Jzu7KRvE19ihgG/Dn3ZDjG5Pcj9Hsy4wYJIdQ9d5+jMznrZMcA/wV8Iqq+lb/ulHqS1XtqqoT6L2bfwrw+CE3aUaSPA/4ZlVtGnZbBuSUqjqR3lDPy5M8vX/lCL3GjgBOBNZV1ZOAnfxgGAsYqb7MiEFy8H0jyUMAup/f7MpvAx7RV+/hXdmskORIeiHy1qp6T1c8kn2ZVFU7gGvoDf8cm+SIblV/e7/fl279AuDfDnFT9+XHgNVJbgHeTm946w2MZl+oqtu6n98E3ksv5EfxNfZV4KtV9clu+d30gmUU+zIjBsnBtwH4xW7+F+ldb5gsf2n3CY6nAnf2nQYPVZIAbwI+U1WX960axb4sTnJsN380vWs9n6EXKC/qqu3Zl8k+vgj4aPducuiq6uKqenhVLQVeTK9tL2EE+5LkfknuPzkPrARuZgRfY1X1deDWJD/cFT0T+GdGsC8zNuyLNHNpAt4GfA34Lr13KWfSG5O+GvgC8BFgvKsb4I/pjddvBiaG3f6+fpxC7zT808CN3fTcEe3LCuCGri83A7/dlT8auA74IvAuYKwrP6pb/mK3/tHD7sM++nUq8P5R7UvX5pu6aQvwv7rykXuNde07AdjYvc7+Glg4qn2ZyeQtUiRJTRzakiQ1MUgkSU0MEklSE4NEktTEIJEkNTFIpP1Isqu7O+3k9BtTb7XPff3DgNq0NskFe5TdkmRRNz/Z5puTvCvJfQdxXGlfjpi6ijSv/Xv1bq/SrKr+yyD2Mw3fb3OStwLnAJfvfxNp5jwjkWagOwO4JMn13f/UeHxXvrj73xNbupv3be07U7i7+3lqkmv7/n/FW7u7CZDkpCQf625k+KHJW2w0+Hvghxr3Ie2XQSLt39F7DG2d0bfu9urddHAdMDnU9Ep6tyJZRu+eS0v2sd8nAa8AnkDvW94/1t3f7A+BF1XVScCbgd/bx/a/2t8u4KF7Vujur/Ucet+elg4ah7ak/dvf0NbkzSw3AT/ZzZ8CvBCgqv42yfZ9bHtdVX0VoAuCpcAO4Hjgw90JyuH0brmzN6+rqtdOLnQ3cpx0dLdP6J2RvGkf+5AGwiCRZu7b3c9dHPjv0rf75ie3D7Clqk5ubNfArutI0+HQljRYnwB+BiDJSno375uuzwGLk5zcbX9kkmWDb6I0WAaJtH97XiN51RT1LwFWJrkZ+Gl6/xnvrukcqKq+Q+92769OchO9uy4fqk96STPm3X+lAUoyBuyqqnu7M4t1DjNprvMaiTRYS4B3JjkM+A5w1pDbIx10npFIkpp4jUSS1MQgkSQ1MUgkSU0MEklSE4NEktTk/wO3+onxbkLKtwAAAABJRU5ErkJggg==\n", 241 | "text/plain": [ 242 | "
" 243 | ] 244 | }, 245 | "metadata": {}, 246 | "output_type": "display_data" 247 | } 248 | ], 249 | "source": [ 250 | "data = [df_camaro['engine_hp'], df_vet['engine_hp'], df_mustang['engine_hp']]\n", 251 | "plt.boxplot(data, vert=False)\n", 252 | "plt.title('Engine HP Box Plot')\n", 253 | "plt.xlabel('Engine HP')\n", 254 | "plt.yticks([1, 2, 3], ['Camaro', 'Corvette', 'Mustang'])\n", 255 | "plt.show()" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [] 264 | } 265 | ], 266 | "metadata": { 267 | "kernelspec": { 268 | "display_name": "Python 2", 269 | "language": "python", 270 | "name": "python2" 271 | }, 272 | "language_info": { 273 | "codemirror_mode": { 274 | "name": "ipython", 275 | "version": 2 276 | }, 277 | "file_extension": ".py", 278 | "mimetype": "text/x-python", 279 | "name": "python", 280 | "nbconvert_exporter": "python", 281 | "pygments_lexer": "ipython2", 282 | "version": "2.7.15" 283 | } 284 | }, 285 | "nbformat": 4, 286 | "nbformat_minor": 2 287 | } 288 | -------------------------------------------------------------------------------- /Chapter5/my-manifest.json: -------------------------------------------------------------------------------- 1 | { 2 | "fileLocations": [ 3 | { 4 | "URIPrefixes": [ 5 | "s3:///" 6 | ] 7 | } 8 | ], 9 | "globalUploadSettings": { 10 | "format": "CSV", 11 | "delimiter": ",", 12 | "textqualifier": "'", 13 | "containsHeader": "true" 14 | } 15 | } -------------------------------------------------------------------------------- /Chapter6/ufo-modeling-lab.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## UFO Sightings K-Means Clustering\n", 8 | "### Modeling Lab\n", 9 | "\n", 10 | "The goal of this notebook is to analyze where Mr. K should build his extraterrestrial life facilities using the K-Means algorithm. \n", 11 | "\n", 12 | "What we plan on accomplishing is the following:\n", 13 | "1. [Load dataset onto Notebook instance from S3](#Step-1:-Loading-the-data-from-Amazon-S3)\n", 14 | "2. [Cleaning, transforming, and preparing the data](#Step-2:-Cleaning,-transforming,-and-preparing-the-data)\n", 15 | "3. [Create and train our model](#Step-3:-Create-and-train-our-model)\n", 16 | "4. [Viewing the results](#Step-4:-Viewing-the-results)\n", 17 | "5. [Visualize using QuickSight](https://docs.aws.amazon.com/quicksight/latest/user/create-a-data-set-s3.html)" 18 | ] 19 | }, 20 | { 21 | "cell_type": "markdown", 22 | "metadata": {}, 23 | "source": [ 24 | "First let's go ahead and import all the needed libraries." 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 1, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "import pandas as pd\n", 34 | "import numpy as np\n", 35 | "from datetime import datetime\n", 36 | "\n", 37 | "import boto3\n", 38 | "from sagemaker import get_execution_role\n", 39 | "import sagemaker.amazon.common as smac" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "## Step 1: Loading the data from Amazon S3\n", 47 | "Next, lets get the UFO sightings data that is stored in S3." 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 2, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "role = get_execution_role()\n", 57 | "bucket = ''\n", 58 | "prefix = 'ufo_dataset'\n", 59 | "data_key = 'ufo_fullset.csv'\n", 60 | "data_location = 's3://{}/{}/{}'.format(bucket, prefix, data_key)\n", 61 | "\n", 62 | "df = pd.read_csv(data_location, low_memory=False)" 63 | ] 64 | }, 65 | { 66 | "cell_type": "code", 67 | "execution_count": 3, 68 | "metadata": { 69 | "scrolled": true 70 | }, 71 | "outputs": [ 72 | { 73 | "data": { 74 | "text/html": [ 75 | "
\n", 76 | "\n", 89 | "\n", 90 | " \n", 91 | " \n", 92 | " \n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | " \n", 142 | " \n", 143 | " \n", 144 | " \n", 145 | " \n", 146 | " \n", 147 | " \n", 148 | " \n", 149 | " \n", 150 | " \n", 151 | " \n", 152 | " \n", 153 | " \n", 154 | " \n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | " \n", 191 | " \n", 192 | " \n", 193 | " \n", 194 | " \n", 195 | " \n", 196 | " \n", 197 | " \n", 198 | " \n", 199 | " \n", 200 | " \n", 201 | " \n", 202 | "
reportedTimestampeventDateeventTimeshapedurationwitnessesweatherfirstNamelastNamelatitudelongitudesightingphysicalEvidencecontactresearchOutcome
01982-11-29T10:01:48.297Z1982-11-2803:17oval711snowMurielBartell28.039167-81.950000YNNexplained
12006-03-05T18:36:08.186Z2006-03-0504:56light751partly cloudyFloyHeaney33.660278-117.998333YYNexplained
22002-07-31T23:33:55.223Z2002-07-2613:43oval251rainEvelynChamplin41.325278-72.193611YYYprobable
31986-08-31T00:50:08.017Z1986-08-2716:12sphere471mostly cloudyHoldenWard38.254167-85.759444YNNexplained
42004-09-26T08:47:39.860Z2004-09-2517:21disk591rainAbigayleGrady22.30808569.600603YNNunexplained
\n", 203 | "
" 204 | ], 205 | "text/plain": [ 206 | " reportedTimestamp eventDate eventTime shape duration \\\n", 207 | "0 1982-11-29T10:01:48.297Z 1982-11-28 03:17 oval 71 \n", 208 | "1 2006-03-05T18:36:08.186Z 2006-03-05 04:56 light 75 \n", 209 | "2 2002-07-31T23:33:55.223Z 2002-07-26 13:43 oval 25 \n", 210 | "3 1986-08-31T00:50:08.017Z 1986-08-27 16:12 sphere 47 \n", 211 | "4 2004-09-26T08:47:39.860Z 2004-09-25 17:21 disk 59 \n", 212 | "\n", 213 | " witnesses weather firstName lastName latitude longitude \\\n", 214 | "0 1 snow Muriel Bartell 28.039167 -81.950000 \n", 215 | "1 1 partly cloudy Floy Heaney 33.660278 -117.998333 \n", 216 | "2 1 rain Evelyn Champlin 41.325278 -72.193611 \n", 217 | "3 1 mostly cloudy Holden Ward 38.254167 -85.759444 \n", 218 | "4 1 rain Abigayle Grady 22.308085 69.600603 \n", 219 | "\n", 220 | " sighting physicalEvidence contact researchOutcome \n", 221 | "0 Y N N explained \n", 222 | "1 Y Y N explained \n", 223 | "2 Y Y Y probable \n", 224 | "3 Y N N explained \n", 225 | "4 Y N N unexplained " 226 | ] 227 | }, 228 | "execution_count": 3, 229 | "metadata": {}, 230 | "output_type": "execute_result" 231 | } 232 | ], 233 | "source": [ 234 | "df.head()" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 4, 240 | "metadata": {}, 241 | "outputs": [ 242 | { 243 | "data": { 244 | "text/plain": [ 245 | "(18000, 15)" 246 | ] 247 | }, 248 | "execution_count": 4, 249 | "metadata": {}, 250 | "output_type": "execute_result" 251 | } 252 | ], 253 | "source": [ 254 | "df.shape" 255 | ] 256 | }, 257 | { 258 | "cell_type": "markdown", 259 | "metadata": {}, 260 | "source": [ 261 | "## Step 2: Cleaning, transforming, and preparing the data\n", 262 | "Create another DataFrame with just the latitude and longitude attributes" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 5, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "df_geo = df[['latitude', 'longitude']]" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 6, 277 | "metadata": {}, 278 | "outputs": [ 279 | { 280 | "data": { 281 | "text/html": [ 282 | "
\n", 283 | "\n", 296 | "\n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | " \n", 309 | " \n", 310 | " \n", 311 | " \n", 312 | " \n", 313 | " \n", 314 | " \n", 315 | " \n", 316 | " \n", 317 | " \n", 318 | " \n", 319 | " \n", 320 | " \n", 321 | " \n", 322 | " \n", 323 | " \n", 324 | " \n", 325 | " \n", 326 | " \n", 327 | " \n", 328 | " \n", 329 | " \n", 330 | " \n", 331 | "
latitudelongitude
028.039167-81.950000
133.660278-117.998333
241.325278-72.193611
338.254167-85.759444
422.30808569.600603
\n", 332 | "
" 333 | ], 334 | "text/plain": [ 335 | " latitude longitude\n", 336 | "0 28.039167 -81.950000\n", 337 | "1 33.660278 -117.998333\n", 338 | "2 41.325278 -72.193611\n", 339 | "3 38.254167 -85.759444\n", 340 | "4 22.308085 69.600603" 341 | ] 342 | }, 343 | "execution_count": 6, 344 | "metadata": {}, 345 | "output_type": "execute_result" 346 | } 347 | ], 348 | "source": [ 349 | "df_geo.head()" 350 | ] 351 | }, 352 | { 353 | "cell_type": "code", 354 | "execution_count": 7, 355 | "metadata": {}, 356 | "outputs": [ 357 | { 358 | "name": "stdout", 359 | "output_type": "stream", 360 | "text": [ 361 | "\n", 362 | "RangeIndex: 18000 entries, 0 to 17999\n", 363 | "Data columns (total 2 columns):\n", 364 | "latitude 18000 non-null float64\n", 365 | "longitude 18000 non-null float64\n", 366 | "dtypes: float64(2)\n", 367 | "memory usage: 281.3 KB\n" 368 | ] 369 | } 370 | ], 371 | "source": [ 372 | "df_geo.info()" 373 | ] 374 | }, 375 | { 376 | "cell_type": "code", 377 | "execution_count": 8, 378 | "metadata": { 379 | "scrolled": true 380 | }, 381 | "outputs": [ 382 | { 383 | "name": "stdout", 384 | "output_type": "stream", 385 | "text": [ 386 | "Are there any missing values? False\n" 387 | ] 388 | } 389 | ], 390 | "source": [ 391 | "missing_values = df_geo.isnull().values.any()\n", 392 | "print('Are there any missing values? {}'.format(missing_values))\n", 393 | "if(missing_values):\n", 394 | " df_geo[df_geo.isnull().any(axis=1)]" 395 | ] 396 | }, 397 | { 398 | "cell_type": "markdown", 399 | "metadata": {}, 400 | "source": [ 401 | "Next, let's go ahead and transform the pandas DataFrame (our dataset) into a numpy.ndarray. When we do this each row is converted to a Record object. According to the documentation, this is what the K-Means algorithm expects as training data. This is what we will use as training data for our model.\n", 402 | "\n", 403 | "[See the documentation for input training](https://sagemaker.readthedocs.io/en/stable/kmeans.html)" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": 9, 409 | "metadata": {}, 410 | "outputs": [ 411 | { 412 | "data": { 413 | "text/plain": [ 414 | "array([[ 28.039167, -81.95 ],\n", 415 | " [ 33.66028 , -117.99834 ],\n", 416 | " [ 41.32528 , -72.19361 ],\n", 417 | " ...,\n", 418 | " [ 37.49472 , -120.84556 ],\n", 419 | " [ 40.771946, -73.93056 ],\n", 420 | " [ 64.837776, -147.71638 ]], dtype=float32)" 421 | ] 422 | }, 423 | "execution_count": 9, 424 | "metadata": {}, 425 | "output_type": "execute_result" 426 | } 427 | ], 428 | "source": [ 429 | "data_train = df_geo.values.astype('float32')\n", 430 | "data_train" 431 | ] 432 | }, 433 | { 434 | "cell_type": "markdown", 435 | "metadata": {}, 436 | "source": [ 437 | "## Step 3: Create and train our model\n", 438 | "In this step we will import and use the built-in SageMaker K-Means algorithm. We will set the number of cluster to 10 (for our 10 sensors), specify the instance type we want to train on, and the location of where we want our model artifact to live. \n", 439 | "\n", 440 | "[See the documentation of hyperparameters here](https://docs.aws.amazon.com/sagemaker/latest/dg/k-means-api-config.html)" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": 10, 446 | "metadata": {}, 447 | "outputs": [], 448 | "source": [ 449 | "from sagemaker import KMeans\n", 450 | "\n", 451 | "num_clusters = 10\n", 452 | "output_location = 's3://' + bucket + '/model-artifacts'\n", 453 | "\n", 454 | "kmeans = KMeans(role=role,\n", 455 | " instance_count=1,\n", 456 | " instance_type='ml.c4.xlarge',\n", 457 | " output_path=output_location,\n", 458 | " k=num_clusters)" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": 11, 464 | "metadata": {}, 465 | "outputs": [ 466 | { 467 | "name": "stdout", 468 | "output_type": "stream", 469 | "text": [ 470 | "Here is the job name kmeans-geo-job-20190517133512\n" 471 | ] 472 | } 473 | ], 474 | "source": [ 475 | "job_name = 'kmeans-geo-job-{}'.format(datetime.now().strftime(\"%Y%m%d%H%M%S\"))\n", 476 | "print('Here is the job name {}'.format(job_name))" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": 14, 482 | "metadata": {}, 483 | "outputs": [ 484 | { 485 | "name": "stdout", 486 | "output_type": "stream", 487 | "text": [ 488 | "2019-05-17 13:37:19 Starting - Starting the training job...\n", 489 | "2019-05-17 13:37:21 Starting - Launching requested ML instances......\n", 490 | "2019-05-17 13:38:26 Starting - Preparing the instances for training......\n", 491 | "2019-05-17 13:39:44 Downloading - Downloading input data..\n", 492 | "\n", 493 | "2019-05-17 13:40:12 Training - Training image download completed. Training in progress.\n", 494 | "2019-05-17 13:40:12 Uploading - Uploading generated training model\n", 495 | "2019-05-17 13:40:12 Completed - Training job completed\n", 496 | "\u001b[31mDocker entrypoint called with argument(s): train\u001b[0m\n", 497 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'_enable_profiler': u'false', u'_tuning_objective_metric': u'', u'_num_gpus': u'auto', u'local_lloyd_num_trials': u'auto', u'_log_level': u'info', u'_kvstore': u'auto', u'local_lloyd_init_method': u'kmeans++', u'force_dense': u'true', u'epochs': u'1', u'init_method': u'random', u'local_lloyd_tol': u'0.0001', u'local_lloyd_max_iter': u'300', u'_disable_wait_to_read': u'false', u'extra_center_factor': u'auto', u'eval_metrics': u'[\"msd\"]', u'_num_kv_servers': u'1', u'mini_batch_size': u'5000', u'half_life_time_size': u'0', u'_num_slices': u'1'}\u001b[0m\n", 498 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] Reading provided configuration from /opt/ml/input/config/hyperparameters.json: {u'feature_dim': u'2', u'k': u'10', u'force_dense': u'True'}\u001b[0m\n", 499 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] Final configuration: {u'_tuning_objective_metric': u'', u'extra_center_factor': u'auto', u'local_lloyd_init_method': u'kmeans++', u'force_dense': u'True', u'epochs': u'1', u'feature_dim': u'2', u'local_lloyd_tol': u'0.0001', u'_disable_wait_to_read': u'false', u'eval_metrics': u'[\"msd\"]', u'_num_kv_servers': u'1', u'mini_batch_size': u'5000', u'_enable_profiler': u'false', u'_num_gpus': u'auto', u'local_lloyd_num_trials': u'auto', u'_log_level': u'info', u'init_method': u'random', u'half_life_time_size': u'0', u'local_lloyd_max_iter': u'300', u'_kvstore': u'auto', u'k': u'10', u'_num_slices': u'1'}\u001b[0m\n", 500 | "\u001b[31m[05/17/2019 13:40:02 WARNING 140619722299200] Loggers have already been setup.\u001b[0m\n", 501 | "\u001b[31mProcess 1 is a worker.\u001b[0m\n", 502 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] Using default worker.\u001b[0m\n", 503 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] Loaded iterator creator application/x-recordio-protobuf for content type ('application/x-recordio-protobuf', '1.0')\u001b[0m\n", 504 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] Create Store: local\u001b[0m\n", 505 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] nvidia-smi took: 0.0252430438995 secs to identify 0 gpus\u001b[0m\n", 506 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] Number of GPUs being used: 0\u001b[0m\n", 507 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] Setting up with params: {u'_tuning_objective_metric': u'', u'extra_center_factor': u'auto', u'local_lloyd_init_method': u'kmeans++', u'force_dense': u'True', u'epochs': u'1', u'feature_dim': u'2', u'local_lloyd_tol': u'0.0001', u'_disable_wait_to_read': u'false', u'eval_metrics': u'[\"msd\"]', u'_num_kv_servers': u'1', u'mini_batch_size': u'5000', u'_enable_profiler': u'false', u'_num_gpus': u'auto', u'local_lloyd_num_trials': u'auto', u'_log_level': u'info', u'init_method': u'random', u'half_life_time_size': u'0', u'local_lloyd_max_iter': u'300', u'_kvstore': u'auto', u'k': u'10', u'_num_slices': u'1'}\u001b[0m\n", 508 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] Number of GPUs being used: 0\u001b[0m\n", 509 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] number of center slices 1\u001b[0m\n", 510 | "\u001b[31m#metrics {\"Metrics\": {\"Max Batches Seen Between Resets\": {\"count\": 1, \"max\": 1, \"sum\": 1.0, \"min\": 1}, \"Number of Batches Since Last Reset\": {\"count\": 1, \"max\": 1, \"sum\": 1.0, \"min\": 1}, \"Number of Records Since Last Reset\": {\"count\": 1, \"max\": 5000, \"sum\": 5000.0, \"min\": 5000}, \"Total Batches Seen\": {\"count\": 1, \"max\": 1, \"sum\": 1.0, \"min\": 1}, \"Total Records Seen\": {\"count\": 1, \"max\": 5000, \"sum\": 5000.0, \"min\": 5000}, \"Max Records Seen Between Resets\": {\"count\": 1, \"max\": 5000, \"sum\": 5000.0, \"min\": 5000}, \"Reset Count\": {\"count\": 1, \"max\": 0, \"sum\": 0.0, \"min\": 0}}, \"EndTime\": 1558100402.361402, \"Dimensions\": {\"Host\": \"algo-1\", \"Meta\": \"init_train_data_iter\", \"Operation\": \"training\", \"Algorithm\": \"AWS/KMeansWebscale\"}, \"StartTime\": 1558100402.361373}\n", 511 | "\u001b[0m\n", 512 | "\u001b[31m[2019-05-17 13:40:02.361] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 0, \"duration\": 45, \"num_examples\": 1, \"num_bytes\": 160000}\u001b[0m\n", 513 | "\u001b[31m[2019-05-17 13:40:02.442] [tensorio] [info] epoch_stats={\"data_pipeline\": \"/opt/ml/input/data/train\", \"epoch\": 1, \"duration\": 80, \"num_examples\": 4, \"num_bytes\": 576000}\u001b[0m\n", 514 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] processed a total of 18000 examples\u001b[0m\n", 515 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] #progress_metric: host=algo-1, completed 100 % of epochs\u001b[0m\n", 516 | "\u001b[31m#metrics {\"Metrics\": {\"Max Batches Seen Between Resets\": {\"count\": 1, \"max\": 4, \"sum\": 4.0, \"min\": 4}, \"Number of Batches Since Last Reset\": {\"count\": 1, \"max\": 4, \"sum\": 4.0, \"min\": 4}, \"Number of Records Since Last Reset\": {\"count\": 1, \"max\": 18000, \"sum\": 18000.0, \"min\": 18000}, \"Total Batches Seen\": {\"count\": 1, \"max\": 5, \"sum\": 5.0, \"min\": 5}, \"Total Records Seen\": {\"count\": 1, \"max\": 23000, \"sum\": 23000.0, \"min\": 23000}, \"Max Records Seen Between Resets\": {\"count\": 1, \"max\": 18000, \"sum\": 18000.0, \"min\": 18000}, \"Reset Count\": {\"count\": 1, \"max\": 1, \"sum\": 1.0, \"min\": 1}}, \"EndTime\": 1558100402.443082, \"Dimensions\": {\"Host\": \"algo-1\", \"Meta\": \"training_data_iter\", \"Operation\": \"training\", \"Algorithm\": \"AWS/KMeansWebscale\", \"epoch\": 0}, \"StartTime\": 1558100402.361611}\n", 517 | "\u001b[0m\n", 518 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] #throughput_metric: host=algo-1, train throughput=220579.989949 records/second\u001b[0m\n", 519 | "\u001b[31m[05/17/2019 13:40:02 WARNING 140619722299200] wait_for_all_workers will not sync workers since the kv store is not running distributed\u001b[0m\n", 520 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] shrinking 100 centers into 10\u001b[0m\n", 521 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] local kmeans attempt #0. Current mean square distance 27.843344\u001b[0m\n", 522 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] local kmeans attempt #1. Current mean square distance 28.573576\u001b[0m\n", 523 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] local kmeans attempt #2. Current mean square distance 25.915253\u001b[0m\n", 524 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] local kmeans attempt #3. Current mean square distance 26.098783\u001b[0m\n", 525 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] local kmeans attempt #4. Current mean square distance 27.912024\u001b[0m\n", 526 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] local kmeans attempt #5. Current mean square distance 25.231770\u001b[0m\n", 527 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] local kmeans attempt #6. Current mean square distance 26.845171\u001b[0m\n", 528 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] local kmeans attempt #7. Current mean square distance 27.047560\u001b[0m\n", 529 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] local kmeans attempt #8. Current mean square distance 28.046276\u001b[0m\n", 530 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] local kmeans attempt #9. Current mean square distance 25.349459\u001b[0m\n", 531 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] finished shrinking process. Mean Square Distance = 25\u001b[0m\n", 532 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] #quality_metric: host=algo-1, train msd =25.2317695618\u001b[0m\n", 533 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] compute all data-center distances: inner product took: 24.4517%, (0.021202 secs)\u001b[0m\n", 534 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] predict compute msd took: 18.1709%, (0.015756 secs)\u001b[0m\n", 535 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] gradient: cluster size took: 9.9631%, (0.008639 secs)\u001b[0m\n", 536 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] gradient: cluster center took: 8.7569%, (0.007593 secs)\u001b[0m\n", 537 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] collect from kv store took: 8.4902%, (0.007362 secs)\u001b[0m\n", 538 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] splitting centers key-value pair took: 8.0024%, (0.006939 secs)\u001b[0m\n", 539 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] batch data loading with context took: 7.8072%, (0.006770 secs)\u001b[0m\n", 540 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] compute all data-center distances: point norm took: 6.0801%, (0.005272 secs)\u001b[0m\n", 541 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] gradient: one_hot took: 5.3672%, (0.004654 secs)\u001b[0m\n", 542 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] update state and report convergance took: 1.9272%, (0.001671 secs)\u001b[0m\n", 543 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] compute all data-center distances: center norm took: 0.6530%, (0.000566 secs)\u001b[0m\n", 544 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] update set-up time took: 0.2343%, (0.000203 secs)\u001b[0m\n", 545 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] predict minus dist took: 0.0960%, (0.000083 secs)\u001b[0m\n", 546 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] TOTAL took: 0.0867109298706\u001b[0m\n", 547 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] Number of GPUs being used: 0\u001b[0m\n", 548 | "\u001b[31m#metrics {\"Metrics\": {\"finalize.time\": {\"count\": 1, \"max\": 349.2088317871094, \"sum\": 349.2088317871094, \"min\": 349.2088317871094}, \"initialize.time\": {\"count\": 1, \"max\": 38.21587562561035, \"sum\": 38.21587562561035, \"min\": 38.21587562561035}, \"model.serialize.time\": {\"count\": 1, \"max\": 0.14901161193847656, \"sum\": 0.14901161193847656, \"min\": 0.14901161193847656}, \"update.time\": {\"count\": 1, \"max\": 81.25615119934082, \"sum\": 81.25615119934082, \"min\": 81.25615119934082}, \"epochs\": {\"count\": 1, \"max\": 1, \"sum\": 1.0, \"min\": 1}, \"state.serialize.time\": {\"count\": 1, \"max\": 2.516031265258789, \"sum\": 2.516031265258789, \"min\": 2.516031265258789}, \"_shrink.time\": {\"count\": 1, \"max\": 347.4760055541992, \"sum\": 347.4760055541992, \"min\": 347.4760055541992}}, \"EndTime\": 1558100402.795428, \"Dimensions\": {\"Host\": \"algo-1\", \"Operation\": \"training\", \"Algorithm\": \"AWS/KMeansWebscale\"}, \"StartTime\": 1558100402.316022}\n", 549 | "\u001b[0m\n", 550 | "\u001b[31m[05/17/2019 13:40:02 INFO 140619722299200] Test data is not provided.\u001b[0m\n", 551 | "\u001b[31m#metrics {\"Metrics\": {\"totaltime\": {\"count\": 1, \"max\": 547.003984451294, \"sum\": 547.003984451294, \"min\": 547.003984451294}, \"setuptime\": {\"count\": 1, \"max\": 15.492916107177734, \"sum\": 15.492916107177734, \"min\": 15.492916107177734}}, \"EndTime\": 1558100402.795768, \"Dimensions\": {\"Host\": \"algo-1\", \"Operation\": \"training\", \"Algorithm\": \"AWS/KMeansWebscale\"}, \"StartTime\": 1558100402.795523}\n", 552 | "\u001b[0m\n" 553 | ] 554 | }, 555 | { 556 | "name": "stdout", 557 | "output_type": "stream", 558 | "text": [ 559 | "Billable seconds: 28\n", 560 | "CPU times: user 695 ms, sys: 6.68 ms, total: 702 ms\n", 561 | "Wall time: 3min 12s\n" 562 | ] 563 | } 564 | ], 565 | "source": [ 566 | "%%time\n", 567 | "kmeans.fit(kmeans.record_set(data_train), job_name=job_name)" 568 | ] 569 | }, 570 | { 571 | "cell_type": "markdown", 572 | "metadata": {}, 573 | "source": [ 574 | "## Step 4: Viewing the results\n", 575 | "In this step we will take a look at the model artifact SageMaker created for us and stored onto S3. We have to do a few special things to see the latitude and longitude for our 10 clusters (and the center points of those clusters)\n", 576 | "\n", 577 | "[See the documentation of deserilization here](https://docs.aws.amazon.com/sagemaker/latest/dg/cdf-training.html#td-deserialization)\n", 578 | "\n", 579 | "At this point we need to \"deserilize\" the model artifact. Here we are going to open and review them in our notebook instance. We can unzip the model artifact which will contain model_algo-1. This is just a serialized Apache MXNet object. From here we can load that serialized object into a numpy.ndarray and then extract the clustered centroids from the numpy.ndarray.\n", 580 | "\n", 581 | "After we extract the results into a DataFrame of latitudes and longitudes, we can create a CSV with that data, load it onto S3 and then visualize it with QuickSight." 582 | ] 583 | }, 584 | { 585 | "cell_type": "code", 586 | "execution_count": 17, 587 | "metadata": {}, 588 | "outputs": [ 589 | { 590 | "data": { 591 | "text/plain": [ 592 | "2304" 593 | ] 594 | }, 595 | "execution_count": 17, 596 | "metadata": {}, 597 | "output_type": "execute_result" 598 | } 599 | ], 600 | "source": [ 601 | "import os\n", 602 | "model_key = 'model-artifacts/' + job_name + '/output/model.tar.gz'\n", 603 | "\n", 604 | "boto3.resource('s3').Bucket(bucket).download_file(model_key, 'model.tar.gz')\n", 605 | "os.system('tar -zxvf model.tar.gz')\n", 606 | "os.system('unzip model_algo-1')" 607 | ] 608 | }, 609 | { 610 | "cell_type": "code", 611 | "execution_count": 18, 612 | "metadata": { 613 | "scrolled": true 614 | }, 615 | "outputs": [ 616 | { 617 | "name": "stdout", 618 | "output_type": "stream", 619 | "text": [ 620 | "Collecting mxnet\n", 621 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/58/f4/bc147a1ba7175f9890523ff8f1a928a43ac8a79d5897a067158cac4d092f/mxnet-1.4.1-py2.py3-none-manylinux1_x86_64.whl (28.4MB)\n", 622 | "\u001b[K 100% |████████████████████████████████| 28.4MB 1.6MB/s eta 0:00:01 0% |▎ | 235kB 35.0MB/s eta 0:00:01 68% |██████████████████████ | 19.6MB 45.2MB/s eta 0:00:01\n", 623 | "\u001b[?25hCollecting numpy<1.15.0,>=1.8.2 (from mxnet)\n", 624 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/e5/c4/395ebb218053ba44d64935b3729bc88241ec279915e72100c5979db10945/numpy-1.14.6-cp36-cp36m-manylinux1_x86_64.whl (13.8MB)\n", 625 | "\u001b[K 100% |████████████████████████████████| 13.8MB 5.4MB/s eta 0:00:01\n", 626 | "\u001b[?25hRequirement already satisfied: requests>=2.20.0 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from mxnet) (2.20.1)\n", 627 | "Collecting graphviz<0.9.0,>=0.8.1 (from mxnet)\n", 628 | " Downloading https://files.pythonhosted.org/packages/53/39/4ab213673844e0c004bed8a0781a0721a3f6bb23eb8854ee75c236428892/graphviz-0.8.4-py2.py3-none-any.whl\n", 629 | "Requirement already satisfied: urllib3<1.25,>=1.21.1 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from requests>=2.20.0->mxnet) (1.23)\n", 630 | "Requirement already satisfied: idna<2.8,>=2.5 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from requests>=2.20.0->mxnet) (2.6)\n", 631 | "Requirement already satisfied: certifi>=2017.4.17 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from requests>=2.20.0->mxnet) (2019.3.9)\n", 632 | "Requirement already satisfied: chardet<3.1.0,>=3.0.2 in /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages (from requests>=2.20.0->mxnet) (3.0.4)\n", 633 | "Installing collected packages: numpy, graphviz, mxnet\n", 634 | " Found existing installation: numpy 1.15.4\n", 635 | " Uninstalling numpy-1.15.4:\n", 636 | " Successfully uninstalled numpy-1.15.4\n", 637 | "Successfully installed graphviz-0.8.4 mxnet-1.4.1 numpy-1.14.6\n", 638 | "\u001b[33mYou are using pip version 10.0.1, however version 19.1.1 is available.\n", 639 | "You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n" 640 | ] 641 | } 642 | ], 643 | "source": [ 644 | "!pip install mxnet" 645 | ] 646 | }, 647 | { 648 | "cell_type": "code", 649 | "execution_count": 19, 650 | "metadata": {}, 651 | "outputs": [], 652 | "source": [ 653 | "import mxnet as mx\n", 654 | "Kmeans_model_params = mx.ndarray.load('model_algo-1')" 655 | ] 656 | }, 657 | { 658 | "cell_type": "code", 659 | "execution_count": 20, 660 | "metadata": { 661 | "scrolled": true 662 | }, 663 | "outputs": [ 664 | { 665 | "data": { 666 | "text/html": [ 667 | "
\n", 668 | "\n", 681 | "\n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | "
latitudelongitude
041.286369-74.856453
1-3.558636115.825752
235.375927-117.235794
348.4778523.664200
436.134438-97.897385
526.707329-81.378113
646.405426-120.561981
725.992533-146.748108
838.832069-85.299072
961.760826-148.924332
\n", 742 | "
" 743 | ], 744 | "text/plain": [ 745 | " latitude longitude\n", 746 | "0 41.286369 -74.856453\n", 747 | "1 -3.558636 115.825752\n", 748 | "2 35.375927 -117.235794\n", 749 | "3 48.477852 3.664200\n", 750 | "4 36.134438 -97.897385\n", 751 | "5 26.707329 -81.378113\n", 752 | "6 46.405426 -120.561981\n", 753 | "7 25.992533 -146.748108\n", 754 | "8 38.832069 -85.299072\n", 755 | "9 61.760826 -148.924332" 756 | ] 757 | }, 758 | "execution_count": 20, 759 | "metadata": {}, 760 | "output_type": "execute_result" 761 | } 762 | ], 763 | "source": [ 764 | "cluster_centroids_kmeans = pd.DataFrame(Kmeans_model_params[0].asnumpy())\n", 765 | "cluster_centroids_kmeans.columns=df_geo.columns\n", 766 | "cluster_centroids_kmeans" 767 | ] 768 | }, 769 | { 770 | "cell_type": "markdown", 771 | "metadata": {}, 772 | "source": [ 773 | "Let's go ahead and upload this dataset onto S3 and view within QuickSight" 774 | ] 775 | }, 776 | { 777 | "cell_type": "code", 778 | "execution_count": 23, 779 | "metadata": {}, 780 | "outputs": [ 781 | { 782 | "data": { 783 | "text/plain": [ 784 | "{'ResponseMetadata': {'RequestId': '5B2C5F338C4D94A2',\n", 785 | " 'HostId': 'EBaTdqW46uapRaIWfzr0UMENSLV4vuhXsUML53S9b4QC4MP0heG2FEcRYKJqYeSum2J8ikhHdrY=',\n", 786 | " 'HTTPStatusCode': 200,\n", 787 | " 'HTTPHeaders': {'x-amz-id-2': 'EBaTdqW46uapRaIWfzr0UMENSLV4vuhXsUML53S9b4QC4MP0heG2FEcRYKJqYeSum2J8ikhHdrY=',\n", 788 | " 'x-amz-request-id': '5B2C5F338C4D94A2',\n", 789 | " 'date': 'Fri, 17 May 2019 13:53:38 GMT',\n", 790 | " 'etag': '\"51e129efa7a05a163e90bd3fd0433c70\"',\n", 791 | " 'content-length': '0',\n", 792 | " 'server': 'AmazonS3'},\n", 793 | " 'RetryAttempts': 0},\n", 794 | " 'ETag': '\"51e129efa7a05a163e90bd3fd0433c70\"'}" 795 | ] 796 | }, 797 | "execution_count": 23, 798 | "metadata": {}, 799 | "output_type": "execute_result" 800 | } 801 | ], 802 | "source": [ 803 | "from io import StringIO\n", 804 | "\n", 805 | "csv_buffer = StringIO()\n", 806 | "cluster_centroids_kmeans.to_csv(csv_buffer, index=False)\n", 807 | "s3_resource = boto3.resource('s3')\n", 808 | "s3_resource.Object(bucket, 'results/ten_locations_kmeans.csv').put(Body=csv_buffer.getvalue())" 809 | ] 810 | }, 811 | { 812 | "cell_type": "code", 813 | "execution_count": null, 814 | "metadata": {}, 815 | "outputs": [], 816 | "source": [] 817 | } 818 | ], 819 | "metadata": { 820 | "kernelspec": { 821 | "display_name": "conda_python3", 822 | "language": "python", 823 | "name": "conda_python3" 824 | }, 825 | "language_info": { 826 | "codemirror_mode": { 827 | "name": "ipython", 828 | "version": 3 829 | }, 830 | "file_extension": ".py", 831 | "mimetype": "text/x-python", 832 | "name": "python", 833 | "nbconvert_exporter": "python", 834 | "pygments_lexer": "ipython3", 835 | "version": "3.6.5" 836 | } 837 | }, 838 | "nbformat": 4, 839 | "nbformat_minor": 2 840 | } 841 | -------------------------------------------------------------------------------- /Chapter7/ufo-algorithms-lab.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## UFO Sightings Algorithms Lab\n", 8 | "\n", 9 | "The goal of this notebook is to build out models to use for predicting the legitimacy of a UFO sighting using the XGBoost and Linear Learner algorithm.\n", 10 | "\n", 11 | "What we plan on accompishling is the following:\n", 12 | "1. [Load dataset onto Notebook instance memory from S3](#Step-1:-Load-the-data-from-Amazon-S3)\n", 13 | "1. [Cleaning, transforming, analyize, and preparing the dataset](#Step-2:-Cleaning,-transforming,-analyize,-and-preparing-the-dataset)\n", 14 | "1. [Create and train our model (XGBoost)](#Step-3:-Creating-and-training-our-model-(XGBoost))\n", 15 | "1. [Create and train our model (Linear Learner)](#Step-4:-Creating-and-training-our-model-(Linear-Learner))" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "First let's go ahead and import all the needed libraries." 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "import pandas as pd\n", 32 | "import numpy as np\n", 33 | "from datetime import datetime\n", 34 | "import io\n", 35 | "import sagemaker.amazon.common as smac\n", 36 | "\n", 37 | "import boto3\n", 38 | "from sagemaker import get_execution_role\n", 39 | "import sagemaker\n", 40 | "\n", 41 | "import matplotlib.pyplot as plt\n", 42 | "import seaborn as sns" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "## Step 1: Loading the data from Amazon S3\n", 50 | "Let's get the UFO sightings data that is stored in S3 and load it into memory." 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "role = get_execution_role()\n", 60 | "bucket=''\n", 61 | "sub_folder = 'ufo_dataset'\n", 62 | "data_key = 'ufo_fullset.csv'\n", 63 | "data_location = 's3://{}/{}/{}'.format(bucket, sub_folder, data_key)\n", 64 | "\n", 65 | "df = pd.read_csv(data_location, low_memory=False)\n", 66 | "df.head()" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "
" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "## Step 2: Cleaning, transforming, analyize, and preparing the dataset\n", 81 | "This step is so important. It's crucial that we clean and prepare our data before we do anything else." 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": { 88 | "scrolled": true 89 | }, 90 | "outputs": [], 91 | "source": [ 92 | "# Let's check to see if there are any missing values\n", 93 | "missing_values = df.isnull().values.any()\n", 94 | "if(missing_values):\n", 95 | " display(df[df.isnull().any(axis=1)])" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": null, 101 | "metadata": {}, 102 | "outputs": [], 103 | "source": [ 104 | "df['shape'].value_counts()" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "# Replace the missing values with the most common shape\n", 114 | "df['shape'] = df['shape'].fillna(df['shape'].value_counts().index[0])" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "metadata": { 120 | "scrolled": true 121 | }, 122 | "source": [ 123 | "Let's go ahead and start preparing our dataset by transforming some of the values into the correct data types. Here is what we are going to take care of.\n", 124 | "1. Convert the `reportedTimestamp` and `eventDate` to a datetime data types.\n", 125 | "1. Convert the `shape` and `weather` to a category data type.\n", 126 | "1. Map the `physicalEvidence` and `contact` from 'Y', 'N' to `0`, `1`.\n", 127 | "1. Convert the `researchOutcome` to a category data type (target attribute)." 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": {}, 134 | "outputs": [], 135 | "source": [ 136 | "df['reportedTimestamp'] = pd.to_datetime(df['reportedTimestamp'])\n", 137 | "df['eventDate'] = pd.to_datetime(df['eventDate'])\n", 138 | "\n", 139 | "df['shape'] = df['shape'].astype('category')\n", 140 | "df['weather'] = df['weather'].astype('category')\n", 141 | "\n", 142 | "df['physicalEvidence'] = df['physicalEvidence'].replace({'Y': 1, 'N': 0})\n", 143 | "df['contact'] = df['contact'].replace({'Y': 1, 'N': 0})\n", 144 | "\n", 145 | "df['researchOutcome'] = df['researchOutcome'].astype('category')" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "df.dtypes" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "Let's visualize some of the data to see if we can find out any important information." 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": {}, 168 | "outputs": [], 169 | "source": [ 170 | "%matplotlib inline\n", 171 | "sns.set_context(\"paper\", font_scale=1.4)" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": { 178 | "scrolled": false 179 | }, 180 | "outputs": [], 181 | "source": [ 182 | "m_cts = (df['contact'].value_counts())\n", 183 | "m_ctsx = m_cts.index\n", 184 | "m_ctsy = m_cts.to_numpy()\n", 185 | "f, ax = plt.subplots(figsize=(5,5))\n", 186 | "\n", 187 | "sns.barplot(x=m_ctsx, y=m_ctsy)\n", 188 | "ax.set_title('UFO Sightings and Contact')\n", 189 | "ax.set_xlabel('Was contact made?')\n", 190 | "ax.set_ylabel('Number of Sightings')\n", 191 | "ax.set_xticklabels(['No', 'Yes'])\n", 192 | "plt.xticks(rotation=45)\n", 193 | "plt.show()" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": { 200 | "scrolled": false 201 | }, 202 | "outputs": [], 203 | "source": [ 204 | "m_cts = (df['physicalEvidence'].value_counts())\n", 205 | "m_ctsx = m_cts.index\n", 206 | "m_ctsy = m_cts.to_numpy()\n", 207 | "f, ax = plt.subplots(figsize=(5,5))\n", 208 | "\n", 209 | "sns.barplot(x=m_ctsx, y=m_ctsy)\n", 210 | "ax.set_title('UFO Sightings and Physical Evidence')\n", 211 | "ax.set_xlabel('Was there physical evidence?')\n", 212 | "ax.set_ylabel('Number of Sightings')\n", 213 | "ax.set_xticklabels(['No', 'Yes'])\n", 214 | "plt.xticks(rotation=45)\n", 215 | "plt.show()" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": null, 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "m_cts = (df['shape'].value_counts())\n", 225 | "m_ctsx = m_cts.index\n", 226 | "m_ctsy = m_cts.to_numpy()\n", 227 | "f, ax = plt.subplots(figsize=(9,5))\n", 228 | "\n", 229 | "sns.barplot(x=m_ctsx, y=m_ctsy)\n", 230 | "ax.set_title('UFO Sightings by Shape')\n", 231 | "ax.set_xlabel('UFO Shape')\n", 232 | "ax.set_ylabel('Number of Sightings')\n", 233 | "plt.xticks(rotation=45)\n", 234 | "plt.show()" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "metadata": { 241 | "scrolled": false 242 | }, 243 | "outputs": [], 244 | "source": [ 245 | "m_cts = (df['weather'].value_counts())\n", 246 | "m_ctsx = m_cts.index\n", 247 | "m_ctsy = m_cts.to_numpy()\n", 248 | "f, ax = plt.subplots(figsize=(5,5))\n", 249 | "\n", 250 | "sns.barplot(x=m_ctsx, y=m_ctsy)\n", 251 | "ax.set_title('UFO Sightings by Weather')\n", 252 | "ax.set_xlabel('Weather')\n", 253 | "ax.set_ylabel('Number of Sightings')\n", 254 | "plt.xticks(rotation=45)\n", 255 | "plt.show()" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": null, 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "m_cts = (df['researchOutcome'].value_counts())\n", 265 | "m_ctsx = m_cts.index\n", 266 | "m_ctsy = m_cts.to_numpy()\n", 267 | "f, ax = plt.subplots(figsize=(5,5))\n", 268 | "\n", 269 | "sns.barplot(x=m_ctsx, y=m_ctsy)\n", 270 | "ax.set_title('UFO Sightings and Research Outcome')\n", 271 | "ax.set_xlabel('Research Outcome')\n", 272 | "ax.set_ylabel('Number of Sightings')\n", 273 | "plt.xticks(rotation=45)\n", 274 | "plt.show()" 275 | ] 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": null, 280 | "metadata": { 281 | "scrolled": true 282 | }, 283 | "outputs": [], 284 | "source": [ 285 | "ufo_yr = df['eventDate'].dt.year # series with the year exclusively\n", 286 | "\n", 287 | "## Set axes ##\n", 288 | "years_data = ufo_yr.value_counts()\n", 289 | "years_index = years_data.index # x ticks\n", 290 | "years_values = years_data.to_numpy()\n", 291 | "\n", 292 | "## Create Bar Plot ##\n", 293 | "plt.figure(figsize=(15,8))\n", 294 | "plt.xticks(rotation = 60)\n", 295 | "plt.title('UFO Sightings by Year')\n", 296 | "plt.ylabel('Number of Sightings')\n", 297 | "plt.xlabel('Year')\n", 298 | "\n", 299 | "years_plot = sns.barplot(x=years_index[:60],y=years_values[:60])" 300 | ] 301 | }, 302 | { 303 | "cell_type": "code", 304 | "execution_count": null, 305 | "metadata": { 306 | "scrolled": true 307 | }, 308 | "outputs": [], 309 | "source": [ 310 | "df.corr()" 311 | ] 312 | }, 313 | { 314 | "cell_type": "markdown", 315 | "metadata": {}, 316 | "source": [ 317 | "Let's drop the columns that are not important. \n", 318 | "1. We can drop `sighting` becuase it is always 'Y' or Yes. \n", 319 | "1. Let's drop the `firstName` and `lastName` becuase they are not important in determining the `researchOutcome`.\n", 320 | "1. Let's drop the `reportedTimestamp` becuase when the sighting was reporting isn't going to help us determine the legitimacy of the sighting.\n", 321 | "1. We would need to create some sort of buckets for the `eventDate` and `eventTime`, like seasons for example, but since the distribution of dates is pretty even, let's go ahead and drop them." 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": null, 327 | "metadata": {}, 328 | "outputs": [], 329 | "source": [ 330 | "df.drop(columns=['firstName', 'lastName', 'sighting', 'reportedTimestamp', 'eventDate', 'eventTime'], inplace=True)" 331 | ] 332 | }, 333 | { 334 | "cell_type": "code", 335 | "execution_count": null, 336 | "metadata": {}, 337 | "outputs": [], 338 | "source": [ 339 | "df.head()" 340 | ] 341 | }, 342 | { 343 | "cell_type": "markdown", 344 | "metadata": {}, 345 | "source": [ 346 | "Let's apply one-hot encoding\n", 347 | "1. We need to one-hot both the `weather` attribute and the `shape` attribute. \n", 348 | "1. We also need to transform or map the researchOutcome (target) attribute into numeric values. This is what the alogrithm is expecting. We can do this by mapping unexplained, explained, and probable to 0, 1, 2." 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": null, 354 | "metadata": {}, 355 | "outputs": [], 356 | "source": [ 357 | "# Let's one-hot the weather and shape attribute\n", 358 | "df = pd.get_dummies(df, columns=['weather', 'shape'])\n", 359 | "\n", 360 | "# Let's replace the researchOutcome values with 0, 1, 2 for Unexplained, Explained, and Probable\n", 361 | "df['researchOutcome'] = df['researchOutcome'].replace({'unexplained': 0, 'explained': 1, 'probable': 2})" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": null, 367 | "metadata": { 368 | "scrolled": false 369 | }, 370 | "outputs": [], 371 | "source": [ 372 | "display(df.head())\n", 373 | "display(df.shape)" 374 | ] 375 | }, 376 | { 377 | "cell_type": "markdown", 378 | "metadata": {}, 379 | "source": [ 380 | "
" 381 | ] 382 | }, 383 | { 384 | "cell_type": "markdown", 385 | "metadata": {}, 386 | "source": [ 387 | "Let's randomize and split the data into training, validation, and testing.\n", 388 | "1. First we need to randomize the data.\n", 389 | "1. Next Let's use 80% of the dataset for our training set.\n", 390 | "1. Then use 10% for validation during training.\n", 391 | "1. Finally we will use 10% for testing our model after it is deployed." 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "execution_count": null, 397 | "metadata": {}, 398 | "outputs": [], 399 | "source": [ 400 | "# Let's go ahead and randomize our data.\n", 401 | "df = df.sample(frac=1).reset_index(drop=True)\n", 402 | "\n", 403 | "# Next, Let's split the data into a training, validation, and testing.\n", 404 | "rand_split = np.random.rand(len(df))\n", 405 | "train_list = rand_split < 0.8 # 80% for training\n", 406 | "val_list = (rand_split >= 0.8) & (rand_split < 0.9) # 10% for validation\n", 407 | "test_list = rand_split >= 0.9 # 10% for testing\n", 408 | "\n", 409 | " # This dataset will be used to train the model.\n", 410 | "data_train = df[train_list]\n", 411 | "\n", 412 | "# This dataset will be used to validate the model.\n", 413 | "data_val = df[val_list]\n", 414 | "\n", 415 | "# This dataset will be used to test the model.\n", 416 | "data_test = df[test_list]" 417 | ] 418 | }, 419 | { 420 | "cell_type": "markdown", 421 | "metadata": {}, 422 | "source": [ 423 | "Next, let's go ahead and rearrange our attributes so the first attribute is our target attribute `researchOutcome`. This is what AWS requires and the XGBoost algorithms expects. You can read all about it here in the [documentation](https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html#InputOutput-XGBoost).\n", 424 | "\n", 425 | "After that we will go ahead and create those files on our Notebook instance (stored as CSV) and then upload them to S3. " 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": null, 431 | "metadata": {}, 432 | "outputs": [], 433 | "source": [ 434 | "# Simply moves the researchOutcome attribute to the first position before creating CSV files\n", 435 | "pd.concat([data_train['researchOutcome'], data_train.drop(['researchOutcome'], axis=1)], axis=1).to_csv('train.csv', index=False, header=False)\n", 436 | "pd.concat([data_val['researchOutcome'], data_val.drop(['researchOutcome'], axis=1)], axis=1).to_csv('validation.csv', index=False, header=False)\n", 437 | "\n", 438 | "# Next we can take the files we just stored onto our Notebook instance and upload them to S3.\n", 439 | "boto3.Session().resource('s3').Bucket(bucket).Object('algorithms_lab/xgboost_train/train.csv').upload_file('train.csv')\n", 440 | "boto3.Session().resource('s3').Bucket(bucket).Object('algorithms_lab/xgboost_validation/validation.csv').upload_file('validation.csv')\n" 441 | ] 442 | }, 443 | { 444 | "cell_type": "markdown", 445 | "metadata": {}, 446 | "source": [ 447 | "
" 448 | ] 449 | }, 450 | { 451 | "cell_type": "markdown", 452 | "metadata": {}, 453 | "source": [ 454 | "## Step 3: Creating and training our model (XGBoost)\n", 455 | "\n", 456 | "This is where the magic happens. We will get the ECR container hosted in ECR for the XGBoost algorithm. " 457 | ] 458 | }, 459 | { 460 | "cell_type": "code", 461 | "execution_count": null, 462 | "metadata": {}, 463 | "outputs": [], 464 | "source": [ 465 | "from sagemaker import image_uris\n", 466 | "container = image_uris.retrieve('xgboost', boto3.Session().region_name, '1')" 467 | ] 468 | }, 469 | { 470 | "cell_type": "markdown", 471 | "metadata": {}, 472 | "source": [ 473 | "Next, because we're training with the CSV file format, we'll create inputs that our training function can use as a pointer to the files in S3, which also specify that the content type is CSV." 474 | ] 475 | }, 476 | { 477 | "cell_type": "code", 478 | "execution_count": null, 479 | "metadata": {}, 480 | "outputs": [], 481 | "source": [ 482 | "s3_input_train = sagemaker.inputs.TrainingInput(s3_data='s3://{}/algorithms_lab/xgboost_train'.format(bucket), content_type='csv')\n", 483 | "s3_input_validation = sagemaker.inputs.TrainingInput(s3_data='s3://{}/algorithms_lab/xgboost_validation'.format(bucket), content_type='csv')\n" 484 | ] 485 | }, 486 | { 487 | "cell_type": "markdown", 488 | "metadata": {}, 489 | "source": [ 490 | "
" 491 | ] 492 | }, 493 | { 494 | "cell_type": "markdown", 495 | "metadata": {}, 496 | "source": [ 497 | "Next we start building out our model by using the SageMaker Python SDK and passing in everything that is required to create a XGBoost model.\n", 498 | "\n", 499 | "First I like to always create a specific job name.\n", 500 | "\n", 501 | "Next, we'll need to specify training parameters.\n", 502 | "1. The `xgboost` algorithm container\n", 503 | "1. The IAM role to use\n", 504 | "1. Training instance type and count\n", 505 | "1. S3 location for output data/model artifact\n", 506 | "1. [XGBoost Hyperparameters](https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost_hyperparameters.html)\n", 507 | "\n", 508 | "\n", 509 | "Finally, after everything is included and ready, then we can call the `.fit()` function which specifies the S3 location for training and validation data." 510 | ] 511 | }, 512 | { 513 | "cell_type": "code", 514 | "execution_count": null, 515 | "metadata": {}, 516 | "outputs": [], 517 | "source": [ 518 | "# Create a training job name\n", 519 | "job_name = 'ufo-xgboost-job-{}'.format(datetime.now().strftime(\"%Y%m%d%H%M%S\"))\n", 520 | "print('Here is the job name {}'.format(job_name))\n", 521 | "\n", 522 | "# Here is where the model artifact will be stored\n", 523 | "output_location = 's3://{}/algorithms_lab/xgboost_output'.format(bucket)" 524 | ] 525 | }, 526 | { 527 | "cell_type": "code", 528 | "execution_count": null, 529 | "metadata": { 530 | "scrolled": true 531 | }, 532 | "outputs": [], 533 | "source": [ 534 | "sess = sagemaker.Session()\n", 535 | "\n", 536 | "xgb = sagemaker.estimator.Estimator(container,\n", 537 | " role, \n", 538 | " instance_count=1, \n", 539 | " instance_type='ml.m4.xlarge',\n", 540 | " output_path=output_location,\n", 541 | " sagemaker_session=sess)\n", 542 | "\n", 543 | "xgb.set_hyperparameters(objective='multi:softmax',\n", 544 | " num_class=3,\n", 545 | " num_round=100)\n", 546 | "\n", 547 | "data_channels = {\n", 548 | " 'train': s3_input_train,\n", 549 | " 'validation': s3_input_validation\n", 550 | "}\n", 551 | "xgb.fit(data_channels, job_name=job_name) " 552 | ] 553 | }, 554 | { 555 | "cell_type": "code", 556 | "execution_count": null, 557 | "metadata": { 558 | "scrolled": true 559 | }, 560 | "outputs": [], 561 | "source": [ 562 | "print('Here is the location of the trained XGBoost model: {}/{}/output/model.tar.gz'.format(output_location, job_name))" 563 | ] 564 | }, 565 | { 566 | "cell_type": "markdown", 567 | "metadata": {}, 568 | "source": [ 569 | "After we train our model we can see the default evaluation metric in the logs. The `merror` is used in multiclass classification error rate. It is calculated as #(wrong cases)/#(all cases). We want this to be minimized (so we want this to be super small)." 570 | ] 571 | }, 572 | { 573 | "cell_type": "markdown", 574 | "metadata": {}, 575 | "source": [ 576 | "---\n", 577 | "\n", 578 | "## Step 4: Creating and training our model (Linear Learner)\n", 579 | "\n", 580 | "Let's evaluate the Linear Learner algorithm as well. Let's go ahead and randomize the data again and get it ready for the Linear Leaner algorithm. We will also rearrange the columns so it is ready for the algorithm (it expects the first column to be the target attribute)" 581 | ] 582 | }, 583 | { 584 | "cell_type": "code", 585 | "execution_count": null, 586 | "metadata": {}, 587 | "outputs": [], 588 | "source": [ 589 | "np.random.seed(0)\n", 590 | "rand_split = np.random.rand(len(df))\n", 591 | "train_list = rand_split < 0.8\n", 592 | "val_list = (rand_split >= 0.8) & (rand_split < 0.9)\n", 593 | "test_list = rand_split >= 0.9\n", 594 | "\n", 595 | " # This dataset will be used to train the model.\n", 596 | "data_train = df[train_list]\n", 597 | "\n", 598 | "# This dataset will be used to validate the model.\n", 599 | "data_val = df[val_list]\n", 600 | "\n", 601 | "# This dataset will be used to test the model.\n", 602 | "data_test = df[test_list]\n", 603 | "\n", 604 | "# This rearranges the columns\n", 605 | "cols = list(data_train)\n", 606 | "cols.insert(0, cols.pop(cols.index('researchOutcome')))\n", 607 | "data_train = data_train[cols]\n", 608 | "\n", 609 | "cols = list(data_val)\n", 610 | "cols.insert(0, cols.pop(cols.index('researchOutcome')))\n", 611 | "data_val = data_val[cols]\n", 612 | "\n", 613 | "cols = list(data_test)\n", 614 | "cols.insert(0, cols.pop(cols.index('researchOutcome')))\n", 615 | "data_test = data_test[cols]\n", 616 | "\n", 617 | "# Breaks the datasets into attribute numpy.ndarray and the same for target attribute. \n", 618 | "train_X = data_train.drop(columns='researchOutcome').values\n", 619 | "train_y = data_train['researchOutcome'].values\n", 620 | "\n", 621 | "val_X = data_val.drop(columns='researchOutcome').values\n", 622 | "val_y = data_val['researchOutcome'].values\n", 623 | "\n", 624 | "test_X = data_test.drop(columns='researchOutcome').values\n", 625 | "test_y = data_test['researchOutcome'].values" 626 | ] 627 | }, 628 | { 629 | "cell_type": "markdown", 630 | "metadata": {}, 631 | "source": [ 632 | "Next, Let's create recordIO file for the training data and upload it to S3." 633 | ] 634 | }, 635 | { 636 | "cell_type": "code", 637 | "execution_count": null, 638 | "metadata": {}, 639 | "outputs": [], 640 | "source": [ 641 | "train_file = 'ufo_sightings_train_recordIO_protobuf.data'\n", 642 | "\n", 643 | "f = io.BytesIO()\n", 644 | "smac.write_numpy_to_dense_tensor(f, train_X.astype('float32'), train_y.astype('float32'))\n", 645 | "f.seek(0)\n", 646 | "\n", 647 | "boto3.Session().resource('s3').Bucket(bucket).Object('algorithms_lab/linearlearner_train/{}'.format(train_file)).upload_fileobj(f)\n", 648 | "training_recordIO_protobuf_location = 's3://{}/algorithms_lab/linearlearner_train/{}'.format(bucket, train_file)\n", 649 | "print('The Pipe mode recordIO protobuf training data: {}'.format(training_recordIO_protobuf_location))" 650 | ] 651 | }, 652 | { 653 | "cell_type": "markdown", 654 | "metadata": {}, 655 | "source": [ 656 | "Let's create recordIO file for the validation data and upload it to S3" 657 | ] 658 | }, 659 | { 660 | "cell_type": "code", 661 | "execution_count": null, 662 | "metadata": {}, 663 | "outputs": [], 664 | "source": [ 665 | "validation_file = 'ufo_sightings_validatioin_recordIO_protobuf.data'\n", 666 | "\n", 667 | "f = io.BytesIO()\n", 668 | "smac.write_numpy_to_dense_tensor(f, val_X.astype('float32'), val_y.astype('float32'))\n", 669 | "f.seek(0)\n", 670 | "\n", 671 | "boto3.Session().resource('s3').Bucket(bucket).Object('algorithms_lab/linearlearner_validation/{}'.format(validation_file)).upload_fileobj(f)\n", 672 | "validate_recordIO_protobuf_location = 's3://{}/algorithms_lab/linearlearner_validation/{}'.format(bucket, validation_file)\n", 673 | "print('The Pipe mode recordIO protobuf validation data: {}'.format(validate_recordIO_protobuf_location))" 674 | ] 675 | }, 676 | { 677 | "cell_type": "markdown", 678 | "metadata": {}, 679 | "source": [ 680 | "---\n", 681 | "\n", 682 | "Alright we are good to go for the Linear Learner algorithm. Let's get everything we need from the ECR repository to call the Linear Learner algorithm." 683 | ] 684 | }, 685 | { 686 | "cell_type": "code", 687 | "execution_count": null, 688 | "metadata": {}, 689 | "outputs": [], 690 | "source": [ 691 | "from sagemaker import image_uris\n", 692 | "container = image_uris.retrieve('linear-learner', boto3.Session().region_name, '1')" 693 | ] 694 | }, 695 | { 696 | "cell_type": "code", 697 | "execution_count": null, 698 | "metadata": {}, 699 | "outputs": [], 700 | "source": [ 701 | "# Create a training job name\n", 702 | "job_name = 'ufo-linear-learner-job-{}'.format(datetime.now().strftime(\"%Y%m%d%H%M%S\"))\n", 703 | "print('Here is the job name {}'.format(job_name))\n", 704 | "\n", 705 | "# Here is where the model-artifact will be stored\n", 706 | "output_location = 's3://{}/algorithms_lab/linearlearner_output'.format(bucket)" 707 | ] 708 | }, 709 | { 710 | "cell_type": "markdown", 711 | "metadata": {}, 712 | "source": [ 713 | "Next we start building out our model by using the SageMaker Python SDK and passing in everything that is required to create a Linear Learner model.\n", 714 | "\n", 715 | "First I like to always create a specific job name.\n", 716 | "\n", 717 | "Next, we'll need to specify training parameters.\n", 718 | "1. The `linear-learner` algorithm container\n", 719 | "1. The IAM role to use\n", 720 | "1. Training instance type and count\n", 721 | "1. S3 location for output data/model artifact\n", 722 | "1. [The input type (Pipe)](https://docs.aws.amazon.com/sagemaker/latest/dg/linear-learner.html)\n", 723 | "1. [Linear Learner Hyperparameters](https://docs.aws.amazon.com/sagemaker/latest/dg/ll_hyperparameters.html)\n", 724 | "\n", 725 | "\n", 726 | "Finally, after everything is included and ready, then we can call the `.fit()` function which specifies the S3 location for training and validation data." 727 | ] 728 | }, 729 | { 730 | "cell_type": "code", 731 | "execution_count": null, 732 | "metadata": {}, 733 | "outputs": [], 734 | "source": [ 735 | "print('The feature_dim hyperparameter needs to be set to {}.'.format(data_train.shape[1] - 1))" 736 | ] 737 | }, 738 | { 739 | "cell_type": "code", 740 | "execution_count": null, 741 | "metadata": { 742 | "scrolled": true 743 | }, 744 | "outputs": [], 745 | "source": [ 746 | "sess = sagemaker.Session()\n", 747 | "\n", 748 | "# Setup the LinearLeaner algorithm from the ECR container\n", 749 | "linear = sagemaker.estimator.Estimator(container,\n", 750 | " role, \n", 751 | " instance_count=1, \n", 752 | " instance_type='ml.c4.xlarge',\n", 753 | " output_path=output_location,\n", 754 | " sagemaker_session=sess,\n", 755 | " input_mode='Pipe')\n", 756 | "# Setup the hyperparameters\n", 757 | "linear.set_hyperparameters(feature_dim=22, # number of attributes (minus the researchOutcome attribute)\n", 758 | " predictor_type='multiclass_classifier', # type of classification problem\n", 759 | " num_classes=3) # number of classes in out researchOutcome (explained, unexplained, probable)\n", 760 | "\n", 761 | "\n", 762 | "# Launch a training job. This method calls the CreateTrainingJob API call\n", 763 | "data_channels = {\n", 764 | " 'train': training_recordIO_protobuf_location,\n", 765 | " 'validation': validate_recordIO_protobuf_location\n", 766 | "}\n", 767 | "linear.fit(data_channels, job_name=job_name)" 768 | ] 769 | }, 770 | { 771 | "cell_type": "code", 772 | "execution_count": null, 773 | "metadata": {}, 774 | "outputs": [], 775 | "source": [ 776 | "print('Here is the location of the trained Linear Learner model: {}/{}/output/model.tar.gz'.format(output_location, job_name))" 777 | ] 778 | }, 779 | { 780 | "cell_type": "markdown", 781 | "metadata": {}, 782 | "source": [ 783 | "From here we have two trained models to present to Mr. K. Congratulations!" 784 | ] 785 | }, 786 | { 787 | "cell_type": "code", 788 | "execution_count": null, 789 | "metadata": {}, 790 | "outputs": [], 791 | "source": [] 792 | } 793 | ], 794 | "metadata": { 795 | "kernelspec": { 796 | "display_name": "conda_python3", 797 | "language": "python", 798 | "name": "conda_python3" 799 | }, 800 | "language_info": { 801 | "codemirror_mode": { 802 | "name": "ipython", 803 | "version": 3 804 | }, 805 | "file_extension": ".py", 806 | "mimetype": "text/x-python", 807 | "name": "python", 808 | "nbconvert_exporter": "python", 809 | "pygments_lexer": "ipython3", 810 | "version": "3.6.5" 811 | } 812 | }, 813 | "nbformat": 4, 814 | "nbformat_minor": 2 815 | } 816 | -------------------------------------------------------------------------------- /Chapter8/ufo-evaluation-optimization-lab.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## UFO Sightings Evaluation and Optimization Lab\n", 8 | "\n", 9 | "The goal of this notebook is to find out if our optimized model hyperparmeters out performs the training of our baseline Linear Learner model. We can also compare things like accurary and see if they differ.\n", 10 | "\n", 11 | "What we plan on accompishling is the following:\n", 12 | "1. [Create and train our \"optimized\" model (Linear Learner)](#1.-Create-and-train-our-%22optimized%22-model-(Linear-Learner))\n", 13 | "1. Compare the results!" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "First let's go ahead and import all the needed libraries." 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "import pandas as pd\n", 30 | "import numpy as np\n", 31 | "from datetime import datetime\n", 32 | "\n", 33 | "\n", 34 | "import boto3\n", 35 | "from sagemaker import get_execution_role\n", 36 | "import sagemaker" 37 | ] 38 | }, 39 | { 40 | "cell_type": "code", 41 | "execution_count": null, 42 | "metadata": {}, 43 | "outputs": [], 44 | "source": [ 45 | "role = get_execution_role()\n", 46 | "bucket=''" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "---\n", 54 | "\n", 55 | "### 1. Create and train our \"optimized\" model (Linear Learner)\n", 56 | "\n", 57 | "Let's evaluate the Linear Learner algorithm with the new optimized hyperparameters. Let's go ahead and get the data that we already stored into S3 as recordIO protobuf data." 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "Let's get the recordIO file for the training data that is in S3" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "train_file = 'ufo_sightings_train_recordIO_protobuf.data'\n", 74 | "training_recordIO_protobuf_location = 's3://{}/algorithms_lab/linearlearner_train/{}'.format(bucket, train_file)\n", 75 | "print('The Pipe mode recordIO protobuf training data: {}'.format(training_recordIO_protobuf_location))" 76 | ] 77 | }, 78 | { 79 | "cell_type": "markdown", 80 | "metadata": {}, 81 | "source": [ 82 | "Let's get the recordIO file for the validation data that is in S3" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "validation_file = 'ufo_sightings_validatioin_recordIO_protobuf.data'\n", 92 | "validate_recordIO_protobuf_location = 's3://{}/algorithms_lab/linearlearner_validation/{}'.format(bucket, validation_file)\n", 93 | "print('The Pipe mode recordIO protobuf validation data: {}'.format(validate_recordIO_protobuf_location))" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "---\n", 101 | "\n", 102 | "Alright we are good to go for the Linear Learner algorithm. Let's get everything we need from the ECR repository to call the Linear Learner algorithm." 103 | ] 104 | }, 105 | { 106 | "cell_type": "code", 107 | "execution_count": null, 108 | "metadata": {}, 109 | "outputs": [], 110 | "source": [ 111 | "from sagemaker import image_uris\n", 112 | "container = image_uris.retrieve('linear-learner', boto3.Session().region_name, '1')" 113 | ] 114 | }, 115 | { 116 | "cell_type": "markdown", 117 | "metadata": {}, 118 | "source": [ 119 | "Let's create a job and use the optimzed hyperparamters." 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": { 126 | "scrolled": true 127 | }, 128 | "outputs": [], 129 | "source": [ 130 | "# Create a training job name\n", 131 | "job_name = 'ufo-linear-learner-job-optimized-{}'.format(datetime.now().strftime(\"%Y%m%d%H%M%S\"))\n", 132 | "print('Here is the job name {}'.format(job_name))\n", 133 | "\n", 134 | "# Here is where the model-artifact will be stored\n", 135 | "output_location = 's3://{}/optimization_evaluation_lab/linearlearner_optimized_output'.format(bucket)" 136 | ] 137 | }, 138 | { 139 | "cell_type": "markdown", 140 | "metadata": {}, 141 | "source": [ 142 | "Next we can start building out our model by using the SageMaker Python SDK and passing in everything that is required to create a Linear Learner training job.\n", 143 | "\n", 144 | "Here are the [linear learner hyperparameters](https://docs.aws.amazon.com/sagemaker/latest/dg/ll_hyperparameters.html) that we can use within our training job.\n", 145 | "\n", 146 | "After we run this job we can view the results." 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "%%time\n", 156 | "sess = sagemaker.Session()\n", 157 | "\n", 158 | "# Setup the LinearLeaner algorithm from the ECR container\n", 159 | "linear = sagemaker.estimator.Estimator(container,\n", 160 | " role, \n", 161 | " instance_count=1, \n", 162 | " instance_type='ml.c4.xlarge',\n", 163 | " output_path=output_location,\n", 164 | " sagemaker_session=sess,\n", 165 | " input_mode='Pipe')\n", 166 | "# Setup the hyperparameters\n", 167 | "linear.set_hyperparameters( feature_dim=22, \n", 168 | " predictor_type='multiclass_classifier',\n", 169 | " num_classes=3,\n", 170 | " ## enter optimized hyperparameters here\n", 171 | " ## enter optimized hyperparameters here\n", 172 | " ## enter optimized hyperparameters here\n", 173 | " ## enter optimized hyperparameters here\n", 174 | " ## enter optimized hyperparameters here\n", 175 | " ## enter optimized hyperparameters here\n", 176 | " ## enter optimized hyperparameters here\n", 177 | " ## enter optimized hyperparameters here\n", 178 | " ## enter optimized hyperparameters here\n", 179 | " ## enter optimized hyperparameters here)\n", 180 | " )\n", 181 | "\n", 182 | "\n", 183 | "# Launch a training job. This method calls the CreateTrainingJob API call\n", 184 | "data_channels = {\n", 185 | " 'train': training_recordIO_protobuf_location,\n", 186 | " 'validation': validate_recordIO_protobuf_location\n", 187 | "}\n", 188 | "linear.fit(data_channels, job_name=job_name)" 189 | ] 190 | }, 191 | { 192 | "cell_type": "markdown", 193 | "metadata": {}, 194 | "source": [ 195 | "Now we can compare the amount of time billed and the accuracy compared to our baseline model." 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": null, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [] 204 | } 205 | ], 206 | "metadata": { 207 | "kernelspec": { 208 | "display_name": "conda_python3", 209 | "language": "python", 210 | "name": "conda_python3" 211 | }, 212 | "language_info": { 213 | "codemirror_mode": { 214 | "name": "ipython", 215 | "version": 3 216 | }, 217 | "file_extension": ".py", 218 | "mimetype": "text/x-python", 219 | "name": "python", 220 | "nbconvert_exporter": "python", 221 | "pygments_lexer": "ipython3", 222 | "version": "3.6.5" 223 | } 224 | }, 225 | "nbformat": 4, 226 | "nbformat_minor": 2 227 | } 228 | -------------------------------------------------------------------------------- /Chapter8/ufo_sightings_train_recordIO_protobuf.data: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ACloudGuru-Resources/Course_AWS_Certified_Machine_Learning/c132e72de12f01b80332fccc67905058a7a67ea6/Chapter8/ufo_sightings_train_recordIO_protobuf.data -------------------------------------------------------------------------------- /Chapter8/ufo_sightings_validatioin_recordIO_protobuf.data: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ACloudGuru-Resources/Course_AWS_Certified_Machine_Learning/c132e72de12f01b80332fccc67905058a7a67ea6/Chapter8/ufo_sightings_validatioin_recordIO_protobuf.data -------------------------------------------------------------------------------- /Chapter9/lambda_function.py: -------------------------------------------------------------------------------- 1 | import os 2 | import io 3 | import boto3 4 | import json 5 | import csv 6 | 7 | # grab environment variables 8 | ENDPOINT_NAME = os.environ['ENDPOINT_NAME'] 9 | runtime= boto3.client('runtime.sagemaker') 10 | 11 | def lambda_handler(event, context): 12 | print("Received event: " + json.dumps(event, indent=2)) 13 | 14 | data = json.loads(json.dumps(event)) 15 | payload = data['data'] 16 | print(payload) 17 | 18 | response = runtime.invoke_endpoint(EndpointName=ENDPOINT_NAME, 19 | ContentType='text/csv', 20 | Body=payload) 21 | print(response) 22 | result = json.loads(response['Body'].read().decode()) 23 | print(result) 24 | pred = int(result['predictions'][0]['predicted_label']) 25 | 26 | if(pred == 0): 27 | return 'Unexplained' 28 | if(pred == 1): 29 | return 'Explained' 30 | if(pred == 2): 31 | return 'Probable' -------------------------------------------------------------------------------- /Chapter9/sample_request.json: -------------------------------------------------------------------------------- 1 | { 2 | "data": "45.0, 10.0,38.5816667,-121.49333329999999,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0" 3 | } -------------------------------------------------------------------------------- /Chapter9/ufo-implementation-operations-lab.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## UFO Sightings Implementation and Operations Lab\n", 8 | "\n", 9 | "The goal of this notebook is to train and deploy our model into SageMaker online hosting with 1 variant. \n", 10 | "\n", 11 | "What we plan on accompishling is the following:\n", 12 | "1. [Load dataset onto Notebook instance memory from S3](#Step-1:-Load-the-data-from-Amazon-S3)\n", 13 | "1. [Cleaning, transforming and preparing the dataset](#Step-2:-Cleaning,-transforming-and-preparing-the-dataset)\n", 14 | "1. [Create and train our model (Linear Learner)](#Step-4:-Creating-and-training-our-model-(Linear-Learner))\n", 15 | "1. [Deploying the model into SageMaker hosting](#Step-4:-Deploying-the-model-into-SageMaker-hosting)" 16 | ] 17 | }, 18 | { 19 | "cell_type": "markdown", 20 | "metadata": {}, 21 | "source": [ 22 | "First let's go ahead and import all the needed libraries." 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "import pandas as pd\n", 32 | "import numpy as np\n", 33 | "from datetime import datetime\n", 34 | "import io\n", 35 | "import sagemaker.amazon.common as smac\n", 36 | "\n", 37 | "import boto3\n", 38 | "from sagemaker import get_execution_role\n", 39 | "import sagemaker\n", 40 | "\n", 41 | "import matplotlib.pyplot as plt\n", 42 | "import seaborn as sns" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "## Step 1: Loading the data from Amazon S3\n", 50 | "Let's get the UFO sightings data that is stored in S3 and load it into memory." 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": null, 56 | "metadata": {}, 57 | "outputs": [], 58 | "source": [ 59 | "role = get_execution_role()\n", 60 | "bucket=''\n", 61 | "sub_folder = 'ufo_dataset'\n", 62 | "data_key = 'ufo_fullset.csv'\n", 63 | "data_location = 's3://{}/{}/{}'.format(bucket, sub_folder, data_key)\n", 64 | "\n", 65 | "df = pd.read_csv(data_location, low_memory=False)\n", 66 | "df.head()" 67 | ] 68 | }, 69 | { 70 | "cell_type": "markdown", 71 | "metadata": {}, 72 | "source": [ 73 | "
" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "## Step 2: Cleaning, transforming and preparing the dataset\n", 81 | "This step is so important. It's crucial that we clean and prepare our data before we do anything else." 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": { 87 | "scrolled": true 88 | }, 89 | "source": [ 90 | "Let's go ahead and start preparing our dataset by transforming some of the values into the correct data types. Here is what we are going to take care of.\n", 91 | "1. Convert the `reportedTimestamp` and `eventDate` to a datetime data types.\n", 92 | "1. Convert the `shape` and `weather` to a category data type.\n", 93 | "1. Map the `physicalEvidence` and `contact` from 'Y', 'N' to `0`, `1`.\n", 94 | "1. Convert the `researchOutcome` to a category data type (target attribute).\n", 95 | "\n", 96 | "Let's also drop the columns that are not important. \n", 97 | "1. We can drop `sighting` becuase it is always 'Y' or Yes. \n", 98 | "1. Let's drop the `firstName` and `lastName` becuase they are not important in determining the `researchOutcome`.\n", 99 | "1. Let's drop the `reportedTimestamp` becuase when the sighting was reporting isn't going to help us determine the legitimacy of the sighting.\n", 100 | "1. We would need to create some sort of buckets for the `eventDate` and `eventTime`, like seasons for example, but since the distribution of dates is pretty even, let's go ahead and drop them.\n", 101 | "\n", 102 | "Finally, let's apply one-hot encoding\n", 103 | "1. We need to one-hot both the `weather` attribute and the `shape` attribute. \n", 104 | "1. We also need to transform or map the researchOutcome (target) attribute into numeric values. This is what the alogrithm is expecting. We can do this by mapping unexplained, explained, and probable to 0, 1, 2." 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": {}, 111 | "outputs": [], 112 | "source": [ 113 | "# Replace the missing values with the most common shape\n", 114 | "df['shape'] = df['shape'].fillna(df['shape'].value_counts().index[0])\n", 115 | "\n", 116 | "df['reportedTimestamp'] = pd.to_datetime(df['reportedTimestamp'])\n", 117 | "df['eventDate'] = pd.to_datetime(df['eventDate'])\n", 118 | "\n", 119 | "df['shape'] = df['shape'].astype('category')\n", 120 | "df['weather'] = df['weather'].astype('category')\n", 121 | "\n", 122 | "df['physicalEvidence'] = df['physicalEvidence'].replace({'Y': 1, 'N': 0})\n", 123 | "df['contact'] = df['contact'].replace({'Y': 1, 'N': 0})\n", 124 | "\n", 125 | "df['researchOutcome'] = df['researchOutcome'].astype('category')\n", 126 | "\n", 127 | "df.drop(columns=['firstName', 'lastName', 'sighting', 'reportedTimestamp', 'eventDate', 'eventTime'], inplace=True)\n", 128 | "\n", 129 | "# Let's one-hot the weather and shape attribute\n", 130 | "df = pd.get_dummies(df, columns=['weather', 'shape'])\n", 131 | "\n", 132 | "# Let's replace the researchOutcome values with 0, 1, 2 for Unexplained, Explained, and Probable\n", 133 | "df['researchOutcome'] = df['researchOutcome'].replace({'unexplained': 0, 'explained': 1, 'probable': 2})" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": { 140 | "scrolled": true 141 | }, 142 | "outputs": [], 143 | "source": [ 144 | "display(df.head())\n", 145 | "display(df.shape)" 146 | ] 147 | }, 148 | { 149 | "cell_type": "markdown", 150 | "metadata": {}, 151 | "source": [ 152 | "
" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "metadata": {}, 158 | "source": [ 159 | "---\n", 160 | "\n", 161 | "## Step 3: Creating and training our model (Linear Learner)\n", 162 | "\n", 163 | "Let's evaluate the Linear Learner algorithm as well. Let's go ahead and randomize the data again and get it ready for the Linear Leaner algorithm. We will also rearrange the columns so it is ready for the algorithm (it expects the first column to be the target attribute)" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": null, 169 | "metadata": {}, 170 | "outputs": [], 171 | "source": [ 172 | "np.random.seed(0)\n", 173 | "rand_split = np.random.rand(len(df))\n", 174 | "train_list = rand_split < 0.8\n", 175 | "val_list = (rand_split >= 0.8) & (rand_split < 0.9)\n", 176 | "test_list = rand_split >= 0.9\n", 177 | "\n", 178 | " # This dataset will be used to train the model.\n", 179 | "data_train = df[train_list]\n", 180 | "\n", 181 | "# This dataset will be used to validate the model.\n", 182 | "data_val = df[val_list]\n", 183 | "\n", 184 | "# This dataset will be used to test the model.\n", 185 | "data_test = df[test_list]\n", 186 | "\n", 187 | "# Breaks the datasets into attribute numpy.ndarray and the same for target attribute. \n", 188 | "train_X = data_train.drop(columns='researchOutcome').values\n", 189 | "train_y = data_train['researchOutcome'].values\n", 190 | "\n", 191 | "val_X = data_val.drop(columns='researchOutcome').values\n", 192 | "val_y = data_val['researchOutcome'].values\n", 193 | "\n", 194 | "test_X = data_test.drop(columns='researchOutcome').values\n", 195 | "test_y = data_test['researchOutcome'].values" 196 | ] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": {}, 201 | "source": [ 202 | "Next, Let's create recordIO file for the training data and upload it to S3." 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": null, 208 | "metadata": {}, 209 | "outputs": [], 210 | "source": [ 211 | "train_file = 'ufo_sightings_train_recordIO_protobuf.data'\n", 212 | "\n", 213 | "f = io.BytesIO()\n", 214 | "smac.write_numpy_to_dense_tensor(f, train_X.astype('float32'), train_y.astype('float32'))\n", 215 | "f.seek(0)\n", 216 | "\n", 217 | "boto3.Session().resource('s3').Bucket(bucket).Object('implementation_operations_lab/linearlearner_train/{}'.format(train_file)).upload_fileobj(f)\n", 218 | "training_recordIO_protobuf_location = 's3://{}/implementation_operations_lab/linearlearner_train/{}'.format(bucket, train_file)\n", 219 | "print('The Pipe mode recordIO protobuf training data: {}'.format(training_recordIO_protobuf_location))" 220 | ] 221 | }, 222 | { 223 | "cell_type": "markdown", 224 | "metadata": {}, 225 | "source": [ 226 | "Let's create recordIO file for the validation data and upload it to S3" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": {}, 233 | "outputs": [], 234 | "source": [ 235 | "validation_file = 'ufo_sightings_validatioin_recordIO_protobuf.data'\n", 236 | "\n", 237 | "f = io.BytesIO()\n", 238 | "smac.write_numpy_to_dense_tensor(f, val_X.astype('float32'), val_y.astype('float32'))\n", 239 | "f.seek(0)\n", 240 | "\n", 241 | "boto3.Session().resource('s3').Bucket(bucket).Object('implementation_operations_lab/linearlearner_validation/{}'.format(validation_file)).upload_fileobj(f)\n", 242 | "validate_recordIO_protobuf_location = 's3://{}/implementation_operations_lab/linearlearner_validation/{}'.format(bucket, validation_file)\n", 243 | "print('The Pipe mode recordIO protobuf validation data: {}'.format(validate_recordIO_protobuf_location))" 244 | ] 245 | }, 246 | { 247 | "cell_type": "markdown", 248 | "metadata": {}, 249 | "source": [ 250 | "---\n", 251 | "\n", 252 | "Alright we are good to go for the Linear Learner algorithm. Let's get everything we need from the ECR repository to call the Linear Learner algorithm." 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "from sagemaker import image_uris\n", 262 | "container = image_uris.retrieve('linear-learner', boto3.Session().region_name, '1')" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "metadata": {}, 269 | "outputs": [], 270 | "source": [ 271 | "# Create a training job name\n", 272 | "job_name = 'ufo-linear-learner-job-{}'.format(datetime.now().strftime(\"%Y%m%d%H%M%S\"))\n", 273 | "print('Here is the job name {}'.format(job_name))\n", 274 | "\n", 275 | "# Here is where the model-artifact will be stored\n", 276 | "output_location = 's3://{}/implementation_operations_lab/linearlearner_output'.format(bucket)" 277 | ] 278 | }, 279 | { 280 | "cell_type": "markdown", 281 | "metadata": {}, 282 | "source": [ 283 | "Next we start building out our model by using the SageMaker Python SDK and passing in everything that is required to create a Linear Learner model.\n", 284 | "\n", 285 | "First I like to always create a specific job name. Next, we'll need to specify training parameters.\n", 286 | "\n", 287 | "Finally, after everything is included and ready, then we can call the `.fit()` function which specifies the S3 location for training and validation data." 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": null, 293 | "metadata": { 294 | "scrolled": true 295 | }, 296 | "outputs": [], 297 | "source": [ 298 | "print('The feature_dim hyperparameter needs to be set to {}.'.format(data_train.shape[1] - 1))" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "metadata": {}, 305 | "outputs": [], 306 | "source": [ 307 | "sess = sagemaker.Session()\n", 308 | "\n", 309 | "# Setup the LinearLeaner algorithm from the ECR container\n", 310 | "linear = sagemaker.estimator.Estimator(container,\n", 311 | " role, \n", 312 | " instance_count=1, \n", 313 | " instance_type='ml.c4.xlarge',\n", 314 | " output_path=output_location,\n", 315 | " sagemaker_session=sess,\n", 316 | " input_mode='Pipe')\n", 317 | "# Setup the hyperparameters\n", 318 | "linear.set_hyperparameters(feature_dim=22,\n", 319 | " predictor_type='multiclass_classifier',\n", 320 | " num_classes=3\n", 321 | " # add optimized hyperparmeters here \n", 322 | " # add optimized hyperparmeters here \n", 323 | " # add optimized hyperparmeters here \n", 324 | " # add optimized hyperparmeters here \n", 325 | " # add optimized hyperparmeters here \n", 326 | " # add optimized hyperparmeters here\n", 327 | " )\n", 328 | "\n", 329 | "# Launch a training job. This method calls the CreateTrainingJob API call\n", 330 | "data_channels = {\n", 331 | " 'train': training_recordIO_protobuf_location,\n", 332 | " 'validation': validate_recordIO_protobuf_location\n", 333 | "}\n", 334 | "linear.fit(data_channels, job_name=job_name)" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": null, 340 | "metadata": {}, 341 | "outputs": [], 342 | "source": [ 343 | "print('Here is the location of the trained Linear Learner model: {}/{}/output/model.tar.gz'.format(output_location, job_name))" 344 | ] 345 | }, 346 | { 347 | "cell_type": "markdown", 348 | "metadata": {}, 349 | "source": [ 350 | "From here we have our trained model we can deploy into production!" 351 | ] 352 | }, 353 | { 354 | "cell_type": "markdown", 355 | "metadata": {}, 356 | "source": [ 357 | "---\n", 358 | "\n", 359 | "## Step 4: Deploying the model into SageMaker hosting\n", 360 | "\n", 361 | "Next, let's deploy the model into SageMaker hosting onto a single m4 instance. We can then use this instance to test the model with the test data that we help out at the beginning of the notebook. We can then evaluate things like accuracy, precision, recall, and f1 score. \n", 362 | "\n", 363 | "We can use some fancy libraries to build out a confusion matrix/heatmap to see how accurate our model is. " 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": null, 369 | "metadata": { 370 | "scrolled": false 371 | }, 372 | "outputs": [], 373 | "source": [ 374 | "multiclass_predictor = linear.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')" 375 | ] 376 | }, 377 | { 378 | "cell_type": "markdown", 379 | "metadata": {}, 380 | "source": [ 381 | "This next code is just setup code to allow us to draw out nice and pretty confusion matrix/heatmap. " 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": null, 387 | "metadata": {}, 388 | "outputs": [], 389 | "source": [ 390 | "from sklearn.metrics import confusion_matrix\n", 391 | "from sklearn.utils.multiclass import unique_labels\n", 392 | "\n", 393 | "def plot_confusion_matrix(y_true, y_pred, classes,\n", 394 | " normalize=False,\n", 395 | " title=None, \n", 396 | " cmap=None):\n", 397 | " \"\"\"\n", 398 | " This function prints and plots the confusion matrix.\n", 399 | " Normalization can be applied by setting `normalize=True`.\n", 400 | " \"\"\"\n", 401 | " if not title:\n", 402 | " if normalize:\n", 403 | " title = 'Normalized confusion matrix'\n", 404 | " plt.cm.Greens\n", 405 | " else:\n", 406 | " title = 'Confusion matrix, without normalization'\n", 407 | "\n", 408 | " # Compute confusion matrix\n", 409 | " cm = confusion_matrix(y_true, y_pred)\n", 410 | " # Only use the labels that appear in the data\n", 411 | " classes = classes[unique_labels(y_true, y_pred)]\n", 412 | " if normalize:\n", 413 | " cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n", 414 | "# print(\"Normalized confusion matrix\")\n", 415 | "# else:\n", 416 | "# print('Confusion matrix, without normalization')\n", 417 | "\n", 418 | "# print(cm)\n", 419 | "\n", 420 | " fig, ax = plt.subplots()\n", 421 | " im = ax.imshow(cm, interpolation='nearest', cmap=cmap)\n", 422 | " ax.figure.colorbar(im, ax=ax)\n", 423 | " # We want to show all ticks...\n", 424 | " ax.set(xticks=np.arange(cm.shape[1]),\n", 425 | " yticks=np.arange(cm.shape[0]),\n", 426 | " # ... and label them with the respective list entries\n", 427 | " xticklabels=classes, yticklabels=classes,\n", 428 | " title=title,\n", 429 | " ylabel='Actual',\n", 430 | " xlabel='Predicted')\n", 431 | "\n", 432 | " # Rotate the tick labels and set their alignment.\n", 433 | " plt.setp(ax.get_xticklabels(), rotation=45, ha=\"right\",\n", 434 | " rotation_mode=\"anchor\")\n", 435 | "\n", 436 | " # Loop over data dimensions and create text annotations.\n", 437 | " fmt = '.2f' if normalize else 'd'\n", 438 | " thresh = cm.max() / 2.\n", 439 | " for i in range(cm.shape[0]):\n", 440 | " for j in range(cm.shape[1]):\n", 441 | " ax.text(j, i, format(cm[i, j], fmt),\n", 442 | " ha=\"center\", va=\"center\",\n", 443 | " color=\"white\" if cm[i, j] > thresh else \"black\")\n", 444 | " fig.tight_layout()\n", 445 | " return ax\n", 446 | "\n", 447 | "\n", 448 | "np.set_printoptions(precision=2)" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": null, 454 | "metadata": {}, 455 | "outputs": [], 456 | "source": [ 457 | "# from sagemaker.predictor import json_deserializer, csv_serializer\n", 458 | "\n", 459 | "# multiclass_predictor.content_type = 'text/csv'\n", 460 | "multiclass_predictor.serializer = sagemaker.serializers.CSVSerializer()\n", 461 | "multiclass_predictor.deserializer = sagemaker.deserializers.JSONDeserializer()\n", 462 | "\n", 463 | "predictions = []\n", 464 | "results = multiclass_predictor.predict(test_X)\n", 465 | "predictions += [r['predicted_label'] for r in results['predictions']]\n", 466 | "predictions = np.array(predictions)" 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": null, 472 | "metadata": {}, 473 | "outputs": [], 474 | "source": [ 475 | "%matplotlib inline\n", 476 | "sns.set_context(\"paper\", font_scale=1.4)\n", 477 | "\n", 478 | "y_test = test_y\n", 479 | "y_pred = predictions\n", 480 | "\n", 481 | "class_names = np.array(['Unexplained', 'Explained', 'Probable'])\n", 482 | "\n", 483 | "# Plot non-normalized confusion matrix\n", 484 | "plot_confusion_matrix(y_test, y_pred, classes=class_names,\n", 485 | " title='Confusion matrix',\n", 486 | " cmap=plt.cm.Blues)\n", 487 | "plt.grid(False)\n", 488 | "plt.show()" 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": null, 494 | "metadata": {}, 495 | "outputs": [], 496 | "source": [ 497 | "from sklearn.metrics import precision_recall_fscore_support\n", 498 | "from sklearn.metrics import accuracy_score\n", 499 | "\n", 500 | "y_test = data_test['researchOutcome']\n", 501 | "y_pred = predictions\n", 502 | "scores = precision_recall_fscore_support(y_test, y_pred, average='macro', labels=np.unique(y_pred))\n", 503 | "acc = accuracy_score(y_test, y_pred)\n", 504 | "print('Accuracy is: {}'.format(acc))\n", 505 | "print('Precision is: {}'.format(scores[0]))\n", 506 | "print('Recall is: {}'.format(scores[1]))\n", 507 | "print('F1 score is: {}'.format(scores[2]))" 508 | ] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "execution_count": null, 513 | "metadata": {}, 514 | "outputs": [], 515 | "source": [] 516 | } 517 | ], 518 | "metadata": { 519 | "kernelspec": { 520 | "display_name": "conda_python3", 521 | "language": "python", 522 | "name": "conda_python3" 523 | }, 524 | "language_info": { 525 | "codemirror_mode": { 526 | "name": "ipython", 527 | "version": 3 528 | }, 529 | "file_extension": ".py", 530 | "mimetype": "text/x-python", 531 | "name": "python", 532 | "nbconvert_exporter": "python", 533 | "pygments_lexer": "ipython3", 534 | "version": "3.6.5" 535 | } 536 | }, 537 | "nbformat": 4, 538 | "nbformat_minor": 2 539 | } 540 | -------------------------------------------------------------------------------- /LAB-PerformRealTimeDataAnalysisWithKinesis/aws-config.txt: -------------------------------------------------------------------------------- 1 | [default] 2 | region = us-east-1 3 | -------------------------------------------------------------------------------- /LAB-PerformRealTimeDataAnalysisWithKinesis/kinesis-analytics-popular-captains.sql.txt: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE STREAM "CAPTAIN_SCORES" ("favoritecaptain" VARCHAR(32), average_rating DOUBLE, total_rating INTEGER); 2 | 3 | CREATE OR REPLACE PUMP "STREAM_PUMP" AS 4 | INSERT INTO "CAPTAIN_SCORES" 5 | SELECT STREAM "favoritecaptain", avg("rating") as average_rating, sum("rating") as total_rating 6 | FROM "SOURCE_SQL_STREAM_001" 7 | GROUP BY "favoritecaptain", STEP("SOURCE_SQL_STREAM_001".ROWTIME BY INTERVAL '1' MINUTE) 8 | ORDER BY STEP("SOURCE_SQL_STREAM_001".ROWTIME BY INTERVAL '1' MINUTE), avg("rating") DESC; 9 | -------------------------------------------------------------------------------- /LAB-PerformRealTimeDataAnalysisWithKinesis/kinesis-analytics-rating-anomaly.sql.txt: -------------------------------------------------------------------------------- 1 | CREATE OR REPLACE STREAM "RAW_ANOMALY_STREAM" ( 2 | "favoritecaptain" VARCHAR(32), 3 | "rating" INTEGER, 4 | "ANOMALY_SCORE" DOUBLE); 5 | 6 | CREATE OR REPLACE PUMP "RAW_PUMP" AS INSERT INTO "RAW_ANOMALY_STREAM" 7 | SELECT STREAM "favoritecaptain", "rating", "ANOMALY_SCORE" FROM 8 | TABLE(RANDOM_CUT_FOREST( 9 | CURSOR(SELECT STREAM "favoritecaptain", "rating" FROM "SOURCE_SQL_STREAM_001") 10 | )); 11 | 12 | CREATE OR REPLACE STREAM "ORDERED_ANOMALY_STREAM" ( 13 | "favoritecaptain" VARCHAR(32), 14 | "rating" INTEGER, 15 | "ANOMALY_SCORE" DOUBLE); 16 | 17 | -- Sort records by descending anomaly score, insert into output stream 18 | CREATE OR REPLACE PUMP "ORDERED_PUMP" AS INSERT INTO "ORDERED_ANOMALY_STREAM" 19 | SELECT STREAM * FROM "RAW_ANOMALY_STREAM" 20 | ORDER BY FLOOR("RAW_ANOMALY_STREAM".ROWTIME TO SECOND), "ANOMALY_SCORE" DESC; 21 | -------------------------------------------------------------------------------- /LAB-PerformRealTimeDataAnalysisWithKinesis/send_captains_to_cloud.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import boto3 3 | from faker import Faker 4 | import random 5 | import time 6 | import json 7 | 8 | DeliveryStreamName = 'captains-kfh' 9 | client = boto3.client('firehose') 10 | fake = Faker() 11 | 12 | captains = [ 13 | "Jean-Luc Picard", 14 | "James T. Kirk", 15 | "Han Solo", 16 | "Kathryn Janeway", 17 | "Malcolm Reynolds", 18 | "William Adama", 19 | "Turanga Leela", 20 | "Jacob Keyes", 21 | "Wilhuff Tarkin", 22 | "Christopher Pike", 23 | "David Bowman", 24 | "The Doctor", 25 | "John Robinson", 26 | "Khan Noonien Singh" 27 | ]; 28 | 29 | record = {} 30 | while True: 31 | 32 | record['user'] = fake.name(); 33 | if random.randint(1,100) < 5: 34 | record['favoritecaptain'] = "Neil Armstrong"; 35 | record['rating'] = random.randint(7000,9000); 36 | else: 37 | record['favoritecaptain'] = random.choice(captains); 38 | record['rating'] = random.randint(1, 1000); 39 | record['timestamp'] = time.time(); 40 | response = client.put_record( 41 | DeliveryStreamName=DeliveryStreamName, 42 | Record={ 43 | 'Data': json.dumps(record) 44 | } 45 | ) 46 | print('Record: ' + str(record)); 47 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 A Cloud Guru 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # COURSE_AWS_CERTIFIED_MACHINE_LEARNING 2 | An A Cloud Guru Course - 3 | [AWS Certified Machine Learning](https://acloud.guru/learn/aws-certified-machine-learning-specialty) 4 | 5 | Your executive board members are asking you to do something with it. Your grandmother is asking you if it will put you out of a job. Your deadbeat college roommate is asking if you can help him find a date with it. Everyone seems to fuss over Machine Learning, but how many of us truly understand it? Too few. 6 | 7 | Fortunately, ACG has your back yet again with a fresh course focused on helping you outsmart the new AWS Certified Machine Learning Specialty. In typical ACG manner, we have created a course that confronts the potentially dull and boring topic of machine learning head-on with quirky and engaging lectures, interactive labs and plenty of real-world, plain-speak examples. 8 | 9 | Prepared by [Brock Tubre](https://learn.acloud.guru/profile/brock-tubre) and [Scott Pletcher](https://learn.acloud.guru/profile/scott-pletcher) 2019 10 | 11 | Lab files for A Cloud Guru, Course - [AWS Certified Machine Learning](https://acloud.guru/learn/aws-certified-machine-learning-specialty) 12 | 13 | ## In this course you'll learn: 14 | - The domains of knowledge for the AWS Certified Machine Learning Speciality exam. 15 | - Best practices for using the tools and platforms of AWS for data engineering, data analysis, machine learning modeling, model evaluation and deployment. 16 | - Hands-on labs designed to challenge your intuition, creativity and knowledge of the AWS platform. 17 | 18 | With this course you'll get a solid understanding of the services and platforms available on AWS for Machine Learning projects, build a foundation to pass the certification exam and feel equipped to use the AWS ML portfolio in your own real-world applications. 19 | 20 | Don’t just sit idly by, watching as robotic overlords take over the world. Create your own army of sentient machines and beat them at their own game! And keep being awesome, cloud gurus! 21 | 22 | 23 | ## IMPORTANT 24 | Please note, this is provided as-is, neither I, nor A Cloud Guru support this code. If you do identify any errors, then please identify and we will attempt to fix on a best efforts basis. 25 | 26 | IMPORTANT - We recommend creating a new account or lab specs for this workshop. Using an existing account could cause damage or disruption to the resources in that account. 27 | 28 | These files are distributed on an AS IS BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied 29 | 30 | 31 | ## May 2019 32 | Initial Creation. 33 | 34 | --------------------------------------------------------------------------------