├── .gitignore ├── README.md ├── cfn_setup.yaml ├── decks ├── 01-Recommender_Workshop_01_Introduction.pdf ├── 02-Recommender_Workshop_02_Collaborative_Filtering.pdf ├── 03-Recommender_Workshop_03_Matrix_Factorization.pdf └── 04-Recommender_Workshop_04_Tuning.pdf └── notebooks ├── 01_exploring_data.ipynb ├── 02_clustering_users.ipynb ├── 03_factorization_machines.ipynb └── 03_factorization_machines_regression.ipynb /.gitignore: -------------------------------------------------------------------------------- 1 | .ipynb*/** 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Amazon SageMaker Recommender Workshop 2 | 3 | ## Agenda 4 | 5 | - {0.75} Introduction 6 | - {0.5} Overview of ML workflow -> decks/Recommender_Workshop_01_Introduction.pptx 7 | - {0.25} MovieLens data intro -> notebooks/01_exploring_data.ipynb 8 | - preflight: CFN template check -> cfn_setup.yaml 9 | - launch individual notebook instances 10 | - what sizes? 11 | - clone repo -> http://bit.ly/2wkaV0N (this repo) 12 | - [0.5] Unsupervised ML 13 | - concepts 14 | - hands-on 15 | - [0.25] **Break** 16 | - [1] Recommendation Engine 17 | - [0.25] concepts 18 | - [0.75] hands-on 19 | - show movie recommender website: http://sagemaker-nab-demo.s3-website-us-west-2.amazonaws.com/ 20 | - clone repo to individual notebook instances 21 | - run all at beginning of workshop 22 | - TBD: perform training locally? 23 | - during training: discussion of underlying mechanisms for SageMaker (Batch, ECR, etc) 24 | 25 | - [0.5] Hyperparameter Optimization (PPT+Demo) 26 | - overview 27 | - demo w/ XGBoost 28 | - [0.25] Wrap-up (PPT) 29 | - Other resources 30 | - Next steps 31 | 32 | 33 | ## References 34 | 35 | - https://grouplens.org/datasets/movielens/ 36 | - https://medium.com/@julsimon/building-a-movie-recommender-with-factorization-machines-on-amazon-sagemaker-cedbfc8c93d8 37 | - http://sagemaker-nab-demo.s3-website-us-west-2.amazonaws.com/ 38 | - https://sagemaker.readthedocs.io/en/latest/factorization_machines.html -------------------------------------------------------------------------------- /cfn_setup.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Parameters: 3 | InputBucketName: 4 | Type: "String" 5 | Description : "Enter a unique name for object storage bucket to store data" 6 | Resources: 7 | InputBucket: 8 | Type: 'AWS::S3::Bucket' 9 | Properties: 10 | BucketName: 11 | Ref: InputBucketName 12 | DeletionPolicy: Delete 13 | SagemakerRole: 14 | Type: "AWS::IAM::Role" 15 | Properties: 16 | AssumeRolePolicyDocument: 17 | Version: "2012-10-17" 18 | Statement: 19 | - 20 | Effect: "Allow" 21 | Principal: 22 | Service: 23 | - "sagemaker.amazonaws.com" 24 | Action: 25 | - "sts:AssumeRole" 26 | Path: "/" 27 | Policies: 28 | - 29 | PolicyName: "SagemakerAccessResources" 30 | PolicyDocument: 31 | Version: "2012-10-17" 32 | Statement: 33 | - 34 | Effect: "Allow" 35 | Action: 36 | - "sagemaker:*" 37 | - "ecr:GetAuthorizationToken" 38 | - "cloudwatch:PutMetricData" 39 | - "logs:CreateLogGroup" 40 | - "logs:CreateLogStream" 41 | - "logs:DescribeLogStreams" 42 | - "logs:PutLogEvents" 43 | - "logs:GetLogEvents" 44 | - "ec2:CreateNetworkInterface" 45 | - "ec2:CreateNetworkInterfacePermission" 46 | - "ec2:DeleteNetworkInterface" 47 | - "ec2:DeleteNetworkInterfacePermission" 48 | - "ec2:DescribeNetworkInterfaces" 49 | - "ec2:DescribeVpcs" 50 | - "ec2:DescribeDhcpOptions" 51 | - "ec2:DescribeSubnets" 52 | - "ec2:DescribeSecurityGroups" 53 | - "application-autoscaling:DeleteScalingPolicy" 54 | - "application-autoscaling:DeleteScheduledAction" 55 | - "application-autoscaling:DeregisterScalableTarget" 56 | - "application-autoscaling:DescribeScalableTargets" 57 | - "application-autoscaling:DescribeScalingActivities" 58 | - "application-autoscaling:DescribeScalingPolicies" 59 | - "application-autoscaling:DescribeScheduledActions" 60 | - "application-autoscaling:PutScalingPolicy" 61 | - "application-autoscaling:PutScheduledAction" 62 | - "application-autoscaling:RegisterScalableTarget" 63 | Resource: "*" 64 | - 65 | Effect: "Allow" 66 | Action: 67 | - "iam:PassRole" 68 | Resource: "*" 69 | Condition: 70 | StringEquals: 71 | "iam:PassedToService": "sagemaker.amazonaws.com" 72 | - 73 | Effect: "Allow" 74 | Action: 75 | - "s3:ListBucket" 76 | Resource: 77 | - 78 | !Join 79 | - '' 80 | - - 'arn:aws:s3:::' 81 | - !Ref InputBucket 82 | - 83 | Effect: "Allow" 84 | Action: 85 | - "s3:*" 86 | Resource: 87 | - 88 | !Join 89 | - '' 90 | - - 'arn:aws:s3:::' 91 | - !Ref InputBucket 92 | - '/*' 93 | - 94 | Effect: "Allow" 95 | Action: 96 | - "ecr:BatchCheckLayerAvailability" 97 | - "ecr:GetDownloadUrlForLayer" 98 | - "ecr:BatchGetImage" 99 | Resource: 100 | - "arn:aws:ecr:::repository/*" 101 | - 102 | Effect: "Allow" 103 | Action: 104 | - "iam:CreateServiceLinkedRole" 105 | Resource: 106 | - "arn:aws:iam::*:role/aws-service-role/sagemaker.application-autoscaling.amazonaws.com/AWSServiceRoleForApplicationAutoScaling_SageMakerEndpoint" 107 | Condition: 108 | StringLike: 109 | "iam:AWSServiceName": "sagemaker.application-autoscaling.amazonaws.com" 110 | Outputs: 111 | SageMakerRoleArn: 112 | Description: The Arn IAM Role for SageMaker to access S3 buckets and other resources 113 | Value: !GetAtt [ SagemakerRole, Arn ] -------------------------------------------------------------------------------- /decks/01-Recommender_Workshop_01_Introduction.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shirkeyaws/sagemaker-recommender-workshop/90b80dd3e7326386ec6174b0dff49c5044ced15b/decks/01-Recommender_Workshop_01_Introduction.pdf -------------------------------------------------------------------------------- /decks/02-Recommender_Workshop_02_Collaborative_Filtering.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shirkeyaws/sagemaker-recommender-workshop/90b80dd3e7326386ec6174b0dff49c5044ced15b/decks/02-Recommender_Workshop_02_Collaborative_Filtering.pdf -------------------------------------------------------------------------------- /decks/03-Recommender_Workshop_03_Matrix_Factorization.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shirkeyaws/sagemaker-recommender-workshop/90b80dd3e7326386ec6174b0dff49c5044ced15b/decks/03-Recommender_Workshop_03_Matrix_Factorization.pdf -------------------------------------------------------------------------------- /decks/04-Recommender_Workshop_04_Tuning.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shirkeyaws/sagemaker-recommender-workshop/90b80dd3e7326386ec6174b0dff49c5044ced15b/decks/04-Recommender_Workshop_04_Tuning.pdf -------------------------------------------------------------------------------- /notebooks/01_exploring_data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Exploring data\n", 8 | "\n", 9 | "We will use the MovieLens 100K dataset for performing our initial survey of user and movie data. \n", 10 | "\n", 11 | "See http://files.grouplens.org/datasets/movielens/ml-100k-README.txt for more information on this dataset." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import pandas as pd\n", 21 | "import matplotlib.pyplot as plt\n", 22 | "%pylab inline" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "!rm -f /tmp/ml-100k.zip\n", 32 | "!rm -rf /tmp/ml-100k\n", 33 | "!wget -O /tmp/ml-100k.zip http://files.grouplens.org/datasets/movielens/ml-100k.zip\n", 34 | "!unzip -j -o /tmp/ml-100k.zip -d /tmp/ml-100k" 35 | ] 36 | }, 37 | { 38 | "cell_type": "markdown", 39 | "metadata": {}, 40 | "source": [ 41 | "# MovieLens Dataset Info\n", 42 | "\n", 43 | "We can parse the file u.info to find details about this dataset\n", 44 | "\n", 45 | "```\n", 46 | "u.info -- The number of users, items, and ratings in the u data set.\n", 47 | "```" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "%cd /tmp/ml-100k\n", 57 | "!cat u.info" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "## Movies\n", 65 | "\n", 66 | "Let's start by looking at the movie data which represent the items that are rated and recommended\n", 67 | "\n", 68 | "First, we will look at the u.genre to understand how these movies are categorized" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [ 77 | "!cat u.genre" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "To find the information about each movie title in our dataset, we check the u.item file\n", 85 | "\n", 86 | "```\n", 87 | "u.item -- Information about the items (movies); this is a tab separated\n", 88 | " list of\n", 89 | " movie id | movie title | release date | video release date |\n", 90 | " IMDb URL | unknown | Action | Adventure | Animation |\n", 91 | " Children's | Comedy | Crime | Documentary | Drama | Fantasy |\n", 92 | " Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi |\n", 93 | " Thriller | War | Western |\n", 94 | " The last 19 fields are the genres, a 1 indicates the movie\n", 95 | " is of that genre, a 0 indicates it is not; movies can be in\n", 96 | " several genres at once.\n", 97 | " The movie ids are the ones used in the u.data data set.\n", 98 | "```" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": {}, 105 | "outputs": [], 106 | "source": [ 107 | "!head -10 u.item" 108 | ] 109 | }, 110 | { 111 | "cell_type": "markdown", 112 | "metadata": {}, 113 | "source": [ 114 | "The list of items has several columns of 0/1 which represent the one-hot encoding of genere information -- we'll add that to our table header as we load the data in a Pandas dataframe" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": null, 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "items = pd.read_csv(\"u.item\", encoding = \"ISO-8859-1\", sep='|', header=None,\n", 124 | " names=['title', 'published','', 'url','genre_unknown', 'genre_action', 'genre_adventure', 'genre_animation', 'genre_childrens','genre_comedy','genre_crime','genre_documentary','genre_drama','genre_fantasy','genre_film-noir','genre_horror','genre_musical','genre_mystery','genre_romance','genre_scifi','genre_thriller','genre_war','genre_western'])\n", 125 | "items.head()" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "## Users\n", 133 | "\n", 134 | "We have some limited demographics data about users:\n", 135 | "\n", 136 | "```\n", 137 | "u.user -- Demographic information about the users; this is a tab\n", 138 | " separated list of\n", 139 | " user id | age | gender | occupation | zip code\n", 140 | " The user ids are the ones used in the u.data data set.\n", 141 | "```\n", 142 | "Let's have a look" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "!cat u.user" 152 | ] 153 | }, 154 | { 155 | "cell_type": "markdown", 156 | "metadata": {}, 157 | "source": [ 158 | "Users are identified by age, gender, occupation and postal code" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": null, 164 | "metadata": {}, 165 | "outputs": [], 166 | "source": [ 167 | "users = pd.read_csv(\"u.user\", sep='|', header=None, \n", 168 | " names=['userid', 'age', 'gender', 'occupation', 'postal_code'])\n", 169 | "users.head()" 170 | ] 171 | }, 172 | { 173 | "cell_type": "code", 174 | "execution_count": null, 175 | "metadata": {}, 176 | "outputs": [], 177 | "source": [ 178 | "users['age'].hist(bins=10)" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": null, 184 | "metadata": {}, 185 | "outputs": [], 186 | "source": [ 187 | "users['gender'].hist(bins=2)" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": {}, 193 | "source": [ 194 | "*We'll return to users analysis later when we perform user clustering in 02_clustering_users.ipynb*" 195 | ] 196 | }, 197 | { 198 | "cell_type": "markdown", 199 | "metadata": {}, 200 | "source": [ 201 | "# Ratings\n", 202 | "\n", 203 | "Let's now have a look at the ratings data -- the u.data file contains individual user preference for movies/items as specified in 1 (least liked) to 5(most liked) ratings for a limited set of movies.\n", 204 | "\n", 205 | "```\n", 206 | "u.data -- The full u data set, 100000 ratings by 943 users on 1682 items.\n", 207 | " Each user has rated at least 20 movies. Users and items are\n", 208 | " numbered consecutively from 1. The data is randomly\n", 209 | " ordered. This is a tab separated list of \n", 210 | "\t user id | item id | rating | timestamp. \n", 211 | " The time stamps are unix seconds since 1/1/1970 UTC \n", 212 | "```" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "data = pd.read_csv(\"u.data\", sep='\\t', header=None, \n", 222 | " names=['userid', 'movieid', 'rating', 'timestamp'])\n", 223 | "data.head()" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [ 232 | "print(\"Number of Users: %d\" % (data['userid'].max()))\n", 233 | "print(\"Number of Movies: %d\" % (data['movieid'].max()))" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "by_user = data.groupby('userid')\n", 243 | "ratings_hist = by_user['movieid'].count().hist(bins=100)\n", 244 | "ratings_hist.set_title('Ratings count distribution')" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": null, 250 | "metadata": {}, 251 | "outputs": [], 252 | "source": [ 253 | "cust_size = by_user.size()\n", 254 | "cust_size.sample(random_state=42)\n", 255 | "\n", 256 | "#print(cust_size)\n", 257 | "#cust_size.plot(kind='bar')\n", 258 | "#cust_size.count()" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "metadata": {}, 265 | "outputs": [], 266 | "source": [ 267 | "data['rating'].describe()" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": null, 273 | "metadata": {}, 274 | "outputs": [], 275 | "source": [ 276 | "toy_story=data[(data.movieid == 1)]\n", 277 | "toy_story.head()\n", 278 | "#toy_story['rating'].hist()" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "metadata": {}, 285 | "outputs": [], 286 | "source": [ 287 | "toy_story['rating'].mean()" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": null, 293 | "metadata": {}, 294 | "outputs": [], 295 | "source": [ 296 | "toy_story['rating'].hist()" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": null, 302 | "metadata": {}, 303 | "outputs": [], 304 | "source": [] 305 | } 306 | ], 307 | "metadata": { 308 | "kernelspec": { 309 | "display_name": "conda_python3", 310 | "language": "python", 311 | "name": "conda_python3" 312 | }, 313 | "language_info": { 314 | "codemirror_mode": { 315 | "name": "ipython", 316 | "version": 3 317 | }, 318 | "file_extension": ".py", 319 | "mimetype": "text/x-python", 320 | "name": "python", 321 | "nbconvert_exporter": "python", 322 | "pygments_lexer": "ipython3", 323 | "version": "3.6.4" 324 | } 325 | }, 326 | "nbformat": 4, 327 | "nbformat_minor": 2 328 | } 329 | -------------------------------------------------------------------------------- /notebooks/02_clustering_users.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "import numpy as np\n", 11 | "import sagemaker\n", 12 | "from sagemaker import get_execution_role\n", 13 | "\n", 14 | "role = get_execution_role()" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": null, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "sess = sagemaker.Session()\n", 24 | "bucket = sess.default_bucket()" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": null, 30 | "metadata": {}, 31 | "outputs": [], 32 | "source": [ 33 | "# NOTE: this step is not strictly required if you've already run the 01_... notebook\n", 34 | "!rm -f /tmp/ml-100k.zip\n", 35 | "!rm -rf /tmp/ml-100k\n", 36 | "!wget -O /tmp/ml-100k.zip http://files.grouplens.org/datasets/movielens/ml-100k.zip\n", 37 | "!unzip -j -o /tmp/ml-100k.zip -d /tmp/ml-100k" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "users = pd.read_csv(\"/tmp/ml-100k/u.user\", sep='|', header=None, index_col=['userid'],\n", 47 | " names=['userid', 'age', 'gender', 'occupation', 'postal_code'])\n", 48 | "users = users.drop('postal_code',1)\n", 49 | "users.head()" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "users_onehot=pd.get_dummies(users)\n", 59 | "users_onehot.head()" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": {}, 66 | "outputs": [], 67 | "source": [ 68 | "users_onehot_df = users_onehot.values.astype(np.float32) # built-in K-Means requires float32\n", 69 | "print(users_onehot_df)" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": null, 75 | "metadata": {}, 76 | "outputs": [], 77 | "source": [ 78 | "from sagemaker import KMeans\n", 79 | "\n", 80 | "data_location = 's3://{}/recommender_workshop/kmeans/data'.format(bucket)\n", 81 | "output_location = 's3://{}/recommender_workshop/kmeans/output'.format(bucket)\n", 82 | "\n", 83 | "print('training data will be uploaded to: {}'.format(data_location))\n", 84 | "print('training artifacts will be uploaded to: {}'.format(output_location))\n", 85 | "\n", 86 | "#!aws s3 cp /tmp/ml-100k/u.user $data_location/u.user\n", 87 | "\n", 88 | "k_value=5 #number of clusters\n", 89 | "kmeans = KMeans(role=role,\n", 90 | " train_instance_count=1,\n", 91 | " train_instance_type='ml.c4.2xlarge',\n", 92 | " output_path=output_location,\n", 93 | " k=k_value,\n", 94 | " data_location=data_location)" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "%%time\n", 104 | "\n", 105 | "kmeans.fit(kmeans.record_set(users_onehot_df))" 106 | ] 107 | }, 108 | { 109 | "cell_type": "code", 110 | "execution_count": null, 111 | "metadata": {}, 112 | "outputs": [], 113 | "source": [ 114 | "%%time\n", 115 | "\n", 116 | "kmeans_predictor = kmeans.deploy(initial_instance_count=1,\n", 117 | " instance_type='ml.m4.xlarge')" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": null, 123 | "metadata": {}, 124 | "outputs": [], 125 | "source": [ 126 | "pd.get_dummies(users.head())" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "result = kmeans_predictor.predict(users_onehot_df[0:5])\n", 136 | "print(result)" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "result = kmeans_predictor.predict(users_onehot_df)\n", 146 | "\n", 147 | "cluster=[]\n", 148 | "for i in range(k_value):\n", 149 | " cluster.append([r.label['distance_to_cluster'].float32_tensor.values[0] for r in result if r.label['closest_cluster'].float32_tensor.values[0] == i])\n", 150 | "\n", 151 | "cluster_zip = sorted(zip())\n" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "%matplotlib inline\n", 161 | "import matplotlib.pyplot as plt\n", 162 | "\n", 163 | "for i in range(k_value):\n", 164 | " fig,ax = plt.subplots()\n", 165 | " ax.hist(cluster[i])\n", 166 | "\n", 167 | "plt.plot()" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "import sagemaker\n", 177 | "sagemaker.Session().delete_endpoint(kmeans_predictor.endpoint)" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [] 186 | } 187 | ], 188 | "metadata": { 189 | "kernelspec": { 190 | "display_name": "conda_python3", 191 | "language": "python", 192 | "name": "conda_python3" 193 | }, 194 | "language_info": { 195 | "codemirror_mode": { 196 | "name": "ipython", 197 | "version": 3 198 | }, 199 | "file_extension": ".py", 200 | "mimetype": "text/x-python", 201 | "name": "python", 202 | "nbconvert_exporter": "python", 203 | "pygments_lexer": "ipython3", 204 | "version": "3.6.4" 205 | } 206 | }, 207 | "nbformat": 4, 208 | "nbformat_minor": 2 209 | } 210 | -------------------------------------------------------------------------------- /notebooks/03_factorization_machines.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Movie recommendation on Amazon SageMaker with Factorization Machines\n", 8 | "\n", 9 | "See Julien's original post here:\n", 10 | "https://medium.com/@julsimon/building-a-movie-recommender-with-factorization-machines-on-amazon-sagemaker-cedbfc8c93d8\n", 11 | "\n", 12 | "This notebook:\n", 13 | "https://raw.githubusercontent.com/juliensimon/dlnotebooks/master/sagemaker/03-Factorization-Machines-Movielens.ipynb" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "### Download ml-100k dataset" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "!rm -f /tmp/dataset.zip\n", 30 | "!rm -rf /tmp/dataset\n", 31 | "!wget http://files.grouplens.org/datasets/movielens/ml-100k.zip -O /tmp/dataset.zip\n", 32 | "!unzip -j -o /tmp/dataset.zip -d /tmp/dataset" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "%cd /tmp/dataset\n", 42 | "!shuf ua.base -o ua.base.shuffled\n", 43 | "!head -10 ua.base.shuffled" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "!head -10 ua.test" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "import sagemaker\n", 62 | "import sagemaker.amazon.common as smac\n", 63 | "from sagemaker import get_execution_role\n", 64 | "from sagemaker.predictor import json_deserializer\n", 65 | "\n", 66 | "import boto3, csv, io, json\n", 67 | "import numpy as np\n", 68 | "from scipy.sparse import lil_matrix" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "### Build training set and test set" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "nbUsers=943\n", 85 | "nbMovies=1682\n", 86 | "nbFeatures=nbUsers+nbMovies\n", 87 | "\n", 88 | "nbRatingsTrain=90570\n", 89 | "nbRatingsTest=9430" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "# For each user, build a list of rated movies.\n", 99 | "# We'd need this to add random negative samples.\n", 100 | "moviesByUser = {}\n", 101 | "for userId in range(nbUsers):\n", 102 | " moviesByUser[str(userId)]=[]\n", 103 | " \n", 104 | "with open('ua.base.shuffled','r') as f:\n", 105 | " samples=csv.reader(f,delimiter='\\t')\n", 106 | " for userId,movieId,rating,timestamp in samples:\n", 107 | " moviesByUser[str(int(userId)-1)].append(int(movieId)-1) " 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "def loadDataset(filename, lines, columns):\n", 117 | " # Features are one-hot encoded in a sparse matrix\n", 118 | " X = lil_matrix((lines, columns)).astype('float32')\n", 119 | " # Labels are stored in a vector\n", 120 | " Y = []\n", 121 | " line=0\n", 122 | " with open(filename,'r') as f:\n", 123 | " samples=csv.reader(f,delimiter='\\t')\n", 124 | " for userId,movieId,rating,timestamp in samples:\n", 125 | " X[line,int(userId)-1] = 1\n", 126 | " X[line,int(nbUsers)+int(movieId)-1] = 1\n", 127 | " if int(rating) >= 4:\n", 128 | " Y.append(1)\n", 129 | " else:\n", 130 | " Y.append(0)\n", 131 | " line=line+1\n", 132 | " \n", 133 | " Y=np.array(Y).astype('float32')\n", 134 | " return X,Y" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "X_train, Y_train = loadDataset('ua.base.shuffled', nbRatingsTrain, nbFeatures)\n", 144 | "X_test, Y_test = loadDataset('ua.test',nbRatingsTest,nbFeatures)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "print(X_train.shape)\n", 154 | "print(Y_train.shape)\n", 155 | "assert X_train.shape == (nbRatingsTrain, nbFeatures)\n", 156 | "assert Y_train.shape == (nbRatingsTrain, )\n", 157 | "zero_labels = np.count_nonzero(Y_train)\n", 158 | "print(\"Training labels: %d zeros, %d ones\" % (zero_labels, nbRatingsTrain-zero_labels))\n", 159 | "\n", 160 | "print(X_test.shape)\n", 161 | "print(Y_test.shape)\n", 162 | "assert X_test.shape == (nbRatingsTest, nbFeatures)\n", 163 | "assert Y_test.shape == (nbRatingsTest, )\n", 164 | "zero_labels = np.count_nonzero(Y_test)\n", 165 | "print(\"Test labels: %d zeros, %d ones\" % (zero_labels, nbRatingsTest-zero_labels))" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "### Convert to protobuf and save to S3" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "import sagemaker\n", 182 | "\n", 183 | "sess = sagemaker.Session()\n", 184 | "bucket = sess.default_bucket()\n", 185 | "\n", 186 | "prefix = 'sagemaker/fm-movielens'\n", 187 | "\n", 188 | "train_key = 'train.protobuf'\n", 189 | "train_prefix = '{}/{}'.format(prefix, 'train3')\n", 190 | "\n", 191 | "test_key = 'test.protobuf'\n", 192 | "test_prefix = '{}/{}'.format(prefix, 'test3')\n", 193 | "\n", 194 | "output_prefix = 's3://{}/{}/output'.format(bucket, prefix)" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": null, 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "def writeDatasetToProtobuf(X, Y, bucket, prefix, key):\n", 204 | " buf = io.BytesIO()\n", 205 | " smac.write_spmatrix_to_sparse_tensor(buf, X, Y)\n", 206 | " buf.seek(0)\n", 207 | " obj = '{}/{}'.format(prefix, key)\n", 208 | " boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf)\n", 209 | " return 's3://{}/{}'.format(bucket,obj)\n", 210 | " \n", 211 | "train_data = writeDatasetToProtobuf(X_train, Y_train, bucket, train_prefix, train_key) \n", 212 | "test_data = writeDatasetToProtobuf(X_test, Y_test, bucket, test_prefix, test_key) \n", 213 | " \n", 214 | "print(train_data)\n", 215 | "print(test_data)\n", 216 | "print('Output: {}'.format(output_prefix))" 217 | ] 218 | }, 219 | { 220 | "cell_type": "markdown", 221 | "metadata": {}, 222 | "source": [ 223 | "### Run training job" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": null, 229 | "metadata": {}, 230 | "outputs": [], 231 | "source": [ 232 | "containers = {'us-west-2': '174872318107.dkr.ecr.us-west-2.amazonaws.com/factorization-machines:latest',\n", 233 | " 'us-east-1': '382416733822.dkr.ecr.us-east-1.amazonaws.com/factorization-machines:latest',\n", 234 | " 'us-east-2': '404615174143.dkr.ecr.us-east-2.amazonaws.com/factorization-machines:latest',\n", 235 | " 'eu-west-1': '438346466558.dkr.ecr.eu-west-1.amazonaws.com/factorization-machines:latest'}" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [ 244 | "%%time\n", 245 | "\n", 246 | "fm = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],\n", 247 | " get_execution_role(), \n", 248 | " train_instance_count=1, \n", 249 | " train_instance_type='ml.c4.xlarge',\n", 250 | " output_path=output_prefix,\n", 251 | " sagemaker_session=sagemaker.Session())\n", 252 | "\n", 253 | "fm.set_hyperparameters(feature_dim=nbFeatures,\n", 254 | " predictor_type='binary_classifier',\n", 255 | " mini_batch_size=1000,\n", 256 | " num_factors=64,\n", 257 | " epochs=100)\n", 258 | "\n", 259 | "fm.fit({'train': train_data, 'test': test_data})" 260 | ] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "metadata": {}, 265 | "source": [ 266 | "### Deploy model" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": null, 272 | "metadata": { 273 | "scrolled": true 274 | }, 275 | "outputs": [], 276 | "source": [ 277 | "%%time\n", 278 | "\n", 279 | "fm_predictor = fm.deploy(instance_type='ml.c4.xlarge', initial_instance_count=1)" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": null, 285 | "metadata": {}, 286 | "outputs": [], 287 | "source": [ 288 | "def fm_serializer(data):\n", 289 | " js = {'instances': []}\n", 290 | " for row in data:\n", 291 | " js['instances'].append({'features': row.tolist()})\n", 292 | " #print js\n", 293 | " return json.dumps(js)\n", 294 | "\n", 295 | "fm_predictor.content_type = 'application/json'\n", 296 | "fm_predictor.serializer = fm_serializer\n", 297 | "fm_predictor.deserializer = json_deserializer" 298 | ] 299 | }, 300 | { 301 | "cell_type": "markdown", 302 | "metadata": {}, 303 | "source": [ 304 | "## Run predictions\n", 305 | "First let's run a prediction test against our set-aside data" 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": {}, 312 | "outputs": [], 313 | "source": [ 314 | "result = fm_predictor.predict(X_test[1000:1010].toarray())\n", 315 | "print(result)\n", 316 | "print (Y_test[1000:1010])" 317 | ] 318 | }, 319 | { 320 | "cell_type": "code", 321 | "execution_count": null, 322 | "metadata": {}, 323 | "outputs": [], 324 | "source": [ 325 | "%%time\n", 326 | "\n", 327 | "correct_predictions = 0\n", 328 | "total_predictions = X_test.shape[0]\n", 329 | "for i in range(total_predictions):\n", 330 | " result = fm_predictor.predict(X_test[i].toarray())\n", 331 | " if(int(result['predictions'][0]['predicted_label']) == int(Y_test[i])):\n", 332 | " correct_predictions += 1\n", 333 | " #print(\"match: \" + str(result['predictions'][0]['predicted_label']) + \" \" + str(Y_test[i]))" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": {}, 340 | "outputs": [], 341 | "source": [ 342 | "accuracy_predictions = correct_predictions/(total_predictions*1.0)\n", 343 | "print('Total predictions: {}'.format(total_predictions)) \n", 344 | "print('Correct predictions: {}'.format(correct_predictions))\n", 345 | "print('Accuracy: {}%'.format(accuracy_predictions*100))" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": null, 351 | "metadata": {}, 352 | "outputs": [], 353 | "source": [ 354 | "import sagemaker\n", 355 | "sagemaker.Session().delete_endpoint(fm_predictor.endpoint)" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "metadata": {}, 362 | "outputs": [], 363 | "source": [] 364 | } 365 | ], 366 | "metadata": { 367 | "kernelspec": { 368 | "display_name": "conda_python2", 369 | "language": "python", 370 | "name": "conda_python2" 371 | }, 372 | "language_info": { 373 | "codemirror_mode": { 374 | "name": "ipython", 375 | "version": 2 376 | }, 377 | "file_extension": ".py", 378 | "mimetype": "text/x-python", 379 | "name": "python", 380 | "nbconvert_exporter": "python", 381 | "pygments_lexer": "ipython2", 382 | "version": "2.7.14" 383 | } 384 | }, 385 | "nbformat": 4, 386 | "nbformat_minor": 2 387 | } 388 | -------------------------------------------------------------------------------- /notebooks/03_factorization_machines_regression.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Movie recommendation on Amazon SageMaker with Factorization Machines\n", 8 | "\n", 9 | "See Julien's original post here:\n", 10 | "https://medium.com/@julsimon/building-a-movie-recommender-with-factorization-machines-on-amazon-sagemaker-cedbfc8c93d8\n", 11 | "\n", 12 | "This notebook:\n", 13 | "https://raw.githubusercontent.com/juliensimon/dlnotebooks/master/sagemaker/03-Factorization-Machines-Movielens.ipynb" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": {}, 19 | "source": [ 20 | "### Download ml-100k dataset" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": null, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "!rm -f /tmp/dataset.zip\n", 30 | "!rm -rf /tmp/dataset\n", 31 | "!wget http://files.grouplens.org/datasets/movielens/ml-100k.zip -O /tmp/dataset.zip\n", 32 | "!unzip -j -o /tmp/dataset.zip -d /tmp/dataset" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": null, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "%cd /tmp/dataset\n", 42 | "!shuf ua.base -o ua.base.shuffled\n", 43 | "!head -10 ua.base.shuffled" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": null, 49 | "metadata": {}, 50 | "outputs": [], 51 | "source": [ 52 | "!head -10 ua.test" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": null, 58 | "metadata": {}, 59 | "outputs": [], 60 | "source": [ 61 | "import sagemaker\n", 62 | "import sagemaker.amazon.common as smac\n", 63 | "from sagemaker import get_execution_role\n", 64 | "from sagemaker.predictor import json_deserializer\n", 65 | "\n", 66 | "import boto3, csv, io, json\n", 67 | "import numpy as np\n", 68 | "from scipy.sparse import lil_matrix" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": {}, 74 | "source": [ 75 | "### Build training set and test set" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": null, 81 | "metadata": {}, 82 | "outputs": [], 83 | "source": [ 84 | "nbUsers=943\n", 85 | "nbMovies=1682\n", 86 | "nbFeatures=nbUsers+nbMovies\n", 87 | "\n", 88 | "nbRatingsTrain=90570\n", 89 | "nbRatingsTest=9430" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": null, 95 | "metadata": {}, 96 | "outputs": [], 97 | "source": [ 98 | "# For each user, build a list of rated movies.\n", 99 | "# We'd need this to add random negative samples.\n", 100 | "moviesByUser = {}\n", 101 | "for userId in range(nbUsers):\n", 102 | " moviesByUser[str(userId)]=[]\n", 103 | " \n", 104 | "with open('ua.base.shuffled','r') as f:\n", 105 | " samples=csv.reader(f,delimiter='\\t')\n", 106 | " for userId,movieId,rating,timestamp in samples:\n", 107 | " moviesByUser[str(int(userId)-1)].append(int(movieId)-1) " 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "def loadDataset(filename, lines, columns):\n", 117 | " # Features are one-hot encoded in a sparse matrix\n", 118 | " X = lil_matrix((lines, columns)).astype('float32')\n", 119 | " # Labels are stored in a vector\n", 120 | " Y = []\n", 121 | " line=0\n", 122 | " with open(filename,'r') as f:\n", 123 | " samples=csv.reader(f,delimiter='\\t')\n", 124 | " for userId,movieId,rating,timestamp in samples:\n", 125 | " X[line,int(userId)-1] = 1\n", 126 | " X[line,int(nbUsers)+int(movieId)-1] = 1\n", 127 | " if int(rating) >= 4:\n", 128 | " Y.append(1)\n", 129 | " else:\n", 130 | " Y.append(0)\n", 131 | " line=line+1\n", 132 | " \n", 133 | " Y=np.array(Y).astype('float32')\n", 134 | " return X,Y" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": null, 140 | "metadata": {}, 141 | "outputs": [], 142 | "source": [ 143 | "X_train, Y_train = loadDataset('ua.base.shuffled', nbRatingsTrain, nbFeatures)\n", 144 | "X_test, Y_test = loadDataset('ua.test',nbRatingsTest,nbFeatures)" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [ 153 | "print(X_train.shape)\n", 154 | "print(Y_train.shape)\n", 155 | "assert X_train.shape == (nbRatingsTrain, nbFeatures)\n", 156 | "assert Y_train.shape == (nbRatingsTrain, )\n", 157 | "zero_labels = np.count_nonzero(Y_train)\n", 158 | "print(\"Training labels: %d zeros, %d ones\" % (zero_labels, nbRatingsTrain-zero_labels))\n", 159 | "\n", 160 | "print(X_test.shape)\n", 161 | "print(Y_test.shape)\n", 162 | "assert X_test.shape == (nbRatingsTest, nbFeatures)\n", 163 | "assert Y_test.shape == (nbRatingsTest, )\n", 164 | "zero_labels = np.count_nonzero(Y_test)\n", 165 | "print(\"Test labels: %d zeros, %d ones\" % (zero_labels, nbRatingsTest-zero_labels))" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "### Convert to protobuf and save to S3" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": null, 178 | "metadata": {}, 179 | "outputs": [], 180 | "source": [ 181 | "bucket = 'change-me'\n", 182 | "prefix = 'sagemaker/fm-movielens'\n", 183 | "\n", 184 | "train_key = 'train.protobuf'\n", 185 | "train_prefix = '{}/{}'.format(prefix, 'train3')\n", 186 | "\n", 187 | "test_key = 'test.protobuf'\n", 188 | "test_prefix = '{}/{}'.format(prefix, 'test3')\n", 189 | "\n", 190 | "output_prefix = 's3://{}/{}/output'.format(bucket, prefix)" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "assert(bucket!='change-me'), \"Please change your bucket id\"" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "def writeDatasetToProtobuf(X, Y, bucket, prefix, key):\n", 209 | " buf = io.BytesIO()\n", 210 | " smac.write_spmatrix_to_sparse_tensor(buf, X, Y)\n", 211 | " buf.seek(0)\n", 212 | " obj = '{}/{}'.format(prefix, key)\n", 213 | " boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf)\n", 214 | " return 's3://{}/{}'.format(bucket,obj)\n", 215 | " \n", 216 | "train_data = writeDatasetToProtobuf(X_train, Y_train, bucket, train_prefix, train_key) \n", 217 | "test_data = writeDatasetToProtobuf(X_test, Y_test, bucket, test_prefix, test_key) \n", 218 | " \n", 219 | "print(train_data)\n", 220 | "print(test_data)\n", 221 | "print('Output: {}'.format(output_prefix))" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "### Run training job" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": null, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "containers = {'us-west-2': '174872318107.dkr.ecr.us-west-2.amazonaws.com/factorization-machines:latest',\n", 238 | " 'us-east-1': '382416733822.dkr.ecr.us-east-1.amazonaws.com/factorization-machines:latest',\n", 239 | " 'us-east-2': '404615174143.dkr.ecr.us-east-2.amazonaws.com/factorization-machines:latest',\n", 240 | " 'eu-west-1': '438346466558.dkr.ecr.eu-west-1.amazonaws.com/factorization-machines:latest'}" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": null, 246 | "metadata": {}, 247 | "outputs": [], 248 | "source": [ 249 | "%%time\n", 250 | "\n", 251 | "fm = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],\n", 252 | " get_execution_role(), \n", 253 | " train_instance_count=1, \n", 254 | " train_instance_type='ml.c4.xlarge',\n", 255 | " output_path=output_prefix,\n", 256 | " sagemaker_session=sagemaker.Session())\n", 257 | "\n", 258 | "fm.set_hyperparameters(feature_dim=nbFeatures,\n", 259 | " predictor_type='binary_classifier',\n", 260 | " mini_batch_size=1000,\n", 261 | " num_factors=64,\n", 262 | " _speedometer_period=10,\n", 263 | " epochs=100)\n", 264 | "\n", 265 | "fm.fit({'train': train_data, 'test': test_data})" 266 | ] 267 | }, 268 | { 269 | "cell_type": "markdown", 270 | "metadata": {}, 271 | "source": [ 272 | "### Deploy model" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": null, 278 | "metadata": { 279 | "scrolled": true 280 | }, 281 | "outputs": [], 282 | "source": [ 283 | "%%time\n", 284 | "\n", 285 | "fm_predictor = fm.deploy(instance_type='ml.c4.xlarge', initial_instance_count=1)" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": null, 291 | "metadata": {}, 292 | "outputs": [], 293 | "source": [ 294 | "def fm_serializer(data):\n", 295 | " js = {'instances': []}\n", 296 | " for row in data:\n", 297 | " js['instances'].append({'features': row.tolist()})\n", 298 | " #print js\n", 299 | " return json.dumps(js)\n", 300 | "\n", 301 | "fm_predictor.content_type = 'application/json'\n", 302 | "fm_predictor.serializer = fm_serializer\n", 303 | "fm_predictor.deserializer = json_deserializer" 304 | ] 305 | }, 306 | { 307 | "cell_type": "markdown", 308 | "metadata": {}, 309 | "source": [ 310 | "## Run predictions\n", 311 | "First let's run a prediction test against our set-aside data" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": null, 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [ 320 | "result = fm_predictor.predict(X_test[1000:1010].toarray())\n", 321 | "print(result)\n", 322 | "print (Y_test[1000:1010])" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "metadata": {}, 329 | "outputs": [], 330 | "source": [ 331 | "%%time\n", 332 | "\n", 333 | "correct_predictions = 0\n", 334 | "total_predictions = X_test.shape[0]\n", 335 | "for i in range(total_predictions):\n", 336 | " result = fm_predictor.predict(X_test[i].toarray())\n", 337 | " if(int(result['predictions'][0]['predicted_label']) == int(Y_test[i])):\n", 338 | " correct_predictions += 1\n", 339 | " #print(\"match: \" + str(result['predictions'][0]['predicted_label']) + \" \" + str(Y_test[i]))" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": [ 348 | "accuracy_predictions = correct_predictions/(total_predictions*1.0)\n", 349 | "print('Total predictions: {}'.format(total_predictions)) \n", 350 | "print('Correct predictions: {}'.format(correct_predictions))\n", 351 | "print('Accuracy: {}%'.format(accuracy_predictions*100))" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": null, 357 | "metadata": {}, 358 | "outputs": [], 359 | "source": [ 360 | "import sagemaker\n", 361 | "sagemaker.Session().delete_endpoint(fm_predictor.endpoint)" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": null, 367 | "metadata": {}, 368 | "outputs": [], 369 | "source": [] 370 | } 371 | ], 372 | "metadata": { 373 | "kernelspec": { 374 | "display_name": "conda_python2", 375 | "language": "python", 376 | "name": "conda_python2" 377 | }, 378 | "language_info": { 379 | "codemirror_mode": { 380 | "name": "ipython", 381 | "version": 2 382 | }, 383 | "file_extension": ".py", 384 | "mimetype": "text/x-python", 385 | "name": "python", 386 | "nbconvert_exporter": "python", 387 | "pygments_lexer": "ipython2", 388 | "version": "2.7.14" 389 | } 390 | }, 391 | "nbformat": 4, 392 | "nbformat_minor": 2 393 | } 394 | --------------------------------------------------------------------------------