├── .gitignore
├── README.md
├── cfn_setup.yaml
├── decks
    ├── 01-Recommender_Workshop_01_Introduction.pdf
    ├── 02-Recommender_Workshop_02_Collaborative_Filtering.pdf
    ├── 03-Recommender_Workshop_03_Matrix_Factorization.pdf
    └── 04-Recommender_Workshop_04_Tuning.pdf
└── notebooks
    ├── 01_exploring_data.ipynb
    ├── 02_clustering_users.ipynb
    ├── 03_factorization_machines.ipynb
    └── 03_factorization_machines_regression.ipynb


/.gitignore:
--------------------------------------------------------------------------------
1 | .ipynb*/**
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Amazon SageMaker Recommender Workshop
 2 | 
 3 | ## Agenda
 4 | 
 5 | - {0.75} Introduction
 6 |     - {0.5} Overview of ML workflow -> decks/Recommender_Workshop_01_Introduction.pptx
 7 |     - {0.25} MovieLens data intro -> notebooks/01_exploring_data.ipynb
 8 |         - preflight: CFN template check -> cfn_setup.yaml
 9 |         - launch individual notebook instances
10 |             - what sizes?
11 |         - clone repo -> http://bit.ly/2wkaV0N (this repo)
12 | - [0.5] Unsupervised ML
13 |     - concepts
14 |     - hands-on
15 | - [0.25] **Break**
16 | - [1] Recommendation Engine
17 |     - [0.25] concepts
18 |     - [0.75] hands-on
19 |         - show movie recommender website: http://sagemaker-nab-demo.s3-website-us-west-2.amazonaws.com/
20 |         - clone repo to individual notebook instances
21 |         - run all at beginning of workshop
22 |         - TBD: perform training locally?
23 |         - during training: discussion of underlying mechanisms for SageMaker (Batch, ECR, etc)
24 | 
25 | - [0.5] Hyperparameter Optimization (PPT+Demo)
26 |     - overview
27 |     - demo w/ XGBoost
28 | - [0.25] Wrap-up (PPT)
29 |     - Other resources
30 |     - Next steps
31 |     
32 | 
33 | ## References
34 | 
35 | - https://grouplens.org/datasets/movielens/
36 | - https://medium.com/@julsimon/building-a-movie-recommender-with-factorization-machines-on-amazon-sagemaker-cedbfc8c93d8
37 | - http://sagemaker-nab-demo.s3-website-us-west-2.amazonaws.com/
38 | - https://sagemaker.readthedocs.io/en/latest/factorization_machines.html


--------------------------------------------------------------------------------
/cfn_setup.yaml:
--------------------------------------------------------------------------------
  1 | AWSTemplateFormatVersion: 2010-09-09
  2 | Parameters:
  3 |   InputBucketName:
  4 |     Type: "String"
  5 |     Description : "Enter a unique name for object storage bucket to store data"
  6 | Resources:
  7 |   InputBucket:
  8 |     Type: 'AWS::S3::Bucket'
  9 |     Properties:
 10 |       BucketName: 
 11 |         Ref: InputBucketName
 12 |     DeletionPolicy: Delete
 13 |   SagemakerRole: 
 14 |       Type: "AWS::IAM::Role"
 15 |       Properties: 
 16 |         AssumeRolePolicyDocument: 
 17 |           Version: "2012-10-17"
 18 |           Statement: 
 19 |             - 
 20 |               Effect: "Allow"
 21 |               Principal: 
 22 |                 Service: 
 23 |                   - "sagemaker.amazonaws.com"
 24 |               Action: 
 25 |                 - "sts:AssumeRole"
 26 |         Path: "/"
 27 |         Policies: 
 28 |           - 
 29 |             PolicyName: "SagemakerAccessResources"
 30 |             PolicyDocument:
 31 |               Version: "2012-10-17"
 32 |               Statement: 
 33 |                 -
 34 |                   Effect: "Allow"
 35 |                   Action: 
 36 |                     - "sagemaker:*"
 37 |                     - "ecr:GetAuthorizationToken"
 38 |                     - "cloudwatch:PutMetricData"
 39 |                     - "logs:CreateLogGroup"
 40 |                     - "logs:CreateLogStream"
 41 |                     - "logs:DescribeLogStreams"
 42 |                     - "logs:PutLogEvents"
 43 |                     - "logs:GetLogEvents"
 44 |                     - "ec2:CreateNetworkInterface"
 45 |                     - "ec2:CreateNetworkInterfacePermission"
 46 |                     - "ec2:DeleteNetworkInterface"
 47 |                     - "ec2:DeleteNetworkInterfacePermission"
 48 |                     - "ec2:DescribeNetworkInterfaces"
 49 |                     - "ec2:DescribeVpcs"
 50 |                     - "ec2:DescribeDhcpOptions"
 51 |                     - "ec2:DescribeSubnets"
 52 |                     - "ec2:DescribeSecurityGroups"
 53 |                     - "application-autoscaling:DeleteScalingPolicy"
 54 |                     - "application-autoscaling:DeleteScheduledAction"
 55 |                     - "application-autoscaling:DeregisterScalableTarget"
 56 |                     - "application-autoscaling:DescribeScalableTargets"
 57 |                     - "application-autoscaling:DescribeScalingActivities"
 58 |                     - "application-autoscaling:DescribeScalingPolicies"
 59 |                     - "application-autoscaling:DescribeScheduledActions"
 60 |                     - "application-autoscaling:PutScalingPolicy"
 61 |                     - "application-autoscaling:PutScheduledAction"
 62 |                     - "application-autoscaling:RegisterScalableTarget"
 63 |                   Resource: "*"
 64 |                 -
 65 |                   Effect: "Allow"
 66 |                   Action:
 67 |                     - "iam:PassRole"
 68 |                   Resource: "*"
 69 |                   Condition:
 70 |                     StringEquals:
 71 |                       "iam:PassedToService": "sagemaker.amazonaws.com"
 72 |                 -
 73 |                   Effect: "Allow"
 74 |                   Action:
 75 |                     - "s3:ListBucket"
 76 |                   Resource:
 77 |                     - 
 78 |                       !Join
 79 |                         - ''
 80 |                         - - 'arn:aws:s3:::'
 81 |                           - !Ref InputBucket
 82 |                 -      
 83 |                   Effect: "Allow"
 84 |                   Action:
 85 |                     - "s3:*"
 86 |                   Resource:
 87 |                     -
 88 |                       !Join
 89 |                         - ''
 90 |                         - - 'arn:aws:s3:::'
 91 |                           - !Ref InputBucket
 92 |                           - '/*'
 93 |                 -
 94 |                   Effect: "Allow"
 95 |                   Action:
 96 |                     - "ecr:BatchCheckLayerAvailability"
 97 |                     - "ecr:GetDownloadUrlForLayer"
 98 |                     - "ecr:BatchGetImage"
 99 |                   Resource:
100 |                     - "arn:aws:ecr:::repository/*"
101 |                 - 
102 |                   Effect: "Allow"
103 |                   Action: 
104 |                     - "iam:CreateServiceLinkedRole"
105 |                   Resource: 
106 |                     - "arn:aws:iam::*:role/aws-service-role/sagemaker.application-autoscaling.amazonaws.com/AWSServiceRoleForApplicationAutoScaling_SageMakerEndpoint"
107 |                   Condition: 
108 |                     StringLike:
109 |                       "iam:AWSServiceName": "sagemaker.application-autoscaling.amazonaws.com"
110 | Outputs:
111 |   SageMakerRoleArn:
112 |     Description: The Arn IAM Role for SageMaker to access S3 buckets and other resources
113 |     Value: !GetAtt [ SagemakerRole, Arn ]


--------------------------------------------------------------------------------
/decks/01-Recommender_Workshop_01_Introduction.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shirkeyaws/sagemaker-recommender-workshop/90b80dd3e7326386ec6174b0dff49c5044ced15b/decks/01-Recommender_Workshop_01_Introduction.pdf


--------------------------------------------------------------------------------
/decks/02-Recommender_Workshop_02_Collaborative_Filtering.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shirkeyaws/sagemaker-recommender-workshop/90b80dd3e7326386ec6174b0dff49c5044ced15b/decks/02-Recommender_Workshop_02_Collaborative_Filtering.pdf


--------------------------------------------------------------------------------
/decks/03-Recommender_Workshop_03_Matrix_Factorization.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shirkeyaws/sagemaker-recommender-workshop/90b80dd3e7326386ec6174b0dff49c5044ced15b/decks/03-Recommender_Workshop_03_Matrix_Factorization.pdf


--------------------------------------------------------------------------------
/decks/04-Recommender_Workshop_04_Tuning.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shirkeyaws/sagemaker-recommender-workshop/90b80dd3e7326386ec6174b0dff49c5044ced15b/decks/04-Recommender_Workshop_04_Tuning.pdf


--------------------------------------------------------------------------------
/notebooks/01_exploring_data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Exploring data\n",
  8 |     "\n",
  9 |     "We will use the MovieLens 100K dataset for performing our initial survey of user and movie data. \n",
 10 |     "\n",
 11 |     "See http://files.grouplens.org/datasets/movielens/ml-100k-README.txt for more information on this dataset."
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": null,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import pandas as pd\n",
 21 |     "import matplotlib.pyplot as plt\n",
 22 |     "%pylab inline"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "!rm -f /tmp/ml-100k.zip\n",
 32 |     "!rm -rf /tmp/ml-100k\n",
 33 |     "!wget -O /tmp/ml-100k.zip http://files.grouplens.org/datasets/movielens/ml-100k.zip\n",
 34 |     "!unzip -j -o /tmp/ml-100k.zip -d /tmp/ml-100k"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "markdown",
 39 |    "metadata": {},
 40 |    "source": [
 41 |     "# MovieLens Dataset Info\n",
 42 |     "\n",
 43 |     "We can parse the file u.info to find details about this dataset\n",
 44 |     "\n",
 45 |     "```\n",
 46 |     "u.info     -- The number of users, items, and ratings in the u data set.\n",
 47 |     "```"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": null,
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "%cd /tmp/ml-100k\n",
 57 |     "!cat u.info"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "markdown",
 62 |    "metadata": {},
 63 |    "source": [
 64 |     "## Movies\n",
 65 |     "\n",
 66 |     "Let's start by looking at the movie data which represent the items that are rated and recommended\n",
 67 |     "\n",
 68 |     "First, we will look at the u.genre to understand how these movies are categorized"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "code",
 73 |    "execution_count": null,
 74 |    "metadata": {},
 75 |    "outputs": [],
 76 |    "source": [
 77 |     "!cat u.genre"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "metadata": {},
 83 |    "source": [
 84 |     "To find the information about each movie title in our dataset, we check the u.item file\n",
 85 |     "\n",
 86 |     "```\n",
 87 |     "u.item     -- Information about the items (movies); this is a tab separated\n",
 88 |     "              list of\n",
 89 |     "              movie id | movie title | release date | video release date |\n",
 90 |     "              IMDb URL | unknown | Action | Adventure | Animation |\n",
 91 |     "              Children's | Comedy | Crime | Documentary | Drama | Fantasy |\n",
 92 |     "              Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi |\n",
 93 |     "              Thriller | War | Western |\n",
 94 |     "              The last 19 fields are the genres, a 1 indicates the movie\n",
 95 |     "              is of that genre, a 0 indicates it is not; movies can be in\n",
 96 |     "              several genres at once.\n",
 97 |     "              The movie ids are the ones used in the u.data data set.\n",
 98 |     "```"
 99 |    ]
100 |   },
101 |   {
102 |    "cell_type": "code",
103 |    "execution_count": null,
104 |    "metadata": {},
105 |    "outputs": [],
106 |    "source": [
107 |     "!head -10 u.item"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "markdown",
112 |    "metadata": {},
113 |    "source": [
114 |     "The list of items has several columns of 0/1 which represent the one-hot encoding of genere information -- we'll add that to our table header as we load the data in a Pandas dataframe"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": null,
120 |    "metadata": {},
121 |    "outputs": [],
122 |    "source": [
123 |     "items = pd.read_csv(\"u.item\", encoding = \"ISO-8859-1\", sep='|', header=None,\n",
124 |     "    names=['title', 'published','', 'url','genre_unknown', 'genre_action', 'genre_adventure', 'genre_animation', 'genre_childrens','genre_comedy','genre_crime','genre_documentary','genre_drama','genre_fantasy','genre_film-noir','genre_horror','genre_musical','genre_mystery','genre_romance','genre_scifi','genre_thriller','genre_war','genre_western'])\n",
125 |     "items.head()"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "metadata": {},
131 |    "source": [
132 |     "## Users\n",
133 |     "\n",
134 |     "We have some limited demographics data about users:\n",
135 |     "\n",
136 |     "```\n",
137 |     "u.user     -- Demographic information about the users; this is a tab\n",
138 |     "              separated list of\n",
139 |     "              user id | age | gender | occupation | zip code\n",
140 |     "              The user ids are the ones used in the u.data data set.\n",
141 |     "```\n",
142 |     "Let's have a look"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": null,
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": [
151 |     "!cat u.user"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "markdown",
156 |    "metadata": {},
157 |    "source": [
158 |     "Users are identified by age, gender, occupation and postal code"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": null,
164 |    "metadata": {},
165 |    "outputs": [],
166 |    "source": [
167 |     "users = pd.read_csv(\"u.user\", sep='|', header=None, \n",
168 |     "    names=['userid', 'age', 'gender', 'occupation', 'postal_code'])\n",
169 |     "users.head()"
170 |    ]
171 |   },
172 |   {
173 |    "cell_type": "code",
174 |    "execution_count": null,
175 |    "metadata": {},
176 |    "outputs": [],
177 |    "source": [
178 |     "users['age'].hist(bins=10)"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": null,
184 |    "metadata": {},
185 |    "outputs": [],
186 |    "source": [
187 |     "users['gender'].hist(bins=2)"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "markdown",
192 |    "metadata": {},
193 |    "source": [
194 |     "*We'll return to users analysis later when we perform user clustering in 02_clustering_users.ipynb*"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "markdown",
199 |    "metadata": {},
200 |    "source": [
201 |     "# Ratings\n",
202 |     "\n",
203 |     "Let's now have a look at the ratings data -- the u.data file contains individual user preference for movies/items as specified in 1 (least liked) to 5(most liked) ratings for a limited set of movies.\n",
204 |     "\n",
205 |     "```\n",
206 |     "u.data     -- The full u data set, 100000 ratings by 943 users on 1682 items.\n",
207 |     "              Each user has rated at least 20 movies.  Users and items are\n",
208 |     "              numbered consecutively from 1.  The data is randomly\n",
209 |     "              ordered. This is a tab separated list of \n",
210 |     "\t         user id | item id | rating | timestamp. \n",
211 |     "              The time stamps are unix seconds since 1/1/1970 UTC   \n",
212 |     "```"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": null,
218 |    "metadata": {},
219 |    "outputs": [],
220 |    "source": [
221 |     "data = pd.read_csv(\"u.data\", sep='\\t', header=None, \n",
222 |     "    names=['userid', 'movieid', 'rating', 'timestamp'])\n",
223 |     "data.head()"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": null,
229 |    "metadata": {},
230 |    "outputs": [],
231 |    "source": [
232 |     "print(\"Number of Users: %d\" % (data['userid'].max()))\n",
233 |     "print(\"Number of Movies: %d\" % (data['movieid'].max()))"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": null,
239 |    "metadata": {},
240 |    "outputs": [],
241 |    "source": [
242 |     "by_user = data.groupby('userid')\n",
243 |     "ratings_hist = by_user['movieid'].count().hist(bins=100)\n",
244 |     "ratings_hist.set_title('Ratings count distribution')"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": null,
250 |    "metadata": {},
251 |    "outputs": [],
252 |    "source": [
253 |     "cust_size = by_user.size()\n",
254 |     "cust_size.sample(random_state=42)\n",
255 |     "\n",
256 |     "#print(cust_size)\n",
257 |     "#cust_size.plot(kind='bar')\n",
258 |     "#cust_size.count()"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "code",
263 |    "execution_count": null,
264 |    "metadata": {},
265 |    "outputs": [],
266 |    "source": [
267 |     "data['rating'].describe()"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": null,
273 |    "metadata": {},
274 |    "outputs": [],
275 |    "source": [
276 |     "toy_story=data[(data.movieid == 1)]\n",
277 |     "toy_story.head()\n",
278 |     "#toy_story['rating'].hist()"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": null,
284 |    "metadata": {},
285 |    "outputs": [],
286 |    "source": [
287 |     "toy_story['rating'].mean()"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": null,
293 |    "metadata": {},
294 |    "outputs": [],
295 |    "source": [
296 |     "toy_story['rating'].hist()"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": null,
302 |    "metadata": {},
303 |    "outputs": [],
304 |    "source": []
305 |   }
306 |  ],
307 |  "metadata": {
308 |   "kernelspec": {
309 |    "display_name": "conda_python3",
310 |    "language": "python",
311 |    "name": "conda_python3"
312 |   },
313 |   "language_info": {
314 |    "codemirror_mode": {
315 |     "name": "ipython",
316 |     "version": 3
317 |    },
318 |    "file_extension": ".py",
319 |    "mimetype": "text/x-python",
320 |    "name": "python",
321 |    "nbconvert_exporter": "python",
322 |    "pygments_lexer": "ipython3",
323 |    "version": "3.6.4"
324 |   }
325 |  },
326 |  "nbformat": 4,
327 |  "nbformat_minor": 2
328 | }
329 | 


--------------------------------------------------------------------------------
/notebooks/02_clustering_users.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd\n",
 10 |     "import numpy as np\n",
 11 |     "import sagemaker\n",
 12 |     "from sagemaker import get_execution_role\n",
 13 |     "\n",
 14 |     "role = get_execution_role()"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": null,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "sess = sagemaker.Session()\n",
 24 |     "bucket = sess.default_bucket()"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": null,
 30 |    "metadata": {},
 31 |    "outputs": [],
 32 |    "source": [
 33 |     "# NOTE: this step is not strictly required if you've already run the 01_... notebook\n",
 34 |     "!rm -f /tmp/ml-100k.zip\n",
 35 |     "!rm -rf /tmp/ml-100k\n",
 36 |     "!wget -O /tmp/ml-100k.zip http://files.grouplens.org/datasets/movielens/ml-100k.zip\n",
 37 |     "!unzip -j -o /tmp/ml-100k.zip -d /tmp/ml-100k"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": null,
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "users = pd.read_csv(\"/tmp/ml-100k/u.user\", sep='|', header=None, index_col=['userid'],\n",
 47 |     "    names=['userid', 'age', 'gender', 'occupation', 'postal_code'])\n",
 48 |     "users = users.drop('postal_code',1)\n",
 49 |     "users.head()"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "users_onehot=pd.get_dummies(users)\n",
 59 |     "users_onehot.head()"
 60 |    ]
 61 |   },
 62 |   {
 63 |    "cell_type": "code",
 64 |    "execution_count": null,
 65 |    "metadata": {},
 66 |    "outputs": [],
 67 |    "source": [
 68 |     "users_onehot_df = users_onehot.values.astype(np.float32) # built-in K-Means requires float32\n",
 69 |     "print(users_onehot_df)"
 70 |    ]
 71 |   },
 72 |   {
 73 |    "cell_type": "code",
 74 |    "execution_count": null,
 75 |    "metadata": {},
 76 |    "outputs": [],
 77 |    "source": [
 78 |     "from sagemaker import KMeans\n",
 79 |     "\n",
 80 |     "data_location = 's3://{}/recommender_workshop/kmeans/data'.format(bucket)\n",
 81 |     "output_location = 's3://{}/recommender_workshop/kmeans/output'.format(bucket)\n",
 82 |     "\n",
 83 |     "print('training data will be uploaded to: {}'.format(data_location))\n",
 84 |     "print('training artifacts will be uploaded to: {}'.format(output_location))\n",
 85 |     "\n",
 86 |     "#!aws s3 cp /tmp/ml-100k/u.user $data_location/u.user\n",
 87 |     "\n",
 88 |     "k_value=5 #number of clusters\n",
 89 |     "kmeans = KMeans(role=role,\n",
 90 |     "                train_instance_count=1,\n",
 91 |     "                train_instance_type='ml.c4.2xlarge',\n",
 92 |     "                output_path=output_location,\n",
 93 |     "                k=k_value,\n",
 94 |     "                data_location=data_location)"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "%%time\n",
104 |     "\n",
105 |     "kmeans.fit(kmeans.record_set(users_onehot_df))"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "code",
110 |    "execution_count": null,
111 |    "metadata": {},
112 |    "outputs": [],
113 |    "source": [
114 |     "%%time\n",
115 |     "\n",
116 |     "kmeans_predictor = kmeans.deploy(initial_instance_count=1,\n",
117 |     "                                 instance_type='ml.m4.xlarge')"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": null,
123 |    "metadata": {},
124 |    "outputs": [],
125 |    "source": [
126 |     "pd.get_dummies(users.head())"
127 |    ]
128 |   },
129 |   {
130 |    "cell_type": "code",
131 |    "execution_count": null,
132 |    "metadata": {},
133 |    "outputs": [],
134 |    "source": [
135 |     "result = kmeans_predictor.predict(users_onehot_df[0:5])\n",
136 |     "print(result)"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": [
145 |     "result = kmeans_predictor.predict(users_onehot_df)\n",
146 |     "\n",
147 |     "cluster=[]\n",
148 |     "for i in range(k_value):\n",
149 |     "    cluster.append([r.label['distance_to_cluster'].float32_tensor.values[0] for r in result if r.label['closest_cluster'].float32_tensor.values[0] == i])\n",
150 |     "\n",
151 |     "cluster_zip = sorted(zip())\n"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": null,
157 |    "metadata": {},
158 |    "outputs": [],
159 |    "source": [
160 |     "%matplotlib inline\n",
161 |     "import matplotlib.pyplot as plt\n",
162 |     "\n",
163 |     "for i in range(k_value):\n",
164 |     "    fig,ax = plt.subplots()\n",
165 |     "    ax.hist(cluster[i])\n",
166 |     "\n",
167 |     "plt.plot()"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "metadata": {},
174 |    "outputs": [],
175 |    "source": [
176 |     "import sagemaker\n",
177 |     "sagemaker.Session().delete_endpoint(kmeans_predictor.endpoint)"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": null,
183 |    "metadata": {},
184 |    "outputs": [],
185 |    "source": []
186 |   }
187 |  ],
188 |  "metadata": {
189 |   "kernelspec": {
190 |    "display_name": "conda_python3",
191 |    "language": "python",
192 |    "name": "conda_python3"
193 |   },
194 |   "language_info": {
195 |    "codemirror_mode": {
196 |     "name": "ipython",
197 |     "version": 3
198 |    },
199 |    "file_extension": ".py",
200 |    "mimetype": "text/x-python",
201 |    "name": "python",
202 |    "nbconvert_exporter": "python",
203 |    "pygments_lexer": "ipython3",
204 |    "version": "3.6.4"
205 |   }
206 |  },
207 |  "nbformat": 4,
208 |  "nbformat_minor": 2
209 | }
210 | 


--------------------------------------------------------------------------------
/notebooks/03_factorization_machines.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Movie recommendation on Amazon SageMaker with Factorization Machines\n",
  8 |     "\n",
  9 |     "See Julien's original post here:\n",
 10 |     "https://medium.com/@julsimon/building-a-movie-recommender-with-factorization-machines-on-amazon-sagemaker-cedbfc8c93d8\n",
 11 |     "\n",
 12 |     "This notebook:\n",
 13 |     "https://raw.githubusercontent.com/juliensimon/dlnotebooks/master/sagemaker/03-Factorization-Machines-Movielens.ipynb"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "### Download ml-100k dataset"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "!rm -f /tmp/dataset.zip\n",
 30 |     "!rm -rf /tmp/dataset\n",
 31 |     "!wget http://files.grouplens.org/datasets/movielens/ml-100k.zip -O /tmp/dataset.zip\n",
 32 |     "!unzip -j -o /tmp/dataset.zip -d /tmp/dataset"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "%cd /tmp/dataset\n",
 42 |     "!shuf ua.base -o ua.base.shuffled\n",
 43 |     "!head -10 ua.base.shuffled"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "!head -10 ua.test"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "import sagemaker\n",
 62 |     "import sagemaker.amazon.common as smac\n",
 63 |     "from sagemaker import get_execution_role\n",
 64 |     "from sagemaker.predictor import json_deserializer\n",
 65 |     "\n",
 66 |     "import boto3, csv, io, json\n",
 67 |     "import numpy as np\n",
 68 |     "from scipy.sparse import lil_matrix"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "metadata": {},
 74 |    "source": [
 75 |     "### Build training set and test set"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "nbUsers=943\n",
 85 |     "nbMovies=1682\n",
 86 |     "nbFeatures=nbUsers+nbMovies\n",
 87 |     "\n",
 88 |     "nbRatingsTrain=90570\n",
 89 |     "nbRatingsTest=9430"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "# For each user, build a list of rated movies.\n",
 99 |     "# We'd need this to add random negative samples.\n",
100 |     "moviesByUser = {}\n",
101 |     "for userId in range(nbUsers):\n",
102 |     "    moviesByUser[str(userId)]=[]\n",
103 |     " \n",
104 |     "with open('ua.base.shuffled','r') as f:\n",
105 |     "    samples=csv.reader(f,delimiter='\\t')\n",
106 |     "    for userId,movieId,rating,timestamp in samples:\n",
107 |     "        moviesByUser[str(int(userId)-1)].append(int(movieId)-1) "
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "def loadDataset(filename, lines, columns):\n",
117 |     "    # Features are one-hot encoded in a sparse matrix\n",
118 |     "    X = lil_matrix((lines, columns)).astype('float32')\n",
119 |     "    # Labels are stored in a vector\n",
120 |     "    Y = []\n",
121 |     "    line=0\n",
122 |     "    with open(filename,'r') as f:\n",
123 |     "        samples=csv.reader(f,delimiter='\\t')\n",
124 |     "        for userId,movieId,rating,timestamp in samples:\n",
125 |     "            X[line,int(userId)-1] = 1\n",
126 |     "            X[line,int(nbUsers)+int(movieId)-1] = 1\n",
127 |     "            if int(rating) >= 4:\n",
128 |     "                Y.append(1)\n",
129 |     "            else:\n",
130 |     "                Y.append(0)\n",
131 |     "            line=line+1\n",
132 |     "            \n",
133 |     "    Y=np.array(Y).astype('float32')\n",
134 |     "    return X,Y"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {},
141 |    "outputs": [],
142 |    "source": [
143 |     "X_train, Y_train = loadDataset('ua.base.shuffled', nbRatingsTrain, nbFeatures)\n",
144 |     "X_test, Y_test = loadDataset('ua.test',nbRatingsTest,nbFeatures)"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "print(X_train.shape)\n",
154 |     "print(Y_train.shape)\n",
155 |     "assert X_train.shape == (nbRatingsTrain, nbFeatures)\n",
156 |     "assert Y_train.shape == (nbRatingsTrain, )\n",
157 |     "zero_labels = np.count_nonzero(Y_train)\n",
158 |     "print(\"Training labels: %d zeros, %d ones\" % (zero_labels, nbRatingsTrain-zero_labels))\n",
159 |     "\n",
160 |     "print(X_test.shape)\n",
161 |     "print(Y_test.shape)\n",
162 |     "assert X_test.shape  == (nbRatingsTest, nbFeatures)\n",
163 |     "assert Y_test.shape  == (nbRatingsTest, )\n",
164 |     "zero_labels = np.count_nonzero(Y_test)\n",
165 |     "print(\"Test labels: %d zeros, %d ones\" % (zero_labels, nbRatingsTest-zero_labels))"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "markdown",
170 |    "metadata": {},
171 |    "source": [
172 |     "### Convert to protobuf and save to S3"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": null,
178 |    "metadata": {},
179 |    "outputs": [],
180 |    "source": [
181 |     "import sagemaker\n",
182 |     "\n",
183 |     "sess = sagemaker.Session()\n",
184 |     "bucket = sess.default_bucket()\n",
185 |     "\n",
186 |     "prefix = 'sagemaker/fm-movielens'\n",
187 |     "\n",
188 |     "train_key      = 'train.protobuf'\n",
189 |     "train_prefix   = '{}/{}'.format(prefix, 'train3')\n",
190 |     "\n",
191 |     "test_key       = 'test.protobuf'\n",
192 |     "test_prefix    = '{}/{}'.format(prefix, 'test3')\n",
193 |     "\n",
194 |     "output_prefix  = 's3://{}/{}/output'.format(bucket, prefix)"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": null,
200 |    "metadata": {},
201 |    "outputs": [],
202 |    "source": [
203 |     "def writeDatasetToProtobuf(X, Y, bucket, prefix, key):\n",
204 |     "    buf = io.BytesIO()\n",
205 |     "    smac.write_spmatrix_to_sparse_tensor(buf, X, Y)\n",
206 |     "    buf.seek(0)\n",
207 |     "    obj = '{}/{}'.format(prefix, key)\n",
208 |     "    boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf)\n",
209 |     "    return 's3://{}/{}'.format(bucket,obj)\n",
210 |     "    \n",
211 |     "train_data = writeDatasetToProtobuf(X_train, Y_train, bucket, train_prefix, train_key)    \n",
212 |     "test_data  = writeDatasetToProtobuf(X_test, Y_test, bucket, test_prefix, test_key)    \n",
213 |     "  \n",
214 |     "print(train_data)\n",
215 |     "print(test_data)\n",
216 |     "print('Output: {}'.format(output_prefix))"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "markdown",
221 |    "metadata": {},
222 |    "source": [
223 |     "### Run training job"
224 |    ]
225 |   },
226 |   {
227 |    "cell_type": "code",
228 |    "execution_count": null,
229 |    "metadata": {},
230 |    "outputs": [],
231 |    "source": [
232 |     "containers = {'us-west-2': '174872318107.dkr.ecr.us-west-2.amazonaws.com/factorization-machines:latest',\n",
233 |     "              'us-east-1': '382416733822.dkr.ecr.us-east-1.amazonaws.com/factorization-machines:latest',\n",
234 |     "              'us-east-2': '404615174143.dkr.ecr.us-east-2.amazonaws.com/factorization-machines:latest',\n",
235 |     "              'eu-west-1': '438346466558.dkr.ecr.eu-west-1.amazonaws.com/factorization-machines:latest'}"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": null,
241 |    "metadata": {},
242 |    "outputs": [],
243 |    "source": [
244 |     "%%time\n",
245 |     "\n",
246 |     "fm = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],\n",
247 |     "                                   get_execution_role(), \n",
248 |     "                                   train_instance_count=1, \n",
249 |     "                                   train_instance_type='ml.c4.xlarge',\n",
250 |     "                                   output_path=output_prefix,\n",
251 |     "                                   sagemaker_session=sagemaker.Session())\n",
252 |     "\n",
253 |     "fm.set_hyperparameters(feature_dim=nbFeatures,\n",
254 |     "                      predictor_type='binary_classifier',\n",
255 |     "                      mini_batch_size=1000,\n",
256 |     "                      num_factors=64,\n",
257 |     "                      epochs=100)\n",
258 |     "\n",
259 |     "fm.fit({'train': train_data, 'test': test_data})"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "markdown",
264 |    "metadata": {},
265 |    "source": [
266 |     "### Deploy model"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": null,
272 |    "metadata": {
273 |     "scrolled": true
274 |    },
275 |    "outputs": [],
276 |    "source": [
277 |     "%%time\n",
278 |     "\n",
279 |     "fm_predictor = fm.deploy(instance_type='ml.c4.xlarge', initial_instance_count=1)"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": null,
285 |    "metadata": {},
286 |    "outputs": [],
287 |    "source": [
288 |     "def fm_serializer(data):\n",
289 |     "    js = {'instances': []}\n",
290 |     "    for row in data:\n",
291 |     "        js['instances'].append({'features': row.tolist()})\n",
292 |     "    #print js\n",
293 |     "    return json.dumps(js)\n",
294 |     "\n",
295 |     "fm_predictor.content_type = 'application/json'\n",
296 |     "fm_predictor.serializer = fm_serializer\n",
297 |     "fm_predictor.deserializer = json_deserializer"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "markdown",
302 |    "metadata": {},
303 |    "source": [
304 |     "## Run predictions\n",
305 |     "First let's run a prediction test against our set-aside data"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": null,
311 |    "metadata": {},
312 |    "outputs": [],
313 |    "source": [
314 |     "result = fm_predictor.predict(X_test[1000:1010].toarray())\n",
315 |     "print(result)\n",
316 |     "print (Y_test[1000:1010])"
317 |    ]
318 |   },
319 |   {
320 |    "cell_type": "code",
321 |    "execution_count": null,
322 |    "metadata": {},
323 |    "outputs": [],
324 |    "source": [
325 |     "%%time\n",
326 |     "\n",
327 |     "correct_predictions = 0\n",
328 |     "total_predictions = X_test.shape[0]\n",
329 |     "for i in range(total_predictions):\n",
330 |     "    result = fm_predictor.predict(X_test[i].toarray())\n",
331 |     "    if(int(result['predictions'][0]['predicted_label']) == int(Y_test[i])):\n",
332 |     "        correct_predictions += 1\n",
333 |     "        #print(\"match: \" + str(result['predictions'][0]['predicted_label']) + \" \" + str(Y_test[i]))"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": null,
339 |    "metadata": {},
340 |    "outputs": [],
341 |    "source": [
342 |     "accuracy_predictions = correct_predictions/(total_predictions*1.0)\n",
343 |     "print('Total predictions: {}'.format(total_predictions))     \n",
344 |     "print('Correct predictions: {}'.format(correct_predictions))\n",
345 |     "print('Accuracy: {}%'.format(accuracy_predictions*100))"
346 |    ]
347 |   },
348 |   {
349 |    "cell_type": "code",
350 |    "execution_count": null,
351 |    "metadata": {},
352 |    "outputs": [],
353 |    "source": [
354 |     "import sagemaker\n",
355 |     "sagemaker.Session().delete_endpoint(fm_predictor.endpoint)"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": null,
361 |    "metadata": {},
362 |    "outputs": [],
363 |    "source": []
364 |   }
365 |  ],
366 |  "metadata": {
367 |   "kernelspec": {
368 |    "display_name": "conda_python2",
369 |    "language": "python",
370 |    "name": "conda_python2"
371 |   },
372 |   "language_info": {
373 |    "codemirror_mode": {
374 |     "name": "ipython",
375 |     "version": 2
376 |    },
377 |    "file_extension": ".py",
378 |    "mimetype": "text/x-python",
379 |    "name": "python",
380 |    "nbconvert_exporter": "python",
381 |    "pygments_lexer": "ipython2",
382 |    "version": "2.7.14"
383 |   }
384 |  },
385 |  "nbformat": 4,
386 |  "nbformat_minor": 2
387 | }
388 | 


--------------------------------------------------------------------------------
/notebooks/03_factorization_machines_regression.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Movie recommendation on Amazon SageMaker with Factorization Machines\n",
  8 |     "\n",
  9 |     "See Julien's original post here:\n",
 10 |     "https://medium.com/@julsimon/building-a-movie-recommender-with-factorization-machines-on-amazon-sagemaker-cedbfc8c93d8\n",
 11 |     "\n",
 12 |     "This notebook:\n",
 13 |     "https://raw.githubusercontent.com/juliensimon/dlnotebooks/master/sagemaker/03-Factorization-Machines-Movielens.ipynb"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "markdown",
 18 |    "metadata": {},
 19 |    "source": [
 20 |     "### Download ml-100k dataset"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": null,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "!rm -f /tmp/dataset.zip\n",
 30 |     "!rm -rf /tmp/dataset\n",
 31 |     "!wget http://files.grouplens.org/datasets/movielens/ml-100k.zip -O /tmp/dataset.zip\n",
 32 |     "!unzip -j -o /tmp/dataset.zip -d /tmp/dataset"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": null,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "%cd /tmp/dataset\n",
 42 |     "!shuf ua.base -o ua.base.shuffled\n",
 43 |     "!head -10 ua.base.shuffled"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": null,
 49 |    "metadata": {},
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "!head -10 ua.test"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": null,
 58 |    "metadata": {},
 59 |    "outputs": [],
 60 |    "source": [
 61 |     "import sagemaker\n",
 62 |     "import sagemaker.amazon.common as smac\n",
 63 |     "from sagemaker import get_execution_role\n",
 64 |     "from sagemaker.predictor import json_deserializer\n",
 65 |     "\n",
 66 |     "import boto3, csv, io, json\n",
 67 |     "import numpy as np\n",
 68 |     "from scipy.sparse import lil_matrix"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "metadata": {},
 74 |    "source": [
 75 |     "### Build training set and test set"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": null,
 81 |    "metadata": {},
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "nbUsers=943\n",
 85 |     "nbMovies=1682\n",
 86 |     "nbFeatures=nbUsers+nbMovies\n",
 87 |     "\n",
 88 |     "nbRatingsTrain=90570\n",
 89 |     "nbRatingsTest=9430"
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "# For each user, build a list of rated movies.\n",
 99 |     "# We'd need this to add random negative samples.\n",
100 |     "moviesByUser = {}\n",
101 |     "for userId in range(nbUsers):\n",
102 |     "    moviesByUser[str(userId)]=[]\n",
103 |     " \n",
104 |     "with open('ua.base.shuffled','r') as f:\n",
105 |     "    samples=csv.reader(f,delimiter='\\t')\n",
106 |     "    for userId,movieId,rating,timestamp in samples:\n",
107 |     "        moviesByUser[str(int(userId)-1)].append(int(movieId)-1) "
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "def loadDataset(filename, lines, columns):\n",
117 |     "    # Features are one-hot encoded in a sparse matrix\n",
118 |     "    X = lil_matrix((lines, columns)).astype('float32')\n",
119 |     "    # Labels are stored in a vector\n",
120 |     "    Y = []\n",
121 |     "    line=0\n",
122 |     "    with open(filename,'r') as f:\n",
123 |     "        samples=csv.reader(f,delimiter='\\t')\n",
124 |     "        for userId,movieId,rating,timestamp in samples:\n",
125 |     "            X[line,int(userId)-1] = 1\n",
126 |     "            X[line,int(nbUsers)+int(movieId)-1] = 1\n",
127 |     "            if int(rating) >= 4:\n",
128 |     "                Y.append(1)\n",
129 |     "            else:\n",
130 |     "                Y.append(0)\n",
131 |     "            line=line+1\n",
132 |     "            \n",
133 |     "    Y=np.array(Y).astype('float32')\n",
134 |     "    return X,Y"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "code",
139 |    "execution_count": null,
140 |    "metadata": {},
141 |    "outputs": [],
142 |    "source": [
143 |     "X_train, Y_train = loadDataset('ua.base.shuffled', nbRatingsTrain, nbFeatures)\n",
144 |     "X_test, Y_test = loadDataset('ua.test',nbRatingsTest,nbFeatures)"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": [
153 |     "print(X_train.shape)\n",
154 |     "print(Y_train.shape)\n",
155 |     "assert X_train.shape == (nbRatingsTrain, nbFeatures)\n",
156 |     "assert Y_train.shape == (nbRatingsTrain, )\n",
157 |     "zero_labels = np.count_nonzero(Y_train)\n",
158 |     "print(\"Training labels: %d zeros, %d ones\" % (zero_labels, nbRatingsTrain-zero_labels))\n",
159 |     "\n",
160 |     "print(X_test.shape)\n",
161 |     "print(Y_test.shape)\n",
162 |     "assert X_test.shape  == (nbRatingsTest, nbFeatures)\n",
163 |     "assert Y_test.shape  == (nbRatingsTest, )\n",
164 |     "zero_labels = np.count_nonzero(Y_test)\n",
165 |     "print(\"Test labels: %d zeros, %d ones\" % (zero_labels, nbRatingsTest-zero_labels))"
166 |    ]
167 |   },
168 |   {
169 |    "cell_type": "markdown",
170 |    "metadata": {},
171 |    "source": [
172 |     "### Convert to protobuf and save to S3"
173 |    ]
174 |   },
175 |   {
176 |    "cell_type": "code",
177 |    "execution_count": null,
178 |    "metadata": {},
179 |    "outputs": [],
180 |    "source": [
181 |     "bucket = 'change-me'\n",
182 |     "prefix = 'sagemaker/fm-movielens'\n",
183 |     "\n",
184 |     "train_key      = 'train.protobuf'\n",
185 |     "train_prefix   = '{}/{}'.format(prefix, 'train3')\n",
186 |     "\n",
187 |     "test_key       = 'test.protobuf'\n",
188 |     "test_prefix    = '{}/{}'.format(prefix, 'test3')\n",
189 |     "\n",
190 |     "output_prefix  = 's3://{}/{}/output'.format(bucket, prefix)"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": null,
196 |    "metadata": {},
197 |    "outputs": [],
198 |    "source": [
199 |     "assert(bucket!='change-me'), \"Please change your bucket id\""
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": null,
205 |    "metadata": {},
206 |    "outputs": [],
207 |    "source": [
208 |     "def writeDatasetToProtobuf(X, Y, bucket, prefix, key):\n",
209 |     "    buf = io.BytesIO()\n",
210 |     "    smac.write_spmatrix_to_sparse_tensor(buf, X, Y)\n",
211 |     "    buf.seek(0)\n",
212 |     "    obj = '{}/{}'.format(prefix, key)\n",
213 |     "    boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf)\n",
214 |     "    return 's3://{}/{}'.format(bucket,obj)\n",
215 |     "    \n",
216 |     "train_data = writeDatasetToProtobuf(X_train, Y_train, bucket, train_prefix, train_key)    \n",
217 |     "test_data  = writeDatasetToProtobuf(X_test, Y_test, bucket, test_prefix, test_key)    \n",
218 |     "  \n",
219 |     "print(train_data)\n",
220 |     "print(test_data)\n",
221 |     "print('Output: {}'.format(output_prefix))"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "markdown",
226 |    "metadata": {},
227 |    "source": [
228 |     "### Run training job"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": null,
234 |    "metadata": {},
235 |    "outputs": [],
236 |    "source": [
237 |     "containers = {'us-west-2': '174872318107.dkr.ecr.us-west-2.amazonaws.com/factorization-machines:latest',\n",
238 |     "              'us-east-1': '382416733822.dkr.ecr.us-east-1.amazonaws.com/factorization-machines:latest',\n",
239 |     "              'us-east-2': '404615174143.dkr.ecr.us-east-2.amazonaws.com/factorization-machines:latest',\n",
240 |     "              'eu-west-1': '438346466558.dkr.ecr.eu-west-1.amazonaws.com/factorization-machines:latest'}"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": null,
246 |    "metadata": {},
247 |    "outputs": [],
248 |    "source": [
249 |     "%%time\n",
250 |     "\n",
251 |     "fm = sagemaker.estimator.Estimator(containers[boto3.Session().region_name],\n",
252 |     "                                   get_execution_role(), \n",
253 |     "                                   train_instance_count=1, \n",
254 |     "                                   train_instance_type='ml.c4.xlarge',\n",
255 |     "                                   output_path=output_prefix,\n",
256 |     "                                   sagemaker_session=sagemaker.Session())\n",
257 |     "\n",
258 |     "fm.set_hyperparameters(feature_dim=nbFeatures,\n",
259 |     "                      predictor_type='binary_classifier',\n",
260 |     "                      mini_batch_size=1000,\n",
261 |     "                      num_factors=64,\n",
262 |     "                      _speedometer_period=10,\n",
263 |     "                      epochs=100)\n",
264 |     "\n",
265 |     "fm.fit({'train': train_data, 'test': test_data})"
266 |    ]
267 |   },
268 |   {
269 |    "cell_type": "markdown",
270 |    "metadata": {},
271 |    "source": [
272 |     "### Deploy model"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "code",
277 |    "execution_count": null,
278 |    "metadata": {
279 |     "scrolled": true
280 |    },
281 |    "outputs": [],
282 |    "source": [
283 |     "%%time\n",
284 |     "\n",
285 |     "fm_predictor = fm.deploy(instance_type='ml.c4.xlarge', initial_instance_count=1)"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": null,
291 |    "metadata": {},
292 |    "outputs": [],
293 |    "source": [
294 |     "def fm_serializer(data):\n",
295 |     "    js = {'instances': []}\n",
296 |     "    for row in data:\n",
297 |     "        js['instances'].append({'features': row.tolist()})\n",
298 |     "    #print js\n",
299 |     "    return json.dumps(js)\n",
300 |     "\n",
301 |     "fm_predictor.content_type = 'application/json'\n",
302 |     "fm_predictor.serializer = fm_serializer\n",
303 |     "fm_predictor.deserializer = json_deserializer"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "markdown",
308 |    "metadata": {},
309 |    "source": [
310 |     "## Run predictions\n",
311 |     "First let's run a prediction test against our set-aside data"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "code",
316 |    "execution_count": null,
317 |    "metadata": {},
318 |    "outputs": [],
319 |    "source": [
320 |     "result = fm_predictor.predict(X_test[1000:1010].toarray())\n",
321 |     "print(result)\n",
322 |     "print (Y_test[1000:1010])"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "code",
327 |    "execution_count": null,
328 |    "metadata": {},
329 |    "outputs": [],
330 |    "source": [
331 |     "%%time\n",
332 |     "\n",
333 |     "correct_predictions = 0\n",
334 |     "total_predictions = X_test.shape[0]\n",
335 |     "for i in range(total_predictions):\n",
336 |     "    result = fm_predictor.predict(X_test[i].toarray())\n",
337 |     "    if(int(result['predictions'][0]['predicted_label']) == int(Y_test[i])):\n",
338 |     "        correct_predictions += 1\n",
339 |     "        #print(\"match: \" + str(result['predictions'][0]['predicted_label']) + \" \" + str(Y_test[i]))"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": null,
345 |    "metadata": {},
346 |    "outputs": [],
347 |    "source": [
348 |     "accuracy_predictions = correct_predictions/(total_predictions*1.0)\n",
349 |     "print('Total predictions: {}'.format(total_predictions))     \n",
350 |     "print('Correct predictions: {}'.format(correct_predictions))\n",
351 |     "print('Accuracy: {}%'.format(accuracy_predictions*100))"
352 |    ]
353 |   },
354 |   {
355 |    "cell_type": "code",
356 |    "execution_count": null,
357 |    "metadata": {},
358 |    "outputs": [],
359 |    "source": [
360 |     "import sagemaker\n",
361 |     "sagemaker.Session().delete_endpoint(fm_predictor.endpoint)"
362 |    ]
363 |   },
364 |   {
365 |    "cell_type": "code",
366 |    "execution_count": null,
367 |    "metadata": {},
368 |    "outputs": [],
369 |    "source": []
370 |   }
371 |  ],
372 |  "metadata": {
373 |   "kernelspec": {
374 |    "display_name": "conda_python2",
375 |    "language": "python",
376 |    "name": "conda_python2"
377 |   },
378 |   "language_info": {
379 |    "codemirror_mode": {
380 |     "name": "ipython",
381 |     "version": 2
382 |    },
383 |    "file_extension": ".py",
384 |    "mimetype": "text/x-python",
385 |    "name": "python",
386 |    "nbconvert_exporter": "python",
387 |    "pygments_lexer": "ipython2",
388 |    "version": "2.7.14"
389 |   }
390 |  },
391 |  "nbformat": 4,
392 |  "nbformat_minor": 2
393 | }
394 | 


--------------------------------------------------------------------------------