├── src ├── pipeline │ ├── __init__.py │ ├── preprocess.py │ └── prepare.py ├── config.py └── dag_ml_pipeline_amazon_video_reviews.py ├── .gitignore ├── images ├── LaunchStack.png ├── cfn_output.png ├── airflow-sagemaker-dag.png ├── airflow-sagemaker-airflow-dag.png ├── airflow-sagemaker-airflow_setup.png └── airflow-sagemaker-ml-workflow.png ├── .github └── PULL_REQUEST_TEMPLATE.md ├── CODE_OF_CONDUCT.md ├── LICENSE ├── CONTRIBUTING.md ├── README.md ├── cfn ├── airflow-ec2-1.10.12-RDS.yaml ├── airflow-ec2-1.10.12-Aurora-Serverless.yaml ├── airflow-ec2-2.0.2-RDS.yaml └── airflow-ec2-2.0.2-Aurora-Serverless.yaml └── notebooks └── amazon-video-recommender_using_fm_algo.ipynb /src/pipeline/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | src/pipeline/__pycache__/* 2 | *.pyc 3 | .vscode/ 4 | notebooks/.ipynb_checkpoints/ -------------------------------------------------------------------------------- /images/LaunchStack.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/sagemaker-ml-workflow-with-apache-airflow/HEAD/images/LaunchStack.png -------------------------------------------------------------------------------- /images/cfn_output.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/sagemaker-ml-workflow-with-apache-airflow/HEAD/images/cfn_output.png -------------------------------------------------------------------------------- /images/airflow-sagemaker-dag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/sagemaker-ml-workflow-with-apache-airflow/HEAD/images/airflow-sagemaker-dag.png -------------------------------------------------------------------------------- /images/airflow-sagemaker-airflow-dag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/sagemaker-ml-workflow-with-apache-airflow/HEAD/images/airflow-sagemaker-airflow-dag.png -------------------------------------------------------------------------------- /images/airflow-sagemaker-airflow_setup.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/sagemaker-ml-workflow-with-apache-airflow/HEAD/images/airflow-sagemaker-airflow_setup.png -------------------------------------------------------------------------------- /images/airflow-sagemaker-ml-workflow.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/sagemaker-ml-workflow-with-apache-airflow/HEAD/images/airflow-sagemaker-ml-workflow.png -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | *Issue #, if available:* 2 | 3 | *Description of changes:* 4 | 5 | 6 | By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice. 7 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software is furnished to do so. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 15 | -------------------------------------------------------------------------------- /src/config.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from sagemaker.tuner import ContinuousParameter 3 | 4 | config = {} 5 | 6 | config["job_level"] = { 7 | "region_name": "", 8 | "run_hyperparameter_opt": "no" 9 | } 10 | 11 | config["preprocess_data"] = { 12 | "s3_in_url": "s3://amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Video_Download_v1_00.tsv.gz", 13 | "s3_out_bucket": "", # replace 14 | "s3_out_prefix": "preprocess/", 15 | "delimiter": "\t" 16 | } 17 | 18 | config["prepare_data"] = { 19 | "s3_in_bucket": "", # replace 20 | "s3_in_prefix": "preprocess/", 21 | "s3_out_bucket": "", # replace 22 | "s3_out_prefix": "prepare/", 23 | "delimiter": "\t" 24 | } 25 | 26 | config["train_model"] = { 27 | "sagemaker_role": "AirflowSageMakerExecutionRole", 28 | "estimator_config": { 29 | "train_instance_count": 1, 30 | "train_instance_type": "ml.c5.4xlarge", 31 | "train_volume_size": 30, 32 | "train_max_run": 3600, 33 | "output_path": "s3:///train/", # replace 34 | "base_job_name": "trng-recommender", 35 | "hyperparameters": { 36 | "feature_dim": "178729", 37 | "epochs": "10", 38 | "mini_batch_size": "200", 39 | "num_factors": "64", 40 | "predictor_type": 'regressor' 41 | } 42 | }, 43 | "inputs": { 44 | "train": "s3:///prepare/train/train.protobuf", # replace 45 | } 46 | } 47 | 48 | config["tune_model"] = { 49 | "tuner_config": { 50 | "objective_metric_name": "test:rmse", 51 | "objective_type": "Minimize", 52 | "hyperparameter_ranges": { 53 | "factors_lr": ContinuousParameter(0.0001, 0.2), 54 | "factors_init_sigma": ContinuousParameter(0.0001, 1) 55 | }, 56 | "max_jobs": 20, 57 | "max_parallel_jobs": 2, 58 | "base_tuning_job_name": "hpo-recommender" 59 | }, 60 | "inputs": { 61 | "train": "s3:///prepare/train/train.protobuf", # replace 62 | "test": "s3:///prepare/validate/validate.protobuf" # replace 63 | } 64 | } 65 | 66 | config["batch_transform"] = { 67 | "transform_config": { 68 | "instance_count": 1, 69 | "instance_type": "ml.c4.xlarge", 70 | "data": "s3:///prepare/test/", 71 | "data_type": "S3Prefix", 72 | "content_type": "application/x-recordio-protobuf", 73 | "strategy": "MultiRecord", 74 | "output_path": "s3:///transform/" 75 | } 76 | } 77 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check [existing open](https://github.com/aws-samples/sagemaker-ml-workflow-with-apache-airflow/issues), or [recently closed](https://github.com/aws-samples/sagemaker-ml-workflow-with-apache-airflow/issues?utf8=%E2%9C%93&q=is%3Aissue%20is%3Aclosed%20), issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *master* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any ['help wanted'](https://github.com/aws-samples/sagemaker-ml-workflow-with-apache-airflow/labels/help%20wanted) issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](https://github.com/aws-samples/sagemaker-ml-workflow-with-apache-airflow/blob/master/LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | 61 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes. 62 | -------------------------------------------------------------------------------- /src/pipeline/preprocess.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import numpy as np 3 | import s3fs 4 | 5 | 6 | def preprocess(s3_in_url, 7 | s3_out_bucket, 8 | s3_out_prefix, 9 | delimiter=","): 10 | """Preprocesses data based on business logic 11 | 12 | - Reads delimited file passed as s3_url and preprocess data by filtering 13 | long tail in the customer ratings data i.e. keep customers who have rated 5 14 | or more videos, and videos that have been rated by 9+ customers 15 | - Preprocessed data is then written to output 16 | 17 | Args: 18 | s3_in_url: 19 | s3 url to the delimited file to be processed 20 | e.g. s3://amazon-reviews-pds/tsv/reviews.tsv.gz 21 | s3_out_bucket: 22 | s3 bucket where preprocessed data will be staged 23 | e.g. mybucket 24 | s3_out_prefix: 25 | s3 url prefix to stage preprocessed data to use later in the pipeline 26 | e.g. amazon-reviews-pds/preprocess/ 27 | delimiter: 28 | delimiter to be used for parsing the file. Defaults to "," if none 29 | provided 30 | 31 | Returns: 32 | status of preprocessed data 33 | 34 | Raises: 35 | IOError: An error occurred accessing the s3 file 36 | """ 37 | try: 38 | print("preprocessing data from {}".format(s3_in_url)) 39 | # read s3 url into pandas dataframe 40 | # pandas internally uses s3fs to read s3 file directory 41 | df = pd.read_csv(s3_in_url, delimiter, error_bad_lines=False) 42 | 43 | # limit dataframe to customer_id, product_id, and star_rating 44 | # `product_title` will be useful validating recommendations 45 | df = df[['customer_id', 'product_id', 'star_rating', 'product_title']] 46 | 47 | # clean out the long tail because most people haven't seen most videos, 48 | # and people rate fewer videos than they actually watch 49 | customers = df['customer_id'].value_counts() 50 | products = df['product_id'].value_counts() 51 | 52 | # based on data exploration only about 5% of customers have rated 5 or 53 | # more videos, and only 25% of videos have been rated by 9+ customers 54 | customers = customers[customers >= 5] 55 | products = products[products >= 10] 56 | print("# of rows before the long tail = {:10d}".format(df.shape[0])) 57 | reduced_df = df \ 58 | .merge(pd.DataFrame({'customer_id': customers.index})) \ 59 | .merge(pd.DataFrame({'product_id': products.index})) 60 | print("# of rows after the long tail = {:10d}".format( 61 | reduced_df.shape[0])) 62 | reduced_df = reduced_df.drop_duplicates(['customer_id', 'product_id']) 63 | print("# of rows after removing duplicates = {:10d}".format( 64 | reduced_df.shape[0])) 65 | 66 | # recreate customer and product lists since there are customers with 67 | # more than 5 reviews, but all of their reviews are on products with 68 | # less than 5 reviews (and vice versa) 69 | customers = reduced_df['customer_id'].value_counts() 70 | products = reduced_df['product_id'].value_counts() 71 | 72 | # sequentially index each user and item to hold the sparse format where 73 | # the indices indicate the row and column in our ratings matrix 74 | customer_index = pd.DataFrame({ 75 | 'customer_id': customers.index, 76 | 'customer': np.arange(customers.shape[0])}) 77 | product_index = pd.DataFrame({ 78 | 'product_id': products.index, 79 | 'product': np.arange(products.shape[0])}) 80 | reduced_df = reduced_df \ 81 | .merge(customer_index) \ 82 | .merge(product_index) 83 | 84 | nb_customer = reduced_df['customer'].max() + 1 85 | nb_products = reduced_df['product'].max() + 1 86 | feature_dim = nb_customer + nb_products 87 | print(nb_customer, nb_products, feature_dim) 88 | 89 | product_df = reduced_df[['customer', 'product', 'star_rating']] 90 | 91 | # split into train, validation and test data sets 92 | train_df, validate_df, test_df = np.split( 93 | product_df.sample(frac=1), 94 | [int(.6*len(product_df)), int(.8*len(product_df))] 95 | ) 96 | 97 | print("# of rows train data set = {:10d}".format( 98 | train_df.shape[0])) 99 | print("# of rows validation data set = {:10d}".format( 100 | validate_df.shape[0])) 101 | print("# of rows test data set = {:10d}".format( 102 | test_df.shape[0])) 103 | 104 | # select columns required for training the model 105 | # excluding columns "customer_id", "product_id", "product_title" to 106 | # keep files small 107 | cols = ["customer", "product", "star_rating"] 108 | train_df = train_df[cols] 109 | validate_df = validate_df[cols] 110 | test_df = test_df[cols] 111 | 112 | # write output to s3 as delimited file 113 | fs = s3fs.S3FileSystem(anon=False) 114 | s3_out_prefix = s3_out_prefix[:-1] \ 115 | if s3_out_prefix[-1] == "/" else s3_out_prefix 116 | s3_out_train = "s3://{}/{}/{}".format( 117 | s3_out_bucket, s3_out_prefix, "train/train.csv") 118 | print("writing training data to {}".format(s3_out_train)) 119 | with fs.open(s3_out_train, "w") as f: 120 | train_df.to_csv(f, sep=str(','), index=False) 121 | 122 | s3_out_validate = "s3://{}/{}/{}".format( 123 | s3_out_bucket, s3_out_prefix, "validate/validate.csv") 124 | print("writing test data to {}".format(s3_out_validate)) 125 | with fs.open(s3_out_validate, "w") as f: 126 | validate_df.to_csv(f, sep=str(','), index=False) 127 | 128 | s3_out_test = "s3://{}/{}/{}".format( 129 | s3_out_bucket, s3_out_prefix, "test/test.csv") 130 | print("writing test data to {}".format(s3_out_test)) 131 | with fs.open(s3_out_test, "w") as f: 132 | test_df.to_csv(f, sep=str(','), index=False) 133 | 134 | print("preprocessing completed") 135 | return "SUCCESS" 136 | except Exception as e: 137 | raise e 138 | -------------------------------------------------------------------------------- /src/pipeline/prepare.py: -------------------------------------------------------------------------------- 1 | import sagemaker.amazon.common as smac 2 | import pandas as pd 3 | import numpy as np 4 | import boto3 5 | import s3fs 6 | import io 7 | 8 | from scipy.sparse import lil_matrix 9 | 10 | 11 | def convert_sparse_matrix(df, nb_rows, nb_customer, nb_products): 12 | # dataframe to array 13 | df_val = df.values 14 | 15 | # determine feature size 16 | nb_cols = nb_customer + nb_products 17 | print("# of rows = {}".format(str(nb_rows))) 18 | print("# of cols = {}".format(str(nb_cols))) 19 | 20 | # extract customers and ratings 21 | df_X = df_val[:, 0:2] 22 | # Features are one-hot encoded in a sparse matrix 23 | X = lil_matrix((nb_rows, nb_cols)).astype('float32') 24 | df_X[:, 1] = nb_customer + df_X[:, 1] 25 | coords = df_X[:, 0:2] 26 | X[np.arange(nb_rows), coords[:, 0]] = 1 27 | X[np.arange(nb_rows), coords[:, 1]] = 1 28 | 29 | # create label with ratings 30 | Y = df_val[:, 2].astype('float32') 31 | 32 | # validate size and shape 33 | print(X.shape) 34 | print(Y.shape) 35 | assert X.shape == (nb_rows, nb_cols) 36 | assert Y.shape == (nb_rows, ) 37 | 38 | return X, Y 39 | 40 | 41 | def save_as_protobuf(X, Y, bucket, key): 42 | """Converts features and predictions matrices to recordio protobuf and 43 | writes to S3 44 | 45 | Args: 46 | X: 47 | 2D numpy matrix with features 48 | Y: 49 | 1D numpy matrix with predictions 50 | bucket: 51 | s3 bucket where recordio protobuf file will be staged 52 | prefix: 53 | s3 url prefix to stage prepared data to use for training the model 54 | key: 55 | protobuf file name to be staged 56 | 57 | Returns: 58 | s3 url with key to the protobuf data 59 | """ 60 | buf = io.BytesIO() 61 | smac.write_spmatrix_to_sparse_tensor(buf, X, Y) 62 | buf.seek(0) 63 | obj = '{}'.format(key) 64 | boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf) 65 | return 's3://{}/{}'.format(bucket, obj) 66 | 67 | 68 | def chunk(x, batch_size): 69 | """split array into chunks of batch_size 70 | """ 71 | chunk_range = range(0, x.shape[0], batch_size) 72 | chunks = [x[p: p + batch_size] for p in chunk_range] 73 | return chunks 74 | 75 | 76 | def prepare(s3_in_bucket, 77 | s3_in_prefix, 78 | s3_out_bucket, 79 | s3_out_prefix, 80 | delimiter=","): 81 | """Prepare data for training with Sagemaker algorithms 82 | 83 | - Read preprocessed data and converts to ProtoBuf format to prepare for 84 | training with Sagemaker algorithms 85 | 86 | Args: 87 | s3_in_bucket: 88 | s3 bucket where preprocessed files are staged 89 | e.g. mybucket 90 | s3_in_prefix: 91 | s3 prefix to the files to be used for training 92 | e.g. amazon-reviews-pds/preprocess/ 93 | it's expected to have train and test folders in this prefix that will 94 | be staged by preprocessor 95 | s3_out_bucket: 96 | s3 bucket where training and test files will be staged 97 | e.g. mybucket 98 | s3_out_prefix: 99 | s3 url prefix to stage prepared data to use for training the model 100 | e.g. amazon-reviews-pds/prepare/ 101 | delimiter: 102 | delimiter to be used for parsing the file. Defaults to "," if none 103 | provided 104 | 105 | Returns: 106 | s3 url with key to the prepared data 107 | 108 | Raises: 109 | IOError: An error occurred accessing the s3 file 110 | """ 111 | try: 112 | print("preparing data from {}".format(s3_in_prefix)) 113 | 114 | # prepare training data set 115 | if s3_in_prefix[-1] == "/": 116 | s3_in_prefix = s3_in_prefix[:-1] 117 | s3_train_url = "s3://{}/{}/{}".format( 118 | s3_in_bucket, s3_in_prefix, 'train/train.csv') 119 | train_df = pd.read_csv(s3_train_url, 120 | sep=str(','), error_bad_lines=False) 121 | 122 | # prepare validateion dataset 123 | s3_validate_url = "s3://{}/{}/{}".format( 124 | s3_in_bucket, s3_in_prefix, 'validate/validate.csv') 125 | validate_df = pd.read_csv(s3_validate_url, 126 | sep=str(','), error_bad_lines=False) 127 | 128 | # prepare test dataset 129 | s3_test_url = "s3://{}/{}/{}".format( 130 | s3_in_bucket, s3_in_prefix, 'test/test.csv') 131 | test_df = pd.read_csv(s3_test_url, 132 | sep=str(','), error_bad_lines=False) 133 | 134 | # get feature dimension 135 | all_df = pd.concat([train_df, validate_df, test_df]) 136 | nb_customer = np.unique(all_df['customer'].values).shape[0] 137 | nb_products = np.unique(all_df['product'].values).shape[0] 138 | feature_dim = nb_customer + nb_products 139 | print(nb_customer, nb_products, feature_dim) 140 | 141 | train_X, train_Y = convert_sparse_matrix( 142 | train_df, train_df.shape[0], nb_customer, nb_products) 143 | validate_X, validate_Y = convert_sparse_matrix( 144 | validate_df, validate_df.shape[0], nb_customer, nb_products) 145 | test_X, test_Y = convert_sparse_matrix( 146 | test_df, test_df.shape[0], nb_customer, nb_products) 147 | 148 | # write train and test in protobuf format to s3 149 | if s3_out_prefix[-1] == "/": 150 | s3_out_prefix = s3_out_prefix[:-1] 151 | train_data = save_as_protobuf( 152 | train_X, train_Y, s3_out_bucket, 153 | s3_out_prefix + "/" + "train/train.protobuf") 154 | print(train_data) 155 | validate_data = save_as_protobuf( 156 | validate_X, validate_Y, s3_out_bucket, 157 | s3_out_prefix + "/" + "validate/validate.protobuf") 158 | print(validate_data) 159 | 160 | # chunk test data to avoid payload size issues when batch transforming 161 | test_x_chunks = chunk(test_X, 10000) 162 | test_y_chunks = chunk(test_Y, 10000) 163 | N = len(test_x_chunks) 164 | for i in range(N): 165 | test_data = save_as_protobuf( 166 | test_x_chunks[i], 167 | test_y_chunks[i], 168 | s3_out_bucket, 169 | s3_out_prefix + "/" + "test/test_" + str(i) + ".protobuf") 170 | print(test_data) 171 | 172 | return "SUCCESS" 173 | except Exception as e: 174 | raise e 175 | -------------------------------------------------------------------------------- /src/dag_ml_pipeline_amazon_video_reviews.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import json 3 | import requests 4 | from datetime import datetime 5 | 6 | # airflow operators 7 | import airflow 8 | from airflow.models import DAG 9 | from airflow.utils.trigger_rule import TriggerRule 10 | from airflow.operators.python_operator import BranchPythonOperator 11 | from airflow.operators.dummy_operator import DummyOperator 12 | from airflow.operators.python_operator import PythonOperator 13 | 14 | # airflow sagemaker operators 15 | from airflow.contrib.operators.sagemaker_training_operator \ 16 | import SageMakerTrainingOperator 17 | from airflow.contrib.operators.sagemaker_tuning_operator \ 18 | import SageMakerTuningOperator 19 | from airflow.contrib.operators.sagemaker_transform_operator \ 20 | import SageMakerTransformOperator 21 | from airflow.contrib.hooks.aws_hook import AwsHook 22 | 23 | # sagemaker sdk 24 | import boto3 25 | import sagemaker 26 | from sagemaker.amazon.amazon_estimator import get_image_uri 27 | from sagemaker.estimator import Estimator 28 | from sagemaker.tuner import HyperparameterTuner 29 | 30 | # airflow sagemaker configuration 31 | from sagemaker.workflow.airflow import training_config 32 | from sagemaker.workflow.airflow import tuning_config 33 | from sagemaker.workflow.airflow import transform_config_from_estimator 34 | 35 | # ml workflow specific 36 | from pipeline import prepare, preprocess 37 | import config as cfg 38 | 39 | # ============================================================================= 40 | # functions 41 | # ============================================================================= 42 | 43 | 44 | def is_hpo_enabled(): 45 | """check if hyper-parameter optimization is enabled in the config 46 | """ 47 | hpo_enabled = False 48 | if "job_level" in config and \ 49 | "run_hyperparameter_opt" in config["job_level"]: 50 | run_hpo_config = config["job_level"]["run_hyperparameter_opt"] 51 | if run_hpo_config.lower() == "yes": 52 | hpo_enabled = True 53 | return hpo_enabled 54 | 55 | 56 | def get_sagemaker_role_arn(role_name, region_name): 57 | iam = boto3.client('iam', region_name=region_name) 58 | response = iam.get_role(RoleName=role_name) 59 | return response["Role"]["Arn"] 60 | 61 | # ============================================================================= 62 | # setting up training, tuning and transform configuration 63 | # ============================================================================= 64 | 65 | 66 | # read config file 67 | config = cfg.config 68 | 69 | # set configuration for tasks 70 | hook = AwsHook(aws_conn_id='airflow-sagemaker') 71 | region = config["job_level"]["region_name"] 72 | sess = hook.get_session(region_name=region) 73 | role = get_sagemaker_role_arn( 74 | config["train_model"]["sagemaker_role"], 75 | sess.region_name) 76 | container = get_image_uri(sess.region_name, 'factorization-machines') 77 | hpo_enabled = is_hpo_enabled() 78 | 79 | # create estimator 80 | fm_estimator = Estimator( 81 | image_name=container, 82 | role=role, 83 | sagemaker_session=sagemaker.session.Session(sess), 84 | **config["train_model"]["estimator_config"] 85 | ) 86 | 87 | # train_config specifies SageMaker training configuration 88 | train_config = training_config( 89 | estimator=fm_estimator, 90 | inputs=config["train_model"]["inputs"]) 91 | 92 | # create tuner 93 | fm_tuner = HyperparameterTuner( 94 | estimator=fm_estimator, 95 | **config["tune_model"]["tuner_config"] 96 | ) 97 | 98 | # create tuning config 99 | tuner_config = tuning_config( 100 | tuner=fm_tuner, 101 | inputs=config["tune_model"]["inputs"]) 102 | 103 | # create transform config 104 | transform_config = transform_config_from_estimator( 105 | estimator=fm_estimator, 106 | task_id="model_tuning" if hpo_enabled else "model_training", 107 | task_type="tuning" if hpo_enabled else "training", 108 | **config["batch_transform"]["transform_config"] 109 | ) 110 | 111 | # ============================================================================= 112 | # define airflow DAG and tasks 113 | # ============================================================================= 114 | 115 | # define airflow DAG 116 | 117 | args = { 118 | 'owner': 'airflow', 119 | 'start_date': airflow.utils.dates.days_ago(2) 120 | } 121 | 122 | dag = DAG( 123 | dag_id='sagemaker-ml-pipeline', 124 | default_args=args, 125 | schedule_interval=None, 126 | concurrency=1, 127 | max_active_runs=1, 128 | user_defined_filters={'tojson': lambda s: json.JSONEncoder().encode(s)} 129 | ) 130 | 131 | # set the tasks in the DAG 132 | 133 | # dummy operator 134 | init = DummyOperator( 135 | task_id='start', 136 | dag=dag 137 | ) 138 | 139 | # preprocess the data 140 | preprocess_task = PythonOperator( 141 | task_id='preprocessing', 142 | dag=dag, 143 | provide_context=False, 144 | python_callable=preprocess.preprocess, 145 | op_kwargs=config["preprocess_data"]) 146 | 147 | # prepare the data for training 148 | prepare_task = PythonOperator( 149 | task_id='preparing', 150 | dag=dag, 151 | provide_context=False, 152 | python_callable=prepare.prepare, 153 | op_kwargs=config["prepare_data"] 154 | ) 155 | 156 | branching = BranchPythonOperator( 157 | task_id='branching', 158 | dag=dag, 159 | python_callable=lambda: "model_tuning" if hpo_enabled else "model_training") 160 | 161 | # launch sagemaker training job and wait until it completes 162 | train_model_task = SageMakerTrainingOperator( 163 | task_id='model_training', 164 | dag=dag, 165 | config=train_config, 166 | aws_conn_id='airflow-sagemaker', 167 | wait_for_completion=True, 168 | check_interval=30 169 | ) 170 | 171 | # launch sagemaker hyperparameter job and wait until it completes 172 | tune_model_task = SageMakerTuningOperator( 173 | task_id='model_tuning', 174 | dag=dag, 175 | config=tuner_config, 176 | aws_conn_id='airflow-sagemaker', 177 | wait_for_completion=True, 178 | check_interval=30 179 | ) 180 | 181 | # launch sagemaker batch transform job and wait until it completes 182 | batch_transform_task = SageMakerTransformOperator( 183 | task_id='predicting', 184 | dag=dag, 185 | config=transform_config, 186 | aws_conn_id='airflow-sagemaker', 187 | wait_for_completion=True, 188 | check_interval=30, 189 | trigger_rule=TriggerRule.ONE_SUCCESS 190 | ) 191 | 192 | cleanup_task = DummyOperator( 193 | task_id='cleaning_up', 194 | dag=dag) 195 | 196 | # set the dependencies between tasks 197 | 198 | init.set_downstream(preprocess_task) 199 | preprocess_task.set_downstream(prepare_task) 200 | prepare_task.set_downstream(branching) 201 | branching.set_downstream(tune_model_task) 202 | branching.set_downstream(train_model_task) 203 | tune_model_task.set_downstream(batch_transform_task) 204 | train_model_task.set_downstream(batch_transform_task) 205 | batch_transform_task.set_downstream(cleanup_task) 206 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Build End-to-End Machine Learning (ML) Workflows with Amazon SageMaker and Apache Airflow 2 | 3 | This repository contains the assets for the Amazon Sagemaker and Apache Airflow integration sample described in this [ML blog post](#TODO). 4 | 5 | ## Overview 6 | 7 | This repository shows a sample example to build, manage and orchestrate ML workflows using Amazon Sagemaker and Apache Airflow. We will build a recommender system to predict a customer's rating for a certain video based on customer's historical ratings of similar videos as well as the behavior of other similar customers. We'll use historical star ratings from over 2M Amazon customers on over 160K digital videos. More details on this dataset can be found at its [AWS Public Datasets page](https://s3.amazonaws.com/amazon-reviews-pds/readme.html). 8 | 9 | ### Repository Structure 10 | 11 | The repository contains 12 | 13 | - [AWS CloudFormation Templates](./cfn/) to launch the AWS services required to create the components 14 | - [Airflow DAG Python Script](./src/dag_ml_pipeline_amazon_video_reviews.py) that integrates and orchestrates all the ML tasks in a ML workflow for building a recommender system. 15 | - A companion [Jupyter Notebook](./notebooks/amazon-video-recommender_using_fm_algo.ipynb) to understand the individual ML tasks in detail such as data exploration, data preparation, model training/tuning and inference. 16 | 17 | 18 | ```text 19 | . 20 | ├── README.md About the repository 21 | ├── cfn AWS CloudFormation Templates 22 | │   └── airflow-ec2.yaml CloudFormation for installing Airflow instance backed by RDS 23 | ├── notebooks Jupyter Notebooks 24 | │   └── amazon-video-recommender_using_fm_algo.ipynb 25 | └── src Source code for Airflow DAG definition 26 | ├── config.py Config file to configure SageMaker jobs and other ML tasks 27 | ├── dag_ml_pipeline_amazon_video_reviews.py Airflow DAG definition for ML workflow 28 | └── pipeline Python module used in Airflow DAG for data preparation 29 | ├── __init__.py 30 | ├── prepare.py Data preparation script 31 | └── preprocess.py Data pre-processing script 32 | ``` 33 | 34 | ### High Level Solution 35 | 36 | Here is the high-level depiction of the ML workflow we will implement for building the recommender system 37 | 38 | ![airflow_dag_workflow](./images/airflow-sagemaker-airflow-dag.png) 39 | 40 | The workflow performs the following tasks 41 | 42 | 1. **Data Pre-processing:** Extract and pre-process data from S3 to prepare the training data. 43 | 2. **Prepare Training Data:** To build the recommender system, we will use SageMaker's built-in algorithm - Factorization machines. The algorithm expects training data only in RecordIO Protobuf format with Float32 tensors. In this task, pre-processed data will be transformed to RecordIO Protobuf format. 44 | 3. **Training the Model:** Train the SageMaker's built-in Factorization Machine model with the training data and generate model artifacts. The training job will be launched by the Airflow SageMaker operator `SageMakerTrainingOperator`. 45 | 4. **Tune the Model Hyper-parameters:** A conditional/optional task to tune the hyper-parameters of Factorization Machine to find the best model. The hyper-parameter tuning job will be launched by the SageMaker Airflow operator `SageMakerTuningOperator`. 46 | 5. **Batch inference:** Using the trained model, get inferences on the test dataset stored in Amazon S3 using Airflow SageMaker operator `SageMakerTransformOperator`. 47 | 48 | ### CloudFormation Template Resources 49 | 50 | We will set up a simple Airflow architecture with scheduler, worker and web server running on the same instance. Typically, you will not use this setup for production workloads. We will use AWS CloudFormation to launch the AWS services required to create the components in the blog post. The stack includes the following 51 | 52 | - Amazon EC2 instance to set up the Airflow components 53 | - Amazon Relational Database Service (RDS) Postgres or Aurora Serverless instance to host the Airflow metadata database. 54 | - Amazon S3 bucket to store the Sagemaker model artifacts, outputs and Airflow DAG with ML workflow. Template will prompt for the S3 bucket name 55 | - AWS IAM roles and EC2 Security Groups to allow Airflow components interact with the metadata database, S3 bucket and Amazon SageMaker 56 | 57 | If you want to troubleshoot or add custom operators, you can connect directly to the instance through the Session Manager console. You can also launch different stable versions of Airflow (1.10.12 and 2.0.2). 58 | - Airflow 1.10.12 RDS: [![cfn-launch-stack](./images/LaunchStack.png)](https://console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks/new?stackName=airflow-sagemaker&templateURL=https://s3.amazonaws.com/aws-machine-learning-blog/artifacts/sagemaker-ml-workflow-with-apache-airflow/v1/cfn/airflow-ec2-1.10.12-RDS.yaml) 59 | - Airflow 1.10.12 Aurora Serverless: [![cfn-launch-stack](./images/LaunchStack.png)](https://console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks/new?stackName=airflow-sagemaker&templateURL=https://s3.amazonaws.com/aws-machine-learning-blog/artifacts/sagemaker-ml-workflow-with-apache-airflow/v1/cfn/airflow-ec2-1.10.12-Aurora-Serverless.yaml) 60 | - Airflow 2.0.2 RDS: [![cfn-launch-stack](./images/LaunchStack.png)](https://console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks/new?stackName=airflow-sagemaker&templateURL=https://s3.amazonaws.com/aws-machine-learning-blog/artifacts/sagemaker-ml-workflow-with-apache-airflow/v1/cfn/airflow-ec2-2.0.2-RDS.yaml) 61 | - Airflow 2.0.2 Aurora Serverless: [![cfn-launch-stack](./images/LaunchStack.png)](https://console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks/new?stackName=airflow-sagemaker&templateURL=https://s3.amazonaws.com/aws-machine-learning-blog/artifacts/sagemaker-ml-workflow-with-apache-airflow/v1/cfn/airflow-ec2-2.0.2-Aurora-Serverless.yaml) 62 | 63 | It might take up to 10 minutes for the CloudFormation stack to create the resources. After the resource creation is completed, you should be able to log in to Airflow web UI with the credentials specified in the parameters of the CloudFormation stack. The Airflow web server runs on port 8080 by default. To open the Airflow web UI, open any browser, and type in the http://ec2-public-dns-name:8080. The public DNS name of the EC2 instance can be found on the Outputs tab of CloudFormation stack on the AWS CloudFormation console. 64 | 65 | ### Airflow DAG for ML Workflow 66 | 67 | Airflow DAG integrates all the ML tasks in a ML workflow. Airflow DAG is a python script where you express individual tasks as Airflow operators, set task dependencies and associate the tasks to the DAG to run either on demand or scheduled interval. The Airflow DAG script is divided into following sections 68 | 69 | 1. Set DAG with parameters such as schedule_interval to run the workflow at scheduled time 70 | 2. Set up training, tuning and inference configurations for each operators using Sagemaker Python SDK for Airflow operators. 71 | 3. Create individual tasks as Airflow operators defining trigger rules and associating them with the DAG object. Refer previous section for defining the individual tasks 72 | 4. Specify task dependencies 73 | 74 | ![airflow_dag](./images/airflow-sagemaker-dag.png) 75 | 76 | You can find the Airflow DAG code [here](./src/dag_ml_pipeline_amazon_video_reviews.py) in the repo. 77 | 78 | ### Cleaning Up the Stack Resources 79 | 80 | The final step is to clean up. To avoid unnecessary charges, 81 | 82 | 1. You should destroy all of the resources created by the CloudFormation stack in Airflow set up by deleting the stack after you’re done experimenting with it. You can follow the steps here to [delete the stack](https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/cfn-console-delete-stack.html). 83 | 2. You have to manually [delete the S3 bucket](https://docs.aws.amazon.com/AmazonS3/latest/user-guide/delete-bucket.html) created because AWS CloudFormation cannot delete non-empty S3 bucket. 84 | 85 | ## References 86 | 87 | - Refer [SageMaker SDK documentation](https://github.com/aws/sagemaker-python-sdk/blob/master/src/sagemaker/workflow/README.rst) and [Airflow documentation](https://airflow.apache.org/integration.html?highlight=sagemaker#amazon-sagemaker) for additional details on the Airflow SageMaker operators. 88 | - Refer [SageMaker documentation](https://docs.aws.amazon.com/sagemaker/latest/dg/fact-machines.html) to know more about Factorization Machines algorithm used in the blog post. 89 | 90 | ## License Summary 91 | 92 | This sample code is made available under a modified MIT license. See the [LICENSE](./LICENSE) file. 93 | -------------------------------------------------------------------------------- /cfn/airflow-ec2-1.10.12-RDS.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: "2010-09-09" 2 | 3 | Description: Airflow server v1.10.12 on EC2 Amazon Linux 2 backed by Postgres RDS 4 | 5 | Parameters: 6 | AirflowUser: 7 | NoEcho: "false" 8 | Description: Airflow UI admin account username 9 | Type: String 10 | MinLength: "4" 11 | MaxLength: "41" 12 | AllowedPattern: "[a-zA-Z0-9]*" 13 | ConstraintDescription: Must contain only alphanumeric characters 14 | AirflowPassword: 15 | NoEcho: "false" 16 | Description: Airflow UI admin account password 17 | Type: String 18 | MinLength: "8" 19 | MaxLength: "41" 20 | AllowedPattern: "[a-zA-Z0-9]*" 21 | ConstraintDescription: Must contain only alphanumeric characters 22 | DBPassword: 23 | NoEcho: "false" 24 | Description: Airflow database admin account password 25 | Type: String 26 | MinLength: "8" 27 | MaxLength: "41" 28 | AllowedPattern: "[a-zA-Z0-9]*" 29 | ConstraintDescription: Must contain only alphanumeric characters 30 | 31 | # Mapping to find the Amazon Linux AMI in each region. 32 | Mappings: 33 | RegionMap: 34 | ap-northeast-1: 35 | AMI: "ami-09ebacdc178ae23b7" 36 | ap-northeast-2: 37 | AMI: "ami-0a0de518b1fc4524c" 38 | ap-northeast-3: 39 | AMI: "ami-0e787554e61105680" 40 | ap-south-1: 41 | AMI: "ami-04db49c0fb2215364" 42 | ap-southeast-1: 43 | AMI: "ami-0f511ead81ccde020" 44 | ap-southeast-2: 45 | AMI: "ami-0aab712d6363da7f9" 46 | ca-central-1: 47 | AMI: "ami-02f84cf47c23f1769" 48 | eu-central-1: 49 | AMI: "ami-0453cb7b5f2b7fca2" 50 | eu-west-1: 51 | AMI: "ami-02b4e72b17337d6c1" 52 | eu-west-2: 53 | AMI: "ami-0d26eb3972b7f8c96" 54 | eu-west-3: 55 | AMI: "ami-0d49cec198762b78c" 56 | sa-east-1: 57 | AMI: "ami-0f8243a5175208e08" 58 | us-east-1: 59 | AMI: "ami-0c2b8ca1dad447f8a" 60 | us-east-2: 61 | AMI: "ami-0443305dabd4be2bc" 62 | us-west-1: 63 | AMI: "ami-04b6c97b14c54de18" 64 | us-west-2: 65 | AMI: "ami-083ac7c7ecf9bb9b0" 66 | 67 | Resources: 68 | EC2Instance: 69 | Type: AWS::EC2::Instance 70 | CreationPolicy: 71 | ResourceSignal: 72 | Timeout: PT10M 73 | Properties: 74 | SecurityGroups: [!Ref "AirflowEC2SecurityGroup"] 75 | InstanceType: "m4.xlarge" 76 | IamInstanceProfile: 77 | Ref: EC2InstanceProfile 78 | Tags: 79 | - Key: Name 80 | Value: Airflow 81 | ImageId: !FindInMap 82 | - RegionMap 83 | - !Ref "AWS::Region" 84 | - AMI 85 | UserData: 86 | Fn::Base64: !Sub | 87 | #!/bin/bash 88 | set -x 89 | exec > >(tee /var/log/user-data.log|logger -t user-data ) 2>&1 90 | ln -s /root/user-data.log /var/log/user-data.log 91 | # Get right version of pip 92 | yum install aws-cfn-bootstrap -y 93 | python3 -m pip install pip==20.2.4 --user 94 | # Start cfn-init 95 | /opt/aws/bin/cfn-init -v -c install --stack ${AWS::StackId} --resource EC2Instance --region ${AWS::Region} 96 | yum remove python3-docutils -y 97 | echo "Installing s3fs" 98 | python3 -m pip install --upgrade s3fs==0.4.2 99 | python3 -m pip install psycopg2 wheel 100 | echo "Installing sagemaker sdk" 101 | python3 -m pip install sagemaker==v1.72 102 | # Install airflow using pip 103 | echo "Installing Apache Airflow" 104 | export AIRFLOW_GPL_UNIDECODE=yes 105 | python3 -m pip install apache-airflow[crypto,postgres]==1.10.12 --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-1.10.12/constraints-3.7.txt" 106 | # Create Fernet Key 107 | export FERNET_KEY=`openssl rand -base64 32` 108 | sed -i 's|fernet_key =|fernet_key = '$FERNET'|g' ~/airflow/airflow.cfg 109 | # Postgres operators and hook, support as an Airflow backend 110 | echo 'export PATH=/usr/local/bin:~/.local/bin:$PATH' >> ~/.bash_profile 111 | source ~/.bash_profile 112 | # Initialize Airflow 113 | airflow initdb 114 | # Update the RDS connection in the Airflow Config file 115 | sed -i '/sql_alchemy_conn/s/^/#/g' ~/airflow/airflow.cfg 116 | sed -i '/#sql_alchemy_conn/ a sql_alchemy_conn = postgresql://airflow:${DBPassword}@${DBInstance.Endpoint.Address}:${DBInstance.Endpoint.Port}/airflowdb' ~/airflow/airflow.cfg 117 | # Update the type of executor in the Airflow Config file 118 | sed -i '/executor = SequentialExecutor/s/^/#/g' ~/airflow/airflow.cfg 119 | sed -i '/executor = SequentialExecutor/ a executor = LocalExecutor' ~/airflow/airflow.cfg 120 | sed -i 's/load_examples = True/load_examples = False/g' ~/airflow/airflow.cfg 121 | sed -i 's/rbac = False/rbac = True/g' ~/airflow/airflow.cfg 122 | airflow initdb 123 | # create airflow connection to sagemaker 124 | cat >> /tmp/airflow_conn.py << EOF 125 | from airflow import settings 126 | from airflow.models import Connection 127 | #create a connection object 128 | extra = '{"region_name": "${AWS::Region}"}' 129 | conn_id = 'airflow-sagemaker' 130 | conn = Connection(conn_id=conn_id,conn_type='aws', extra=extra) 131 | # get the session 132 | session = settings.Session() 133 | session.add(conn) 134 | session.commit() 135 | EOF 136 | python3 /tmp/airflow_conn.py 137 | # create directories 138 | mkdir -p ~/airflow/dags/sm-ml-pipeline 139 | # clone the git repository 140 | cd ~ 141 | git clone https://github.com/aws-samples/sagemaker-ml-workflow-with-apache-airflow.git 142 | mv ~/sagemaker-ml-workflow-with-apache-airflow ~/sm-ml-pipeline 143 | cd ~/sm-ml-pipeline/src 144 | # prepare airflow dag definition for sagemaker blog post 145 | sed -i 's//${S3BucketName}/g' ./*.* 146 | sed -i 's//${AWS::Region}/g' ./*.* 147 | sed -i 's//${AWS::AccountId}/g' ~/sm-ml-pipeline/src/config.py 148 | sed -i 's/AirflowSageMakerExecutionRole/AirflowSageMakerExecutionRole-${AWS::StackName}/g' ~/sm-ml-pipeline/src/config.py 149 | zip -r dag.zip * 150 | cp dag.zip ~/airflow/dags/sm-ml-pipeline/dag.zip 151 | cd - 152 | # Run Airflow webserver and scheduler 153 | airflow create_user -e admin@example.com -f admin -l airflow -p ${AirflowPassword} -r Admin -u ${AirflowUser} 154 | airflow list_dags 155 | airflow webserver -D 156 | airflow scheduler -D 157 | yum install aws-cfn-bootstrap -y 158 | /opt/aws/bin/cfn-signal --exit-code 0 --resource EC2Instance --region ${AWS::Region} --stack ${AWS::StackName} 159 | Metadata: 160 | AWS::CloudFormation::Init: 161 | configSets: 162 | install: 163 | - installpackages 164 | installpackages: 165 | packages: 166 | yum: 167 | python3: [] 168 | python3-devel: [] 169 | gcc: [] 170 | gcc-c++: [] 171 | postgresql-devel: [] 172 | openssl-devel: [] 173 | git: [] 174 | DependsOn: 175 | - DBInstance 176 | - AirflowEC2SecurityGroup 177 | DBInstance: 178 | Type: AWS::RDS::DBInstance 179 | DeletionPolicy: Delete 180 | Properties: 181 | DBName: airflowdb 182 | Engine: postgres 183 | MasterUsername: airflow 184 | MasterUserPassword: !Ref "DBPassword" 185 | DBInstanceClass: db.t3.small 186 | AllocatedStorage: 5 187 | DBSecurityGroups: 188 | - Ref: DBSecurityGroup 189 | AirflowEC2SecurityGroup: 190 | Type: AWS::EC2::SecurityGroup 191 | Properties: 192 | GroupName: !Sub 'AirflowEC2SG-${AWS::StackName}' 193 | GroupDescription: Enable HTTP access via port 8080 194 | SecurityGroupIngress: 195 | - IpProtocol: tcp 196 | FromPort: 8080 197 | ToPort: 8080 198 | CidrIp: 0.0.0.0/0 199 | DBSecurityGroup: 200 | Type: AWS::RDS::DBSecurityGroup 201 | Properties: 202 | GroupDescription: Frontend Access 203 | DBSecurityGroupIngress: 204 | EC2SecurityGroupName: 205 | Ref: AirflowEC2SecurityGroup 206 | EC2Role: 207 | Type: AWS::IAM::Role 208 | Properties: 209 | RoleName: !Sub 'AirflowInstanceRole-${AWS::StackName}' 210 | AssumeRolePolicyDocument: 211 | Version: "2012-10-17" 212 | Statement: 213 | - Effect: "Allow" 214 | Principal: 215 | Service: 216 | - "ec2.amazonaws.com" 217 | Action: 218 | - "sts:AssumeRole" 219 | Path: "/" 220 | ManagedPolicyArns: 221 | - arn:aws:iam::aws:policy/AmazonSageMakerFullAccess 222 | - arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore 223 | Policies: 224 | - PolicyName: !Sub 'AirflowResourceAccess-${AWS::StackName}' 225 | PolicyDocument: 226 | Version: "2012-10-17" 227 | Statement: 228 | - Effect: Allow 229 | Action: 230 | - s3:* 231 | Resource: 232 | - !Sub "arn:aws:s3:::${S3BucketName}" 233 | - !Sub "arn:aws:s3:::${S3BucketName}/*" 234 | - Effect: Allow 235 | Action: 236 | - iam:GetRole 237 | Resource: "*" 238 | EC2InstanceProfile: 239 | Type: AWS::IAM::InstanceProfile 240 | Properties: 241 | InstanceProfileName: !Sub 'AirflowInstanceProfile-${AWS::StackName}' 242 | Roles: 243 | - Ref: EC2Role 244 | S3BucketName: 245 | Type: AWS::S3::Bucket 246 | DeletionPolicy: Delete 247 | Properties: 248 | AccessControl: BucketOwnerFullControl 249 | BucketName: !Join 250 | - "-" 251 | - - "airflow-sagemaker" 252 | - !Select 253 | - 0 254 | - !Split 255 | - "-" 256 | - !Select 257 | - 2 258 | - !Split 259 | - "/" 260 | - !Ref "AWS::StackId" 261 | AirflowSageMakerExecutionRole: 262 | Type: AWS::IAM::Role 263 | Properties: 264 | RoleName: !Sub 'AirflowSageMakerExecutionRole-${AWS::StackName}' 265 | AssumeRolePolicyDocument: 266 | Version: "2012-10-17" 267 | Statement: 268 | - Effect: "Allow" 269 | Principal: 270 | Service: 271 | - "sagemaker.amazonaws.com" 272 | Action: 273 | - "sts:AssumeRole" 274 | ManagedPolicyArns: 275 | - arn:aws:iam::aws:policy/AmazonSageMakerFullAccess 276 | Path: "/service-role/" 277 | Policies: 278 | - PolicyName: !Sub 'SageMakerS3BucketAccess-${AWS::StackName}' 279 | PolicyDocument: 280 | Version: "2012-10-17" 281 | Statement: 282 | - Effect: Allow 283 | Action: 284 | - s3:* 285 | Resource: 286 | - !Sub "arn:aws:s3:::${S3BucketName}" 287 | - !Sub "arn:aws:s3:::${S3BucketName}/*" 288 | Outputs: 289 | AirflowEC2PublicDNSName: 290 | Description: Public DNS Name of the Airflow EC2 instance 291 | Value: !Join ["", ["http://", !GetAtt EC2Instance.PublicDnsName, ":8080"]] -------------------------------------------------------------------------------- /cfn/airflow-ec2-1.10.12-Aurora-Serverless.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: "2010-09-09" 2 | 3 | Description: Airflow server v1.10.12 on EC2 Amazon Linux 2 backed by Postgres Aurora Serverless 4 | 5 | Parameters: 6 | AirflowUser: 7 | NoEcho: "false" 8 | Description: Airflow UI admin account username 9 | Type: String 10 | MinLength: "4" 11 | MaxLength: "41" 12 | AllowedPattern: "[a-zA-Z0-9]*" 13 | ConstraintDescription: Must contain only alphanumeric characters 14 | AirflowPassword: 15 | NoEcho: "false" 16 | Description: Airflow UI admin account password 17 | Type: String 18 | MinLength: "8" 19 | MaxLength: "41" 20 | AllowedPattern: "[a-zA-Z0-9]*" 21 | ConstraintDescription: Must contain only alphanumeric characters 22 | DBPassword: 23 | NoEcho: "false" 24 | Description: Airflow database admin account password 25 | Type: String 26 | MinLength: "8" 27 | MaxLength: "41" 28 | AllowedPattern: "[a-zA-Z0-9]*" 29 | ConstraintDescription: Must contain only alphanumeric characters 30 | 31 | # Mapping to find the Amazon Linux AMI in each region. 32 | Mappings: 33 | RegionMap: 34 | ap-northeast-1: 35 | AMI: "ami-09ebacdc178ae23b7" 36 | ap-northeast-2: 37 | AMI: "ami-0a0de518b1fc4524c" 38 | ap-northeast-3: 39 | AMI: "ami-0e787554e61105680" 40 | ap-south-1: 41 | AMI: "ami-04db49c0fb2215364" 42 | ap-southeast-1: 43 | AMI: "ami-0f511ead81ccde020" 44 | ap-southeast-2: 45 | AMI: "ami-0aab712d6363da7f9" 46 | ca-central-1: 47 | AMI: "ami-02f84cf47c23f1769" 48 | eu-central-1: 49 | AMI: "ami-0453cb7b5f2b7fca2" 50 | eu-west-1: 51 | AMI: "ami-02b4e72b17337d6c1" 52 | eu-west-2: 53 | AMI: "ami-0d26eb3972b7f8c96" 54 | eu-west-3: 55 | AMI: "ami-0d49cec198762b78c" 56 | sa-east-1: 57 | AMI: "ami-0f8243a5175208e08" 58 | us-east-1: 59 | AMI: "ami-0c2b8ca1dad447f8a" 60 | us-east-2: 61 | AMI: "ami-0443305dabd4be2bc" 62 | us-west-1: 63 | AMI: "ami-04b6c97b14c54de18" 64 | us-west-2: 65 | AMI: "ami-083ac7c7ecf9bb9b0" 66 | 67 | Resources: 68 | EC2Instance: 69 | Type: AWS::EC2::Instance 70 | CreationPolicy: 71 | ResourceSignal: 72 | Timeout: PT10M 73 | Properties: 74 | SecurityGroups: [!Ref "AirflowEC2SecurityGroup"] 75 | InstanceType: "m4.xlarge" 76 | IamInstanceProfile: 77 | Ref: EC2InstanceProfile 78 | Tags: 79 | - Key: Name 80 | Value: Airflow 81 | ImageId: !FindInMap 82 | - RegionMap 83 | - !Ref "AWS::Region" 84 | - AMI 85 | UserData: 86 | Fn::Base64: !Sub | 87 | #!/bin/bash 88 | set -x 89 | exec > >(tee /var/log/user-data.log|logger -t user-data ) 2>&1 90 | ln -s /root/user-data.log /var/log/user-data.log 91 | # Get right version of pip 92 | yum install aws-cfn-bootstrap -y 93 | python3 -m pip install pip==20.2.4 --user 94 | # Start cfn-init 95 | /opt/aws/bin/cfn-init -v -c install --stack ${AWS::StackId} --resource EC2Instance --region ${AWS::Region} 96 | yum remove python3-docutils -y 97 | echo "Installing s3fs" 98 | python3 -m pip install --upgrade s3fs==0.4.2 99 | python3 -m pip install psycopg2 wheel 100 | echo "Installing sagemaker sdk" 101 | python3 -m pip install sagemaker==v1.72 102 | # Install airflow using pip 103 | echo "Installing Apache Airflow" 104 | export AIRFLOW_GPL_UNIDECODE=yes 105 | python3 -m pip install apache-airflow[crypto,postgres]==1.10.12 --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-1.10.12/constraints-3.7.txt" 106 | # Create Fernet Key 107 | export FERNET_KEY=`openssl rand -base64 32` 108 | sed -i 's|fernet_key =|fernet_key = '$FERNET'|g' ~/airflow/airflow.cfg 109 | # Postgres operators and hook, support as an Airflow backend 110 | echo 'export PATH=/usr/local/bin:~/.local/bin:$PATH' >> ~/.bash_profile 111 | source ~/.bash_profile 112 | # Initialize Airflow 113 | airflow initdb 114 | # Update the RDS connection in the Airflow Config file 115 | sed -i '/sql_alchemy_conn/s/^/#/g' ~/airflow/airflow.cfg 116 | sed -i '/#sql_alchemy_conn/ a sql_alchemy_conn = postgresql://airflow:${DBPassword}@${DBInstance.Endpoint.Address}:${DBInstance.Endpoint.Port}/airflowdb' ~/airflow/airflow.cfg 117 | # Update the type of executor in the Airflow Config file 118 | sed -i '/executor = SequentialExecutor/s/^/#/g' ~/airflow/airflow.cfg 119 | sed -i '/executor = SequentialExecutor/ a executor = LocalExecutor' ~/airflow/airflow.cfg 120 | sed -i 's/load_examples = True/load_examples = False/g' ~/airflow/airflow.cfg 121 | sed -i 's/rbac = False/rbac = True/g' ~/airflow/airflow.cfg 122 | airflow initdb 123 | # create airflow connection to sagemaker 124 | cat >> /tmp/airflow_conn.py << EOF 125 | from airflow import settings 126 | from airflow.models import Connection 127 | #create a connection object 128 | extra = '{"region_name": "${AWS::Region}"}' 129 | conn_id = 'airflow-sagemaker' 130 | conn = Connection(conn_id=conn_id,conn_type='aws', extra=extra) 131 | # get the session 132 | session = settings.Session() 133 | session.add(conn) 134 | session.commit() 135 | EOF 136 | python3 /tmp/airflow_conn.py 137 | # create directories 138 | mkdir -p ~/airflow/dags/sm-ml-pipeline 139 | # clone the git repository 140 | cd ~ 141 | git clone https://github.com/aws-samples/sagemaker-ml-workflow-with-apache-airflow.git 142 | mv ~/sagemaker-ml-workflow-with-apache-airflow ~/sm-ml-pipeline 143 | cd ~/sm-ml-pipeline/src 144 | # prepare airflow dag definition for sagemaker blog post 145 | sed -i 's//${S3BucketName}/g' ./*.* 146 | sed -i 's//${AWS::Region}/g' ./*.* 147 | sed -i 's//${AWS::AccountId}/g' ~/sm-ml-pipeline/src/config.py 148 | sed -i 's/AirflowSageMakerExecutionRole/AirflowSageMakerExecutionRole-${AWS::StackName}/g' ~/sm-ml-pipeline/src/config.py 149 | zip -r dag.zip * 150 | cp dag.zip ~/airflow/dags/sm-ml-pipeline/dag.zip 151 | cd - 152 | # Run Airflow webserver and scheduler 153 | airflow create_user -e admin@example.com -f admin -l airflow -p ${AirflowPassword} -r Admin -u ${AirflowUser} 154 | airflow list_dags 155 | airflow webserver -D 156 | airflow scheduler -D 157 | yum install aws-cfn-bootstrap -y 158 | /opt/aws/bin/cfn-signal --exit-code 0 --resource EC2Instance --region ${AWS::Region} --stack ${AWS::StackName} 159 | Metadata: 160 | AWS::CloudFormation::Init: 161 | configSets: 162 | install: 163 | - installpackages 164 | installpackages: 165 | packages: 166 | yum: 167 | python3: [] 168 | python3-devel: [] 169 | gcc: [] 170 | gcc-c++: [] 171 | postgresql-devel: [] 172 | openssl-devel: [] 173 | git: [] 174 | DependsOn: 175 | - DBInstance 176 | - AirflowEC2SecurityGroup 177 | DBInstance: 178 | Type: AWS::RDS::DBCluster 179 | DeletionPolicy: Delete 180 | Properties: 181 | DatabaseName: airflowdb 182 | Engine: aurora-postgresql 183 | MasterUsername: airflow 184 | MasterUserPassword: !Ref "DBPassword" 185 | EngineMode: serverless 186 | ScalingConfiguration: 187 | AutoPause: true 188 | MaxCapacity: 16 189 | MinCapacity: 2 190 | SecondsUntilAutoPause: 300 191 | VpcSecurityGroupIds: 192 | - !GetAtt AirflowEC2SecurityGroup.GroupId 193 | AirflowEC2SecurityGroup: 194 | Type: AWS::EC2::SecurityGroup 195 | Properties: 196 | GroupName: !Sub 'AirflowEC2SG-${AWS::StackName}' 197 | GroupDescription: Enable HTTP access via port 80 198 | SecurityGroupIngress: 199 | - IpProtocol: tcp 200 | FromPort: 8080 201 | ToPort: 8080 202 | CidrIp: 0.0.0.0/0 203 | - IpProtocol: tcp 204 | FromPort: 5432 205 | ToPort: 5432 206 | SourceSecurityGroupName: !Sub 'AirflowEC2SG-${AWS::StackName}' 207 | EC2Role: 208 | Type: AWS::IAM::Role 209 | Properties: 210 | RoleName: !Sub 'AirflowInstanceRole-${AWS::StackName}' 211 | AssumeRolePolicyDocument: 212 | Version: "2012-10-17" 213 | Statement: 214 | - Effect: "Allow" 215 | Principal: 216 | Service: 217 | - "ec2.amazonaws.com" 218 | Action: 219 | - "sts:AssumeRole" 220 | Path: "/" 221 | ManagedPolicyArns: 222 | - arn:aws:iam::aws:policy/AmazonSageMakerFullAccess 223 | - arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore 224 | Policies: 225 | - PolicyName: !Sub 'AirflowResourceAccess-${AWS::StackName}' 226 | PolicyDocument: 227 | Version: "2012-10-17" 228 | Statement: 229 | - Effect: Allow 230 | Action: 231 | - s3:* 232 | Resource: 233 | - !Sub "arn:aws:s3:::${S3BucketName}" 234 | - !Sub "arn:aws:s3:::${S3BucketName}/*" 235 | - Effect: Allow 236 | Action: 237 | - iam:GetRole 238 | Resource: "*" 239 | EC2InstanceProfile: 240 | Type: AWS::IAM::InstanceProfile 241 | Properties: 242 | InstanceProfileName: !Sub 'AirflowInstanceProfile-${AWS::StackName}' 243 | Roles: 244 | - Ref: EC2Role 245 | S3BucketName: 246 | Type: AWS::S3::Bucket 247 | DeletionPolicy: Delete 248 | Properties: 249 | AccessControl: BucketOwnerFullControl 250 | BucketName: !Join 251 | - "-" 252 | - - "airflow-sagemaker" 253 | - !Select 254 | - 0 255 | - !Split 256 | - "-" 257 | - !Select 258 | - 2 259 | - !Split 260 | - "/" 261 | - !Ref "AWS::StackId" 262 | AirflowSageMakerExecutionRole: 263 | Type: AWS::IAM::Role 264 | Properties: 265 | RoleName: !Sub 'AirflowSageMakerExecutionRole-${AWS::StackName}' 266 | AssumeRolePolicyDocument: 267 | Version: "2012-10-17" 268 | Statement: 269 | - Effect: "Allow" 270 | Principal: 271 | Service: 272 | - "sagemaker.amazonaws.com" 273 | Action: 274 | - "sts:AssumeRole" 275 | ManagedPolicyArns: 276 | - arn:aws:iam::aws:policy/AmazonSageMakerFullAccess 277 | Path: "/service-role/" 278 | Policies: 279 | - PolicyName: !Sub 'SageMakerS3BucketAccess-${AWS::StackName}' 280 | PolicyDocument: 281 | Version: "2012-10-17" 282 | Statement: 283 | - Effect: Allow 284 | Action: 285 | - s3:* 286 | Resource: 287 | - !Sub "arn:aws:s3:::${S3BucketName}" 288 | - !Sub "arn:aws:s3:::${S3BucketName}/*" 289 | Outputs: 290 | AirflowEC2PublicDNSName: 291 | Description: Public DNS Name of the Airflow EC2 instance 292 | Value: !Join ["", ["http://", !GetAtt EC2Instance.PublicDnsName, ":8080"]] -------------------------------------------------------------------------------- /cfn/airflow-ec2-2.0.2-RDS.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: "2010-09-09" 2 | 3 | Description: Airflow server v2.0.2 on EC2 Amazon Linux 2 backed by Postgres RDS 4 | 5 | Parameters: 6 | AirflowUser: 7 | NoEcho: "false" 8 | Description: Airflow UI admin account username 9 | Type: String 10 | MinLength: "4" 11 | MaxLength: "41" 12 | AllowedPattern: "[a-zA-Z0-9]*" 13 | ConstraintDescription: Must contain only alphanumeric characters 14 | AirflowPassword: 15 | NoEcho: "false" 16 | Description: Airflow UI admin account password 17 | Type: String 18 | MinLength: "8" 19 | MaxLength: "41" 20 | AllowedPattern: "[a-zA-Z0-9]*" 21 | ConstraintDescription: Must contain only alphanumeric characters 22 | DBPassword: 23 | NoEcho: "false" 24 | Description: Airflow database admin account password 25 | Type: String 26 | MinLength: "8" 27 | MaxLength: "41" 28 | AllowedPattern: "[a-zA-Z0-9]*" 29 | ConstraintDescription: Must contain only alphanumeric characters 30 | 31 | # Mapping to find the Amazon Linux AMI in each region. 32 | Mappings: 33 | RegionMap: 34 | ap-northeast-1: 35 | AMI: "ami-09ebacdc178ae23b7" 36 | ap-northeast-2: 37 | AMI: "ami-0a0de518b1fc4524c" 38 | ap-northeast-3: 39 | AMI: "ami-0e787554e61105680" 40 | ap-south-1: 41 | AMI: "ami-04db49c0fb2215364" 42 | ap-southeast-1: 43 | AMI: "ami-0f511ead81ccde020" 44 | ap-southeast-2: 45 | AMI: "ami-0aab712d6363da7f9" 46 | ca-central-1: 47 | AMI: "ami-02f84cf47c23f1769" 48 | eu-central-1: 49 | AMI: "ami-0453cb7b5f2b7fca2" 50 | eu-west-1: 51 | AMI: "ami-02b4e72b17337d6c1" 52 | eu-west-2: 53 | AMI: "ami-0d26eb3972b7f8c96" 54 | eu-west-3: 55 | AMI: "ami-0d49cec198762b78c" 56 | sa-east-1: 57 | AMI: "ami-0f8243a5175208e08" 58 | us-east-1: 59 | AMI: "ami-0c2b8ca1dad447f8a" 60 | us-east-2: 61 | AMI: "ami-0443305dabd4be2bc" 62 | us-west-1: 63 | AMI: "ami-04b6c97b14c54de18" 64 | us-west-2: 65 | AMI: "ami-083ac7c7ecf9bb9b0" 66 | 67 | Resources: 68 | EC2Instance: 69 | Type: AWS::EC2::Instance 70 | CreationPolicy: 71 | ResourceSignal: 72 | Timeout: PT10M 73 | Properties: 74 | SecurityGroups: [!Ref "AirflowEC2SecurityGroup"] 75 | InstanceType: "m4.xlarge" 76 | IamInstanceProfile: 77 | Ref: EC2InstanceProfile 78 | Tags: 79 | - Key: Name 80 | Value: Airflow 81 | ImageId: !FindInMap 82 | - RegionMap 83 | - !Ref "AWS::Region" 84 | - AMI 85 | UserData: 86 | Fn::Base64: !Sub | 87 | #!/bin/bash 88 | set -x 89 | exec > >(tee /var/log/user-data.log|logger -t user-data ) 2>&1 90 | ln -s /root/user-data.log /var/log/user-data.log 91 | # Get right version of pip 92 | yum install aws-cfn-bootstrap -y 93 | python3 -m pip install pip==20.2.4 --user 94 | # Start cfn-init 95 | /opt/aws/bin/cfn-init -v -c install --stack ${AWS::StackId} --resource EC2Instance --region ${AWS::Region} 96 | yum remove python3-docutils -y 97 | echo "Installing s3fs" 98 | python3 -m pip install --upgrade s3fs==0.4.2 99 | python3 -m pip install psycopg2 wheel 100 | # Upgrade sqlite 101 | wget https://www.sqlite.org/src/tarball/sqlite.tar.gz 102 | tar xzf sqlite.tar.gz 103 | cd sqlite/ 104 | export CFLAGS="-DSQLITE_ENABLE_FTS3 \ 105 | -DSQLITE_ENABLE_FTS3_PARENTHESIS \ 106 | -DSQLITE_ENABLE_FTS4 \ 107 | -DSQLITE_ENABLE_FTS5 \ 108 | -DSQLITE_ENABLE_JSON1 \ 109 | -DSQLITE_ENABLE_LOAD_EXTENSION \ 110 | -DSQLITE_ENABLE_RTREE \ 111 | -DSQLITE_ENABLE_STAT4 \ 112 | -DSQLITE_ENABLE_UPDATE_DELETE_LIMIT \ 113 | -DSQLITE_SOUNDEX \ 114 | -DSQLITE_TEMP_STORE=3 \ 115 | -DSQLITE_USE_URI \ 116 | -O2 \ 117 | -fPIC" 118 | export PREFIX="/usr/local" 119 | LIBS="-lm" ./configure --disable-tcl --enable-shared --enable-tempstore=always --prefix="$PREFIX" 120 | make 121 | make install 122 | export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH 123 | echo "Installing sagemaker sdk" 124 | python3 -m pip install sagemaker==v1.72 125 | # Install airflow using pip 126 | echo "Installing Apache Airflow" 127 | export AIRFLOW_GPL_UNIDECODE=yes 128 | python3 -m pip install apache-airflow[crypto,postgres,amazon]==2.0.2 --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.0.2/constraints-3.7.txt" 129 | # Create Fernet Key 130 | export FERNET_KEY=`openssl rand -base64 32` 131 | sed -i 's|fernet_key =|fernet_key = '$FERNET'|g' ~/airflow/airflow.cfg 132 | # Postgres operators and hook, support as an Airflow backend 133 | echo 'export PATH=/usr/local/bin:~/.local/bin:$PATH' >> ~/.bash_profile 134 | source ~/.bash_profile 135 | # Initialize Airflow 136 | airflow db init 137 | # Update the RDS connection in the Airflow Config file 138 | sed -i '/sql_alchemy_conn/s/^/#/g' ~/airflow/airflow.cfg 139 | sed -i '/#sql_alchemy_conn/ a sql_alchemy_conn = postgresql://airflow:${DBPassword}@${DBInstance.Endpoint.Address}:${DBInstance.Endpoint.Port}/airflowdb' ~/airflow/airflow.cfg 140 | # Update the type of executor in the Airflow Config file 141 | sed -i '/executor = SequentialExecutor/s/^/#/g' ~/airflow/airflow.cfg 142 | sed -i '/executor = SequentialExecutor/ a executor = LocalExecutor' ~/airflow/airflow.cfg 143 | sed -i 's/load_examples = True/load_examples = False/g' ~/airflow/airflow.cfg 144 | airflow db init 145 | airflow users create -e admin@example.com -f admin -l airflow -p ${AirflowPassword} -r Admin -u ${AirflowUser} 146 | # create airflow connection to sagemaker 147 | cat >> /tmp/airflow_conn.py << EOF 148 | from airflow import settings 149 | from airflow.models import Connection 150 | #create a connection object 151 | extra = '{"region_name": "${AWS::Region}"}' 152 | conn_id = 'airflow-sagemaker' 153 | conn = Connection(conn_id=conn_id,conn_type='s3', extra=extra) 154 | # get the session 155 | session = settings.Session() 156 | session.add(conn) 157 | session.commit() 158 | EOF 159 | python3 /tmp/airflow_conn.py 160 | # create directories 161 | mkdir -p ~/airflow/dags/sm-ml-pipeline 162 | # clone the git repository 163 | cd ~ 164 | git clone https://github.com/aws-samples/sagemaker-ml-workflow-with-apache-airflow.git 165 | mv ~/sagemaker-ml-workflow-with-apache-airflow ~/sm-ml-pipeline 166 | cd ~/sm-ml-pipeline/src 167 | # prepare airflow dag definition for sagemaker blog post 168 | sed -i 's//${S3BucketName}/g' ./*.* 169 | sed -i 's//${AWS::Region}/g' ./*.* 170 | sed -i 's//${AWS::AccountId}/g' ~/sm-ml-pipeline/src/config.py 171 | sed -i 's/AirflowSageMakerExecutionRole/AirflowSageMakerExecutionRole-${AWS::StackName}/g' ~/sm-ml-pipeline/src/config.py 172 | sed -i "s/hook = AwsHook(aws_conn_id='airflow-sagemaker')/hook = AwsHook(aws_conn_id='airflow-sagemaker', client_type='s3')/g" ~/sm-ml-pipeline/src/dag_ml_pipeline_amazon_video_reviews.py 173 | sed -i '/provide_context=False/d' ~/sm-ml-pipeline/src/dag_ml_pipeline_amazon_video_reviews.py 174 | sed -i 's/enable_xcom_pickling = False/enable_xcom_pickling = True/g' ~/airflow/airflow.cfg 175 | zip -r dag.zip * 176 | cp dag.zip ~/airflow/dags/sm-ml-pipeline/dag.zip 177 | cd - 178 | # Run Airflow webserver and scheduler 179 | airflow dags list 180 | airflow webserver -D 181 | airflow scheduler -D 182 | yum install aws-cfn-bootstrap -y 183 | /opt/aws/bin/cfn-signal --exit-code 0 --resource EC2Instance --region ${AWS::Region} --stack ${AWS::StackName} 184 | Metadata: 185 | AWS::CloudFormation::Init: 186 | configSets: 187 | install: 188 | - installpackages 189 | installpackages: 190 | packages: 191 | yum: 192 | python3: [] 193 | python3-devel: [] 194 | gcc: [] 195 | gcc-c++: [] 196 | postgresql-devel: [] 197 | openssl-devel: [] 198 | git: [] 199 | DependsOn: 200 | - DBInstance 201 | - AirflowEC2SecurityGroup 202 | DBInstance: 203 | Type: AWS::RDS::DBInstance 204 | DeletionPolicy: Delete 205 | Properties: 206 | DBName: airflowdb 207 | Engine: postgres 208 | MasterUsername: airflow 209 | MasterUserPassword: !Ref "DBPassword" 210 | DBInstanceClass: db.t3.small 211 | AllocatedStorage: 5 212 | DBSecurityGroups: 213 | - Ref: DBSecurityGroup 214 | AirflowEC2SecurityGroup: 215 | Type: AWS::EC2::SecurityGroup 216 | Properties: 217 | GroupName: !Sub 'AirflowEC2SG-${AWS::StackName}' 218 | GroupDescription: Enable HTTP access via port 8080 219 | SecurityGroupIngress: 220 | - IpProtocol: tcp 221 | FromPort: 8080 222 | ToPort: 8080 223 | CidrIp: 0.0.0.0/0 224 | DBSecurityGroup: 225 | Type: AWS::RDS::DBSecurityGroup 226 | Properties: 227 | GroupDescription: Frontend Access 228 | DBSecurityGroupIngress: 229 | EC2SecurityGroupName: 230 | Ref: AirflowEC2SecurityGroup 231 | EC2Role: 232 | Type: AWS::IAM::Role 233 | Properties: 234 | RoleName: !Sub 'AirflowInstanceRole-${AWS::StackName}' 235 | AssumeRolePolicyDocument: 236 | Version: "2012-10-17" 237 | Statement: 238 | - Effect: "Allow" 239 | Principal: 240 | Service: 241 | - "ec2.amazonaws.com" 242 | Action: 243 | - "sts:AssumeRole" 244 | Path: "/" 245 | ManagedPolicyArns: 246 | - arn:aws:iam::aws:policy/AmazonSageMakerFullAccess 247 | - arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore 248 | Policies: 249 | - PolicyName: !Sub 'AirflowResourceAccess-${AWS::StackName}' 250 | PolicyDocument: 251 | Version: "2012-10-17" 252 | Statement: 253 | - Effect: Allow 254 | Action: 255 | - s3:* 256 | Resource: 257 | - !Sub "arn:aws:s3:::${S3BucketName}" 258 | - !Sub "arn:aws:s3:::${S3BucketName}/*" 259 | - Effect: Allow 260 | Action: 261 | - iam:GetRole 262 | Resource: "*" 263 | EC2InstanceProfile: 264 | Type: AWS::IAM::InstanceProfile 265 | Properties: 266 | InstanceProfileName: !Sub 'AirflowInstanceProfile-${AWS::StackName}' 267 | Roles: 268 | - Ref: EC2Role 269 | S3BucketName: 270 | Type: AWS::S3::Bucket 271 | DeletionPolicy: Delete 272 | Properties: 273 | AccessControl: BucketOwnerFullControl 274 | BucketName: !Join 275 | - "-" 276 | - - "airflow-sagemaker" 277 | - !Select 278 | - 0 279 | - !Split 280 | - "-" 281 | - !Select 282 | - 2 283 | - !Split 284 | - "/" 285 | - !Ref "AWS::StackId" 286 | AirflowSageMakerExecutionRole: 287 | Type: AWS::IAM::Role 288 | Properties: 289 | RoleName: !Sub 'AirflowSageMakerExecutionRole-${AWS::StackName}' 290 | AssumeRolePolicyDocument: 291 | Version: "2012-10-17" 292 | Statement: 293 | - Effect: "Allow" 294 | Principal: 295 | Service: 296 | - "sagemaker.amazonaws.com" 297 | Action: 298 | - "sts:AssumeRole" 299 | ManagedPolicyArns: 300 | - arn:aws:iam::aws:policy/AmazonSageMakerFullAccess 301 | Path: "/service-role/" 302 | Policies: 303 | - PolicyName: !Sub 'SageMakerS3BucketAccess-${AWS::StackName}' 304 | PolicyDocument: 305 | Version: "2012-10-17" 306 | Statement: 307 | - Effect: Allow 308 | Action: 309 | - s3:* 310 | Resource: 311 | - !Sub "arn:aws:s3:::${S3BucketName}" 312 | - !Sub "arn:aws:s3:::${S3BucketName}/*" 313 | Outputs: 314 | AirflowEC2PublicDNSName: 315 | Description: Public DNS Name of the Airflow EC2 instance 316 | Value: !Join ["", ["http://", !GetAtt EC2Instance.PublicDnsName, ":8080"]] -------------------------------------------------------------------------------- /cfn/airflow-ec2-2.0.2-Aurora-Serverless.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: "2010-09-09" 2 | 3 | Description: Airflow server v2.0.2 on EC2 Amazon Linux 2 backed by Postgres Aurora Serverless 4 | 5 | Parameters: 6 | AirflowUser: 7 | NoEcho: "false" 8 | Description: Airflow UI admin account username 9 | Type: String 10 | MinLength: "4" 11 | MaxLength: "41" 12 | AllowedPattern: "[a-zA-Z0-9]*" 13 | ConstraintDescription: Must contain only alphanumeric characters 14 | AirflowPassword: 15 | NoEcho: "false" 16 | Description: Airflow UI admin account password 17 | Type: String 18 | MinLength: "8" 19 | MaxLength: "41" 20 | AllowedPattern: "[a-zA-Z0-9]*" 21 | ConstraintDescription: Must contain only alphanumeric characters 22 | DBPassword: 23 | NoEcho: "false" 24 | Description: Airflow database admin account password 25 | Type: String 26 | MinLength: "8" 27 | MaxLength: "41" 28 | AllowedPattern: "[a-zA-Z0-9]*" 29 | ConstraintDescription: Must contain only alphanumeric characters 30 | 31 | # Mapping to find the Amazon Linux AMI in each region. 32 | Mappings: 33 | RegionMap: 34 | ap-northeast-1: 35 | AMI: "ami-09ebacdc178ae23b7" 36 | ap-northeast-2: 37 | AMI: "ami-0a0de518b1fc4524c" 38 | ap-northeast-3: 39 | AMI: "ami-0e787554e61105680" 40 | ap-south-1: 41 | AMI: "ami-04db49c0fb2215364" 42 | ap-southeast-1: 43 | AMI: "ami-0f511ead81ccde020" 44 | ap-southeast-2: 45 | AMI: "ami-0aab712d6363da7f9" 46 | ca-central-1: 47 | AMI: "ami-02f84cf47c23f1769" 48 | eu-central-1: 49 | AMI: "ami-0453cb7b5f2b7fca2" 50 | eu-west-1: 51 | AMI: "ami-02b4e72b17337d6c1" 52 | eu-west-2: 53 | AMI: "ami-0d26eb3972b7f8c96" 54 | eu-west-3: 55 | AMI: "ami-0d49cec198762b78c" 56 | sa-east-1: 57 | AMI: "ami-0f8243a5175208e08" 58 | us-east-1: 59 | AMI: "ami-0c2b8ca1dad447f8a" 60 | us-east-2: 61 | AMI: "ami-0443305dabd4be2bc" 62 | us-west-1: 63 | AMI: "ami-04b6c97b14c54de18" 64 | us-west-2: 65 | AMI: "ami-083ac7c7ecf9bb9b0" 66 | 67 | Resources: 68 | EC2Instance: 69 | Type: AWS::EC2::Instance 70 | CreationPolicy: 71 | ResourceSignal: 72 | Timeout: PT10M 73 | Properties: 74 | SecurityGroups: [!Ref "AirflowEC2SecurityGroup"] 75 | InstanceType: "m4.xlarge" 76 | IamInstanceProfile: 77 | Ref: EC2InstanceProfile 78 | Tags: 79 | - Key: Name 80 | Value: Airflow 81 | ImageId: !FindInMap 82 | - RegionMap 83 | - !Ref "AWS::Region" 84 | - AMI 85 | UserData: 86 | Fn::Base64: !Sub | 87 | #!/bin/bash 88 | set -x 89 | exec > >(tee /var/log/user-data.log|logger -t user-data ) 2>&1 90 | ln -s /root/user-data.log /var/log/user-data.log 91 | # Get right version of pip 92 | yum install aws-cfn-bootstrap -y 93 | python3 -m pip install pip==20.2.4 --user 94 | # Start cfn-init 95 | /opt/aws/bin/cfn-init -v -c install --stack ${AWS::StackId} --resource EC2Instance --region ${AWS::Region} 96 | yum remove python3-docutils -y 97 | echo "Installing s3fs" 98 | python3 -m pip install --upgrade s3fs==0.4.2 99 | python3 -m pip install psycopg2 wheel 100 | # Upgrade sqlite 101 | wget https://www.sqlite.org/src/tarball/sqlite.tar.gz 102 | tar xzf sqlite.tar.gz 103 | cd sqlite/ 104 | export CFLAGS="-DSQLITE_ENABLE_FTS3 \ 105 | -DSQLITE_ENABLE_FTS3_PARENTHESIS \ 106 | -DSQLITE_ENABLE_FTS4 \ 107 | -DSQLITE_ENABLE_FTS5 \ 108 | -DSQLITE_ENABLE_JSON1 \ 109 | -DSQLITE_ENABLE_LOAD_EXTENSION \ 110 | -DSQLITE_ENABLE_RTREE \ 111 | -DSQLITE_ENABLE_STAT4 \ 112 | -DSQLITE_ENABLE_UPDATE_DELETE_LIMIT \ 113 | -DSQLITE_SOUNDEX \ 114 | -DSQLITE_TEMP_STORE=3 \ 115 | -DSQLITE_USE_URI \ 116 | -O2 \ 117 | -fPIC" 118 | export PREFIX="/usr/local" 119 | LIBS="-lm" ./configure --disable-tcl --enable-shared --enable-tempstore=always --prefix="$PREFIX" 120 | make 121 | make install 122 | export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH 123 | echo "Installing sagemaker sdk" 124 | python3 -m pip install sagemaker==v1.72 125 | # Install airflow using pip 126 | echo "Installing Apache Airflow" 127 | export AIRFLOW_GPL_UNIDECODE=yes 128 | python3 -m pip install apache-airflow[crypto,postgres,amazon]==2.0.2 --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.0.2/constraints-3.7.txt" 129 | # Create Fernet Key 130 | export FERNET_KEY=`openssl rand -base64 32` 131 | sed -i 's|fernet_key =|fernet_key = '$FERNET'|g' ~/airflow/airflow.cfg 132 | # Postgres operators and hook, support as an Airflow backend 133 | echo 'export PATH=/usr/local/bin:~/.local/bin:$PATH' >> ~/.bash_profile 134 | source ~/.bash_profile 135 | # Initialize Airflow 136 | airflow db init 137 | # Update the RDS connection in the Airflow Config file 138 | sed -i '/sql_alchemy_conn/s/^/#/g' ~/airflow/airflow.cfg 139 | sed -i '/#sql_alchemy_conn/ a sql_alchemy_conn = postgresql://airflow:${DBPassword}@${DBInstance.Endpoint.Address}:${DBInstance.Endpoint.Port}/airflowdb' ~/airflow/airflow.cfg 140 | # Update the type of executor in the Airflow Config file 141 | sed -i '/executor = SequentialExecutor/s/^/#/g' ~/airflow/airflow.cfg 142 | sed -i '/executor = SequentialExecutor/ a executor = LocalExecutor' ~/airflow/airflow.cfg 143 | sed -i 's/load_examples = True/load_examples = False/g' ~/airflow/airflow.cfg 144 | airflow db init 145 | airflow users create -e admin@example.com -f admin -l airflow -p ${AirflowPassword} -r Admin -u ${AirflowUser} 146 | # create airflow connection to sagemaker 147 | cat >> /tmp/airflow_conn.py << EOF 148 | from airflow import settings 149 | from airflow.models import Connection 150 | #create a connection object 151 | extra = '{"region_name": "${AWS::Region}"}' 152 | conn_id = 'airflow-sagemaker' 153 | conn = Connection(conn_id=conn_id,conn_type='s3', extra=extra) 154 | # get the session 155 | session = settings.Session() 156 | session.add(conn) 157 | session.commit() 158 | EOF 159 | python3 /tmp/airflow_conn.py 160 | # create directories 161 | mkdir -p ~/airflow/dags/sm-ml-pipeline 162 | # clone the git repository 163 | cd ~ 164 | git clone https://github.com/aws-samples/sagemaker-ml-workflow-with-apache-airflow.git 165 | mv ~/sagemaker-ml-workflow-with-apache-airflow ~/sm-ml-pipeline 166 | cd ~/sm-ml-pipeline/src 167 | # prepare airflow dag definition for sagemaker blog post 168 | sed -i 's//${S3BucketName}/g' ./*.* 169 | sed -i 's//${AWS::Region}/g' ./*.* 170 | sed -i 's//${AWS::AccountId}/g' ~/sm-ml-pipeline/src/config.py 171 | sed -i 's/AirflowSageMakerExecutionRole/AirflowSageMakerExecutionRole-${AWS::StackName}/g' ~/sm-ml-pipeline/src/config.py 172 | sed -i "s/hook = AwsHook(aws_conn_id='airflow-sagemaker')/hook = AwsHook(aws_conn_id='airflow-sagemaker', client_type='s3')/g" ~/sm-ml-pipeline/src/dag_ml_pipeline_amazon_video_reviews.py 173 | sed -i '/provide_context=False/d' ~/sm-ml-pipeline/src/dag_ml_pipeline_amazon_video_reviews.py 174 | sed -i 's/enable_xcom_pickling = False/enable_xcom_pickling = True/g' ~/airflow/airflow.cfg 175 | zip -r dag.zip * 176 | cp dag.zip ~/airflow/dags/sm-ml-pipeline/dag.zip 177 | cd - 178 | # Run Airflow webserver and scheduler 179 | airflow dags list 180 | airflow webserver -D 181 | airflow scheduler -D 182 | yum install aws-cfn-bootstrap -y 183 | /opt/aws/bin/cfn-signal --exit-code 0 --resource EC2Instance --region ${AWS::Region} --stack ${AWS::StackName} 184 | Metadata: 185 | AWS::CloudFormation::Init: 186 | configSets: 187 | install: 188 | - installpackages 189 | installpackages: 190 | packages: 191 | yum: 192 | python3: [] 193 | python3-devel: [] 194 | gcc: [] 195 | gcc-c++: [] 196 | postgresql-devel: [] 197 | openssl-devel: [] 198 | git: [] 199 | DependsOn: 200 | - DBInstance 201 | - AirflowEC2SecurityGroup 202 | DBInstance: 203 | Type: AWS::RDS::DBCluster 204 | DeletionPolicy: Delete 205 | Properties: 206 | DatabaseName: airflowdb 207 | Engine: aurora-postgresql 208 | MasterUsername: airflow 209 | MasterUserPassword: !Ref "DBPassword" 210 | EngineMode: serverless 211 | ScalingConfiguration: 212 | AutoPause: true 213 | MaxCapacity: 16 214 | MinCapacity: 2 215 | SecondsUntilAutoPause: 300 216 | VpcSecurityGroupIds: 217 | - !GetAtt AirflowEC2SecurityGroup.GroupId 218 | AirflowEC2SecurityGroup: 219 | Type: AWS::EC2::SecurityGroup 220 | Properties: 221 | GroupName: !Sub 'AirflowEC2SG-${AWS::StackName}' 222 | GroupDescription: Enable HTTP access via port 80 223 | SecurityGroupIngress: 224 | - IpProtocol: tcp 225 | FromPort: 8080 226 | ToPort: 8080 227 | CidrIp: 0.0.0.0/0 228 | - IpProtocol: tcp 229 | FromPort: 5432 230 | ToPort: 5432 231 | SourceSecurityGroupName: !Sub 'AirflowEC2SG-${AWS::StackName}' 232 | EC2Role: 233 | Type: AWS::IAM::Role 234 | Properties: 235 | RoleName: !Sub 'AirflowInstanceRole-${AWS::StackName}' 236 | AssumeRolePolicyDocument: 237 | Version: "2012-10-17" 238 | Statement: 239 | - Effect: "Allow" 240 | Principal: 241 | Service: 242 | - "ec2.amazonaws.com" 243 | Action: 244 | - "sts:AssumeRole" 245 | Path: "/" 246 | ManagedPolicyArns: 247 | - arn:aws:iam::aws:policy/AmazonSageMakerFullAccess 248 | - arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore 249 | Policies: 250 | - PolicyName: !Sub 'AirflowResourceAccess-${AWS::StackName}' 251 | PolicyDocument: 252 | Version: "2012-10-17" 253 | Statement: 254 | - Effect: Allow 255 | Action: 256 | - s3:* 257 | Resource: 258 | - !Sub "arn:aws:s3:::${S3BucketName}" 259 | - !Sub "arn:aws:s3:::${S3BucketName}/*" 260 | - Effect: Allow 261 | Action: 262 | - iam:GetRole 263 | Resource: "*" 264 | EC2InstanceProfile: 265 | Type: AWS::IAM::InstanceProfile 266 | Properties: 267 | InstanceProfileName: !Sub 'AirflowInstanceProfile-${AWS::StackName}' 268 | Roles: 269 | - Ref: EC2Role 270 | S3BucketName: 271 | Type: AWS::S3::Bucket 272 | DeletionPolicy: Delete 273 | Properties: 274 | AccessControl: BucketOwnerFullControl 275 | BucketName: !Join 276 | - "-" 277 | - - "airflow-sagemaker" 278 | - !Select 279 | - 0 280 | - !Split 281 | - "-" 282 | - !Select 283 | - 2 284 | - !Split 285 | - "/" 286 | - !Ref "AWS::StackId" 287 | AirflowSageMakerExecutionRole: 288 | Type: AWS::IAM::Role 289 | Properties: 290 | RoleName: !Sub 'AirflowSageMakerExecutionRole-${AWS::StackName}' 291 | AssumeRolePolicyDocument: 292 | Version: "2012-10-17" 293 | Statement: 294 | - Effect: "Allow" 295 | Principal: 296 | Service: 297 | - "sagemaker.amazonaws.com" 298 | Action: 299 | - "sts:AssumeRole" 300 | ManagedPolicyArns: 301 | - arn:aws:iam::aws:policy/AmazonSageMakerFullAccess 302 | Path: "/service-role/" 303 | Policies: 304 | - PolicyName: !Sub 'SageMakerS3BucketAccess-${AWS::StackName}' 305 | PolicyDocument: 306 | Version: "2012-10-17" 307 | Statement: 308 | - Effect: Allow 309 | Action: 310 | - s3:* 311 | Resource: 312 | - !Sub "arn:aws:s3:::${S3BucketName}" 313 | - !Sub "arn:aws:s3:::${S3BucketName}/*" 314 | Outputs: 315 | AirflowEC2PublicDNSName: 316 | Description: Public DNS Name of the Airflow EC2 instance 317 | Value: !Join ["", ["http://", !GetAtt EC2Instance.PublicDnsName, ":8080"]] -------------------------------------------------------------------------------- /notebooks/amazon-video-recommender_using_fm_algo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Implementing Recommender System with SageMaker Built-In Algorithm\n", 8 | "_**Making Product Recommendations Using Factorization Machines**_\n", 9 | "\n", 10 | "--- \n", 11 | "\n", 12 | "*This work is based on content from [Gluon based Recommender System notebook](https://github.com/awslabs/amazon-sagemaker-examples/blob/master/introduction_to_applying_machine_learning/gluon_recommender_system/gluon_recommender_system.ipynb)*\n", 13 | "\n", 14 | "---" 15 | ] 16 | }, 17 | { 18 | "cell_type": "markdown", 19 | "metadata": {}, 20 | "source": [ 21 | "## Table of Contents\n", 22 | "\n", 23 | "1. [Background](#Background)\n", 24 | "1. [Setup](#Setup)\n", 25 | "1. [Data](#Data)\n", 26 | " 1. [Explore](#Explore)\n", 27 | " 1. [Clean](#Clean)\n", 28 | " 1. [Prepare](#Prepare)\n", 29 | "1. [Model Training](#Model-Training)\n", 30 | "1. [Model Inference](#Model-Inference)\n", 31 | " 1. [Real-Time Inference](#Real-Time-Inference)\n", 32 | " 1. [Batch Inference](#Batch-Inference)\n", 33 | "1. [Evaluate Model Performance](#Evaluate-Model-Performance)\n", 34 | "1. [Model Tuning](#Model-Tuning)\n", 35 | "1. [Wrap-up](#Wrap-up)\n", 36 | " 1. [Clean-Up](#Clean-up-(optional))" 37 | ] 38 | }, 39 | { 40 | "cell_type": "markdown", 41 | "metadata": {}, 42 | "source": [ 43 | "---" 44 | ] 45 | }, 46 | { 47 | "cell_type": "markdown", 48 | "metadata": {}, 49 | "source": [ 50 | "## Background\n", 51 | "\n", 52 | "In many ways, recommender systems were a catalyst for the current popularity of machine learning. One of Amazon's earliest successes was the \"Customers who bought this, also bought...\" feature, while the million dollar Netflix Prize spurred research, raised public awareness, and inspired numerous other data science competitions.\n", 53 | "\n", 54 | "Recommender systems can utilize a multitude of data sources and ML algorithms, and most combine various unsupervised, supervised, and reinforcement learning techniques into a holistic framework. However, the core component is almost always a model which predicts a user's rating (or purchase) for a certain item based on that user's historical ratings of similar items as well as the behavior of other similar users. The minimal required dataset for this is a history of user item ratings. In our case, we'll use 1 to 5 star ratings from over 2M Amazon customers on over 160K digital videos. More details on this dataset can be found at its [AWS Public Datasets page](https://s3.amazonaws.com/amazon-reviews-pds/readme.html).\n", 55 | "\n", 56 | "Matrix factorization has been the cornerstone of most user-item prediction models. This method starts with the large, sparse, user-item ratings in a single matrix, where users index the rows, and items index the columns. It then seeks to find two lower-dimensional, dense matrices which, when multiplied together, preserve the information and relationships in the larger matrix.\n", 57 | "\n", 58 | "![image](./factorization.png)\n", 59 | "\n", 60 | "Matrix factorization has been extended and generalized with deep learning and embeddings. These techniques allows us to introduce non-linearities for enhanced performance and flexibility. This notebook will fit a neural network-based model to generate recommendations for the Amazon video dataset. It will start by exploring our data in the notebook, training a model on the data and fit our model using a SageMaker managed training cluster. We'll then deploy to an endpoint and check our method.\n", 61 | "\n", 62 | "We will also see how the tasks in the machine learning pipeline can be orchestrated and automated through Apache Airflow integration with Sagemaker.\n", 63 | "\n", 64 | "---\n", 65 | "\n", 66 | "## Setup\n", 67 | "\n", 68 | "_This notebook was created and tested on an ml.t2.xlarge notebook instance._\n", 69 | "\n", 70 | "Let's start by specifying:\n", 71 | "\n", 72 | "- The S3 bucket and prefix that you want to use for training and model data. This should be within the same region as the Notebook Instance, training, and hosting.\n", 73 | "- The IAM role arn used to give training and hosting access to your data. See the documentation for how to create these. Note, if more than one role is required for notebook instances, training, and/or hosting, please replace the `get_execution_role()` call with the appropriate full IAM role arn string(s)." 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": { 80 | "isConfigCell": true 81 | }, 82 | "outputs": [], 83 | "source": [ 84 | "bucket = 'mybucket' #replace with your bucket\n", 85 | "prefix = 'sagemaker/fm-recsys'\n", 86 | "\n", 87 | "import sagemaker\n", 88 | "\n", 89 | "from sagemaker.tuner import HyperparameterTuner, ContinuousParameter\n", 90 | "from sagemaker.analytics import HyperparameterTuningJobAnalytics, TrainingJobAnalytics\n", 91 | "\n", 92 | "role = sagemaker.get_execution_role()\n", 93 | "sess = sagemaker.Session()\n", 94 | "smclient = boto3.Session().client('sagemaker')" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "Now let's load the Python libraries we'll need for the remainder of this example notebook." 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": null, 107 | "metadata": {}, 108 | "outputs": [], 109 | "source": [ 110 | "import os\n", 111 | "import io\n", 112 | "import sys\n", 113 | "import time\n", 114 | "\n", 115 | "import pandas as pd\n", 116 | "import numpy as np\n", 117 | "from scipy.sparse import lil_matrix\n", 118 | "\n", 119 | "import boto3\n", 120 | "import json\n", 121 | "\n", 122 | "import matplotlib.pyplot as plt\n", 123 | "import seaborn as sns\n", 124 | "\n", 125 | "import sagemaker.amazon.common as smac\n", 126 | "from sagemaker.predictor import json_deserializer" 127 | ] 128 | }, 129 | { 130 | "cell_type": "code", 131 | "execution_count": null, 132 | "metadata": {}, 133 | "outputs": [], 134 | "source": [ 135 | "# plot aesthetics\n", 136 | "sns.set(color_codes=True)\n", 137 | "sns.set_context('paper')\n", 138 | "five_thirty_eight = [\"#30a2da\", \"#fc4f30\", \"#e5ae38\", \"#6d904f\", \"#8b8b8b\",]\n", 139 | "sns.set_palette(five_thirty_eight)\n", 140 | "\n", 141 | "%matplotlib inline" 142 | ] 143 | }, 144 | { 145 | "cell_type": "markdown", 146 | "metadata": {}, 147 | "source": [ 148 | "---\n", 149 | "## Data\n", 150 | "\n", 151 | "### Explore\n", 152 | "\n", 153 | "Let's start by bringing in our dataset from an S3 public bucket. As mentioned above, this contains 1 to 5 star ratings from over 2M Amazon customers on over 160K digital videos. More details on this dataset can be found at its [AWS Public Datasets page](https://s3.amazonaws.com/amazon-reviews-pds/readme.html).\n", 154 | "\n", 155 | "_Note, because this dataset is over a half gigabyte, the load from S3 may take ~10 minutes. Also, since Amazon SageMaker Notebooks start with a 5GB persistent volume by default, and we don't need to keep this data on our instance for long, we'll bring it to the temporary volume (which has up to 20GB of storage)._" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "!mkdir /tmp/recsys/\n", 165 | "!aws s3 cp s3://amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Video_Download_v1_00.tsv.gz /tmp/recsys/" 166 | ] 167 | }, 168 | { 169 | "cell_type": "markdown", 170 | "metadata": {}, 171 | "source": [ 172 | "Let's read the data into a [Pandas DataFrame](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html) so that we can begin to understand it.\n", 173 | "\n", 174 | "*Note, we'll set `error_bad_lines=False` when reading the file in as there appear to be a very small number of records which would create a problem otherwise.*" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": null, 180 | "metadata": {}, 181 | "outputs": [], 182 | "source": [ 183 | "reviews = pd.read_csv('/tmp/recsys/amazon_reviews_us_Digital_Video_Download_v1_00.tsv.gz', delimiter='\\t',error_bad_lines=False)\n", 184 | "reviews.head()" 185 | ] 186 | }, 187 | { 188 | "cell_type": "markdown", 189 | "metadata": {}, 190 | "source": [ 191 | "We can see this dataset includes information like:\n", 192 | "\n", 193 | "- `marketplace`: 2-letter country code (in this case all \"US\").\n", 194 | "- `customer_id`: Random identifier that can be used to aggregate reviews written by a single author.\n", 195 | "- `review_id`: A unique ID for the review.\n", 196 | "- `product_id`: The Amazon Standard Identification Number (ASIN). `http://www.amazon.com/dp/` links to the product's detail page.\n", 197 | "- `product_parent`: The parent of that ASIN. Multiple ASINs (color or format variations of the same product) can roll up into a single parent parent.\n", 198 | "- `product_title`: Title description of the product.\n", 199 | "- `product_category`: Broad product category that can be used to group reviews (in this case digital videos).\n", 200 | "- `star_rating`: The review's rating (1 to 5 stars).\n", 201 | "- `helpful_votes`: Number of helpful votes for the review.\n", 202 | "- `total_votes`: Number of total votes the review received.\n", 203 | "- `vine`: Was the review written as part of the [Vine](https://www.amazon.com/gp/vine/help) program?\n", 204 | "- `verified_purchase`: Was the review from a verified purchase?\n", 205 | "- `review_headline`: The title of the review itself.\n", 206 | "- `review_body`: The text of the review.\n", 207 | "- `review_date`: The date the review was written.\n", 208 | "\n", 209 | "For this example, let's limit ourselves to `customer_id`, `product_id`, and `star_rating`. Including additional features in our recommendation system could be beneficial, but would require substantial processing (particularly the text data) which would take us beyond the scope of this notebook.\n", 210 | "\n", 211 | "*Note: we'll keep `product_title` on the dataset to help verify our recommendations later in the notebook, but it will not be used in algorithm training.*" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": null, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "reviews = reviews[['customer_id', 'product_id', 'star_rating', 'product_title']]" 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": {}, 226 | "source": [ 227 | "Because most people haven't seen most videos, and people rate fewer videos than we actually watch, we'd expect our data to be sparse. Our algorithm should work well with this sparse problem in general, but we may still want to clean out some of the long tail. Let's look at some basic percentiles to confirm." 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": null, 233 | "metadata": {}, 234 | "outputs": [], 235 | "source": [ 236 | "customers = reviews['customer_id'].value_counts()\n", 237 | "products = reviews['product_id'].value_counts()\n", 238 | "\n", 239 | "quantiles = [0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.96, 0.97, 0.98, 0.99, 1]\n", 240 | "product_q = pd.DataFrame(zip(quantiles, products.quantile(quantiles)), columns=[\"quantile\", \"products\"])\n", 241 | "customer_q = pd.DataFrame(zip(quantiles, customers.quantile(quantiles)), columns=[\"quantile\", \"customers\"])\n", 242 | "# product_q.tail(10)\n", 243 | "# customer_q.tail(10)" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [ 252 | "axp = sns.barplot(x=\"quantile\", y=\"products\", data=product_q, palette=five_thirty_eight)" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": null, 258 | "metadata": {}, 259 | "outputs": [], 260 | "source": [ 261 | "axc = sns.barplot(x=\"quantile\", y=\"customers\", data=customer_q, palette=five_thirty_eight)" 262 | ] 263 | }, 264 | { 265 | "cell_type": "markdown", 266 | "metadata": {}, 267 | "source": [ 268 | "As we can see, only about 5% of customers have rated 5 or more videos, and only 25% of videos have been rated by 9+ customers.\n", 269 | "\n", 270 | "### Clean\n", 271 | "\n", 272 | "Let's filter out this long tail and remove any duplicate reviews (same product and customer)." 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": null, 278 | "metadata": {}, 279 | "outputs": [], 280 | "source": [ 281 | "customers = customers[customers >= 5]\n", 282 | "products = products[products >= 10]\n", 283 | "\n", 284 | "print(\"# of records before removing the long tail = {:10d}\".format(reviews.shape[0]))\n", 285 | "reduced_df = reviews.merge(pd.DataFrame({'customer_id': customers.index})).merge(pd.DataFrame({'product_id': products.index}))\n", 286 | "print(\"# of records after removing the long tail = {:10d}\".format(reduced_df.shape[0]))\n", 287 | "reduced_df = reduced_df.drop_duplicates(['customer_id', 'product_id'])\n", 288 | "print(\"# of records after removing duplicates = {:10d}\".format(reduced_df.shape[0]))" 289 | ] 290 | }, 291 | { 292 | "cell_type": "markdown", 293 | "metadata": {}, 294 | "source": [ 295 | "Now, we'll recreate our customer and product lists since there are customers with more than 5 reviews, but all of their reviews are on products with less than 5 reviews (and vice versa)." 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": null, 301 | "metadata": {}, 302 | "outputs": [], 303 | "source": [ 304 | "customers = reduced_df['customer_id'].value_counts()\n", 305 | "products = reduced_df['product_id'].value_counts()" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "Next, we'll number each user and item, giving them their own sequential index. This will allow us to hold the information in a sparse format where the sequential indices indicate the row and column in our ratings matrix." 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": null, 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "customer_index = pd.DataFrame({'customer_id': customers.index, 'customer': np.arange(customers.shape[0])})\n", 322 | "product_index = pd.DataFrame({'product_id': products.index, \n", 323 | " 'product': np.arange(products.shape[0])})\n", 324 | "\n", 325 | "reduced_df = reduced_df.merge(customer_index).merge(product_index)\n", 326 | "reduced_df.head()" 327 | ] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "metadata": {}, 332 | "source": [ 333 | "Let's look at the feature dimension size whch will required for preparing the training and test data sets" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": { 340 | "scrolled": true 341 | }, 342 | "outputs": [], 343 | "source": [ 344 | "nb_customer = reduced_df['customer'].max() + 1\n", 345 | "nb_products = reduced_df['product'].max() + 1\n", 346 | "feature_dim = nb_customer + nb_products\n", 347 | "print(nb_customer, nb_products, feature_dim)" 348 | ] 349 | }, 350 | { 351 | "cell_type": "markdown", 352 | "metadata": {}, 353 | "source": [ 354 | "Trim down the data set to include only customer, product, star_rating which is all we need for the training algorithm to build the model" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": null, 360 | "metadata": {}, 361 | "outputs": [], 362 | "source": [ 363 | "product_df = reduced_df[['customer', 'product', 'star_rating']]\n", 364 | "product_df.head()" 365 | ] 366 | }, 367 | { 368 | "cell_type": "markdown", 369 | "metadata": {}, 370 | "source": [ 371 | "### Prepare\n", 372 | "\n", 373 | "We will be using SageMaker's implementation of Factorization Machines (FM) for building a recommender system. The algorithm expects float32 tensors in protobuf whereas the data sets are pandas dataframe on disk. Most of the conversion effort is handled by the Amazon SageMaker Python SDK.\n", 374 | "\n", 375 | "The FM algorithm will utilize sparse input and since the data sets are dense matrix, it has to be converted a sparse matrix with one-hot encoded feature vectors with customers and products. Thus, each sample in the data set will be a wide boolean vector with 178729 feature space (140344 customer + 38385 products) with only two values set to 1 with respect to the customer and product.\n", 376 | "\n", 377 | "Following are the next steps\n", 378 | "\n", 379 | "1. Split the cleaned data set into train and test data sets.\n", 380 | "2. For each set, build a sparse matrix with one-hot encoded feature vectors (customer + products) and a label vector with star ratings.\n", 381 | "3. Convert both the sets to protobuf encoded files.\n", 382 | "4. Copy these files to an Amazon S3 bucket.\n", 383 | "5. Configure and run a Factorization Machines training job on Amazon SageMaker.\n", 384 | "6. Deploy the corresponding model to an endpoint.\n", 385 | "7. Run predictions on test data set and validate\n", 386 | "\n", 387 | "#### Split into Training and Test Data Sets\n", 388 | "\n", 389 | "Let's start by [splitting](https://docs.scipy.org/doc/numpy/reference/generated/numpy.split.html) in training, validation and test sets. This will allow us to estimate the model's accuracy on videos our customers rated, but wasn't included in our training. We will use validation data set specifically for tuning model hyper-parameters." 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": null, 395 | "metadata": {}, 396 | "outputs": [], 397 | "source": [ 398 | "train_df, validate_df, test_df = np.split(\n", 399 | " product_df.sample(frac=1), \n", 400 | " [int(.6*len(product_df)), int(.8*len(product_df))])" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": null, 406 | "metadata": {}, 407 | "outputs": [], 408 | "source": [ 409 | "print(\"# of rows in the training data set = {:10d}\".format(train_df.shape[0]))\n", 410 | "print(\"# of rows in the validation data set = {:10d}\".format(validate_df.shape[0]))\n", 411 | "print(\"# of rows in the test data set = {:10d}\".format(test_df.shape[0]))" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": null, 417 | "metadata": {}, 418 | "outputs": [], 419 | "source": [ 420 | "train_df.head()" 421 | ] 422 | }, 423 | { 424 | "cell_type": "markdown", 425 | "metadata": {}, 426 | "source": [ 427 | "Let's get the feature dimensions by adding total number of (unique) customers and products" 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": null, 433 | "metadata": {}, 434 | "outputs": [], 435 | "source": [ 436 | "# get feature dimension\n", 437 | "all_df = pd.concat([train_df, validate_df, test_df])\n", 438 | "nb_customer = np.unique(all_df['customer'].values).shape[0]\n", 439 | "nb_products = np.unique(all_df['product'].values).shape[0]\n", 440 | "feature_dim = nb_customer + nb_products\n", 441 | "print(\"# of customers = {:10d}\".format(nb_customer))\n", 442 | "print(\"# of products = {:10d}\".format(nb_products))\n", 443 | "print(\"# of features = {:10d}\".format(feature_dim))" 444 | ] 445 | }, 446 | { 447 | "cell_type": "markdown", 448 | "metadata": {}, 449 | "source": [ 450 | "#### Building Sparse One-Hot Encoded Matrix\n", 451 | "\n", 452 | "Our training matrix is now even sparser: Of all 183,833,321,511 values (1028559 rows * 178729 columns), only 2,057,118 are non-zero (1,028,559*2). In other words, the matrix is 99.99% sparse. Storing this as a dense matrix would be a massive waste of both storage and computing power. To avoid this, use a scipy.lil_matrix sparse matrix for features and a numpy array for ratings.\n", 453 | "\n", 454 | "Let's define a function that takes the data set and returns a sparse feature matrix and numpy array with ratings." 455 | ] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": null, 460 | "metadata": {}, 461 | "outputs": [], 462 | "source": [ 463 | "def convert_sparse_matrix(df, nb_rows, nb_customer, nb_products):\n", 464 | " # dataframe to array\n", 465 | " df_val = df.values\n", 466 | "\n", 467 | " # determine feature size\n", 468 | " nb_cols = nb_customer + nb_products\n", 469 | " print(\"# of rows = {}\".format(str(nb_rows)))\n", 470 | " print(\"# of cols = {}\".format(str(nb_cols)))\n", 471 | "\n", 472 | " # extract customers and ratings\n", 473 | " df_X = df_val[:, 0:2]\n", 474 | " # Features are one-hot encoded in a sparse matrix\n", 475 | " X = lil_matrix((nb_rows, nb_cols)).astype('float32')\n", 476 | " df_X[:, 1] = nb_customer + df_X[:, 1]\n", 477 | " coords = df_X[:, 0:2]\n", 478 | " X[np.arange(nb_rows), coords[:, 0]] = 1\n", 479 | " X[np.arange(nb_rows), coords[:, 1]] = 1\n", 480 | "\n", 481 | " # create label with ratings\n", 482 | " Y = df_val[:, 2].astype('float32')\n", 483 | "\n", 484 | " # validate size and shape\n", 485 | " print(X.shape)\n", 486 | " print(Y.shape)\n", 487 | " assert X.shape == (nb_rows, nb_cols)\n", 488 | " assert Y.shape == (nb_rows, )\n", 489 | "\n", 490 | " return X, Y" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": null, 496 | "metadata": {}, 497 | "outputs": [], 498 | "source": [ 499 | "print(\"Convert training data set to one-hot encoded sparse matrix\")\n", 500 | "train_X, train_Y = convert_sparse_matrix(train_df, train_df.shape[0], nb_customer, nb_products)\n", 501 | "print(\"Convert validation data set to one-hot encoded sparse matrix\")\n", 502 | "validate_X, validate_Y = convert_sparse_matrix(validate_df, validate_df.shape[0], nb_customer, nb_products)\n", 503 | "print(\"Convert test data set to one-hot encoded sparse matrix\")\n", 504 | "test_X, test_Y = convert_sparse_matrix(test_df, test_df.shape[0], nb_customer, nb_products)" 505 | ] 506 | }, 507 | { 508 | "cell_type": "markdown", 509 | "metadata": {}, 510 | "source": [ 511 | "#### Convert to Protobuf format and Upload to S3" 512 | ] 513 | }, 514 | { 515 | "cell_type": "markdown", 516 | "metadata": {}, 517 | "source": [ 518 | "We will use Sagemaker's utility function [`write_spmatrix_to_sparse_tensor`](https://github.com/aws/sagemaker-python-sdk/blob/master/src/sagemaker/amazon/common.py) to convert scipy sparse matrix to protobuf format." 519 | ] 520 | }, 521 | { 522 | "cell_type": "code", 523 | "execution_count": null, 524 | "metadata": {}, 525 | "outputs": [], 526 | "source": [ 527 | "def save_as_protobuf(X, Y, bucket, key):\n", 528 | " \"\"\"Converts features and predictions matrices to recordio protobuf and\n", 529 | " writes to S3\n", 530 | "\n", 531 | " Args:\n", 532 | " X:\n", 533 | " 2D numpy matrix with features\n", 534 | " Y:\n", 535 | " 1D numpy matrix with predictions\n", 536 | " bucket:\n", 537 | " s3 bucket where recordio protobuf file will be staged\n", 538 | " prefix:\n", 539 | " s3 url prefix to stage prepared data to use for training the model\n", 540 | " key:\n", 541 | " protobuf file name to be staged\n", 542 | "\n", 543 | " Returns:\n", 544 | " s3 url with key to the protobuf data\n", 545 | " \"\"\"\n", 546 | " buf = io.BytesIO()\n", 547 | " smac.write_spmatrix_to_sparse_tensor(buf, X, Y)\n", 548 | " buf.seek(0)\n", 549 | " obj = '{}'.format(key)\n", 550 | " boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf)\n", 551 | " return 's3://{}/{}'.format(bucket, obj)" 552 | ] 553 | }, 554 | { 555 | "cell_type": "code", 556 | "execution_count": null, 557 | "metadata": {}, 558 | "outputs": [], 559 | "source": [ 560 | "s3_train_path = save_as_protobuf(train_X, train_Y, bucket, 'prepare/train/train.protobuf')\n", 561 | "print(\"Training data set in protobuf format uploaded at {}\".format(s3_train_path))\n", 562 | "s3_val_path = save_as_protobuf(validate_X, validate_Y, bucket, 'prepare/validate/validate.protobuf')\n", 563 | "print(\"Validation data set in protobuf format uploaded at {}\".format(s3_val_path))" 564 | ] 565 | }, 566 | { 567 | "cell_type": "markdown", 568 | "metadata": {}, 569 | "source": [ 570 | "We will chunk the test data to avoid the payload size issues when performing batch predictions." 571 | ] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": null, 576 | "metadata": {}, 577 | "outputs": [], 578 | "source": [ 579 | "def chunk(x, batch_size):\n", 580 | " \"\"\"split array into chunks of batch_size\n", 581 | " \"\"\"\n", 582 | " chunk_range = range(0, x.shape[0], batch_size)\n", 583 | " chunks = [x[p: p + batch_size] for p in chunk_range]\n", 584 | " return chunks" 585 | ] 586 | }, 587 | { 588 | "cell_type": "code", 589 | "execution_count": null, 590 | "metadata": {}, 591 | "outputs": [], 592 | "source": [ 593 | "test_x_chunks = chunk(test_X, 10000)\n", 594 | "test_y_chunks = chunk(test_Y, 10000)\n", 595 | "N = len(test_x_chunks)\n", 596 | "for i in range(N):\n", 597 | " test_data = save_as_protobuf(\n", 598 | " test_x_chunks[i],\n", 599 | " test_y_chunks[i],\n", 600 | " bucket,\n", 601 | " \"prepare/test/test_\" + str(i) + \".protobuf\")\n", 602 | " print(test_data)" 603 | ] 604 | }, 605 | { 606 | "cell_type": "markdown", 607 | "metadata": {}, 608 | "source": [ 609 | "---" 610 | ] 611 | }, 612 | { 613 | "cell_type": "markdown", 614 | "metadata": {}, 615 | "source": [ 616 | "## Model Training\n", 617 | "\n", 618 | "Once we have the data preprocessed and available in the correct format for training, the next step is to actually train the model using the data. We'll use the Amazon SageMaker Python SDK to kick off training and monitor status until it is completed. In this example that takes between 4-7 minutes for 3-10 epochs. \n", 619 | "\n", 620 | "First, let's get the Sagemaker Factorization Machine container" 621 | ] 622 | }, 623 | { 624 | "cell_type": "code", 625 | "execution_count": null, 626 | "metadata": {}, 627 | "outputs": [], 628 | "source": [ 629 | "from sagemaker.amazon.amazon_estimator import get_image_uri\n", 630 | "container = get_image_uri(boto3.Session().region_name, 'factorization-machines')" 631 | ] 632 | }, 633 | { 634 | "cell_type": "markdown", 635 | "metadata": {}, 636 | "source": [ 637 | "Next kick off the base estimator, making sure to pass in the necessary hyperparameters. Notice:\n", 638 | "\n", 639 | "- `feature_dim` is set to 178729, which is the number of customers + products in the training data set.\n", 640 | "- `predictor_type` is set to 'regressor' since we are trying to predict the rating\n", 641 | "- `mini_batch_size` is set to 200. This value can be tuned for relatively minor improvements in fit and speed, but selecting a reasonable value relative to the dataset is appropriate in most cases.\n", 642 | "- `num_factors` is set to 64. Factorization machines find a lower dimensional representation of the interactions for all features. Making this value smaller provides a more parsimonious model, closer to a linear model, but may sacrifice information about interactions. Making it larger provides a higher-dimensional representation of feature interactions, but adds computational complexity and can lead to overfitting. In a practical application, time should be invested to tune this parameter to the appropriate value." 643 | ] 644 | }, 645 | { 646 | "cell_type": "code", 647 | "execution_count": null, 648 | "metadata": {}, 649 | "outputs": [], 650 | "source": [ 651 | "%time\n", 652 | "\n", 653 | "output_location = 's3://{}/train/'.format(bucket)\n", 654 | "s3_train_path = 's3://{}/prepare/train/train.protobuf'.format(bucket)\n", 655 | "s3_val_path = 's3://{}/prepare/validate/validate.protobuf'.format(bucket)\n", 656 | "\n", 657 | "fm = sagemaker.estimator.Estimator(container,\n", 658 | " role, \n", 659 | " train_instance_count=1, \n", 660 | " train_instance_type='ml.c5.4xlarge',\n", 661 | " output_path=output_location,\n", 662 | " sagemaker_session=sess)\n", 663 | "\n", 664 | "fm.set_hyperparameters(feature_dim=feature_dim,\n", 665 | " predictor_type='regressor',\n", 666 | " mini_batch_size=200,\n", 667 | " num_factors=512,\n", 668 | " bias_lr=0.02,\n", 669 | " epochs=10)" 670 | ] 671 | }, 672 | { 673 | "cell_type": "code", 674 | "execution_count": null, 675 | "metadata": {}, 676 | "outputs": [], 677 | "source": [ 678 | "fm.fit({'train': s3_train_path,'test': s3_val_path}, wait=False)" 679 | ] 680 | }, 681 | { 682 | "cell_type": "markdown", 683 | "metadata": {}, 684 | "source": [ 685 | "Amazon SageMaker built-in algorithms automatically compute and emit a variety of model training, evaluation, and validation metrics that can be captured from Cloudwatch using Sagemaker SDK. Since we are using FM built-in algorithm with predictor type as `regressor`, we can capture RMSE (root-mean-square error) of the model that measures the differences between the predicted values and the actual values.\n", 686 | "\n", 687 | "Let's capture the RMSE of the model" 688 | ] 689 | }, 690 | { 691 | "cell_type": "code", 692 | "execution_count": null, 693 | "metadata": {}, 694 | "outputs": [], 695 | "source": [ 696 | "training_job_name = fm._current_job_name\n", 697 | "metric_name = 'train:rmse:epoch'" 698 | ] 699 | }, 700 | { 701 | "cell_type": "code", 702 | "execution_count": null, 703 | "metadata": {}, 704 | "outputs": [], 705 | "source": [ 706 | "# run this cell to check current status of training job\n", 707 | "fm_training_job_result = smclient.describe_training_job(TrainingJobName=training_job_name)\n", 708 | "\n", 709 | "status = fm_training_job_result['TrainingJobStatus']\n", 710 | "if status != 'Completed':\n", 711 | " print('Reminder: the training job has not been completed.')\n", 712 | "else:\n", 713 | " print('The training job is completed')" 714 | ] 715 | }, 716 | { 717 | "cell_type": "code", 718 | "execution_count": null, 719 | "metadata": {}, 720 | "outputs": [], 721 | "source": [ 722 | "# plug-in the training job name and metrics to be captured\n", 723 | "metrics_dataframe = TrainingJobAnalytics(training_job_name=training_job_name,metric_names=[metric_name]).dataframe()\n", 724 | "metrics_dataframe" 725 | ] 726 | }, 727 | { 728 | "cell_type": "code", 729 | "execution_count": null, 730 | "metadata": {}, 731 | "outputs": [], 732 | "source": [ 733 | "plt = metrics_dataframe.plot(kind='line', figsize=(12,5), x='timestamp', y='value', style='b.', legend=False)\n", 734 | "plt.set_ylabel(metric_name);" 735 | ] 736 | }, 737 | { 738 | "cell_type": "markdown", 739 | "metadata": {}, 740 | "source": [ 741 | "As the number of epochs increased, RMSE goes down which is a good sign that the predicted values are getting closer to the actual values from the test set. We can increase number of epochs or change the hyperparameters and try to tweak the model. Let's try to deploy this model and make predictions to see how close the predictions are. Then we can run a hyper-parameter tuning job to determine the best model." 742 | ] 743 | }, 744 | { 745 | "cell_type": "markdown", 746 | "metadata": {}, 747 | "source": [ 748 | "#### Utility Functions" 749 | ] 750 | }, 751 | { 752 | "cell_type": "markdown", 753 | "metadata": {}, 754 | "source": [ 755 | "We will define some common utility functions here that will be used during inference and evaluating results" 756 | ] 757 | }, 758 | { 759 | "cell_type": "code", 760 | "execution_count": null, 761 | "metadata": {}, 762 | "outputs": [], 763 | "source": [ 764 | "def convert_to_protobuf(X, Y=None):\n", 765 | " buf = io.BytesIO()\n", 766 | " smac.write_spmatrix_to_sparse_tensor(buf, X, Y)\n", 767 | " buf.seek(0)\n", 768 | " return buf" 769 | ] 770 | }, 771 | { 772 | "cell_type": "code", 773 | "execution_count": null, 774 | "metadata": {}, 775 | "outputs": [], 776 | "source": [ 777 | "def convert_sparse_matrix_X(df, nb_rows, nb_customer, nb_products):\n", 778 | " # dataframe to array\n", 779 | " df_val = df.values\n", 780 | "\n", 781 | " # determine feature size\n", 782 | " nb_cols = nb_customer + nb_products\n", 783 | " \n", 784 | " # extract customers and ratings\n", 785 | " df_X = df_val[:,0:2]\n", 786 | " # Features are one-hot encoded in a sparse matrix\n", 787 | " X = lil_matrix((nb_rows, nb_cols)).astype('float32')\n", 788 | " df_X[:,1] = nb_customer + df_X[:,1]\n", 789 | " coords = df_X[:,0:2]\n", 790 | " X[np.arange(nb_rows), coords[:, 0]] = 1\n", 791 | " X[np.arange(nb_rows), coords[:, 1]] = 1\n", 792 | "\n", 793 | " # validate size and shape\n", 794 | " assert X.shape == (nb_rows, nb_cols)\n", 795 | " \n", 796 | " return X" 797 | ] 798 | }, 799 | { 800 | "cell_type": "markdown", 801 | "metadata": {}, 802 | "source": [ 803 | "---" 804 | ] 805 | }, 806 | { 807 | "cell_type": "markdown", 808 | "metadata": {}, 809 | "source": [ 810 | "## Inference " 811 | ] 812 | }, 813 | { 814 | "cell_type": "markdown", 815 | "metadata": {}, 816 | "source": [ 817 | "### Real-Time Inference " 818 | ] 819 | }, 820 | { 821 | "cell_type": "markdown", 822 | "metadata": {}, 823 | "source": [ 824 | "Since the model is trained, all it takes to deploy the model is a Sagemaker API call `deploy()` that creates the model package, sets up endpoint configuration and finally creates the endpoint." 825 | ] 826 | }, 827 | { 828 | "cell_type": "code", 829 | "execution_count": null, 830 | "metadata": {}, 831 | "outputs": [], 832 | "source": [ 833 | "fm_predictor = fm.deploy(instance_type='ml.c4.xlarge', initial_instance_count=1)" 834 | ] 835 | }, 836 | { 837 | "cell_type": "markdown", 838 | "metadata": {}, 839 | "source": [ 840 | "Predictions could be done by sending HTTP POST requests from a separate web service, but to keep things easy, we'll just use the `.predict()` method from the SageMaker Python SDK. The API expects JSON or RecordIO format for request and JSON for response data." 841 | ] 842 | }, 843 | { 844 | "cell_type": "code", 845 | "execution_count": null, 846 | "metadata": {}, 847 | "outputs": [], 848 | "source": [ 849 | "fm_predictor.content_type = 'application/x-recordio-protobuf'" 850 | ] 851 | }, 852 | { 853 | "cell_type": "markdown", 854 | "metadata": {}, 855 | "source": [ 856 | "Let's test the model with sample ratings from test data set using `predict()` API call" 857 | ] 858 | }, 859 | { 860 | "cell_type": "code", 861 | "execution_count": null, 862 | "metadata": {}, 863 | "outputs": [], 864 | "source": [ 865 | "test_pb = convert_to_protobuf(test_X[1000:1010]).getvalue()" 866 | ] 867 | }, 868 | { 869 | "cell_type": "code", 870 | "execution_count": null, 871 | "metadata": {}, 872 | "outputs": [], 873 | "source": [ 874 | "response = fm_predictor.predict(test_pb)\n", 875 | "response" 876 | ] 877 | }, 878 | { 879 | "cell_type": "code", 880 | "execution_count": null, 881 | "metadata": {}, 882 | "outputs": [], 883 | "source": [ 884 | "predicted = [round(r['score'], 2) for r in json.loads(response)['predictions']]\n", 885 | "predicted" 886 | ] 887 | }, 888 | { 889 | "cell_type": "code", 890 | "execution_count": null, 891 | "metadata": {}, 892 | "outputs": [], 893 | "source": [ 894 | "results_df = pd.DataFrame(zip(test_Y[1000:1010], predicted), columns = [\"actual_rating\", \"predicted_rating\"])\n", 895 | "results_df" 896 | ] 897 | }, 898 | { 899 | "cell_type": "markdown", 900 | "metadata": {}, 901 | "source": [ 902 | "---" 903 | ] 904 | }, 905 | { 906 | "cell_type": "markdown", 907 | "metadata": {}, 908 | "source": [ 909 | "### Batch Inference" 910 | ] 911 | }, 912 | { 913 | "cell_type": "markdown", 914 | "metadata": {}, 915 | "source": [ 916 | "Here we will perform batch inference on the test data set prepared earlier (chunking into multiple protobuf files). To run batch transform, create a model package for the transform endpoint " 917 | ] 918 | }, 919 | { 920 | "cell_type": "markdown", 921 | "metadata": {}, 922 | "source": [ 923 | "- Create the model from the training estimator" 924 | ] 925 | }, 926 | { 927 | "cell_type": "code", 928 | "execution_count": null, 929 | "metadata": {}, 930 | "outputs": [], 931 | "source": [ 932 | "fm_model = fm.create_model()" 933 | ] 934 | }, 935 | { 936 | "cell_type": "markdown", 937 | "metadata": {}, 938 | "source": [ 939 | "- Perform batch inference on the test data set and save results to S3" 940 | ] 941 | }, 942 | { 943 | "cell_type": "code", 944 | "execution_count": null, 945 | "metadata": {}, 946 | "outputs": [], 947 | "source": [ 948 | "fm_transformer = fm_model.transformer(\n", 949 | " instance_type='ml.c4.xlarge', \n", 950 | " instance_count=1, \n", 951 | " strategy=\"MultiRecord\", \n", 952 | " output_path=\"s3://{}/transform/\".format(bucket)\n", 953 | ")" 954 | ] 955 | }, 956 | { 957 | "cell_type": "code", 958 | "execution_count": null, 959 | "metadata": {}, 960 | "outputs": [], 961 | "source": [ 962 | "fm_transformer.transform(\n", 963 | " data=\"s3://{}/prepare/test/\".format(bucket), \n", 964 | " data_type='S3Prefix', \n", 965 | " content_type=\"application/x-recordio-protobuf\")" 966 | ] 967 | }, 968 | { 969 | "cell_type": "code", 970 | "execution_count": null, 971 | "metadata": {}, 972 | "outputs": [], 973 | "source": [ 974 | "print('Waiting for transform job: ' + fm_transformer.latest_transform_job.job_name)\n", 975 | "fm_transformer.wait()" 976 | ] 977 | }, 978 | { 979 | "cell_type": "markdown", 980 | "metadata": {}, 981 | "source": [ 982 | "- Inference results will be stored in a separate file for each test file chunk. Let's download the results from S3 and merge them" 983 | ] 984 | }, 985 | { 986 | "cell_type": "code", 987 | "execution_count": null, 988 | "metadata": {}, 989 | "outputs": [], 990 | "source": [ 991 | "def download_from_s3(bucket, key):\n", 992 | " s3 = boto3.resource('s3')\n", 993 | " obj = s3.Object( bucket, key)\n", 994 | " content = obj.get()['Body'].read()\n", 995 | " return content" 996 | ] 997 | }, 998 | { 999 | "cell_type": "code", 1000 | "execution_count": null, 1001 | "metadata": {}, 1002 | "outputs": [], 1003 | "source": [ 1004 | "test_preds = []\n", 1005 | "for i in range(N):\n", 1006 | " key = 'transform/test_' + str(i) + '.protobuf.out'\n", 1007 | " response = download_from_s3(bucket, key)\n", 1008 | " result = [json.loads(row)[\"score\"] for row in response.split(\"\\n\") if len(row) > 0]\n", 1009 | " test_preds.extend(result)" 1010 | ] 1011 | }, 1012 | { 1013 | "cell_type": "code", 1014 | "execution_count": null, 1015 | "metadata": {}, 1016 | "outputs": [], 1017 | "source": [ 1018 | "test_preds = np.array(test_preds)" 1019 | ] 1020 | }, 1021 | { 1022 | "cell_type": "code", 1023 | "execution_count": null, 1024 | "metadata": {}, 1025 | "outputs": [], 1026 | "source": [ 1027 | "test_preds.shape" 1028 | ] 1029 | }, 1030 | { 1031 | "cell_type": "markdown", 1032 | "metadata": {}, 1033 | "source": [ 1034 | "---" 1035 | ] 1036 | }, 1037 | { 1038 | "cell_type": "markdown", 1039 | "metadata": {}, 1040 | "source": [ 1041 | "## Evaluate Model Performance\n", 1042 | "\n", 1043 | "Let's start by calculating a naive baseline to approximate how well our model is doing. The simplest estimate would be to assume every user item rating is just the average rating over all ratings.\n", 1044 | "\n", 1045 | "*Note, we could do better by using each individual video's average, however, in this case it doesn't really matter as the same conclusions would hold.*" 1046 | ] 1047 | }, 1048 | { 1049 | "cell_type": "code", 1050 | "execution_count": null, 1051 | "metadata": {}, 1052 | "outputs": [], 1053 | "source": [ 1054 | "print('Naive MSE:', np.mean((test_df['star_rating'] - np.mean(train_df['star_rating'])) ** 2))" 1055 | ] 1056 | }, 1057 | { 1058 | "cell_type": "markdown", 1059 | "metadata": {}, 1060 | "source": [ 1061 | "Now, we'll calculate predictions for our test dataset." 1062 | ] 1063 | }, 1064 | { 1065 | "cell_type": "code", 1066 | "execution_count": null, 1067 | "metadata": {}, 1068 | "outputs": [], 1069 | "source": [ 1070 | "print('MSE:', np.mean((test_Y - test_preds) ** 2))" 1071 | ] 1072 | }, 1073 | { 1074 | "cell_type": "markdown", 1075 | "metadata": {}, 1076 | "source": [ 1077 | "We can see that our neural network and embedding model produces substantially better results (~1.44 vs 1.13 on the mean square error).\n", 1078 | "\n", 1079 | "For recommender systems, subjective accuracy also matters. Let's get some recommendations for a random user to see if they make intuitive sense." 1080 | ] 1081 | }, 1082 | { 1083 | "cell_type": "code", 1084 | "execution_count": null, 1085 | "metadata": {}, 1086 | "outputs": [], 1087 | "source": [ 1088 | "df_customer_6 = reduced_df[reduced_df['customer'] == 6].sort_values(['star_rating', 'product'], ascending=[False, True])\n", 1089 | "pd.concat((df_customer_6.head(10), df_customer_6.tail(10)))" 1090 | ] 1091 | }, 1092 | { 1093 | "cell_type": "markdown", 1094 | "metadata": {}, 1095 | "source": [ 1096 | "As we can see, user #6 seems to like sprawling dramamtic television series and sci-fi, but they dislike silly comedies.\n", 1097 | "\n", 1098 | "Now we'll loop through and predict user #6's ratings for every common video in the catalog, to see which ones we'd recommend and which ones we wouldn't." 1099 | ] 1100 | }, 1101 | { 1102 | "cell_type": "code", 1103 | "execution_count": null, 1104 | "metadata": {}, 1105 | "outputs": [], 1106 | "source": [ 1107 | "def create_payload(cust_id, nb_customer, nb_products, product_index):\n", 1108 | " # prepare payload for user #6\n", 1109 | " c = [cust_id] * nb_products\n", 1110 | " p = product_index['product'].values\n", 1111 | " x = pd.DataFrame(zip(c,p))\n", 1112 | " p_x = convert_sparse_matrix_X(x, x.shape[0], nb_customer, nb_products)\n", 1113 | " x_pb = convert_to_protobuf(p_x)\n", 1114 | " return x_pb" 1115 | ] 1116 | }, 1117 | { 1118 | "cell_type": "code", 1119 | "execution_count": null, 1120 | "metadata": {}, 1121 | "outputs": [], 1122 | "source": [ 1123 | "x_pb = create_payload(6, nb_customer, nb_products, product_index)" 1124 | ] 1125 | }, 1126 | { 1127 | "cell_type": "code", 1128 | "execution_count": null, 1129 | "metadata": {}, 1130 | "outputs": [], 1131 | "source": [ 1132 | "# make predictions using end-point created in Real-Time Inference\n", 1133 | "response = fm_predictor.predict(x_pb)\n", 1134 | "predictions = [round(r['score'], 2) for r in json.loads(response)['predictions']]" 1135 | ] 1136 | }, 1137 | { 1138 | "cell_type": "code", 1139 | "execution_count": null, 1140 | "metadata": {}, 1141 | "outputs": [], 1142 | "source": [ 1143 | "predictions_df = pd.DataFrame({'product': product_index['product'],\n", 1144 | " 'prediction': predictions})" 1145 | ] 1146 | }, 1147 | { 1148 | "cell_type": "code", 1149 | "execution_count": null, 1150 | "metadata": {}, 1151 | "outputs": [], 1152 | "source": [ 1153 | "df_results_cust_6 = df_customer_6.merge(predictions_df, on=['product'])[['customer', 'customer_id', 'product', 'product_id', 'product_title', 'star_rating', 'prediction']]\n", 1154 | "df_results_cust_6.sort_values(['prediction', 'product'], ascending=[False, True])" 1155 | ] 1156 | }, 1157 | { 1158 | "cell_type": "markdown", 1159 | "metadata": {}, 1160 | "source": [ 1161 | "Indeed, our predicted highly rated shows have some well-reviewed TV dramas and some sci-fi. Meanwhile, our bottom rated shows include goofball comedies.\n", 1162 | "\n", 1163 | "*Note, because of random initialization in the weights, results on subsequent runs may differ slightly.*\n", 1164 | "\n", 1165 | "Let's confirm that we no longer have almost perfect correlation in recommendations with user #7." 1166 | ] 1167 | }, 1168 | { 1169 | "cell_type": "code", 1170 | "execution_count": null, 1171 | "metadata": {}, 1172 | "outputs": [], 1173 | "source": [ 1174 | "x_pb = create_payload(7, nb_customer, nb_products, product_index)\n", 1175 | "response = fm_predictor.predict(x_pb)\n", 1176 | "predictions_user7 = [round(r['score'], 2) for r in json.loads(response)['predictions']]" 1177 | ] 1178 | }, 1179 | { 1180 | "cell_type": "code", 1181 | "execution_count": null, 1182 | "metadata": {}, 1183 | "outputs": [], 1184 | "source": [ 1185 | "plt.scatter(predictions_df['prediction'], np.array(predictions_user7))\n", 1186 | "plt.show()" 1187 | ] 1188 | }, 1189 | { 1190 | "cell_type": "markdown", 1191 | "metadata": {}, 1192 | "source": [ 1193 | "---" 1194 | ] 1195 | }, 1196 | { 1197 | "cell_type": "markdown", 1198 | "metadata": {}, 1199 | "source": [ 1200 | "## Model Tuning\n", 1201 | "\n", 1202 | "So far, we have developed a deep learning model to predict customer ratings but the model could be improved further by various techniques. In this section, let's see if tuning the hyper-parameters of Factorization Machine is going to make the model any better." 1203 | ] 1204 | }, 1205 | { 1206 | "cell_type": "code", 1207 | "execution_count": null, 1208 | "metadata": {}, 1209 | "outputs": [], 1210 | "source": [ 1211 | "output_location = 's3://{}/train/'.format(bucket)\n", 1212 | "s3_train_path = 's3://{}/prepare/train/train.protobuf'.format(bucket)\n", 1213 | "s3_val_path = 's3://{}/prepare/validate/validate.protobuf'.format(bucket)" 1214 | ] 1215 | }, 1216 | { 1217 | "cell_type": "markdown", 1218 | "metadata": {}, 1219 | "source": [ 1220 | "- Let's create the estimator with Factorization Machines container similar to how we defined in training the model. Also, set the initial hyper-parameters that we know worked before." 1221 | ] 1222 | }, 1223 | { 1224 | "cell_type": "code", 1225 | "execution_count": null, 1226 | "metadata": {}, 1227 | "outputs": [], 1228 | "source": [ 1229 | "fm_estimator = sagemaker.estimator.Estimator(container,\n", 1230 | " role, \n", 1231 | " train_instance_count=1, \n", 1232 | " train_instance_type='ml.c5.4xlarge',\n", 1233 | " output_path=output_location,\n", 1234 | " sagemaker_session=sess)" 1235 | ] 1236 | }, 1237 | { 1238 | "cell_type": "code", 1239 | "execution_count": null, 1240 | "metadata": {}, 1241 | "outputs": [], 1242 | "source": [ 1243 | "fm_estimator.set_hyperparameters(\n", 1244 | " feature_dim=feature_dim,\n", 1245 | " predictor_type='regressor',\n", 1246 | " mini_batch_size=200,\n", 1247 | " num_factors=512,\n", 1248 | " bias_lr=0.02,\n", 1249 | " epochs=20)" 1250 | ] 1251 | }, 1252 | { 1253 | "cell_type": "markdown", 1254 | "metadata": {}, 1255 | "source": [ 1256 | "- Find best hyperparameters with Sagemaker's Automatic Model Tuning. Following hyperparameters will be tuned\n", 1257 | " - ***factors_lr:*** The learning rate for factorization terms.\n", 1258 | " - ***factors_init_sigma:*** The standard deviation for initialization of factorization terms. Takes effect if factors_init_method is set to normal.\n", 1259 | " \n", 1260 | "\n", 1261 | "- Define the hyperparameter tuning ranges to be searched and set the objective metric" 1262 | ] 1263 | }, 1264 | { 1265 | "cell_type": "code", 1266 | "execution_count": null, 1267 | "metadata": {}, 1268 | "outputs": [], 1269 | "source": [ 1270 | "hyperparameter_ranges= {\n", 1271 | " \"factors_lr\": ContinuousParameter(0.0001, 0.2),\n", 1272 | " \"factors_init_sigma\": ContinuousParameter(0.0001, 1)\n", 1273 | "}" 1274 | ] 1275 | }, 1276 | { 1277 | "cell_type": "markdown", 1278 | "metadata": {}, 1279 | "source": [ 1280 | "- Now that we have our ranges defined we want to define our success metric" 1281 | ] 1282 | }, 1283 | { 1284 | "cell_type": "code", 1285 | "execution_count": null, 1286 | "metadata": {}, 1287 | "outputs": [], 1288 | "source": [ 1289 | "objective_metric_name = \"test:rmse\"\n", 1290 | "objective_type = \"Minimize\"" 1291 | ] 1292 | }, 1293 | { 1294 | "cell_type": "markdown", 1295 | "metadata": {}, 1296 | "source": [ 1297 | "- Start hyperparameter tuning job with the ranges defined" 1298 | ] 1299 | }, 1300 | { 1301 | "cell_type": "code", 1302 | "execution_count": null, 1303 | "metadata": {}, 1304 | "outputs": [], 1305 | "source": [ 1306 | "fm_tuner = HyperparameterTuner(\n", 1307 | " estimator=fm_estimator,\n", 1308 | " objective_metric_name=objective_metric_name, \n", 1309 | " hyperparameter_ranges=hyperparameter_ranges,\n", 1310 | " objective_type=objective_type,\n", 1311 | " max_jobs=10,\n", 1312 | " max_parallel_jobs=2\n", 1313 | ")" 1314 | ] 1315 | }, 1316 | { 1317 | "cell_type": "code", 1318 | "execution_count": null, 1319 | "metadata": {}, 1320 | "outputs": [], 1321 | "source": [ 1322 | "timestamp_prefix = time.strftime(\"%Y%m%d-%H%M%S\", time.gmtime())\n", 1323 | "fm_tuner_job_name = 'hpo-fm-' + timestamp_prefix" 1324 | ] 1325 | }, 1326 | { 1327 | "cell_type": "code", 1328 | "execution_count": null, 1329 | "metadata": {}, 1330 | "outputs": [], 1331 | "source": [ 1332 | "fm_tuner.fit({'train': s3_train_path, 'test': s3_val_path}, job_name=fm_tuner_job_name, wait=False)" 1333 | ] 1334 | }, 1335 | { 1336 | "cell_type": "markdown", 1337 | "metadata": {}, 1338 | "source": [ 1339 | "- Track hyperparameter tuning job progress" 1340 | ] 1341 | }, 1342 | { 1343 | "cell_type": "code", 1344 | "execution_count": null, 1345 | "metadata": {}, 1346 | "outputs": [], 1347 | "source": [ 1348 | "# run this cell to check current status of hyperparameter tuning job\n", 1349 | "tuning_job_result = smclient.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=fm_tuner_job_name)\n", 1350 | "\n", 1351 | "status = tuning_job_result['HyperParameterTuningJobStatus']\n", 1352 | "if status != 'Completed':\n", 1353 | " print('Reminder: the tuning job has not been completed.')\n", 1354 | " \n", 1355 | "job_count = tuning_job_result['TrainingJobStatusCounters']['Completed']\n", 1356 | "print(\"%d training jobs have completed\" % job_count)\n", 1357 | " \n", 1358 | "is_minimize = (tuning_job_result['HyperParameterTuningJobConfig']['HyperParameterTuningJobObjective']['Type'] != 'Maximize')\n", 1359 | "objective_name = tuning_job_result['HyperParameterTuningJobConfig']['HyperParameterTuningJobObjective']['MetricName']" 1360 | ] 1361 | }, 1362 | { 1363 | "cell_type": "markdown", 1364 | "metadata": {}, 1365 | "source": [ 1366 | "* Analyze Hyper-Parameter Tuning Job Results" 1367 | ] 1368 | }, 1369 | { 1370 | "cell_type": "code", 1371 | "execution_count": null, 1372 | "metadata": {}, 1373 | "outputs": [], 1374 | "source": [ 1375 | "# plug-in the training job name and metrics to be captured\n", 1376 | "fm_tuner_analytics = HyperparameterTuningJobAnalytics(hyperparameter_tuning_job_name=fm_tuner_job_name)\n", 1377 | "df_fm_tuner_metrics = fm_tuner_analytics.dataframe()\n", 1378 | "df_fm_tuner_metrics" 1379 | ] 1380 | }, 1381 | { 1382 | "cell_type": "code", 1383 | "execution_count": null, 1384 | "metadata": {}, 1385 | "outputs": [], 1386 | "source": [ 1387 | "# analyze using seaborn\n", 1388 | "plt = df_fm_tuner_metrics.plot(kind='line', figsize=(12,5), x='TrainingStartTime', \n", 1389 | " y='FinalObjectiveValue', \n", 1390 | " style='b.', legend=False)\n", 1391 | "plt.set_ylabel(objective_metric_name);" 1392 | ] 1393 | }, 1394 | { 1395 | "cell_type": "markdown", 1396 | "metadata": {}, 1397 | "source": [ 1398 | "- Best Factorization Machine Model after Hyper-Parameter Optimization" 1399 | ] 1400 | }, 1401 | { 1402 | "cell_type": "code", 1403 | "execution_count": null, 1404 | "metadata": {}, 1405 | "outputs": [], 1406 | "source": [ 1407 | "print(\"fm_tuner_job_name: \" + fm_tuner_job_name)\n", 1408 | "fm_tuner = HyperparameterTuner.attach(fm_tuner_job_name)\n", 1409 | "\n", 1410 | "fm_tuner_analytics = HyperparameterTuningJobAnalytics(hyperparameter_tuning_job_name=fm_tuner_job_name)\n", 1411 | "df_fm_tuner_metrics = fm_tuner_analytics.dataframe()\n", 1412 | "\n", 1413 | "fm_best_model_name = fm_tuner.best_training_job()\n", 1414 | "print(\"fm_best_model_name: \" + fm_best_model_name)\n", 1415 | "\n", 1416 | "fm_model_info = smclient.describe_training_job(TrainingJobName=fm_best_model_name)" 1417 | ] 1418 | }, 1419 | { 1420 | "cell_type": "code", 1421 | "execution_count": null, 1422 | "metadata": {}, 1423 | "outputs": [], 1424 | "source": [ 1425 | "df_fm_tuner_metrics[df_fm_tuner_metrics['TrainingJobName']==fm_best_model_name]" 1426 | ] 1427 | }, 1428 | { 1429 | "cell_type": "markdown", 1430 | "metadata": {}, 1431 | "source": [ 1432 | "- Let's evaluate the results with the best training job from hyper-parameter tuning job." 1433 | ] 1434 | }, 1435 | { 1436 | "cell_type": "markdown", 1437 | "metadata": {}, 1438 | "source": [ 1439 | "We can deploy the endpoint using hyper-parameter tuning job and test the predictions. " 1440 | ] 1441 | }, 1442 | { 1443 | "cell_type": "code", 1444 | "execution_count": null, 1445 | "metadata": {}, 1446 | "outputs": [], 1447 | "source": [ 1448 | "fm = sagemaker.estimator.Estimator.attach(fm_best_model_name)" 1449 | ] 1450 | }, 1451 | { 1452 | "cell_type": "markdown", 1453 | "metadata": {}, 1454 | "source": [ 1455 | "We can re-run the cells in Batch Inference and Evaluation section to evaluate the performance of the model with tuned hyper-parameters. \n", 1456 | "\n", 1457 | "Assuming batch inference is carried out, let's calculate predictions for our test dataset and see if we do better than the training job with default hyper-parameters." 1458 | ] 1459 | }, 1460 | { 1461 | "cell_type": "code", 1462 | "execution_count": null, 1463 | "metadata": {}, 1464 | "outputs": [], 1465 | "source": [ 1466 | "print('MSE:', np.mean((test_Y - test_preds) ** 2))" 1467 | ] 1468 | }, 1469 | { 1470 | "cell_type": "markdown", 1471 | "metadata": {}, 1472 | "source": [ 1473 | "---\n", 1474 | "\n", 1475 | "## Wrap-up\n", 1476 | "\n", 1477 | "In this example, we developed a deep learning model to predict customer ratings. This could serve as the foundation of a recommender system in a variety of use cases. However, there are many ways in which it could be improved. For example we did very little with:\n", 1478 | "- hyperparameter tuning\n", 1479 | "- controlling for overfitting (early stopping, dropout, etc.)\n", 1480 | "- testing whether binarizing our target variable would improve results\n", 1481 | "- including other information sources (video genres, historical ratings, time of review)\n", 1482 | "- adjusting our threshold for user and item inclusion \n", 1483 | "\n", 1484 | "In addition to improving the model, we could improve the engineering by:\n", 1485 | "- Setting the context and key value store up for distributed training\n", 1486 | "- Fine tuning our data ingestion (e.g. num_workers on our data iterators) to ensure we're fully utilizing our GPU\n", 1487 | "- Thinking about how pre-processing would need to change as datasets scale beyond a single machine\n", 1488 | "\n", 1489 | "Beyond that, recommenders are a very active area of research and techniques from active learning, reinforcement learning, segmentation, ensembling, and more should be investigated to deliver well-rounded recommendations.\n", 1490 | "\n", 1491 | "### Clean-up (optional)\n", 1492 | "\n", 1493 | "Let's finish by deleting our endpoint to avoid stray hosting charges." 1494 | ] 1495 | }, 1496 | { 1497 | "cell_type": "code", 1498 | "execution_count": null, 1499 | "metadata": {}, 1500 | "outputs": [], 1501 | "source": [ 1502 | "endpoint_name_contains = ['-fm-', 'factorization-machines-']\n", 1503 | "for name in endpoint_name_contains:\n", 1504 | " endpoints = smclient.list_endpoints(NameContains=name, StatusEquals='InService')\n", 1505 | " endpoint_names = [r['EndpointName'] for r in endpoints['Endpoints']]\n", 1506 | " for endpoint_name in endpoint_names:\n", 1507 | " print(\"Deleting endpoint: \" + endpoint_name)\n", 1508 | " smclient.delete_endpoint(EndpointName=endpoint_name)" 1509 | ] 1510 | }, 1511 | { 1512 | "cell_type": "markdown", 1513 | "metadata": {}, 1514 | "source": [ 1515 | "---" 1516 | ] 1517 | } 1518 | ], 1519 | "metadata": { 1520 | "kernelspec": { 1521 | "display_name": "conda_python2", 1522 | "language": "python", 1523 | "name": "conda_python2" 1524 | }, 1525 | "language_info": { 1526 | "codemirror_mode": { 1527 | "name": "ipython", 1528 | "version": 2 1529 | }, 1530 | "file_extension": ".py", 1531 | "mimetype": "text/x-python", 1532 | "name": "python", 1533 | "nbconvert_exporter": "python", 1534 | "pygments_lexer": "ipython2", 1535 | "version": "2.7.15" 1536 | }, 1537 | "notice": "Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the \"License\"). You may not use this file except in compliance with the License. A copy of the License is located at http://aws.amazon.com/apache2.0/ or in the \"license\" file accompanying this file. This file is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License." 1538 | }, 1539 | "nbformat": 4, 1540 | "nbformat_minor": 2 1541 | } 1542 | --------------------------------------------------------------------------------