├── src
    ├── pipeline
    │   ├── __init__.py
    │   ├── preprocess.py
    │   └── prepare.py
    ├── config.py
    └── dag_ml_pipeline_amazon_video_reviews.py
├── .gitignore
├── images
    ├── LaunchStack.png
    ├── cfn_output.png
    ├── airflow-sagemaker-dag.png
    ├── airflow-sagemaker-airflow-dag.png
    ├── airflow-sagemaker-airflow_setup.png
    └── airflow-sagemaker-ml-workflow.png
├── .github
    └── PULL_REQUEST_TEMPLATE.md
├── CODE_OF_CONDUCT.md
├── LICENSE
├── CONTRIBUTING.md
├── README.md
├── cfn
    ├── airflow-ec2-1.10.12-RDS.yaml
    ├── airflow-ec2-1.10.12-Aurora-Serverless.yaml
    ├── airflow-ec2-2.0.2-RDS.yaml
    └── airflow-ec2-2.0.2-Aurora-Serverless.yaml
└── notebooks
    └── amazon-video-recommender_using_fm_algo.ipynb


/src/pipeline/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | src/pipeline/__pycache__/*
2 | *.pyc
3 | .vscode/
4 | notebooks/.ipynb_checkpoints/


--------------------------------------------------------------------------------
/images/LaunchStack.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/sagemaker-ml-workflow-with-apache-airflow/HEAD/images/LaunchStack.png


--------------------------------------------------------------------------------
/images/cfn_output.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/sagemaker-ml-workflow-with-apache-airflow/HEAD/images/cfn_output.png


--------------------------------------------------------------------------------
/images/airflow-sagemaker-dag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/sagemaker-ml-workflow-with-apache-airflow/HEAD/images/airflow-sagemaker-dag.png


--------------------------------------------------------------------------------
/images/airflow-sagemaker-airflow-dag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/sagemaker-ml-workflow-with-apache-airflow/HEAD/images/airflow-sagemaker-airflow-dag.png


--------------------------------------------------------------------------------
/images/airflow-sagemaker-airflow_setup.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/sagemaker-ml-workflow-with-apache-airflow/HEAD/images/airflow-sagemaker-airflow_setup.png


--------------------------------------------------------------------------------
/images/airflow-sagemaker-ml-workflow.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/sagemaker-ml-workflow-with-apache-airflow/HEAD/images/airflow-sagemaker-ml-workflow.png


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | *Issue #, if available:*
2 | 
3 | *Description of changes:*
4 | 
5 | 
6 | By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice.
7 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 7 | the Software, and to permit persons to whom the Software is furnished to do so.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
15 | 


--------------------------------------------------------------------------------
/src/config.py:
--------------------------------------------------------------------------------
 1 | from datetime import datetime
 2 | from sagemaker.tuner import ContinuousParameter
 3 | 
 4 | config = {}
 5 | 
 6 | config["job_level"] = {
 7 |     "region_name": "<region-name>",
 8 |     "run_hyperparameter_opt": "no"
 9 | }
10 | 
11 | config["preprocess_data"] = {
12 |     "s3_in_url": "s3://amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Video_Download_v1_00.tsv.gz",
13 |     "s3_out_bucket": "<s3-bucket>",  # replace
14 |     "s3_out_prefix": "preprocess/",
15 |     "delimiter": "\t"
16 | }
17 | 
18 | config["prepare_data"] = {
19 |     "s3_in_bucket": "<s3-bucket>",  # replace
20 |     "s3_in_prefix": "preprocess/",
21 |     "s3_out_bucket": "<s3-bucket>",  # replace
22 |     "s3_out_prefix": "prepare/",
23 |     "delimiter": "\t"
24 | }
25 | 
26 | config["train_model"] = {
27 |     "sagemaker_role": "AirflowSageMakerExecutionRole",
28 |     "estimator_config": {
29 |         "train_instance_count": 1,
30 |         "train_instance_type": "ml.c5.4xlarge",
31 |         "train_volume_size": 30,
32 |         "train_max_run": 3600,
33 |         "output_path": "s3://<s3-bucket>/train/",  # replace
34 |         "base_job_name": "trng-recommender",
35 |         "hyperparameters": {
36 |             "feature_dim": "178729",
37 |             "epochs": "10",
38 |             "mini_batch_size": "200",
39 |             "num_factors": "64",
40 |             "predictor_type": 'regressor'
41 |         }
42 |     },
43 |     "inputs": {
44 |         "train": "s3://<s3-bucket>/prepare/train/train.protobuf",  # replace
45 |     }
46 | }
47 | 
48 | config["tune_model"] = {
49 |     "tuner_config": {
50 |         "objective_metric_name": "test:rmse",
51 |         "objective_type": "Minimize",
52 |         "hyperparameter_ranges": {
53 |             "factors_lr": ContinuousParameter(0.0001, 0.2),
54 |             "factors_init_sigma": ContinuousParameter(0.0001, 1)
55 |         },
56 |         "max_jobs": 20,
57 |         "max_parallel_jobs": 2,
58 |         "base_tuning_job_name": "hpo-recommender"
59 |     },
60 |     "inputs": {
61 |         "train": "s3://<s3-bucket>/prepare/train/train.protobuf",  # replace
62 |         "test": "s3://<s3-bucket>/prepare/validate/validate.protobuf"  # replace
63 |     }
64 | }
65 | 
66 | config["batch_transform"] = {
67 |     "transform_config": {
68 |         "instance_count": 1,
69 |         "instance_type": "ml.c4.xlarge",
70 |         "data": "s3://<s3-bucket>/prepare/test/",
71 |         "data_type": "S3Prefix",
72 |         "content_type": "application/x-recordio-protobuf",
73 |         "strategy": "MultiRecord",
74 |         "output_path": "s3://<s3-bucket>/transform/"
75 |     }
76 | }
77 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check [existing open](https://github.com/aws-samples/sagemaker-ml-workflow-with-apache-airflow/issues), or [recently closed](https://github.com/aws-samples/sagemaker-ml-workflow-with-apache-airflow/issues?utf8=%E2%9C%93&q=is%3Aissue%20is%3Aclosed%20), issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *master* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any ['help wanted'](https://github.com/aws-samples/sagemaker-ml-workflow-with-apache-airflow/labels/help%20wanted) issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](https://github.com/aws-samples/sagemaker-ml-workflow-with-apache-airflow/blob/master/LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 
61 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes.
62 | 


--------------------------------------------------------------------------------
/src/pipeline/preprocess.py:
--------------------------------------------------------------------------------
  1 | import pandas as pd
  2 | import numpy as np
  3 | import s3fs
  4 | 
  5 | 
  6 | def preprocess(s3_in_url,
  7 |                s3_out_bucket,
  8 |                s3_out_prefix,
  9 |                delimiter=","):
 10 |     """Preprocesses data based on business logic
 11 | 
 12 |     - Reads delimited file passed as s3_url and preprocess data by filtering
 13 |     long tail in the customer ratings data i.e. keep customers who have rated 5
 14 |     or more videos, and videos that have been rated by 9+ customers
 15 |     - Preprocessed data is then written to output
 16 | 
 17 |     Args:
 18 |         s3_in_url:
 19 |           s3 url to the delimited file to be processed
 20 |           e.g. s3://amazon-reviews-pds/tsv/reviews.tsv.gz
 21 |         s3_out_bucket:
 22 |           s3 bucket where preprocessed data will be staged
 23 |           e.g. mybucket
 24 |         s3_out_prefix:
 25 |           s3 url prefix to stage preprocessed data to use later in the pipeline
 26 |           e.g. amazon-reviews-pds/preprocess/
 27 |         delimiter:
 28 |           delimiter to be used for parsing the file. Defaults to "," if none
 29 |           provided
 30 | 
 31 |     Returns:
 32 |         status of preprocessed data
 33 | 
 34 |     Raises:
 35 |         IOError: An error occurred accessing the s3 file
 36 |     """
 37 |     try:
 38 |         print("preprocessing data from {}".format(s3_in_url))
 39 |         # read s3 url into pandas dataframe
 40 |         # pandas internally uses s3fs to read s3 file directory
 41 |         df = pd.read_csv(s3_in_url, delimiter, error_bad_lines=False)
 42 | 
 43 |         # limit dataframe to customer_id, product_id, and star_rating
 44 |         # `product_title` will be useful validating recommendations
 45 |         df = df[['customer_id', 'product_id', 'star_rating', 'product_title']]
 46 | 
 47 |         # clean out the long tail because most people haven't seen most videos,
 48 |         # and people rate fewer videos than they actually watch
 49 |         customers = df['customer_id'].value_counts()
 50 |         products = df['product_id'].value_counts()
 51 | 
 52 |         # based on data exploration only about 5% of customers have rated 5 or
 53 |         # more videos, and only 25% of videos have been rated by 9+ customers
 54 |         customers = customers[customers >= 5]
 55 |         products = products[products >= 10]
 56 |         print("# of rows before the long tail = {:10d}".format(df.shape[0]))
 57 |         reduced_df = df \
 58 |             .merge(pd.DataFrame({'customer_id': customers.index})) \
 59 |             .merge(pd.DataFrame({'product_id': products.index}))
 60 |         print("# of rows after the long tail = {:10d}".format(
 61 |             reduced_df.shape[0]))
 62 |         reduced_df = reduced_df.drop_duplicates(['customer_id', 'product_id'])
 63 |         print("# of rows after removing duplicates = {:10d}".format(
 64 |             reduced_df.shape[0]))
 65 | 
 66 |         # recreate customer and product lists since there are customers with
 67 |         # more than 5 reviews, but all of their reviews are on products with
 68 |         # less than 5 reviews (and vice versa)
 69 |         customers = reduced_df['customer_id'].value_counts()
 70 |         products = reduced_df['product_id'].value_counts()
 71 | 
 72 |         # sequentially index each user and item to hold the sparse format where
 73 |         # the indices indicate the row and column in our ratings matrix
 74 |         customer_index = pd.DataFrame({
 75 |             'customer_id': customers.index,
 76 |             'customer': np.arange(customers.shape[0])})
 77 |         product_index = pd.DataFrame({
 78 |             'product_id': products.index,
 79 |             'product': np.arange(products.shape[0])})
 80 |         reduced_df = reduced_df \
 81 |             .merge(customer_index) \
 82 |             .merge(product_index)
 83 | 
 84 |         nb_customer = reduced_df['customer'].max() + 1
 85 |         nb_products = reduced_df['product'].max() + 1
 86 |         feature_dim = nb_customer + nb_products
 87 |         print(nb_customer, nb_products, feature_dim)
 88 | 
 89 |         product_df = reduced_df[['customer', 'product', 'star_rating']]
 90 | 
 91 |         # split into train, validation and test data sets
 92 |         train_df, validate_df, test_df = np.split(
 93 |             product_df.sample(frac=1),
 94 |             [int(.6*len(product_df)), int(.8*len(product_df))]
 95 |         )
 96 | 
 97 |         print("# of rows train data set = {:10d}".format(
 98 |             train_df.shape[0]))
 99 |         print("# of rows validation data set = {:10d}".format(
100 |             validate_df.shape[0]))
101 |         print("# of rows test data set = {:10d}".format(
102 |             test_df.shape[0]))
103 | 
104 |         # select columns required for training the model
105 |         # excluding columns "customer_id", "product_id", "product_title" to
106 |         # keep files small
107 |         cols = ["customer", "product", "star_rating"]
108 |         train_df = train_df[cols]
109 |         validate_df = validate_df[cols]
110 |         test_df = test_df[cols]
111 | 
112 |         # write output to s3 as delimited file
113 |         fs = s3fs.S3FileSystem(anon=False)
114 |         s3_out_prefix = s3_out_prefix[:-1] \
115 |             if s3_out_prefix[-1] == "/" else s3_out_prefix
116 |         s3_out_train = "s3://{}/{}/{}".format(
117 |             s3_out_bucket, s3_out_prefix, "train/train.csv")
118 |         print("writing training data to {}".format(s3_out_train))
119 |         with fs.open(s3_out_train, "w") as f:
120 |             train_df.to_csv(f, sep=str(','), index=False)
121 | 
122 |         s3_out_validate = "s3://{}/{}/{}".format(
123 |             s3_out_bucket, s3_out_prefix, "validate/validate.csv")
124 |         print("writing test data to {}".format(s3_out_validate))
125 |         with fs.open(s3_out_validate, "w") as f:
126 |             validate_df.to_csv(f, sep=str(','), index=False)
127 | 
128 |         s3_out_test = "s3://{}/{}/{}".format(
129 |             s3_out_bucket, s3_out_prefix, "test/test.csv")
130 |         print("writing test data to {}".format(s3_out_test))
131 |         with fs.open(s3_out_test, "w") as f:
132 |             test_df.to_csv(f, sep=str(','), index=False)
133 | 
134 |         print("preprocessing completed")
135 |         return "SUCCESS"
136 |     except Exception as e:
137 |         raise e
138 | 


--------------------------------------------------------------------------------
/src/pipeline/prepare.py:
--------------------------------------------------------------------------------
  1 | import sagemaker.amazon.common as smac
  2 | import pandas as pd
  3 | import numpy as np
  4 | import boto3
  5 | import s3fs
  6 | import io
  7 | 
  8 | from scipy.sparse import lil_matrix
  9 | 
 10 | 
 11 | def convert_sparse_matrix(df, nb_rows, nb_customer, nb_products):
 12 |     # dataframe to array
 13 |     df_val = df.values
 14 | 
 15 |     # determine feature size
 16 |     nb_cols = nb_customer + nb_products
 17 |     print("# of rows = {}".format(str(nb_rows)))
 18 |     print("# of cols = {}".format(str(nb_cols)))
 19 | 
 20 |     # extract customers and ratings
 21 |     df_X = df_val[:, 0:2]
 22 |     # Features are one-hot encoded in a sparse matrix
 23 |     X = lil_matrix((nb_rows, nb_cols)).astype('float32')
 24 |     df_X[:, 1] = nb_customer + df_X[:, 1]
 25 |     coords = df_X[:, 0:2]
 26 |     X[np.arange(nb_rows), coords[:, 0]] = 1
 27 |     X[np.arange(nb_rows), coords[:, 1]] = 1
 28 | 
 29 |     # create label with ratings
 30 |     Y = df_val[:, 2].astype('float32')
 31 | 
 32 |     # validate size and shape
 33 |     print(X.shape)
 34 |     print(Y.shape)
 35 |     assert X.shape == (nb_rows, nb_cols)
 36 |     assert Y.shape == (nb_rows, )
 37 | 
 38 |     return X, Y
 39 | 
 40 | 
 41 | def save_as_protobuf(X, Y, bucket, key):
 42 |     """Converts features and predictions matrices to recordio protobuf and
 43 |        writes to S3
 44 | 
 45 |     Args:
 46 |         X:
 47 |           2D numpy matrix with features
 48 |         Y:
 49 |           1D numpy matrix with predictions
 50 |         bucket:
 51 |           s3 bucket where recordio protobuf file will be staged
 52 |         prefix:
 53 |           s3 url prefix to stage prepared data to use for training the model
 54 |         key:
 55 |           protobuf file name to be staged
 56 | 
 57 |     Returns:
 58 |         s3 url with key to the protobuf data
 59 |     """
 60 |     buf = io.BytesIO()
 61 |     smac.write_spmatrix_to_sparse_tensor(buf, X, Y)
 62 |     buf.seek(0)
 63 |     obj = '{}'.format(key)
 64 |     boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf)
 65 |     return 's3://{}/{}'.format(bucket, obj)
 66 | 
 67 | 
 68 | def chunk(x, batch_size):
 69 |     """split array into chunks of batch_size
 70 |     """
 71 |     chunk_range = range(0, x.shape[0], batch_size)
 72 |     chunks = [x[p: p + batch_size] for p in chunk_range]
 73 |     return chunks
 74 | 
 75 | 
 76 | def prepare(s3_in_bucket,
 77 |             s3_in_prefix,
 78 |             s3_out_bucket,
 79 |             s3_out_prefix,
 80 |             delimiter=","):
 81 |     """Prepare data for training with Sagemaker algorithms
 82 | 
 83 |     - Read preprocessed data and converts to ProtoBuf format to prepare for
 84 |       training with Sagemaker algorithms
 85 | 
 86 |     Args:
 87 |         s3_in_bucket:
 88 |           s3 bucket where preprocessed files are staged
 89 |           e.g. mybucket
 90 |         s3_in_prefix:
 91 |           s3 prefix to the files to be used for training
 92 |           e.g. amazon-reviews-pds/preprocess/
 93 |           it's expected to have train and test folders in this prefix that will
 94 |           be staged by preprocessor
 95 |         s3_out_bucket:
 96 |           s3 bucket where training and test files will be staged
 97 |           e.g. mybucket
 98 |         s3_out_prefix:
 99 |           s3 url prefix to stage prepared data to use for training the model
100 |           e.g. amazon-reviews-pds/prepare/
101 |         delimiter:
102 |           delimiter to be used for parsing the file. Defaults to "," if none
103 |           provided
104 | 
105 |     Returns:
106 |         s3 url with key to the prepared data
107 | 
108 |     Raises:
109 |         IOError: An error occurred accessing the s3 file
110 |     """
111 |     try:
112 |         print("preparing data from {}".format(s3_in_prefix))
113 | 
114 |         # prepare training data set
115 |         if s3_in_prefix[-1] == "/":
116 |             s3_in_prefix = s3_in_prefix[:-1]
117 |         s3_train_url = "s3://{}/{}/{}".format(
118 |             s3_in_bucket, s3_in_prefix, 'train/train.csv')
119 |         train_df = pd.read_csv(s3_train_url,
120 |                                sep=str(','), error_bad_lines=False)
121 | 
122 |         # prepare validateion dataset
123 |         s3_validate_url = "s3://{}/{}/{}".format(
124 |             s3_in_bucket, s3_in_prefix, 'validate/validate.csv')
125 |         validate_df = pd.read_csv(s3_validate_url,
126 |                                   sep=str(','), error_bad_lines=False)
127 | 
128 |         # prepare test dataset
129 |         s3_test_url = "s3://{}/{}/{}".format(
130 |             s3_in_bucket, s3_in_prefix, 'test/test.csv')
131 |         test_df = pd.read_csv(s3_test_url,
132 |                               sep=str(','), error_bad_lines=False)
133 | 
134 |         # get feature dimension
135 |         all_df = pd.concat([train_df, validate_df, test_df])
136 |         nb_customer = np.unique(all_df['customer'].values).shape[0]
137 |         nb_products = np.unique(all_df['product'].values).shape[0]
138 |         feature_dim = nb_customer + nb_products
139 |         print(nb_customer, nb_products, feature_dim)
140 | 
141 |         train_X, train_Y = convert_sparse_matrix(
142 |             train_df, train_df.shape[0], nb_customer, nb_products)
143 |         validate_X, validate_Y = convert_sparse_matrix(
144 |             validate_df, validate_df.shape[0], nb_customer, nb_products)
145 |         test_X, test_Y = convert_sparse_matrix(
146 |             test_df, test_df.shape[0], nb_customer, nb_products)
147 | 
148 |         # write train and test in protobuf format to s3
149 |         if s3_out_prefix[-1] == "/":
150 |             s3_out_prefix = s3_out_prefix[:-1]
151 |         train_data = save_as_protobuf(
152 |             train_X, train_Y, s3_out_bucket,
153 |             s3_out_prefix + "/" + "train/train.protobuf")
154 |         print(train_data)
155 |         validate_data = save_as_protobuf(
156 |             validate_X, validate_Y, s3_out_bucket,
157 |             s3_out_prefix + "/" + "validate/validate.protobuf")
158 |         print(validate_data)
159 | 
160 |         # chunk test data to avoid payload size issues when batch transforming
161 |         test_x_chunks = chunk(test_X, 10000)
162 |         test_y_chunks = chunk(test_Y, 10000)
163 |         N = len(test_x_chunks)
164 |         for i in range(N):
165 |             test_data = save_as_protobuf(
166 |                 test_x_chunks[i],
167 |                 test_y_chunks[i],
168 |                 s3_out_bucket,
169 |                 s3_out_prefix + "/" + "test/test_" + str(i) + ".protobuf")
170 |             print(test_data)
171 | 
172 |         return "SUCCESS"
173 |     except Exception as e:
174 |         raise e
175 | 


--------------------------------------------------------------------------------
/src/dag_ml_pipeline_amazon_video_reviews.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | import json
  3 | import requests
  4 | from datetime import datetime
  5 | 
  6 | # airflow operators
  7 | import airflow
  8 | from airflow.models import DAG
  9 | from airflow.utils.trigger_rule import TriggerRule
 10 | from airflow.operators.python_operator import BranchPythonOperator
 11 | from airflow.operators.dummy_operator import DummyOperator
 12 | from airflow.operators.python_operator import PythonOperator
 13 | 
 14 | # airflow sagemaker operators
 15 | from airflow.contrib.operators.sagemaker_training_operator \
 16 |     import SageMakerTrainingOperator
 17 | from airflow.contrib.operators.sagemaker_tuning_operator \
 18 |     import SageMakerTuningOperator
 19 | from airflow.contrib.operators.sagemaker_transform_operator \
 20 |     import SageMakerTransformOperator
 21 | from airflow.contrib.hooks.aws_hook import AwsHook
 22 | 
 23 | # sagemaker sdk
 24 | import boto3
 25 | import sagemaker
 26 | from sagemaker.amazon.amazon_estimator import get_image_uri
 27 | from sagemaker.estimator import Estimator
 28 | from sagemaker.tuner import HyperparameterTuner
 29 | 
 30 | # airflow sagemaker configuration
 31 | from sagemaker.workflow.airflow import training_config
 32 | from sagemaker.workflow.airflow import tuning_config
 33 | from sagemaker.workflow.airflow import transform_config_from_estimator
 34 | 
 35 | # ml workflow specific
 36 | from pipeline import prepare, preprocess
 37 | import config as cfg
 38 | 
 39 | # =============================================================================
 40 | # functions
 41 | # =============================================================================
 42 | 
 43 | 
 44 | def is_hpo_enabled():
 45 |     """check if hyper-parameter optimization is enabled in the config
 46 |     """
 47 |     hpo_enabled = False
 48 |     if "job_level" in config and \
 49 |             "run_hyperparameter_opt" in config["job_level"]:
 50 |         run_hpo_config = config["job_level"]["run_hyperparameter_opt"]
 51 |         if run_hpo_config.lower() == "yes":
 52 |             hpo_enabled = True
 53 |     return hpo_enabled
 54 | 
 55 | 
 56 | def get_sagemaker_role_arn(role_name, region_name):
 57 |     iam = boto3.client('iam', region_name=region_name)
 58 |     response = iam.get_role(RoleName=role_name)
 59 |     return response["Role"]["Arn"]
 60 | 
 61 | # =============================================================================
 62 | # setting up training, tuning and transform configuration
 63 | # =============================================================================
 64 | 
 65 | 
 66 | # read config file
 67 | config = cfg.config
 68 | 
 69 | # set configuration for tasks
 70 | hook = AwsHook(aws_conn_id='airflow-sagemaker')
 71 | region = config["job_level"]["region_name"]
 72 | sess = hook.get_session(region_name=region)
 73 | role = get_sagemaker_role_arn(
 74 |     config["train_model"]["sagemaker_role"],
 75 |     sess.region_name)
 76 | container = get_image_uri(sess.region_name, 'factorization-machines')
 77 | hpo_enabled = is_hpo_enabled()
 78 | 
 79 | # create estimator
 80 | fm_estimator = Estimator(
 81 |     image_name=container,
 82 |     role=role,
 83 |     sagemaker_session=sagemaker.session.Session(sess),
 84 |     **config["train_model"]["estimator_config"]
 85 | )
 86 | 
 87 | # train_config specifies SageMaker training configuration
 88 | train_config = training_config(
 89 |     estimator=fm_estimator,
 90 |     inputs=config["train_model"]["inputs"])
 91 | 
 92 | # create tuner
 93 | fm_tuner = HyperparameterTuner(
 94 |     estimator=fm_estimator,
 95 |     **config["tune_model"]["tuner_config"]
 96 | )
 97 | 
 98 | # create tuning config
 99 | tuner_config = tuning_config(
100 |     tuner=fm_tuner,
101 |     inputs=config["tune_model"]["inputs"])
102 | 
103 | # create transform config
104 | transform_config = transform_config_from_estimator(
105 |     estimator=fm_estimator,
106 |     task_id="model_tuning" if hpo_enabled else "model_training",
107 |     task_type="tuning" if hpo_enabled else "training",
108 |     **config["batch_transform"]["transform_config"]
109 | )
110 | 
111 | # =============================================================================
112 | # define airflow DAG and tasks
113 | # =============================================================================
114 | 
115 | # define airflow DAG
116 | 
117 | args = {
118 |     'owner': 'airflow',
119 |     'start_date': airflow.utils.dates.days_ago(2)
120 | }
121 | 
122 | dag = DAG(
123 |     dag_id='sagemaker-ml-pipeline',
124 |     default_args=args,
125 |     schedule_interval=None,
126 |     concurrency=1,
127 |     max_active_runs=1,
128 |     user_defined_filters={'tojson': lambda s: json.JSONEncoder().encode(s)}
129 | )
130 | 
131 | # set the tasks in the DAG
132 | 
133 | # dummy operator
134 | init = DummyOperator(
135 |     task_id='start',
136 |     dag=dag
137 | )
138 | 
139 | # preprocess the data
140 | preprocess_task = PythonOperator(
141 |     task_id='preprocessing',
142 |     dag=dag,
143 |     provide_context=False,
144 |     python_callable=preprocess.preprocess,
145 |     op_kwargs=config["preprocess_data"])
146 | 
147 | # prepare the data for training
148 | prepare_task = PythonOperator(
149 |     task_id='preparing',
150 |     dag=dag,
151 |     provide_context=False,
152 |     python_callable=prepare.prepare,
153 |     op_kwargs=config["prepare_data"]
154 | )
155 | 
156 | branching = BranchPythonOperator(
157 |     task_id='branching',
158 |     dag=dag,
159 |     python_callable=lambda: "model_tuning" if hpo_enabled else "model_training")
160 | 
161 | # launch sagemaker training job and wait until it completes
162 | train_model_task = SageMakerTrainingOperator(
163 |     task_id='model_training',
164 |     dag=dag,
165 |     config=train_config,
166 |     aws_conn_id='airflow-sagemaker',
167 |     wait_for_completion=True,
168 |     check_interval=30
169 | )
170 | 
171 | # launch sagemaker hyperparameter job and wait until it completes
172 | tune_model_task = SageMakerTuningOperator(
173 |     task_id='model_tuning',
174 |     dag=dag,
175 |     config=tuner_config,
176 |     aws_conn_id='airflow-sagemaker',
177 |     wait_for_completion=True,
178 |     check_interval=30
179 | )
180 | 
181 | # launch sagemaker batch transform job and wait until it completes
182 | batch_transform_task = SageMakerTransformOperator(
183 |     task_id='predicting',
184 |     dag=dag,
185 |     config=transform_config,
186 |     aws_conn_id='airflow-sagemaker',
187 |     wait_for_completion=True,
188 |     check_interval=30,
189 |     trigger_rule=TriggerRule.ONE_SUCCESS
190 | )
191 | 
192 | cleanup_task = DummyOperator(
193 |     task_id='cleaning_up',
194 |     dag=dag)
195 | 
196 | # set the dependencies between tasks
197 | 
198 | init.set_downstream(preprocess_task)
199 | preprocess_task.set_downstream(prepare_task)
200 | prepare_task.set_downstream(branching)
201 | branching.set_downstream(tune_model_task)
202 | branching.set_downstream(train_model_task)
203 | tune_model_task.set_downstream(batch_transform_task)
204 | train_model_task.set_downstream(batch_transform_task)
205 | batch_transform_task.set_downstream(cleanup_task)
206 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Build End-to-End Machine Learning (ML) Workflows with Amazon SageMaker and Apache Airflow
 2 | 
 3 | This repository contains the assets for the Amazon Sagemaker and Apache Airflow integration sample described in this [ML blog post](#TODO).
 4 | 
 5 | ## Overview
 6 | 
 7 | This repository shows a sample example to build, manage and orchestrate ML workflows using Amazon Sagemaker and Apache Airflow. We will build a recommender system to predict a customer's rating for a certain video based on customer's historical ratings of similar videos as well as the behavior of other similar customers. We'll use historical star ratings from over 2M Amazon customers on over 160K digital videos. More details on this dataset can be found at its [AWS Public Datasets page](https://s3.amazonaws.com/amazon-reviews-pds/readme.html).
 8 | 
 9 | ### Repository Structure
10 | 
11 | The repository contains
12 | 
13 | - [AWS CloudFormation Templates](./cfn/) to launch the AWS services required to create the components
14 | - [Airflow DAG Python Script](./src/dag_ml_pipeline_amazon_video_reviews.py) that integrates and orchestrates all the ML tasks in a ML workflow for building a recommender system.
15 | - A companion [Jupyter Notebook](./notebooks/amazon-video-recommender_using_fm_algo.ipynb) to understand the individual ML tasks in detail such as data exploration, data preparation, model training/tuning and inference.
16 | 
17 | 
18 | ```text
19 | .
20 | ├── README.md                                         About the repository
21 | ├── cfn                                               AWS CloudFormation Templates
22 | │   └── airflow-ec2.yaml                              CloudFormation for installing Airflow instance backed by RDS
23 | ├── notebooks                                         Jupyter Notebooks
24 | │   └── amazon-video-recommender_using_fm_algo.ipynb
25 | └── src                                               Source code for Airflow DAG definition
26 |     ├── config.py                                     Config file to configure SageMaker jobs and other ML tasks
27 |     ├── dag_ml_pipeline_amazon_video_reviews.py       Airflow DAG definition for ML workflow
28 |     └── pipeline                                      Python module used in Airflow DAG for data preparation
29 |         ├── __init__.py
30 |         ├── prepare.py                                Data preparation script
31 |         └── preprocess.py                             Data pre-processing script
32 | ```
33 | 
34 | ### High Level Solution
35 | 
36 | Here is the high-level depiction of the ML workflow we will implement for building the recommender system
37 | 
38 | ![airflow_dag_workflow](./images/airflow-sagemaker-airflow-dag.png)
39 | 
40 | The workflow performs the following tasks
41 | 
42 | 1. **Data Pre-processing:** Extract and pre-process data from S3 to prepare the training data.
43 | 2. **Prepare Training Data:** To build the recommender system, we will use SageMaker's built-in algorithm - Factorization machines. The algorithm expects training data only in RecordIO Protobuf format with Float32 tensors. In this task, pre-processed data will be transformed to RecordIO Protobuf format.
44 | 3. **Training the Model:** Train the SageMaker's built-in Factorization Machine model with the training data and generate model artifacts. The training job will be launched by the Airflow SageMaker operator `SageMakerTrainingOperator`.
45 | 4. **Tune the Model Hyper-parameters:** A conditional/optional task to tune the hyper-parameters of Factorization Machine to find the best model. The hyper-parameter tuning job will be launched by the SageMaker Airflow operator `SageMakerTuningOperator`.
46 | 5. **Batch inference:** Using the trained model, get inferences on the test dataset stored in Amazon S3 using Airflow SageMaker operator `SageMakerTransformOperator`.
47 | 
48 | ### CloudFormation Template Resources
49 | 
50 | We will set up a simple Airflow architecture with scheduler, worker and web server running on the same instance. Typically, you will not use this setup for production workloads. We will use AWS CloudFormation to launch the AWS services required to create the components in the blog post. The stack includes the following
51 | 
52 | - Amazon EC2 instance to set up the Airflow components
53 | - Amazon Relational Database Service (RDS) Postgres or Aurora Serverless instance to host the Airflow metadata database.
54 | - Amazon S3 bucket to store the Sagemaker model artifacts, outputs and Airflow DAG with ML workflow. Template will prompt for the S3 bucket name
55 | - AWS IAM roles and EC2 Security Groups to allow Airflow components interact with the metadata database, S3 bucket and Amazon SageMaker
56 | 
57 | If you want to troubleshoot or add custom operators, you can connect directly to the instance through the Session Manager console. You can also launch different stable versions of Airflow (1.10.12 and 2.0.2).
58 | - Airflow 1.10.12 RDS: [![cfn-launch-stack](./images/LaunchStack.png)](https://console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks/new?stackName=airflow-sagemaker&templateURL=https://s3.amazonaws.com/aws-machine-learning-blog/artifacts/sagemaker-ml-workflow-with-apache-airflow/v1/cfn/airflow-ec2-1.10.12-RDS.yaml)
59 | - Airflow 1.10.12 Aurora Serverless: [![cfn-launch-stack](./images/LaunchStack.png)](https://console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks/new?stackName=airflow-sagemaker&templateURL=https://s3.amazonaws.com/aws-machine-learning-blog/artifacts/sagemaker-ml-workflow-with-apache-airflow/v1/cfn/airflow-ec2-1.10.12-Aurora-Serverless.yaml)
60 | - Airflow 2.0.2 RDS: [![cfn-launch-stack](./images/LaunchStack.png)](https://console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks/new?stackName=airflow-sagemaker&templateURL=https://s3.amazonaws.com/aws-machine-learning-blog/artifacts/sagemaker-ml-workflow-with-apache-airflow/v1/cfn/airflow-ec2-2.0.2-RDS.yaml)
61 | - Airflow 2.0.2 Aurora Serverless: [![cfn-launch-stack](./images/LaunchStack.png)](https://console.aws.amazon.com/cloudformation/home?region=us-west-2#/stacks/new?stackName=airflow-sagemaker&templateURL=https://s3.amazonaws.com/aws-machine-learning-blog/artifacts/sagemaker-ml-workflow-with-apache-airflow/v1/cfn/airflow-ec2-2.0.2-Aurora-Serverless.yaml)
62 | 
63 | It might take up to 10 minutes for the CloudFormation stack to create the resources. After the resource creation is completed, you should be able to log in to Airflow web UI with the credentials specified in the parameters of the CloudFormation stack. The Airflow web server runs on port 8080 by default. To open the Airflow web UI, open any browser, and type in the http://ec2-public-dns-name:8080. The public DNS name of the EC2 instance can be found on the Outputs tab of CloudFormation stack on the AWS CloudFormation console.
64 | 
65 | ### Airflow DAG for ML Workflow
66 | 
67 | Airflow DAG integrates all the ML tasks in a ML workflow. Airflow DAG is a python script where you express individual tasks as Airflow operators, set task dependencies and associate the tasks to the DAG to run either on demand or scheduled interval. The Airflow DAG script is divided into following sections
68 | 
69 | 1. Set DAG with parameters such as schedule_interval to run the workflow at scheduled time
70 | 2. Set up training, tuning and inference configurations for each operators using Sagemaker Python SDK for Airflow operators. 
71 | 3. Create individual tasks as Airflow operators defining trigger rules and associating them with the DAG object. Refer previous section for defining the individual tasks
72 | 4. Specify task dependencies
73 | 
74 | ![airflow_dag](./images/airflow-sagemaker-dag.png)
75 | 
76 | You can find the Airflow DAG code [here](./src/dag_ml_pipeline_amazon_video_reviews.py) in the repo.
77 | 
78 | ### Cleaning Up the Stack Resources
79 | 
80 | The final step is to clean up. To avoid unnecessary charges,
81 | 
82 | 1. You should destroy all of the resources created by the CloudFormation stack in Airflow set up by deleting the stack after you’re done experimenting with it. You can follow the steps here to [delete the stack](https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/cfn-console-delete-stack.html). 
83 | 2. You have to manually [delete the S3 bucket](https://docs.aws.amazon.com/AmazonS3/latest/user-guide/delete-bucket.html) created because AWS CloudFormation cannot delete non-empty S3 bucket.
84 | 
85 | ## References
86 | 
87 | - Refer [SageMaker SDK documentation](https://github.com/aws/sagemaker-python-sdk/blob/master/src/sagemaker/workflow/README.rst) and [Airflow documentation](https://airflow.apache.org/integration.html?highlight=sagemaker#amazon-sagemaker) for additional details on the Airflow SageMaker operators.
88 | - Refer [SageMaker documentation](https://docs.aws.amazon.com/sagemaker/latest/dg/fact-machines.html) to know more about Factorization Machines algorithm used in the blog post.
89 | 
90 | ## License Summary
91 | 
92 | This sample code is made available under a modified MIT license. See the [LICENSE](./LICENSE) file.
93 | 


--------------------------------------------------------------------------------
/cfn/airflow-ec2-1.10.12-RDS.yaml:
--------------------------------------------------------------------------------
  1 | AWSTemplateFormatVersion: "2010-09-09"
  2 | 
  3 | Description: Airflow server v1.10.12 on EC2 Amazon Linux 2 backed by Postgres RDS
  4 | 
  5 | Parameters:
  6 |   AirflowUser:
  7 |     NoEcho: "false"
  8 |     Description: Airflow UI admin account username
  9 |     Type: String
 10 |     MinLength: "4"
 11 |     MaxLength: "41"
 12 |     AllowedPattern: "[a-zA-Z0-9]*"
 13 |     ConstraintDescription: Must contain only alphanumeric characters
 14 |   AirflowPassword:
 15 |     NoEcho: "false"
 16 |     Description: Airflow UI admin account password
 17 |     Type: String
 18 |     MinLength: "8"
 19 |     MaxLength: "41"
 20 |     AllowedPattern: "[a-zA-Z0-9]*"
 21 |     ConstraintDescription: Must contain only alphanumeric characters
 22 |   DBPassword:
 23 |     NoEcho: "false"
 24 |     Description: Airflow database admin account password
 25 |     Type: String
 26 |     MinLength: "8"
 27 |     MaxLength: "41"
 28 |     AllowedPattern: "[a-zA-Z0-9]*"
 29 |     ConstraintDescription: Must contain only alphanumeric characters
 30 | 
 31 | # Mapping to find the Amazon Linux AMI in each region.
 32 | Mappings:
 33 |   RegionMap:
 34 |     ap-northeast-1:
 35 |       AMI: "ami-09ebacdc178ae23b7"
 36 |     ap-northeast-2:
 37 |       AMI: "ami-0a0de518b1fc4524c"
 38 |     ap-northeast-3:
 39 |       AMI: "ami-0e787554e61105680"
 40 |     ap-south-1:
 41 |       AMI: "ami-04db49c0fb2215364"
 42 |     ap-southeast-1:
 43 |       AMI: "ami-0f511ead81ccde020"
 44 |     ap-southeast-2:
 45 |       AMI: "ami-0aab712d6363da7f9"
 46 |     ca-central-1:
 47 |       AMI: "ami-02f84cf47c23f1769"
 48 |     eu-central-1:
 49 |       AMI: "ami-0453cb7b5f2b7fca2"
 50 |     eu-west-1:
 51 |       AMI: "ami-02b4e72b17337d6c1"
 52 |     eu-west-2:
 53 |       AMI: "ami-0d26eb3972b7f8c96"
 54 |     eu-west-3:
 55 |       AMI: "ami-0d49cec198762b78c"
 56 |     sa-east-1:
 57 |       AMI: "ami-0f8243a5175208e08"
 58 |     us-east-1:
 59 |       AMI: "ami-0c2b8ca1dad447f8a"
 60 |     us-east-2:
 61 |       AMI: "ami-0443305dabd4be2bc"
 62 |     us-west-1:
 63 |       AMI: "ami-04b6c97b14c54de18"
 64 |     us-west-2:
 65 |       AMI: "ami-083ac7c7ecf9bb9b0"
 66 | 
 67 | Resources:
 68 |   EC2Instance:
 69 |     Type: AWS::EC2::Instance
 70 |     CreationPolicy:
 71 |       ResourceSignal:
 72 |         Timeout: PT10M
 73 |     Properties:
 74 |       SecurityGroups: [!Ref "AirflowEC2SecurityGroup"]
 75 |       InstanceType: "m4.xlarge"
 76 |       IamInstanceProfile:
 77 |         Ref: EC2InstanceProfile
 78 |       Tags:
 79 |         - Key: Name
 80 |           Value: Airflow
 81 |       ImageId: !FindInMap
 82 |         - RegionMap
 83 |         - !Ref "AWS::Region"
 84 |         - AMI
 85 |       UserData:
 86 |         Fn::Base64: !Sub |
 87 |           #!/bin/bash
 88 |           set -x
 89 |           exec > >(tee /var/log/user-data.log|logger -t user-data ) 2>&1
 90 |           ln -s /root/user-data.log /var/log/user-data.log
 91 |           # Get right version of pip
 92 |           yum install aws-cfn-bootstrap -y
 93 |           python3 -m pip install pip==20.2.4 --user
 94 |           # Start cfn-init
 95 |           /opt/aws/bin/cfn-init -v -c install --stack ${AWS::StackId} --resource EC2Instance --region ${AWS::Region}
 96 |           yum remove python3-docutils -y
 97 |           echo "Installing s3fs"
 98 |           python3 -m pip install --upgrade s3fs==0.4.2
 99 |           python3 -m pip install psycopg2 wheel
100 |           echo "Installing sagemaker sdk"
101 |           python3 -m pip install sagemaker==v1.72
102 |           # Install airflow using pip
103 |           echo "Installing Apache Airflow"
104 |           export AIRFLOW_GPL_UNIDECODE=yes
105 |           python3 -m pip install apache-airflow[crypto,postgres]==1.10.12 --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-1.10.12/constraints-3.7.txt"
106 |           # Create Fernet Key
107 |           export FERNET_KEY=`openssl rand -base64 32`
108 |           sed -i 's|fernet_key =|fernet_key = '$FERNET'|g' ~/airflow/airflow.cfg
109 |           # Postgres operators and hook, support as an Airflow backend
110 |           echo 'export PATH=/usr/local/bin:~/.local/bin:$PATH' >> ~/.bash_profile
111 |           source ~/.bash_profile
112 |           # Initialize Airflow
113 |           airflow initdb
114 |           # Update the RDS connection in the Airflow Config file
115 |           sed -i '/sql_alchemy_conn/s/^/#/g' ~/airflow/airflow.cfg
116 |           sed -i '/#sql_alchemy_conn/ a sql_alchemy_conn = postgresql://airflow:${DBPassword}@${DBInstance.Endpoint.Address}:${DBInstance.Endpoint.Port}/airflowdb' ~/airflow/airflow.cfg
117 |           # Update the type of executor in the Airflow Config file
118 |           sed -i '/executor = SequentialExecutor/s/^/#/g' ~/airflow/airflow.cfg
119 |           sed -i '/executor = SequentialExecutor/ a executor = LocalExecutor' ~/airflow/airflow.cfg
120 |           sed -i 's/load_examples = True/load_examples = False/g' ~/airflow/airflow.cfg
121 |           sed -i 's/rbac = False/rbac = True/g' ~/airflow/airflow.cfg
122 |           airflow initdb
123 |           # create airflow connection to sagemaker
124 |           cat >> /tmp/airflow_conn.py << EOF
125 |           from airflow import settings
126 |           from airflow.models import Connection
127 |           #create a connection object
128 |           extra = '{"region_name": "${AWS::Region}"}'
129 |           conn_id = 'airflow-sagemaker'
130 |           conn = Connection(conn_id=conn_id,conn_type='aws', extra=extra)
131 |           # get the session
132 |           session = settings.Session()
133 |           session.add(conn)
134 |           session.commit()
135 |           EOF
136 |           python3 /tmp/airflow_conn.py
137 |           # create directories
138 |           mkdir -p ~/airflow/dags/sm-ml-pipeline
139 |           # clone the git repository
140 |           cd ~
141 |           git clone https://github.com/aws-samples/sagemaker-ml-workflow-with-apache-airflow.git
142 |           mv ~/sagemaker-ml-workflow-with-apache-airflow ~/sm-ml-pipeline
143 |           cd ~/sm-ml-pipeline/src
144 |           # prepare airflow dag definition for sagemaker blog post
145 |           sed -i 's/<s3-bucket>/${S3BucketName}/g' ./*.*
146 |           sed -i 's/<region-name>/${AWS::Region}/g' ./*.*
147 |           sed -i 's/<accountid>/${AWS::AccountId}/g' ~/sm-ml-pipeline/src/config.py
148 |           sed -i 's/AirflowSageMakerExecutionRole/AirflowSageMakerExecutionRole-${AWS::StackName}/g' ~/sm-ml-pipeline/src/config.py
149 |           zip -r dag.zip *
150 |           cp dag.zip ~/airflow/dags/sm-ml-pipeline/dag.zip
151 |           cd -
152 |           # Run Airflow webserver and scheduler
153 |           airflow create_user -e admin@example.com -f admin -l airflow -p ${AirflowPassword} -r Admin -u ${AirflowUser}
154 |           airflow list_dags
155 |           airflow webserver -D
156 |           airflow scheduler -D
157 |           yum install aws-cfn-bootstrap -y
158 |           /opt/aws/bin/cfn-signal --exit-code 0 --resource EC2Instance --region ${AWS::Region} --stack ${AWS::StackName}
159 |     Metadata:
160 |       AWS::CloudFormation::Init:
161 |         configSets:
162 |           install:
163 |             - installpackages
164 |         installpackages:
165 |           packages:
166 |             yum:
167 |               python3: []
168 |               python3-devel: []
169 |               gcc: []
170 |               gcc-c++: []
171 |               postgresql-devel: []
172 |               openssl-devel: []
173 |               git: []
174 |     DependsOn:
175 |       - DBInstance
176 |       - AirflowEC2SecurityGroup
177 |   DBInstance:
178 |     Type: AWS::RDS::DBInstance
179 |     DeletionPolicy: Delete
180 |     Properties:
181 |       DBName: airflowdb
182 |       Engine: postgres
183 |       MasterUsername: airflow
184 |       MasterUserPassword: !Ref "DBPassword"
185 |       DBInstanceClass: db.t3.small
186 |       AllocatedStorage: 5
187 |       DBSecurityGroups:
188 |         - Ref: DBSecurityGroup
189 |   AirflowEC2SecurityGroup:
190 |     Type: AWS::EC2::SecurityGroup
191 |     Properties:
192 |       GroupName: !Sub 'AirflowEC2SG-${AWS::StackName}'
193 |       GroupDescription: Enable HTTP access via port 8080
194 |       SecurityGroupIngress:
195 |         - IpProtocol: tcp
196 |           FromPort: 8080
197 |           ToPort: 8080
198 |           CidrIp: 0.0.0.0/0
199 |   DBSecurityGroup:
200 |     Type: AWS::RDS::DBSecurityGroup
201 |     Properties:
202 |       GroupDescription: Frontend Access
203 |       DBSecurityGroupIngress:
204 |         EC2SecurityGroupName:
205 |           Ref: AirflowEC2SecurityGroup
206 |   EC2Role:
207 |     Type: AWS::IAM::Role
208 |     Properties:
209 |       RoleName: !Sub 'AirflowInstanceRole-${AWS::StackName}'
210 |       AssumeRolePolicyDocument:
211 |         Version: "2012-10-17"
212 |         Statement:
213 |           - Effect: "Allow"
214 |             Principal:
215 |               Service:
216 |                 - "ec2.amazonaws.com"
217 |             Action:
218 |               - "sts:AssumeRole"
219 |       Path: "/"
220 |       ManagedPolicyArns:
221 |         - arn:aws:iam::aws:policy/AmazonSageMakerFullAccess
222 |         - arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore
223 |       Policies:
224 |         - PolicyName: !Sub 'AirflowResourceAccess-${AWS::StackName}'
225 |           PolicyDocument:
226 |             Version: "2012-10-17"
227 |             Statement:
228 |               - Effect: Allow
229 |                 Action:
230 |                   - s3:*
231 |                 Resource:
232 |                   - !Sub "arn:aws:s3:::${S3BucketName}"
233 |                   - !Sub "arn:aws:s3:::${S3BucketName}/*"
234 |               - Effect: Allow
235 |                 Action:
236 |                   - iam:GetRole
237 |                 Resource: "*"
238 |   EC2InstanceProfile:
239 |     Type: AWS::IAM::InstanceProfile
240 |     Properties:
241 |       InstanceProfileName: !Sub 'AirflowInstanceProfile-${AWS::StackName}'
242 |       Roles:
243 |         - Ref: EC2Role
244 |   S3BucketName:
245 |     Type: AWS::S3::Bucket
246 |     DeletionPolicy: Delete
247 |     Properties:
248 |       AccessControl: BucketOwnerFullControl
249 |       BucketName: !Join
250 |         - "-"
251 |         - - "airflow-sagemaker"
252 |           - !Select
253 |             - 0
254 |             - !Split
255 |               - "-"
256 |               - !Select
257 |                 - 2
258 |                 - !Split
259 |                   - "/"
260 |                   - !Ref "AWS::StackId"
261 |   AirflowSageMakerExecutionRole:
262 |     Type: AWS::IAM::Role
263 |     Properties:
264 |       RoleName: !Sub 'AirflowSageMakerExecutionRole-${AWS::StackName}'
265 |       AssumeRolePolicyDocument:
266 |         Version: "2012-10-17"
267 |         Statement:
268 |           - Effect: "Allow"
269 |             Principal:
270 |               Service:
271 |                 - "sagemaker.amazonaws.com"
272 |             Action:
273 |               - "sts:AssumeRole"
274 |       ManagedPolicyArns:
275 |         - arn:aws:iam::aws:policy/AmazonSageMakerFullAccess
276 |       Path: "/service-role/"
277 |       Policies:
278 |         - PolicyName: !Sub 'SageMakerS3BucketAccess-${AWS::StackName}'
279 |           PolicyDocument:
280 |             Version: "2012-10-17"
281 |             Statement:
282 |               - Effect: Allow
283 |                 Action:
284 |                   - s3:*
285 |                 Resource:
286 |                   - !Sub "arn:aws:s3:::${S3BucketName}"
287 |                   - !Sub "arn:aws:s3:::${S3BucketName}/*"
288 | Outputs:
289 |   AirflowEC2PublicDNSName:
290 |     Description: Public DNS Name of the Airflow EC2 instance
291 |     Value: !Join ["", ["http://", !GetAtt EC2Instance.PublicDnsName, ":8080"]]


--------------------------------------------------------------------------------
/cfn/airflow-ec2-1.10.12-Aurora-Serverless.yaml:
--------------------------------------------------------------------------------
  1 | AWSTemplateFormatVersion: "2010-09-09"
  2 | 
  3 | Description: Airflow server v1.10.12 on EC2 Amazon Linux 2 backed by Postgres Aurora Serverless
  4 | 
  5 | Parameters:
  6 |   AirflowUser:
  7 |     NoEcho: "false"
  8 |     Description: Airflow UI admin account username
  9 |     Type: String
 10 |     MinLength: "4"
 11 |     MaxLength: "41"
 12 |     AllowedPattern: "[a-zA-Z0-9]*"
 13 |     ConstraintDescription: Must contain only alphanumeric characters
 14 |   AirflowPassword:
 15 |     NoEcho: "false"
 16 |     Description: Airflow UI admin account password
 17 |     Type: String
 18 |     MinLength: "8"
 19 |     MaxLength: "41"
 20 |     AllowedPattern: "[a-zA-Z0-9]*"
 21 |     ConstraintDescription: Must contain only alphanumeric characters
 22 |   DBPassword:
 23 |     NoEcho: "false"
 24 |     Description: Airflow database admin account password
 25 |     Type: String
 26 |     MinLength: "8"
 27 |     MaxLength: "41"
 28 |     AllowedPattern: "[a-zA-Z0-9]*"
 29 |     ConstraintDescription: Must contain only alphanumeric characters
 30 | 
 31 | # Mapping to find the Amazon Linux AMI in each region.
 32 | Mappings:
 33 |   RegionMap:
 34 |     ap-northeast-1:
 35 |       AMI: "ami-09ebacdc178ae23b7"
 36 |     ap-northeast-2:
 37 |       AMI: "ami-0a0de518b1fc4524c"
 38 |     ap-northeast-3:
 39 |       AMI: "ami-0e787554e61105680"
 40 |     ap-south-1:
 41 |       AMI: "ami-04db49c0fb2215364"
 42 |     ap-southeast-1:
 43 |       AMI: "ami-0f511ead81ccde020"
 44 |     ap-southeast-2:
 45 |       AMI: "ami-0aab712d6363da7f9"
 46 |     ca-central-1:
 47 |       AMI: "ami-02f84cf47c23f1769"
 48 |     eu-central-1:
 49 |       AMI: "ami-0453cb7b5f2b7fca2"
 50 |     eu-west-1:
 51 |       AMI: "ami-02b4e72b17337d6c1"
 52 |     eu-west-2:
 53 |       AMI: "ami-0d26eb3972b7f8c96"
 54 |     eu-west-3:
 55 |       AMI: "ami-0d49cec198762b78c"
 56 |     sa-east-1:
 57 |       AMI: "ami-0f8243a5175208e08"
 58 |     us-east-1:
 59 |       AMI: "ami-0c2b8ca1dad447f8a"
 60 |     us-east-2:
 61 |       AMI: "ami-0443305dabd4be2bc"
 62 |     us-west-1:
 63 |       AMI: "ami-04b6c97b14c54de18"
 64 |     us-west-2:
 65 |       AMI: "ami-083ac7c7ecf9bb9b0"
 66 | 
 67 | Resources:
 68 |   EC2Instance:
 69 |     Type: AWS::EC2::Instance
 70 |     CreationPolicy:
 71 |       ResourceSignal:
 72 |         Timeout: PT10M
 73 |     Properties:
 74 |       SecurityGroups: [!Ref "AirflowEC2SecurityGroup"]
 75 |       InstanceType: "m4.xlarge"
 76 |       IamInstanceProfile:
 77 |         Ref: EC2InstanceProfile
 78 |       Tags:
 79 |         - Key: Name
 80 |           Value: Airflow
 81 |       ImageId: !FindInMap
 82 |         - RegionMap
 83 |         - !Ref "AWS::Region"
 84 |         - AMI
 85 |       UserData:
 86 |         Fn::Base64: !Sub |
 87 |           #!/bin/bash
 88 |           set -x
 89 |           exec > >(tee /var/log/user-data.log|logger -t user-data ) 2>&1
 90 |           ln -s /root/user-data.log /var/log/user-data.log
 91 |           # Get right version of pip
 92 |           yum install aws-cfn-bootstrap -y
 93 |           python3 -m pip install pip==20.2.4 --user
 94 |           # Start cfn-init
 95 |           /opt/aws/bin/cfn-init -v -c install --stack ${AWS::StackId} --resource EC2Instance --region ${AWS::Region}
 96 |           yum remove python3-docutils -y
 97 |           echo "Installing s3fs"
 98 |           python3 -m pip install --upgrade s3fs==0.4.2
 99 |           python3 -m pip install psycopg2 wheel
100 |           echo "Installing sagemaker sdk"
101 |           python3 -m pip install sagemaker==v1.72
102 |           # Install airflow using pip
103 |           echo "Installing Apache Airflow"
104 |           export AIRFLOW_GPL_UNIDECODE=yes
105 |           python3 -m pip install apache-airflow[crypto,postgres]==1.10.12 --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-1.10.12/constraints-3.7.txt"
106 |           # Create Fernet Key
107 |           export FERNET_KEY=`openssl rand -base64 32`
108 |           sed -i 's|fernet_key =|fernet_key = '$FERNET'|g' ~/airflow/airflow.cfg
109 |           # Postgres operators and hook, support as an Airflow backend
110 |           echo 'export PATH=/usr/local/bin:~/.local/bin:$PATH' >> ~/.bash_profile
111 |           source ~/.bash_profile
112 |           # Initialize Airflow
113 |           airflow initdb
114 |           # Update the RDS connection in the Airflow Config file
115 |           sed -i '/sql_alchemy_conn/s/^/#/g' ~/airflow/airflow.cfg
116 |           sed -i '/#sql_alchemy_conn/ a sql_alchemy_conn = postgresql://airflow:${DBPassword}@${DBInstance.Endpoint.Address}:${DBInstance.Endpoint.Port}/airflowdb' ~/airflow/airflow.cfg
117 |           # Update the type of executor in the Airflow Config file
118 |           sed -i '/executor = SequentialExecutor/s/^/#/g' ~/airflow/airflow.cfg
119 |           sed -i '/executor = SequentialExecutor/ a executor = LocalExecutor' ~/airflow/airflow.cfg
120 |           sed -i 's/load_examples = True/load_examples = False/g' ~/airflow/airflow.cfg
121 |           sed -i 's/rbac = False/rbac = True/g' ~/airflow/airflow.cfg
122 |           airflow initdb
123 |           # create airflow connection to sagemaker
124 |           cat >> /tmp/airflow_conn.py << EOF
125 |           from airflow import settings
126 |           from airflow.models import Connection
127 |           #create a connection object
128 |           extra = '{"region_name": "${AWS::Region}"}'
129 |           conn_id = 'airflow-sagemaker'
130 |           conn = Connection(conn_id=conn_id,conn_type='aws', extra=extra)
131 |           # get the session
132 |           session = settings.Session()
133 |           session.add(conn)
134 |           session.commit()
135 |           EOF
136 |           python3 /tmp/airflow_conn.py
137 |           # create directories
138 |           mkdir -p ~/airflow/dags/sm-ml-pipeline
139 |           # clone the git repository
140 |           cd ~
141 |           git clone https://github.com/aws-samples/sagemaker-ml-workflow-with-apache-airflow.git
142 |           mv ~/sagemaker-ml-workflow-with-apache-airflow ~/sm-ml-pipeline
143 |           cd ~/sm-ml-pipeline/src
144 |           # prepare airflow dag definition for sagemaker blog post
145 |           sed -i 's/<s3-bucket>/${S3BucketName}/g' ./*.*
146 |           sed -i 's/<region-name>/${AWS::Region}/g' ./*.*
147 |           sed -i 's/<accountid>/${AWS::AccountId}/g' ~/sm-ml-pipeline/src/config.py
148 |           sed -i 's/AirflowSageMakerExecutionRole/AirflowSageMakerExecutionRole-${AWS::StackName}/g' ~/sm-ml-pipeline/src/config.py
149 |           zip -r dag.zip *
150 |           cp dag.zip ~/airflow/dags/sm-ml-pipeline/dag.zip
151 |           cd -
152 |           # Run Airflow webserver and scheduler
153 |           airflow create_user -e admin@example.com -f admin -l airflow -p ${AirflowPassword} -r Admin -u ${AirflowUser}
154 |           airflow list_dags
155 |           airflow webserver -D
156 |           airflow scheduler -D
157 |           yum install aws-cfn-bootstrap -y
158 |           /opt/aws/bin/cfn-signal --exit-code 0 --resource EC2Instance --region ${AWS::Region} --stack ${AWS::StackName}
159 |     Metadata:
160 |       AWS::CloudFormation::Init:
161 |         configSets:
162 |           install:
163 |             - installpackages
164 |         installpackages:
165 |           packages:
166 |             yum:
167 |               python3: []
168 |               python3-devel: []
169 |               gcc: []
170 |               gcc-c++: []
171 |               postgresql-devel: []
172 |               openssl-devel: []
173 |               git: []
174 |     DependsOn:
175 |       - DBInstance
176 |       - AirflowEC2SecurityGroup
177 |   DBInstance:
178 |     Type: AWS::RDS::DBCluster
179 |     DeletionPolicy: Delete
180 |     Properties:
181 |       DatabaseName: airflowdb
182 |       Engine: aurora-postgresql
183 |       MasterUsername: airflow
184 |       MasterUserPassword: !Ref "DBPassword"
185 |       EngineMode: serverless
186 |       ScalingConfiguration:
187 |         AutoPause: true
188 |         MaxCapacity: 16
189 |         MinCapacity: 2
190 |         SecondsUntilAutoPause: 300
191 |       VpcSecurityGroupIds:
192 |         - !GetAtt AirflowEC2SecurityGroup.GroupId
193 |   AirflowEC2SecurityGroup:
194 |     Type: AWS::EC2::SecurityGroup
195 |     Properties:
196 |       GroupName: !Sub 'AirflowEC2SG-${AWS::StackName}'
197 |       GroupDescription: Enable HTTP access via port 80
198 |       SecurityGroupIngress:
199 |         - IpProtocol: tcp
200 |           FromPort: 8080
201 |           ToPort: 8080
202 |           CidrIp: 0.0.0.0/0
203 |         - IpProtocol: tcp
204 |           FromPort: 5432
205 |           ToPort: 5432
206 |           SourceSecurityGroupName: !Sub 'AirflowEC2SG-${AWS::StackName}'
207 |   EC2Role:
208 |     Type: AWS::IAM::Role
209 |     Properties:
210 |       RoleName: !Sub 'AirflowInstanceRole-${AWS::StackName}'
211 |       AssumeRolePolicyDocument:
212 |         Version: "2012-10-17"
213 |         Statement:
214 |           - Effect: "Allow"
215 |             Principal:
216 |               Service:
217 |                 - "ec2.amazonaws.com"
218 |             Action:
219 |               - "sts:AssumeRole"
220 |       Path: "/"
221 |       ManagedPolicyArns:
222 |         - arn:aws:iam::aws:policy/AmazonSageMakerFullAccess
223 |         - arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore
224 |       Policies:
225 |         - PolicyName: !Sub 'AirflowResourceAccess-${AWS::StackName}'
226 |           PolicyDocument:
227 |             Version: "2012-10-17"
228 |             Statement:
229 |               - Effect: Allow
230 |                 Action:
231 |                   - s3:*
232 |                 Resource:
233 |                   - !Sub "arn:aws:s3:::${S3BucketName}"
234 |                   - !Sub "arn:aws:s3:::${S3BucketName}/*"
235 |               - Effect: Allow
236 |                 Action:
237 |                   - iam:GetRole
238 |                 Resource: "*"
239 |   EC2InstanceProfile:
240 |     Type: AWS::IAM::InstanceProfile
241 |     Properties:
242 |       InstanceProfileName: !Sub 'AirflowInstanceProfile-${AWS::StackName}'
243 |       Roles:
244 |         - Ref: EC2Role
245 |   S3BucketName:
246 |     Type: AWS::S3::Bucket
247 |     DeletionPolicy: Delete
248 |     Properties:
249 |       AccessControl: BucketOwnerFullControl
250 |       BucketName: !Join
251 |         - "-"
252 |         - - "airflow-sagemaker"
253 |           - !Select
254 |             - 0
255 |             - !Split
256 |               - "-"
257 |               - !Select
258 |                 - 2
259 |                 - !Split
260 |                   - "/"
261 |                   - !Ref "AWS::StackId"
262 |   AirflowSageMakerExecutionRole:
263 |     Type: AWS::IAM::Role
264 |     Properties:
265 |       RoleName: !Sub 'AirflowSageMakerExecutionRole-${AWS::StackName}'
266 |       AssumeRolePolicyDocument:
267 |         Version: "2012-10-17"
268 |         Statement:
269 |           - Effect: "Allow"
270 |             Principal:
271 |               Service:
272 |                 - "sagemaker.amazonaws.com"
273 |             Action:
274 |               - "sts:AssumeRole"
275 |       ManagedPolicyArns:
276 |         - arn:aws:iam::aws:policy/AmazonSageMakerFullAccess
277 |       Path: "/service-role/"
278 |       Policies:
279 |         - PolicyName: !Sub 'SageMakerS3BucketAccess-${AWS::StackName}'
280 |           PolicyDocument:
281 |             Version: "2012-10-17"
282 |             Statement:
283 |               - Effect: Allow
284 |                 Action:
285 |                   - s3:*
286 |                 Resource:
287 |                   - !Sub "arn:aws:s3:::${S3BucketName}"
288 |                   - !Sub "arn:aws:s3:::${S3BucketName}/*"
289 | Outputs:
290 |   AirflowEC2PublicDNSName:
291 |     Description: Public DNS Name of the Airflow EC2 instance
292 |     Value: !Join ["", ["http://", !GetAtt EC2Instance.PublicDnsName, ":8080"]]


--------------------------------------------------------------------------------
/cfn/airflow-ec2-2.0.2-RDS.yaml:
--------------------------------------------------------------------------------
  1 | AWSTemplateFormatVersion: "2010-09-09"
  2 | 
  3 | Description: Airflow server v2.0.2 on EC2 Amazon Linux 2 backed by Postgres RDS
  4 | 
  5 | Parameters:
  6 |   AirflowUser:
  7 |     NoEcho: "false"
  8 |     Description: Airflow UI admin account username
  9 |     Type: String
 10 |     MinLength: "4"
 11 |     MaxLength: "41"
 12 |     AllowedPattern: "[a-zA-Z0-9]*"
 13 |     ConstraintDescription: Must contain only alphanumeric characters
 14 |   AirflowPassword:
 15 |     NoEcho: "false"
 16 |     Description: Airflow UI admin account password
 17 |     Type: String
 18 |     MinLength: "8"
 19 |     MaxLength: "41"
 20 |     AllowedPattern: "[a-zA-Z0-9]*"
 21 |     ConstraintDescription: Must contain only alphanumeric characters
 22 |   DBPassword:
 23 |     NoEcho: "false"
 24 |     Description: Airflow database admin account password
 25 |     Type: String
 26 |     MinLength: "8"
 27 |     MaxLength: "41"
 28 |     AllowedPattern: "[a-zA-Z0-9]*"
 29 |     ConstraintDescription: Must contain only alphanumeric characters
 30 | 
 31 | # Mapping to find the Amazon Linux AMI in each region.
 32 | Mappings:
 33 |   RegionMap:
 34 |     ap-northeast-1:
 35 |       AMI: "ami-09ebacdc178ae23b7"
 36 |     ap-northeast-2:
 37 |       AMI: "ami-0a0de518b1fc4524c"
 38 |     ap-northeast-3:
 39 |       AMI: "ami-0e787554e61105680"
 40 |     ap-south-1:
 41 |       AMI: "ami-04db49c0fb2215364"
 42 |     ap-southeast-1:
 43 |       AMI: "ami-0f511ead81ccde020"
 44 |     ap-southeast-2:
 45 |       AMI: "ami-0aab712d6363da7f9"
 46 |     ca-central-1:
 47 |       AMI: "ami-02f84cf47c23f1769"
 48 |     eu-central-1:
 49 |       AMI: "ami-0453cb7b5f2b7fca2"
 50 |     eu-west-1:
 51 |       AMI: "ami-02b4e72b17337d6c1"
 52 |     eu-west-2:
 53 |       AMI: "ami-0d26eb3972b7f8c96"
 54 |     eu-west-3:
 55 |       AMI: "ami-0d49cec198762b78c"
 56 |     sa-east-1:
 57 |       AMI: "ami-0f8243a5175208e08"
 58 |     us-east-1:
 59 |       AMI: "ami-0c2b8ca1dad447f8a"
 60 |     us-east-2:
 61 |       AMI: "ami-0443305dabd4be2bc"
 62 |     us-west-1:
 63 |       AMI: "ami-04b6c97b14c54de18"
 64 |     us-west-2:
 65 |       AMI: "ami-083ac7c7ecf9bb9b0"
 66 | 
 67 | Resources:
 68 |   EC2Instance:
 69 |     Type: AWS::EC2::Instance
 70 |     CreationPolicy:
 71 |       ResourceSignal:
 72 |         Timeout: PT10M
 73 |     Properties:
 74 |       SecurityGroups: [!Ref "AirflowEC2SecurityGroup"]
 75 |       InstanceType: "m4.xlarge"
 76 |       IamInstanceProfile:
 77 |         Ref: EC2InstanceProfile
 78 |       Tags:
 79 |         - Key: Name
 80 |           Value: Airflow
 81 |       ImageId: !FindInMap
 82 |         - RegionMap
 83 |         - !Ref "AWS::Region"
 84 |         - AMI
 85 |       UserData:
 86 |         Fn::Base64: !Sub |
 87 |           #!/bin/bash
 88 |           set -x
 89 |           exec > >(tee /var/log/user-data.log|logger -t user-data ) 2>&1
 90 |           ln -s /root/user-data.log /var/log/user-data.log
 91 |           # Get right version of pip
 92 |           yum install aws-cfn-bootstrap -y
 93 |           python3 -m pip install pip==20.2.4 --user
 94 |           # Start cfn-init
 95 |           /opt/aws/bin/cfn-init -v -c install --stack ${AWS::StackId} --resource EC2Instance --region ${AWS::Region}
 96 |           yum remove python3-docutils -y
 97 |           echo "Installing s3fs"
 98 |           python3 -m pip install --upgrade s3fs==0.4.2
 99 |           python3 -m pip install psycopg2 wheel
100 |           # Upgrade sqlite
101 |           wget https://www.sqlite.org/src/tarball/sqlite.tar.gz
102 |           tar xzf sqlite.tar.gz
103 |           cd sqlite/
104 |           export CFLAGS="-DSQLITE_ENABLE_FTS3 \
105 |               -DSQLITE_ENABLE_FTS3_PARENTHESIS \
106 |               -DSQLITE_ENABLE_FTS4 \
107 |               -DSQLITE_ENABLE_FTS5 \
108 |               -DSQLITE_ENABLE_JSON1 \
109 |               -DSQLITE_ENABLE_LOAD_EXTENSION \
110 |               -DSQLITE_ENABLE_RTREE \
111 |               -DSQLITE_ENABLE_STAT4 \
112 |               -DSQLITE_ENABLE_UPDATE_DELETE_LIMIT \
113 |               -DSQLITE_SOUNDEX \
114 |               -DSQLITE_TEMP_STORE=3 \
115 |               -DSQLITE_USE_URI \
116 |               -O2 \
117 |               -fPIC"
118 |           export PREFIX="/usr/local"
119 |           LIBS="-lm" ./configure --disable-tcl --enable-shared --enable-tempstore=always --prefix="$PREFIX"
120 |           make
121 |           make install
122 |           export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
123 |           echo "Installing sagemaker sdk"
124 |           python3 -m pip install sagemaker==v1.72
125 |           # Install airflow using pip
126 |           echo "Installing Apache Airflow"
127 |           export AIRFLOW_GPL_UNIDECODE=yes
128 |           python3 -m pip install apache-airflow[crypto,postgres,amazon]==2.0.2 --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.0.2/constraints-3.7.txt"
129 |           # Create Fernet Key
130 |           export FERNET_KEY=`openssl rand -base64 32`
131 |           sed -i 's|fernet_key =|fernet_key = '$FERNET'|g' ~/airflow/airflow.cfg
132 |           # Postgres operators and hook, support as an Airflow backend
133 |           echo 'export PATH=/usr/local/bin:~/.local/bin:$PATH' >> ~/.bash_profile
134 |           source ~/.bash_profile
135 |           # Initialize Airflow
136 |           airflow db init
137 |           # Update the RDS connection in the Airflow Config file
138 |           sed -i '/sql_alchemy_conn/s/^/#/g' ~/airflow/airflow.cfg
139 |           sed -i '/#sql_alchemy_conn/ a sql_alchemy_conn = postgresql://airflow:${DBPassword}@${DBInstance.Endpoint.Address}:${DBInstance.Endpoint.Port}/airflowdb' ~/airflow/airflow.cfg
140 |           # Update the type of executor in the Airflow Config file
141 |           sed -i '/executor = SequentialExecutor/s/^/#/g' ~/airflow/airflow.cfg
142 |           sed -i '/executor = SequentialExecutor/ a executor = LocalExecutor' ~/airflow/airflow.cfg
143 |           sed -i 's/load_examples = True/load_examples = False/g' ~/airflow/airflow.cfg
144 |           airflow db init
145 |           airflow users create -e admin@example.com -f admin -l airflow -p ${AirflowPassword} -r Admin -u ${AirflowUser}
146 |           # create airflow connection to sagemaker
147 |           cat >> /tmp/airflow_conn.py << EOF
148 |           from airflow import settings
149 |           from airflow.models import Connection
150 |           #create a connection object
151 |           extra = '{"region_name": "${AWS::Region}"}'
152 |           conn_id = 'airflow-sagemaker'
153 |           conn = Connection(conn_id=conn_id,conn_type='s3', extra=extra)
154 |           # get the session
155 |           session = settings.Session()
156 |           session.add(conn)
157 |           session.commit()
158 |           EOF
159 |           python3 /tmp/airflow_conn.py
160 |           # create directories
161 |           mkdir -p ~/airflow/dags/sm-ml-pipeline
162 |           # clone the git repository
163 |           cd ~
164 |           git clone https://github.com/aws-samples/sagemaker-ml-workflow-with-apache-airflow.git
165 |           mv ~/sagemaker-ml-workflow-with-apache-airflow ~/sm-ml-pipeline
166 |           cd ~/sm-ml-pipeline/src
167 |           # prepare airflow dag definition for sagemaker blog post
168 |           sed -i 's/<s3-bucket>/${S3BucketName}/g' ./*.*
169 |           sed -i 's/<region-name>/${AWS::Region}/g' ./*.*
170 |           sed -i 's/<accountid>/${AWS::AccountId}/g' ~/sm-ml-pipeline/src/config.py
171 |           sed -i 's/AirflowSageMakerExecutionRole/AirflowSageMakerExecutionRole-${AWS::StackName}/g' ~/sm-ml-pipeline/src/config.py
172 |           sed -i "s/hook = AwsHook(aws_conn_id='airflow-sagemaker')/hook = AwsHook(aws_conn_id='airflow-sagemaker', client_type='s3')/g" ~/sm-ml-pipeline/src/dag_ml_pipeline_amazon_video_reviews.py
173 |           sed -i '/provide_context=False/d' ~/sm-ml-pipeline/src/dag_ml_pipeline_amazon_video_reviews.py
174 |           sed -i 's/enable_xcom_pickling = False/enable_xcom_pickling = True/g' ~/airflow/airflow.cfg
175 |           zip -r dag.zip *
176 |           cp dag.zip ~/airflow/dags/sm-ml-pipeline/dag.zip
177 |           cd -
178 |           # Run Airflow webserver and scheduler
179 |           airflow dags list
180 |           airflow webserver -D
181 |           airflow scheduler -D
182 |           yum install aws-cfn-bootstrap -y
183 |           /opt/aws/bin/cfn-signal --exit-code 0 --resource EC2Instance --region ${AWS::Region} --stack ${AWS::StackName}
184 |     Metadata:
185 |       AWS::CloudFormation::Init:
186 |         configSets:
187 |           install:
188 |             - installpackages
189 |         installpackages:
190 |           packages:
191 |             yum:
192 |               python3: []
193 |               python3-devel: []
194 |               gcc: []
195 |               gcc-c++: []
196 |               postgresql-devel: []
197 |               openssl-devel: []
198 |               git: []
199 |     DependsOn:
200 |       - DBInstance
201 |       - AirflowEC2SecurityGroup
202 |   DBInstance:
203 |     Type: AWS::RDS::DBInstance
204 |     DeletionPolicy: Delete
205 |     Properties:
206 |       DBName: airflowdb
207 |       Engine: postgres
208 |       MasterUsername: airflow
209 |       MasterUserPassword: !Ref "DBPassword"
210 |       DBInstanceClass: db.t3.small
211 |       AllocatedStorage: 5
212 |       DBSecurityGroups:
213 |         - Ref: DBSecurityGroup
214 |   AirflowEC2SecurityGroup:
215 |     Type: AWS::EC2::SecurityGroup
216 |     Properties:
217 |       GroupName: !Sub 'AirflowEC2SG-${AWS::StackName}'
218 |       GroupDescription: Enable HTTP access via port 8080
219 |       SecurityGroupIngress:
220 |         - IpProtocol: tcp
221 |           FromPort: 8080
222 |           ToPort: 8080
223 |           CidrIp: 0.0.0.0/0
224 |   DBSecurityGroup:
225 |     Type: AWS::RDS::DBSecurityGroup
226 |     Properties:
227 |       GroupDescription: Frontend Access
228 |       DBSecurityGroupIngress:
229 |         EC2SecurityGroupName:
230 |           Ref: AirflowEC2SecurityGroup
231 |   EC2Role:
232 |     Type: AWS::IAM::Role
233 |     Properties:
234 |       RoleName: !Sub 'AirflowInstanceRole-${AWS::StackName}'
235 |       AssumeRolePolicyDocument:
236 |         Version: "2012-10-17"
237 |         Statement:
238 |           - Effect: "Allow"
239 |             Principal:
240 |               Service:
241 |                 - "ec2.amazonaws.com"
242 |             Action:
243 |               - "sts:AssumeRole"
244 |       Path: "/"
245 |       ManagedPolicyArns:
246 |         - arn:aws:iam::aws:policy/AmazonSageMakerFullAccess
247 |         - arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore
248 |       Policies:
249 |         - PolicyName: !Sub 'AirflowResourceAccess-${AWS::StackName}'
250 |           PolicyDocument:
251 |             Version: "2012-10-17"
252 |             Statement:
253 |               - Effect: Allow
254 |                 Action:
255 |                   - s3:*
256 |                 Resource:
257 |                   - !Sub "arn:aws:s3:::${S3BucketName}"
258 |                   - !Sub "arn:aws:s3:::${S3BucketName}/*"
259 |               - Effect: Allow
260 |                 Action:
261 |                   - iam:GetRole
262 |                 Resource: "*"
263 |   EC2InstanceProfile:
264 |     Type: AWS::IAM::InstanceProfile
265 |     Properties:
266 |       InstanceProfileName: !Sub 'AirflowInstanceProfile-${AWS::StackName}'
267 |       Roles:
268 |         - Ref: EC2Role
269 |   S3BucketName:
270 |     Type: AWS::S3::Bucket
271 |     DeletionPolicy: Delete
272 |     Properties:
273 |       AccessControl: BucketOwnerFullControl
274 |       BucketName: !Join
275 |         - "-"
276 |         - - "airflow-sagemaker"
277 |           - !Select
278 |             - 0
279 |             - !Split
280 |               - "-"
281 |               - !Select
282 |                 - 2
283 |                 - !Split
284 |                   - "/"
285 |                   - !Ref "AWS::StackId"
286 |   AirflowSageMakerExecutionRole:
287 |     Type: AWS::IAM::Role
288 |     Properties:
289 |       RoleName: !Sub 'AirflowSageMakerExecutionRole-${AWS::StackName}'
290 |       AssumeRolePolicyDocument:
291 |         Version: "2012-10-17"
292 |         Statement:
293 |           - Effect: "Allow"
294 |             Principal:
295 |               Service:
296 |                 - "sagemaker.amazonaws.com"
297 |             Action:
298 |               - "sts:AssumeRole"
299 |       ManagedPolicyArns:
300 |         - arn:aws:iam::aws:policy/AmazonSageMakerFullAccess
301 |       Path: "/service-role/"
302 |       Policies:
303 |         - PolicyName: !Sub 'SageMakerS3BucketAccess-${AWS::StackName}'
304 |           PolicyDocument:
305 |             Version: "2012-10-17"
306 |             Statement:
307 |               - Effect: Allow
308 |                 Action:
309 |                   - s3:*
310 |                 Resource:
311 |                   - !Sub "arn:aws:s3:::${S3BucketName}"
312 |                   - !Sub "arn:aws:s3:::${S3BucketName}/*"
313 | Outputs:
314 |   AirflowEC2PublicDNSName:
315 |     Description: Public DNS Name of the Airflow EC2 instance
316 |     Value: !Join ["", ["http://", !GetAtt EC2Instance.PublicDnsName, ":8080"]]


--------------------------------------------------------------------------------
/cfn/airflow-ec2-2.0.2-Aurora-Serverless.yaml:
--------------------------------------------------------------------------------
  1 | AWSTemplateFormatVersion: "2010-09-09"
  2 | 
  3 | Description: Airflow server v2.0.2 on EC2 Amazon Linux 2 backed by Postgres Aurora Serverless
  4 | 
  5 | Parameters:
  6 |   AirflowUser:
  7 |     NoEcho: "false"
  8 |     Description: Airflow UI admin account username
  9 |     Type: String
 10 |     MinLength: "4"
 11 |     MaxLength: "41"
 12 |     AllowedPattern: "[a-zA-Z0-9]*"
 13 |     ConstraintDescription: Must contain only alphanumeric characters
 14 |   AirflowPassword:
 15 |     NoEcho: "false"
 16 |     Description: Airflow UI admin account password
 17 |     Type: String
 18 |     MinLength: "8"
 19 |     MaxLength: "41"
 20 |     AllowedPattern: "[a-zA-Z0-9]*"
 21 |     ConstraintDescription: Must contain only alphanumeric characters
 22 |   DBPassword:
 23 |     NoEcho: "false"
 24 |     Description: Airflow database admin account password
 25 |     Type: String
 26 |     MinLength: "8"
 27 |     MaxLength: "41"
 28 |     AllowedPattern: "[a-zA-Z0-9]*"
 29 |     ConstraintDescription: Must contain only alphanumeric characters
 30 |     
 31 | # Mapping to find the Amazon Linux AMI in each region.
 32 | Mappings:
 33 |   RegionMap:
 34 |     ap-northeast-1:
 35 |       AMI: "ami-09ebacdc178ae23b7"
 36 |     ap-northeast-2:
 37 |       AMI: "ami-0a0de518b1fc4524c"
 38 |     ap-northeast-3:
 39 |       AMI: "ami-0e787554e61105680"
 40 |     ap-south-1:
 41 |       AMI: "ami-04db49c0fb2215364"
 42 |     ap-southeast-1:
 43 |       AMI: "ami-0f511ead81ccde020"
 44 |     ap-southeast-2:
 45 |       AMI: "ami-0aab712d6363da7f9"
 46 |     ca-central-1:
 47 |       AMI: "ami-02f84cf47c23f1769"
 48 |     eu-central-1:
 49 |       AMI: "ami-0453cb7b5f2b7fca2"
 50 |     eu-west-1:
 51 |       AMI: "ami-02b4e72b17337d6c1"
 52 |     eu-west-2:
 53 |       AMI: "ami-0d26eb3972b7f8c96"
 54 |     eu-west-3:
 55 |       AMI: "ami-0d49cec198762b78c"
 56 |     sa-east-1:
 57 |       AMI: "ami-0f8243a5175208e08"
 58 |     us-east-1:
 59 |       AMI: "ami-0c2b8ca1dad447f8a"
 60 |     us-east-2:
 61 |       AMI: "ami-0443305dabd4be2bc"
 62 |     us-west-1:
 63 |       AMI: "ami-04b6c97b14c54de18"
 64 |     us-west-2:
 65 |       AMI: "ami-083ac7c7ecf9bb9b0"
 66 | 
 67 | Resources:
 68 |   EC2Instance:
 69 |     Type: AWS::EC2::Instance
 70 |     CreationPolicy:
 71 |       ResourceSignal:
 72 |         Timeout: PT10M
 73 |     Properties:
 74 |       SecurityGroups: [!Ref "AirflowEC2SecurityGroup"]
 75 |       InstanceType: "m4.xlarge"
 76 |       IamInstanceProfile:
 77 |         Ref: EC2InstanceProfile
 78 |       Tags:
 79 |         - Key: Name
 80 |           Value: Airflow
 81 |       ImageId: !FindInMap
 82 |         - RegionMap
 83 |         - !Ref "AWS::Region"
 84 |         - AMI
 85 |       UserData:
 86 |         Fn::Base64: !Sub |
 87 |           #!/bin/bash
 88 |           set -x
 89 |           exec > >(tee /var/log/user-data.log|logger -t user-data ) 2>&1
 90 |           ln -s /root/user-data.log /var/log/user-data.log
 91 |           # Get right version of pip
 92 |           yum install aws-cfn-bootstrap -y
 93 |           python3 -m pip install pip==20.2.4 --user
 94 |           # Start cfn-init
 95 |           /opt/aws/bin/cfn-init -v -c install --stack ${AWS::StackId} --resource EC2Instance --region ${AWS::Region}
 96 |           yum remove python3-docutils -y
 97 |           echo "Installing s3fs"
 98 |           python3 -m pip install --upgrade s3fs==0.4.2
 99 |           python3 -m pip install psycopg2 wheel
100 |           # Upgrade sqlite
101 |           wget https://www.sqlite.org/src/tarball/sqlite.tar.gz
102 |           tar xzf sqlite.tar.gz
103 |           cd sqlite/
104 |           export CFLAGS="-DSQLITE_ENABLE_FTS3 \
105 |               -DSQLITE_ENABLE_FTS3_PARENTHESIS \
106 |               -DSQLITE_ENABLE_FTS4 \
107 |               -DSQLITE_ENABLE_FTS5 \
108 |               -DSQLITE_ENABLE_JSON1 \
109 |               -DSQLITE_ENABLE_LOAD_EXTENSION \
110 |               -DSQLITE_ENABLE_RTREE \
111 |               -DSQLITE_ENABLE_STAT4 \
112 |               -DSQLITE_ENABLE_UPDATE_DELETE_LIMIT \
113 |               -DSQLITE_SOUNDEX \
114 |               -DSQLITE_TEMP_STORE=3 \
115 |               -DSQLITE_USE_URI \
116 |               -O2 \
117 |               -fPIC"
118 |           export PREFIX="/usr/local"
119 |           LIBS="-lm" ./configure --disable-tcl --enable-shared --enable-tempstore=always --prefix="$PREFIX"
120 |           make
121 |           make install
122 |           export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
123 |           echo "Installing sagemaker sdk"
124 |           python3 -m pip install sagemaker==v1.72
125 |           # Install airflow using pip
126 |           echo "Installing Apache Airflow"
127 |           export AIRFLOW_GPL_UNIDECODE=yes
128 |           python3 -m pip install apache-airflow[crypto,postgres,amazon]==2.0.2 --constraint "https://raw.githubusercontent.com/apache/airflow/constraints-2.0.2/constraints-3.7.txt"
129 |           # Create Fernet Key
130 |           export FERNET_KEY=`openssl rand -base64 32`
131 |           sed -i 's|fernet_key =|fernet_key = '$FERNET'|g' ~/airflow/airflow.cfg
132 |           # Postgres operators and hook, support as an Airflow backend
133 |           echo 'export PATH=/usr/local/bin:~/.local/bin:$PATH' >> ~/.bash_profile
134 |           source ~/.bash_profile
135 |           # Initialize Airflow
136 |           airflow db init
137 |           # Update the RDS connection in the Airflow Config file
138 |           sed -i '/sql_alchemy_conn/s/^/#/g' ~/airflow/airflow.cfg
139 |           sed -i '/#sql_alchemy_conn/ a sql_alchemy_conn = postgresql://airflow:${DBPassword}@${DBInstance.Endpoint.Address}:${DBInstance.Endpoint.Port}/airflowdb' ~/airflow/airflow.cfg
140 |           # Update the type of executor in the Airflow Config file
141 |           sed -i '/executor = SequentialExecutor/s/^/#/g' ~/airflow/airflow.cfg
142 |           sed -i '/executor = SequentialExecutor/ a executor = LocalExecutor' ~/airflow/airflow.cfg
143 |           sed -i 's/load_examples = True/load_examples = False/g' ~/airflow/airflow.cfg
144 |           airflow db init
145 |           airflow users create -e admin@example.com -f admin -l airflow -p ${AirflowPassword} -r Admin -u ${AirflowUser}
146 |           # create airflow connection to sagemaker
147 |           cat >> /tmp/airflow_conn.py << EOF
148 |           from airflow import settings
149 |           from airflow.models import Connection
150 |           #create a connection object
151 |           extra = '{"region_name": "${AWS::Region}"}'
152 |           conn_id = 'airflow-sagemaker'
153 |           conn = Connection(conn_id=conn_id,conn_type='s3', extra=extra)
154 |           # get the session
155 |           session = settings.Session()
156 |           session.add(conn)
157 |           session.commit()
158 |           EOF
159 |           python3 /tmp/airflow_conn.py
160 |           # create directories
161 |           mkdir -p ~/airflow/dags/sm-ml-pipeline
162 |           # clone the git repository
163 |           cd ~
164 |           git clone https://github.com/aws-samples/sagemaker-ml-workflow-with-apache-airflow.git
165 |           mv ~/sagemaker-ml-workflow-with-apache-airflow ~/sm-ml-pipeline
166 |           cd ~/sm-ml-pipeline/src
167 |           # prepare airflow dag definition for sagemaker blog post
168 |           sed -i 's/<s3-bucket>/${S3BucketName}/g' ./*.*
169 |           sed -i 's/<region-name>/${AWS::Region}/g' ./*.*
170 |           sed -i 's/<accountid>/${AWS::AccountId}/g' ~/sm-ml-pipeline/src/config.py
171 |           sed -i 's/AirflowSageMakerExecutionRole/AirflowSageMakerExecutionRole-${AWS::StackName}/g' ~/sm-ml-pipeline/src/config.py
172 |           sed -i "s/hook = AwsHook(aws_conn_id='airflow-sagemaker')/hook = AwsHook(aws_conn_id='airflow-sagemaker', client_type='s3')/g" ~/sm-ml-pipeline/src/dag_ml_pipeline_amazon_video_reviews.py
173 |           sed -i '/provide_context=False/d' ~/sm-ml-pipeline/src/dag_ml_pipeline_amazon_video_reviews.py
174 |           sed -i 's/enable_xcom_pickling = False/enable_xcom_pickling = True/g' ~/airflow/airflow.cfg
175 |           zip -r dag.zip *
176 |           cp dag.zip ~/airflow/dags/sm-ml-pipeline/dag.zip
177 |           cd -
178 |           # Run Airflow webserver and scheduler
179 |           airflow dags list
180 |           airflow webserver -D
181 |           airflow scheduler -D
182 |           yum install aws-cfn-bootstrap -y
183 |           /opt/aws/bin/cfn-signal --exit-code 0 --resource EC2Instance --region ${AWS::Region} --stack ${AWS::StackName}
184 |     Metadata:
185 |       AWS::CloudFormation::Init:
186 |         configSets:
187 |           install:
188 |             - installpackages
189 |         installpackages:
190 |           packages:
191 |             yum:
192 |               python3: []
193 |               python3-devel: []
194 |               gcc: []
195 |               gcc-c++: []
196 |               postgresql-devel: []
197 |               openssl-devel: []
198 |               git: []
199 |     DependsOn:
200 |       - DBInstance
201 |       - AirflowEC2SecurityGroup
202 |   DBInstance:
203 |     Type: AWS::RDS::DBCluster
204 |     DeletionPolicy: Delete
205 |     Properties:
206 |       DatabaseName: airflowdb
207 |       Engine: aurora-postgresql
208 |       MasterUsername: airflow
209 |       MasterUserPassword: !Ref "DBPassword"
210 |       EngineMode: serverless
211 |       ScalingConfiguration:
212 |         AutoPause: true
213 |         MaxCapacity: 16
214 |         MinCapacity: 2
215 |         SecondsUntilAutoPause: 300
216 |       VpcSecurityGroupIds:
217 |         - !GetAtt AirflowEC2SecurityGroup.GroupId
218 |   AirflowEC2SecurityGroup:
219 |     Type: AWS::EC2::SecurityGroup
220 |     Properties:
221 |       GroupName: !Sub 'AirflowEC2SG-${AWS::StackName}'
222 |       GroupDescription: Enable HTTP access via port 80
223 |       SecurityGroupIngress:
224 |         - IpProtocol: tcp
225 |           FromPort: 8080
226 |           ToPort: 8080
227 |           CidrIp: 0.0.0.0/0
228 |         - IpProtocol: tcp
229 |           FromPort: 5432
230 |           ToPort: 5432
231 |           SourceSecurityGroupName: !Sub 'AirflowEC2SG-${AWS::StackName}'
232 |   EC2Role:
233 |     Type: AWS::IAM::Role
234 |     Properties:
235 |       RoleName: !Sub 'AirflowInstanceRole-${AWS::StackName}'
236 |       AssumeRolePolicyDocument:
237 |         Version: "2012-10-17"
238 |         Statement:
239 |           - Effect: "Allow"
240 |             Principal:
241 |               Service:
242 |                 - "ec2.amazonaws.com"
243 |             Action:
244 |               - "sts:AssumeRole"
245 |       Path: "/"
246 |       ManagedPolicyArns:
247 |         - arn:aws:iam::aws:policy/AmazonSageMakerFullAccess
248 |         - arn:aws:iam::aws:policy/AmazonSSMManagedInstanceCore
249 |       Policies:
250 |         - PolicyName: !Sub 'AirflowResourceAccess-${AWS::StackName}'
251 |           PolicyDocument:
252 |             Version: "2012-10-17"
253 |             Statement:
254 |               - Effect: Allow
255 |                 Action:
256 |                   - s3:*
257 |                 Resource:
258 |                   - !Sub "arn:aws:s3:::${S3BucketName}"
259 |                   - !Sub "arn:aws:s3:::${S3BucketName}/*"
260 |               - Effect: Allow
261 |                 Action:
262 |                   - iam:GetRole
263 |                 Resource: "*"
264 |   EC2InstanceProfile:
265 |     Type: AWS::IAM::InstanceProfile
266 |     Properties:
267 |       InstanceProfileName: !Sub 'AirflowInstanceProfile-${AWS::StackName}'
268 |       Roles:
269 |         - Ref: EC2Role
270 |   S3BucketName:
271 |     Type: AWS::S3::Bucket
272 |     DeletionPolicy: Delete
273 |     Properties:
274 |       AccessControl: BucketOwnerFullControl
275 |       BucketName: !Join
276 |         - "-"
277 |         - - "airflow-sagemaker"
278 |           - !Select
279 |             - 0
280 |             - !Split
281 |               - "-"
282 |               - !Select
283 |                 - 2
284 |                 - !Split
285 |                   - "/"
286 |                   - !Ref "AWS::StackId"
287 |   AirflowSageMakerExecutionRole:
288 |     Type: AWS::IAM::Role
289 |     Properties:
290 |       RoleName: !Sub 'AirflowSageMakerExecutionRole-${AWS::StackName}'
291 |       AssumeRolePolicyDocument:
292 |         Version: "2012-10-17"
293 |         Statement:
294 |           - Effect: "Allow"
295 |             Principal:
296 |               Service:
297 |                 - "sagemaker.amazonaws.com"
298 |             Action:
299 |               - "sts:AssumeRole"
300 |       ManagedPolicyArns:
301 |         - arn:aws:iam::aws:policy/AmazonSageMakerFullAccess
302 |       Path: "/service-role/"
303 |       Policies:
304 |         - PolicyName: !Sub 'SageMakerS3BucketAccess-${AWS::StackName}'
305 |           PolicyDocument:
306 |             Version: "2012-10-17"
307 |             Statement:
308 |               - Effect: Allow
309 |                 Action:
310 |                   - s3:*
311 |                 Resource:
312 |                   - !Sub "arn:aws:s3:::${S3BucketName}"
313 |                   - !Sub "arn:aws:s3:::${S3BucketName}/*"
314 | Outputs:
315 |   AirflowEC2PublicDNSName:
316 |     Description: Public DNS Name of the Airflow EC2 instance
317 |     Value: !Join ["", ["http://", !GetAtt EC2Instance.PublicDnsName, ":8080"]]


--------------------------------------------------------------------------------
/notebooks/amazon-video-recommender_using_fm_algo.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Implementing Recommender System with SageMaker Built-In Algorithm\n",
   8 |     "_**Making Product Recommendations Using Factorization Machines**_\n",
   9 |     "\n",
  10 |     "--- \n",
  11 |     "\n",
  12 |     "*This work is based on content from [Gluon based Recommender System notebook](https://github.com/awslabs/amazon-sagemaker-examples/blob/master/introduction_to_applying_machine_learning/gluon_recommender_system/gluon_recommender_system.ipynb)*\n",
  13 |     "\n",
  14 |     "---"
  15 |    ]
  16 |   },
  17 |   {
  18 |    "cell_type": "markdown",
  19 |    "metadata": {},
  20 |    "source": [
  21 |     "## Table of Contents\n",
  22 |     "\n",
  23 |     "1. [Background](#Background)\n",
  24 |     "1. [Setup](#Setup)\n",
  25 |     "1. [Data](#Data)\n",
  26 |     "  1. [Explore](#Explore)\n",
  27 |     "  1. [Clean](#Clean)\n",
  28 |     "  1. [Prepare](#Prepare)\n",
  29 |     "1. [Model Training](#Model-Training)\n",
  30 |     "1. [Model Inference](#Model-Inference)\n",
  31 |     "  1. [Real-Time Inference](#Real-Time-Inference)\n",
  32 |     "  1. [Batch Inference](#Batch-Inference)\n",
  33 |     "1. [Evaluate Model Performance](#Evaluate-Model-Performance)\n",
  34 |     "1. [Model Tuning](#Model-Tuning)\n",
  35 |     "1. [Wrap-up](#Wrap-up)\n",
  36 |     "  1. [Clean-Up](#Clean-up-(optional))"
  37 |    ]
  38 |   },
  39 |   {
  40 |    "cell_type": "markdown",
  41 |    "metadata": {},
  42 |    "source": [
  43 |     "---"
  44 |    ]
  45 |   },
  46 |   {
  47 |    "cell_type": "markdown",
  48 |    "metadata": {},
  49 |    "source": [
  50 |     "## Background\n",
  51 |     "\n",
  52 |     "In many ways, recommender systems were a catalyst for the current popularity of machine learning.  One of Amazon's earliest successes was the \"Customers who bought this, also bought...\" feature, while the million dollar Netflix Prize spurred research, raised public awareness, and inspired numerous other data science competitions.\n",
  53 |     "\n",
  54 |     "Recommender systems can utilize a multitude of data sources and ML algorithms, and most combine various unsupervised, supervised, and reinforcement learning techniques into a holistic framework.  However, the core component is almost always a model which predicts a user's rating (or purchase) for a certain item based on that user's historical ratings of similar items as well as the behavior of other similar users.  The minimal required dataset for this is a history of user item ratings.  In our case, we'll use 1 to 5 star ratings from over 2M Amazon customers on over 160K digital videos. More details on this dataset can be found at its [AWS Public Datasets page](https://s3.amazonaws.com/amazon-reviews-pds/readme.html).\n",
  55 |     "\n",
  56 |     "Matrix factorization has been the cornerstone of most user-item prediction models.  This method starts with the large, sparse, user-item ratings in a single matrix, where users index the rows, and items index the columns.  It then seeks to find two lower-dimensional, dense matrices which, when multiplied together, preserve the information and relationships in the larger matrix.\n",
  57 |     "\n",
  58 |     "![image](./factorization.png)\n",
  59 |     "\n",
  60 |     "Matrix factorization has been extended and generalized with deep learning and embeddings.  These techniques allows us to introduce non-linearities for enhanced performance and flexibility.  This notebook will fit a neural network-based model to generate recommendations for the Amazon video dataset.  It will start by exploring our data in the notebook, training a model on the data and fit our model using a SageMaker managed training cluster.  We'll then deploy to an endpoint and check our method.\n",
  61 |     "\n",
  62 |     "We will also see how the tasks in the machine learning pipeline can be orchestrated and automated through Apache Airflow integration with Sagemaker.\n",
  63 |     "\n",
  64 |     "---\n",
  65 |     "\n",
  66 |     "## Setup\n",
  67 |     "\n",
  68 |     "_This notebook was created and tested on an ml.t2.xlarge notebook instance._\n",
  69 |     "\n",
  70 |     "Let's start by specifying:\n",
  71 |     "\n",
  72 |     "- The S3 bucket and prefix that you want to use for training and model data.  This should be within the same region as the Notebook Instance, training, and hosting.\n",
  73 |     "- The IAM role arn used to give training and hosting access to your data. See the documentation for how to create these.  Note, if more than one role is required for notebook instances, training, and/or hosting, please replace the `get_execution_role()` call with the appropriate full IAM role arn string(s)."
  74 |    ]
  75 |   },
  76 |   {
  77 |    "cell_type": "code",
  78 |    "execution_count": null,
  79 |    "metadata": {
  80 |     "isConfigCell": true
  81 |    },
  82 |    "outputs": [],
  83 |    "source": [
  84 |     "bucket = 'mybucket' #replace with your bucket\n",
  85 |     "prefix = 'sagemaker/fm-recsys'\n",
  86 |     "\n",
  87 |     "import sagemaker\n",
  88 |     "\n",
  89 |     "from sagemaker.tuner import HyperparameterTuner, ContinuousParameter\n",
  90 |     "from sagemaker.analytics import HyperparameterTuningJobAnalytics, TrainingJobAnalytics\n",
  91 |     "\n",
  92 |     "role = sagemaker.get_execution_role()\n",
  93 |     "sess = sagemaker.Session()\n",
  94 |     "smclient = boto3.Session().client('sagemaker')"
  95 |    ]
  96 |   },
  97 |   {
  98 |    "cell_type": "markdown",
  99 |    "metadata": {},
 100 |    "source": [
 101 |     "Now let's load the Python libraries we'll need for the remainder of this example notebook."
 102 |    ]
 103 |   },
 104 |   {
 105 |    "cell_type": "code",
 106 |    "execution_count": null,
 107 |    "metadata": {},
 108 |    "outputs": [],
 109 |    "source": [
 110 |     "import os\n",
 111 |     "import io\n",
 112 |     "import sys\n",
 113 |     "import time\n",
 114 |     "\n",
 115 |     "import pandas as pd\n",
 116 |     "import numpy as np\n",
 117 |     "from scipy.sparse import lil_matrix\n",
 118 |     "\n",
 119 |     "import boto3\n",
 120 |     "import json\n",
 121 |     "\n",
 122 |     "import matplotlib.pyplot as plt\n",
 123 |     "import seaborn as sns\n",
 124 |     "\n",
 125 |     "import sagemaker.amazon.common as smac\n",
 126 |     "from sagemaker.predictor import json_deserializer"
 127 |    ]
 128 |   },
 129 |   {
 130 |    "cell_type": "code",
 131 |    "execution_count": null,
 132 |    "metadata": {},
 133 |    "outputs": [],
 134 |    "source": [
 135 |     "# plot aesthetics\n",
 136 |     "sns.set(color_codes=True)\n",
 137 |     "sns.set_context('paper')\n",
 138 |     "five_thirty_eight = [\"#30a2da\", \"#fc4f30\", \"#e5ae38\", \"#6d904f\", \"#8b8b8b\",]\n",
 139 |     "sns.set_palette(five_thirty_eight)\n",
 140 |     "\n",
 141 |     "%matplotlib inline"
 142 |    ]
 143 |   },
 144 |   {
 145 |    "cell_type": "markdown",
 146 |    "metadata": {},
 147 |    "source": [
 148 |     "---\n",
 149 |     "## Data\n",
 150 |     "\n",
 151 |     "### Explore\n",
 152 |     "\n",
 153 |     "Let's start by bringing in our dataset from an S3 public bucket.  As mentioned above, this contains 1 to 5 star ratings from over 2M Amazon customers on over 160K digital videos.  More details on this dataset can be found at its [AWS Public Datasets page](https://s3.amazonaws.com/amazon-reviews-pds/readme.html).\n",
 154 |     "\n",
 155 |     "_Note, because this dataset is over a half gigabyte, the load from S3 may take ~10 minutes.  Also, since Amazon SageMaker Notebooks start with a 5GB persistent volume by default, and we don't need to keep this data on our instance for long, we'll bring it to the temporary volume (which has up to 20GB of storage)._"
 156 |    ]
 157 |   },
 158 |   {
 159 |    "cell_type": "code",
 160 |    "execution_count": null,
 161 |    "metadata": {},
 162 |    "outputs": [],
 163 |    "source": [
 164 |     "!mkdir /tmp/recsys/\n",
 165 |     "!aws s3 cp s3://amazon-reviews-pds/tsv/amazon_reviews_us_Digital_Video_Download_v1_00.tsv.gz /tmp/recsys/"
 166 |    ]
 167 |   },
 168 |   {
 169 |    "cell_type": "markdown",
 170 |    "metadata": {},
 171 |    "source": [
 172 |     "Let's read the data into a [Pandas DataFrame](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.html) so that we can begin to understand it.\n",
 173 |     "\n",
 174 |     "*Note, we'll set `error_bad_lines=False` when reading the file in as there appear to be a very small number of records which would create a problem otherwise.*"
 175 |    ]
 176 |   },
 177 |   {
 178 |    "cell_type": "code",
 179 |    "execution_count": null,
 180 |    "metadata": {},
 181 |    "outputs": [],
 182 |    "source": [
 183 |     "reviews = pd.read_csv('/tmp/recsys/amazon_reviews_us_Digital_Video_Download_v1_00.tsv.gz', delimiter='\\t',error_bad_lines=False)\n",
 184 |     "reviews.head()"
 185 |    ]
 186 |   },
 187 |   {
 188 |    "cell_type": "markdown",
 189 |    "metadata": {},
 190 |    "source": [
 191 |     "We can see this dataset includes information like:\n",
 192 |     "\n",
 193 |     "- `marketplace`: 2-letter country code (in this case all \"US\").\n",
 194 |     "- `customer_id`: Random identifier that can be used to aggregate reviews written by a single author.\n",
 195 |     "- `review_id`: A unique ID for the review.\n",
 196 |     "- `product_id`: The Amazon Standard Identification Number (ASIN).  `http://www.amazon.com/dp/<ASIN>` links to the product's detail page.\n",
 197 |     "- `product_parent`: The parent of that ASIN.  Multiple ASINs (color or format variations of the same product) can roll up into a single parent parent.\n",
 198 |     "- `product_title`: Title description of the product.\n",
 199 |     "- `product_category`: Broad product category that can be used to group reviews (in this case digital videos).\n",
 200 |     "- `star_rating`: The review's rating (1 to 5 stars).\n",
 201 |     "- `helpful_votes`: Number of helpful votes for the review.\n",
 202 |     "- `total_votes`: Number of total votes the review received.\n",
 203 |     "- `vine`: Was the review written as part of the [Vine](https://www.amazon.com/gp/vine/help) program?\n",
 204 |     "- `verified_purchase`: Was the review from a verified purchase?\n",
 205 |     "- `review_headline`: The title of the review itself.\n",
 206 |     "- `review_body`: The text of the review.\n",
 207 |     "- `review_date`: The date the review was written.\n",
 208 |     "\n",
 209 |     "For this example, let's limit ourselves to `customer_id`, `product_id`, and `star_rating`.  Including additional features in our recommendation system could be beneficial, but would require substantial processing (particularly the text data) which would take us beyond the scope of this notebook.\n",
 210 |     "\n",
 211 |     "*Note: we'll keep `product_title` on the dataset to help verify our recommendations later in the notebook, but it will not be used in algorithm training.*"
 212 |    ]
 213 |   },
 214 |   {
 215 |    "cell_type": "code",
 216 |    "execution_count": null,
 217 |    "metadata": {},
 218 |    "outputs": [],
 219 |    "source": [
 220 |     "reviews = reviews[['customer_id', 'product_id', 'star_rating', 'product_title']]"
 221 |    ]
 222 |   },
 223 |   {
 224 |    "cell_type": "markdown",
 225 |    "metadata": {},
 226 |    "source": [
 227 |     "Because most people haven't seen most videos, and people rate fewer videos than we actually watch, we'd expect our data to be sparse.  Our algorithm should work well with this sparse problem in general, but we may still want to clean out some of the long tail.  Let's look at some basic percentiles to confirm."
 228 |    ]
 229 |   },
 230 |   {
 231 |    "cell_type": "code",
 232 |    "execution_count": null,
 233 |    "metadata": {},
 234 |    "outputs": [],
 235 |    "source": [
 236 |     "customers = reviews['customer_id'].value_counts()\n",
 237 |     "products = reviews['product_id'].value_counts()\n",
 238 |     "\n",
 239 |     "quantiles = [0, 0.01, 0.02, 0.03, 0.04, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.96, 0.97, 0.98, 0.99, 1]\n",
 240 |     "product_q = pd.DataFrame(zip(quantiles, products.quantile(quantiles)), columns=[\"quantile\", \"products\"])\n",
 241 |     "customer_q = pd.DataFrame(zip(quantiles, customers.quantile(quantiles)), columns=[\"quantile\", \"customers\"])\n",
 242 |     "# product_q.tail(10)\n",
 243 |     "# customer_q.tail(10)"
 244 |    ]
 245 |   },
 246 |   {
 247 |    "cell_type": "code",
 248 |    "execution_count": null,
 249 |    "metadata": {},
 250 |    "outputs": [],
 251 |    "source": [
 252 |     "axp = sns.barplot(x=\"quantile\", y=\"products\", data=product_q, palette=five_thirty_eight)"
 253 |    ]
 254 |   },
 255 |   {
 256 |    "cell_type": "code",
 257 |    "execution_count": null,
 258 |    "metadata": {},
 259 |    "outputs": [],
 260 |    "source": [
 261 |     "axc = sns.barplot(x=\"quantile\", y=\"customers\", data=customer_q, palette=five_thirty_eight)"
 262 |    ]
 263 |   },
 264 |   {
 265 |    "cell_type": "markdown",
 266 |    "metadata": {},
 267 |    "source": [
 268 |     "As we can see, only about 5% of customers have rated 5 or more videos, and only 25% of videos have been rated by 9+ customers.\n",
 269 |     "\n",
 270 |     "### Clean\n",
 271 |     "\n",
 272 |     "Let's filter out this long tail and remove any duplicate reviews (same product and customer)."
 273 |    ]
 274 |   },
 275 |   {
 276 |    "cell_type": "code",
 277 |    "execution_count": null,
 278 |    "metadata": {},
 279 |    "outputs": [],
 280 |    "source": [
 281 |     "customers = customers[customers >= 5]\n",
 282 |     "products = products[products >= 10]\n",
 283 |     "\n",
 284 |     "print(\"# of records before removing the long tail = {:10d}\".format(reviews.shape[0]))\n",
 285 |     "reduced_df = reviews.merge(pd.DataFrame({'customer_id': customers.index})).merge(pd.DataFrame({'product_id': products.index}))\n",
 286 |     "print(\"# of records after  removing the long tail = {:10d}\".format(reduced_df.shape[0]))\n",
 287 |     "reduced_df = reduced_df.drop_duplicates(['customer_id', 'product_id'])\n",
 288 |     "print(\"# of records after  removing duplicates    = {:10d}\".format(reduced_df.shape[0]))"
 289 |    ]
 290 |   },
 291 |   {
 292 |    "cell_type": "markdown",
 293 |    "metadata": {},
 294 |    "source": [
 295 |     "Now, we'll recreate our customer and product lists since there are customers with more than 5 reviews, but all of their reviews are on products with less than 5 reviews (and vice versa)."
 296 |    ]
 297 |   },
 298 |   {
 299 |    "cell_type": "code",
 300 |    "execution_count": null,
 301 |    "metadata": {},
 302 |    "outputs": [],
 303 |    "source": [
 304 |     "customers = reduced_df['customer_id'].value_counts()\n",
 305 |     "products = reduced_df['product_id'].value_counts()"
 306 |    ]
 307 |   },
 308 |   {
 309 |    "cell_type": "markdown",
 310 |    "metadata": {},
 311 |    "source": [
 312 |     "Next, we'll number each user and item, giving them their own sequential index.  This will allow us to hold the information in a sparse format where the sequential indices indicate the row and column in our ratings matrix."
 313 |    ]
 314 |   },
 315 |   {
 316 |    "cell_type": "code",
 317 |    "execution_count": null,
 318 |    "metadata": {},
 319 |    "outputs": [],
 320 |    "source": [
 321 |     "customer_index = pd.DataFrame({'customer_id': customers.index, 'customer': np.arange(customers.shape[0])})\n",
 322 |     "product_index = pd.DataFrame({'product_id': products.index, \n",
 323 |     "                              'product': np.arange(products.shape[0])})\n",
 324 |     "\n",
 325 |     "reduced_df = reduced_df.merge(customer_index).merge(product_index)\n",
 326 |     "reduced_df.head()"
 327 |    ]
 328 |   },
 329 |   {
 330 |    "cell_type": "markdown",
 331 |    "metadata": {},
 332 |    "source": [
 333 |     "Let's look at the feature dimension size whch will required for preparing the training and test data sets"
 334 |    ]
 335 |   },
 336 |   {
 337 |    "cell_type": "code",
 338 |    "execution_count": null,
 339 |    "metadata": {
 340 |     "scrolled": true
 341 |    },
 342 |    "outputs": [],
 343 |    "source": [
 344 |     "nb_customer = reduced_df['customer'].max() + 1\n",
 345 |     "nb_products = reduced_df['product'].max() + 1\n",
 346 |     "feature_dim = nb_customer + nb_products\n",
 347 |     "print(nb_customer, nb_products, feature_dim)"
 348 |    ]
 349 |   },
 350 |   {
 351 |    "cell_type": "markdown",
 352 |    "metadata": {},
 353 |    "source": [
 354 |     "Trim down the data set to include only customer, product, star_rating which is all we need for the training algorithm to build the model"
 355 |    ]
 356 |   },
 357 |   {
 358 |    "cell_type": "code",
 359 |    "execution_count": null,
 360 |    "metadata": {},
 361 |    "outputs": [],
 362 |    "source": [
 363 |     "product_df = reduced_df[['customer', 'product', 'star_rating']]\n",
 364 |     "product_df.head()"
 365 |    ]
 366 |   },
 367 |   {
 368 |    "cell_type": "markdown",
 369 |    "metadata": {},
 370 |    "source": [
 371 |     "### Prepare\n",
 372 |     "\n",
 373 |     "We will be using SageMaker's implementation of Factorization Machines (FM) for building a recommender system. The algorithm expects float32 tensors in protobuf whereas the data sets are pandas dataframe on disk. Most of the conversion effort is handled by the Amazon SageMaker Python SDK.\n",
 374 |     "\n",
 375 |     "The FM algorithm will utilize sparse input and since the data sets are dense matrix, it has to be converted a sparse matrix with one-hot encoded feature vectors with customers and products. Thus, each sample in the data set will be a wide boolean vector with 178729 feature space (140344 customer + 38385 products) with only two values set to 1 with respect to the customer and product.\n",
 376 |     "\n",
 377 |     "Following are the next steps\n",
 378 |     "\n",
 379 |     "1. Split the cleaned data set into train and test data sets.\n",
 380 |     "2. For each set, build a sparse matrix with one-hot encoded feature vectors (customer + products) and a label vector with star ratings.\n",
 381 |     "3. Convert both the sets to protobuf encoded files.\n",
 382 |     "4. Copy these files to an Amazon S3 bucket.\n",
 383 |     "5. Configure and run a Factorization Machines training job on Amazon SageMaker.\n",
 384 |     "6. Deploy the corresponding model to an endpoint.\n",
 385 |     "7. Run predictions on test data set and validate\n",
 386 |     "\n",
 387 |     "#### Split into Training and Test Data Sets\n",
 388 |     "\n",
 389 |     "Let's start by [splitting](https://docs.scipy.org/doc/numpy/reference/generated/numpy.split.html) in training, validation and test sets.  This will allow us to estimate the model's accuracy on videos our customers rated, but wasn't included in our training. We will use validation data set specifically for tuning model hyper-parameters."
 390 |    ]
 391 |   },
 392 |   {
 393 |    "cell_type": "code",
 394 |    "execution_count": null,
 395 |    "metadata": {},
 396 |    "outputs": [],
 397 |    "source": [
 398 |     "train_df, validate_df, test_df = np.split(\n",
 399 |     "    product_df.sample(frac=1), \n",
 400 |     "    [int(.6*len(product_df)), int(.8*len(product_df))])"
 401 |    ]
 402 |   },
 403 |   {
 404 |    "cell_type": "code",
 405 |    "execution_count": null,
 406 |    "metadata": {},
 407 |    "outputs": [],
 408 |    "source": [
 409 |     "print(\"# of rows in the training data set   = {:10d}\".format(train_df.shape[0]))\n",
 410 |     "print(\"# of rows in the validation data set = {:10d}\".format(validate_df.shape[0]))\n",
 411 |     "print(\"# of rows in the test data set       = {:10d}\".format(test_df.shape[0]))"
 412 |    ]
 413 |   },
 414 |   {
 415 |    "cell_type": "code",
 416 |    "execution_count": null,
 417 |    "metadata": {},
 418 |    "outputs": [],
 419 |    "source": [
 420 |     "train_df.head()"
 421 |    ]
 422 |   },
 423 |   {
 424 |    "cell_type": "markdown",
 425 |    "metadata": {},
 426 |    "source": [
 427 |     "Let's get the feature dimensions by adding total number of (unique) customers and products"
 428 |    ]
 429 |   },
 430 |   {
 431 |    "cell_type": "code",
 432 |    "execution_count": null,
 433 |    "metadata": {},
 434 |    "outputs": [],
 435 |    "source": [
 436 |     "# get feature dimension\n",
 437 |     "all_df = pd.concat([train_df, validate_df, test_df])\n",
 438 |     "nb_customer = np.unique(all_df['customer'].values).shape[0]\n",
 439 |     "nb_products = np.unique(all_df['product'].values).shape[0]\n",
 440 |     "feature_dim = nb_customer + nb_products\n",
 441 |     "print(\"# of customers = {:10d}\".format(nb_customer))\n",
 442 |     "print(\"# of products  = {:10d}\".format(nb_products))\n",
 443 |     "print(\"# of features  = {:10d}\".format(feature_dim))"
 444 |    ]
 445 |   },
 446 |   {
 447 |    "cell_type": "markdown",
 448 |    "metadata": {},
 449 |    "source": [
 450 |     "#### Building Sparse One-Hot Encoded Matrix\n",
 451 |     "\n",
 452 |     "Our training matrix is now even sparser: Of all 183,833,321,511 values (1028559 rows * 178729 columns), only 2,057,118 are non-zero (1,028,559*2). In other words, the matrix is 99.99% sparse. Storing this as a dense matrix would be a massive waste of both storage and computing power. To avoid this, use a scipy.lil_matrix sparse matrix for features and a numpy array for ratings.\n",
 453 |     "\n",
 454 |     "Let's define a function that takes the data set and returns a sparse feature matrix and numpy array with ratings."
 455 |    ]
 456 |   },
 457 |   {
 458 |    "cell_type": "code",
 459 |    "execution_count": null,
 460 |    "metadata": {},
 461 |    "outputs": [],
 462 |    "source": [
 463 |     "def convert_sparse_matrix(df, nb_rows, nb_customer, nb_products):\n",
 464 |     "    # dataframe to array\n",
 465 |     "    df_val = df.values\n",
 466 |     "\n",
 467 |     "    # determine feature size\n",
 468 |     "    nb_cols = nb_customer + nb_products\n",
 469 |     "    print(\"# of rows = {}\".format(str(nb_rows)))\n",
 470 |     "    print(\"# of cols = {}\".format(str(nb_cols)))\n",
 471 |     "\n",
 472 |     "    # extract customers and ratings\n",
 473 |     "    df_X = df_val[:, 0:2]\n",
 474 |     "    # Features are one-hot encoded in a sparse matrix\n",
 475 |     "    X = lil_matrix((nb_rows, nb_cols)).astype('float32')\n",
 476 |     "    df_X[:, 1] = nb_customer + df_X[:, 1]\n",
 477 |     "    coords = df_X[:, 0:2]\n",
 478 |     "    X[np.arange(nb_rows), coords[:, 0]] = 1\n",
 479 |     "    X[np.arange(nb_rows), coords[:, 1]] = 1\n",
 480 |     "\n",
 481 |     "    # create label with ratings\n",
 482 |     "    Y = df_val[:, 2].astype('float32')\n",
 483 |     "\n",
 484 |     "    # validate size and shape\n",
 485 |     "    print(X.shape)\n",
 486 |     "    print(Y.shape)\n",
 487 |     "    assert X.shape == (nb_rows, nb_cols)\n",
 488 |     "    assert Y.shape == (nb_rows, )\n",
 489 |     "\n",
 490 |     "    return X, Y"
 491 |    ]
 492 |   },
 493 |   {
 494 |    "cell_type": "code",
 495 |    "execution_count": null,
 496 |    "metadata": {},
 497 |    "outputs": [],
 498 |    "source": [
 499 |     "print(\"Convert training data set to one-hot encoded sparse matrix\")\n",
 500 |     "train_X, train_Y = convert_sparse_matrix(train_df, train_df.shape[0], nb_customer, nb_products)\n",
 501 |     "print(\"Convert validation data set to one-hot encoded sparse matrix\")\n",
 502 |     "validate_X, validate_Y = convert_sparse_matrix(validate_df, validate_df.shape[0], nb_customer, nb_products)\n",
 503 |     "print(\"Convert test data set to one-hot encoded sparse matrix\")\n",
 504 |     "test_X, test_Y = convert_sparse_matrix(test_df, test_df.shape[0], nb_customer, nb_products)"
 505 |    ]
 506 |   },
 507 |   {
 508 |    "cell_type": "markdown",
 509 |    "metadata": {},
 510 |    "source": [
 511 |     "#### Convert to Protobuf format and Upload to S3"
 512 |    ]
 513 |   },
 514 |   {
 515 |    "cell_type": "markdown",
 516 |    "metadata": {},
 517 |    "source": [
 518 |     "We will use Sagemaker's utility function [`write_spmatrix_to_sparse_tensor`](https://github.com/aws/sagemaker-python-sdk/blob/master/src/sagemaker/amazon/common.py) to convert scipy sparse matrix to protobuf format."
 519 |    ]
 520 |   },
 521 |   {
 522 |    "cell_type": "code",
 523 |    "execution_count": null,
 524 |    "metadata": {},
 525 |    "outputs": [],
 526 |    "source": [
 527 |     "def save_as_protobuf(X, Y, bucket, key):\n",
 528 |     "    \"\"\"Converts features and predictions matrices to recordio protobuf and\n",
 529 |     "       writes to S3\n",
 530 |     "\n",
 531 |     "    Args:\n",
 532 |     "        X:\n",
 533 |     "          2D numpy matrix with features\n",
 534 |     "        Y:\n",
 535 |     "          1D numpy matrix with predictions\n",
 536 |     "        bucket:\n",
 537 |     "          s3 bucket where recordio protobuf file will be staged\n",
 538 |     "        prefix:\n",
 539 |     "          s3 url prefix to stage prepared data to use for training the model\n",
 540 |     "        key:\n",
 541 |     "          protobuf file name to be staged\n",
 542 |     "\n",
 543 |     "    Returns:\n",
 544 |     "        s3 url with key to the protobuf data\n",
 545 |     "    \"\"\"\n",
 546 |     "    buf = io.BytesIO()\n",
 547 |     "    smac.write_spmatrix_to_sparse_tensor(buf, X, Y)\n",
 548 |     "    buf.seek(0)\n",
 549 |     "    obj = '{}'.format(key)\n",
 550 |     "    boto3.resource('s3').Bucket(bucket).Object(obj).upload_fileobj(buf)\n",
 551 |     "    return 's3://{}/{}'.format(bucket, obj)"
 552 |    ]
 553 |   },
 554 |   {
 555 |    "cell_type": "code",
 556 |    "execution_count": null,
 557 |    "metadata": {},
 558 |    "outputs": [],
 559 |    "source": [
 560 |     "s3_train_path = save_as_protobuf(train_X, train_Y, bucket, 'prepare/train/train.protobuf')\n",
 561 |     "print(\"Training data set in protobuf format uploaded at {}\".format(s3_train_path))\n",
 562 |     "s3_val_path = save_as_protobuf(validate_X, validate_Y, bucket, 'prepare/validate/validate.protobuf')\n",
 563 |     "print(\"Validation data set in protobuf format uploaded at {}\".format(s3_val_path))"
 564 |    ]
 565 |   },
 566 |   {
 567 |    "cell_type": "markdown",
 568 |    "metadata": {},
 569 |    "source": [
 570 |     "We will chunk the test data to avoid the payload size issues when performing batch predictions."
 571 |    ]
 572 |   },
 573 |   {
 574 |    "cell_type": "code",
 575 |    "execution_count": null,
 576 |    "metadata": {},
 577 |    "outputs": [],
 578 |    "source": [
 579 |     "def chunk(x, batch_size):\n",
 580 |     "    \"\"\"split array into chunks of batch_size\n",
 581 |     "    \"\"\"\n",
 582 |     "    chunk_range = range(0, x.shape[0], batch_size)\n",
 583 |     "    chunks = [x[p: p + batch_size] for p in chunk_range]\n",
 584 |     "    return chunks"
 585 |    ]
 586 |   },
 587 |   {
 588 |    "cell_type": "code",
 589 |    "execution_count": null,
 590 |    "metadata": {},
 591 |    "outputs": [],
 592 |    "source": [
 593 |     "test_x_chunks = chunk(test_X, 10000)\n",
 594 |     "test_y_chunks = chunk(test_Y, 10000)\n",
 595 |     "N = len(test_x_chunks)\n",
 596 |     "for i in range(N):\n",
 597 |     "    test_data = save_as_protobuf(\n",
 598 |     "        test_x_chunks[i],\n",
 599 |     "        test_y_chunks[i],\n",
 600 |     "        bucket,\n",
 601 |     "        \"prepare/test/test_\" + str(i) + \".protobuf\")\n",
 602 |     "    print(test_data)"
 603 |    ]
 604 |   },
 605 |   {
 606 |    "cell_type": "markdown",
 607 |    "metadata": {},
 608 |    "source": [
 609 |     "---"
 610 |    ]
 611 |   },
 612 |   {
 613 |    "cell_type": "markdown",
 614 |    "metadata": {},
 615 |    "source": [
 616 |     "## Model Training\n",
 617 |     "\n",
 618 |     "Once we have the data preprocessed and available in the correct format for training, the next step is to actually train the model using the data. We'll use the Amazon SageMaker Python SDK to kick off training and monitor status until it is completed. In this example that takes between 4-7 minutes for 3-10 epochs. \n",
 619 |     "\n",
 620 |     "First, let's get the Sagemaker Factorization Machine container"
 621 |    ]
 622 |   },
 623 |   {
 624 |    "cell_type": "code",
 625 |    "execution_count": null,
 626 |    "metadata": {},
 627 |    "outputs": [],
 628 |    "source": [
 629 |     "from sagemaker.amazon.amazon_estimator import get_image_uri\n",
 630 |     "container = get_image_uri(boto3.Session().region_name, 'factorization-machines')"
 631 |    ]
 632 |   },
 633 |   {
 634 |    "cell_type": "markdown",
 635 |    "metadata": {},
 636 |    "source": [
 637 |     "Next kick off the base estimator, making sure to pass in the necessary hyperparameters. Notice:\n",
 638 |     "\n",
 639 |     "- `feature_dim` is set to 178729, which is the number of customers + products in the training data set.\n",
 640 |     "- `predictor_type` is set to 'regressor' since we are trying to predict the rating\n",
 641 |     "- `mini_batch_size` is set to 200. This value can be tuned for relatively minor improvements in fit and speed, but selecting a reasonable value relative to the dataset is appropriate in most cases.\n",
 642 |     "- `num_factors` is set to 64. Factorization machines find a lower dimensional representation of the interactions for all features. Making this value smaller provides a more parsimonious model, closer to a linear model, but may sacrifice information about interactions. Making it larger provides a higher-dimensional representation of feature interactions, but adds computational complexity and can lead to overfitting. In a practical application, time should be invested to tune this parameter to the appropriate value."
 643 |    ]
 644 |   },
 645 |   {
 646 |    "cell_type": "code",
 647 |    "execution_count": null,
 648 |    "metadata": {},
 649 |    "outputs": [],
 650 |    "source": [
 651 |     "%time\n",
 652 |     "\n",
 653 |     "output_location = 's3://{}/train/'.format(bucket)\n",
 654 |     "s3_train_path = 's3://{}/prepare/train/train.protobuf'.format(bucket)\n",
 655 |     "s3_val_path = 's3://{}/prepare/validate/validate.protobuf'.format(bucket)\n",
 656 |     "\n",
 657 |     "fm = sagemaker.estimator.Estimator(container,\n",
 658 |     "                                   role, \n",
 659 |     "                                   train_instance_count=1, \n",
 660 |     "                                   train_instance_type='ml.c5.4xlarge',\n",
 661 |     "                                   output_path=output_location,\n",
 662 |     "                                   sagemaker_session=sess)\n",
 663 |     "\n",
 664 |     "fm.set_hyperparameters(feature_dim=feature_dim,\n",
 665 |     "                      predictor_type='regressor',\n",
 666 |     "                      mini_batch_size=200,\n",
 667 |     "                      num_factors=512,\n",
 668 |     "                      bias_lr=0.02,\n",
 669 |     "                      epochs=10)"
 670 |    ]
 671 |   },
 672 |   {
 673 |    "cell_type": "code",
 674 |    "execution_count": null,
 675 |    "metadata": {},
 676 |    "outputs": [],
 677 |    "source": [
 678 |     "fm.fit({'train': s3_train_path,'test': s3_val_path}, wait=False)"
 679 |    ]
 680 |   },
 681 |   {
 682 |    "cell_type": "markdown",
 683 |    "metadata": {},
 684 |    "source": [
 685 |     "Amazon SageMaker built-in algorithms automatically compute and emit a variety of model training, evaluation, and validation metrics that can be captured from Cloudwatch using Sagemaker SDK. Since we are using FM built-in algorithm with predictor type as `regressor`, we can capture RMSE (root-mean-square error) of the model that measures the differences between the predicted values and the actual values.\n",
 686 |     "\n",
 687 |     "Let's capture the RMSE of the model"
 688 |    ]
 689 |   },
 690 |   {
 691 |    "cell_type": "code",
 692 |    "execution_count": null,
 693 |    "metadata": {},
 694 |    "outputs": [],
 695 |    "source": [
 696 |     "training_job_name = fm._current_job_name\n",
 697 |     "metric_name = 'train:rmse:epoch'"
 698 |    ]
 699 |   },
 700 |   {
 701 |    "cell_type": "code",
 702 |    "execution_count": null,
 703 |    "metadata": {},
 704 |    "outputs": [],
 705 |    "source": [
 706 |     "# run this cell to check current status of training job\n",
 707 |     "fm_training_job_result = smclient.describe_training_job(TrainingJobName=training_job_name)\n",
 708 |     "\n",
 709 |     "status = fm_training_job_result['TrainingJobStatus']\n",
 710 |     "if status != 'Completed':\n",
 711 |     "    print('Reminder: the training job has not been completed.')\n",
 712 |     "else:\n",
 713 |     "    print('The training job is completed')"
 714 |    ]
 715 |   },
 716 |   {
 717 |    "cell_type": "code",
 718 |    "execution_count": null,
 719 |    "metadata": {},
 720 |    "outputs": [],
 721 |    "source": [
 722 |     "# plug-in the training job name and metrics to be captured\n",
 723 |     "metrics_dataframe = TrainingJobAnalytics(training_job_name=training_job_name,metric_names=[metric_name]).dataframe()\n",
 724 |     "metrics_dataframe"
 725 |    ]
 726 |   },
 727 |   {
 728 |    "cell_type": "code",
 729 |    "execution_count": null,
 730 |    "metadata": {},
 731 |    "outputs": [],
 732 |    "source": [
 733 |     "plt = metrics_dataframe.plot(kind='line', figsize=(12,5), x='timestamp', y='value', style='b.', legend=False)\n",
 734 |     "plt.set_ylabel(metric_name);"
 735 |    ]
 736 |   },
 737 |   {
 738 |    "cell_type": "markdown",
 739 |    "metadata": {},
 740 |    "source": [
 741 |     "As the number of epochs increased, RMSE goes down which is a good sign that the predicted values are getting closer to the actual values from the test set. We can increase number of epochs or change the hyperparameters and try to tweak the model. Let's try to deploy this model and make predictions to see how close the predictions are. Then we can run a hyper-parameter tuning job to determine the best model."
 742 |    ]
 743 |   },
 744 |   {
 745 |    "cell_type": "markdown",
 746 |    "metadata": {},
 747 |    "source": [
 748 |     "#### Utility Functions"
 749 |    ]
 750 |   },
 751 |   {
 752 |    "cell_type": "markdown",
 753 |    "metadata": {},
 754 |    "source": [
 755 |     "We will define some common utility functions here that will be used during inference and evaluating results"
 756 |    ]
 757 |   },
 758 |   {
 759 |    "cell_type": "code",
 760 |    "execution_count": null,
 761 |    "metadata": {},
 762 |    "outputs": [],
 763 |    "source": [
 764 |     "def convert_to_protobuf(X, Y=None):\n",
 765 |     "    buf = io.BytesIO()\n",
 766 |     "    smac.write_spmatrix_to_sparse_tensor(buf, X, Y)\n",
 767 |     "    buf.seek(0)\n",
 768 |     "    return buf"
 769 |    ]
 770 |   },
 771 |   {
 772 |    "cell_type": "code",
 773 |    "execution_count": null,
 774 |    "metadata": {},
 775 |    "outputs": [],
 776 |    "source": [
 777 |     "def convert_sparse_matrix_X(df, nb_rows, nb_customer, nb_products):\n",
 778 |     "    # dataframe to array\n",
 779 |     "    df_val = df.values\n",
 780 |     "\n",
 781 |     "    # determine feature size\n",
 782 |     "    nb_cols = nb_customer + nb_products\n",
 783 |     "    \n",
 784 |     "    # extract customers and ratings\n",
 785 |     "    df_X = df_val[:,0:2]\n",
 786 |     "    # Features are one-hot encoded in a sparse matrix\n",
 787 |     "    X = lil_matrix((nb_rows, nb_cols)).astype('float32')\n",
 788 |     "    df_X[:,1] = nb_customer + df_X[:,1]\n",
 789 |     "    coords = df_X[:,0:2]\n",
 790 |     "    X[np.arange(nb_rows), coords[:, 0]] = 1\n",
 791 |     "    X[np.arange(nb_rows), coords[:, 1]] = 1\n",
 792 |     "\n",
 793 |     "    # validate size and shape\n",
 794 |     "    assert X.shape == (nb_rows, nb_cols)\n",
 795 |     "    \n",
 796 |     "    return X"
 797 |    ]
 798 |   },
 799 |   {
 800 |    "cell_type": "markdown",
 801 |    "metadata": {},
 802 |    "source": [
 803 |     "---"
 804 |    ]
 805 |   },
 806 |   {
 807 |    "cell_type": "markdown",
 808 |    "metadata": {},
 809 |    "source": [
 810 |     "## Inference "
 811 |    ]
 812 |   },
 813 |   {
 814 |    "cell_type": "markdown",
 815 |    "metadata": {},
 816 |    "source": [
 817 |     "### Real-Time Inference "
 818 |    ]
 819 |   },
 820 |   {
 821 |    "cell_type": "markdown",
 822 |    "metadata": {},
 823 |    "source": [
 824 |     "Since the model is trained, all it takes to deploy the model is a Sagemaker API call `deploy()` that creates the model package, sets up endpoint configuration and finally creates the endpoint."
 825 |    ]
 826 |   },
 827 |   {
 828 |    "cell_type": "code",
 829 |    "execution_count": null,
 830 |    "metadata": {},
 831 |    "outputs": [],
 832 |    "source": [
 833 |     "fm_predictor = fm.deploy(instance_type='ml.c4.xlarge', initial_instance_count=1)"
 834 |    ]
 835 |   },
 836 |   {
 837 |    "cell_type": "markdown",
 838 |    "metadata": {},
 839 |    "source": [
 840 |     "Predictions could be done by sending HTTP POST requests from a separate web service, but to keep things easy, we'll just use the `.predict()` method from the SageMaker Python SDK. The API expects JSON or RecordIO format for  request and JSON for response data."
 841 |    ]
 842 |   },
 843 |   {
 844 |    "cell_type": "code",
 845 |    "execution_count": null,
 846 |    "metadata": {},
 847 |    "outputs": [],
 848 |    "source": [
 849 |     "fm_predictor.content_type = 'application/x-recordio-protobuf'"
 850 |    ]
 851 |   },
 852 |   {
 853 |    "cell_type": "markdown",
 854 |    "metadata": {},
 855 |    "source": [
 856 |     "Let's test the model with sample ratings from test data set using `predict()` API call"
 857 |    ]
 858 |   },
 859 |   {
 860 |    "cell_type": "code",
 861 |    "execution_count": null,
 862 |    "metadata": {},
 863 |    "outputs": [],
 864 |    "source": [
 865 |     "test_pb = convert_to_protobuf(test_X[1000:1010]).getvalue()"
 866 |    ]
 867 |   },
 868 |   {
 869 |    "cell_type": "code",
 870 |    "execution_count": null,
 871 |    "metadata": {},
 872 |    "outputs": [],
 873 |    "source": [
 874 |     "response = fm_predictor.predict(test_pb)\n",
 875 |     "response"
 876 |    ]
 877 |   },
 878 |   {
 879 |    "cell_type": "code",
 880 |    "execution_count": null,
 881 |    "metadata": {},
 882 |    "outputs": [],
 883 |    "source": [
 884 |     "predicted = [round(r['score'], 2) for r in json.loads(response)['predictions']]\n",
 885 |     "predicted"
 886 |    ]
 887 |   },
 888 |   {
 889 |    "cell_type": "code",
 890 |    "execution_count": null,
 891 |    "metadata": {},
 892 |    "outputs": [],
 893 |    "source": [
 894 |     "results_df = pd.DataFrame(zip(test_Y[1000:1010], predicted), columns = [\"actual_rating\", \"predicted_rating\"])\n",
 895 |     "results_df"
 896 |    ]
 897 |   },
 898 |   {
 899 |    "cell_type": "markdown",
 900 |    "metadata": {},
 901 |    "source": [
 902 |     "---"
 903 |    ]
 904 |   },
 905 |   {
 906 |    "cell_type": "markdown",
 907 |    "metadata": {},
 908 |    "source": [
 909 |     "### Batch Inference"
 910 |    ]
 911 |   },
 912 |   {
 913 |    "cell_type": "markdown",
 914 |    "metadata": {},
 915 |    "source": [
 916 |     "Here we will perform batch inference on the test data set prepared earlier (chunking into multiple protobuf files). To run batch transform, create a model package for the transform endpoint "
 917 |    ]
 918 |   },
 919 |   {
 920 |    "cell_type": "markdown",
 921 |    "metadata": {},
 922 |    "source": [
 923 |     "- Create the model from the training estimator"
 924 |    ]
 925 |   },
 926 |   {
 927 |    "cell_type": "code",
 928 |    "execution_count": null,
 929 |    "metadata": {},
 930 |    "outputs": [],
 931 |    "source": [
 932 |     "fm_model = fm.create_model()"
 933 |    ]
 934 |   },
 935 |   {
 936 |    "cell_type": "markdown",
 937 |    "metadata": {},
 938 |    "source": [
 939 |     "- Perform batch inference on the test data set and save results to S3"
 940 |    ]
 941 |   },
 942 |   {
 943 |    "cell_type": "code",
 944 |    "execution_count": null,
 945 |    "metadata": {},
 946 |    "outputs": [],
 947 |    "source": [
 948 |     "fm_transformer = fm_model.transformer(\n",
 949 |     "    instance_type='ml.c4.xlarge', \n",
 950 |     "    instance_count=1, \n",
 951 |     "    strategy=\"MultiRecord\", \n",
 952 |     "    output_path=\"s3://{}/transform/\".format(bucket)\n",
 953 |     ")"
 954 |    ]
 955 |   },
 956 |   {
 957 |    "cell_type": "code",
 958 |    "execution_count": null,
 959 |    "metadata": {},
 960 |    "outputs": [],
 961 |    "source": [
 962 |     "fm_transformer.transform(\n",
 963 |     "    data=\"s3://{}/prepare/test/\".format(bucket), \n",
 964 |     "    data_type='S3Prefix', \n",
 965 |     "    content_type=\"application/x-recordio-protobuf\")"
 966 |    ]
 967 |   },
 968 |   {
 969 |    "cell_type": "code",
 970 |    "execution_count": null,
 971 |    "metadata": {},
 972 |    "outputs": [],
 973 |    "source": [
 974 |     "print('Waiting for transform job: ' + fm_transformer.latest_transform_job.job_name)\n",
 975 |     "fm_transformer.wait()"
 976 |    ]
 977 |   },
 978 |   {
 979 |    "cell_type": "markdown",
 980 |    "metadata": {},
 981 |    "source": [
 982 |     "- Inference results will be stored in a separate file for each test file chunk. Let's download the results from S3 and merge them"
 983 |    ]
 984 |   },
 985 |   {
 986 |    "cell_type": "code",
 987 |    "execution_count": null,
 988 |    "metadata": {},
 989 |    "outputs": [],
 990 |    "source": [
 991 |     "def download_from_s3(bucket, key):\n",
 992 |     "    s3 = boto3.resource('s3')\n",
 993 |     "    obj = s3.Object( bucket, key)\n",
 994 |     "    content = obj.get()['Body'].read()\n",
 995 |     "    return content"
 996 |    ]
 997 |   },
 998 |   {
 999 |    "cell_type": "code",
1000 |    "execution_count": null,
1001 |    "metadata": {},
1002 |    "outputs": [],
1003 |    "source": [
1004 |     "test_preds = []\n",
1005 |     "for i in range(N):\n",
1006 |     "    key = 'transform/test_' + str(i) + '.protobuf.out'\n",
1007 |     "    response = download_from_s3(bucket, key)\n",
1008 |     "    result = [json.loads(row)[\"score\"] for row in response.split(\"\\n\") if len(row) > 0]\n",
1009 |     "    test_preds.extend(result)"
1010 |    ]
1011 |   },
1012 |   {
1013 |    "cell_type": "code",
1014 |    "execution_count": null,
1015 |    "metadata": {},
1016 |    "outputs": [],
1017 |    "source": [
1018 |     "test_preds = np.array(test_preds)"
1019 |    ]
1020 |   },
1021 |   {
1022 |    "cell_type": "code",
1023 |    "execution_count": null,
1024 |    "metadata": {},
1025 |    "outputs": [],
1026 |    "source": [
1027 |     "test_preds.shape"
1028 |    ]
1029 |   },
1030 |   {
1031 |    "cell_type": "markdown",
1032 |    "metadata": {},
1033 |    "source": [
1034 |     "---"
1035 |    ]
1036 |   },
1037 |   {
1038 |    "cell_type": "markdown",
1039 |    "metadata": {},
1040 |    "source": [
1041 |     "## Evaluate Model Performance\n",
1042 |     "\n",
1043 |     "Let's start by calculating a naive baseline to approximate how well our model is doing.  The simplest estimate would be to assume every user item rating is just the average rating over all ratings.\n",
1044 |     "\n",
1045 |     "*Note, we could do better by using each individual video's average, however, in this case it doesn't really matter as the same conclusions would hold.*"
1046 |    ]
1047 |   },
1048 |   {
1049 |    "cell_type": "code",
1050 |    "execution_count": null,
1051 |    "metadata": {},
1052 |    "outputs": [],
1053 |    "source": [
1054 |     "print('Naive MSE:', np.mean((test_df['star_rating'] - np.mean(train_df['star_rating'])) ** 2))"
1055 |    ]
1056 |   },
1057 |   {
1058 |    "cell_type": "markdown",
1059 |    "metadata": {},
1060 |    "source": [
1061 |     "Now, we'll calculate predictions for our test dataset."
1062 |    ]
1063 |   },
1064 |   {
1065 |    "cell_type": "code",
1066 |    "execution_count": null,
1067 |    "metadata": {},
1068 |    "outputs": [],
1069 |    "source": [
1070 |     "print('MSE:', np.mean((test_Y - test_preds) ** 2))"
1071 |    ]
1072 |   },
1073 |   {
1074 |    "cell_type": "markdown",
1075 |    "metadata": {},
1076 |    "source": [
1077 |     "We can see that our neural network and embedding model produces substantially better results (~1.44 vs 1.13 on the mean square error).\n",
1078 |     "\n",
1079 |     "For recommender systems, subjective accuracy also matters.  Let's get some recommendations for a random user to see if they make intuitive sense."
1080 |    ]
1081 |   },
1082 |   {
1083 |    "cell_type": "code",
1084 |    "execution_count": null,
1085 |    "metadata": {},
1086 |    "outputs": [],
1087 |    "source": [
1088 |     "df_customer_6 = reduced_df[reduced_df['customer'] == 6].sort_values(['star_rating', 'product'], ascending=[False, True])\n",
1089 |     "pd.concat((df_customer_6.head(10), df_customer_6.tail(10)))"
1090 |    ]
1091 |   },
1092 |   {
1093 |    "cell_type": "markdown",
1094 |    "metadata": {},
1095 |    "source": [
1096 |     "As we can see, user #6 seems to like sprawling dramamtic television series and sci-fi, but they dislike silly comedies.\n",
1097 |     "\n",
1098 |     "Now we'll loop through and predict user #6's ratings for every common video in the catalog, to see which ones we'd recommend and which ones we wouldn't."
1099 |    ]
1100 |   },
1101 |   {
1102 |    "cell_type": "code",
1103 |    "execution_count": null,
1104 |    "metadata": {},
1105 |    "outputs": [],
1106 |    "source": [
1107 |     "def create_payload(cust_id, nb_customer, nb_products, product_index):\n",
1108 |     "    # prepare payload for user #6\n",
1109 |     "    c = [cust_id] * nb_products\n",
1110 |     "    p = product_index['product'].values\n",
1111 |     "    x = pd.DataFrame(zip(c,p))\n",
1112 |     "    p_x = convert_sparse_matrix_X(x, x.shape[0], nb_customer, nb_products)\n",
1113 |     "    x_pb = convert_to_protobuf(p_x)\n",
1114 |     "    return x_pb"
1115 |    ]
1116 |   },
1117 |   {
1118 |    "cell_type": "code",
1119 |    "execution_count": null,
1120 |    "metadata": {},
1121 |    "outputs": [],
1122 |    "source": [
1123 |     "x_pb = create_payload(6, nb_customer, nb_products, product_index)"
1124 |    ]
1125 |   },
1126 |   {
1127 |    "cell_type": "code",
1128 |    "execution_count": null,
1129 |    "metadata": {},
1130 |    "outputs": [],
1131 |    "source": [
1132 |     "# make predictions using end-point created in Real-Time Inference\n",
1133 |     "response = fm_predictor.predict(x_pb)\n",
1134 |     "predictions = [round(r['score'], 2) for r in json.loads(response)['predictions']]"
1135 |    ]
1136 |   },
1137 |   {
1138 |    "cell_type": "code",
1139 |    "execution_count": null,
1140 |    "metadata": {},
1141 |    "outputs": [],
1142 |    "source": [
1143 |     "predictions_df = pd.DataFrame({'product': product_index['product'],\n",
1144 |     "                            'prediction': predictions})"
1145 |    ]
1146 |   },
1147 |   {
1148 |    "cell_type": "code",
1149 |    "execution_count": null,
1150 |    "metadata": {},
1151 |    "outputs": [],
1152 |    "source": [
1153 |     "df_results_cust_6 = df_customer_6.merge(predictions_df, on=['product'])[['customer', 'customer_id', 'product', 'product_id', 'product_title', 'star_rating', 'prediction']]\n",
1154 |     "df_results_cust_6.sort_values(['prediction', 'product'], ascending=[False, True])"
1155 |    ]
1156 |   },
1157 |   {
1158 |    "cell_type": "markdown",
1159 |    "metadata": {},
1160 |    "source": [
1161 |     "Indeed, our predicted highly rated shows have some well-reviewed TV dramas and some sci-fi.  Meanwhile, our bottom rated shows include goofball comedies.\n",
1162 |     "\n",
1163 |     "*Note, because of random initialization in the weights, results on subsequent runs may differ slightly.*\n",
1164 |     "\n",
1165 |     "Let's confirm that we no longer have almost perfect correlation in recommendations with user #7."
1166 |    ]
1167 |   },
1168 |   {
1169 |    "cell_type": "code",
1170 |    "execution_count": null,
1171 |    "metadata": {},
1172 |    "outputs": [],
1173 |    "source": [
1174 |     "x_pb = create_payload(7, nb_customer, nb_products, product_index)\n",
1175 |     "response = fm_predictor.predict(x_pb)\n",
1176 |     "predictions_user7 = [round(r['score'], 2) for r in json.loads(response)['predictions']]"
1177 |    ]
1178 |   },
1179 |   {
1180 |    "cell_type": "code",
1181 |    "execution_count": null,
1182 |    "metadata": {},
1183 |    "outputs": [],
1184 |    "source": [
1185 |     "plt.scatter(predictions_df['prediction'], np.array(predictions_user7))\n",
1186 |     "plt.show()"
1187 |    ]
1188 |   },
1189 |   {
1190 |    "cell_type": "markdown",
1191 |    "metadata": {},
1192 |    "source": [
1193 |     "---"
1194 |    ]
1195 |   },
1196 |   {
1197 |    "cell_type": "markdown",
1198 |    "metadata": {},
1199 |    "source": [
1200 |     "## Model Tuning\n",
1201 |     "\n",
1202 |     "So far, we have developed a deep learning model to predict customer ratings but the model could be improved further by various techniques. In this section, let's see if tuning the hyper-parameters of Factorization Machine is going to make the model any better."
1203 |    ]
1204 |   },
1205 |   {
1206 |    "cell_type": "code",
1207 |    "execution_count": null,
1208 |    "metadata": {},
1209 |    "outputs": [],
1210 |    "source": [
1211 |     "output_location = 's3://{}/train/'.format(bucket)\n",
1212 |     "s3_train_path = 's3://{}/prepare/train/train.protobuf'.format(bucket)\n",
1213 |     "s3_val_path = 's3://{}/prepare/validate/validate.protobuf'.format(bucket)"
1214 |    ]
1215 |   },
1216 |   {
1217 |    "cell_type": "markdown",
1218 |    "metadata": {},
1219 |    "source": [
1220 |     "- Let's create the estimator with Factorization Machines container similar to how we defined in training the model. Also, set the initial hyper-parameters that we know worked before."
1221 |    ]
1222 |   },
1223 |   {
1224 |    "cell_type": "code",
1225 |    "execution_count": null,
1226 |    "metadata": {},
1227 |    "outputs": [],
1228 |    "source": [
1229 |     "fm_estimator = sagemaker.estimator.Estimator(container,\n",
1230 |     "                                   role, \n",
1231 |     "                                   train_instance_count=1, \n",
1232 |     "                                   train_instance_type='ml.c5.4xlarge',\n",
1233 |     "                                   output_path=output_location,\n",
1234 |     "                                   sagemaker_session=sess)"
1235 |    ]
1236 |   },
1237 |   {
1238 |    "cell_type": "code",
1239 |    "execution_count": null,
1240 |    "metadata": {},
1241 |    "outputs": [],
1242 |    "source": [
1243 |     "fm_estimator.set_hyperparameters(\n",
1244 |     "    feature_dim=feature_dim,\n",
1245 |     "    predictor_type='regressor',\n",
1246 |     "    mini_batch_size=200,\n",
1247 |     "    num_factors=512,\n",
1248 |     "    bias_lr=0.02,\n",
1249 |     "    epochs=20)"
1250 |    ]
1251 |   },
1252 |   {
1253 |    "cell_type": "markdown",
1254 |    "metadata": {},
1255 |    "source": [
1256 |     "- Find best hyperparameters with Sagemaker's Automatic Model Tuning. Following hyperparameters will be tuned\n",
1257 |     "    - ***factors_lr:*** The learning rate for factorization terms.\n",
1258 |     "    - ***factors_init_sigma:*** The standard deviation for initialization of factorization terms. Takes effect if factors_init_method is set to normal.\n",
1259 |     "    \n",
1260 |     "\n",
1261 |     "- Define the hyperparameter tuning ranges to be searched and set the objective metric"
1262 |    ]
1263 |   },
1264 |   {
1265 |    "cell_type": "code",
1266 |    "execution_count": null,
1267 |    "metadata": {},
1268 |    "outputs": [],
1269 |    "source": [
1270 |     "hyperparameter_ranges=  {\n",
1271 |     "    \"factors_lr\": ContinuousParameter(0.0001, 0.2),\n",
1272 |     "    \"factors_init_sigma\": ContinuousParameter(0.0001, 1)\n",
1273 |     "}"
1274 |    ]
1275 |   },
1276 |   {
1277 |    "cell_type": "markdown",
1278 |    "metadata": {},
1279 |    "source": [
1280 |     "- Now that we have our ranges defined we want to define our success metric"
1281 |    ]
1282 |   },
1283 |   {
1284 |    "cell_type": "code",
1285 |    "execution_count": null,
1286 |    "metadata": {},
1287 |    "outputs": [],
1288 |    "source": [
1289 |     "objective_metric_name = \"test:rmse\"\n",
1290 |     "objective_type = \"Minimize\""
1291 |    ]
1292 |   },
1293 |   {
1294 |    "cell_type": "markdown",
1295 |    "metadata": {},
1296 |    "source": [
1297 |     "- Start hyperparameter tuning job with the ranges defined"
1298 |    ]
1299 |   },
1300 |   {
1301 |    "cell_type": "code",
1302 |    "execution_count": null,
1303 |    "metadata": {},
1304 |    "outputs": [],
1305 |    "source": [
1306 |     "fm_tuner = HyperparameterTuner(\n",
1307 |     "    estimator=fm_estimator,\n",
1308 |     "    objective_metric_name=objective_metric_name, \n",
1309 |     "    hyperparameter_ranges=hyperparameter_ranges,\n",
1310 |     "    objective_type=objective_type,\n",
1311 |     "    max_jobs=10,\n",
1312 |     "    max_parallel_jobs=2\n",
1313 |     ")"
1314 |    ]
1315 |   },
1316 |   {
1317 |    "cell_type": "code",
1318 |    "execution_count": null,
1319 |    "metadata": {},
1320 |    "outputs": [],
1321 |    "source": [
1322 |     "timestamp_prefix = time.strftime(\"%Y%m%d-%H%M%S\", time.gmtime())\n",
1323 |     "fm_tuner_job_name = 'hpo-fm-' + timestamp_prefix"
1324 |    ]
1325 |   },
1326 |   {
1327 |    "cell_type": "code",
1328 |    "execution_count": null,
1329 |    "metadata": {},
1330 |    "outputs": [],
1331 |    "source": [
1332 |     "fm_tuner.fit({'train': s3_train_path, 'test': s3_val_path}, job_name=fm_tuner_job_name, wait=False)"
1333 |    ]
1334 |   },
1335 |   {
1336 |    "cell_type": "markdown",
1337 |    "metadata": {},
1338 |    "source": [
1339 |     "- Track hyperparameter tuning job progress"
1340 |    ]
1341 |   },
1342 |   {
1343 |    "cell_type": "code",
1344 |    "execution_count": null,
1345 |    "metadata": {},
1346 |    "outputs": [],
1347 |    "source": [
1348 |     "# run this cell to check current status of hyperparameter tuning job\n",
1349 |     "tuning_job_result = smclient.describe_hyper_parameter_tuning_job(HyperParameterTuningJobName=fm_tuner_job_name)\n",
1350 |     "\n",
1351 |     "status = tuning_job_result['HyperParameterTuningJobStatus']\n",
1352 |     "if status != 'Completed':\n",
1353 |     "    print('Reminder: the tuning job has not been completed.')\n",
1354 |     "    \n",
1355 |     "job_count = tuning_job_result['TrainingJobStatusCounters']['Completed']\n",
1356 |     "print(\"%d training jobs have completed\" % job_count)\n",
1357 |     "    \n",
1358 |     "is_minimize = (tuning_job_result['HyperParameterTuningJobConfig']['HyperParameterTuningJobObjective']['Type'] != 'Maximize')\n",
1359 |     "objective_name = tuning_job_result['HyperParameterTuningJobConfig']['HyperParameterTuningJobObjective']['MetricName']"
1360 |    ]
1361 |   },
1362 |   {
1363 |    "cell_type": "markdown",
1364 |    "metadata": {},
1365 |    "source": [
1366 |     "* Analyze Hyper-Parameter Tuning Job Results"
1367 |    ]
1368 |   },
1369 |   {
1370 |    "cell_type": "code",
1371 |    "execution_count": null,
1372 |    "metadata": {},
1373 |    "outputs": [],
1374 |    "source": [
1375 |     "# plug-in the training job name and metrics to be captured\n",
1376 |     "fm_tuner_analytics = HyperparameterTuningJobAnalytics(hyperparameter_tuning_job_name=fm_tuner_job_name)\n",
1377 |     "df_fm_tuner_metrics = fm_tuner_analytics.dataframe()\n",
1378 |     "df_fm_tuner_metrics"
1379 |    ]
1380 |   },
1381 |   {
1382 |    "cell_type": "code",
1383 |    "execution_count": null,
1384 |    "metadata": {},
1385 |    "outputs": [],
1386 |    "source": [
1387 |     "# analyze using seaborn\n",
1388 |     "plt = df_fm_tuner_metrics.plot(kind='line', figsize=(12,5), x='TrainingStartTime', \n",
1389 |     "                             y='FinalObjectiveValue', \n",
1390 |     "                             style='b.', legend=False)\n",
1391 |     "plt.set_ylabel(objective_metric_name);"
1392 |    ]
1393 |   },
1394 |   {
1395 |    "cell_type": "markdown",
1396 |    "metadata": {},
1397 |    "source": [
1398 |     "- Best Factorization Machine Model after Hyper-Parameter Optimization"
1399 |    ]
1400 |   },
1401 |   {
1402 |    "cell_type": "code",
1403 |    "execution_count": null,
1404 |    "metadata": {},
1405 |    "outputs": [],
1406 |    "source": [
1407 |     "print(\"fm_tuner_job_name: \" + fm_tuner_job_name)\n",
1408 |     "fm_tuner = HyperparameterTuner.attach(fm_tuner_job_name)\n",
1409 |     "\n",
1410 |     "fm_tuner_analytics = HyperparameterTuningJobAnalytics(hyperparameter_tuning_job_name=fm_tuner_job_name)\n",
1411 |     "df_fm_tuner_metrics = fm_tuner_analytics.dataframe()\n",
1412 |     "\n",
1413 |     "fm_best_model_name = fm_tuner.best_training_job()\n",
1414 |     "print(\"fm_best_model_name: \" + fm_best_model_name)\n",
1415 |     "\n",
1416 |     "fm_model_info = smclient.describe_training_job(TrainingJobName=fm_best_model_name)"
1417 |    ]
1418 |   },
1419 |   {
1420 |    "cell_type": "code",
1421 |    "execution_count": null,
1422 |    "metadata": {},
1423 |    "outputs": [],
1424 |    "source": [
1425 |     "df_fm_tuner_metrics[df_fm_tuner_metrics['TrainingJobName']==fm_best_model_name]"
1426 |    ]
1427 |   },
1428 |   {
1429 |    "cell_type": "markdown",
1430 |    "metadata": {},
1431 |    "source": [
1432 |     "- Let's evaluate the results with the best training job from hyper-parameter tuning job."
1433 |    ]
1434 |   },
1435 |   {
1436 |    "cell_type": "markdown",
1437 |    "metadata": {},
1438 |    "source": [
1439 |     "We can deploy the endpoint using hyper-parameter tuning job and test the predictions. "
1440 |    ]
1441 |   },
1442 |   {
1443 |    "cell_type": "code",
1444 |    "execution_count": null,
1445 |    "metadata": {},
1446 |    "outputs": [],
1447 |    "source": [
1448 |     "fm = sagemaker.estimator.Estimator.attach(fm_best_model_name)"
1449 |    ]
1450 |   },
1451 |   {
1452 |    "cell_type": "markdown",
1453 |    "metadata": {},
1454 |    "source": [
1455 |     "We can re-run the cells in Batch Inference and Evaluation section to evaluate the performance of the model with tuned hyper-parameters. \n",
1456 |     "\n",
1457 |     "Assuming batch inference is carried out, let's calculate predictions for our test dataset and see if we do better than the training job with default hyper-parameters."
1458 |    ]
1459 |   },
1460 |   {
1461 |    "cell_type": "code",
1462 |    "execution_count": null,
1463 |    "metadata": {},
1464 |    "outputs": [],
1465 |    "source": [
1466 |     "print('MSE:', np.mean((test_Y - test_preds) ** 2))"
1467 |    ]
1468 |   },
1469 |   {
1470 |    "cell_type": "markdown",
1471 |    "metadata": {},
1472 |    "source": [
1473 |     "---\n",
1474 |     "\n",
1475 |     "## Wrap-up\n",
1476 |     "\n",
1477 |     "In this example, we developed a deep learning model to predict customer ratings.  This could serve as the foundation of a recommender system in a variety of use cases.  However, there are many ways in which it could be improved.  For example we did very little with:\n",
1478 |     "- hyperparameter tuning\n",
1479 |     "- controlling for overfitting (early stopping, dropout, etc.)\n",
1480 |     "- testing whether binarizing our target variable would improve results\n",
1481 |     "- including other information sources (video genres, historical ratings, time of review)\n",
1482 |     "- adjusting our threshold for user and item inclusion \n",
1483 |     "\n",
1484 |     "In addition to improving the model, we could improve the engineering by:\n",
1485 |     "- Setting the context and key value store up for distributed training\n",
1486 |     "- Fine tuning our data ingestion (e.g. num_workers on our data iterators) to ensure we're fully utilizing our GPU\n",
1487 |     "- Thinking about how pre-processing would need to change as datasets scale beyond a single machine\n",
1488 |     "\n",
1489 |     "Beyond that, recommenders are a very active area of research and techniques from active learning, reinforcement learning, segmentation, ensembling, and more should be investigated to deliver well-rounded recommendations.\n",
1490 |     "\n",
1491 |     "### Clean-up (optional)\n",
1492 |     "\n",
1493 |     "Let's finish by deleting our endpoint to avoid stray hosting charges."
1494 |    ]
1495 |   },
1496 |   {
1497 |    "cell_type": "code",
1498 |    "execution_count": null,
1499 |    "metadata": {},
1500 |    "outputs": [],
1501 |    "source": [
1502 |     "endpoint_name_contains = ['-fm-', 'factorization-machines-']\n",
1503 |     "for name in endpoint_name_contains:\n",
1504 |     "    endpoints = smclient.list_endpoints(NameContains=name, StatusEquals='InService')\n",
1505 |     "    endpoint_names = [r['EndpointName'] for r in endpoints['Endpoints']]\n",
1506 |     "    for endpoint_name in endpoint_names:\n",
1507 |     "        print(\"Deleting endpoint: \" + endpoint_name)\n",
1508 |     "        smclient.delete_endpoint(EndpointName=endpoint_name)"
1509 |    ]
1510 |   },
1511 |   {
1512 |    "cell_type": "markdown",
1513 |    "metadata": {},
1514 |    "source": [
1515 |     "---"
1516 |    ]
1517 |   }
1518 |  ],
1519 |  "metadata": {
1520 |   "kernelspec": {
1521 |    "display_name": "conda_python2",
1522 |    "language": "python",
1523 |    "name": "conda_python2"
1524 |   },
1525 |   "language_info": {
1526 |    "codemirror_mode": {
1527 |     "name": "ipython",
1528 |     "version": 2
1529 |    },
1530 |    "file_extension": ".py",
1531 |    "mimetype": "text/x-python",
1532 |    "name": "python",
1533 |    "nbconvert_exporter": "python",
1534 |    "pygments_lexer": "ipython2",
1535 |    "version": "2.7.15"
1536 |   },
1537 |   "notice": "Copyright 2017 Amazon.com, Inc. or its affiliates. All Rights Reserved.  Licensed under the Apache License, Version 2.0 (the \"License\"). You may not use this file except in compliance with the License. A copy of the License is located at http://aws.amazon.com/apache2.0/ or in the \"license\" file accompanying this file. This file is distributed on an \"AS IS\" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License."
1538 |  },
1539 |  "nbformat": 4,
1540 |  "nbformat_minor": 2
1541 | }
1542 | 


--------------------------------------------------------------------------------