├── .github
    └── PULL_REQUEST_TEMPLATE.md
├── .gitignore
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── NOTICE
├── README.md
├── deployment
    ├── architecture.png
    ├── fraud-detection-sagemaker-demo-stack.yaml
    ├── fraud-detection-sagemaker-notebook-instance.yaml
    ├── fraud-detection-sagemaker-permissions-stack.yaml
    ├── fraud-detection-using-machine-learning.yaml
    └── solution-assistant
    │   ├── requirements.in
    │   ├── solution-assistant.yaml
    │   └── src
    │       └── lambda_function.py
├── source
    ├── env_setup.py
    ├── lambda
    │   └── model-invocation
    │   │   └── index.py
    ├── notebooks
    │   ├── endpoint_demo.ipynb
    │   ├── requirements.in
    │   ├── sagemaker_fraud_detection.ipynb
    │   ├── setup.py
    │   └── src
    │   │   └── package
    │   │       ├── __init__.py
    │   │       ├── config.py
    │   │       ├── generate_endpoint_traffic.py
    │   │       └── utils.py
    └── scripts
    │   └── set_kernelspec.py
└── test
    ├── buildspec.yml
    ├── run_notebook.py
    ├── test_deployment.ipynb
    └── test_deployment_out.ipynb


/.github/PULL_REQUEST_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | *Issue #, if available:*
2 | 
3 | *Description of changes:*
4 | 
5 | 
6 | By submitting this pull request, I confirm that you can use, modify, copy, and redistribute this contribution, under the terms of your choice.
7 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | 
3 | .ipynb_checkpoints/
4 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Change Log
 2 | All notable changes to this project will be documented in this file.
 3 | 
 4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 5 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 6 | 
 7 | ## [2.0.0] - 2020-05-05
 8 | ### Added
 9 | - unsupervised learning model with Amazon SageMaker Random Cut Forest
10 | - data upsampling techniques like SMOTE for addressing imbalanced data
11 | - More interpretation of results
12 | - supervised learning model with XGBoost
13 | - Amazon API Gateway entrypoint
14 | 
15 | ### Removed
16 | - Supervised learning model with SageMaker linear learner
17 | 
18 | 
19 | ## [1.0.0] - 2019-05-16
20 | ### Added
21 | - initial checkin
22 | 
23 | ### Changed
24 | 
25 | ### Removed
26 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check [existing open](https://github.com/awslabs/fraud-detection-using-machine-learning/issues), or [recently closed](https://github.com/awslabs/fraud-detection-using-machine-learning/issues?utf8=%E2%9C%93&q=is%3Aissue%20is%3Aclosed%20), issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *master* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any ['help wanted'](https://github.com/awslabs/fraud-detection-using-machine-learning/labels/help%20wanted) issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](https://github.com/awslabs/fraud-detection-using-machine-learning/blob/master/LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 
61 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes.
62 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 


--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
 1 | Fraud Detection using Machine Learing
 2 | Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 | 
 4 | **********************
 5 | THIRD PARTY COMPONENTS
 6 | **********************
 7 | This software includes third party software subject to the following copyrights:
 8 | 
 9 | Boto 3 - The AWS SDK for Python under the Apache 2.0 License
10 | requests - A simple, yet elegant HTTP library under the Apache 2.0 License
11 | aws-requests-auth under the BSD 3-Clause License
12 | pandas under the BSD 3-Clause License
13 | numpy under the BSD 3-Clause License
14 | scipy under the BSD 3-Clause License
15 | seaborn under the BSD 3-Clause License
16 | scikit-learn under the BSD 3-Clause License
17 | imbalanced-learn under the MIT License
18 | torch under the BSD 3-Clause License
19 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Guidance for Fraud Detection using Machine Learning on AWS
  2 | 
  3 | ## Table of Contents
  4 | 
  5 | 1. [Overview](#overview)
  6 |    - [AWS services](#awservices)
  7 |    - [Cost](#cost)
  8 | 3. [Prerequisites](#prerequisites)
  9 | 4. [Architecture](#architecture)
 10 | 5. [Deployment Steps](#deployment-steps)
 11 | 6. [Deployment Validation](#deployment-validation)
 12 | 7. [Running the Guidance](#running-the-guidance)
 13 | 8. [Next Steps](#next-steps)
 14 | 9. [Cleanup](#cleanup)
 15 | 10. [Notices](#notices)
 16 | 
 17 | ## Overview
 18 | 
 19 | With businesses moving online, fraud and abuse in online systems is constantly increasing as well. Traditionally, rule-based fraud detection systems are used to combat online fraud, but these rely on a static set of rules created by human experts. This project uses machine learning to create models for fraud detection that are dynamic, self-improving and maintainable. Importantly, they can scale with the online business.
 20 | 
 21 | Specifically, we show how to use Amazon SageMaker to train supervised and unsupervised machine learning models on historical transactions, so that they can predict the likelihood of incoming transactions being fraudulent or not. We also show how to deploy the models, once trained, to a REST API that can be integrated into an existing business software infrastructure. This project includes a demonstration of this process using a public, anonymized credit card transactions [dataset provided by ULB](https://www.kaggle.com/mlg-ulb/creditcardfraud), but can be easily modified to work with custom labelled or unlaballed data provided as a relational table in csv format.
 22 | 
 23 | ### AWS services
 24 | 
 25 | - [Amazon S3](https://docs.aws.amazon.com/s3/?icmpid=docs_homepage_featuredsvcs)
 26 | - [Amazon Sagemaker](https://docs.aws.amazon.com/sagemaker/latest/dg/gs.html?icmpid=docs_sagemaker_lp/index.html)
 27 | - [AWS Lambda](https://docs.aws.amazon.com/lambda/latest/dg/welcome.html)
 28 | - [Amazon API Gateway](https://docs.aws.amazon.com/apigateway/latest/developerguide/welcome.html)
 29 | - [Amazon Kinesis Firehose](https://aws.amazon.com/firehose/)
 30 | - [Amazon QuickSight](https://docs.aws.amazon.com/quicksight/latest/user/welcome.html)
 31 | 
 32 | ### Cost
 33 | 
 34 | The following table provides a sample cost breakdown for deploying this
 35 | Guidance with the default parameters in the US East (N. Virginia) Region
 36 | for one month.
 37 | 
 38 | | **AWS service**   | Dimensions                                             | Monthly cost \[USD\] |
 39 | | ----------------- | ------------------------------------------------------ | ---------------------------------------------- |
 40 | | Amazon S3 Standard	| S3 Standard storage (10 GB per month), Data returned by S3 Select (10 GB per month), Data scanned by S3 Select (10 GB per month)	| $0.26 |
 41 | | Amazon S3 Data Transfer	| DT Inbound: Not selected (0 TB per month), DT Outbound: Not selected (0 TB per month)	| $0 |
 42 | | Amazon SageMaker Studio Notebooks	| Instance name (ml.c5.12xlarge), Number of data scientist(s) (5), Number of Studio Notebook instances per data scientist (2), Studio Notebook hour(s) per day (3), Studio Notebook day(s) per month (10)	| $734.4 |
 43 | | Amazon SageMaker Processing	Storage | (General Purpose SSD (gp2)), Instance name (ml.c4.2xlarge), Number of processing jobs per month (20), Number of instances per job (2), Hour(s) per instance per job (2)	| $39.24 |
 44 | | Amazon SageMaker Training	Storage | (General Purpose SSD (gp2)), Instance name (ml.c4.2xlarge), Number of training jobs per month (20), Number of instances per job (2), Hour(s) per instance per job (3)	| $58.76 |
 45 | | Amazon SageMaker Real-Time Inference	Storage | (General Purpose SSD (gp2)), Instance name (ml.c4.2xlarge), Instance name (ml.c4.2xlarge), Number of models deployed (5), Number of models per endpoint (5), Number of instances per endpoint (2), Endpoint hour(s) per day (3), Endpoint day(s) per month (20), Data Processed IN (10 GB), Data Processed OUT (8 GB)	| $59.05 |
 46 | | AWS Lambda	| Architecture (x86), Architecture (x86), Invoke Mode (Buffered), Amount of ephemeral storage allocated (512 MB), Number of requests (1 million per month)	 | $0 |
 47 | | Amazon API Gateway	| Cache memory size (GB) (None), WebSocket message units (thousands), HTTP API requests units (millions), Average size of each request (34 KB), REST API request units (millions), Average message size (32 KB), Requests (1 per month)	| $1 |
 48 | | Amazon Kinesis Data Firehose |	Dynamic Partitioning (Add On) (Disabled), Source Type (Direct PUT or Kinesis Data Stream), Average ratio of data processed to VPC vs data ingested (1.3), Data records units (millions), Record size (100 KB), Data format conversion (optional) (Disabled), Number of records for data ingestion (1 per month), Data format conversion (optional)  (Disabled), Data records units  (thousands), Record size  (5 KB)	| $2.77 |
 49 | | Amazon S3 Standard	| S3 Standard storage (100 GB per month)	| $2.3 | 
 50 | | Total | | $897.78 |
 51 | ## Prerequisites
 52 | 
 53 | You will need an AWS account to use this solution. Sign up for an account [here](https://aws.amazon.com/).
 54 | 
 55 | To run this JumpStart 1P Solution and have the infrastructure deploy to your AWS account you will need to create an active SageMaker Studio instance (see [Onboard to Amazon SageMaker Studio](https://docs.aws.amazon.com/sagemaker/latest/dg/gs-studio-onboard.html)). When your Studio instance is *Ready*, use the instructions in [SageMaker JumpStart](https://docs.aws.amazon.com/sagemaker/latest/dg/studio-jumpstart.html) to 1-Click Launch the solution.
 56 | 
 57 | The solution artifacts are included in this GitHub repository for reference.
 58 | 
 59 | *Note*: Solutions are available in most regions including us-west-2, and us-east-1.
 60 | 
 61 | **Caution**: Cloning this GitHub repository and running the code manually could lead to unexpected issues! Use the AWS CloudFormation template. You'll get an Amazon SageMaker Notebook instance that's been correctly setup and configured to access the other resources in the solution.
 62 | 
 63 | ## Architecture
 64 | 
 65 | The project architecture deployed by the cloud formation template is shown here.
 66 | 
 67 | ![](deployment/architecture.png)
 68 | 
 69 | ## Deployment Steps
 70 | The project uses Amazon SageMaker to train both a supervised and an unsupervised machine learning models, which are then deployed using Amazon Sagemaker-managed endpoints.
 71 | 
 72 | If you have labels for your data, for example if some of the transactions have been annotated as fraudulent and some as legitimate, then you can train a supervised learning model to learn to discern the two classes. In this project, we provide a recipe to train a gradient boosted decision tree model using [XGBoost on Amazon SageMaker](https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html). The supervised model training process also handles the common issue of working with highly imbalanced data in fraud detection problems. The project addresses this issue into two ways by 1) implementing data upsampling using the "imbalanced-learn" package, and 2) using scale position weight to control the balance of positive and negative weights.
 73 | 
 74 | If you don't have labelled data or if you want to augment your supervised model predictions with an anomaly score from an unsupervised model, then the project also trains a [RandomCutForest](https://docs.aws.amazon.com/sagemaker/latest/dg/randomcutforest.html) model using Amazon SageMaker. The RandomCutForest algorithm is trained on the entire dataset, without labels, and takes advantage of the highly imbalanced nature of fraud datasets, to predict higher anomaly scores for the fraudulent transactions in the dataset.
 75 | 
 76 | Both of the trained models are deployed to Amazon SageMaker managed real-time endpoints that host the models and can be invoked to provide model predictions for new transactions.
 77 | 
 78 | The model training and endpoint deployment is orchestrated by running a [jupyter notebook](source/notebooks/sagemaker_fraud_detection.ipynb) on a SageMaker Notebook instance. The jupyter notebook runs a demonstration of the project using the aforementioned anonymized credit card dataset that is automatically downloaded to the Amazon S3 Bucket created when you launch the solution. However, the notebook can be modified to run the project on a custom dataset in S3. The notebook instance also contains some example code that shows how to invoke the REST API for inference.
 79 | 
 80 | In order to encapsulate the project as a stand-alone microservice, Amazon API Gateway is used to provide a REST API, that is backed by an AWS Lambda function. The Lambda function runs the code necessary to preprocess incoming transactions, invoke sagemaker endpoints, merge results from both endpoints if necessary, store the model inputs and model predictions in S3 via Kinesis Firehose, and provide a response to the client.
 81 | 
 82 | ## Data
 83 | 
 84 | 
 85 | The example dataset used in this solution was originally released as part of a research collaboration of Worldline and
 86 | the Machine Learning Group (http://mlg.ulb.ac.be) of ULB (Université Libre de Bruxelles) on big data mining and fraud
 87 | detection.
 88 | 
 89 | The dataset contains credit card transactions from European cardholders in 2013. As is common in fraud detection,
 90 | it is highly unbalanced, with 492 fraudulent transactions out of the 284,807 total transactions. The dataset contains
 91 | only numerical features, because the original features have been transformed for confidentiality using PCA. As a result,
 92 | the dataset contains 28 PCA components, and two features that haven't been transformed, _Amount_ and _Time_.
 93 | _Amount_ refers to the transaction amount, and _Time_ is the seconds elapsed between any transaction in the data
 94 | and the first transaction.
 95 | 
 96 | More details on current and past projects on related topics are available on
 97 | https://www.researchgate.net/project/Fraud-detection-5 and the page of the
 98 | [DefeatFraud](https://mlg.ulb.ac.be/wordpress/portfolio_page/defeatfraud-assessment-and-validation-of-deep-feature-engineering-and-learning-solutions-for-fraud-detection/) project
 99 | 
100 | We cite the following works:
101 | * Andrea Dal Pozzolo, Olivier Caelen, Reid A. Johnson and Gianluca Bontempi. Calibrating Probability with Undersampling for Unbalanced Classification. In Symposium on Computational Intelligence and Data Mining (CIDM), IEEE, 2015
102 | * Dal Pozzolo, Andrea; Caelen, Olivier; Le Borgne, Yann-Ael; Waterschoot, Serge; Bontempi, Gianluca. Learned lessons in credit card fraud detection from a practitioner perspective, Expert systems with applications,41,10,4915-4928,2014, Pergamon
103 | * Dal Pozzolo, Andrea; Boracchi, Giacomo; Caelen, Olivier; Alippi, Cesare; Bontempi, Gianluca. Credit card fraud detection: a realistic modeling and a novel learning strategy, IEEE transactions on neural networks and learning systems,29,8,3784-3797,2018,IEEE
104 | * Dal Pozzolo, Andrea Adaptive Machine learning for credit card fraud detection ULB MLG PhD thesis (supervised by G. Bontempi)
105 | * Carcillo, Fabrizio; Dal Pozzolo, Andrea; Le Borgne, Yann-Aël; Caelen, Olivier; Mazzer, Yannis; Bontempi, Gianluca. Scarff: a scalable framework for streaming credit card fraud detection with Spark, Information fusion,41, 182-194,2018,Elsevier
106 | * Carcillo, Fabrizio; Le Borgne, Yann-Aël; Caelen, Olivier; Bontempi, Gianluca. Streaming active learning strategies for real-life credit card fraud detection: assessment and visualization, International Journal of Data Science and Analytics, 5,4,285-300,2018,Springer International Publishing
107 | 
108 | 
109 | ## Running the Guidance
110 | 
111 | * `deployment/`
112 |   * `fraud-detection-using-machine-learning.yaml`: Creates AWS CloudFormation Stack for solution
113 | * `source/`
114 |   * `lambda`
115 |     * `model-invocation/`
116 |       * `index.py`: Lambda function script for invoking SageMaker endpoints for inference
117 |   * `notebooks/`
118 |     * `src`
119 |       * `package`
120 |         * `config.py`: Read in the environment variables set during the Amazon CloudFormation stack creation
121 |         * `generate_endpoint_traffic.py`: Custom script to show how to send transaction traffic to REST API for inference
122 |         * `util.py`: Helper function and utilities
123 |     * `sagemaker_fraud_detection.ipynb`: Orchestrates the solution. Trains the models and deploys the trained model
124 |     * `endpoint_demo.ipynb`: A small notebook that demonstrates how one can use the solution's endpoint to make prediction.
125 |   * `scripts/`
126 |     * `set_kernelspec.py`: Used to update the kernelspec name at deployment.
127 |   * `test/`
128 |     * Files that are used to automatically test the solution
129 | 
130 | 
131 | ## License
132 | 
133 | This project is licensed under the Apache-2.0 License.
134 | 
135 | ## Notices
136 | 
137 | _Customers are responsible for making their own independent assessment of the information in this Guidance. This Guidance: (a) is for informational purposes only, (b) represents AWS current product offerings and practices, which are subject to change without notice, and (c) does not create any commitments or assurances from AWS and its affiliates, suppliers or licensors. AWS products or services are provided “as is” without warranties, representations, or conditions of any kind, whether express or implied. AWS responsibilities and liabilities to its customers are controlled by AWS agreements, and this Guidance is not part of, nor does it modify, any agreement between AWS and its customers._
138 | 
139 | 
140 | 
141 | 
142 | 
143 | 


--------------------------------------------------------------------------------
/deployment/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/fraud-detection-using-machine-learning/c4fe32b5f04dedafaa4b4e6613515fd714031969/deployment/architecture.png


--------------------------------------------------------------------------------
/deployment/fraud-detection-sagemaker-demo-stack.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: "2010-09-09"
 2 | Description: "((SO0056)) - fraud-detection-using-machine-learning demo stack"
 3 | Parameters:
 4 |   SolutionPrefix:
 5 |     Description: The name of the prefix for the solution used for naming resources.
 6 |     Type: String
 7 |   SolutionsBucket:
 8 |     Description: The bucket that contains the solution files.
 9 |     Type: String
10 |   SolutionName:
11 |     Type: String
12 |   ExecutionRoleArn:
13 |     Description: The role used when invoking the enpoint.
14 |     Type: String
15 | 
16 | Mappings:
17 |   RegionMap:
18 |     "us-west-1":
19 |       "XGBoost":  "746614075791.dkr.ecr.us-west-1.amazonaws.com"
20 |     "us-west-2":
21 |       "XGBoost":  "246618743249.dkr.ecr.us-west-2.amazonaws.com"
22 |     "us-east-1":
23 |       "XGBoost":  "683313688378.dkr.ecr.us-east-1.amazonaws.com"
24 |     "us-east-2":
25 |       "XGBoost":  "257758044811.dkr.ecr.us-east-2.amazonaws.com"
26 |     "ap-northeast-1":
27 |       "XGBoost":  "354813040037.dkr.ecr.ap-northeast-1.amazonaws.com"
28 |     "ap-northeast-2":
29 |       "XGBoost":  "366743142698.dkr.ecr.ap-northeast-2.amazonaws.com"
30 |     "ap-southeast-1":
31 |       "XGBoost":  "121021644041.dkr.ecr.ap-southeast-1.amazonaws.com"
32 |     "ap-southeast-2":
33 |       "XGBoost":  "783357654285.dkr.ecr.ap-southeast-2.amazonaws.com"
34 |     "ap-south-1":
35 |       "XGBoost":  "720646828776.dkr.ecr.ap-south-1.amazonaws.com"
36 |     "ap-east-1":
37 |       "XGBoost":  "651117190479.dkr.ecr.ap-east-1.amazonaws.com"
38 |     "ca-central-1":
39 |       "XGBoost":  "341280168497.dkr.ecr.ca-central-1.amazonaws.com"
40 |     "cn-north-1":
41 |       "XGBoost":  "450853457545.dkr.ecr.cn-north-1.amazonaws.com.cn"
42 |     "cn-northwest-1":
43 |       "XGBoost":  "451049120500.dkr.ecr.cn-northwest-1.amazonaws.com.cn"
44 |     "eu-central-1":
45 |       "XGBoost":  "492215442770.dkr.ecr.eu-central-1.amazonaws.com"
46 |     "eu-north-1":
47 |       "XGBoost":  "662702820516.dkr.ecr.eu-north-1.amazonaws.com"
48 |     "eu-south-1":
49 |       "XGBoost":  "048378556238.dkr.ecr.eu-north-1.amazonaws.com"
50 |     "eu-west-1":
51 |       "XGBoost":  "141502667606.dkr.ecr.eu-west-1.amazonaws.com"
52 |     "eu-west-2":
53 |       "XGBoost":  "764974769150.dkr.ecr.eu-west-2.amazonaws.com"
54 |     "eu-west-3":
55 |       "XGBoost":  "659782779980.dkr.ecr.eu-west-3.amazonaws.com"
56 |     "me-south-1":
57 |       "XGBoost":  "801668240914.dkr.ecr.me-south-1.amazonaws.com"
58 |     "sa-east-1":
59 |       "XGBoost":  " 737474898029.dkr.ecr.sa-east-1.amazonaws.com"
60 |     "us-gov-west-1":
61 |       "XGBoost":   "414596584902.dkr.ecr.us-gov-west-1.amazonaws.com"
62 | 
63 | Resources:
64 |   FraudClassificationModel:
65 |     Type: "AWS::SageMaker::Model"
66 |     Properties:
67 |       ExecutionRoleArn: !Ref ExecutionRoleArn
68 |       PrimaryContainer:
69 |         Image: !Sub
70 |           - "${ContainerLocation}/sagemaker-xgboost:0.90-2-cpu-py3"
71 |           - ContainerLocation:
72 |               Fn::FindInMap: [RegionMap, !Ref "AWS::Region", "XGBoost"]
73 |         ModelDataUrl: !Sub "s3://${SolutionsBucket}/${SolutionName}/artifacts/xgboost-model.tar.gz"
74 |       ModelName: !Sub "${SolutionPrefix}-demo"
75 |   FraudClassificationEndpointConfig:
76 |     Type: "AWS::SageMaker::EndpointConfig"
77 |     Properties:
78 |       ProductionVariants:
79 |         - InitialInstanceCount: 1
80 |           InitialVariantWeight: 1.0
81 |           InstanceType: ml.m5.xlarge
82 |           ModelName: !GetAtt FraudClassificationModel.ModelName
83 |           VariantName: !GetAtt FraudClassificationModel.ModelName
84 |       EndpointConfigName: !Sub "${SolutionPrefix}-demo"
85 |     Metadata:
86 |       cfn_nag:
87 |         rules_to_suppress:
88 |           - id: W1200
89 |             reason: Demo endpoint not given a KmsID
90 |   FraudClassificationEndpoint:
91 |     Type: "AWS::SageMaker::Endpoint"
92 |     Properties:
93 |       EndpointName: !Sub "${SolutionPrefix}-demo"
94 |       EndpointConfigName: !GetAtt FraudClassificationEndpointConfig.EndpointConfigName
95 | 
96 | Outputs:
97 |   EndpointName:
98 |     Description: Name of the demo XGBoost fraud classification endpoint
99 |     Value: !GetAtt FraudClassificationEndpoint.EndpointName


--------------------------------------------------------------------------------
/deployment/fraud-detection-sagemaker-notebook-instance.yaml:
--------------------------------------------------------------------------------
  1 | AWSTemplateFormatVersion: 2010-09-09
  2 | Description: >-
  3 |   (SO0056) - fraud-detection-using-machine-learning: Notebook instance stack
  4 | Parameters:
  5 |   SolutionPrefix:
  6 |     Type: String
  7 |   ParentStackName:
  8 |     Type: String
  9 |   SolutionName:
 10 |     Type: String
 11 |   StackVersion:
 12 |     Type: String
 13 |   ModelDataBucket:
 14 |     Type: String
 15 |   NotebookInstanceExecutionRoleArn:
 16 |     Type: String
 17 |   RESTAPIGateway:
 18 |     Type: String
 19 |   TestOutputsS3Bucket:
 20 |     Type: String
 21 | 
 22 | Mappings:
 23 |   SolutionsS3BucketName:
 24 |     development:
 25 |       Prefix: sagemaker-solutions-devo
 26 |     release:
 27 |       Prefix: sagemaker-solutions-prod
 28 |   NotebookInstanceType:
 29 |     "af-south-1":
 30 |         Type: ml.t3.medium
 31 |     "ap-east-1":
 32 |         Type: ml.t3.medium
 33 |     "ap-northeast-1":
 34 |         Type: ml.t3.medium
 35 |     "ap-northeast-2":
 36 |         Type: ml.t2.medium
 37 |     "ap-south-1":
 38 |         Type: ml.t2.medium
 39 |     "ap-southeast-1":
 40 |         Type: ml.t3.medium
 41 |     "ap-southeast-2":
 42 |         Type: ml.t3.medium
 43 |     "ca-central-1":
 44 |         Type: ml.t3.medium
 45 |     "eu-central-1":
 46 |         Type: ml.t3.medium
 47 |     "eu-north-1":
 48 |         Type: ml.t3.medium
 49 |     "eu-south-1":
 50 |         Type: ml.t3.medium
 51 |     "eu-west-1":
 52 |         Type: ml.t3.medium
 53 |     "eu-west-2":
 54 |         Type: ml.t3.medium
 55 |     "eu-west-3":
 56 |         Type: ml.t3.medium
 57 |     "me-south-1":
 58 |         Type: ml.t3.medium
 59 |     "sa-east-1":
 60 |         Type: ml.t3.medium
 61 |     "us-east-1":
 62 |         Type: ml.t3.medium
 63 |     "us-east-2":
 64 |         Type: ml.t3.medium
 65 |     "us-west-1":
 66 |         Type: ml.t3.medium
 67 |     "us-west-2":
 68 |         Type: ml.t3.medium
 69 | 
 70 | Resources:
 71 |   BasicNotebookInstance:
 72 |     Type: 'AWS::SageMaker::NotebookInstance'
 73 |     Properties:
 74 |       InstanceType: !FindInMap [NotebookInstanceType, !Ref "AWS::Region", Type]
 75 |       NotebookInstanceName: !Sub "${SolutionPrefix}-notebook-instance"
 76 |       RoleArn: !Ref NotebookInstanceExecutionRoleArn
 77 |       LifecycleConfigName: !GetAtt
 78 |         - BasicNotebookInstanceLifecycleConfig
 79 |         - NotebookInstanceLifecycleConfigName
 80 |     Metadata:
 81 |       cfn_nag:
 82 |         rules_to_suppress:
 83 |           - id: W1201
 84 |             reason: Solution does not have KMS encryption enabled by default
 85 | 
 86 |   BasicNotebookInstanceLifecycleConfig:
 87 |     Type: 'AWS::SageMaker::NotebookInstanceLifecycleConfig'
 88 |     Properties:
 89 |       NotebookInstanceLifecycleConfigName: !Sub "${SolutionPrefix}-nb-lifecycle-config"
 90 |       OnCreate:
 91 |         - Content:
 92 |             Fn::Base64: !Sub
 93 |               - |
 94 |                 set -e
 95 |                 # perform following actions as ec2-user
 96 |                 sudo -u ec2-user -i <<EOF
 97 |                 cd /home/ec2-user/SageMaker
 98 |                 # copy source files
 99 |                 aws s3 sync s3://${SolutionsS3BucketNamePrefix}-${AWS::Region}/${SolutionName}/source .
100 |                 # copy test files
101 |                 aws s3 sync s3://${SolutionsS3BucketNamePrefix}-${AWS::Region}/${SolutionName}/test ./test
102 |                 # create stack_outputs.json with stack resources that are required in notebook(s)
103 |                 touch stack_outputs.json
104 |                 echo '{' >> stack_outputs.json
105 |                 echo '  "FraudStackName": "${ParentStackName}",' >> stack_outputs.json
106 |                 echo '  "SolutionPrefix": "${SolutionPrefix}",' >> stack_outputs.json
107 |                 echo '  "AwsAccountId": "${AWS::AccountId}",' >> stack_outputs.json
108 |                 echo '  "AwsRegion": "${AWS::Region}",' >> stack_outputs.json
109 |                 echo '  "IamRole": "${NotebookInstanceExecutionRoleArn}",' >> stack_outputs.json
110 |                 echo '  "ModelDataBucket": "${ModelDataBucket}",' >> stack_outputs.json
111 |                 echo '  "SolutionsS3Bucket": "${SolutionsS3BucketNamePrefix}",' >> stack_outputs.json
112 |                 echo '  "RESTAPIGateway": "${RESTAPIGateway}",' >> stack_outputs.json
113 |                 echo '  "TestOutputsS3Bucket": "${TestOutputsS3Bucket}",' >> stack_outputs.json
114 |                 echo '  "SolutionName": "${SolutionName}",' >> stack_outputs.json
115 |                 echo '  "SagemakerMode": "NotebookInstance"' >> stack_outputs.json
116 |                 echo '}' >> stack_outputs.json
117 |                 echo "stack_outputs.json created:"
118 |                 cat stack_outputs.json
119 |                 # Replace placeholders
120 |                 cd /home/ec2-user/SageMaker/notebooks
121 |                 sed -s -i 's/HUB_1P_IMAGE/conda_python3/g' *.ipynb
122 |                 EOF
123 |               - SolutionsS3BucketNamePrefix:
124 |                   Fn::FindInMap: [SolutionsS3BucketName, Ref: StackVersion, Prefix]
125 |       OnStart:
126 |         - Content:
127 |             Fn::Base64: |
128 |               #!/bin/bash
129 |               set -e
130 |               # perform following actions as ec2-user
131 |               sudo -u ec2-user -i <<EOF
132 |               /home/ec2-user/anaconda3/envs/python3/bin/python /home/ec2-user/SageMaker/env_setup.py --force --log-level DEBUG
133 |               cd /home/ec2-user/SageMaker
134 |               for nb in notebooks/*.ipynb; do python ./scripts/set_kernelspec.py --notebook "$nb" --kernel "conda_python3" --display-name "conda_python3"; done
135 |               # Optionally run the solution's notebook if this was an integration test launch
136 |               nohup /home/ec2-user/anaconda3/envs/python3/bin/python ./test/run_notebook.py > ./test/run_notebook.log 2>&1 &
137 |               echo "OnStart script completed!"
138 |               EOF
139 | Outputs:
140 |   SageMakerNotebook:
141 |     Description: "Opens the Jupyter notebook to get started with model training"
142 |     Value: !Sub "https://${SolutionPrefix}-notebook-instance.notebook.${AWS::Region}.sagemaker.aws/notebooks/notebooks/sagemaker_fraud_detection.ipynb"
143 | 


--------------------------------------------------------------------------------
/deployment/fraud-detection-sagemaker-permissions-stack.yaml:
--------------------------------------------------------------------------------
  1 | AWSTemplateFormatVersion: "2010-09-09"
  2 | Description: "((SO0056)) - fraud-detection-using-machine-learning SageMaker permissions stack"
  3 | Parameters:
  4 |   SolutionPrefix:
  5 |     Description: The name of the prefix for the solution used for naming resources
  6 |     Type: String
  7 |   ModelDataBucketName:
  8 |     Description: The name of the model and data S3 bucket for the solution
  9 |     Type: String
 10 |   OutputBucketName:
 11 |     Description: The name of the output S3 bucket for the solution
 12 |     Type: String
 13 |   RESTAPIGateway:
 14 |     Description: The name of the API Gateway being deployed
 15 |     Type: String
 16 |   StackVersion:
 17 |     Description: The name of the template stack version
 18 |     Type: String
 19 |   RootStackID:
 20 |     Description: The ID of the root stack
 21 |     Type: String
 22 |   TestOutputsS3Bucket:
 23 |     Description: The name of the test output bucket for the solution
 24 |     Type: String
 25 | 
 26 | Mappings:
 27 |   SolutionsS3BucketName:
 28 |     release:
 29 |       Prefix: "sagemaker-solutions-build"
 30 |     development:
 31 |       Prefix: "sagemaker-solutions-devo"
 32 | 
 33 | Conditions:
 34 |     AddTestBucketPermissions: !Not [!Equals [!Ref TestOutputsS3Bucket, ""]]
 35 | 
 36 | Resources:
 37 |   NotebookInstanceExecutionRole:
 38 |     Type: 'AWS::IAM::Role'
 39 |     Properties:
 40 |       RoleName: !Sub "${SolutionPrefix}-${AWS::Region}-nb-role"
 41 |       ManagedPolicyArns:
 42 |       - >-
 43 |         arn:aws:iam::aws:policy/service-role/AmazonAPIGatewayPushToCloudWatchLogs
 44 |       AssumeRolePolicyDocument:
 45 |         Version: 2012-10-17
 46 |         Statement:
 47 |           - Effect: Allow
 48 |             Principal:
 49 |               Service:
 50 |                 - sagemaker.amazonaws.com
 51 |                 - lambda.amazonaws.com
 52 |                 - apigateway.amazonaws.com
 53 |                 - firehose.amazonaws.com
 54 |             Action:
 55 |               - 'sts:AssumeRole'
 56 |     Metadata:
 57 |       cfn_nag:
 58 |         rules_to_suppress:
 59 |           - id: W28
 60 |             reason: Needs to be explicitly named to tighten launch permissions policy
 61 |   NotebookInstanceIAMPolicy:
 62 |     Type: 'AWS::IAM::Policy'
 63 |     Properties:
 64 |       PolicyName: !Sub "${SolutionPrefix}-nb-instance-policy"
 65 |       PolicyDocument:
 66 |         Version: 2012-10-17
 67 |         Statement:
 68 |           # Can list the contents of the SageMaker Solutions Amazon S3 Bucket
 69 |           - Effect: Allow
 70 |             Action:
 71 |               - s3:ListBucket
 72 |             Resource:
 73 |               - !Sub
 74 |                 - "arn:aws:s3:::${SolutionsS3BucketNamePrefix}-${AWS::Region}"
 75 |                 - SolutionsS3BucketNamePrefix:
 76 |                     Fn::FindInMap: [SolutionsS3BucketName, Ref: StackVersion, Prefix]
 77 |           # Can get objects from the SageMaker Solutions Amazon S3 Bucket
 78 |           - Effect: Allow
 79 |             Action:
 80 |               - s3:GetObject
 81 |             Resource:
 82 |               - !Sub
 83 |                 - "arn:aws:s3:::${SolutionsS3BucketNamePrefix}-${AWS::Region}/*"
 84 |                 - SolutionsS3BucketNamePrefix:
 85 |                     Fn::FindInMap: [SolutionsS3BucketName, Ref: StackVersion, Prefix]
 86 |           # Can get, put and delete objects in the model-data bucket, and the buckets themselves (for SolutionAssistant)
 87 |           - Effect: Allow
 88 |             Action:
 89 |               - s3:AbortMultipartUpload
 90 |               - s3:GetObject
 91 |               - s3:PutObject
 92 |               - s3:DeleteObject
 93 |             Resource:
 94 |                 - !Sub "arn:aws:s3:::${ModelDataBucketName}/*"
 95 |                 - !Sub "arn:aws:s3:::${OutputBucketName}/*"
 96 |                 - !If
 97 |                   - AddTestBucketPermissions
 98 |                   - !Sub "arn:aws:s3:::${TestOutputsS3Bucket}/*"
 99 |                   - !Ref "AWS::NoValue"
100 |           - Effect: Allow
101 |             Action:
102 |               - s3:ListBucket
103 |               - s3:DeleteBucket
104 |               - s3:GetBucketLocation
105 |               - s3:ListBucketMultipartUploads
106 |             Resource:
107 |               - !Sub "arn:aws:s3:::${ModelDataBucketName}"
108 |               - !Sub "arn:aws:s3:::${OutputBucketName}"
109 |               - !If
110 |                 - AddTestBucketPermissions
111 |                 - !Sub "arn:aws:s3:::${TestOutputsS3Bucket}"
112 |                 - !Ref "AWS::NoValue"
113 |           # Needed by the invocation Lambda function
114 |           - Effect: Allow
115 |             Action:
116 |               - 'logs:CreateLogGroup'
117 |               - 'logs:CreateLogStream'
118 |               - 'logs:PutLogEvents'
119 |             Resource: !Sub "arn:aws:logs:${AWS::Region}:${AWS::AccountId}:log-group:/aws/lambda/*"
120 |           - Effect: Allow
121 |             Action:
122 |               - 'sagemaker:InvokeEndpoint'
123 |             Resource:
124 |               - 'arn:aws:sagemaker:*:*:endpoint/*'
125 |           - Effect: Allow
126 |             Action:
127 |               - 'firehose:PutRecord'
128 |               - 'firehose:PutRecordBatch'
129 |             Resource: !Sub "arn:aws:firehose:${AWS::Region}:${AWS::AccountId}:deliverystream/${SolutionPrefix}-firehose-stream"
130 |           # SageMaker
131 |           - Effect: Allow
132 |             Action:
133 |               - 'sagemaker:CreateTrainingJob'
134 |               - 'sagemaker:DescribeTrainingJob'
135 |               - 'sagemaker:CreateModel'
136 |               - 'sagemaker:DescribeModel'
137 |               - 'sagemaker:DeleteModel'
138 |               - 'sagemaker:CreateEndpoint'
139 |               - 'sagemaker:CreateEndpointConfig'
140 |               - 'sagemaker:DescribeEndpoint'
141 |               - 'sagemaker:DescribeEndpointConfig'
142 |               - 'sagemaker:DeleteEndpoint'
143 |               - 'sagemaker:DeleteEndpointConfig'
144 |               - 'sagemaker:InvokeEndpoint'
145 |             Resource: !Sub "arn:aws:sagemaker:${AWS::Region}:${AWS::AccountId}:*"
146 |           - Effect: Allow
147 |             Action:
148 |               - 'ecr:GetDownloadUrlForLayer'
149 |               - 'ecr:BatchGetImage'
150 |               - 'ecr:BatchCheckLayerAvailability'
151 |             Resource: !Sub "arn:aws:ecr:${AWS::Region}:${AWS::AccountId}:repository/*"
152 |           - Effect: Allow
153 |             Action:
154 |               - 'ec2:CreateVpcEndpoint'
155 |               - 'ec2:DescribeRouteTables'
156 |             Resource: '*'
157 |           - Effect: Allow
158 |             Action:
159 |               - 'cloudwatch:PutMetricData'
160 |               - 'cloudwatch:GetMetricData'
161 |               - 'cloudwatch:GetMetricStatistics'
162 |               - 'cloudwatch:ListMetrics'
163 |             Resource: !Sub "arn:aws:cloudwatch:${AWS::Region}:${AWS::AccountId}:*"
164 |           - Effect: Allow
165 |             Action:
166 |               - 'logs:CreateLogGroup'
167 |               - 'logs:CreateLogStream'
168 |               - 'logs:DescribeLogStreams'
169 |               - 'logs:GetLogEvents'
170 |               - 'logs:PutLogEvents'
171 |             Resource: !Sub "arn:aws:logs:${AWS::Region}:${AWS::AccountId}:log-group:/aws/sagemaker/*"
172 |           - Effect: Allow
173 |             Action:
174 |               - 'iam:PassRole'
175 |             Resource:
176 |               - !GetAtt
177 |                 - NotebookInstanceExecutionRole
178 |                 - Arn
179 |             Condition:
180 |               StringEquals:
181 |                 'iam:PassedToService': sagemaker.amazonaws.com
182 |           - Effect: Allow
183 |             Action:
184 |               - 'iam:GetRole'
185 |             Resource:
186 |               - !GetAtt
187 |                 - NotebookInstanceExecutionRole
188 |                 - Arn
189 |           - Effect: Allow
190 |             Action:
191 |               - 'execute-api:Invoke'
192 |             Resource: !Sub "arn:aws:execute-api:${AWS::Region}:${AWS::AccountId}:${RESTAPIGateway}/*/POST/*"
193 |           # Needed by run_notebook.py for CI
194 |           - Effect: Allow
195 |             Action:
196 |               - cloudformation:DescribeStacks
197 |             Resource:
198 |               - !Ref RootStackID
199 |       Roles:
200 |         - !Ref NotebookInstanceExecutionRole
201 |     Metadata:
202 |       cfn_nag:
203 |         rules_to_suppress:
204 |           - id: W12
205 |             reason: >-
206 |               This policy needs to have * resource because some of the resources
207 |               are created dynamically and some of its actions are * resource
208 |               actions
209 |           - id: W76
210 |             reason: Need single policy doc for all permissions to better handle Studio.
211 | 
212 | Outputs:
213 |   SageMakerRoleArn:
214 |     Description: "SageMaker Execution Role for the solution"
215 |     Value: !GetAtt NotebookInstanceExecutionRole.Arn


--------------------------------------------------------------------------------
/deployment/fraud-detection-using-machine-learning.yaml:
--------------------------------------------------------------------------------
  1 | AWSTemplateFormatVersion: 2010-09-09
  2 | Description: >-
  3 |   (SO0056) - fraud-detection-using-machine-learning: Solution for predicting
  4 |   fraud events with ML using Amazon SageMaker. Version 3
  5 | Parameters:
  6 |   SolutionPrefix:
  7 |     Type: String
  8 |     Description: |
  9 |       Used to name resources created as part of this stack (and inside nested stacks too).
 10 |       Can be the same as the stack name used by AWS CloudFormation, but this field has extra
 11 |       constraints because it's used to name resources with restrictions (e.g. Amazon S3 bucket
 12 |       names cannot contain capital letters).
 13 |     AllowedPattern: '^sagemaker-soln-fdml[a-z0-9\-]{1,19}$'
 14 |     ConstraintDescription: |
 15 |       Should start with 'sagemaker-soln-fdml-' for permission management, followed by up to 19
 16 |       characters. Only allowed to use lowercase letters, hyphens and/or numbers.
 17 |     Default: "sagemaker-soln-fdml-"
 18 |   CreateSageMakerNotebookInstance:
 19 |     Description: Create a SageMaker Notebook Instance as part of the stack?
 20 |     Type: String
 21 |     Default: 'true'
 22 |     AllowedValues:
 23 |       - 'true'
 24 |       - 'false'
 25 |   IamRole:
 26 |     Type: String
 27 |     Default: ""
 28 |     Description: |
 29 |       The ARN of the IAM Role that will be attached to the resources created by this stack to grant them permissions to
 30 |       perform their required functions. If left blank, the template will attempt to create a role for you.
 31 |       That could cause a stack creation error if you don't have privileges to create new roles.
 32 |   StackVersion:
 33 |     Description: |
 34 |       CloudFormation Stack version.
 35 |       Use 'release' version unless you are customizing the
 36 |       CloudFormation templates and solution artifacts.
 37 |     Type: String
 38 |     Default: release
 39 |     AllowedValues:
 40 |       - release
 41 |       - development
 42 |   SolutionName:
 43 |     Description: |
 44 |       Prefix for the solution name. Needs to be Fraud-detection-using-machine-learning
 45 |       or begin with  'Fraud-detection-using-machine-learning-' followed by a set of letters and hyphens.
 46 |       Used to specify a particular directory on S3, that can correspond to a development branch.
 47 |     Type: String
 48 |     Default: "Fraud-detection-using-machine-learning"
 49 |     AllowedPattern: '^Fraud-detection-using-machine-learning-?[a-z\-]*$'
 50 |   TestOutputsS3Bucket:
 51 |     Description: |
 52 |       This bucket is used to store output files when the solution is tested. Should be left blank, unless this is an automated launch.
 53 |     Type: String
 54 |     Default: ""
 55 |     AllowedPattern: '^$|^sagemaker-soln-[a-z0-9\-\.]{1,48}$'
 56 |   KinesisFirehosePrefix:
 57 |     Type: String
 58 |     Default: fraud-detection/firehose/
 59 |     Description: Kinesis Firehose prefix for delivery of processed events.
 60 | Metadata:
 61 |   'AWS::CloudFormation::Interface':
 62 |     ParameterGroups:
 63 |       - Label:
 64 |           default: Solution Configuration
 65 |         Parameters:
 66 |           - SolutionPrefix
 67 |           - CreateSageMakerNotebookInstance
 68 |           - StackVersion
 69 |       - Label:
 70 |           default: Amazon Kinesis Firehose Configuration
 71 |         Parameters:
 72 |           - KinesisFirehosePrefix
 73 |     ParameterLabels:
 74 |       KinesisFirehosePrefix:
 75 |         default: Kinesis Firehose S3 Prefix
 76 | Mappings:
 77 |   Function:
 78 |     FraudDetection:
 79 |       S3Key: build/model_invocation.zip
 80 |   SolutionsS3BucketName:
 81 |     development:
 82 |       Prefix: sagemaker-solutions-build
 83 |     release:
 84 |       Prefix: sagemaker-solutions-prod
 85 | Conditions:
 86 |   CreateSageMakerNotebookInstance: !Equals [ !Ref CreateSageMakerNotebookInstance, 'true' ]
 87 |   CreateDemoEndpoint: !Equals [ !Ref CreateSageMakerNotebookInstance, 'false' ]
 88 |   CreateCustomSolutionRole: !Equals [!Ref IamRole, ""]
 89 | Resources:
 90 |   ModelDataBucket:
 91 |     Type: 'AWS::S3::Bucket'
 92 |     Properties:
 93 |       BucketName: !Sub ${SolutionPrefix}-${AWS::AccountId}-md
 94 |       PublicAccessBlockConfiguration:
 95 |         BlockPublicAcls: true
 96 |         BlockPublicPolicy: true
 97 |         IgnorePublicAcls: true
 98 |         RestrictPublicBuckets: true
 99 |       BucketEncryption:
100 |         ServerSideEncryptionConfiguration:
101 |           - ServerSideEncryptionByDefault:
102 |               SSEAlgorithm: AES256
103 |     Metadata:
104 |       cfn_nag:
105 |         rules_to_suppress:
106 |           - id: W35
107 |             reason: >-
108 |               Configuring logging requires supplying an existing customer S3
109 |               bucket to store logs
110 |           - id: W51
111 |             reason: Default policy works fine
112 |   OutputBucket:
113 |     Type: 'AWS::S3::Bucket'
114 |     Properties:
115 |       BucketName: !Sub ${SolutionPrefix}-${AWS::AccountId}-out
116 |       PublicAccessBlockConfiguration:
117 |         BlockPublicAcls: true
118 |         BlockPublicPolicy: true
119 |         IgnorePublicAcls: true
120 |         RestrictPublicBuckets: true
121 |       BucketEncryption:
122 |         ServerSideEncryptionConfiguration:
123 |           - ServerSideEncryptionByDefault:
124 |               SSEAlgorithm: AES256
125 |     Metadata:
126 |       cfn_nag:
127 |         rules_to_suppress:
128 |           - id: W35
129 |             reason: >-
130 |               Configuring logging requires supplying an existing customer S3
131 |               bucket to store logs
132 |           - id: W51
133 |             reason: Default policy works fine
134 |   LambdaFunction:
135 |     Type: 'AWS::Lambda::Function'
136 |     Properties:
137 |       Handler: index.lambda_handler
138 |       FunctionName: !Sub "${SolutionPrefix}-event-processor"
139 |       Role: !If [CreateCustomSolutionRole, !GetAtt SageMakerPermissionsStack.Outputs.SageMakerRoleArn, !Ref IamRole]
140 |       Code:
141 |         S3Bucket: !Sub
142 |           - "${SolutionsS3BucketNamePrefix}-${AWS::Region}"
143 |           - SolutionsS3BucketNamePrefix:
144 |               Fn::FindInMap: [SolutionsS3BucketName, Ref: StackVersion, Prefix]
145 |         S3Key: !Sub
146 |           - "${SolutionName}/${LambdaS3Key}"
147 |           - LambdaS3Key:
148 |               Fn::FindInMap: [Function, FraudDetection, S3Key]
149 |       Runtime: python3.8
150 |       Environment:
151 |         Variables:
152 |           SolutionPrefix: !Sub ${SolutionPrefix}
153 |           StreamName: !Ref KinesisFirehoseDeliveryStream
154 |     Metadata:
155 |         cfn_nag:
156 |           rules_to_suppress:
157 |             - id: W58
158 |               reason: >-
159 |                 The required permissions are provided in the permissions stack.
160 |   KinesisFirehoseDeliveryStream:
161 |     Type: 'AWS::KinesisFirehose::DeliveryStream'
162 |     Properties:
163 |       DeliveryStreamName: !Sub "${SolutionPrefix}-firehose-stream"
164 |       DeliveryStreamType: DirectPut
165 |       S3DestinationConfiguration:
166 |         BucketARN: !GetAtt
167 |           - OutputBucket
168 |           - Arn
169 |         Prefix: !Ref KinesisFirehosePrefix
170 |         BufferingHints:
171 |           IntervalInSeconds: 60
172 |           SizeInMBs: 100
173 |         CompressionFormat: GZIP
174 |         EncryptionConfiguration:
175 |           NoEncryptionConfig: NoEncryption
176 |         RoleARN: !If [CreateCustomSolutionRole, !GetAtt SageMakerPermissionsStack.Outputs.SageMakerRoleArn, !Ref IamRole]
177 |   RESTAPIGateway:
178 |     Type: 'AWS::ApiGateway::RestApi'
179 |     Properties:
180 |       Description: >-
181 |         A REST API that can be used to invoke the Lambda function that triggers
182 |         predictions.
183 |       Name: !Sub "${SolutionPrefix}-model-invocation-api"
184 |       EndpointConfiguration:
185 |         Types:
186 |           - REGIONAL
187 |   APIGatewayCloudWatchLogGroup:
188 |     Type: 'AWS::Logs::LogGroup'
189 |     Properties:
190 |       LogGroupName: !Join
191 |         - /
192 |         - - /aws/apigateway/AccessLogs
193 |           - !Ref RESTAPIGateway
194 |           - prod
195 |       RetentionInDays: 365
196 |   APIGatewayAccount:
197 |     Type: 'AWS::ApiGateway::Account'
198 |     Properties:
199 |       CloudWatchRoleArn: !If [CreateCustomSolutionRole, !GetAtt SageMakerPermissionsStack.Outputs.SageMakerRoleArn, !Ref IamRole]
200 |     DependsOn:
201 |       - RESTAPIGateway
202 |   LambdaAPIPermission:
203 |     Type: 'AWS::Lambda::Permission'
204 |     Properties:
205 |       Action: 'lambda:InvokeFunction'
206 |       FunctionName: !GetAtt
207 |         - LambdaFunction
208 |         - Arn
209 |       Principal: apigateway.amazonaws.com
210 |       SourceArn: !Join
211 |         - ''
212 |         - - 'arn:aws:execute-api:'
213 |           - !Ref 'AWS::Region'
214 |           - ':'
215 |           - !Ref 'AWS::AccountId'
216 |           - ':'
217 |           - !Ref RESTAPIGateway
218 |           - /*/POST/*
219 |   RESTInvocationResource:
220 |     Type: 'AWS::ApiGateway::Resource'
221 |     Properties:
222 |       ParentId: !GetAtt
223 |         - RESTAPIGateway
224 |         - RootResourceId
225 |       PathPart: invocations
226 |       RestApiId: !Ref RESTAPIGateway
227 |   POSTMethod:
228 |     Type: 'AWS::ApiGateway::Method'
229 |     Properties:
230 |       RestApiId: !Ref RESTAPIGateway
231 |       ResourceId: !Ref RESTInvocationResource
232 |       HttpMethod: POST
233 |       AuthorizationType: AWS_IAM
234 |       Integration:
235 |         Type: AWS
236 |         IntegrationHttpMethod: POST
237 |         Uri: !Join
238 |           - ''
239 |           - - 'arn:aws:apigateway:'
240 |             - !Ref 'AWS::Region'
241 |             - ':lambda:path/2015-03-31/functions/'
242 |             - !GetAtt
243 |               - LambdaFunction
244 |               - Arn
245 |             - /invocations
246 |         IntegrationResponses:
247 |           - ResponseTemplates:
248 |               application/json: ''
249 |             StatusCode: "200"
250 |           - SelectionPattern: ^not found.*
251 |             ResponseTemplates:
252 |               application/json: '{}'
253 |             StatusCode: "404"
254 |         PassthroughBehavior: WHEN_NO_TEMPLATES
255 |         RequestTemplates:
256 |           application/json: >-
257 |             {"data": $input.json('$.data'),"metadata":
258 |             $input.json('$.metadata'),"model": "$input.params('model')"}
259 |       MethodResponses:
260 |         - ResponseModels:
261 |             application/json: Empty
262 |           StatusCode: "200"
263 |         - ResponseModels:
264 |             application/json: Empty
265 |           StatusCode: "404"
266 |       RequestParameters:
267 |         method.request.querystring.model: false
268 |   RestApiDeployment:
269 |     Type: 'AWS::ApiGateway::Deployment'
270 |     Properties:
271 |       StageDescription:
272 |         AccessLogSetting:
273 |           DestinationArn: !GetAtt
274 |             - APIGatewayCloudWatchLogGroup
275 |             - Arn
276 |           Format: !Join
277 |             - ','
278 |             - - '{"requestId":"$context.requestId"'
279 |               - '"ip": "$context.identity.sourceIp"'
280 |               - '"caller":"$context.identity.caller"'
281 |               - '"user":"$context.identity.user"'
282 |               - '"requestTime":"$context.requestTime"'
283 |               - '"httpMethod":"$context.httpMethod"'
284 |               - '"resourcePath":"$context.resourcePath"'
285 |               - '"status":"$context.status"'
286 |               - '"protocol":"$context.protocol"'
287 |               - '"responseLength":"$context.responseLength"}'
288 |       RestApiId: !Ref RESTAPIGateway
289 |       StageName: prod
290 |     DependsOn:
291 |       - POSTMethod
292 |     Metadata:
293 |       cfn_nag:
294 |         rules_to_suppress:
295 |           - id: W68
296 |             reason: Resource not associated with an AWS::ApiGateway::UsagePlan for now
297 |   SolutionAssistantStack:
298 |     Type: "AWS::CloudFormation::Stack"
299 |     Properties:
300 |       TemplateURL: !Sub
301 |       - "https://${SolutionsS3BucketNamePrefix}-${AWS::Region}.s3.${AWS::Region}.amazonaws.com/${SolutionName}/deployment/solution-assistant/solution-assistant.yaml"
302 |       - SolutionsS3BucketNamePrefix:
303 |           Fn::FindInMap: [SolutionsS3BucketName, Ref: StackVersion, Prefix]
304 |       Parameters:
305 |         SolutionPrefix: !Ref SolutionPrefix
306 |         SolutionName: !Ref SolutionName
307 |         StackName: !Ref AWS::StackName
308 |         ModelDataBucketName: !Ref ModelDataBucket
309 |         OutputBucketName: !Ref OutputBucket
310 |         SolutionsS3BucketName: !Sub
311 |           - ${SolutionsS3BucketNamePrefix}-${AWS::Region}
312 |           - SolutionsS3BucketNamePrefix:
313 |               Fn::FindInMap: [SolutionsS3BucketName, Ref: StackVersion, Prefix]
314 |         RoleArn: !If [CreateCustomSolutionRole, !GetAtt SageMakerPermissionsStack.Outputs.SageMakerRoleArn, !Ref IamRole]
315 | 
316 |   SageMakerStack:
317 |     Type: "AWS::CloudFormation::Stack"
318 |     Condition: CreateSageMakerNotebookInstance
319 |     Properties:
320 |       TemplateURL: !Sub
321 |       - "https://${SolutionsS3BucketNamePrefix}-${AWS::Region}.s3.${AWS::Region}.amazonaws.com/${SolutionName}/deployment/fraud-detection-sagemaker-notebook-instance.yaml"
322 |       - SolutionsS3BucketNamePrefix:
323 |           Fn::FindInMap: [SolutionsS3BucketName, Ref: StackVersion, Prefix]
324 |       Parameters:
325 |         SolutionPrefix: !Ref SolutionPrefix
326 |         ParentStackName: !Ref AWS::StackName
327 |         SolutionName: !Ref SolutionName
328 |         StackVersion: !Ref StackVersion
329 |         ModelDataBucket: !Ref ModelDataBucket
330 |         NotebookInstanceExecutionRoleArn: !If [CreateCustomSolutionRole, !GetAtt SageMakerPermissionsStack.Outputs.SageMakerRoleArn, !Ref IamRole]
331 |         RESTAPIGateway: !Ref RESTAPIGateway
332 |         TestOutputsS3Bucket: !Ref TestOutputsS3Bucket
333 | 
334 |   SageMakerPermissionsStack:
335 |     Type: "AWS::CloudFormation::Stack"
336 |     Condition: CreateCustomSolutionRole
337 |     Properties:
338 |       TemplateURL: !Sub
339 |       - "https://${SolutionsS3BucketNamePrefix}-${AWS::Region}.s3.${AWS::Region}.amazonaws.com/${SolutionName}/deployment/fraud-detection-sagemaker-permissions-stack.yaml"
340 |       - SolutionsS3BucketNamePrefix:
341 |           !FindInMap [SolutionsS3BucketName, Ref: StackVersion, Prefix]
342 |       Parameters:
343 |         SolutionPrefix: !Ref SolutionPrefix
344 |         ModelDataBucketName: !Ref ModelDataBucket
345 |         OutputBucketName: !Ref OutputBucket
346 |         RESTAPIGateway: !Ref RESTAPIGateway
347 |         StackVersion: !Ref StackVersion
348 |         RootStackID: !Ref AWS::StackId
349 |         TestOutputsS3Bucket: !Ref TestOutputsS3Bucket
350 | 
351 |   DemoEndpointStack:
352 |     Type: "AWS::CloudFormation::Stack"
353 |     Condition: CreateDemoEndpoint
354 |     Properties:
355 |       TemplateURL: !Sub
356 |       - "https://${SolutionsS3BucketNamePrefix}-${AWS::Region}.s3.${AWS::Region}.amazonaws.com/${SolutionName}/deployment/fraud-detection-sagemaker-demo-stack.yaml"
357 |       - SolutionsS3BucketNamePrefix:
358 |           !FindInMap [SolutionsS3BucketName, Ref: StackVersion, Prefix]
359 |       Parameters:
360 |         SolutionPrefix: !Ref SolutionPrefix
361 |         SolutionsBucket: !Sub
362 |           - "${SolutionsS3BucketNamePrefix}-${AWS::Region}"
363 |           - SolutionsS3BucketNamePrefix:
364 |               !FindInMap [SolutionsS3BucketName, Ref: StackVersion, Prefix]
365 |         SolutionName: !Ref SolutionName
366 |         ExecutionRoleArn: !If [CreateCustomSolutionRole, !GetAtt SageMakerPermissionsStack.Outputs.SageMakerRoleArn, !Ref IamRole]
367 | 
368 | Outputs:
369 |   SageMakerNotebook:
370 |     Condition: CreateSageMakerNotebookInstance
371 |     Description: "Opens the Jupyter notebook to get started with model training"
372 |     Value: !GetAtt SageMakerStack.Outputs.SageMakerNotebook
373 |   FirehoseDeliveryStreamArn:
374 |     Description: Firehose Delivery Stream ARN
375 |     Value: !GetAtt
376 |       - KinesisFirehoseDeliveryStream
377 |       - Arn
378 |   FraudStackName:
379 |     Value: !Ref AWS::StackName
380 |   SolutionPrefix:
381 |     Value: !Ref SolutionPrefix
382 |   AwsAccountId:
383 |     Value: !Ref AWS::AccountId
384 |   AwsRegion:
385 |     Value: !Ref AWS::Region
386 |   IamRole:
387 |     Value: !If [CreateCustomSolutionRole, !GetAtt SageMakerPermissionsStack.Outputs.SageMakerRoleArn, !Ref IamRole]
388 |   ModelDataBucket:
389 |     Value: !Ref ModelDataBucket
390 |   SolutionsS3Bucket:
391 |     Value: !FindInMap [SolutionsS3BucketName, Ref: StackVersion, Prefix]
392 |   RESTAPIGateway:
393 |     Value: !Ref RESTAPIGateway
394 |   SagemakerMode:
395 |     Value: !If
396 |       - CreateSageMakerNotebookInstance
397 |       - "NotebookInstance"
398 |       - "Studio"
399 |   DemoEndpointName:
400 |     Condition: CreateDemoEndpoint
401 |     Value: DemoEndpointStack.Outputs.EndpointName
402 |   SolutionName:
403 |     Value: !Ref SolutionName
404 |   TestOutputsS3Bucket:
405 |     Value: !Ref TestOutputsS3Bucket
406 | 


--------------------------------------------------------------------------------
/deployment/solution-assistant/requirements.in:
--------------------------------------------------------------------------------
1 | crhelper
2 | 


--------------------------------------------------------------------------------
/deployment/solution-assistant/solution-assistant.yaml:
--------------------------------------------------------------------------------
 1 | AWSTemplateFormatVersion: 2010-09-09
 2 | Description: Stack for Solution Helper resources.
 3 | Parameters:
 4 |   SolutionPrefix:
 5 |     Description: Used as a prefix for resources.
 6 |     Type: String
 7 |   SolutionName:
 8 |     Description: The name of the solution, used as an S3 prefix
 9 |     Type: String
10 |   StackName:
11 |     Description: |
12 |       Used as a prefix to name all stack resources.
13 |       Should be root stack's name if this is a nested stack.
14 |     Type: String
15 |   ModelDataBucketName:
16 |     Description: Amazon S3 Bucket used to store trained model and data.
17 |     Type: String
18 |   OutputBucketName:
19 |     Description: Amazon S3 Bucket used to store prediction outputs.
20 |     Type: String
21 |   SolutionsS3BucketName:
22 |     Description: Amazon S3 Bucket containing solutions.
23 |     Type: String
24 |   RoleArn:
25 |     Description: The ARN of the role to be used to clean and delete the resources.
26 |     Type: String
27 | Mappings:
28 |   Function:
29 |     SolutionAssistant:
30 |       S3Key: "build/solution_assistant.zip"
31 | Resources:
32 |   SolutionAssistant:
33 |     Type: "Custom::SolutionAssistant"
34 |     Properties:
35 |       SolutionPrefix: !Ref SolutionPrefix
36 |       SolutionName: !Ref SolutionName
37 |       ServiceToken: !GetAtt SolutionAssistantLambda.Arn
38 |       StackName: !Ref StackName
39 |       ModelDataBucketName: !Ref ModelDataBucketName
40 |       OutputBucketName: !Ref OutputBucketName
41 |       SolutionsS3BucketName: !Ref SolutionsS3BucketName
42 |       RoleArn: !Ref RoleArn
43 |   SolutionAssistantLambda:
44 |     Type: AWS::Lambda::Function
45 |     Properties:
46 |       Handler: "lambda_function.handler"
47 |       FunctionName: !Sub "${SolutionPrefix}-solution-assistant"
48 |       Role: !Ref RoleArn
49 |       Runtime: "python3.8"
50 |       Code:
51 |         S3Bucket: !Ref SolutionsS3BucketName
52 |         S3Key: !Sub
53 |           - "${SolutionName}/${LambdaS3Key}"
54 |           - LambdaS3Key:
55 |               Fn::FindInMap: [Function, SolutionAssistant, S3Key]
56 |       Timeout : 60
57 |     Metadata:
58 |         cfn_nag:
59 |           rules_to_suppress:
60 |             - id: W58
61 |               reason: >-
62 |                 The required permissions are provided in the permissions stack.
63 | 


--------------------------------------------------------------------------------
/deployment/solution-assistant/src/lambda_function.py:
--------------------------------------------------------------------------------
  1 | import boto3
  2 | import sys
  3 | import time
  4 | 
  5 | sys.path.append('./site-packages')
  6 | from crhelper import CfnResource
  7 | 
  8 | helper = CfnResource()
  9 | 
 10 | 
 11 | @helper.create
 12 | def on_create(_, __):
 13 |     pass
 14 | 
 15 | @helper.update
 16 | def on_update(_, __):
 17 |     pass
 18 | 
 19 | 
 20 | def delete_sagemaker_endpoint(endpoint_name):
 21 |     sagemaker_client = boto3.client("sagemaker")
 22 |     try:
 23 |         sagemaker_client.delete_endpoint(EndpointName=endpoint_name)
 24 |         print(
 25 |             "Successfully deleted endpoint "
 26 |             "called '{}'.".format(endpoint_name)
 27 |         )
 28 |     except sagemaker_client.exceptions.ClientError as e:
 29 |         if "Could not find endpoint" in str(e):
 30 |             print(
 31 |                 "Could not find endpoint called '{}'. "
 32 |                 "Skipping delete.".format(endpoint_name)
 33 |             )
 34 |         else:
 35 |             raise e
 36 | 
 37 | 
 38 | def delete_sagemaker_endpoint_config(endpoint_config_name):
 39 |     sagemaker_client = boto3.client("sagemaker")
 40 |     try:
 41 |         sagemaker_client.delete_endpoint_config(
 42 |             EndpointConfigName=endpoint_config_name
 43 |         )
 44 |         print(
 45 |             "Successfully deleted endpoint configuration "
 46 |             "called '{}'.".format(endpoint_config_name)
 47 |         )
 48 |     except sagemaker_client.exceptions.ClientError as e:
 49 |         if "Could not find endpoint configuration" in str(e):
 50 |             print(
 51 |                 "Could not find endpoint configuration called '{}'. "
 52 |                 "Skipping delete.".format(endpoint_config_name)
 53 |             )
 54 |         else:
 55 |             raise e
 56 | 
 57 | 
 58 | def delete_sagemaker_model(model_name):
 59 |     sagemaker_client = boto3.client("sagemaker")
 60 |     try:
 61 |         sagemaker_client.delete_model(ModelName=model_name)
 62 |         print("Successfully deleted model called '{}'.".format(model_name))
 63 |     except sagemaker_client.exceptions.ClientError as e:
 64 |         if "Could not find model" in str(e):
 65 |             print(
 66 |                 "Could not find model called '{}'. "
 67 |                 "Skipping delete.".format(model_name)
 68 |             )
 69 |         else:
 70 |             raise e
 71 | 
 72 | 
 73 | def delete_s3_objects(bucket_name):
 74 |     s3_resource = boto3.resource("s3")
 75 |     try:
 76 |         s3_resource.Bucket(bucket_name).objects.all().delete()
 77 |         print(
 78 |             "Successfully deleted objects in bucket "
 79 |             "called '{}'.".format(bucket_name)
 80 |         )
 81 |     except s3_resource.meta.client.exceptions.NoSuchBucket:
 82 |         print(
 83 |             "Could not find bucket called '{}'. "
 84 |             "Skipping delete.".format(bucket_name)
 85 |         )
 86 | 
 87 | 
 88 | def delete_s3_bucket(bucket_name):
 89 |     s3_resource = boto3.resource("s3")
 90 |     try:
 91 |         s3_resource.Bucket(bucket_name).delete()
 92 |         print(
 93 |             "Successfully deleted bucket "
 94 |             "called '{}'.".format(bucket_name)
 95 |         )
 96 |     except s3_resource.meta.client.exceptions.NoSuchBucket:
 97 |         print(
 98 |             "Could not find bucket called '{}'. "
 99 |             "Skipping delete.".format(bucket_name)
100 |         )
101 | 
102 | 
103 | def bucket_delete_retry(bucket_name):
104 |     # Try to empty the bucket then delete the model-data bucket 5 times
105 |     # This is needed because the thread we open
106 |     s3_client = boto3.client("s3")
107 |     for _ in range(5):
108 |         delete_s3_objects(bucket_name)
109 |         delete_s3_bucket(bucket_name)
110 | 
111 |         # Give the delete op time to finish
112 |         time.sleep(10)
113 | 
114 |         try:
115 |             _ = s3_client.head_bucket(Bucket=bucket_name)
116 |         except s3_client.exceptions.ClientError:
117 |             break  # This is good, the bucket was deleted, so we just exit the loop
118 | 
119 |         # Otherwise wait a minute and try again
120 |         time.sleep(60)
121 | 
122 | 
123 | @helper.delete
124 | def on_delete(event, __):
125 |     # remove sagemaker endpoints
126 |     solution_prefix = event["ResourceProperties"]["SolutionPrefix"]
127 |     endpoint_names = [
128 |         "{}-rcf".format(solution_prefix),
129 |         "{}-xgb".format(solution_prefix),
130 |         "{}-xgb-smote".format(solution_prefix)
131 |     ]
132 |     for endpoint_name in endpoint_names:
133 |         delete_sagemaker_model(endpoint_name)
134 |         delete_sagemaker_endpoint_config(endpoint_name)
135 |         delete_sagemaker_endpoint(endpoint_name)
136 | 
137 |     # delete buckets
138 |     model_data_bucket = event["ResourceProperties"]["ModelDataBucketName"]
139 |     output_bucket = event["ResourceProperties"]["OutputBucketName"]
140 |     bucket_delete_retry(model_data_bucket)
141 |     bucket_delete_retry(output_bucket)
142 | 
143 | 
144 | def handler(event, context):
145 |     helper(event, context)
146 | 


--------------------------------------------------------------------------------
/source/env_setup.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import json
  3 | from pathlib import Path
  4 | import subprocess
  5 | import logging
  6 | import sys
  7 | 
  8 | CURRENT_FILE = Path(__file__).resolve()
  9 | CURRENT_FOLDER = CURRENT_FILE.parent
 10 | LOGBOOK_FILE = Path(CURRENT_FOLDER, 'env_setup_logbook.json')
 11 | ROOT_PATHS = {'NotebookInstance': Path('/home/ec2-user'), 'Studio': Path('/root')}
 12 | BIN_PATHS = {'NotebookInstance': Path('/usr/bin'), 'Studio': Path('/opt/conda/bin')}
 13 | 
 14 | 
 15 | # Common setup
 16 | 
 17 | def get_sagemaker_mode() -> str:
 18 |     stack_outputs_file = Path(CURRENT_FOLDER, 'stack_outputs.json')
 19 |     with open(stack_outputs_file) as f:
 20 |         outputs = json.load(f)
 21 |     sagemaker_mode = outputs['SagemakerMode']
 22 |     if sagemaker_mode not in set(['Studio', 'NotebookInstance']):
 23 |         raise ValueError('SagemakerMode should be Studio or NotebookInstance. Check stack_outputs.json.')
 24 |     return sagemaker_mode
 25 | 
 26 | 
 27 | def get_executable() -> str:
 28 |     return sys.executable
 29 | 
 30 | 
 31 | def get_hostname() -> str:
 32 |     hostname_file = Path('/etc/hostname')
 33 |     if hostname_file.is_file():
 34 |         with open(hostname_file, 'r') as f:
 35 |             contents = f.readlines()
 36 |         assert len(contents) == 1
 37 |         hostname = contents[0].strip()
 38 |     else:
 39 |         logging.warning(f'Could not find {hostname_file}. Setting hostname to None.')
 40 |         hostname = None
 41 |     return hostname
 42 | 
 43 | 
 44 | def parse_args() -> argparse.Namespace:
 45 |     parser = argparse.ArgumentParser(description='Setup environment for solution.')
 46 |     parser.add_argument('--force', action='store_true',)
 47 |     parser.add_argument('--log-level', type=str, default='INFO')
 48 |     args = parser.parse_args()
 49 |     return args
 50 | 
 51 | 
 52 | def read_file(file: str) -> str:
 53 |     with open(file, 'r') as f:
 54 |         return f.read()
 55 | 
 56 | 
 57 | def bash(cmd: str) -> subprocess.CompletedProcess:
 58 |     try:
 59 |         if logging.root.level > logging.DEBUG:
 60 |             stdout = subprocess.PIPE
 61 |             stderr = subprocess.PIPE
 62 |         else:
 63 |             stdout = sys.stdout
 64 |             stderr = sys.stderr
 65 |         process = subprocess.run(
 66 |             "set -e" + '\n' + cmd,
 67 |             shell=True,
 68 |             check=True,
 69 |             universal_newlines=True,  # same as text=True but support py3.6 too
 70 |             stdout=stdout,
 71 |             stderr=stderr
 72 |         )
 73 |     except subprocess.CalledProcessError as e:
 74 |         if logging.root.level > logging.DEBUG:
 75 |             logging.error('\n' + e.stderr)
 76 |         raise e
 77 |     return process
 78 | 
 79 | 
 80 | def logging_setup(level: str) -> None:
 81 |     level = logging.getLevelName(level)
 82 |     logging.basicConfig(stream=sys.stdout, level=level)
 83 | 
 84 | 
 85 | def env_setup() -> None:
 86 |     args = parse_args()
 87 |     logging_setup(args.log_level)
 88 |     sagemaker_mode = get_sagemaker_mode()
 89 |     if sagemaker_mode == 'Studio':
 90 |         hostname = get_hostname()
 91 |         logging.debug(f'hostname: {hostname}')
 92 |         executable = get_executable()
 93 |         logging.debug(f'executable: {executable}')
 94 |         if args.force or not in_logbook(hostname, executable):
 95 |             env_setup_studio()
 96 |             logging.info('Successfully setup environment.')
 97 |             add_to_logbook(hostname, executable)
 98 |         else:
 99 |             logging.info('Skipping. Already setup environment.')
100 |     if sagemaker_mode == 'NotebookInstance':
101 |         if args.force:
102 |             env_setup_notebook_instance()
103 |             logging.info('Successfully setup environment.')
104 |         else:
105 |             logging.info('Skipping. Already setup environment.')
106 | 
107 | 
108 | def in_logbook(hostname: str, executable: str) -> bool:
109 |     if LOGBOOK_FILE.is_file():
110 |         with open(LOGBOOK_FILE, 'r') as f:
111 |             logbook = json.load(f)
112 |         for entry in logbook:
113 |             if (entry['hostname'] == hostname) and (entry['executable'] == executable):
114 |                 return True
115 |         logging.debug('Could not find a matching entry in logbook.')
116 |         return False
117 |     else:
118 |         logging.debug(f'Could not find logbook at {LOGBOOK_FILE}.')
119 |         return False
120 | 
121 | 
122 | def add_to_logbook(hostname: str, executable: str) -> None:
123 |     if (hostname is None) or (executable is None):
124 |         logging.warn('Could not add to logbook because either hostname or executable is empty.')
125 |     else:
126 |         new_entry = {'hostname': hostname, 'executable': executable}
127 |         if LOGBOOK_FILE.is_file():
128 |             with open(LOGBOOK_FILE, 'r') as f:
129 |                 logbook = json.load(f)
130 |         else:
131 |             logbook = []
132 |         for entry in logbook:
133 |             if (entry['hostname'] == hostname) and (entry['executable'] == executable):
134 |                 return  # don't need to add since already in logbook
135 |         logbook.append(new_entry)
136 |         with open(LOGBOOK_FILE, 'w') as f:
137 |             json.dump(logbook, f)
138 | 
139 | 
140 | # Solution specific setup
141 | 
142 | def env_setup_notebook_instance() -> None:
143 |     logging.info('Starting environment setup for Notebook Instance.')
144 |     py_exec = get_executable()
145 | 
146 |     logging.info('Uninstalling Python packages installed with distutils.')
147 |     bash("""
148 |     # fix to upgrade `docutils` that was installed with `distutils` (hence pip can't uninstall)
149 |     rm -rf /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/docutils
150 |     rm -rf /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/docutils-*
151 |     """)
152 | 
153 |     # This is due to papermill->black causing an inconsistency
154 |     logging.info("Removing incompatible package")
155 |     bash(f"""
156 |     export PIP_DISABLE_PIP_VERSION_CHECK=1
157 |     {py_exec} -m pip uninstall -y enum34
158 |     """)
159 | 
160 |     logging.info('Upgrading pip packages.')
161 |     bash(f"""
162 |     export PIP_DISABLE_PIP_VERSION_CHECK=1
163 |     {py_exec} -m pip  install --upgrade pyyaml --ignore-installed
164 |     """)
165 | 
166 |     logging.info('Installing pip packages.')
167 |     bash(f"""
168 |     export PIP_DISABLE_PIP_VERSION_CHECK=1
169 |     {py_exec} -m pip install -r {CURRENT_FOLDER}/notebooks/requirements.txt
170 |     {py_exec} -m pip install -e {CURRENT_FOLDER}/notebooks/
171 |     """)
172 | 
173 |     # This required for us to run papermill on a different env from where it was installed
174 |     # logging.info("Update nb_conda_kernels")
175 |     # # nohup because conda env solving is slow
176 |     # bash("""
177 |     # nohup conda update -n python3 nb_conda_kernels -y &
178 |     # """)
179 | 
180 | 
181 | def env_setup_studio() -> None:
182 |     logging.info('Starting environment setup for Studio.')
183 |     py_exec = get_executable()
184 | 
185 |     logging.info('Installing local packages.')
186 |     bash(f"""
187 |     export PIP_DISABLE_PIP_VERSION_CHECK=1
188 |     {py_exec} -m pip install -e {CURRENT_FOLDER}/notebooks/
189 |     """)
190 | 
191 |     logging.info('Completed environment setup for Studio.')
192 | 
193 | 
194 | if __name__ == "__main__":
195 |     env_setup()
196 | 


--------------------------------------------------------------------------------
/source/lambda/model-invocation/index.py:
--------------------------------------------------------------------------------
 1 | ##############################################################################
 2 | #  Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.   #
 3 | #                                                                            #
 4 | #  Licensed under the Amazon Software License (the "License"). You may not   #
 5 | #  use this file except in compliance with the License. A copy of the        #
 6 | #  License is located at                                                     #
 7 | #                                                                            #
 8 | #      http://aws.amazon.com/asl/                                            #
 9 | #                                                                            #
10 | #  or in the "license" file accompanying this file. This file is distributed #
11 | #  on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND,        #
12 | #  express or implied. See the License for the specific language governing   #
13 | #  permissions and limitations under the License.                            #
14 | ##############################################################################
15 | import json
16 | import os
17 | import logging
18 | 
19 | import boto3
20 | 
21 | logger = logging.getLogger()
22 | logger.setLevel(logging.INFO)
23 | 
24 | STREAM_NAME = os.environ['StreamName']
25 | SOLUTION_PREFIX = os.environ['SolutionPrefix']
26 | 
27 | 
28 | def lambda_handler(event, context):
29 |     logger.info(event)
30 |     metadata = event.get('metadata', None)
31 |     assert metadata, "Request did not include metadata!"
32 |     data_payload = event.get('data', None)
33 |     assert data_payload, "Payload did not include a data field!"
34 |     model_choice = event.get('model', None)
35 |     valid_models = {'anomaly_detector', 'fraud_classifier'}
36 |     if model_choice:
37 |         assert model_choice in valid_models, "The requested model, {}, was not a valid model name {}".format(model_choice, valid_models)
38 |     models = {model_choice} if model_choice else valid_models
39 | 
40 |     output = {}
41 |     if 'anomaly_detector' in models:
42 |         output["anomaly_detector"] = get_anomaly_prediction(data_payload)
43 | 
44 |     if 'fraud_classifier' in models:
45 |         output["fraud_classifier"] = get_fraud_prediction(data_payload)
46 | 
47 |     store_data_prediction(output, metadata)
48 |     return output
49 | 
50 | 
51 | def get_anomaly_prediction(data):
52 |     sagemaker_endpoint_name = "{}-rcf".format(SOLUTION_PREFIX)
53 |     sagemaker_runtime = boto3.client('sagemaker-runtime')
54 |     response = sagemaker_runtime.invoke_endpoint(
55 |         EndpointName=sagemaker_endpoint_name, ContentType='text/csv', Body=data)
56 |     # Extract anomaly score from the endpoint response
57 |     anomaly_score = json.loads(response['Body'].read().decode())["scores"][0]["score"]
58 |     logger.info("anomaly score: {}".format(anomaly_score))
59 | 
60 |     return {"score": anomaly_score}
61 | 
62 | 
63 | def get_fraud_prediction(data, threshold=0.5):
64 |     sagemaker_endpoint_name = "{}-xgb".format(SOLUTION_PREFIX)
65 |     sagemaker_runtime = boto3.client('sagemaker-runtime')
66 |     response = sagemaker_runtime.invoke_endpoint(
67 |         EndpointName=sagemaker_endpoint_name, ContentType='text/csv',Body=data)
68 |     pred_proba = json.loads(response['Body'].read().decode())
69 |     prediction = 0 if pred_proba < threshold else 1
70 | 
71 |     logger.info("classification pred_proba: {}, prediction: {}".format(pred_proba, prediction))
72 | 
73 |     return {"pred_proba": pred_proba, "prediction": prediction}
74 | 
75 | 
76 | def store_data_prediction(output_dict, metadata):
77 |     firehose_delivery_stream = STREAM_NAME
78 |     firehose = boto3.client('firehose', region_name=os.environ['AWS_REGION'])
79 | 
80 |     # Extract anomaly score and classifier prediction, if they exist
81 |     fraud_pred = output_dict["fraud_classifier"]["prediction"] if 'fraud_classifier' in output_dict else ""
82 |     anomaly_score = output_dict["anomaly_detector"]["score"] if 'anomaly_detector' in output_dict else ""
83 | 
84 |     record = ','.join(metadata + [str(fraud_pred), str(anomaly_score)]) + '\n'
85 | 
86 |     success = firehose.put_record(
87 |         DeliveryStreamName=firehose_delivery_stream, Record={'Data': record})
88 |     if success:
89 |         logger.info("Record logged: {}".format(record))
90 |     else:
91 |         logger.warning("Record delivery failed for record: {}".format(record))
92 | 


--------------------------------------------------------------------------------
/source/notebooks/endpoint_demo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "source": [
  5 |     "In this notebook you can get a quick preview of what the outcome when you complete the full notebook for this solution.\n",
  6 |     "\n",
  7 |     "Here we are using a pre-trained XGBoost model to make predictions for our test dataset,  and evaluate its accuracy.\n",
  8 |     "\n",
  9 |     "You can select Run->Run All from the menu to run all cells in Studio (or Cell->Run All in a SageMaker Notebook Instance)."
 10 |    ],
 11 |    "cell_type": "markdown",
 12 |    "metadata": {}
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": null,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import sys\n",
 21 |     "sys.path.append('./src/')\n",
 22 |     "from package import config"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "markdown",
 27 |    "metadata": {},
 28 |    "source": [
 29 |     "## Read in the data"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": null,
 35 |    "metadata": {},
 36 |    "outputs": [],
 37 |    "source": [
 38 |     "import boto3\n",
 39 |     "from zipfile import ZipFile\n",
 40 |     "\n",
 41 |     "s3 = boto3.resource('s3')\n",
 42 |     "object = s3.Object(f\"{config.SOLUTIONS_S3_BUCKET}-{config.AWS_REGION}\",f\"{config.SOLUTION_NAME}/data/creditcardfraud.zip\")\n",
 43 |     "object.download_file(\"creditcardfraud.zip\")\n",
 44 |     "\n",
 45 |     "with ZipFile('creditcardfraud.zip', 'r') as zf:\n",
 46 |     "    zf.extractall()"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "source": [
 51 |     "## Split intro train/test"
 52 |    ],
 53 |    "cell_type": "markdown",
 54 |    "metadata": {}
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": null,
 59 |    "metadata": {},
 60 |    "outputs": [],
 61 |    "source": [
 62 |     "import numpy as np \n",
 63 |     "import pandas as pd\n",
 64 |     "\n",
 65 |     "data = pd.read_csv('creditcard.csv', delimiter=',')\n",
 66 |     "\n",
 67 |     "feature_columns = data.columns[:-1]\n",
 68 |     "label_column = data.columns[-1]\n",
 69 |     "\n",
 70 |     "features = data[feature_columns].values.astype('float32')\n",
 71 |     "labels = (data[label_column].values).astype('float32')\n",
 72 |     "\n",
 73 |     "from sklearn.model_selection import train_test_split\n",
 74 |     "\n",
 75 |     "X_train, X_test, y_train, y_test = train_test_split(\n",
 76 |     "    features, labels, test_size=0.1, random_state=42)"
 77 |    ]
 78 |   },
 79 |   {
 80 |    "cell_type": "markdown",
 81 |    "metadata": {},
 82 |    "source": [
 83 |     "## Set up a predictor, using the demo endpoint, and a pre-trained model"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "from sagemaker.predictor import csv_serializer, RealTimePredictor\n",
 93 |     "\n",
 94 |     "xgb_predictor = RealTimePredictor(endpoint=\"{}-demo\".format(config.SOLUTION_PREFIX),\n",
 95 |     "                          serializer=csv_serializer,\n",
 96 |     "                          deserializer=None,\n",
 97 |     "                          content_type='text/csv')"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "# Because we have a large test set, we call predict on smaller batches\n",
107 |     "def predict(current_predictor, data, rows=500):\n",
108 |     "    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))\n",
109 |     "    predictions = ''\n",
110 |     "    for array in split_array:\n",
111 |     "        predictions = ','.join([predictions, current_predictor.predict(array).decode('utf-8')])\n",
112 |     "\n",
113 |     "    return np.fromstring(predictions[1:], sep=',')"
114 |    ]
115 |   },
116 |   {
117 |    "cell_type": "markdown",
118 |    "metadata": {},
119 |    "source": [
120 |     "## Make predictions and evaluate accuracy"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": null,
126 |    "metadata": {},
127 |    "outputs": [],
128 |    "source": [
129 |     "raw_preds = predict(xgb_predictor, X_test)"
130 |    ]
131 |   },
132 |   {
133 |    "cell_type": "code",
134 |    "execution_count": null,
135 |    "metadata": {},
136 |    "outputs": [],
137 |    "source": [
138 |     "from sklearn.metrics import balanced_accuracy_score, cohen_kappa_score\n",
139 |     "\n",
140 |     "# scikit-learn expects 0/1 predictions, so we threshold our raw predictions\n",
141 |     "y_preds = np.where(raw_preds > 0.5, 1, 0)\n",
142 |     "print(\"Balanced accuracy = {}\".format(balanced_accuracy_score(y_test, y_preds)))\n",
143 |     "print(\"Cohen's Kappa = {}\".format(cohen_kappa_score(y_test, y_preds)))"
144 |    ]
145 |   }
146 |  ],
147 |  "metadata": {
148 |   "kernelspec": {
149 |    "display_name": "Python 3 (Data Science JumpStart)",
150 |    "language": "python",
151 |    "name": "HUB_1P_IMAGE"
152 |   }
153 |  },
154 |  "nbformat": 4,
155 |  "nbformat_minor": 4
156 | }


--------------------------------------------------------------------------------
/source/notebooks/requirements.in:
--------------------------------------------------------------------------------
 1 | imbalanced-learn
 2 | aws_requests_auth
 3 | matplotlib
 4 | scikit-learn
 5 | pandas
 6 | papermill
 7 | sagemaker==1.72.0
 8 | traitlets==4.3.3
 9 | boto3
10 | seaborn
11 | awscli
12 | watchtower
13 | 


--------------------------------------------------------------------------------
/source/notebooks/sagemaker_fraud_detection.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Credit card fraud detector"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "markdown",
  12 |    "metadata": {},
  13 |    "source": [
  14 |     "In this solution we will build the core of a credit card fraud detection system using SageMaker. We will start by training an anomaly detection algorithm, then proceed to train two XGBoost models for supervised training. To deal with the highly unbalanced data common in fraud detection, our first model will use re-weighting of the data, and the second will use re-sampling, using the popular SMOTE technique for oversampling the rare fraud data.\n",
  15 |     "\n",
  16 |     "Our solution includes an example of making calls to a REST API to simulate a real deployment, using AWS Lambda to trigger both the anomaly detection and XGBoost model.\n",
  17 |     "\n",
  18 |     "You can select Run->Run All from the menu to run all cells in Studio (or Cell->Run All in a SageMaker Notebook Instance)."
  19 |    ]
  20 |   },
  21 |   {
  22 |    "cell_type": "markdown",
  23 |    "metadata": {},
  24 |    "source": [
  25 |     "**Note**: When running this notebook on SageMaker Studio, you should make sure the 'SageMaker JumpStart Data Science 1.0' image/kernel is used."
  26 |    ]
  27 |   },
  28 |   {
  29 |    "cell_type": "markdown",
  30 |    "metadata": {},
  31 |    "source": [
  32 |     "### Set up environment"
  33 |    ]
  34 |   },
  35 |   {
  36 |    "cell_type": "code",
  37 |    "execution_count": null,
  38 |    "metadata": {},
  39 |    "outputs": [],
  40 |    "source": [
  41 |     "import sys\n",
  42 |     "sys.path.insert(0, './src/')"
  43 |    ]
  44 |   },
  45 |   {
  46 |    "cell_type": "markdown",
  47 |    "metadata": {},
  48 |    "source": [
  49 |     "## Investigate and process the data"
  50 |    ]
  51 |   },
  52 |   {
  53 |    "cell_type": "markdown",
  54 |    "metadata": {},
  55 |    "source": [
  56 |     "Let's start by reading in the credit card fraud data set."
  57 |    ]
  58 |   },
  59 |   {
  60 |    "cell_type": "code",
  61 |    "execution_count": null,
  62 |    "metadata": {},
  63 |    "outputs": [],
  64 |    "source": [
  65 |     "import boto3\n",
  66 |     "from package import config\n",
  67 |     "\n",
  68 |     "instance_type = 'ml.m5.large'\n",
  69 |     "s3 = boto3.resource('s3', region_name=config.AWS_REGION)\n",
  70 |     "object = s3.Object(f\"{config.SOLUTIONS_S3_BUCKET}-{config.AWS_REGION}\",f\"{config.SOLUTION_NAME}/data/creditcardfraud.zip\")\n",
  71 |     "object.download_file(\"creditcardfraud.zip\")"
  72 |    ]
  73 |   },
  74 |   {
  75 |    "cell_type": "code",
  76 |    "execution_count": null,
  77 |    "metadata": {},
  78 |    "outputs": [],
  79 |    "source": [
  80 |     "from zipfile import ZipFile\n",
  81 |     "\n",
  82 |     "with ZipFile('creditcardfraud.zip', 'r') as zf:\n",
  83 |     "    zf.extractall()"
  84 |    ]
  85 |   },
  86 |   {
  87 |    "cell_type": "code",
  88 |    "execution_count": null,
  89 |    "metadata": {},
  90 |    "outputs": [],
  91 |    "source": [
  92 |     "import numpy as np \n",
  93 |     "import pandas as pd\n",
  94 |     "\n",
  95 |     "data = pd.read_csv('creditcard.csv', delimiter=',')"
  96 |    ]
  97 |   },
  98 |   {
  99 |    "cell_type": "markdown",
 100 |    "metadata": {},
 101 |    "source": [
 102 |     "Let's take a peek at our data (we only show a subset of the columns in the table):"
 103 |    ]
 104 |   },
 105 |   {
 106 |    "cell_type": "code",
 107 |    "execution_count": null,
 108 |    "metadata": {},
 109 |    "outputs": [],
 110 |    "source": [
 111 |     "print(data.columns)\n",
 112 |     "data[['Time', 'V1', 'V2', 'V27', 'V28', 'Amount', 'Class']].describe()"
 113 |    ]
 114 |   },
 115 |   {
 116 |    "cell_type": "markdown",
 117 |    "metadata": {},
 118 |    "source": [
 119 |     "The dataset contains\n",
 120 |     "only numerical features, because the original features have been transformed using PCA, to protect user privacy. As a result,\n",
 121 |     "the dataset contains 28 PCA components, V1-V28, and two features that haven't been transformed, _Amount_ and _Time_.\n",
 122 |     "_Amount_ refers to the transaction amount, and _Time_ is the seconds elapsed between any transaction in the data\n",
 123 |     "and the first transaction.\n",
 124 |     "\n",
 125 |     "The class column corresponds to whether or not a transaction is fraudulent. We see that the majority of data is non-fraudulent with only $492$ ($0.173\\%$) of the data corresponding to fraudulent examples, out of the total of 284,807 examples in the data."
 126 |    ]
 127 |   },
 128 |   {
 129 |    "cell_type": "code",
 130 |    "execution_count": null,
 131 |    "metadata": {},
 132 |    "outputs": [],
 133 |    "source": [
 134 |     "nonfrauds, frauds = data.groupby('Class').size()\n",
 135 |     "print('Number of frauds: ', frauds)\n",
 136 |     "print('Number of non-frauds: ', nonfrauds)\n",
 137 |     "print('Percentage of fradulent data:', 100.*frauds/(frauds + nonfrauds))"
 138 |    ]
 139 |   },
 140 |   {
 141 |    "cell_type": "markdown",
 142 |    "metadata": {},
 143 |    "source": [
 144 |     "We already know that the columns $V_i$ have been normalized to have $0$ mean and unit standard deviation as the result of a PCA."
 145 |    ]
 146 |   },
 147 |   {
 148 |    "cell_type": "code",
 149 |    "execution_count": null,
 150 |    "metadata": {},
 151 |    "outputs": [],
 152 |    "source": [
 153 |     "feature_columns = data.columns[:-1]\n",
 154 |     "label_column = data.columns[-1]\n",
 155 |     "\n",
 156 |     "features = data[feature_columns].values.astype('float32')\n",
 157 |     "labels = (data[label_column].values).astype('float32')"
 158 |    ]
 159 |   },
 160 |   {
 161 |    "cell_type": "markdown",
 162 |    "metadata": {},
 163 |    "source": [
 164 |     "Next, we will prepare our data for loading and training."
 165 |    ]
 166 |   },
 167 |   {
 168 |    "cell_type": "markdown",
 169 |    "metadata": {},
 170 |    "source": [
 171 |     "## Training"
 172 |    ]
 173 |   },
 174 |   {
 175 |    "cell_type": "markdown",
 176 |    "metadata": {},
 177 |    "source": [
 178 |     "We will split our dataset into a train and test to evaluate the performance of our models. It's important to do so _before_ any techniques meant to alleviate the class imbalance are used. This ensures that we don't leak information from the test set into the train set."
 179 |    ]
 180 |   },
 181 |   {
 182 |    "cell_type": "code",
 183 |    "execution_count": null,
 184 |    "metadata": {},
 185 |    "outputs": [],
 186 |    "source": [
 187 |     "from sklearn.model_selection import train_test_split\n",
 188 |     "\n",
 189 |     "X_train, X_test, y_train, y_test = train_test_split(\n",
 190 |     "    features, labels, test_size=0.1, random_state=42)"
 191 |    ]
 192 |   },
 193 |   {
 194 |    "cell_type": "markdown",
 195 |    "metadata": {},
 196 |    "source": [
 197 |     "> Note: If you are bringing your own data to this solution and they include categorical data, that have strings as values, you'd need to one-hot encode these values first using for example sklearn's [OneHotEncoder](https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-categorical-features), as XGBoost only supports numerical data."
 198 |    ]
 199 |   },
 200 |   {
 201 |    "cell_type": "markdown",
 202 |    "metadata": {},
 203 |    "source": [
 204 |     "## Unsupervised Learning"
 205 |    ]
 206 |   },
 207 |   {
 208 |    "cell_type": "markdown",
 209 |    "metadata": {},
 210 |    "source": [
 211 |     "In a fraud detection scenario, commonly we will have very few labeled examples, and it's possible that labeling fraud takes a very long time. We would like then to extract information from the unlabeled data we have at hand as well. _Anomaly detection_ is a form of unsupervised learning where we try to identify anomalous examples based solely on their feature characteristics. Random Cut Forest is a state-of-the-art anomaly detection algorithm that is both accurate and scalable. We will train such a model on our training data and evaluate its performance on our test set."
 212 |    ]
 213 |   },
 214 |   {
 215 |    "cell_type": "code",
 216 |    "execution_count": null,
 217 |    "metadata": {},
 218 |    "outputs": [],
 219 |    "source": [
 220 |     "import os\n",
 221 |     "import sagemaker\n",
 222 |     "from package import config\n",
 223 |     "\n",
 224 |     "session = sagemaker.Session()\n",
 225 |     "bucket = config.MODEL_DATA_S3_BUCKET\n",
 226 |     "prefix = 'fraud-classifier'"
 227 |    ]
 228 |   },
 229 |   {
 230 |    "cell_type": "code",
 231 |    "execution_count": null,
 232 |    "metadata": {},
 233 |    "outputs": [],
 234 |    "source": [
 235 |     "from sagemaker import RandomCutForest\n",
 236 |     "\n",
 237 |     "# specify general training job information\n",
 238 |     "rcf = RandomCutForest(role=config.SAGEMAKER_IAM_ROLE,\n",
 239 |     "                      instance_count=1,\n",
 240 |     "                      instance_type=instance_type,\n",
 241 |     "                      data_location='s3://{}/{}/'.format(bucket, prefix),\n",
 242 |     "                      output_path='s3://{}/{}/output'.format(bucket, prefix),\n",
 243 |     "                      base_job_name=\"{}-rcf\".format(config.SOLUTION_PREFIX),\n",
 244 |     "                      num_samples_per_tree=512,\n",
 245 |     "                      num_trees=50)"
 246 |    ]
 247 |   },
 248 |   {
 249 |    "cell_type": "markdown",
 250 |    "metadata": {
 251 |     "pycharm": {
 252 |      "name": "#%% md\n"
 253 |     }
 254 |    },
 255 |    "source": [
 256 |     "Now we are ready to fit the model. The below cell should take around 5 minutes to complete."
 257 |    ]
 258 |   },
 259 |   {
 260 |    "cell_type": "code",
 261 |    "execution_count": null,
 262 |    "metadata": {
 263 |     "pycharm": {
 264 |      "name": "#%%\n"
 265 |     }
 266 |    },
 267 |    "outputs": [],
 268 |    "source": [
 269 |     "rcf.fit(rcf.record_set(X_train))"
 270 |    ]
 271 |   },
 272 |   {
 273 |    "cell_type": "markdown",
 274 |    "metadata": {},
 275 |    "source": [
 276 |     "### Host Random Cut Forest"
 277 |    ]
 278 |   },
 279 |   {
 280 |    "cell_type": "markdown",
 281 |    "metadata": {
 282 |     "pycharm": {
 283 |      "name": "#%% md\n"
 284 |     }
 285 |    },
 286 |    "source": [
 287 |     "Once we have a trained model we can deploy it and get some predictions for our test set. SageMaker will spin up an instance for us and deploy the model, the whole process should take around 10 minutes, you will see progress being made with each `-` and an exclamation point when the process is finished."
 288 |    ]
 289 |   },
 290 |   {
 291 |    "cell_type": "code",
 292 |    "execution_count": null,
 293 |    "metadata": {},
 294 |    "outputs": [],
 295 |    "source": [
 296 |     "rcf_predictor = rcf.deploy(\n",
 297 |     "    model_name=\"{}-rcf\".format(config.SOLUTION_PREFIX),\n",
 298 |     "    endpoint_name=\"{}-rcf\".format(config.SOLUTION_PREFIX),\n",
 299 |     "    initial_instance_count=1,\n",
 300 |     "    instance_type=instance_type)"
 301 |    ]
 302 |   },
 303 |   {
 304 |    "cell_type": "code",
 305 |    "execution_count": null,
 306 |    "metadata": {
 307 |     "pycharm": {
 308 |      "name": "#%%\n"
 309 |     }
 310 |    },
 311 |    "outputs": [],
 312 |    "source": [
 313 |     "from sagemaker.serializers import CSVSerializer\n",
 314 |     "from sagemaker.deserializers import JSONDeserializer\n",
 315 |     " \n",
 316 |     "rcf_predictor.content_type = 'text/csv'\n",
 317 |     "rcf_predictor.serializer = CSVSerializer()\n",
 318 |     "rcf_predictor.accept = 'application/json'\n",
 319 |     "rcf_predictor.deserializer = JSONDeserializer()"
 320 |    ]
 321 |   },
 322 |   {
 323 |    "cell_type": "markdown",
 324 |    "metadata": {},
 325 |    "source": [
 326 |     "### Test Random Cut Forest"
 327 |    ]
 328 |   },
 329 |   {
 330 |    "cell_type": "markdown",
 331 |    "metadata": {
 332 |     "pycharm": {
 333 |      "name": "#%% md\n"
 334 |     }
 335 |    },
 336 |    "source": [
 337 |     "With the model deployed, let's see how it performs in terms of separating fraudulent from legitimate transactions."
 338 |    ]
 339 |   },
 340 |   {
 341 |    "cell_type": "code",
 342 |    "execution_count": null,
 343 |    "metadata": {},
 344 |    "outputs": [],
 345 |    "source": [
 346 |     "def predict_rcf(current_predictor, data, rows=500):\n",
 347 |     "    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))\n",
 348 |     "    predictions = []\n",
 349 |     "    for array in split_array:\n",
 350 |     "        array_preds = [s['score'] for s in current_predictor.predict(array)['scores']]\n",
 351 |     "        predictions.append(array_preds)\n",
 352 |     "\n",
 353 |     "    return np.concatenate([np.array(batch) for batch in predictions])"
 354 |    ]
 355 |   },
 356 |   {
 357 |    "cell_type": "code",
 358 |    "execution_count": null,
 359 |    "metadata": {},
 360 |    "outputs": [],
 361 |    "source": [
 362 |     "positives = X_test[y_test == 1]\n",
 363 |     "positives_scores = predict_rcf(rcf_predictor, positives)\n",
 364 |     "\n",
 365 |     "negatives = X_test[y_test == 0]\n",
 366 |     "negatives_scores = predict_rcf(rcf_predictor, negatives)"
 367 |    ]
 368 |   },
 369 |   {
 370 |    "cell_type": "code",
 371 |    "execution_count": null,
 372 |    "metadata": {},
 373 |    "outputs": [],
 374 |    "source": [
 375 |     "import seaborn as sns\n",
 376 |     "import matplotlib.pyplot as plt\n",
 377 |     "sns.set(color_codes=True)"
 378 |    ]
 379 |   },
 380 |   {
 381 |    "cell_type": "code",
 382 |    "execution_count": null,
 383 |    "metadata": {
 384 |     "pycharm": {
 385 |      "name": "#%%\n"
 386 |     }
 387 |    },
 388 |    "outputs": [],
 389 |    "source": [
 390 |     "import matplotlib.pyplot as plt\n",
 391 |     "import seaborn as sns\n",
 392 |     "\n",
 393 |     "fig, ax = plt.subplots(figsize=(8, 6))\n",
 394 |     "sns.histplot(positives_scores, label='fraud', bins=20, ax=ax)\n",
 395 |     "sns.histplot(negatives_scores, label='not-fraud', bins=20, ax=ax)\n",
 396 |     "ax.legend()"
 397 |    ]
 398 |   },
 399 |   {
 400 |    "cell_type": "markdown",
 401 |    "metadata": {},
 402 |    "source": [
 403 |     "The unsupervised model already can achieve some separation between the classes, with higher anomaly scores being correlated to fraud."
 404 |    ]
 405 |   },
 406 |   {
 407 |    "cell_type": "markdown",
 408 |    "metadata": {},
 409 |    "source": [
 410 |     "## Supervised Learning"
 411 |    ]
 412 |   },
 413 |   {
 414 |    "cell_type": "markdown",
 415 |    "metadata": {},
 416 |    "source": [
 417 |     "Once we have gathered an adequate amount of labeled training data, we can use a supervised learning algorithm that discovers relationships between the features and the dependent class.\n",
 418 |     "\n",
 419 |     "We will use Gradient Boosted Trees as our model, as they have a proven track record, are highly scalable and can deal with missing data, reducing the need to pre-process datasets."
 420 |    ]
 421 |   },
 422 |   {
 423 |    "cell_type": "markdown",
 424 |    "metadata": {},
 425 |    "source": [
 426 |     "### Prepare Data and Upload to S3"
 427 |    ]
 428 |   },
 429 |   {
 430 |    "cell_type": "markdown",
 431 |    "metadata": {
 432 |     "pycharm": {
 433 |      "name": "#%% md\n"
 434 |     }
 435 |    },
 436 |    "source": [
 437 |     "First we copy the data to an in-memory buffer."
 438 |    ]
 439 |   },
 440 |   {
 441 |    "cell_type": "code",
 442 |    "execution_count": null,
 443 |    "metadata": {
 444 |     "pycharm": {
 445 |      "name": "#%%\n"
 446 |     }
 447 |    },
 448 |    "outputs": [],
 449 |    "source": [
 450 |     "import io\n",
 451 |     "import sklearn\n",
 452 |     "from sklearn.datasets import dump_svmlight_file   \n",
 453 |     "\n",
 454 |     "buf = io.BytesIO()\n",
 455 |     "\n",
 456 |     "sklearn.datasets.dump_svmlight_file(X_train, y_train, buf)\n",
 457 |     "buf.seek(0);"
 458 |    ]
 459 |   },
 460 |   {
 461 |    "cell_type": "markdown",
 462 |    "metadata": {
 463 |     "pycharm": {
 464 |      "name": "#%% md\n"
 465 |     }
 466 |    },
 467 |    "source": [
 468 |     "Now we upload the data to S3 using boto3."
 469 |    ]
 470 |   },
 471 |   {
 472 |    "cell_type": "code",
 473 |    "execution_count": null,
 474 |    "metadata": {
 475 |     "pycharm": {
 476 |      "name": "#%%\n"
 477 |     }
 478 |    },
 479 |    "outputs": [],
 480 |    "source": [
 481 |     "key = 'fraud-dataset'\n",
 482 |     "subdir = 'base'\n",
 483 |     "boto3.resource('s3', region_name=config.AWS_REGION).Bucket(bucket).Object(os.path.join(prefix, 'train', subdir, key)).upload_fileobj(buf)\n",
 484 |     "\n",
 485 |     "s3_train_data = 's3://{}/{}/train/{}/{}'.format(bucket, prefix, subdir, key)\n",
 486 |     "print('Uploaded training data location: {}'.format(s3_train_data))\n",
 487 |     "\n",
 488 |     "output_location = 's3://{}/{}/output'.format(bucket, prefix)\n",
 489 |     "print('Training artifacts will be uploaded to: {}'.format(output_location))"
 490 |    ]
 491 |   },
 492 |   {
 493 |    "cell_type": "markdown",
 494 |    "metadata": {
 495 |     "pycharm": {
 496 |      "name": "#%% md\n"
 497 |     }
 498 |    },
 499 |    "source": [
 500 |     "We can now train using SageMaker's built-in XGBoost algorithm. To specify the XGBoost algorithm, we use a utility function to obtain its URI. A complete list of built-in algorithms is found here: https://docs.aws.amazon.com/sagemaker/latest/dg/algos.html"
 501 |    ]
 502 |   },
 503 |   {
 504 |    "cell_type": "code",
 505 |    "execution_count": null,
 506 |    "metadata": {
 507 |     "pycharm": {
 508 |      "name": "#%%\n"
 509 |     }
 510 |    },
 511 |    "outputs": [],
 512 |    "source": [
 513 |     "import sagemaker\n",
 514 |     "\n",
 515 |     "# Get the XGBoost image URI\n",
 516 |     "xgboost_image_uri = sagemaker.image_uris.retrieve(\n",
 517 |     "    framework=\"xgboost\",\n",
 518 |     "    region=boto3.Session().region_name,\n",
 519 |     "    version=\"0.90-2\",\n",
 520 |     "    py_version=\"py3\",\n",
 521 |     ")\n"
 522 |    ]
 523 |   },
 524 |   {
 525 |    "cell_type": "markdown",
 526 |    "metadata": {
 527 |     "pycharm": {
 528 |      "name": "#%% md\n"
 529 |     }
 530 |    },
 531 |    "source": [
 532 |     "SageMaker abstracts training via Estimators. We can pass the classifier and parameters along with hyperparameters to the estimator, and fit the estimator to the data in S3. An important parameter here is `scale_pos_weight` which scales the weights of the positive vs. negative class examples. This is crucial to do in an imbalanced dataset like the one we are using here, otherwise the majority class would dominate the learning."
 533 |    ]
 534 |   },
 535 |   {
 536 |    "cell_type": "code",
 537 |    "execution_count": null,
 538 |    "metadata": {
 539 |     "pycharm": {
 540 |      "name": "#%%\n"
 541 |     },
 542 |     "scrolled": true
 543 |    },
 544 |    "outputs": [],
 545 |    "source": [
 546 |     "from math import sqrt\n",
 547 |     "\n",
 548 |     "# Because the data set is so highly skewed, we set the scale position weight conservatively,\n",
 549 |     "# as sqrt(num_nonfraud/num_fraud).\n",
 550 |     "# Other recommendations for the scale_pos_weight are setting it to (num_nonfraud/num_fraud).\n",
 551 |     "scale_pos_weight = sqrt(np.count_nonzero(y_train==0)/np.count_nonzero(y_train))\n",
 552 |     "hyperparams = {\n",
 553 |     "    \"max_depth\":5,\n",
 554 |     "    \"subsample\":0.8,\n",
 555 |     "    \"num_round\":100,\n",
 556 |     "    \"eta\":0.2,\n",
 557 |     "    \"gamma\":4,\n",
 558 |     "    \"min_child_weight\":6,\n",
 559 |     "    \"silent\":0,\n",
 560 |     "    \"objective\":'binary:logistic',\n",
 561 |     "    \"eval_metric\":'auc',\n",
 562 |     "    \"scale_pos_weight\": scale_pos_weight\n",
 563 |     "}"
 564 |    ]
 565 |   },
 566 |   {
 567 |    "cell_type": "markdown",
 568 |    "metadata": {},
 569 |    "source": [
 570 |     "Let us explain the hyper-parameters used above. The one that's very relevant for learning from skewed data is `scale_pos_weight`. This is a ratio that weighs the examples of the positive class (fraud) against the negative class (legitimate). Commonly this is set to `(num_nonfraud/num_fraud)`, but our data is exteremely skewed so we will set it to `sqrt(num_nonfraud/num_fraud)`.  For the data in this example, this would be `sqrt(284,807/492)` which would give our fraud examples a weight of ~24.\n",
 571 |     "\n",
 572 |     "The rest of the hyper-parameters are as follows:\n",
 573 |     "\n",
 574 |     "* `max_depth`: This is the maximum depth of the trees that will be built for our ensemble. A max depth of 5 will give us trees with up to 32 leaves. Note that tree size grows exponentially when increasing this parameter (`num_leaves=2^max_depth`), so a max depth of 10 would give us trees with 1024 leaves, which are likely to overfit.\n",
 575 |     "* `subsample`: The subsample ratio that we use to select a subset of the complete data to train each tree in the ensemble. With a value of 0.8, each tree is trained on a random sample containing 80% of the complete data. This is used to prevent overfitting.\n",
 576 |     "* `num_round`: This is the size of the ensemble. We will for 100 \"rounds\", each training round adding a new tree to the ensemble.\n",
 577 |     "* `eta`: This is the step size shrinkage applied at each update. This value will shrink the weights of new features to prevent overfitting.\n",
 578 |     "* `gamma`: This is the minimum loss reduction to reach before splitting a leaf. Splitting a leaf can sometimes have a small benefit, and splitting such leaves can lead to overfitting. By setting `gamma` to values larger than zero, we ensure that there should be at least some non-negligible amount of accuracy gain before splitting a leaf.\n",
 579 |     "* `min_child_weight`: This parameter has a similar effect to gamma, setting it to higher values means we'll wait until enough gain will be possible before splitting a leaf.\n",
 580 |     "* `objective`: We are doing binary classification, so we use a logistic loss objective.\n",
 581 |     "* `eval_metric`: Having a good evaluation metric is crucial when dealing with imbalanced data (see discussion below). We use AUC here."
 582 |    ]
 583 |   },
 584 |   {
 585 |    "cell_type": "code",
 586 |    "execution_count": null,
 587 |    "metadata": {},
 588 |    "outputs": [],
 589 |    "source": [
 590 |     "from sagemaker.estimator import Estimator\n",
 591 |     "\n",
 592 |     "clf = Estimator(\n",
 593 |     "    image_uri=xgboost_image_uri,\n",
 594 |     "    role=config.SAGEMAKER_IAM_ROLE,\n",
 595 |     "    instance_count=1,\n",
 596 |     "    instance_type=instance_type,\n",
 597 |     "    hyperparameters=hyperparams,\n",
 598 |     "    output_path=output_location,\n",
 599 |     "    sagemaker_session=session,\n",
 600 |     "    base_job_name=\"{}-xgb\".format(config.SOLUTION_PREFIX)\n",
 601 |     ")"
 602 |    ]
 603 |   },
 604 |   {
 605 |    "cell_type": "markdown",
 606 |    "metadata": {},
 607 |    "source": [
 608 |     "We can now fit our supervised training model, the call to fit below should take around 5 minutes to complete."
 609 |    ]
 610 |   },
 611 |   {
 612 |    "cell_type": "code",
 613 |    "execution_count": null,
 614 |    "metadata": {
 615 |     "pycharm": {
 616 |      "name": "#%%\n"
 617 |     }
 618 |    },
 619 |    "outputs": [],
 620 |    "source": [
 621 |     "clf.fit({'train': s3_train_data})"
 622 |    ]
 623 |   },
 624 |   {
 625 |    "cell_type": "markdown",
 626 |    "metadata": {
 627 |     "pycharm": {
 628 |      "name": "#%% md\n"
 629 |     }
 630 |    },
 631 |    "source": [
 632 |     "### Host Classifier"
 633 |    ]
 634 |   },
 635 |   {
 636 |    "cell_type": "markdown",
 637 |    "metadata": {
 638 |     "pycharm": {
 639 |      "name": "#%% md\n"
 640 |     }
 641 |    },
 642 |    "source": [
 643 |     "Now we deploy the estimator to and endpoint. As before progress will be indicated by `-`, and the deployment should be done after 10 minutes."
 644 |    ]
 645 |   },
 646 |   {
 647 |    "cell_type": "code",
 648 |    "execution_count": null,
 649 |    "metadata": {
 650 |     "pycharm": {
 651 |      "name": "#%%\n"
 652 |     }
 653 |    },
 654 |    "outputs": [],
 655 |    "source": [
 656 |     "from sagemaker.serializers import CSVSerializer\n",
 657 |     "\n",
 658 |     "predictor = clf.deploy(initial_instance_count=1,\n",
 659 |     "                       model_name=\"{}-xgb\".format(config.SOLUTION_PREFIX),\n",
 660 |     "                       endpoint_name=\"{}-xgb\".format(config.SOLUTION_PREFIX),\n",
 661 |     "                       instance_type=instance_type,\n",
 662 |     "                       serializer=CSVSerializer(),\n",
 663 |     "                       deserializer=None)"
 664 |    ]
 665 |   },
 666 |   {
 667 |    "cell_type": "markdown",
 668 |    "metadata": {
 669 |     "pycharm": {
 670 |      "name": "#%% md\n"
 671 |     }
 672 |    },
 673 |    "source": [
 674 |     "## Evaluation"
 675 |    ]
 676 |   },
 677 |   {
 678 |    "cell_type": "markdown",
 679 |    "metadata": {},
 680 |    "source": [
 681 |     "Once we have trained the model we can use it to make predictions for the test set."
 682 |    ]
 683 |   },
 684 |   {
 685 |    "cell_type": "code",
 686 |    "execution_count": null,
 687 |    "metadata": {
 688 |     "pycharm": {
 689 |      "name": "#%%\n"
 690 |     }
 691 |    },
 692 |    "outputs": [],
 693 |    "source": [
 694 |     "# Because we have a large test set, we call predict on smaller batches\n",
 695 |     "def predict(current_predictor, data, rows=500):\n",
 696 |     "    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))\n",
 697 |     "    predictions = ''\n",
 698 |     "    for array in split_array:\n",
 699 |     "        predictions = ','.join([predictions, current_predictor.predict(array).decode('utf-8')])\n",
 700 |     "\n",
 701 |     "    return np.fromstring(predictions[1:], sep=',')"
 702 |    ]
 703 |   },
 704 |   {
 705 |    "cell_type": "code",
 706 |    "execution_count": null,
 707 |    "metadata": {},
 708 |    "outputs": [],
 709 |    "source": [
 710 |     "raw_preds = predict(predictor, X_test)"
 711 |    ]
 712 |   },
 713 |   {
 714 |    "cell_type": "markdown",
 715 |    "metadata": {
 716 |     "pycharm": {
 717 |      "name": "#%% md\n"
 718 |     }
 719 |    },
 720 |    "source": [
 721 |     "We will use a few measures from the scikit-learn package to evaluate the performance of our model. When dealing with an imbalanced dataset, we need to choose metrics that take into account the frequency of each class in the data.\n",
 722 |     "\n",
 723 |     "Two such metrics are the [balanced accuracy score](https://scikit-learn.org/stable/modules/model_evaluation.html#balanced-accuracy-score), and [Cohen's Kappa](https://scikit-learn.org/stable/modules/model_evaluation.html#cohen-s-kappa)."
 724 |    ]
 725 |   },
 726 |   {
 727 |    "cell_type": "code",
 728 |    "execution_count": null,
 729 |    "metadata": {},
 730 |    "outputs": [],
 731 |    "source": [
 732 |     "from sklearn.metrics import balanced_accuracy_score, cohen_kappa_score\n",
 733 |     "\n",
 734 |     "# scikit-learn expects 0/1 predictions, so we threshold our raw predictions\n",
 735 |     "y_preds = np.where(raw_preds > 0.5, 1, 0)\n",
 736 |     "print(\"Balanced accuracy = {}\".format(balanced_accuracy_score(y_test, y_preds)))\n",
 737 |     "print(\"Cohen's Kappa = {}\".format(cohen_kappa_score(y_test, y_preds)))"
 738 |    ]
 739 |   },
 740 |   {
 741 |    "cell_type": "markdown",
 742 |    "metadata": {},
 743 |    "source": [
 744 |     "We can already see that our model performs very well in terms of both metrics, Cohen's Kappa scores above 0.8 are generally very favorable."
 745 |    ]
 746 |   },
 747 |   {
 748 |    "cell_type": "markdown",
 749 |    "metadata": {},
 750 |    "source": [
 751 |     "Apart from single-value metrics, it's also useful to look at metrics that indicate performance per class. A confusion matrix, and per-class precision, recall and f1-score can also provide more information about the model's performance."
 752 |    ]
 753 |   },
 754 |   {
 755 |    "cell_type": "code",
 756 |    "execution_count": null,
 757 |    "metadata": {},
 758 |    "outputs": [],
 759 |    "source": [
 760 |     "import matplotlib.pyplot as plt\n",
 761 |     "import seaborn as sns\n",
 762 |     "from sklearn.metrics import confusion_matrix\n",
 763 |     "\n",
 764 |     "def plot_confusion_matrix(y_true, y_predicted):\n",
 765 |     "\n",
 766 |     "    cm  = confusion_matrix(y_true, y_predicted)\n",
 767 |     "    # Get the per-class normalized value for each cell\n",
 768 |     "    cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n",
 769 |     "    \n",
 770 |     "    # We color each cell according to its normalized value, annotate with exact counts.\n",
 771 |     "    ax = sns.heatmap(cm_norm, annot=cm, fmt=\"d\")\n",
 772 |     "    ax.set(xticklabels=[\"non-fraud\", \"fraud\"], yticklabels=[\"non-fraud\", \"fraud\"])\n",
 773 |     "    ax.set_ylim([0,2])\n",
 774 |     "    plt.title('Confusion Matrix')\n",
 775 |     "    plt.ylabel('Real Classes')\n",
 776 |     "    plt.xlabel('Predicted Classes')\n",
 777 |     "    plt.show()"
 778 |    ]
 779 |   },
 780 |   {
 781 |    "cell_type": "code",
 782 |    "execution_count": null,
 783 |    "metadata": {
 784 |     "pycharm": {
 785 |      "name": "#%%\n"
 786 |     }
 787 |    },
 788 |    "outputs": [],
 789 |    "source": [
 790 |     "plot_confusion_matrix(y_test, y_preds)"
 791 |    ]
 792 |   },
 793 |   {
 794 |    "cell_type": "code",
 795 |    "execution_count": null,
 796 |    "metadata": {
 797 |     "pycharm": {
 798 |      "name": "#%%\n"
 799 |     }
 800 |    },
 801 |    "outputs": [],
 802 |    "source": [
 803 |     "from sklearn.metrics import classification_report\n",
 804 |     "\n",
 805 |     "print(classification_report(\n",
 806 |     "    y_test, y_preds, target_names=['non-fraud', 'fraud']))"
 807 |    ]
 808 |   },
 809 |   {
 810 |    "cell_type": "markdown",
 811 |    "metadata": {
 812 |     "pycharm": {
 813 |      "name": "#%% md\n"
 814 |     }
 815 |    },
 816 |    "source": [
 817 |     "### Keep sending test traffic to the endpoint via lambda"
 818 |    ]
 819 |   },
 820 |   {
 821 |    "cell_type": "markdown",
 822 |    "metadata": {},
 823 |    "source": [
 824 |     "**NOTE:**\n",
 825 |     "If you're running the solution within Studio, uncomment the next cell to set `force_traffic_generation` to True, in order to run a simulation that sends traffic to the REST API deployed with this solution. **Before you do this you will need to attach the execute-api:Invoke action to the role that’s assumed by this notebook.** Here’s what the permission statement block you should add looks like: \n",
 826 |     "\n",
 827 |     "```json\n",
 828 |     "{\n",
 829 |     "    \"Action\": [\n",
 830 |     "        \"execute-api:Invoke\"\n",
 831 |     "    ],\n",
 832 |     "    \"Resource\": [\n",
 833 |     "        \"arn:aws:execute-api:*:*:*/*/POST/*\"\n",
 834 |     "    ],\n",
 835 |     "   \"Effect\": \"Allow\"\n",
 836 |     "}\n",
 837 |     "```\n",
 838 |     "\n",
 839 |     "See the APIGateway IAM documentation page for more info: https://docs.aws.amazon.com/service-authorization/latest/reference/list_amazonapigateway.html on managing execute-api   permissions.\n"
 840 |    ]
 841 |   },
 842 |   {
 843 |    "cell_type": "code",
 844 |    "execution_count": null,
 845 |    "metadata": {},
 846 |    "outputs": [],
 847 |    "source": [
 848 |     "force_traffic_generation = False\n",
 849 |     "#force_traffic_generation = True # If in Studio, set to True after you've updated you permissions"
 850 |    ]
 851 |   },
 852 |   {
 853 |    "cell_type": "markdown",
 854 |    "metadata": {},
 855 |    "source": [
 856 |     "We can now show how we could use both of these models in a production system, using HTTP requests to an AWS Lambda function that invokes both the unsupervised and the supervised SageMaker endpoints.\n",
 857 |     "\n",
 858 |     "We create a background thread that will constantly create HTTP requests to invoke the Lambda, using our test data as input. See the included `generate_endpoint_traffic.py` file to see how that is done.\n",
 859 |     "The output will be logged to an S3 bucket through Kinesis, and you can also observe it in the Lambda function's CloudWatch logs."
 860 |    ]
 861 |   },
 862 |   {
 863 |    "cell_type": "code",
 864 |    "execution_count": null,
 865 |    "metadata": {},
 866 |    "outputs": [],
 867 |    "source": [
 868 |     "pip install aws_requests_auth"
 869 |    ]
 870 |   },
 871 |   {
 872 |    "cell_type": "code",
 873 |    "execution_count": null,
 874 |    "metadata": {},
 875 |    "outputs": [],
 876 |    "source": [
 877 |     "from threading import Thread\n",
 878 |     "from package.generate_endpoint_traffic import generate_traffic\n",
 879 |     "\n",
 880 |     "if config.SAGEMAKER_MODE == \"NotebookInstance\" or force_traffic_generation:\n",
 881 |     "    thread = Thread(target = generate_traffic, args=[np.copy(X_test)])\n",
 882 |     "    thread.start()\n"
 883 |    ]
 884 |   },
 885 |   {
 886 |    "cell_type": "markdown",
 887 |    "metadata": {},
 888 |    "source": [
 889 |     "After a few minutes you can monitor the function invocations in the following URL:"
 890 |    ]
 891 |   },
 892 |   {
 893 |    "cell_type": "code",
 894 |    "execution_count": null,
 895 |    "metadata": {},
 896 |    "outputs": [],
 897 |    "source": [
 898 |     "from IPython.display import display\n",
 899 |     "from IPython.display import Markdown as md\n",
 900 |     "if config.SAGEMAKER_MODE == \"NotebookInstance\" or force_traffic_generation:\n",
 901 |     "    print(\"hello\")\n",
 902 |     "    display(md(f\"[Link to Lambda Monitoring](https://{config.AWS_REGION}.console.aws.amazon.com/lambda/home?region={config.AWS_REGION}#/functions/{config.SOLUTION_PREFIX}-event-processor?tab=monitoring)\"))"
 903 |    ]
 904 |   },
 905 |   {
 906 |    "cell_type": "markdown",
 907 |    "metadata": {},
 908 |    "source": [
 909 |     "### SMOTE"
 910 |    ]
 911 |   },
 912 |   {
 913 |    "cell_type": "markdown",
 914 |    "metadata": {},
 915 |    "source": [
 916 |     "Now that we have a baseline model using XGBoost, we can try to see if sampling techniques that are designed specifically for imbalanced problems can improve the performance of the model.\n",
 917 |     "\n",
 918 |     "For that purpose we will be using the [imbalanced-learn](https://imbalanced-learn.readthedocs.io/en/stable/index.html) package that works well with scikit-learn. We have pre-installed the package for this kernel, but if you need it for a different Jupyter kernel you can install it by running `pip install --upgrade imbalanced-learn` within the conda environment you need.\n",
 919 |     "\n",
 920 |     "We will be using [Sythetic Minority Over-sampling](https://arxiv.org/abs/1106.1813) (SMOTE), which oversamples the minority class by interpolating new data points between existing ones."
 921 |    ]
 922 |   },
 923 |   {
 924 |    "cell_type": "code",
 925 |    "execution_count": null,
 926 |    "metadata": {},
 927 |    "outputs": [],
 928 |    "source": [
 929 |     "import sys\n",
 930 |     "!{sys.executable} -m pip install imblearn\n",
 931 |     "\n",
 932 |     "from imblearn.over_sampling import SMOTE\n",
 933 |     "\n",
 934 |     "smote = SMOTE(random_state=42)\n",
 935 |     "X_smote, y_smote = smote.fit_resample(X_train, y_train)"
 936 |    ]
 937 |   },
 938 |   {
 939 |    "cell_type": "markdown",
 940 |    "metadata": {
 941 |     "pycharm": {
 942 |      "name": "#%% md\n"
 943 |     }
 944 |    },
 945 |    "source": [
 946 |     "We can see that SMOTE has now balanced the two classes:"
 947 |    ]
 948 |   },
 949 |   {
 950 |    "cell_type": "code",
 951 |    "execution_count": null,
 952 |    "metadata": {},
 953 |    "outputs": [],
 954 |    "source": [
 955 |     "from collections import Counter\n",
 956 |     "print(sorted(Counter(y_smote).items()))"
 957 |    ]
 958 |   },
 959 |   {
 960 |    "cell_type": "markdown",
 961 |    "metadata": {},
 962 |    "source": [
 963 |     "We note that this is a case of extreme oversampling of the the minority class, we went from ~0.17% to 50%. An alternative would be to use a smaller resampling ratio, such as having one minority cl\n",
 964 |     "ass sample for every `sqrt(non_fraud/fraud)` majority samples, or using more advanced resampling techniques. See the [comparison](https://imbalanced-learn.readthedocs.io/en/stable/auto_examples/over-sampling/plot_comparison_over_sampling.html#sphx-glr-auto-examples-over-sampling-plot-comparison-over-sampling-py) provided by imbalanced-learn for more over-sampling options."
 965 |    ]
 966 |   },
 967 |   {
 968 |    "cell_type": "markdown",
 969 |    "metadata": {
 970 |     "pycharm": {
 971 |      "name": "#%% md\n"
 972 |     }
 973 |    },
 974 |    "source": [
 975 |     "In our case we'll use the SMOTE dataset we just created and upload it to S3 for training."
 976 |    ]
 977 |   },
 978 |   {
 979 |    "cell_type": "code",
 980 |    "execution_count": null,
 981 |    "metadata": {},
 982 |    "outputs": [],
 983 |    "source": [
 984 |     "smote_buf = io.BytesIO()\n",
 985 |     "\n",
 986 |     "# Dump the SMOTE data into a buffer\n",
 987 |     "sklearn.datasets.dump_svmlight_file(X_smote, y_smote, smote_buf)\n",
 988 |     "smote_buf.seek(0);\n",
 989 |     "\n",
 990 |     "# Upload from the buffer to S3\n",
 991 |     "key = 'fraud-dataset-smote'\n",
 992 |     "subdir = 'smote'\n",
 993 |     "boto3.resource('s3', region_name=config.AWS_REGION).Bucket(bucket).Object(os.path.join(prefix, 'train', subdir, key)).upload_fileobj(smote_buf)\n",
 994 |     "\n",
 995 |     "s3_smote_train_data = 's3://{}/{}/train/{}/{}'.format(bucket, prefix, subdir, key)\n",
 996 |     "print('Uploaded training data location: {}'.format(s3_smote_train_data))\n",
 997 |     "\n",
 998 |     "smote_output_location = 's3://{}/{}/smote-output'.format(bucket, prefix)\n",
 999 |     "print('Training artifacts will be uploaded to: {}'.format(smote_output_location))"
1000 |    ]
1001 |   },
1002 |   {
1003 |    "cell_type": "code",
1004 |    "execution_count": null,
1005 |    "metadata": {},
1006 |    "outputs": [],
1007 |    "source": [
1008 |     "# No need to scale weights after SMOTE resampling, so we remove that parameter\n",
1009 |     "hyperparams.pop(\"scale_pos_weight\", None)\n",
1010 |     "smote_xgb = sagemaker.estimator.Estimator(xgboost_image_uri,\n",
1011 |     "                                        role=config.SAGEMAKER_IAM_ROLE,\n",
1012 |     "                                        hyperparameters=hyperparams,\n",
1013 |     "                                        instance_count=1, \n",
1014 |     "                                        instance_type=instance_type,\n",
1015 |     "                                        output_path=smote_output_location,\n",
1016 |     "                                        sagemaker_session=session,\n",
1017 |     "                                        base_job_name=\"{}-xgb-smote\".format(config.SOLUTION_PREFIX))"
1018 |    ]
1019 |   },
1020 |   {
1021 |    "cell_type": "markdown",
1022 |    "metadata": {
1023 |     "pycharm": {
1024 |      "name": "#%% md\n"
1025 |     }
1026 |    },
1027 |    "source": [
1028 |     "We are now ready to fit the model, which should take around 5 minutes to complete."
1029 |    ]
1030 |   },
1031 |   {
1032 |    "cell_type": "code",
1033 |    "execution_count": null,
1034 |    "metadata": {},
1035 |    "outputs": [],
1036 |    "source": [
1037 |     "smote_xgb.fit({'train': s3_smote_train_data})"
1038 |    ]
1039 |   },
1040 |   {
1041 |    "cell_type": "markdown",
1042 |    "metadata": {},
1043 |    "source": [
1044 |     "After fitting the model we can check its performance to compare it against the base XGBoost model. The deployment will take around 10 minutes."
1045 |    ]
1046 |   },
1047 |   {
1048 |    "cell_type": "code",
1049 |    "execution_count": null,
1050 |    "metadata": {
1051 |     "pycharm": {
1052 |      "name": "#%%\n"
1053 |     }
1054 |    },
1055 |    "outputs": [],
1056 |    "source": [
1057 |     "from sagemaker.serializers import CSVSerializer\n",
1058 |     "from sagemaker.deserializers import CSVDeserializer\n",
1059 |     "\n",
1060 |     "smote_predictor = smote_xgb.deploy(initial_instance_count=1,\n",
1061 |     "                                   model_name=\"{}-xgb-smote\".format(config.SOLUTION_PREFIX),\n",
1062 |     "                                   endpoint_name=\"{}-xgb-smote\".format(config.SOLUTION_PREFIX),\n",
1063 |     "                                   instance_type=instance_type)\n",
1064 |     "\n",
1065 |     "# Specify input and output formats.\n",
1066 |     "smote_predictor.content_type = 'text/csv'\n",
1067 |     "csv_serializer = CSVSerializer()\n",
1068 |     "smote_predictor.serializer = csv_serializer\n",
1069 |     "\n",
1070 |     "# Set the deserializer to handle the response from the inference endpoint\n",
1071 |     "#csv_deserializer = CSVDeserializer()\n",
1072 |     "#smote_predictor.deserializer = csv_deserializer"
1073 |    ]
1074 |   },
1075 |   {
1076 |    "cell_type": "code",
1077 |    "execution_count": null,
1078 |    "metadata": {},
1079 |    "outputs": [],
1080 |    "source": [
1081 |     "smote_raw_preds = predict(smote_predictor, X_test)\n",
1082 |     "smote_preds = np.where(smote_raw_preds > 0.5, 1, 0)"
1083 |    ]
1084 |   },
1085 |   {
1086 |    "cell_type": "code",
1087 |    "execution_count": null,
1088 |    "metadata": {
1089 |     "pycharm": {
1090 |      "name": "#%%\n"
1091 |     }
1092 |    },
1093 |    "outputs": [],
1094 |    "source": [
1095 |     "print(\"Balanced accuracy = {}\".format(balanced_accuracy_score(y_test, smote_preds)))\n",
1096 |     "print(\"Cohen's Kappa = {}\".format(cohen_kappa_score(y_test, smote_preds)))"
1097 |    ]
1098 |   },
1099 |   {
1100 |    "cell_type": "code",
1101 |    "execution_count": null,
1102 |    "metadata": {
1103 |     "pycharm": {
1104 |      "name": "#%%\n"
1105 |     }
1106 |    },
1107 |    "outputs": [],
1108 |    "source": [
1109 |     "plot_confusion_matrix(y_test, smote_preds)"
1110 |    ]
1111 |   },
1112 |   {
1113 |    "cell_type": "code",
1114 |    "execution_count": null,
1115 |    "metadata": {},
1116 |    "outputs": [],
1117 |    "source": [
1118 |     "print(classification_report(\n",
1119 |     "    y_test, smote_preds, target_names=['non-fraud', 'fraud']))"
1120 |    ]
1121 |   },
1122 |   {
1123 |    "cell_type": "markdown",
1124 |    "metadata": {},
1125 |    "source": [
1126 |     "Due to the randomness of XGBoost your results may vary, but overall, you should see a large increase in non-fraud cases being classified as fraud (false positives). The reason this happens is because SMOTE has oversampled the fraud class so much that it's increased its overlap in feature space with the non-fraud cases.\n",
1127 |     "Since Cohen's Kappa gives more weight to false positives than balanced accuracy does, the metric drops significantly, as does the precision and F1 score for fraud cases. However, we can bring a balance between the metrics again by adjusting our classification threshold."
1128 |    ]
1129 |   },
1130 |   {
1131 |    "cell_type": "markdown",
1132 |    "metadata": {},
1133 |    "source": [
1134 |     "So far we've been using 0.5 as the threshold between labeling a point as fraud or not. We can try different thresholds to see if they affect the result of the classification. To evaluate we'll use the balanced accuracy and Cohen's Kappa metrics."
1135 |    ]
1136 |   },
1137 |   {
1138 |    "cell_type": "code",
1139 |    "execution_count": null,
1140 |    "metadata": {
1141 |     "pycharm": {
1142 |      "name": "#%%\n"
1143 |     }
1144 |    },
1145 |    "outputs": [],
1146 |    "source": [
1147 |     "for thres in np.linspace(0.1, 0.9, num=9):\n",
1148 |     "    smote_thres_preds = np.where(smote_raw_preds > thres, 1, 0)\n",
1149 |     "    print(\"Threshold: {:.1f}\".format(thres))\n",
1150 |     "    print(\"Balanced accuracy = {:.3f}\".format(balanced_accuracy_score(y_test, smote_thres_preds)))\n",
1151 |     "    print(\"Cohen's Kappa = {:.3f}\\n\".format(cohen_kappa_score(y_test, smote_thres_preds)))"
1152 |    ]
1153 |   },
1154 |   {
1155 |    "cell_type": "markdown",
1156 |    "metadata": {},
1157 |    "source": [
1158 |     "We see that Cohen's Kappa keeps increasing along with the threshold, without a significant loss in balanced accuracy. This adds a useful knob to our model: We can keep a low threshold if we care more about not missing any fraudulent cases, or we can increase the threshold to try to minimize the number of false positives."
1159 |    ]
1160 |   },
1161 |   {
1162 |    "cell_type": "markdown",
1163 |    "metadata": {},
1164 |    "source": [
1165 |     "## Clean up\n",
1166 |     "\n",
1167 |     "We will leave the unsupervised and base XGBoost endpoints running at the end of this notebook so we can handle incoming event streams using the Lambda function. The solution will automatically clean up the endpoints when deleted, however, don't forget to ensure the prediction endpoints are deleted when you're done. You can do that at the Amazon SageMaker console in the Endpoints page. Or you can run `predictor_name.delete_endpoint()` here."
1168 |    ]
1169 |   },
1170 |   {
1171 |    "cell_type": "code",
1172 |    "execution_count": null,
1173 |    "metadata": {},
1174 |    "outputs": [],
1175 |    "source": [
1176 |     "# Uncomment to clean up endpoints\n",
1177 |     "rcf_predictor.delete_model()\n",
1178 |     "rcf_predictor.delete_endpoint()\n",
1179 |     "predictor.delete_model()\n",
1180 |     "predictor.delete_endpoint()\n",
1181 |     "smote_predictor.delete_model()\n",
1182 |     "smote_predictor.delete_endpoint()\n",
1183 |     "sm_client = boto3.client('sagemaker', region_name=config.AWS_REGION)\n",
1184 |     "waiter = sm_client.get_waiter('endpoint_deleted')\n",
1185 |     "waiter.wait(EndpointName=\"{}-xgb-smote\".format(config.SOLUTION_PREFIX))\n",
1186 |     "waiter.wait(EndpointName=\"{}-xgb\".format(config.SOLUTION_PREFIX))\n",
1187 |     "waiter.wait(EndpointName=\"{}-rcf\".format(config.SOLUTION_PREFIX))\n"
1188 |    ]
1189 |   },
1190 |   {
1191 |    "cell_type": "markdown",
1192 |    "metadata": {},
1193 |    "source": [
1194 |     "\n",
1195 |     "## Data Acknowledgements\n",
1196 |     "\n",
1197 |     "The dataset used to demonstrated the fraud detection solution has been collected and analysed during a research collaboration of Worldline and the Machine Learning Group (http://mlg.ulb.ac.be) of ULB (Université Libre de Bruxelles) on big data mining and fraud detection. More details on current and past projects on related topics are available on https://www.researchgate.net/project/Fraud-detection-5 and the page of the [DefeatFraud](https://mlg.ulb.ac.be/wordpress/portfolio_page/defeatfraud-assessment-and-validation-of-deep-feature-engineering-and-learning-solutions-for-fraud-detection/) project\n",
1198 |     "We cite the following works:\n",
1199 |     "* Andrea Dal Pozzolo, Olivier Caelen, Reid A. Johnson and Gianluca Bontempi. Calibrating Probability with Undersampling for Unbalanced Classification. In Symposium on Computational Intelligence and Data Mining (CIDM), IEEE, 2015\n",
1200 |     "* Dal Pozzolo, Andrea; Caelen, Olivier; Le Borgne, Yann-Ael; Waterschoot, Serge; Bontempi, Gianluca. Learned lessons in credit card fraud detection from a practitioner perspective, Expert systems with applications,41,10,4915-4928,2014, Pergamon\n",
1201 |     "* Dal Pozzolo, Andrea; Boracchi, Giacomo; Caelen, Olivier; Alippi, Cesare; Bontempi, Gianluca. Credit card fraud detection: a realistic modeling and a novel learning strategy, IEEE transactions on neural networks and learning systems,29,8,3784-3797,2018,IEEE\n",
1202 |     "* Dal Pozzolo, Andrea Adaptive Machine learning for credit card fraud detection ULB MLG PhD thesis (supervised by G. Bontempi)\n",
1203 |     "* Carcillo, Fabrizio; Dal Pozzolo, Andrea; Le Borgne, Yann-Aël; Caelen, Olivier; Mazzer, Yannis; Bontempi, Gianluca. Scarff: a scalable framework for streaming credit card fraud detection with Spark, Information fusion,41, 182-194,2018,Elsevier\n",
1204 |     "* Carcillo, Fabrizio; Le Borgne, Yann-Aël; Caelen, Olivier; Bontempi, Gianluca. Streaming active learning strategies for real-life credit card fraud detection: assessment and visualization, International Journal of Data Science and Analytics, 5,4,285-300,2018,Springer International Publishing"
1205 |    ]
1206 |   }
1207 |  ],
1208 |  "metadata": {
1209 |   "kernelspec": {
1210 |    "display_name": "conda_python3",
1211 |    "language": "python",
1212 |    "name": "conda_python3"
1213 |   },
1214 |   "language_info": {
1215 |    "codemirror_mode": {
1216 |     "name": "ipython",
1217 |     "version": 3
1218 |    },
1219 |    "file_extension": ".py",
1220 |    "mimetype": "text/x-python",
1221 |    "name": "python",
1222 |    "nbconvert_exporter": "python",
1223 |    "pygments_lexer": "ipython3",
1224 |    "version": "3.10.14"
1225 |   }
1226 |  },
1227 |  "nbformat": 4,
1228 |  "nbformat_minor": 2
1229 | }
1230 | 


--------------------------------------------------------------------------------
/source/notebooks/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | 
 3 | 
 4 | setup(
 5 |     name='package',
 6 |     version='1.0',
 7 |     description="A package to organize the solution's code.",
 8 |     package_dir={'': 'src'},
 9 |     packages=['package'],
10 | )
11 | 


--------------------------------------------------------------------------------
/source/notebooks/src/package/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/fraud-detection-using-machine-learning/c4fe32b5f04dedafaa4b4e6613515fd714031969/source/notebooks/src/package/__init__.py


--------------------------------------------------------------------------------
/source/notebooks/src/package/config.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from pathlib import Path
 3 | 
 4 | from package import utils
 5 | 
 6 | current_folder = utils.get_current_folder(globals())
 7 | cfn_stack_outputs_filepath = Path(current_folder, '../../../stack_outputs.json').resolve()
 8 | assert cfn_stack_outputs_filepath.exists(), "Could not find stack_outputs.json file at {}".format(
 9 |     str(cfn_stack_outputs_filepath))
10 | 
11 | with open(cfn_stack_outputs_filepath) as f:
12 |     cfn_stack_outputs = json.load(f)
13 | 
14 | STACK_NAME = cfn_stack_outputs['FraudStackName']
15 | SOLUTION_PREFIX = cfn_stack_outputs['SolutionPrefix']
16 | AWS_ACCOUNT_ID = cfn_stack_outputs['AwsAccountId']
17 | AWS_REGION = cfn_stack_outputs['AwsRegion']
18 | SAGEMAKER_IAM_ROLE = cfn_stack_outputs['IamRole']
19 | MODEL_DATA_S3_BUCKET = cfn_stack_outputs['ModelDataBucket']
20 | SOLUTIONS_S3_BUCKET = cfn_stack_outputs['SolutionsS3Bucket']
21 | REST_API_GATEWAY = cfn_stack_outputs['RESTAPIGateway']
22 | SOLUTION_NAME = cfn_stack_outputs['SolutionName']
23 | TEST_OUTPUTS_S3_BUCKET = cfn_stack_outputs.get('TestOutputsS3Bucket', "")
24 | SAGEMAKER_MODE = cfn_stack_outputs['SagemakerMode']
25 | 


--------------------------------------------------------------------------------
/source/notebooks/src/package/generate_endpoint_traffic.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Handles generating traffic and creating the ElasticSearch index and dashboard.
 3 | """
 4 | import time
 5 | import re
 6 | import datetime
 7 | import random
 8 | 
 9 | import requests
10 | from aws_requests_auth.boto_utils import BotoAWSRequestsAuth
11 | import numpy as np
12 | from scipy.stats import poisson
13 | 
14 | from package import config
15 | 
16 | def generate_metadata():
17 |     """
18 |     Generates medatadata for the HTTP request: a randomized source and a timestamp.
19 |     """
20 |     millisecond_regex = r'\.\d+'
21 |     timestamp = re.sub(millisecond_regex, '', str(datetime.datetime.now()))
22 |     source = random.choice(['Mobile', 'Web', 'Store'])
23 |     result = [timestamp, 'random_id', source]
24 | 
25 |     return result
26 | 
27 | 
28 | def get_data_payload(test_array):
29 |     return {'data':','.join(map(str, test_array)),
30 |             'metadata': generate_metadata()}
31 | 
32 | 
33 | def generate_traffic(X_test):
34 |     """
35 |     Using a feature array as input
36 |     """
37 |     while True:
38 |         # NB: The shuffle will mutate the X_test array in-place, so ensure
39 |         # you're working with a copy if you intend to use the calling argument
40 |         # array elsewhere.
41 |         np.random.shuffle(X_test)
42 |         for example in X_test:
43 |             data_payload = get_data_payload(example)
44 |             invoke_endpoint(data_payload)
45 |             # We invoke the function according to a shifted Poisson distribution
46 |             # to simulate data arriving at random intervals
47 |             time.sleep(poisson.rvs(1, size=1)[0] + np.random.rand() / 100)
48 | 
49 | 
50 | def invoke_endpoint(payload):
51 |     """
52 |     We get credentials from the IAM role of the notebook instance,
53 |     then use them to create a signed request to the API Gateway
54 |     """
55 |     auth = BotoAWSRequestsAuth(aws_host="{}.execute-api.{}.amazonaws.com".format(
56 |                                  config.REST_API_GATEWAY, config.AWS_REGION),
57 |                                aws_region=config.AWS_REGION,
58 |                                aws_service='execute-api')
59 | 
60 |     invoke_url = "https://{}.execute-api.{}.amazonaws.com/prod/invocations".format(
61 |         config.REST_API_GATEWAY, config.AWS_REGION)
62 | 
63 |     requests.post(invoke_url, json=payload, auth=auth)
64 | 


--------------------------------------------------------------------------------
/source/notebooks/src/package/utils.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | import os
 3 | import json
 4 | 
 5 | 
 6 | def get_notebook_name():
 7 |     with open('/opt/ml/metadata/resource-metadata.json') as openfile:
 8 |         data = json.load(openfile)
 9 |     notebook_name = data['ResourceName']
10 |     return notebook_name
11 | 
12 | 
13 | def get_current_folder(global_variables):
14 |     # if calling from a file
15 |     if "__file__" in global_variables:
16 |         current_file = Path(global_variables["__file__"])
17 |         current_folder = current_file.parent.resolve()
18 |     # if calling from a notebook
19 |     else:
20 |         current_folder = Path(os.getcwd())
21 |     return current_folder
22 | 


--------------------------------------------------------------------------------
/source/scripts/set_kernelspec.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | 
 5 | def set_kernel_spec(notebook_filepath, display_name, kernel_name):
 6 |     with open(notebook_filepath, "r") as openfile:
 7 |         notebook = json.load(openfile)
 8 |     kernel_spec = {"display_name": display_name, "language": "python", "name": kernel_name}
 9 |     if "metadata" not in notebook:
10 |         notebook["metadata"] = {}
11 |     notebook["metadata"]["kernelspec"] = kernel_spec
12 |     with open(notebook_filepath, "w") as openfile:
13 |         json.dump(notebook, openfile)
14 | 
15 | 
16 | if __name__ == "__main__":
17 |     parser = argparse.ArgumentParser()
18 |     parser.add_argument("--notebook")
19 |     parser.add_argument("--display-name")
20 |     parser.add_argument("--kernel")
21 |     args = parser.parse_args()
22 |     set_kernel_spec(args.notebook, args.display_name, args.kernel)
23 | 


--------------------------------------------------------------------------------
/test/buildspec.yml:
--------------------------------------------------------------------------------
  1 | version: 0.2
  2 | 
  3 | batch:
  4 |   fast-fail: false
  5 |   build-list:
  6 |     - identifier: us_west_2
  7 |       env:
  8 |         variables:
  9 |           REGION: us-west-2
 10 |     - identifier: me_south_1
 11 |       env:
 12 |         variables:
 13 |           REGION: me-south-1
 14 |     - identifier: ap_east_1
 15 |       env:
 16 |         variables:
 17 |           REGION: ap-east-1
 18 |     - identifier: ap_northeast_1
 19 |       env:
 20 |         variables:
 21 |           REGION: ap-northeast-1
 22 |     - identifier: ap_northeast_2
 23 |       env:
 24 |         variables:
 25 |           REGION: ap-northeast-2
 26 |     - identifier: ap_south_1
 27 |       env:
 28 |         variables:
 29 |           REGION: ap-south-1
 30 |     - identifier: ap_southeast_1
 31 |       env:
 32 |         variables:
 33 |           REGION: ap-southeast-1
 34 |     - identifier: ap_southeast_2
 35 |       env:
 36 |         variables:
 37 |           REGION: ap-southeast-2
 38 |     - identifier: ca_central_1
 39 |       env:
 40 |         variables:
 41 |           REGION: ca-central-1
 42 |     - identifier: eu_central_1
 43 |       env:
 44 |         variables:
 45 |           REGION: eu-central-1
 46 |     - identifier: eu_north_1
 47 |       env:
 48 |         variables:
 49 |           REGION: eu-north-1
 50 |     - identifier: eu_west_1
 51 |       env:
 52 |         variables:
 53 |           REGION: eu-west-1
 54 |     - identifier: eu_west_2
 55 |       env:
 56 |         variables:
 57 |           REGION: eu-west-2
 58 |     - identifier: eu_west_3
 59 |       env:
 60 |         variables:
 61 |           REGION: eu-west-3
 62 |     - identifier: sa_east_1
 63 |       env:
 64 |         variables:
 65 |           REGION: sa-east-1
 66 |     - identifier: us_east_1
 67 |       env:
 68 |         variables:
 69 |           REGION: us-east-1
 70 |     - identifier: us_east_2
 71 |       env:
 72 |         variables:
 73 |           REGION: us-east-2
 74 |     - identifier: us_west_1
 75 |       env:
 76 |         variables:
 77 |           REGION: us-west-1
 78 | 
 79 | env:
 80 |   variables:
 81 |     STACK_PREFIX: "sagemaker-soln-fdml-ci"
 82 |   shell: 'bash'
 83 | 
 84 | phases:
 85 |   install:
 86 |     runtime-versions:
 87 |       python: 3.x
 88 |     commands:
 89 |       - pip3 install --upgrade pip
 90 |       - pip3 install jupyter
 91 |       - pip3 install papermill
 92 |   pre_build:
 93 |     commands:
 94 |       - export CI_BUCKET="${STACK_PREFIX}-${CODEBUILD_BUILD_ID:(-8)}-$REGION"
 95 |       - export STACK_NAME="${STACK_PREFIX}-${BRANCH}-${CODEBUILD_BUILD_ID:(-8)}"
 96 |       - echo "Testing removal of CI bucket, in case it's left over from previous run"
 97 |       - aws s3 rb --force "s3://${CI_BUCKET}" --region $REGION || true
 98 |   build:
 99 |     commands:
100 |       - echo "Starting build `date` in `pwd`"
101 |       - ls .
102 |       - jupyter kernelspec list
103 |       - aws s3 mb "s3://${CI_BUCKET}" --region $REGION
104 |       - aws s3api wait bucket-exists --bucket "${CI_BUCKET}"
105 |       - >
106 |         papermill ./test/test_deployment.ipynb ./test/test_deployment_out.ipynb
107 |         -p REGION "${REGION}" -p STACK_NAME "${STACK_NAME}"
108 |         -p BRANCH "${BRANCH}" -p CI_BUCKET "${CI_BUCKET}"
109 |         -p SOLUTIONS_BUCKET $SOLUTIONS_BUCKET
110 |         -k python3
111 |         --log-output
112 |   post_build:
113 |     commands:
114 |       - aws s3 rb --force "s3://${CI_BUCKET}" --region $REGION
115 |       - aws cloudformation --region $REGION delete-stack --stack-name "${STACK_NAME}"
116 |       - aws cloudformation --region $REGION wait stack-delete-complete --stack-name "${STACK_NAME}"
117 |       - echo "Build completed `date`"
118 | 
119 | artifacts:
120 |   files:
121 |       - "**/*"
122 | 


--------------------------------------------------------------------------------
/test/run_notebook.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import logging
 4 | 
 5 | import boto3
 6 | import papermill as pm
 7 | import watchtower
 8 | 
 9 | from package import config, utils
10 | 
11 | 
12 | if __name__ == "__main__":
13 | 
14 |     run_on_start = False if config.TEST_OUTPUTS_S3_BUCKET == "" else True
15 | 
16 |     if not run_on_start:
17 |         exit()
18 | 
19 |     cfn_client = boto3.client('cloudformation', region_name=config.AWS_REGION)
20 | 
21 |     # Set up logging through watchtower
22 |     logging.basicConfig(level=logging.INFO)
23 |     logger = logging.getLogger(__name__)
24 |     log_group = "/aws/sagemaker/NotebookInstances"
25 |     stream_name = "{}/run-notebook.log".format(utils.get_notebook_name())
26 |     logger.addHandler(
27 |         watchtower.CloudWatchLogHandler(log_group=log_group, stream_name=stream_name))
28 |     # Add papermill logging to CloudWatch as well
29 |     pm_logger = logging.getLogger('papermill')
30 |     pm_logger.addHandler(
31 |         watchtower.CloudWatchLogHandler(log_group=log_group, stream_name=stream_name))
32 | 
33 |     # Wait for the stack to finish launching
34 |     logger.info("Waiting for stack to finish launching...")
35 |     waiter = cfn_client.get_waiter('stack_create_complete')
36 | 
37 |     waiter.wait(StackName=config.STACK_NAME)
38 | 
39 |     logger.info("Starting notebook execution through papermill")
40 | 
41 |     # Run the notebook
42 |     bucket = config.TEST_OUTPUTS_S3_BUCKET
43 |     prefix = 'integration-test'
44 |     output_notebook = "output.ipynb"
45 | 
46 |     start_time = time.time()
47 |     test_prefix = "/home/ec2-user/SageMaker/test/"
48 |     with open(os.path.join(test_prefix, "output_stdout.txt"), 'w') as stdoutfile, open(os.path.join(test_prefix, "output_stderr.txt"), 'w') as stderrfile:
49 |         try:
50 |             nb = pm.execute_notebook(
51 |                 '/home/ec2-user/SageMaker/notebooks/sagemaker_fraud_detection.ipynb',
52 |                 os.path.join(test_prefix, output_notebook),
53 |                 cwd='/home/ec2-user/SageMaker/notebooks/',
54 |                 kernel_name='python3',
55 |                 stdout_file=stdoutfile, stderr_file=stderrfile, log_output=True
56 |             )
57 |         except pm.PapermillExecutionError as err:
58 |             logger.warn("Notebook encountered execution error: {}".format(err))
59 |         finally:
60 |             end_time = time.time()
61 |             logger.info("Notebook execution time: {} sec.".format(end_time - start_time))
62 |             s3 = boto3.resource('s3')
63 |             # Upload notebook output file to S3
64 |             s3.meta.client.upload_file(
65 |                 os.path.join(test_prefix, output_notebook), bucket, os.path.join(prefix, output_notebook))
66 |             s3.meta.client.upload_file(
67 |                 os.path.join(test_prefix, "output_stdout.txt"), bucket, os.path.join(prefix, "output_stdout.txt"))
68 |             s3.meta.client.upload_file(
69 |                 os.path.join(test_prefix, "output_stderr.txt"), bucket, os.path.join(prefix, "output_stderr.txt"))
70 | 


--------------------------------------------------------------------------------
/test/test_deployment.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "This noteboook launches the solution, with a parameter that instructs the instance to run the solution's notebook using papermill, wait for that process to finish, then raise any errors encountered while running the notebook to the build.\n",
  8 |     "\n",
  9 |     "The _build instance_ will launch the solution using the following parameters, which can be overriden by providing them as enviroment variables in the build settings. Since the build instance is launching the solution, the build project needs to be provided with all the permissions that are necessary to launch the solution."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": null,
 15 |    "metadata": {
 16 |     "tags": [
 17 |      "parameters"
 18 |     ]
 19 |    },
 20 |    "outputs": [],
 21 |    "source": [
 22 |     "BRANCH=\"mainline\"\n",
 23 |     "REGION=\"us-west-2\"\n",
 24 |     "SOLUTIONS_BUCKET=\"sagemaker-solutions-devo\"\n",
 25 |     "SOLUTION_NAME=\"Fraud-detection-using-machine-learning\"\n",
 26 |     "STACK_NAME=\"sagemaker-soln-fdml-ci\"\n",
 27 |     "STACK_VERSION=\"development\"\n",
 28 |     "COMMIT_ID = \"\"\n",
 29 |     "CI_BUCKET = \"\"\n",
 30 |     "# TODO: Get timeout from build, and divide by 1 min to get number of attempts\n",
 31 |     "NOTEBOOK_POLL_ATTEMPTS=120 # Number of attempts while waiting for SM notebook to execute and produce output on S3\n",
 32 |     "NOTEBOOK_POLL_DELAY=60 # Delay between each attempt, in seconds"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "markdown",
 37 |    "metadata": {},
 38 |    "source": [
 39 |     "The next cell programmatically creates the URL for the solution's template, based on the parameters passed above. It's important to include the branch suffix to be able to support feature branches as well as the mainline release pipeline."
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": null,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "branch_suffix = \"\" if BRANCH == \"mainline\" else f\"-{BRANCH}\"\n",
 49 |     "template_url = f\"https://{SOLUTIONS_BUCKET}-{REGION}.s3.{REGION}.amazonaws.com/{SOLUTION_NAME}{branch_suffix}/deployment/fraud-detection-using-machine-learning.yaml\""
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "metadata": {},
 55 |    "source": [
 56 |     "In the next cell we create a unique prefix for our solution, and create an S3 bucket that will serve as the destination for the notebook files we run on the SM instance. It's important that its name starts with the solution prefix, as that will allow the solution itself to write to it (because the solution should have write access to all `sagemaker-soln-` buckets under the same account)."
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "import boto3\n",
 66 |     "import uuid\n",
 67 |     "import logging\n",
 68 |     "import os\n",
 69 |     "import uuid\n",
 70 |     "\n",
 71 |     "logging.basicConfig(level=os.environ.get(\"LOGLEVEL\", \"INFO\"))\n",
 72 |     "\n",
 73 |     "\n",
 74 |     "cfn_client = boto3.client('cloudformation', region_name=REGION)\n",
 75 |     "s3_client = boto3.client('s3', region_name=REGION)\n",
 76 |     "s3 = boto3.resource('s3', region_name=REGION)\n",
 77 |     "\n",
 78 |     "unique_id = uuid.uuid4().hex[:8]\n",
 79 |     "\n",
 80 |     "# Give the solution a unique prefix\n",
 81 |     "solution_prefix = \"sagemaker-soln-fdml-\" # TODO: Get from template directly\n",
 82 |     "unique_prefix = f\"{solution_prefix}{unique_id}\""
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "markdown",
 87 |    "metadata": {},
 88 |    "source": [
 89 |     "The `TestOutputsS3Bucket` CloudFormation parameter given in the next cell, is parsed by CloudFormation and taken in by the project's configuration package (see `source/notebooks/src/package/config.py`). When this parameter is set to something different than `\"\"`, the notebook instance will run the solution's notebook using papermill, through the instance's on-start script (see `deployment/fraud-detection-sagemaker-notebook-instance.yaml`)."
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": null,
 95 |    "metadata": {},
 96 |    "outputs": [],
 97 |    "source": [
 98 |     "logging.info(f\"Creating stack using template located at {template_url}\")\n",
 99 |     "logging.info(f\"STACK_NAME: {STACK_NAME}\")\n",
100 |     "logging.info(f\"REGION: {REGION}\")\n",
101 |     "logging.info(f\"SOLUTIONS_BUCKET: {SOLUTIONS_BUCKET}\")\n",
102 |     "logging.info(f\"CI_BUCKET: {CI_BUCKET}\")\n",
103 |     "logging.info(f\"StackVersion: {STACK_VERSION}\")\n",
104 |     "logging.info(f\"SolutionPrefix: {unique_prefix}\")\n",
105 |     "\n",
106 |     "cfn_client.create_stack(\n",
107 |     "    StackName=STACK_NAME,\n",
108 |     "    TemplateURL=template_url,\n",
109 |     "    Parameters=[\n",
110 |     "        {\n",
111 |     "            'ParameterKey': 'SolutionPrefix',\n",
112 |     "            'ParameterValue': unique_prefix\n",
113 |     "        },\n",
114 |     "        {\n",
115 |     "            'ParameterKey': 'StackVersion',\n",
116 |     "            'ParameterValue': STACK_VERSION\n",
117 |     "        },\n",
118 |     "        {\n",
119 |     "            'ParameterKey': 'TestOutputsS3Bucket',\n",
120 |     "            'ParameterValue': CI_BUCKET\n",
121 |     "        },\n",
122 |     "        {\n",
123 |     "            'ParameterKey': 'SolutionName',\n",
124 |     "            'ParameterValue': f\"{SOLUTION_NAME}{branch_suffix}\"\n",
125 |     "        }\n",
126 |     "    ],\n",
127 |     "    Capabilities=[\n",
128 |     "        'CAPABILITY_IAM',\n",
129 |     "        'CAPABILITY_NAMED_IAM'\n",
130 |     "    ]\n",
131 |     ")"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "markdown",
136 |    "metadata": {},
137 |    "source": [
138 |     "We then wait for the stack to finish launching."
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": [
147 |     "logging.info(\"Waiting for stack creation to complete...\")\n",
148 |     "waiter = cfn_client.get_waiter('stack_create_complete')\n",
149 |     "\n",
150 |     "waiter.wait(StackName=STACK_NAME)\n",
151 |     "logging.info(\"Stack creation complete, notebook run has begun...\")\n",
152 |     "\n",
153 |     "logging.info(\"Notebook instance run logs will be available at:\")\n",
154 |     "logging.info(f\"https://{REGION}.console.aws.amazon.com/cloudwatch/home?region=us-west-2#logsV2:log-groups/log-group/$252Faws$252Fsagemaker$252FNotebookInstances/log-events/{unique_prefix}-notebook-instance$252Frun-notebook.log\")"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "markdown",
159 |    "metadata": {},
160 |    "source": [
161 |     "Once the stack has finished creating, the OnStart script will attempt to run the `sagemaker_fraud_detection.ipynb` notebook, through the `test/run_notebook.py` script. The notebook is run using papermill, and creates an output in the CI S3 bucket we created previously. The following cell will continuously poll the expected location until the output file appears, or errors out after `NOTEBOOK_POLL_DELAY * NOTEBOOK_POLL_ATTEMPTS` seconds. This also means that the CodeBuild project needs to be able to read files from the particular bucket.\n",
162 |     "\n",
163 |     "Note that if this is longer than the build stage's timeout, the build stage will fail. If your solution's notebooks take very long to run, make sure to [increase the build stage's time out](https://docs.aws.amazon.com/codebuild/latest/userguide/change-project-console.html) as well, can be set using a parameter in the CFT you used to create the pipeline."
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": null,
169 |    "metadata": {},
170 |    "outputs": [],
171 |    "source": [
172 |     "# TODO: Ensure there's a single source for these filenames, either in the config, or passed as a papermill parameter?\n",
173 |     "# Right now they're set here and in run_notebook.py\n",
174 |     "import os\n",
175 |     "prefix = 'integration-test' \n",
176 |     "key = \"output.ipynb\"\n",
177 |     "\n",
178 |     "\n",
179 |     "\n",
180 |     "waiter = s3_client.get_waiter('object_exists')\n",
181 |     "\n",
182 |     "logging.info(f\"Waiting for output notebook to appear at {CI_BUCKET}/{os.path.join(prefix, key)}...\")\n",
183 |     "logging.info(f\"Will attempt a total {NOTEBOOK_POLL_ATTEMPTS} polls every {NOTEBOOK_POLL_DELAY} seconds.\")\n",
184 |     "waiter.wait(Bucket=CI_BUCKET, Key=os.path.join(prefix, key), WaiterConfig={'Delay': NOTEBOOK_POLL_DELAY,'MaxAttempts': NOTEBOOK_POLL_ATTEMPTS})"
185 |    ]
186 |   },
187 |   {
188 |    "cell_type": "markdown",
189 |    "metadata": {},
190 |    "source": [
191 |     "Once the notebook appears in the expected location in S3, we download it locally within the build instance, and the stdout and stderr output we got from running the notebook. This doesn't actually run the notebook, but will raise and surface any errors that we triggered during execution on the SM notebook instance. If your solution needs to run more than one notebook you would need to wait for each one to finish in the order you expect them to execute, download them, then dry-run them sequentially here."
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": null,
197 |    "metadata": {},
198 |    "outputs": [],
199 |    "source": [
200 |     "# Dry-run execute the notebook, raising errors if any existed\n",
201 |     "import papermill as pm\n",
202 |     "\n",
203 |     "logging.info(\"Downloading notebook outputs locally...\")\n",
204 |     "s3.meta.client.download_file(CI_BUCKET, os.path.join(prefix, key), key)\n",
205 |     "try:\n",
206 |     "    s3.meta.client.download_file(CI_BUCKET, os.path.join(prefix, \"output_stdout.txt\"), \"output_stdout.txt\")\n",
207 |     "    s3.meta.client.download_file(CI_BUCKET, os.path.join(prefix, \"output_stderr.txt\"), \"output_stderr.txt\")\n",
208 |     "except:\n",
209 |     "    pass\n",
210 |     "\n",
211 |     "# TODO: this notebook filename should also be a parameter\n",
212 |     "logging.info(\"Performing dry-run of notebooks to surface any errors...\")\n",
213 |     "nb = pm.iorw.load_notebook_node(key)\n",
214 |     "pm.execute.raise_for_execution_errors(nb, key)\n",
215 |     "\n",
216 |     "print(\"Test deployment and notebook execution completed successfully!\")"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "markdown",
221 |    "metadata": {},
222 |    "source": [
223 |     "The build project's artifacts will include all the files you download locally here, so they will end up on S3, where you can go and check out the output to debug any errors in this or the solution's notebook. You can find the build artifacts by browsing to the CI build stage in your pipeline."
224 |    ]
225 |   }
226 |  ],
227 |  "metadata": {
228 |   "celltoolbar": "Tags",
229 |   "kernelspec": {
230 |    "display_name": "Python 3",
231 |    "language": "python",
232 |    "name": "python3"
233 |   },
234 |   "language_info": {
235 |    "codemirror_mode": {
236 |     "name": "ipython",
237 |     "version": 3
238 |    },
239 |    "file_extension": ".py",
240 |    "mimetype": "text/x-python",
241 |    "name": "python",
242 |    "nbconvert_exporter": "python",
243 |    "pygments_lexer": "ipython3",
244 |    "version": "3.6.10-final"
245 |   }
246 |  },
247 |  "nbformat": 4,
248 |  "nbformat_minor": 4
249 | }


--------------------------------------------------------------------------------
/test/test_deployment_out.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "papermill": {
  7 |      "duration": 0.012236,
  8 |      "end_time": "2020-11-25T19:35:41.483644",
  9 |      "exception": false,
 10 |      "start_time": "2020-11-25T19:35:41.471408",
 11 |      "status": "completed"
 12 |     },
 13 |     "tags": []
 14 |    },
 15 |    "source": [
 16 |     "This noteboook launches the solution, with a parameter that instructs the instance to run the solution's notebook using papermill, wait for that process to finish, then raise any errors encountered while running the notebook to the build.\n",
 17 |     "\n",
 18 |     "The _build instance_ will launch the solution using the following parameters, which can be overriden by providing them as enviroment variables in the build settings. Since the build instance is launching the solution, the build project needs to be provided with all the permissions that are necessary to launch the solution."
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 1,
 24 |    "metadata": {
 25 |     "execution": {
 26 |      "iopub.execute_input": "2020-11-25T19:35:41.502340Z",
 27 |      "iopub.status.busy": "2020-11-25T19:35:41.501888Z",
 28 |      "iopub.status.idle": "2020-11-25T19:35:41.503443Z",
 29 |      "shell.execute_reply": "2020-11-25T19:35:41.503829Z"
 30 |     },
 31 |     "papermill": {
 32 |      "duration": 0.013273,
 33 |      "end_time": "2020-11-25T19:35:41.504077",
 34 |      "exception": false,
 35 |      "start_time": "2020-11-25T19:35:41.490804",
 36 |      "status": "completed"
 37 |     },
 38 |     "tags": [
 39 |      "parameters"
 40 |     ]
 41 |    },
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "BRANCH=\"mainline\"\n",
 45 |     "REGION=\"us-west-2\"\n",
 46 |     "SOLUTIONS_BUCKET=\"sagemaker-solutions-devo\"\n",
 47 |     "SOLUTION_NAME=\"Fraud-detection-using-machine-learning\"\n",
 48 |     "STACK_NAME=\"sagemaker-soln-fdml-ci\"\n",
 49 |     "STACK_VERSION=\"development\"\n",
 50 |     "COMMIT_ID = \"\"\n",
 51 |     "CI_BUCKET = \"\"\n",
 52 |     "EXECUTION_ID = \"\"\n",
 53 |     "NOTEBOOK_POLL_ATTEMPTS=120 # Number of attempts while waiting for SM notebook to execute and produce output on S3\n",
 54 |     "NOTEBOOK_POLL_DELAY=60 # Delay between each attempt, in seconds"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 2,
 60 |    "metadata": {
 61 |     "execution": {
 62 |      "iopub.execute_input": "2020-11-25T19:35:41.519169Z",
 63 |      "iopub.status.busy": "2020-11-25T19:35:41.518776Z",
 64 |      "iopub.status.idle": "2020-11-25T19:35:41.520639Z",
 65 |      "shell.execute_reply": "2020-11-25T19:35:41.520252Z"
 66 |     },
 67 |     "papermill": {
 68 |      "duration": 0.010379,
 69 |      "end_time": "2020-11-25T19:35:41.520724",
 70 |      "exception": false,
 71 |      "start_time": "2020-11-25T19:35:41.510345",
 72 |      "status": "completed"
 73 |     },
 74 |     "tags": [
 75 |      "injected-parameters"
 76 |     ]
 77 |    },
 78 |    "outputs": [],
 79 |    "source": [
 80 |     "# Parameters\n",
 81 |     "STACK_NAME = \"sagemaker-soln-fdml-725e04-me-south-1\"\n",
 82 |     "BRANCH = \"multi-region-ci\"\n",
 83 |     "EXECUTION_ID = \"589f83f6-3aad-487e-81d2-211a6a725e04\"\n",
 84 |     "CI_BUCKET = \"sagemaker-soln-fdml-725e04-me-south-1\"\n",
 85 |     "REGION = \"me-south-1\"\n",
 86 |     "SOLUTIONS_BUCKET = \"thvasilo-dev-test\"\n",
 87 |     "STACK_VERSION = \"development\"\n"
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "markdown",
 92 |    "metadata": {
 93 |     "papermill": {
 94 |      "duration": 0.005728,
 95 |      "end_time": "2020-11-25T19:35:41.532448",
 96 |      "exception": false,
 97 |      "start_time": "2020-11-25T19:35:41.526720",
 98 |      "status": "completed"
 99 |     },
100 |     "tags": []
101 |    },
102 |    "source": [
103 |     "The next cell programmatically creates the URL for the solution's template, based on the parameters passed above. It's important to include the branch suffix to be able to support feature branches as well as the mainline release pipeline."
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "code",
108 |    "execution_count": 3,
109 |    "metadata": {
110 |     "execution": {
111 |      "iopub.execute_input": "2020-11-25T19:35:41.546759Z",
112 |      "iopub.status.busy": "2020-11-25T19:35:41.546377Z",
113 |      "iopub.status.idle": "2020-11-25T19:35:41.547881Z",
114 |      "shell.execute_reply": "2020-11-25T19:35:41.548196Z"
115 |     },
116 |     "papermill": {
117 |      "duration": 0.00998,
118 |      "end_time": "2020-11-25T19:35:41.548297",
119 |      "exception": false,
120 |      "start_time": "2020-11-25T19:35:41.538317",
121 |      "status": "completed"
122 |     },
123 |     "tags": []
124 |    },
125 |    "outputs": [],
126 |    "source": [
127 |     "branch_suffix = \"\" if BRANCH == \"mainline\" else f\"-{BRANCH}\"\n",
128 |     "template_url = f\"https://{SOLUTIONS_BUCKET}-{REGION}.s3.{REGION}.amazonaws.com/{SOLUTION_NAME}{branch_suffix}/deployment/fraud-detection-using-machine-learning.yaml\""
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "markdown",
133 |    "metadata": {
134 |     "papermill": {
135 |      "duration": 0.006514,
136 |      "end_time": "2020-11-25T19:35:41.560519",
137 |      "exception": false,
138 |      "start_time": "2020-11-25T19:35:41.554005",
139 |      "status": "completed"
140 |     },
141 |     "tags": []
142 |    },
143 |    "source": [
144 |     "In the next cell we create a unique prefix for our solution, and create an S3 bucket that will serve as the destination for the notebook files we run on the SM instance. It's important that its name starts with the solution prefix, as that will allow the solution itself to write to it (because the solution should have write access to all `sagemaker-soln-` buckets under the same account)."
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": 4,
150 |    "metadata": {
151 |     "execution": {
152 |      "iopub.execute_input": "2020-11-25T19:35:41.575998Z",
153 |      "iopub.status.busy": "2020-11-25T19:35:41.575526Z",
154 |      "iopub.status.idle": "2020-11-25T19:35:41.775674Z",
155 |      "shell.execute_reply": "2020-11-25T19:35:41.775996Z"
156 |     },
157 |     "papermill": {
158 |      "duration": 0.209284,
159 |      "end_time": "2020-11-25T19:35:41.776114",
160 |      "exception": false,
161 |      "start_time": "2020-11-25T19:35:41.566830",
162 |      "status": "completed"
163 |     },
164 |     "tags": []
165 |    },
166 |    "outputs": [
167 |     {
168 |      "name": "stderr",
169 |      "output_type": "stream",
170 |      "text": [
171 |       "INFO:botocore.credentials:Found credentials in shared credentials file: ~/.aws/credentials\n"
172 |      ]
173 |     }
174 |    ],
175 |    "source": [
176 |     "import boto3\n",
177 |     "import uuid\n",
178 |     "import logging\n",
179 |     "import os\n",
180 |     "\n",
181 |     "logging.basicConfig(level=os.environ.get(\"LOGLEVEL\", \"INFO\"))\n",
182 |     "\n",
183 |     "\n",
184 |     "cfn_client = boto3.client('cloudformation', region_name=REGION)\n",
185 |     "s3_client = boto3.client('s3', region_name=REGION)\n",
186 |     "s3 = boto3.resource('s3', region_name=REGION)\n",
187 |     "\n",
188 |     "# Use the commit id to give the solution a unique prefix and name\n",
189 |     "solution_prefix = \"sagemaker-soln-fdml-\" # TODO: Get from template directly\n",
190 |     "unique_prefix = f\"{solution_prefix}{EXECUTION_ID[-6:]}-{REGION}\""
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "markdown",
195 |    "metadata": {
196 |     "papermill": {
197 |      "duration": 0.00573,
198 |      "end_time": "2020-11-25T19:35:41.788312",
199 |      "exception": false,
200 |      "start_time": "2020-11-25T19:35:41.782582",
201 |      "status": "completed"
202 |     },
203 |     "tags": []
204 |    },
205 |    "source": [
206 |     "The `TestOutputsS3Bucket` CloudFormation parameter given in the next cell, is parsed by CloudFormation and taken in by the project's configuration package (see `source/notebooks/src/package/config.py`). When this parameter is set to something different than `\"\"`, the notebook instance will run the solution's notebook using papermill, through the instance's on-start script (see `deployment/fraud-detection-sagemaker-notebook-instance.yaml`)."
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": 5,
212 |    "metadata": {
213 |     "execution": {
214 |      "iopub.execute_input": "2020-11-25T19:35:41.813269Z",
215 |      "iopub.status.busy": "2020-11-25T19:35:41.812473Z",
216 |      "iopub.status.idle": "2020-11-25T19:35:43.474452Z",
217 |      "shell.execute_reply": "2020-11-25T19:35:43.474005Z"
218 |     },
219 |     "papermill": {
220 |      "duration": 1.677137,
221 |      "end_time": "2020-11-25T19:35:43.474599",
222 |      "exception": false,
223 |      "start_time": "2020-11-25T19:35:41.797462",
224 |      "status": "completed"
225 |     },
226 |     "tags": []
227 |    },
228 |    "outputs": [
229 |     {
230 |      "name": "stderr",
231 |      "output_type": "stream",
232 |      "text": [
233 |       "INFO:root:Creating stack using template located at https://thvasilo-dev-test-me-south-1.s3.me-south-1.amazonaws.com/Fraud-detection-using-machine-learning-multi-region-ci/deployment/fraud-detection-using-machine-learning.yaml\n"
234 |      ]
235 |     },
236 |     {
237 |      "name": "stderr",
238 |      "output_type": "stream",
239 |      "text": [
240 |       "INFO:root:STACK_NAME: sagemaker-soln-fdml-725e04-me-south-1\n"
241 |      ]
242 |     },
243 |     {
244 |      "name": "stderr",
245 |      "output_type": "stream",
246 |      "text": [
247 |       "INFO:root:REGION: me-south-1\n"
248 |      ]
249 |     },
250 |     {
251 |      "name": "stderr",
252 |      "output_type": "stream",
253 |      "text": [
254 |       "INFO:root:SOLUTIONS_BUCKET: thvasilo-dev-test\n"
255 |      ]
256 |     },
257 |     {
258 |      "name": "stderr",
259 |      "output_type": "stream",
260 |      "text": [
261 |       "INFO:root:CI_BUCKET: sagemaker-soln-fdml-725e04-me-south-1\n"
262 |      ]
263 |     },
264 |     {
265 |      "name": "stderr",
266 |      "output_type": "stream",
267 |      "text": [
268 |       "INFO:root:StackVersion: development\n"
269 |      ]
270 |     },
271 |     {
272 |      "data": {
273 |       "text/plain": [
274 |        "{'StackId': 'arn:aws:cloudformation:me-south-1:412868550678:stack/sagemaker-soln-fdml-725e04-me-south-1/680ab570-2f55-11eb-8873-0686b2c3ec60',\n",
275 |        " 'ResponseMetadata': {'RequestId': '1bfdc5e1-b6ef-4ec5-8a44-aedceb29aa8b',\n",
276 |        "  'HTTPStatusCode': 200,\n",
277 |        "  'HTTPHeaders': {'x-amzn-requestid': '1bfdc5e1-b6ef-4ec5-8a44-aedceb29aa8b',\n",
278 |        "   'content-type': 'text/xml',\n",
279 |        "   'content-length': '408',\n",
280 |        "   'date': 'Wed, 25 Nov 2020 19:35:42 GMT'},\n",
281 |        "  'RetryAttempts': 0}}"
282 |       ]
283 |      },
284 |      "execution_count": 5,
285 |      "metadata": {},
286 |      "output_type": "execute_result"
287 |     }
288 |    ],
289 |    "source": [
290 |     "logging.info(f\"Creating stack using template located at {template_url}\")\n",
291 |     "logging.info(f\"STACK_NAME: {STACK_NAME}\")\n",
292 |     "logging.info(f\"REGION: {REGION}\")\n",
293 |     "logging.info(f\"SOLUTIONS_BUCKET: {SOLUTIONS_BUCKET}\")\n",
294 |     "logging.info(f\"CI_BUCKET: {CI_BUCKET}\")\n",
295 |     "logging.info(f\"StackVersion: {STACK_VERSION}\")\n",
296 |     "\n",
297 |     "cfn_client.create_stack(\n",
298 |     "    StackName=STACK_NAME,\n",
299 |     "    TemplateURL=template_url,\n",
300 |     "    Parameters=[\n",
301 |     "        {\n",
302 |     "            'ParameterKey': 'SolutionPrefix',\n",
303 |     "            'ParameterValue': unique_prefix\n",
304 |     "        },\n",
305 |     "        {\n",
306 |     "            'ParameterKey': 'StackVersion',\n",
307 |     "            'ParameterValue': STACK_VERSION\n",
308 |     "        },\n",
309 |     "        {\n",
310 |     "            'ParameterKey': 'TestOutputsS3Bucket',\n",
311 |     "            'ParameterValue': CI_BUCKET\n",
312 |     "        },\n",
313 |     "        {\n",
314 |     "            'ParameterKey': 'SolutionName',\n",
315 |     "            'ParameterValue': f\"{SOLUTION_NAME}{branch_suffix}\"\n",
316 |     "        }\n",
317 |     "    ],\n",
318 |     "    Capabilities=[\n",
319 |     "        'CAPABILITY_IAM',\n",
320 |     "        'CAPABILITY_NAMED_IAM'\n",
321 |     "    ]\n",
322 |     ")"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "markdown",
327 |    "metadata": {
328 |     "papermill": {
329 |      "duration": 0.007602,
330 |      "end_time": "2020-11-25T19:35:43.491854",
331 |      "exception": false,
332 |      "start_time": "2020-11-25T19:35:43.484252",
333 |      "status": "completed"
334 |     },
335 |     "tags": []
336 |    },
337 |    "source": [
338 |     "We then wait for the stack to finish launching."
339 |    ]
340 |   },
341 |   {
342 |    "cell_type": "code",
343 |    "execution_count": 6,
344 |    "metadata": {
345 |     "execution": {
346 |      "iopub.execute_input": "2020-11-25T19:35:43.510407Z",
347 |      "iopub.status.busy": "2020-11-25T19:35:43.509925Z",
348 |      "iopub.status.idle": "2020-11-25T19:41:26.849062Z",
349 |      "shell.execute_reply": "2020-11-25T19:41:26.849687Z"
350 |     },
351 |     "papermill": {
352 |      "duration": 343.349818,
353 |      "end_time": "2020-11-25T19:41:26.850142",
354 |      "exception": false,
355 |      "start_time": "2020-11-25T19:35:43.500324",
356 |      "status": "completed"
357 |     },
358 |     "tags": []
359 |    },
360 |    "outputs": [
361 |     {
362 |      "name": "stderr",
363 |      "output_type": "stream",
364 |      "text": [
365 |       "INFO:root:Waiting for stack creation to complete...\n"
366 |      ]
367 |     }
368 |    ],
369 |    "source": [
370 |     "logging.info(\"Waiting for stack creation to complete...\")\n",
371 |     "waiter = cfn_client.get_waiter('stack_create_complete')\n",
372 |     "\n",
373 |     "waiter.wait(StackName=STACK_NAME)"
374 |    ]
375 |   },
376 |   {
377 |    "cell_type": "markdown",
378 |    "metadata": {
379 |     "papermill": {
380 |      "duration": 0.007849,
381 |      "end_time": "2020-11-25T19:41:26.870715",
382 |      "exception": false,
383 |      "start_time": "2020-11-25T19:41:26.862866",
384 |      "status": "completed"
385 |     },
386 |     "tags": []
387 |    },
388 |    "source": [
389 |     "Once the stack has finished creating, the OnStart script will attempt to run the `sagemaker_fraud_detection.ipynb` notebook, through the `test/run_notebook.py` script. The notebook is run using papermill, and creates an output in the CI S3 bucket we created previously. The following cell will continuously poll the expected location until the output file appears, or errors out after `NOTEBOOK_POLL_DELAY * NOTEBOOK_POLL_ATTEMPTS` seconds. This also means that the CodeBuild project needs to be able to read files from the particular bucket.\n",
390 |     "\n",
391 |     "Note that if this is longer than the build stage's timeout, the build stage will fail. If your solution's notebooks take very long to run, make sure to [increase the build stage's time out](https://docs.aws.amazon.com/codebuild/latest/userguide/change-project-console.html) as well, can be set using a parameter in the CFT you used to create the pipeline."
392 |    ]
393 |   },
394 |   {
395 |    "cell_type": "code",
396 |    "execution_count": 7,
397 |    "metadata": {
398 |     "execution": {
399 |      "iopub.execute_input": "2020-11-25T19:41:26.892332Z",
400 |      "iopub.status.busy": "2020-11-25T19:41:26.891930Z",
401 |      "iopub.status.idle": "2020-11-25T20:19:13.673993Z",
402 |      "shell.execute_reply": "2020-11-25T20:19:13.674802Z"
403 |     },
404 |     "papermill": {
405 |      "duration": 2266.796426,
406 |      "end_time": "2020-11-25T20:19:13.675623",
407 |      "exception": false,
408 |      "start_time": "2020-11-25T19:41:26.879197",
409 |      "status": "completed"
410 |     },
411 |     "tags": []
412 |    },
413 |    "outputs": [
414 |     {
415 |      "name": "stderr",
416 |      "output_type": "stream",
417 |      "text": [
418 |       "INFO:root:Waiting for output notebook to appear at sagemaker-soln-fdml-725e04-me-south-1/integration-test/output.ipynb...\n"
419 |      ]
420 |     },
421 |     {
422 |      "name": "stderr",
423 |      "output_type": "stream",
424 |      "text": [
425 |       "INFO:root:Will attempt a total 120 every 60 seconds.\n"
426 |      ]
427 |     }
428 |    ],
429 |    "source": [
430 |     "# TODO: Ensure there's a single source for these filenames, either in the config, or passed as a papermill parameter?\n",
431 |     "# Right now they're set here and in run_notebook.py\n",
432 |     "import os\n",
433 |     "prefix = 'integration-test' \n",
434 |     "key = \"output.ipynb\"\n",
435 |     "\n",
436 |     "\n",
437 |     "\n",
438 |     "waiter = s3_client.get_waiter('object_exists')\n",
439 |     "\n",
440 |     "logging.info(f\"Waiting for output notebook to appear at {CI_BUCKET}/{os.path.join(prefix, key)}...\")\n",
441 |     "logging.info(f\"Will attempt a total {NOTEBOOK_POLL_ATTEMPTS} every {NOTEBOOK_POLL_DELAY} seconds.\")\n",
442 |     "waiter.wait(Bucket=CI_BUCKET, Key=os.path.join(prefix, key), WaiterConfig={'Delay': NOTEBOOK_POLL_DELAY,'MaxAttempts': NOTEBOOK_POLL_ATTEMPTS})"
443 |    ]
444 |   },
445 |   {
446 |    "cell_type": "markdown",
447 |    "metadata": {
448 |     "papermill": {
449 |      "duration": 0.009378,
450 |      "end_time": "2020-11-25T20:19:13.700811",
451 |      "exception": false,
452 |      "start_time": "2020-11-25T20:19:13.691433",
453 |      "status": "completed"
454 |     },
455 |     "tags": []
456 |    },
457 |    "source": [
458 |     "Once the notebook appears in the expected location in S3, we download it locally within the build instance, and the stdout and stderr output we got from running the notebook. This doesn't actually run the notebook, but will raise and surface any errors that we triggered during execution on the SM notebook instance. If your solution needs to run more than one notebook you would need to wait for each one to finish in the order you expect them to execute, download them, then dry-run them sequentially here."
459 |    ]
460 |   },
461 |   {
462 |    "cell_type": "code",
463 |    "execution_count": 8,
464 |    "metadata": {
465 |     "execution": {
466 |      "iopub.execute_input": "2020-11-25T20:19:13.728566Z",
467 |      "iopub.status.busy": "2020-11-25T20:19:13.728056Z",
468 |      "iopub.status.idle": "2020-11-25T20:19:17.873433Z",
469 |      "shell.execute_reply": "2020-11-25T20:19:17.872855Z"
470 |     },
471 |     "papermill": {
472 |      "duration": 4.163517,
473 |      "end_time": "2020-11-25T20:19:17.873583",
474 |      "exception": false,
475 |      "start_time": "2020-11-25T20:19:13.710066",
476 |      "status": "completed"
477 |     },
478 |     "tags": []
479 |    },
480 |    "outputs": [
481 |     {
482 |      "name": "stderr",
483 |      "output_type": "stream",
484 |      "text": [
485 |       "INFO:root:Downloading notebook outputs locally...\n"
486 |      ]
487 |     },
488 |     {
489 |      "name": "stderr",
490 |      "output_type": "stream",
491 |      "text": [
492 |       "INFO:root:Performing dry-run of notebooks to surface any errors...\n"
493 |      ]
494 |     },
495 |     {
496 |      "name": "stdout",
497 |      "output_type": "stream",
498 |      "text": [
499 |       "Test deployment and notebook execution completed successfully!\n"
500 |      ]
501 |     }
502 |    ],
503 |    "source": [
504 |     "# Dry-run execute the notebook, raising errors if any existed\n",
505 |     "import papermill as pm\n",
506 |     "\n",
507 |     "logging.info(\"Downloading notebook outputs locally...\")\n",
508 |     "s3.meta.client.download_file(CI_BUCKET, os.path.join(prefix, key), key)\n",
509 |     "try:\n",
510 |     "    s3.meta.client.download_file(CI_BUCKET, os.path.join(prefix, \"output_stdout.txt\"), \"output_stdout.txt\")\n",
511 |     "    s3.meta.client.download_file(CI_BUCKET, os.path.join(prefix, \"output_stderr.txt\"), \"output_stderr.txt\")\n",
512 |     "except:\n",
513 |     "    pass\n",
514 |     "\n",
515 |     "# TODO: this notebook filename should also be a parameter\n",
516 |     "logging.info(\"Performing dry-run of notebooks to surface any errors...\")\n",
517 |     "nb = pm.iorw.load_notebook_node(key)\n",
518 |     "pm.execute.raise_for_execution_errors(nb, key)\n",
519 |     "\n",
520 |     "print(\"Test deployment and notebook execution completed successfully!\")"
521 |    ]
522 |   },
523 |   {
524 |    "cell_type": "markdown",
525 |    "metadata": {
526 |     "papermill": {
527 |      "duration": 0.011738,
528 |      "end_time": "2020-11-25T20:19:17.897345",
529 |      "exception": false,
530 |      "start_time": "2020-11-25T20:19:17.885607",
531 |      "status": "completed"
532 |     },
533 |     "tags": []
534 |    },
535 |    "source": [
536 |     "The build project's artifacts will include all the files you download locally here, so they will end up on S3, where you can go and check out the output to debug any errors in this or the solution's notebook. You can find the build artifacts by browsing to the CI build stage in your pipeline."
537 |    ]
538 |   }
539 |  ],
540 |  "metadata": {
541 |   "celltoolbar": "Tags",
542 |   "kernelspec": {
543 |    "display_name": "Python 3",
544 |    "language": "python",
545 |    "name": "python3"
546 |   },
547 |   "language_info": {
548 |    "codemirror_mode": {
549 |     "name": "ipython",
550 |     "version": 3
551 |    },
552 |    "file_extension": ".py",
553 |    "mimetype": "text/x-python",
554 |    "name": "python",
555 |    "nbconvert_exporter": "python",
556 |    "pygments_lexer": "ipython3",
557 |    "version": "3.7.4"
558 |   },
559 |   "papermill": {
560 |    "duration": 2617.877708,
561 |    "end_time": "2020-11-25T20:19:18.223213",
562 |    "environment_variables": {},
563 |    "exception": null,
564 |    "input_path": "./test/test_deployment.ipynb",
565 |    "output_path": "./test/test_deployment_out.ipynb",
566 |    "parameters": {
567 |     "BRANCH": "multi-region-ci",
568 |     "CI_BUCKET": "sagemaker-soln-fdml-725e04-me-south-1",
569 |     "EXECUTION_ID": "589f83f6-3aad-487e-81d2-211a6a725e04",
570 |     "REGION": "me-south-1",
571 |     "SOLUTIONS_BUCKET": "thvasilo-dev-test",
572 |     "STACK_NAME": "sagemaker-soln-fdml-725e04-me-south-1",
573 |     "STACK_VERSION": "development"
574 |    },
575 |    "start_time": "2020-11-25T19:35:40.345505",
576 |    "version": "2.1.2"
577 |   }
578 |  },
579 |  "nbformat": 4,
580 |  "nbformat_minor": 4
581 | }


--------------------------------------------------------------------------------