├── .github
    ├── ISSUE_TEMPLATE
    │   ├── bug.md
    │   ├── feature-request.md
    │   └── general.md
    └── PULL_REQUEST_TEMPLATE
    │   └── pull_request_template.md
├── .gitignore
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE.txt
├── NOTICE.txt
├── README.md
├── deployment
    ├── app_code
    │   ├── data
    │   │   ├── initial_contacts.csv
    │   │   └── update_contacts.csv
    │   ├── ecr_build_src.zip
    │   ├── job
    │   │   ├── delta_load.ipynb
    │   │   ├── initial_load.ipynb
    │   │   ├── scd2_merge.ipynb
    │   │   └── wordcount.py
    │   ├── meta
    │   │   └── contact_meta_0.json
    │   └── sql
    │   │   ├── add_calc_field_for_scd2.sql
    │   │   ├── create_table_contact.sql
    │   │   └── sqlvalidate_errors.sql
    ├── build-s3-dist.sh
    ├── cdk-solution-helper
    │   ├── README.md
    │   ├── index.js
    │   └── package.json
    ├── delete_all.sh
    └── post-deployment.sh
└── source
    ├── app.py
    ├── app_resources
        ├── alb-iam-role.yaml
        ├── alb-values.yaml
        ├── argo-values.yaml
        ├── autoscaler-iam-role.yaml
        ├── autoscaler-values.yaml
        ├── etl-iam-role.yaml
        ├── etl-rbac.yaml
        ├── ex-secret-iam-role.yaml
        ├── ex-secret-values.yaml
        ├── jupyter-config.yaml
        ├── jupyter-values.yaml
        ├── native-spark-iam-role.yaml
        ├── native-spark-rbac.yaml
        ├── spark-operator-values.yaml
        └── spark-template.yaml
    ├── cdk.json
    ├── example
        ├── native-spark-job-scheduler.yaml
        ├── notebook
        │   ├── nyctaxi-job.ipynb
        │   └── scd2-job.ipynb
        ├── nyctaxi-job-scheduler.yaml
        └── scd2-job-scheduler.yaml
    ├── images
        ├── 00-deploy-to-aws.png
        ├── 3-argo-job-dependency.png
        ├── 3-argo-log.png
        ├── 4-auto-scaling.png
        ├── 4-k8s-retry.png
        ├── 4-spot-console.png
        ├── architecture.png
        ├── driver_interruption_test.gif
        ├── executor_interruption_test.png
        ├── fake_data.gif
        ├── run_jupyter.gif
        ├── sql-based-etl-spark-architecture-final.png
        ├── sql-based-etl-with-apache-spark-on-amazon-eks.preview.png
        └── submit_job_in_argo.gif
    ├── lib
        ├── cdk_infra
        │   ├── eks_base_app.py
        │   ├── eks_cluster.py
        │   ├── eks_service_account.py
        │   ├── iam_roles.py
        │   ├── network_sg.py
        │   ├── s3_app_code.py
        │   └── spark_permission.py
        ├── cloud_front_stack.py
        ├── ecr_build
        │   ├── Dockerfile
        │   ├── buildspec.yaml
        │   └── ecr_build_pipeline.py
        ├── solution_helper
        │   ├── lambda_function.py
        │   ├── requirements.txt
        │   └── solution_metrics.py
        ├── spark_on_eks_stack.py
        └── util
        │   ├── conditional_resources.py
        │   ├── get_aws_managed_prefix.py
        │   ├── manifest_reader.py
        │   └── override_rule.py
    ├── package.json
    ├── requirements.txt
    ├── run-all-tests.sh
    └── setup.py


/.github/ISSUE_TEMPLATE/bug.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | labels: bug
 5 | 
 6 | ---
 7 | <!-- Required -->
 8 | ### Describe the bug ###
 9 | A clear and concise description of what the bug is.
10 | 
11 | ### To Reproduce ###
12 | Steps to reproduce the behavior
13 | 
14 | 1. Step One:
15 | 2. Step Two:
16 | 3. [...]
17 | 
18 | ### Expected Result ###
19 | A clear and concise description of what you expected to happen.
20 | 
21 | ### Actual Result ###
22 | A description of what is the result and/or error messages you got when you faced this issue.
23 | 
24 | <!-- Optional to fill out unless if you have launched the solution. In that case Region and Version are required. -->
25 | ### Other information: ###
26 | 1. Version of the Solution (e.g., v1.1.0):
27 | 
28 |     To get the version of the solution, you can look at the description of the created CloudFormation stack. For example, "_(SO0027) AWS Serverless Bot Framework v1.2.0 - This AWS CloudFormation template helps you provision the AWS Serverless Bot Framework stack without worrying about creating and configuring the underlying AWS infrastructure_". If the description does not contain the version information, you can look at the mappings section of the template:
29 | 
30 | ```yaml
31 | Mappings:
32 |   Solution:
33 |     Data:
34 |       ID: SO0027
35 |       Version: 'v1.2.0'
36 | ```
37 | 
38 | 2. Region where CloudFormation template is deployed (e.g., us-east-1):
39 | 3. Did you make any change in the source code? If yes, what are the relevant changes (if publicly available)?:
40 | 4. Troubleshooting steps attempted:
41 | 5. Were there any errors in the Cloudwatch logs?:
42 | 6. Screenshots (please **DO NOT include sensitive information**):
43 | 7. Did you use the Sample Weather Service (please DO NOT include API KEY) ? Yes / No
44 | 
45 | ### Stack Parameters ###
46 | Cloudformation Stack Parameters (please **DO NOT include sensitive information** like S3 bucket name, IP address, credentials, etc):
47 | 1. Bot Name:
48 | 2. Bot Language:
49 | 3. Bot Gender:
50 | <!-- Add more stack paramters if needed-->
51 | 
52 | ### Additional context ###
53 | Add any other context about the problem here.
54 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature-request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this solution
 4 | labels: enhancement
 5 | 
 6 | ---
 7 | 
 8 | ### Is your feature request related to a problem? Please describe. ###
 9 | A clear and concise description of what the problem is. E.g., I'm always frustrated when [...]
10 | 
11 | ### Describe the feature you'd like ###
12 | A clear and concise description of what you want to happen.
13 | 
14 | ### Additional context ###
15 | Add any other context or screenshots about the feature request here.
16 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/general.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: General issue
 3 | about: If your issue is not a bug or a feature request. Use this format.
 4 | 
 5 | ---
 6 | <!-- If you're reporting a bug or you are submitting a feature request please close this and open an issue with Bug or Feature request template. -->
 7 | 
 8 | ### Describe the issue ###
 9 | A clear and concise description of what the issue is.
10 | 
11 | ### Additional context ###
12 | Add any other context or screenshots about the issue here.
13 | 


--------------------------------------------------------------------------------
/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ### Pull Request Description ###
 3 | Describe the changes that have been made in this pull request.
 4 | 
 5 | ### Link to Related Issue (if applicable) ###
 6 | If this pull request fixes an open issue, provide a link to the issue here.
 7 | 
 8 | ----
 9 | *Note: We will not be able to merge pull requests directly. However, if a pull request is accepted we will put the changes into the source code and publish the changes.*
10 | 
11 | ----
12 | 
13 | *By submitting this pull request, I confirm that my contribution is made under the terms of the Apache-2.0 license*


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Git
 2 | .git
 3 | Config
 4 | 
 5 | ### VisualStudioCode ###
 6 | .vscode/*
 7 | ### IntelliJ/ PyCharm ###
 8 | .idea/*
 9 | # System Files
10 | **/.DS_Store
11 | # CDK
12 | **/cdk.out
13 | **/cdk.context.json
14 | *.swp
15 | **/node_modules
16 | **/package-lock.json
17 | 
18 | # compiled output
19 | **/global-s3-assets
20 | **/regional-s3-assets
21 | **/open-source
22 | 
23 | ### Python ###
24 | # Byte-compiled / optimized / DLL files
25 | *__pycache__/
26 | *.py[cod]
27 | *$py.class
28 | # Python Distribution / packaging
29 | *.egg-info/
30 | *.egg
31 | # Python Virtual Environments
32 | **/venv*
33 | **/.venv*
34 | **/.env
35 | ## Python Testing
36 | .pytest_cache
37 | **/.pytest_cache
38 | **/.coverage
39 | **/.coveragerc
40 | **/coverage-reports/


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Change Log
 2 | All notable changes to this project will be documented in this file.
 3 | 
 4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 5 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 6 | 
 7 | ## [2.0.1] - 2023-11-13
 8 | ### Upgrade
 9 | - Upgrade CDK v2.67.0 to v2.105.0
10 | - Add python version 3.10 & to 3.11
11 | - Upgrade EKS version to 1.27
12 | ### Change
13 | - To compatible with k8s 1.26+, disable userScheduler in Jupyterhub
14 | 
15 | ## [2.0.0] - 2022-07-22
16 | ### Upgrade
17 | - Upgrade CDK v1 to v2
18 | - Upgrade python from 3.8 to 3.9
19 | 
20 | ## [1.0.0] - 2021-07-29
21 | ### Added
22 | - All files, initial version
23 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check [existing open](https://github.com/awslabs/sql-based-etl-with-apache-spark-on-amazon-eks/issues), or [recently closed](https://github.com/awslabs/sql-based-etl-with-apache-spark-on-amazon-eks/issues?utf8=%E2%9C%93&q=is%3Aissue%20is%3Aclosed%20), issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *master* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure all build processes execute successfully (see README.md for additional guidance).
35 | 4. Ensure all unit, integration, and/or snapshot tests pass, as applicable.
36 | 5. Commit to your fork using clear commit messages.
37 | 6. Send us a pull request, answering any default questions in the pull request interface.
38 | 7. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
39 | 
40 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
41 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
42 | 
43 | 
44 | ## Finding contributions to work on
45 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels ((enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any ['help wanted'](https://github.com/awslabs/sql-based-etl-with-apache-spark-on-amazon-eks/labels/help%20wanted) issues is a great place to start.
46 | 
47 | 
48 | ## Code of Conduct
49 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
50 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
51 | opensource-codeofconduct@amazon.com with any additional questions or comments.
52 | 
53 | 
54 | ## Security issue notifications
55 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public GitHub issue.
56 | 
57 | 
58 | ## Licensing
59 | 
60 | See the [LICENSE](https://github.com/awslabs/sql-based-etl-with-apache-spark-on-amazon-eks/blob/master/LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
61 | 
62 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes.
63 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | MIT No Attribution
 2 | 
 3 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 6 | this software and associated documentation files (the "Software"), to deal in
 7 | the Software without restriction, including without limitation the rights to
 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 9 | the Software, and to permit persons to whom the Software is furnished to do so.
10 | 
11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
13 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
14 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
15 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
16 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.


--------------------------------------------------------------------------------
/NOTICE.txt:
--------------------------------------------------------------------------------
 1 | sql-based-etl-with-apache-spark-on-amazon-eks
 2 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 3 | 
 4 | Permission is hereby granted, free of charge, to any person obtaining a copy of this
 5 | software and associated documentation files (the "Software"), to deal in the Software
 6 | without restriction, including without limitation the rights to use, copy, modify,
 7 | merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
 8 | permit persons to whom the Software is furnished to do so.
 9 | 
10 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
11 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
12 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
13 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
14 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
15 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
16 | 
17 | **********************
18 | THIRD PARTY COMPONENTS
19 | **********************
20 | This software includes third party software subject to the following copyrights:
21 | 
22 | Arc -- v3.10.0_spark_3.0.3_scala_2.12_hadoop_3.2.0_1.0.0 -- https://arc.tripl.ai/ -- MIT License
23 | Arc Jupyter - v3.14.2_scala_2.12_hadoop_3.2.0_1.1.0 -- https://github.com/tripl-ai/arc-jupyter --  MIT License
24 | argo-workflows -- v3.5.4 -- https://github.com/argoproj/argo-workflows -- Apache-2.0
25 | JupyterHub -- v1.5.0 -- https://hub.jupyter.org/helm-chart/ -- revised BSD license
26 | aws-cdk -- v2.105.0 -- https://github.com/aws/aws-cdk -- Apache-2.0
27 | k8s-cluster-autoscaler -- v1.27.2 -- https://github.com/kubernetes/autoscaler -- Apache-2.0
28 | # amazon-cloudwatch-container-insights -- latest version -- https://github.com/aws-samples/amazon-cloudwatch-container-insights -- MIT-0
29 | aws-load-balancer-controller -- v2.5.4 -- https://github.com/aws/eks-charts/ -- Apache-2.0
30 | kubernetes-external-secrets -- v8.5.5 -- https://github.com/external-secrets/kubernetes-external-secrets -- MIT License
31 | spark-on-k8s-operator -- v1beta2-1.2.3-3.1.1 -- https://github.com/GoogleCloudPlatform/spark-on-k8s-operator -- Apache-2.0
32 | 


--------------------------------------------------------------------------------
/deployment/app_code/ecr_build_src.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/guidance-for-sql-based-etl-with-apache-spark-on-amazon-eks/06f7302c12ad69851031043068fd34abdb75a7cb/deployment/app_code/ecr_build_src.zip


--------------------------------------------------------------------------------
/deployment/app_code/job/delta_load.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%conf numRows=5 logger=true"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {
 15 |     "ExecuteTime": {
 16 |      "end_time": "2020-03-18T22:38:05.895407Z",
 17 |      "start_time": "2020-03-18T22:37:48.160Z"
 18 |     }
 19 |    },
 20 |    "source": [
 21 |     "## 2. Ingest A New Incremental CSV File\n",
 22 |     "### Look at record 12, the `state` is changed in the file"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "{\n",
 32 |     "  \"type\": \"DelimitedExtract\",\n",
 33 |     "  \"name\": \"extract incremental data\",\n",
 34 |     "  \"environments\": [\"dev\", \"test\"],\n",
 35 |     "  \"inputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/data/update_contacts.csv\",\n",
 36 |     "  \"outputView\": \"delta_raw\",            \n",
 37 |     "  \"delimiter\": \"Comma\",\n",
 38 |     "  \"header\": false,\n",
 39 |     "  \"authentication\": {\n",
 40 |     "     \"method\": \"AmazonIAM\"\n",
 41 |     "  }\n",
 42 |     "}"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "## 2.2 Apply Data Type (reused schema file)"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "{\n",
 59 |     "  \"type\": \"TypingTransform\",\n",
 60 |     "  \"name\": \"apply table schema 0 to incremental load\",\n",
 61 |     "  \"environments\": [\"dev\", \"test\"],\n",
 62 |     "  \"schemaURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/meta/contact_meta_0.json\",\n",
 63 |     "  \"inputView\": \"delta_raw\",            \n",
 64 |     "  \"outputView\": \"delta_typed\",\n",
 65 |     "  \"authentication\": {\n",
 66 |     "     \"method\": \"AmazonIAM\"\n",
 67 |     "  }\n",
 68 |     "}"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "metadata": {
 74 |     "ExecuteTime": {
 75 |      "end_time": "2020-06-07T15:02:50.155313Z",
 76 |      "start_time": "2020-06-07T15:02:50.125Z"
 77 |     }
 78 |    },
 79 |    "source": [
 80 |     "## 2.3 Data Quality Control (reused sql script)"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "%sqlvaildate outputView=\"fail_fast\" name=\"validation\" description=\"fail the job if data transform is failed\" environments=dev,test sqlParams=inputView=delta_typed\n",
 90 |     "\n",
 91 |     "SELECT SUM(error) = 0 AS valid\n",
 92 |     "      ,TO_JSON(\n",
 93 |     "        NAMED_STRUCT('count', COUNT(error), 'errors', SUM(error))\n",
 94 |     "      ) AS message\n",
 95 |     "FROM \n",
 96 |     "(\n",
 97 |     "  SELECT CASE WHEN SIZE(_errors) > 0 THEN 1 ELSE 0 END AS error \n",
 98 |     "  FROM ${inputView}\n",
 99 |     ") base"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "markdown",
104 |    "metadata": {
105 |     "ExecuteTime": {
106 |      "end_time": "2020-05-31T05:01:13.796275Z",
107 |      "start_time": "2020-05-31T05:01:13.734Z"
108 |     }
109 |    },
110 |    "source": [
111 |     "## 2.4 Add Calculated Fields (reused sql script)"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": [
120 |     "%env \n",
121 |     "ETL_CONF_CURRENT_TIMESTAMP=CURRENT_TIMESTAMP()"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "%sql outputView=\"update_load\" name=\"add calc field for SCD\" environments=dev,test sqlParams=table_name=delta_typed,now=${ETL_CONF_CURRENT_TIMESTAMP}\n",
131 |     "\n",
132 |     "SELECT id,name,email,state, CAST(${now} AS timestamp) AS valid_from, CAST(null AS timestamp) AS valid_to\n",
133 |     ",1 AS iscurrent, md5(concat(name,email,state)) AS checksum \n",
134 |     "FROM ${table_name}"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "markdown",
139 |    "metadata": {},
140 |    "source": [
141 |     "## 2.5 Output Incremental data to Delta Lake\n",
142 |     "### Delta Lake is an optimized data lake to support Time Travel, ACID transaction"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": null,
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": [
151 |     "{\n",
152 |     "  \"type\": \"DeltaLakeLoad\",\n",
153 |     "  \"name\": \"Initial load to Data Lake\",\n",
154 |     "  \"environments\": [\"dev\", \"test\"],\n",
155 |     "  \"inputView\": \"update_load\",\n",
156 |     "  \"outputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/output/delta_load/\",\n",
157 |     "  \"numPartitions\": 2\n",
158 |     "  \"saveMode\": \"Overwrite\",\n",
159 |     "  \"authentication\": {\n",
160 |     "     \"method\": \"AmazonIAM\"\n",
161 |     "  }\n",
162 |     "}"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": null,
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": []
171 |   }
172 |  ],
173 |  "metadata": {
174 |   "kernelspec": {
175 |    "display_name": "Arc",
176 |    "language": "javascript",
177 |    "name": "arc"
178 |   },
179 |   "language_info": {
180 |    "codemirror_mode": "javascript",
181 |    "file_extension": ".json",
182 |    "mimetype": "javascript",
183 |    "name": "arc",
184 |    "nbconvert_exporter": "arcexport",
185 |    "version": "3.8.0"
186 |   }
187 |  },
188 |  "nbformat": 4,
189 |  "nbformat_minor": 4
190 | }


--------------------------------------------------------------------------------
/deployment/app_code/job/initial_load.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%conf numRows=5 logger=true"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "# 1. Initial Table Load"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": null,
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "{\n",
 26 |     "  \"type\": \"DelimitedExtract\",\n",
 27 |     "  \"name\": \"extract initial table\",\n",
 28 |     "  \"environments\": [\"dev\", \"test\"],\n",
 29 |     "  \"inputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/data/initial_contacts.csv\",\n",
 30 |     "  \"outputView\": \"initial_raw\",            \n",
 31 |     "  \"delimiter\": \"Comma\",\n",
 32 |     "  \"header\": false,\n",
 33 |     "  \"quote\": \"None\",\n",
 34 |     "  \"authentication\": {\n",
 35 |     "     \"method\": \"AmazonIAM\"\n",
 36 |     "  }\n",
 37 |     "}"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "## Check Original Data Schema"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "%printschema \n",
 54 |     "initial_raw"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "metadata": {
 60 |     "ExecuteTime": {
 61 |      "start_time": "2020-03-03T08:30:30.028Z"
 62 |     }
 63 |    },
 64 |    "source": [
 65 |     "## 1.2 Apply Data Type"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "{\n",
 75 |     "  \"type\": \"TypingTransform\",\n",
 76 |     "  \"name\": \"apply table schema 0\",\n",
 77 |     "  \"environments\": [\"dev\", \"test\"],\n",
 78 |     "  \"schemaURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/meta/contact_meta_0.json\",\n",
 79 |     "  \"inputView\": \"initial_raw\",            \n",
 80 |     "  \"outputView\": \"initial_typed\",\n",
 81 |     "  \"authentication\": {\n",
 82 |     "     \"method\": \"AmazonIAM\"\n",
 83 |     "  }\n",
 84 |     "}"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "markdown",
 89 |    "metadata": {},
 90 |    "source": [
 91 |     "## Check Typed Data Schema & Stats"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "%printschema \n",
101 |     "initial_typed"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "markdown",
106 |    "metadata": {},
107 |    "source": [
108 |     "## 1.3 Data Quality Control"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "%sqlvaildate outputView=\"fail_fast\" name=\"validation\" description=\"fail the job if data transform is failed\" environments=dev,test sqlParams=inputView=initial_typed\n",
118 |     "\n",
119 |     "SELECT SUM(error) = 0 AS valid\n",
120 |     "      ,TO_JSON(\n",
121 |     "        NAMED_STRUCT('count', COUNT(error), 'errors', SUM(error))\n",
122 |     "      ) AS message\n",
123 |     "FROM \n",
124 |     "(\n",
125 |     "  SELECT CASE WHEN SIZE(_errors) > 0 THEN 1 ELSE 0 END AS error \n",
126 |     "  FROM ${inputView}\n",
127 |     ") base"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "metadata": {},
133 |    "source": [
134 |     "## 1.4 Add Calculated Fields for SCD Type 2\n",
135 |     "### CURRENT_TIMESTAMP will be passed in automatically, when the ETL job is triggered"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "%env \n",
145 |     "ETL_CONF_CURRENT_TIMESTAMP=current_timestamp()"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "metadata": {},
152 |    "outputs": [],
153 |    "source": [
154 |     "%sql outputView=\"initial_load\" name=\"add calc field for SCD\" environments=dev,test sqlParams=table_name=initial_typed,now=${ETL_CONF_CURRENT_TIMESTAMP}\n",
155 |     "\n",
156 |     "SELECT id,name,email,state, CAST(${now} AS timestamp) AS valid_from, CAST(null AS timestamp) AS valid_to\n",
157 |     ",1 AS iscurrent, md5(concat(name,email,state)) AS checksum \n",
158 |     "FROM ${table_name}"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "metadata": {},
164 |    "source": [
165 |     "## 1.5 Load to Delta Lake as the initial daily snaptshot table\n",
166 |     "### Delta Lake is an optimized data lake to support Time Travel, ACID transaction"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "{\n",
176 |     "  \"type\": \"DeltaLakeLoad\",\n",
177 |     "  \"name\": \"Initial load to Data Lake\",\n",
178 |     "  \"environments\": [\"dev\", \"test\"],\n",
179 |     "  \"inputView\": \"initial_load\",\n",
180 |     "  \"outputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/output/contact_snapshot/\",\n",
181 |     "  \"numPartitions\": 2\n",
182 |     "  \"saveMode\": \"Overwrite\",\n",
183 |     "  \"authentication\": {\n",
184 |     "     \"method\": \"AmazonIAM\"\n",
185 |     "  }\n",
186 |     "}"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": []
195 |   }
196 |  ],
197 |  "metadata": {
198 |   "kernelspec": {
199 |    "display_name": "Arc",
200 |    "language": "javascript",
201 |    "name": "arc"
202 |   },
203 |   "language_info": {
204 |    "codemirror_mode": "javascript",
205 |    "file_extension": ".json",
206 |    "mimetype": "javascript",
207 |    "name": "arc",
208 |    "nbconvert_exporter": "arcexport",
209 |    "version": "3.8.0"
210 |   }
211 |  },
212 |  "nbformat": 4,
213 |  "nbformat_minor": 4
214 | }
215 | 


--------------------------------------------------------------------------------
/deployment/app_code/job/scd2_merge.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## 3. Read initial & incremental tables from Delta Lake"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "{\n",
 17 |     "  \"type\": \"DeltaLakeExtract\",\n",
 18 |     "  \"name\": \"read initial load table\",\n",
 19 |     "  \"description\": \"read initial load table\",\n",
 20 |     "  \"environments\": [\n",
 21 |     "    \"dev\",\n",
 22 |     "    \"test\"\n",
 23 |     "  ],\n",
 24 |     "  \"inputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/output/contact_snapshot/\",\n",
 25 |     "  \"outputView\": \"current_snapshot\"\n",
 26 |     "}"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "{\n",
 36 |     "  \"type\": \"DeltaLakeExtract\",\n",
 37 |     "  \"name\": \"read contact Delta Lake table\",\n",
 38 |     "  \"description\": \"read contact table\",\n",
 39 |     "  \"environments\": [\n",
 40 |     "    \"dev\",\n",
 41 |     "    \"test\"\n",
 42 |     "  ],\n",
 43 |     "  \"inputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/output/delta_load/\",\n",
 44 |     "  \"outputView\": \"delta_data\"\n",
 45 |     "}"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {
 51 |     "ExecuteTime": {
 52 |      "end_time": "2020-05-31T05:03:33.741024Z",
 53 |      "start_time": "2020-05-31T05:03:33.247Z"
 54 |     }
 55 |    },
 56 |    "source": [
 57 |     "## 3.2 Prepare Datasets for SCD Type2 Insert\n",
 58 |     "\n",
 59 |     "- Generate extra rows for changed records.\n",
 60 |     "- The 'null' merge_key means it will be inserted, not update existing records according to the rule in SCD type2"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "%sql outputView=\"staged_update\" name=\"generate extra rows for SCD\" environments=dev,test\n",
 70 |     "\n",
 71 |     "SELECT NULL AS mergeKey, new.*\n",
 72 |     "FROM current_snapshot old\n",
 73 |     "INNER JOIN delta_data new\n",
 74 |     "ON old.id = new.id\n",
 75 |     "WHERE old.iscurrent=1\n",
 76 |     "AND old.checksum<>new.checksum\n",
 77 |     "\n",
 78 |     "UNION\n",
 79 |     "\n",
 80 |     "SELECT id AS mergeKey, *\n",
 81 |     "FROM delta_data"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "markdown",
 86 |    "metadata": {},
 87 |    "source": [
 88 |     "## 3.3 Implement the Type 2 SCD merge operation"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "%conf logger=true"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "%sql outputView=\"target_merged\" name=\"merge into existing contacts table\" environments=dev,test\n",
107 |     "\n",
108 |     "MERGE INTO current_snapshot tgt\n",
109 |     "USING staged_update src\n",
110 |     "ON tgt.id = src.mergeKey\n",
111 |     "WHEN MATCHED AND src.checksum != tgt.checksum AND tgt.iscurrent = 1 THEN \n",
112 |     "  UPDATE SET \n",
113 |     "    valid_to = src.valid_from, \n",
114 |     "    iscurrent = 0\n",
115 |     "WHEN NOT MATCHED THEN \n",
116 |     "  INSERT *"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "markdown",
121 |    "metadata": {},
122 |    "source": [
123 |     "## 3.4 Create a Delta Lake table in Athena\n",
124 |     "### Build up a Glue Data Catalog via Athena. This step can be done by Glue Crawler. However, it makes sense if we refresh partitions, create/update data catalog at the end of each ETL process, which is provides the data lineage contro at a single place."
125 |    ]
126 |   },
127 |   {
128 |    "cell_type": "code",
129 |    "execution_count": null,
130 |    "metadata": {},
131 |    "outputs": [],
132 |    "source": [
133 |     "{\n",
134 |     "  \"type\": \"JDBCExecute\",\n",
135 |     "  \"name\": \"Create glue data catalog\",\n",
136 |     "  \"environments\": [\n",
137 |     "    \"dev\",\n",
138 |     "    \"test\"\n",
139 |     "  ],\n",
140 |     "  \"inputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/sql/create_table_contact.sql\",\n",
141 |     "  \"jdbcURL\": \"jdbc:awsathena://AwsRegion=\"${AWS_DEFAULT_REGION}\";S3OutputLocation=s3://\"${ETL_CONF_DATALAKE_LOC}\"/athena-query-result;AwsCredentialsProviderClass=com.amazonaws.auth.WebIdentityTokenCredentialsProvider\",\n",
142 |     "  \"sqlParams\":{\n",
143 |     "    \"datalake_loc\": \"'s3://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/output/contact_snapshot\\/'\",\n",
144 |     "    \"table_name\": \"default.contact_snapshot\"\n",
145 |     "  }\n",
146 |     "}"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "markdown",
151 |    "metadata": {},
152 |    "source": [
153 |     "# 4. Query Delta Lake (validation steps)\n",
154 |     "### to stop executing the followings in a productionized ETL job, use a fake environment `uat`\n",
155 |     "### the same queries can be run in Athena"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": null,
161 |    "metadata": {},
162 |    "outputs": [],
163 |    "source": [
164 |     "{\n",
165 |     "  \"type\": \"DeltaLakeExtract\",\n",
166 |     "  \"name\": \"read contact Delta Lake table\",\n",
167 |     "  \"description\": \"read contact table\",\n",
168 |     "  \"environments\": [\n",
169 |     "    \"uat\"\n",
170 |     "  ],\n",
171 |     "  \"inputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/output/contact_snapshot\",\n",
172 |     "  \"outputView\": \"contact_snapshot\"\n",
173 |     "}"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "markdown",
178 |    "metadata": {},
179 |    "source": [
180 |     "## Confirm 92 records are expired"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": null,
186 |    "metadata": {},
187 |    "outputs": [],
188 |    "source": [
189 |     "%sql outputView=\"expired_count\" name=\"expired_count\" environments=uat\n",
190 |     "SELECT count(*) FROM contact_snapshot WHERE valid_to is not null"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": null,
196 |    "metadata": {},
197 |    "outputs": [],
198 |    "source": [
199 |     "%metadata \n",
200 |     "contact_snapshot"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "markdown",
205 |    "metadata": {},
206 |    "source": [
207 |     " ## Confirm we now have 1192 records"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": null,
213 |    "metadata": {},
214 |    "outputs": [],
215 |    "source": [
216 |     "%sql outputView=\"total_count\" name=\"total_count\" environments=uat\n",
217 |     "SELECT count(*) FROM contact_snapshot"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "markdown",
222 |    "metadata": {},
223 |    "source": [
224 |     "## View one of the changed records"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": null,
230 |    "metadata": {},
231 |    "outputs": [],
232 |    "source": [
233 |     "%sql outputView=\"validate_type2\" name=\"validate_type2\" environments=uat\n",
234 |     "SELECT * FROM contact_snapshot WHERE id=12"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": null,
240 |    "metadata": {},
241 |    "outputs": [],
242 |    "source": []
243 |   }
244 |  ],
245 |  "metadata": {
246 |   "kernelspec": {
247 |    "display_name": "Arc",
248 |    "language": "javascript",
249 |    "name": "arc"
250 |   },
251 |   "language_info": {
252 |    "codemirror_mode": "javascript",
253 |    "file_extension": ".json",
254 |    "mimetype": "javascript",
255 |    "name": "arc",
256 |    "nbconvert_exporter": "arcexport",
257 |    "version": "3.8.0"
258 |   }
259 |  },
260 |  "nbformat": 4,
261 |  "nbformat_minor": 4
262 | }
263 | 


--------------------------------------------------------------------------------
/deployment/app_code/job/wordcount.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from pyspark.sql import SparkSession
3 | spark = SparkSession.builder.appName('NYC taxi vendor count').getOrCreate()
4 | df = spark.read.option("header",True).csv(sys.argv[1])
5 | df.filter(df["vendor_name"].isNotNull()).select("vendor_name").groupBy("vendor_name").count().write.mode("overwrite").parquet(sys.argv[2])
6 | exit()


--------------------------------------------------------------------------------
/deployment/app_code/meta/contact_meta_0.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "name": "id",
 4 |     "description": "contact id",
 5 |     "trim": true,
 6 |     "nullable": false,
 7 |     "primaryKey": true,
 8 |     "type": "integer"
 9 |   },
10 |   {
11 |     "name": "name",
12 |     "description": "contact name",
13 |     "trim": true,
14 |     "nullable": true,
15 |     "primaryKey": false,
16 |     "type": "string",
17 |     "nullableValues": [
18 |       "",
19 |       "null"
20 |     ]
21 |   },
22 |   {
23 |     "name": "email",
24 |     "description": "contact email",
25 |     "trim": true,
26 |     "nullable": true,
27 |     "primaryKey": false,
28 |     "type": "string",
29 |     "nullableValues": [
30 |       "",
31 |       "null"
32 |     ]
33 |   },
34 |   {
35 |     "name": "state",
36 |     "description": "state in the country of the contact",
37 |     "trim": true,
38 |     "nullable": true,
39 |     "primaryKey": false,
40 |     "type": "string",
41 |     "nullableValues": [
42 |       "",
43 |       "null"
44 |     ]
45 |   }
46 | ]


--------------------------------------------------------------------------------
/deployment/app_code/sql/add_calc_field_for_scd2.sql:
--------------------------------------------------------------------------------
1 | SELECT id
2 | , name
3 | , email
4 | , state
5 | , ${CURRENT_TIMESTAMP} AS valid_from
6 | , CAST(null AS timestamp) AS valid_to
7 | , 1 AS iscurrent
8 | , md5(concat(name,email,state)) AS checksum 
9 | FROM ${table_name}


--------------------------------------------------------------------------------
/deployment/app_code/sql/create_table_contact.sql:
--------------------------------------------------------------------------------
1 | CREATE EXTERNAL TABLE IF NOT EXISTS ${table_name}
2 | LOCATION ${datalake_loc}
3 | TBLPROPERTIES (
4 |   'table_type' = 'DELTA'
5 | )


--------------------------------------------------------------------------------
/deployment/app_code/sql/sqlvalidate_errors.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 |   SUM(error) = 0 AS valid
 3 |   ,TO_JSON(
 4 |       NAMED_STRUCT(
 5 |         'count', COUNT(error), 
 6 |         'errors', SUM(error)
 7 |       )
 8 |   ) AS message
 9 | FROM (
10 |   SELECT CASE WHEN SIZE(_errors) > 0 THEN 1 ELSE 0 END AS error 
11 |   FROM ${inputView}
12 | ) base


--------------------------------------------------------------------------------
/deployment/build-s3-dist.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #
  3 | # This script packages your project into a solution distributable that can be
  4 | # used as an input to the solution builder validation pipeline.
  5 | #
  6 | # Important notes and prereq's:
  7 | #   1. The initialize-repo.sh script must have been run in order for this script to
  8 | #      function properly.
  9 | #   2. This script should be run from the repo's root folder.
 10 | #
 11 | # This script will perform the following tasks:
 12 | #   1. Remove any old dist files from previous runs.
 13 | #   2. Install dependencies for the cdk-solution-helper; responsible for
 14 | #      converting standard 'cdk synth' output into solution assets.
 15 | #   3. Build and synthesize your CDK project.
 16 | #   4. Run the cdk-solution-helper on template outputs and organize
 17 | #      those outputs into the /global-s3-assets folder.
 18 | #   5. Organize source code artifacts into the /regional-s3-assets folder.
 19 | #   6. Remove any temporary files used for staging.
 20 | #
 21 | # Parameters:
 22 | #  - source-bucket-base-name: Name for the S3 bucket location where the template will source the Lambda
 23 | #    code from. The template will append '-[region_name]' to this bucket name.
 24 | #    For example: ./build-s3-dist.sh solutions v1.0.0
 25 | #    The template will then expect the source code to be located in the solutions-[region_name] bucket
 26 | #  - solution-name: name of the solution for consistency
 27 | #  - version-code: version of the package
 28 | 
 29 | # Important: CDK global version number
 30 | cdk_version===2.105.0
 31 | 
 32 | # Check to see if the required parameters have been provided:
 33 | if [ -z "$1" ] || [ -z "$2" ] || [ -z "$3" ]; then
 34 |     echo "Please provide the base source bucket name, trademark approved solution name and version where the lambda code will eventually reside."
 35 |     echo "For example: ./build-s3-dist.sh solutions trademarked-solution-name v1.0.0 template-bucket-name"
 36 |     exit 1
 37 | fi
 38 | 
 39 | # Get reference for all important folders
 40 | template_dir="$PWD"
 41 | staging_dist_dir="$template_dir/staging"
 42 | app_code_dir="$template_dir/deployment/app_code"
 43 | template_dist_dir="$template_dir/deployment/global-s3-assets"
 44 | build_dist_dir="$template_dir/deployment/regional-s3-assets"
 45 | source_dir="$template_dir/source"
 46 | 
 47 | echo "------------------------------------------------------------------------------"
 48 | echo "[Init] Remove any old dist files from previous runs"
 49 | echo "------------------------------------------------------------------------------"
 50 | 
 51 | echo "rm -rf $template_dist_dir"
 52 | rm -rf $template_dist_dir
 53 | echo "mkdir -p $template_dist_dir"
 54 | mkdir -p $template_dist_dir
 55 | echo "rm -rf $build_dist_dir"
 56 | rm -rf $build_dist_dir
 57 | echo "mkdir -p $build_dist_dir"
 58 | mkdir -p $build_dist_dir
 59 | echo "rm -rf $staging_dist_dir"
 60 | rm -rf $staging_dist_dir
 61 | echo "mkdir -p $staging_dist_dir"
 62 | mkdir -p $staging_dist_dir
 63 | 
 64 | echo "------------------------------------------------------------------------------"
 65 | echo "[Init] Install dependencies for the cdk-solution-helper"
 66 | echo "------------------------------------------------------------------------------"
 67 | 
 68 | echo "cd $template_dir/deployment/cdk-solution-helper"
 69 | cd $template_dir/deployment/cdk-solution-helper
 70 | echo "npm install"
 71 | # npm i --package-lock-only
 72 | # npm audit fix
 73 | npm install
 74 | 
 75 | cd $template_dir
 76 | echo "pip3 install -q $source_dir"
 77 | python3 -m venv .env
 78 | source .env/bin/activate
 79 | pip3 install --upgrade pip -q $source_dir
 80 | 
 81 | echo "------------------------------------------------------------------------------"
 82 | echo "[Packing] solution_helper lambda function"
 83 | echo "------------------------------------------------------------------------------"
 84 | 
 85 | # echo "cd $source_dir/lib/solution_helper"
 86 | # cd $source_dir/lib/solution_helper
 87 | # echo "pip install -r requirements.txt --target ../package"
 88 | # pip install -r requirements.txt --target ../package
 89 | # cd $source_dir/lib/package || exit 1
 90 | # echo "zip -q -r9 $app_code_dir/solution_helper.zip ."
 91 | # zip -q -r9 $app_code_dir/solution_helper.zip .
 92 | # echo "cd $source_dir/lib/solution_helper" || exit 1
 93 | # cd $source_dir/lib/solution_helper
 94 | # echo "zip -g -r $app_code_dir/solution_helper.zip lambda_function.py"
 95 | # zip -g -r $app_code_dir/solution_helper.zip lambda_function.py
 96 | # echo "rm -rf $source_dir/lib/package"
 97 | # rm -rf $source_dir/lib/package
 98 | 
 99 | echo "------------------------------------------------------------------------------"
100 | echo "[Packing] ecr image build"
101 | echo "------------------------------------------------------------------------------"
102 | 
103 | echo "cd $source_dir/lib/ecr_build"
104 | cd $source_dir/lib/ecr_build
105 | echo "zip -q -r9 $app_code_dir/ecr_build_src.zip ."
106 | zip -q -r9 $app_code_dir/ecr_build_src.zip .
107 | cd $source_dir
108 | 
109 | echo "------------------------------------------------------------------------------"
110 | echo "[Synth] CDK Project"
111 | echo "------------------------------------------------------------------------------"
112 | 
113 | # # Install the global aws-cdk package
114 | echo "npm install -g aws-cdk@$cdk_version"
115 | # npm i --package-lock-only
116 | # npm audit fix
117 | npm install aws-cdk@$cdk_version
118 | 
119 | # Run 'cdk synth' to generate raw solution outputs
120 | echo "cdk synth --output=$staging_dist_dir"
121 | node_modules/aws-cdk/bin/cdk synth --output=$staging_dist_dir
122 | 
123 | # Remove unnecessary output files
124 | echo "cd $staging_dist_dir"
125 | cd $staging_dist_dir
126 | echo "rm tree.json manifest.json cdk.out"
127 | rm tree.json manifest.json cdk.out
128 | 
129 | echo "------------------------------------------------------------------------------"
130 | echo "[Packing] Template artifacts"
131 | echo "------------------------------------------------------------------------------"
132 | 
133 | # Move outputs from staging to template_dist_dir
134 | echo "Move outputs from staging to template_dist_dir"
135 | mv $staging_dist_dir/*.json $template_dist_dir/
136 | 
137 | # Rename all *.template.json files to *.template
138 | echo "Rename all *.template.json to *.template"
139 | echo "copy templates and rename"
140 | for f in $template_dist_dir/*.template.json; do
141 |     mv -- "$f" "${f%.template.json}.template"
142 | done
143 | 
144 | # Run the helper to clean-up the templates and remove unnecessary CDK elements
145 | echo "Run the helper to clean-up the templates and remove unnecessary CDK elements"
146 | echo "node $template_dir/deployment/cdk-solution-helper/index"
147 | node $template_dir/deployment/cdk-solution-helper/index
148 | if [ "$?" = "1" ]; then
149 | 	echo "(cdk-solution-helper) ERROR: there is likely output above." 1>&2
150 | 	exit 1
151 | fi
152 | 
153 | # Find and replace bucket_name, solution_name, and version
154 | echo "Find and replace bucket_name, solution_name, and version"
155 | cd $template_dist_dir
156 | echo "Updating code source bucket in template with $1"
157 | replace="s/%%BUCKET_NAME%%/$1/g"
158 | echo "sed -i '' -e $replace $template_dist_dir/*.template"
159 | sed -i '' -e $replace $template_dist_dir/*.template
160 | replace="s/%%SOLUTION_NAME%%/$2/g"
161 | echo "sed -i '' -e $replace $template_dist_dir/*.template"
162 | sed -i '' -e $replace $template_dist_dir/*.template
163 | replace="s/%%VERSION%%/$3/g"
164 | echo "sed -i '' -e $replace $template_dist_dir/*.template"
165 | sed -i '' -e $replace $template_dist_dir/*.template
166 | 
167 | # Put Global and Regional code files in a single bucket if the 4th parameter doesn't exist
168 | if [ -z "$4" ]; then
169 |     replace="s/%%TEMPLATE_OUTPUT_BUCKET%%/$1"-"${AWS_REGION}/g"
170 | else
171 |     replace="s/%%TEMPLATE_OUTPUT_BUCKET%%/$4/g"    
172 | fi
173 | 
174 | echo "sed -i '' -e $replace $template_dist_dir/*.template"
175 | sed -i '' -e $replace $template_dist_dir/*.template
176 | 
177 | rm $template_dist_dir/*.json
178 | 
179 | echo "------------------------------------------------------------------------------"
180 | echo "[Packing] Source code artifacts"
181 | echo "------------------------------------------------------------------------------"
182 | 
183 | # General cleanup of node_modules and package-lock.json files
184 | echo "find $staging_dist_dir -iname "node_modules" -type d -exec rm -rf "{}" \; 2> /dev/null"
185 | find $staging_dist_dir -iname "node_modules" -type d -exec rm -rf "{}" \; 2> /dev/null
186 | echo "find $staging_dist_dir -iname "package-lock.json" -type f -exec rm -f "{}" \; 2> /dev/null"
187 | find $staging_dist_dir -iname "package-lock.json" -type f -exec rm -f "{}" \; 2> /dev/null
188 | 
189 | # ... For each asset.* source code artifact in the temporary /staging folder...
190 | cd $staging_dist_dir
191 | for d in `find . -mindepth 1 -maxdepth 1 -type d`; do
192 | 
193 |     # Rename the artifact, removing the period for handler compatibility
194 |     pfname="$(basename -- $d)"
195 |     fname="$(echo $pfname | sed -e 's/\.//g')"
196 |     echo "zip -r $fname.zip $fname"
197 |     mv $d $fname
198 |     cd $staging_dist_dir/$fname
199 | 
200 |     # Build the artifcats
201 |     if ls *.py 1> /dev/null 2>&1; then
202 |         echo "===================================="
203 |         echo "This is Python runtime"
204 |         echo "===================================="
205 |         venv_folder=".venv-prod"
206 |         rm -fr .venv-test
207 |         rm -fr .venv-prod
208 |         echo "Initiating virtual environment"
209 |         python3 -m venv $venv_folder
210 |         source $venv_folder/bin/activate
211 |         pip3 install --upgrade pip -q $source_dir --target $venv_folder/lib/python3.*/site-packages
212 |         echo "package python artifact"
213 |         cd $venv_folder/lib/python3.*/site-packages
214 |         zip -qr9 $staging_dist_dir/$fname.zip .  -x "aws_cdk/*"
215 |         echo "zip -r $staging_dist_dir/$fname"
216 |         cd $staging_dist_dir/$fname
217 |         rm -rf $venv_folder
218 |         zip -grq $staging_dist_dir/$fname.zip .
219 |        
220 |     elif ls *.js 1> /dev/null 2>&1; then
221 |         echo "===================================="
222 |         echo "This is Node runtime"
223 |         echo "===================================="
224 |         echo "Clean and rebuild artifacts"
225 |         echo "copy package.json and package-lock.json files"
226 |         # npm audit fix --force
227 |         cp -rf $template_dir/deployment/cdk-solution-helper/*.json .
228 |         npm run
229 |         npm ci
230 |         if [ "$?" = "1" ]; then
231 | 	        echo "ERROR: Seems like package-lock.json does not exists or is out of sync with package.josn. Trying npm install instead" 1>&2
232 |             npm install --package-lock
233 |         fi
234 |         # Zip the artifact
235 |         echo "zip -r $staging_dist_dir/$fname"
236 |         zip -qr9 $staging_dist_dir/$fname.zip .
237 |     else
238 |         # Zip the artifact
239 |         echo "zip -r $staging_dist_dir/$fname"
240 |         zip -rq $staging_dist_dir/$fname.zip .
241 |     fi    
242 | 
243 |     cd $staging_dist_dir
244 |     # Copy the zipped artifact from /staging to /regional-s3-assets
245 |     echo "cp $fname.zip $build_dist_dir"
246 |     mv $fname.zip $build_dist_dir
247 | 
248 |     # Remove the old, unzipped artifact from /staging
249 |     echo "rm -rf $fname"
250 |     rm -rf $fname
251 | 
252 |     # ... repeat until all source code artifacts are zipped and placed in the
253 |     # ... /regional-s3-assets folder
254 | 
255 | done
256 | 
257 | echo "------------------------------------------------------------------------------"
258 | echo "[Move] the zip files from staging to regional-s3-assets folder"
259 | echo "------------------------------------------------------------------------------"
260 | for d in `find . -mindepth 1 -maxdepth 1`; do
261 |     pfname="$(basename -- $d)"
262 |     fname="$(echo $pfname | sed -e 's/asset./asset/g')"
263 |     mv $d $build_dist_dir/$fname
264 | done    
265 | 
266 | echo "------------------------------------------------------------------------------"
267 | echo "[Cleanup] Remove temporary files"
268 | echo "------------------------------------------------------------------------------"
269 | 
270 | # Delete the temporary /staging folder
271 | echo "rm -rf $staging_dist_dir"
272 | rm -rf $staging_dist_dir
273 | 
274 | 


--------------------------------------------------------------------------------
/deployment/cdk-solution-helper/README.md:
--------------------------------------------------------------------------------
  1 | # cdk-solution-helper
  2 | 
  3 | A lightweight helper function that cleans-up synthesized templates from the AWS Cloud Development Kit (CDK) and prepares
  4 | them for use with the AWS Solutions publishing pipeline. This function performs the following tasks:
  5 | 
  6 | #### Lambda function preparation
  7 | 
  8 | Replaces the AssetParameter-style properties that identify source code for Lambda functions with the common variables 
  9 | used by the AWS Solutions publishing pipeline. 
 10 | 
 11 | - `Code.S3Bucket` is assigned the `%%BUCKET_NAME%%` placeholder value.
 12 | - `Code.S3Key` is assigned the `%%SOLUTION_NAME%%`/`%%VERSION%%` placeholder value.
 13 | - `Handler` is given a prefix identical to the artifact hash, enabling the Lambda function to properly find the handler in the extracted source code package.
 14 | 
 15 | These placeholders are then replaced with the appropriate values using the default find/replace operation run by the pipeline.
 16 | 
 17 | Before:
 18 | ```
 19 | "examplefunction67F55935": {
 20 |       "Type": "AWS::Lambda::Function",
 21 |       "Properties": {
 22 |         "Code": {
 23 |           "S3Bucket": {
 24 |             "Ref": "AssetParametersd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7S3Bucket54E71A95"
 25 |           },
 26 |           "S3Key": {
 27 |             "Fn::Join": [
 28 |               "",
 29 |               [
 30 |                 {
 31 |                   "Fn::Select": [
 32 |                     0,
 33 |                     {
 34 |                       "Fn::Split": [
 35 |                         "||",
 36 |                         {
 37 |                           "Ref": "AssetParametersd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7S3VersionKeyC789D8B1"
 38 |                         }
 39 |                       ]
 40 |                     }
 41 |                   ]
 42 |                 },
 43 |                 {
 44 |                   "Fn::Select": [
 45 |                     1,
 46 |                     {
 47 |                       "Fn::Split": [
 48 |                         "||",
 49 |                         {
 50 |                           "Ref": "AssetParametersd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7S3VersionKeyC789D8B1"
 51 |                         }
 52 |                       ]
 53 |                     }
 54 |                   ]
 55 |                 }
 56 |               ]
 57 |             ]
 58 |           }
 59 |         }, ...
 60 |         Handler: "index.handler", ...
 61 | ```
 62 | 
 63 | After helper function run:
 64 | ```
 65 | "examplefunction67F55935": {
 66 |       "Type": "AWS::Lambda::Function",
 67 |       "Properties": {
 68 |         "Code": {
 69 |           "S3Bucket": "%%BUCKET_NAME%%",
 70 |           "S3Key": "%%SOLUTION_NAME%%/%%VERSION%%/assetd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7.zip"
 71 |         }, ...
 72 |         "Handler": "assetd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7/index.handler"
 73 | ```
 74 | 
 75 | After build script run:
 76 | ```
 77 | "examplefunction67F55935": {
 78 |       "Type": "AWS::Lambda::Function",
 79 |       "Properties": {
 80 |         "Code": {
 81 |           "S3Bucket": "solutions",
 82 |           "S3Key": "trademarked-solution-name/v1.0.0/asset.d513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7.zip"
 83 |         }, ...
 84 |         "Handler": "assetd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7/index.handler"
 85 | ```
 86 | 
 87 | After CloudFormation deployment:
 88 | ```
 89 | "examplefunction67F55935": {
 90 |       "Type": "AWS::Lambda::Function",
 91 |       "Properties": {
 92 |         "Code": {
 93 |           "S3Bucket": "solutions-us-east-1",
 94 |           "S3Key": "trademarked-solution-name/v1.0.0/asset.d513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7.zip"
 95 |         }, ...
 96 |         "Handler": "assetd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7/index.handler"
 97 | ```
 98 | 
 99 | #### Template cleanup
100 | 
101 | Cleans-up the parameters section and improves readability by removing the AssetParameter-style fields that would have 
102 | been used to specify Lambda source code properties. This allows solution-specific parameters to be highlighted and 
103 | removes unnecessary clutter.
104 | 
105 | Before:
106 | ```
107 | "Parameters": {
108 |     "AssetParametersd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7S3Bucket54E71A95": {
109 |       "Type": "String",
110 |       "Description": "S3 bucket for asset \"d513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7\""
111 |     },
112 |     "AssetParametersd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7S3VersionKeyC789D8B1": {
113 |       "Type": "String",
114 |       "Description": "S3 key for asset version \"d513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7\""
115 |     },
116 |     "AssetParametersd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7ArtifactHash7AA751FE": {
117 |       "Type": "String",
118 |       "Description": "Artifact hash for asset \"d513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7\""
119 |     },
120 |     "CorsEnabled" : {
121 |         "Description" : "Would you like to enable Cross-Origin Resource Sharing (CORS) for the image handler API? Select 'Yes' if so.",
122 |         "Default" : "No",
123 |         "Type" : "String",
124 |         "AllowedValues" : [ "Yes", "No" ]
125 |     },
126 |     "CorsOrigin" : {
127 |         "Description" : "If you selected 'Yes' above, please specify an origin value here. A wildcard (*) value will support any origin.",
128 |         "Default" : "*",
129 |         "Type" : "String"
130 |     }
131 |   }
132 |   ```
133 | 
134 | After:
135 | ```
136 | "Parameters": {
137 |     "CorsEnabled" : {
138 |         "Description" : "Would you like to enable Cross-Origin Resource Sharing (CORS) for the image handler API? Select 'Yes' if so.",
139 |         "Default" : "No",
140 |         "Type" : "String",
141 |         "AllowedValues" : [ "Yes", "No" ]
142 |     },
143 |     "CorsOrigin" : {
144 |         "Description" : "If you selected 'Yes' above, please specify an origin value here. A wildcard (*) value will support any origin.",
145 |         "Default" : "*",
146 |         "Type" : "String"
147 |     }
148 |   }
149 |   ```
150 | 
151 | ***
152 | &copy; Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.


--------------------------------------------------------------------------------
/deployment/cdk-solution-helper/index.js:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env node
  2 | // Imports
  3 | const fs = require('fs');
  4 | 
  5 | // Paths
  6 | var currentPath = process.cwd();
  7 | const global_s3_assets = currentPath+'/../deployment/global-s3-assets';
  8 | const solution_name='sql-based-etl-with-apache-spark-on-amazon-eks';
  9 | 
 10 | function setParameter(template) {
 11 |     const parameters = (template.Parameters) ? template.Parameters : {};
 12 |     const assetParameters = Object.keys(parameters).filter(function(key) {
 13 |       return key.includes('BootstrapVersion');
 14 |     });
 15 |     assetParameters.forEach(function(a) {
 16 |         template.Parameters[a] = undefined;
 17 |     });
 18 |     const rules = (template.Rules) ? template.Rules : {};
 19 |     const rule = Object.keys(rules).filter(function(key) {
 20 |       return key.includes('CheckBootstrapVersion');
 21 |     });
 22 |     rule.forEach(function(a) {
 23 |       template.Rules[a] = undefined;
 24 |   })
 25 | }
 26 | function assetRef(s3BucketRef) {
 27 |   // Get S3 bucket key references from assets file
 28 |     const raw_meta = fs.readFileSync(`${global_s3_assets}/${solution_name}.assets.json`);
 29 |     let template = JSON.parse(raw_meta);
 30 |     const metadata = (template.files[s3BucketRef]) ? template.files[s3BucketRef] : {};
 31 |     var assetPath = metadata.source.path.replace('.json','');
 32 |     return assetPath;
 33 | }
 34 | 
 35 | // For each template in global_s3_assets ...
 36 | fs.readdirSync(global_s3_assets).forEach(file => {
 37 |   if ( file != `${solution_name}.assets.json`) {
 38 |     // Import and parse template file
 39 |     const raw_template = fs.readFileSync(`${global_s3_assets}/${file}`);
 40 |     let template = JSON.parse(raw_template);
 41 | 
 42 |     //1. Clean-up parameters section
 43 |     setParameter(template);
 44 | 
 45 |     const resources = (template.Resources) ? template.Resources : {};
 46 |     //3. Clean-up Account ID and region to enable cross account deployment
 47 |     const rsrctype=[
 48 |       "AWS::Lambda::Function",
 49 |       "AWS::Lambda::LayerVersion",
 50 |       "Custom::CDKBucketDeployment", 
 51 |       "AWS::CloudFormation::Stack",
 52 |       "AWS::CloudFront::Distribution"
 53 |     ]
 54 |     const focusTemplate = Object.keys(resources).filter(function(key) {
 55 |       return (resources[key].Type.indexOf(rsrctype) < 0)
 56 |     });
 57 |     focusTemplate.forEach(function(f) {
 58 |         const fn = template.Resources[f];
 59 |         if (fn.Properties.hasOwnProperty('Code') && fn.Properties.Code.hasOwnProperty('S3Bucket')) {
 60 |           // Set Lambda::Function S3 reference to regional folder
 61 |           if (! String(fn.Properties.Code.S3Bucket.Ref).startsWith('appcode')){
 62 |             fn.Properties.Code.S3Key = `%%SOLUTION_NAME%%/%%VERSION%%/asset`+fn.Properties.Code.S3Key;
 63 |             fn.Properties.Code.S3Bucket = {'Fn::Sub': '%%BUCKET_NAME%%-${AWS::Region}'};
 64 |           }
 65 |         }
 66 |         else if (fn.Properties.hasOwnProperty('Content') && fn.Properties.Content.hasOwnProperty('S3Bucket')) {
 67 |           // Set Lambda::LayerVersion S3 bucket reference
 68 |           fn.Properties.Content.S3Key = `%%SOLUTION_NAME%%/%%VERSION%%/asset`+fn.Properties.Content.S3Key;
 69 |           fn.Properties.Content.S3Bucket = {'Fn::Sub': '%%BUCKET_NAME%%-${AWS::Region}'};    
 70 |         }
 71 |         else if (fn.Properties.hasOwnProperty('SourceBucketNames')) {
 72 |           // Set CDKBucketDeployment S3 bucket reference
 73 |           fn.Properties.SourceObjectKeys = [`%%SOLUTION_NAME%%/%%VERSION%%/asset`+fn.Properties.SourceObjectKeys[0]];
 74 |           fn.Properties.SourceBucketNames = [{'Fn::Sub': '%%BUCKET_NAME%%-${AWS::Region}'}];
 75 |         }
 76 |         else if (fn.Properties.hasOwnProperty('PolicyName') && fn.Properties.PolicyName.includes('CustomCDKBucketDeployment')) {
 77 |           // Set CDKBucketDeployment S3 bucket Policy reference
 78 |           fn.Properties.PolicyDocument.Statement.forEach(function(sub,i) {
 79 |             if (typeof(sub.Resource[i]) === 'object') {
 80 |               sub.Resource.forEach(function(resource){
 81 |                 var arrayKey = Object.keys(resource);
 82 |                 if (typeof(resource[arrayKey][1]) === 'object') {
 83 |                   resource[arrayKey][1].filter(function(s){
 84 |                       if (s.hasOwnProperty('Ref')) {
 85 |                         fn.Properties.PolicyDocument.Statement[i].Resource = [
 86 |                         {"Fn::Join": ["",["arn:",{"Ref": "AWS::Partition"},":s3:::%%BUCKET_NAME%%-",{"Ref": "AWS::Region"}]]},
 87 |                         {"Fn::Join": ["",["arn:",{"Ref": "AWS::Partition"},":s3:::%%BUCKET_NAME%%-",{"Ref": "AWS::Region"},"/*"]]}]}})}})}});
 88 |         }
 89 |         // Set NestedStack S3 bucket reference
 90 |         else if (fn.Properties.hasOwnProperty('TemplateURL')) {
 91 |           var key=fn.Properties.TemplateURL['Fn::Join'][1][6].replace('.json','').replace('/','');
 92 |           var assetPath = assetRef(key);
 93 |           fn.Properties.TemplateURL = {
 94 |             "Fn::Join": ["",
 95 |               [
 96 |                 "https://s3.",
 97 |                 {
 98 |                   "Ref": "AWS::URLSuffix"
 99 |                 },
100 |                 "/",
101 |                 `%%TEMPLATE_OUTPUT_BUCKET%%/%%SOLUTION_NAME%%/%%VERSION%%/${assetPath}`
102 |               ]]
103 |           };
104 |         }
105 |         // Set CloudFront logging bucket
106 |         else if (fn.Properties.hasOwnProperty('DistributionConfig')){
107 |           fn.Properties.DistributionConfig.Logging.Bucket= {
108 |             "Fn::Join": ["",[fn.Properties.DistributionConfig.Logging.Bucket['Fn::Join'][1][0],
109 |             ".s3.",{"Ref": "AWS::Region"},".",{"Ref": "AWS::URLSuffix"}]]
110 |           }
111 |         }
112 |     });
113 |     
114 |     //6. Output modified template file
115 |     const output_template = JSON.stringify(template, null, 2);
116 |     fs.writeFileSync(`${global_s3_assets}/${file}`, output_template);
117 |   }
118 | });


--------------------------------------------------------------------------------
/deployment/cdk-solution-helper/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "cdk-solution-helper",
 3 |     "version": "0.1.0",
 4 |     "devDependencies": {
 5 |         "fs": "0.0.1-security"
 6 |     },
 7 |     "dependencies": {
 8 |         "fs": "0.0.1-security"
 9 |     }
10 | }
11 | 


--------------------------------------------------------------------------------
/deployment/delete_all.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export stack_name="${1:-SparkOnEKS}"
 4 | export region="${2:-us-east-1}"
 5 | lower_stack_name=$(echo $stack_name | tr '[:upper:]' '[:lower:]')
 6 | 
 7 | echo "================================================================================================="
 8 | echo "  Make sure your CloudFormation stack name $stack_name is correct and exists in region: $region  "
 9 | echo "  If you use a different name, rerun the script with the parameters:"
10 | echo "      ./deployment/delete_all.sh <stack_name> <region>"
11 | echo "================================================================================================="
12 | 
13 | # delete s3
14 | code_bucket=$(aws cloudformation describe-stacks --stack-name $stack_name --region $region --query "Stacks[0].Outputs[?OutputKey=='CODEBUCKET'].OutputValue" --output text)
15 | if ! [ -z "$code_bucket" ] 
16 | then	
17 | 	if ! [ -z $(aws s3api list-buckets --region $region --query 'Buckets[?Name==`'$code_bucket'`].Name' --output text) ]; then
18 | 		echo "Delete logs from S3"
19 | 		aws s3 rm s3://${code_bucket}/vpcRejectlog/
20 | 		echo "Delete athena query result from S3"
21 | 		aws s3 rm s3://${code_bucket}/athena-query-result/
22 | 	fi	
23 | fi
24 | # delete ecr
25 | repo_name=$(aws ecr describe-repositories --region $region --query 'repositories[?starts_with(repositoryName,`'$lower_stack_name'`)==`true`]'.repositoryName --output text)
26 | if ! [ -z "${repo_name}" ]; then
27 | 	echo "Delete Arc docker image from ECR"
28 | 	aws ecr delete-repository --region $region --repository-name $repo_name --force
29 | fi
30 | # delete glue tables
31 | tbl1=$(aws glue get-tables --region $region --database-name 'default' --query 'TableList[?starts_with(Name,`contact_snapshot`)==`true`]'.Name --output text)
32 | tbl2=$(aws glue get-tables --region $region --database-name 'default' --query 'TableList[?starts_with(Name,`deltalake_contact_jhub`)==`true`]'.Name --output text)
33 | if ! [ -z "$tbl1" ] 
34 | then
35 | 	echo "Drop a Delta Lake table default.contact_snapshot"
36 | 	aws athena start-query-execution --region $region --query-string "DROP TABLE default.contact_snapshot" --result-configuration OutputLocation=s3://$code_bucket/athena-query-result
37 | fi
38 | if ! [ -z "$tbl2" ] 
39 | then
40 | 	echo "Drop a Delta Lake table default.deltalake_contact_jhub"
41 | 	aws athena start-query-execution --region $region --query-string "DROP TABLE default.deltalake_contact_jhub" --result-configuration OutputLocation=s3://$code_bucket/athena-query-result
42 | fi
43 | 
44 | # delete ALB
45 | argoALB=$(aws elbv2 describe-load-balancers --region $region --query 'LoadBalancers[?starts_with(DNSName,`k8s-argo`)==`true`].LoadBalancerArn' --output text)
46 | jhubALB=$(aws elbv2 describe-load-balancers --region $region --query 'LoadBalancers[?starts_with(DNSName,`k8s-jupyter`)==`true`].LoadBalancerArn' --output text)
47 | if ! [ -z "$argoALB" ]
48 | then
49 | 	echo "Delete Argo ALB"
50 | 	aws elbv2 delete-load-balancer --load-balancer-arn $argoALB --region $region
51 | 	sleep 5
52 | fi	
53 | if ! [ -z "$jhubALB" ]
54 | then
55 | 	echo "Delete Jupyter ALB"
56 | 	aws elbv2 delete-load-balancer --load-balancer-arn $jhubALB --region $region
57 | 	sleep 5
58 | fi
59 | 
60 | argoTG=$(aws elbv2 describe-target-groups --region $region --query 'TargetGroups[?starts_with(TargetGroupName,`k8s-argo`)==`true`].TargetGroupArn' --output text)
61 | jhubTG=$(aws elbv2 describe-target-groups --region $region --query 'TargetGroups[?starts_with(TargetGroupName,`k8s-jupyter`)==`true`].TargetGroupArn' --output text)
62 | if ! [ -z "$argoTG" ]
63 | then
64 | 	sleep 5
65 | 	echo "Delete Argo Target groups"
66 | 	aws elbv2 delete-target-group --region $region --target-group-arn $argoTG 
67 | fi	
68 | if ! [ -z "$jhubTG" ]
69 | then
70 | 	sleep 5
71 | 	echo "Delete Jupyter Target groups"
72 | 	aws elbv2 delete-target-group --region $region --target-group-arn $jhubTG 
73 | fi	
74 | 
75 | # delete the rest from CF
76 | echo "Delete the rest of resources by CloudFormation delete command"
77 | aws cloudformation delete-stack --region $region --stack-name $stack_name


--------------------------------------------------------------------------------
/deployment/post-deployment.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | export stack_name="${1:-SparkOnEKS}"
 4 | export region="${2:-us-east-1}"
 5 | 
 6 | echo "================================================================================================="
 7 | echo "  Make sure your CloudFormation stack name $stack_name is correct and exists in region: $region  "
 8 | echo "  If you use a different name, rerun the script with parameters:"
 9 | echo "      ./deployment/post-deployment.sh <stack_name> <region>"
10 | echo "================================================================================================="
11 | 
12 | # 1. update ECR endpoint in example jobs
13 | export ECR_IMAGE_URI=$(aws cloudformation describe-stacks --stack-name $stack_name --region $region \
14 | --query "Stacks[0].Outputs[?OutputKey=='IMAGEURI'].OutputValue" --output text)
15 | echo "Updated ECR endpoint in sample job files in source/example/"
16 | sed -i.bak "s|{{ECR_URL}}|${ECR_IMAGE_URI}|g" source/example/*.yaml
17 | 
18 | find . -type f -name "*.bak" -delete
19 | 
20 | # 2. install k8s command tools 
21 | echo "================================================================================"
22 | echo "  Installing kubectl tool on Linux ..."
23 | echo "  For other operationing system, install the kubectl > 1.27 here:"
24 | echo "  https://docs.aws.amazon.com/eks/latest/userguide/install-kubectl.html"
25 | echo "================================================================================"
26 | curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
27 | chmod +x kubectl
28 | sudo mkdir -p /usr/local/bin && sudo mv kubectl /usr/local/bin/kubectl && export PATH=$PATH:/usr/local/bin/
29 | echo "Installed kubectl version: "
30 | kubectl version --client
31 | echo "================================================================================================"
32 | echo " Installing argoCLI tool on Linux ..."
33 | echo " Check out https://github.com/argoproj/argo-workflows/releases for other OS type installation."
34 | echo "================================================================================================"
35 | VERSION=v3.5.4
36 | sudo curl -sLO https://github.com/argoproj/argo-workflows/releases/download/${VERSION}/argo-linux-amd64.gz && gunzip argo-linux-amd64.gz
37 | chmod +x argo-linux-amd64 && sudo mv ./argo-linux-amd64 /usr/local/bin/argo
38 | echo "Installed argoCLI version: "
39 | argo version --short
40 | 
41 | # 3. connect to the EKS newly created
42 | echo `aws cloudformation describe-stacks --stack-name $stack_name --region $region --query "Stacks[0].Outputs[?starts_with(OutputKey,'eksclusterEKSConfig')].OutputValue" --output text` | bash
43 | echo "Testing EKS connection..."
44 | kubectl get svc
45 | 
46 | # 4. get Jupyter Hub login
47 | LOGIN_URI=$(aws cloudformation describe-stacks --stack-name $stack_name --region $region \
48 | --query "Stacks[0].Outputs[?OutputKey=='JUPYTERURL'].OutputValue" --output text)
49 | SEC_ID=$(aws secretsmanager list-secrets --region $region --query "SecretList[?not_null(Tags[?Value=='$stack_name'])].Name" --output text)
50 | LOGIN=$(aws secretsmanager get-secret-value --region $region --secret-id $SEC_ID --query SecretString --output text)
51 | echo -e "\n=============================== JupyterHub Login =============================================="
52 | echo -e "\nJUPYTER_URL: $LOGIN_URI"
53 | echo "LOGIN: $LOGIN" 
54 | echo "================================================================================================"
55 | 
56 | #5. Get ArgoWorkflows login
57 | ARGO_LOGIN_URI=$(aws cloudformation describe-stacks --stack-name $stack_name --region $region \
58 | --query "Stacks[0].Outputs[?OutputKey=='ARGOURL'].OutputValue" --output text)
59 | 
60 | echo -e "\n=============================== ARGO Workflows Login =============================================="
61 | echo -e "\nARGO_URL: $ARGO_LOGIN_URI"
62 | echo "================================================================================================"
63 | 


--------------------------------------------------------------------------------
/source/app.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | from aws_cdk import (App,Tags,CfnOutput)
 3 | from lib.spark_on_eks_stack import SparkOnEksStack
 4 | from lib.cloud_front_stack import NestedStack
 5 | 
 6 | app = App()
 7 | 
 8 | eks_name = app.node.try_get_context('cluster_name')
 9 | solution_id = app.node.try_get_context('solution_id')
10 | solution_version= app.node.try_get_context('version')
11 | 
12 | # main stack
13 | eks_stack = SparkOnEksStack(app, 'sql-based-etl-with-apache-spark-on-amazon-eks', eks_name, solution_id, solution_version)
14 | # Recommend to remove the CloudFront nested stack. Setup your own SSL certificate and add it to ALB.
15 | cf_nested_stack = NestedStack(eks_stack,'CreateCloudFront', eks_stack.code_bucket, eks_stack.argo_url, eks_stack.jhub_url)
16 | Tags.of(eks_stack).add('project', 'sqlbasedetl')
17 | Tags.of(cf_nested_stack).add('project', 'sqlbasedetl')
18 | # Deployment Output
19 | CfnOutput(eks_stack,'CODE_BUCKET', value=eks_stack.code_bucket)
20 | CfnOutput(eks_stack,'ARGO_URL', value='https://'+ cf_nested_stack.argo_cf)
21 | CfnOutput(eks_stack,'JUPYTER_URL', value='https://'+ cf_nested_stack.jhub_cf)
22 | 
23 | app.synth()


--------------------------------------------------------------------------------
/source/app_resources/alb-iam-role.yaml:
--------------------------------------------------------------------------------
  1 | - Effect: Allow
  2 |   Action:
  3 |   - iam:CreateServiceLinkedRole
  4 |   Resource: "*"
  5 |   Condition:
  6 |     StringEquals:
  7 |       iam:AWSServiceName: elasticloadbalancing.amazonaws.com
  8 | - Effect: Allow
  9 |   Action:
 10 |   - ec2:DescribeAccountAttributes
 11 |   - ec2:DescribeAddresses
 12 |   - ec2:DescribeAvailabilityZones
 13 |   - ec2:DescribeInternetGateways
 14 |   - ec2:DescribeVpcs
 15 |   - ec2:DescribeVpcPeeringConnections
 16 |   - ec2:DescribeSubnets
 17 |   - ec2:DescribeSecurityGroups
 18 |   - ec2:DescribeInstances
 19 |   - ec2:DescribeNetworkInterfaces
 20 |   - ec2:DescribeTags
 21 |   - ec2:GetCoipPoolUsage
 22 |   - ec2:DescribeCoipPools
 23 |   - elasticloadbalancing:DescribeLoadBalancers
 24 |   - elasticloadbalancing:DescribeLoadBalancerAttributes
 25 |   - elasticloadbalancing:DescribeListeners
 26 |   - elasticloadbalancing:DescribeListenerCertificates
 27 |   - elasticloadbalancing:DescribeSSLPolicies
 28 |   - elasticloadbalancing:DescribeRules
 29 |   - elasticloadbalancing:DescribeTargetGroups
 30 |   - elasticloadbalancing:DescribeTargetGroupAttributes
 31 |   - elasticloadbalancing:DescribeTargetHealth
 32 |   - elasticloadbalancing:DescribeTags
 33 |   Resource: "*"
 34 | - Effect: Allow
 35 |   Action:
 36 |   - cognito-idp:DescribeUserPoolClient
 37 |   - acm:ListCertificates
 38 |   - acm:DescribeCertificate
 39 |   - iam:ListServerCertificates
 40 |   - iam:GetServerCertificate
 41 |   - waf-regional:GetWebACL
 42 |   - waf-regional:GetWebACLForResource
 43 |   - waf-regional:AssociateWebACL
 44 |   - waf-regional:DisassociateWebACL
 45 |   - wafv2:GetWebACL
 46 |   - wafv2:GetWebACLForResource
 47 |   - wafv2:AssociateWebACL
 48 |   - wafv2:DisassociateWebACL
 49 |   - shield:GetSubscriptionState
 50 |   - shield:DescribeProtection
 51 |   - shield:CreateProtection
 52 |   - shield:DeleteProtection
 53 |   Resource: "*"
 54 | - Effect: Allow
 55 |   Action:
 56 |   - ec2:AuthorizeSecurityGroupIngress
 57 |   - ec2:RevokeSecurityGroupIngress
 58 |   Resource: "*"
 59 | - Effect: Allow
 60 |   Action:
 61 |   - ec2:CreateSecurityGroup
 62 |   Resource: "*"
 63 | - Effect: Allow
 64 |   Action:
 65 |   - ec2:CreateTags
 66 |   Resource: arn:aws:ec2:*:*:security-group/*
 67 |   Condition:
 68 |     StringEquals:
 69 |       ec2:CreateAction: CreateSecurityGroup
 70 |     'Null':
 71 |       aws:RequestTag/elbv2.k8s.aws/cluster: 'false'
 72 | - Effect: Allow
 73 |   Action:
 74 |   - ec2:CreateTags
 75 |   - ec2:DeleteTags
 76 |   Resource: arn:aws:ec2:*:*:security-group/*
 77 |   Condition:
 78 |     'Null':
 79 |       aws:RequestTag/elbv2.k8s.aws/cluster: 'true'
 80 |       aws:ResourceTag/elbv2.k8s.aws/cluster: 'false'
 81 | - Effect: Allow
 82 |   Action:
 83 |   - ec2:AuthorizeSecurityGroupIngress
 84 |   - ec2:RevokeSecurityGroupIngress
 85 |   - ec2:DeleteSecurityGroup
 86 |   Resource: "*"
 87 |   Condition:
 88 |     'Null':
 89 |       aws:ResourceTag/elbv2.k8s.aws/cluster: 'false'
 90 | - Effect: Allow
 91 |   Action:
 92 |   - elasticloadbalancing:CreateLoadBalancer
 93 |   - elasticloadbalancing:CreateTargetGroup
 94 |   Resource: "*"
 95 |   Condition:
 96 |     'Null':
 97 |       aws:RequestTag/elbv2.k8s.aws/cluster: 'false'
 98 | - Effect: Allow
 99 |   Action:
100 |   - elasticloadbalancing:CreateListener
101 |   - elasticloadbalancing:DeleteListener
102 |   - elasticloadbalancing:CreateRule
103 |   - elasticloadbalancing:DeleteRule
104 |   Resource: "*"
105 | - Effect: Allow
106 |   Action:
107 |   - elasticloadbalancing:AddTags
108 |   - elasticloadbalancing:RemoveTags
109 |   Resource:
110 |   - arn:aws:elasticloadbalancing:*:*:targetgroup/*/*
111 |   - arn:aws:elasticloadbalancing:*:*:loadbalancer/net/*/*
112 |   - arn:aws:elasticloadbalancing:*:*:loadbalancer/app/*/*
113 |   Condition:
114 |     'Null':
115 |       aws:RequestTag/elbv2.k8s.aws/cluster: 'true'
116 |       aws:ResourceTag/elbv2.k8s.aws/cluster: 'false'
117 | - Effect: Allow
118 |   Action:
119 |   - elasticloadbalancing:AddTags
120 |   - elasticloadbalancing:RemoveTags
121 |   Resource:
122 |   - arn:aws:elasticloadbalancing:*:*:listener/net/*/*/*
123 |   - arn:aws:elasticloadbalancing:*:*:listener/app/*/*/*
124 |   - arn:aws:elasticloadbalancing:*:*:listener-rule/net/*/*/*
125 |   - arn:aws:elasticloadbalancing:*:*:listener-rule/app/*/*/*
126 | - Effect: Allow
127 |   Action:
128 |   - elasticloadbalancing:ModifyLoadBalancerAttributes
129 |   - elasticloadbalancing:SetIpAddressType
130 |   - elasticloadbalancing:SetSecurityGroups
131 |   - elasticloadbalancing:SetSubnets
132 |   - elasticloadbalancing:DeleteLoadBalancer
133 |   - elasticloadbalancing:ModifyTargetGroup
134 |   - elasticloadbalancing:ModifyTargetGroupAttributes
135 |   - elasticloadbalancing:DeleteTargetGroup
136 |   Resource: "*"
137 |   Condition:
138 |     'Null':
139 |       aws:ResourceTag/elbv2.k8s.aws/cluster: 'false'
140 | - Effect: Allow
141 |   Action:
142 |   - elasticloadbalancing:AddTags
143 |   Resource:
144 |   - arn:aws:elasticloadbalancing:*:*:targetgroup/*/*
145 |   - arn:aws:elasticloadbalancing:*:*:loadbalancer/net/*/*
146 |   - arn:aws:elasticloadbalancing:*:*:loadbalancer/app/*/*
147 |   Condition:
148 |     StringEquals:
149 |       elasticloadbalancing:CreateAction:
150 |       - CreateTargetGroup
151 |       - CreateLoadBalancer
152 |     'Null':
153 |       aws:RequestTag/elbv2.k8s.aws/cluster: 'false'
154 | - Effect: Allow
155 |   Action:
156 |   - elasticloadbalancing:RegisterTargets
157 |   - elasticloadbalancing:DeregisterTargets
158 |   Resource: arn:aws:elasticloadbalancing:*:*:targetgroup/*/*
159 | - Effect: Allow
160 |   Action:
161 |   - elasticloadbalancing:SetWebAcl
162 |   - elasticloadbalancing:ModifyListener
163 |   - elasticloadbalancing:AddListenerCertificates
164 |   - elasticloadbalancing:RemoveListenerCertificates
165 |   - elasticloadbalancing:ModifyRule
166 |   Resource: "*"


--------------------------------------------------------------------------------
/source/app_resources/alb-values.yaml:
--------------------------------------------------------------------------------
 1 | # image:
 2 | #   tag: v2.2.0
 3 | region: {{region_name}}
 4 | vpcId: {{vpc_id}}
 5 | clusterName: {{cluster_name}}
 6 | serviceAccount:
 7 |   create: false
 8 |   name: alb-aws-load-balancer-controller
 9 | nodeSelector:
10 |   eks.amazonaws.com/capacityType: ON_DEMAND


--------------------------------------------------------------------------------
/source/app_resources/argo-values.yaml:
--------------------------------------------------------------------------------
 1 | controller:
 2 |   workflowNamespaces:
 3 |     - argo
 4 |   nodeSelector:
 5 |     eks.amazonaws.com/capacityType: ON_DEMAND
 6 | init:
 7 |   serviceAccount: arcjob
 8 | workflow:
 9 |   namespace: spark
10 |   serviceAccount:
11 |     create: false
12 |     name: arcjob
13 | server:
14 |   nodeSelector:
15 |     eks.amazonaws.com/capacityType: ON_DEMAND
16 |   extraArgs:
17 |   - --auth-mode
18 |   - client
19 |   ingress:  
20 |     enabled: true  
21 |     annotations:    
22 |       kubernetes.io/ingress.class: alb
23 |       alb.ingress.kubernetes.io/scheme: internet-facing
24 |       alb.ingress.kubernetes.io/target-type: ip
25 |       alb.ingress.kubernetes.io/success-codes: 200,301,302
26 |       alb.ingress.kubernetes.io/listen-ports: '[{"HTTP": 2746}]'
27 |       alb.ingress.kubernetes.io/manage-backend-security-group-rules: "true"
28 |       alb.ingress.kubernetes.io/security-groups: {{INBOUND_SG}}


--------------------------------------------------------------------------------
/source/app_resources/autoscaler-iam-role.yaml:
--------------------------------------------------------------------------------
 1 | - Effect: Allow
 2 |   Action:
 3 |   - autoscaling:DescribeAutoScalingGroups
 4 |   - autoscaling:DescribeAutoScalingInstances
 5 |   - autoscaling:DescribeLaunchConfigurations
 6 |   - autoscaling:DescribeTags
 7 |   - autoscaling:SetDesiredCapacity
 8 |   - autoscaling:TerminateInstanceInAutoScalingGroup
 9 |   - ec2:DescribeLaunchTemplateVersions
10 |   Resource: 
11 |   - "*"
12 |    


--------------------------------------------------------------------------------
/source/app_resources/autoscaler-values.yaml:
--------------------------------------------------------------------------------
 1 | autoDiscovery:
 2 |     clusterName: {{cluster_name}}
 3 | awsRegion: {{region_name}}
 4 | image:
 5 |     tag: v1.27.3
 6 | nodeSelector:
 7 |   eks.amazonaws.com/capacityType: ON_DEMAND
 8 | podAnnotations:
 9 |     cluster-autoscaler.kubernetes.io/safe-to-evict: 'false'
10 | extraArgs:
11 |     skip-nodes-with-system-pods: false
12 |     scale-down-unneeded-time: 2m
13 |     scale-down-unready-time: 5m
14 | rbac:
15 |     serviceAccount:
16 |         create: false
17 |         name: cluster-autoscaler
18 | 
19 |   


--------------------------------------------------------------------------------
/source/app_resources/etl-iam-role.yaml:
--------------------------------------------------------------------------------
 1 |   - Effect: Allow
 2 |     Action: 
 3 |     - s3:ListBucket
 4 |     - s3:GetBucketLocation
 5 |     Resource:
 6 |     - arn:aws:s3:::{{codeBucket}}
 7 |     - arn:aws:s3:::{{datalakeBucket}}
 8 |     - arn:aws:s3:::nyc-tlc
 9 |   - Effect: Allow
10 |     Action:
11 |     - s3:PutObject
12 |     - s3:GetObject
13 |     Resource:
14 |     - arn:aws:s3:::{{codeBucket}}/*
15 |     - arn:aws:s3:::{{datalakeBucket}}/*
16 |     - arn:aws:s3:::nyc-tlc/*
17 |   - Effect: Allow
18 |     Action:
19 |     - s3:DeleteObject
20 |     Resource:
21 |     - arn:aws:s3:::{{codeBucket}}/*
22 |     - arn:aws:s3:::{{datalakeBucket}}/*
23 |   - Effect: Allow
24 |     Action:
25 |     - kms:Decrypt
26 |     - kms:Encrypt
27 |     - kms:GenerateDataKey*
28 |     - athena:StartQueryExecution
29 |     - athena:GetQueryExecution
30 |     - athena:GetQueryResults
31 |     - athena:GetQueryResultsStream
32 |     - athena:GetWorkGroup
33 |     - athena:ListDataCatalogs
34 |     - glue:CreateTable
35 |     - glue:CreateDatabase
36 |     - glue:CreatePartition
37 |     - glue:UpdatePartition
38 |     - glue:UpdateTable
39 |     - glue:GetTable
40 |     - glue:GetDatabases
41 |     - glue:GetCatalogImportStatus
42 |     Resource:
43 |       - '*'   


--------------------------------------------------------------------------------
/source/app_resources/etl-rbac.yaml:
--------------------------------------------------------------------------------
 1 | kind: Role
 2 | apiVersion: rbac.authorization.k8s.io/v1
 3 | metadata:
 4 |   name: etl-workflow-role
 5 |   namespace: spark
 6 | rules:
 7 |   - apiGroups: [""]
 8 |     resources: ["pods","pods/exec","configmaps","services"]
 9 |     verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
10 |   - apiGroups: ["batch", "extensions"]
11 |     resources: ["jobs"]
12 |     verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]  
13 |   - apiGroups: [""]
14 |     resources: ["events","pods/log","serviceaccounts", "secrets","endpoints"]
15 |     verbs: ["list", "get", "watch"]
16 |   - apiGroups: [""]
17 |     resources: ["persistentvolumeclaims"]
18 |     verbs: ["create", "delete", "get", "list"]
19 |   - apiGroups: ["argoproj.io"]
20 |     resources: ["workflows","workflows/finalizers"]
21 |     verbs: ["*"]
22 |   - apiGroups: ["argoproj.io"]
23 |     resources: ["workflowtemplates","workflowtemplates/finalizers"]
24 |     verbs: ["get", "list", "watch"]
25 |     
26 |   
27 | ---
28 | kind: RoleBinding
29 | apiVersion: rbac.authorization.k8s.io/v1
30 | metadata:
31 |   name: {{MY_SA}}-role-binding
32 |   namespace: spark
33 | subjects:
34 |   - kind: ServiceAccount
35 |     name: {{MY_SA}}
36 |     namespace: spark
37 | roleRef:
38 |   kind: Role
39 |   name: etl-workflow-role
40 |   apiGroup: rbac.authorization.k8s.io


--------------------------------------------------------------------------------
/source/app_resources/ex-secret-iam-role.yaml:
--------------------------------------------------------------------------------
 1 | - Effect: Allow
 2 |   Action:
 3 |   - secretsmanager:GetResourcePolicy
 4 |   - secretsmanager:GetSecretValue
 5 |   - secretsmanager:DescribeSecret
 6 |   - secretsmanager:ListSecretVersionIds
 7 |   Resource: {{secretsmanager}}
 8 | - Effect: Allow
 9 |   Action:
10 |   - secretsmanager:GetRandomPassword
11 |   - secretsmanager:ListSecrets
12 |   - kms:Decrypt
13 |   - kms:Encrypt
14 |   Resource: 
15 |   - "*"


--------------------------------------------------------------------------------
/source/app_resources/ex-secret-values.yaml:
--------------------------------------------------------------------------------
 1 | image:
 2 |   tag: 7.2.0
 3 | env:
 4 |   AWS_REGION: {{region_name}}
 5 |   AWS_DEFAULT_REGION: {{region_name}}
 6 | serviceAccount:
 7 |   create: false
 8 |   name: external-secrets-controller
 9 | securityContext:
10 |   fsGroup: 65534
11 | 


--------------------------------------------------------------------------------
/source/app_resources/jupyter-config.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: RoleBinding
 3 | metadata:
 4 |   name: {{MY_SA}}-role-binding
 5 |   namespace: jupyter
 6 | subjects:
 7 |   - kind: ServiceAccount
 8 |     name: {{MY_SA}}
 9 |     namespace: jupyter
10 | roleRef:
11 |   kind: Role
12 |   name: hub
13 |   apiGroup: rbac.authorization.k8s.io
14 | 
15 | ---
16 | apiVersion: networking.k8s.io/v1
17 | kind: Ingress
18 | metadata:
19 |   name: jupyterhub
20 |   namespace: jupyter
21 |   annotations:
22 |     kubernetes.io/ingress.class: alb
23 |     alb.ingress.kubernetes.io/scheme: internet-facing
24 |     alb.ingress.kubernetes.io/target-type: ip
25 |     alb.ingress.kubernetes.io/success-codes: 200,301,302
26 |     alb.ingress.kubernetes.io/listen-ports: '[{"HTTP": 80}]'
27 |     alb.ingress.kubernetes.io/manage-backend-security-group-rules: "true"
28 |     alb.ingress.kubernetes.io/security-groups: {{INBOUND_SG}}
29 |   labels:
30 |     app: jupyterhub
31 | spec:
32 |   rules:
33 |   - host: ""
34 |     http:
35 |       paths:
36 |       - path: /
37 |         pathType: Prefix
38 |         backend:
39 |           service:
40 |               name: proxy-public
41 |               port:
42 |                 number: 80
43 |           
44 | ---
45 | apiVersion: kubernetes-client.io/v1
46 | kind: ExternalSecret
47 | metadata:
48 |   name: jupyter-external-secret
49 |   namespace: jupyter
50 | spec:
51 |   backendType: secretsManager
52 |   region: {{REGION}}
53 |   data:
54 |     - key: {{SECRET_NAME}}
55 |       name: password
56 |       property: password          


--------------------------------------------------------------------------------
/source/app_resources/jupyter-values.yaml:
--------------------------------------------------------------------------------
  1 | hub:
  2 |   db:
  3 |     type: sqlite-memory
  4 |   extraConfig:
  5 |     overrideServiceAccount: |
  6 |       import os, sys
  7 |       
  8 |       c.JupyterHub.authenticator_class = 'jupyterhub.auth.DummyAuthenticator'
  9 |       c.DummyAuthenticator.password = os.environ['LOGIN']
 10 |       c.Authenticator.admin_users = {"service-admin"}
 11 |       c.JupyterHub.service_tokens = {
 12 |           "secret-token": "service-admin",
 13 |       }
 14 |       # this script allows serviceAccountName to use dynamic naming based on {unescaped_username}"
 15 |       async def override_service_account_hook(kube_spawner):
 16 |         if kube_spawner.service_account is not None:
 17 |           kube_spawner.service_account = kube_spawner._expand_user_properties(kube_spawner.service_account)
 18 |           kube_spawner.env['USER_NAME'] = kube_spawner._expand_user_properties("{unescaped_username}")
 19 |           print("kube_spawner.service_account = " + kube_spawner.service_account)
 20 |       c.KubeSpawner.pre_spawn_hook = override_service_account_hook
 21 |       
 22 |       # setup timeout
 23 |       c.JupyterHub.cookie_max_age_days = 0.0105
 24 |       c.Authenticator.refresh_pre_spawn = True
 25 | 
 26 |   extraEnv:
 27 |     - name: LOGIN
 28 |       valueFrom:
 29 |         secretKeyRef:
 30 |           name: jupyter-external-secret
 31 |           key: password
 32 |   nodeSelector:
 33 |     lifecycle: OnDemand
 34 |   readinessProbe:
 35 |     initialDelaySeconds: 30
 36 |     periodSeconds: 10
 37 |           
 38 | proxy:
 39 |   secretToken: "*****"
 40 |   service:
 41 |     type: ClusterIP
 42 |   chp:
 43 |     nodeSelector:
 44 |       lifecycle: OnDemand    
 45 | 
 46 | singleuser:
 47 |   defaultUrl: "/lab"
 48 |   nodeSelector:
 49 |     lifecycle: OnDemand
 50 |   image:
 51 |     name: ghcr.io/tripl-ai/arc-jupyter
 52 |     tag:  arc-jupyter_3.16.0_scala_2.12_hadoop_3.3.2_3.16.0_slim
 53 |     pullPolicy: Always
 54 |   lifecycleHooks:
 55 |     postStart:
 56 |       exec:
 57 |         command:
 58 |           - "bash"
 59 |           - "-c"
 60 |           - >
 61 |             cp -r /opt/.jupyter $HOME/.jupyter;
 62 |             echo "git clone https://github.com/awslabs/sql-based-etl-with-apache-spark-on-amazon-eks";
 63 |             git clone https://github.com/awslabs/sql-based-etl-with-apache-spark-on-amazon-eks;
 64 | 
 65 |   serviceAccountName: "{username}"
 66 |   cpu:
 67 |     guarantee: 0.25
 68 |     limit: 0.5
 69 |   memory:
 70 |     guarantee: 4G
 71 |     limit: 4G
 72 |   extraEnv:
 73 |     CONF_ALLOW_EXPORT: "true"
 74 |     JAVA_OPTS: -Xmx4G
 75 |     ETL_CONF_DATALAKE_LOC: {{codeBucket}}
 76 |     ETL_CONF_AWS_REGION: {{region}}
 77 |     conf_spark_hadoop_fs_s3a_aws_credentials_provider: com.amazonaws.auth.WebIdentityTokenCredentialsProvider
 78 |   storage:
 79 |     type: none
 80 |   # storage:
 81 |   #   type: dynamic
 82 |   #   capacity: 10G
 83 |   #   homeMountPath: '/home/{username}/data'
 84 |   #   # mount to EBS  
 85 |   #   dynamic:
 86 |   #     storageClass: gp2 
 87 |   profileList:
 88 |   - default: True
 89 |     display_name: "Small (default): Arc-Jupyter Development Environment"
 90 |     description: "4GB Memory & 1vCPUs"
 91 |     kubespawner_override:
 92 |       cpu_guarantee: 0.5
 93 |       cpu_limit: 1
 94 |       mem_guarantee: 4G
 95 |       mem_limit: 10G
 96 |   - display_name: "Big Arc-Jupyter Development Environment"
 97 |     description: "15GB Memory & 2vCPUs"
 98 |     kubespawner_override:
 99 |       cpu_guarantee: 0.5
100 |       cpu_limit: 2
101 |       mem_guarantee: 10G
102 |       mem_limit: 15G
103 | 
104 | prePuller:
105 |   hook:
106 |     enabled: false 
107 | 
108 | # autoscacling setting
109 | scheduling:
110 |   userScheduler:
111 |     enabled: false
112 | cull:
113 |   timeout: 1800
114 | # debug:
115 | #   enabled: true


--------------------------------------------------------------------------------
/source/app_resources/native-spark-iam-role.yaml:
--------------------------------------------------------------------------------
 1 | - Effect: Allow
 2 |   Action: s3:ListBucket
 3 |   Resource:
 4 |   - arn:aws:s3:::{{codeBucket}}
 5 |   - arn:aws:s3:::{{datalakeBucket}}
 6 |   - arn:aws:s3:::nyc-tlc
 7 | - Effect: Allow
 8 |   Action:
 9 |   - s3:PutObject
10 |   - s3:GetObject
11 |   Resource:
12 |   - arn:aws:s3:::{{codeBucket}}/*
13 |   - arn:aws:s3:::{{datalakeBucket}}/*
14 |   - arn:aws:s3:::nyc-tlc/*
15 | - Effect: Allow
16 |   Action:
17 |   - s3:DeleteObject
18 |   Resource:
19 |   - arn:aws:s3:::{{codeBucket}}/*  
20 | - Effect: Allow
21 |   Action:
22 |   - kms:Encrypt
23 |   - kms:Decrypt
24 |   - kms:GenerateDataKey*
25 |   - kms:DescribeKey
26 |   Resource:
27 |   - '*'    


--------------------------------------------------------------------------------
/source/app_resources/native-spark-rbac.yaml:
--------------------------------------------------------------------------------
 1 | kind: RoleBinding
 2 | apiVersion: rbac.authorization.k8s.io/v1
 3 | metadata:
 4 |   name: {{MY_SA}}-role-binding
 5 |   namespace: spark
 6 | subjects:
 7 |   - kind: ServiceAccount
 8 |     name: {{MY_SA}}
 9 |     namespace: spark
10 | roleRef:
11 |   kind: Role
12 |   name: etl-workflow-role
13 |   apiGroup: rbac.authorization.k8s.io


--------------------------------------------------------------------------------
/source/app_resources/spark-operator-values.yaml:
--------------------------------------------------------------------------------
 1 | nodeSelector:
 2 |   # spark operator only works with non-graviton CPU
 3 |   kubernetes.io/arch: amd64
 4 | serviceAccounts:
 5 |   spark:
 6 |     create: false
 7 |   sparkoperator:
 8 |     create: true  
 9 | metrics:
10 | # -- Disable prometheus metric scraping
11 |   enable: false
12 | webhook:
13 |   enable: true
14 |   port: 443


--------------------------------------------------------------------------------
/source/app_resources/spark-template.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: argoproj.io/v1alpha1
  2 | kind: WorkflowTemplate
  3 | metadata:
  4 |   name: spark-template
  5 |   namespace: spark
  6 | spec:
  7 |   templates:
  8 |   - name: smalljob
  9 |     retryStrategy:
 10 |       limit: 3
 11 |       retryPolicy: "Always"
 12 |     inputs:
 13 |       # override defaults here
 14 |       parameters:
 15 |       - name: jobId
 16 |       - name: configUri
 17 |       - name: image
 18 |         value: ghcr.io/tripl-ai/arc:latest
 19 |       - name: pullPolicy
 20 |         value: "Always"    
 21 |       - name: executorInstances
 22 |         value: "1"
 23 |       - name: executorCores
 24 |         value: "1"
 25 |       - name: executorMemory
 26 |         value: "1"
 27 |       - name: sparkConf
 28 |         value: ""
 29 |       - name: tags
 30 |         value: ""
 31 |       - name: parameters
 32 |         value: ""
 33 |       # to exec each stages at a jupyter notebook, we can controle it by matching the environment. Some stages may not required in prod env.   
 34 |       - name: environment
 35 |         value: test  
 36 |     metadata:
 37 |       labels:
 38 |           app: spark
 39 |           workflowId: "{{workflow.uid}}"
 40 |     script:
 41 |       resources:
 42 |         limits:
 43 |           cpu: "1"
 44 |           memory: "1Gi"
 45 |       image: "{{inputs.parameters.image}}"
 46 |       command: ["/bin/sh"]
 47 |       source: |
 48 |         # verbose logging
 49 |         set -ex
 50 | 
 51 |         # print current hostname and ip
 52 |         hostname
 53 |         hostname -I
 54 | 
 55 |         # submit job
 56 |         /opt/spark/bin/spark-submit \
 57 |         --master k8s://kubernetes.default.svc:443 \
 58 |         --deploy-mode client \
 59 |         --class ai.tripl.arc.ARC \
 60 |         --name arc \
 61 |         --conf spark.authenticate=true \
 62 |         --conf spark.driver.extraJavaOptions="-XX:+UseG1GC" \
 63 |         --conf spark.driver.host=$(hostname -I)  \
 64 |         --conf spark.driver.memory=921m \
 65 |         --conf spark.executor.cores={{inputs.parameters.executorCores}} \
 66 |         --conf spark.executor.extraJavaOptions="-XX:+UseG1GC" \
 67 |         --conf spark.executor.instances={{inputs.parameters.executorInstances}} \
 68 |         --conf spark.executor.memory={{inputs.parameters.executorMemory}}G \
 69 |         --conf spark.io.encryption.enabled=true \
 70 |         --conf spark.kubernetes.authenticate.caCertFile=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt \
 71 |         --conf spark.kubernetes.authenticate.driver.serviceAccountName={{workflow.serviceAccountName}} \
 72 |         --conf spark.kubernetes.authenticate.oauthTokenFile=/var/run/secrets/kubernetes.io/serviceaccount/token \
 73 |         --conf spark.kubernetes.container.image.pullPolicy={{inputs.parameters.pullPolicy}} \
 74 |         --conf spark.kubernetes.container.image={{inputs.parameters.image}} \
 75 |         --conf spark.kubernetes.driver.limit.cores=1 \
 76 |         --conf spark.kubernetes.driver.pod.name=$(hostname) \
 77 |         --conf spark.kubernetes.executor.label.workflowId={{workflow.uid}} \
 78 |         --conf spark.kubernetes.executor.limit.cores={{inputs.parameters.executorCores}} \
 79 |         --conf spark.kubernetes.executor.podNamePrefix=$(hostname) \
 80 |         --conf spark.kubernetes.executor.request.cores={{inputs.parameters.executorCores}} \
 81 |         --conf spark.kubernetes.local.dirs.tmpfs=true \
 82 |         --conf spark.kubernetes.namespace={{workflow.namespace}} \
 83 |         --conf spark.network.crypto.enabled=true \
 84 |         --conf spark.sql.ansi.enabled=true \
 85 |         --conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.WebIdentityTokenCredentialsProvider \
 86 |         {{inputs.parameters.sparkConf}} \
 87 |         local:///opt/spark/jars/arc.jar \
 88 |         --etl.config.uri={{inputs.parameters.configUri}} \
 89 |         --etl.config.job.id={{inputs.parameters.jobId}} \
 90 |         --etl.config.environment={{inputs.parameters.environment}} \
 91 |         --etl.config.ignoreEnvironments=false \
 92 |         --etl.config.tags="service=arc workflowId={{workflow.uid}} pod={{pod.name}} serviceAccount={{workflow.serviceAccountName}} namespace={{workflow.namespace}} {{inputs.parameters.tags}}" \
 93 |         --ETL_CONF_EPOCH=$(date '+%s') --ETL_CONF_CURRENT_TIMESTAMP="'$(date -u '+%Y-%m-%d %H:%M:%S')'" \
 94 |         {{inputs.parameters.parameters}}
 95 | 
 96 |   - name: mediumjob
 97 |     retryStrategy:
 98 |       limit: 3
 99 |       retryPolicy: "Always"
100 |     inputs:
101 |       # override defaults here
102 |       parameters:
103 |       - name: jobId
104 |       - name: configUri
105 |       - name: image
106 |         value: ghcr.io/tripl-ai/arc:latest
107 |       - name: pullPolicy
108 |         value: "Always"    
109 |       - name: executorInstances
110 |         value: "2"
111 |       - name: executorCores
112 |         value: "2"
113 |       - name: executorMemory
114 |         value: "10"
115 |       - name: sparkConf
116 |         value: ""
117 |       - name: tags
118 |         value: ""
119 |       - name: parameters
120 |         value: ""
121 |       # to exec each stages at a jupyter notebook, we can controle it by matching the environment. Some stages may not required in prod env.   
122 |       - name: environment
123 |         value: test  
124 |     metadata:
125 |       labels:
126 |           app: spark
127 |           workflowId: "{{workflow.uid}}"
128 |     script:
129 |       resources:
130 |         limits:
131 |           cpu: "2"
132 |           memory: "13Gi"
133 |       image: "{{inputs.parameters.image}}"
134 |       command: ["/bin/sh"]
135 |       source: |
136 |         # verbose logging
137 |         set -ex
138 | 
139 |         # print current hostname and ip
140 |         hostname
141 |         hostname -I
142 | 
143 |         # submit job
144 |         /opt/spark/bin/spark-submit \
145 |         --master k8s://kubernetes.default.svc:443 \
146 |         --deploy-mode client \
147 |         --class ai.tripl.arc.ARC \
148 |         --name arc \
149 |         --conf spark.authenticate=true \
150 |         --conf spark.driver.extraJavaOptions="-XX:+UseG1GC" \
151 |         --conf spark.driver.host=$(hostname -I)  \
152 |         --conf spark.driver.memory=2g \
153 |         --conf spark.executor.cores={{inputs.parameters.executorCores}} \
154 |         --conf spark.executor.extraJavaOptions="-XX:+UseG1GC" \
155 |         --conf spark.executor.instances={{inputs.parameters.executorInstances}} \
156 |         --conf spark.executor.memory={{inputs.parameters.executorMemory}}G \
157 |         --conf spark.io.encryption.enabled=true \
158 |         --conf spark.kubernetes.authenticate.caCertFile=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt \
159 |         --conf spark.kubernetes.authenticate.driver.serviceAccountName={{workflow.serviceAccountName}} \
160 |         --conf spark.kubernetes.authenticate.oauthTokenFile=/var/run/secrets/kubernetes.io/serviceaccount/token \
161 |         --conf spark.kubernetes.container.image.pullPolicy={{inputs.parameters.pullPolicy}} \
162 |         --conf spark.kubernetes.container.image={{inputs.parameters.image}} \
163 |         --conf spark.kubernetes.driver.limit.cores=1 \
164 |         --conf spark.kubernetes.driver.pod.name=$(hostname) \
165 |         --conf spark.kubernetes.executor.label.workflowId={{workflow.uid}} \
166 |         --conf spark.kubernetes.executor.limit.cores={{inputs.parameters.executorCores}} \
167 |         --conf spark.kubernetes.executor.podNamePrefix=$(hostname) \
168 |         --conf spark.kubernetes.executor.request.cores={{inputs.parameters.executorCores}} \
169 |         --conf spark.kubernetes.local.dirs.tmpfs=true \
170 |         --conf spark.kubernetes.namespace={{workflow.namespace}} \
171 |         --conf spark.network.crypto.enabled=true \
172 |         --conf spark.sql.ansi.enabled=true \
173 |         --conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.WebIdentityTokenCredentialsProvider \
174 |         {{inputs.parameters.sparkConf}} \
175 |         local:///opt/spark/jars/arc.jar \
176 |         --etl.config.uri={{inputs.parameters.configUri}} \
177 |         --etl.config.job.id={{inputs.parameters.jobId}} \
178 |         --etl.config.environment={{inputs.parameters.environment}} \
179 |         --etl.config.ignoreEnvironments=false \
180 |         --etl.config.tags="service=arc workflowId={{workflow.uid}} pod={{pod.name}} serviceAccount={{workflow.serviceAccountName}} namespace={{workflow.namespace}} {{inputs.parameters.tags}}" \
181 |         --ETL_CONF_EPOCH=$(date '+%s') --ETL_CONF_CURRENT_TIMESTAMP="'$(date -u '+%Y-%m-%d %H:%M:%S')'" \
182 |         {{inputs.parameters.parameters}}
183 |   
184 |   - name: largejob
185 |     retryStrategy:
186 |       limit: 3
187 |       retryPolicy: "Always"
188 |     inputs:
189 |       # override defaults here
190 |       parameters:
191 |       - name: jobId
192 |       - name: configUri
193 |       - name: image
194 |         value: ghcr.io/tripl-ai/arc:latest
195 |       - name: pullPolicy
196 |         value: "Always"  
197 |       - name: executorInstances
198 |         value: "3"
199 |       - name: executorCores
200 |         value: "2"
201 |       - name: executorMemory
202 |         value: "12"
203 |       - name: sparkConf
204 |         value: ""
205 |       - name: tags
206 |         value: ""
207 |       - name: parameters
208 |         value: ""
209 |       # to exec each stages at a jupyter notebook, we can controle it by matching the environment. Some stages may not required in prod env.   
210 |       - name: environment
211 |         value: test  
212 |     metadata:
213 |       labels:
214 |           app: spark
215 |           workflowId: "{{workflow.uid}}"
216 |     script:
217 |       resources:
218 |         limits:
219 |           cpu: "3"
220 |           memory: "13Gi"
221 |       image: "{{inputs.parameters.image}}"
222 |       command: ["/bin/sh"]
223 |       source: |
224 |         # verbose logging
225 |         set -ex
226 | 
227 |         # print current hostname and ip
228 |         hostname
229 |         hostname -I
230 | 
231 |         # submit job
232 |         /opt/spark/bin/spark-submit \
233 |         --master k8s://kubernetes.default.svc:443 \
234 |         --deploy-mode client \
235 |         --class ai.tripl.arc.ARC \
236 |         --name arc \
237 |         --conf spark.authenticate=true \
238 |         --conf spark.driver.extraJavaOptions="-XX:+UseG1GC" \
239 |         --conf spark.driver.host=$(hostname -I)  \
240 |         --conf spark.driver.memory=4g \
241 |         --conf spark.executor.cores={{inputs.parameters.executorCores}} \
242 |         --conf spark.executor.extraJavaOptions="-XX:+UseG1GC" \
243 |         --conf spark.executor.instances={{inputs.parameters.executorInstances}} \
244 |         --conf spark.executor.memory={{inputs.parameters.executorMemory}}G \
245 |         --conf spark.io.encryption.enabled=true \
246 |         --conf spark.kubernetes.authenticate.caCertFile=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt \
247 |         --conf spark.kubernetes.authenticate.driver.serviceAccountName={{workflow.serviceAccountName}} \
248 |         --conf spark.kubernetes.authenticate.oauthTokenFile=/var/run/secrets/kubernetes.io/serviceaccount/token \
249 |         --conf spark.kubernetes.container.image.pullPolicy={{inputs.parameters.pullPolicy}} \
250 |         --conf spark.kubernetes.container.image={{inputs.parameters.image}} \
251 |         --conf spark.kubernetes.driver.limit.cores=1 \
252 |         --conf spark.kubernetes.driver.pod.name=$(hostname) \
253 |         --conf spark.kubernetes.executor.label.workflowId={{workflow.uid}} \
254 |         --conf spark.kubernetes.executor.limit.cores={{inputs.parameters.executorCores}} \
255 |         --conf spark.kubernetes.executor.podNamePrefix=$(hostname) \
256 |         --conf spark.kubernetes.executor.request.cores={{inputs.parameters.executorCores}} \
257 |         --conf spark.kubernetes.local.dirs.tmpfs=true \
258 |         --conf spark.kubernetes.namespace={{workflow.namespace}} \
259 |         --conf spark.network.crypto.enabled=true \
260 |         --conf spark.sql.ansi.enabled=true \
261 |         --conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.WebIdentityTokenCredentialsProvider \
262 |         {{inputs.parameters.sparkConf}} \
263 |         local:///opt/spark/jars/arc.jar \
264 |         --etl.config.uri={{inputs.parameters.configUri}} \
265 |         --etl.config.job.id={{inputs.parameters.jobId}} \
266 |         --etl.config.environment={{inputs.parameters.environment}} \
267 |         --etl.config.ignoreEnvironments=false \
268 |         --etl.config.tags="service=arc workflowId={{workflow.uid}} pod={{pod.name}} serviceAccount={{workflow.serviceAccountName}} namespace={{workflow.namespace}} {{inputs.parameters.tags}}" \
269 |         --ETL_CONF_EPOCH=$(date '+%s') --ETL_CONF_CURRENT_TIMESTAMP="'$(date -u '+%Y-%m-%d %H:%M:%S')'" \
270 |         {{inputs.parameters.parameters}}
271 |           
272 |   - name: sparklocal
273 |     inputs:
274 |       retryStrategy:
275 |         limit: 3
276 |         retryPolicy: "Always"
277 |       # override defaults here
278 |       parameters:
279 |       - name: jobId
280 |       - name: configUri
281 |       - name: image
282 |         value: ghcr.io/tripl-ai/arc:latest
283 |       - name: executorInstances
284 |         value: "1"
285 |       - name: executorCores
286 |         value: "1"
287 |       - name: executorMemory
288 |         value: "1"
289 |       - name: sparkConf
290 |         value: ""
291 |       - name: tags
292 |         value: ""
293 |       - name: parameters
294 |         value: ""
295 |       - name: pullPolicy
296 |         value: IfNotPresent
297 |       - name: environment
298 |         value: test   
299 |     metadata:
300 |       labels:
301 |         app: spark 
302 |         workflowId: "{{workflow.uid}}"
303 |     podSpecPatch: |
304 |       containers:
305 |         - name: main
306 |           resources:
307 |             requests:
308 |               cpu: "{{inputs.parameters.executorCores}}"
309 |               memory: "{{inputs.parameters.executorMemory}}Gi"
310 |     script:
311 |       image: "{{inputs.parameters.image}}"
312 |       command: ["/bin/sh"]
313 |       source: |
314 |         # verbose logging
315 |         set -ex
316 | 
317 |         # print current hostname and ip
318 |         hostname
319 |         hostname -I
320 | 
321 |         # submit job
322 |         # driver memory is set at 90% of executorMemory
323 |         /opt/spark/bin/spark-submit \
324 |         --master local[{{inputs.parameters.executorCores}}] \
325 |         --driver-memory $(({{inputs.parameters.executorMemory}} * 1024 * 90/100))m \
326 |         --driver-java-options "-XX:+UseG1GC" \
327 |         --class ai.tripl.arc.ARC \
328 |         --name arc \
329 |         --conf spark.driver.host=$(hostname -I)  \
330 |         --conf spark.driver.pod.name=$(hostname)-driver \
331 |         --conf spark.io.encryption.enabled=true \
332 |         --conf spark.sql.adaptive.enabled=true \
333 |         --conf spark.network.crypto.enabled=true \
334 |         --conf spark.ui.enabled=true \
335 |         --conf spark.sql.ansi.enabled=true \
336 |         --conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.WebIdentityTokenCredentialsProvider \
337 |         {{inputs.parameters.sparkConf}} \
338 |         local:///opt/spark/jars/arc.jar \
339 |         --etl.config.uri={{inputs.parameters.configUri}} \
340 |         --etl.config.job.id={{inputs.parameters.jobId}} \
341 |         --etl.config.environment={{inputs.parameters.environment}} \
342 |         --etl.config.ignoreEnvironments=fales \
343 |         --etl.config.tags="service=arc workflowId={{workflow.uid}} pod={{pod.name}} serviceAccount={{workflow.serviceAccountName}} namespace={{workflow.namespace}} {{inputs.parameters.tags}}" \
344 |         --ETL_CONF_EPOCH=$(date '+%s') --ETL_CONF_CURRENT_TIMESTAMP="'$(date -u '+%Y-%m-%d %H:%M:%S')'" \
345 |         {{inputs.parameters.parameters}}        


--------------------------------------------------------------------------------
/source/cdk.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "app": "python3 app.py",
 3 |   "context": {
 4 |     "cluster_name": "spark-on-eks",
 5 |     "solution_id": "SO0141",
 6 |     "solution_name": "sql-based-etl-with-apache-spark-on-amazon-eks",
 7 |     "version": "v2.0.0",
 8 |     "@aws-cdk/core:stackRelativeExports": true,
 9 |     "@aws-cdk/customresources:installLatestAwsSdkDefault": false
10 |   }
11 | }
12 | 


--------------------------------------------------------------------------------
/source/example/native-spark-job-scheduler.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: "sparkoperator.k8s.io/v1beta2"
 2 | kind: SparkApplication
 3 | metadata:
 4 |   name: word-count
 5 |   namespace: spark
 6 | spec:
 7 |   type: Python
 8 |   pythonVersion: "3"
 9 |   mode: cluster
10 |   image: {{ECR_URL}}
11 |   imagePullPolicy: Always
12 |   mainApplicationFile: "s3a://$(BUCKET_PARAM)/app_code/job/wordcount.py"
13 |   arguments: ["s3a://nyc-tlc/csv_backup/yellow_tripdata*.csv","s3a://$(BUCKET_PARAM)/app_code/output/native"]
14 |   sparkVersion: "3.3.4"
15 |   sparkConf:
16 |     # By design, the graviton EKS nodegroup is in a single AZ
17 |     # use the nodegroup label to trigger the scaling of Graviton instances within a single AZ 
18 |     # "spark.kubernetes.node.selector.nodegroup": "single-az-graviton"
19 |     "spark.hadoop.fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem"
20 |     "spark.hadoop.fs.s3a.aws.credentials.provider": "com.amazonaws.auth.WebIdentityTokenCredentialsProvider"
21 |     "spark.kubernetes.allocation.batch.size": "15" 
22 |     "spark.io.encryption.enabled": "true"
23 |     "spark.kubernetes.local.dirs.tmpfs": "true"
24 |   volumes:
25 |     - name: spark-local-dir-1
26 |       hostPath:
27 |         path: "/tmp"
28 |         type: Directory      
29 |   dynamicAllocation:
30 |     enabled: true
31 |     initialExecutors: 1
32 |     minExecutors: 1
33 |     maxExecutors: 20
34 |   restartPolicy:
35 |     type: OnFailure
36 |     onFailureRetries: 3
37 |     onFailureRetryInterval: 10
38 |     onSubmissionFailureRetries: 5
39 |     onSubmissionFailureRetryInterval: 5          
40 |   driver:
41 |     # schedule on spot to test the driver restart
42 |     affinity:
43 |       nodeAffinity:
44 |         requiredDuringSchedulingIgnoredDuringExecution:  
45 |           nodeSelectorTerms:
46 |             - matchExpressions:
47 |               - key: lifecycle
48 |                 operator: In 
49 |                 values: 
50 |                 - Ec2Spot   
51 |     env:
52 |       - name: BUCKET_PARAM
53 |         valueFrom:
54 |           configMapKeyRef:
55 |             name: special-config
56 |             key: codeBucket
57 |     cores: 1
58 |     memory: "1G"
59 |     labels:
60 |       role: driver
61 |     serviceAccount: nativejob
62 |     volumeMounts:
63 |       - name: spark-local-dir-1
64 |         mountPath: "/tmp"
65 |   executor:
66 |    # start executors on Spot
67 |     affinity:
68 |       nodeAffinity:
69 |         requiredDuringSchedulingIgnoredDuringExecution:     
70 |           nodeSelectorTerms:
71 |             - matchExpressions:
72 |               - key: lifecycle
73 |                 operator: In 
74 |                 values: 
75 |                 - Ec2Spot    
76 |     cores: 1
77 |     memory: "4G"
78 |     labels:
79 |       role: executor
80 |     volumeMounts:
81 |       - name: spark-local-dir-1
82 |         mountPath: "/tmp"
83 | 


--------------------------------------------------------------------------------
/source/example/notebook/nyctaxi-job.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%conf \n",
 10 |     "numRows=5"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": null,
 16 |    "metadata": {},
 17 |    "outputs": [],
 18 |    "source": [
 19 |     "%env \n",
 20 |     "ETL_CONF_DATA_URL=s3a://nyc-tlc/csv_backup\n",
 21 |     "ETL_CONF_JOB_URL=https://raw.githubusercontent.com/tripl-ai/arc-starter/master/examples/kubernetes"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": null,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "{\n",
 31 |     "  \"type\": \"DelimitedExtract\",\n",
 32 |     "  \"name\": \"extract data from green_tripdata schema 0\",\n",
 33 |     "  \"environments\": [\"production\", \"test\"],\n",
 34 |     "  \"inputURI\": ${ETL_CONF_DATA_URL}\"/green_tripdata_2013-08.csv\",\n",
 35 |     "  \"outputView\": \"green_tripdata0_raw\",            \n",
 36 |     "  \"delimiter\": \"Comma\",\n",
 37 |     "  \"quote\" : \"DoubleQuote\",\n",
 38 |     "  \"header\": true,\n",
 39 |     "  \"persist\": true\n",
 40 |     "}"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": null,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "{\n",
 50 |     "  \"type\": \"TypingTransform\",\n",
 51 |     "  \"name\": \"apply green_tripdata schema 0 data types\",\n",
 52 |     "  \"environments\": [\"production\", \"test\"],\n",
 53 |     "  \"schemaURI\": ${ETL_CONF_JOB_URL}\"/green_tripdata0.json\",\n",
 54 |     "  \"inputView\": \"green_tripdata0_raw\",            \n",
 55 |     "  \"outputView\": \"green_tripdata0\"\n",
 56 |     "}"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": null,
 62 |    "metadata": {},
 63 |    "outputs": [],
 64 |    "source": [
 65 |     "%sqlvalidate name=\"ensure no errors exist after data typing\" environments=production,test\n",
 66 |     "SELECT\n",
 67 |     "  SUM(error) = 0 AS valid\n",
 68 |     "  ,TO_JSON(\n",
 69 |     "      NAMED_STRUCT(\n",
 70 |     "        'count', COUNT(error), \n",
 71 |     "        'errors', SUM(error)\n",
 72 |     "      )\n",
 73 |     "  ) AS message\n",
 74 |     "FROM (\n",
 75 |     "  SELECT \n",
 76 |     "    CASE \n",
 77 |     "      WHEN SIZE(_errors) > 0 THEN 1 \n",
 78 |     "      ELSE 0 \n",
 79 |     "    END AS error \n",
 80 |     "  FROM green_tripdata0\n",
 81 |     ") input_table"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "%sql name=\"ensure a query can be executed\" environments=production,test persist=true outputView=green_trip_filtered\n",
 91 |     "SELECT * \n",
 92 |     "FROM green_tripdata0\n",
 93 |     "WHERE store_and_fwd_flag = TRUE"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": null,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": []
102 |   }
103 |  ],
104 |  "metadata": {
105 |   "kernelspec": {
106 |    "display_name": "Arc",
107 |    "language": "javascript",
108 |    "name": "arc"
109 |   },
110 |   "language_info": {
111 |    "file_extension": "arc",
112 |    "mimetype": "text/arc",
113 |    "name": "arc",
114 |    "nbconvert_exporter": "text",
115 |    "version": "2.2.0"
116 |   }
117 |  },
118 |  "nbformat": 4,
119 |  "nbformat_minor": 2
120 | }
121 | 


--------------------------------------------------------------------------------
/source/example/notebook/scd2-job.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%conf \n",
 10 |     "numRows=12"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "markdown",
 15 |    "metadata": {},
 16 |    "source": [
 17 |     "# 1. Initial Table Load"
 18 |    ]
 19 |   },
 20 |   {
 21 |    "cell_type": "code",
 22 |    "execution_count": null,
 23 |    "metadata": {},
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "{\n",
 27 |     "  \"type\": \"DelimitedExtract\",\n",
 28 |     "  \"name\": \"extract initial table\",\n",
 29 |     "  \"environments\": [\"dev\", \"test\"],\n",
 30 |     "  \"inputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/data/initial_contacts.csv\",\n",
 31 |     "  \"outputView\": \"initial_raw\",            \n",
 32 |     "  \"delimiter\": \"Comma\",\n",
 33 |     "  \"header\": false,\n",
 34 |     "  \"quote\": \"None\",\n",
 35 |     "  \"authentication\": {\n",
 36 |     "     \"method\": \"AmazonIAM\"\n",
 37 |     "  }\n",
 38 |     "}"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "markdown",
 43 |    "metadata": {},
 44 |    "source": [
 45 |     "## 1.2 Check Original Data Schema"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": null,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "%printschema \n",
 55 |     "initial_raw"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {
 61 |     "ExecuteTime": {
 62 |      "start_time": "2020-03-03T08:30:30.028Z"
 63 |     }
 64 |    },
 65 |    "source": [
 66 |     "## 1.3 Apply Data Type"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": null,
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "{\n",
 76 |     "  \"type\": \"TypingTransform\",\n",
 77 |     "  \"name\": \"apply table schema 0\",\n",
 78 |     "  \"environments\": [\"dev\", \"test\"],\n",
 79 |     "  \"schemaURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/meta/contact_meta_0.json\",\n",
 80 |     "  \"inputView\": \"initial_raw\",            \n",
 81 |     "  \"outputView\": \"initial_typed\",\n",
 82 |     "  \"authentication\": {\n",
 83 |     "     \"method\": \"AmazonIAM\"\n",
 84 |     "  }\n",
 85 |     "}"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "## 1.4 Check Typed Data Schema & Stats"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": null,
 98 |    "metadata": {},
 99 |    "outputs": [],
100 |    "source": [
101 |     "%printschema \n",
102 |     "initial_typed"
103 |    ]
104 |   },
105 |   {
106 |    "cell_type": "markdown",
107 |    "metadata": {},
108 |    "source": [
109 |     "## 1.5 Data Quality Control"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": null,
115 |    "metadata": {},
116 |    "outputs": [],
117 |    "source": [
118 |     "%sqlvaildate outputView=\"fail_fast\" name=\"validation\" description=\"fail the job if data transform is failed\" environments=dev,test sqlParams=inputView=initial_typed\n",
119 |     "\n",
120 |     "SELECT SUM(error) = 0 AS valid\n",
121 |     "      ,TO_JSON(\n",
122 |     "        NAMED_STRUCT('count', COUNT(error), 'errors', SUM(error))\n",
123 |     "      ) AS message\n",
124 |     "FROM \n",
125 |     "(\n",
126 |     "  SELECT CASE WHEN SIZE(_errors) > 0 THEN 1 ELSE 0 END AS error \n",
127 |     "  FROM ${inputView}\n",
128 |     ") base"
129 |    ]
130 |   },
131 |   {
132 |    "cell_type": "markdown",
133 |    "metadata": {},
134 |    "source": [
135 |     "## 1.6 Add Calculated Fields for SCD Type 2\n",
136 |     "### CURRENT_TIMESTAMP will be passed in automatically, when the ETL job is triggered"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": null,
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": [
145 |     "%env\n",
146 |     "ETL_CONF_CURRENT_TIMESTAMP=current_timestamp()"
147 |    ]
148 |   },
149 |   {
150 |    "cell_type": "code",
151 |    "execution_count": null,
152 |    "metadata": {},
153 |    "outputs": [],
154 |    "source": [
155 |     "%sql outputView=\"initial_load\" name=\"add calc field for SCD\" environments=dev,test sqlParams=table_name=initial_typed,now=${ETL_CONF_CURRENT_TIMESTAMP}\n",
156 |     "\n",
157 |     "SELECT id,name,email,state, ${now} AS valid_from, CAST(null AS timestamp) AS valid_to\n",
158 |     ",1 AS iscurrent, md5(concat(name,email,state)) AS checksum \n",
159 |     "FROM ${table_name}"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "markdown",
164 |    "metadata": {},
165 |    "source": [
166 |     "## 1.7 Initial load to Delta Lake\n",
167 |     "### Delta Lake is an optimized data lake to support Time Travel, ACID transaction"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "metadata": {},
174 |    "outputs": [],
175 |    "source": [
176 |     "{\n",
177 |     "  \"type\": \"DeltaLakeLoad\",\n",
178 |     "  \"name\": \"Initial load to Data Lake\",\n",
179 |     "  \"environments\": [\"dev\", \"test\"],\n",
180 |     "  \"inputView\": \"initial_load\",\n",
181 |     "  \"outputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/output/contact/\",\n",
182 |     "  \"numPartitions\": 2\n",
183 |     "  \"saveMode\": \"Overwrite\",\n",
184 |     "  \"authentication\": {\n",
185 |     "     \"method\": \"AmazonIAM\"\n",
186 |     "  }\n",
187 |     "}"
188 |    ]
189 |   },
190 |   {
191 |    "cell_type": "markdown",
192 |    "metadata": {
193 |     "ExecuteTime": {
194 |      "end_time": "2020-05-31T04:55:34.761654Z",
195 |      "start_time": "2020-05-31T04:55:34.738Z"
196 |     }
197 |    },
198 |    "source": [
199 |     "# SCD Type2 Implementation"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "markdown",
204 |    "metadata": {
205 |     "ExecuteTime": {
206 |      "end_time": "2020-03-18T22:38:05.895407Z",
207 |      "start_time": "2020-03-18T22:37:48.160Z"
208 |     }
209 |    },
210 |    "source": [
211 |     "## 2. Ingest A New Incremental CSV File\n",
212 |     "### Look at record 12, the `state` is changed in the file"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": null,
218 |    "metadata": {},
219 |    "outputs": [],
220 |    "source": [
221 |     "{\n",
222 |     "  \"type\": \"DelimitedExtract\",\n",
223 |     "  \"name\": \"extract incremental data\",\n",
224 |     "  \"environments\": [\"dev\", \"test\"],\n",
225 |     "  \"inputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/data/update_contacts.csv\",\n",
226 |     "  \"outputView\": \"delta_raw\",            \n",
227 |     "  \"delimiter\": \"Comma\",\n",
228 |     "  \"header\": false,\n",
229 |     "  \"authentication\": {\n",
230 |     "     \"method\": \"AmazonIAM\"\n",
231 |     "  }\n",
232 |     "}"
233 |    ]
234 |   },
235 |   {
236 |    "cell_type": "markdown",
237 |    "metadata": {},
238 |    "source": [
239 |     "## 2.1 Apply Data Type (reused schema file)"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": null,
245 |    "metadata": {},
246 |    "outputs": [],
247 |    "source": [
248 |     "{\n",
249 |     "  \"type\": \"TypingTransform\",\n",
250 |     "  \"name\": \"apply table schema 0 to incremental load\",\n",
251 |     "  \"environments\": [\"dev\", \"test\"],\n",
252 |     "  \"schemaURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/meta/contact_meta_0.json\",\n",
253 |     "  \"inputView\": \"delta_raw\",            \n",
254 |     "  \"outputView\": \"delta_typed\",\n",
255 |     "  \"authentication\": {\n",
256 |     "     \"method\": \"AmazonIAM\"\n",
257 |     "  }\n",
258 |     "}"
259 |    ]
260 |   },
261 |   {
262 |    "cell_type": "markdown",
263 |    "metadata": {
264 |     "ExecuteTime": {
265 |      "end_time": "2020-06-07T15:02:50.155313Z",
266 |      "start_time": "2020-06-07T15:02:50.125Z"
267 |     }
268 |    },
269 |    "source": [
270 |     "## 2.2 Data Quality Control (reused sql script)"
271 |    ]
272 |   },
273 |   {
274 |    "cell_type": "code",
275 |    "execution_count": null,
276 |    "metadata": {},
277 |    "outputs": [],
278 |    "source": [
279 |     "%sqlvaildate outputView=\"fail_fast\" name=\"validation\" description=\"fail the job if data transform is failed\" environments=dev,test sqlParams=inputView=delta_typed\n",
280 |     "\n",
281 |     "SELECT SUM(error) = 0 AS valid\n",
282 |     "      ,TO_JSON(\n",
283 |     "        NAMED_STRUCT('count', COUNT(error), 'errors', SUM(error))\n",
284 |     "      ) AS message\n",
285 |     "FROM \n",
286 |     "(\n",
287 |     "  SELECT CASE WHEN SIZE(_errors) > 0 THEN 1 ELSE 0 END AS error \n",
288 |     "  FROM ${inputView}\n",
289 |     ") base"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "markdown",
294 |    "metadata": {
295 |     "ExecuteTime": {
296 |      "end_time": "2020-05-31T05:01:13.796275Z",
297 |      "start_time": "2020-05-31T05:01:13.734Z"
298 |     }
299 |    },
300 |    "source": [
301 |     "## 2.3 Add Calculated Fields (reused sql script)"
302 |    ]
303 |   },
304 |   {
305 |    "cell_type": "code",
306 |    "execution_count": null,
307 |    "metadata": {},
308 |    "outputs": [],
309 |    "source": [
310 |     "%env\n",
311 |     "ETL_CONF_CURRENT_TIMESTAMP=current_timestamp()"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "code",
316 |    "execution_count": null,
317 |    "metadata": {},
318 |    "outputs": [],
319 |    "source": [
320 |     "%sql outputView=\"update_load\" name=\"add calc field for SCD\" environments=dev,test sqlParams=table_name=delta_typed,now=${ETL_CONF_CURRENT_TIMESTAMP}\n",
321 |     "\n",
322 |     "SELECT id,name,email,state, ${now} AS valid_from, CAST(null AS timestamp) AS valid_to\n",
323 |     ",1 AS iscurrent, md5(concat(name,email,state)) AS checksum \n",
324 |     "FROM ${table_name}"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "markdown",
329 |    "metadata": {
330 |     "ExecuteTime": {
331 |      "end_time": "2020-05-31T05:03:33.741024Z",
332 |      "start_time": "2020-05-31T05:03:33.247Z"
333 |     }
334 |    },
335 |    "source": [
336 |     "## 2.4 Prepare Datasets for SCD Type2 Insert\n",
337 |     "\n",
338 |     "- Generate extra rows for changed records.\n",
339 |     "- The 'null' merge_key means it will be inserted, not update existing records according to the rule in SCD type2"
340 |    ]
341 |   },
342 |   {
343 |    "cell_type": "code",
344 |    "execution_count": null,
345 |    "metadata": {},
346 |    "outputs": [],
347 |    "source": [
348 |     "%sql outputView=\"staged_update\" name=\"generate extra rows for SCD\" environments=dev,test\n",
349 |     "\n",
350 |     "SELECT NULL AS mergeKey, new.*\n",
351 |     "FROM initial_load old\n",
352 |     "INNER JOIN update_load new\n",
353 |     "ON old.id = new.id\n",
354 |     "WHERE old.iscurrent=1\n",
355 |     "AND old.checksum<>new.checksum\n",
356 |     "\n",
357 |     "UNION\n",
358 |     "\n",
359 |     "SELECT id AS mergeKey, *\n",
360 |     "FROM update_load"
361 |    ]
362 |   },
363 |   {
364 |    "cell_type": "markdown",
365 |    "metadata": {},
366 |    "source": [
367 |     "## 2.5 Perform the Type 2 SCD"
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "code",
372 |    "execution_count": null,
373 |    "metadata": {},
374 |    "outputs": [],
375 |    "source": [
376 |     "{\n",
377 |     "  \"type\": \"DeltaLakeExtract\",\n",
378 |     "  \"name\": \"read initial Delta table\",\n",
379 |     "  \"description\": \"read initial Delta table\",\n",
380 |     "  \"environments\": [\n",
381 |     "    \"dev\",\n",
382 |     "    \"test\"\n",
383 |     "  ],\n",
384 |     "  \"inputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/output/contact/\",\n",
385 |     "  \"outputView\": \"current_snapshot\"\n",
386 |     "}"
387 |    ]
388 |   },
389 |   {
390 |    "cell_type": "code",
391 |    "execution_count": null,
392 |    "metadata": {
393 |     "vscode": {
394 |      "languageId": "plaintext"
395 |     }
396 |    },
397 |    "outputs": [],
398 |    "source": [
399 |     "%sql name=\"merge into existing contacts table\" environments=dev,test\n",
400 |     "\n",
401 |     "MERGE INTO current_snapshot tgt\n",
402 |     "USING staged_update src\n",
403 |     "ON tgt.id = src.mergeKey\n",
404 |     "WHEN MATCHED AND src.checksum != tgt.checksum AND tgt.iscurrent = 1 THEN \n",
405 |     "  UPDATE SET \n",
406 |     "    valid_to = src.valid_from, \n",
407 |     "    iscurrent = 0\n",
408 |     "WHEN NOT MATCHED THEN \n",
409 |     "  INSERT *"
410 |    ]
411 |   },
412 |   {
413 |    "cell_type": "markdown",
414 |    "metadata": {},
415 |    "source": [
416 |     "# 3. Create a Delta Lake table in Athena\n",
417 |     "### Build up a Glue data catalog from Athena.We are using token based authentication to access Athena, no more long live credentials is required from secrets manager. "
418 |    ]
419 |   },
420 |   {
421 |    "cell_type": "code",
422 |    "execution_count": null,
423 |    "metadata": {},
424 |    "outputs": [],
425 |    "source": [
426 |     "{\n",
427 |     "  \"type\": \"JDBCExecute\",\n",
428 |     "  \"name\": \"Create glue data catalog\",\n",
429 |     "  \"environments\": [\n",
430 |     "    \"dev\",\n",
431 |     "    \"test\"\n",
432 |     "  ],\n",
433 |     "  \"inputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/sql/create_table_contact.sql\",\n",
434 |     "  \"jdbcURL\": \"jdbc:awsathena://AwsRegion=\"${AWS_DEFAULT_REGION}\";S3OutputLocation=s3://\"${ETL_CONF_DATALAKE_LOC}\"/athena-query-result;AwsCredentialsProviderClass=com.amazonaws.auth.WebIdentityTokenCredentialsProvider\",\n",
435 |     "  \"sqlParams\":{\n",
436 |     "    \"datalake_loc\": \"'s3://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/output/contact\\/'\",\n",
437 |     "    \"table_name\": \"default.deltalake_contact_jhub\"\n",
438 |     "  }\n",
439 |     "}"
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "markdown",
444 |    "metadata": {},
445 |    "source": [
446 |     "## 3. Query Delta Lake (optional)\n",
447 |     "### to skip in a productionized ETL job, use a fake environment `uat`"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "code",
452 |    "execution_count": null,
453 |    "metadata": {},
454 |    "outputs": [],
455 |    "source": [
456 |     "{\n",
457 |     "  \"type\": \"DeltaLakeExtract\",\n",
458 |     "  \"name\": \"read contact Delta Lake table\",\n",
459 |     "  \"description\": \"read contact table\",\n",
460 |     "  \"environments\": [\n",
461 |     "    \"uat\"\n",
462 |     "  ],\n",
463 |     "  \"inputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/output/contact/\",\n",
464 |     "  \"outputView\": \"contact\"\n",
465 |     "}"
466 |    ]
467 |   },
468 |   {
469 |    "cell_type": "markdown",
470 |    "metadata": {},
471 |    "source": [
472 |     "## Confirm 92 records are expired"
473 |    ]
474 |   },
475 |   {
476 |    "cell_type": "code",
477 |    "execution_count": null,
478 |    "metadata": {},
479 |    "outputs": [],
480 |    "source": [
481 |     "%sql outputView=\"expired_count\" name=\"expired_count\" environments=uat\n",
482 |     "SELECT count(*) FROM contact WHERE valid_to is not null"
483 |    ]
484 |   },
485 |   {
486 |    "cell_type": "code",
487 |    "execution_count": null,
488 |    "metadata": {},
489 |    "outputs": [],
490 |    "source": [
491 |     "%metadata \n",
492 |     "contact"
493 |    ]
494 |   },
495 |   {
496 |    "cell_type": "markdown",
497 |    "metadata": {},
498 |    "source": [
499 |     " ## Confirm we now have 1192 records"
500 |    ]
501 |   },
502 |   {
503 |    "cell_type": "code",
504 |    "execution_count": null,
505 |    "metadata": {},
506 |    "outputs": [],
507 |    "source": [
508 |     "%sql outputView=\"total_count\" name=\"total_count\" environments=uat\n",
509 |     "SELECT count(*) FROM contact"
510 |    ]
511 |   },
512 |   {
513 |    "cell_type": "markdown",
514 |    "metadata": {},
515 |    "source": [
516 |     "## View one of the changed records"
517 |    ]
518 |   },
519 |   {
520 |    "cell_type": "code",
521 |    "execution_count": null,
522 |    "metadata": {},
523 |    "outputs": [],
524 |    "source": [
525 |     "%sql outputView=\"validate_type2\" name=\"validate_type2\" environments=uat\n",
526 |     "SELECT * FROM contact WHERE id=12"
527 |    ]
528 |   },
529 |   {
530 |    "cell_type": "code",
531 |    "execution_count": null,
532 |    "metadata": {},
533 |    "outputs": [],
534 |    "source": []
535 |   }
536 |  ],
537 |  "metadata": {
538 |   "kernelspec": {
539 |    "display_name": "Arc",
540 |    "language": "javascript",
541 |    "name": "arc"
542 |   },
543 |   "language_info": {
544 |    "codemirror_mode": "javascript",
545 |    "file_extension": ".json",
546 |    "mimetype": "javascript",
547 |    "name": "arc",
548 |    "nbconvert_exporter": "arcexport",
549 |    "version": "3.13.1"
550 |   }
551 |  },
552 |  "nbformat": 4,
553 |  "nbformat_minor": 4
554 | }
555 | 


--------------------------------------------------------------------------------
/source/example/nyctaxi-job-scheduler.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: argoproj.io/v1alpha1
 2 | kind: Workflow
 3 | metadata:
 4 |   generateName: nyctaxi-job-
 5 |   namespace: spark
 6 | spec:
 7 |   serviceAccountName: arcjob
 8 |   # keep workflows history for 30m
 9 |   ttlStrategy:
10 |     secondsAfterCompletion: 1800
11 |   entrypoint: nyctaxi
12 |   nodeselector:
13 |     kubernetes.io/arch: amd64
14 |   templates:
15 |   - name: nyctaxi
16 |     dag:
17 |       tasks:
18 |         - name: step1-query
19 |           templateRef:
20 |             name: spark-template
21 |             template: sparklocal 
22 |           arguments:
23 |             parameters:
24 |             - name: jobId
25 |               value: nyctaxi 
26 |             - name: tags
27 |               value: "project=sqlbasedetl owner=myowner costcenter=66666" 
28 |             - name: configUri
29 |               value: https://raw.githubusercontent.com/tripl-ai/arc-starter/master/examples/kubernetes/nyctaxi.ipynb
30 |             - name: image
31 |               value: ghcr.io/tripl-ai/arc:arc_4.2.0_spark_3.3.4_scala_2.12_hadoop_3.3.2_4.2.1_slim
32 |             - name: parameters
33 |               value: "--ETL_CONF_DATA_URL=s3a://nyc-tlc/csv_backup \
34 |               --ETL_CONF_JOB_URL=https://raw.githubusercontent.com/tripl-ai/arc-starter/master/examples/kubernetes"
35 | 


--------------------------------------------------------------------------------
/source/example/scd2-job-scheduler.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: argoproj.io/v1alpha1
 2 | kind: Workflow
 3 | metadata:
 4 |   generateName: scd2-job-
 5 |   namespace: spark
 6 | spec:
 7 |   serviceAccountName: arcjob
 8 |   entrypoint: scd2-process
 9 |   nodeselector:
10 |     kubernetes.io/arch: amd64
11 |   arguments:
12 |     parameters:
13 |     - name: codeBucket
14 |       value: cfn_value
15 |   templates:
16 |   - name: scd2-process
17 |     dag:
18 |       tasks:
19 |         - name: initial-load
20 |           templateRef:
21 |             name: spark-template
22 |             template: smalljob
23 |           arguments:
24 |             parameters:
25 |             - name: jobId
26 |               value: initial-load
27 |             - name: image
28 |               value: {{ECR_URL}}
29 |             - name: configUri
30 |               value: "s3a://{{workflow.parameters.codeBucket}}/app_code/job/initial_load.ipynb"
31 |             - name: parameters
32 |               value: "--ETL_CONF_DATALAKE_LOC={{workflow.parameters.codeBucket}}"
33 |         - name: delta-load
34 |           templateRef:
35 |             name: spark-template
36 |             template: smalljob
37 |           arguments:
38 |             parameters:
39 |             - name: jobId
40 |               value: delta-load 
41 |             - name: image
42 |               value: {{ECR_URL}}
43 |             - name: configUri
44 |               value: "s3a://{{workflow.parameters.codeBucket}}/app_code/job/delta_load.ipynb"
45 |             - name: parameters
46 |               value: "--ETL_CONF_DATALAKE_LOC={{workflow.parameters.codeBucket}}"
47 |         - name: SCD2-merge
48 |           dependencies: [initial-load, delta-load]
49 |           templateRef:
50 |             name: spark-template
51 |             template: smalljob
52 |           arguments:
53 |             parameters:
54 |             - name: jobId
55 |               value: SCD2-merge 
56 |             - name: image
57 |               value: {{ECR_URL}}
58 |             - name: configUri
59 |               value: "s3a://{{workflow.parameters.codeBucket}}/app_code/job/scd2_merge.ipynb"
60 |             - name: parameters
61 |               value: "--ETL_CONF_DATALAKE_LOC={{workflow.parameters.codeBucket}}"           
62 |             - name: sparkConf
63 |               value: "--conf spark.databricks.delta.merge.repartitionBeforeWrite.enabled=true" 
64 | 


--------------------------------------------------------------------------------
/source/images/00-deploy-to-aws.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/guidance-for-sql-based-etl-with-apache-spark-on-amazon-eks/06f7302c12ad69851031043068fd34abdb75a7cb/source/images/00-deploy-to-aws.png


--------------------------------------------------------------------------------
/source/images/3-argo-job-dependency.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/guidance-for-sql-based-etl-with-apache-spark-on-amazon-eks/06f7302c12ad69851031043068fd34abdb75a7cb/source/images/3-argo-job-dependency.png


--------------------------------------------------------------------------------
/source/images/3-argo-log.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/guidance-for-sql-based-etl-with-apache-spark-on-amazon-eks/06f7302c12ad69851031043068fd34abdb75a7cb/source/images/3-argo-log.png


--------------------------------------------------------------------------------
/source/images/4-auto-scaling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/guidance-for-sql-based-etl-with-apache-spark-on-amazon-eks/06f7302c12ad69851031043068fd34abdb75a7cb/source/images/4-auto-scaling.png


--------------------------------------------------------------------------------
/source/images/4-k8s-retry.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/guidance-for-sql-based-etl-with-apache-spark-on-amazon-eks/06f7302c12ad69851031043068fd34abdb75a7cb/source/images/4-k8s-retry.png


--------------------------------------------------------------------------------
/source/images/4-spot-console.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/guidance-for-sql-based-etl-with-apache-spark-on-amazon-eks/06f7302c12ad69851031043068fd34abdb75a7cb/source/images/4-spot-console.png


--------------------------------------------------------------------------------
/source/images/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/guidance-for-sql-based-etl-with-apache-spark-on-amazon-eks/06f7302c12ad69851031043068fd34abdb75a7cb/source/images/architecture.png


--------------------------------------------------------------------------------
/source/images/driver_interruption_test.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/guidance-for-sql-based-etl-with-apache-spark-on-amazon-eks/06f7302c12ad69851031043068fd34abdb75a7cb/source/images/driver_interruption_test.gif


--------------------------------------------------------------------------------
/source/images/executor_interruption_test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/guidance-for-sql-based-etl-with-apache-spark-on-amazon-eks/06f7302c12ad69851031043068fd34abdb75a7cb/source/images/executor_interruption_test.png


--------------------------------------------------------------------------------
/source/images/fake_data.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/guidance-for-sql-based-etl-with-apache-spark-on-amazon-eks/06f7302c12ad69851031043068fd34abdb75a7cb/source/images/fake_data.gif


--------------------------------------------------------------------------------
/source/images/run_jupyter.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/guidance-for-sql-based-etl-with-apache-spark-on-amazon-eks/06f7302c12ad69851031043068fd34abdb75a7cb/source/images/run_jupyter.gif


--------------------------------------------------------------------------------
/source/images/sql-based-etl-spark-architecture-final.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/guidance-for-sql-based-etl-with-apache-spark-on-amazon-eks/06f7302c12ad69851031043068fd34abdb75a7cb/source/images/sql-based-etl-spark-architecture-final.png


--------------------------------------------------------------------------------
/source/images/sql-based-etl-with-apache-spark-on-amazon-eks.preview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/guidance-for-sql-based-etl-with-apache-spark-on-amazon-eks/06f7302c12ad69851031043068fd34abdb75a7cb/source/images/sql-based-etl-with-apache-spark-on-amazon-eks.preview.png


--------------------------------------------------------------------------------
/source/images/submit_job_in_argo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-solutions-library-samples/guidance-for-sql-based-etl-with-apache-spark-on-amazon-eks/06f7302c12ad69851031043068fd34abdb75a7cb/source/images/submit_job_in_argo.gif


--------------------------------------------------------------------------------
/source/lib/cdk_infra/eks_base_app.py:
--------------------------------------------------------------------------------
 1 | from aws_cdk import Aws
 2 | from constructs import Construct
 3 | from aws_cdk.aws_eks import ICluster, KubernetesManifest
 4 | from lib.util.manifest_reader import *
 5 | import os
 6 | 
 7 | class EksBaseAppConst(Construct):
 8 |     @property
 9 |     def secret_created(self):
10 |         return self._ext_secret
11 | 
12 |     def __init__(self,scope: Construct,id: str,eks_cluster: ICluster, **kwargs,) -> None:
13 |         super().__init__(scope, id, **kwargs)
14 | 
15 |         source_dir=os.path.split(os.environ['VIRTUAL_ENV'])[0]+'/source'
16 |         
17 |         # Add ALB ingress controller to EKS
18 |         self._alb = eks_cluster.add_helm_chart('ALBChart',
19 |             chart='aws-load-balancer-controller',
20 |             repository='https://aws.github.io/eks-charts',
21 |             release='alb',
22 |             version='1.5.5',
23 |             create_namespace=False,
24 |             namespace='kube-system',
25 |             values=load_yaml_replace_var_local(source_dir+'/app_resources/alb-values.yaml',
26 |                 fields={
27 |                     "{{region_name}}": Aws.REGION, 
28 |                     "{{cluster_name}}": eks_cluster.cluster_name, 
29 |                     "{{vpc_id}}": eks_cluster.vpc.vpc_id
30 |                 }
31 |             )
32 |         )
33 |         # Add external secrets controller to EKS
34 |         self._ext_secret = eks_cluster.add_helm_chart('SecretContrChart',
35 |             chart='kubernetes-external-secrets',
36 |             repository='https://external-secrets.github.io/kubernetes-external-secrets/',
37 |             release='external-secrets',
38 |             create_namespace=False,
39 |             namespace='kube-system',
40 |             values=load_yaml_replace_var_local(source_dir+'/app_resources/ex-secret-values.yaml',
41 |                 fields={
42 |                     '{{region_name}}': Aws.REGION
43 |                 }
44 |             )
45 |         )
46 |         self._ext_secret.node.add_dependency(self._alb)
47 |         # Add Cluster Autoscaler to EKS
48 |         _var_mapping = {
49 |             "{{region_name}}": Aws.REGION, 
50 |             "{{cluster_name}}": eks_cluster.cluster_name, 
51 |         }
52 |         eks_cluster.add_helm_chart('ClusterAutoScaler',
53 |             chart='cluster-autoscaler',
54 |             repository='https://kubernetes.github.io/autoscaler',
55 |             release='nodescaler',
56 |             create_namespace=False,
57 |             namespace='kube-system',
58 |             values=load_yaml_replace_var_local(source_dir+'/app_resources/autoscaler-values.yaml',_var_mapping)
59 |         )
60 |         # # Add container insight (CloudWatch Log) to EKS
61 |         # KubernetesManifest(self,'ContainerInsight',
62 |         #     cluster=eks_cluster, 
63 |         #     manifest=load_yaml_replace_var_remotely('https://raw.githubusercontent.com/aws-samples/amazon-cloudwatch-container-insights/latest/k8s-deployment-manifest-templates/deployment-mode/daemonset/container-insights-monitoring/quickstart/cwagent-fluentd-quickstart.yaml', 
64 |         #             fields=_var_mapping,
65 |         #             multi_resource=True
66 |         #     )
67 |         # )
68 |         # Add Spark Operator to EKS
69 |         eks_cluster.add_helm_chart('SparkOperatorChart',
70 |             chart='spark-operator',
71 |             repository='https://kubeflow.github.io/spark-operator',
72 |             release='spark-operator',
73 |             version='1.1.27',
74 |             create_namespace=True,
75 |             values=load_yaml_replace_var_local(source_dir+'/app_resources/spark-operator-values.yaml',fields={'':''})
76 |         )


--------------------------------------------------------------------------------
/source/lib/cdk_infra/eks_cluster.py:
--------------------------------------------------------------------------------
 1 | from aws_cdk import (aws_eks as eks,aws_ec2 as ec2, RemovalPolicy)
 2 | from aws_cdk.aws_iam import IRole
 3 | from constructs import Construct
 4 | from aws_cdk.lambda_layer_kubectl_v27 import KubectlV27Layer
 5 | 
 6 | class EksConst(Construct):
 7 | 
 8 |     @property
 9 |     def my_cluster(self):
10 |         return self._my_cluster
11 | 
12 |     def __init__(self, scope: Construct, id:str, eksname: str, eksvpc: ec2.IVpc, noderole: IRole, eks_adminrole: IRole, **kwargs) -> None:
13 |         super().__init__(scope, id, **kwargs)
14 | 
15 |         # 1.Create EKS cluster without node group
16 |         self._my_cluster = eks.Cluster(self,'EKS',
17 |                 vpc= eksvpc,
18 |                 cluster_name=eksname,
19 |                 masters_role=eks_adminrole,
20 |                 output_cluster_name=True,
21 |                 version= eks.KubernetesVersion.V1_27,
22 |                 endpoint_access= eks.EndpointAccess.PUBLIC_AND_PRIVATE,
23 |                 default_capacity=0,
24 |                 kubectl_layer=KubectlV27Layer(self, 'kubectlV27Layer')
25 |         )
26 | 
27 |         # 2.Add Managed NodeGroup to EKS, compute resource to run Spark jobs
28 |         self._my_cluster.add_nodegroup_capacity('onDemand-mn',
29 |             nodegroup_name = 'etl-ondemand',
30 |             node_role = noderole,
31 |             desired_size = 1,
32 |             max_size = 5,
33 |             disk_size = 50,
34 |             instance_types = [ec2.InstanceType('m7g.xlarge')],
35 |             labels = {'lifecycle':'OnDemand'},
36 |             # create one nodegroup per AZ, as cluster autoscaler has no control over what AZ ASG will launch instance in.
37 |             # if using Karpenter, this is not needed.
38 |             subnets = ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS,one_per_az=True),
39 |             tags = {'Name':'OnDemand-'+eksname,'k8s.io/cluster-autoscaler/enabled': 'true', 'k8s.io/cluster-autoscaler/'+eksname: 'owned'}
40 |         )
41 | 
42 |         # 3. Add Spot managed NodeGroup to EKS (Run Spark exectutor on spot)
43 |         self._my_cluster.add_nodegroup_capacity('spot-mn',
44 |             nodegroup_name = 'etl-spot',
45 |             node_role = noderole,
46 |             capacity_type=eks.CapacityType.SPOT,
47 |             desired_size = 1,
48 |             max_size = 30,
49 |             disk_size = 50,
50 |             instance_types=[ec2.InstanceType('r5.xlarge'),ec2.InstanceType('r4.xlarge'),ec2.InstanceType('r5a.xlarge')],
51 |             subnets = ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS,one_per_az=True),
52 |             labels = {'lifecycle':'Ec2Spot'},
53 |             tags = {'Name':'Spot-'+eksname, 'k8s.io/cluster-autoscaler/enabled': 'true', 'k8s.io/cluster-autoscaler/'+eksname: 'owned'}
54 |         )
55 |         self._my_cluster.add_nodegroup_capacity('spot-arm64',
56 |             nodegroup_name = 'single-az-graviton',
57 |             node_role = noderole,
58 |             capacity_type=eks.CapacityType.SPOT,
59 |             desired_size = 1,
60 |             max_size = 30,
61 |             disk_size = 50,
62 |             instance_types = [ec2.InstanceType('r7g.xlarge'),ec2.InstanceType('r6g.xlarge'),ec2.InstanceType('r6gd.xlarge')],
63 |             # create one nodegroup per AZ, as cluster autoscaler has no control over what AZ ASG will launch instance in.
64 |             # if using Karpenter, this is not needed.
65 |             subnets = ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS,availability_zones=sorted(eksvpc.availability_zones)[1:2]),
66 |             labels = {'nodegroup':'single-az-graviton', 'lifecycle':'Ec2Spot'},
67 |             tags = {'Name':'single-az-graviton','k8s.io/cluster-autoscaler/enabled': 'true', 'k8s.io/cluster-autoscaler/'+eksname: 'owned'}
68 |         )  
69 | 
70 |         # # 4. Add Fargate NodeGroup to EKS, without setup cluster-autoscaler
71 |         # self._my_cluster.add_fargate_profile('FargateEnabled',
72 |         #     selectors =[{
73 |         #         "namespace": "spark"
74 |         #     }],
75 |         #     fargate_profile_name='sparkETL'
76 |         # )


--------------------------------------------------------------------------------
/source/lib/cdk_infra/eks_service_account.py:
--------------------------------------------------------------------------------
 1 | from aws_cdk import aws_iam as iam
 2 | from constructs import Construct
 3 | from aws_cdk.aws_secretsmanager import ISecret
 4 | from aws_cdk.aws_eks import ICluster
 5 | from lib.util.manifest_reader import *
 6 | # import lib.util.override_rule as scan
 7 | import os
 8 | 
 9 | class EksSAConst(Construct):
10 | 
11 |     def __init__(self, scope: Construct, id:str, eks_cluster: ICluster, secret: ISecret, **kwargs,) -> None:
12 |         super().__init__(scope, id, **kwargs)
13 | 
14 |         source_dir=os.path.split(os.environ['VIRTUAL_ENV'])[0]+'/source'
15 | 
16 | # //************************************v*************************************************************//
17 | # //***************************** SERVICE ACCOUNT, RBAC and IAM ROLES *******************************//
18 | # //****** Associating IAM role to K8s Service Account to provide fine-grain security control ******//
19 | # //***********************************************************************************************//
20 |         # Cluster Auto-scaler
21 |         self._scaler_sa = eks_cluster.add_service_account('AutoScalerSa', 
22 |             name='cluster-autoscaler', 
23 |             namespace='kube-system'
24 |         )  
25 |         _scaler_role = load_yaml_local(source_dir+'/app_resources/autoscaler-iam-role.yaml')
26 |         for statmt in _scaler_role:
27 |             self._scaler_sa.add_to_principal_policy(iam.PolicyStatement.from_json(statmt))
28 | 
29 |         # ALB Ingress
30 |         self._alb_sa = eks_cluster.add_service_account('ALBServiceAcct', 
31 |             name='alb-aws-load-balancer-controller',
32 |             namespace='kube-system'
33 |         )
34 |         _alb_role = load_yaml_local(source_dir+'/app_resources/alb-iam-role.yaml')
35 |         for statmt in _alb_role:
36 |             self._alb_sa.add_to_principal_policy(iam.PolicyStatement.from_json(statmt))
37 | 
38 |         # External secret controller
39 |         self._secrets_sa = eks_cluster.add_service_account('ExSecretController',
40 |             name='external-secrets-controller',
41 |             namespace="kube-system"
42 |         )
43 |         self._secrets_sa.node.add_dependency(secret)
44 |         _secrets_role = load_yaml_replace_var_local(source_dir+'/app_resources/ex-secret-iam-role.yaml',
45 |                         fields={"{{secretsmanager}}": secret.secret_arn+"*"}
46 |                     )
47 |         for statmt in _secrets_role:
48 |             self._secrets_sa.add_to_principal_policy(iam.PolicyStatement.from_json(statmt))  
49 | 
50 | # //************************************v*************************************************************//
51 | # //*********************** Override cfn Nag scan rules for deployment *******************************//
52 | # //***********************************************************************************************//   
53 |         
54 |         # Override Cfn Nag warning W12: IAM policy should not allow * resource
55 |         # scan.suppress_cfnnag_rule('W12', 'by default the role scaler_sa has * resource', self._scaler_sa.role.node.find_child('DefaultPolicy').node.default_child)
56 |         # scan.suppress_cfnnag_rule('W12', 'by default the role secrets_sa has * resource', self._secrets_sa.role.node.find_child('DefaultPolicy').node.default_child)
57 |         # scan.suppress_iam_cfnnag_rule(self._alb_sa.role.node.find_child('DefaultPolicy').node.default_child)


--------------------------------------------------------------------------------
/source/lib/cdk_infra/iam_roles.py:
--------------------------------------------------------------------------------
 1 | import typing
 2 | from aws_cdk import (Tags, aws_iam as iam)
 3 | from constructs import Construct
 4 | # import lib.util.override_rule as scan
 5 | 
 6 | class IamConst(Construct):
 7 | 
 8 |     @property
 9 |     def managed_node_role(self):
10 |         return self._managed_node_role
11 | 
12 |     @property
13 |     def admin_role(self):
14 |         return self._clusterAdminRole
15 | 
16 |     def __init__(self,scope: Construct, id:str, cluster_name:str, **kwargs,) -> None:
17 |         super().__init__(scope, id, **kwargs)
18 | 
19 |         # EKS admin role
20 |         self._clusterAdminRole = iam.Role(self, 'clusterAdmin',
21 |             assumed_by= iam.AccountRootPrincipal()
22 |         )
23 |         self._clusterAdminRole.add_to_policy(iam.PolicyStatement(
24 |             resources=["*"],
25 |             actions=[
26 |                 "eks:Describe*",
27 |                 "eks:List*",
28 |                 "eks:AccessKubernetesApi",
29 |                 "ssm:GetParameter",
30 |                 "iam:ListRoles",
31 |                 "emr-containers:CreateVirtualCluster"
32 |             ],
33 |         ))
34 |         Tags.of(self._clusterAdminRole).add(
35 |             key='eks/%s/type' % cluster_name, 
36 |             value='admin-role'
37 |         )
38 | 
39 |         # Managed Node Group Instance Role
40 |         _managed_node_managed_policies = (
41 |             iam.ManagedPolicy.from_aws_managed_policy_name('AmazonEKSWorkerNodePolicy'),
42 |             iam.ManagedPolicy.from_aws_managed_policy_name('AmazonEKS_CNI_Policy'),
43 |             iam.ManagedPolicy.from_aws_managed_policy_name('AmazonEC2ContainerRegistryReadOnly'),
44 |             iam.ManagedPolicy.from_aws_managed_policy_name('CloudWatchAgentServerPolicy'), 
45 |         )
46 |         self._managed_node_role = iam.Role(self,'NodeInstance-Role',
47 |             path='/',
48 |             assumed_by=iam.ServicePrincipal('ec2.amazonaws.com'),
49 |             managed_policies=list(_managed_node_managed_policies),
50 |         )
51 | 
52 | 
53 |         # Override Cfn Nag rule
54 |         # scan.suppress_cfnnag_rule('W12', 'by default the role has * resource', self._clusterAdminRole.node.find_child('DefaultPolicy').node.default_child)
55 |         


--------------------------------------------------------------------------------
/source/lib/cdk_infra/network_sg.py:
--------------------------------------------------------------------------------
 1 | from aws_cdk import (Tags, aws_ec2 as ec2, aws_s3 as s3)
 2 | from constructs import Construct
 3 | # import lib.util.override_rule as scan 
 4 | import lib.util.get_aws_managed_prefix as custom
 5 | 
 6 | class NetworkSgConst(Construct):
 7 | 
 8 |     @property
 9 |     def vpc(self):
10 |         return self._vpc
11 |     @property
12 |     def alb_jhub_sg(self):
13 |         return self._alb_jhub_sg
14 |     @property
15 |     def alb_argo_sg(self):
16 |         return self._alb_argo_sg
17 | 
18 |     def __init__(self,scope: Construct, id:str, eksname:str, codebucket: str, **kwargs) -> None:
19 |         super().__init__(scope, id, **kwargs)
20 |         
21 |         # //*************************************************//
22 |         # //******************* NETWORK ********************//
23 |         # //************************************************//
24 |         # create VPC
25 |         self._vpc = ec2.Vpc(self, 'eksVpc',max_azs=2, nat_gateways=1)
26 |         Tags.of(self._vpc).add('Name', eksname + 'EksVpc')
27 | 
28 |         # self._log_bucket=s3.Bucket.from_bucket_name(self,'vpc_logbucket', codebucket)
29 |         # self._vpc.add_flow_log("FlowLogCloudWatch",
30 |         #     destination=ec2.FlowLogDestination.to_s3(self._log_bucket,'vpcRejectlog/'),
31 |         #     traffic_type=ec2.FlowLogTrafficType.REJECT
32 |         # )
33 | 
34 |         # ALB security group for Jupyter & Argo
35 |         prefixlist_peer=ec2.Peer.prefix_list(
36 |                 custom.AwsManagedPrefixList(self,'cr-getprefixId',
37 |                     custom.AwsManagedPrefixListProps(name='com.amazonaws.global.cloudfront.origin-facing')
38 |                 ).prefixlist_id
39 |             )
40 |         self._alb_jhub_sg=ec2.SecurityGroup(self,'JupyterALBInboundSG', vpc=self._vpc,description='Security Group for Jupyter ALB')
41 |         self._alb_argo_sg=ec2.SecurityGroup(self,'ArgoALBInboundSG', vpc=self._vpc,description='Security Group for Argo ALB')
42 |         self._alb_jhub_sg.add_ingress_rule(prefixlist_peer,ec2.Port.tcp(port=80))
43 |         self._alb_argo_sg.add_ingress_rule(prefixlist_peer,ec2.Port.tcp(port=2746))
44 |         Tags.of(self._alb_jhub_sg).add('Name','SparkOnEKS-JhubSg')
45 |         Tags.of(self._alb_argo_sg).add('Name','SparkOnEKS-ArgoSg')
46 | 
47 |         # VPC endpoint security group
48 |         self._vpc_endpoint_sg = ec2.SecurityGroup(self,'EndpointSg',vpc=self._vpc,description='Security Group for Endpoint')
49 |         self._vpc_endpoint_sg.add_ingress_rule(ec2.Peer.ipv4(self._vpc.vpc_cidr_block),ec2.Port.tcp(port=443))
50 |         self._vpc_endpoint_sg.add_ingress_rule(ec2.Peer.ipv4(self._vpc.vpc_cidr_block),ec2.Port.tcp(port=444))
51 |         Tags.of(self._vpc_endpoint_sg).add('Name','SparkOnEKS-VPCEndpointSg')
52 |         
53 |         # Add VPC endpoint 
54 |         self._vpc.add_gateway_endpoint("S3GatewayEndpoint",
55 |                                         service=ec2.GatewayVpcEndpointAwsService.S3,
56 |                                         subnets=[ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC),
57 |                                                  ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS)])
58 |                                                  
59 |         self._vpc.add_interface_endpoint("EcrDockerEndpoint",service=ec2.InterfaceVpcEndpointAwsService.ECR_DOCKER, security_groups=[self._vpc_endpoint_sg])
60 |         self._vpc.add_interface_endpoint("CWLogsEndpoint", service=ec2.InterfaceVpcEndpointAwsService.CLOUDWATCH_LOGS,security_groups=[self._vpc_endpoint_sg])
61 |         self._vpc.add_interface_endpoint("AthenaEndpoint", service=ec2.InterfaceVpcEndpointAwsService.ATHENA,security_groups=[self._vpc_endpoint_sg])
62 |         self._vpc.add_interface_endpoint("KMSEndpoint", service=ec2.InterfaceVpcEndpointAwsService.KMS,security_groups=[self._vpc_endpoint_sg])
63 |         
64 | 
65 |         # Override Cfn_Nag rule for AWS Solution CICD validation
66 |         # for subnet in self._vpc.public_subnets:
67 |         #     scan.suppress_cfnnag_rule('W33','a public facing ALB is required and ingress from the internet should be permitted.',subnet.node.default_child)
68 | 
69 |         # self._vpc_endpoint_sg.node.default_child.add_metadata('cfn_nag',{
70 |         #     "rules_to_suppress": [
71 |         #         {
72 |         #             "id": "W40",
73 |         #             "reason": "Egress IP Protocol of -1 is default and generally considered OK"
74 |         #         },
75 |         #         {
76 |         #             "id": "W5",
77 |         #             "reason": "Security Groups with cidr open considered OK"
78 |         #         }
79 |         #     ]
80 |         # })
81 | 


--------------------------------------------------------------------------------
/source/lib/cdk_infra/s3_app_code.py:
--------------------------------------------------------------------------------
 1 | from aws_cdk import (RemovalPolicy, aws_s3 as s3, aws_s3_deployment as s3deploy, aws_kms as kms)
 2 | from constructs import Construct
 3 | import os
 4 | 
 5 | class S3AppCodeConst(Construct):
 6 | 
 7 |     @property
 8 |     def code_bucket(self):
 9 |         return self.bucket_name
10 | 
11 |     @property
12 |     def artifact_bucket(self):
13 |         return self._artifact_bucket   
14 |     
15 |     # @property
16 |     # def s3_deploy_contrust(self):
17 |     #     return self.deploy
18 | 
19 |     def __init__(self,scope: Construct, id: str, **kwargs,) -> None:
20 |         super().__init__(scope, id, **kwargs)
21 | 
22 |        # Upload application code to S3 bucket 
23 |         self._artifact_bucket=s3.Bucket(self, id, 
24 |             block_public_access=s3.BlockPublicAccess.BLOCK_ALL,
25 |             encryption=s3.BucketEncryption.KMS_MANAGED,
26 |             removal_policy=RemovalPolicy.RETAIN,
27 |             access_control = s3.BucketAccessControl.LOG_DELIVERY_WRITE,
28 |             object_ownership=s3.ObjectOwnership.OBJECT_WRITER,
29 |             versioned=True #required by codebuild
30 |         )
31 | 
32 |         proj_dir=os.path.split(os.environ['VIRTUAL_ENV'])[0]
33 |         self.deploy=s3deploy.BucketDeployment(self, "DeployCode",
34 |             sources=[s3deploy.Source.asset(proj_dir+'/deployment/app_code')],
35 |             destination_bucket= self.artifact_bucket,
36 |             destination_key_prefix="app_code"
37 |         )
38 |         self.bucket_name = self.artifact_bucket.bucket_name
39 | 
40 |         # # Override Cfn_Nag rule for S3 access logging
41 |         # self.artifact_bucket.node.default_child.add_metadata('cfn_nag',{
42 |         #     "rules_to_suppress": [
43 |         #         {
44 |         #             "id": "W35",
45 |         #             "reason": "bucket access log stops bucket removal, disable for now"
46 |         #         },
47 |         #         {
48 |         #             "id": "W51",
49 |         #             "reason": "bucket access is controled by IAM level"
50 |         #         }
51 |         #     ]
52 |         # })
53 | 


--------------------------------------------------------------------------------
/source/lib/cdk_infra/spark_permission.py:
--------------------------------------------------------------------------------
  1 | from aws_cdk import (aws_iam as iam)
  2 | from constructs import Construct
  3 | from aws_cdk.aws_eks import ICluster, KubernetesManifest
  4 | from lib.util.manifest_reader import load_yaml_replace_var_local
  5 | # import lib.util.override_rule as scan
  6 | import os
  7 | 
  8 | class SparkOnEksSAConst(Construct):
  9 | 
 10 |     @property
 11 |     def jupyter_sa(self):
 12 |         return self._jupyter_sa.service_account_name
 13 | 
 14 |     def __init__(self,scope: Construct, id: str, 
 15 |         eks_cluster: ICluster, 
 16 |         login_name: str, 
 17 |         code_bucket: str, 
 18 |         datalake_bucket: str,
 19 |         **kwargs) -> None:
 20 |         super().__init__(scope, id, **kwargs)
 21 | 
 22 |         source_dir=os.path.split(os.environ['VIRTUAL_ENV'])[0]+'/source'
 23 | # //******************************************************************************************//
 24 | # //************************ SETUP PERMISSION FOR ARC SPARK JOBS ****************************//
 25 | # //******* create k8s namespace, service account, and IAM role for service account ********//
 26 | # //***************************************************************************************//
 27 | 
 28 |         # create k8s namespace
 29 |         etl_ns = eks_cluster.add_manifest('SparkNamespace',{
 30 |                 "apiVersion": "v1",
 31 |                 "kind": "Namespace",
 32 |                 "metadata": { 
 33 |                     "name": "spark",
 34 |                     "labels": {"name":"spark"}
 35 |                 }
 36 |             }
 37 |         )
 38 |         jupyter_ns = eks_cluster.add_manifest('jhubNamespace',{
 39 |                 "apiVersion": "v1",
 40 |                 "kind": "Namespace",
 41 |                 "metadata": { 
 42 |                     "name": "jupyter",
 43 |                     "labels": {"name":"spark"}
 44 |                 }
 45 |             }
 46 |         )     
 47 |         
 48 |         # create k8s service account
 49 |         self._etl_sa = eks_cluster.add_service_account('ETLSa', 
 50 |             name='arcjob', 
 51 |             namespace='spark'
 52 |         )
 53 |         self._etl_sa.node.add_dependency(etl_ns)
 54 | 
 55 |         _etl_rb = KubernetesManifest(self,'ETLRoleBinding',
 56 |             cluster=eks_cluster,
 57 |             manifest=load_yaml_replace_var_local(source_dir+'/app_resources/etl-rbac.yaml', 
 58 |             fields= {
 59 |                 "{{MY_SA}}": self._etl_sa.service_account_name
 60 |             }, 
 61 |             multi_resource=True)
 62 |         )
 63 |         _etl_rb.node.add_dependency(self._etl_sa)
 64 | 
 65 |         self._jupyter_sa = eks_cluster.add_service_account('jhubServiceAcct', 
 66 |             # name=login_name,
 67 |             name='sparkoneks',
 68 |             namespace='jupyter'
 69 |         )
 70 |         self._jupyter_sa.node.add_dependency(jupyter_ns)
 71 | 
 72 |         # Associate AWS IAM role to K8s Service Account
 73 |         datalake_bucket=code_bucket if not datalake_bucket.strip() else datalake_bucket
 74 |         _bucket_setting={
 75 |                 "{{codeBucket}}": code_bucket,
 76 |                 "{{datalakeBucket}}": datalake_bucket
 77 |         }
 78 |         _etl_iam = load_yaml_replace_var_local(source_dir+'/app_resources/etl-iam-role.yaml',fields=_bucket_setting)
 79 |         for statmnt in _etl_iam:
 80 |             self._etl_sa.add_to_principal_policy(iam.PolicyStatement.from_json(statmnt))
 81 |             self._jupyter_sa.add_to_principal_policy(iam.PolicyStatement.from_json(statmnt))
 82 | 
 83 | # # //*************************************************************************************//
 84 | # # //******************** SETUP PERMISSION FOR NATIVE SPARK JOBS   **********************//
 85 | # # //***********************************************************************************//
 86 |         self._spark_sa = eks_cluster.add_service_account('NativeSparkSa',
 87 |             name='nativejob',
 88 |             namespace='spark'
 89 |         )
 90 |         self._spark_sa.node.add_dependency(etl_ns)
 91 | 
 92 |         _spark_rb = eks_cluster.add_manifest('sparkRoleBinding',
 93 |             load_yaml_replace_var_local(source_dir+'/app_resources/native-spark-rbac.yaml',
 94 |                 fields= {
 95 |                     "{{MY_SA}}": self._spark_sa.service_account_name
 96 |                 })
 97 |         )
 98 |         _spark_rb.node.add_dependency(self._spark_sa)
 99 | 
100 |         _native_spark_iam = load_yaml_replace_var_local(source_dir+'/app_resources/native-spark-iam-role.yaml',fields=_bucket_setting)
101 |         for statmnt in _native_spark_iam:
102 |             self._spark_sa.add_to_principal_policy(iam.PolicyStatement.from_json(statmnt))
103 | 
104 | 
105 |         # Override Cfn Nag warning W12: IAM policy should not allow * resource
106 |         # scan.suppress_cfnnag_rule('W12', 'by default the etl_sa role has * resource', self._etl_sa.role.node.find_child('DefaultPolicy').node.default_child)
107 |         # scan.suppress_cfnnag_rule('W12', 'by default the role spark_sa has * resource', self._spark_sa.role.node.find_child('DefaultPolicy').node.default_child)
108 |         # scan.suppress_cfnnag_rule('W12', 'by default the role jupyter_sa has * resource', self._jupyter_sa.role.node.find_child('DefaultPolicy').node.default_child)


--------------------------------------------------------------------------------
/source/lib/cloud_front_stack.py:
--------------------------------------------------------------------------------
 1 | from aws_cdk import (
 2 |     NestedStack,Fn,
 3 |     aws_cloudfront_origins as origins,
 4 |     aws_cloudfront as cf,
 5 |     aws_elasticloadbalancingv2 as alb,
 6 |     aws_s3 as s3
 7 | )
 8 | from constructs import Construct
 9 | # import lib.util.override_rule as scan
10 | 
11 | class NestedStack(NestedStack):
12 | 
13 |     @property
14 |     def jhub_cf(self):
15 |         return self._jhub_cf
16 | 
17 |     @property
18 |     def argo_cf(self):
19 |         return self._argo_cf
20 | 
21 |     def __init__(self,scope: Construct, id: str,logbucket: str,argo_alb_dns_name: str, jhub_alb_dns_name: str, **kwargs) -> None:
22 |         super().__init__(scope, id, **kwargs)
23 | 
24 | # //**********************************************************************************************************//
25 | # //*************************** Add CloudFront to enable HTTPS Endpoint (OPTIONAL) **************************//
26 | # //***** recommended way is to generate your own SSL certificate via AWS Certificate Manager ***************//
27 | # //****************************** add it to the application load balancer *********************************//
28 | # //*******************************************************************************************************//
29 |         self._bucket=s3.Bucket.from_bucket_name(self,'cf_logbucket', logbucket)
30 |         self._jhub_cf = add_distribution(self, 'jhub_dist', jhub_alb_dns_name, 80, self._bucket)
31 |         self._argo_cf = add_distribution(self, 'argo_dist', argo_alb_dns_name, 2746, self._bucket)
32 | 
33 | def add_distribution(scope: Construct, id: str, alb_dns_name: str, port: int, logbucket: s3.IBucket
34 | ) -> cf.IDistribution:
35 | 
36 |     load_balancer_arn=Fn.get_att(alb_dns_name,"DNSName")
37 |     security_group_id=Fn.get_att(alb_dns_name,"SecurityGroups")
38 | 
39 |     alb2 = alb.ApplicationLoadBalancer.from_application_load_balancer_attributes(scope, id,
40 |             load_balancer_arn=load_balancer_arn.to_string(),
41 |             security_group_id=security_group_id.to_string(),
42 |             load_balancer_dns_name=alb_dns_name
43 |         )
44 |     _origin = origins.LoadBalancerV2Origin(alb2,
45 |         http_port=port,
46 |         protocol_policy=cf.OriginProtocolPolicy.HTTP_ONLY
47 |     )
48 |     dist = cf.Distribution(scope, "CF-"+id,
49 |         default_behavior={
50 |             "origin": _origin,
51 |             "allowed_methods": cf.AllowedMethods.ALLOW_ALL,
52 |             "cache_policy": cf.CachePolicy.CACHING_DISABLED,
53 |             "origin_request_policy": cf.OriginRequestPolicy.ALL_VIEWER,
54 |             "viewer_protocol_policy": cf.ViewerProtocolPolicy.REDIRECT_TO_HTTPS
55 |         },
56 |         minimum_protocol_version=cf.SecurityPolicyProtocol.TLS_V1_2_2019,
57 |         enable_logging=True,
58 |         log_bucket=logbucket
59 |     )
60 |     # Override Cfn_Nag rule for Cloudfront TLS-1.2 (https://github.com/stelligent/cfn_nag/issues/384)
61 |     # scan.suppress_cfnnag_rule('W70','the distribution uses CloudFront domain name and automatically sets the policy to TLSv1',dist.node.default_child)
62 | 
63 |     return dist.distribution_domain_name
64 | 
65 |    


--------------------------------------------------------------------------------
/source/lib/ecr_build/Dockerfile:
--------------------------------------------------------------------------------
1 | #FROM ghcr.io/tripl-ai/arc::latest
2 | FROM ghcr.io/tripl-ai/arc:arc_4.2.0_spark_3.3.4_scala_2.12_hadoop_3.3.2_4.2.1_slim
3 | ENV SPARK_HOME /opt/spark
4 | RUN mkdir -p $SPARK_HOME/work-dir
5 | WORKDIR $SPARK_HOME/work-dir


--------------------------------------------------------------------------------
/source/lib/ecr_build/buildspec.yaml:
--------------------------------------------------------------------------------
 1 | version: 0.2
 2 | phases:
 3 |   install:
 4 |     commands:
 5 |       - export BUILDX_VERSION=$(curl --silent "https://api.github.com/repos/docker/buildx/releases/latest" |jq -r .tag_name)
 6 |       - curl -JLO "https://github.com/docker/buildx/releases/download/$BUILDX_VERSION/buildx-$BUILDX_VERSION.linux-amd64"
 7 |       - mkdir -p ~/.docker/cli-plugins
 8 |       - mv "buildx-$BUILDX_VERSION.linux-amd64" ~/.docker/cli-plugins/docker-buildx
 9 |       - chmod +x ~/.docker/cli-plugins/docker-buildx
10 |       # - docker run --privileged --rm tonistiigi/binfmt --install arm64
11 |       # To install all the supported platforms:
12 |       - docker run --privileged --rm tonistiigi/binfmt --install all
13 |   pre_build:
14 |     commands:
15 |       - echo Logging in to Amazon ECR...
16 |       - aws --version
17 |       - $(aws ecr get-login --region $AWS_DEFAULT_REGION --no-include-email)
18 |       - REPOSITORY_URI=${REPO_ECR}
19 |       - COMMIT_HASH=$(echo $CODEBUILD_RESOLVED_SOURCE_VERSION | cut -c 1-7)
20 |       - IMAGE_TAG=${COMMIT_HASH:=latest}
21 |   build:
22 |     commands:
23 |       - echo Build started on `date`
24 |       - echo Building the Docker image...
25 |       - docker buildx create --use --name multiarch
26 |       - docker buildx build --push --platform=linux/amd64,linux/arm64 -t $REPOSITORY_URI:$IMAGE_TAG -t $REPOSITORY_URI:latest .
27 |       # - docker build -t $REPOSITORY_URI:latest .
28 |       # - docker tag $REPOSITORY_URI:latest $REPOSITORY_URI:$IMAGE_TAG
29 |   # post_build:
30 |   #   commands:
31 |   #     - echo Build completed on `date`
32 |   #     - echo Pushing the Docker images...
33 |   #     - docker push $REPOSITORY_URI:latest
34 |   #     - docker push $REPOSITORY_URI:$IMAGE_TAG


--------------------------------------------------------------------------------
/source/lib/ecr_build/ecr_build_pipeline.py:
--------------------------------------------------------------------------------
  1 | ######################################################################################################################
  2 | # Copyright 2020-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.                                      #
  3 | #                                                                                                                   #
  4 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance    #
  5 | # with the License. A copy of the License is located at                                                             #
  6 | #                                                                                                                   #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0                                                                    #
  8 | #                                                                                                                   #
  9 | # or in the 'license' file accompanying this file. This file is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES #
 10 | # OR CONDITIONS OF ANY KIND, express o#implied. See the License for the specific language governing permissions     #
 11 | # and limitations under the License.  																				#                                                                              #
 12 | ######################################################################################################################
 13 | #
 14 | from aws_cdk import (
 15 |     RemovalPolicy,
 16 |     Duration,
 17 |     aws_s3 as s3,
 18 |     aws_codepipeline as codepipeline,
 19 |     aws_codebuild as codebuild,
 20 |     aws_codepipeline_actions as codepipeline_actions,
 21 |     aws_ecr as ecr,
 22 | ) 
 23 | from constructs import Construct
 24 | # import lib.util.override_rule as scan
 25 | 
 26 | class DockerPipelineConstruct(Construct):
 27 | 
 28 |     @property
 29 |     def image_uri(self):
 30 |         return self.ecr_repo.repository_uri
 31 | 
 32 |     def __init__(self,scope: Construct, id: str, codebucket: s3.IBucket, **kwargs,) -> None:
 33 |         super().__init__(scope, id, **kwargs)
 34 |         
 35 |         # 1. Create ECR repositories
 36 |         self.ecr_repo=ecr.Repository(self,'ECRRepo',
 37 |             image_scan_on_push=True,
 38 |             removal_policy=RemovalPolicy.DESTROY
 39 |         )
 40 |         # 2. Setup deployment CI/CD to deploy docker image to ECR
 41 |         pipeline = codepipeline.Pipeline(self, "Pipeline",
 42 |             pipeline_name='BuildArcDockerImage',
 43 |             artifact_bucket=codebucket
 44 |         )
 45 |         image_builder = codebuild.PipelineProject(self,'DockerBuild',
 46 |             project_name='BuildArcDockerImage',
 47 |             build_spec=codebuild.BuildSpec.from_source_filename('buildspec.yaml'),
 48 |             environment=dict(
 49 |                 build_image=codebuild.LinuxBuildImage.AMAZON_LINUX_2_3,
 50 |                 privileged=True
 51 |             ),
 52 |             environment_variables={
 53 |                 'REPO_ECR': codebuild.BuildEnvironmentVariable(value=self.ecr_repo.repository_uri),
 54 |             },
 55 |             description='Pipeline for docker build',
 56 |             timeout=Duration.minutes(60)
 57 |         )
 58 |         image_builder.apply_removal_policy(RemovalPolicy.DESTROY)
 59 | 
 60 |         # 3. grant permissions for the CI/CD
 61 |         codebucket.grant_read_write(pipeline.role)
 62 |         codebucket.grant_read_write(image_builder)  
 63 |         self.ecr_repo.grant_pull_push(image_builder)
 64 | 
 65 |         source_output=codepipeline.Artifact('src')
 66 |         pipeline.add_stage(
 67 |             stage_name='Source',
 68 |             actions=[
 69 |                 codepipeline_actions.S3SourceAction(
 70 |                     action_name='S3Trigger',
 71 |                     bucket=codebucket,
 72 |                     bucket_key='app_code/ecr_build_src.zip',
 73 |                     output=source_output,
 74 |                     trigger=codepipeline_actions.S3Trigger.POLL),
 75 |             ]
 76 |         )
 77 |         pipeline.add_stage(
 78 |             stage_name='Build',
 79 |             actions=[
 80 |                 codepipeline_actions.CodeBuildAction(
 81 |                     action_name='DockerImageBuild',
 82 |                     input=source_output,
 83 |                     project=image_builder
 84 |                 )
 85 |             ]
 86 |         )
 87 | 
 88 |         # Override Cfn Nag warning W12: IAM policy should not allow * resource
 89 |         # scan.suppress_cfnnag_rule('W12', 'the role for action of ecr:GetAuthorizationToken requires * resource', image_builder.role.node.find_child('DefaultPolicy').node.default_child)
 90 | 
 91 |         # image_builder.role.node.find_child('DefaultPolicy').node.default_child.add_metadata('cfn_nag',{
 92 |         #     "rules_to_suppress": [
 93 |         #         {
 94 |         #             "id": "W12",
 95 |         #             "reason": "the role for action of ecr:GetAuthorizationToken requires * resource"
 96 |         #         },
 97 |         #         {
 98 |         #             "id": "W76",
 99 |         #             "reason": "the IAM policy is complex, need to be higher than 25"
100 |         #         }
101 |         #     ]
102 |         # })


--------------------------------------------------------------------------------
/source/lib/solution_helper/lambda_function.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import uuid
 3 | import requests
 4 | from copy import copy
 5 | from crhelper import CfnResource
 6 | from datetime import datetime
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | helper = CfnResource(json_logging=True, log_level='INFO')
10 | METRICS_ENDPOINT = "https://metrics.awssolutionsbuilder.com/generic"
11 | 
12 | def _sanitize_data(resource_properties):
13 |     resource_properties.pop("ServiceToken", None)
14 |     resource_properties.pop("Resource", None)
15 | 
16 |     # Solution ID and unique ID are sent separately
17 |     resource_properties["Data"].pop("Solution", None)
18 |     resource_properties["Data"].pop("UUID", None)
19 | 
20 |     return resource_properties
21 | 
22 | @helper.create
23 | @helper.update
24 | @helper.delete
25 | def custom_resource(event, _):
26 |     # print("Received event: " + json.dumps(event, indent=2))
27 |     request_type = event["RequestType"]
28 |     resource_properties = event["ResourceProperties"]
29 |     resource = resource_properties["Resource"]
30 | 
31 |     # One UUID per CFN deployment
32 |     if resource == "UUID" and request_type == "Create":
33 |         random_id = str(uuid.uuid4())
34 |         helper.Data.update({"UUID":random_id})
35 |     elif resource == "AnonymousMetric":
36 |         try:
37 |             metrics_data = _sanitize_data(copy(resource_properties))
38 |             metrics_data["CFTemplate"]= request_type + "d"
39 |             headers = {"Content-Type": "application/json"}
40 |             payload = {
41 |                 "Solution": resource_properties["Solution"],
42 |                 "UUID": resource_properties["UUID"],
43 |                 "TimeStamp": datetime.utcnow().isoformat(),
44 |                 **metrics_data
45 |             }
46 |             logger.info(f'Sending payload: {payload}')
47 |             response = requests.post(METRICS_ENDPOINT, json=payload, headers=headers)
48 |             logger.info(f'Response from metrics endpoint: {response.status_code} {response.reason}')
49 |         except requests.exceptions.RequestException:
50 |             logger.exception("Could not send usage data")
51 |         except Exception:
52 |             logger.exception("Unknown error when trying to send usage data")
53 | 
54 | def handler(event, context):
55 |     try:
56 |         helper(event, context)
57 |         return {"Data": helper.Data}
58 |     except Exception as error:
59 |         logger.exception("[handler] failed: {error}")
60 | 
61 | 


--------------------------------------------------------------------------------
/source/lib/solution_helper/requirements.txt:
--------------------------------------------------------------------------------
1 | #requests==2.31.0
2 | #crhelper==2.0.11
3 | #urllib3==1.26.15


--------------------------------------------------------------------------------
/source/lib/solution_helper/solution_metrics.py:
--------------------------------------------------------------------------------
 1 | from constructs import Construct
 2 | from aws_cdk.aws_s3 import IBucket
 3 | from lib.util.conditional_resources import Condition
 4 | from aws_cdk import (
 5 |     aws_lambda as _lambda,
 6 |     custom_resources as _custom_resources,
 7 |     aws_ec2 as _ec2
 8 | )
 9 | from aws_cdk import (
10 |     Aspects,
11 |     Fn, 
12 |     Duration,
13 |     CfnMapping, 
14 |     CfnCondition, 
15 |     CustomResource,
16 |     Duration,
17 |     RemovalPolicy
18 | )
19 | import lib.util.override_rule as scan
20 | 
21 | class SendAnonymousData(Construct):
22 | 
23 |     @property
24 |     def UUID(self):
25 |         return self._uuid   
26 | 
27 |     def __init__(self,scope: Construct, id: str, vpc: _ec2.IVpc, codebucket: IBucket, s3_deploy, metrics) -> None:
28 |         super().__init__(scope, id)
29 | 
30 |         self._metrics_mapping = CfnMapping(self, 'AnonymousData',mapping={'SendAnonymousData': {'Data': 'Yes'}})
31 |         self._metrics_condition = CfnCondition(self, 'AnonymousDatatoAWS',
32 |             expression=Fn.condition_equals(self._metrics_mapping.find_in_map('SendAnonymousData','Data'),'Yes')
33 |         )
34 | 
35 |         self._helper_func = _lambda.SingletonFunction(self, 'SolutionHelper',
36 |             uuid='75248a81-9138-468c-9ba1-bca6c7137599',
37 |             runtime= _lambda.Runtime.PYTHON_3_8,
38 |             handler= 'lambda_function.handler',
39 |             description= 'This function generates UUID for each deployment and sends anonymous data to the AWS Solutions team',
40 |             code= _lambda.Code.from_bucket(bucket=codebucket,key='app_code/solution_helper.zip'),
41 |             vpc=vpc,
42 |             timeout=Duration.seconds(30)
43 |         )
44 |         self._helper_func.add_dependency(s3_deploy)
45 |         
46 |         self._lambda_provider = _custom_resources.Provider(
47 |             self, 'LambdaProvider',
48 |             on_event_handler=self._helper_func,
49 |             vpc=vpc
50 |         )
51 | 
52 |         self._uuid = CustomResource(self, 'UUIDCustomResource',
53 |             service_token=self._lambda_provider.service_token,
54 |             properties={
55 |                 "Resource": "UUID"
56 |             },
57 |             resource_type="Custom::CreateUUID",
58 |             removal_policy=RemovalPolicy.DESTROY
59 |         )
60 | 
61 |         self._send_data = CustomResource(self, 'SendDataCustomResource',
62 |             service_token=self._lambda_provider.service_token,
63 |             properties={
64 |                 "Resource": "AnonymousMetric",
65 |                 "UUID": self._uuid.get_att_string("UUID"),
66 |                 "Solution": metrics["Solution"],
67 |                 "Data": metrics
68 |             },
69 |             resource_type= 'Custom::AnonymousData',
70 |             removal_policy=RemovalPolicy.DESTROY
71 |         )
72 |         self._send_data.node.add_dependency(self._uuid)
73 | 
74 |         Aspects.of(self._helper_func).add(Condition(self._metrics_condition))
75 |         Aspects.of(self._uuid).add(Condition(self._metrics_condition))
76 |         Aspects.of(self._send_data).add(Condition(self._metrics_condition))


--------------------------------------------------------------------------------
/source/lib/spark_on_eks_stack.py:
--------------------------------------------------------------------------------
  1 | from aws_cdk import (Stack, CfnOutput, Duration, RemovalPolicy, Aws, Fn, CfnParameter, aws_eks as eks,aws_secretsmanager as secmger,aws_kms as kms)
  2 | from constructs import Construct
  3 | from lib.cdk_infra.network_sg import NetworkSgConst
  4 | from lib.cdk_infra.iam_roles import IamConst
  5 | from lib.cdk_infra.eks_cluster import EksConst
  6 | from lib.cdk_infra.eks_service_account import EksSAConst
  7 | from lib.cdk_infra.eks_base_app import EksBaseAppConst
  8 | from lib.cdk_infra.s3_app_code import S3AppCodeConst
  9 | from lib.cdk_infra.spark_permission import SparkOnEksSAConst
 10 | from lib.ecr_build.ecr_build_pipeline import DockerPipelineConstruct
 11 | from lib.cloud_front_stack import NestedStack
 12 | from lib.util.manifest_reader import *
 13 | # from lib.util import override_rule as scan
 14 | # from lib.solution_helper import solution_metrics
 15 | import json, os
 16 | 
 17 | class SparkOnEksStack(Stack):
 18 | 
 19 |     @property
 20 |     def code_bucket(self):
 21 |         return self.app_s3.code_bucket
 22 | 
 23 |     @property
 24 |     def argo_url(self):
 25 |         return self._argo_alb.value
 26 | 
 27 |     @property
 28 |     def jhub_url(self):
 29 |         return self._jhub_alb.value 
 30 | 
 31 |     def __init__(self, scope: Construct, id: str, eksname: str, solution_id: str, version: str, **kwargs) -> None:
 32 |         super().__init__(scope, id, **kwargs)
 33 | 
 34 |         self.template_options.description = "(SO0141) SQL based ETL with Apache Spark on Amazon EKS. This solution provides a SQL based ETL option with a open-source declarative framework powered by Apache Spark."
 35 |         source_dir=os.path.split(os.environ['VIRTUAL_ENV'])[0]+'/source'
 36 | 
 37 |         # Cloudformation input params
 38 |         datalake_bucket = CfnParameter(self, "datalakebucket", type="String",
 39 |             description="Your existing S3 bucket to be accessed by Jupyter Notebook and ETL job. Default: blank",
 40 |             default=""
 41 |         )
 42 |         login_name = "sparkoneks"
 43 |         # login_name = CfnParameter(self, "jhubuser", type="String",
 44 |         #     description="Your username login to jupyter hub. Only alphanumeric characters are allowed",
 45 |         #     default="sparkoneks"
 46 |         # )
 47 | 
 48 |         # Auto-generate a user login in secrets manager
 49 |         key = kms.Key(self, 'KMSKey',removal_policy=RemovalPolicy.DESTROY,enable_key_rotation=True)
 50 |         key.add_alias("alias/secretsManager")
 51 |         jhub_secret = secmger.Secret(self, 'jHubPwd', 
 52 |             generate_secret_string=secmger.SecretStringGenerator(
 53 |                 exclude_punctuation=True,
 54 |                 secret_string_template=json.dumps({'username': login_name}),
 55 |                 generate_string_key="password"),
 56 |             removal_policy=RemovalPolicy.DESTROY,
 57 |             encryption_key=key
 58 |         )
 59 | 
 60 |         # 1. a new bucket to store app code and logs
 61 |         self.app_s3 = S3AppCodeConst(self,'appcode')
 62 | 
 63 |         # 2. push docker image to ECR via AWS CICD pipeline
 64 |         ecr_image = DockerPipelineConstruct(self,'image', self.app_s3.artifact_bucket)
 65 |         ecr_image.node.add_dependency(self.app_s3)
 66 |         CfnOutput(self,'IMAGE_URI', value=ecr_image.image_uri)
 67 | 
 68 |         # 3. EKS base infrastructure
 69 |         network_sg = NetworkSgConst(self,'network-sg', eksname, self.app_s3.code_bucket)
 70 |         iam = IamConst(self,'iam_roles', eksname)
 71 |         eks_cluster = EksConst(self,'eks_cluster', eksname, network_sg.vpc, iam.managed_node_role, iam.admin_role)
 72 |         EksSAConst(self, 'eks_sa', eks_cluster.my_cluster, jhub_secret)
 73 |         base_app=EksBaseAppConst(self, 'eks_base_app', eks_cluster.my_cluster)
 74 | 
 75 |         # 4. Spark app access control
 76 |         app_security = SparkOnEksSAConst(self,'spark_service_account', 
 77 |             eks_cluster.my_cluster, 
 78 |             login_name,
 79 |             self.app_s3.code_bucket,
 80 |             datalake_bucket.value_as_string
 81 |         )
 82 |         app_security.node.add_dependency(base_app.secret_created)
 83 |         # 5. Install Arc Jupyter notebook in EKS
 84 |         jhub_install= eks_cluster.my_cluster.add_helm_chart('JHubChart',
 85 |             chart='jupyterhub',
 86 |             repository='https://jupyterhub.github.io/helm-chart',
 87 |             release='jhub',
 88 |             version='1.2.0',
 89 |             namespace='jupyter',
 90 |             create_namespace=False,
 91 |             values=load_yaml_replace_var_local(source_dir+'/app_resources/jupyter-values.yaml', 
 92 |                 fields={
 93 |                     "{{codeBucket}}": self.app_s3.code_bucket,
 94 |                     "{{region}}": Aws.REGION
 95 |                 })
 96 |         )
 97 |         jhub_install.node.add_dependency(app_security)
 98 |         # EKS get Jupyter login dynamically from secrets manager
 99 |         name_parts=Fn.split('-',jhub_secret.secret_name)
100 |         name_no_suffix=Fn.join('-',[Fn.select(0, name_parts), Fn.select(1, name_parts)])
101 | 
102 |         config_hub = eks.KubernetesManifest(self,'JHubConfig',
103 |             cluster=eks_cluster.my_cluster,
104 |             manifest=load_yaml_replace_var_local(source_dir+'/app_resources/jupyter-config.yaml', 
105 |                 fields= {
106 |                     "{{MY_SA}}": app_security.jupyter_sa,
107 |                     "{{REGION}}": Aws.REGION, 
108 |                     "{{SECRET_NAME}}": name_no_suffix,
109 |                     "{{INBOUND_SG}}": network_sg.alb_jhub_sg.security_group_id
110 |                 }, 
111 |                 multi_resource=True)
112 |         )
113 |         config_hub.node.add_dependency(jhub_install)
114 |             
115 |         # 6. Install ETL orchestrator - Argo in EKS
116 |         # can be replaced by other workflow tool, eg. Airflow
117 |         argo_install = eks_cluster.my_cluster.add_helm_chart('ARGOChart',
118 |             chart='argo-workflows',
119 |             repository='https://argoproj.github.io/argo-helm',
120 |             release='argo',
121 |             version='0.40.10',
122 |             namespace='argo',
123 |             create_namespace=True,
124 |             values=load_yaml_replace_var_local(source_dir+'/app_resources/argo-values.yaml',
125 |                 fields= {
126 |                     "{{INBOUND_SG}}": network_sg.alb_argo_sg.security_group_id
127 |                 })
128 |         )
129 |         argo_install.node.add_dependency(config_hub)
130 |         # Create argo workflow template for Spark with T-shirt size
131 |         submit_tmpl = eks_cluster.my_cluster.add_manifest('SubmitSparkWrktmpl',
132 |             load_yaml_local(source_dir+'/app_resources/spark-template.yaml')
133 |         )
134 |         submit_tmpl.node.add_dependency(argo_install)
135 | 
136 |         # 7. (OPTIONAL) retrieve ALB DNS Name to enable CloudFront in the nested stack.
137 |         # It is used to serve HTTPS requests with its default domain name. 
138 |         # Recommend to issue your own TLS certificate, and delete the CF components.
139 |         self._jhub_alb=eks.KubernetesObjectValue(self, 'jhubALB',
140 |             cluster=eks_cluster.my_cluster,
141 |             json_path='..status.loadBalancer.ingress[0].hostname',
142 |             object_type='ingress.networking',
143 |             object_name='jupyterhub',
144 |             object_namespace='jupyter',
145 |             timeout=Duration.minutes(10)
146 |         )
147 |         self._jhub_alb.node.add_dependency(config_hub)
148 | 
149 |         self._argo_alb = eks.KubernetesObjectValue(self, 'argoALB',
150 |             cluster=eks_cluster.my_cluster,
151 |             json_path='..status.loadBalancer.ingress[0].hostname',
152 |             object_type='ingress.networking',
153 |             object_name='argo-argo-workflows-server',
154 |             object_namespace='argo',
155 |             timeout=Duration.minutes(10)
156 |         )
157 |         self._argo_alb.node.add_dependency(argo_install)
158 | 
159 |         # 8. (OPTIONAL) Send solution metrics to AWS
160 |         # turn it off from the CloudFormation mapping section if prefer.
161 |         # send_metrics=solution_metrics.SendAnonymousData(self,"SendMetrics", network_sg.vpc, self.app_s3.artifact_bucket,self.app_s3.s3_deploy_contrust,
162 |         #     metrics={
163 |         #                 "Solution": solution_id,
164 |         #                 "Region": Aws.REGION,
165 |         #                 "SolutionVersion": version,
166 |         #                 "UUID": "MY_UUID",
167 |         #                 "UseDataLakeBucket": "True" if not datalake_bucket.value_as_string else "False",
168 |         #                 "UseAWSCICD": "True" if ecr_image.image_uri else "False",
169 |         #                 "NoAZs": len(network_sg.vpc.availability_zones)
170 |         #             }
171 |         # )
172 |         # send_metrics.node.add_dependency(self.app_s3.s3_deploy_contrust)
173 | 
174 |         # 9. (OPTIONAL) Override the cfn Nag rules for AWS Solution CICD deployment
175 |         # remove the section if your CI/CD pipeline doesn't use the cfn_nag utility to validate the CFN.
176 |         # k8s_ctl_node=self.node.find_child('@aws-cdk--aws-eks.KubectlProvider')
177 |         # cluster_resrc_node=self.node.find_child('@aws-cdk--aws-eks.ClusterResourceProvider')
178 |         # scan.suppress_cfnnag_rule('W12', 'by default the role has * resource', self.node.find_child('eks_cluster').node.find_child('EKS').node.default_child.node.find_child('CreationRole').node.find_child('DefaultPolicy').node.default_child)
179 |         # scan.suppress_cfnnag_rule('W11', 'by default the role has * resource', self.node.find_child('Custom::AWSCDKOpenIdConnectProviderCustomResourceProvider').node.find_child('Role'))
180 |         # scan.suppress_lambda_cfnnag_rule(k8s_ctl_node.node.find_child('Handler').node.default_child)
181 |         # scan.suppress_lambda_cfnnag_rule(k8s_ctl_node.node.find_child('Provider').node.find_child('framework-onEvent').node.default_child)
182 |         # scan.suppress_lambda_cfnnag_rule(self.node.find_child('Custom::CDKBucketDeployment8693BB64968944B69AAFB0CC9EB8756C').node.default_child)
183 |         # # scan.suppress_lambda_cfnnag_rule(self.node.find_child('Custom::S3AutoDeleteObjectsCustomResourceProvider').node.find_child('Handler'))
184 |         # scan.suppress_lambda_cfnnag_rule(self.node.find_child('Custom::AWSCDKOpenIdConnectProviderCustomResourceProvider').node.find_child('Handler'))
185 |         # scan.suppress_lambda_cfnnag_rule(self.node.find_child('AWSCDKCfnUtilsProviderCustomResourceProvider').node.find_child('Handler'))
186 |         # scan.suppress_lambda_cfnnag_rule(cluster_resrc_node.node.find_child('OnEventHandler').node.default_child)
187 |         # scan.suppress_lambda_cfnnag_rule(cluster_resrc_node.node.find_child('IsCompleteHandler').node.default_child)
188 |         # scan.suppress_lambda_cfnnag_rule(cluster_resrc_node.node.find_child('Provider').node.find_child('framework-isComplete').node.default_child)
189 |         # scan.suppress_lambda_cfnnag_rule(cluster_resrc_node.node.find_child('Provider').node.find_child('framework-onTimeout').node.default_child)      
190 |         # scan.suppress_lambda_cfnnag_rule(cluster_resrc_node.node.find_child('Provider').node.find_child('framework-onEvent').node.default_child)
191 |         # scan.suppress_network_cfnnag_rule(self.node.find_child('eks_cluster').node.find_child('EKS').node.find_child('ControlPlaneSecurityGroup').node.default_child)
192 |         # scan.suppress_lambda_cfnnag_rule(self.node.find_child('SendMetrics').node.find_child('LambdaProvider').node.find_child('framework-onEvent').node.default_child)
193 |         # scan.suppress_network_cfnnag_rule(self.node.find_child('SendMetrics').node.find_child('LambdaProvider').node.find_child('framework-onEvent').node.find_child('SecurityGroup').node.default_child)
194 |         # scan.suppress_lambda_cfnnag_rule(self.node.find_child('SingletonLambda75248a819138468c9ba1bca6c7137599').node.default_child)
195 |         # scan.suppress_network_cfnnag_rule(self.node.find_child('SingletonLambda75248a819138468c9ba1bca6c7137599').node.find_child('SecurityGroup').node.default_child)
196 | 


--------------------------------------------------------------------------------
/source/lib/util/conditional_resources.py:
--------------------------------------------------------------------------------
 1 | import jsii
 2 | from constructs import IConstruct
 3 | from aws_cdk import CfnCondition, CfnResource, IAspect
 4 | 
 5 | # This code enables `apply_aspect()` to apply conditions to a resource.
 6 | # This way we can provision some resources if a condition is true.
 7 | # For example, if PROVISIONTYPE parameter is 'Git' then we provision CodePipeline
 8 | # with it's source stage being CodeCommit or GitHub
 9 | # https://docs.aws.amazon.com/cdk/latest/guide/aspects.html
10 | 
11 | 
12 | @jsii.implements(IAspect)
13 | class Condition:
14 |     def __init__(self, condition: CfnCondition):
15 |         self._condition = condition
16 | 
17 |     def visit(self, node: IConstruct):
18 |         child = node.node.default_child  # type: CfnResource
19 |         if child:
20 |             child.cfn_options.condition = self._condition
21 | 


--------------------------------------------------------------------------------
/source/lib/util/get_aws_managed_prefix.py:
--------------------------------------------------------------------------------
 1 | from aws_cdk import (Aws, aws_ec2 as ec2,aws_iam as iam, Fn)
 2 | from aws_cdk.custom_resources import (
 3 |     AwsCustomResource,
 4 |     AwsCustomResourcePolicy,
 5 |     PhysicalResourceId,
 6 |     AwsSdkCall
 7 | )
 8 | from constructs import Construct
 9 | 
10 | class AwsManagedPrefixListProps:
11 |     def __init__(self, name: str):
12 |         """
13 |         Name of the AWS managed prefix list.
14 |         See: https://docs.aws.amazon.com/vpc/latest/userguide/working-with-aws-managed-prefix-lists.html#available-aws-managed-prefix-lists
15 |         eg. com.amazonaws.global.cloudfront.origin-facing
16 |         """
17 |         self.name = name
18 | 
19 | class AwsManagedPrefixList(Construct):
20 |     def __init__(self, scope: Construct, id: str, props: AwsManagedPrefixListProps):
21 |         super().__init__(scope, id)
22 |         res = AwsCustomResource(
23 |             self, 'AWSCustomResource',
24 |             on_create=self.create(props),
25 |             policy=AwsCustomResourcePolicy.from_statements([
26 |                 iam.PolicyStatement(
27 |                     effect=iam.Effect.ALLOW,
28 |                     actions=['ec2:DescribeManagedPrefixLists'],
29 |                     resources=['*'],
30 |                 ),
31 |             ])
32 |         )
33 |         self.prefixlist_id=res.get_response_field("PrefixLists.0.PrefixListId")
34 | 
35 |     def create(self, props):
36 |         custom_params = {
37 |             'Filters': [
38 |                 {
39 |                     'Name': 'prefix-list-name',
40 |                     'Values': [props.name],
41 |                 },
42 |             ]
43 |         }
44 | 
45 |         return AwsSdkCall(
46 |                 service='EC2',
47 |                 action='describeManagedPrefixLists',
48 |                 parameters=custom_params,
49 |                 physical_resource_id=PhysicalResourceId.of(f"{id}-{Fn.select(0, Fn.split(':', self.node.addr))}"),
50 |                 region=Aws.REGION
51 |         )
52 | 
53 | 


--------------------------------------------------------------------------------
/source/lib/util/manifest_reader.py:
--------------------------------------------------------------------------------
 1 | import yaml
 2 | import urllib.request as request
 3 | import os.path as path
 4 | import sys
 5 | 
 6 | def load_yaml_remotely(url, multi_resource=False):
 7 |     try:
 8 |         file_to_parse = request.urlopen(url)
 9 |         if multi_resource:
10 |             yaml_data = list(yaml.full_load_all(file_to_parse))
11 |         else:
12 |             yaml_data = yaml.full_load(file_to_parse) 
13 |         # print(yaml_data)  
14 |     except:
15 |         print("Cannot read yaml config file {}, check formatting."
16 |                 "".format(file_to_parse))
17 |         sys.exit(1)
18 |         
19 |     return yaml_data 
20 | 
21 | def load_yaml_local(yaml_file, multi_resource=False):
22 | 
23 |     file_to_parse=path.join(path.dirname(__file__), yaml_file)
24 |     if not path.exists(file_to_parse):
25 |         print("The file {} does not exist"
26 |             "".format(file_to_parse))
27 |         sys.exit(1)
28 | 
29 |     try:
30 |         with open(file_to_parse, 'r') as yaml_stream:
31 |             if multi_resource:
32 |                 yaml_data = list(yaml.full_load_all(yaml_stream))
33 |             else:
34 |                 yaml_data = yaml.full_load(yaml_stream) 
35 |             # print(yaml_data)    
36 |     except:
37 |         print("Cannot read yaml config file {}, check formatting."
38 |                 "".format(file_to_parse))
39 |         sys.exit(1)
40 |         
41 |     return yaml_data 
42 | 
43 | def load_yaml_replace_var_remotely(url, fields, multi_resource=False):
44 |     try:
45 |         with request.urlopen(url) as f:
46 |             file_to_replace = f.read().decode('utf-8')
47 |             for searchwrd,replwrd in fields.items():
48 |                 file_to_replace = file_to_replace.replace(searchwrd, replwrd)
49 | 
50 |         if multi_resource:
51 |             yaml_data = list(yaml.full_load_all(file_to_replace))
52 |         else:
53 |             yaml_data = yaml.full_load(file_to_replace) 
54 |         # print(yaml_data)
55 |     except request.URLError as e:
56 |         print(e.reason)
57 |         sys.exit(1)
58 | 
59 |     return yaml_data
60 | 
61 | 
62 | def load_yaml_replace_var_local(yaml_file, fields, multi_resource=False, write_output=False):
63 | 
64 |     file_to_replace=path.join(path.dirname(__file__), yaml_file)
65 |     if not path.exists(file_to_replace):
66 |         print("The file {} does not exist"
67 |             "".format(file_to_replace))
68 |         sys.exit(1)
69 | 
70 |     try:
71 |         with open(file_to_replace, 'r') as f:
72 |             filedata = f.read()
73 | 
74 |             for searchwrd, replwrd in fields.items():
75 |                 filedata = filedata.replace(searchwrd, replwrd)
76 |             if multi_resource:
77 |                 yaml_data = list(yaml.full_load_all(filedata))
78 |             else:
79 |                 yaml_data = yaml.full_load(filedata) 
80 |         if write_output:
81 |             with open(file_to_replace, "w") as f:
82 |                 yaml.dump(yaml_data, f, default_flow_style=False, allow_unicode = True, sort_keys=False)
83 |     
84 |         # print(yaml_data)
85 |     except request.URLError as e:
86 |         print(e.reason)
87 |         sys.exit(1)
88 | 
89 |     return yaml_data
90 | 


--------------------------------------------------------------------------------
/source/lib/util/override_rule.py:
--------------------------------------------------------------------------------
 1 |  from constructs import IConstruct
 2 | def suppress_cfnnag_rule(rule_id: str, reason: str, cnstrt: IConstruct):    
 3 |     cnstrt.add_metadata('cfn_nag',{
 4 |         "rules_to_suppress": [{
 5 |                 "id": rule_id,
 6 |                 "reason": reason
 7 |             }]
 8 |     })
 9 | 
10 | def suppress_lambda_cfnnag_rule(cnstrt: IConstruct):
11 |     cnstrt.add_metadata('cfn_nag',{
12 |             "rules_to_suppress": [
13 |                 {
14 |                     "id": "W58",
15 |                     "reason": "service role has permission to write logs to CloudWatch"
16 |                 },
17 |                 {
18 |                     "id": "W89",
19 |                     "reason": "interal function does not need to associate to VPC"
20 |                 },
21 |                 {
22 |                     "id": "W92",
23 |                     "reason": "Setting up ReservedConcurrentExecutions is out of reach with the internal function created by CDK"
24 |                 }
25 |             ]
26 |         })
27 | 
28 | def suppress_network_cfnnag_rule(cnstrt: IConstruct):
29 |     cnstrt.add_metadata('cfn_nag',{
30 |       "rules_to_suppress": [
31 |                 {
32 |                     "id": "W40",
33 |                     "reason": "Egress IP Protocol of -1 is default and generally considered OK"
34 |                 },
35 |                 {
36 |                     "id": "W5",
37 |                     "reason": "The Security Group with cidr open considered OK"
38 |                 }
39 |             ]
40 |     })
41 | 
42 | def suppress_iam_cfnnag_rule(cnstrt: IConstruct):
43 |     cnstrt.add_metadata('cfn_nag',{
44 |       "rules_to_suppress": [
45 |                 {
46 |                     "id": "W12",
47 |                     "reason": "by default the role scaler_sa has * resource"
48 |                 },
49 |                 {
50 |                     "id": "W76",
51 |                     "reason": "standard IAM role offered by ALB ingress controller"
52 |                 }
53 |             ]
54 |     })   


--------------------------------------------------------------------------------
/source/package.json:
--------------------------------------------------------------------------------
1 | {
2 |   "dependencies": {
3 |     "aws-cdk": "^2.105.0",
4 |     "vm2": "^3.9.10"
5 |   }
6 | }
7 | 


--------------------------------------------------------------------------------
/source/requirements.txt:
--------------------------------------------------------------------------------
1 | -e .
2 | pytest


--------------------------------------------------------------------------------
/source/run-all-tests.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #
  3 | # This script runs all tests for the root CDK project, as well as any microservices, Lambda functions, or dependency
  4 | # source code packages. These include unit tests, integration tests, and snapshot tests.
  5 | #
  6 | # It is important that this script  be tested and validated to ensure that all available test fixtures are run.
  7 | #
  8 | 
  9 | [ "$DEBUG" == 'true' ] && set -x
 10 | set -e
 11 | 
 12 | setup_python_env() {
 13 | 	if [ -d "./.venv-test" ]; then
 14 | 		echo "Reusing already setup python venv in ./.venv-test. Delete ./.venv-test if you want a fresh one created."
 15 | 		return
 16 | 	fi
 17 | 	echo "Setting up python venv-test"
 18 | 	python3 -m venv .venv-test
 19 | 	echo "Initiating virtual environment"
 20 | 	source .venv-test/bin/activate
 21 | 	echo "Installing python packages"
 22 | 	pip3 install -e source
 23 | 	echo "deactivate virtual environment"
 24 | 	deactivate
 25 | }
 26 | 
 27 | setup_and_activate_python_env() {
 28 | 	# module_path=$1
 29 | 	# cd $module_path
 30 | 
 31 | 	[ "${CLEAN:-true}" = "true" ] && rm -fr .venv-test
 32 | 
 33 | 	setup_python_env
 34 | 
 35 | 	echo "Initiating virtual environment"
 36 | 	source .venv-test/bin/activate
 37 | }
 38 | 
 39 | 
 40 | run_python_test() {
 41 | 	module_path=$(pwd)
 42 | 	module_name=${1}
 43 | 	echo $1
 44 | 	echo "------------------------------------------------------------------------------"
 45 | 	echo "[Test] Python path=$module_path module=$module_name"
 46 | 	echo "------------------------------------------------------------------------------"
 47 | 	
 48 | 
 49 | 	# setup coverage report path
 50 | 	mkdir -p $source_dir/test/coverage-reports
 51 | 	coverage_report_path=$source_dir/test/coverage-reports/$module_name.coverage.xml
 52 | 
 53 | 	echo "coverage report path set to $coverage_report_path"
 54 | 
 55 | 	# Use -vv for debugging
 56 | 	python3 -m pytest --cov --cov-report=term-missing --cov-report "xml:$coverage_report_path"
 57 | 	
 58 | 	if [ "$?" = "1" ]; then
 59 | 		echo "(source/run-all-tests.sh) ERROR: there is likely output above." 1>&2
 60 | 		exit 1
 61 | 	fi
 62 | 	sed -i -e "s,<source>$source_dir,<source>source,g" $coverage_report_path
 63 | }
 64 | 
 65 | run_cdk_project_test() {
 66 | 	component_description=$1
 67 | 	echo "------------------------------------------------------------------------------"
 68 | 	echo "[Test] $component_description"
 69 | 	echo "------------------------------------------------------------------------------"
 70 | 	[ "${CLEAN:-true}" = "true" ] && npm run clean
 71 | 	npm install
 72 | 	npm run build
 73 | 	npm run test -- -u
 74 | 	if [ "$?" = "1" ]; then
 75 | 		echo "(source/run-all-tests.sh) ERROR: there is likely output above." 1>&2
 76 | 		exit 1
 77 | 	fi
 78 | 	[ "${CLEAN:-true}" = "true" ] && rm -fr coverage
 79 | }
 80 | 
 81 | run_source_unit_test() {
 82 | 	echo "------------------------------------------------------------------------------"
 83 | 	echo "[Test] Run source unit tests"
 84 | 	echo "------------------------------------------------------------------------------"
 85 | 
 86 | 	# Test the functions
 87 | 	cd $source_dir
 88 | 	for folder in */; do
 89 | 		if [ "$folder" = "test/" ]; then
 90 | 			echo "------------------------------------------------------------------------------"
 91 | 			echo "[Test] Run tests against $folder"
 92 | 			echo "------------------------------------------------------------------------------"
 93 | 			pip3 install -r $folder/requirement-test.txt
 94 | 			run_python_test $folder
 95 | 			rm -rf *.egg-info
 96 | 		fi
 97 | 		cd $source_dir
 98 | 	done
 99 | }
100 | 
101 | # Clean the test environment before running tests and after finished running tests
102 | # The variable is option with default of 'true'. It can be overwritten by caller
103 | # setting the CLEAN environment variable. For example
104 | #    $ CLEAN=true ./run-all-tests.sh
105 | # or
106 | #    $ CLEAN=false ./run-all-tests.sh
107 | #
108 | CLEAN="${CLEAN:-true}"
109 | 
110 | setup_and_activate_python_env
111 | source_dir=$PWD/source
112 | cd $source_dir
113 | 
114 | python --version
115 | run_source_unit_test
116 | 
117 | # Return to the root/ level where we started
118 | cd $source_dir


--------------------------------------------------------------------------------
/source/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # SPDX-License-Identifier: MIT-0
 3 | import setuptools
 4 | 
 5 | try:
 6 |     with open("../README.md") as fp:
 7 |         long_description = fp.read()
 8 | except IOError as e:
 9 |     long_description = ''
10 | 
11 | setuptools.setup(
12 |     name="sql-based-etl",
13 |     version="2.0.0",
14 | 
15 |     description="A CDK v2 Python app for SQL-based ETL",
16 |     long_description=long_description,
17 |     long_description_content_type="text/markdown",
18 |     
19 |     author="meloyang",
20 | 
21 |     package_dir={"": "./"},
22 |     packages=setuptools.find_packages(where="./"),
23 | 
24 |     install_requires=[
25 |         "aws-cdk-lib==2.105.0",
26 |         "aws-cdk.lambda-layer-kubectl-v27==2.1.0",
27 |         "constructs>=10.0.0,<11.0.0",
28 |         "pyyaml==6.0.1"
29 |     ],
30 | 
31 |     python_requires=">=3.7",
32 | 
33 |     classifiers=[
34 |         "Development Status :: 4 - Beta",
35 | 
36 |         "Intended Audience :: Developers",
37 | 
38 |         "License :: OSI Approved :: MIT License",
39 | 
40 |         "Programming Language :: JavaScript",
41 |         "Programming Language :: Python :: 3 :: Only",
42 |         "Programming Language :: Python :: 3.7",
43 |         "Programming Language :: Python :: 3.8",
44 |         "Programming Language :: Python :: 3.9",
45 |         "Programming Language :: Python :: 3.10",
46 |         "Programming Language :: Python :: 3.11",   
47 |          
48 |         "Topic :: Software Development :: Code Generators",
49 |         "Topic :: Utilities",
50 | 
51 |         "Typing :: Typed",
52 |     ],
53 | )
54 | 


--------------------------------------------------------------------------------