├── .github ├── ISSUE_TEMPLATE │ ├── bug.md │ ├── feature-request.md │ └── general.md └── PULL_REQUEST_TEMPLATE │ └── pull_request_template.md ├── .gitignore ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE.txt ├── NOTICE.txt ├── README.md ├── deployment ├── app_code │ ├── data │ │ ├── initial_contacts.csv │ │ └── update_contacts.csv │ ├── ecr_build_src.zip │ ├── job │ │ ├── delta_load.ipynb │ │ ├── initial_load.ipynb │ │ ├── scd2_merge.ipynb │ │ └── wordcount.py │ ├── meta │ │ └── contact_meta_0.json │ └── sql │ │ ├── add_calc_field_for_scd2.sql │ │ ├── create_table_contact.sql │ │ └── sqlvalidate_errors.sql ├── build-s3-dist.sh ├── cdk-solution-helper │ ├── README.md │ ├── index.js │ └── package.json ├── delete_all.sh └── post-deployment.sh └── source ├── app.py ├── app_resources ├── alb-iam-role.yaml ├── alb-values.yaml ├── argo-values.yaml ├── autoscaler-iam-role.yaml ├── autoscaler-values.yaml ├── etl-iam-role.yaml ├── etl-rbac.yaml ├── ex-secret-iam-role.yaml ├── ex-secret-values.yaml ├── jupyter-config.yaml ├── jupyter-values.yaml ├── native-spark-iam-role.yaml ├── native-spark-rbac.yaml ├── spark-operator-values.yaml └── spark-template.yaml ├── cdk.json ├── example ├── native-spark-job-scheduler.yaml ├── notebook │ ├── nyctaxi-job.ipynb │ └── scd2-job.ipynb ├── nyctaxi-job-scheduler.yaml └── scd2-job-scheduler.yaml ├── images ├── 00-deploy-to-aws.png ├── 3-argo-job-dependency.png ├── 3-argo-log.png ├── 4-auto-scaling.png ├── 4-k8s-retry.png ├── 4-spot-console.png ├── architecture.png ├── driver_interruption_test.gif ├── executor_interruption_test.png ├── fake_data.gif ├── run_jupyter.gif ├── sql-based-etl-spark-architecture-final.png ├── sql-based-etl-with-apache-spark-on-amazon-eks.preview.png └── submit_job_in_argo.gif ├── lib ├── cdk_infra │ ├── eks_base_app.py │ ├── eks_cluster.py │ ├── eks_service_account.py │ ├── iam_roles.py │ ├── network_sg.py │ ├── s3_app_code.py │ └── spark_permission.py ├── cloud_front_stack.py ├── ecr_build │ ├── Dockerfile │ ├── buildspec.yaml │ └── ecr_build_pipeline.py ├── solution_helper │ ├── lambda_function.py │ ├── requirements.txt │ └── solution_metrics.py ├── spark_on_eks_stack.py └── util │ ├── conditional_resources.py │ ├── get_aws_managed_prefix.py │ ├── manifest_reader.py │ └── override_rule.py ├── package.json ├── requirements.txt ├── run-all-tests.sh └── setup.py /.github/ISSUE_TEMPLATE/bug.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | labels: bug 5 | 6 | --- 7 | 8 | ### Describe the bug ### 9 | A clear and concise description of what the bug is. 10 | 11 | ### To Reproduce ### 12 | Steps to reproduce the behavior 13 | 14 | 1. Step One: 15 | 2. Step Two: 16 | 3. [...] 17 | 18 | ### Expected Result ### 19 | A clear and concise description of what you expected to happen. 20 | 21 | ### Actual Result ### 22 | A description of what is the result and/or error messages you got when you faced this issue. 23 | 24 | 25 | ### Other information: ### 26 | 1. Version of the Solution (e.g., v1.1.0): 27 | 28 | To get the version of the solution, you can look at the description of the created CloudFormation stack. For example, "_(SO0027) AWS Serverless Bot Framework v1.2.0 - This AWS CloudFormation template helps you provision the AWS Serverless Bot Framework stack without worrying about creating and configuring the underlying AWS infrastructure_". If the description does not contain the version information, you can look at the mappings section of the template: 29 | 30 | ```yaml 31 | Mappings: 32 | Solution: 33 | Data: 34 | ID: SO0027 35 | Version: 'v1.2.0' 36 | ``` 37 | 38 | 2. Region where CloudFormation template is deployed (e.g., us-east-1): 39 | 3. Did you make any change in the source code? If yes, what are the relevant changes (if publicly available)?: 40 | 4. Troubleshooting steps attempted: 41 | 5. Were there any errors in the Cloudwatch logs?: 42 | 6. Screenshots (please **DO NOT include sensitive information**): 43 | 7. Did you use the Sample Weather Service (please DO NOT include API KEY) ? Yes / No 44 | 45 | ### Stack Parameters ### 46 | Cloudformation Stack Parameters (please **DO NOT include sensitive information** like S3 bucket name, IP address, credentials, etc): 47 | 1. Bot Name: 48 | 2. Bot Language: 49 | 3. Bot Gender: 50 | 51 | 52 | ### Additional context ### 53 | Add any other context about the problem here. 54 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this solution 4 | labels: enhancement 5 | 6 | --- 7 | 8 | ### Is your feature request related to a problem? Please describe. ### 9 | A clear and concise description of what the problem is. E.g., I'm always frustrated when [...] 10 | 11 | ### Describe the feature you'd like ### 12 | A clear and concise description of what you want to happen. 13 | 14 | ### Additional context ### 15 | Add any other context or screenshots about the feature request here. 16 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/general.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: General issue 3 | about: If your issue is not a bug or a feature request. Use this format. 4 | 5 | --- 6 | 7 | 8 | ### Describe the issue ### 9 | A clear and concise description of what the issue is. 10 | 11 | ### Additional context ### 12 | Add any other context or screenshots about the issue here. 13 | -------------------------------------------------------------------------------- /.github/PULL_REQUEST_TEMPLATE/pull_request_template.md: -------------------------------------------------------------------------------- 1 | 2 | ### Pull Request Description ### 3 | Describe the changes that have been made in this pull request. 4 | 5 | ### Link to Related Issue (if applicable) ### 6 | If this pull request fixes an open issue, provide a link to the issue here. 7 | 8 | ---- 9 | *Note: We will not be able to merge pull requests directly. However, if a pull request is accepted we will put the changes into the source code and publish the changes.* 10 | 11 | ---- 12 | 13 | *By submitting this pull request, I confirm that my contribution is made under the terms of the Apache-2.0 license* -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Git 2 | .git 3 | Config 4 | 5 | ### VisualStudioCode ### 6 | .vscode/* 7 | ### IntelliJ/ PyCharm ### 8 | .idea/* 9 | # System Files 10 | **/.DS_Store 11 | # CDK 12 | **/cdk.out 13 | **/cdk.context.json 14 | *.swp 15 | **/node_modules 16 | **/package-lock.json 17 | 18 | # compiled output 19 | **/global-s3-assets 20 | **/regional-s3-assets 21 | **/open-source 22 | 23 | ### Python ### 24 | # Byte-compiled / optimized / DLL files 25 | *__pycache__/ 26 | *.py[cod] 27 | *$py.class 28 | # Python Distribution / packaging 29 | *.egg-info/ 30 | *.egg 31 | # Python Virtual Environments 32 | **/venv* 33 | **/.venv* 34 | **/.env 35 | ## Python Testing 36 | .pytest_cache 37 | **/.pytest_cache 38 | **/.coverage 39 | **/.coveragerc 40 | **/coverage-reports/ -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Change Log 2 | All notable changes to this project will be documented in this file. 3 | 4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 5 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 6 | 7 | ## [2.0.1] - 2023-11-13 8 | ### Upgrade 9 | - Upgrade CDK v2.67.0 to v2.105.0 10 | - Add python version 3.10 & to 3.11 11 | - Upgrade EKS version to 1.27 12 | ### Change 13 | - To compatible with k8s 1.26+, disable userScheduler in Jupyterhub 14 | 15 | ## [2.0.0] - 2022-07-22 16 | ### Upgrade 17 | - Upgrade CDK v1 to v2 18 | - Upgrade python from 3.8 to 3.9 19 | 20 | ## [1.0.0] - 2021-07-29 21 | ### Added 22 | - All files, initial version 23 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check [existing open](https://github.com/awslabs/sql-based-etl-with-apache-spark-on-amazon-eks/issues), or [recently closed](https://github.com/awslabs/sql-based-etl-with-apache-spark-on-amazon-eks/issues?utf8=%E2%9C%93&q=is%3Aissue%20is%3Aclosed%20), issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *master* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure all build processes execute successfully (see README.md for additional guidance). 35 | 4. Ensure all unit, integration, and/or snapshot tests pass, as applicable. 36 | 5. Commit to your fork using clear commit messages. 37 | 6. Send us a pull request, answering any default questions in the pull request interface. 38 | 7. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 39 | 40 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 41 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 42 | 43 | 44 | ## Finding contributions to work on 45 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels ((enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any ['help wanted'](https://github.com/awslabs/sql-based-etl-with-apache-spark-on-amazon-eks/labels/help%20wanted) issues is a great place to start. 46 | 47 | 48 | ## Code of Conduct 49 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 50 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 51 | opensource-codeofconduct@amazon.com with any additional questions or comments. 52 | 53 | 54 | ## Security issue notifications 55 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public GitHub issue. 56 | 57 | 58 | ## Licensing 59 | 60 | See the [LICENSE](https://github.com/awslabs/sql-based-etl-with-apache-spark-on-amazon-eks/blob/master/LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 61 | 62 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes. 63 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT No Attribution 2 | 3 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy of 6 | this software and associated documentation files (the "Software"), to deal in 7 | the Software without restriction, including without limitation the rights to 8 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 9 | the Software, and to permit persons to whom the Software is furnished to do so. 10 | 11 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 12 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 13 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 14 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 15 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 16 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /NOTICE.txt: -------------------------------------------------------------------------------- 1 | sql-based-etl-with-apache-spark-on-amazon-eks 2 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | 4 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 5 | software and associated documentation files (the "Software"), to deal in the Software 6 | without restriction, including without limitation the rights to use, copy, modify, 7 | merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so. 9 | 10 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 11 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 12 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 13 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 14 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 15 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 16 | 17 | ********************** 18 | THIRD PARTY COMPONENTS 19 | ********************** 20 | This software includes third party software subject to the following copyrights: 21 | 22 | Arc -- v3.10.0_spark_3.0.3_scala_2.12_hadoop_3.2.0_1.0.0 -- https://arc.tripl.ai/ -- MIT License 23 | Arc Jupyter - v3.14.2_scala_2.12_hadoop_3.2.0_1.1.0 -- https://github.com/tripl-ai/arc-jupyter -- MIT License 24 | argo-workflows -- v3.5.4 -- https://github.com/argoproj/argo-workflows -- Apache-2.0 25 | JupyterHub -- v1.5.0 -- https://hub.jupyter.org/helm-chart/ -- revised BSD license 26 | aws-cdk -- v2.105.0 -- https://github.com/aws/aws-cdk -- Apache-2.0 27 | k8s-cluster-autoscaler -- v1.27.2 -- https://github.com/kubernetes/autoscaler -- Apache-2.0 28 | # amazon-cloudwatch-container-insights -- latest version -- https://github.com/aws-samples/amazon-cloudwatch-container-insights -- MIT-0 29 | aws-load-balancer-controller -- v2.5.4 -- https://github.com/aws/eks-charts/ -- Apache-2.0 30 | kubernetes-external-secrets -- v8.5.5 -- https://github.com/external-secrets/kubernetes-external-secrets -- MIT License 31 | spark-on-k8s-operator -- v1beta2-1.2.3-3.1.1 -- https://github.com/GoogleCloudPlatform/spark-on-k8s-operator -- Apache-2.0 32 | -------------------------------------------------------------------------------- /deployment/app_code/ecr_build_src.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/guidance-for-sql-based-etl-with-apache-spark-on-amazon-eks/06f7302c12ad69851031043068fd34abdb75a7cb/deployment/app_code/ecr_build_src.zip -------------------------------------------------------------------------------- /deployment/app_code/job/delta_load.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%conf numRows=5 logger=true" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": { 15 | "ExecuteTime": { 16 | "end_time": "2020-03-18T22:38:05.895407Z", 17 | "start_time": "2020-03-18T22:37:48.160Z" 18 | } 19 | }, 20 | "source": [ 21 | "## 2. Ingest A New Incremental CSV File\n", 22 | "### Look at record 12, the `state` is changed in the file" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "{\n", 32 | " \"type\": \"DelimitedExtract\",\n", 33 | " \"name\": \"extract incremental data\",\n", 34 | " \"environments\": [\"dev\", \"test\"],\n", 35 | " \"inputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/data/update_contacts.csv\",\n", 36 | " \"outputView\": \"delta_raw\", \n", 37 | " \"delimiter\": \"Comma\",\n", 38 | " \"header\": false,\n", 39 | " \"authentication\": {\n", 40 | " \"method\": \"AmazonIAM\"\n", 41 | " }\n", 42 | "}" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "## 2.2 Apply Data Type (reused schema file)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "{\n", 59 | " \"type\": \"TypingTransform\",\n", 60 | " \"name\": \"apply table schema 0 to incremental load\",\n", 61 | " \"environments\": [\"dev\", \"test\"],\n", 62 | " \"schemaURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/meta/contact_meta_0.json\",\n", 63 | " \"inputView\": \"delta_raw\", \n", 64 | " \"outputView\": \"delta_typed\",\n", 65 | " \"authentication\": {\n", 66 | " \"method\": \"AmazonIAM\"\n", 67 | " }\n", 68 | "}" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": { 74 | "ExecuteTime": { 75 | "end_time": "2020-06-07T15:02:50.155313Z", 76 | "start_time": "2020-06-07T15:02:50.125Z" 77 | } 78 | }, 79 | "source": [ 80 | "## 2.3 Data Quality Control (reused sql script)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "%sqlvaildate outputView=\"fail_fast\" name=\"validation\" description=\"fail the job if data transform is failed\" environments=dev,test sqlParams=inputView=delta_typed\n", 90 | "\n", 91 | "SELECT SUM(error) = 0 AS valid\n", 92 | " ,TO_JSON(\n", 93 | " NAMED_STRUCT('count', COUNT(error), 'errors', SUM(error))\n", 94 | " ) AS message\n", 95 | "FROM \n", 96 | "(\n", 97 | " SELECT CASE WHEN SIZE(_errors) > 0 THEN 1 ELSE 0 END AS error \n", 98 | " FROM ${inputView}\n", 99 | ") base" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": { 105 | "ExecuteTime": { 106 | "end_time": "2020-05-31T05:01:13.796275Z", 107 | "start_time": "2020-05-31T05:01:13.734Z" 108 | } 109 | }, 110 | "source": [ 111 | "## 2.4 Add Calculated Fields (reused sql script)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "%env \n", 121 | "ETL_CONF_CURRENT_TIMESTAMP=CURRENT_TIMESTAMP()" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "%sql outputView=\"update_load\" name=\"add calc field for SCD\" environments=dev,test sqlParams=table_name=delta_typed,now=${ETL_CONF_CURRENT_TIMESTAMP}\n", 131 | "\n", 132 | "SELECT id,name,email,state, CAST(${now} AS timestamp) AS valid_from, CAST(null AS timestamp) AS valid_to\n", 133 | ",1 AS iscurrent, md5(concat(name,email,state)) AS checksum \n", 134 | "FROM ${table_name}" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": {}, 140 | "source": [ 141 | "## 2.5 Output Incremental data to Delta Lake\n", 142 | "### Delta Lake is an optimized data lake to support Time Travel, ACID transaction" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "{\n", 152 | " \"type\": \"DeltaLakeLoad\",\n", 153 | " \"name\": \"Initial load to Data Lake\",\n", 154 | " \"environments\": [\"dev\", \"test\"],\n", 155 | " \"inputView\": \"update_load\",\n", 156 | " \"outputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/output/delta_load/\",\n", 157 | " \"numPartitions\": 2\n", 158 | " \"saveMode\": \"Overwrite\",\n", 159 | " \"authentication\": {\n", 160 | " \"method\": \"AmazonIAM\"\n", 161 | " }\n", 162 | "}" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [] 171 | } 172 | ], 173 | "metadata": { 174 | "kernelspec": { 175 | "display_name": "Arc", 176 | "language": "javascript", 177 | "name": "arc" 178 | }, 179 | "language_info": { 180 | "codemirror_mode": "javascript", 181 | "file_extension": ".json", 182 | "mimetype": "javascript", 183 | "name": "arc", 184 | "nbconvert_exporter": "arcexport", 185 | "version": "3.8.0" 186 | } 187 | }, 188 | "nbformat": 4, 189 | "nbformat_minor": 4 190 | } -------------------------------------------------------------------------------- /deployment/app_code/job/initial_load.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%conf numRows=5 logger=true" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# 1. Initial Table Load" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "{\n", 26 | " \"type\": \"DelimitedExtract\",\n", 27 | " \"name\": \"extract initial table\",\n", 28 | " \"environments\": [\"dev\", \"test\"],\n", 29 | " \"inputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/data/initial_contacts.csv\",\n", 30 | " \"outputView\": \"initial_raw\", \n", 31 | " \"delimiter\": \"Comma\",\n", 32 | " \"header\": false,\n", 33 | " \"quote\": \"None\",\n", 34 | " \"authentication\": {\n", 35 | " \"method\": \"AmazonIAM\"\n", 36 | " }\n", 37 | "}" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "## Check Original Data Schema" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "%printschema \n", 54 | "initial_raw" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": { 60 | "ExecuteTime": { 61 | "start_time": "2020-03-03T08:30:30.028Z" 62 | } 63 | }, 64 | "source": [ 65 | "## 1.2 Apply Data Type" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "{\n", 75 | " \"type\": \"TypingTransform\",\n", 76 | " \"name\": \"apply table schema 0\",\n", 77 | " \"environments\": [\"dev\", \"test\"],\n", 78 | " \"schemaURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/meta/contact_meta_0.json\",\n", 79 | " \"inputView\": \"initial_raw\", \n", 80 | " \"outputView\": \"initial_typed\",\n", 81 | " \"authentication\": {\n", 82 | " \"method\": \"AmazonIAM\"\n", 83 | " }\n", 84 | "}" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "## Check Typed Data Schema & Stats" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "%printschema \n", 101 | "initial_typed" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "## 1.3 Data Quality Control" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "%sqlvaildate outputView=\"fail_fast\" name=\"validation\" description=\"fail the job if data transform is failed\" environments=dev,test sqlParams=inputView=initial_typed\n", 118 | "\n", 119 | "SELECT SUM(error) = 0 AS valid\n", 120 | " ,TO_JSON(\n", 121 | " NAMED_STRUCT('count', COUNT(error), 'errors', SUM(error))\n", 122 | " ) AS message\n", 123 | "FROM \n", 124 | "(\n", 125 | " SELECT CASE WHEN SIZE(_errors) > 0 THEN 1 ELSE 0 END AS error \n", 126 | " FROM ${inputView}\n", 127 | ") base" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "## 1.4 Add Calculated Fields for SCD Type 2\n", 135 | "### CURRENT_TIMESTAMP will be passed in automatically, when the ETL job is triggered" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "%env \n", 145 | "ETL_CONF_CURRENT_TIMESTAMP=current_timestamp()" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "%sql outputView=\"initial_load\" name=\"add calc field for SCD\" environments=dev,test sqlParams=table_name=initial_typed,now=${ETL_CONF_CURRENT_TIMESTAMP}\n", 155 | "\n", 156 | "SELECT id,name,email,state, CAST(${now} AS timestamp) AS valid_from, CAST(null AS timestamp) AS valid_to\n", 157 | ",1 AS iscurrent, md5(concat(name,email,state)) AS checksum \n", 158 | "FROM ${table_name}" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "## 1.5 Load to Delta Lake as the initial daily snaptshot table\n", 166 | "### Delta Lake is an optimized data lake to support Time Travel, ACID transaction" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "{\n", 176 | " \"type\": \"DeltaLakeLoad\",\n", 177 | " \"name\": \"Initial load to Data Lake\",\n", 178 | " \"environments\": [\"dev\", \"test\"],\n", 179 | " \"inputView\": \"initial_load\",\n", 180 | " \"outputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/output/contact_snapshot/\",\n", 181 | " \"numPartitions\": 2\n", 182 | " \"saveMode\": \"Overwrite\",\n", 183 | " \"authentication\": {\n", 184 | " \"method\": \"AmazonIAM\"\n", 185 | " }\n", 186 | "}" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [] 195 | } 196 | ], 197 | "metadata": { 198 | "kernelspec": { 199 | "display_name": "Arc", 200 | "language": "javascript", 201 | "name": "arc" 202 | }, 203 | "language_info": { 204 | "codemirror_mode": "javascript", 205 | "file_extension": ".json", 206 | "mimetype": "javascript", 207 | "name": "arc", 208 | "nbconvert_exporter": "arcexport", 209 | "version": "3.8.0" 210 | } 211 | }, 212 | "nbformat": 4, 213 | "nbformat_minor": 4 214 | } 215 | -------------------------------------------------------------------------------- /deployment/app_code/job/scd2_merge.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 3. Read initial & incremental tables from Delta Lake" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "{\n", 17 | " \"type\": \"DeltaLakeExtract\",\n", 18 | " \"name\": \"read initial load table\",\n", 19 | " \"description\": \"read initial load table\",\n", 20 | " \"environments\": [\n", 21 | " \"dev\",\n", 22 | " \"test\"\n", 23 | " ],\n", 24 | " \"inputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/output/contact_snapshot/\",\n", 25 | " \"outputView\": \"current_snapshot\"\n", 26 | "}" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "{\n", 36 | " \"type\": \"DeltaLakeExtract\",\n", 37 | " \"name\": \"read contact Delta Lake table\",\n", 38 | " \"description\": \"read contact table\",\n", 39 | " \"environments\": [\n", 40 | " \"dev\",\n", 41 | " \"test\"\n", 42 | " ],\n", 43 | " \"inputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/output/delta_load/\",\n", 44 | " \"outputView\": \"delta_data\"\n", 45 | "}" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": { 51 | "ExecuteTime": { 52 | "end_time": "2020-05-31T05:03:33.741024Z", 53 | "start_time": "2020-05-31T05:03:33.247Z" 54 | } 55 | }, 56 | "source": [ 57 | "## 3.2 Prepare Datasets for SCD Type2 Insert\n", 58 | "\n", 59 | "- Generate extra rows for changed records.\n", 60 | "- The 'null' merge_key means it will be inserted, not update existing records according to the rule in SCD type2" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "%sql outputView=\"staged_update\" name=\"generate extra rows for SCD\" environments=dev,test\n", 70 | "\n", 71 | "SELECT NULL AS mergeKey, new.*\n", 72 | "FROM current_snapshot old\n", 73 | "INNER JOIN delta_data new\n", 74 | "ON old.id = new.id\n", 75 | "WHERE old.iscurrent=1\n", 76 | "AND old.checksum<>new.checksum\n", 77 | "\n", 78 | "UNION\n", 79 | "\n", 80 | "SELECT id AS mergeKey, *\n", 81 | "FROM delta_data" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "## 3.3 Implement the Type 2 SCD merge operation" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "%conf logger=true" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "%sql outputView=\"target_merged\" name=\"merge into existing contacts table\" environments=dev,test\n", 107 | "\n", 108 | "MERGE INTO current_snapshot tgt\n", 109 | "USING staged_update src\n", 110 | "ON tgt.id = src.mergeKey\n", 111 | "WHEN MATCHED AND src.checksum != tgt.checksum AND tgt.iscurrent = 1 THEN \n", 112 | " UPDATE SET \n", 113 | " valid_to = src.valid_from, \n", 114 | " iscurrent = 0\n", 115 | "WHEN NOT MATCHED THEN \n", 116 | " INSERT *" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": {}, 122 | "source": [ 123 | "## 3.4 Create a Delta Lake table in Athena\n", 124 | "### Build up a Glue Data Catalog via Athena. This step can be done by Glue Crawler. However, it makes sense if we refresh partitions, create/update data catalog at the end of each ETL process, which is provides the data lineage contro at a single place." 125 | ] 126 | }, 127 | { 128 | "cell_type": "code", 129 | "execution_count": null, 130 | "metadata": {}, 131 | "outputs": [], 132 | "source": [ 133 | "{\n", 134 | " \"type\": \"JDBCExecute\",\n", 135 | " \"name\": \"Create glue data catalog\",\n", 136 | " \"environments\": [\n", 137 | " \"dev\",\n", 138 | " \"test\"\n", 139 | " ],\n", 140 | " \"inputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/sql/create_table_contact.sql\",\n", 141 | " \"jdbcURL\": \"jdbc:awsathena://AwsRegion=\"${AWS_DEFAULT_REGION}\";S3OutputLocation=s3://\"${ETL_CONF_DATALAKE_LOC}\"/athena-query-result;AwsCredentialsProviderClass=com.amazonaws.auth.WebIdentityTokenCredentialsProvider\",\n", 142 | " \"sqlParams\":{\n", 143 | " \"datalake_loc\": \"'s3://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/output/contact_snapshot\\/'\",\n", 144 | " \"table_name\": \"default.contact_snapshot\"\n", 145 | " }\n", 146 | "}" 147 | ] 148 | }, 149 | { 150 | "cell_type": "markdown", 151 | "metadata": {}, 152 | "source": [ 153 | "# 4. Query Delta Lake (validation steps)\n", 154 | "### to stop executing the followings in a productionized ETL job, use a fake environment `uat`\n", 155 | "### the same queries can be run in Athena" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": null, 161 | "metadata": {}, 162 | "outputs": [], 163 | "source": [ 164 | "{\n", 165 | " \"type\": \"DeltaLakeExtract\",\n", 166 | " \"name\": \"read contact Delta Lake table\",\n", 167 | " \"description\": \"read contact table\",\n", 168 | " \"environments\": [\n", 169 | " \"uat\"\n", 170 | " ],\n", 171 | " \"inputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/output/contact_snapshot\",\n", 172 | " \"outputView\": \"contact_snapshot\"\n", 173 | "}" 174 | ] 175 | }, 176 | { 177 | "cell_type": "markdown", 178 | "metadata": {}, 179 | "source": [ 180 | "## Confirm 92 records are expired" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": null, 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "%sql outputView=\"expired_count\" name=\"expired_count\" environments=uat\n", 190 | "SELECT count(*) FROM contact_snapshot WHERE valid_to is not null" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "%metadata \n", 200 | "contact_snapshot" 201 | ] 202 | }, 203 | { 204 | "cell_type": "markdown", 205 | "metadata": {}, 206 | "source": [ 207 | " ## Confirm we now have 1192 records" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": null, 213 | "metadata": {}, 214 | "outputs": [], 215 | "source": [ 216 | "%sql outputView=\"total_count\" name=\"total_count\" environments=uat\n", 217 | "SELECT count(*) FROM contact_snapshot" 218 | ] 219 | }, 220 | { 221 | "cell_type": "markdown", 222 | "metadata": {}, 223 | "source": [ 224 | "## View one of the changed records" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": null, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "%sql outputView=\"validate_type2\" name=\"validate_type2\" environments=uat\n", 234 | "SELECT * FROM contact_snapshot WHERE id=12" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": null, 240 | "metadata": {}, 241 | "outputs": [], 242 | "source": [] 243 | } 244 | ], 245 | "metadata": { 246 | "kernelspec": { 247 | "display_name": "Arc", 248 | "language": "javascript", 249 | "name": "arc" 250 | }, 251 | "language_info": { 252 | "codemirror_mode": "javascript", 253 | "file_extension": ".json", 254 | "mimetype": "javascript", 255 | "name": "arc", 256 | "nbconvert_exporter": "arcexport", 257 | "version": "3.8.0" 258 | } 259 | }, 260 | "nbformat": 4, 261 | "nbformat_minor": 4 262 | } 263 | -------------------------------------------------------------------------------- /deployment/app_code/job/wordcount.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pyspark.sql import SparkSession 3 | spark = SparkSession.builder.appName('NYC taxi vendor count').getOrCreate() 4 | df = spark.read.option("header",True).csv(sys.argv[1]) 5 | df.filter(df["vendor_name"].isNotNull()).select("vendor_name").groupBy("vendor_name").count().write.mode("overwrite").parquet(sys.argv[2]) 6 | exit() -------------------------------------------------------------------------------- /deployment/app_code/meta/contact_meta_0.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "name": "id", 4 | "description": "contact id", 5 | "trim": true, 6 | "nullable": false, 7 | "primaryKey": true, 8 | "type": "integer" 9 | }, 10 | { 11 | "name": "name", 12 | "description": "contact name", 13 | "trim": true, 14 | "nullable": true, 15 | "primaryKey": false, 16 | "type": "string", 17 | "nullableValues": [ 18 | "", 19 | "null" 20 | ] 21 | }, 22 | { 23 | "name": "email", 24 | "description": "contact email", 25 | "trim": true, 26 | "nullable": true, 27 | "primaryKey": false, 28 | "type": "string", 29 | "nullableValues": [ 30 | "", 31 | "null" 32 | ] 33 | }, 34 | { 35 | "name": "state", 36 | "description": "state in the country of the contact", 37 | "trim": true, 38 | "nullable": true, 39 | "primaryKey": false, 40 | "type": "string", 41 | "nullableValues": [ 42 | "", 43 | "null" 44 | ] 45 | } 46 | ] -------------------------------------------------------------------------------- /deployment/app_code/sql/add_calc_field_for_scd2.sql: -------------------------------------------------------------------------------- 1 | SELECT id 2 | , name 3 | , email 4 | , state 5 | , ${CURRENT_TIMESTAMP} AS valid_from 6 | , CAST(null AS timestamp) AS valid_to 7 | , 1 AS iscurrent 8 | , md5(concat(name,email,state)) AS checksum 9 | FROM ${table_name} -------------------------------------------------------------------------------- /deployment/app_code/sql/create_table_contact.sql: -------------------------------------------------------------------------------- 1 | CREATE EXTERNAL TABLE IF NOT EXISTS ${table_name} 2 | LOCATION ${datalake_loc} 3 | TBLPROPERTIES ( 4 | 'table_type' = 'DELTA' 5 | ) -------------------------------------------------------------------------------- /deployment/app_code/sql/sqlvalidate_errors.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | SUM(error) = 0 AS valid 3 | ,TO_JSON( 4 | NAMED_STRUCT( 5 | 'count', COUNT(error), 6 | 'errors', SUM(error) 7 | ) 8 | ) AS message 9 | FROM ( 10 | SELECT CASE WHEN SIZE(_errors) > 0 THEN 1 ELSE 0 END AS error 11 | FROM ${inputView} 12 | ) base -------------------------------------------------------------------------------- /deployment/build-s3-dist.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # This script packages your project into a solution distributable that can be 4 | # used as an input to the solution builder validation pipeline. 5 | # 6 | # Important notes and prereq's: 7 | # 1. The initialize-repo.sh script must have been run in order for this script to 8 | # function properly. 9 | # 2. This script should be run from the repo's root folder. 10 | # 11 | # This script will perform the following tasks: 12 | # 1. Remove any old dist files from previous runs. 13 | # 2. Install dependencies for the cdk-solution-helper; responsible for 14 | # converting standard 'cdk synth' output into solution assets. 15 | # 3. Build and synthesize your CDK project. 16 | # 4. Run the cdk-solution-helper on template outputs and organize 17 | # those outputs into the /global-s3-assets folder. 18 | # 5. Organize source code artifacts into the /regional-s3-assets folder. 19 | # 6. Remove any temporary files used for staging. 20 | # 21 | # Parameters: 22 | # - source-bucket-base-name: Name for the S3 bucket location where the template will source the Lambda 23 | # code from. The template will append '-[region_name]' to this bucket name. 24 | # For example: ./build-s3-dist.sh solutions v1.0.0 25 | # The template will then expect the source code to be located in the solutions-[region_name] bucket 26 | # - solution-name: name of the solution for consistency 27 | # - version-code: version of the package 28 | 29 | # Important: CDK global version number 30 | cdk_version===2.105.0 31 | 32 | # Check to see if the required parameters have been provided: 33 | if [ -z "$1" ] || [ -z "$2" ] || [ -z "$3" ]; then 34 | echo "Please provide the base source bucket name, trademark approved solution name and version where the lambda code will eventually reside." 35 | echo "For example: ./build-s3-dist.sh solutions trademarked-solution-name v1.0.0 template-bucket-name" 36 | exit 1 37 | fi 38 | 39 | # Get reference for all important folders 40 | template_dir="$PWD" 41 | staging_dist_dir="$template_dir/staging" 42 | app_code_dir="$template_dir/deployment/app_code" 43 | template_dist_dir="$template_dir/deployment/global-s3-assets" 44 | build_dist_dir="$template_dir/deployment/regional-s3-assets" 45 | source_dir="$template_dir/source" 46 | 47 | echo "------------------------------------------------------------------------------" 48 | echo "[Init] Remove any old dist files from previous runs" 49 | echo "------------------------------------------------------------------------------" 50 | 51 | echo "rm -rf $template_dist_dir" 52 | rm -rf $template_dist_dir 53 | echo "mkdir -p $template_dist_dir" 54 | mkdir -p $template_dist_dir 55 | echo "rm -rf $build_dist_dir" 56 | rm -rf $build_dist_dir 57 | echo "mkdir -p $build_dist_dir" 58 | mkdir -p $build_dist_dir 59 | echo "rm -rf $staging_dist_dir" 60 | rm -rf $staging_dist_dir 61 | echo "mkdir -p $staging_dist_dir" 62 | mkdir -p $staging_dist_dir 63 | 64 | echo "------------------------------------------------------------------------------" 65 | echo "[Init] Install dependencies for the cdk-solution-helper" 66 | echo "------------------------------------------------------------------------------" 67 | 68 | echo "cd $template_dir/deployment/cdk-solution-helper" 69 | cd $template_dir/deployment/cdk-solution-helper 70 | echo "npm install" 71 | # npm i --package-lock-only 72 | # npm audit fix 73 | npm install 74 | 75 | cd $template_dir 76 | echo "pip3 install -q $source_dir" 77 | python3 -m venv .env 78 | source .env/bin/activate 79 | pip3 install --upgrade pip -q $source_dir 80 | 81 | echo "------------------------------------------------------------------------------" 82 | echo "[Packing] solution_helper lambda function" 83 | echo "------------------------------------------------------------------------------" 84 | 85 | # echo "cd $source_dir/lib/solution_helper" 86 | # cd $source_dir/lib/solution_helper 87 | # echo "pip install -r requirements.txt --target ../package" 88 | # pip install -r requirements.txt --target ../package 89 | # cd $source_dir/lib/package || exit 1 90 | # echo "zip -q -r9 $app_code_dir/solution_helper.zip ." 91 | # zip -q -r9 $app_code_dir/solution_helper.zip . 92 | # echo "cd $source_dir/lib/solution_helper" || exit 1 93 | # cd $source_dir/lib/solution_helper 94 | # echo "zip -g -r $app_code_dir/solution_helper.zip lambda_function.py" 95 | # zip -g -r $app_code_dir/solution_helper.zip lambda_function.py 96 | # echo "rm -rf $source_dir/lib/package" 97 | # rm -rf $source_dir/lib/package 98 | 99 | echo "------------------------------------------------------------------------------" 100 | echo "[Packing] ecr image build" 101 | echo "------------------------------------------------------------------------------" 102 | 103 | echo "cd $source_dir/lib/ecr_build" 104 | cd $source_dir/lib/ecr_build 105 | echo "zip -q -r9 $app_code_dir/ecr_build_src.zip ." 106 | zip -q -r9 $app_code_dir/ecr_build_src.zip . 107 | cd $source_dir 108 | 109 | echo "------------------------------------------------------------------------------" 110 | echo "[Synth] CDK Project" 111 | echo "------------------------------------------------------------------------------" 112 | 113 | # # Install the global aws-cdk package 114 | echo "npm install -g aws-cdk@$cdk_version" 115 | # npm i --package-lock-only 116 | # npm audit fix 117 | npm install aws-cdk@$cdk_version 118 | 119 | # Run 'cdk synth' to generate raw solution outputs 120 | echo "cdk synth --output=$staging_dist_dir" 121 | node_modules/aws-cdk/bin/cdk synth --output=$staging_dist_dir 122 | 123 | # Remove unnecessary output files 124 | echo "cd $staging_dist_dir" 125 | cd $staging_dist_dir 126 | echo "rm tree.json manifest.json cdk.out" 127 | rm tree.json manifest.json cdk.out 128 | 129 | echo "------------------------------------------------------------------------------" 130 | echo "[Packing] Template artifacts" 131 | echo "------------------------------------------------------------------------------" 132 | 133 | # Move outputs from staging to template_dist_dir 134 | echo "Move outputs from staging to template_dist_dir" 135 | mv $staging_dist_dir/*.json $template_dist_dir/ 136 | 137 | # Rename all *.template.json files to *.template 138 | echo "Rename all *.template.json to *.template" 139 | echo "copy templates and rename" 140 | for f in $template_dist_dir/*.template.json; do 141 | mv -- "$f" "${f%.template.json}.template" 142 | done 143 | 144 | # Run the helper to clean-up the templates and remove unnecessary CDK elements 145 | echo "Run the helper to clean-up the templates and remove unnecessary CDK elements" 146 | echo "node $template_dir/deployment/cdk-solution-helper/index" 147 | node $template_dir/deployment/cdk-solution-helper/index 148 | if [ "$?" = "1" ]; then 149 | echo "(cdk-solution-helper) ERROR: there is likely output above." 1>&2 150 | exit 1 151 | fi 152 | 153 | # Find and replace bucket_name, solution_name, and version 154 | echo "Find and replace bucket_name, solution_name, and version" 155 | cd $template_dist_dir 156 | echo "Updating code source bucket in template with $1" 157 | replace="s/%%BUCKET_NAME%%/$1/g" 158 | echo "sed -i '' -e $replace $template_dist_dir/*.template" 159 | sed -i '' -e $replace $template_dist_dir/*.template 160 | replace="s/%%SOLUTION_NAME%%/$2/g" 161 | echo "sed -i '' -e $replace $template_dist_dir/*.template" 162 | sed -i '' -e $replace $template_dist_dir/*.template 163 | replace="s/%%VERSION%%/$3/g" 164 | echo "sed -i '' -e $replace $template_dist_dir/*.template" 165 | sed -i '' -e $replace $template_dist_dir/*.template 166 | 167 | # Put Global and Regional code files in a single bucket if the 4th parameter doesn't exist 168 | if [ -z "$4" ]; then 169 | replace="s/%%TEMPLATE_OUTPUT_BUCKET%%/$1"-"${AWS_REGION}/g" 170 | else 171 | replace="s/%%TEMPLATE_OUTPUT_BUCKET%%/$4/g" 172 | fi 173 | 174 | echo "sed -i '' -e $replace $template_dist_dir/*.template" 175 | sed -i '' -e $replace $template_dist_dir/*.template 176 | 177 | rm $template_dist_dir/*.json 178 | 179 | echo "------------------------------------------------------------------------------" 180 | echo "[Packing] Source code artifacts" 181 | echo "------------------------------------------------------------------------------" 182 | 183 | # General cleanup of node_modules and package-lock.json files 184 | echo "find $staging_dist_dir -iname "node_modules" -type d -exec rm -rf "{}" \; 2> /dev/null" 185 | find $staging_dist_dir -iname "node_modules" -type d -exec rm -rf "{}" \; 2> /dev/null 186 | echo "find $staging_dist_dir -iname "package-lock.json" -type f -exec rm -f "{}" \; 2> /dev/null" 187 | find $staging_dist_dir -iname "package-lock.json" -type f -exec rm -f "{}" \; 2> /dev/null 188 | 189 | # ... For each asset.* source code artifact in the temporary /staging folder... 190 | cd $staging_dist_dir 191 | for d in `find . -mindepth 1 -maxdepth 1 -type d`; do 192 | 193 | # Rename the artifact, removing the period for handler compatibility 194 | pfname="$(basename -- $d)" 195 | fname="$(echo $pfname | sed -e 's/\.//g')" 196 | echo "zip -r $fname.zip $fname" 197 | mv $d $fname 198 | cd $staging_dist_dir/$fname 199 | 200 | # Build the artifcats 201 | if ls *.py 1> /dev/null 2>&1; then 202 | echo "====================================" 203 | echo "This is Python runtime" 204 | echo "====================================" 205 | venv_folder=".venv-prod" 206 | rm -fr .venv-test 207 | rm -fr .venv-prod 208 | echo "Initiating virtual environment" 209 | python3 -m venv $venv_folder 210 | source $venv_folder/bin/activate 211 | pip3 install --upgrade pip -q $source_dir --target $venv_folder/lib/python3.*/site-packages 212 | echo "package python artifact" 213 | cd $venv_folder/lib/python3.*/site-packages 214 | zip -qr9 $staging_dist_dir/$fname.zip . -x "aws_cdk/*" 215 | echo "zip -r $staging_dist_dir/$fname" 216 | cd $staging_dist_dir/$fname 217 | rm -rf $venv_folder 218 | zip -grq $staging_dist_dir/$fname.zip . 219 | 220 | elif ls *.js 1> /dev/null 2>&1; then 221 | echo "====================================" 222 | echo "This is Node runtime" 223 | echo "====================================" 224 | echo "Clean and rebuild artifacts" 225 | echo "copy package.json and package-lock.json files" 226 | # npm audit fix --force 227 | cp -rf $template_dir/deployment/cdk-solution-helper/*.json . 228 | npm run 229 | npm ci 230 | if [ "$?" = "1" ]; then 231 | echo "ERROR: Seems like package-lock.json does not exists or is out of sync with package.josn. Trying npm install instead" 1>&2 232 | npm install --package-lock 233 | fi 234 | # Zip the artifact 235 | echo "zip -r $staging_dist_dir/$fname" 236 | zip -qr9 $staging_dist_dir/$fname.zip . 237 | else 238 | # Zip the artifact 239 | echo "zip -r $staging_dist_dir/$fname" 240 | zip -rq $staging_dist_dir/$fname.zip . 241 | fi 242 | 243 | cd $staging_dist_dir 244 | # Copy the zipped artifact from /staging to /regional-s3-assets 245 | echo "cp $fname.zip $build_dist_dir" 246 | mv $fname.zip $build_dist_dir 247 | 248 | # Remove the old, unzipped artifact from /staging 249 | echo "rm -rf $fname" 250 | rm -rf $fname 251 | 252 | # ... repeat until all source code artifacts are zipped and placed in the 253 | # ... /regional-s3-assets folder 254 | 255 | done 256 | 257 | echo "------------------------------------------------------------------------------" 258 | echo "[Move] the zip files from staging to regional-s3-assets folder" 259 | echo "------------------------------------------------------------------------------" 260 | for d in `find . -mindepth 1 -maxdepth 1`; do 261 | pfname="$(basename -- $d)" 262 | fname="$(echo $pfname | sed -e 's/asset./asset/g')" 263 | mv $d $build_dist_dir/$fname 264 | done 265 | 266 | echo "------------------------------------------------------------------------------" 267 | echo "[Cleanup] Remove temporary files" 268 | echo "------------------------------------------------------------------------------" 269 | 270 | # Delete the temporary /staging folder 271 | echo "rm -rf $staging_dist_dir" 272 | rm -rf $staging_dist_dir 273 | 274 | -------------------------------------------------------------------------------- /deployment/cdk-solution-helper/README.md: -------------------------------------------------------------------------------- 1 | # cdk-solution-helper 2 | 3 | A lightweight helper function that cleans-up synthesized templates from the AWS Cloud Development Kit (CDK) and prepares 4 | them for use with the AWS Solutions publishing pipeline. This function performs the following tasks: 5 | 6 | #### Lambda function preparation 7 | 8 | Replaces the AssetParameter-style properties that identify source code for Lambda functions with the common variables 9 | used by the AWS Solutions publishing pipeline. 10 | 11 | - `Code.S3Bucket` is assigned the `%%BUCKET_NAME%%` placeholder value. 12 | - `Code.S3Key` is assigned the `%%SOLUTION_NAME%%`/`%%VERSION%%` placeholder value. 13 | - `Handler` is given a prefix identical to the artifact hash, enabling the Lambda function to properly find the handler in the extracted source code package. 14 | 15 | These placeholders are then replaced with the appropriate values using the default find/replace operation run by the pipeline. 16 | 17 | Before: 18 | ``` 19 | "examplefunction67F55935": { 20 | "Type": "AWS::Lambda::Function", 21 | "Properties": { 22 | "Code": { 23 | "S3Bucket": { 24 | "Ref": "AssetParametersd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7S3Bucket54E71A95" 25 | }, 26 | "S3Key": { 27 | "Fn::Join": [ 28 | "", 29 | [ 30 | { 31 | "Fn::Select": [ 32 | 0, 33 | { 34 | "Fn::Split": [ 35 | "||", 36 | { 37 | "Ref": "AssetParametersd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7S3VersionKeyC789D8B1" 38 | } 39 | ] 40 | } 41 | ] 42 | }, 43 | { 44 | "Fn::Select": [ 45 | 1, 46 | { 47 | "Fn::Split": [ 48 | "||", 49 | { 50 | "Ref": "AssetParametersd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7S3VersionKeyC789D8B1" 51 | } 52 | ] 53 | } 54 | ] 55 | } 56 | ] 57 | ] 58 | } 59 | }, ... 60 | Handler: "index.handler", ... 61 | ``` 62 | 63 | After helper function run: 64 | ``` 65 | "examplefunction67F55935": { 66 | "Type": "AWS::Lambda::Function", 67 | "Properties": { 68 | "Code": { 69 | "S3Bucket": "%%BUCKET_NAME%%", 70 | "S3Key": "%%SOLUTION_NAME%%/%%VERSION%%/assetd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7.zip" 71 | }, ... 72 | "Handler": "assetd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7/index.handler" 73 | ``` 74 | 75 | After build script run: 76 | ``` 77 | "examplefunction67F55935": { 78 | "Type": "AWS::Lambda::Function", 79 | "Properties": { 80 | "Code": { 81 | "S3Bucket": "solutions", 82 | "S3Key": "trademarked-solution-name/v1.0.0/asset.d513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7.zip" 83 | }, ... 84 | "Handler": "assetd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7/index.handler" 85 | ``` 86 | 87 | After CloudFormation deployment: 88 | ``` 89 | "examplefunction67F55935": { 90 | "Type": "AWS::Lambda::Function", 91 | "Properties": { 92 | "Code": { 93 | "S3Bucket": "solutions-us-east-1", 94 | "S3Key": "trademarked-solution-name/v1.0.0/asset.d513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7.zip" 95 | }, ... 96 | "Handler": "assetd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7/index.handler" 97 | ``` 98 | 99 | #### Template cleanup 100 | 101 | Cleans-up the parameters section and improves readability by removing the AssetParameter-style fields that would have 102 | been used to specify Lambda source code properties. This allows solution-specific parameters to be highlighted and 103 | removes unnecessary clutter. 104 | 105 | Before: 106 | ``` 107 | "Parameters": { 108 | "AssetParametersd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7S3Bucket54E71A95": { 109 | "Type": "String", 110 | "Description": "S3 bucket for asset \"d513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7\"" 111 | }, 112 | "AssetParametersd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7S3VersionKeyC789D8B1": { 113 | "Type": "String", 114 | "Description": "S3 key for asset version \"d513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7\"" 115 | }, 116 | "AssetParametersd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7ArtifactHash7AA751FE": { 117 | "Type": "String", 118 | "Description": "Artifact hash for asset \"d513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7\"" 119 | }, 120 | "CorsEnabled" : { 121 | "Description" : "Would you like to enable Cross-Origin Resource Sharing (CORS) for the image handler API? Select 'Yes' if so.", 122 | "Default" : "No", 123 | "Type" : "String", 124 | "AllowedValues" : [ "Yes", "No" ] 125 | }, 126 | "CorsOrigin" : { 127 | "Description" : "If you selected 'Yes' above, please specify an origin value here. A wildcard (*) value will support any origin.", 128 | "Default" : "*", 129 | "Type" : "String" 130 | } 131 | } 132 | ``` 133 | 134 | After: 135 | ``` 136 | "Parameters": { 137 | "CorsEnabled" : { 138 | "Description" : "Would you like to enable Cross-Origin Resource Sharing (CORS) for the image handler API? Select 'Yes' if so.", 139 | "Default" : "No", 140 | "Type" : "String", 141 | "AllowedValues" : [ "Yes", "No" ] 142 | }, 143 | "CorsOrigin" : { 144 | "Description" : "If you selected 'Yes' above, please specify an origin value here. A wildcard (*) value will support any origin.", 145 | "Default" : "*", 146 | "Type" : "String" 147 | } 148 | } 149 | ``` 150 | 151 | *** 152 | © Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. -------------------------------------------------------------------------------- /deployment/cdk-solution-helper/index.js: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | // Imports 3 | const fs = require('fs'); 4 | 5 | // Paths 6 | var currentPath = process.cwd(); 7 | const global_s3_assets = currentPath+'/../deployment/global-s3-assets'; 8 | const solution_name='sql-based-etl-with-apache-spark-on-amazon-eks'; 9 | 10 | function setParameter(template) { 11 | const parameters = (template.Parameters) ? template.Parameters : {}; 12 | const assetParameters = Object.keys(parameters).filter(function(key) { 13 | return key.includes('BootstrapVersion'); 14 | }); 15 | assetParameters.forEach(function(a) { 16 | template.Parameters[a] = undefined; 17 | }); 18 | const rules = (template.Rules) ? template.Rules : {}; 19 | const rule = Object.keys(rules).filter(function(key) { 20 | return key.includes('CheckBootstrapVersion'); 21 | }); 22 | rule.forEach(function(a) { 23 | template.Rules[a] = undefined; 24 | }) 25 | } 26 | function assetRef(s3BucketRef) { 27 | // Get S3 bucket key references from assets file 28 | const raw_meta = fs.readFileSync(`${global_s3_assets}/${solution_name}.assets.json`); 29 | let template = JSON.parse(raw_meta); 30 | const metadata = (template.files[s3BucketRef]) ? template.files[s3BucketRef] : {}; 31 | var assetPath = metadata.source.path.replace('.json',''); 32 | return assetPath; 33 | } 34 | 35 | // For each template in global_s3_assets ... 36 | fs.readdirSync(global_s3_assets).forEach(file => { 37 | if ( file != `${solution_name}.assets.json`) { 38 | // Import and parse template file 39 | const raw_template = fs.readFileSync(`${global_s3_assets}/${file}`); 40 | let template = JSON.parse(raw_template); 41 | 42 | //1. Clean-up parameters section 43 | setParameter(template); 44 | 45 | const resources = (template.Resources) ? template.Resources : {}; 46 | //3. Clean-up Account ID and region to enable cross account deployment 47 | const rsrctype=[ 48 | "AWS::Lambda::Function", 49 | "AWS::Lambda::LayerVersion", 50 | "Custom::CDKBucketDeployment", 51 | "AWS::CloudFormation::Stack", 52 | "AWS::CloudFront::Distribution" 53 | ] 54 | const focusTemplate = Object.keys(resources).filter(function(key) { 55 | return (resources[key].Type.indexOf(rsrctype) < 0) 56 | }); 57 | focusTemplate.forEach(function(f) { 58 | const fn = template.Resources[f]; 59 | if (fn.Properties.hasOwnProperty('Code') && fn.Properties.Code.hasOwnProperty('S3Bucket')) { 60 | // Set Lambda::Function S3 reference to regional folder 61 | if (! String(fn.Properties.Code.S3Bucket.Ref).startsWith('appcode')){ 62 | fn.Properties.Code.S3Key = `%%SOLUTION_NAME%%/%%VERSION%%/asset`+fn.Properties.Code.S3Key; 63 | fn.Properties.Code.S3Bucket = {'Fn::Sub': '%%BUCKET_NAME%%-${AWS::Region}'}; 64 | } 65 | } 66 | else if (fn.Properties.hasOwnProperty('Content') && fn.Properties.Content.hasOwnProperty('S3Bucket')) { 67 | // Set Lambda::LayerVersion S3 bucket reference 68 | fn.Properties.Content.S3Key = `%%SOLUTION_NAME%%/%%VERSION%%/asset`+fn.Properties.Content.S3Key; 69 | fn.Properties.Content.S3Bucket = {'Fn::Sub': '%%BUCKET_NAME%%-${AWS::Region}'}; 70 | } 71 | else if (fn.Properties.hasOwnProperty('SourceBucketNames')) { 72 | // Set CDKBucketDeployment S3 bucket reference 73 | fn.Properties.SourceObjectKeys = [`%%SOLUTION_NAME%%/%%VERSION%%/asset`+fn.Properties.SourceObjectKeys[0]]; 74 | fn.Properties.SourceBucketNames = [{'Fn::Sub': '%%BUCKET_NAME%%-${AWS::Region}'}]; 75 | } 76 | else if (fn.Properties.hasOwnProperty('PolicyName') && fn.Properties.PolicyName.includes('CustomCDKBucketDeployment')) { 77 | // Set CDKBucketDeployment S3 bucket Policy reference 78 | fn.Properties.PolicyDocument.Statement.forEach(function(sub,i) { 79 | if (typeof(sub.Resource[i]) === 'object') { 80 | sub.Resource.forEach(function(resource){ 81 | var arrayKey = Object.keys(resource); 82 | if (typeof(resource[arrayKey][1]) === 'object') { 83 | resource[arrayKey][1].filter(function(s){ 84 | if (s.hasOwnProperty('Ref')) { 85 | fn.Properties.PolicyDocument.Statement[i].Resource = [ 86 | {"Fn::Join": ["",["arn:",{"Ref": "AWS::Partition"},":s3:::%%BUCKET_NAME%%-",{"Ref": "AWS::Region"}]]}, 87 | {"Fn::Join": ["",["arn:",{"Ref": "AWS::Partition"},":s3:::%%BUCKET_NAME%%-",{"Ref": "AWS::Region"},"/*"]]}]}})}})}}); 88 | } 89 | // Set NestedStack S3 bucket reference 90 | else if (fn.Properties.hasOwnProperty('TemplateURL')) { 91 | var key=fn.Properties.TemplateURL['Fn::Join'][1][6].replace('.json','').replace('/',''); 92 | var assetPath = assetRef(key); 93 | fn.Properties.TemplateURL = { 94 | "Fn::Join": ["", 95 | [ 96 | "https://s3.", 97 | { 98 | "Ref": "AWS::URLSuffix" 99 | }, 100 | "/", 101 | `%%TEMPLATE_OUTPUT_BUCKET%%/%%SOLUTION_NAME%%/%%VERSION%%/${assetPath}` 102 | ]] 103 | }; 104 | } 105 | // Set CloudFront logging bucket 106 | else if (fn.Properties.hasOwnProperty('DistributionConfig')){ 107 | fn.Properties.DistributionConfig.Logging.Bucket= { 108 | "Fn::Join": ["",[fn.Properties.DistributionConfig.Logging.Bucket['Fn::Join'][1][0], 109 | ".s3.",{"Ref": "AWS::Region"},".",{"Ref": "AWS::URLSuffix"}]] 110 | } 111 | } 112 | }); 113 | 114 | //6. Output modified template file 115 | const output_template = JSON.stringify(template, null, 2); 116 | fs.writeFileSync(`${global_s3_assets}/${file}`, output_template); 117 | } 118 | }); -------------------------------------------------------------------------------- /deployment/cdk-solution-helper/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "cdk-solution-helper", 3 | "version": "0.1.0", 4 | "devDependencies": { 5 | "fs": "0.0.1-security" 6 | }, 7 | "dependencies": { 8 | "fs": "0.0.1-security" 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /deployment/delete_all.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export stack_name="${1:-SparkOnEKS}" 4 | export region="${2:-us-east-1}" 5 | lower_stack_name=$(echo $stack_name | tr '[:upper:]' '[:lower:]') 6 | 7 | echo "=================================================================================================" 8 | echo " Make sure your CloudFormation stack name $stack_name is correct and exists in region: $region " 9 | echo " If you use a different name, rerun the script with the parameters:" 10 | echo " ./deployment/delete_all.sh " 11 | echo "=================================================================================================" 12 | 13 | # delete s3 14 | code_bucket=$(aws cloudformation describe-stacks --stack-name $stack_name --region $region --query "Stacks[0].Outputs[?OutputKey=='CODEBUCKET'].OutputValue" --output text) 15 | if ! [ -z "$code_bucket" ] 16 | then 17 | if ! [ -z $(aws s3api list-buckets --region $region --query 'Buckets[?Name==`'$code_bucket'`].Name' --output text) ]; then 18 | echo "Delete logs from S3" 19 | aws s3 rm s3://${code_bucket}/vpcRejectlog/ 20 | echo "Delete athena query result from S3" 21 | aws s3 rm s3://${code_bucket}/athena-query-result/ 22 | fi 23 | fi 24 | # delete ecr 25 | repo_name=$(aws ecr describe-repositories --region $region --query 'repositories[?starts_with(repositoryName,`'$lower_stack_name'`)==`true`]'.repositoryName --output text) 26 | if ! [ -z "${repo_name}" ]; then 27 | echo "Delete Arc docker image from ECR" 28 | aws ecr delete-repository --region $region --repository-name $repo_name --force 29 | fi 30 | # delete glue tables 31 | tbl1=$(aws glue get-tables --region $region --database-name 'default' --query 'TableList[?starts_with(Name,`contact_snapshot`)==`true`]'.Name --output text) 32 | tbl2=$(aws glue get-tables --region $region --database-name 'default' --query 'TableList[?starts_with(Name,`deltalake_contact_jhub`)==`true`]'.Name --output text) 33 | if ! [ -z "$tbl1" ] 34 | then 35 | echo "Drop a Delta Lake table default.contact_snapshot" 36 | aws athena start-query-execution --region $region --query-string "DROP TABLE default.contact_snapshot" --result-configuration OutputLocation=s3://$code_bucket/athena-query-result 37 | fi 38 | if ! [ -z "$tbl2" ] 39 | then 40 | echo "Drop a Delta Lake table default.deltalake_contact_jhub" 41 | aws athena start-query-execution --region $region --query-string "DROP TABLE default.deltalake_contact_jhub" --result-configuration OutputLocation=s3://$code_bucket/athena-query-result 42 | fi 43 | 44 | # delete ALB 45 | argoALB=$(aws elbv2 describe-load-balancers --region $region --query 'LoadBalancers[?starts_with(DNSName,`k8s-argo`)==`true`].LoadBalancerArn' --output text) 46 | jhubALB=$(aws elbv2 describe-load-balancers --region $region --query 'LoadBalancers[?starts_with(DNSName,`k8s-jupyter`)==`true`].LoadBalancerArn' --output text) 47 | if ! [ -z "$argoALB" ] 48 | then 49 | echo "Delete Argo ALB" 50 | aws elbv2 delete-load-balancer --load-balancer-arn $argoALB --region $region 51 | sleep 5 52 | fi 53 | if ! [ -z "$jhubALB" ] 54 | then 55 | echo "Delete Jupyter ALB" 56 | aws elbv2 delete-load-balancer --load-balancer-arn $jhubALB --region $region 57 | sleep 5 58 | fi 59 | 60 | argoTG=$(aws elbv2 describe-target-groups --region $region --query 'TargetGroups[?starts_with(TargetGroupName,`k8s-argo`)==`true`].TargetGroupArn' --output text) 61 | jhubTG=$(aws elbv2 describe-target-groups --region $region --query 'TargetGroups[?starts_with(TargetGroupName,`k8s-jupyter`)==`true`].TargetGroupArn' --output text) 62 | if ! [ -z "$argoTG" ] 63 | then 64 | sleep 5 65 | echo "Delete Argo Target groups" 66 | aws elbv2 delete-target-group --region $region --target-group-arn $argoTG 67 | fi 68 | if ! [ -z "$jhubTG" ] 69 | then 70 | sleep 5 71 | echo "Delete Jupyter Target groups" 72 | aws elbv2 delete-target-group --region $region --target-group-arn $jhubTG 73 | fi 74 | 75 | # delete the rest from CF 76 | echo "Delete the rest of resources by CloudFormation delete command" 77 | aws cloudformation delete-stack --region $region --stack-name $stack_name -------------------------------------------------------------------------------- /deployment/post-deployment.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | export stack_name="${1:-SparkOnEKS}" 4 | export region="${2:-us-east-1}" 5 | 6 | echo "=================================================================================================" 7 | echo " Make sure your CloudFormation stack name $stack_name is correct and exists in region: $region " 8 | echo " If you use a different name, rerun the script with parameters:" 9 | echo " ./deployment/post-deployment.sh " 10 | echo "=================================================================================================" 11 | 12 | # 1. update ECR endpoint in example jobs 13 | export ECR_IMAGE_URI=$(aws cloudformation describe-stacks --stack-name $stack_name --region $region \ 14 | --query "Stacks[0].Outputs[?OutputKey=='IMAGEURI'].OutputValue" --output text) 15 | echo "Updated ECR endpoint in sample job files in source/example/" 16 | sed -i.bak "s|{{ECR_URL}}|${ECR_IMAGE_URI}|g" source/example/*.yaml 17 | 18 | find . -type f -name "*.bak" -delete 19 | 20 | # 2. install k8s command tools 21 | echo "================================================================================" 22 | echo " Installing kubectl tool on Linux ..." 23 | echo " For other operationing system, install the kubectl > 1.27 here:" 24 | echo " https://docs.aws.amazon.com/eks/latest/userguide/install-kubectl.html" 25 | echo "================================================================================" 26 | curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" 27 | chmod +x kubectl 28 | sudo mkdir -p /usr/local/bin && sudo mv kubectl /usr/local/bin/kubectl && export PATH=$PATH:/usr/local/bin/ 29 | echo "Installed kubectl version: " 30 | kubectl version --client 31 | echo "================================================================================================" 32 | echo " Installing argoCLI tool on Linux ..." 33 | echo " Check out https://github.com/argoproj/argo-workflows/releases for other OS type installation." 34 | echo "================================================================================================" 35 | VERSION=v3.5.4 36 | sudo curl -sLO https://github.com/argoproj/argo-workflows/releases/download/${VERSION}/argo-linux-amd64.gz && gunzip argo-linux-amd64.gz 37 | chmod +x argo-linux-amd64 && sudo mv ./argo-linux-amd64 /usr/local/bin/argo 38 | echo "Installed argoCLI version: " 39 | argo version --short 40 | 41 | # 3. connect to the EKS newly created 42 | echo `aws cloudformation describe-stacks --stack-name $stack_name --region $region --query "Stacks[0].Outputs[?starts_with(OutputKey,'eksclusterEKSConfig')].OutputValue" --output text` | bash 43 | echo "Testing EKS connection..." 44 | kubectl get svc 45 | 46 | # 4. get Jupyter Hub login 47 | LOGIN_URI=$(aws cloudformation describe-stacks --stack-name $stack_name --region $region \ 48 | --query "Stacks[0].Outputs[?OutputKey=='JUPYTERURL'].OutputValue" --output text) 49 | SEC_ID=$(aws secretsmanager list-secrets --region $region --query "SecretList[?not_null(Tags[?Value=='$stack_name'])].Name" --output text) 50 | LOGIN=$(aws secretsmanager get-secret-value --region $region --secret-id $SEC_ID --query SecretString --output text) 51 | echo -e "\n=============================== JupyterHub Login ==============================================" 52 | echo -e "\nJUPYTER_URL: $LOGIN_URI" 53 | echo "LOGIN: $LOGIN" 54 | echo "================================================================================================" 55 | 56 | #5. Get ArgoWorkflows login 57 | ARGO_LOGIN_URI=$(aws cloudformation describe-stacks --stack-name $stack_name --region $region \ 58 | --query "Stacks[0].Outputs[?OutputKey=='ARGOURL'].OutputValue" --output text) 59 | 60 | echo -e "\n=============================== ARGO Workflows Login ==============================================" 61 | echo -e "\nARGO_URL: $ARGO_LOGIN_URI" 62 | echo "================================================================================================" 63 | -------------------------------------------------------------------------------- /source/app.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | from aws_cdk import (App,Tags,CfnOutput) 3 | from lib.spark_on_eks_stack import SparkOnEksStack 4 | from lib.cloud_front_stack import NestedStack 5 | 6 | app = App() 7 | 8 | eks_name = app.node.try_get_context('cluster_name') 9 | solution_id = app.node.try_get_context('solution_id') 10 | solution_version= app.node.try_get_context('version') 11 | 12 | # main stack 13 | eks_stack = SparkOnEksStack(app, 'sql-based-etl-with-apache-spark-on-amazon-eks', eks_name, solution_id, solution_version) 14 | # Recommend to remove the CloudFront nested stack. Setup your own SSL certificate and add it to ALB. 15 | cf_nested_stack = NestedStack(eks_stack,'CreateCloudFront', eks_stack.code_bucket, eks_stack.argo_url, eks_stack.jhub_url) 16 | Tags.of(eks_stack).add('project', 'sqlbasedetl') 17 | Tags.of(cf_nested_stack).add('project', 'sqlbasedetl') 18 | # Deployment Output 19 | CfnOutput(eks_stack,'CODE_BUCKET', value=eks_stack.code_bucket) 20 | CfnOutput(eks_stack,'ARGO_URL', value='https://'+ cf_nested_stack.argo_cf) 21 | CfnOutput(eks_stack,'JUPYTER_URL', value='https://'+ cf_nested_stack.jhub_cf) 22 | 23 | app.synth() -------------------------------------------------------------------------------- /source/app_resources/alb-iam-role.yaml: -------------------------------------------------------------------------------- 1 | - Effect: Allow 2 | Action: 3 | - iam:CreateServiceLinkedRole 4 | Resource: "*" 5 | Condition: 6 | StringEquals: 7 | iam:AWSServiceName: elasticloadbalancing.amazonaws.com 8 | - Effect: Allow 9 | Action: 10 | - ec2:DescribeAccountAttributes 11 | - ec2:DescribeAddresses 12 | - ec2:DescribeAvailabilityZones 13 | - ec2:DescribeInternetGateways 14 | - ec2:DescribeVpcs 15 | - ec2:DescribeVpcPeeringConnections 16 | - ec2:DescribeSubnets 17 | - ec2:DescribeSecurityGroups 18 | - ec2:DescribeInstances 19 | - ec2:DescribeNetworkInterfaces 20 | - ec2:DescribeTags 21 | - ec2:GetCoipPoolUsage 22 | - ec2:DescribeCoipPools 23 | - elasticloadbalancing:DescribeLoadBalancers 24 | - elasticloadbalancing:DescribeLoadBalancerAttributes 25 | - elasticloadbalancing:DescribeListeners 26 | - elasticloadbalancing:DescribeListenerCertificates 27 | - elasticloadbalancing:DescribeSSLPolicies 28 | - elasticloadbalancing:DescribeRules 29 | - elasticloadbalancing:DescribeTargetGroups 30 | - elasticloadbalancing:DescribeTargetGroupAttributes 31 | - elasticloadbalancing:DescribeTargetHealth 32 | - elasticloadbalancing:DescribeTags 33 | Resource: "*" 34 | - Effect: Allow 35 | Action: 36 | - cognito-idp:DescribeUserPoolClient 37 | - acm:ListCertificates 38 | - acm:DescribeCertificate 39 | - iam:ListServerCertificates 40 | - iam:GetServerCertificate 41 | - waf-regional:GetWebACL 42 | - waf-regional:GetWebACLForResource 43 | - waf-regional:AssociateWebACL 44 | - waf-regional:DisassociateWebACL 45 | - wafv2:GetWebACL 46 | - wafv2:GetWebACLForResource 47 | - wafv2:AssociateWebACL 48 | - wafv2:DisassociateWebACL 49 | - shield:GetSubscriptionState 50 | - shield:DescribeProtection 51 | - shield:CreateProtection 52 | - shield:DeleteProtection 53 | Resource: "*" 54 | - Effect: Allow 55 | Action: 56 | - ec2:AuthorizeSecurityGroupIngress 57 | - ec2:RevokeSecurityGroupIngress 58 | Resource: "*" 59 | - Effect: Allow 60 | Action: 61 | - ec2:CreateSecurityGroup 62 | Resource: "*" 63 | - Effect: Allow 64 | Action: 65 | - ec2:CreateTags 66 | Resource: arn:aws:ec2:*:*:security-group/* 67 | Condition: 68 | StringEquals: 69 | ec2:CreateAction: CreateSecurityGroup 70 | 'Null': 71 | aws:RequestTag/elbv2.k8s.aws/cluster: 'false' 72 | - Effect: Allow 73 | Action: 74 | - ec2:CreateTags 75 | - ec2:DeleteTags 76 | Resource: arn:aws:ec2:*:*:security-group/* 77 | Condition: 78 | 'Null': 79 | aws:RequestTag/elbv2.k8s.aws/cluster: 'true' 80 | aws:ResourceTag/elbv2.k8s.aws/cluster: 'false' 81 | - Effect: Allow 82 | Action: 83 | - ec2:AuthorizeSecurityGroupIngress 84 | - ec2:RevokeSecurityGroupIngress 85 | - ec2:DeleteSecurityGroup 86 | Resource: "*" 87 | Condition: 88 | 'Null': 89 | aws:ResourceTag/elbv2.k8s.aws/cluster: 'false' 90 | - Effect: Allow 91 | Action: 92 | - elasticloadbalancing:CreateLoadBalancer 93 | - elasticloadbalancing:CreateTargetGroup 94 | Resource: "*" 95 | Condition: 96 | 'Null': 97 | aws:RequestTag/elbv2.k8s.aws/cluster: 'false' 98 | - Effect: Allow 99 | Action: 100 | - elasticloadbalancing:CreateListener 101 | - elasticloadbalancing:DeleteListener 102 | - elasticloadbalancing:CreateRule 103 | - elasticloadbalancing:DeleteRule 104 | Resource: "*" 105 | - Effect: Allow 106 | Action: 107 | - elasticloadbalancing:AddTags 108 | - elasticloadbalancing:RemoveTags 109 | Resource: 110 | - arn:aws:elasticloadbalancing:*:*:targetgroup/*/* 111 | - arn:aws:elasticloadbalancing:*:*:loadbalancer/net/*/* 112 | - arn:aws:elasticloadbalancing:*:*:loadbalancer/app/*/* 113 | Condition: 114 | 'Null': 115 | aws:RequestTag/elbv2.k8s.aws/cluster: 'true' 116 | aws:ResourceTag/elbv2.k8s.aws/cluster: 'false' 117 | - Effect: Allow 118 | Action: 119 | - elasticloadbalancing:AddTags 120 | - elasticloadbalancing:RemoveTags 121 | Resource: 122 | - arn:aws:elasticloadbalancing:*:*:listener/net/*/*/* 123 | - arn:aws:elasticloadbalancing:*:*:listener/app/*/*/* 124 | - arn:aws:elasticloadbalancing:*:*:listener-rule/net/*/*/* 125 | - arn:aws:elasticloadbalancing:*:*:listener-rule/app/*/*/* 126 | - Effect: Allow 127 | Action: 128 | - elasticloadbalancing:ModifyLoadBalancerAttributes 129 | - elasticloadbalancing:SetIpAddressType 130 | - elasticloadbalancing:SetSecurityGroups 131 | - elasticloadbalancing:SetSubnets 132 | - elasticloadbalancing:DeleteLoadBalancer 133 | - elasticloadbalancing:ModifyTargetGroup 134 | - elasticloadbalancing:ModifyTargetGroupAttributes 135 | - elasticloadbalancing:DeleteTargetGroup 136 | Resource: "*" 137 | Condition: 138 | 'Null': 139 | aws:ResourceTag/elbv2.k8s.aws/cluster: 'false' 140 | - Effect: Allow 141 | Action: 142 | - elasticloadbalancing:AddTags 143 | Resource: 144 | - arn:aws:elasticloadbalancing:*:*:targetgroup/*/* 145 | - arn:aws:elasticloadbalancing:*:*:loadbalancer/net/*/* 146 | - arn:aws:elasticloadbalancing:*:*:loadbalancer/app/*/* 147 | Condition: 148 | StringEquals: 149 | elasticloadbalancing:CreateAction: 150 | - CreateTargetGroup 151 | - CreateLoadBalancer 152 | 'Null': 153 | aws:RequestTag/elbv2.k8s.aws/cluster: 'false' 154 | - Effect: Allow 155 | Action: 156 | - elasticloadbalancing:RegisterTargets 157 | - elasticloadbalancing:DeregisterTargets 158 | Resource: arn:aws:elasticloadbalancing:*:*:targetgroup/*/* 159 | - Effect: Allow 160 | Action: 161 | - elasticloadbalancing:SetWebAcl 162 | - elasticloadbalancing:ModifyListener 163 | - elasticloadbalancing:AddListenerCertificates 164 | - elasticloadbalancing:RemoveListenerCertificates 165 | - elasticloadbalancing:ModifyRule 166 | Resource: "*" -------------------------------------------------------------------------------- /source/app_resources/alb-values.yaml: -------------------------------------------------------------------------------- 1 | # image: 2 | # tag: v2.2.0 3 | region: {{region_name}} 4 | vpcId: {{vpc_id}} 5 | clusterName: {{cluster_name}} 6 | serviceAccount: 7 | create: false 8 | name: alb-aws-load-balancer-controller 9 | nodeSelector: 10 | eks.amazonaws.com/capacityType: ON_DEMAND -------------------------------------------------------------------------------- /source/app_resources/argo-values.yaml: -------------------------------------------------------------------------------- 1 | controller: 2 | workflowNamespaces: 3 | - argo 4 | nodeSelector: 5 | eks.amazonaws.com/capacityType: ON_DEMAND 6 | init: 7 | serviceAccount: arcjob 8 | workflow: 9 | namespace: spark 10 | serviceAccount: 11 | create: false 12 | name: arcjob 13 | server: 14 | nodeSelector: 15 | eks.amazonaws.com/capacityType: ON_DEMAND 16 | extraArgs: 17 | - --auth-mode 18 | - client 19 | ingress: 20 | enabled: true 21 | annotations: 22 | kubernetes.io/ingress.class: alb 23 | alb.ingress.kubernetes.io/scheme: internet-facing 24 | alb.ingress.kubernetes.io/target-type: ip 25 | alb.ingress.kubernetes.io/success-codes: 200,301,302 26 | alb.ingress.kubernetes.io/listen-ports: '[{"HTTP": 2746}]' 27 | alb.ingress.kubernetes.io/manage-backend-security-group-rules: "true" 28 | alb.ingress.kubernetes.io/security-groups: {{INBOUND_SG}} -------------------------------------------------------------------------------- /source/app_resources/autoscaler-iam-role.yaml: -------------------------------------------------------------------------------- 1 | - Effect: Allow 2 | Action: 3 | - autoscaling:DescribeAutoScalingGroups 4 | - autoscaling:DescribeAutoScalingInstances 5 | - autoscaling:DescribeLaunchConfigurations 6 | - autoscaling:DescribeTags 7 | - autoscaling:SetDesiredCapacity 8 | - autoscaling:TerminateInstanceInAutoScalingGroup 9 | - ec2:DescribeLaunchTemplateVersions 10 | Resource: 11 | - "*" 12 | -------------------------------------------------------------------------------- /source/app_resources/autoscaler-values.yaml: -------------------------------------------------------------------------------- 1 | autoDiscovery: 2 | clusterName: {{cluster_name}} 3 | awsRegion: {{region_name}} 4 | image: 5 | tag: v1.27.3 6 | nodeSelector: 7 | eks.amazonaws.com/capacityType: ON_DEMAND 8 | podAnnotations: 9 | cluster-autoscaler.kubernetes.io/safe-to-evict: 'false' 10 | extraArgs: 11 | skip-nodes-with-system-pods: false 12 | scale-down-unneeded-time: 2m 13 | scale-down-unready-time: 5m 14 | rbac: 15 | serviceAccount: 16 | create: false 17 | name: cluster-autoscaler 18 | 19 | -------------------------------------------------------------------------------- /source/app_resources/etl-iam-role.yaml: -------------------------------------------------------------------------------- 1 | - Effect: Allow 2 | Action: 3 | - s3:ListBucket 4 | - s3:GetBucketLocation 5 | Resource: 6 | - arn:aws:s3:::{{codeBucket}} 7 | - arn:aws:s3:::{{datalakeBucket}} 8 | - arn:aws:s3:::nyc-tlc 9 | - Effect: Allow 10 | Action: 11 | - s3:PutObject 12 | - s3:GetObject 13 | Resource: 14 | - arn:aws:s3:::{{codeBucket}}/* 15 | - arn:aws:s3:::{{datalakeBucket}}/* 16 | - arn:aws:s3:::nyc-tlc/* 17 | - Effect: Allow 18 | Action: 19 | - s3:DeleteObject 20 | Resource: 21 | - arn:aws:s3:::{{codeBucket}}/* 22 | - arn:aws:s3:::{{datalakeBucket}}/* 23 | - Effect: Allow 24 | Action: 25 | - kms:Decrypt 26 | - kms:Encrypt 27 | - kms:GenerateDataKey* 28 | - athena:StartQueryExecution 29 | - athena:GetQueryExecution 30 | - athena:GetQueryResults 31 | - athena:GetQueryResultsStream 32 | - athena:GetWorkGroup 33 | - athena:ListDataCatalogs 34 | - glue:CreateTable 35 | - glue:CreateDatabase 36 | - glue:CreatePartition 37 | - glue:UpdatePartition 38 | - glue:UpdateTable 39 | - glue:GetTable 40 | - glue:GetDatabases 41 | - glue:GetCatalogImportStatus 42 | Resource: 43 | - '*' -------------------------------------------------------------------------------- /source/app_resources/etl-rbac.yaml: -------------------------------------------------------------------------------- 1 | kind: Role 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | metadata: 4 | name: etl-workflow-role 5 | namespace: spark 6 | rules: 7 | - apiGroups: [""] 8 | resources: ["pods","pods/exec","configmaps","services"] 9 | verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] 10 | - apiGroups: ["batch", "extensions"] 11 | resources: ["jobs"] 12 | verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] 13 | - apiGroups: [""] 14 | resources: ["events","pods/log","serviceaccounts", "secrets","endpoints"] 15 | verbs: ["list", "get", "watch"] 16 | - apiGroups: [""] 17 | resources: ["persistentvolumeclaims"] 18 | verbs: ["create", "delete", "get", "list"] 19 | - apiGroups: ["argoproj.io"] 20 | resources: ["workflows","workflows/finalizers"] 21 | verbs: ["*"] 22 | - apiGroups: ["argoproj.io"] 23 | resources: ["workflowtemplates","workflowtemplates/finalizers"] 24 | verbs: ["get", "list", "watch"] 25 | 26 | 27 | --- 28 | kind: RoleBinding 29 | apiVersion: rbac.authorization.k8s.io/v1 30 | metadata: 31 | name: {{MY_SA}}-role-binding 32 | namespace: spark 33 | subjects: 34 | - kind: ServiceAccount 35 | name: {{MY_SA}} 36 | namespace: spark 37 | roleRef: 38 | kind: Role 39 | name: etl-workflow-role 40 | apiGroup: rbac.authorization.k8s.io -------------------------------------------------------------------------------- /source/app_resources/ex-secret-iam-role.yaml: -------------------------------------------------------------------------------- 1 | - Effect: Allow 2 | Action: 3 | - secretsmanager:GetResourcePolicy 4 | - secretsmanager:GetSecretValue 5 | - secretsmanager:DescribeSecret 6 | - secretsmanager:ListSecretVersionIds 7 | Resource: {{secretsmanager}} 8 | - Effect: Allow 9 | Action: 10 | - secretsmanager:GetRandomPassword 11 | - secretsmanager:ListSecrets 12 | - kms:Decrypt 13 | - kms:Encrypt 14 | Resource: 15 | - "*" -------------------------------------------------------------------------------- /source/app_resources/ex-secret-values.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | tag: 7.2.0 3 | env: 4 | AWS_REGION: {{region_name}} 5 | AWS_DEFAULT_REGION: {{region_name}} 6 | serviceAccount: 7 | create: false 8 | name: external-secrets-controller 9 | securityContext: 10 | fsGroup: 65534 11 | -------------------------------------------------------------------------------- /source/app_resources/jupyter-config.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: RoleBinding 3 | metadata: 4 | name: {{MY_SA}}-role-binding 5 | namespace: jupyter 6 | subjects: 7 | - kind: ServiceAccount 8 | name: {{MY_SA}} 9 | namespace: jupyter 10 | roleRef: 11 | kind: Role 12 | name: hub 13 | apiGroup: rbac.authorization.k8s.io 14 | 15 | --- 16 | apiVersion: networking.k8s.io/v1 17 | kind: Ingress 18 | metadata: 19 | name: jupyterhub 20 | namespace: jupyter 21 | annotations: 22 | kubernetes.io/ingress.class: alb 23 | alb.ingress.kubernetes.io/scheme: internet-facing 24 | alb.ingress.kubernetes.io/target-type: ip 25 | alb.ingress.kubernetes.io/success-codes: 200,301,302 26 | alb.ingress.kubernetes.io/listen-ports: '[{"HTTP": 80}]' 27 | alb.ingress.kubernetes.io/manage-backend-security-group-rules: "true" 28 | alb.ingress.kubernetes.io/security-groups: {{INBOUND_SG}} 29 | labels: 30 | app: jupyterhub 31 | spec: 32 | rules: 33 | - host: "" 34 | http: 35 | paths: 36 | - path: / 37 | pathType: Prefix 38 | backend: 39 | service: 40 | name: proxy-public 41 | port: 42 | number: 80 43 | 44 | --- 45 | apiVersion: kubernetes-client.io/v1 46 | kind: ExternalSecret 47 | metadata: 48 | name: jupyter-external-secret 49 | namespace: jupyter 50 | spec: 51 | backendType: secretsManager 52 | region: {{REGION}} 53 | data: 54 | - key: {{SECRET_NAME}} 55 | name: password 56 | property: password -------------------------------------------------------------------------------- /source/app_resources/jupyter-values.yaml: -------------------------------------------------------------------------------- 1 | hub: 2 | db: 3 | type: sqlite-memory 4 | extraConfig: 5 | overrideServiceAccount: | 6 | import os, sys 7 | 8 | c.JupyterHub.authenticator_class = 'jupyterhub.auth.DummyAuthenticator' 9 | c.DummyAuthenticator.password = os.environ['LOGIN'] 10 | c.Authenticator.admin_users = {"service-admin"} 11 | c.JupyterHub.service_tokens = { 12 | "secret-token": "service-admin", 13 | } 14 | # this script allows serviceAccountName to use dynamic naming based on {unescaped_username}" 15 | async def override_service_account_hook(kube_spawner): 16 | if kube_spawner.service_account is not None: 17 | kube_spawner.service_account = kube_spawner._expand_user_properties(kube_spawner.service_account) 18 | kube_spawner.env['USER_NAME'] = kube_spawner._expand_user_properties("{unescaped_username}") 19 | print("kube_spawner.service_account = " + kube_spawner.service_account) 20 | c.KubeSpawner.pre_spawn_hook = override_service_account_hook 21 | 22 | # setup timeout 23 | c.JupyterHub.cookie_max_age_days = 0.0105 24 | c.Authenticator.refresh_pre_spawn = True 25 | 26 | extraEnv: 27 | - name: LOGIN 28 | valueFrom: 29 | secretKeyRef: 30 | name: jupyter-external-secret 31 | key: password 32 | nodeSelector: 33 | lifecycle: OnDemand 34 | readinessProbe: 35 | initialDelaySeconds: 30 36 | periodSeconds: 10 37 | 38 | proxy: 39 | secretToken: "*****" 40 | service: 41 | type: ClusterIP 42 | chp: 43 | nodeSelector: 44 | lifecycle: OnDemand 45 | 46 | singleuser: 47 | defaultUrl: "/lab" 48 | nodeSelector: 49 | lifecycle: OnDemand 50 | image: 51 | name: ghcr.io/tripl-ai/arc-jupyter 52 | tag: arc-jupyter_3.16.0_scala_2.12_hadoop_3.3.2_3.16.0_slim 53 | pullPolicy: Always 54 | lifecycleHooks: 55 | postStart: 56 | exec: 57 | command: 58 | - "bash" 59 | - "-c" 60 | - > 61 | cp -r /opt/.jupyter $HOME/.jupyter; 62 | echo "git clone https://github.com/awslabs/sql-based-etl-with-apache-spark-on-amazon-eks"; 63 | git clone https://github.com/awslabs/sql-based-etl-with-apache-spark-on-amazon-eks; 64 | 65 | serviceAccountName: "{username}" 66 | cpu: 67 | guarantee: 0.25 68 | limit: 0.5 69 | memory: 70 | guarantee: 4G 71 | limit: 4G 72 | extraEnv: 73 | CONF_ALLOW_EXPORT: "true" 74 | JAVA_OPTS: -Xmx4G 75 | ETL_CONF_DATALAKE_LOC: {{codeBucket}} 76 | ETL_CONF_AWS_REGION: {{region}} 77 | conf_spark_hadoop_fs_s3a_aws_credentials_provider: com.amazonaws.auth.WebIdentityTokenCredentialsProvider 78 | storage: 79 | type: none 80 | # storage: 81 | # type: dynamic 82 | # capacity: 10G 83 | # homeMountPath: '/home/{username}/data' 84 | # # mount to EBS 85 | # dynamic: 86 | # storageClass: gp2 87 | profileList: 88 | - default: True 89 | display_name: "Small (default): Arc-Jupyter Development Environment" 90 | description: "4GB Memory & 1vCPUs" 91 | kubespawner_override: 92 | cpu_guarantee: 0.5 93 | cpu_limit: 1 94 | mem_guarantee: 4G 95 | mem_limit: 10G 96 | - display_name: "Big Arc-Jupyter Development Environment" 97 | description: "15GB Memory & 2vCPUs" 98 | kubespawner_override: 99 | cpu_guarantee: 0.5 100 | cpu_limit: 2 101 | mem_guarantee: 10G 102 | mem_limit: 15G 103 | 104 | prePuller: 105 | hook: 106 | enabled: false 107 | 108 | # autoscacling setting 109 | scheduling: 110 | userScheduler: 111 | enabled: false 112 | cull: 113 | timeout: 1800 114 | # debug: 115 | # enabled: true -------------------------------------------------------------------------------- /source/app_resources/native-spark-iam-role.yaml: -------------------------------------------------------------------------------- 1 | - Effect: Allow 2 | Action: s3:ListBucket 3 | Resource: 4 | - arn:aws:s3:::{{codeBucket}} 5 | - arn:aws:s3:::{{datalakeBucket}} 6 | - arn:aws:s3:::nyc-tlc 7 | - Effect: Allow 8 | Action: 9 | - s3:PutObject 10 | - s3:GetObject 11 | Resource: 12 | - arn:aws:s3:::{{codeBucket}}/* 13 | - arn:aws:s3:::{{datalakeBucket}}/* 14 | - arn:aws:s3:::nyc-tlc/* 15 | - Effect: Allow 16 | Action: 17 | - s3:DeleteObject 18 | Resource: 19 | - arn:aws:s3:::{{codeBucket}}/* 20 | - Effect: Allow 21 | Action: 22 | - kms:Encrypt 23 | - kms:Decrypt 24 | - kms:GenerateDataKey* 25 | - kms:DescribeKey 26 | Resource: 27 | - '*' -------------------------------------------------------------------------------- /source/app_resources/native-spark-rbac.yaml: -------------------------------------------------------------------------------- 1 | kind: RoleBinding 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | metadata: 4 | name: {{MY_SA}}-role-binding 5 | namespace: spark 6 | subjects: 7 | - kind: ServiceAccount 8 | name: {{MY_SA}} 9 | namespace: spark 10 | roleRef: 11 | kind: Role 12 | name: etl-workflow-role 13 | apiGroup: rbac.authorization.k8s.io -------------------------------------------------------------------------------- /source/app_resources/spark-operator-values.yaml: -------------------------------------------------------------------------------- 1 | nodeSelector: 2 | # spark operator only works with non-graviton CPU 3 | kubernetes.io/arch: amd64 4 | serviceAccounts: 5 | spark: 6 | create: false 7 | sparkoperator: 8 | create: true 9 | metrics: 10 | # -- Disable prometheus metric scraping 11 | enable: false 12 | webhook: 13 | enable: true 14 | port: 443 -------------------------------------------------------------------------------- /source/app_resources/spark-template.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: argoproj.io/v1alpha1 2 | kind: WorkflowTemplate 3 | metadata: 4 | name: spark-template 5 | namespace: spark 6 | spec: 7 | templates: 8 | - name: smalljob 9 | retryStrategy: 10 | limit: 3 11 | retryPolicy: "Always" 12 | inputs: 13 | # override defaults here 14 | parameters: 15 | - name: jobId 16 | - name: configUri 17 | - name: image 18 | value: ghcr.io/tripl-ai/arc:latest 19 | - name: pullPolicy 20 | value: "Always" 21 | - name: executorInstances 22 | value: "1" 23 | - name: executorCores 24 | value: "1" 25 | - name: executorMemory 26 | value: "1" 27 | - name: sparkConf 28 | value: "" 29 | - name: tags 30 | value: "" 31 | - name: parameters 32 | value: "" 33 | # to exec each stages at a jupyter notebook, we can controle it by matching the environment. Some stages may not required in prod env. 34 | - name: environment 35 | value: test 36 | metadata: 37 | labels: 38 | app: spark 39 | workflowId: "{{workflow.uid}}" 40 | script: 41 | resources: 42 | limits: 43 | cpu: "1" 44 | memory: "1Gi" 45 | image: "{{inputs.parameters.image}}" 46 | command: ["/bin/sh"] 47 | source: | 48 | # verbose logging 49 | set -ex 50 | 51 | # print current hostname and ip 52 | hostname 53 | hostname -I 54 | 55 | # submit job 56 | /opt/spark/bin/spark-submit \ 57 | --master k8s://kubernetes.default.svc:443 \ 58 | --deploy-mode client \ 59 | --class ai.tripl.arc.ARC \ 60 | --name arc \ 61 | --conf spark.authenticate=true \ 62 | --conf spark.driver.extraJavaOptions="-XX:+UseG1GC" \ 63 | --conf spark.driver.host=$(hostname -I) \ 64 | --conf spark.driver.memory=921m \ 65 | --conf spark.executor.cores={{inputs.parameters.executorCores}} \ 66 | --conf spark.executor.extraJavaOptions="-XX:+UseG1GC" \ 67 | --conf spark.executor.instances={{inputs.parameters.executorInstances}} \ 68 | --conf spark.executor.memory={{inputs.parameters.executorMemory}}G \ 69 | --conf spark.io.encryption.enabled=true \ 70 | --conf spark.kubernetes.authenticate.caCertFile=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt \ 71 | --conf spark.kubernetes.authenticate.driver.serviceAccountName={{workflow.serviceAccountName}} \ 72 | --conf spark.kubernetes.authenticate.oauthTokenFile=/var/run/secrets/kubernetes.io/serviceaccount/token \ 73 | --conf spark.kubernetes.container.image.pullPolicy={{inputs.parameters.pullPolicy}} \ 74 | --conf spark.kubernetes.container.image={{inputs.parameters.image}} \ 75 | --conf spark.kubernetes.driver.limit.cores=1 \ 76 | --conf spark.kubernetes.driver.pod.name=$(hostname) \ 77 | --conf spark.kubernetes.executor.label.workflowId={{workflow.uid}} \ 78 | --conf spark.kubernetes.executor.limit.cores={{inputs.parameters.executorCores}} \ 79 | --conf spark.kubernetes.executor.podNamePrefix=$(hostname) \ 80 | --conf spark.kubernetes.executor.request.cores={{inputs.parameters.executorCores}} \ 81 | --conf spark.kubernetes.local.dirs.tmpfs=true \ 82 | --conf spark.kubernetes.namespace={{workflow.namespace}} \ 83 | --conf spark.network.crypto.enabled=true \ 84 | --conf spark.sql.ansi.enabled=true \ 85 | --conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.WebIdentityTokenCredentialsProvider \ 86 | {{inputs.parameters.sparkConf}} \ 87 | local:///opt/spark/jars/arc.jar \ 88 | --etl.config.uri={{inputs.parameters.configUri}} \ 89 | --etl.config.job.id={{inputs.parameters.jobId}} \ 90 | --etl.config.environment={{inputs.parameters.environment}} \ 91 | --etl.config.ignoreEnvironments=false \ 92 | --etl.config.tags="service=arc workflowId={{workflow.uid}} pod={{pod.name}} serviceAccount={{workflow.serviceAccountName}} namespace={{workflow.namespace}} {{inputs.parameters.tags}}" \ 93 | --ETL_CONF_EPOCH=$(date '+%s') --ETL_CONF_CURRENT_TIMESTAMP="'$(date -u '+%Y-%m-%d %H:%M:%S')'" \ 94 | {{inputs.parameters.parameters}} 95 | 96 | - name: mediumjob 97 | retryStrategy: 98 | limit: 3 99 | retryPolicy: "Always" 100 | inputs: 101 | # override defaults here 102 | parameters: 103 | - name: jobId 104 | - name: configUri 105 | - name: image 106 | value: ghcr.io/tripl-ai/arc:latest 107 | - name: pullPolicy 108 | value: "Always" 109 | - name: executorInstances 110 | value: "2" 111 | - name: executorCores 112 | value: "2" 113 | - name: executorMemory 114 | value: "10" 115 | - name: sparkConf 116 | value: "" 117 | - name: tags 118 | value: "" 119 | - name: parameters 120 | value: "" 121 | # to exec each stages at a jupyter notebook, we can controle it by matching the environment. Some stages may not required in prod env. 122 | - name: environment 123 | value: test 124 | metadata: 125 | labels: 126 | app: spark 127 | workflowId: "{{workflow.uid}}" 128 | script: 129 | resources: 130 | limits: 131 | cpu: "2" 132 | memory: "13Gi" 133 | image: "{{inputs.parameters.image}}" 134 | command: ["/bin/sh"] 135 | source: | 136 | # verbose logging 137 | set -ex 138 | 139 | # print current hostname and ip 140 | hostname 141 | hostname -I 142 | 143 | # submit job 144 | /opt/spark/bin/spark-submit \ 145 | --master k8s://kubernetes.default.svc:443 \ 146 | --deploy-mode client \ 147 | --class ai.tripl.arc.ARC \ 148 | --name arc \ 149 | --conf spark.authenticate=true \ 150 | --conf spark.driver.extraJavaOptions="-XX:+UseG1GC" \ 151 | --conf spark.driver.host=$(hostname -I) \ 152 | --conf spark.driver.memory=2g \ 153 | --conf spark.executor.cores={{inputs.parameters.executorCores}} \ 154 | --conf spark.executor.extraJavaOptions="-XX:+UseG1GC" \ 155 | --conf spark.executor.instances={{inputs.parameters.executorInstances}} \ 156 | --conf spark.executor.memory={{inputs.parameters.executorMemory}}G \ 157 | --conf spark.io.encryption.enabled=true \ 158 | --conf spark.kubernetes.authenticate.caCertFile=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt \ 159 | --conf spark.kubernetes.authenticate.driver.serviceAccountName={{workflow.serviceAccountName}} \ 160 | --conf spark.kubernetes.authenticate.oauthTokenFile=/var/run/secrets/kubernetes.io/serviceaccount/token \ 161 | --conf spark.kubernetes.container.image.pullPolicy={{inputs.parameters.pullPolicy}} \ 162 | --conf spark.kubernetes.container.image={{inputs.parameters.image}} \ 163 | --conf spark.kubernetes.driver.limit.cores=1 \ 164 | --conf spark.kubernetes.driver.pod.name=$(hostname) \ 165 | --conf spark.kubernetes.executor.label.workflowId={{workflow.uid}} \ 166 | --conf spark.kubernetes.executor.limit.cores={{inputs.parameters.executorCores}} \ 167 | --conf spark.kubernetes.executor.podNamePrefix=$(hostname) \ 168 | --conf spark.kubernetes.executor.request.cores={{inputs.parameters.executorCores}} \ 169 | --conf spark.kubernetes.local.dirs.tmpfs=true \ 170 | --conf spark.kubernetes.namespace={{workflow.namespace}} \ 171 | --conf spark.network.crypto.enabled=true \ 172 | --conf spark.sql.ansi.enabled=true \ 173 | --conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.WebIdentityTokenCredentialsProvider \ 174 | {{inputs.parameters.sparkConf}} \ 175 | local:///opt/spark/jars/arc.jar \ 176 | --etl.config.uri={{inputs.parameters.configUri}} \ 177 | --etl.config.job.id={{inputs.parameters.jobId}} \ 178 | --etl.config.environment={{inputs.parameters.environment}} \ 179 | --etl.config.ignoreEnvironments=false \ 180 | --etl.config.tags="service=arc workflowId={{workflow.uid}} pod={{pod.name}} serviceAccount={{workflow.serviceAccountName}} namespace={{workflow.namespace}} {{inputs.parameters.tags}}" \ 181 | --ETL_CONF_EPOCH=$(date '+%s') --ETL_CONF_CURRENT_TIMESTAMP="'$(date -u '+%Y-%m-%d %H:%M:%S')'" \ 182 | {{inputs.parameters.parameters}} 183 | 184 | - name: largejob 185 | retryStrategy: 186 | limit: 3 187 | retryPolicy: "Always" 188 | inputs: 189 | # override defaults here 190 | parameters: 191 | - name: jobId 192 | - name: configUri 193 | - name: image 194 | value: ghcr.io/tripl-ai/arc:latest 195 | - name: pullPolicy 196 | value: "Always" 197 | - name: executorInstances 198 | value: "3" 199 | - name: executorCores 200 | value: "2" 201 | - name: executorMemory 202 | value: "12" 203 | - name: sparkConf 204 | value: "" 205 | - name: tags 206 | value: "" 207 | - name: parameters 208 | value: "" 209 | # to exec each stages at a jupyter notebook, we can controle it by matching the environment. Some stages may not required in prod env. 210 | - name: environment 211 | value: test 212 | metadata: 213 | labels: 214 | app: spark 215 | workflowId: "{{workflow.uid}}" 216 | script: 217 | resources: 218 | limits: 219 | cpu: "3" 220 | memory: "13Gi" 221 | image: "{{inputs.parameters.image}}" 222 | command: ["/bin/sh"] 223 | source: | 224 | # verbose logging 225 | set -ex 226 | 227 | # print current hostname and ip 228 | hostname 229 | hostname -I 230 | 231 | # submit job 232 | /opt/spark/bin/spark-submit \ 233 | --master k8s://kubernetes.default.svc:443 \ 234 | --deploy-mode client \ 235 | --class ai.tripl.arc.ARC \ 236 | --name arc \ 237 | --conf spark.authenticate=true \ 238 | --conf spark.driver.extraJavaOptions="-XX:+UseG1GC" \ 239 | --conf spark.driver.host=$(hostname -I) \ 240 | --conf spark.driver.memory=4g \ 241 | --conf spark.executor.cores={{inputs.parameters.executorCores}} \ 242 | --conf spark.executor.extraJavaOptions="-XX:+UseG1GC" \ 243 | --conf spark.executor.instances={{inputs.parameters.executorInstances}} \ 244 | --conf spark.executor.memory={{inputs.parameters.executorMemory}}G \ 245 | --conf spark.io.encryption.enabled=true \ 246 | --conf spark.kubernetes.authenticate.caCertFile=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt \ 247 | --conf spark.kubernetes.authenticate.driver.serviceAccountName={{workflow.serviceAccountName}} \ 248 | --conf spark.kubernetes.authenticate.oauthTokenFile=/var/run/secrets/kubernetes.io/serviceaccount/token \ 249 | --conf spark.kubernetes.container.image.pullPolicy={{inputs.parameters.pullPolicy}} \ 250 | --conf spark.kubernetes.container.image={{inputs.parameters.image}} \ 251 | --conf spark.kubernetes.driver.limit.cores=1 \ 252 | --conf spark.kubernetes.driver.pod.name=$(hostname) \ 253 | --conf spark.kubernetes.executor.label.workflowId={{workflow.uid}} \ 254 | --conf spark.kubernetes.executor.limit.cores={{inputs.parameters.executorCores}} \ 255 | --conf spark.kubernetes.executor.podNamePrefix=$(hostname) \ 256 | --conf spark.kubernetes.executor.request.cores={{inputs.parameters.executorCores}} \ 257 | --conf spark.kubernetes.local.dirs.tmpfs=true \ 258 | --conf spark.kubernetes.namespace={{workflow.namespace}} \ 259 | --conf spark.network.crypto.enabled=true \ 260 | --conf spark.sql.ansi.enabled=true \ 261 | --conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.WebIdentityTokenCredentialsProvider \ 262 | {{inputs.parameters.sparkConf}} \ 263 | local:///opt/spark/jars/arc.jar \ 264 | --etl.config.uri={{inputs.parameters.configUri}} \ 265 | --etl.config.job.id={{inputs.parameters.jobId}} \ 266 | --etl.config.environment={{inputs.parameters.environment}} \ 267 | --etl.config.ignoreEnvironments=false \ 268 | --etl.config.tags="service=arc workflowId={{workflow.uid}} pod={{pod.name}} serviceAccount={{workflow.serviceAccountName}} namespace={{workflow.namespace}} {{inputs.parameters.tags}}" \ 269 | --ETL_CONF_EPOCH=$(date '+%s') --ETL_CONF_CURRENT_TIMESTAMP="'$(date -u '+%Y-%m-%d %H:%M:%S')'" \ 270 | {{inputs.parameters.parameters}} 271 | 272 | - name: sparklocal 273 | inputs: 274 | retryStrategy: 275 | limit: 3 276 | retryPolicy: "Always" 277 | # override defaults here 278 | parameters: 279 | - name: jobId 280 | - name: configUri 281 | - name: image 282 | value: ghcr.io/tripl-ai/arc:latest 283 | - name: executorInstances 284 | value: "1" 285 | - name: executorCores 286 | value: "1" 287 | - name: executorMemory 288 | value: "1" 289 | - name: sparkConf 290 | value: "" 291 | - name: tags 292 | value: "" 293 | - name: parameters 294 | value: "" 295 | - name: pullPolicy 296 | value: IfNotPresent 297 | - name: environment 298 | value: test 299 | metadata: 300 | labels: 301 | app: spark 302 | workflowId: "{{workflow.uid}}" 303 | podSpecPatch: | 304 | containers: 305 | - name: main 306 | resources: 307 | requests: 308 | cpu: "{{inputs.parameters.executorCores}}" 309 | memory: "{{inputs.parameters.executorMemory}}Gi" 310 | script: 311 | image: "{{inputs.parameters.image}}" 312 | command: ["/bin/sh"] 313 | source: | 314 | # verbose logging 315 | set -ex 316 | 317 | # print current hostname and ip 318 | hostname 319 | hostname -I 320 | 321 | # submit job 322 | # driver memory is set at 90% of executorMemory 323 | /opt/spark/bin/spark-submit \ 324 | --master local[{{inputs.parameters.executorCores}}] \ 325 | --driver-memory $(({{inputs.parameters.executorMemory}} * 1024 * 90/100))m \ 326 | --driver-java-options "-XX:+UseG1GC" \ 327 | --class ai.tripl.arc.ARC \ 328 | --name arc \ 329 | --conf spark.driver.host=$(hostname -I) \ 330 | --conf spark.driver.pod.name=$(hostname)-driver \ 331 | --conf spark.io.encryption.enabled=true \ 332 | --conf spark.sql.adaptive.enabled=true \ 333 | --conf spark.network.crypto.enabled=true \ 334 | --conf spark.ui.enabled=true \ 335 | --conf spark.sql.ansi.enabled=true \ 336 | --conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.WebIdentityTokenCredentialsProvider \ 337 | {{inputs.parameters.sparkConf}} \ 338 | local:///opt/spark/jars/arc.jar \ 339 | --etl.config.uri={{inputs.parameters.configUri}} \ 340 | --etl.config.job.id={{inputs.parameters.jobId}} \ 341 | --etl.config.environment={{inputs.parameters.environment}} \ 342 | --etl.config.ignoreEnvironments=fales \ 343 | --etl.config.tags="service=arc workflowId={{workflow.uid}} pod={{pod.name}} serviceAccount={{workflow.serviceAccountName}} namespace={{workflow.namespace}} {{inputs.parameters.tags}}" \ 344 | --ETL_CONF_EPOCH=$(date '+%s') --ETL_CONF_CURRENT_TIMESTAMP="'$(date -u '+%Y-%m-%d %H:%M:%S')'" \ 345 | {{inputs.parameters.parameters}} -------------------------------------------------------------------------------- /source/cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "python3 app.py", 3 | "context": { 4 | "cluster_name": "spark-on-eks", 5 | "solution_id": "SO0141", 6 | "solution_name": "sql-based-etl-with-apache-spark-on-amazon-eks", 7 | "version": "v2.0.0", 8 | "@aws-cdk/core:stackRelativeExports": true, 9 | "@aws-cdk/customresources:installLatestAwsSdkDefault": false 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /source/example/native-spark-job-scheduler.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: "sparkoperator.k8s.io/v1beta2" 2 | kind: SparkApplication 3 | metadata: 4 | name: word-count 5 | namespace: spark 6 | spec: 7 | type: Python 8 | pythonVersion: "3" 9 | mode: cluster 10 | image: {{ECR_URL}} 11 | imagePullPolicy: Always 12 | mainApplicationFile: "s3a://$(BUCKET_PARAM)/app_code/job/wordcount.py" 13 | arguments: ["s3a://nyc-tlc/csv_backup/yellow_tripdata*.csv","s3a://$(BUCKET_PARAM)/app_code/output/native"] 14 | sparkVersion: "3.3.4" 15 | sparkConf: 16 | # By design, the graviton EKS nodegroup is in a single AZ 17 | # use the nodegroup label to trigger the scaling of Graviton instances within a single AZ 18 | # "spark.kubernetes.node.selector.nodegroup": "single-az-graviton" 19 | "spark.hadoop.fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem" 20 | "spark.hadoop.fs.s3a.aws.credentials.provider": "com.amazonaws.auth.WebIdentityTokenCredentialsProvider" 21 | "spark.kubernetes.allocation.batch.size": "15" 22 | "spark.io.encryption.enabled": "true" 23 | "spark.kubernetes.local.dirs.tmpfs": "true" 24 | volumes: 25 | - name: spark-local-dir-1 26 | hostPath: 27 | path: "/tmp" 28 | type: Directory 29 | dynamicAllocation: 30 | enabled: true 31 | initialExecutors: 1 32 | minExecutors: 1 33 | maxExecutors: 20 34 | restartPolicy: 35 | type: OnFailure 36 | onFailureRetries: 3 37 | onFailureRetryInterval: 10 38 | onSubmissionFailureRetries: 5 39 | onSubmissionFailureRetryInterval: 5 40 | driver: 41 | # schedule on spot to test the driver restart 42 | affinity: 43 | nodeAffinity: 44 | requiredDuringSchedulingIgnoredDuringExecution: 45 | nodeSelectorTerms: 46 | - matchExpressions: 47 | - key: lifecycle 48 | operator: In 49 | values: 50 | - Ec2Spot 51 | env: 52 | - name: BUCKET_PARAM 53 | valueFrom: 54 | configMapKeyRef: 55 | name: special-config 56 | key: codeBucket 57 | cores: 1 58 | memory: "1G" 59 | labels: 60 | role: driver 61 | serviceAccount: nativejob 62 | volumeMounts: 63 | - name: spark-local-dir-1 64 | mountPath: "/tmp" 65 | executor: 66 | # start executors on Spot 67 | affinity: 68 | nodeAffinity: 69 | requiredDuringSchedulingIgnoredDuringExecution: 70 | nodeSelectorTerms: 71 | - matchExpressions: 72 | - key: lifecycle 73 | operator: In 74 | values: 75 | - Ec2Spot 76 | cores: 1 77 | memory: "4G" 78 | labels: 79 | role: executor 80 | volumeMounts: 81 | - name: spark-local-dir-1 82 | mountPath: "/tmp" 83 | -------------------------------------------------------------------------------- /source/example/notebook/nyctaxi-job.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%conf \n", 10 | "numRows=5" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": null, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "%env \n", 20 | "ETL_CONF_DATA_URL=s3a://nyc-tlc/csv_backup\n", 21 | "ETL_CONF_JOB_URL=https://raw.githubusercontent.com/tripl-ai/arc-starter/master/examples/kubernetes" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": null, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "{\n", 31 | " \"type\": \"DelimitedExtract\",\n", 32 | " \"name\": \"extract data from green_tripdata schema 0\",\n", 33 | " \"environments\": [\"production\", \"test\"],\n", 34 | " \"inputURI\": ${ETL_CONF_DATA_URL}\"/green_tripdata_2013-08.csv\",\n", 35 | " \"outputView\": \"green_tripdata0_raw\", \n", 36 | " \"delimiter\": \"Comma\",\n", 37 | " \"quote\" : \"DoubleQuote\",\n", 38 | " \"header\": true,\n", 39 | " \"persist\": true\n", 40 | "}" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "{\n", 50 | " \"type\": \"TypingTransform\",\n", 51 | " \"name\": \"apply green_tripdata schema 0 data types\",\n", 52 | " \"environments\": [\"production\", \"test\"],\n", 53 | " \"schemaURI\": ${ETL_CONF_JOB_URL}\"/green_tripdata0.json\",\n", 54 | " \"inputView\": \"green_tripdata0_raw\", \n", 55 | " \"outputView\": \"green_tripdata0\"\n", 56 | "}" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": null, 62 | "metadata": {}, 63 | "outputs": [], 64 | "source": [ 65 | "%sqlvalidate name=\"ensure no errors exist after data typing\" environments=production,test\n", 66 | "SELECT\n", 67 | " SUM(error) = 0 AS valid\n", 68 | " ,TO_JSON(\n", 69 | " NAMED_STRUCT(\n", 70 | " 'count', COUNT(error), \n", 71 | " 'errors', SUM(error)\n", 72 | " )\n", 73 | " ) AS message\n", 74 | "FROM (\n", 75 | " SELECT \n", 76 | " CASE \n", 77 | " WHEN SIZE(_errors) > 0 THEN 1 \n", 78 | " ELSE 0 \n", 79 | " END AS error \n", 80 | " FROM green_tripdata0\n", 81 | ") input_table" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "%sql name=\"ensure a query can be executed\" environments=production,test persist=true outputView=green_trip_filtered\n", 91 | "SELECT * \n", 92 | "FROM green_tripdata0\n", 93 | "WHERE store_and_fwd_flag = TRUE" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": null, 99 | "metadata": {}, 100 | "outputs": [], 101 | "source": [] 102 | } 103 | ], 104 | "metadata": { 105 | "kernelspec": { 106 | "display_name": "Arc", 107 | "language": "javascript", 108 | "name": "arc" 109 | }, 110 | "language_info": { 111 | "file_extension": "arc", 112 | "mimetype": "text/arc", 113 | "name": "arc", 114 | "nbconvert_exporter": "text", 115 | "version": "2.2.0" 116 | } 117 | }, 118 | "nbformat": 4, 119 | "nbformat_minor": 2 120 | } 121 | -------------------------------------------------------------------------------- /source/example/notebook/scd2-job.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%conf \n", 10 | "numRows=12" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": {}, 16 | "source": [ 17 | "# 1. Initial Table Load" 18 | ] 19 | }, 20 | { 21 | "cell_type": "code", 22 | "execution_count": null, 23 | "metadata": {}, 24 | "outputs": [], 25 | "source": [ 26 | "{\n", 27 | " \"type\": \"DelimitedExtract\",\n", 28 | " \"name\": \"extract initial table\",\n", 29 | " \"environments\": [\"dev\", \"test\"],\n", 30 | " \"inputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/data/initial_contacts.csv\",\n", 31 | " \"outputView\": \"initial_raw\", \n", 32 | " \"delimiter\": \"Comma\",\n", 33 | " \"header\": false,\n", 34 | " \"quote\": \"None\",\n", 35 | " \"authentication\": {\n", 36 | " \"method\": \"AmazonIAM\"\n", 37 | " }\n", 38 | "}" 39 | ] 40 | }, 41 | { 42 | "cell_type": "markdown", 43 | "metadata": {}, 44 | "source": [ 45 | "## 1.2 Check Original Data Schema" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": null, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "%printschema \n", 55 | "initial_raw" 56 | ] 57 | }, 58 | { 59 | "cell_type": "markdown", 60 | "metadata": { 61 | "ExecuteTime": { 62 | "start_time": "2020-03-03T08:30:30.028Z" 63 | } 64 | }, 65 | "source": [ 66 | "## 1.3 Apply Data Type" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": null, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "{\n", 76 | " \"type\": \"TypingTransform\",\n", 77 | " \"name\": \"apply table schema 0\",\n", 78 | " \"environments\": [\"dev\", \"test\"],\n", 79 | " \"schemaURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/meta/contact_meta_0.json\",\n", 80 | " \"inputView\": \"initial_raw\", \n", 81 | " \"outputView\": \"initial_typed\",\n", 82 | " \"authentication\": {\n", 83 | " \"method\": \"AmazonIAM\"\n", 84 | " }\n", 85 | "}" 86 | ] 87 | }, 88 | { 89 | "cell_type": "markdown", 90 | "metadata": {}, 91 | "source": [ 92 | "## 1.4 Check Typed Data Schema & Stats" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": null, 98 | "metadata": {}, 99 | "outputs": [], 100 | "source": [ 101 | "%printschema \n", 102 | "initial_typed" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "## 1.5 Data Quality Control" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": {}, 116 | "outputs": [], 117 | "source": [ 118 | "%sqlvaildate outputView=\"fail_fast\" name=\"validation\" description=\"fail the job if data transform is failed\" environments=dev,test sqlParams=inputView=initial_typed\n", 119 | "\n", 120 | "SELECT SUM(error) = 0 AS valid\n", 121 | " ,TO_JSON(\n", 122 | " NAMED_STRUCT('count', COUNT(error), 'errors', SUM(error))\n", 123 | " ) AS message\n", 124 | "FROM \n", 125 | "(\n", 126 | " SELECT CASE WHEN SIZE(_errors) > 0 THEN 1 ELSE 0 END AS error \n", 127 | " FROM ${inputView}\n", 128 | ") base" 129 | ] 130 | }, 131 | { 132 | "cell_type": "markdown", 133 | "metadata": {}, 134 | "source": [ 135 | "## 1.6 Add Calculated Fields for SCD Type 2\n", 136 | "### CURRENT_TIMESTAMP will be passed in automatically, when the ETL job is triggered" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": null, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "%env\n", 146 | "ETL_CONF_CURRENT_TIMESTAMP=current_timestamp()" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": null, 152 | "metadata": {}, 153 | "outputs": [], 154 | "source": [ 155 | "%sql outputView=\"initial_load\" name=\"add calc field for SCD\" environments=dev,test sqlParams=table_name=initial_typed,now=${ETL_CONF_CURRENT_TIMESTAMP}\n", 156 | "\n", 157 | "SELECT id,name,email,state, ${now} AS valid_from, CAST(null AS timestamp) AS valid_to\n", 158 | ",1 AS iscurrent, md5(concat(name,email,state)) AS checksum \n", 159 | "FROM ${table_name}" 160 | ] 161 | }, 162 | { 163 | "cell_type": "markdown", 164 | "metadata": {}, 165 | "source": [ 166 | "## 1.7 Initial load to Delta Lake\n", 167 | "### Delta Lake is an optimized data lake to support Time Travel, ACID transaction" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "{\n", 177 | " \"type\": \"DeltaLakeLoad\",\n", 178 | " \"name\": \"Initial load to Data Lake\",\n", 179 | " \"environments\": [\"dev\", \"test\"],\n", 180 | " \"inputView\": \"initial_load\",\n", 181 | " \"outputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/output/contact/\",\n", 182 | " \"numPartitions\": 2\n", 183 | " \"saveMode\": \"Overwrite\",\n", 184 | " \"authentication\": {\n", 185 | " \"method\": \"AmazonIAM\"\n", 186 | " }\n", 187 | "}" 188 | ] 189 | }, 190 | { 191 | "cell_type": "markdown", 192 | "metadata": { 193 | "ExecuteTime": { 194 | "end_time": "2020-05-31T04:55:34.761654Z", 195 | "start_time": "2020-05-31T04:55:34.738Z" 196 | } 197 | }, 198 | "source": [ 199 | "# SCD Type2 Implementation" 200 | ] 201 | }, 202 | { 203 | "cell_type": "markdown", 204 | "metadata": { 205 | "ExecuteTime": { 206 | "end_time": "2020-03-18T22:38:05.895407Z", 207 | "start_time": "2020-03-18T22:37:48.160Z" 208 | } 209 | }, 210 | "source": [ 211 | "## 2. Ingest A New Incremental CSV File\n", 212 | "### Look at record 12, the `state` is changed in the file" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "{\n", 222 | " \"type\": \"DelimitedExtract\",\n", 223 | " \"name\": \"extract incremental data\",\n", 224 | " \"environments\": [\"dev\", \"test\"],\n", 225 | " \"inputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/data/update_contacts.csv\",\n", 226 | " \"outputView\": \"delta_raw\", \n", 227 | " \"delimiter\": \"Comma\",\n", 228 | " \"header\": false,\n", 229 | " \"authentication\": {\n", 230 | " \"method\": \"AmazonIAM\"\n", 231 | " }\n", 232 | "}" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "metadata": {}, 238 | "source": [ 239 | "## 2.1 Apply Data Type (reused schema file)" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": null, 245 | "metadata": {}, 246 | "outputs": [], 247 | "source": [ 248 | "{\n", 249 | " \"type\": \"TypingTransform\",\n", 250 | " \"name\": \"apply table schema 0 to incremental load\",\n", 251 | " \"environments\": [\"dev\", \"test\"],\n", 252 | " \"schemaURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/meta/contact_meta_0.json\",\n", 253 | " \"inputView\": \"delta_raw\", \n", 254 | " \"outputView\": \"delta_typed\",\n", 255 | " \"authentication\": {\n", 256 | " \"method\": \"AmazonIAM\"\n", 257 | " }\n", 258 | "}" 259 | ] 260 | }, 261 | { 262 | "cell_type": "markdown", 263 | "metadata": { 264 | "ExecuteTime": { 265 | "end_time": "2020-06-07T15:02:50.155313Z", 266 | "start_time": "2020-06-07T15:02:50.125Z" 267 | } 268 | }, 269 | "source": [ 270 | "## 2.2 Data Quality Control (reused sql script)" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": null, 276 | "metadata": {}, 277 | "outputs": [], 278 | "source": [ 279 | "%sqlvaildate outputView=\"fail_fast\" name=\"validation\" description=\"fail the job if data transform is failed\" environments=dev,test sqlParams=inputView=delta_typed\n", 280 | "\n", 281 | "SELECT SUM(error) = 0 AS valid\n", 282 | " ,TO_JSON(\n", 283 | " NAMED_STRUCT('count', COUNT(error), 'errors', SUM(error))\n", 284 | " ) AS message\n", 285 | "FROM \n", 286 | "(\n", 287 | " SELECT CASE WHEN SIZE(_errors) > 0 THEN 1 ELSE 0 END AS error \n", 288 | " FROM ${inputView}\n", 289 | ") base" 290 | ] 291 | }, 292 | { 293 | "cell_type": "markdown", 294 | "metadata": { 295 | "ExecuteTime": { 296 | "end_time": "2020-05-31T05:01:13.796275Z", 297 | "start_time": "2020-05-31T05:01:13.734Z" 298 | } 299 | }, 300 | "source": [ 301 | "## 2.3 Add Calculated Fields (reused sql script)" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": null, 307 | "metadata": {}, 308 | "outputs": [], 309 | "source": [ 310 | "%env\n", 311 | "ETL_CONF_CURRENT_TIMESTAMP=current_timestamp()" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": null, 317 | "metadata": {}, 318 | "outputs": [], 319 | "source": [ 320 | "%sql outputView=\"update_load\" name=\"add calc field for SCD\" environments=dev,test sqlParams=table_name=delta_typed,now=${ETL_CONF_CURRENT_TIMESTAMP}\n", 321 | "\n", 322 | "SELECT id,name,email,state, ${now} AS valid_from, CAST(null AS timestamp) AS valid_to\n", 323 | ",1 AS iscurrent, md5(concat(name,email,state)) AS checksum \n", 324 | "FROM ${table_name}" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": { 330 | "ExecuteTime": { 331 | "end_time": "2020-05-31T05:03:33.741024Z", 332 | "start_time": "2020-05-31T05:03:33.247Z" 333 | } 334 | }, 335 | "source": [ 336 | "## 2.4 Prepare Datasets for SCD Type2 Insert\n", 337 | "\n", 338 | "- Generate extra rows for changed records.\n", 339 | "- The 'null' merge_key means it will be inserted, not update existing records according to the rule in SCD type2" 340 | ] 341 | }, 342 | { 343 | "cell_type": "code", 344 | "execution_count": null, 345 | "metadata": {}, 346 | "outputs": [], 347 | "source": [ 348 | "%sql outputView=\"staged_update\" name=\"generate extra rows for SCD\" environments=dev,test\n", 349 | "\n", 350 | "SELECT NULL AS mergeKey, new.*\n", 351 | "FROM initial_load old\n", 352 | "INNER JOIN update_load new\n", 353 | "ON old.id = new.id\n", 354 | "WHERE old.iscurrent=1\n", 355 | "AND old.checksum<>new.checksum\n", 356 | "\n", 357 | "UNION\n", 358 | "\n", 359 | "SELECT id AS mergeKey, *\n", 360 | "FROM update_load" 361 | ] 362 | }, 363 | { 364 | "cell_type": "markdown", 365 | "metadata": {}, 366 | "source": [ 367 | "## 2.5 Perform the Type 2 SCD" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": null, 373 | "metadata": {}, 374 | "outputs": [], 375 | "source": [ 376 | "{\n", 377 | " \"type\": \"DeltaLakeExtract\",\n", 378 | " \"name\": \"read initial Delta table\",\n", 379 | " \"description\": \"read initial Delta table\",\n", 380 | " \"environments\": [\n", 381 | " \"dev\",\n", 382 | " \"test\"\n", 383 | " ],\n", 384 | " \"inputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/output/contact/\",\n", 385 | " \"outputView\": \"current_snapshot\"\n", 386 | "}" 387 | ] 388 | }, 389 | { 390 | "cell_type": "code", 391 | "execution_count": null, 392 | "metadata": { 393 | "vscode": { 394 | "languageId": "plaintext" 395 | } 396 | }, 397 | "outputs": [], 398 | "source": [ 399 | "%sql name=\"merge into existing contacts table\" environments=dev,test\n", 400 | "\n", 401 | "MERGE INTO current_snapshot tgt\n", 402 | "USING staged_update src\n", 403 | "ON tgt.id = src.mergeKey\n", 404 | "WHEN MATCHED AND src.checksum != tgt.checksum AND tgt.iscurrent = 1 THEN \n", 405 | " UPDATE SET \n", 406 | " valid_to = src.valid_from, \n", 407 | " iscurrent = 0\n", 408 | "WHEN NOT MATCHED THEN \n", 409 | " INSERT *" 410 | ] 411 | }, 412 | { 413 | "cell_type": "markdown", 414 | "metadata": {}, 415 | "source": [ 416 | "# 3. Create a Delta Lake table in Athena\n", 417 | "### Build up a Glue data catalog from Athena.We are using token based authentication to access Athena, no more long live credentials is required from secrets manager. " 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": null, 423 | "metadata": {}, 424 | "outputs": [], 425 | "source": [ 426 | "{\n", 427 | " \"type\": \"JDBCExecute\",\n", 428 | " \"name\": \"Create glue data catalog\",\n", 429 | " \"environments\": [\n", 430 | " \"dev\",\n", 431 | " \"test\"\n", 432 | " ],\n", 433 | " \"inputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/sql/create_table_contact.sql\",\n", 434 | " \"jdbcURL\": \"jdbc:awsathena://AwsRegion=\"${AWS_DEFAULT_REGION}\";S3OutputLocation=s3://\"${ETL_CONF_DATALAKE_LOC}\"/athena-query-result;AwsCredentialsProviderClass=com.amazonaws.auth.WebIdentityTokenCredentialsProvider\",\n", 435 | " \"sqlParams\":{\n", 436 | " \"datalake_loc\": \"'s3://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/output/contact\\/'\",\n", 437 | " \"table_name\": \"default.deltalake_contact_jhub\"\n", 438 | " }\n", 439 | "}" 440 | ] 441 | }, 442 | { 443 | "cell_type": "markdown", 444 | "metadata": {}, 445 | "source": [ 446 | "## 3. Query Delta Lake (optional)\n", 447 | "### to skip in a productionized ETL job, use a fake environment `uat`" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": null, 453 | "metadata": {}, 454 | "outputs": [], 455 | "source": [ 456 | "{\n", 457 | " \"type\": \"DeltaLakeExtract\",\n", 458 | " \"name\": \"read contact Delta Lake table\",\n", 459 | " \"description\": \"read contact table\",\n", 460 | " \"environments\": [\n", 461 | " \"uat\"\n", 462 | " ],\n", 463 | " \"inputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/output/contact/\",\n", 464 | " \"outputView\": \"contact\"\n", 465 | "}" 466 | ] 467 | }, 468 | { 469 | "cell_type": "markdown", 470 | "metadata": {}, 471 | "source": [ 472 | "## Confirm 92 records are expired" 473 | ] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": null, 478 | "metadata": {}, 479 | "outputs": [], 480 | "source": [ 481 | "%sql outputView=\"expired_count\" name=\"expired_count\" environments=uat\n", 482 | "SELECT count(*) FROM contact WHERE valid_to is not null" 483 | ] 484 | }, 485 | { 486 | "cell_type": "code", 487 | "execution_count": null, 488 | "metadata": {}, 489 | "outputs": [], 490 | "source": [ 491 | "%metadata \n", 492 | "contact" 493 | ] 494 | }, 495 | { 496 | "cell_type": "markdown", 497 | "metadata": {}, 498 | "source": [ 499 | " ## Confirm we now have 1192 records" 500 | ] 501 | }, 502 | { 503 | "cell_type": "code", 504 | "execution_count": null, 505 | "metadata": {}, 506 | "outputs": [], 507 | "source": [ 508 | "%sql outputView=\"total_count\" name=\"total_count\" environments=uat\n", 509 | "SELECT count(*) FROM contact" 510 | ] 511 | }, 512 | { 513 | "cell_type": "markdown", 514 | "metadata": {}, 515 | "source": [ 516 | "## View one of the changed records" 517 | ] 518 | }, 519 | { 520 | "cell_type": "code", 521 | "execution_count": null, 522 | "metadata": {}, 523 | "outputs": [], 524 | "source": [ 525 | "%sql outputView=\"validate_type2\" name=\"validate_type2\" environments=uat\n", 526 | "SELECT * FROM contact WHERE id=12" 527 | ] 528 | }, 529 | { 530 | "cell_type": "code", 531 | "execution_count": null, 532 | "metadata": {}, 533 | "outputs": [], 534 | "source": [] 535 | } 536 | ], 537 | "metadata": { 538 | "kernelspec": { 539 | "display_name": "Arc", 540 | "language": "javascript", 541 | "name": "arc" 542 | }, 543 | "language_info": { 544 | "codemirror_mode": "javascript", 545 | "file_extension": ".json", 546 | "mimetype": "javascript", 547 | "name": "arc", 548 | "nbconvert_exporter": "arcexport", 549 | "version": "3.13.1" 550 | } 551 | }, 552 | "nbformat": 4, 553 | "nbformat_minor": 4 554 | } 555 | -------------------------------------------------------------------------------- /source/example/nyctaxi-job-scheduler.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: argoproj.io/v1alpha1 2 | kind: Workflow 3 | metadata: 4 | generateName: nyctaxi-job- 5 | namespace: spark 6 | spec: 7 | serviceAccountName: arcjob 8 | # keep workflows history for 30m 9 | ttlStrategy: 10 | secondsAfterCompletion: 1800 11 | entrypoint: nyctaxi 12 | nodeselector: 13 | kubernetes.io/arch: amd64 14 | templates: 15 | - name: nyctaxi 16 | dag: 17 | tasks: 18 | - name: step1-query 19 | templateRef: 20 | name: spark-template 21 | template: sparklocal 22 | arguments: 23 | parameters: 24 | - name: jobId 25 | value: nyctaxi 26 | - name: tags 27 | value: "project=sqlbasedetl owner=myowner costcenter=66666" 28 | - name: configUri 29 | value: https://raw.githubusercontent.com/tripl-ai/arc-starter/master/examples/kubernetes/nyctaxi.ipynb 30 | - name: image 31 | value: ghcr.io/tripl-ai/arc:arc_4.2.0_spark_3.3.4_scala_2.12_hadoop_3.3.2_4.2.1_slim 32 | - name: parameters 33 | value: "--ETL_CONF_DATA_URL=s3a://nyc-tlc/csv_backup \ 34 | --ETL_CONF_JOB_URL=https://raw.githubusercontent.com/tripl-ai/arc-starter/master/examples/kubernetes" 35 | -------------------------------------------------------------------------------- /source/example/scd2-job-scheduler.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: argoproj.io/v1alpha1 2 | kind: Workflow 3 | metadata: 4 | generateName: scd2-job- 5 | namespace: spark 6 | spec: 7 | serviceAccountName: arcjob 8 | entrypoint: scd2-process 9 | nodeselector: 10 | kubernetes.io/arch: amd64 11 | arguments: 12 | parameters: 13 | - name: codeBucket 14 | value: cfn_value 15 | templates: 16 | - name: scd2-process 17 | dag: 18 | tasks: 19 | - name: initial-load 20 | templateRef: 21 | name: spark-template 22 | template: smalljob 23 | arguments: 24 | parameters: 25 | - name: jobId 26 | value: initial-load 27 | - name: image 28 | value: {{ECR_URL}} 29 | - name: configUri 30 | value: "s3a://{{workflow.parameters.codeBucket}}/app_code/job/initial_load.ipynb" 31 | - name: parameters 32 | value: "--ETL_CONF_DATALAKE_LOC={{workflow.parameters.codeBucket}}" 33 | - name: delta-load 34 | templateRef: 35 | name: spark-template 36 | template: smalljob 37 | arguments: 38 | parameters: 39 | - name: jobId 40 | value: delta-load 41 | - name: image 42 | value: {{ECR_URL}} 43 | - name: configUri 44 | value: "s3a://{{workflow.parameters.codeBucket}}/app_code/job/delta_load.ipynb" 45 | - name: parameters 46 | value: "--ETL_CONF_DATALAKE_LOC={{workflow.parameters.codeBucket}}" 47 | - name: SCD2-merge 48 | dependencies: [initial-load, delta-load] 49 | templateRef: 50 | name: spark-template 51 | template: smalljob 52 | arguments: 53 | parameters: 54 | - name: jobId 55 | value: SCD2-merge 56 | - name: image 57 | value: {{ECR_URL}} 58 | - name: configUri 59 | value: "s3a://{{workflow.parameters.codeBucket}}/app_code/job/scd2_merge.ipynb" 60 | - name: parameters 61 | value: "--ETL_CONF_DATALAKE_LOC={{workflow.parameters.codeBucket}}" 62 | - name: sparkConf 63 | value: "--conf spark.databricks.delta.merge.repartitionBeforeWrite.enabled=true" 64 | -------------------------------------------------------------------------------- /source/images/00-deploy-to-aws.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/guidance-for-sql-based-etl-with-apache-spark-on-amazon-eks/06f7302c12ad69851031043068fd34abdb75a7cb/source/images/00-deploy-to-aws.png -------------------------------------------------------------------------------- /source/images/3-argo-job-dependency.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/guidance-for-sql-based-etl-with-apache-spark-on-amazon-eks/06f7302c12ad69851031043068fd34abdb75a7cb/source/images/3-argo-job-dependency.png -------------------------------------------------------------------------------- /source/images/3-argo-log.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/guidance-for-sql-based-etl-with-apache-spark-on-amazon-eks/06f7302c12ad69851031043068fd34abdb75a7cb/source/images/3-argo-log.png -------------------------------------------------------------------------------- /source/images/4-auto-scaling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/guidance-for-sql-based-etl-with-apache-spark-on-amazon-eks/06f7302c12ad69851031043068fd34abdb75a7cb/source/images/4-auto-scaling.png -------------------------------------------------------------------------------- /source/images/4-k8s-retry.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/guidance-for-sql-based-etl-with-apache-spark-on-amazon-eks/06f7302c12ad69851031043068fd34abdb75a7cb/source/images/4-k8s-retry.png -------------------------------------------------------------------------------- /source/images/4-spot-console.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/guidance-for-sql-based-etl-with-apache-spark-on-amazon-eks/06f7302c12ad69851031043068fd34abdb75a7cb/source/images/4-spot-console.png -------------------------------------------------------------------------------- /source/images/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/guidance-for-sql-based-etl-with-apache-spark-on-amazon-eks/06f7302c12ad69851031043068fd34abdb75a7cb/source/images/architecture.png -------------------------------------------------------------------------------- /source/images/driver_interruption_test.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/guidance-for-sql-based-etl-with-apache-spark-on-amazon-eks/06f7302c12ad69851031043068fd34abdb75a7cb/source/images/driver_interruption_test.gif -------------------------------------------------------------------------------- /source/images/executor_interruption_test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/guidance-for-sql-based-etl-with-apache-spark-on-amazon-eks/06f7302c12ad69851031043068fd34abdb75a7cb/source/images/executor_interruption_test.png -------------------------------------------------------------------------------- /source/images/fake_data.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/guidance-for-sql-based-etl-with-apache-spark-on-amazon-eks/06f7302c12ad69851031043068fd34abdb75a7cb/source/images/fake_data.gif -------------------------------------------------------------------------------- /source/images/run_jupyter.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/guidance-for-sql-based-etl-with-apache-spark-on-amazon-eks/06f7302c12ad69851031043068fd34abdb75a7cb/source/images/run_jupyter.gif -------------------------------------------------------------------------------- /source/images/sql-based-etl-spark-architecture-final.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/guidance-for-sql-based-etl-with-apache-spark-on-amazon-eks/06f7302c12ad69851031043068fd34abdb75a7cb/source/images/sql-based-etl-spark-architecture-final.png -------------------------------------------------------------------------------- /source/images/sql-based-etl-with-apache-spark-on-amazon-eks.preview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/guidance-for-sql-based-etl-with-apache-spark-on-amazon-eks/06f7302c12ad69851031043068fd34abdb75a7cb/source/images/sql-based-etl-with-apache-spark-on-amazon-eks.preview.png -------------------------------------------------------------------------------- /source/images/submit_job_in_argo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-solutions-library-samples/guidance-for-sql-based-etl-with-apache-spark-on-amazon-eks/06f7302c12ad69851031043068fd34abdb75a7cb/source/images/submit_job_in_argo.gif -------------------------------------------------------------------------------- /source/lib/cdk_infra/eks_base_app.py: -------------------------------------------------------------------------------- 1 | from aws_cdk import Aws 2 | from constructs import Construct 3 | from aws_cdk.aws_eks import ICluster, KubernetesManifest 4 | from lib.util.manifest_reader import * 5 | import os 6 | 7 | class EksBaseAppConst(Construct): 8 | @property 9 | def secret_created(self): 10 | return self._ext_secret 11 | 12 | def __init__(self,scope: Construct,id: str,eks_cluster: ICluster, **kwargs,) -> None: 13 | super().__init__(scope, id, **kwargs) 14 | 15 | source_dir=os.path.split(os.environ['VIRTUAL_ENV'])[0]+'/source' 16 | 17 | # Add ALB ingress controller to EKS 18 | self._alb = eks_cluster.add_helm_chart('ALBChart', 19 | chart='aws-load-balancer-controller', 20 | repository='https://aws.github.io/eks-charts', 21 | release='alb', 22 | version='1.5.5', 23 | create_namespace=False, 24 | namespace='kube-system', 25 | values=load_yaml_replace_var_local(source_dir+'/app_resources/alb-values.yaml', 26 | fields={ 27 | "{{region_name}}": Aws.REGION, 28 | "{{cluster_name}}": eks_cluster.cluster_name, 29 | "{{vpc_id}}": eks_cluster.vpc.vpc_id 30 | } 31 | ) 32 | ) 33 | # Add external secrets controller to EKS 34 | self._ext_secret = eks_cluster.add_helm_chart('SecretContrChart', 35 | chart='kubernetes-external-secrets', 36 | repository='https://external-secrets.github.io/kubernetes-external-secrets/', 37 | release='external-secrets', 38 | create_namespace=False, 39 | namespace='kube-system', 40 | values=load_yaml_replace_var_local(source_dir+'/app_resources/ex-secret-values.yaml', 41 | fields={ 42 | '{{region_name}}': Aws.REGION 43 | } 44 | ) 45 | ) 46 | self._ext_secret.node.add_dependency(self._alb) 47 | # Add Cluster Autoscaler to EKS 48 | _var_mapping = { 49 | "{{region_name}}": Aws.REGION, 50 | "{{cluster_name}}": eks_cluster.cluster_name, 51 | } 52 | eks_cluster.add_helm_chart('ClusterAutoScaler', 53 | chart='cluster-autoscaler', 54 | repository='https://kubernetes.github.io/autoscaler', 55 | release='nodescaler', 56 | create_namespace=False, 57 | namespace='kube-system', 58 | values=load_yaml_replace_var_local(source_dir+'/app_resources/autoscaler-values.yaml',_var_mapping) 59 | ) 60 | # # Add container insight (CloudWatch Log) to EKS 61 | # KubernetesManifest(self,'ContainerInsight', 62 | # cluster=eks_cluster, 63 | # manifest=load_yaml_replace_var_remotely('https://raw.githubusercontent.com/aws-samples/amazon-cloudwatch-container-insights/latest/k8s-deployment-manifest-templates/deployment-mode/daemonset/container-insights-monitoring/quickstart/cwagent-fluentd-quickstart.yaml', 64 | # fields=_var_mapping, 65 | # multi_resource=True 66 | # ) 67 | # ) 68 | # Add Spark Operator to EKS 69 | eks_cluster.add_helm_chart('SparkOperatorChart', 70 | chart='spark-operator', 71 | repository='https://kubeflow.github.io/spark-operator', 72 | release='spark-operator', 73 | version='1.1.27', 74 | create_namespace=True, 75 | values=load_yaml_replace_var_local(source_dir+'/app_resources/spark-operator-values.yaml',fields={'':''}) 76 | ) -------------------------------------------------------------------------------- /source/lib/cdk_infra/eks_cluster.py: -------------------------------------------------------------------------------- 1 | from aws_cdk import (aws_eks as eks,aws_ec2 as ec2, RemovalPolicy) 2 | from aws_cdk.aws_iam import IRole 3 | from constructs import Construct 4 | from aws_cdk.lambda_layer_kubectl_v27 import KubectlV27Layer 5 | 6 | class EksConst(Construct): 7 | 8 | @property 9 | def my_cluster(self): 10 | return self._my_cluster 11 | 12 | def __init__(self, scope: Construct, id:str, eksname: str, eksvpc: ec2.IVpc, noderole: IRole, eks_adminrole: IRole, **kwargs) -> None: 13 | super().__init__(scope, id, **kwargs) 14 | 15 | # 1.Create EKS cluster without node group 16 | self._my_cluster = eks.Cluster(self,'EKS', 17 | vpc= eksvpc, 18 | cluster_name=eksname, 19 | masters_role=eks_adminrole, 20 | output_cluster_name=True, 21 | version= eks.KubernetesVersion.V1_27, 22 | endpoint_access= eks.EndpointAccess.PUBLIC_AND_PRIVATE, 23 | default_capacity=0, 24 | kubectl_layer=KubectlV27Layer(self, 'kubectlV27Layer') 25 | ) 26 | 27 | # 2.Add Managed NodeGroup to EKS, compute resource to run Spark jobs 28 | self._my_cluster.add_nodegroup_capacity('onDemand-mn', 29 | nodegroup_name = 'etl-ondemand', 30 | node_role = noderole, 31 | desired_size = 1, 32 | max_size = 5, 33 | disk_size = 50, 34 | instance_types = [ec2.InstanceType('m7g.xlarge')], 35 | labels = {'lifecycle':'OnDemand'}, 36 | # create one nodegroup per AZ, as cluster autoscaler has no control over what AZ ASG will launch instance in. 37 | # if using Karpenter, this is not needed. 38 | subnets = ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS,one_per_az=True), 39 | tags = {'Name':'OnDemand-'+eksname,'k8s.io/cluster-autoscaler/enabled': 'true', 'k8s.io/cluster-autoscaler/'+eksname: 'owned'} 40 | ) 41 | 42 | # 3. Add Spot managed NodeGroup to EKS (Run Spark exectutor on spot) 43 | self._my_cluster.add_nodegroup_capacity('spot-mn', 44 | nodegroup_name = 'etl-spot', 45 | node_role = noderole, 46 | capacity_type=eks.CapacityType.SPOT, 47 | desired_size = 1, 48 | max_size = 30, 49 | disk_size = 50, 50 | instance_types=[ec2.InstanceType('r5.xlarge'),ec2.InstanceType('r4.xlarge'),ec2.InstanceType('r5a.xlarge')], 51 | subnets = ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS,one_per_az=True), 52 | labels = {'lifecycle':'Ec2Spot'}, 53 | tags = {'Name':'Spot-'+eksname, 'k8s.io/cluster-autoscaler/enabled': 'true', 'k8s.io/cluster-autoscaler/'+eksname: 'owned'} 54 | ) 55 | self._my_cluster.add_nodegroup_capacity('spot-arm64', 56 | nodegroup_name = 'single-az-graviton', 57 | node_role = noderole, 58 | capacity_type=eks.CapacityType.SPOT, 59 | desired_size = 1, 60 | max_size = 30, 61 | disk_size = 50, 62 | instance_types = [ec2.InstanceType('r7g.xlarge'),ec2.InstanceType('r6g.xlarge'),ec2.InstanceType('r6gd.xlarge')], 63 | # create one nodegroup per AZ, as cluster autoscaler has no control over what AZ ASG will launch instance in. 64 | # if using Karpenter, this is not needed. 65 | subnets = ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS,availability_zones=sorted(eksvpc.availability_zones)[1:2]), 66 | labels = {'nodegroup':'single-az-graviton', 'lifecycle':'Ec2Spot'}, 67 | tags = {'Name':'single-az-graviton','k8s.io/cluster-autoscaler/enabled': 'true', 'k8s.io/cluster-autoscaler/'+eksname: 'owned'} 68 | ) 69 | 70 | # # 4. Add Fargate NodeGroup to EKS, without setup cluster-autoscaler 71 | # self._my_cluster.add_fargate_profile('FargateEnabled', 72 | # selectors =[{ 73 | # "namespace": "spark" 74 | # }], 75 | # fargate_profile_name='sparkETL' 76 | # ) -------------------------------------------------------------------------------- /source/lib/cdk_infra/eks_service_account.py: -------------------------------------------------------------------------------- 1 | from aws_cdk import aws_iam as iam 2 | from constructs import Construct 3 | from aws_cdk.aws_secretsmanager import ISecret 4 | from aws_cdk.aws_eks import ICluster 5 | from lib.util.manifest_reader import * 6 | # import lib.util.override_rule as scan 7 | import os 8 | 9 | class EksSAConst(Construct): 10 | 11 | def __init__(self, scope: Construct, id:str, eks_cluster: ICluster, secret: ISecret, **kwargs,) -> None: 12 | super().__init__(scope, id, **kwargs) 13 | 14 | source_dir=os.path.split(os.environ['VIRTUAL_ENV'])[0]+'/source' 15 | 16 | # //************************************v*************************************************************// 17 | # //***************************** SERVICE ACCOUNT, RBAC and IAM ROLES *******************************// 18 | # //****** Associating IAM role to K8s Service Account to provide fine-grain security control ******// 19 | # //***********************************************************************************************// 20 | # Cluster Auto-scaler 21 | self._scaler_sa = eks_cluster.add_service_account('AutoScalerSa', 22 | name='cluster-autoscaler', 23 | namespace='kube-system' 24 | ) 25 | _scaler_role = load_yaml_local(source_dir+'/app_resources/autoscaler-iam-role.yaml') 26 | for statmt in _scaler_role: 27 | self._scaler_sa.add_to_principal_policy(iam.PolicyStatement.from_json(statmt)) 28 | 29 | # ALB Ingress 30 | self._alb_sa = eks_cluster.add_service_account('ALBServiceAcct', 31 | name='alb-aws-load-balancer-controller', 32 | namespace='kube-system' 33 | ) 34 | _alb_role = load_yaml_local(source_dir+'/app_resources/alb-iam-role.yaml') 35 | for statmt in _alb_role: 36 | self._alb_sa.add_to_principal_policy(iam.PolicyStatement.from_json(statmt)) 37 | 38 | # External secret controller 39 | self._secrets_sa = eks_cluster.add_service_account('ExSecretController', 40 | name='external-secrets-controller', 41 | namespace="kube-system" 42 | ) 43 | self._secrets_sa.node.add_dependency(secret) 44 | _secrets_role = load_yaml_replace_var_local(source_dir+'/app_resources/ex-secret-iam-role.yaml', 45 | fields={"{{secretsmanager}}": secret.secret_arn+"*"} 46 | ) 47 | for statmt in _secrets_role: 48 | self._secrets_sa.add_to_principal_policy(iam.PolicyStatement.from_json(statmt)) 49 | 50 | # //************************************v*************************************************************// 51 | # //*********************** Override cfn Nag scan rules for deployment *******************************// 52 | # //***********************************************************************************************// 53 | 54 | # Override Cfn Nag warning W12: IAM policy should not allow * resource 55 | # scan.suppress_cfnnag_rule('W12', 'by default the role scaler_sa has * resource', self._scaler_sa.role.node.find_child('DefaultPolicy').node.default_child) 56 | # scan.suppress_cfnnag_rule('W12', 'by default the role secrets_sa has * resource', self._secrets_sa.role.node.find_child('DefaultPolicy').node.default_child) 57 | # scan.suppress_iam_cfnnag_rule(self._alb_sa.role.node.find_child('DefaultPolicy').node.default_child) -------------------------------------------------------------------------------- /source/lib/cdk_infra/iam_roles.py: -------------------------------------------------------------------------------- 1 | import typing 2 | from aws_cdk import (Tags, aws_iam as iam) 3 | from constructs import Construct 4 | # import lib.util.override_rule as scan 5 | 6 | class IamConst(Construct): 7 | 8 | @property 9 | def managed_node_role(self): 10 | return self._managed_node_role 11 | 12 | @property 13 | def admin_role(self): 14 | return self._clusterAdminRole 15 | 16 | def __init__(self,scope: Construct, id:str, cluster_name:str, **kwargs,) -> None: 17 | super().__init__(scope, id, **kwargs) 18 | 19 | # EKS admin role 20 | self._clusterAdminRole = iam.Role(self, 'clusterAdmin', 21 | assumed_by= iam.AccountRootPrincipal() 22 | ) 23 | self._clusterAdminRole.add_to_policy(iam.PolicyStatement( 24 | resources=["*"], 25 | actions=[ 26 | "eks:Describe*", 27 | "eks:List*", 28 | "eks:AccessKubernetesApi", 29 | "ssm:GetParameter", 30 | "iam:ListRoles", 31 | "emr-containers:CreateVirtualCluster" 32 | ], 33 | )) 34 | Tags.of(self._clusterAdminRole).add( 35 | key='eks/%s/type' % cluster_name, 36 | value='admin-role' 37 | ) 38 | 39 | # Managed Node Group Instance Role 40 | _managed_node_managed_policies = ( 41 | iam.ManagedPolicy.from_aws_managed_policy_name('AmazonEKSWorkerNodePolicy'), 42 | iam.ManagedPolicy.from_aws_managed_policy_name('AmazonEKS_CNI_Policy'), 43 | iam.ManagedPolicy.from_aws_managed_policy_name('AmazonEC2ContainerRegistryReadOnly'), 44 | iam.ManagedPolicy.from_aws_managed_policy_name('CloudWatchAgentServerPolicy'), 45 | ) 46 | self._managed_node_role = iam.Role(self,'NodeInstance-Role', 47 | path='/', 48 | assumed_by=iam.ServicePrincipal('ec2.amazonaws.com'), 49 | managed_policies=list(_managed_node_managed_policies), 50 | ) 51 | 52 | 53 | # Override Cfn Nag rule 54 | # scan.suppress_cfnnag_rule('W12', 'by default the role has * resource', self._clusterAdminRole.node.find_child('DefaultPolicy').node.default_child) 55 | -------------------------------------------------------------------------------- /source/lib/cdk_infra/network_sg.py: -------------------------------------------------------------------------------- 1 | from aws_cdk import (Tags, aws_ec2 as ec2, aws_s3 as s3) 2 | from constructs import Construct 3 | # import lib.util.override_rule as scan 4 | import lib.util.get_aws_managed_prefix as custom 5 | 6 | class NetworkSgConst(Construct): 7 | 8 | @property 9 | def vpc(self): 10 | return self._vpc 11 | @property 12 | def alb_jhub_sg(self): 13 | return self._alb_jhub_sg 14 | @property 15 | def alb_argo_sg(self): 16 | return self._alb_argo_sg 17 | 18 | def __init__(self,scope: Construct, id:str, eksname:str, codebucket: str, **kwargs) -> None: 19 | super().__init__(scope, id, **kwargs) 20 | 21 | # //*************************************************// 22 | # //******************* NETWORK ********************// 23 | # //************************************************// 24 | # create VPC 25 | self._vpc = ec2.Vpc(self, 'eksVpc',max_azs=2, nat_gateways=1) 26 | Tags.of(self._vpc).add('Name', eksname + 'EksVpc') 27 | 28 | # self._log_bucket=s3.Bucket.from_bucket_name(self,'vpc_logbucket', codebucket) 29 | # self._vpc.add_flow_log("FlowLogCloudWatch", 30 | # destination=ec2.FlowLogDestination.to_s3(self._log_bucket,'vpcRejectlog/'), 31 | # traffic_type=ec2.FlowLogTrafficType.REJECT 32 | # ) 33 | 34 | # ALB security group for Jupyter & Argo 35 | prefixlist_peer=ec2.Peer.prefix_list( 36 | custom.AwsManagedPrefixList(self,'cr-getprefixId', 37 | custom.AwsManagedPrefixListProps(name='com.amazonaws.global.cloudfront.origin-facing') 38 | ).prefixlist_id 39 | ) 40 | self._alb_jhub_sg=ec2.SecurityGroup(self,'JupyterALBInboundSG', vpc=self._vpc,description='Security Group for Jupyter ALB') 41 | self._alb_argo_sg=ec2.SecurityGroup(self,'ArgoALBInboundSG', vpc=self._vpc,description='Security Group for Argo ALB') 42 | self._alb_jhub_sg.add_ingress_rule(prefixlist_peer,ec2.Port.tcp(port=80)) 43 | self._alb_argo_sg.add_ingress_rule(prefixlist_peer,ec2.Port.tcp(port=2746)) 44 | Tags.of(self._alb_jhub_sg).add('Name','SparkOnEKS-JhubSg') 45 | Tags.of(self._alb_argo_sg).add('Name','SparkOnEKS-ArgoSg') 46 | 47 | # VPC endpoint security group 48 | self._vpc_endpoint_sg = ec2.SecurityGroup(self,'EndpointSg',vpc=self._vpc,description='Security Group for Endpoint') 49 | self._vpc_endpoint_sg.add_ingress_rule(ec2.Peer.ipv4(self._vpc.vpc_cidr_block),ec2.Port.tcp(port=443)) 50 | self._vpc_endpoint_sg.add_ingress_rule(ec2.Peer.ipv4(self._vpc.vpc_cidr_block),ec2.Port.tcp(port=444)) 51 | Tags.of(self._vpc_endpoint_sg).add('Name','SparkOnEKS-VPCEndpointSg') 52 | 53 | # Add VPC endpoint 54 | self._vpc.add_gateway_endpoint("S3GatewayEndpoint", 55 | service=ec2.GatewayVpcEndpointAwsService.S3, 56 | subnets=[ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC), 57 | ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS)]) 58 | 59 | self._vpc.add_interface_endpoint("EcrDockerEndpoint",service=ec2.InterfaceVpcEndpointAwsService.ECR_DOCKER, security_groups=[self._vpc_endpoint_sg]) 60 | self._vpc.add_interface_endpoint("CWLogsEndpoint", service=ec2.InterfaceVpcEndpointAwsService.CLOUDWATCH_LOGS,security_groups=[self._vpc_endpoint_sg]) 61 | self._vpc.add_interface_endpoint("AthenaEndpoint", service=ec2.InterfaceVpcEndpointAwsService.ATHENA,security_groups=[self._vpc_endpoint_sg]) 62 | self._vpc.add_interface_endpoint("KMSEndpoint", service=ec2.InterfaceVpcEndpointAwsService.KMS,security_groups=[self._vpc_endpoint_sg]) 63 | 64 | 65 | # Override Cfn_Nag rule for AWS Solution CICD validation 66 | # for subnet in self._vpc.public_subnets: 67 | # scan.suppress_cfnnag_rule('W33','a public facing ALB is required and ingress from the internet should be permitted.',subnet.node.default_child) 68 | 69 | # self._vpc_endpoint_sg.node.default_child.add_metadata('cfn_nag',{ 70 | # "rules_to_suppress": [ 71 | # { 72 | # "id": "W40", 73 | # "reason": "Egress IP Protocol of -1 is default and generally considered OK" 74 | # }, 75 | # { 76 | # "id": "W5", 77 | # "reason": "Security Groups with cidr open considered OK" 78 | # } 79 | # ] 80 | # }) 81 | -------------------------------------------------------------------------------- /source/lib/cdk_infra/s3_app_code.py: -------------------------------------------------------------------------------- 1 | from aws_cdk import (RemovalPolicy, aws_s3 as s3, aws_s3_deployment as s3deploy, aws_kms as kms) 2 | from constructs import Construct 3 | import os 4 | 5 | class S3AppCodeConst(Construct): 6 | 7 | @property 8 | def code_bucket(self): 9 | return self.bucket_name 10 | 11 | @property 12 | def artifact_bucket(self): 13 | return self._artifact_bucket 14 | 15 | # @property 16 | # def s3_deploy_contrust(self): 17 | # return self.deploy 18 | 19 | def __init__(self,scope: Construct, id: str, **kwargs,) -> None: 20 | super().__init__(scope, id, **kwargs) 21 | 22 | # Upload application code to S3 bucket 23 | self._artifact_bucket=s3.Bucket(self, id, 24 | block_public_access=s3.BlockPublicAccess.BLOCK_ALL, 25 | encryption=s3.BucketEncryption.KMS_MANAGED, 26 | removal_policy=RemovalPolicy.RETAIN, 27 | access_control = s3.BucketAccessControl.LOG_DELIVERY_WRITE, 28 | object_ownership=s3.ObjectOwnership.OBJECT_WRITER, 29 | versioned=True #required by codebuild 30 | ) 31 | 32 | proj_dir=os.path.split(os.environ['VIRTUAL_ENV'])[0] 33 | self.deploy=s3deploy.BucketDeployment(self, "DeployCode", 34 | sources=[s3deploy.Source.asset(proj_dir+'/deployment/app_code')], 35 | destination_bucket= self.artifact_bucket, 36 | destination_key_prefix="app_code" 37 | ) 38 | self.bucket_name = self.artifact_bucket.bucket_name 39 | 40 | # # Override Cfn_Nag rule for S3 access logging 41 | # self.artifact_bucket.node.default_child.add_metadata('cfn_nag',{ 42 | # "rules_to_suppress": [ 43 | # { 44 | # "id": "W35", 45 | # "reason": "bucket access log stops bucket removal, disable for now" 46 | # }, 47 | # { 48 | # "id": "W51", 49 | # "reason": "bucket access is controled by IAM level" 50 | # } 51 | # ] 52 | # }) 53 | -------------------------------------------------------------------------------- /source/lib/cdk_infra/spark_permission.py: -------------------------------------------------------------------------------- 1 | from aws_cdk import (aws_iam as iam) 2 | from constructs import Construct 3 | from aws_cdk.aws_eks import ICluster, KubernetesManifest 4 | from lib.util.manifest_reader import load_yaml_replace_var_local 5 | # import lib.util.override_rule as scan 6 | import os 7 | 8 | class SparkOnEksSAConst(Construct): 9 | 10 | @property 11 | def jupyter_sa(self): 12 | return self._jupyter_sa.service_account_name 13 | 14 | def __init__(self,scope: Construct, id: str, 15 | eks_cluster: ICluster, 16 | login_name: str, 17 | code_bucket: str, 18 | datalake_bucket: str, 19 | **kwargs) -> None: 20 | super().__init__(scope, id, **kwargs) 21 | 22 | source_dir=os.path.split(os.environ['VIRTUAL_ENV'])[0]+'/source' 23 | # //******************************************************************************************// 24 | # //************************ SETUP PERMISSION FOR ARC SPARK JOBS ****************************// 25 | # //******* create k8s namespace, service account, and IAM role for service account ********// 26 | # //***************************************************************************************// 27 | 28 | # create k8s namespace 29 | etl_ns = eks_cluster.add_manifest('SparkNamespace',{ 30 | "apiVersion": "v1", 31 | "kind": "Namespace", 32 | "metadata": { 33 | "name": "spark", 34 | "labels": {"name":"spark"} 35 | } 36 | } 37 | ) 38 | jupyter_ns = eks_cluster.add_manifest('jhubNamespace',{ 39 | "apiVersion": "v1", 40 | "kind": "Namespace", 41 | "metadata": { 42 | "name": "jupyter", 43 | "labels": {"name":"spark"} 44 | } 45 | } 46 | ) 47 | 48 | # create k8s service account 49 | self._etl_sa = eks_cluster.add_service_account('ETLSa', 50 | name='arcjob', 51 | namespace='spark' 52 | ) 53 | self._etl_sa.node.add_dependency(etl_ns) 54 | 55 | _etl_rb = KubernetesManifest(self,'ETLRoleBinding', 56 | cluster=eks_cluster, 57 | manifest=load_yaml_replace_var_local(source_dir+'/app_resources/etl-rbac.yaml', 58 | fields= { 59 | "{{MY_SA}}": self._etl_sa.service_account_name 60 | }, 61 | multi_resource=True) 62 | ) 63 | _etl_rb.node.add_dependency(self._etl_sa) 64 | 65 | self._jupyter_sa = eks_cluster.add_service_account('jhubServiceAcct', 66 | # name=login_name, 67 | name='sparkoneks', 68 | namespace='jupyter' 69 | ) 70 | self._jupyter_sa.node.add_dependency(jupyter_ns) 71 | 72 | # Associate AWS IAM role to K8s Service Account 73 | datalake_bucket=code_bucket if not datalake_bucket.strip() else datalake_bucket 74 | _bucket_setting={ 75 | "{{codeBucket}}": code_bucket, 76 | "{{datalakeBucket}}": datalake_bucket 77 | } 78 | _etl_iam = load_yaml_replace_var_local(source_dir+'/app_resources/etl-iam-role.yaml',fields=_bucket_setting) 79 | for statmnt in _etl_iam: 80 | self._etl_sa.add_to_principal_policy(iam.PolicyStatement.from_json(statmnt)) 81 | self._jupyter_sa.add_to_principal_policy(iam.PolicyStatement.from_json(statmnt)) 82 | 83 | # # //*************************************************************************************// 84 | # # //******************** SETUP PERMISSION FOR NATIVE SPARK JOBS **********************// 85 | # # //***********************************************************************************// 86 | self._spark_sa = eks_cluster.add_service_account('NativeSparkSa', 87 | name='nativejob', 88 | namespace='spark' 89 | ) 90 | self._spark_sa.node.add_dependency(etl_ns) 91 | 92 | _spark_rb = eks_cluster.add_manifest('sparkRoleBinding', 93 | load_yaml_replace_var_local(source_dir+'/app_resources/native-spark-rbac.yaml', 94 | fields= { 95 | "{{MY_SA}}": self._spark_sa.service_account_name 96 | }) 97 | ) 98 | _spark_rb.node.add_dependency(self._spark_sa) 99 | 100 | _native_spark_iam = load_yaml_replace_var_local(source_dir+'/app_resources/native-spark-iam-role.yaml',fields=_bucket_setting) 101 | for statmnt in _native_spark_iam: 102 | self._spark_sa.add_to_principal_policy(iam.PolicyStatement.from_json(statmnt)) 103 | 104 | 105 | # Override Cfn Nag warning W12: IAM policy should not allow * resource 106 | # scan.suppress_cfnnag_rule('W12', 'by default the etl_sa role has * resource', self._etl_sa.role.node.find_child('DefaultPolicy').node.default_child) 107 | # scan.suppress_cfnnag_rule('W12', 'by default the role spark_sa has * resource', self._spark_sa.role.node.find_child('DefaultPolicy').node.default_child) 108 | # scan.suppress_cfnnag_rule('W12', 'by default the role jupyter_sa has * resource', self._jupyter_sa.role.node.find_child('DefaultPolicy').node.default_child) -------------------------------------------------------------------------------- /source/lib/cloud_front_stack.py: -------------------------------------------------------------------------------- 1 | from aws_cdk import ( 2 | NestedStack,Fn, 3 | aws_cloudfront_origins as origins, 4 | aws_cloudfront as cf, 5 | aws_elasticloadbalancingv2 as alb, 6 | aws_s3 as s3 7 | ) 8 | from constructs import Construct 9 | # import lib.util.override_rule as scan 10 | 11 | class NestedStack(NestedStack): 12 | 13 | @property 14 | def jhub_cf(self): 15 | return self._jhub_cf 16 | 17 | @property 18 | def argo_cf(self): 19 | return self._argo_cf 20 | 21 | def __init__(self,scope: Construct, id: str,logbucket: str,argo_alb_dns_name: str, jhub_alb_dns_name: str, **kwargs) -> None: 22 | super().__init__(scope, id, **kwargs) 23 | 24 | # //**********************************************************************************************************// 25 | # //*************************** Add CloudFront to enable HTTPS Endpoint (OPTIONAL) **************************// 26 | # //***** recommended way is to generate your own SSL certificate via AWS Certificate Manager ***************// 27 | # //****************************** add it to the application load balancer *********************************// 28 | # //*******************************************************************************************************// 29 | self._bucket=s3.Bucket.from_bucket_name(self,'cf_logbucket', logbucket) 30 | self._jhub_cf = add_distribution(self, 'jhub_dist', jhub_alb_dns_name, 80, self._bucket) 31 | self._argo_cf = add_distribution(self, 'argo_dist', argo_alb_dns_name, 2746, self._bucket) 32 | 33 | def add_distribution(scope: Construct, id: str, alb_dns_name: str, port: int, logbucket: s3.IBucket 34 | ) -> cf.IDistribution: 35 | 36 | load_balancer_arn=Fn.get_att(alb_dns_name,"DNSName") 37 | security_group_id=Fn.get_att(alb_dns_name,"SecurityGroups") 38 | 39 | alb2 = alb.ApplicationLoadBalancer.from_application_load_balancer_attributes(scope, id, 40 | load_balancer_arn=load_balancer_arn.to_string(), 41 | security_group_id=security_group_id.to_string(), 42 | load_balancer_dns_name=alb_dns_name 43 | ) 44 | _origin = origins.LoadBalancerV2Origin(alb2, 45 | http_port=port, 46 | protocol_policy=cf.OriginProtocolPolicy.HTTP_ONLY 47 | ) 48 | dist = cf.Distribution(scope, "CF-"+id, 49 | default_behavior={ 50 | "origin": _origin, 51 | "allowed_methods": cf.AllowedMethods.ALLOW_ALL, 52 | "cache_policy": cf.CachePolicy.CACHING_DISABLED, 53 | "origin_request_policy": cf.OriginRequestPolicy.ALL_VIEWER, 54 | "viewer_protocol_policy": cf.ViewerProtocolPolicy.REDIRECT_TO_HTTPS 55 | }, 56 | minimum_protocol_version=cf.SecurityPolicyProtocol.TLS_V1_2_2019, 57 | enable_logging=True, 58 | log_bucket=logbucket 59 | ) 60 | # Override Cfn_Nag rule for Cloudfront TLS-1.2 (https://github.com/stelligent/cfn_nag/issues/384) 61 | # scan.suppress_cfnnag_rule('W70','the distribution uses CloudFront domain name and automatically sets the policy to TLSv1',dist.node.default_child) 62 | 63 | return dist.distribution_domain_name 64 | 65 | -------------------------------------------------------------------------------- /source/lib/ecr_build/Dockerfile: -------------------------------------------------------------------------------- 1 | #FROM ghcr.io/tripl-ai/arc::latest 2 | FROM ghcr.io/tripl-ai/arc:arc_4.2.0_spark_3.3.4_scala_2.12_hadoop_3.3.2_4.2.1_slim 3 | ENV SPARK_HOME /opt/spark 4 | RUN mkdir -p $SPARK_HOME/work-dir 5 | WORKDIR $SPARK_HOME/work-dir -------------------------------------------------------------------------------- /source/lib/ecr_build/buildspec.yaml: -------------------------------------------------------------------------------- 1 | version: 0.2 2 | phases: 3 | install: 4 | commands: 5 | - export BUILDX_VERSION=$(curl --silent "https://api.github.com/repos/docker/buildx/releases/latest" |jq -r .tag_name) 6 | - curl -JLO "https://github.com/docker/buildx/releases/download/$BUILDX_VERSION/buildx-$BUILDX_VERSION.linux-amd64" 7 | - mkdir -p ~/.docker/cli-plugins 8 | - mv "buildx-$BUILDX_VERSION.linux-amd64" ~/.docker/cli-plugins/docker-buildx 9 | - chmod +x ~/.docker/cli-plugins/docker-buildx 10 | # - docker run --privileged --rm tonistiigi/binfmt --install arm64 11 | # To install all the supported platforms: 12 | - docker run --privileged --rm tonistiigi/binfmt --install all 13 | pre_build: 14 | commands: 15 | - echo Logging in to Amazon ECR... 16 | - aws --version 17 | - $(aws ecr get-login --region $AWS_DEFAULT_REGION --no-include-email) 18 | - REPOSITORY_URI=${REPO_ECR} 19 | - COMMIT_HASH=$(echo $CODEBUILD_RESOLVED_SOURCE_VERSION | cut -c 1-7) 20 | - IMAGE_TAG=${COMMIT_HASH:=latest} 21 | build: 22 | commands: 23 | - echo Build started on `date` 24 | - echo Building the Docker image... 25 | - docker buildx create --use --name multiarch 26 | - docker buildx build --push --platform=linux/amd64,linux/arm64 -t $REPOSITORY_URI:$IMAGE_TAG -t $REPOSITORY_URI:latest . 27 | # - docker build -t $REPOSITORY_URI:latest . 28 | # - docker tag $REPOSITORY_URI:latest $REPOSITORY_URI:$IMAGE_TAG 29 | # post_build: 30 | # commands: 31 | # - echo Build completed on `date` 32 | # - echo Pushing the Docker images... 33 | # - docker push $REPOSITORY_URI:latest 34 | # - docker push $REPOSITORY_URI:$IMAGE_TAG -------------------------------------------------------------------------------- /source/lib/ecr_build/ecr_build_pipeline.py: -------------------------------------------------------------------------------- 1 | ###################################################################################################################### 2 | # Copyright 2020-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved. # 3 | # # 4 | # Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance # 5 | # with the License. A copy of the License is located at # 6 | # # 7 | # http://www.apache.org/licenses/LICENSE-2.0 # 8 | # # 9 | # or in the 'license' file accompanying this file. This file is distributed on an 'AS IS' BASIS, WITHOUT WARRANTIES # 10 | # OR CONDITIONS OF ANY KIND, express o#implied. See the License for the specific language governing permissions # 11 | # and limitations under the License. # # 12 | ###################################################################################################################### 13 | # 14 | from aws_cdk import ( 15 | RemovalPolicy, 16 | Duration, 17 | aws_s3 as s3, 18 | aws_codepipeline as codepipeline, 19 | aws_codebuild as codebuild, 20 | aws_codepipeline_actions as codepipeline_actions, 21 | aws_ecr as ecr, 22 | ) 23 | from constructs import Construct 24 | # import lib.util.override_rule as scan 25 | 26 | class DockerPipelineConstruct(Construct): 27 | 28 | @property 29 | def image_uri(self): 30 | return self.ecr_repo.repository_uri 31 | 32 | def __init__(self,scope: Construct, id: str, codebucket: s3.IBucket, **kwargs,) -> None: 33 | super().__init__(scope, id, **kwargs) 34 | 35 | # 1. Create ECR repositories 36 | self.ecr_repo=ecr.Repository(self,'ECRRepo', 37 | image_scan_on_push=True, 38 | removal_policy=RemovalPolicy.DESTROY 39 | ) 40 | # 2. Setup deployment CI/CD to deploy docker image to ECR 41 | pipeline = codepipeline.Pipeline(self, "Pipeline", 42 | pipeline_name='BuildArcDockerImage', 43 | artifact_bucket=codebucket 44 | ) 45 | image_builder = codebuild.PipelineProject(self,'DockerBuild', 46 | project_name='BuildArcDockerImage', 47 | build_spec=codebuild.BuildSpec.from_source_filename('buildspec.yaml'), 48 | environment=dict( 49 | build_image=codebuild.LinuxBuildImage.AMAZON_LINUX_2_3, 50 | privileged=True 51 | ), 52 | environment_variables={ 53 | 'REPO_ECR': codebuild.BuildEnvironmentVariable(value=self.ecr_repo.repository_uri), 54 | }, 55 | description='Pipeline for docker build', 56 | timeout=Duration.minutes(60) 57 | ) 58 | image_builder.apply_removal_policy(RemovalPolicy.DESTROY) 59 | 60 | # 3. grant permissions for the CI/CD 61 | codebucket.grant_read_write(pipeline.role) 62 | codebucket.grant_read_write(image_builder) 63 | self.ecr_repo.grant_pull_push(image_builder) 64 | 65 | source_output=codepipeline.Artifact('src') 66 | pipeline.add_stage( 67 | stage_name='Source', 68 | actions=[ 69 | codepipeline_actions.S3SourceAction( 70 | action_name='S3Trigger', 71 | bucket=codebucket, 72 | bucket_key='app_code/ecr_build_src.zip', 73 | output=source_output, 74 | trigger=codepipeline_actions.S3Trigger.POLL), 75 | ] 76 | ) 77 | pipeline.add_stage( 78 | stage_name='Build', 79 | actions=[ 80 | codepipeline_actions.CodeBuildAction( 81 | action_name='DockerImageBuild', 82 | input=source_output, 83 | project=image_builder 84 | ) 85 | ] 86 | ) 87 | 88 | # Override Cfn Nag warning W12: IAM policy should not allow * resource 89 | # scan.suppress_cfnnag_rule('W12', 'the role for action of ecr:GetAuthorizationToken requires * resource', image_builder.role.node.find_child('DefaultPolicy').node.default_child) 90 | 91 | # image_builder.role.node.find_child('DefaultPolicy').node.default_child.add_metadata('cfn_nag',{ 92 | # "rules_to_suppress": [ 93 | # { 94 | # "id": "W12", 95 | # "reason": "the role for action of ecr:GetAuthorizationToken requires * resource" 96 | # }, 97 | # { 98 | # "id": "W76", 99 | # "reason": "the IAM policy is complex, need to be higher than 25" 100 | # } 101 | # ] 102 | # }) -------------------------------------------------------------------------------- /source/lib/solution_helper/lambda_function.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import uuid 3 | import requests 4 | from copy import copy 5 | from crhelper import CfnResource 6 | from datetime import datetime 7 | 8 | logger = logging.getLogger(__name__) 9 | helper = CfnResource(json_logging=True, log_level='INFO') 10 | METRICS_ENDPOINT = "https://metrics.awssolutionsbuilder.com/generic" 11 | 12 | def _sanitize_data(resource_properties): 13 | resource_properties.pop("ServiceToken", None) 14 | resource_properties.pop("Resource", None) 15 | 16 | # Solution ID and unique ID are sent separately 17 | resource_properties["Data"].pop("Solution", None) 18 | resource_properties["Data"].pop("UUID", None) 19 | 20 | return resource_properties 21 | 22 | @helper.create 23 | @helper.update 24 | @helper.delete 25 | def custom_resource(event, _): 26 | # print("Received event: " + json.dumps(event, indent=2)) 27 | request_type = event["RequestType"] 28 | resource_properties = event["ResourceProperties"] 29 | resource = resource_properties["Resource"] 30 | 31 | # One UUID per CFN deployment 32 | if resource == "UUID" and request_type == "Create": 33 | random_id = str(uuid.uuid4()) 34 | helper.Data.update({"UUID":random_id}) 35 | elif resource == "AnonymousMetric": 36 | try: 37 | metrics_data = _sanitize_data(copy(resource_properties)) 38 | metrics_data["CFTemplate"]= request_type + "d" 39 | headers = {"Content-Type": "application/json"} 40 | payload = { 41 | "Solution": resource_properties["Solution"], 42 | "UUID": resource_properties["UUID"], 43 | "TimeStamp": datetime.utcnow().isoformat(), 44 | **metrics_data 45 | } 46 | logger.info(f'Sending payload: {payload}') 47 | response = requests.post(METRICS_ENDPOINT, json=payload, headers=headers) 48 | logger.info(f'Response from metrics endpoint: {response.status_code} {response.reason}') 49 | except requests.exceptions.RequestException: 50 | logger.exception("Could not send usage data") 51 | except Exception: 52 | logger.exception("Unknown error when trying to send usage data") 53 | 54 | def handler(event, context): 55 | try: 56 | helper(event, context) 57 | return {"Data": helper.Data} 58 | except Exception as error: 59 | logger.exception("[handler] failed: {error}") 60 | 61 | -------------------------------------------------------------------------------- /source/lib/solution_helper/requirements.txt: -------------------------------------------------------------------------------- 1 | #requests==2.31.0 2 | #crhelper==2.0.11 3 | #urllib3==1.26.15 -------------------------------------------------------------------------------- /source/lib/solution_helper/solution_metrics.py: -------------------------------------------------------------------------------- 1 | from constructs import Construct 2 | from aws_cdk.aws_s3 import IBucket 3 | from lib.util.conditional_resources import Condition 4 | from aws_cdk import ( 5 | aws_lambda as _lambda, 6 | custom_resources as _custom_resources, 7 | aws_ec2 as _ec2 8 | ) 9 | from aws_cdk import ( 10 | Aspects, 11 | Fn, 12 | Duration, 13 | CfnMapping, 14 | CfnCondition, 15 | CustomResource, 16 | Duration, 17 | RemovalPolicy 18 | ) 19 | import lib.util.override_rule as scan 20 | 21 | class SendAnonymousData(Construct): 22 | 23 | @property 24 | def UUID(self): 25 | return self._uuid 26 | 27 | def __init__(self,scope: Construct, id: str, vpc: _ec2.IVpc, codebucket: IBucket, s3_deploy, metrics) -> None: 28 | super().__init__(scope, id) 29 | 30 | self._metrics_mapping = CfnMapping(self, 'AnonymousData',mapping={'SendAnonymousData': {'Data': 'Yes'}}) 31 | self._metrics_condition = CfnCondition(self, 'AnonymousDatatoAWS', 32 | expression=Fn.condition_equals(self._metrics_mapping.find_in_map('SendAnonymousData','Data'),'Yes') 33 | ) 34 | 35 | self._helper_func = _lambda.SingletonFunction(self, 'SolutionHelper', 36 | uuid='75248a81-9138-468c-9ba1-bca6c7137599', 37 | runtime= _lambda.Runtime.PYTHON_3_8, 38 | handler= 'lambda_function.handler', 39 | description= 'This function generates UUID for each deployment and sends anonymous data to the AWS Solutions team', 40 | code= _lambda.Code.from_bucket(bucket=codebucket,key='app_code/solution_helper.zip'), 41 | vpc=vpc, 42 | timeout=Duration.seconds(30) 43 | ) 44 | self._helper_func.add_dependency(s3_deploy) 45 | 46 | self._lambda_provider = _custom_resources.Provider( 47 | self, 'LambdaProvider', 48 | on_event_handler=self._helper_func, 49 | vpc=vpc 50 | ) 51 | 52 | self._uuid = CustomResource(self, 'UUIDCustomResource', 53 | service_token=self._lambda_provider.service_token, 54 | properties={ 55 | "Resource": "UUID" 56 | }, 57 | resource_type="Custom::CreateUUID", 58 | removal_policy=RemovalPolicy.DESTROY 59 | ) 60 | 61 | self._send_data = CustomResource(self, 'SendDataCustomResource', 62 | service_token=self._lambda_provider.service_token, 63 | properties={ 64 | "Resource": "AnonymousMetric", 65 | "UUID": self._uuid.get_att_string("UUID"), 66 | "Solution": metrics["Solution"], 67 | "Data": metrics 68 | }, 69 | resource_type= 'Custom::AnonymousData', 70 | removal_policy=RemovalPolicy.DESTROY 71 | ) 72 | self._send_data.node.add_dependency(self._uuid) 73 | 74 | Aspects.of(self._helper_func).add(Condition(self._metrics_condition)) 75 | Aspects.of(self._uuid).add(Condition(self._metrics_condition)) 76 | Aspects.of(self._send_data).add(Condition(self._metrics_condition)) -------------------------------------------------------------------------------- /source/lib/spark_on_eks_stack.py: -------------------------------------------------------------------------------- 1 | from aws_cdk import (Stack, CfnOutput, Duration, RemovalPolicy, Aws, Fn, CfnParameter, aws_eks as eks,aws_secretsmanager as secmger,aws_kms as kms) 2 | from constructs import Construct 3 | from lib.cdk_infra.network_sg import NetworkSgConst 4 | from lib.cdk_infra.iam_roles import IamConst 5 | from lib.cdk_infra.eks_cluster import EksConst 6 | from lib.cdk_infra.eks_service_account import EksSAConst 7 | from lib.cdk_infra.eks_base_app import EksBaseAppConst 8 | from lib.cdk_infra.s3_app_code import S3AppCodeConst 9 | from lib.cdk_infra.spark_permission import SparkOnEksSAConst 10 | from lib.ecr_build.ecr_build_pipeline import DockerPipelineConstruct 11 | from lib.cloud_front_stack import NestedStack 12 | from lib.util.manifest_reader import * 13 | # from lib.util import override_rule as scan 14 | # from lib.solution_helper import solution_metrics 15 | import json, os 16 | 17 | class SparkOnEksStack(Stack): 18 | 19 | @property 20 | def code_bucket(self): 21 | return self.app_s3.code_bucket 22 | 23 | @property 24 | def argo_url(self): 25 | return self._argo_alb.value 26 | 27 | @property 28 | def jhub_url(self): 29 | return self._jhub_alb.value 30 | 31 | def __init__(self, scope: Construct, id: str, eksname: str, solution_id: str, version: str, **kwargs) -> None: 32 | super().__init__(scope, id, **kwargs) 33 | 34 | self.template_options.description = "(SO0141) SQL based ETL with Apache Spark on Amazon EKS. This solution provides a SQL based ETL option with a open-source declarative framework powered by Apache Spark." 35 | source_dir=os.path.split(os.environ['VIRTUAL_ENV'])[0]+'/source' 36 | 37 | # Cloudformation input params 38 | datalake_bucket = CfnParameter(self, "datalakebucket", type="String", 39 | description="Your existing S3 bucket to be accessed by Jupyter Notebook and ETL job. Default: blank", 40 | default="" 41 | ) 42 | login_name = "sparkoneks" 43 | # login_name = CfnParameter(self, "jhubuser", type="String", 44 | # description="Your username login to jupyter hub. Only alphanumeric characters are allowed", 45 | # default="sparkoneks" 46 | # ) 47 | 48 | # Auto-generate a user login in secrets manager 49 | key = kms.Key(self, 'KMSKey',removal_policy=RemovalPolicy.DESTROY,enable_key_rotation=True) 50 | key.add_alias("alias/secretsManager") 51 | jhub_secret = secmger.Secret(self, 'jHubPwd', 52 | generate_secret_string=secmger.SecretStringGenerator( 53 | exclude_punctuation=True, 54 | secret_string_template=json.dumps({'username': login_name}), 55 | generate_string_key="password"), 56 | removal_policy=RemovalPolicy.DESTROY, 57 | encryption_key=key 58 | ) 59 | 60 | # 1. a new bucket to store app code and logs 61 | self.app_s3 = S3AppCodeConst(self,'appcode') 62 | 63 | # 2. push docker image to ECR via AWS CICD pipeline 64 | ecr_image = DockerPipelineConstruct(self,'image', self.app_s3.artifact_bucket) 65 | ecr_image.node.add_dependency(self.app_s3) 66 | CfnOutput(self,'IMAGE_URI', value=ecr_image.image_uri) 67 | 68 | # 3. EKS base infrastructure 69 | network_sg = NetworkSgConst(self,'network-sg', eksname, self.app_s3.code_bucket) 70 | iam = IamConst(self,'iam_roles', eksname) 71 | eks_cluster = EksConst(self,'eks_cluster', eksname, network_sg.vpc, iam.managed_node_role, iam.admin_role) 72 | EksSAConst(self, 'eks_sa', eks_cluster.my_cluster, jhub_secret) 73 | base_app=EksBaseAppConst(self, 'eks_base_app', eks_cluster.my_cluster) 74 | 75 | # 4. Spark app access control 76 | app_security = SparkOnEksSAConst(self,'spark_service_account', 77 | eks_cluster.my_cluster, 78 | login_name, 79 | self.app_s3.code_bucket, 80 | datalake_bucket.value_as_string 81 | ) 82 | app_security.node.add_dependency(base_app.secret_created) 83 | # 5. Install Arc Jupyter notebook in EKS 84 | jhub_install= eks_cluster.my_cluster.add_helm_chart('JHubChart', 85 | chart='jupyterhub', 86 | repository='https://jupyterhub.github.io/helm-chart', 87 | release='jhub', 88 | version='1.2.0', 89 | namespace='jupyter', 90 | create_namespace=False, 91 | values=load_yaml_replace_var_local(source_dir+'/app_resources/jupyter-values.yaml', 92 | fields={ 93 | "{{codeBucket}}": self.app_s3.code_bucket, 94 | "{{region}}": Aws.REGION 95 | }) 96 | ) 97 | jhub_install.node.add_dependency(app_security) 98 | # EKS get Jupyter login dynamically from secrets manager 99 | name_parts=Fn.split('-',jhub_secret.secret_name) 100 | name_no_suffix=Fn.join('-',[Fn.select(0, name_parts), Fn.select(1, name_parts)]) 101 | 102 | config_hub = eks.KubernetesManifest(self,'JHubConfig', 103 | cluster=eks_cluster.my_cluster, 104 | manifest=load_yaml_replace_var_local(source_dir+'/app_resources/jupyter-config.yaml', 105 | fields= { 106 | "{{MY_SA}}": app_security.jupyter_sa, 107 | "{{REGION}}": Aws.REGION, 108 | "{{SECRET_NAME}}": name_no_suffix, 109 | "{{INBOUND_SG}}": network_sg.alb_jhub_sg.security_group_id 110 | }, 111 | multi_resource=True) 112 | ) 113 | config_hub.node.add_dependency(jhub_install) 114 | 115 | # 6. Install ETL orchestrator - Argo in EKS 116 | # can be replaced by other workflow tool, eg. Airflow 117 | argo_install = eks_cluster.my_cluster.add_helm_chart('ARGOChart', 118 | chart='argo-workflows', 119 | repository='https://argoproj.github.io/argo-helm', 120 | release='argo', 121 | version='0.40.10', 122 | namespace='argo', 123 | create_namespace=True, 124 | values=load_yaml_replace_var_local(source_dir+'/app_resources/argo-values.yaml', 125 | fields= { 126 | "{{INBOUND_SG}}": network_sg.alb_argo_sg.security_group_id 127 | }) 128 | ) 129 | argo_install.node.add_dependency(config_hub) 130 | # Create argo workflow template for Spark with T-shirt size 131 | submit_tmpl = eks_cluster.my_cluster.add_manifest('SubmitSparkWrktmpl', 132 | load_yaml_local(source_dir+'/app_resources/spark-template.yaml') 133 | ) 134 | submit_tmpl.node.add_dependency(argo_install) 135 | 136 | # 7. (OPTIONAL) retrieve ALB DNS Name to enable CloudFront in the nested stack. 137 | # It is used to serve HTTPS requests with its default domain name. 138 | # Recommend to issue your own TLS certificate, and delete the CF components. 139 | self._jhub_alb=eks.KubernetesObjectValue(self, 'jhubALB', 140 | cluster=eks_cluster.my_cluster, 141 | json_path='..status.loadBalancer.ingress[0].hostname', 142 | object_type='ingress.networking', 143 | object_name='jupyterhub', 144 | object_namespace='jupyter', 145 | timeout=Duration.minutes(10) 146 | ) 147 | self._jhub_alb.node.add_dependency(config_hub) 148 | 149 | self._argo_alb = eks.KubernetesObjectValue(self, 'argoALB', 150 | cluster=eks_cluster.my_cluster, 151 | json_path='..status.loadBalancer.ingress[0].hostname', 152 | object_type='ingress.networking', 153 | object_name='argo-argo-workflows-server', 154 | object_namespace='argo', 155 | timeout=Duration.minutes(10) 156 | ) 157 | self._argo_alb.node.add_dependency(argo_install) 158 | 159 | # 8. (OPTIONAL) Send solution metrics to AWS 160 | # turn it off from the CloudFormation mapping section if prefer. 161 | # send_metrics=solution_metrics.SendAnonymousData(self,"SendMetrics", network_sg.vpc, self.app_s3.artifact_bucket,self.app_s3.s3_deploy_contrust, 162 | # metrics={ 163 | # "Solution": solution_id, 164 | # "Region": Aws.REGION, 165 | # "SolutionVersion": version, 166 | # "UUID": "MY_UUID", 167 | # "UseDataLakeBucket": "True" if not datalake_bucket.value_as_string else "False", 168 | # "UseAWSCICD": "True" if ecr_image.image_uri else "False", 169 | # "NoAZs": len(network_sg.vpc.availability_zones) 170 | # } 171 | # ) 172 | # send_metrics.node.add_dependency(self.app_s3.s3_deploy_contrust) 173 | 174 | # 9. (OPTIONAL) Override the cfn Nag rules for AWS Solution CICD deployment 175 | # remove the section if your CI/CD pipeline doesn't use the cfn_nag utility to validate the CFN. 176 | # k8s_ctl_node=self.node.find_child('@aws-cdk--aws-eks.KubectlProvider') 177 | # cluster_resrc_node=self.node.find_child('@aws-cdk--aws-eks.ClusterResourceProvider') 178 | # scan.suppress_cfnnag_rule('W12', 'by default the role has * resource', self.node.find_child('eks_cluster').node.find_child('EKS').node.default_child.node.find_child('CreationRole').node.find_child('DefaultPolicy').node.default_child) 179 | # scan.suppress_cfnnag_rule('W11', 'by default the role has * resource', self.node.find_child('Custom::AWSCDKOpenIdConnectProviderCustomResourceProvider').node.find_child('Role')) 180 | # scan.suppress_lambda_cfnnag_rule(k8s_ctl_node.node.find_child('Handler').node.default_child) 181 | # scan.suppress_lambda_cfnnag_rule(k8s_ctl_node.node.find_child('Provider').node.find_child('framework-onEvent').node.default_child) 182 | # scan.suppress_lambda_cfnnag_rule(self.node.find_child('Custom::CDKBucketDeployment8693BB64968944B69AAFB0CC9EB8756C').node.default_child) 183 | # # scan.suppress_lambda_cfnnag_rule(self.node.find_child('Custom::S3AutoDeleteObjectsCustomResourceProvider').node.find_child('Handler')) 184 | # scan.suppress_lambda_cfnnag_rule(self.node.find_child('Custom::AWSCDKOpenIdConnectProviderCustomResourceProvider').node.find_child('Handler')) 185 | # scan.suppress_lambda_cfnnag_rule(self.node.find_child('AWSCDKCfnUtilsProviderCustomResourceProvider').node.find_child('Handler')) 186 | # scan.suppress_lambda_cfnnag_rule(cluster_resrc_node.node.find_child('OnEventHandler').node.default_child) 187 | # scan.suppress_lambda_cfnnag_rule(cluster_resrc_node.node.find_child('IsCompleteHandler').node.default_child) 188 | # scan.suppress_lambda_cfnnag_rule(cluster_resrc_node.node.find_child('Provider').node.find_child('framework-isComplete').node.default_child) 189 | # scan.suppress_lambda_cfnnag_rule(cluster_resrc_node.node.find_child('Provider').node.find_child('framework-onTimeout').node.default_child) 190 | # scan.suppress_lambda_cfnnag_rule(cluster_resrc_node.node.find_child('Provider').node.find_child('framework-onEvent').node.default_child) 191 | # scan.suppress_network_cfnnag_rule(self.node.find_child('eks_cluster').node.find_child('EKS').node.find_child('ControlPlaneSecurityGroup').node.default_child) 192 | # scan.suppress_lambda_cfnnag_rule(self.node.find_child('SendMetrics').node.find_child('LambdaProvider').node.find_child('framework-onEvent').node.default_child) 193 | # scan.suppress_network_cfnnag_rule(self.node.find_child('SendMetrics').node.find_child('LambdaProvider').node.find_child('framework-onEvent').node.find_child('SecurityGroup').node.default_child) 194 | # scan.suppress_lambda_cfnnag_rule(self.node.find_child('SingletonLambda75248a819138468c9ba1bca6c7137599').node.default_child) 195 | # scan.suppress_network_cfnnag_rule(self.node.find_child('SingletonLambda75248a819138468c9ba1bca6c7137599').node.find_child('SecurityGroup').node.default_child) 196 | -------------------------------------------------------------------------------- /source/lib/util/conditional_resources.py: -------------------------------------------------------------------------------- 1 | import jsii 2 | from constructs import IConstruct 3 | from aws_cdk import CfnCondition, CfnResource, IAspect 4 | 5 | # This code enables `apply_aspect()` to apply conditions to a resource. 6 | # This way we can provision some resources if a condition is true. 7 | # For example, if PROVISIONTYPE parameter is 'Git' then we provision CodePipeline 8 | # with it's source stage being CodeCommit or GitHub 9 | # https://docs.aws.amazon.com/cdk/latest/guide/aspects.html 10 | 11 | 12 | @jsii.implements(IAspect) 13 | class Condition: 14 | def __init__(self, condition: CfnCondition): 15 | self._condition = condition 16 | 17 | def visit(self, node: IConstruct): 18 | child = node.node.default_child # type: CfnResource 19 | if child: 20 | child.cfn_options.condition = self._condition 21 | -------------------------------------------------------------------------------- /source/lib/util/get_aws_managed_prefix.py: -------------------------------------------------------------------------------- 1 | from aws_cdk import (Aws, aws_ec2 as ec2,aws_iam as iam, Fn) 2 | from aws_cdk.custom_resources import ( 3 | AwsCustomResource, 4 | AwsCustomResourcePolicy, 5 | PhysicalResourceId, 6 | AwsSdkCall 7 | ) 8 | from constructs import Construct 9 | 10 | class AwsManagedPrefixListProps: 11 | def __init__(self, name: str): 12 | """ 13 | Name of the AWS managed prefix list. 14 | See: https://docs.aws.amazon.com/vpc/latest/userguide/working-with-aws-managed-prefix-lists.html#available-aws-managed-prefix-lists 15 | eg. com.amazonaws.global.cloudfront.origin-facing 16 | """ 17 | self.name = name 18 | 19 | class AwsManagedPrefixList(Construct): 20 | def __init__(self, scope: Construct, id: str, props: AwsManagedPrefixListProps): 21 | super().__init__(scope, id) 22 | res = AwsCustomResource( 23 | self, 'AWSCustomResource', 24 | on_create=self.create(props), 25 | policy=AwsCustomResourcePolicy.from_statements([ 26 | iam.PolicyStatement( 27 | effect=iam.Effect.ALLOW, 28 | actions=['ec2:DescribeManagedPrefixLists'], 29 | resources=['*'], 30 | ), 31 | ]) 32 | ) 33 | self.prefixlist_id=res.get_response_field("PrefixLists.0.PrefixListId") 34 | 35 | def create(self, props): 36 | custom_params = { 37 | 'Filters': [ 38 | { 39 | 'Name': 'prefix-list-name', 40 | 'Values': [props.name], 41 | }, 42 | ] 43 | } 44 | 45 | return AwsSdkCall( 46 | service='EC2', 47 | action='describeManagedPrefixLists', 48 | parameters=custom_params, 49 | physical_resource_id=PhysicalResourceId.of(f"{id}-{Fn.select(0, Fn.split(':', self.node.addr))}"), 50 | region=Aws.REGION 51 | ) 52 | 53 | -------------------------------------------------------------------------------- /source/lib/util/manifest_reader.py: -------------------------------------------------------------------------------- 1 | import yaml 2 | import urllib.request as request 3 | import os.path as path 4 | import sys 5 | 6 | def load_yaml_remotely(url, multi_resource=False): 7 | try: 8 | file_to_parse = request.urlopen(url) 9 | if multi_resource: 10 | yaml_data = list(yaml.full_load_all(file_to_parse)) 11 | else: 12 | yaml_data = yaml.full_load(file_to_parse) 13 | # print(yaml_data) 14 | except: 15 | print("Cannot read yaml config file {}, check formatting." 16 | "".format(file_to_parse)) 17 | sys.exit(1) 18 | 19 | return yaml_data 20 | 21 | def load_yaml_local(yaml_file, multi_resource=False): 22 | 23 | file_to_parse=path.join(path.dirname(__file__), yaml_file) 24 | if not path.exists(file_to_parse): 25 | print("The file {} does not exist" 26 | "".format(file_to_parse)) 27 | sys.exit(1) 28 | 29 | try: 30 | with open(file_to_parse, 'r') as yaml_stream: 31 | if multi_resource: 32 | yaml_data = list(yaml.full_load_all(yaml_stream)) 33 | else: 34 | yaml_data = yaml.full_load(yaml_stream) 35 | # print(yaml_data) 36 | except: 37 | print("Cannot read yaml config file {}, check formatting." 38 | "".format(file_to_parse)) 39 | sys.exit(1) 40 | 41 | return yaml_data 42 | 43 | def load_yaml_replace_var_remotely(url, fields, multi_resource=False): 44 | try: 45 | with request.urlopen(url) as f: 46 | file_to_replace = f.read().decode('utf-8') 47 | for searchwrd,replwrd in fields.items(): 48 | file_to_replace = file_to_replace.replace(searchwrd, replwrd) 49 | 50 | if multi_resource: 51 | yaml_data = list(yaml.full_load_all(file_to_replace)) 52 | else: 53 | yaml_data = yaml.full_load(file_to_replace) 54 | # print(yaml_data) 55 | except request.URLError as e: 56 | print(e.reason) 57 | sys.exit(1) 58 | 59 | return yaml_data 60 | 61 | 62 | def load_yaml_replace_var_local(yaml_file, fields, multi_resource=False, write_output=False): 63 | 64 | file_to_replace=path.join(path.dirname(__file__), yaml_file) 65 | if not path.exists(file_to_replace): 66 | print("The file {} does not exist" 67 | "".format(file_to_replace)) 68 | sys.exit(1) 69 | 70 | try: 71 | with open(file_to_replace, 'r') as f: 72 | filedata = f.read() 73 | 74 | for searchwrd, replwrd in fields.items(): 75 | filedata = filedata.replace(searchwrd, replwrd) 76 | if multi_resource: 77 | yaml_data = list(yaml.full_load_all(filedata)) 78 | else: 79 | yaml_data = yaml.full_load(filedata) 80 | if write_output: 81 | with open(file_to_replace, "w") as f: 82 | yaml.dump(yaml_data, f, default_flow_style=False, allow_unicode = True, sort_keys=False) 83 | 84 | # print(yaml_data) 85 | except request.URLError as e: 86 | print(e.reason) 87 | sys.exit(1) 88 | 89 | return yaml_data 90 | -------------------------------------------------------------------------------- /source/lib/util/override_rule.py: -------------------------------------------------------------------------------- 1 | from constructs import IConstruct 2 | def suppress_cfnnag_rule(rule_id: str, reason: str, cnstrt: IConstruct): 3 | cnstrt.add_metadata('cfn_nag',{ 4 | "rules_to_suppress": [{ 5 | "id": rule_id, 6 | "reason": reason 7 | }] 8 | }) 9 | 10 | def suppress_lambda_cfnnag_rule(cnstrt: IConstruct): 11 | cnstrt.add_metadata('cfn_nag',{ 12 | "rules_to_suppress": [ 13 | { 14 | "id": "W58", 15 | "reason": "service role has permission to write logs to CloudWatch" 16 | }, 17 | { 18 | "id": "W89", 19 | "reason": "interal function does not need to associate to VPC" 20 | }, 21 | { 22 | "id": "W92", 23 | "reason": "Setting up ReservedConcurrentExecutions is out of reach with the internal function created by CDK" 24 | } 25 | ] 26 | }) 27 | 28 | def suppress_network_cfnnag_rule(cnstrt: IConstruct): 29 | cnstrt.add_metadata('cfn_nag',{ 30 | "rules_to_suppress": [ 31 | { 32 | "id": "W40", 33 | "reason": "Egress IP Protocol of -1 is default and generally considered OK" 34 | }, 35 | { 36 | "id": "W5", 37 | "reason": "The Security Group with cidr open considered OK" 38 | } 39 | ] 40 | }) 41 | 42 | def suppress_iam_cfnnag_rule(cnstrt: IConstruct): 43 | cnstrt.add_metadata('cfn_nag',{ 44 | "rules_to_suppress": [ 45 | { 46 | "id": "W12", 47 | "reason": "by default the role scaler_sa has * resource" 48 | }, 49 | { 50 | "id": "W76", 51 | "reason": "standard IAM role offered by ALB ingress controller" 52 | } 53 | ] 54 | }) -------------------------------------------------------------------------------- /source/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": { 3 | "aws-cdk": "^2.105.0", 4 | "vm2": "^3.9.10" 5 | } 6 | } 7 | -------------------------------------------------------------------------------- /source/requirements.txt: -------------------------------------------------------------------------------- 1 | -e . 2 | pytest -------------------------------------------------------------------------------- /source/run-all-tests.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # This script runs all tests for the root CDK project, as well as any microservices, Lambda functions, or dependency 4 | # source code packages. These include unit tests, integration tests, and snapshot tests. 5 | # 6 | # It is important that this script be tested and validated to ensure that all available test fixtures are run. 7 | # 8 | 9 | [ "$DEBUG" == 'true' ] && set -x 10 | set -e 11 | 12 | setup_python_env() { 13 | if [ -d "./.venv-test" ]; then 14 | echo "Reusing already setup python venv in ./.venv-test. Delete ./.venv-test if you want a fresh one created." 15 | return 16 | fi 17 | echo "Setting up python venv-test" 18 | python3 -m venv .venv-test 19 | echo "Initiating virtual environment" 20 | source .venv-test/bin/activate 21 | echo "Installing python packages" 22 | pip3 install -e source 23 | echo "deactivate virtual environment" 24 | deactivate 25 | } 26 | 27 | setup_and_activate_python_env() { 28 | # module_path=$1 29 | # cd $module_path 30 | 31 | [ "${CLEAN:-true}" = "true" ] && rm -fr .venv-test 32 | 33 | setup_python_env 34 | 35 | echo "Initiating virtual environment" 36 | source .venv-test/bin/activate 37 | } 38 | 39 | 40 | run_python_test() { 41 | module_path=$(pwd) 42 | module_name=${1} 43 | echo $1 44 | echo "------------------------------------------------------------------------------" 45 | echo "[Test] Python path=$module_path module=$module_name" 46 | echo "------------------------------------------------------------------------------" 47 | 48 | 49 | # setup coverage report path 50 | mkdir -p $source_dir/test/coverage-reports 51 | coverage_report_path=$source_dir/test/coverage-reports/$module_name.coverage.xml 52 | 53 | echo "coverage report path set to $coverage_report_path" 54 | 55 | # Use -vv for debugging 56 | python3 -m pytest --cov --cov-report=term-missing --cov-report "xml:$coverage_report_path" 57 | 58 | if [ "$?" = "1" ]; then 59 | echo "(source/run-all-tests.sh) ERROR: there is likely output above." 1>&2 60 | exit 1 61 | fi 62 | sed -i -e "s,$source_dir,source,g" $coverage_report_path 63 | } 64 | 65 | run_cdk_project_test() { 66 | component_description=$1 67 | echo "------------------------------------------------------------------------------" 68 | echo "[Test] $component_description" 69 | echo "------------------------------------------------------------------------------" 70 | [ "${CLEAN:-true}" = "true" ] && npm run clean 71 | npm install 72 | npm run build 73 | npm run test -- -u 74 | if [ "$?" = "1" ]; then 75 | echo "(source/run-all-tests.sh) ERROR: there is likely output above." 1>&2 76 | exit 1 77 | fi 78 | [ "${CLEAN:-true}" = "true" ] && rm -fr coverage 79 | } 80 | 81 | run_source_unit_test() { 82 | echo "------------------------------------------------------------------------------" 83 | echo "[Test] Run source unit tests" 84 | echo "------------------------------------------------------------------------------" 85 | 86 | # Test the functions 87 | cd $source_dir 88 | for folder in */; do 89 | if [ "$folder" = "test/" ]; then 90 | echo "------------------------------------------------------------------------------" 91 | echo "[Test] Run tests against $folder" 92 | echo "------------------------------------------------------------------------------" 93 | pip3 install -r $folder/requirement-test.txt 94 | run_python_test $folder 95 | rm -rf *.egg-info 96 | fi 97 | cd $source_dir 98 | done 99 | } 100 | 101 | # Clean the test environment before running tests and after finished running tests 102 | # The variable is option with default of 'true'. It can be overwritten by caller 103 | # setting the CLEAN environment variable. For example 104 | # $ CLEAN=true ./run-all-tests.sh 105 | # or 106 | # $ CLEAN=false ./run-all-tests.sh 107 | # 108 | CLEAN="${CLEAN:-true}" 109 | 110 | setup_and_activate_python_env 111 | source_dir=$PWD/source 112 | cd $source_dir 113 | 114 | python --version 115 | run_source_unit_test 116 | 117 | # Return to the root/ level where we started 118 | cd $source_dir -------------------------------------------------------------------------------- /source/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | import setuptools 4 | 5 | try: 6 | with open("../README.md") as fp: 7 | long_description = fp.read() 8 | except IOError as e: 9 | long_description = '' 10 | 11 | setuptools.setup( 12 | name="sql-based-etl", 13 | version="2.0.0", 14 | 15 | description="A CDK v2 Python app for SQL-based ETL", 16 | long_description=long_description, 17 | long_description_content_type="text/markdown", 18 | 19 | author="meloyang", 20 | 21 | package_dir={"": "./"}, 22 | packages=setuptools.find_packages(where="./"), 23 | 24 | install_requires=[ 25 | "aws-cdk-lib==2.105.0", 26 | "aws-cdk.lambda-layer-kubectl-v27==2.1.0", 27 | "constructs>=10.0.0,<11.0.0", 28 | "pyyaml==6.0.1" 29 | ], 30 | 31 | python_requires=">=3.7", 32 | 33 | classifiers=[ 34 | "Development Status :: 4 - Beta", 35 | 36 | "Intended Audience :: Developers", 37 | 38 | "License :: OSI Approved :: MIT License", 39 | 40 | "Programming Language :: JavaScript", 41 | "Programming Language :: Python :: 3 :: Only", 42 | "Programming Language :: Python :: 3.7", 43 | "Programming Language :: Python :: 3.8", 44 | "Programming Language :: Python :: 3.9", 45 | "Programming Language :: Python :: 3.10", 46 | "Programming Language :: Python :: 3.11", 47 | 48 | "Topic :: Software Development :: Code Generators", 49 | "Topic :: Utilities", 50 | 51 | "Typing :: Typed", 52 | ], 53 | ) 54 | --------------------------------------------------------------------------------