├── .gitignore ├── CHANGELOG.md ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE.txt ├── README.md ├── THIRD-PARTY-NOTICES.txt ├── emr-on-eks ├── README.md ├── deprovision.sh ├── green_taxi_load.ipynb ├── green_taxi_load.ipynb.license ├── green_taxi_schema.json ├── green_taxi_schema.json.license ├── provision.sh └── submit_arc_job.sh └── spark-on-eks ├── README.md ├── deployment ├── app_code │ ├── data │ │ ├── initial_contacts.csv │ │ └── update_contacts.csv │ ├── job │ │ ├── delta_load.ipynb │ │ ├── driver-pod-template.yaml │ │ ├── executor-pod-template.yaml │ │ ├── green_taxi_load.ipynb │ │ ├── initial_load.ipynb │ │ ├── msk_consumer.py │ │ ├── scd2_merge.ipynb │ │ └── wordcount.py │ ├── meta │ │ ├── contact_meta_0.json │ │ └── green_taxi_schema.json │ └── sql │ │ ├── add_calc_field_for_scd2.sql │ │ ├── create_table_contact.sql │ │ └── sqlvalidate_errors.sql ├── build-s3-dist.sh ├── cdk-solution-helper │ ├── README.md │ ├── index.js │ └── package.json ├── delete_all.sh └── post-deployment.sh ├── images ├── 00-deploy-to-aws.png ├── 3-argo-job-dependency.png ├── 3-argo-log.png ├── 3-argo-sidemenu.png ├── 4-auto-scaling.png ├── 4-k8s-retry.png ├── 4-spot-console.png ├── architecture.png ├── driver_interruption_test.gif ├── executor_interruption_test.png ├── fake_data.gif ├── run_jupyter.gif ├── submit_job_in_argo.gif ├── submit_native_spark.gif └── two_architecture.png └── source ├── app.py ├── app_resources ├── alb-iam-role.yaml ├── alb-values.yaml ├── argo-values.yaml ├── autoscaler-iam-role.yaml ├── autoscaler-values.yaml ├── etl-iam-role.yaml ├── etl-rbac.yaml ├── ex-secret-iam-role.yaml ├── ex-secret-values.yaml ├── jupyter-config.yaml ├── jupyter-values.yaml ├── native-spark-iam-role.yaml ├── native-spark-rbac.yaml ├── spark-operator-values.yaml └── spark-template.yaml ├── cdk.json ├── example ├── native-spark-job-scheduler.yaml ├── notebook │ ├── Spark_streaming_job.ipynb │ ├── nyctaxi-job.ipynb │ └── scd2-job.ipynb ├── nyctaxi-job-scheduler.yaml ├── scd2-job-scheduler.yaml └── test │ ├── TEST-arc-jupyter.yaml │ └── TEST-cron-job-scheduler.yaml ├── lib ├── cdk_infra │ ├── eks_base_app.py │ ├── eks_cluster.py │ ├── eks_service_account.py │ ├── iam_roles.py │ ├── network_sg.py │ ├── s3_app_code.py │ └── spark_permission.py ├── cloud_front_stack.py ├── spark_on_eks_stack.py └── util │ ├── get_aws_managed_prefix.py │ └── manifest_reader.py ├── package.json ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # SPDX-FileCopyrightText: 2021 Amazon.com, Inc. or its affiliates. 2 | # 3 | # SPDX-License-Identifier: MIT-0 4 | # Git 5 | .git 6 | 7 | ### VisualStudioCode ### 8 | .vscode/* 9 | ### IntelliJ/ PyCharm ### 10 | .idea/* 11 | # System Files 12 | **/.DS_Store 13 | # CDK 14 | **/cdk.out 15 | **/cdk.context.json 16 | *.swp 17 | **/node_modules 18 | **/package-lock.json 19 | 20 | # compiled output 21 | **/global-s3-assets 22 | **/regional-s3-assets 23 | **/open-source 24 | 25 | ### Python ### 26 | # Byte-compiled / optimized / DLL files 27 | __pycache__/ 28 | *.py[cod] 29 | *$py.class 30 | # Python Distribution / packaging 31 | *.egg-info/ 32 | *.egg 33 | # Python Virtual Environments 34 | **/venv* 35 | **/.venv* 36 | **/.env 37 | ## Python Testing 38 | .pytest_cache 39 | **/.pytest_cache 40 | **/.coverage 41 | **/coverage-reports/ -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Change Log 2 | All notable changes to this project will be documented in this file. 3 | 4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), 5 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). 6 | 7 | ## [3.0.0] - 2023-12-14 8 | ### Added 9 | - Added a compulsory CFN input parameter to restrict inbound CIDRs for ALB security group 10 | ## [2.0.0] - 2021-11-19 11 | ### Upgrade 12 | - upgrade the entire cdk code from version 1 to version 2 13 | ## [1.0.0] - 2020-12-04 14 | ### Added 15 | - All files, initial version 16 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software is furnished to do so. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 15 | 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SQL-based ETL with Spark on EKS 2 | 3 | We introduce a quality-aware design to increase data processing productivity, by leveraging an open-source [Arc data framework](https://arc.tripl.ai/) for a user-centered declarative ETL solution. We take considerations of the needs and expected skills from customers in data analytics, and accelerate their interaction with ETL practice in order to foster simplicity, while maximizing efficiency. 4 | 5 | The sample provides two ways of running the solution shown in the architecture diagram: 6 | 1. Spark on EKS by Argo Workflows tool 7 | 2. [EMR on EKS](https://aws.amazon.com/emr/features/eks/) 8 | 9 | ![](/spark-on-eks/images/two_architecture.png) 10 | 11 | ### Test job in Jupyter 12 | ![](/spark-on-eks/images/run_jupyter.gif) 13 | 14 | 15 | ### Test Spark Driver self-recovery (100% spot) 16 | ![](/spark-on-eks/images/driver_interruption_test.gif) 17 | 18 | ### Submit Spark job by Argo tool 19 | ![](/spark-on-eks/images/submit_job_in_argo.gif) 20 | 21 | 22 | ## Prerequisite 23 | Running the sample solution on a local machine, you should have the following prerequisites: 24 | 1. Python 3.6 or later. Download Python [here](https://www.python.org/downloads/). 25 | 2. AWS CLI version 1. 26 | Windows: [MSI installer](https://docs.aws.amazon.com/cli/latest/userguide/install-windows.html#install-msi-on-windows) 27 | Linux, macOS or Unix: [Bundled installer](https://docs.aws.amazon.com/cli/latest/userguide/install-macos.html#install-macosos-bundled) 28 | 3. AWS CLI is configured to communicate with services in your deployment account. Otherwise, either set your profile by `export AWS_PROFILE=` , or run the following configuration to setup your AWS account access. 29 | ```bash 30 | aws configure 31 | ``` 32 | If you don’t want to install anything on your computer, use [AWS CloudShell](https://aws.amazon.com/cloudshell/), a browser-based shell that makes it easy to run scripts with the AWS Command Line Interface (AWS CLI). 33 | 34 | ## Clone the project 35 | Download the sample code either to your computer or to your [AWS CloudShell Console](https://console.aws.amazon.com/cloudshell/home?region=us-east-1). 36 | 37 | ```bash 38 | git clone https://github.com/aws-samples/sql-based-etl-on-amazon-eks.git 39 | cd sql-based-etl-on-amazon-eks 40 | ``` 41 | 42 | ## Deploy Infrastructure 43 | 44 | The provisining takes about 30 minutes to complete. See the `troubleshooting` section if you have any problem during the deployment. 45 | 46 | The example solution provides two options to submit ETL jobs. See the detailed deployment instruction: 47 | 48 | 1. [Spark on EKS](/spark-on-eks/README.md) 49 | 2. [EMR on EKS](/emr-on-eks/README.md) 50 | 51 | 52 | ## Troubleshooting 53 | 1. If you see the error `Credentials were refreshed, but the refreshed credentials are still expired` in AWS CloudShell, click **Actions** button, and create a `New tab`. 54 | 55 | 2. If you see the issue `[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1123)`, most likely it means no default certificate authority for your Python installation on OSX. Refer to the [answer](https://stackoverflow.com/questions/52805115/0nd) installing `Install Certificates.command` should fix your local environment. Otherwise, use [Cloud9](https://aws.amazon.com/cloud9/details/) to deploy the CDK instead. 56 | 57 | 58 | ## Security 59 | 60 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information. 61 | 62 | ## License 63 | 64 | This library is licensed under the MIT-0 License. See the [LICENSE](LICENSE.txt) file. -------------------------------------------------------------------------------- /THIRD-PARTY-NOTICES.txt: -------------------------------------------------------------------------------- 1 | ** aws-cdk -- v1.96.0 -- https://github.com/aws/aws-cdk -- Apache-2.0 2 | ** cdk-solution-init-pkg; version 1.0.0 -- https://aws.amazon.com/solutions/ -- Apahe-2.0 3 | ** Arc -- v3.10.0_spark_3.0.3_scala_2.12_hadoop_3.2.0_1.0.0 -- https://arc.tripl.ai/ -- MIT License 4 | ** Arc Jupyter - v3.14.2_scala_2.12_hadoop_3.2.0_1.1.0 -- https://github.com/tripl-ai/arc-jupyter -- MIT License 5 | ** argo-workflows -- v3.5.4 -- https://github.com/argoproj/argo-helm -- Apache-2.0 6 | ** JupyterHub -- v1.5.0 -- https://jupyterhub.github.io/helm-chart/ -- revised BSD license 7 | ** k8s-cluster-autoscaler -- v1.27.2 -- https://github.com/kubernetes/autoscaler -- Apache-2.0 8 | ** amazon-cloudwatch-container-insights -- latest version -- https://github.com/aws-samples/amazon-cloudwatch-container-insights -- MIT-0 9 | ** aws-load-balancer-controller -- v2.5.4 -- https://github.com/aws/eks-charts/ -- Apache-2.0 10 | ** kubernetes-external-secrets -- v8.5.5 -- https://github.com/external-secrets/kubernetes-external-secrets -- MIT License 11 | ** spark-on-k8s-operator -- v1beta2-1.2.3-3.1.1 -- https://github.com/GoogleCloudPlatform/spark-on-k8s-operator -- Apache-2.0 12 | 13 | Apache License 14 | 15 | Version 2.0, January 2004 16 | 17 | http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND 18 | DISTRIBUTION 19 | 20 | 1. Definitions. 21 | 22 | "License" shall mean the terms and conditions for use, reproduction, and 23 | distribution as defined by Sections 1 through 9 of this document. 24 | 25 | "Licensor" shall mean the copyright owner or entity authorized by the 26 | copyright owner that is granting the License. 27 | 28 | "Legal Entity" shall mean the union of the acting entity and all other 29 | entities that control, are controlled by, or are under common control 30 | with that entity. For the purposes of this definition, "control" means 31 | (i) the power, direct or indirect, to cause the direction or management 32 | of such entity, whether by contract or otherwise, or (ii) ownership of 33 | fifty percent (50%) or more of the outstanding shares, or (iii) 34 | beneficial ownership of such entity. 35 | 36 | "You" (or "Your") shall mean an individual or Legal Entity exercising 37 | permissions granted by this License. 38 | 39 | "Source" form shall mean the preferred form for making modifications, 40 | including but not limited to software source code, documentation source, 41 | and configuration files. 42 | 43 | "Object" form shall mean any form resulting from mechanical 44 | transformation or translation of a Source form, including but not limited 45 | to compiled object code, generated documentation, and conversions to 46 | other media types. 47 | 48 | "Work" shall mean the work of authorship, whether in Source or Object 49 | form, made available under the License, as indicated by a copyright 50 | notice that is included in or attached to the work (an example is 51 | provided in the Appendix below). 52 | 53 | "Derivative Works" shall mean any work, whether in Source or Object form, 54 | that is based on (or derived from) the Work and for which the editorial 55 | revisions, annotations, elaborations, or other modifications represent, 56 | as a whole, an original work of authorship. For the purposes of this 57 | License, Derivative Works shall not include works that remain separable 58 | from, or merely link (or bind by name) to the interfaces of, the Work and 59 | Derivative Works thereof. 60 | 61 | "Contribution" shall mean any work of authorship, including the original 62 | version of the Work and any modifications or additions to that Work or 63 | Derivative Works thereof, that is intentionally submitted to Licensor for 64 | inclusion in the Work by the copyright owner or by an individual or Legal 65 | Entity authorized to submit on behalf of the copyright owner. For the 66 | purposes of this definition, "submitted" means any form of electronic, 67 | verbal, or written communication sent to the Licensor or its 68 | representatives, including but not limited to communication on electronic 69 | mailing lists, source code control systems, and issue tracking systems 70 | that are managed by, or on behalf of, the Licensor for the purpose of 71 | discussing and improving the Work, but excluding communication that is 72 | conspicuously marked or otherwise designated in writing by the copyright 73 | owner as "Not a Contribution." 74 | 75 | "Contributor" shall mean Licensor and any individual or Legal Entity on 76 | behalf of whom a Contribution has been received by Licensor and 77 | subsequently incorporated within the Work. 78 | 79 | 2. Grant of Copyright License. Subject to the terms and conditions of this 80 | License, each Contributor hereby grants to You a perpetual, worldwide, 81 | non-exclusive, no-charge, royalty-free, irrevocable copyright license to 82 | reproduce, prepare Derivative Works of, publicly display, publicly perform, 83 | sublicense, and distribute the Work and such Derivative Works in Source or 84 | Object form. 85 | 86 | 3. Grant of Patent License. Subject to the terms and conditions of this 87 | License, each Contributor hereby grants to You a perpetual, worldwide, 88 | non-exclusive, no-charge, royalty-free, irrevocable (except as stated in 89 | this section) patent license to make, have made, use, offer to sell, sell, 90 | import, and otherwise transfer the Work, where such license applies only to 91 | those patent claims licensable by such Contributor that are necessarily 92 | infringed by their Contribution(s) alone or by combination of their 93 | Contribution(s) with the Work to which such Contribution(s) was submitted. 94 | If You institute patent litigation against any entity (including a 95 | cross-claim or counterclaim in a lawsuit) alleging that the Work or a 96 | Contribution incorporated within the Work constitutes direct or contributory 97 | patent infringement, then any patent licenses granted to You under this 98 | License for that Work shall terminate as of the date such litigation is 99 | filed. 100 | 101 | 4. Redistribution. You may reproduce and distribute copies of the Work or 102 | Derivative Works thereof in any medium, with or without modifications, and 103 | in Source or Object form, provided that You meet the following conditions: 104 | 105 | (a) You must give any other recipients of the Work or Derivative Works a 106 | copy of this License; and 107 | 108 | (b) You must cause any modified files to carry prominent notices stating 109 | that You changed the files; and 110 | 111 | (c) You must retain, in the Source form of any Derivative Works that You 112 | distribute, all copyright, patent, trademark, and attribution notices 113 | from the Source form of the Work, excluding those notices that do not 114 | pertain to any part of the Derivative Works; and 115 | 116 | (d) If the Work includes a "NOTICE" text file as part of its 117 | distribution, then any Derivative Works that You distribute must include 118 | a readable copy of the attribution notices contained within such NOTICE 119 | file, excluding those notices that do not pertain to any part of the 120 | Derivative Works, in at least one of the following places: within a 121 | NOTICE text file distributed as part of the Derivative Works; within the 122 | Source form or documentation, if provided along with the Derivative 123 | Works; or, within a display generated by the Derivative Works, if and 124 | wherever such third-party notices normally appear. The contents of the 125 | NOTICE file are for informational purposes only and do not modify the 126 | License. You may add Your own attribution notices within Derivative Works 127 | that You distribute, alongside or as an addendum to the NOTICE text from 128 | the Work, provided that such additional attribution notices cannot be 129 | construed as modifying the License. 130 | 131 | You may add Your own copyright statement to Your modifications and may 132 | provide additional or different license terms and conditions for use, 133 | reproduction, or distribution of Your modifications, or for any such 134 | Derivative Works as a whole, provided Your use, reproduction, and 135 | distribution of the Work otherwise complies with the conditions stated in 136 | this License. 137 | 138 | 5. Submission of Contributions. Unless You explicitly state otherwise, any 139 | Contribution intentionally submitted for inclusion in the Work by You to the 140 | Licensor shall be under the terms and conditions of this License, without 141 | any additional terms or conditions. Notwithstanding the above, nothing 142 | herein shall supersede or modify the terms of any separate license agreement 143 | you may have executed with Licensor regarding such Contributions. 144 | 145 | 6. Trademarks. This License does not grant permission to use the trade 146 | names, trademarks, service marks, or product names of the Licensor, except 147 | as required for reasonable and customary use in describing the origin of the 148 | Work and reproducing the content of the NOTICE file. 149 | 150 | 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in 151 | writing, Licensor provides the Work (and each Contributor provides its 152 | Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 153 | KIND, either express or implied, including, without limitation, any 154 | warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or 155 | FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining 156 | the appropriateness of using or redistributing the Work and assume any risks 157 | associated with Your exercise of permissions under this License. 158 | 159 | 8. Limitation of Liability. In no event and under no legal theory, whether 160 | in tort (including negligence), contract, or otherwise, unless required by 161 | applicable law (such as deliberate and grossly negligent acts) or agreed to 162 | in writing, shall any Contributor be liable to You for damages, including 163 | any direct, indirect, special, incidental, or consequential damages of any 164 | character arising as a result of this License or out of the use or inability 165 | to use the Work (including but not limited to damages for loss of goodwill, 166 | work stoppage, computer failure or malfunction, or any and all other 167 | commercial damages or losses), even if such Contributor has been advised of 168 | the possibility of such damages. 169 | 170 | 9. Accepting Warranty or Additional Liability. While redistributing the Work 171 | or Derivative Works thereof, You may choose to offer, and charge a fee for, 172 | acceptance of support, warranty, indemnity, or other liability obligations 173 | and/or rights consistent with this License. However, in accepting such 174 | obligations, You may act only on Your own behalf and on Your sole 175 | responsibility, not on behalf of any other Contributor, and only if You 176 | agree to indemnify, defend, and hold each Contributor harmless for any 177 | liability incurred by, or claims asserted against, such Contributor by 178 | reason of your accepting any such warranty or additional liability. END OF 179 | TERMS AND CONDITIONS 180 | 181 | APPENDIX: How to apply the Apache License to your work. 182 | 183 | To apply the Apache License to your work, attach the following boilerplate 184 | notice, with the fields enclosed by brackets "[]" replaced with your own 185 | identifying information. (Don't include the brackets!) The text should be 186 | enclosed in the appropriate comment syntax for the file format. We also 187 | recommend that a file or class name and description of purpose be included on 188 | the same "printed page" as the copyright notice for easier identification 189 | within third-party archives. 190 | 191 | Copyright [yyyy] [name of copyright owner] 192 | 193 | Licensed under the Apache License, Version 2.0 (the "License"); 194 | 195 | you may not use this file except in compliance with the License. 196 | 197 | You may obtain a copy of the License at 198 | 199 | http://www.apache.org/licenses/LICENSE-2.0 200 | 201 | Unless required by applicable law or agreed to in writing, software 202 | 203 | distributed under the License is distributed on an "AS IS" BASIS, 204 | 205 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 206 | 207 | See the License for the specific language governing permissions and 208 | 209 | limitations under the License. 210 | 211 | * For cdk-solution-init-pkg see also this required NOTICE: 212 | Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 213 | Licensed under the Apache License Version 2.0 (the "License"). You may not 214 | use this file except 215 | in compliance with the License. A copy of the License is located at 216 | http://www.apache.org/licenses/ 217 | or in the "license" file accompanying this file. This file is distributed 218 | on an "AS IS" BASIS, 219 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. See the 220 | License for the 221 | specific language governing permissions and limitations under the License. -------------------------------------------------------------------------------- /emr-on-eks/README.md: -------------------------------------------------------------------------------- 1 | 6 | 7 | # Arc ETL framework on EMR on EKS 8 | AWS Launched [EMR on EKS](https://aws.amazon.com/emr/features/eks/) and this sample demonstrates an end-to-end process to provision an EKS cluster, execute a Spark ETL job defined as a [jupyter notebook](green_taxi_load.ipynb) using [Arc Framework](https://arc.tripl.ai/getting-started/). 9 | 10 | # Provisioning 11 | 1. Open AWS CloudShell in us-east-1: [link to AWS CloudShell](https://console.aws.amazon.com/cloudshell/home?region=us-east-1) 12 | 2. Run the following command to provision a new EKS cluster `eks-cluster` backed by Fargate and build a virtual EMR cluster `emr-on-eks-cluster` 13 | ```bash 14 | curl https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/main/emr-on-eks/provision.sh | bash 15 | ``` 16 | 3. Once provisioning is complete (~20 min), run the following command to submit a new Spark job on the virtual EMR cluster: 17 | ```bash 18 | curl https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/main/emr-on-eks/submit_arc_job.sh | bash 19 | ``` 20 | The sample job will create an output S3 bucket, load the [TLC green taxi trip records](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page) from public `s3://nyc-tlc/csv_backup/green_tripdata*.csv`, apply schema, convert it into Parquet and store it in the output S3 bucket. 21 | 22 | The job is defined as a [jupyter notebook green_taxi_load.ipynb](green_taxi_load.ipynb) using [Arc Framework](https://arc.tripl.ai/getting-started/) and the applied schema is defined in [green_taxi_schema.json](green_taxi_schema.json) 23 | 24 | 25 | ## AWS Resources 26 | * EKS cluster: [link to AWS Console](https://console.aws.amazon.com/eks/home?region=us-east-1#/clusters/eks-cluster) 27 | * Virtual EMR Clusters and jobs: [link to AWS Console](https://console.aws.amazon.com/elasticmapreduce/home?region=us-east-1#virtual-cluster-list:) 28 | * CloudWatch EMR job logs: [link to AWS Console](https://console.aws.amazon.com/cloudwatch/home?region=us-east-1#logsV2:log-groups/log-group/$252Faws$252Feks$252Feks-cluster$252Fjobs) 29 | * S3 buckets - navigate to the output S3 bucket: [link to AWS Console](https://s3.console.aws.amazon.com/s3/home?region=us-east-1) 30 | 31 | ## EKS Resources 32 | To review the execution process, run: 33 | ``` 34 | kubectl get po -n emr 35 | ``` 36 | 37 | # Cleanup 38 | To clean up resources, run: 39 | ```bash 40 | curl https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/main/emr-on-eks/deprovision.sh | bash 41 | ``` 42 | 43 | 44 | 45 | That's it! 46 | -------------------------------------------------------------------------------- /emr-on-eks/deprovision.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # SPDX-FileCopyrightText: Copyright 2021 Amazon.com, Inc. or its affiliates. 4 | # SPDX-License-Identifier: MIT-0 5 | 6 | # Define params 7 | export AWS_DEFAULT_REGION=us-east-1 8 | export EKSCLUSTERNAME=eks-cluster 9 | export EMRCLUSTERNAME=emr-on-$EKSCLUSTERNAME 10 | export ROLENAME=${EMRCLUSTERNAME}-execution-role 11 | 12 | #submit test job 13 | export EMRCLUSTERID=$(aws emr-containers list-virtual-clusters --query "virtualClusters[?name == '${EMRCLUSTERNAME}' && state == 'RUNNING'].id" --output text) 14 | export ACCOUNTID=$(aws sts get-caller-identity --query Account --output text) 15 | export ROLEARN=arn:aws:iam::$ACCOUNTID:role/$ROLENAME 16 | export OUTPUTS3BUCKET=${EMRCLUSTERNAME}-${ACCOUNTID} 17 | export POLICYARN=arn:aws:iam::$ACCOUNTID:policy/${ROLENAME}-policy 18 | 19 | # install eksctl (https://docs.aws.amazon.com/emr/latest/EMR-on-EKS-DevelopmentGuide/setting-up-eksctl.html) 20 | curl --silent --location "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp 21 | sudo mv /tmp/eksctl /usr/local/bin 22 | 23 | # update aws CLI to the latest version (we will require aws cli version >= 2.1.14) 24 | curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "/tmp/awscliv2.zip" 25 | unzip -q -o /tmp/awscliv2.zip -d /tmp 26 | sudo /tmp/aws/install --update 27 | 28 | # clean up resources 29 | aws emr-containers delete-virtual-cluster --id $EMRCLUSTERID 30 | eksctl delete cluster --name=$EKSCLUSTERNAME 31 | aws iam detach-role-policy --role-name $ROLENAME --policy-arn $POLICYARN 32 | aws iam delete-role --role-name $ROLENAME 33 | aws iam delete-policy --policy-arn $POLICYARN 34 | aws s3 rm s3://$OUTPUTS3BUCKET --recursive 35 | aws s3api delete-bucket --bucket $OUTPUTS3BUCKET 36 | 37 | -------------------------------------------------------------------------------- /emr-on-eks/green_taxi_load.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%env\n", 10 | "SCHEMA=https://\n", 11 | "OUTPUT=s3://\n", 12 | "ETL_CONF_ENV=production" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "{\n", 22 | " \"type\": \"DelimitedExtract\",\n", 23 | " \"name\": \"extract csv data from nyc_tripdata\",\n", 24 | " \"environments\": [\"production\", \"test\"],\n", 25 | " \"inputURI\": \"s3a://nyc-tlc/csv_backup/green_tripdata_*.csv\",\n", 26 | " \"outputView\": \"green_tripdata0_raw\", \n", 27 | " \"delimiter\": \"Comma\",\n", 28 | " \"quote\" : \"DoubleQuote\",\n", 29 | " \"header\": true,\n", 30 | " \"persist\": true\n", 31 | "}" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "{\n", 41 | " \"type\": \"TypingTransform\",\n", 42 | " \"name\": \"apply green_tripdata schema 0 data types\",\n", 43 | " \"environments\": [\"production\", \"test\"],\n", 44 | " \"schemaURI\": ${SCHEMA},\n", 45 | " \"inputView\": \"green_tripdata0_raw\", \n", 46 | " \"outputView\": \"green_tripdata0\"\n", 47 | "}" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "%sql name=\"aggregate the result by month and year\" outputView=green_trip_summery environments=production,test persist=true\n", 57 | "\n", 58 | "SELECT \n", 59 | " year(lpep_pickup_datetime) AS trip_year\n", 60 | " ,month(lpep_pickup_datetime) AS trip_month\n", 61 | " ,vendor_id\n", 62 | " ,sum(coalesce(trip_distance,0)) AS total_distance\n", 63 | " ,sum(coalesce(total_amount,0)) AS total_fee\n", 64 | "FROM green_tripdata0\n", 65 | "GROUP BY trip_year, trip_month, vendor_id" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "{\n", 75 | " \"type\": \"ParquetLoad\",\n", 76 | " \"name\": \"write out green_tripdata0 dataset as Parquet\",\n", 77 | " \"environments\": [\"production\", \"test\"],\n", 78 | " \"inputView\": \"green_trip_summery\",\n", 79 | " \"outputURI\": ${OUTPUT},\n", 80 | " \"saveMode\": \"Overwrite\"\n", 81 | "}" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [] 90 | } 91 | ], 92 | "metadata": { 93 | "kernelspec": { 94 | "display_name": "Arc", 95 | "language": "javascript", 96 | "name": "arc" 97 | }, 98 | "language_info": { 99 | "codemirror_mode": "javascript", 100 | "file_extension": ".json", 101 | "mimetype": "javascript", 102 | "name": "arc", 103 | "nbconvert_exporter": "arcexport", 104 | "version": "3.12.1" 105 | } 106 | }, 107 | "nbformat": 4, 108 | "nbformat_minor": 4 109 | } 110 | -------------------------------------------------------------------------------- /emr-on-eks/green_taxi_load.ipynb.license: -------------------------------------------------------------------------------- 1 | SPDX-FileCopyrightText: 2021 Amazon.com, Inc. or its affiliates. 2 | 3 | SPDX-License-Identifier: MIT-0 -------------------------------------------------------------------------------- /emr-on-eks/green_taxi_schema.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "id": "f457e562-5c7a-4215-a754-ab749509f3fb", 4 | "name": "vendor_id", 5 | "description": "A code indicating the TPEP provider that provided the record.", 6 | "trim": true, 7 | "nullable": true, 8 | "type": "integer", 9 | "nullableValues": [ 10 | "", 11 | "null" 12 | ] 13 | }, 14 | { 15 | "id": "d61934ed-e32e-406b-bd18-8d6b7296a8c0", 16 | "name": "lpep_pickup_datetime", 17 | "description": "The date and time when the meter was engaged.", 18 | "trim": true, 19 | "nullable": true, 20 | "type": "timestamp", 21 | "formatters": [ 22 | "uuuu-MM-dd HH:mm:ss" 23 | ], 24 | "timezoneId": "America/New_York", 25 | "nullableValues": [ 26 | "", 27 | "null" 28 | ] 29 | }, 30 | { 31 | "id": "d61934ed-e32e-406b-bd18-8d6b7296a8c0", 32 | "name": "lpep_dropoff_datetime", 33 | "description": "The date and time when the meter was disengaged.", 34 | "trim": true, 35 | "nullable": true, 36 | "type": "timestamp", 37 | "formatters": [ 38 | "uuuu-MM-dd HH:mm:ss" 39 | ], 40 | "timezoneId": "America/New_York", 41 | "nullableValues": [ 42 | "", 43 | "null" 44 | ] 45 | }, 46 | { 47 | "id": "aa315986-9fa9-4aa2-a72e-411196648351", 48 | "name": "store_and_fwd_flag", 49 | "description": "This flag indicates whether the trip record was held in vehicle memory before sending to the vendor, aka 'store and forward', because the vehicle did not have a connection to the server.", 50 | "trim": true, 51 | "nullable": true, 52 | "type": "boolean", 53 | "nullableValues": [ 54 | "", 55 | "null" 56 | ], 57 | "trueValues": [ 58 | "Y" 59 | ], 60 | "falseValues": [ 61 | "N" 62 | ] 63 | }, 64 | { 65 | "id": "ce66288c-65c1-45b7-83b4-5de3f38f89b7", 66 | "name": "rate_code_id", 67 | "description": "The final rate code in effect at the end of the trip.", 68 | "trim": true, 69 | "nullable": true, 70 | "type": "integer", 71 | "nullableValues": [ 72 | "", 73 | "null" 74 | ] 75 | }, 76 | { 77 | "id": "2d7b4a53-5203-4273-bd4a-3bbc742539ec", 78 | "name": "pickup_longitude", 79 | "description": "Longitude where the meter was engaged.", 80 | "trim": true, 81 | "nullable": true, 82 | "type": "decimal", 83 | "nullableValues": [ 84 | "0" 85 | ], 86 | "precision": 18, 87 | "scale": 14 88 | }, 89 | { 90 | "id": "a183ecd0-6169-429c-8bc0-0df4f08526e8", 91 | "name": "pickup_latitude", 92 | "description": "Latitude where the meter was engaged.", 93 | "trim": true, 94 | "nullable": true, 95 | "type": "decimal", 96 | "nullableValues": [ 97 | "0" 98 | ], 99 | "precision": 18, 100 | "scale": 14 101 | }, 102 | { 103 | "id": "a3d6135c-202f-4ba6-ab25-93fa6c28bc97", 104 | "name": "dropoff_longitude", 105 | "description": "Longitude where the meter was disengaged.", 106 | "trim": true, 107 | "nullable": true, 108 | "type": "decimal", 109 | "nullableValues": [ 110 | "0" 111 | ], 112 | "precision": 18, 113 | "scale": 14 114 | }, 115 | { 116 | "id": "77160ee6-5040-4444-a731-45902b32911f", 117 | "name": "dropoff_latitude", 118 | "description": "Latitude where the meter was disengaged.", 119 | "trim": true, 120 | "nullable": true, 121 | "type": "decimal", 122 | "nullableValues": [ 123 | "0" 124 | ], 125 | "precision": 18, 126 | "scale": 14 127 | }, 128 | { 129 | "id": "ef1fe668-7850-4ef5-966b-0813d2024c32", 130 | "name": "passenger_count", 131 | "description": "The number of passengers in the vehicle. This is a driver-entered value.", 132 | "trim": true, 133 | "nullable": true, 134 | "type": "integer", 135 | "nullableValues": [ 136 | "", 137 | "null" 138 | ] 139 | }, 140 | { 141 | "id": "77160ee6-5040-4444-a731-45902b32911f", 142 | "name": "trip_distance", 143 | "description": "The elapsed trip distance in miles reported by the taximeter.", 144 | "trim": true, 145 | "nullable": true, 146 | "type": "decimal", 147 | "nullableValues": [ 148 | "0", 149 | "null" 150 | ], 151 | "precision": 18, 152 | "scale": 15 153 | }, 154 | { 155 | "id": "e71597c1-67ae-4176-9ae3-ae4dbe0886b9", 156 | "name": "fare_amount", 157 | "description": "The time-and-distance fare calculated by the meter.", 158 | "trim": true, 159 | "nullable": true, 160 | "type": "decimal", 161 | "nullableValues": [ 162 | "", 163 | "null" 164 | ], 165 | "precision": 10, 166 | "scale": 2 167 | }, 168 | { 169 | "id": "77d91cb6-22e4-4dba-883a-eee0c8690f31", 170 | "name": "extra", 171 | "description": "Miscellaneous extras and surcharges. Currently, this only includes the $0.50 and $1 rush hour and overnight charges.", 172 | "trim": true, 173 | "nullable": true, 174 | "type": "decimal", 175 | "nullableValues": [ 176 | "", 177 | "null" 178 | ], 179 | "precision": 10, 180 | "scale": 2 181 | }, 182 | { 183 | "id": "aebe7970-91dc-4155-b9a9-78dbcf836ac8", 184 | "name": "mta_tax", 185 | "description": "$0.50 MTA tax that is automatically triggered based on the metered rate in use.", 186 | "trim": true, 187 | "nullable": true, 188 | "type": "decimal", 189 | "nullableValues": [ 190 | "", 191 | "null" 192 | ], 193 | "precision": 10, 194 | "scale": 2 195 | }, 196 | { 197 | "id": "3630c209-a88c-4dd7-ab43-276234f04252", 198 | "name": "tip_amount", 199 | "description": "Tip amount – This field is automatically populated for credit card tips. Cash tips are not included.", 200 | "trim": true, 201 | "nullable": true, 202 | "type": "decimal", 203 | "nullableValues": [ 204 | "", 205 | "null" 206 | ], 207 | "precision": 10, 208 | "scale": 2 209 | }, 210 | { 211 | "id": "9d10371c-c08c-461a-a1a9-e5cd0c46655c", 212 | "name": "tolls_amount", 213 | "description": "Total amount of all tolls paid in trip.", 214 | "trim": true, 215 | "nullable": true, 216 | "type": "decimal", 217 | "nullableValues": [ 218 | "", 219 | "null" 220 | ], 221 | "precision": 10, 222 | "scale": 2 223 | }, 224 | { 225 | "id": "f59aba58-2a8c-40f9-830b-f1abafe80b7f", 226 | "name": "ehail_fee", 227 | "description": "Fee for allowing passengers to 'e-hail' a New York City taxicab via downloadable smartphone applications.", 228 | "trim": true, 229 | "nullable": true, 230 | "type": "decimal", 231 | "nullableValues": [ 232 | "", 233 | "null" 234 | ], 235 | "precision": 10, 236 | "scale": 2 237 | }, 238 | { 239 | "id": "1414fd4b-32ed-430c-a4b0-a569e7144bbb", 240 | "name": "total_amount", 241 | "description": "The total amount charged to passengers. Does not include cash tips.", 242 | "trim": true, 243 | "nullable": true, 244 | "type": "decimal", 245 | "nullableValues": [ 246 | "", 247 | "null" 248 | ], 249 | "precision": 10, 250 | "scale": 2 251 | }, 252 | { 253 | "id": "5b43ec13-dc16-40bd-8af5-4e2f85285e15", 254 | "name": "payment_type", 255 | "description": "A numeric code signifying how the passenger paid for the trip.", 256 | "trim": true, 257 | "nullable": true, 258 | "type": "integer", 259 | "nullableValues": [ 260 | "", 261 | "null" 262 | ] 263 | }, 264 | { 265 | "id": "bccf357f-6671-4168-998a-c991fdcf7fe0", 266 | "name": "trip_type", 267 | "description": "A code indicating whether the trip was a street-hail or a dispatch that is automatically assigned based on the metered rate in use but can be altered by the driver.", 268 | "trim": true, 269 | "nullable": true, 270 | "type": "integer", 271 | "nullableValues": [ 272 | "", 273 | "null" 274 | ] 275 | } 276 | ] -------------------------------------------------------------------------------- /emr-on-eks/green_taxi_schema.json.license: -------------------------------------------------------------------------------- 1 | SPDX-FileCopyrightText: 2021 Amazon.com, Inc. or its affiliates. 2 | 3 | SPDX-License-Identifier: MIT-0 -------------------------------------------------------------------------------- /emr-on-eks/provision.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # SPDX-FileCopyrightText: Copyright 2021 Amazon.com, Inc. or its affiliates. 4 | # SPDX-License-Identifier: MIT-0 5 | 6 | # Define params 7 | export AWS_DEFAULT_REGION=us-east-1 8 | export EKSCLUSTERNAME=eks-cluster 9 | export EMRCLUSTERNAME=emr-on-$EKSCLUSTERNAME 10 | export ROLENAME=${EMRCLUSTERNAME}-execution-role 11 | 12 | # Using EKS Fargate mode, uncomment to use EKS EC2 mode 13 | EKSCTL_PARAM="--fargate" 14 | # EKSCTL_PARAM="--nodes 6 --node-type t3.xlarge" 15 | 16 | # install eksctl (https://docs.aws.amazon.com/emr/latest/EMR-on-EKS-DevelopmentGuide/setting-up-eksctl.html) 17 | curl --silent --location "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp 18 | sudo mv /tmp/eksctl /usr/local/bin 19 | 20 | # update aws CLI to the latest version (we will require aws cli version >= 2.1.14) 21 | curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "/tmp/awscliv2.zip" 22 | unzip -q -o /tmp/awscliv2.zip -d /tmp 23 | sudo /tmp/aws/install --update 24 | 25 | # install kubectl 26 | curl -L "https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl" \ 27 | -o "/tmp/kubectl" 28 | chmod +x /tmp/kubectl 29 | sudo mv /tmp/kubectl /usr/local/bin 30 | 31 | # Provision eks cluster called “eks-fargate” backed by fargate 32 | eksctl create cluster --name $EKSCLUSTERNAME --with-oidc --zones ${AWS_DEFAULT_REGION}a,${AWS_DEFAULT_REGION}b $EKSCTL_PARAM 33 | aws eks update-kubeconfig --name $EKSCLUSTERNAME 34 | 35 | # Create kubernetes namespace 'emr' for EMR 36 | kubectl create namespace emr 37 | 38 | # Create fargate profile 'fp-emr' for namespace 'emr' 39 | eksctl create fargateprofile --cluster $EKSCLUSTERNAME --name fp-emr --namespace emr 40 | 41 | # Wait for EKS cluster to finish provisioning, enable all logging 42 | # Enable cluster access for Amazon EMR on EKS (https://docs.aws.amazon.com/emr/latest/EMR-on-EKS-DevelopmentGuide/setting-up-cluster-access.html) in the 'emr' kubernetes namespace by running: 43 | eksctl create iamidentitymapping --cluster $EKSCLUSTERNAME --namespace "emr" --service-name "emr-containers" 44 | eksctl utils update-cluster-logging --cluster $EKSCLUSTERNAME --enable-types all --approve 45 | 46 | # create S3 bucket for output 47 | export ACCOUNTID=$(aws sts get-caller-identity --query Account --output text) 48 | export OUTPUTS3BUCKET=${EMRCLUSTERNAME}-${ACCOUNTID} 49 | aws s3api create-bucket --bucket $OUTPUTS3BUCKET 50 | 51 | # Create a job execution role (https://docs.aws.amazon.com/emr/latest/EMR-on-EKS-DevelopmentGuide/creating-job-execution-role.html) 52 | cat > /tmp/job-execution-policy.json < /tmp/trust-policy.json <= 2.1.14) 19 | curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "/tmp/awscliv2.zip" 20 | unzip -q -o /tmp/awscliv2.zip -d /tmp 21 | sudo /tmp/aws/install --update 22 | 23 | # install kubectl 24 | curl -L "https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl" \ 25 | -o "/tmp/kubectl" 26 | chmod +x /tmp/kubectl 27 | sudo mv /tmp/kubectl /usr/local/bin 28 | 29 | # sumbit job 30 | 31 | aws emr-containers start-job-run --virtual-cluster-id $EMRCLUSTERID \ 32 | --name arc-job --execution-role-arn $ROLEARN --release-label emr-6.2.0-latest \ 33 | --job-driver '{"sparkSubmitJobDriver": {"entryPoint": "https://repo1.maven.org/maven2/ai/tripl/arc_2.12/3.6.2/arc_2.12-3.6.2.jar", "entryPointArguments":["--etl.config.uri=https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/main/emr-on-eks/green_taxi_load.ipynb"], "sparkSubmitParameters": "--packages com.typesafe:config:1.4.0 --class ai.tripl.arc.ARC --conf spark.executor.instances=10 --conf spark.executor.memory=4G --conf spark.driver.memory=2G --conf spark.executor.cores=2 --conf spark.kubernetes.driverEnv.ETL_CONF_ENV=production --conf spark.kubernetes.driverEnv.OUTPUT=s3://'$OUTPUTS3BUCKET'/output/ --conf spark.kubernetes.driverEnv.SCHEMA=https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/main/emr-on-eks/green_taxi_schema.json"}}' \ 34 | --configuration-overrides '{"monitoringConfiguration": {"cloudWatchMonitoringConfiguration": {"logGroupName": "/aws/eks/'$EKSCLUSTERNAME'/jobs", "logStreamNamePrefix": "arc-job"}}}' 35 | 36 | echo "Job submitted" 37 | echo "Navigate to https://console.aws.amazon.com/emr/home?#/eks/clusters/"${EMRCLUSTERID}" to view job status" 38 | 39 | echo "Navigate to the output S3 bucket here https://s3.console.aws.amazon.com/s3/buckets/"${OUTPUTS3BUCKET}" to view outputs" 40 | -------------------------------------------------------------------------------- /spark-on-eks/deployment/app_code/job/delta_load.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%conf numRows=5 logger=true" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": { 15 | "ExecuteTime": { 16 | "end_time": "2020-03-18T22:38:05.895407Z", 17 | "start_time": "2020-03-18T22:37:48.160Z" 18 | } 19 | }, 20 | "source": [ 21 | "## 2. Ingest A New Incremental CSV File\n", 22 | "### Look at record 12, the `state` is changed in the file" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "{\n", 32 | " \"type\": \"DelimitedExtract\",\n", 33 | " \"name\": \"extract incremental data\",\n", 34 | " \"environments\": [\"dev\", \"test\"],\n", 35 | " \"inputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/data/update_contacts.csv\",\n", 36 | " \"outputView\": \"delta_raw\", \n", 37 | " \"delimiter\": \"Comma\",\n", 38 | " \"header\": false,\n", 39 | " \"authentication\": {\n", 40 | " \"method\": \"AmazonIAM\"\n", 41 | " }\n", 42 | "}" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "## 2.2 Apply Data Type (reused schema file)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "{\n", 59 | " \"type\": \"TypingTransform\",\n", 60 | " \"name\": \"apply table schema 0 to incremental load\",\n", 61 | " \"environments\": [\"dev\", \"test\"],\n", 62 | " \"schemaURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/meta/contact_meta_0.json\",\n", 63 | " \"inputView\": \"delta_raw\", \n", 64 | " \"outputView\": \"delta_typed\",\n", 65 | " \"authentication\": {\n", 66 | " \"method\": \"AmazonIAM\"\n", 67 | " }\n", 68 | "}" 69 | ] 70 | }, 71 | { 72 | "cell_type": "markdown", 73 | "metadata": { 74 | "ExecuteTime": { 75 | "end_time": "2020-06-07T15:02:50.155313Z", 76 | "start_time": "2020-06-07T15:02:50.125Z" 77 | } 78 | }, 79 | "source": [ 80 | "## 2.3 Data Quality Control (reused sql script)" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": null, 86 | "metadata": {}, 87 | "outputs": [], 88 | "source": [ 89 | "%sqlvaildate outputView=\"fail_fast\" name=\"validation\" description=\"fail the job if data transform is failed\" environments=dev,test sqlParams=inputView=delta_typed\n", 90 | "\n", 91 | "SELECT SUM(error) = 0 AS valid\n", 92 | " ,TO_JSON(\n", 93 | " NAMED_STRUCT('count', COUNT(error), 'errors', SUM(error))\n", 94 | " ) AS message\n", 95 | "FROM \n", 96 | "(\n", 97 | " SELECT CASE WHEN SIZE(_errors) > 0 THEN 1 ELSE 0 END AS error \n", 98 | " FROM ${inputView}\n", 99 | ") base" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": { 105 | "ExecuteTime": { 106 | "end_time": "2020-05-31T05:01:13.796275Z", 107 | "start_time": "2020-05-31T05:01:13.734Z" 108 | } 109 | }, 110 | "source": [ 111 | "## 2.4 Add Calculated Fields (reused sql script)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": null, 117 | "metadata": {}, 118 | "outputs": [], 119 | "source": [ 120 | "%env \n", 121 | "ETL_CONF_CURRENT_TIMESTAMP=CURRENT_TIMESTAMP()" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": null, 127 | "metadata": {}, 128 | "outputs": [], 129 | "source": [ 130 | "%sql outputView=\"update_load\" name=\"add calc field for SCD\" environments=dev,test sqlParams=table_name=delta_typed,now=${ETL_CONF_CURRENT_TIMESTAMP}\n", 131 | "\n", 132 | "SELECT id,name,email,state, CAST(${now} AS timestamp) AS valid_from, CAST(null AS timestamp) AS valid_to\n", 133 | ",1 AS iscurrent, md5(concat(name,email,state)) AS checksum \n", 134 | "FROM ${table_name}" 135 | ] 136 | }, 137 | { 138 | "cell_type": "markdown", 139 | "metadata": {}, 140 | "source": [ 141 | "## 2.5 Output Incremental data to Delta Lake\n", 142 | "### Delta Lake is an optimized data lake to support Time Travel, ACID transaction" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "{\n", 152 | " \"type\": \"DeltaLakeLoad\",\n", 153 | " \"name\": \"Initial load to Data Lake\",\n", 154 | " \"environments\": [\"dev\", \"test\"],\n", 155 | " \"inputView\": \"update_load\",\n", 156 | " \"outputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/output/delta_load/\",\n", 157 | " \"numPartitions\": 2\n", 158 | " \"saveMode\": \"Overwrite\",\n", 159 | " \"authentication\": {\n", 160 | " \"method\": \"AmazonIAM\"\n", 161 | " }\n", 162 | "}" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": null, 168 | "metadata": {}, 169 | "outputs": [], 170 | "source": [] 171 | } 172 | ], 173 | "metadata": { 174 | "kernelspec": { 175 | "display_name": "Arc", 176 | "language": "javascript", 177 | "name": "arc" 178 | }, 179 | "language_info": { 180 | "codemirror_mode": "javascript", 181 | "file_extension": ".json", 182 | "mimetype": "javascript", 183 | "name": "arc", 184 | "nbconvert_exporter": "arcexport", 185 | "version": "3.8.0" 186 | } 187 | }, 188 | "nbformat": 4, 189 | "nbformat_minor": 4 190 | } -------------------------------------------------------------------------------- /spark-on-eks/deployment/app_code/job/driver-pod-template.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | labels: 5 | spark-role: driver 6 | namespace: spark 7 | spec: 8 | serviceAccountName: nativejob 9 | affinity: 10 | nodeAffinity: 11 | requiredDuringSchedulingIgnoredDuringExecution: 12 | nodeSelectorTerms: 13 | - matchExpressions: 14 | - key: lifecycle 15 | operator: In 16 | values: 17 | - OnDemand 18 | -------------------------------------------------------------------------------- /spark-on-eks/deployment/app_code/job/executor-pod-template.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | labels: 5 | spark-role: executor 6 | namespace: spark 7 | spec: 8 | serviceAccountName: nativejob 9 | affinity: 10 | nodeAffinity: 11 | requiredDuringSchedulingIgnoredDuringExecution: 12 | nodeSelectorTerms: 13 | - matchExpressions: 14 | - key: lifecycle 15 | operator: In 16 | values: 17 | - Ec2Spot -------------------------------------------------------------------------------- /spark-on-eks/deployment/app_code/job/green_taxi_load.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%env\n", 10 | "SCHEMA=https://\n", 11 | "OUTPUT=s3://\n", 12 | "ETL_CONF_ENV=production" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": null, 18 | "metadata": {}, 19 | "outputs": [], 20 | "source": [ 21 | "{\n", 22 | " \"type\": \"DelimitedExtract\",\n", 23 | " \"name\": \"extract csv data from nyc_tripdata\",\n", 24 | " \"environments\": [\"production\", \"test\"],\n", 25 | " \"inputURI\": \"s3a://nyc-tlc/trip*data/green_tripdata_*.csv\",\n", 26 | " \"outputView\": \"green_tripdata0_raw\", \n", 27 | " \"delimiter\": \"Comma\",\n", 28 | " \"quote\" : \"DoubleQuote\",\n", 29 | " \"header\": true,\n", 30 | " \"persist\": true\n", 31 | "}" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": null, 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "{\n", 41 | " \"type\": \"TypingTransform\",\n", 42 | " \"name\": \"apply green_tripdata schema 0 data types\",\n", 43 | " \"environments\": [\"production\", \"test\"],\n", 44 | " \"schemaURI\": ${SCHEMA},\n", 45 | " \"inputView\": \"green_tripdata0_raw\", \n", 46 | " \"outputView\": \"green_tripdata0\"\n", 47 | "}" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": null, 53 | "metadata": {}, 54 | "outputs": [], 55 | "source": [ 56 | "%sql name=\"aggregate the result by month and year\" outputView=green_trip_summery environments=production,test persist=true\n", 57 | "\n", 58 | "SELECT \n", 59 | " year(lpep_pickup_datetime) AS trip_year\n", 60 | " ,month(lpep_pickup_datetime) AS trip_month\n", 61 | " ,vendor_id\n", 62 | " ,sum(coalesce(trip_distance,0)) AS total_distance\n", 63 | " ,sum(coalesce(total_amount,0)) AS total_fee\n", 64 | "FROM green_tripdata0\n", 65 | "GROUP BY trip_year, trip_month, vendor_id" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "{\n", 75 | " \"type\": \"ParquetLoad\",\n", 76 | " \"name\": \"write out green_tripdata0 dataset as Parquet\",\n", 77 | " \"environments\": [\"production\", \"test\"],\n", 78 | " \"inputView\": \"green_trip_summery\",\n", 79 | " \"outputURI\": ${OUTPUT},\n", 80 | " \"saveMode\": \"Overwrite\"\n", 81 | "}" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": null, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [] 90 | } 91 | ], 92 | "metadata": { 93 | "kernelspec": { 94 | "display_name": "Arc", 95 | "language": "javascript", 96 | "name": "arc" 97 | }, 98 | "language_info": { 99 | "codemirror_mode": "javascript", 100 | "file_extension": ".json", 101 | "mimetype": "javascript", 102 | "name": "arc", 103 | "nbconvert_exporter": "arcexport", 104 | "version": "3.12.1" 105 | } 106 | }, 107 | "nbformat": 4, 108 | "nbformat_minor": 4 109 | } 110 | -------------------------------------------------------------------------------- /spark-on-eks/deployment/app_code/job/initial_load.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%conf numRows=5 logger=true" 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "# 1. Initial Table Load" 17 | ] 18 | }, 19 | { 20 | "cell_type": "code", 21 | "execution_count": null, 22 | "metadata": {}, 23 | "outputs": [], 24 | "source": [ 25 | "{\n", 26 | " \"type\": \"DelimitedExtract\",\n", 27 | " \"name\": \"extract initial table\",\n", 28 | " \"environments\": [\"dev\", \"test\"],\n", 29 | " \"inputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/data/initial_contacts.csv\",\n", 30 | " \"outputView\": \"initial_raw\", \n", 31 | " \"delimiter\": \"Comma\",\n", 32 | " \"header\": false,\n", 33 | " \"quote\": \"None\",\n", 34 | " \"authentication\": {\n", 35 | " \"method\": \"AmazonIAM\"\n", 36 | " }\n", 37 | "}" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "## Check Original Data Schema" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": null, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "%printschema \n", 54 | "initial_raw" 55 | ] 56 | }, 57 | { 58 | "cell_type": "markdown", 59 | "metadata": { 60 | "ExecuteTime": { 61 | "start_time": "2020-03-03T08:30:30.028Z" 62 | } 63 | }, 64 | "source": [ 65 | "## 1.2 Apply Data Type" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": null, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "{\n", 75 | " \"type\": \"TypingTransform\",\n", 76 | " \"name\": \"apply table schema 0\",\n", 77 | " \"environments\": [\"dev\", \"test\"],\n", 78 | " \"schemaURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/meta/contact_meta_0.json\",\n", 79 | " \"inputView\": \"initial_raw\", \n", 80 | " \"outputView\": \"initial_typed\",\n", 81 | " \"authentication\": {\n", 82 | " \"method\": \"AmazonIAM\"\n", 83 | " }\n", 84 | "}" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": {}, 90 | "source": [ 91 | "## Check Typed Data Schema & Stats" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": null, 97 | "metadata": {}, 98 | "outputs": [], 99 | "source": [ 100 | "%printschema \n", 101 | "initial_typed" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "## 1.3 Data Quality Control" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": null, 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "%sqlvaildate outputView=\"fail_fast\" name=\"validation\" description=\"fail the job if data transform is failed\" environments=dev,test sqlParams=inputView=initial_typed\n", 118 | "\n", 119 | "SELECT SUM(error) = 0 AS valid\n", 120 | " ,TO_JSON(\n", 121 | " NAMED_STRUCT('count', COUNT(error), 'errors', SUM(error))\n", 122 | " ) AS message\n", 123 | "FROM \n", 124 | "(\n", 125 | " SELECT CASE WHEN SIZE(_errors) > 0 THEN 1 ELSE 0 END AS error \n", 126 | " FROM ${inputView}\n", 127 | ") base" 128 | ] 129 | }, 130 | { 131 | "cell_type": "markdown", 132 | "metadata": {}, 133 | "source": [ 134 | "## 1.4 Add Calculated Fields for SCD Type 2\n", 135 | "### CURRENT_TIMESTAMP will be passed in automatically, when the ETL job is triggered" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": {}, 142 | "outputs": [], 143 | "source": [ 144 | "%env \n", 145 | "ETL_CONF_CURRENT_TIMESTAMP=current_timestamp()" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "%sql outputView=\"initial_load\" name=\"add calc field for SCD\" environments=dev,test sqlParams=table_name=initial_typed,now=${ETL_CONF_CURRENT_TIMESTAMP}\n", 155 | "\n", 156 | "SELECT id,name,email,state, CAST(${now} AS timestamp) AS valid_from, CAST(null AS timestamp) AS valid_to\n", 157 | ",1 AS iscurrent, md5(concat(name,email,state)) AS checksum \n", 158 | "FROM ${table_name}" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "## 1.5 Load to Delta Lake as the initial daily snaptshot table\n", 166 | "### Delta Lake is an optimized data lake to support Time Travel, ACID transaction" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": null, 172 | "metadata": {}, 173 | "outputs": [], 174 | "source": [ 175 | "{\n", 176 | " \"type\": \"DeltaLakeLoad\",\n", 177 | " \"name\": \"Initial load to Data Lake\",\n", 178 | " \"environments\": [\"dev\", \"test\"],\n", 179 | " \"inputView\": \"initial_load\",\n", 180 | " \"outputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/output/contact_snapshot/\",\n", 181 | " \"numPartitions\": 2\n", 182 | " \"saveMode\": \"Overwrite\",\n", 183 | " \"authentication\": {\n", 184 | " \"method\": \"AmazonIAM\"\n", 185 | " }\n", 186 | "}" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": null, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [] 195 | } 196 | ], 197 | "metadata": { 198 | "kernelspec": { 199 | "display_name": "Arc", 200 | "language": "javascript", 201 | "name": "arc" 202 | }, 203 | "language_info": { 204 | "codemirror_mode": "javascript", 205 | "file_extension": ".json", 206 | "mimetype": "javascript", 207 | "name": "arc", 208 | "nbconvert_exporter": "arcexport", 209 | "version": "3.8.0" 210 | } 211 | }, 212 | "nbformat": 4, 213 | "nbformat_minor": 4 214 | } -------------------------------------------------------------------------------- /spark-on-eks/deployment/app_code/job/msk_consumer.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | from pyspark.sql.types import * 3 | from pyspark.sql.functions import * 4 | import pyspark 5 | import sys 6 | 7 | spark = SparkSession.builder \ 8 | .appName("Spark Structured Streaming from Kafka") \ 9 | .getOrCreate() 10 | 11 | sdfRides = spark \ 12 | .readStream \ 13 | .format("kafka") \ 14 | .option("kafka.bootstrap.servers", sys.argv[1]) \ 15 | .option("subscribe", "taxirides") \ 16 | .option("startingOffsets", "latest") \ 17 | .option("auto.offset.reset", "latest") \ 18 | .load() \ 19 | .selectExpr("decode(CAST(value AS STRING),'utf-8') as value") 20 | 21 | # sdfFares = spark \ 22 | # .readStream \ 23 | # .format("kafka") \ 24 | # .option("kafka.bootstrap.servers", "b-1.emr-eks-msk.wz7wsg.c4.kafka.ap-southeast-2.amazonaws.com:9092") \ 25 | # .option("subscribe", "taxifares") \ 26 | # .option("startingOffsets", "latest") \ 27 | # .load() \ 28 | # .selectExpr("decode(CAST(value AS STRING),'utf-8') as value") 29 | 30 | # taxiFaresSchema = StructType([ \ 31 | # StructField("rideId", LongType()), StructField("taxiId", LongType()), \ 32 | # StructField("driverId", LongType()), StructField("startTime", TimestampType()), \ 33 | # StructField("paymentType", StringType()), StructField("tip", FloatType()), \ 34 | # StructField("tolls", FloatType()), StructField("totalFare", FloatType())]) 35 | 36 | taxiRidesSchema = StructType([ \ 37 | StructField("rideId", LongType()), StructField("isStart", StringType()), \ 38 | StructField("endTime", TimestampType()), StructField("startTime", TimestampType()), \ 39 | StructField("startLon", FloatType()), StructField("startLat", FloatType()), \ 40 | StructField("endLon", FloatType()), StructField("endLat", FloatType()), \ 41 | StructField("passengerCnt", ShortType()), StructField("taxiId", LongType()), \ 42 | StructField("driverId", LongType()),StructField("timestamp", TimestampType())]) 43 | 44 | def parse_data_from_kafka_message(sdf, schema): 45 | assert sdf.isStreaming == True, "DataFrame doesn't receive streaming data" 46 | col = split(sdf['value'], ',') #split attributes to nested array in one Column 47 | #now expand col to multiple top-level columns 48 | for idx, field in enumerate(schema): 49 | sdf = sdf.withColumn(field.name, col.getItem(idx).cast(field.dataType)) 50 | if field.name=="timestamp": 51 | sdf = sdf.withColumn(field.name, current_timestamp()) 52 | return sdf.select([field.name for field in schema]) 53 | 54 | sdfRides = parse_data_from_kafka_message(sdfRides, taxiRidesSchema) 55 | # sdfFares = parse_data_from_kafka_message(sdfFares, taxiFaresSchema) 56 | 57 | query = sdfRides.withWatermark("timestamp", "10 seconds") \ 58 | .groupBy("driverId", window("timestamp", "10 seconds", "5 seconds")).count() 59 | 60 | # query.writeStream \ 61 | # .outputMode("append") \ 62 | # .format("console") \ 63 | # .option("checkpointLocation", "s3://testtestmelody/stream/checkpoint/consumer_taxi2") \ 64 | # .option("truncate", False) \ 65 | # .start() \ 66 | # .awaitTermination() 67 | 68 | output=query.select(to_json(struct("*")).alias("value")) \ 69 | .selectExpr("CAST(value AS STRING)") \ 70 | .writeStream \ 71 | .outputMode("append") \ 72 | .format("kafka") \ 73 | .option("kafka.bootstrap.servers", sys.argv[1]) \ 74 | .option("topic", sys.argv[3]) \ 75 | .option("checkpointLocation", sys.argv[2]) \ 76 | .start() 77 | 78 | output.awaitTermination() 79 | -------------------------------------------------------------------------------- /spark-on-eks/deployment/app_code/job/scd2_merge.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "## 3. Read initial & incremental tables from Delta Lake" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": null, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "{\n", 17 | " \"type\": \"DeltaLakeExtract\",\n", 18 | " \"name\": \"read initial load table\",\n", 19 | " \"description\": \"read initial load table\",\n", 20 | " \"environments\": [\n", 21 | " \"dev\",\n", 22 | " \"test\"\n", 23 | " ],\n", 24 | " \"inputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/output/contact_snapshot/\",\n", 25 | " \"outputView\": \"current_snapshot\"\n", 26 | "}" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": null, 32 | "metadata": {}, 33 | "outputs": [], 34 | "source": [ 35 | "{\n", 36 | " \"type\": \"DeltaLakeExtract\",\n", 37 | " \"name\": \"read contact Delta Lake table\",\n", 38 | " \"description\": \"read contact table\",\n", 39 | " \"environments\": [\n", 40 | " \"dev\",\n", 41 | " \"test\"\n", 42 | " ],\n", 43 | " \"inputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/output/delta_load/\",\n", 44 | " \"outputView\": \"delta_data\"\n", 45 | "}" 46 | ] 47 | }, 48 | { 49 | "cell_type": "markdown", 50 | "metadata": { 51 | "ExecuteTime": { 52 | "end_time": "2020-05-31T05:03:33.741024Z", 53 | "start_time": "2020-05-31T05:03:33.247Z" 54 | } 55 | }, 56 | "source": [ 57 | "## 3.2 Prepare Datasets for SCD Type2 Insert\n", 58 | "\n", 59 | "- Generate extra rows for changed records.\n", 60 | "- The 'null' merge_key means it will be inserted, not update existing records according to the rule in SCD type2" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": null, 66 | "metadata": {}, 67 | "outputs": [], 68 | "source": [ 69 | "%sql outputView=\"staged_update\" name=\"generate extra rows for SCD\" environments=dev,test\n", 70 | "\n", 71 | "SELECT NULL AS mergeKey, new.*\n", 72 | "FROM current_snapshot old\n", 73 | "INNER JOIN delta_data new\n", 74 | "ON old.id = new.id\n", 75 | "WHERE old.iscurrent=true\n", 76 | "AND old.checksum<>new.checksum\n", 77 | "\n", 78 | "UNION\n", 79 | "\n", 80 | "SELECT id AS mergeKey, *\n", 81 | "FROM delta_data" 82 | ] 83 | }, 84 | { 85 | "cell_type": "markdown", 86 | "metadata": {}, 87 | "source": [ 88 | "## 3.3 Implement the Type 2 SCD merge operation" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": {}, 95 | "outputs": [], 96 | "source": [ 97 | "%conf logger=true" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": null, 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "{\n", 107 | " \"type\": \"DeltaLakeMergeLoad\",\n", 108 | " \"name\": \"merge with existing contacts data\",\n", 109 | " \"environments\": [\n", 110 | " \"dev\",\n", 111 | " \"test\"\n", 112 | " ],\n", 113 | " \"inputView\": \"staged_update\",\n", 114 | " \"outputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/output/contact_snapshot/\"\n", 115 | " \"condition\": \"source.mergeKey = target.id\",\n", 116 | " \"whenMatchedUpdate\": {\n", 117 | " \"condition\": \"target.iscurrent = true AND source.checksum <> target.checksum\",\n", 118 | " \"values\": {\n", 119 | " \"valid_to\": ${ETL_CONF_CURRENT_TIMESTAMP},\n", 120 | " \"iscurrent\": false\n", 121 | " }\n", 122 | " },\n", 123 | " \"whenNotMatchedByTargetInsert\": {},\n", 124 | " \"numPartitions\": 1\n", 125 | "}" 126 | ] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "## 3.4 Create a Delta Lake table in Athena\n", 133 | "### Build up a Glue Data Catalog via Athena. This step can be done by Glue Crawler. However, it makes sense if we refresh partitions, create/update data catalog at the end of each ETL process, which is provides the data lineage contro at a single place." 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "{\n", 143 | " \"type\": \"JDBCExecute\",\n", 144 | " \"name\": \"Create glue data catalog\",\n", 145 | " \"environments\": [\n", 146 | " \"dev\",\n", 147 | " \"test\"\n", 148 | " ],\n", 149 | " \"inputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/sql/create_table_contact.sql\",\n", 150 | " \"jdbcURL\": \"jdbc:awsathena://AwsRegion=\"${AWS_DEFAULT_REGION}\";S3OutputLocation=s3://\"${ETL_CONF_DATALAKE_LOC}\"/athena-query-result;AwsCredentialsProviderClass=com.amazonaws.auth.WebIdentityTokenCredentialsProvider\",\n", 151 | " \"sqlParams\":{\n", 152 | " \"datalake_loc\": \"'s3://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/output/contact_snapshot/_symlink_format_manifest/'\",\n", 153 | " \"table_name\": \"default.contact_snapshot\"\n", 154 | " }\n", 155 | "}" 156 | ] 157 | }, 158 | { 159 | "cell_type": "markdown", 160 | "metadata": {}, 161 | "source": [ 162 | "# 4. Query Delta Lake (validation steps)\n", 163 | "### to stop executing the followings in a productionized ETL job, use a fake environment `uat`\n", 164 | "### the same queries can be run in Athena" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": null, 170 | "metadata": {}, 171 | "outputs": [], 172 | "source": [ 173 | "{\n", 174 | " \"type\": \"DeltaLakeExtract\",\n", 175 | " \"name\": \"read contact Delta Lake table\",\n", 176 | " \"description\": \"read contact table\",\n", 177 | " \"environments\": [\n", 178 | " \"uat\"\n", 179 | " ],\n", 180 | " \"inputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/output/contact_snapshot\",\n", 181 | " \"outputView\": \"contact_snapshot\"\n", 182 | "}" 183 | ] 184 | }, 185 | { 186 | "cell_type": "markdown", 187 | "metadata": {}, 188 | "source": [ 189 | "## Confirm 92 records are expired" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": null, 195 | "metadata": {}, 196 | "outputs": [], 197 | "source": [ 198 | "%sql outputView=\"expired_count\" name=\"expired_count\" environments=uat\n", 199 | "SELECT count(*) FROM contact_snapshot WHERE valid_to is not null" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": {}, 206 | "outputs": [], 207 | "source": [ 208 | "%metadata \n", 209 | "contact_snapshot" 210 | ] 211 | }, 212 | { 213 | "cell_type": "markdown", 214 | "metadata": {}, 215 | "source": [ 216 | " ## Confirm we now have 1192 records" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "metadata": {}, 223 | "outputs": [], 224 | "source": [ 225 | "%sql outputView=\"total_count\" name=\"total_count\" environments=uat\n", 226 | "SELECT count(*) FROM contact_snapshot" 227 | ] 228 | }, 229 | { 230 | "cell_type": "markdown", 231 | "metadata": {}, 232 | "source": [ 233 | "## View one of the changed records" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "%sql outputView=\"validate_type2\" name=\"validate_type2\" environments=uat\n", 243 | "SELECT * FROM contact_snapshot WHERE id=12" 244 | ] 245 | }, 246 | { 247 | "cell_type": "code", 248 | "execution_count": null, 249 | "metadata": {}, 250 | "outputs": [], 251 | "source": [] 252 | } 253 | ], 254 | "metadata": { 255 | "kernelspec": { 256 | "display_name": "Arc", 257 | "language": "javascript", 258 | "name": "arc" 259 | }, 260 | "language_info": { 261 | "codemirror_mode": "javascript", 262 | "file_extension": ".json", 263 | "mimetype": "javascript", 264 | "name": "arc", 265 | "nbconvert_exporter": "arcexport", 266 | "version": "3.8.0" 267 | } 268 | }, 269 | "nbformat": 4, 270 | "nbformat_minor": 4 271 | } -------------------------------------------------------------------------------- /spark-on-eks/deployment/app_code/job/wordcount.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pyspark.sql import SparkSession 3 | spark = SparkSession.builder.appName('NYC taxi vendor count').getOrCreate() 4 | df = spark.read.option("header",True).csv(sys.argv[1]) 5 | df.filter(df["vendor_name"].isNotNull()).select("vendor_name").groupBy("vendor_name").count().write.mode("overwrite").parquet(sys.argv[2]) 6 | exit() -------------------------------------------------------------------------------- /spark-on-eks/deployment/app_code/meta/contact_meta_0.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "name": "id", 4 | "description": "contact id", 5 | "trim": true, 6 | "nullable": false, 7 | "primaryKey": true, 8 | "type": "integer" 9 | }, 10 | { 11 | "name": "name", 12 | "description": "contact name", 13 | "trim": true, 14 | "nullable": true, 15 | "primaryKey": false, 16 | "type": "string", 17 | "nullableValues": [ 18 | "", 19 | "null" 20 | ] 21 | }, 22 | { 23 | "name": "email", 24 | "description": "contact email", 25 | "trim": true, 26 | "nullable": true, 27 | "primaryKey": false, 28 | "type": "string", 29 | "nullableValues": [ 30 | "", 31 | "null" 32 | ] 33 | }, 34 | { 35 | "name": "state", 36 | "description": "state in the country of the contact", 37 | "trim": true, 38 | "nullable": true, 39 | "primaryKey": false, 40 | "type": "string", 41 | "nullableValues": [ 42 | "", 43 | "null" 44 | ] 45 | } 46 | ] -------------------------------------------------------------------------------- /spark-on-eks/deployment/app_code/meta/green_taxi_schema.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "id": "f457e562-5c7a-4215-a754-ab749509f3fb", 4 | "name": "vendor_id", 5 | "description": "A code indicating the TPEP provider that provided the record.", 6 | "trim": true, 7 | "nullable": true, 8 | "type": "integer", 9 | "nullableValues": [ 10 | "", 11 | "null" 12 | ] 13 | }, 14 | { 15 | "id": "d61934ed-e32e-406b-bd18-8d6b7296a8c0", 16 | "name": "lpep_pickup_datetime", 17 | "description": "The date and time when the meter was engaged.", 18 | "trim": true, 19 | "nullable": true, 20 | "type": "timestamp", 21 | "formatters": [ 22 | "uuuu-MM-dd HH:mm:ss" 23 | ], 24 | "timezoneId": "America/New_York", 25 | "nullableValues": [ 26 | "", 27 | "null" 28 | ] 29 | }, 30 | { 31 | "id": "d61934ed-e32e-406b-bd18-8d6b7296a8c0", 32 | "name": "lpep_dropoff_datetime", 33 | "description": "The date and time when the meter was disengaged.", 34 | "trim": true, 35 | "nullable": true, 36 | "type": "timestamp", 37 | "formatters": [ 38 | "uuuu-MM-dd HH:mm:ss" 39 | ], 40 | "timezoneId": "America/New_York", 41 | "nullableValues": [ 42 | "", 43 | "null" 44 | ] 45 | }, 46 | { 47 | "id": "aa315986-9fa9-4aa2-a72e-411196648351", 48 | "name": "store_and_fwd_flag", 49 | "description": "This flag indicates whether the trip record was held in vehicle memory before sending to the vendor, aka 'store and forward', because the vehicle did not have a connection to the server.", 50 | "trim": true, 51 | "nullable": true, 52 | "type": "boolean", 53 | "nullableValues": [ 54 | "", 55 | "null" 56 | ], 57 | "trueValues": [ 58 | "Y" 59 | ], 60 | "falseValues": [ 61 | "N" 62 | ] 63 | }, 64 | { 65 | "id": "ce66288c-65c1-45b7-83b4-5de3f38f89b7", 66 | "name": "rate_code_id", 67 | "description": "The final rate code in effect at the end of the trip.", 68 | "trim": true, 69 | "nullable": true, 70 | "type": "integer", 71 | "nullableValues": [ 72 | "", 73 | "null" 74 | ] 75 | }, 76 | { 77 | "id": "2d7b4a53-5203-4273-bd4a-3bbc742539ec", 78 | "name": "pickup_longitude", 79 | "description": "Longitude where the meter was engaged.", 80 | "trim": true, 81 | "nullable": true, 82 | "type": "decimal", 83 | "nullableValues": [ 84 | "0" 85 | ], 86 | "precision": 18, 87 | "scale": 14 88 | }, 89 | { 90 | "id": "a183ecd0-6169-429c-8bc0-0df4f08526e8", 91 | "name": "pickup_latitude", 92 | "description": "Latitude where the meter was engaged.", 93 | "trim": true, 94 | "nullable": true, 95 | "type": "decimal", 96 | "nullableValues": [ 97 | "0" 98 | ], 99 | "precision": 18, 100 | "scale": 14 101 | }, 102 | { 103 | "id": "a3d6135c-202f-4ba6-ab25-93fa6c28bc97", 104 | "name": "dropoff_longitude", 105 | "description": "Longitude where the meter was disengaged.", 106 | "trim": true, 107 | "nullable": true, 108 | "type": "decimal", 109 | "nullableValues": [ 110 | "0" 111 | ], 112 | "precision": 18, 113 | "scale": 14 114 | }, 115 | { 116 | "id": "77160ee6-5040-4444-a731-45902b32911f", 117 | "name": "dropoff_latitude", 118 | "description": "Latitude where the meter was disengaged.", 119 | "trim": true, 120 | "nullable": true, 121 | "type": "decimal", 122 | "nullableValues": [ 123 | "0" 124 | ], 125 | "precision": 18, 126 | "scale": 14 127 | }, 128 | { 129 | "id": "ef1fe668-7850-4ef5-966b-0813d2024c32", 130 | "name": "passenger_count", 131 | "description": "The number of passengers in the vehicle. This is a driver-entered value.", 132 | "trim": true, 133 | "nullable": true, 134 | "type": "integer", 135 | "nullableValues": [ 136 | "", 137 | "null" 138 | ] 139 | }, 140 | { 141 | "id": "77160ee6-5040-4444-a731-45902b32911f", 142 | "name": "trip_distance", 143 | "description": "The elapsed trip distance in miles reported by the taximeter.", 144 | "trim": true, 145 | "nullable": true, 146 | "type": "decimal", 147 | "nullableValues": [ 148 | "0", 149 | "null" 150 | ], 151 | "precision": 18, 152 | "scale": 15 153 | }, 154 | { 155 | "id": "e71597c1-67ae-4176-9ae3-ae4dbe0886b9", 156 | "name": "fare_amount", 157 | "description": "The time-and-distance fare calculated by the meter.", 158 | "trim": true, 159 | "nullable": true, 160 | "type": "decimal", 161 | "nullableValues": [ 162 | "", 163 | "null" 164 | ], 165 | "precision": 10, 166 | "scale": 2 167 | }, 168 | { 169 | "id": "77d91cb6-22e4-4dba-883a-eee0c8690f31", 170 | "name": "extra", 171 | "description": "Miscellaneous extras and surcharges. Currently, this only includes the $0.50 and $1 rush hour and overnight charges.", 172 | "trim": true, 173 | "nullable": true, 174 | "type": "decimal", 175 | "nullableValues": [ 176 | "", 177 | "null" 178 | ], 179 | "precision": 10, 180 | "scale": 2 181 | }, 182 | { 183 | "id": "aebe7970-91dc-4155-b9a9-78dbcf836ac8", 184 | "name": "mta_tax", 185 | "description": "$0.50 MTA tax that is automatically triggered based on the metered rate in use.", 186 | "trim": true, 187 | "nullable": true, 188 | "type": "decimal", 189 | "nullableValues": [ 190 | "", 191 | "null" 192 | ], 193 | "precision": 10, 194 | "scale": 2 195 | }, 196 | { 197 | "id": "3630c209-a88c-4dd7-ab43-276234f04252", 198 | "name": "tip_amount", 199 | "description": "Tip amount – This field is automatically populated for credit card tips. Cash tips are not included.", 200 | "trim": true, 201 | "nullable": true, 202 | "type": "decimal", 203 | "nullableValues": [ 204 | "", 205 | "null" 206 | ], 207 | "precision": 10, 208 | "scale": 2 209 | }, 210 | { 211 | "id": "9d10371c-c08c-461a-a1a9-e5cd0c46655c", 212 | "name": "tolls_amount", 213 | "description": "Total amount of all tolls paid in trip.", 214 | "trim": true, 215 | "nullable": true, 216 | "type": "decimal", 217 | "nullableValues": [ 218 | "", 219 | "null" 220 | ], 221 | "precision": 10, 222 | "scale": 2 223 | }, 224 | { 225 | "id": "f59aba58-2a8c-40f9-830b-f1abafe80b7f", 226 | "name": "ehail_fee", 227 | "description": "Fee for allowing passengers to 'e-hail' a New York City taxicab via downloadable smartphone applications.", 228 | "trim": true, 229 | "nullable": true, 230 | "type": "decimal", 231 | "nullableValues": [ 232 | "", 233 | "null" 234 | ], 235 | "precision": 10, 236 | "scale": 2 237 | }, 238 | { 239 | "id": "1414fd4b-32ed-430c-a4b0-a569e7144bbb", 240 | "name": "total_amount", 241 | "description": "The total amount charged to passengers. Does not include cash tips.", 242 | "trim": true, 243 | "nullable": true, 244 | "type": "decimal", 245 | "nullableValues": [ 246 | "", 247 | "null" 248 | ], 249 | "precision": 10, 250 | "scale": 2 251 | }, 252 | { 253 | "id": "5b43ec13-dc16-40bd-8af5-4e2f85285e15", 254 | "name": "payment_type", 255 | "description": "A numeric code signifying how the passenger paid for the trip.", 256 | "trim": true, 257 | "nullable": true, 258 | "type": "integer", 259 | "nullableValues": [ 260 | "", 261 | "null" 262 | ] 263 | }, 264 | { 265 | "id": "bccf357f-6671-4168-998a-c991fdcf7fe0", 266 | "name": "trip_type", 267 | "description": "A code indicating whether the trip was a street-hail or a dispatch that is automatically assigned based on the metered rate in use but can be altered by the driver.", 268 | "trim": true, 269 | "nullable": true, 270 | "type": "integer", 271 | "nullableValues": [ 272 | "", 273 | "null" 274 | ] 275 | } 276 | ] -------------------------------------------------------------------------------- /spark-on-eks/deployment/app_code/sql/add_calc_field_for_scd2.sql: -------------------------------------------------------------------------------- 1 | SELECT id 2 | , name 3 | , email 4 | , state 5 | , ${CURRENT_TIMESTAMP} AS valid_from 6 | , CAST(null AS timestamp) AS valid_to 7 | , 1 AS iscurrent 8 | , md5(concat(name,email,state)) AS checksum 9 | FROM ${table_name} -------------------------------------------------------------------------------- /spark-on-eks/deployment/app_code/sql/create_table_contact.sql: -------------------------------------------------------------------------------- 1 | CREATE EXTERNAL TABLE IF NOT EXISTS ${table_name}( 2 | `id` int 3 | ,`name` string 4 | ,`email` string 5 | ,`state` string 6 | ,`valid_from` timestamp 7 | ,`valid_to` timestamp 8 | ,`iscurrent` tinyint 9 | ,`checksum` string 10 | ) 11 | ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe' 12 | STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat' 13 | OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' 14 | LOCATION ${datalake_loc} 15 | TBLPROPERTIES ( 16 | 'classification'='parquet', 17 | 'parquet.compress'='SNAPPY' 18 | ) -------------------------------------------------------------------------------- /spark-on-eks/deployment/app_code/sql/sqlvalidate_errors.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | SUM(error) = 0 AS valid 3 | ,TO_JSON( 4 | NAMED_STRUCT( 5 | 'count', COUNT(error), 6 | 'errors', SUM(error) 7 | ) 8 | ) AS message 9 | FROM ( 10 | SELECT CASE WHEN SIZE(_errors) > 0 THEN 1 ELSE 0 END AS error 11 | FROM ${inputView} 12 | ) base -------------------------------------------------------------------------------- /spark-on-eks/deployment/build-s3-dist.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # This script packages your project into a solution distributable that can be 4 | # used as an input to the solution builder validation pipeline. 5 | # 6 | # Important notes and prereq's: 7 | # 1. The initialize-repo.sh script must have been run in order for this script to 8 | # function properly. 9 | # 2. This script should be run from the repo's root folder. 10 | # 11 | # This script will perform the following tasks: 12 | # 1. Remove any old dist files from previous runs. 13 | # 2. Install dependencies for the cdk-solution-helper; responsible for 14 | # converting standard 'cdk synth' output into solution assets. 15 | # 3. Build and synthesize your CDK project. 16 | # 4. Run the cdk-solution-helper on template outputs and organize 17 | # those outputs into the /global-s3-assets folder. 18 | # 5. Organize source code artifacts into the /regional-s3-assets folder. 19 | # 6. Remove any temporary files used for staging. 20 | # 21 | # Parameters: 22 | # - source-bucket-base-name: Name for the S3 bucket location where the template will source the Lambda 23 | # code from. The template will append '-[region_name]' to this bucket name. 24 | # For example: ./build-s3-dist.sh solutions v1.0.0 25 | # The template will then expect the source code to be located in the solutions-[region_name] bucket 26 | # - solution-name: name of the solution for consistency 27 | # - version-code: version of the package 28 | 29 | # Important: CDK global version number 30 | cdk_version===2.105.0 31 | 32 | # Check to see if the required parameters have been provided: 33 | if [ -z "$1" ] || [ -z "$2" ] || [ -z "$3" ]; then 34 | echo "Please provide the base source bucket name, trademark approved solution name and version where the lambda code will eventually reside." 35 | echo "For example: ./build-s3-dist.sh solutions trademarked-solution-name v1.0.0 template-bucket-name" 36 | exit 1 37 | fi 38 | 39 | # Get reference for all important folders 40 | template_dir="$PWD" 41 | staging_dist_dir="$template_dir/staging" 42 | template_dist_dir="$template_dir/deployment/global-s3-assets" 43 | build_dist_dir="$template_dir/deployment/regional-s3-assets" 44 | source_dir="$template_dir/source" 45 | 46 | echo "------------------------------------------------------------------------------" 47 | echo "[Init] Remove any old dist files from previous runs" 48 | echo "------------------------------------------------------------------------------" 49 | 50 | echo "rm -rf $template_dist_dir" 51 | rm -rf $template_dist_dir 52 | echo "mkdir -p $template_dist_dir" 53 | mkdir -p $template_dist_dir 54 | echo "rm -rf $build_dist_dir" 55 | rm -rf $build_dist_dir 56 | echo "mkdir -p $build_dist_dir" 57 | mkdir -p $build_dist_dir 58 | echo "rm -rf $staging_dist_dir" 59 | rm -rf $staging_dist_dir 60 | echo "mkdir -p $staging_dist_dir" 61 | mkdir -p $staging_dist_dir 62 | 63 | echo "------------------------------------------------------------------------------" 64 | echo "[Init] Install dependencies for the cdk-solution-helper" 65 | echo "------------------------------------------------------------------------------" 66 | 67 | echo "cd $template_dir/deployment/cdk-solution-helper" 68 | cd $template_dir/deployment/cdk-solution-helper 69 | echo "npm install" 70 | # npm audit fix --force 71 | npm install 72 | 73 | cd $template_dir 74 | echo "pip3 install -q $source_dir" 75 | python3 -m venv .env 76 | source .env/bin/activate 77 | pip3 install --upgrade pip -q $source_dir 78 | echo "cd $source_dir" 79 | cd $source_dir 80 | 81 | echo "------------------------------------------------------------------------------" 82 | echo "[Synth] CDK Project" 83 | echo "------------------------------------------------------------------------------" 84 | 85 | # # Install the global aws-cdk package 86 | echo "npm install -g aws-cdk@$cdk_version" 87 | # npm audit fix --force 88 | npm install aws-cdk@$cdk_version 89 | 90 | # Run 'cdk synth' to generate raw solution outputs 91 | echo "cdk synth --output=$staging_dist_dir" 92 | node_modules/aws-cdk/bin/cdk synth --output=$staging_dist_dir 93 | 94 | # Remove unnecessary output files 95 | echo "cd $staging_dist_dir" 96 | cd $staging_dist_dir 97 | echo "rm tree.json manifest.json cdk.out" 98 | rm tree.json manifest.json cdk.out 99 | 100 | echo "------------------------------------------------------------------------------" 101 | echo "[Packing] Template artifacts" 102 | echo "------------------------------------------------------------------------------" 103 | 104 | # Move outputs from staging to template_dist_dir 105 | echo "Move outputs from staging to template_dist_dir" 106 | mv $staging_dist_dir/*.json $template_dist_dir/ 107 | 108 | # Rename all *.template.json files to *.template 109 | echo "Rename all *.template.json to *.template" 110 | echo "copy templates and rename" 111 | for f in $template_dist_dir/*.template.json; do 112 | mv -- "$f" "${f%.template.json}.template" 113 | done 114 | 115 | # Run the helper to clean-up the templates and remove unnecessary CDK elements 116 | echo "Run the helper to clean-up the templates and remove unnecessary CDK elements" 117 | echo "node $template_dir/deployment/cdk-solution-helper/index" 118 | node $template_dir/deployment/cdk-solution-helper/index 119 | if [ "$?" = "1" ]; then 120 | echo "(cdk-solution-helper) ERROR: there is likely output above." 1>&2 121 | exit 1 122 | fi 123 | 124 | # Find and replace bucket_name, solution_name, and version 125 | echo "Find and replace bucket_name, solution_name, and version" 126 | cd $template_dist_dir 127 | echo "Updating code source bucket in template with $1" 128 | replace="s/%%BUCKET_NAME%%/$1/g" 129 | echo "sed -i '' -e $replace $template_dist_dir/*.template" 130 | sed -i '' -e $replace $template_dist_dir/*.template 131 | replace="s/%%SOLUTION_NAME%%/$2/g" 132 | echo "sed -i '' -e $replace $template_dist_dir/*.template" 133 | sed -i '' -e $replace $template_dist_dir/*.template 134 | replace="s/%%VERSION%%/$3/g" 135 | echo "sed -i '' -e $replace $template_dist_dir/*.template" 136 | sed -i '' -e $replace $template_dist_dir/*.template 137 | 138 | # Generate CFN template and zip code assets in a user's single bucket 139 | if [ -z "$4" ]; then 140 | replace="s/%%TEMPLATE_OUTPUT_BUCKET%%/$1"-"$AWS_REGION/g" 141 | echo "User's template bucket is: $replace" 142 | else 143 | replace="s/%%TEMPLATE_OUTPUT_BUCKET%%/$4/g" 144 | fi 145 | 146 | echo "sed -i '' -e $replace $template_dist_dir/*.template" 147 | sed -i '' -e $replace $template_dist_dir/*.template 148 | 149 | rm $template_dist_dir/*.json 150 | 151 | echo "------------------------------------------------------------------------------" 152 | echo "[Packing] Source code artifacts" 153 | echo "------------------------------------------------------------------------------" 154 | 155 | # General cleanup of node_modules and package-lock.json files 156 | echo "find $staging_dist_dir -iname "node_modules" -type d -exec rm -rf "{}" \; 2> /dev/null" 157 | find $staging_dist_dir -iname "node_modules" -type d -exec rm -rf "{}" \; 2> /dev/null 158 | echo "find $staging_dist_dir -iname "package-lock.json" -type f -exec rm -f "{}" \; 2> /dev/null" 159 | find $staging_dist_dir -iname "package-lock.json" -type f -exec rm -f "{}" \; 2> /dev/null 160 | 161 | # ... For each asset.* source code artifact in the temporary /staging folder... 162 | cd $staging_dist_dir 163 | for d in `find . -mindepth 1 -maxdepth 1 -type d`; do 164 | 165 | # Rename the artifact, removing the period for handler compatibility 166 | pfname="$(basename -- $d)" 167 | fname="$(echo $pfname | sed -e 's/\.//g')" 168 | echo "zip -r $fname.zip $fname" 169 | mv $d $fname 170 | cd $staging_dist_dir/$fname 171 | 172 | # Build the artifcats 173 | if ls *.py 1>/dev/null 2>&1; then 174 | echo "====================================" 175 | echo "This is Python runtime" 176 | echo "====================================" 177 | venv_folder=".venv-prod" 178 | rm -fr .venv-test 179 | rm -fr .venv-prod 180 | echo "Initiating virtual environment" 181 | python3 -m venv $venv_folder 182 | source $venv_folder/bin/activate 183 | pip3 install --upgrade pip -q $source_dir --target $venv_folder/lib/python3.*/site-packages 184 | echo "package python artifact" 185 | cd $venv_folder/lib/python3.*/site-packages 186 | zip -qr9 $staging_dist_dir/$fname.zip . -x "aws_cdk/*" 187 | echo "zip -r $staging_dist_dir/$fname" 188 | cd $staging_dist_dir/$fname 189 | rm -rf $venv_folder 190 | zip -grq $staging_dist_dir/$fname.zip . 191 | elif ls *.js 1>/dev/null 2>&1; then 192 | echo "====================================" 193 | echo "This is Node runtime" 194 | echo "====================================" 195 | echo "Clean and rebuild artifacts" 196 | # npm audit fix --force 197 | echo "copy package.json and package-lock.json files" 198 | cp -rf $template_dir/deployment/cdk-solution-helper/*.json . 199 | npm run 200 | npm ci 201 | if [ "$?" = "1" ]; then 202 | echo "ERROR: Seems like package-lock.json does not exists or is out of sync with package.josn. Trying npm install instead" 1>&2 203 | npm install 204 | fi 205 | # Zip the artifact 206 | echo "zip -r $staging_dist_dir/$fname" 207 | zip -qr9 $staging_dist_dir/$fname.zip . 208 | else 209 | # Zip the artifact 210 | echo "zip -r $staging_dist_dir/$fname" 211 | zip -rq $staging_dist_dir/$fname.zip . 212 | fi 213 | 214 | cd $staging_dist_dir 215 | # Copy the zipped artifact from /staging to /regional-s3-assets 216 | echo "cp $fname.zip $build_dist_dir" 217 | mv $fname.zip $build_dist_dir 218 | 219 | # Remove the old, unzipped artifact from /staging 220 | echo "rm -rf $fname" 221 | rm -rf $fname 222 | 223 | # ... repeat until all source code artifacts are zipped and placed in the 224 | # ... /regional-s3-assets folder 225 | 226 | done 227 | 228 | echo "------------------------------------------------------------------------------" 229 | echo "[Move] the zip files from staging to regional-s3-assets folder" 230 | echo "------------------------------------------------------------------------------" 231 | for d in `find . -mindepth 1 -maxdepth 1`; do 232 | pfname="$(basename -- $d)" 233 | fname="$(echo $pfname | sed -e 's/asset./asset/g')" 234 | mv $d $build_dist_dir/$fname 235 | done 236 | 237 | echo "------------------------------------------------------------------------------" 238 | echo "[Cleanup] Remove temporary files" 239 | echo "------------------------------------------------------------------------------" 240 | 241 | # Delete the temporary /staging folder 242 | echo "rm -rf $staging_dist_dir" 243 | rm -rf $staging_dist_dir 244 | -------------------------------------------------------------------------------- /spark-on-eks/deployment/cdk-solution-helper/README.md: -------------------------------------------------------------------------------- 1 | # cdk-solution-helper 2 | 3 | A lightweight helper function that cleans-up synthesized templates from the AWS Cloud Development Kit (CDK) and prepares 4 | them for use with the AWS Solutions publishing pipeline. This function performs the following tasks: 5 | 6 | #### Lambda function preparation 7 | 8 | Replaces the AssetParameter-style properties that identify source code for Lambda functions with the common variables 9 | used by the AWS Solutions publishing pipeline. 10 | 11 | - `Code.S3Bucket` is assigned the `%%BUCKET_NAME%%` placeholder value. 12 | - `Code.S3Key` is assigned the `%%SOLUTION_NAME%%`/`%%VERSION%%` placeholder value. 13 | - `Handler` is given a prefix identical to the artifact hash, enabling the Lambda function to properly find the handler in the extracted source code package. 14 | 15 | These placeholders are then replaced with the appropriate values using the default find/replace operation run by the pipeline. 16 | 17 | Before: 18 | ``` 19 | "examplefunction67F55935": { 20 | "Type": "AWS::Lambda::Function", 21 | "Properties": { 22 | "Code": { 23 | "S3Bucket": { 24 | "Ref": "AssetParametersd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7S3Bucket54E71A95" 25 | }, 26 | "S3Key": { 27 | "Fn::Join": [ 28 | "", 29 | [ 30 | { 31 | "Fn::Select": [ 32 | 0, 33 | { 34 | "Fn::Split": [ 35 | "||", 36 | { 37 | "Ref": "AssetParametersd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7S3VersionKeyC789D8B1" 38 | } 39 | ] 40 | } 41 | ] 42 | }, 43 | { 44 | "Fn::Select": [ 45 | 1, 46 | { 47 | "Fn::Split": [ 48 | "||", 49 | { 50 | "Ref": "AssetParametersd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7S3VersionKeyC789D8B1" 51 | } 52 | ] 53 | } 54 | ] 55 | } 56 | ] 57 | ] 58 | } 59 | }, ... 60 | Handler: "index.handler", ... 61 | ``` 62 | 63 | After helper function run: 64 | ``` 65 | "examplefunction67F55935": { 66 | "Type": "AWS::Lambda::Function", 67 | "Properties": { 68 | "Code": { 69 | "S3Bucket": "%%BUCKET_NAME%%", 70 | "S3Key": "%%SOLUTION_NAME%%/%%VERSION%%/assetd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7.zip" 71 | }, ... 72 | "Handler": "assetd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7/index.handler" 73 | ``` 74 | 75 | After build script run: 76 | ``` 77 | "examplefunction67F55935": { 78 | "Type": "AWS::Lambda::Function", 79 | "Properties": { 80 | "Code": { 81 | "S3Bucket": "solutions", 82 | "S3Key": "trademarked-solution-name/v1.0.0/asset.d513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7.zip" 83 | }, ... 84 | "Handler": "assetd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7/index.handler" 85 | ``` 86 | 87 | After CloudFormation deployment: 88 | ``` 89 | "examplefunction67F55935": { 90 | "Type": "AWS::Lambda::Function", 91 | "Properties": { 92 | "Code": { 93 | "S3Bucket": "solutions-us-east-1", 94 | "S3Key": "trademarked-solution-name/v1.0.0/asset.d513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7.zip" 95 | }, ... 96 | "Handler": "assetd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7/index.handler" 97 | ``` 98 | 99 | #### Template cleanup 100 | 101 | Cleans-up the parameters section and improves readability by removing the AssetParameter-style fields that would have 102 | been used to specify Lambda source code properties. This allows solution-specific parameters to be highlighted and 103 | removes unnecessary clutter. 104 | 105 | Before: 106 | ``` 107 | "Parameters": { 108 | "AssetParametersd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7S3Bucket54E71A95": { 109 | "Type": "String", 110 | "Description": "S3 bucket for asset \"d513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7\"" 111 | }, 112 | "AssetParametersd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7S3VersionKeyC789D8B1": { 113 | "Type": "String", 114 | "Description": "S3 key for asset version \"d513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7\"" 115 | }, 116 | "AssetParametersd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7ArtifactHash7AA751FE": { 117 | "Type": "String", 118 | "Description": "Artifact hash for asset \"d513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7\"" 119 | }, 120 | "CorsEnabled" : { 121 | "Description" : "Would you like to enable Cross-Origin Resource Sharing (CORS) for the image handler API? Select 'Yes' if so.", 122 | "Default" : "No", 123 | "Type" : "String", 124 | "AllowedValues" : [ "Yes", "No" ] 125 | }, 126 | "CorsOrigin" : { 127 | "Description" : "If you selected 'Yes' above, please specify an origin value here. A wildcard (*) value will support any origin.", 128 | "Default" : "*", 129 | "Type" : "String" 130 | } 131 | } 132 | ``` 133 | 134 | After: 135 | ``` 136 | "Parameters": { 137 | "CorsEnabled" : { 138 | "Description" : "Would you like to enable Cross-Origin Resource Sharing (CORS) for the image handler API? Select 'Yes' if so.", 139 | "Default" : "No", 140 | "Type" : "String", 141 | "AllowedValues" : [ "Yes", "No" ] 142 | }, 143 | "CorsOrigin" : { 144 | "Description" : "If you selected 'Yes' above, please specify an origin value here. A wildcard (*) value will support any origin.", 145 | "Default" : "*", 146 | "Type" : "String" 147 | } 148 | } 149 | ``` 150 | 151 | *** 152 | © Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. -------------------------------------------------------------------------------- /spark-on-eks/deployment/cdk-solution-helper/index.js: -------------------------------------------------------------------------------- 1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: MIT-0 3 | // Imports 4 | const fs = require('fs'); 5 | 6 | // Paths 7 | var currentPath = process.cwd(); 8 | const global_s3_assets = currentPath+'/../deployment/global-s3-assets'; 9 | const solution_name='SparkOnEKS'; 10 | 11 | function setParameter(template) { 12 | const parameters = (template.Parameters) ? template.Parameters : {}; 13 | const assetParameters = Object.keys(parameters).filter(function(key) { 14 | return key.includes('BootstrapVersion'); 15 | }); 16 | assetParameters.forEach(function(a) { 17 | template.Parameters[a] = undefined; 18 | }); 19 | const rules = (template.Rules) ? template.Rules : {}; 20 | const rule = Object.keys(rules).filter(function(key) { 21 | return key.includes('CheckBootstrapVersion'); 22 | }); 23 | rule.forEach(function(a) { 24 | template.Rules[a] = undefined; 25 | }) 26 | } 27 | function assetRef(s3BucketRef) { 28 | // Get S3 bucket key references from assets file 29 | const raw_meta = fs.readFileSync(`${global_s3_assets}/${solution_name}.assets.json`); 30 | let template = JSON.parse(raw_meta); 31 | const metadata = (template.files[s3BucketRef]) ? template.files[s3BucketRef] : {}; 32 | var assetPath = metadata.source.path.replace('.json',''); 33 | return assetPath; 34 | } 35 | 36 | // For each template in global_s3_assets ... 37 | fs.readdirSync(global_s3_assets).forEach(file => { 38 | if ( file != `${solution_name}.assets.json`) { 39 | // Import and parse template file 40 | const raw_template = fs.readFileSync(`${global_s3_assets}/${file}`); 41 | let template = JSON.parse(raw_template); 42 | 43 | //1. Clean-up parameters section 44 | setParameter(template); 45 | // setOutput(template); 46 | 47 | const resources = (template.Resources) ? template.Resources : {}; 48 | //3. Clean-up Account ID and region to enable cross account deployment 49 | const rsrctype=["AWS::Lambda::Function","AWS::Lambda::LayerVersion","Custom::CDKBucketDeployment", "AWS::CloudFormation::Stack","AWS::CloudFront::Distribution"] 50 | const focusTemplate = Object.keys(resources).filter(function(key) { 51 | return (resources[key].Type.indexOf(rsrctype) < 0) 52 | }); 53 | focusTemplate.forEach(function(f) { 54 | const fn = template.Resources[f]; 55 | if (fn.Properties.hasOwnProperty('Code') && fn.Properties.Code.hasOwnProperty('S3Bucket')) { 56 | // Set Lambda::Function S3 reference to regional folder 57 | if (! String(fn.Properties.Code.S3Bucket.Ref).startsWith('appcode')){ 58 | fn.Properties.Code.S3Key = `%%SOLUTION_NAME%%/%%VERSION%%/asset`+fn.Properties.Code.S3Key; 59 | fn.Properties.Code.S3Bucket = {'Fn::Sub': '%%BUCKET_NAME%%-${AWS::Region}'}; 60 | } 61 | // Set the handler 62 | // const handler = fn.Properties.Handler; 63 | // fn.Properties.Handler = `${handler}`; 64 | } 65 | else if (fn.Properties.hasOwnProperty('Content') && fn.Properties.Content.hasOwnProperty('S3Bucket')) { 66 | // Set Lambda::LayerVersion S3 bucket reference 67 | fn.Properties.Content.S3Key = `%%SOLUTION_NAME%%/%%VERSION%%/asset`+fn.Properties.Content.S3Key; 68 | fn.Properties.Content.S3Bucket = {'Fn::Sub': '%%BUCKET_NAME%%-${AWS::Region}'}; 69 | } 70 | else if (fn.Properties.hasOwnProperty('SourceBucketNames')) { 71 | // Set CDKBucketDeployment S3 bucket reference 72 | fn.Properties.SourceObjectKeys = [`%%SOLUTION_NAME%%/%%VERSION%%/asset`+fn.Properties.SourceObjectKeys[0]]; 73 | fn.Properties.SourceBucketNames = [{'Fn::Sub': '%%BUCKET_NAME%%-${AWS::Region}'}]; 74 | } 75 | else if (fn.Properties.hasOwnProperty('PolicyName') && fn.Properties.PolicyName.includes('CustomCDKBucketDeployment')) { 76 | // Set CDKBucketDeployment S3 bucket Policy reference 77 | fn.Properties.PolicyDocument.Statement.forEach(function(sub,i) { 78 | if (typeof(sub.Resource[i]) === 'object') { 79 | sub.Resource.forEach(function(resource){ 80 | var arrayKey = Object.keys(resource); 81 | if (typeof(resource[arrayKey][1]) === 'object') { 82 | resource[arrayKey][1].filter(function(s){ 83 | if (s.hasOwnProperty('Ref')) { 84 | fn.Properties.PolicyDocument.Statement[i].Resource = [ 85 | {"Fn::Join": ["",["arn:",{"Ref": "AWS::Partition"},":s3:::%%BUCKET_NAME%%-",{"Ref": "AWS::Region"}]]}, 86 | {"Fn::Join": ["",["arn:",{"Ref": "AWS::Partition"},":s3:::%%BUCKET_NAME%%-",{"Ref": "AWS::Region"},"/*"]]} 87 | ] 88 | } 89 | }); 90 | } 91 | }) 92 | } 93 | }); 94 | } 95 | // Set NestedStack S3 bucket reference 96 | else if (fn.Properties.hasOwnProperty('TemplateURL')) { 97 | var key=fn.Properties.TemplateURL['Fn::Join'][1][6].replace('.json','').replace('/',''); 98 | var assetPath = assetRef(key); 99 | fn.Properties.TemplateURL = { 100 | "Fn::Join": [ 101 | "", 102 | [ 103 | "https://s3.", 104 | { 105 | "Ref": "AWS::URLSuffix" 106 | }, 107 | "/" 108 | ,`%%TEMPLATE_OUTPUT_BUCKET%%/%%SOLUTION_NAME%%/%%VERSION%%/${assetPath}` 109 | ]] 110 | }; 111 | } 112 | // Set CloudFront logging bucket 113 | else if (fn.Properties.hasOwnProperty('DistributionConfig')){ 114 | fn.Properties.DistributionConfig.Logging.Bucket= { 115 | "Fn::Join": ["",[fn.Properties.DistributionConfig.Logging.Bucket['Fn::Join'][1][0], 116 | ".s3.",{"Ref": "AWS::Region"},".",{"Ref": "AWS::URLSuffix"}]] 117 | } 118 | } 119 | }); 120 | 121 | //6. Output modified template file 122 | const output_template = JSON.stringify(template, null, 2); 123 | fs.writeFileSync(`${global_s3_assets}/${file}`, output_template); 124 | } 125 | }); -------------------------------------------------------------------------------- /spark-on-eks/deployment/cdk-solution-helper/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "cdk-solution-helper", 3 | "version": "0.1.0", 4 | "devDependencies": { 5 | "fs": "0.0.1-security" 6 | }, 7 | "dependencies": { 8 | "fs": "0.0.1-security" 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /spark-on-eks/deployment/delete_all.sh: -------------------------------------------------------------------------------- 1 | # // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # // SPDX-License-Identifier: MIT-0 3 | #!/bin/bash 4 | 5 | export stack_name="${1:-SparkOnEKS}" 6 | export region="${2:-us-east-1}" 7 | 8 | echo "=================================================================================================" 9 | echo " Make sure your CloudFormation stack name $stack_name is correct and exists in region: $region " 10 | echo " If you use a different name, rerun the script with the parameters:" 11 | echo " ./deployment/delete_all.sh " 12 | echo "=================================================================================================" 13 | 14 | code_bucket=$(aws cloudformation describe-stacks --stack-name $stack_name --region $region --query "Stacks[0].Outputs[?OutputKey=='CODEBUCKET'].OutputValue" --output text) 15 | if ! [ -z "$code_bucket" ] 16 | then 17 | if ! [ -z $(aws s3api list-buckets --region $region --query 'Buckets[?Name==`'$code_bucket'`].Name' --output text) ]; then 18 | echo "Delete logs from S3" 19 | aws s3 rm s3://${code_bucket}/vpcRejectlog/ 20 | echo "Delete athena query result from S3" 21 | aws s3 rm s3://${code_bucket}/athena-query-result/ 22 | fi 23 | fi 24 | 25 | # delete glue tables 26 | tbl1=$(aws glue get-tables --region $region --database-name 'default' --query 'TableList[?starts_with(Name,`contact_snapshot`)==`true`]'.Name --output text) 27 | tbl2=$(aws glue get-tables --region $region --database-name 'default' --query 'TableList[?starts_with(Name,`deltalake_contact_jhub`)==`true`]'.Name --output text) 28 | if ! [ -z "$tbl1" ] 29 | then 30 | echo "Drop a Delta Lake table default.contact_snapshot" 31 | aws athena start-query-execution --region $region --query-string "DROP TABLE default.contact_snapshot" --result-configuration OutputLocation=s3://$code_bucket/athena-query-result 32 | fi 33 | if ! [ -z "$tbl2" ] 34 | then 35 | echo "Drop a Delta Lake table default.deltalake_contact_jhub" 36 | aws athena start-query-execution --region $region --query-string "DROP TABLE default.deltalake_contact_jhub" --result-configuration OutputLocation=s3://$code_bucket/athena-query-result 37 | fi 38 | 39 | argoALB=$(aws elbv2 describe-load-balancers --region $region --query 'LoadBalancers[?starts_with(DNSName,`k8s-argo`)==`true`].LoadBalancerArn' --output text) 40 | jhubALB=$(aws elbv2 describe-load-balancers --region $region --query 'LoadBalancers[?starts_with(DNSName,`k8s-jupyter`)==`true`].LoadBalancerArn' --output text) 41 | if ! [ -z "$argoALB" ] 42 | then 43 | echo "Delete Argo ALB" 44 | aws elbv2 delete-load-balancer --load-balancer-arn $argoALB --region $region 45 | sleep 5 46 | fi 47 | if ! [ -z "$jhubALB" ] 48 | then 49 | echo "Delete Jupyter ALB" 50 | aws elbv2 delete-load-balancer --load-balancer-arn $jhubALB --region $region 51 | sleep 5 52 | fi 53 | 54 | argoTG=$(aws elbv2 describe-target-groups --region $region --query 'TargetGroups[?starts_with(TargetGroupName,`k8s-argo`)==`true`].TargetGroupArn' --output text) 55 | jhubTG=$(aws elbv2 describe-target-groups --region $region --query 'TargetGroups[?starts_with(TargetGroupName,`k8s-jupyter`)==`true`].TargetGroupArn' --output text) 56 | if ! [ -z "$argoTG" ] 57 | then 58 | sleep 5 59 | echo "Delete Argo Target groups" 60 | aws elbv2 delete-target-group --target-group-arn $argoTG --region $region 61 | fi 62 | if ! [ -z "$jhubTG" ] 63 | then 64 | sleep 5 65 | echo "Delete Jupyter Target groups" 66 | aws elbv2 delete-target-group --target-group-arn $jhubTG --region $region 67 | fi 68 | 69 | # delete the rest from CF 70 | echo "Delete the rest of resources by CloudFormation delete command" 71 | aws cloudformation delete-stack --stack-name $stack_name --region $region -------------------------------------------------------------------------------- /spark-on-eks/deployment/post-deployment.sh: -------------------------------------------------------------------------------- 1 | # // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # // SPDX-License-Identifier: MIT-0 3 | #!/bin/bash 4 | 5 | #!/bin/bash 6 | 7 | export stack_name="${1:-SparkOnEKS}" 8 | export region="${2:-us-east-1}" 9 | echo "=================================================================================================" 10 | echo " Make sure your CloudFormation stack name $stack_name is correct and exists in region: $region " 11 | echo " If you use a different name, rerun the script with the parameters:" 12 | echo " ./deployment/post-deployment.sh " 13 | echo "=================================================================================================" 14 | 15 | # 1. install k8s command tools 16 | echo "================================================================================" 17 | echo " Installing kubectl tool on Linux ..." 18 | echo " For other operating system, install the kubectl > 1.27 here:" 19 | echo " https://docs.aws.amazon.com/eks/latest/userguide/install-kubectl.html" 20 | echo "================================================================================" 21 | curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" 22 | chmod +x kubectl 23 | sudo mkdir -p /usr/local/bin && sudo mv kubectl /usr/local/bin/kubectl && export PATH=$PATH:/usr/local/bin/ 24 | echo "Installed kubectl version: " 25 | kubectl version --client 26 | 27 | echo "================================================================================================" 28 | echo " Installing argoCLI tool on Linux ..." 29 | echo " Check out https://github.com/argoproj/argo-workflows/releases for other OS type installation." 30 | echo "================================================================================================" 31 | VERSION=v3.5.4 32 | sudo curl -sLO https://github.com/argoproj/argo-workflows/releases/download/${VERSION}/argo-linux-amd64.gz && gunzip argo-linux-amd64.gz 33 | chmod +x argo-linux-amd64 && sudo mv ./argo-linux-amd64 /usr/local/bin/argo 34 | argo version --short 35 | 36 | # 2. connect to the EKS newly created 37 | echo `aws cloudformation describe-stacks --stack-name $stack_name --region $region --query "Stacks[0].Outputs[?starts_with(OutputKey,'eksclusterEKSConfig')].OutputValue" --output text` | bash 38 | echo "Testing EKS connection..." 39 | kubectl get svc 40 | 41 | # 3. get Jupyter Hub login 42 | LOGIN_URI=$(aws cloudformation describe-stacks --stack-name $stack_name --region $region \ 43 | --query "Stacks[0].Outputs[?OutputKey=='JUPYTERURL'].OutputValue" --output text) 44 | SEC_ID=$(aws secretsmanager list-secrets --region $region --query "SecretList[?not_null(Tags[?Value=='$stack_name'])].Name" --output text) 45 | LOGIN=$(aws secretsmanager get-secret-value --region $region --secret-id $SEC_ID --query SecretString --output text) 46 | echo -e "\n=============================== JupyterHub Login ==============================================" 47 | echo -e "\nJUPYTER_URL: $LOGIN_URI" 48 | echo "LOGIN: $LOGIN" 49 | echo "================================================================================================" -------------------------------------------------------------------------------- /spark-on-eks/images/00-deploy-to-aws.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/16a9ea19df38c0bf5ed11ee123e1287d53bed444/spark-on-eks/images/00-deploy-to-aws.png -------------------------------------------------------------------------------- /spark-on-eks/images/3-argo-job-dependency.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/16a9ea19df38c0bf5ed11ee123e1287d53bed444/spark-on-eks/images/3-argo-job-dependency.png -------------------------------------------------------------------------------- /spark-on-eks/images/3-argo-log.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/16a9ea19df38c0bf5ed11ee123e1287d53bed444/spark-on-eks/images/3-argo-log.png -------------------------------------------------------------------------------- /spark-on-eks/images/3-argo-sidemenu.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/16a9ea19df38c0bf5ed11ee123e1287d53bed444/spark-on-eks/images/3-argo-sidemenu.png -------------------------------------------------------------------------------- /spark-on-eks/images/4-auto-scaling.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/16a9ea19df38c0bf5ed11ee123e1287d53bed444/spark-on-eks/images/4-auto-scaling.png -------------------------------------------------------------------------------- /spark-on-eks/images/4-k8s-retry.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/16a9ea19df38c0bf5ed11ee123e1287d53bed444/spark-on-eks/images/4-k8s-retry.png -------------------------------------------------------------------------------- /spark-on-eks/images/4-spot-console.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/16a9ea19df38c0bf5ed11ee123e1287d53bed444/spark-on-eks/images/4-spot-console.png -------------------------------------------------------------------------------- /spark-on-eks/images/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/16a9ea19df38c0bf5ed11ee123e1287d53bed444/spark-on-eks/images/architecture.png -------------------------------------------------------------------------------- /spark-on-eks/images/driver_interruption_test.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/16a9ea19df38c0bf5ed11ee123e1287d53bed444/spark-on-eks/images/driver_interruption_test.gif -------------------------------------------------------------------------------- /spark-on-eks/images/executor_interruption_test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/16a9ea19df38c0bf5ed11ee123e1287d53bed444/spark-on-eks/images/executor_interruption_test.png -------------------------------------------------------------------------------- /spark-on-eks/images/fake_data.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/16a9ea19df38c0bf5ed11ee123e1287d53bed444/spark-on-eks/images/fake_data.gif -------------------------------------------------------------------------------- /spark-on-eks/images/run_jupyter.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/16a9ea19df38c0bf5ed11ee123e1287d53bed444/spark-on-eks/images/run_jupyter.gif -------------------------------------------------------------------------------- /spark-on-eks/images/submit_job_in_argo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/16a9ea19df38c0bf5ed11ee123e1287d53bed444/spark-on-eks/images/submit_job_in_argo.gif -------------------------------------------------------------------------------- /spark-on-eks/images/submit_native_spark.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/16a9ea19df38c0bf5ed11ee123e1287d53bed444/spark-on-eks/images/submit_native_spark.gif -------------------------------------------------------------------------------- /spark-on-eks/images/two_architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/16a9ea19df38c0bf5ed11ee123e1287d53bed444/spark-on-eks/images/two_architecture.png -------------------------------------------------------------------------------- /spark-on-eks/source/app.py: -------------------------------------------------------------------------------- 1 | # // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # // SPDX-License-Identifier: MIT-0 3 | 4 | #!/usr/bin/env python3 5 | from aws_cdk import (App,Tags,CfnOutput) 6 | from lib.spark_on_eks_stack import SparkOnEksStack 7 | from lib.cloud_front_stack import NestedStack 8 | 9 | app = App() 10 | eks_name = app.node.try_get_context('cluster_name') 11 | eks_stack = SparkOnEksStack(app, 'SparkOnEKS', eks_name) 12 | # The CloudFront offers a default domain name to enable HTTPS. 13 | # Recommend to issue a TLS certificate with your own domain, delete the CF nested stack 14 | cf_nested_stack = NestedStack(eks_stack,'CreateCloudFront', eks_stack.code_bucket, eks_stack.argo_url, eks_stack.jhub_url) 15 | 16 | Tags.of(eks_stack).add('project', 'sqlbasedetl') 17 | Tags.of(cf_nested_stack).add('project', 'sqlbasedetl') 18 | 19 | # Deployment Output 20 | CfnOutput(eks_stack,'CODE_BUCKET', value=eks_stack.code_bucket) 21 | CfnOutput(eks_stack,'ARGO_URL', value='https://'+ cf_nested_stack.argo_cf) 22 | CfnOutput(eks_stack,'JUPYTER_URL', value='http://'+ cf_nested_stack.jhub_cf) 23 | 24 | app.synth() 25 | -------------------------------------------------------------------------------- /spark-on-eks/source/app_resources/alb-iam-role.yaml: -------------------------------------------------------------------------------- 1 | - Effect: Allow 2 | Action: 3 | - iam:CreateServiceLinkedRole 4 | Resource: "*" 5 | Condition: 6 | StringEquals: 7 | iam:AWSServiceName: elasticloadbalancing.amazonaws.com 8 | - Effect: Allow 9 | Action: 10 | - ec2:DescribeAccountAttributes 11 | - ec2:DescribeAddresses 12 | - ec2:DescribeAvailabilityZones 13 | - ec2:DescribeInternetGateways 14 | - ec2:DescribeVpcs 15 | - ec2:DescribeVpcPeeringConnections 16 | - ec2:DescribeSubnets 17 | - ec2:DescribeSecurityGroups 18 | - ec2:DescribeInstances 19 | - ec2:DescribeNetworkInterfaces 20 | - ec2:DescribeTags 21 | - ec2:GetCoipPoolUsage 22 | - ec2:DescribeCoipPools 23 | - elasticloadbalancing:DescribeLoadBalancers 24 | - elasticloadbalancing:DescribeLoadBalancerAttributes 25 | - elasticloadbalancing:DescribeListeners 26 | - elasticloadbalancing:DescribeListenerCertificates 27 | - elasticloadbalancing:DescribeSSLPolicies 28 | - elasticloadbalancing:DescribeRules 29 | - elasticloadbalancing:DescribeTargetGroups 30 | - elasticloadbalancing:DescribeTargetGroupAttributes 31 | - elasticloadbalancing:DescribeTargetHealth 32 | - elasticloadbalancing:DescribeTags 33 | Resource: "*" 34 | - Effect: Allow 35 | Action: 36 | - cognito-idp:DescribeUserPoolClient 37 | - acm:ListCertificates 38 | - acm:DescribeCertificate 39 | - iam:ListServerCertificates 40 | - iam:GetServerCertificate 41 | - waf-regional:GetWebACL 42 | - waf-regional:GetWebACLForResource 43 | - waf-regional:AssociateWebACL 44 | - waf-regional:DisassociateWebACL 45 | - wafv2:GetWebACL 46 | - wafv2:GetWebACLForResource 47 | - wafv2:AssociateWebACL 48 | - wafv2:DisassociateWebACL 49 | - shield:GetSubscriptionState 50 | - shield:DescribeProtection 51 | - shield:CreateProtection 52 | - shield:DeleteProtection 53 | Resource: "*" 54 | - Effect: Allow 55 | Action: 56 | - ec2:AuthorizeSecurityGroupIngress 57 | - ec2:RevokeSecurityGroupIngress 58 | Resource: "*" 59 | - Effect: Allow 60 | Action: 61 | - ec2:CreateSecurityGroup 62 | Resource: "*" 63 | - Effect: Allow 64 | Action: 65 | - ec2:CreateTags 66 | Resource: arn:aws:ec2:*:*:security-group/* 67 | Condition: 68 | StringEquals: 69 | ec2:CreateAction: CreateSecurityGroup 70 | 'Null': 71 | aws:RequestTag/elbv2.k8s.aws/cluster: 'false' 72 | - Effect: Allow 73 | Action: 74 | - ec2:CreateTags 75 | - ec2:DeleteTags 76 | Resource: arn:aws:ec2:*:*:security-group/* 77 | Condition: 78 | 'Null': 79 | aws:RequestTag/elbv2.k8s.aws/cluster: 'true' 80 | aws:ResourceTag/elbv2.k8s.aws/cluster: 'false' 81 | - Effect: Allow 82 | Action: 83 | - ec2:AuthorizeSecurityGroupIngress 84 | - ec2:RevokeSecurityGroupIngress 85 | - ec2:DeleteSecurityGroup 86 | Resource: "*" 87 | Condition: 88 | 'Null': 89 | aws:ResourceTag/elbv2.k8s.aws/cluster: 'false' 90 | - Effect: Allow 91 | Action: 92 | - elasticloadbalancing:CreateLoadBalancer 93 | - elasticloadbalancing:CreateTargetGroup 94 | Resource: "*" 95 | Condition: 96 | 'Null': 97 | aws:RequestTag/elbv2.k8s.aws/cluster: 'false' 98 | - Effect: Allow 99 | Action: 100 | - elasticloadbalancing:CreateListener 101 | - elasticloadbalancing:DeleteListener 102 | - elasticloadbalancing:CreateRule 103 | - elasticloadbalancing:DeleteRule 104 | Resource: "*" 105 | - Effect: Allow 106 | Action: 107 | - elasticloadbalancing:AddTags 108 | - elasticloadbalancing:RemoveTags 109 | Resource: 110 | - arn:aws:elasticloadbalancing:*:*:targetgroup/*/* 111 | - arn:aws:elasticloadbalancing:*:*:loadbalancer/net/*/* 112 | - arn:aws:elasticloadbalancing:*:*:loadbalancer/app/*/* 113 | Condition: 114 | 'Null': 115 | aws:RequestTag/elbv2.k8s.aws/cluster: 'true' 116 | aws:ResourceTag/elbv2.k8s.aws/cluster: 'false' 117 | - Effect: Allow 118 | Action: 119 | - elasticloadbalancing:AddTags 120 | - elasticloadbalancing:RemoveTags 121 | Resource: 122 | - arn:aws:elasticloadbalancing:*:*:listener/net/*/*/* 123 | - arn:aws:elasticloadbalancing:*:*:listener/app/*/*/* 124 | - arn:aws:elasticloadbalancing:*:*:listener-rule/net/*/*/* 125 | - arn:aws:elasticloadbalancing:*:*:listener-rule/app/*/*/* 126 | - Effect: Allow 127 | Action: 128 | - elasticloadbalancing:ModifyLoadBalancerAttributes 129 | - elasticloadbalancing:SetIpAddressType 130 | - elasticloadbalancing:SetSecurityGroups 131 | - elasticloadbalancing:SetSubnets 132 | - elasticloadbalancing:DeleteLoadBalancer 133 | - elasticloadbalancing:ModifyTargetGroup 134 | - elasticloadbalancing:ModifyTargetGroupAttributes 135 | - elasticloadbalancing:DeleteTargetGroup 136 | Resource: "*" 137 | Condition: 138 | 'Null': 139 | aws:ResourceTag/elbv2.k8s.aws/cluster: 'false' 140 | - Effect: Allow 141 | Action: 142 | - elasticloadbalancing:AddTags 143 | Resource: 144 | - arn:aws:elasticloadbalancing:*:*:targetgroup/*/* 145 | - arn:aws:elasticloadbalancing:*:*:loadbalancer/net/*/* 146 | - arn:aws:elasticloadbalancing:*:*:loadbalancer/app/*/* 147 | Condition: 148 | StringEquals: 149 | elasticloadbalancing:CreateAction: 150 | - CreateTargetGroup 151 | - CreateLoadBalancer 152 | 'Null': 153 | aws:RequestTag/elbv2.k8s.aws/cluster: 'false' 154 | - Effect: Allow 155 | Action: 156 | - elasticloadbalancing:RegisterTargets 157 | - elasticloadbalancing:DeregisterTargets 158 | Resource: arn:aws:elasticloadbalancing:*:*:targetgroup/*/* 159 | - Effect: Allow 160 | Action: 161 | - elasticloadbalancing:SetWebAcl 162 | - elasticloadbalancing:ModifyListener 163 | - elasticloadbalancing:AddListenerCertificates 164 | - elasticloadbalancing:RemoveListenerCertificates 165 | - elasticloadbalancing:ModifyRule 166 | Resource: "*" -------------------------------------------------------------------------------- /spark-on-eks/source/app_resources/alb-values.yaml: -------------------------------------------------------------------------------- 1 | # image: 2 | # tag: v2.2.0 3 | region: {{region_name}} 4 | vpcId: {{vpc_id}} 5 | clusterName: {{cluster_name}} 6 | serviceAccount: 7 | create: false 8 | name: alb-aws-load-balancer-controller 9 | -------------------------------------------------------------------------------- /spark-on-eks/source/app_resources/argo-values.yaml: -------------------------------------------------------------------------------- 1 | controller: 2 | workflowNamespaces: 3 | - argo 4 | init: 5 | serviceAccount: arcjob 6 | workflow: 7 | namespace: spark 8 | serviceAccount: 9 | create: false 10 | name: arcjob 11 | server: 12 | extraArgs: 13 | - --auth-mode 14 | - client 15 | ingress: 16 | enabled: true 17 | annotations: 18 | kubernetes.io/ingress.class: alb 19 | alb.ingress.kubernetes.io/scheme: internet-facing 20 | alb.ingress.kubernetes.io/target-type: ip 21 | alb.ingress.kubernetes.io/success-codes: 200,301,302 22 | alb.ingress.kubernetes.io/listen-ports: '[{"HTTP": 2746}]' 23 | alb.ingress.kubernetes.io/manage-backend-security-group-rules: "true" 24 | alb.ingress.kubernetes.io/security-groups: {{INBOUND_SG}} -------------------------------------------------------------------------------- /spark-on-eks/source/app_resources/autoscaler-iam-role.yaml: -------------------------------------------------------------------------------- 1 | - Effect: Allow 2 | Action: 3 | - autoscaling:DescribeAutoScalingGroups 4 | - autoscaling:DescribeAutoScalingInstances 5 | - autoscaling:DescribeLaunchConfigurations 6 | - autoscaling:DescribeTags 7 | - autoscaling:SetDesiredCapacity 8 | - autoscaling:TerminateInstanceInAutoScalingGroup 9 | - ec2:DescribeLaunchTemplateVersions 10 | Resource: 11 | - "*" 12 | -------------------------------------------------------------------------------- /spark-on-eks/source/app_resources/autoscaler-values.yaml: -------------------------------------------------------------------------------- 1 | autoDiscovery: 2 | clusterName: {{cluster_name}} 3 | awsRegion: {{region_name}} 4 | image: 5 | tag: v1.27.2 6 | nodeSelector: 7 | app: spark 8 | podAnnotations: 9 | cluster-autoscaler.kubernetes.io/safe-to-evict: 'false' 10 | extraArgs: 11 | skip-nodes-with-system-pods: false 12 | scale-down-unneeded-time: 5m 13 | scale-down-unready-time: 10m 14 | rbac: 15 | serviceAccount: 16 | create: false 17 | name: cluster-autoscaler 18 | 19 | -------------------------------------------------------------------------------- /spark-on-eks/source/app_resources/etl-iam-role.yaml: -------------------------------------------------------------------------------- 1 | - Effect: Allow 2 | Action: 3 | - s3:ListBucket 4 | - s3:GetBucketLocation 5 | Resource: 6 | - arn:aws:s3:::{{codeBucket}} 7 | - arn:aws:s3:::{{datalakeBucket}} 8 | - arn:aws:s3:::nyc-tlc 9 | - Effect: Allow 10 | Action: 11 | - s3:PutObject 12 | - s3:GetObject 13 | Resource: 14 | - arn:aws:s3:::{{codeBucket}}/* 15 | - arn:aws:s3:::{{datalakeBucket}}/* 16 | - arn:aws:s3:::nyc-tlc/* 17 | - Effect: Allow 18 | Action: 19 | - s3:DeleteObject 20 | Resource: 21 | - arn:aws:s3:::{{codeBucket}}/* 22 | - arn:aws:s3:::{{datalakeBucket}}/* 23 | - Effect: Allow 24 | Action: 25 | - kms:Decrypt 26 | - kms:Encrypt 27 | - kms:GenerateDataKey* 28 | - athena:StartQueryExecution 29 | - athena:GetQueryExecution 30 | - athena:GetQueryResults 31 | - glue:CreateTable 32 | - glue:CreateDatabase 33 | - glue:CreatePartition 34 | - glue:UpdatePartition 35 | - glue:GetDatabase 36 | Resource: 37 | - '*' -------------------------------------------------------------------------------- /spark-on-eks/source/app_resources/etl-rbac.yaml: -------------------------------------------------------------------------------- 1 | kind: Role 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | metadata: 4 | name: etl-workflow-role 5 | namespace: spark 6 | rules: 7 | - apiGroups: [""] 8 | resources: ["pods","pods/exec","configmaps","services"] 9 | verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] 10 | - apiGroups: ["batch", "extensions"] 11 | resources: ["jobs"] 12 | verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] 13 | - apiGroups: [""] 14 | resources: ["events","pods/log","serviceaccounts", "secrets","endpoints"] 15 | verbs: ["list", "get", "watch"] 16 | - apiGroups: [""] 17 | resources: ["persistentvolumeclaims"] 18 | verbs: ["create", "delete", "get", "list"] 19 | - apiGroups: ["argoproj.io"] 20 | resources: ["workflows","workflows/finalizers"] 21 | verbs: ["*"] 22 | - apiGroups: ["argoproj.io"] 23 | resources: ["workflowtemplates","workflowtemplates/finalizers"] 24 | verbs: ["get", "list", "watch"] 25 | 26 | 27 | --- 28 | kind: RoleBinding 29 | apiVersion: rbac.authorization.k8s.io/v1 30 | metadata: 31 | name: {{MY_SA}}-role-binding 32 | namespace: spark 33 | subjects: 34 | - kind: ServiceAccount 35 | name: {{MY_SA}} 36 | namespace: spark 37 | roleRef: 38 | kind: Role 39 | name: etl-workflow-role 40 | apiGroup: rbac.authorization.k8s.io -------------------------------------------------------------------------------- /spark-on-eks/source/app_resources/ex-secret-iam-role.yaml: -------------------------------------------------------------------------------- 1 | - Effect: Allow 2 | Action: 3 | - secretsmanager:GetResourcePolicy 4 | - secretsmanager:GetSecretValue 5 | - secretsmanager:DescribeSecret 6 | - secretsmanager:ListSecretVersionIds 7 | Resource: {{secretsmanager}} 8 | - Effect: Allow 9 | Action: 10 | - secretsmanager:GetRandomPassword 11 | - secretsmanager:ListSecrets 12 | - kms:Decrypt 13 | - kms:Encrypt 14 | Resource: 15 | - "*" 16 | -------------------------------------------------------------------------------- /spark-on-eks/source/app_resources/ex-secret-values.yaml: -------------------------------------------------------------------------------- 1 | image: 2 | tag: 7.2.0 3 | env: 4 | AWS_REGION: {{region_name}} 5 | AWS_DEFAULT_REGION: {{region_name}} 6 | serviceAccount: 7 | create: false 8 | name: external-secrets-controller 9 | securityContext: 10 | fsGroup: 65534 11 | -------------------------------------------------------------------------------- /spark-on-eks/source/app_resources/jupyter-config.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: RoleBinding 3 | metadata: 4 | name: {{MY_SA}}-role-binding 5 | namespace: jupyter 6 | subjects: 7 | - kind: ServiceAccount 8 | name: {{MY_SA}} 9 | namespace: jupyter 10 | roleRef: 11 | kind: Role 12 | name: hub 13 | apiGroup: rbac.authorization.k8s.io 14 | 15 | --- 16 | apiVersion: networking.k8s.io/v1 17 | kind: Ingress 18 | metadata: 19 | name: jupyterhub 20 | namespace: jupyter 21 | annotations: 22 | kubernetes.io/ingress.class: alb 23 | alb.ingress.kubernetes.io/scheme: internet-facing 24 | alb.ingress.kubernetes.io/target-type: ip 25 | alb.ingress.kubernetes.io/success-codes: 200,301,302 26 | alb.ingress.kubernetes.io/listen-ports: '[{"HTTP": 80}]' 27 | alb.ingress.kubernetes.io/manage-backend-security-group-rules: "true" 28 | alb.ingress.kubernetes.io/security-groups: {{INBOUND_SG}} 29 | labels: 30 | app: jupyterhub 31 | spec: 32 | rules: 33 | - host: "" 34 | http: 35 | paths: 36 | - path: / 37 | pathType: Prefix 38 | backend: 39 | service: 40 | name: proxy-public 41 | port: 42 | number: 80 43 | 44 | --- 45 | apiVersion: kubernetes-client.io/v1 46 | kind: ExternalSecret 47 | metadata: 48 | name: jupyter-external-secret 49 | namespace: jupyter 50 | spec: 51 | backendType: secretsManager 52 | region: {{REGION}} 53 | data: 54 | - key: {{SECRET_NAME}} 55 | name: password 56 | property: password 57 | 58 | -------------------------------------------------------------------------------- /spark-on-eks/source/app_resources/jupyter-values.yaml: -------------------------------------------------------------------------------- 1 | hub: 2 | db: 3 | type: sqlite-memory 4 | extraConfig: 5 | overrideServiceAccount: | 6 | import os, sys 7 | 8 | c.JupyterHub.authenticator_class = 'jupyterhub.auth.DummyAuthenticator' 9 | c.DummyAuthenticator.password = os.environ['LOGIN'] 10 | c.Authenticator.admin_users = {"service-admin"} 11 | c.JupyterHub.service_tokens = { 12 | "secret-token": "service-admin", 13 | } 14 | # this script allows serviceAccountName to use dynamic naming based on {unescaped_username}" 15 | async def override_service_account_hook(kube_spawner): 16 | if kube_spawner.service_account is not None: 17 | kube_spawner.service_account = kube_spawner._expand_user_properties(kube_spawner.service_account) 18 | kube_spawner.env['USER_NAME'] = kube_spawner._expand_user_properties("{unescaped_username}") 19 | c.KubeSpawner.pre_spawn_hook = override_service_account_hook 20 | 21 | # setup timeout 22 | c.JupyterHub.cookie_max_age_days = 0.0105 23 | c.Authenticator.refresh_pre_spawn = True 24 | # c.JupyterHub.services = [ 25 | # { 26 | # "name": "idle_culler", 27 | # "admin": True, 28 | # "command": [sys.executable, "-m", "jupyterhub_idle_culler", "--timeout=1800"], 29 | # } 30 | # ] 31 | 32 | extraEnv: 33 | - name: LOGIN 34 | valueFrom: 35 | secretKeyRef: 36 | name: jupyter-external-secret 37 | key: password 38 | nodeSelector: 39 | lifecycle: Ec2Spot 40 | readinessProbe: 41 | initialDelaySeconds: 30 42 | periodSeconds: 10 43 | 44 | proxy: 45 | secretToken: "*****" 46 | service: 47 | type: ClusterIP 48 | chp: 49 | nodeSelector: 50 | lifecycle: OnDemand 51 | 52 | singleuser: 53 | defaultUrl: "/lab" 54 | nodeSelector: 55 | lifecycle: OnDemand 56 | image: 57 | name: ghcr.io/tripl-ai/arc-jupyter 58 | tag: arc-jupyter_3.14.2_scala_2.12_hadoop_3.2.0_1.1.0 59 | pullPolicy: Always 60 | lifecycleHooks: 61 | postStart: 62 | exec: 63 | command: 64 | - "bash" 65 | - "-c" 66 | - > 67 | cp -r /opt/.jupyter $HOME/.jupyter; 68 | echo "git clone https://github.com/aws-samples/sql-based-etl-on-amazon-eks/spark-on-eks"; 69 | git clone --depth 1 https://github.com/aws-samples/sql-based-etl-on-amazon-eks spark-on-eks; 70 | cd spark-on-eks; git filter-branch --prune-empty --subdirectory-filter spark-on-eks HEAD; 71 | 72 | serviceAccountName: "{username}" 73 | cpu: 74 | guarantee: 0.25 75 | limit: 0.5 76 | memory: 77 | guarantee: 4G 78 | limit: 4G 79 | extraEnv: 80 | CONF_ALLOW_EXPORT: "true" 81 | JAVA_OPTS: -Xmx4G 82 | ETL_CONF_DATALAKE_LOC: {{codeBucket}} 83 | ETL_CONF_AWS_REGION: {{region}} 84 | storage: 85 | type: none 86 | # storage: 87 | # type: dynamic 88 | # capacity: 10G 89 | # homeMountPath: '/home/{username}/data' 90 | # # mount to EBS 91 | # dynamic: 92 | # storageClass: gp2 93 | profileList: 94 | - default: True 95 | display_name: "Small (default): Arc-Jupyter Development Environment" 96 | description: "4GB Memory & 1vCPUs" 97 | kubespawner_override: 98 | cpu_guarantee: 0.5 99 | cpu_limit: 1 100 | mem_guarantee: 4G 101 | mem_limit: 10G 102 | - display_name: "Big Arc-Jupyter Development Environment" 103 | description: "15GB Memory & 2vCPUs" 104 | kubespawner_override: 105 | cpu_guarantee: 0.5 106 | cpu_limit: 2 107 | mem_guarantee: 10G 108 | mem_limit: 15G 109 | 110 | prePuller: 111 | hook: 112 | enabled: false 113 | 114 | # autoscacling setting 115 | scheduling: 116 | userScheduler: 117 | enabled: false 118 | 119 | cull: 120 | timeout: 1800 121 | 122 | # debug: 123 | # enabled: true 124 | -------------------------------------------------------------------------------- /spark-on-eks/source/app_resources/native-spark-iam-role.yaml: -------------------------------------------------------------------------------- 1 | - Effect: Allow 2 | Action: s3:ListBucket 3 | Resource: 4 | - arn:aws:s3:::{{codeBucket}} 5 | - arn:aws:s3:::{{datalakeBucket}} 6 | - arn:aws:s3:::nyc-tlc 7 | - Effect: Allow 8 | Action: 9 | - s3:PutObject 10 | - s3:GetObject 11 | Resource: 12 | - arn:aws:s3:::{{codeBucket}}/* 13 | - arn:aws:s3:::{{datalakeBucket}}/* 14 | - arn:aws:s3:::nyc-tlc/* 15 | - Effect: Allow 16 | Action: 17 | - s3:DeleteObject 18 | Resource: 19 | - arn:aws:s3:::{{codeBucket}}/* 20 | - Effect: Allow 21 | Action: 22 | - kms:Encrypt 23 | - kms:Decrypt 24 | - kms:GenerateDataKey* 25 | - kms:DescribeKey 26 | Resource: 27 | - '*' -------------------------------------------------------------------------------- /spark-on-eks/source/app_resources/native-spark-rbac.yaml: -------------------------------------------------------------------------------- 1 | kind: RoleBinding 2 | apiVersion: rbac.authorization.k8s.io/v1 3 | metadata: 4 | name: {{MY_SA}}-role-binding 5 | namespace: spark 6 | subjects: 7 | - kind: ServiceAccount 8 | name: {{MY_SA}} 9 | namespace: spark 10 | roleRef: 11 | kind: Role 12 | name: etl-workflow-role 13 | apiGroup: rbac.authorization.k8s.io -------------------------------------------------------------------------------- /spark-on-eks/source/app_resources/spark-operator-values.yaml: -------------------------------------------------------------------------------- 1 | # image: 2 | # tag: v1beta2-1.2.3-3.1.1 3 | serviceAccounts: 4 | spark: 5 | create: false 6 | sparkoperator: 7 | create: true 8 | metrics: 9 | # -- Disable prometheus metric scraping 10 | enable: false 11 | webhook: 12 | enable: true 13 | port: 443 14 | 15 | -------------------------------------------------------------------------------- /spark-on-eks/source/app_resources/spark-template.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: argoproj.io/v1alpha1 2 | kind: WorkflowTemplate 3 | metadata: 4 | name: spark-template 5 | namespace: spark 6 | spec: 7 | templates: 8 | - name: smalljob 9 | retryStrategy: 10 | limit: 3 11 | retryPolicy: "Always" 12 | inputs: 13 | # override defaults here 14 | parameters: 15 | - name: jobId 16 | - name: configUri 17 | - name: image 18 | value: ghcr.io/tripl-ai/arc:latest 19 | - name: pullPolicy 20 | value: "Always" 21 | - name: executorInstances 22 | value: "1" 23 | - name: executorCores 24 | value: "1" 25 | - name: executorMemory 26 | value: "1" 27 | - name: sparkConf 28 | value: "" 29 | - name: tags 30 | value: "" 31 | - name: parameters 32 | value: "" 33 | # to exec each stages at a jupyter notebook, we can controle it by matching the environment. Some stages may not required in prod env. 34 | - name: environment 35 | value: test 36 | metadata: 37 | labels: 38 | app: spark 39 | workflowId: "{{workflow.uid}}" 40 | script: 41 | resources: 42 | limits: 43 | cpu: "1" 44 | memory: "1Gi" 45 | image: "{{inputs.parameters.image}}" 46 | command: ["/bin/sh"] 47 | source: | 48 | # verbose logging 49 | set -ex 50 | 51 | # print current hostname and ip 52 | hostname 53 | hostname -I 54 | 55 | # submit job 56 | /opt/spark/bin/spark-submit \ 57 | --master k8s://kubernetes.default.svc:443 \ 58 | --deploy-mode client \ 59 | --class ai.tripl.arc.ARC \ 60 | --name arc \ 61 | --conf spark.authenticate=true \ 62 | --conf spark.driver.extraJavaOptions="-XX:+UseG1GC" \ 63 | --conf spark.driver.host=$(hostname -I) \ 64 | --conf spark.driver.memory=921m \ 65 | --conf spark.executor.cores={{inputs.parameters.executorCores}} \ 66 | --conf spark.executor.extraJavaOptions="-XX:+UseG1GC" \ 67 | --conf spark.executor.instances={{inputs.parameters.executorInstances}} \ 68 | --conf spark.executor.memory={{inputs.parameters.executorMemory}}G \ 69 | --conf spark.io.encryption.enabled=true \ 70 | --conf spark.kubernetes.authenticate.caCertFile=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt \ 71 | --conf spark.kubernetes.authenticate.driver.serviceAccountName={{workflow.serviceAccountName}} \ 72 | --conf spark.kubernetes.authenticate.oauthTokenFile=/var/run/secrets/kubernetes.io/serviceaccount/token \ 73 | --conf spark.kubernetes.container.image.pullPolicy={{inputs.parameters.pullPolicy}} \ 74 | --conf spark.kubernetes.container.image={{inputs.parameters.image}} \ 75 | --conf spark.kubernetes.driver.limit.cores=1 \ 76 | --conf spark.kubernetes.driver.pod.name=$(hostname) \ 77 | --conf spark.kubernetes.executor.label.workflowId={{workflow.uid}} \ 78 | --conf spark.kubernetes.executor.limit.cores={{inputs.parameters.executorCores}} \ 79 | --conf spark.kubernetes.executor.podNamePrefix=$(hostname) \ 80 | --conf spark.kubernetes.executor.request.cores={{inputs.parameters.executorCores}} \ 81 | --conf spark.kubernetes.local.dirs.tmpfs=true \ 82 | --conf spark.kubernetes.namespace={{workflow.namespace}} \ 83 | --conf spark.network.crypto.enabled=true \ 84 | --conf spark.sql.ansi.enabled=true \ 85 | {{inputs.parameters.sparkConf}} \ 86 | local:///opt/spark/jars/arc.jar \ 87 | --etl.config.uri={{inputs.parameters.configUri}} \ 88 | --etl.config.job.id={{inputs.parameters.jobId}} \ 89 | --etl.config.environment={{inputs.parameters.environment}} \ 90 | --etl.config.ignoreEnvironments=false \ 91 | --etl.config.tags="service=arc workflowId={{workflow.uid}} pod={{pod.name}} serviceAccount={{workflow.serviceAccountName}} namespace={{workflow.namespace}} {{inputs.parameters.tags}}" \ 92 | --ETL_CONF_EPOCH=$(date '+%s') --ETL_CONF_CURRENT_TIMESTAMP="'$(date -u '+%Y-%m-%d %H:%M:%S')'" \ 93 | {{inputs.parameters.parameters}} 94 | 95 | - name: mediumjob 96 | retryStrategy: 97 | limit: 3 98 | retryPolicy: "Always" 99 | inputs: 100 | # override defaults here 101 | parameters: 102 | - name: jobId 103 | - name: configUri 104 | - name: image 105 | value: ghcr.io/tripl-ai/arc:latest 106 | - name: pullPolicy 107 | value: "Always" 108 | - name: executorInstances 109 | value: "2" 110 | - name: executorCores 111 | value: "2" 112 | - name: executorMemory 113 | value: "10" 114 | - name: sparkConf 115 | value: "" 116 | - name: tags 117 | value: "" 118 | - name: parameters 119 | value: "" 120 | # to exec each stages at a jupyter notebook, we can controle it by matching the environment. Some stages may not required in prod env. 121 | - name: environment 122 | value: test 123 | metadata: 124 | labels: 125 | app: spark 126 | workflowId: "{{workflow.uid}}" 127 | script: 128 | resources: 129 | limits: 130 | cpu: "2" 131 | memory: "13Gi" 132 | image: "{{inputs.parameters.image}}" 133 | command: ["/bin/sh"] 134 | source: | 135 | # verbose logging 136 | set -ex 137 | 138 | # print current hostname and ip 139 | hostname 140 | hostname -I 141 | 142 | # submit job 143 | /opt/spark/bin/spark-submit \ 144 | --master k8s://kubernetes.default.svc:443 \ 145 | --deploy-mode client \ 146 | --class ai.tripl.arc.ARC \ 147 | --name arc \ 148 | --conf spark.authenticate=true \ 149 | --conf spark.driver.extraJavaOptions="-XX:+UseG1GC" \ 150 | --conf spark.driver.host=$(hostname -I) \ 151 | --conf spark.driver.memory=2g \ 152 | --conf spark.executor.cores={{inputs.parameters.executorCores}} \ 153 | --conf spark.executor.extraJavaOptions="-XX:+UseG1GC" \ 154 | --conf spark.executor.instances={{inputs.parameters.executorInstances}} \ 155 | --conf spark.executor.memory={{inputs.parameters.executorMemory}}G \ 156 | --conf spark.io.encryption.enabled=true \ 157 | --conf spark.kubernetes.authenticate.caCertFile=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt \ 158 | --conf spark.kubernetes.authenticate.driver.serviceAccountName={{workflow.serviceAccountName}} \ 159 | --conf spark.kubernetes.authenticate.oauthTokenFile=/var/run/secrets/kubernetes.io/serviceaccount/token \ 160 | --conf spark.kubernetes.container.image.pullPolicy={{inputs.parameters.pullPolicy}} \ 161 | --conf spark.kubernetes.container.image={{inputs.parameters.image}} \ 162 | --conf spark.kubernetes.driver.limit.cores=1 \ 163 | --conf spark.kubernetes.driver.pod.name=$(hostname) \ 164 | --conf spark.kubernetes.executor.label.workflowId={{workflow.uid}} \ 165 | --conf spark.kubernetes.executor.limit.cores={{inputs.parameters.executorCores}} \ 166 | --conf spark.kubernetes.executor.podNamePrefix=$(hostname) \ 167 | --conf spark.kubernetes.executor.request.cores={{inputs.parameters.executorCores}} \ 168 | --conf spark.kubernetes.local.dirs.tmpfs=true \ 169 | --conf spark.kubernetes.namespace={{workflow.namespace}} \ 170 | --conf spark.network.crypto.enabled=true \ 171 | --conf spark.sql.ansi.enabled=true \ 172 | {{inputs.parameters.sparkConf}} \ 173 | local:///opt/spark/jars/arc.jar \ 174 | --etl.config.uri={{inputs.parameters.configUri}} \ 175 | --etl.config.job.id={{inputs.parameters.jobId}} \ 176 | --etl.config.environment={{inputs.parameters.environment}} \ 177 | --etl.config.ignoreEnvironments=false \ 178 | --etl.config.tags="service=arc workflowId={{workflow.uid}} pod={{pod.name}} serviceAccount={{workflow.serviceAccountName}} namespace={{workflow.namespace}} {{inputs.parameters.tags}}" \ 179 | --ETL_CONF_EPOCH=$(date '+%s') --ETL_CONF_CURRENT_TIMESTAMP="'$(date -u '+%Y-%m-%d %H:%M:%S')'" \ 180 | {{inputs.parameters.parameters}} 181 | 182 | - name: largejob 183 | retryStrategy: 184 | limit: 3 185 | retryPolicy: "Always" 186 | inputs: 187 | # override defaults here 188 | parameters: 189 | - name: jobId 190 | - name: configUri 191 | - name: image 192 | value: ghcr.io/tripl-ai/arc:latest 193 | - name: pullPolicy 194 | value: "Always" 195 | - name: executorInstances 196 | value: "3" 197 | - name: executorCores 198 | value: "2" 199 | - name: executorMemory 200 | value: "12" 201 | - name: sparkConf 202 | value: "" 203 | - name: tags 204 | value: "" 205 | - name: parameters 206 | value: "" 207 | # to exec each stages at a jupyter notebook, we can controle it by matching the environment. Some stages may not required in prod env. 208 | - name: environment 209 | value: test 210 | metadata: 211 | labels: 212 | app: spark 213 | workflowId: "{{workflow.uid}}" 214 | script: 215 | resources: 216 | limits: 217 | cpu: "3" 218 | memory: "13Gi" 219 | image: "{{inputs.parameters.image}}" 220 | command: ["/bin/sh"] 221 | source: | 222 | # verbose logging 223 | set -ex 224 | 225 | # print current hostname and ip 226 | hostname 227 | hostname -I 228 | 229 | # submit job 230 | /opt/spark/bin/spark-submit \ 231 | --master k8s://kubernetes.default.svc:443 \ 232 | --deploy-mode client \ 233 | --class ai.tripl.arc.ARC \ 234 | --name arc \ 235 | --conf spark.authenticate=true \ 236 | --conf spark.driver.extraJavaOptions="-XX:+UseG1GC" \ 237 | --conf spark.driver.host=$(hostname -I) \ 238 | --conf spark.driver.memory=4g \ 239 | --conf spark.executor.cores={{inputs.parameters.executorCores}} \ 240 | --conf spark.executor.extraJavaOptions="-XX:+UseG1GC" \ 241 | --conf spark.executor.instances={{inputs.parameters.executorInstances}} \ 242 | --conf spark.executor.memory={{inputs.parameters.executorMemory}}G \ 243 | --conf spark.io.encryption.enabled=true \ 244 | --conf spark.kubernetes.authenticate.caCertFile=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt \ 245 | --conf spark.kubernetes.authenticate.driver.serviceAccountName={{workflow.serviceAccountName}} \ 246 | --conf spark.kubernetes.authenticate.oauthTokenFile=/var/run/secrets/kubernetes.io/serviceaccount/token \ 247 | --conf spark.kubernetes.container.image.pullPolicy={{inputs.parameters.pullPolicy}} \ 248 | --conf spark.kubernetes.container.image={{inputs.parameters.image}} \ 249 | --conf spark.kubernetes.driver.limit.cores=1 \ 250 | --conf spark.kubernetes.driver.pod.name=$(hostname) \ 251 | --conf spark.kubernetes.executor.label.workflowId={{workflow.uid}} \ 252 | --conf spark.kubernetes.executor.limit.cores={{inputs.parameters.executorCores}} \ 253 | --conf spark.kubernetes.executor.podNamePrefix=$(hostname) \ 254 | --conf spark.kubernetes.executor.request.cores={{inputs.parameters.executorCores}} \ 255 | --conf spark.kubernetes.local.dirs.tmpfs=true \ 256 | --conf spark.kubernetes.namespace={{workflow.namespace}} \ 257 | --conf spark.network.crypto.enabled=true \ 258 | --conf spark.sql.ansi.enabled=true \ 259 | {{inputs.parameters.sparkConf}} \ 260 | local:///opt/spark/jars/arc.jar \ 261 | --etl.config.uri={{inputs.parameters.configUri}} \ 262 | --etl.config.job.id={{inputs.parameters.jobId}} \ 263 | --etl.config.environment={{inputs.parameters.environment}} \ 264 | --etl.config.ignoreEnvironments=false \ 265 | --etl.config.tags="service=arc workflowId={{workflow.uid}} pod={{pod.name}} serviceAccount={{workflow.serviceAccountName}} namespace={{workflow.namespace}} {{inputs.parameters.tags}}" \ 266 | --ETL_CONF_EPOCH=$(date '+%s') --ETL_CONF_CURRENT_TIMESTAMP="'$(date -u '+%Y-%m-%d %H:%M:%S')'" \ 267 | {{inputs.parameters.parameters}} 268 | 269 | - name: sparklocal 270 | inputs: 271 | retryStrategy: 272 | limit: 3 273 | retryPolicy: "Always" 274 | # override defaults here 275 | parameters: 276 | - name: jobId 277 | - name: configUri 278 | - name: image 279 | value: ghcr.io/tripl-ai/arc:latest 280 | - name: executorInstances 281 | value: "1" 282 | - name: executorCores 283 | value: "1" 284 | - name: executorMemory 285 | value: "1" 286 | - name: sparkConf 287 | value: "" 288 | - name: tags 289 | value: "" 290 | - name: parameters 291 | value: "" 292 | - name: pullPolicy 293 | value: IfNotPresent 294 | - name: environment 295 | value: test 296 | metadata: 297 | labels: 298 | app: spark 299 | workflowId: "{{workflow.uid}}" 300 | podSpecPatch: | 301 | containers: 302 | - name: main 303 | resources: 304 | requests: 305 | cpu: "{{inputs.parameters.executorCores}}" 306 | memory: "{{inputs.parameters.executorMemory}}Gi" 307 | script: 308 | image: "{{inputs.parameters.image}}" 309 | command: ["/bin/sh"] 310 | source: | 311 | # verbose logging 312 | set -ex 313 | 314 | # print current hostname and ip 315 | hostname 316 | hostname -I 317 | 318 | # submit job 319 | # driver memory is set at 90% of executorMemory 320 | /opt/spark/bin/spark-submit \ 321 | --master local[{{inputs.parameters.executorCores}}] \ 322 | --driver-memory $(({{inputs.parameters.executorMemory}} * 1024 * 90/100))m \ 323 | --driver-java-options "-XX:+UseG1GC" \ 324 | --class ai.tripl.arc.ARC \ 325 | --name arc \ 326 | --conf spark.driver.host=$(hostname -I) \ 327 | --conf spark.driver.pod.name=$(hostname)-driver \ 328 | --conf spark.io.encryption.enabled=true \ 329 | --conf spark.sql.adaptive.enabled=true \ 330 | --conf spark.network.crypto.enabled=true \ 331 | --conf spark.ui.enabled=true \ 332 | --conf spark.sql.ansi.enabled=true \ 333 | {{inputs.parameters.sparkConf}} \ 334 | local:///opt/spark/jars/arc.jar \ 335 | --etl.config.uri={{inputs.parameters.configUri}} \ 336 | --etl.config.job.id={{inputs.parameters.jobId}} \ 337 | --etl.config.environment={{inputs.parameters.environment}} \ 338 | --etl.config.ignoreEnvironments=fales \ 339 | --etl.config.tags="service=arc workflowId={{workflow.uid}} pod={{pod.name}} serviceAccount={{workflow.serviceAccountName}} namespace={{workflow.namespace}} {{inputs.parameters.tags}}" \ 340 | --ETL_CONF_EPOCH=$(date '+%s') --ETL_CONF_CURRENT_TIMESTAMP="'$(date -u '+%Y-%m-%d %H:%M:%S')'" \ 341 | {{inputs.parameters.parameters}} -------------------------------------------------------------------------------- /spark-on-eks/source/cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "python3 app.py", 3 | "context": { 4 | "cluster_name": "spark-on-eks", 5 | "solution_name": "sql-based-etl-with-apache-spark-on-amazon-eks", 6 | "version": "2.0.0", 7 | "@aws-cdk/core:stackRelativeExports": true, 8 | "@aws-cdk/customresources:installLatestAwsSdkDefault": false 9 | } 10 | } 11 | -------------------------------------------------------------------------------- /spark-on-eks/source/example/native-spark-job-scheduler.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: "sparkoperator.k8s.io/v1beta2" 2 | kind: SparkApplication 3 | metadata: 4 | name: word-count 5 | namespace: spark 6 | spec: 7 | type: Python 8 | pythonVersion: "3" 9 | mode: cluster 10 | image: ghcr.io/tripl-ai/arc:arc_3.10.0_spark_3.0.3_scala_2.12_hadoop_3.2.0_1.0.0 11 | imagePullPolicy: Always 12 | mainApplicationFile: "s3a://$(BUCKET_PARAM)/app_code/job/wordcount.py" 13 | arguments: ["s3a://nyc-tlc/csv_backup/yellow_tripdata*.csv","s3a://$(BUCKET_PARAM)/app_code/output/native"] 14 | sparkVersion: "3.0.3" 15 | sparkConf: 16 | "spark.hadoop.fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem" 17 | "spark.hadoop.fs.s3a.aws.credentials.provider": "com.amazonaws.auth.WebIdentityTokenCredentialsProvider" 18 | "spark.kubernetes.allocation.batch.size": "15" 19 | "spark.io.encryption.enabled": "true" 20 | "spark.kubernetes.local.dirs.tmpfs": "true" 21 | volumes: 22 | - name: spark-local-dir-1 23 | hostPath: 24 | path: "/tmp" 25 | type: Directory 26 | dynamicAllocation: 27 | enabled: true 28 | initialExecutors: 1 29 | minExecutors: 1 30 | maxExecutors: 20 31 | restartPolicy: 32 | type: OnFailure 33 | onFailureRetries: 3 34 | onFailureRetryInterval: 10 35 | onSubmissionFailureRetries: 5 36 | onSubmissionFailureRetryInterval: 5 37 | driver: 38 | # driver run on Spot 39 | affinity: 40 | nodeAffinity: 41 | requiredDuringSchedulingIgnoredDuringExecution: 42 | nodeSelectorTerms: 43 | - matchExpressions: 44 | - key: lifecycle 45 | operator: In 46 | values: 47 | - Ec2Spot 48 | # - OnDemand 49 | env: 50 | - name: BUCKET_PARAM 51 | valueFrom: 52 | configMapKeyRef: 53 | name: special-config 54 | key: codeBucket 55 | cores: 1 56 | memory: "1G" 57 | labels: 58 | role: driver 59 | serviceAccount: nativejob 60 | volumeMounts: 61 | - name: spark-local-dir-1 62 | mountPath: "/tmp" 63 | executor: 64 | # executors run on Spot 65 | affinity: 66 | nodeAffinity: 67 | requiredDuringSchedulingIgnoredDuringExecution: 68 | nodeSelectorTerms: 69 | - matchExpressions: 70 | - key: lifecycle 71 | operator: In 72 | values: 73 | - Ec2Spot 74 | cores: 1 75 | memory: "4G" 76 | labels: 77 | role: executor 78 | volumeMounts: 79 | - name: spark-local-dir-1 80 | mountPath: "/tmp" 81 | -------------------------------------------------------------------------------- /spark-on-eks/source/example/notebook/Spark_streaming_job.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%conf \n", 10 | "numRows=5\n", 11 | "streaming=false" 12 | ] 13 | }, 14 | { 15 | "cell_type": "markdown", 16 | "metadata": {}, 17 | "source": [ 18 | "# 1. Extract static data" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": null, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "{\n", 28 | " \"type\": \"DelimitedExtract\",\n", 29 | " \"name\": \"extract initial table\",\n", 30 | " \"environments\": [\"dev\", \"test\"],\n", 31 | " \"inputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/data/initial_contacts.csv\",\n", 32 | " \"schemaURI\":\"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/meta/contact_meta_0.json\",\n", 33 | " \"outputView\": \"initial_raw\", \n", 34 | " \"delimiter\": \"Comma\",\n", 35 | " \"header\": false,\n", 36 | " \"quote\": \"None\"\n", 37 | "}" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": {}, 44 | "outputs": [], 45 | "source": [ 46 | "{\n", 47 | " \"type\": \"TypingTransform\",\n", 48 | " \"name\": \"apply table schema 0\",\n", 49 | " \"environments\": [\"dev\", \"test\"],\n", 50 | " \"schemaURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/meta/contact_meta_0.json\",\n", 51 | " \"inputView\": \"initial_raw\", \n", 52 | " \"outputView\": \"initial_typed\",\n", 53 | " \"numPartitions\": 1\n", 54 | " \"persist\":true\n", 55 | "}" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": null, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "{\n", 65 | " \"type\": \"DelimitedExtract\",\n", 66 | " \"name\": \"extract updated data\",\n", 67 | " \"environments\": [\"dev\", \"test\"],\n", 68 | " \"inputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/data/update_contacts.csv\",\n", 69 | " \"schemaURI\":\"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/meta/contact_meta_0.json\",\n", 70 | " \"outputView\": \"delta_raw\", \n", 71 | " \"delimiter\": \"Comma\",\n", 72 | " \"header\": false,\n", 73 | " \"quote\": \"None\"\n", 74 | "}" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": null, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "{\n", 84 | " \"type\": \"TypingTransform\",\n", 85 | " \"name\": \"apply table schema 0\",\n", 86 | " \"environments\": [\"dev\", \"test\"],\n", 87 | " \"schemaURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/meta/contact_meta_0.json\",\n", 88 | " \"inputView\": \"delta_raw\", \n", 89 | " \"outputView\": \"delta_typed\",\n", 90 | " \"numPartitions\": 1\n", 91 | " \"persist\":true\n", 92 | "}" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "# 2. Turn on Spark Streaming" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": {}, 106 | "outputs": [], 107 | "source": [ 108 | "%conf \n", 109 | "streaming=true\n", 110 | "streamingDuration=30" 111 | ] 112 | }, 113 | { 114 | "cell_type": "markdown", 115 | "metadata": {}, 116 | "source": [ 117 | "# 2.1 Convert static data to stream\n", 118 | "- Initial stream = Initial dataset\n", 119 | "- Delta stream = Incremental dataset" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": null, 125 | "metadata": {}, 126 | "outputs": [], 127 | "source": [ 128 | "{\n", 129 | " \"type\": \"RateExtract\",\n", 130 | " \"name\": \"create a streaming source\",\n", 131 | " \"environments\": [\n", 132 | " \"production\",\n", 133 | " \"test\"\n", 134 | " ],\n", 135 | " \"outputView\": \"initial_stream\",\n", 136 | " \"numPartitions\": 1,\n", 137 | " \"rowsPerSecond\": 5\n", 138 | "}" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": null, 144 | "metadata": {}, 145 | "outputs": [], 146 | "source": [ 147 | "%sql outputView=\"stream_a\" name=\"simulate a stream\" sqlParams=input_table=initial_typed,stream_table=initial_stream numPartitions=1\n", 148 | "\n", 149 | "SELECT *\n", 150 | "FROM ${stream_table} \n", 151 | "INNER JOIN ${input_table}\n", 152 | "ON ${input_table}._index = ${stream_table}.value" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": null, 158 | "metadata": {}, 159 | "outputs": [], 160 | "source": [ 161 | "{\n", 162 | " \"type\": \"RateExtract\",\n", 163 | " \"name\": \"create a streaming source\",\n", 164 | " \"environments\": [\n", 165 | " \"production\",\n", 166 | " \"test\"\n", 167 | " ],\n", 168 | " \"outputView\": \"delta_stream\",\n", 169 | " \"numPartitions\": 1,\n", 170 | " \"rowsPerSecond\": 5\n", 171 | "}" 172 | ] 173 | }, 174 | { 175 | "cell_type": "code", 176 | "execution_count": null, 177 | "metadata": {}, 178 | "outputs": [], 179 | "source": [ 180 | "%sql outputView=\"stream_b\" name=\"simulate b stream\" sqlParams=input_table=delta_typed,stream_table=delta_stream numPartitions=1\n", 181 | "\n", 182 | "SELECT *\n", 183 | "FROM ${stream_table} \n", 184 | "INNER JOIN ${input_table}\n", 185 | "ON ${input_table}._index = ${stream_table}.value" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": null, 191 | "metadata": {}, 192 | "outputs": [], 193 | "source": [ 194 | "%sql outputView=\"join_streams\" name=\"join two streams\"\n", 195 | "\n", 196 | "SELECT\n", 197 | " initial.id as initial_id,\n", 198 | " initial.name as initial_name,\n", 199 | " initial.email as initial_email,\n", 200 | " initial.state as initial_state,\n", 201 | " delta.email as delta_email,\n", 202 | " delta.state as delta_state\n", 203 | "FROM stream_a initial\n", 204 | "INNER JOIN stream_b delta\n", 205 | "ON initial.id = delta.id\n", 206 | "where initial.email<>delta.email or initial.state<>delta.state\n" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": null, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [] 215 | } 216 | ], 217 | "metadata": { 218 | "kernelspec": { 219 | "display_name": "Arc", 220 | "language": "javascript", 221 | "name": "arc" 222 | }, 223 | "language_info": { 224 | "file_extension": "arc", 225 | "mimetype": "text/arc", 226 | "name": "arc", 227 | "nbconvert_exporter": "text", 228 | "version": "2.4.2" 229 | } 230 | }, 231 | "nbformat": 4, 232 | "nbformat_minor": 2 233 | } 234 | -------------------------------------------------------------------------------- /spark-on-eks/source/example/notebook/nyctaxi-job.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "%conf \n", 10 | "numRows=5\n", 11 | "showLog=true" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": null, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "%env \n", 21 | "ETL_CONF_DATA_URL=s3a://nyc-tlc/csv_backup\n", 22 | "ETL_CONF_JOB_URL=https://raw.githubusercontent.com/tripl-ai/arc-starter/master/examples/kubernetes" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": null, 28 | "metadata": {}, 29 | "outputs": [], 30 | "source": [ 31 | "{\n", 32 | " \"type\": \"DelimitedExtract\",\n", 33 | " \"name\": \"extract data from green_tripdata schema 0\",\n", 34 | " \"environments\": [\"production\", \"test\"],\n", 35 | " \"inputURI\": ${ETL_CONF_DATA_URL}\"/green_tripdata_2013-08.csv\",\n", 36 | " \"outputView\": \"green_tripdata0_raw\", \n", 37 | " \"delimiter\": \"Comma\",\n", 38 | " \"quote\" : \"DoubleQuote\",\n", 39 | " \"header\": true,\n", 40 | " \"persist\": true\n", 41 | "}" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": null, 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "{\n", 51 | " \"type\": \"TypingTransform\",\n", 52 | " \"name\": \"apply green_tripdata schema 0 data types\",\n", 53 | " \"environments\": [\"production\", \"test\"],\n", 54 | " \"schemaURI\": ${ETL_CONF_JOB_URL}\"/green_tripdata0.json\",\n", 55 | " \"inputView\": \"green_tripdata0_raw\", \n", 56 | " \"outputView\": \"green_tripdata0\"\n", 57 | "}" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": null, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "%sqlvalidate name=\"ensure no errors exist after data typing\" environments=production,test\n", 67 | "SELECT\n", 68 | " SUM(error) = 0 AS valid\n", 69 | " ,TO_JSON(\n", 70 | " NAMED_STRUCT(\n", 71 | " 'count', COUNT(error), \n", 72 | " 'errors', SUM(error)\n", 73 | " )\n", 74 | " ) AS message\n", 75 | "FROM (\n", 76 | " SELECT \n", 77 | " CASE \n", 78 | " WHEN SIZE(_errors) > 0 THEN 1 \n", 79 | " ELSE 0 \n", 80 | " END AS error \n", 81 | " FROM green_tripdata0\n", 82 | ") input_table" 83 | ] 84 | }, 85 | { 86 | "cell_type": "code", 87 | "execution_count": null, 88 | "metadata": {}, 89 | "outputs": [], 90 | "source": [ 91 | "%sql name=\"ensure a query can be executed\" environments=production,test persist=true outputView=green_trip_filtered\n", 92 | "SELECT * \n", 93 | "FROM green_tripdata0\n", 94 | "WHERE store_and_fwd_flag = TRUE" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": null, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [] 103 | } 104 | ], 105 | "metadata": { 106 | "kernelspec": { 107 | "display_name": "Arc", 108 | "language": "javascript", 109 | "name": "arc" 110 | }, 111 | "language_info": { 112 | "file_extension": "arc", 113 | "mimetype": "text/arc", 114 | "name": "arc", 115 | "nbconvert_exporter": "text", 116 | "version": "2.2.0" 117 | } 118 | }, 119 | "nbformat": 4, 120 | "nbformat_minor": 2 121 | } 122 | -------------------------------------------------------------------------------- /spark-on-eks/source/example/nyctaxi-job-scheduler.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: argoproj.io/v1alpha1 2 | kind: Workflow 3 | metadata: 4 | generateName: nyctaxi-job- 5 | namespace: spark 6 | spec: 7 | serviceAccountName: arcjob 8 | ttlStrategy: 9 | # keep workflows for 30m 10 | secondsAfterCompletion: 1800 11 | entrypoint: nyctaxi 12 | templates: 13 | - name: nyctaxi 14 | dag: 15 | tasks: 16 | - name: step1-query 17 | templateRef: 18 | name: spark-template 19 | template: sparklocal 20 | arguments: 21 | parameters: 22 | - name: jobId 23 | value: nyctaxi 24 | - name: tags 25 | value: "project=sqlbasedetl owner=myowner costcenter=66666" 26 | - name: configUri 27 | value: https://raw.githubusercontent.com/tripl-ai/arc-starter/master/examples/kubernetes/nyctaxi.ipynb 28 | - name: image 29 | value: ghcr.io/tripl-ai/arc:arc_3.11.1_spark_3.1.2_scala_2.12_hadoop_3.2.0_1.0.0 30 | - name: parameters 31 | value: "--ETL_CONF_DATA_URL=s3a://nyc-tlc/csv_backup --ETL_CONF_JOB_URL=https://raw.githubusercontent.com/tripl-ai/arc-starter/master/examples/kubernetes" 32 | -------------------------------------------------------------------------------- /spark-on-eks/source/example/scd2-job-scheduler.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: argoproj.io/v1alpha1 2 | kind: Workflow 3 | metadata: 4 | generateName: scd2-job- 5 | namespace: spark 6 | spec: 7 | serviceAccountName: arcjob 8 | entrypoint: scd2-process 9 | arguments: 10 | parameters: 11 | - name: codeBucket 12 | value: cfn_value 13 | templates: 14 | - name: scd2-process 15 | dag: 16 | tasks: 17 | - name: initial-load 18 | templateRef: 19 | name: spark-template 20 | template: smalljob 21 | arguments: 22 | parameters: 23 | - name: jobId 24 | value: initial-load 25 | - name: image 26 | value: ghcr.io/tripl-ai/arc:arc_3.10.0_spark_3.0.3_scala_2.12_hadoop_3.2.0_1.0.0 27 | - name: configUri 28 | value: "s3a://{{workflow.parameters.codeBucket}}/app_code/job/initial_load.ipynb" 29 | - name: parameters 30 | value: "--ETL_CONF_DATALAKE_LOC={{workflow.parameters.codeBucket}}" 31 | - name: delta-load 32 | templateRef: 33 | name: spark-template 34 | template: smalljob 35 | arguments: 36 | parameters: 37 | - name: jobId 38 | value: delta-load 39 | - name: image 40 | value: ghcr.io/tripl-ai/arc:arc_3.10.0_spark_3.0.3_scala_2.12_hadoop_3.2.0_1.0.0 41 | - name: configUri 42 | value: "s3a://{{workflow.parameters.codeBucket}}/app_code/job/delta_load.ipynb" 43 | - name: parameters 44 | value: "--ETL_CONF_DATALAKE_LOC={{workflow.parameters.codeBucket}}" 45 | - name: SCD2-merge 46 | dependencies: [initial-load, delta-load] 47 | templateRef: 48 | name: spark-template 49 | template: smalljob 50 | arguments: 51 | parameters: 52 | - name: jobId 53 | value: SCD2-merge 54 | - name: image 55 | value: ghcr.io/tripl-ai/arc:arc_3.10.0_spark_3.0.3_scala_2.12_hadoop_3.2.0_1.0.0 56 | - name: configUri 57 | value: "s3a://{{workflow.parameters.codeBucket}}/app_code/job/scd2_merge.ipynb" 58 | - name: parameters 59 | value: "--ETL_CONF_DATALAKE_LOC={{workflow.parameters.codeBucket}}" 60 | - name: sparkConf 61 | value: "--conf spark.databricks.delta.merge.repartitionBeforeWrite.enabled=true" 62 | -------------------------------------------------------------------------------- /spark-on-eks/source/example/test/TEST-arc-jupyter.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: arc-jupyter 5 | namespace: spark 6 | spec: 7 | serviceAccountName: arcjob 8 | securityContext: 9 | fsGroup: 65534 10 | containers: 11 | - name: arc-jupyter 12 | image: ghcr.io/tripl-ai/arc-jupyter:latest 13 | imagePullPolicy: IfNotPresent 14 | env: 15 | - name: JAVA_OPTS 16 | value: "-Xmx4G" 17 | - name: CONF_NUM_ROWS 18 | value: "10" 19 | - name: CONF_STORAGE_LEVEL 20 | value: "MEMORY_ONLY_SER" 21 | - name: conf_spark_sql_extensions 22 | value: "io.delta.sql.DeltaSparkSessionExtension" 23 | resources: 24 | requests: 25 | cpu: "1" 26 | memory: "5Gi" 27 | -------------------------------------------------------------------------------- /spark-on-eks/source/example/test/TEST-cron-job-scheduler.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: argoproj.io/v1alpha1 2 | kind: CronWorkflow 3 | metadata: 4 | namespace: spark 5 | generateName: word-count- 6 | spec: 7 | schedule: "* 1 * * *" 8 | concurrencyPolicy: "Replace" 9 | startingDeadlineSeconds: 4500 10 | workflowSpec: 11 | serviceAccountName: nativejob 12 | entrypoint: spotinterruption 13 | # must complete in 4h 14 | activeDeadlineSeconds: 14400 15 | ttlStrategy: 16 | secondsAfterCompletion: 28800 17 | templates: 18 | - name: spotinterruption 19 | inputs: 20 | parameters: 21 | - name: image 22 | value: ghcr.io/tripl-ai/arc:latest 23 | script: 24 | image: "{{inputs.parameters.image}}" 25 | resources: 26 | requests: 27 | cpu: "1" 28 | memory: "1Gi" 29 | command: ["/bin/sh"] 30 | source: | 31 | # verbose logging 32 | set -ex 33 | 34 | # submit job 35 | /opt/spark/bin/spark-submit \ 36 | --master k8s://kubernetes.default.svc:443 \ 37 | --deploy-mode cluster \ 38 | --name 'Word Count' \ 39 | --conf spark.kubernetes.allocation.batch.size=10 \ 40 | --conf spark.kubernetes.container.image={{inputs.parameters.image}} \ 41 | --conf spark.kubernetes.container.image.pullPolicy=Always \ 42 | --conf spark.kubernetes.namespace=spark \ 43 | --conf spark.driver.memory=1g \ 44 | --conf spark.kubernetes.driver.request.cores=2 \ 45 | --conf spark.kubernetes.driver.limit.cores=3 \ 46 | --conf spark.executor.instances=10 \ 47 | --conf spark.executor.memory=10g \ 48 | --conf spark.kubernetes.executor.request.cores=2 \ 49 | --conf spark.kubernetes.executor.limit.cores=3 \ 50 | --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \ 51 | --conf spark.hadoop.fs.s3a.fast.upload=true \ 52 | --conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.WebIdentityTokenCredentialsProvider \ 53 | # --conf spark.kubernetes.driver.podTemplateFile='s3://$(BUCKET_PARAM)/app_code/job/driver-pod-template.yaml' \ 54 | # --conf spark.kubernetes.executor.podTemplateFile='s3://$(BUCKET_PARAM)/app_code/job/executor-pod-template.yaml' \ 55 | --conf spark.kubernetes.authenticate.driver.serviceAccountName=nativejob \ 56 | "s3a://{{codeBucket}}/app_code/job/wordcount.py" \ 57 | "s3a://amazon-reviews-pds/parquet/" \ 58 | "s3a://{{codeBucket}}/app_code/output/native" 59 | 60 | -------------------------------------------------------------------------------- /spark-on-eks/source/lib/cdk_infra/eks_base_app.py: -------------------------------------------------------------------------------- 1 | # // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # // SPDX-License-Identifier: MIT-0 3 | from aws_cdk import Aws 4 | from constructs import Construct 5 | from aws_cdk.aws_eks import ICluster, KubernetesManifest 6 | from lib.util.manifest_reader import * 7 | import os 8 | 9 | class EksBaseAppConst(Construct): 10 | @property 11 | def secret_created(self): 12 | return self._ext_secret 13 | 14 | def __init__(self,scope: Construct, id: str, eks_cluster: ICluster, **kwargs,) -> None: 15 | super().__init__(scope, id, **kwargs) 16 | 17 | source_dir=os.path.split(os.environ['VIRTUAL_ENV'])[0]+'/source' 18 | 19 | # Add ALB ingress controller to EKS 20 | self._alb = eks_cluster.add_helm_chart('ALBChart', 21 | chart='aws-load-balancer-controller', 22 | repository='https://aws.github.io/eks-charts', 23 | release='alb', 24 | version='1.5.5', 25 | create_namespace=False, 26 | namespace='kube-system', 27 | values=load_yaml_replace_var_local(source_dir+'/app_resources/alb-values.yaml', 28 | fields={ 29 | "{{region_name}}": Aws.REGION, 30 | "{{cluster_name}}": eks_cluster.cluster_name, 31 | "{{vpc_id}}": eks_cluster.vpc.vpc_id 32 | } 33 | ) 34 | ) 35 | # Add Cluster Autoscaler to EKS 36 | _var_mapping = { 37 | "{{region_name}}": Aws.REGION, 38 | "{{cluster_name}}": eks_cluster.cluster_name, 39 | } 40 | eks_cluster.add_helm_chart('ClusterAutoScaler', 41 | chart='cluster-autoscaler', 42 | repository='https://kubernetes.github.io/autoscaler', 43 | release='nodescaler', 44 | create_namespace=False, 45 | namespace='kube-system', 46 | values=load_yaml_replace_var_local(source_dir+'/app_resources/autoscaler-values.yaml',_var_mapping) 47 | ) 48 | 49 | # Add container insight (CloudWatch Log) to EKS 50 | KubernetesManifest(self,'ContainerInsight', 51 | cluster=eks_cluster, 52 | manifest=load_yaml_replace_var_remotely('https://raw.githubusercontent.com/aws-samples/amazon-cloudwatch-container-insights/latest/k8s-deployment-manifest-templates/deployment-mode/daemonset/container-insights-monitoring/quickstart/cwagent-fluentd-quickstart.yaml', 53 | fields=_var_mapping, 54 | multi_resource=True 55 | ) 56 | ) 57 | # Add external secrets controller to EKS 58 | self._ext_secret = eks_cluster.add_helm_chart('SecretContrChart', 59 | chart='kubernetes-external-secrets', 60 | repository='https://external-secrets.github.io/kubernetes-external-secrets/', 61 | release='external-secrets', 62 | create_namespace=False, 63 | namespace='kube-system', 64 | values=load_yaml_replace_var_local(source_dir+'/app_resources/ex-secret-values.yaml', 65 | fields={ 66 | '{{region_name}}': Aws.REGION 67 | } 68 | ) 69 | ) 70 | self._ext_secret.node.add_dependency(self._alb) 71 | # Add Spark Operator to EKS 72 | eks_cluster.add_helm_chart('SparkOperatorChart', 73 | chart='spark-operator', 74 | repository='https://kubeflow.github.io/spark-operator', 75 | release='spark-operator', 76 | version='1.1.27', 77 | create_namespace=True, 78 | values=load_yaml_replace_var_local(source_dir+'/app_resources/spark-operator-values.yaml',fields={'':''}) 79 | ) -------------------------------------------------------------------------------- /spark-on-eks/source/lib/cdk_infra/eks_cluster.py: -------------------------------------------------------------------------------- 1 | # // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # // SPDX-License-Identifier: MIT-0 3 | 4 | from aws_cdk import (aws_eks as eks,aws_ec2 as ec2) 5 | from aws_cdk.aws_iam import IRole 6 | from constructs import Construct 7 | from aws_cdk.lambda_layer_kubectl_v27 import KubectlV27Layer 8 | 9 | class EksConst(Construct): 10 | 11 | @property 12 | def my_cluster(self): 13 | return self._my_cluster 14 | 15 | def __init__(self,scope: Construct, id:str, eksname: str, eksvpc: ec2.IVpc, noderole: IRole, eks_adminrole: IRole, **kwargs) -> None: 16 | super().__init__(scope, id, **kwargs) 17 | 18 | # 1.Create EKS cluster without node group 19 | self._my_cluster = eks.Cluster(self,'EKS', 20 | vpc= eksvpc, 21 | cluster_name=eksname, 22 | masters_role=eks_adminrole, 23 | output_cluster_name=True, 24 | version= eks.KubernetesVersion.V1_27, 25 | endpoint_access= eks.EndpointAccess.PUBLIC_AND_PRIVATE, 26 | default_capacity=0, 27 | kubectl_layer=KubectlV27Layer(self, 'kubectlV27Layer') 28 | ) 29 | 30 | # 2.Add Managed NodeGroup to EKS, compute resource to run Spark jobs 31 | _managed_node = self._my_cluster.add_nodegroup_capacity('onDemand-mn', 32 | nodegroup_name = 'etl-ondemand', 33 | node_role = noderole, 34 | desired_size = 1, 35 | max_size = 5, 36 | disk_size = 50, 37 | instance_types = [ec2.InstanceType('m5.xlarge')], 38 | labels = {'app':'spark', 'lifecycle':'OnDemand'}, 39 | subnets = ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS,one_per_az=True), 40 | tags = {'Name':'OnDemand-'+eksname,'k8s.io/cluster-autoscaler/enabled': 'true', 'k8s.io/cluster-autoscaler/'+eksname: 'owned'} 41 | ) 42 | 43 | 44 | # 3. Add Spot managed NodeGroup to EKS (Run Spark exectutor on spot) 45 | _spot_node = self._my_cluster.add_nodegroup_capacity('spot-mn', 46 | nodegroup_name = 'etl-spot', 47 | node_role = noderole, 48 | desired_size = 1, 49 | max_size = 30, 50 | disk_size = 50, 51 | instance_types=[ec2.InstanceType("r5.xlarge"),ec2.InstanceType("r4.xlarge"),ec2.InstanceType("r5a.xlarge")], 52 | labels = {'app':'spark', 'lifecycle':'Ec2Spot'}, 53 | capacity_type=eks.CapacityType.SPOT, 54 | tags = {'Name':'Spot-'+eksname, 'k8s.io/cluster-autoscaler/enabled': 'true', 'k8s.io/cluster-autoscaler/'+eksname: 'owned'} 55 | ) 56 | 57 | # # 4. Add Fargate NodeGroup to EKS, without setup cluster-autoscaler 58 | # self._my_cluster.add_fargate_profile('FargateEnabled', 59 | # selectors =[{ 60 | # "namespace": "spark" 61 | # }], 62 | # fargate_profile_name='sparkETL' 63 | # ) 64 | -------------------------------------------------------------------------------- /spark-on-eks/source/lib/cdk_infra/eks_service_account.py: -------------------------------------------------------------------------------- 1 | # // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # // SPDX-License-Identifier: MIT-0 3 | 4 | from aws_cdk import aws_iam as iam 5 | from constructs import Construct 6 | from aws_cdk.aws_secretsmanager import ISecret 7 | from aws_cdk.aws_eks import ICluster 8 | from lib.util.manifest_reader import * 9 | import os 10 | 11 | class EksSAConst(Construct): 12 | 13 | def __init__(self,scope: Construct, id:str, eks_cluster: ICluster, secret: ISecret, **kwargs,) -> None: 14 | super().__init__(scope, id, **kwargs) 15 | 16 | # //************************************v*************************************************************// 17 | # //***************************** SERVICE ACCOUNT, RBAC and IAM ROLES *******************************// 18 | # //****** Associating IAM role to K8s Service Account to provide fine-grain security control ******// 19 | # //***********************************************************************************************// 20 | source_dir=os.path.split(os.environ['VIRTUAL_ENV'])[0]+'/source' 21 | 22 | # Cluster Auto-scaler 23 | self._scaler_sa = eks_cluster.add_service_account('AutoScalerSa', 24 | name='cluster-autoscaler', 25 | namespace='kube-system' 26 | ) 27 | _scaler_role = load_yaml_local(source_dir+'/app_resources/autoscaler-iam-role.yaml') 28 | for statmt in _scaler_role: 29 | self._scaler_sa.add_to_principal_policy(iam.PolicyStatement.from_json(statmt)) 30 | 31 | # ALB Ingress 32 | self._alb_sa = eks_cluster.add_service_account('ALBServiceAcct', 33 | name='alb-aws-load-balancer-controller', 34 | namespace='kube-system' 35 | ) 36 | _alb_role = load_yaml_local(source_dir+'/app_resources/alb-iam-role.yaml') 37 | for statmt in _alb_role: 38 | self._alb_sa.add_to_principal_policy(iam.PolicyStatement.from_json(statmt)) 39 | 40 | # External secret controller 41 | self._secrets_sa = eks_cluster.add_service_account('ExSecretController', 42 | name='external-secrets-controller', 43 | namespace="kube-system" 44 | ) 45 | self._secrets_sa.node.add_dependency(secret) 46 | _secrets_role = load_yaml_replace_var_local(source_dir+'/app_resources/ex-secret-iam-role.yaml', 47 | fields={"{{secretsmanager}}": secret.secret_arn+"*"} 48 | ) 49 | for statmt in _secrets_role: 50 | self._secrets_sa.add_to_principal_policy(iam.PolicyStatement.from_json(statmt)) -------------------------------------------------------------------------------- /spark-on-eks/source/lib/cdk_infra/iam_roles.py: -------------------------------------------------------------------------------- 1 | # // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # // SPDX-License-Identifier: MIT-0 3 | 4 | import typing 5 | 6 | from aws_cdk import (Tags, aws_iam as iam) 7 | from typing import List 8 | from constructs import Construct 9 | 10 | class IamConst(Construct): 11 | 12 | @property 13 | def managed_node_role(self): 14 | return self._managed_node_role 15 | 16 | @property 17 | def admin_role(self): 18 | return self._clusterAdminRole 19 | 20 | def __init__(self,scope: Construct, id:str, cluster_name:str, **kwargs,) -> None: 21 | super().__init__(scope, id, **kwargs) 22 | 23 | # EKS admin role 24 | self._clusterAdminRole = iam.Role(self, 'clusterAdmin', 25 | assumed_by= iam.AccountRootPrincipal() 26 | ) 27 | self._clusterAdminRole.add_to_policy(iam.PolicyStatement( 28 | resources=["*"], 29 | actions=[ 30 | "eks:Describe*", 31 | "eks:List*", 32 | "eks:AccessKubernetesApi", 33 | "ssm:GetParameter", 34 | "iam:ListRoles" 35 | ], 36 | )) 37 | Tags.of(self._clusterAdminRole).add( 38 | key='eks/%s/type' % cluster_name, 39 | value='admin-role' 40 | ) 41 | 42 | # Managed Node Group Instance Role 43 | _managed_node_managed_policies = ( 44 | iam.ManagedPolicy.from_aws_managed_policy_name('AmazonEKSWorkerNodePolicy'), 45 | iam.ManagedPolicy.from_aws_managed_policy_name('AmazonEKS_CNI_Policy'), 46 | iam.ManagedPolicy.from_aws_managed_policy_name('AmazonEC2ContainerRegistryReadOnly'), 47 | iam.ManagedPolicy.from_aws_managed_policy_name('CloudWatchAgentServerPolicy'), 48 | ) 49 | self._managed_node_role = iam.Role(self,'NodeInstance-Role', 50 | role_name= cluster_name + '-NodeInstanceRole', 51 | path='/', 52 | assumed_by=iam.ServicePrincipal('ec2.amazonaws.com'), 53 | managed_policies=list(_managed_node_managed_policies), 54 | ) -------------------------------------------------------------------------------- /spark-on-eks/source/lib/cdk_infra/network_sg.py: -------------------------------------------------------------------------------- 1 | # // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # // SPDX-License-Identifier: MIT-0 3 | 4 | from aws_cdk import (Tags, aws_ec2 as ec2) 5 | from constructs import Construct 6 | import lib.util.get_aws_managed_prefix as custom 7 | 8 | class NetworkSgConst(Construct): 9 | 10 | @property 11 | def vpc(self): 12 | return self._vpc 13 | 14 | @property 15 | def alb_jhub_sg(self): 16 | return self._alb_jhub_sg 17 | @property 18 | def alb_argo_sg(self): 19 | return self._alb_argo_sg 20 | 21 | # @property 22 | # def efs_sg(self): 23 | # return self._eks_efs_sg 24 | 25 | 26 | def __init__(self,scope: Construct, id:str, eksname:str, codebucket: str, **kwargs) -> None: 27 | super().__init__(scope, id, **kwargs) 28 | 29 | # //*************************************************// 30 | # //******************* NETWORK ********************// 31 | # //************************************************// 32 | # create VPC 33 | self._vpc = ec2.Vpc(self, 'eksVpc',max_azs=2, nat_gateways=1) 34 | Tags.of(self._vpc).add('Name', eksname + 'EksVpc') 35 | 36 | # ALB security group for Jupyter & Argo 37 | prefixlist_peer=ec2.Peer.prefix_list( 38 | custom.AwsManagedPrefixList(self,'cr-getprefixId', 39 | custom.AwsManagedPrefixListProps(name='com.amazonaws.global.cloudfront.origin-facing') 40 | ).prefixlist_id 41 | ) 42 | self._alb_jhub_sg=ec2.SecurityGroup(self,'JupyterALBInboundSG', vpc=self._vpc,description='Security Group for Jupyter ALB') 43 | self._alb_argo_sg=ec2.SecurityGroup(self,'ArgoALBInboundSG', vpc=self._vpc,description='Security Group for Argo ALB') 44 | self._alb_jhub_sg.add_ingress_rule(prefixlist_peer,ec2.Port.tcp(port=80)) 45 | self._alb_argo_sg.add_ingress_rule(prefixlist_peer,ec2.Port.tcp(port=2746)) 46 | Tags.of(self._alb_jhub_sg).add('Name','SparkOnEKS-JhubSg') 47 | Tags.of(self._alb_argo_sg).add('Name','SparkOnEKS-ArgoSg') 48 | 49 | # VPC endpoint security group 50 | self._vpc_endpoint_sg = ec2.SecurityGroup(self,'EndpointSg', 51 | vpc=self._vpc, 52 | description='Security Group for Endpoint', 53 | ) 54 | self._vpc_endpoint_sg.add_ingress_rule(ec2.Peer.ipv4(self._vpc.vpc_cidr_block),ec2.Port.tcp(port=443)) 55 | Tags.of(self._vpc_endpoint_sg).add('Name','SparkOnEKS-VPCEndpointSg') 56 | 57 | # Add VPC endpoint 58 | self._vpc.add_gateway_endpoint("S3GatewayEndpoint", 59 | service=ec2.GatewayVpcEndpointAwsService.S3, 60 | subnets=[ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC), 61 | ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS)]) 62 | 63 | # self._vpc.add_interface_endpoint("EcrDockerEndpoint",service=ec2.InterfaceVpcEndpointAwsService.ECR_DOCKER, security_groups=[self._vpc_endpoint_sg]) 64 | self._vpc.add_interface_endpoint("CWLogsEndpoint", service=ec2.InterfaceVpcEndpointAwsService.CLOUDWATCH_LOGS,security_groups=[self._vpc_endpoint_sg]) 65 | self._vpc.add_interface_endpoint("AthenaEndpoint", service=ec2.InterfaceVpcEndpointAwsService.ATHENA,security_groups=[self._vpc_endpoint_sg]) 66 | self._vpc.add_interface_endpoint("KMSEndpoint", service=ec2.InterfaceVpcEndpointAwsService.KMS,security_groups=[self._vpc_endpoint_sg]) -------------------------------------------------------------------------------- /spark-on-eks/source/lib/cdk_infra/s3_app_code.py: -------------------------------------------------------------------------------- 1 | # // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # // SPDX-License-Identifier: MIT-0 3 | 4 | from aws_cdk import (RemovalPolicy, aws_s3 as s3, aws_s3_deployment as s3deploy) 5 | from constructs import Construct 6 | import os 7 | 8 | class S3AppCodeConst(Construct): 9 | 10 | @property 11 | def code_bucket(self): 12 | return self._code_bucket 13 | 14 | def __init__(self,scope: Construct, id: str, **kwargs,) -> None: 15 | super().__init__(scope, id, **kwargs) 16 | 17 | # Upload application code to S3 bucket 18 | artifact_bucket=s3.Bucket(self, id, 19 | block_public_access=s3.BlockPublicAccess.BLOCK_ALL, 20 | encryption=s3.BucketEncryption.KMS_MANAGED, 21 | removal_policy=RemovalPolicy.DESTROY, 22 | auto_delete_objects=True, 23 | access_control = s3.BucketAccessControl.LOG_DELIVERY_WRITE, 24 | object_ownership=s3.ObjectOwnership.OBJECT_WRITER 25 | ) 26 | 27 | source_dir=os.path.split(os.environ['VIRTUAL_ENV'])[0] 28 | s3deploy.BucketDeployment(self, "DeployCode", 29 | sources=[s3deploy.Source.asset(source_dir+'/deployment/app_code')], 30 | destination_bucket= artifact_bucket, 31 | destination_key_prefix="app_code" 32 | ) 33 | self._code_bucket = artifact_bucket.bucket_name 34 | -------------------------------------------------------------------------------- /spark-on-eks/source/lib/cdk_infra/spark_permission.py: -------------------------------------------------------------------------------- 1 | # // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # // SPDX-License-Identifier: MIT-0 3 | 4 | from aws_cdk import (aws_iam as iam) 5 | from constructs import Construct 6 | from aws_cdk.aws_eks import ICluster, KubernetesManifest 7 | from lib.util.manifest_reader import load_yaml_replace_var_local 8 | import os 9 | 10 | class SparkOnEksSAConst(Construct): 11 | 12 | @property 13 | def jupyter_sa(self): 14 | return self._jupyter_sa.service_account_name 15 | 16 | def __init__(self,scope: Construct, id: str, 17 | eks_cluster: ICluster, 18 | login_name: str, 19 | code_bucket: str, 20 | datalake_bucket: str, 21 | **kwargs,) -> None: 22 | super().__init__(scope, id, **kwargs) 23 | 24 | # //******************************************************************************************// 25 | # //************************ SETUP PERMISSION FOR ARC SPARK JOBS ****************************// 26 | # //******* create k8s namespace, service account, and IAM role for service account ********// 27 | # //***************************************************************************************// 28 | source_dir=os.path.split(os.environ['VIRTUAL_ENV'])[0]+'/source' 29 | 30 | # create k8s namespace 31 | etl_ns = eks_cluster.add_manifest('SparkNamespace',{ 32 | "apiVersion": "v1", 33 | "kind": "Namespace", 34 | "metadata": { 35 | "name": "spark", 36 | "labels": {"name":"spark"} 37 | } 38 | } 39 | ) 40 | jupyter_ns = eks_cluster.add_manifest('jhubNamespace',{ 41 | "apiVersion": "v1", 42 | "kind": "Namespace", 43 | "metadata": { 44 | "name": "jupyter", 45 | "labels": {"name":"spark"} 46 | } 47 | } 48 | ) 49 | 50 | # create k8s service account 51 | self._etl_sa = eks_cluster.add_service_account('ETLSa', 52 | name='arcjob', 53 | namespace='spark' 54 | ) 55 | self._etl_sa.node.add_dependency(etl_ns) 56 | 57 | _etl_rb = KubernetesManifest(self,'ETLRoleBinding', 58 | cluster=eks_cluster, 59 | manifest=load_yaml_replace_var_local(source_dir+'/app_resources/etl-rbac.yaml', 60 | fields= { 61 | "{{MY_SA}}": self._etl_sa.service_account_name 62 | }, 63 | multi_resource=True) 64 | ) 65 | _etl_rb.node.add_dependency(self._etl_sa) 66 | 67 | self._jupyter_sa = eks_cluster.add_service_account('jhubServiceAcct', 68 | name=login_name, 69 | namespace='jupyter' 70 | ) 71 | self._jupyter_sa.node.add_dependency(jupyter_ns) 72 | 73 | # Associate AWS IAM role to K8s Service Account 74 | datalake_bucket=code_bucket if not datalake_bucket.strip() else datalake_bucket 75 | _bucket_setting={ 76 | "{{codeBucket}}": code_bucket, 77 | "{{datalakeBucket}}": datalake_bucket 78 | } 79 | _etl_iam = load_yaml_replace_var_local(source_dir+'/app_resources/etl-iam-role.yaml',fields=_bucket_setting) 80 | for statmnt in _etl_iam: 81 | self._etl_sa.add_to_principal_policy(iam.PolicyStatement.from_json(statmnt)) 82 | self._jupyter_sa.add_to_principal_policy(iam.PolicyStatement.from_json(statmnt)) 83 | 84 | # # //*************************************************************************************// 85 | # # //******************** SETUP PERMISSION FOR NATIVE SPARK JOBS **********************// 86 | # # //***********************************************************************************// 87 | self._spark_sa = eks_cluster.add_service_account('NativeSparkSa', 88 | name='nativejob', 89 | namespace='spark' 90 | ) 91 | self._spark_sa.node.add_dependency(etl_ns) 92 | 93 | _spark_rb = eks_cluster.add_manifest('sparkRoleBinding', 94 | load_yaml_replace_var_local(source_dir+'/app_resources/native-spark-rbac.yaml', 95 | fields= { 96 | "{{MY_SA}}": self._spark_sa.service_account_name 97 | }) 98 | ) 99 | _spark_rb.node.add_dependency(self._spark_sa) 100 | 101 | _native_spark_iam = load_yaml_replace_var_local(source_dir+'/app_resources/native-spark-iam-role.yaml',fields=_bucket_setting) 102 | for statmnt in _native_spark_iam: 103 | self._spark_sa.add_to_principal_policy(iam.PolicyStatement.from_json(statmnt)) 104 | -------------------------------------------------------------------------------- /spark-on-eks/source/lib/cloud_front_stack.py: -------------------------------------------------------------------------------- 1 | # // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # // SPDX-License-Identifier: MIT-0 3 | 4 | from aws_cdk import ( 5 | NestedStack, Fn, 6 | aws_cloudfront_origins as origins, 7 | aws_cloudfront as cf, 8 | aws_elasticloadbalancingv2 as alb, 9 | aws_s3 as s3 10 | ) 11 | from constructs import Construct 12 | 13 | class NestedStack(NestedStack): 14 | 15 | @property 16 | def jhub_cf(self): 17 | return self._jhub_cf 18 | 19 | @property 20 | def argo_cf(self): 21 | return self._argo_cf 22 | 23 | def __init__(self, scope: Construct, id: str,logbucket: str,argo_alb_dns_name: str, jhub_alb_dns_name: str, **kwargs) -> None: 24 | super().__init__(scope, id, **kwargs) 25 | 26 | # //**********************************************************************************************************// 27 | # //*************************** Add CloudFront to enable HTTPS Endpoint (OPTIONAL) **************************// 28 | # //***** recommended way is to generate your own SSL certificate via AWS Certificate Manager ***************// 29 | # //****************************** add it to the application load balancer *********************************// 30 | # //*******************************************************************************************************// 31 | self._bucket=s3.Bucket.from_bucket_name(self,'cf_logbucket', logbucket) 32 | self._jhub_cf = add_distribution(self, 'jhub_dist', jhub_alb_dns_name, 80, self._bucket) 33 | self._argo_cf = add_distribution(self, 'argo_dist', argo_alb_dns_name, 2746, self._bucket) 34 | 35 | 36 | def add_distribution(scope: Construct, id: str, alb_dns_name: str, port: int, logbucket: s3.IBucket 37 | ) -> cf.IDistribution: 38 | 39 | load_balancer_arn=Fn.get_att(alb_dns_name,"DNSName") 40 | security_group_id=Fn.get_att(alb_dns_name,"SecurityGroups") 41 | 42 | alb2 = alb.ApplicationLoadBalancer.from_application_load_balancer_attributes(scope, id, 43 | load_balancer_arn=load_balancer_arn.to_string(), 44 | security_group_id=security_group_id.to_string(), 45 | load_balancer_dns_name=alb_dns_name 46 | ) 47 | _origin = origins.LoadBalancerV2Origin(alb2, 48 | http_port=port, 49 | protocol_policy=cf.OriginProtocolPolicy.HTTP_ONLY 50 | ) 51 | dist = cf.Distribution(scope, "CF-"+id, 52 | default_behavior={ 53 | "origin": _origin, 54 | "allowed_methods": cf.AllowedMethods.ALLOW_ALL, 55 | "cache_policy": cf.CachePolicy.CACHING_DISABLED, 56 | "origin_request_policy": cf.OriginRequestPolicy.ALL_VIEWER, 57 | "viewer_protocol_policy": cf.ViewerProtocolPolicy.REDIRECT_TO_HTTPS 58 | }, 59 | minimum_protocol_version=cf.SecurityPolicyProtocol.TLS_V1_2_2019, 60 | enable_logging=True, 61 | log_bucket=logbucket 62 | ) 63 | return dist.distribution_domain_name 64 | -------------------------------------------------------------------------------- /spark-on-eks/source/lib/spark_on_eks_stack.py: -------------------------------------------------------------------------------- 1 | # // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # // SPDX-License-Identifier: MIT-0 3 | 4 | from aws_cdk import (Stack, Duration, RemovalPolicy, Aws, Fn, CfnParameter, aws_eks as eks,aws_secretsmanager as secmger,aws_kms as kms) 5 | from constructs import Construct 6 | from lib.cdk_infra.network_sg import NetworkSgConst 7 | from lib.cdk_infra.iam_roles import IamConst 8 | from lib.cdk_infra.eks_cluster import EksConst 9 | from lib.cdk_infra.eks_service_account import EksSAConst 10 | from lib.cdk_infra.eks_base_app import EksBaseAppConst 11 | from lib.cdk_infra.s3_app_code import S3AppCodeConst 12 | from lib.cdk_infra.spark_permission import SparkOnEksSAConst 13 | from lib.util.manifest_reader import * 14 | import json,os 15 | 16 | class SparkOnEksStack(Stack): 17 | 18 | @property 19 | def code_bucket(self): 20 | return self.app_s3.code_bucket 21 | 22 | @property 23 | def argo_url(self): 24 | return self._argo_alb.value 25 | 26 | @property 27 | def jhub_url(self): 28 | return self._jhub_alb.value 29 | 30 | def __init__(self, scope: Construct, id: str, eksname: str, **kwargs) -> None: 31 | super().__init__(scope, id, **kwargs) 32 | 33 | source_dir=os.path.split(os.environ['VIRTUAL_ENV'])[0]+'/source' 34 | 35 | # Cloudformation input params 36 | datalake_bucket = CfnParameter(self, "datalakebucket", type="String", 37 | description="Your existing S3 bucket to be accessed by Jupyter Notebook and ETL job. Default: blank", 38 | default="" 39 | ) 40 | login_name="sparkoneks" 41 | # login_name = CfnParameter(self, "jhubuser", type="String", 42 | # description="Your username login to jupyter hub.", 43 | # default="sparkoneks" 44 | # ) 45 | 46 | # Auto-generate a user login in secrets manager 47 | key = kms.Key(self, 'KMSKey',removal_policy=RemovalPolicy.DESTROY,enable_key_rotation=True) 48 | key.add_alias("alias/secretsManager") 49 | jhub_secret = secmger.Secret(self, 'jHubPwd', 50 | generate_secret_string=secmger.SecretStringGenerator( 51 | exclude_punctuation=True, 52 | secret_string_template=json.dumps({'username': login_name}), 53 | # secret_string_template=json.dumps({'username': login_name.value_as_string}), 54 | generate_string_key="password"), 55 | removal_policy=RemovalPolicy.DESTROY, 56 | encryption_key=key 57 | ) 58 | 59 | # A new bucket to store app code and access logs 60 | self.app_s3 = S3AppCodeConst(self,'appcode') 61 | 62 | # 1. Setup EKS base infrastructure 63 | network_sg = NetworkSgConst(self,'network-sg', eksname, self.app_s3.code_bucket) 64 | iam = IamConst(self,'iam_roles', eksname) 65 | eks_cluster = EksConst(self,'eks_cluster', eksname, network_sg.vpc, iam.managed_node_role, iam.admin_role) 66 | EksSAConst(self, 'eks_sa', eks_cluster.my_cluster, jhub_secret) 67 | base_app=EksBaseAppConst(self, 'eks_base_app', eks_cluster.my_cluster) 68 | 69 | # 2. Setup Spark application access control 70 | app_security = SparkOnEksSAConst(self,'spark_service_account', 71 | eks_cluster.my_cluster, 72 | login_name, 73 | # login_name.value_as_string, 74 | self.app_s3.code_bucket, 75 | datalake_bucket.value_as_string 76 | ) 77 | app_security.node.add_dependency(base_app.secret_created) 78 | # 3. Install Arc Jupyter notebook to as Spark ETL IDE 79 | jhub_install= eks_cluster.my_cluster.add_helm_chart('JHubChart', 80 | chart='jupyterhub', 81 | repository='https://jupyterhub.github.io/helm-chart', 82 | release='jupyterhub', 83 | version='1.2.0', 84 | namespace='jupyter', 85 | create_namespace=False, 86 | values=load_yaml_replace_var_local(source_dir+'/app_resources/jupyter-values.yaml', 87 | fields={ 88 | "{{codeBucket}}": self.app_s3.code_bucket, 89 | "{{region}}": Aws.REGION 90 | }) 91 | ) 92 | jhub_install.node.add_dependency(app_security) 93 | 94 | # get Arc Jupyter login from secrets manager 95 | name_parts= Fn.split('-',jhub_secret.secret_name) 96 | name_no_suffix=Fn.join('-',[Fn.select(0, name_parts), Fn.select(1, name_parts)]) 97 | config_hub = eks.KubernetesManifest(self,'JHubConfig', 98 | cluster=eks_cluster.my_cluster, 99 | manifest=load_yaml_replace_var_local(source_dir+'/app_resources/jupyter-config.yaml', 100 | fields= { 101 | "{{MY_SA}}": app_security.jupyter_sa, 102 | "{{REGION}}": Aws.REGION, 103 | "{{SECRET_NAME}}": name_no_suffix, 104 | "{{INBOUND_SG}}": network_sg.alb_jhub_sg.security_group_id 105 | }, 106 | multi_resource=True) 107 | ) 108 | config_hub.node.add_dependency(jhub_install) 109 | 110 | # 4. Install ETL orchestrator - Argo 111 | # can be replaced by other workflow tool, ie. Airflow 112 | argo_install = eks_cluster.my_cluster.add_helm_chart('ARGOChart', 113 | chart='argo-workflows', 114 | repository='https://argoproj.github.io/argo-helm', 115 | release='argo', 116 | version='0.40.7', 117 | namespace='argo', 118 | create_namespace=True, 119 | values=load_yaml_replace_var_local(source_dir+'/app_resources/argo-values.yaml', 120 | fields= { 121 | "{{INBOUND_SG}}": network_sg.alb_argo_sg.security_group_id 122 | }) 123 | ) 124 | argo_install.node.add_dependency(config_hub) 125 | # Create a Spark workflow template with different T-shirt size 126 | submit_tmpl = eks_cluster.my_cluster.add_manifest('SubmitSparkWrktmpl', 127 | load_yaml_local(source_dir+'/app_resources/spark-template.yaml') 128 | ) 129 | submit_tmpl.node.add_dependency(argo_install) 130 | 131 | # 5.(OPTIONAL) retrieve ALB DNS Name to enable Cloudfront in the following nested stack. 132 | # Recommend to remove the CloudFront component 133 | # Setup your TLS certificate with your own domain name. 134 | self._jhub_alb=eks.KubernetesObjectValue(self, 'jhubALB', 135 | cluster=eks_cluster.my_cluster, 136 | json_path='..status.loadBalancer.ingress[0].hostname', 137 | object_type='ingress.networking', 138 | object_name='jupyterhub', 139 | object_namespace='jupyter', 140 | timeout=Duration.minutes(10) 141 | ) 142 | self._jhub_alb.node.add_dependency(config_hub) 143 | self._argo_alb = eks.KubernetesObjectValue(self, 'argoALB', 144 | cluster=eks_cluster.my_cluster, 145 | json_path='..status.loadBalancer.ingress[0].hostname', 146 | object_type='ingress.networking', 147 | object_name='argo-argo-workflows-server', 148 | object_namespace='argo', 149 | timeout=Duration.minutes(10) 150 | ) 151 | self._argo_alb.node.add_dependency(argo_install) 152 | 153 | -------------------------------------------------------------------------------- /spark-on-eks/source/lib/util/get_aws_managed_prefix.py: -------------------------------------------------------------------------------- 1 | from aws_cdk import (Aws, aws_ec2 as ec2,aws_iam as iam, Fn) 2 | from aws_cdk.custom_resources import ( 3 | AwsCustomResource, 4 | AwsCustomResourcePolicy, 5 | PhysicalResourceId, 6 | AwsSdkCall 7 | ) 8 | from constructs import Construct 9 | 10 | class AwsManagedPrefixListProps: 11 | def __init__(self, name: str): 12 | """ 13 | Name of the AWS managed prefix list. 14 | See: https://docs.aws.amazon.com/vpc/latest/userguide/working-with-aws-managed-prefix-lists.html#available-aws-managed-prefix-lists 15 | eg. com.amazonaws.global.cloudfront.origin-facing 16 | """ 17 | self.name = name 18 | 19 | class AwsManagedPrefixList(Construct): 20 | def __init__(self, scope: Construct, id: str, props: AwsManagedPrefixListProps): 21 | super().__init__(scope, id) 22 | res = AwsCustomResource( 23 | self, 'AWSCustomResource', 24 | on_create=self.create(props), 25 | policy=AwsCustomResourcePolicy.from_statements([ 26 | iam.PolicyStatement( 27 | effect=iam.Effect.ALLOW, 28 | actions=['ec2:DescribeManagedPrefixLists'], 29 | resources=['*'], 30 | ), 31 | ]) 32 | ) 33 | self.prefixlist_id=res.get_response_field("PrefixLists.0.PrefixListId") 34 | 35 | def create(self, props): 36 | custom_params = { 37 | 'Filters': [ 38 | { 39 | 'Name': 'prefix-list-name', 40 | 'Values': [props.name], 41 | }, 42 | ] 43 | } 44 | 45 | return AwsSdkCall( 46 | service='EC2', 47 | action='describeManagedPrefixLists', 48 | parameters=custom_params, 49 | physical_resource_id=PhysicalResourceId.of(f"{id}-{Fn.select(0, Fn.split(':', self.node.addr))}"), 50 | region=Aws.REGION 51 | ) -------------------------------------------------------------------------------- /spark-on-eks/source/lib/util/manifest_reader.py: -------------------------------------------------------------------------------- 1 | # // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # // SPDX-License-Identifier: MIT-0 3 | 4 | import yaml 5 | import urllib.request as request 6 | import os.path as path 7 | import sys 8 | 9 | def load_yaml_remotely(url, multi_resource=False): 10 | try: 11 | file_to_parse = request.urlopen(url) 12 | if multi_resource: 13 | yaml_data = list(yaml.full_load_all(file_to_parse)) 14 | else: 15 | yaml_data = yaml.full_load(file_to_parse) 16 | # print(yaml_data) 17 | except: 18 | print("Cannot read yaml config file {}, check formatting." 19 | "".format(file_to_parse)) 20 | sys.exit(1) 21 | 22 | return yaml_data 23 | 24 | def load_yaml_local(yaml_file, multi_resource=False): 25 | 26 | file_to_parse=path.join(path.dirname(__file__), yaml_file) 27 | if not path.exists(file_to_parse): 28 | print("The file {} does not exist" 29 | "".format(file_to_parse)) 30 | sys.exit(1) 31 | 32 | try: 33 | with open(file_to_parse, 'r') as yaml_stream: 34 | if multi_resource: 35 | yaml_data = list(yaml.full_load_all(yaml_stream)) 36 | else: 37 | yaml_data = yaml.full_load(yaml_stream) 38 | # print(yaml_data) 39 | except: 40 | print("Cannot read yaml config file {}, check formatting." 41 | "".format(file_to_parse)) 42 | sys.exit(1) 43 | 44 | return yaml_data 45 | 46 | def load_yaml_replace_var_remotely(url, fields, multi_resource=False): 47 | try: 48 | with request.urlopen(url) as f: 49 | file_to_replace = f.read().decode('utf-8') 50 | for searchwrd,replwrd in fields.items(): 51 | file_to_replace = file_to_replace.replace(searchwrd, replwrd) 52 | 53 | if multi_resource: 54 | yaml_data = list(yaml.full_load_all(file_to_replace)) 55 | else: 56 | yaml_data = yaml.full_load(file_to_replace) 57 | # print(yaml_data) 58 | except request.URLError as e: 59 | print(e.reason) 60 | sys.exit(1) 61 | 62 | return yaml_data 63 | 64 | 65 | def load_yaml_replace_var_local(yaml_file, fields, multi_resource=False, write_output=False): 66 | 67 | file_to_replace=path.join(path.dirname(__file__), yaml_file) 68 | if not path.exists(file_to_replace): 69 | print("The file {} does not exist" 70 | "".format(file_to_replace)) 71 | sys.exit(1) 72 | 73 | try: 74 | with open(file_to_replace, 'r') as f: 75 | filedata = f.read() 76 | 77 | for searchwrd, replwrd in fields.items(): 78 | filedata = filedata.replace(searchwrd, replwrd) 79 | if multi_resource: 80 | yaml_data = list(yaml.full_load_all(filedata)) 81 | else: 82 | yaml_data = yaml.full_load(filedata) 83 | if write_output: 84 | with open(file_to_replace, "w") as f: 85 | yaml.dump(yaml_data, f, default_flow_style=False, allow_unicode = True, sort_keys=False) 86 | 87 | # print(yaml_data) 88 | except request.URLError as e: 89 | print(e.reason) 90 | sys.exit(1) 91 | 92 | return yaml_data 93 | -------------------------------------------------------------------------------- /spark-on-eks/source/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "dependencies": { 3 | "aws-cdk": "^2.105.0" 4 | } 5 | } 6 | -------------------------------------------------------------------------------- /spark-on-eks/source/requirements.txt: -------------------------------------------------------------------------------- 1 | -e . 2 | pytest -------------------------------------------------------------------------------- /spark-on-eks/source/setup.py: -------------------------------------------------------------------------------- 1 | # // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # // SPDX-License-Identifier: MIT-0 3 | import setuptools 4 | 5 | try: 6 | with open("../README.md") as fp: 7 | long_description = fp.read() 8 | except IOError as e: 9 | long_description = '' 10 | 11 | setuptools.setup( 12 | name="sql-based-etl", 13 | version="3.0.0", 14 | 15 | description="A CDK v2 Python app for SQL-based ETL", 16 | long_description=long_description, 17 | long_description_content_type="text/markdown", 18 | 19 | author="meloyang", 20 | 21 | package_dir={"": "./"}, 22 | packages=setuptools.find_packages(where="./"), 23 | 24 | install_requires=[ 25 | "aws-cdk-lib==2.105.0", 26 | "aws-cdk.lambda-layer-kubectl-v27==2.0.0", 27 | "constructs>=10.0.0,<11.0.0", 28 | "pyyaml==6.0.1", 29 | ], 30 | 31 | python_requires=">=3.8", 32 | 33 | classifiers=[ 34 | "Development Status :: 4 - Beta", 35 | 36 | "Intended Audience :: Developers", 37 | 38 | "License :: OSI Approved :: MIT License", 39 | 40 | "Programming Language :: JavaScript", 41 | "Programming Language :: Python :: 3 :: Only", 42 | "Programming Language :: Python :: 3.8", 43 | "Programming Language :: Python :: 3.9", 44 | "Programming Language :: Python :: 3.10", 45 | "Programming Language :: Python :: 3.11", 46 | 47 | "Topic :: Software Development :: Code Generators", 48 | "Topic :: Utilities", 49 | 50 | "Typing :: Typed", 51 | ], 52 | ) 53 | --------------------------------------------------------------------------------