├── .gitignore
├── CHANGELOG.md
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE.txt
├── README.md
├── THIRD-PARTY-NOTICES.txt
├── emr-on-eks
    ├── README.md
    ├── deprovision.sh
    ├── green_taxi_load.ipynb
    ├── green_taxi_load.ipynb.license
    ├── green_taxi_schema.json
    ├── green_taxi_schema.json.license
    ├── provision.sh
    └── submit_arc_job.sh
└── spark-on-eks
    ├── README.md
    ├── deployment
        ├── app_code
        │   ├── data
        │   │   ├── initial_contacts.csv
        │   │   └── update_contacts.csv
        │   ├── job
        │   │   ├── delta_load.ipynb
        │   │   ├── driver-pod-template.yaml
        │   │   ├── executor-pod-template.yaml
        │   │   ├── green_taxi_load.ipynb
        │   │   ├── initial_load.ipynb
        │   │   ├── msk_consumer.py
        │   │   ├── scd2_merge.ipynb
        │   │   └── wordcount.py
        │   ├── meta
        │   │   ├── contact_meta_0.json
        │   │   └── green_taxi_schema.json
        │   └── sql
        │   │   ├── add_calc_field_for_scd2.sql
        │   │   ├── create_table_contact.sql
        │   │   └── sqlvalidate_errors.sql
        ├── build-s3-dist.sh
        ├── cdk-solution-helper
        │   ├── README.md
        │   ├── index.js
        │   └── package.json
        ├── delete_all.sh
        └── post-deployment.sh
    ├── images
        ├── 00-deploy-to-aws.png
        ├── 3-argo-job-dependency.png
        ├── 3-argo-log.png
        ├── 3-argo-sidemenu.png
        ├── 4-auto-scaling.png
        ├── 4-k8s-retry.png
        ├── 4-spot-console.png
        ├── architecture.png
        ├── driver_interruption_test.gif
        ├── executor_interruption_test.png
        ├── fake_data.gif
        ├── run_jupyter.gif
        ├── submit_job_in_argo.gif
        ├── submit_native_spark.gif
        └── two_architecture.png
    └── source
        ├── app.py
        ├── app_resources
            ├── alb-iam-role.yaml
            ├── alb-values.yaml
            ├── argo-values.yaml
            ├── autoscaler-iam-role.yaml
            ├── autoscaler-values.yaml
            ├── etl-iam-role.yaml
            ├── etl-rbac.yaml
            ├── ex-secret-iam-role.yaml
            ├── ex-secret-values.yaml
            ├── jupyter-config.yaml
            ├── jupyter-values.yaml
            ├── native-spark-iam-role.yaml
            ├── native-spark-rbac.yaml
            ├── spark-operator-values.yaml
            └── spark-template.yaml
        ├── cdk.json
        ├── example
            ├── native-spark-job-scheduler.yaml
            ├── notebook
            │   ├── Spark_streaming_job.ipynb
            │   ├── nyctaxi-job.ipynb
            │   └── scd2-job.ipynb
            ├── nyctaxi-job-scheduler.yaml
            ├── scd2-job-scheduler.yaml
            └── test
            │   ├── TEST-arc-jupyter.yaml
            │   └── TEST-cron-job-scheduler.yaml
        ├── lib
            ├── cdk_infra
            │   ├── eks_base_app.py
            │   ├── eks_cluster.py
            │   ├── eks_service_account.py
            │   ├── iam_roles.py
            │   ├── network_sg.py
            │   ├── s3_app_code.py
            │   └── spark_permission.py
            ├── cloud_front_stack.py
            ├── spark_on_eks_stack.py
            └── util
            │   ├── get_aws_managed_prefix.py
            │   └── manifest_reader.py
        ├── package.json
        ├── requirements.txt
        └── setup.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | # SPDX-FileCopyrightText: 2021 Amazon.com, Inc. or its affiliates.
 2 | #
 3 | # SPDX-License-Identifier: MIT-0
 4 | # Git
 5 | .git
 6 | 
 7 | ### VisualStudioCode ###
 8 | .vscode/*
 9 | ### IntelliJ/ PyCharm ###
10 | .idea/*
11 | # System Files
12 | **/.DS_Store
13 | # CDK
14 | **/cdk.out
15 | **/cdk.context.json
16 | *.swp
17 | **/node_modules
18 | **/package-lock.json
19 | 
20 | # compiled output
21 | **/global-s3-assets
22 | **/regional-s3-assets
23 | **/open-source
24 | 
25 | ### Python ###
26 | # Byte-compiled / optimized / DLL files
27 | __pycache__/
28 | *.py[cod]
29 | *$py.class
30 | # Python Distribution / packaging
31 | *.egg-info/
32 | *.egg
33 | # Python Virtual Environments
34 | **/venv*
35 | **/.venv*
36 | **/.env
37 | ## Python Testing
38 | .pytest_cache
39 | **/.pytest_cache
40 | **/.coverage
41 | **/coverage-reports/


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
 1 | # Change Log
 2 | All notable changes to this project will be documented in this file.
 3 | 
 4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 5 | and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 6 | 
 7 | ## [3.0.0] - 2023-12-14
 8 | ### Added
 9 | - Added a compulsory CFN input parameter to restrict inbound CIDRs for ALB security group
10 | ## [2.0.0] - 2021-11-19
11 | ### Upgrade
12 | - upgrade the entire cdk code from version 1 to version 2
13 | ## [1.0.0] - 2020-12-04
14 | ### Added
15 | - All files, initial version
16 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing Guidelines
 2 | 
 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
 4 | documentation, we greatly value feedback and contributions from our community.
 5 | 
 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
 7 | information to effectively respond to your bug report or contribution.
 8 | 
 9 | 
10 | ## Reporting Bugs/Feature Requests
11 | 
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 | 
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 | 
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 | 
22 | 
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 | 
26 | 1. You are working against the latest source on the *main* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 | 
30 | To send us a pull request, please:
31 | 
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 | 
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 | 
42 | 
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 | 
46 | 
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 | 
52 | 
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 | 
56 | 
57 | ## Licensing
58 | 
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of
 4 | this software and associated documentation files (the "Software"), to deal in
 5 | the Software without restriction, including without limitation the rights to
 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
 7 | the Software, and to permit persons to whom the Software is furnished to do so.
 8 | 
 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
15 | 
16 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # SQL-based ETL with Spark on EKS
 2 | 
 3 | We introduce a quality-aware design to increase data processing productivity, by leveraging an open-source [Arc data framework](https://arc.tripl.ai/) for a user-centered declarative ETL solution. We take considerations of the needs and expected skills from customers in data analytics, and accelerate their interaction with ETL practice in order to foster simplicity, while maximizing efficiency.
 4 | 
 5 | The sample provides two ways of running the solution shown in the architecture diagram:
 6 | 1. Spark on EKS by Argo Workflows tool
 7 | 2. [EMR on EKS](https://aws.amazon.com/emr/features/eks/) 
 8 | 
 9 | ![](/spark-on-eks/images/two_architecture.png)
10 | 
11 | ### Test job in Jupyter
12 | ![](/spark-on-eks/images/run_jupyter.gif)
13 | 
14 | 
15 | ### Test Spark Driver self-recovery (100% spot)
16 | ![](/spark-on-eks/images/driver_interruption_test.gif)
17 | 
18 | ### Submit Spark job by Argo tool
19 | ![](/spark-on-eks/images/submit_job_in_argo.gif)
20 | 
21 | 
22 | ## Prerequisite
23 | Running the sample solution on a local machine, you should have the following prerequisites:
24 | 1. Python 3.6 or later. Download Python [here](https://www.python.org/downloads/).
25 | 2. AWS CLI version 1.
26 |   Windows: [MSI installer](https://docs.aws.amazon.com/cli/latest/userguide/install-windows.html#install-msi-on-windows)
27 |   Linux, macOS or Unix: [Bundled installer](https://docs.aws.amazon.com/cli/latest/userguide/install-macos.html#install-macosos-bundled)
28 | 3. AWS CLI is configured to communicate with services in your deployment account. Otherwise, either set your profile by `export AWS_PROFILE=<your_aws_profile>` , or run the following configuration to setup your AWS account access.
29 | ```bash
30 | aws configure
31 | ```  
32 | If you don’t want to install anything on your computer, use [AWS CloudShell](https://aws.amazon.com/cloudshell/), a browser-based shell that makes it easy to run scripts with the AWS Command Line Interface (AWS CLI).
33 | 
34 | ## Clone the project
35 | Download the sample code either to your computer or to your [AWS CloudShell Console](https://console.aws.amazon.com/cloudshell/home?region=us-east-1).
36 | 
37 | ```bash
38 | git clone https://github.com/aws-samples/sql-based-etl-on-amazon-eks.git
39 | cd sql-based-etl-on-amazon-eks
40 | ```
41 | 
42 | ## Deploy Infrastructure
43 | 
44 | The provisining takes about 30 minutes to complete. See the `troubleshooting` section if you have any problem during the deployment. 
45 | 
46 | The example solution provides two options to submit ETL jobs. See the detailed deployment instruction:
47 | 
48 | 1. [Spark on EKS](/spark-on-eks/README.md)
49 | 2. [EMR on EKS](/emr-on-eks/README.md)
50 | 
51 | 
52 | ## Troubleshooting
53 | 1. If you see the error `Credentials were refreshed, but the refreshed credentials are still expired` in AWS CloudShell, click **Actions** button, and create a `New tab`.
54 | 
55 | 2. If you see the issue `[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: unable to get local issuer certificate (_ssl.c:1123)`, most likely it means no default certificate authority for your Python installation on OSX. Refer to the [answer](https://stackoverflow.com/questions/52805115/0nd) installing `Install Certificates.command` should fix your local environment. Otherwise, use [Cloud9](https://aws.amazon.com/cloud9/details/) to deploy the CDK instead.
56 | 
57 | 
58 | ## Security
59 | 
60 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information.
61 | 
62 | ## License
63 | 
64 | This library is licensed under the MIT-0 License. See the [LICENSE](LICENSE.txt) file.


--------------------------------------------------------------------------------
/THIRD-PARTY-NOTICES.txt:
--------------------------------------------------------------------------------
  1 | ** aws-cdk -- v1.96.0 -- https://github.com/aws/aws-cdk -- Apache-2.0
  2 | ** cdk-solution-init-pkg; version 1.0.0 -- https://aws.amazon.com/solutions/ -- Apahe-2.0
  3 | ** Arc -- v3.10.0_spark_3.0.3_scala_2.12_hadoop_3.2.0_1.0.0 -- https://arc.tripl.ai/ -- MIT License
  4 | ** Arc Jupyter - v3.14.2_scala_2.12_hadoop_3.2.0_1.1.0 -- https://github.com/tripl-ai/arc-jupyter --  MIT License
  5 | ** argo-workflows -- v3.5.4 -- https://github.com/argoproj/argo-helm -- Apache-2.0
  6 | ** JupyterHub -- v1.5.0 -- https://jupyterhub.github.io/helm-chart/ -- revised BSD license
  7 | ** k8s-cluster-autoscaler -- v1.27.2 -- https://github.com/kubernetes/autoscaler -- Apache-2.0
  8 | ** amazon-cloudwatch-container-insights -- latest version -- https://github.com/aws-samples/amazon-cloudwatch-container-insights -- MIT-0
  9 | ** aws-load-balancer-controller -- v2.5.4 -- https://github.com/aws/eks-charts/ -- Apache-2.0
 10 | ** kubernetes-external-secrets -- v8.5.5 -- https://github.com/external-secrets/kubernetes-external-secrets -- MIT License
 11 | ** spark-on-k8s-operator -- v1beta2-1.2.3-3.1.1 -- https://github.com/GoogleCloudPlatform/spark-on-k8s-operator -- Apache-2.0
 12 | 
 13 | Apache License
 14 | 
 15 | Version 2.0, January 2004
 16 | 
 17 | http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND
 18 | DISTRIBUTION
 19 | 
 20 |    1. Definitions.
 21 | 
 22 |       "License" shall mean the terms and conditions for use, reproduction, and
 23 |       distribution as defined by Sections 1 through 9 of this document.
 24 | 
 25 |       "Licensor" shall mean the copyright owner or entity authorized by the
 26 |       copyright owner that is granting the License.
 27 | 
 28 |       "Legal Entity" shall mean the union of the acting entity and all other
 29 |       entities that control, are controlled by, or are under common control
 30 |       with that entity. For the purposes of this definition, "control" means
 31 |       (i) the power, direct or indirect, to cause the direction or management
 32 |       of such entity, whether by contract or otherwise, or (ii) ownership of
 33 |       fifty percent (50%) or more of the outstanding shares, or (iii)
 34 |       beneficial ownership of such entity.
 35 | 
 36 |       "You" (or "Your") shall mean an individual or Legal Entity exercising
 37 |       permissions granted by this License.
 38 | 
 39 |       "Source" form shall mean the preferred form for making modifications,
 40 |       including but not limited to software source code, documentation source,
 41 |       and configuration files.
 42 | 
 43 |       "Object" form shall mean any form resulting from mechanical
 44 |       transformation or translation of a Source form, including but not limited
 45 |       to compiled object code, generated documentation, and conversions to
 46 |       other media types.
 47 | 
 48 |       "Work" shall mean the work of authorship, whether in Source or Object
 49 |       form, made available under the License, as indicated by a copyright
 50 |       notice that is included in or attached to the work (an example is
 51 |       provided in the Appendix below).
 52 | 
 53 |       "Derivative Works" shall mean any work, whether in Source or Object form,
 54 |       that is based on (or derived from) the Work and for which the editorial
 55 |       revisions, annotations, elaborations, or other modifications represent,
 56 |       as a whole, an original work of authorship. For the purposes of this
 57 |       License, Derivative Works shall not include works that remain separable
 58 |       from, or merely link (or bind by name) to the interfaces of, the Work and
 59 |       Derivative Works thereof.
 60 | 
 61 |       "Contribution" shall mean any work of authorship, including the original
 62 |       version of the Work and any modifications or additions to that Work or
 63 |       Derivative Works thereof, that is intentionally submitted to Licensor for
 64 |       inclusion in the Work by the copyright owner or by an individual or Legal
 65 |       Entity authorized to submit on behalf of the copyright owner. For the
 66 |       purposes of this definition, "submitted" means any form of electronic,
 67 |       verbal, or written communication sent to the Licensor or its
 68 |       representatives, including but not limited to communication on electronic
 69 |       mailing lists, source code control systems, and issue tracking systems
 70 |       that are managed by, or on behalf of, the Licensor for the purpose of
 71 |       discussing and improving the Work, but excluding communication that is
 72 |       conspicuously marked or otherwise designated in writing by the copyright
 73 |       owner as "Not a Contribution."
 74 | 
 75 |       "Contributor" shall mean Licensor and any individual or Legal Entity on
 76 |       behalf of whom a Contribution has been received by Licensor and
 77 |       subsequently incorporated within the Work.
 78 | 
 79 |    2. Grant of Copyright License. Subject to the terms and conditions of this
 80 |    License, each Contributor hereby grants to You a perpetual, worldwide,
 81 |    non-exclusive, no-charge, royalty-free, irrevocable copyright license to
 82 |    reproduce, prepare Derivative Works of, publicly display, publicly perform,
 83 |    sublicense, and distribute the Work and such Derivative Works in Source or
 84 |    Object form.
 85 | 
 86 |    3. Grant of Patent License. Subject to the terms and conditions of this
 87 |    License, each Contributor hereby grants to You a perpetual, worldwide,
 88 |    non-exclusive, no-charge, royalty-free, irrevocable (except as stated in
 89 |    this section) patent license to make, have made, use, offer to sell, sell,
 90 |    import, and otherwise transfer the Work, where such license applies only to
 91 |    those patent claims licensable by such Contributor that are necessarily
 92 |    infringed by their Contribution(s) alone or by combination of their
 93 |    Contribution(s) with the Work to which such Contribution(s) was submitted.
 94 |    If You institute patent litigation against any entity (including a
 95 |    cross-claim or counterclaim in a lawsuit) alleging that the Work or a
 96 |    Contribution incorporated within the Work constitutes direct or contributory
 97 |    patent infringement, then any patent licenses granted to You under this
 98 |    License for that Work shall terminate as of the date such litigation is
 99 |    filed.
100 | 
101 |    4. Redistribution. You may reproduce and distribute copies of the Work or
102 |    Derivative Works thereof in any medium, with or without modifications, and
103 |    in Source or Object form, provided that You meet the following conditions:
104 | 
105 |       (a) You must give any other recipients of the Work or Derivative Works a
106 |       copy of this License; and
107 | 
108 |       (b) You must cause any modified files to carry prominent notices stating
109 |       that You changed the files; and
110 | 
111 |       (c) You must retain, in the Source form of any Derivative Works that You
112 |       distribute, all copyright, patent, trademark, and attribution notices
113 |       from the Source form of the Work, excluding those notices that do not
114 |       pertain to any part of the Derivative Works; and
115 | 
116 |       (d) If the Work includes a "NOTICE" text file as part of its
117 |       distribution, then any Derivative Works that You distribute must include
118 |       a readable copy of the attribution notices contained within such NOTICE
119 |       file, excluding those notices that do not pertain to any part of the
120 |       Derivative Works, in at least one of the following places: within a
121 |       NOTICE text file distributed as part of the Derivative Works; within the
122 |       Source form or documentation, if provided along with the Derivative
123 |       Works; or, within a display generated by the Derivative Works, if and
124 |       wherever such third-party notices normally appear. The contents of the
125 |       NOTICE file are for informational purposes only and do not modify the
126 |       License. You may add Your own attribution notices within Derivative Works
127 |       that You distribute, alongside or as an addendum to the NOTICE text from
128 |       the Work, provided that such additional attribution notices cannot be
129 |       construed as modifying the License.
130 | 
131 |       You may add Your own copyright statement to Your modifications and may
132 |       provide additional or different license terms and conditions for use,
133 |       reproduction, or distribution of Your modifications, or for any such
134 |       Derivative Works as a whole, provided Your use, reproduction, and
135 |       distribution of the Work otherwise complies with the conditions stated in
136 |       this License.
137 | 
138 |    5. Submission of Contributions. Unless You explicitly state otherwise, any
139 |    Contribution intentionally submitted for inclusion in the Work by You to the
140 |    Licensor shall be under the terms and conditions of this License, without
141 |    any additional terms or conditions. Notwithstanding the above, nothing
142 |    herein shall supersede or modify the terms of any separate license agreement
143 |    you may have executed with Licensor regarding such Contributions.
144 | 
145 |    6. Trademarks. This License does not grant permission to use the trade
146 |    names, trademarks, service marks, or product names of the Licensor, except
147 |    as required for reasonable and customary use in describing the origin of the
148 |    Work and reproducing the content of the NOTICE file.
149 | 
150 |    7. Disclaimer of Warranty. Unless required by applicable law or agreed to in
151 |    writing, Licensor provides the Work (and each Contributor provides its
152 |    Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
153 |    KIND, either express or implied, including, without limitation, any
154 |    warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or
155 |    FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining
156 |    the appropriateness of using or redistributing the Work and assume any risks
157 |    associated with Your exercise of permissions under this License.
158 | 
159 |    8. Limitation of Liability. In no event and under no legal theory, whether
160 |    in tort (including negligence), contract, or otherwise, unless required by
161 |    applicable law (such as deliberate and grossly negligent acts) or agreed to
162 |    in writing, shall any Contributor be liable to You for damages, including
163 |    any direct, indirect, special, incidental, or consequential damages of any
164 |    character arising as a result of this License or out of the use or inability
165 |    to use the Work (including but not limited to damages for loss of goodwill,
166 |    work stoppage, computer failure or malfunction, or any and all other
167 |    commercial damages or losses), even if such Contributor has been advised of
168 |    the possibility of such damages.
169 | 
170 |    9. Accepting Warranty or Additional Liability. While redistributing the Work
171 |    or Derivative Works thereof, You may choose to offer, and charge a fee for,
172 |    acceptance of support, warranty, indemnity, or other liability obligations
173 |    and/or rights consistent with this License. However, in accepting such
174 |    obligations, You may act only on Your own behalf and on Your sole
175 |    responsibility, not on behalf of any other Contributor, and only if You
176 |    agree to indemnify, defend, and hold each Contributor harmless for any
177 |    liability incurred by, or claims asserted against, such Contributor by
178 |    reason of your accepting any such warranty or additional liability. END OF
179 |    TERMS AND CONDITIONS
180 | 
181 | APPENDIX: How to apply the Apache License to your work.
182 | 
183 | To apply the Apache License to your work, attach the following boilerplate
184 | notice, with the fields enclosed by brackets "[]" replaced with your own
185 | identifying information. (Don't include the brackets!) The text should be
186 | enclosed in the appropriate comment syntax for the file format. We also
187 | recommend that a file or class name and description of purpose be included on
188 | the same "printed page" as the copyright notice for easier identification
189 | within third-party archives.
190 | 
191 | Copyright [yyyy] [name of copyright owner]
192 | 
193 | Licensed under the Apache License, Version 2.0 (the "License");
194 | 
195 | you may not use this file except in compliance with the License.
196 | 
197 | You may obtain a copy of the License at
198 | 
199 | http://www.apache.org/licenses/LICENSE-2.0
200 | 
201 | Unless required by applicable law or agreed to in writing, software
202 | 
203 | distributed under the License is distributed on an "AS IS" BASIS,
204 | 
205 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
206 | 
207 | See the License for the specific language governing permissions and
208 | 
209 | limitations under the License.
210 | 
211 | * For cdk-solution-init-pkg see also this required NOTICE:
212 |     Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.
213 |     Licensed under the Apache License Version 2.0 (the "License"). You may not
214 |     use this file except
215 |     in compliance with the License. A copy of the License is located at
216 |     http://www.apache.org/licenses/
217 |     or in the "license" file accompanying this file. This file is distributed
218 |     on an "AS IS" BASIS,
219 |     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. See the
220 |     License for the
221 |     specific language governing permissions and limitations under the License.


--------------------------------------------------------------------------------
/emr-on-eks/README.md:
--------------------------------------------------------------------------------
 1 | <!--
 2 | SPDX-FileCopyrightText: 2021 Amazon.com, Inc. or its affiliates.
 3 | 
 4 | SPDX-License-Identifier: MIT-0
 5 | -->
 6 | 
 7 | # Arc ETL framework on EMR on EKS
 8 | AWS Launched [EMR on EKS](https://aws.amazon.com/emr/features/eks/) and this sample demonstrates an end-to-end process to provision an EKS cluster, execute a Spark ETL job defined as a [jupyter notebook](green_taxi_load.ipynb) using [Arc Framework](https://arc.tripl.ai/getting-started/).
 9 | 
10 | # Provisioning
11 | 1. Open AWS CloudShell in us-east-1: [link to AWS CloudShell](https://console.aws.amazon.com/cloudshell/home?region=us-east-1)
12 | 2. Run the following command to provision a new EKS cluster `eks-cluster` backed by Fargate and build a virtual EMR cluster `emr-on-eks-cluster` 
13 |     ```bash
14 |     curl https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/main/emr-on-eks/provision.sh | bash
15 |     ```
16 | 3. Once provisioning is complete (~20 min), run the following command to submit a new Spark job on the virtual EMR cluster:
17 |     ```bash
18 |     curl https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/main/emr-on-eks/submit_arc_job.sh | bash
19 |     ```
20 |     The sample job will create an output S3 bucket, load the [TLC green taxi trip records](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page) from public `s3://nyc-tlc/csv_backup/green_tripdata*.csv`, apply schema, convert it into Parquet and store it in the output S3 bucket.
21 | 
22 |     The job is defined as a [jupyter notebook green_taxi_load.ipynb](green_taxi_load.ipynb) using [Arc Framework](https://arc.tripl.ai/getting-started/) and the applied schema is defined in [green_taxi_schema.json](green_taxi_schema.json)
23 | 
24 | 
25 | ## AWS Resources
26 | * EKS cluster: [link to AWS Console](https://console.aws.amazon.com/eks/home?region=us-east-1#/clusters/eks-cluster)
27 | * Virtual EMR Clusters and jobs: [link to AWS Console](https://console.aws.amazon.com/elasticmapreduce/home?region=us-east-1#virtual-cluster-list:)
28 | * CloudWatch EMR job logs: [link to AWS Console](https://console.aws.amazon.com/cloudwatch/home?region=us-east-1#logsV2:log-groups/log-group/$252Faws$252Feks$252Feks-cluster$252Fjobs)
29 | * S3 buckets - navigate to the output S3 bucket: [link to AWS Console](https://s3.console.aws.amazon.com/s3/home?region=us-east-1)
30 | 
31 | ## EKS Resources
32 | To review the execution process, run: 
33 | ```
34 | kubectl get po -n emr
35 | ```
36 | 
37 | # Cleanup
38 | To clean up resources, run:
39 | ```bash
40 | curl https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/main/emr-on-eks/deprovision.sh | bash
41 | ```
42 | 
43 | 
44 | 
45 | That's it!
46 | 


--------------------------------------------------------------------------------
/emr-on-eks/deprovision.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # SPDX-FileCopyrightText: Copyright 2021 Amazon.com, Inc. or its affiliates.
 4 | # SPDX-License-Identifier: MIT-0
 5 | 
 6 | # Define params
 7 | export AWS_DEFAULT_REGION=us-east-1
 8 | export EKSCLUSTERNAME=eks-cluster
 9 | export EMRCLUSTERNAME=emr-on-$EKSCLUSTERNAME
10 | export ROLENAME=${EMRCLUSTERNAME}-execution-role
11 | 
12 | #submit test job
13 | export EMRCLUSTERID=$(aws emr-containers list-virtual-clusters --query "virtualClusters[?name == '${EMRCLUSTERNAME}' && state == 'RUNNING'].id" --output text)
14 | export ACCOUNTID=$(aws sts get-caller-identity --query Account --output text)
15 | export ROLEARN=arn:aws:iam::$ACCOUNTID:role/$ROLENAME
16 | export OUTPUTS3BUCKET=${EMRCLUSTERNAME}-${ACCOUNTID}
17 | export POLICYARN=arn:aws:iam::$ACCOUNTID:policy/${ROLENAME}-policy
18 | 
19 | # install eksctl (https://docs.aws.amazon.com/emr/latest/EMR-on-EKS-DevelopmentGuide/setting-up-eksctl.html)
20 | curl --silent --location "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp
21 | sudo mv /tmp/eksctl /usr/local/bin
22 | 
23 | # update aws CLI to the latest version (we will require aws cli version >= 2.1.14)
24 | curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "/tmp/awscliv2.zip" 
25 | unzip -q -o /tmp/awscliv2.zip -d /tmp
26 | sudo /tmp/aws/install --update
27 | 
28 | # clean up resources
29 | aws emr-containers delete-virtual-cluster --id $EMRCLUSTERID
30 | eksctl delete cluster --name=$EKSCLUSTERNAME
31 | aws iam detach-role-policy --role-name $ROLENAME --policy-arn $POLICYARN
32 | aws iam delete-role --role-name $ROLENAME
33 | aws iam delete-policy --policy-arn $POLICYARN
34 | aws s3 rm s3://$OUTPUTS3BUCKET --recursive
35 | aws s3api delete-bucket --bucket $OUTPUTS3BUCKET
36 | 
37 | 


--------------------------------------------------------------------------------
/emr-on-eks/green_taxi_load.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%env\n",
 10 |     "SCHEMA=https://\n",
 11 |     "OUTPUT=s3://\n",
 12 |     "ETL_CONF_ENV=production"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": null,
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "{\n",
 22 |     "  \"type\": \"DelimitedExtract\",\n",
 23 |     "  \"name\": \"extract csv data from nyc_tripdata\",\n",
 24 |     "  \"environments\": [\"production\", \"test\"],\n",
 25 |     "  \"inputURI\": \"s3a://nyc-tlc/csv_backup/green_tripdata_*.csv\",\n",
 26 |     "  \"outputView\": \"green_tripdata0_raw\",            \n",
 27 |     "  \"delimiter\": \"Comma\",\n",
 28 |     "  \"quote\" : \"DoubleQuote\",\n",
 29 |     "  \"header\": true,\n",
 30 |     "  \"persist\": true\n",
 31 |     "}"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "{\n",
 41 |     "  \"type\": \"TypingTransform\",\n",
 42 |     "  \"name\": \"apply green_tripdata schema 0 data types\",\n",
 43 |     "  \"environments\": [\"production\", \"test\"],\n",
 44 |     "  \"schemaURI\": ${SCHEMA},\n",
 45 |     "  \"inputView\": \"green_tripdata0_raw\",            \n",
 46 |     "  \"outputView\": \"green_tripdata0\"\n",
 47 |     "}"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": null,
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "%sql name=\"aggregate the result by month and year\" outputView=green_trip_summery environments=production,test persist=true\n",
 57 |     "\n",
 58 |     "SELECT \n",
 59 |     "   year(lpep_pickup_datetime) AS trip_year\n",
 60 |     "  ,month(lpep_pickup_datetime) AS trip_month\n",
 61 |     "  ,vendor_id\n",
 62 |     "  ,sum(coalesce(trip_distance,0)) AS total_distance\n",
 63 |     "  ,sum(coalesce(total_amount,0)) AS total_fee\n",
 64 |     "FROM green_tripdata0\n",
 65 |     "GROUP BY trip_year, trip_month, vendor_id"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "{\n",
 75 |     "  \"type\": \"ParquetLoad\",\n",
 76 |     "  \"name\": \"write out green_tripdata0 dataset as Parquet\",\n",
 77 |     "  \"environments\": [\"production\", \"test\"],\n",
 78 |     "  \"inputView\": \"green_trip_summery\",\n",
 79 |     "  \"outputURI\": ${OUTPUT},\n",
 80 |     "  \"saveMode\": \"Overwrite\"\n",
 81 |     "}"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": []
 90 |   }
 91 |  ],
 92 |  "metadata": {
 93 |   "kernelspec": {
 94 |    "display_name": "Arc",
 95 |    "language": "javascript",
 96 |    "name": "arc"
 97 |   },
 98 |   "language_info": {
 99 |    "codemirror_mode": "javascript",
100 |    "file_extension": ".json",
101 |    "mimetype": "javascript",
102 |    "name": "arc",
103 |    "nbconvert_exporter": "arcexport",
104 |    "version": "3.12.1"
105 |   }
106 |  },
107 |  "nbformat": 4,
108 |  "nbformat_minor": 4
109 | }
110 | 


--------------------------------------------------------------------------------
/emr-on-eks/green_taxi_load.ipynb.license:
--------------------------------------------------------------------------------
1 | SPDX-FileCopyrightText: 2021 Amazon.com, Inc. or its affiliates.
2 | 
3 | SPDX-License-Identifier: MIT-0


--------------------------------------------------------------------------------
/emr-on-eks/green_taxi_schema.json:
--------------------------------------------------------------------------------
  1 | [
  2 |     {
  3 |         "id": "f457e562-5c7a-4215-a754-ab749509f3fb",
  4 |         "name": "vendor_id",
  5 |         "description": "A code indicating the TPEP provider that provided the record.",
  6 |         "trim": true,
  7 |         "nullable": true,
  8 |         "type": "integer",
  9 |         "nullableValues": [
 10 |             "",
 11 |             "null"
 12 |         ]
 13 |     },
 14 |     {
 15 |         "id": "d61934ed-e32e-406b-bd18-8d6b7296a8c0",
 16 |         "name": "lpep_pickup_datetime",
 17 |         "description": "The date and time when the meter was engaged.",
 18 |         "trim": true,
 19 |         "nullable": true,
 20 |         "type": "timestamp",
 21 |         "formatters": [
 22 |             "uuuu-MM-dd HH:mm:ss"
 23 |         ],
 24 |         "timezoneId": "America/New_York",
 25 |         "nullableValues": [
 26 |             "",
 27 |             "null"
 28 |         ]
 29 |     },
 30 |     {
 31 |         "id": "d61934ed-e32e-406b-bd18-8d6b7296a8c0",
 32 |         "name": "lpep_dropoff_datetime",
 33 |         "description": "The date and time when the meter was disengaged.",
 34 |         "trim": true,
 35 |         "nullable": true,
 36 |         "type": "timestamp",
 37 |         "formatters": [
 38 |             "uuuu-MM-dd HH:mm:ss"
 39 |         ],
 40 |         "timezoneId": "America/New_York",
 41 |         "nullableValues": [
 42 |             "",
 43 |             "null"
 44 |         ]
 45 |     },
 46 |     {
 47 |         "id": "aa315986-9fa9-4aa2-a72e-411196648351",
 48 |         "name": "store_and_fwd_flag",
 49 |         "description": "This flag indicates whether the trip record was held in vehicle memory before sending to the vendor, aka 'store and forward', because the vehicle did not have a connection to the server.",
 50 |         "trim": true,
 51 |         "nullable": true,
 52 |         "type": "boolean",
 53 |         "nullableValues": [
 54 |             "",
 55 |             "null"
 56 |         ],
 57 |         "trueValues": [
 58 |             "Y"
 59 |         ],
 60 |         "falseValues": [
 61 |             "N"
 62 |         ]
 63 |     },
 64 |     {
 65 |         "id": "ce66288c-65c1-45b7-83b4-5de3f38f89b7",
 66 |         "name": "rate_code_id",
 67 |         "description": "The final rate code in effect at the end of the trip.",
 68 |         "trim": true,
 69 |         "nullable": true,
 70 |         "type": "integer",
 71 |         "nullableValues": [
 72 |             "",
 73 |             "null"
 74 |         ]
 75 |     },
 76 |     {
 77 |         "id": "2d7b4a53-5203-4273-bd4a-3bbc742539ec",
 78 |         "name": "pickup_longitude",
 79 |         "description": "Longitude where the meter was engaged.",
 80 |         "trim": true,
 81 |         "nullable": true,
 82 |         "type": "decimal",
 83 |         "nullableValues": [
 84 |             "0"
 85 |         ],
 86 |         "precision": 18,
 87 |         "scale": 14
 88 |     },
 89 |     {
 90 |         "id": "a183ecd0-6169-429c-8bc0-0df4f08526e8",
 91 |         "name": "pickup_latitude",
 92 |         "description": "Latitude where the meter was engaged.",
 93 |         "trim": true,
 94 |         "nullable": true,
 95 |         "type": "decimal",
 96 |         "nullableValues": [
 97 |             "0"
 98 |         ],
 99 |         "precision": 18,
100 |         "scale": 14
101 |     },
102 |     {
103 |         "id": "a3d6135c-202f-4ba6-ab25-93fa6c28bc97",
104 |         "name": "dropoff_longitude",
105 |         "description": "Longitude where the meter was disengaged.",
106 |         "trim": true,
107 |         "nullable": true,
108 |         "type": "decimal",
109 |         "nullableValues": [
110 |             "0"
111 |         ],
112 |         "precision": 18,
113 |         "scale": 14
114 |     },
115 |     {
116 |         "id": "77160ee6-5040-4444-a731-45902b32911f",
117 |         "name": "dropoff_latitude",
118 |         "description": "Latitude where the meter was disengaged.",
119 |         "trim": true,
120 |         "nullable": true,
121 |         "type": "decimal",
122 |         "nullableValues": [
123 |             "0"
124 |         ],
125 |         "precision": 18,
126 |         "scale": 14
127 |     },
128 |     {
129 |         "id": "ef1fe668-7850-4ef5-966b-0813d2024c32",
130 |         "name": "passenger_count",
131 |         "description": "The number of passengers in the vehicle. This is a driver-entered value.",
132 |         "trim": true,
133 |         "nullable": true,
134 |         "type": "integer",
135 |         "nullableValues": [
136 |             "",
137 |             "null"
138 |         ]
139 |     },
140 |     {
141 |         "id": "77160ee6-5040-4444-a731-45902b32911f",
142 |         "name": "trip_distance",
143 |         "description": "The elapsed trip distance in miles reported by the taximeter.",
144 |         "trim": true,
145 |         "nullable": true,
146 |         "type": "decimal",
147 |         "nullableValues": [
148 |             "0",
149 |             "null"
150 |         ],
151 |         "precision": 18,
152 |         "scale": 15
153 |     },
154 |     {
155 |         "id": "e71597c1-67ae-4176-9ae3-ae4dbe0886b9",
156 |         "name": "fare_amount",
157 |         "description": "The time-and-distance fare calculated by the meter.",
158 |         "trim": true,
159 |         "nullable": true,
160 |         "type": "decimal",
161 |         "nullableValues": [
162 |             "",
163 |             "null"
164 |         ],
165 |         "precision": 10,
166 |         "scale": 2
167 |     },
168 |     {
169 |         "id": "77d91cb6-22e4-4dba-883a-eee0c8690f31",
170 |         "name": "extra",
171 |         "description": "Miscellaneous extras and surcharges. Currently, this only includes the $0.50 and $1 rush hour and overnight charges.",
172 |         "trim": true,
173 |         "nullable": true,
174 |         "type": "decimal",
175 |         "nullableValues": [
176 |             "",
177 |             "null"
178 |         ],
179 |         "precision": 10,
180 |         "scale": 2
181 |     },
182 |     {
183 |         "id": "aebe7970-91dc-4155-b9a9-78dbcf836ac8",
184 |         "name": "mta_tax",
185 |         "description": "$0.50 MTA tax that is automatically triggered based on the metered rate in use.",
186 |         "trim": true,
187 |         "nullable": true,
188 |         "type": "decimal",
189 |         "nullableValues": [
190 |             "",
191 |             "null"
192 |         ],
193 |         "precision": 10,
194 |         "scale": 2
195 |     },
196 |     {
197 |         "id": "3630c209-a88c-4dd7-ab43-276234f04252",
198 |         "name": "tip_amount",
199 |         "description": "Tip amount – This field is automatically populated for credit card tips. Cash tips are not included.",
200 |         "trim": true,
201 |         "nullable": true,
202 |         "type": "decimal",
203 |         "nullableValues": [
204 |             "",
205 |             "null"
206 |         ],
207 |         "precision": 10,
208 |         "scale": 2
209 |     },
210 |     {
211 |         "id": "9d10371c-c08c-461a-a1a9-e5cd0c46655c",
212 |         "name": "tolls_amount",
213 |         "description": "Total amount of all tolls paid in trip.",
214 |         "trim": true,
215 |         "nullable": true,
216 |         "type": "decimal",
217 |         "nullableValues": [
218 |             "",
219 |             "null"
220 |         ],
221 |         "precision": 10,
222 |         "scale": 2
223 |     },
224 |     {
225 |         "id": "f59aba58-2a8c-40f9-830b-f1abafe80b7f",
226 |         "name": "ehail_fee",
227 |         "description": "Fee for allowing passengers to 'e-hail' a New York City taxicab via downloadable smartphone applications.",
228 |         "trim": true,
229 |         "nullable": true,
230 |         "type": "decimal",
231 |         "nullableValues": [
232 |             "",
233 |             "null"
234 |         ],
235 |         "precision": 10,
236 |         "scale": 2
237 |     },
238 |     {
239 |         "id": "1414fd4b-32ed-430c-a4b0-a569e7144bbb",
240 |         "name": "total_amount",
241 |         "description": "The total amount charged to passengers. Does not include cash tips.",
242 |         "trim": true,
243 |         "nullable": true,
244 |         "type": "decimal",
245 |         "nullableValues": [
246 |             "",
247 |             "null"
248 |         ],
249 |         "precision": 10,
250 |         "scale": 2
251 |     },
252 |     {
253 |         "id": "5b43ec13-dc16-40bd-8af5-4e2f85285e15",
254 |         "name": "payment_type",
255 |         "description": "A numeric code signifying how the passenger paid for the trip.",
256 |         "trim": true,
257 |         "nullable": true,
258 |         "type": "integer",
259 |         "nullableValues": [
260 |             "",
261 |             "null"
262 |         ]
263 |     },
264 |     {
265 |         "id": "bccf357f-6671-4168-998a-c991fdcf7fe0",
266 |         "name": "trip_type",
267 |         "description": "A code indicating whether the trip was a street-hail or a dispatch that is automatically assigned based on the metered rate in use but can be altered by the driver.",
268 |         "trim": true,
269 |         "nullable": true,
270 |         "type": "integer",
271 |         "nullableValues": [
272 |             "",
273 |             "null"
274 |         ]
275 |     }
276 | ]


--------------------------------------------------------------------------------
/emr-on-eks/green_taxi_schema.json.license:
--------------------------------------------------------------------------------
1 | SPDX-FileCopyrightText: 2021 Amazon.com, Inc. or its affiliates.
2 | 
3 | SPDX-License-Identifier: MIT-0


--------------------------------------------------------------------------------
/emr-on-eks/provision.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | # SPDX-FileCopyrightText: Copyright 2021 Amazon.com, Inc. or its affiliates.
  4 | # SPDX-License-Identifier: MIT-0
  5 | 
  6 | # Define params
  7 | export AWS_DEFAULT_REGION=us-east-1
  8 | export EKSCLUSTERNAME=eks-cluster
  9 | export EMRCLUSTERNAME=emr-on-$EKSCLUSTERNAME
 10 | export ROLENAME=${EMRCLUSTERNAME}-execution-role
 11 | 
 12 | # Using EKS Fargate mode, uncomment to use EKS EC2 mode
 13 | EKSCTL_PARAM="--fargate"
 14 | # EKSCTL_PARAM="--nodes 6 --node-type t3.xlarge"
 15 | 
 16 | # install eksctl (https://docs.aws.amazon.com/emr/latest/EMR-on-EKS-DevelopmentGuide/setting-up-eksctl.html)
 17 | curl --silent --location "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp
 18 | sudo mv /tmp/eksctl /usr/local/bin
 19 | 
 20 | # update aws CLI to the latest version (we will require aws cli version >= 2.1.14)
 21 | curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "/tmp/awscliv2.zip" 
 22 | unzip -q -o /tmp/awscliv2.zip -d /tmp
 23 | sudo /tmp/aws/install --update
 24 | 
 25 | # install kubectl 
 26 | curl -L "https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl" \
 27 |     -o "/tmp/kubectl" 
 28 | chmod +x /tmp/kubectl
 29 | sudo mv /tmp/kubectl /usr/local/bin
 30 | 
 31 | # Provision eks cluster called “eks-fargate” backed by fargate
 32 | eksctl create cluster --name $EKSCLUSTERNAME --with-oidc --zones ${AWS_DEFAULT_REGION}a,${AWS_DEFAULT_REGION}b $EKSCTL_PARAM
 33 | aws eks update-kubeconfig --name $EKSCLUSTERNAME
 34 | 
 35 | # Create kubernetes namespace 'emr' for EMR
 36 | kubectl create namespace emr
 37 | 
 38 | # Create fargate profile 'fp-emr' for namespace 'emr'
 39 | eksctl create fargateprofile --cluster $EKSCLUSTERNAME --name fp-emr --namespace emr
 40 | 
 41 | # Wait for EKS cluster to finish provisioning, enable all logging
 42 | # Enable cluster access for Amazon EMR on EKS (https://docs.aws.amazon.com/emr/latest/EMR-on-EKS-DevelopmentGuide/setting-up-cluster-access.html) in the 'emr' kubernetes namespace by running:
 43 | eksctl create iamidentitymapping --cluster $EKSCLUSTERNAME --namespace "emr" --service-name "emr-containers"
 44 | eksctl utils update-cluster-logging --cluster $EKSCLUSTERNAME --enable-types all --approve
 45 | 
 46 | # create S3 bucket for output
 47 | export ACCOUNTID=$(aws sts get-caller-identity --query Account --output text)
 48 | export OUTPUTS3BUCKET=${EMRCLUSTERNAME}-${ACCOUNTID}
 49 | aws s3api create-bucket --bucket $OUTPUTS3BUCKET
 50 | 
 51 | # Create a job execution role (https://docs.aws.amazon.com/emr/latest/EMR-on-EKS-DevelopmentGuide/creating-job-execution-role.html)
 52 | cat > /tmp/job-execution-policy.json <<EOL
 53 | {
 54 |     "Version": "2012-10-17",
 55 |     "Statement": [ 
 56 |         {
 57 |             "Effect": "Allow",
 58 |             "Action": [
 59 |                 "s3:PutObject",
 60 |                 "s3:DeleteObject",
 61 |                 "s3:GetObject",
 62 |                 "s3:ListBucket"
 63 |             ],
 64 |             "Resource": ["arn:aws:s3:::${OUTPUTS3BUCKET}","arn:aws:s3:::${OUTPUTS3BUCKET}/*", "arn:aws:s3:::nyc-tlc","arn:aws:s3:::nyc-tlc/*"]
 65 |         }, 
 66 |         {
 67 |             "Effect": "Allow",
 68 |             "Action": [ "logs:PutLogEvents", "logs:CreateLogStream", "logs:DescribeLogGroups", "logs:DescribeLogStreams", "logs:CreateLogGroup" ],
 69 |             "Resource": [ "arn:aws:logs:*:*:*" ]
 70 |         }
 71 |     ]
 72 | }
 73 | 
 74 | EOL
 75 | 
 76 | cat > /tmp/trust-policy.json <<EOL
 77 | {
 78 |   "Version": "2012-10-17",
 79 |   "Statement": [ {
 80 |       "Effect": "Allow",
 81 |       "Principal": { "Service": "eks.amazonaws.com" },
 82 |       "Action": "sts:AssumeRole"
 83 |     } ]
 84 | }
 85 | 
 86 | EOL
 87 | 
 88 | ACCOUNTID=$(aws sts get-caller-identity --query Account --output text)
 89 | aws iam create-policy --policy-name $ROLENAME-policy --policy-document file:///tmp/job-execution-policy.json
 90 | aws iam create-role --role-name $ROLENAME --assume-role-policy-document file:///tmp/trust-policy.json
 91 | aws iam attach-role-policy --role-name $ROLENAME --policy-arn arn:aws:iam::$ACCOUNTID:policy/$ROLENAME-policy
 92 | aws emr-containers update-role-trust-policy --cluster-name $EKSCLUSTERNAME --namespace emr --role-name $ROLENAME
 93 | 
 94 | # Create emr virtual cluster
 95 | aws emr-containers create-virtual-cluster --name $EMRCLUSTERNAME \
 96 |     --container-provider '{
 97 |         "id": "'$EKSCLUSTERNAME'",
 98 |         "type": "EKS",
 99 |         "info": { "eksInfo": { "namespace": "emr" } }
100 |     }'
101 | 
102 | echo "Finished, proceed to submitting a job"


--------------------------------------------------------------------------------
/emr-on-eks/submit_arc_job.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # SPDX-FileCopyrightText: Copyright 2021 Amazon.com, Inc. or its affiliates.
 4 | # SPDX-License-Identifier: MIT-0
 5 | 
 6 | # Define params
 7 | export AWS_DEFAULT_REGION=us-east-1
 8 | export EKSCLUSTERNAME=eks-cluster
 9 | export EMRCLUSTERNAME=emr-on-$EKSCLUSTERNAME
10 | export ROLENAME=${EMRCLUSTERNAME}-execution-role
11 | 
12 | #submit test job
13 | export EMRCLUSTERID=$(aws emr-containers list-virtual-clusters --query "virtualClusters[?name == '${EMRCLUSTERNAME}' && state == 'RUNNING'].id" --output text)
14 | export ACCOUNTID=$(aws sts get-caller-identity --query Account --output text)
15 | export ROLEARN=arn:aws:iam::$ACCOUNTID:role/$ROLENAME
16 | export OUTPUTS3BUCKET=${EMRCLUSTERNAME}-${ACCOUNTID}
17 | 
18 | # update aws CLI to the latest version (we will require aws cli version >= 2.1.14)
19 | curl "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "/tmp/awscliv2.zip" 
20 | unzip -q -o /tmp/awscliv2.zip -d /tmp
21 | sudo /tmp/aws/install --update
22 | 
23 | # install kubectl
24 | curl -L "https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl" \
25 |     -o "/tmp/kubectl" 
26 | chmod +x /tmp/kubectl
27 | sudo mv /tmp/kubectl /usr/local/bin
28 | 
29 | # sumbit job
30 | 
31 | aws emr-containers start-job-run --virtual-cluster-id $EMRCLUSTERID \
32 |     --name arc-job --execution-role-arn $ROLEARN --release-label emr-6.2.0-latest \
33 |     --job-driver '{"sparkSubmitJobDriver": {"entryPoint": "https://repo1.maven.org/maven2/ai/tripl/arc_2.12/3.6.2/arc_2.12-3.6.2.jar", "entryPointArguments":["--etl.config.uri=https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/main/emr-on-eks/green_taxi_load.ipynb"], "sparkSubmitParameters": "--packages com.typesafe:config:1.4.0 --class ai.tripl.arc.ARC --conf spark.executor.instances=10 --conf spark.executor.memory=4G --conf spark.driver.memory=2G --conf spark.executor.cores=2 --conf spark.kubernetes.driverEnv.ETL_CONF_ENV=production --conf spark.kubernetes.driverEnv.OUTPUT=s3://'$OUTPUTS3BUCKET'/output/ --conf spark.kubernetes.driverEnv.SCHEMA=https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/main/emr-on-eks/green_taxi_schema.json"}}' \
34 |     --configuration-overrides '{"monitoringConfiguration": {"cloudWatchMonitoringConfiguration": {"logGroupName": "/aws/eks/'$EKSCLUSTERNAME'/jobs", "logStreamNamePrefix": "arc-job"}}}'
35 | 
36 | echo "Job submitted"
37 | echo "Navigate to https://console.aws.amazon.com/emr/home?#/eks/clusters/"${EMRCLUSTERID}" to view job status"
38 | 
39 | echo "Navigate to the output S3 bucket here https://s3.console.aws.amazon.com/s3/buckets/"${OUTPUTS3BUCKET}" to view outputs"
40 | 


--------------------------------------------------------------------------------
/spark-on-eks/deployment/app_code/job/delta_load.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%conf numRows=5 logger=true"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {
 15 |     "ExecuteTime": {
 16 |      "end_time": "2020-03-18T22:38:05.895407Z",
 17 |      "start_time": "2020-03-18T22:37:48.160Z"
 18 |     }
 19 |    },
 20 |    "source": [
 21 |     "## 2. Ingest A New Incremental CSV File\n",
 22 |     "### Look at record 12, the `state` is changed in the file"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "{\n",
 32 |     "  \"type\": \"DelimitedExtract\",\n",
 33 |     "  \"name\": \"extract incremental data\",\n",
 34 |     "  \"environments\": [\"dev\", \"test\"],\n",
 35 |     "  \"inputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/data/update_contacts.csv\",\n",
 36 |     "  \"outputView\": \"delta_raw\",            \n",
 37 |     "  \"delimiter\": \"Comma\",\n",
 38 |     "  \"header\": false,\n",
 39 |     "  \"authentication\": {\n",
 40 |     "     \"method\": \"AmazonIAM\"\n",
 41 |     "  }\n",
 42 |     "}"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "## 2.2 Apply Data Type (reused schema file)"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "{\n",
 59 |     "  \"type\": \"TypingTransform\",\n",
 60 |     "  \"name\": \"apply table schema 0 to incremental load\",\n",
 61 |     "  \"environments\": [\"dev\", \"test\"],\n",
 62 |     "  \"schemaURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/meta/contact_meta_0.json\",\n",
 63 |     "  \"inputView\": \"delta_raw\",            \n",
 64 |     "  \"outputView\": \"delta_typed\",\n",
 65 |     "  \"authentication\": {\n",
 66 |     "     \"method\": \"AmazonIAM\"\n",
 67 |     "  }\n",
 68 |     "}"
 69 |    ]
 70 |   },
 71 |   {
 72 |    "cell_type": "markdown",
 73 |    "metadata": {
 74 |     "ExecuteTime": {
 75 |      "end_time": "2020-06-07T15:02:50.155313Z",
 76 |      "start_time": "2020-06-07T15:02:50.125Z"
 77 |     }
 78 |    },
 79 |    "source": [
 80 |     "## 2.3 Data Quality Control (reused sql script)"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": null,
 86 |    "metadata": {},
 87 |    "outputs": [],
 88 |    "source": [
 89 |     "%sqlvaildate outputView=\"fail_fast\" name=\"validation\" description=\"fail the job if data transform is failed\" environments=dev,test sqlParams=inputView=delta_typed\n",
 90 |     "\n",
 91 |     "SELECT SUM(error) = 0 AS valid\n",
 92 |     "      ,TO_JSON(\n",
 93 |     "        NAMED_STRUCT('count', COUNT(error), 'errors', SUM(error))\n",
 94 |     "      ) AS message\n",
 95 |     "FROM \n",
 96 |     "(\n",
 97 |     "  SELECT CASE WHEN SIZE(_errors) > 0 THEN 1 ELSE 0 END AS error \n",
 98 |     "  FROM ${inputView}\n",
 99 |     ") base"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "markdown",
104 |    "metadata": {
105 |     "ExecuteTime": {
106 |      "end_time": "2020-05-31T05:01:13.796275Z",
107 |      "start_time": "2020-05-31T05:01:13.734Z"
108 |     }
109 |    },
110 |    "source": [
111 |     "## 2.4 Add Calculated Fields (reused sql script)"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": null,
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": [
120 |     "%env \n",
121 |     "ETL_CONF_CURRENT_TIMESTAMP=CURRENT_TIMESTAMP()"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": null,
127 |    "metadata": {},
128 |    "outputs": [],
129 |    "source": [
130 |     "%sql outputView=\"update_load\" name=\"add calc field for SCD\" environments=dev,test sqlParams=table_name=delta_typed,now=${ETL_CONF_CURRENT_TIMESTAMP}\n",
131 |     "\n",
132 |     "SELECT id,name,email,state, CAST(${now} AS timestamp) AS valid_from, CAST(null AS timestamp) AS valid_to\n",
133 |     ",1 AS iscurrent, md5(concat(name,email,state)) AS checksum \n",
134 |     "FROM ${table_name}"
135 |    ]
136 |   },
137 |   {
138 |    "cell_type": "markdown",
139 |    "metadata": {},
140 |    "source": [
141 |     "## 2.5 Output Incremental data to Delta Lake\n",
142 |     "### Delta Lake is an optimized data lake to support Time Travel, ACID transaction"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": null,
148 |    "metadata": {},
149 |    "outputs": [],
150 |    "source": [
151 |     "{\n",
152 |     "  \"type\": \"DeltaLakeLoad\",\n",
153 |     "  \"name\": \"Initial load to Data Lake\",\n",
154 |     "  \"environments\": [\"dev\", \"test\"],\n",
155 |     "  \"inputView\": \"update_load\",\n",
156 |     "  \"outputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/output/delta_load/\",\n",
157 |     "  \"numPartitions\": 2\n",
158 |     "  \"saveMode\": \"Overwrite\",\n",
159 |     "  \"authentication\": {\n",
160 |     "     \"method\": \"AmazonIAM\"\n",
161 |     "  }\n",
162 |     "}"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": null,
168 |    "metadata": {},
169 |    "outputs": [],
170 |    "source": []
171 |   }
172 |  ],
173 |  "metadata": {
174 |   "kernelspec": {
175 |    "display_name": "Arc",
176 |    "language": "javascript",
177 |    "name": "arc"
178 |   },
179 |   "language_info": {
180 |    "codemirror_mode": "javascript",
181 |    "file_extension": ".json",
182 |    "mimetype": "javascript",
183 |    "name": "arc",
184 |    "nbconvert_exporter": "arcexport",
185 |    "version": "3.8.0"
186 |   }
187 |  },
188 |  "nbformat": 4,
189 |  "nbformat_minor": 4
190 | }


--------------------------------------------------------------------------------
/spark-on-eks/deployment/app_code/job/driver-pod-template.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   labels:
 5 |     spark-role: driver
 6 |   namespace: spark
 7 | spec:
 8 |   serviceAccountName: nativejob
 9 |   affinity: 
10 |       nodeAffinity: 
11 |           requiredDuringSchedulingIgnoredDuringExecution: 
12 |             nodeSelectorTerms:
13 |               - matchExpressions:
14 |                 - key: lifecycle 
15 |                   operator: In 
16 |                   values: 
17 |                   - OnDemand
18 | 


--------------------------------------------------------------------------------
/spark-on-eks/deployment/app_code/job/executor-pod-template.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   labels:
 5 |     spark-role: executor
 6 |   namespace: spark
 7 | spec:
 8 |   serviceAccountName: nativejob
 9 |   affinity: 
10 |       nodeAffinity: 
11 |           requiredDuringSchedulingIgnoredDuringExecution: 
12 |             nodeSelectorTerms:
13 |             - matchExpressions:
14 |               - key: lifecycle 
15 |                 operator: In 
16 |                 values: 
17 |                 - Ec2Spot


--------------------------------------------------------------------------------
/spark-on-eks/deployment/app_code/job/green_taxi_load.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%env\n",
 10 |     "SCHEMA=https://\n",
 11 |     "OUTPUT=s3://\n",
 12 |     "ETL_CONF_ENV=production"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": null,
 18 |    "metadata": {},
 19 |    "outputs": [],
 20 |    "source": [
 21 |     "{\n",
 22 |     "  \"type\": \"DelimitedExtract\",\n",
 23 |     "  \"name\": \"extract csv data from nyc_tripdata\",\n",
 24 |     "  \"environments\": [\"production\", \"test\"],\n",
 25 |     "  \"inputURI\": \"s3a://nyc-tlc/trip*data/green_tripdata_*.csv\",\n",
 26 |     "  \"outputView\": \"green_tripdata0_raw\",            \n",
 27 |     "  \"delimiter\": \"Comma\",\n",
 28 |     "  \"quote\" : \"DoubleQuote\",\n",
 29 |     "  \"header\": true,\n",
 30 |     "  \"persist\": true\n",
 31 |     "}"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": null,
 37 |    "metadata": {},
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "{\n",
 41 |     "  \"type\": \"TypingTransform\",\n",
 42 |     "  \"name\": \"apply green_tripdata schema 0 data types\",\n",
 43 |     "  \"environments\": [\"production\", \"test\"],\n",
 44 |     "  \"schemaURI\": ${SCHEMA},\n",
 45 |     "  \"inputView\": \"green_tripdata0_raw\",            \n",
 46 |     "  \"outputView\": \"green_tripdata0\"\n",
 47 |     "}"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": null,
 53 |    "metadata": {},
 54 |    "outputs": [],
 55 |    "source": [
 56 |     "%sql name=\"aggregate the result by month and year\" outputView=green_trip_summery environments=production,test persist=true\n",
 57 |     "\n",
 58 |     "SELECT \n",
 59 |     "   year(lpep_pickup_datetime) AS trip_year\n",
 60 |     "  ,month(lpep_pickup_datetime) AS trip_month\n",
 61 |     "  ,vendor_id\n",
 62 |     "  ,sum(coalesce(trip_distance,0)) AS total_distance\n",
 63 |     "  ,sum(coalesce(total_amount,0)) AS total_fee\n",
 64 |     "FROM green_tripdata0\n",
 65 |     "GROUP BY trip_year, trip_month, vendor_id"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "{\n",
 75 |     "  \"type\": \"ParquetLoad\",\n",
 76 |     "  \"name\": \"write out green_tripdata0 dataset as Parquet\",\n",
 77 |     "  \"environments\": [\"production\", \"test\"],\n",
 78 |     "  \"inputView\": \"green_trip_summery\",\n",
 79 |     "  \"outputURI\": ${OUTPUT},\n",
 80 |     "  \"saveMode\": \"Overwrite\"\n",
 81 |     "}"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": null,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": []
 90 |   }
 91 |  ],
 92 |  "metadata": {
 93 |   "kernelspec": {
 94 |    "display_name": "Arc",
 95 |    "language": "javascript",
 96 |    "name": "arc"
 97 |   },
 98 |   "language_info": {
 99 |    "codemirror_mode": "javascript",
100 |    "file_extension": ".json",
101 |    "mimetype": "javascript",
102 |    "name": "arc",
103 |    "nbconvert_exporter": "arcexport",
104 |    "version": "3.12.1"
105 |   }
106 |  },
107 |  "nbformat": 4,
108 |  "nbformat_minor": 4
109 | }
110 | 


--------------------------------------------------------------------------------
/spark-on-eks/deployment/app_code/job/initial_load.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%conf numRows=5 logger=true"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "# 1. Initial Table Load"
 17 |    ]
 18 |   },
 19 |   {
 20 |    "cell_type": "code",
 21 |    "execution_count": null,
 22 |    "metadata": {},
 23 |    "outputs": [],
 24 |    "source": [
 25 |     "{\n",
 26 |     "  \"type\": \"DelimitedExtract\",\n",
 27 |     "  \"name\": \"extract initial table\",\n",
 28 |     "  \"environments\": [\"dev\", \"test\"],\n",
 29 |     "  \"inputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/data/initial_contacts.csv\",\n",
 30 |     "  \"outputView\": \"initial_raw\",            \n",
 31 |     "  \"delimiter\": \"Comma\",\n",
 32 |     "  \"header\": false,\n",
 33 |     "  \"quote\": \"None\",\n",
 34 |     "  \"authentication\": {\n",
 35 |     "     \"method\": \"AmazonIAM\"\n",
 36 |     "  }\n",
 37 |     "}"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "markdown",
 42 |    "metadata": {},
 43 |    "source": [
 44 |     "## Check Original Data Schema"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": null,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "%printschema \n",
 54 |     "initial_raw"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "markdown",
 59 |    "metadata": {
 60 |     "ExecuteTime": {
 61 |      "start_time": "2020-03-03T08:30:30.028Z"
 62 |     }
 63 |    },
 64 |    "source": [
 65 |     "## 1.2 Apply Data Type"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": null,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "{\n",
 75 |     "  \"type\": \"TypingTransform\",\n",
 76 |     "  \"name\": \"apply table schema 0\",\n",
 77 |     "  \"environments\": [\"dev\", \"test\"],\n",
 78 |     "  \"schemaURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/meta/contact_meta_0.json\",\n",
 79 |     "  \"inputView\": \"initial_raw\",            \n",
 80 |     "  \"outputView\": \"initial_typed\",\n",
 81 |     "  \"authentication\": {\n",
 82 |     "     \"method\": \"AmazonIAM\"\n",
 83 |     "  }\n",
 84 |     "}"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "markdown",
 89 |    "metadata": {},
 90 |    "source": [
 91 |     "## Check Typed Data Schema & Stats"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": null,
 97 |    "metadata": {},
 98 |    "outputs": [],
 99 |    "source": [
100 |     "%printschema \n",
101 |     "initial_typed"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "markdown",
106 |    "metadata": {},
107 |    "source": [
108 |     "## 1.3 Data Quality Control"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "%sqlvaildate outputView=\"fail_fast\" name=\"validation\" description=\"fail the job if data transform is failed\" environments=dev,test sqlParams=inputView=initial_typed\n",
118 |     "\n",
119 |     "SELECT SUM(error) = 0 AS valid\n",
120 |     "      ,TO_JSON(\n",
121 |     "        NAMED_STRUCT('count', COUNT(error), 'errors', SUM(error))\n",
122 |     "      ) AS message\n",
123 |     "FROM \n",
124 |     "(\n",
125 |     "  SELECT CASE WHEN SIZE(_errors) > 0 THEN 1 ELSE 0 END AS error \n",
126 |     "  FROM ${inputView}\n",
127 |     ") base"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "markdown",
132 |    "metadata": {},
133 |    "source": [
134 |     "## 1.4 Add Calculated Fields for SCD Type 2\n",
135 |     "### CURRENT_TIMESTAMP will be passed in automatically, when the ETL job is triggered"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": null,
141 |    "metadata": {},
142 |    "outputs": [],
143 |    "source": [
144 |     "%env \n",
145 |     "ETL_CONF_CURRENT_TIMESTAMP=current_timestamp()"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "metadata": {},
152 |    "outputs": [],
153 |    "source": [
154 |     "%sql outputView=\"initial_load\" name=\"add calc field for SCD\" environments=dev,test sqlParams=table_name=initial_typed,now=${ETL_CONF_CURRENT_TIMESTAMP}\n",
155 |     "\n",
156 |     "SELECT id,name,email,state, CAST(${now} AS timestamp) AS valid_from, CAST(null AS timestamp) AS valid_to\n",
157 |     ",1 AS iscurrent, md5(concat(name,email,state)) AS checksum \n",
158 |     "FROM ${table_name}"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "metadata": {},
164 |    "source": [
165 |     "## 1.5 Load to Delta Lake as the initial daily snaptshot table\n",
166 |     "### Delta Lake is an optimized data lake to support Time Travel, ACID transaction"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": null,
172 |    "metadata": {},
173 |    "outputs": [],
174 |    "source": [
175 |     "{\n",
176 |     "  \"type\": \"DeltaLakeLoad\",\n",
177 |     "  \"name\": \"Initial load to Data Lake\",\n",
178 |     "  \"environments\": [\"dev\", \"test\"],\n",
179 |     "  \"inputView\": \"initial_load\",\n",
180 |     "  \"outputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/output/contact_snapshot/\",\n",
181 |     "  \"numPartitions\": 2\n",
182 |     "  \"saveMode\": \"Overwrite\",\n",
183 |     "  \"authentication\": {\n",
184 |     "     \"method\": \"AmazonIAM\"\n",
185 |     "  }\n",
186 |     "}"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": null,
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": []
195 |   }
196 |  ],
197 |  "metadata": {
198 |   "kernelspec": {
199 |    "display_name": "Arc",
200 |    "language": "javascript",
201 |    "name": "arc"
202 |   },
203 |   "language_info": {
204 |    "codemirror_mode": "javascript",
205 |    "file_extension": ".json",
206 |    "mimetype": "javascript",
207 |    "name": "arc",
208 |    "nbconvert_exporter": "arcexport",
209 |    "version": "3.8.0"
210 |   }
211 |  },
212 |  "nbformat": 4,
213 |  "nbformat_minor": 4
214 | }


--------------------------------------------------------------------------------
/spark-on-eks/deployment/app_code/job/msk_consumer.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import SparkSession
 2 | from pyspark.sql.types import *
 3 | from pyspark.sql.functions import *
 4 | import pyspark
 5 | import sys
 6 | 
 7 | spark = SparkSession.builder \
 8 |   .appName("Spark Structured Streaming from Kafka") \
 9 |   .getOrCreate()
10 | 
11 | sdfRides = spark \
12 |   .readStream \
13 |   .format("kafka") \
14 |   .option("kafka.bootstrap.servers", sys.argv[1]) \
15 |   .option("subscribe", "taxirides") \
16 |   .option("startingOffsets", "latest") \
17 |   .option("auto.offset.reset", "latest") \
18 |   .load() \
19 |   .selectExpr("decode(CAST(value AS STRING),'utf-8') as value") 
20 | 
21 | # sdfFares = spark \
22 | #   .readStream \
23 | #   .format("kafka") \
24 | #   .option("kafka.bootstrap.servers", "b-1.emr-eks-msk.wz7wsg.c4.kafka.ap-southeast-2.amazonaws.com:9092") \
25 | #   .option("subscribe", "taxifares") \
26 | #   .option("startingOffsets", "latest") \
27 | #   .load() \
28 | #   .selectExpr("decode(CAST(value AS STRING),'utf-8') as value")
29 | 
30 | # taxiFaresSchema = StructType([ \
31 | #   StructField("rideId", LongType()), StructField("taxiId", LongType()), \
32 | #   StructField("driverId", LongType()), StructField("startTime", TimestampType()), \
33 | #   StructField("paymentType", StringType()), StructField("tip", FloatType()), \
34 | #   StructField("tolls", FloatType()), StructField("totalFare", FloatType())])
35 |     
36 | taxiRidesSchema = StructType([ \
37 |   StructField("rideId", LongType()), StructField("isStart", StringType()), \
38 |   StructField("endTime", TimestampType()), StructField("startTime", TimestampType()), \
39 |   StructField("startLon", FloatType()), StructField("startLat", FloatType()), \
40 |   StructField("endLon", FloatType()), StructField("endLat", FloatType()), \
41 |   StructField("passengerCnt", ShortType()), StructField("taxiId", LongType()), \
42 |   StructField("driverId", LongType()),StructField("timestamp", TimestampType())])
43 | 
44 | def parse_data_from_kafka_message(sdf, schema):
45 |   assert sdf.isStreaming == True, "DataFrame doesn't receive streaming data"
46 |   col = split(sdf['value'], ',') #split attributes to nested array in one Column
47 |   #now expand col to multiple top-level columns
48 |   for idx, field in enumerate(schema): 
49 |       sdf = sdf.withColumn(field.name, col.getItem(idx).cast(field.dataType)) 
50 |       if field.name=="timestamp":
51 |           sdf = sdf.withColumn(field.name, current_timestamp())
52 |   return sdf.select([field.name for field in schema])
53 | 
54 | sdfRides = parse_data_from_kafka_message(sdfRides, taxiRidesSchema)
55 | # sdfFares = parse_data_from_kafka_message(sdfFares, taxiFaresSchema)
56 | 
57 | query = sdfRides.withWatermark("timestamp", "10 seconds") \
58 |                 .groupBy("driverId", window("timestamp", "10 seconds", "5 seconds")).count()
59 | 
60 | # query.writeStream \
61 | #     .outputMode("append") \
62 | #     .format("console") \
63 | #     .option("checkpointLocation", "s3://testtestmelody/stream/checkpoint/consumer_taxi2") \
64 | #     .option("truncate", False) \
65 | #     .start() \
66 | #     .awaitTermination()
67 | 
68 | output=query.select(to_json(struct("*")).alias("value")) \
69 |   .selectExpr("CAST(value AS STRING)") \
70 |   .writeStream \
71 |   .outputMode("append") \
72 |   .format("kafka") \
73 |   .option("kafka.bootstrap.servers", sys.argv[1]) \
74 |   .option("topic", sys.argv[3]) \
75 |   .option("checkpointLocation", sys.argv[2]) \
76 |   .start()
77 | 
78 | output.awaitTermination()
79 | 


--------------------------------------------------------------------------------
/spark-on-eks/deployment/app_code/job/scd2_merge.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "## 3. Read initial & incremental tables from Delta Lake"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": null,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "{\n",
 17 |     "  \"type\": \"DeltaLakeExtract\",\n",
 18 |     "  \"name\": \"read initial load table\",\n",
 19 |     "  \"description\": \"read initial load table\",\n",
 20 |     "  \"environments\": [\n",
 21 |     "    \"dev\",\n",
 22 |     "    \"test\"\n",
 23 |     "  ],\n",
 24 |     "  \"inputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/output/contact_snapshot/\",\n",
 25 |     "  \"outputView\": \"current_snapshot\"\n",
 26 |     "}"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "code",
 31 |    "execution_count": null,
 32 |    "metadata": {},
 33 |    "outputs": [],
 34 |    "source": [
 35 |     "{\n",
 36 |     "  \"type\": \"DeltaLakeExtract\",\n",
 37 |     "  \"name\": \"read contact Delta Lake table\",\n",
 38 |     "  \"description\": \"read contact table\",\n",
 39 |     "  \"environments\": [\n",
 40 |     "    \"dev\",\n",
 41 |     "    \"test\"\n",
 42 |     "  ],\n",
 43 |     "  \"inputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/output/delta_load/\",\n",
 44 |     "  \"outputView\": \"delta_data\"\n",
 45 |     "}"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "markdown",
 50 |    "metadata": {
 51 |     "ExecuteTime": {
 52 |      "end_time": "2020-05-31T05:03:33.741024Z",
 53 |      "start_time": "2020-05-31T05:03:33.247Z"
 54 |     }
 55 |    },
 56 |    "source": [
 57 |     "## 3.2 Prepare Datasets for SCD Type2 Insert\n",
 58 |     "\n",
 59 |     "- Generate extra rows for changed records.\n",
 60 |     "- The 'null' merge_key means it will be inserted, not update existing records according to the rule in SCD type2"
 61 |    ]
 62 |   },
 63 |   {
 64 |    "cell_type": "code",
 65 |    "execution_count": null,
 66 |    "metadata": {},
 67 |    "outputs": [],
 68 |    "source": [
 69 |     "%sql outputView=\"staged_update\" name=\"generate extra rows for SCD\" environments=dev,test\n",
 70 |     "\n",
 71 |     "SELECT NULL AS mergeKey, new.*\n",
 72 |     "FROM current_snapshot old\n",
 73 |     "INNER JOIN delta_data new\n",
 74 |     "ON old.id = new.id\n",
 75 |     "WHERE old.iscurrent=true\n",
 76 |     "AND old.checksum<>new.checksum\n",
 77 |     "\n",
 78 |     "UNION\n",
 79 |     "\n",
 80 |     "SELECT id AS mergeKey, *\n",
 81 |     "FROM delta_data"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "markdown",
 86 |    "metadata": {},
 87 |    "source": [
 88 |     "## 3.3 Implement the Type 2 SCD merge operation"
 89 |    ]
 90 |   },
 91 |   {
 92 |    "cell_type": "code",
 93 |    "execution_count": null,
 94 |    "metadata": {},
 95 |    "outputs": [],
 96 |    "source": [
 97 |     "%conf logger=true"
 98 |    ]
 99 |   },
100 |   {
101 |    "cell_type": "code",
102 |    "execution_count": null,
103 |    "metadata": {},
104 |    "outputs": [],
105 |    "source": [
106 |     "{\n",
107 |     "  \"type\": \"DeltaLakeMergeLoad\",\n",
108 |     "  \"name\": \"merge with existing contacts data\",\n",
109 |     "  \"environments\": [\n",
110 |     "    \"dev\",\n",
111 |     "    \"test\"\n",
112 |     "  ],\n",
113 |     "  \"inputView\": \"staged_update\",\n",
114 |     "  \"outputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/output/contact_snapshot/\"\n",
115 |     "  \"condition\": \"source.mergeKey = target.id\",\n",
116 |     "  \"whenMatchedUpdate\": {\n",
117 |     "    \"condition\": \"target.iscurrent = true AND source.checksum <> target.checksum\",\n",
118 |     "    \"values\": {\n",
119 |     "      \"valid_to\": ${ETL_CONF_CURRENT_TIMESTAMP},\n",
120 |     "      \"iscurrent\": false\n",
121 |     "    }\n",
122 |     "  },\n",
123 |     "  \"whenNotMatchedByTargetInsert\": {},\n",
124 |     "  \"numPartitions\": 1\n",
125 |     "}"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "metadata": {},
131 |    "source": [
132 |     "## 3.4 Create a Delta Lake table in Athena\n",
133 |     "### Build up a Glue Data Catalog via Athena. This step can be done by Glue Crawler. However, it makes sense if we refresh partitions, create/update data catalog at the end of each ETL process, which is provides the data lineage contro at a single place."
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": null,
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "{\n",
143 |     "  \"type\": \"JDBCExecute\",\n",
144 |     "  \"name\": \"Create glue data catalog\",\n",
145 |     "  \"environments\": [\n",
146 |     "    \"dev\",\n",
147 |     "    \"test\"\n",
148 |     "  ],\n",
149 |     "  \"inputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/sql/create_table_contact.sql\",\n",
150 |     "  \"jdbcURL\": \"jdbc:awsathena://AwsRegion=\"${AWS_DEFAULT_REGION}\";S3OutputLocation=s3://\"${ETL_CONF_DATALAKE_LOC}\"/athena-query-result;AwsCredentialsProviderClass=com.amazonaws.auth.WebIdentityTokenCredentialsProvider\",\n",
151 |     "  \"sqlParams\":{\n",
152 |     "    \"datalake_loc\": \"'s3://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/output/contact_snapshot/_symlink_format_manifest/'\",\n",
153 |     "    \"table_name\": \"default.contact_snapshot\"\n",
154 |     "  }\n",
155 |     "}"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "markdown",
160 |    "metadata": {},
161 |    "source": [
162 |     "# 4. Query Delta Lake (validation steps)\n",
163 |     "### to stop executing the followings in a productionized ETL job, use a fake environment `uat`\n",
164 |     "### the same queries can be run in Athena"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": null,
170 |    "metadata": {},
171 |    "outputs": [],
172 |    "source": [
173 |     "{\n",
174 |     "  \"type\": \"DeltaLakeExtract\",\n",
175 |     "  \"name\": \"read contact Delta Lake table\",\n",
176 |     "  \"description\": \"read contact table\",\n",
177 |     "  \"environments\": [\n",
178 |     "    \"uat\"\n",
179 |     "  ],\n",
180 |     "  \"inputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/output/contact_snapshot\",\n",
181 |     "  \"outputView\": \"contact_snapshot\"\n",
182 |     "}"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "markdown",
187 |    "metadata": {},
188 |    "source": [
189 |     "## Confirm 92 records are expired"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": null,
195 |    "metadata": {},
196 |    "outputs": [],
197 |    "source": [
198 |     "%sql outputView=\"expired_count\" name=\"expired_count\" environments=uat\n",
199 |     "SELECT count(*) FROM contact_snapshot WHERE valid_to is not null"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": null,
205 |    "metadata": {},
206 |    "outputs": [],
207 |    "source": [
208 |     "%metadata \n",
209 |     "contact_snapshot"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "markdown",
214 |    "metadata": {},
215 |    "source": [
216 |     " ## Confirm we now have 1192 records"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": null,
222 |    "metadata": {},
223 |    "outputs": [],
224 |    "source": [
225 |     "%sql outputView=\"total_count\" name=\"total_count\" environments=uat\n",
226 |     "SELECT count(*) FROM contact_snapshot"
227 |    ]
228 |   },
229 |   {
230 |    "cell_type": "markdown",
231 |    "metadata": {},
232 |    "source": [
233 |     "## View one of the changed records"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": null,
239 |    "metadata": {},
240 |    "outputs": [],
241 |    "source": [
242 |     "%sql outputView=\"validate_type2\" name=\"validate_type2\" environments=uat\n",
243 |     "SELECT * FROM contact_snapshot WHERE id=12"
244 |    ]
245 |   },
246 |   {
247 |    "cell_type": "code",
248 |    "execution_count": null,
249 |    "metadata": {},
250 |    "outputs": [],
251 |    "source": []
252 |   }
253 |  ],
254 |  "metadata": {
255 |   "kernelspec": {
256 |    "display_name": "Arc",
257 |    "language": "javascript",
258 |    "name": "arc"
259 |   },
260 |   "language_info": {
261 |    "codemirror_mode": "javascript",
262 |    "file_extension": ".json",
263 |    "mimetype": "javascript",
264 |    "name": "arc",
265 |    "nbconvert_exporter": "arcexport",
266 |    "version": "3.8.0"
267 |   }
268 |  },
269 |  "nbformat": 4,
270 |  "nbformat_minor": 4
271 | }


--------------------------------------------------------------------------------
/spark-on-eks/deployment/app_code/job/wordcount.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from pyspark.sql import SparkSession
3 | spark = SparkSession.builder.appName('NYC taxi vendor count').getOrCreate()
4 | df = spark.read.option("header",True).csv(sys.argv[1])
5 | df.filter(df["vendor_name"].isNotNull()).select("vendor_name").groupBy("vendor_name").count().write.mode("overwrite").parquet(sys.argv[2])
6 | exit()


--------------------------------------------------------------------------------
/spark-on-eks/deployment/app_code/meta/contact_meta_0.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "name": "id",
 4 |     "description": "contact id",
 5 |     "trim": true,
 6 |     "nullable": false,
 7 |     "primaryKey": true,
 8 |     "type": "integer"
 9 |   },
10 |   {
11 |     "name": "name",
12 |     "description": "contact name",
13 |     "trim": true,
14 |     "nullable": true,
15 |     "primaryKey": false,
16 |     "type": "string",
17 |     "nullableValues": [
18 |       "",
19 |       "null"
20 |     ]
21 |   },
22 |   {
23 |     "name": "email",
24 |     "description": "contact email",
25 |     "trim": true,
26 |     "nullable": true,
27 |     "primaryKey": false,
28 |     "type": "string",
29 |     "nullableValues": [
30 |       "",
31 |       "null"
32 |     ]
33 |   },
34 |   {
35 |     "name": "state",
36 |     "description": "state in the country of the contact",
37 |     "trim": true,
38 |     "nullable": true,
39 |     "primaryKey": false,
40 |     "type": "string",
41 |     "nullableValues": [
42 |       "",
43 |       "null"
44 |     ]
45 |   }
46 | ]


--------------------------------------------------------------------------------
/spark-on-eks/deployment/app_code/meta/green_taxi_schema.json:
--------------------------------------------------------------------------------
  1 | [
  2 |     {
  3 |         "id": "f457e562-5c7a-4215-a754-ab749509f3fb",
  4 |         "name": "vendor_id",
  5 |         "description": "A code indicating the TPEP provider that provided the record.",
  6 |         "trim": true,
  7 |         "nullable": true,
  8 |         "type": "integer",
  9 |         "nullableValues": [
 10 |             "",
 11 |             "null"
 12 |         ]
 13 |     },
 14 |     {
 15 |         "id": "d61934ed-e32e-406b-bd18-8d6b7296a8c0",
 16 |         "name": "lpep_pickup_datetime",
 17 |         "description": "The date and time when the meter was engaged.",
 18 |         "trim": true,
 19 |         "nullable": true,
 20 |         "type": "timestamp",
 21 |         "formatters": [
 22 |             "uuuu-MM-dd HH:mm:ss"
 23 |         ],
 24 |         "timezoneId": "America/New_York",
 25 |         "nullableValues": [
 26 |             "",
 27 |             "null"
 28 |         ]
 29 |     },
 30 |     {
 31 |         "id": "d61934ed-e32e-406b-bd18-8d6b7296a8c0",
 32 |         "name": "lpep_dropoff_datetime",
 33 |         "description": "The date and time when the meter was disengaged.",
 34 |         "trim": true,
 35 |         "nullable": true,
 36 |         "type": "timestamp",
 37 |         "formatters": [
 38 |             "uuuu-MM-dd HH:mm:ss"
 39 |         ],
 40 |         "timezoneId": "America/New_York",
 41 |         "nullableValues": [
 42 |             "",
 43 |             "null"
 44 |         ]
 45 |     },
 46 |     {
 47 |         "id": "aa315986-9fa9-4aa2-a72e-411196648351",
 48 |         "name": "store_and_fwd_flag",
 49 |         "description": "This flag indicates whether the trip record was held in vehicle memory before sending to the vendor, aka 'store and forward', because the vehicle did not have a connection to the server.",
 50 |         "trim": true,
 51 |         "nullable": true,
 52 |         "type": "boolean",
 53 |         "nullableValues": [
 54 |             "",
 55 |             "null"
 56 |         ],
 57 |         "trueValues": [
 58 |             "Y"
 59 |         ],
 60 |         "falseValues": [
 61 |             "N"
 62 |         ]
 63 |     },
 64 |     {
 65 |         "id": "ce66288c-65c1-45b7-83b4-5de3f38f89b7",
 66 |         "name": "rate_code_id",
 67 |         "description": "The final rate code in effect at the end of the trip.",
 68 |         "trim": true,
 69 |         "nullable": true,
 70 |         "type": "integer",
 71 |         "nullableValues": [
 72 |             "",
 73 |             "null"
 74 |         ]
 75 |     },
 76 |     {
 77 |         "id": "2d7b4a53-5203-4273-bd4a-3bbc742539ec",
 78 |         "name": "pickup_longitude",
 79 |         "description": "Longitude where the meter was engaged.",
 80 |         "trim": true,
 81 |         "nullable": true,
 82 |         "type": "decimal",
 83 |         "nullableValues": [
 84 |             "0"
 85 |         ],
 86 |         "precision": 18,
 87 |         "scale": 14
 88 |     },
 89 |     {
 90 |         "id": "a183ecd0-6169-429c-8bc0-0df4f08526e8",
 91 |         "name": "pickup_latitude",
 92 |         "description": "Latitude where the meter was engaged.",
 93 |         "trim": true,
 94 |         "nullable": true,
 95 |         "type": "decimal",
 96 |         "nullableValues": [
 97 |             "0"
 98 |         ],
 99 |         "precision": 18,
100 |         "scale": 14
101 |     },
102 |     {
103 |         "id": "a3d6135c-202f-4ba6-ab25-93fa6c28bc97",
104 |         "name": "dropoff_longitude",
105 |         "description": "Longitude where the meter was disengaged.",
106 |         "trim": true,
107 |         "nullable": true,
108 |         "type": "decimal",
109 |         "nullableValues": [
110 |             "0"
111 |         ],
112 |         "precision": 18,
113 |         "scale": 14
114 |     },
115 |     {
116 |         "id": "77160ee6-5040-4444-a731-45902b32911f",
117 |         "name": "dropoff_latitude",
118 |         "description": "Latitude where the meter was disengaged.",
119 |         "trim": true,
120 |         "nullable": true,
121 |         "type": "decimal",
122 |         "nullableValues": [
123 |             "0"
124 |         ],
125 |         "precision": 18,
126 |         "scale": 14
127 |     },
128 |     {
129 |         "id": "ef1fe668-7850-4ef5-966b-0813d2024c32",
130 |         "name": "passenger_count",
131 |         "description": "The number of passengers in the vehicle. This is a driver-entered value.",
132 |         "trim": true,
133 |         "nullable": true,
134 |         "type": "integer",
135 |         "nullableValues": [
136 |             "",
137 |             "null"
138 |         ]
139 |     },
140 |     {
141 |         "id": "77160ee6-5040-4444-a731-45902b32911f",
142 |         "name": "trip_distance",
143 |         "description": "The elapsed trip distance in miles reported by the taximeter.",
144 |         "trim": true,
145 |         "nullable": true,
146 |         "type": "decimal",
147 |         "nullableValues": [
148 |             "0",
149 |             "null"
150 |         ],
151 |         "precision": 18,
152 |         "scale": 15
153 |     },
154 |     {
155 |         "id": "e71597c1-67ae-4176-9ae3-ae4dbe0886b9",
156 |         "name": "fare_amount",
157 |         "description": "The time-and-distance fare calculated by the meter.",
158 |         "trim": true,
159 |         "nullable": true,
160 |         "type": "decimal",
161 |         "nullableValues": [
162 |             "",
163 |             "null"
164 |         ],
165 |         "precision": 10,
166 |         "scale": 2
167 |     },
168 |     {
169 |         "id": "77d91cb6-22e4-4dba-883a-eee0c8690f31",
170 |         "name": "extra",
171 |         "description": "Miscellaneous extras and surcharges. Currently, this only includes the $0.50 and $1 rush hour and overnight charges.",
172 |         "trim": true,
173 |         "nullable": true,
174 |         "type": "decimal",
175 |         "nullableValues": [
176 |             "",
177 |             "null"
178 |         ],
179 |         "precision": 10,
180 |         "scale": 2
181 |     },
182 |     {
183 |         "id": "aebe7970-91dc-4155-b9a9-78dbcf836ac8",
184 |         "name": "mta_tax",
185 |         "description": "$0.50 MTA tax that is automatically triggered based on the metered rate in use.",
186 |         "trim": true,
187 |         "nullable": true,
188 |         "type": "decimal",
189 |         "nullableValues": [
190 |             "",
191 |             "null"
192 |         ],
193 |         "precision": 10,
194 |         "scale": 2
195 |     },
196 |     {
197 |         "id": "3630c209-a88c-4dd7-ab43-276234f04252",
198 |         "name": "tip_amount",
199 |         "description": "Tip amount – This field is automatically populated for credit card tips. Cash tips are not included.",
200 |         "trim": true,
201 |         "nullable": true,
202 |         "type": "decimal",
203 |         "nullableValues": [
204 |             "",
205 |             "null"
206 |         ],
207 |         "precision": 10,
208 |         "scale": 2
209 |     },
210 |     {
211 |         "id": "9d10371c-c08c-461a-a1a9-e5cd0c46655c",
212 |         "name": "tolls_amount",
213 |         "description": "Total amount of all tolls paid in trip.",
214 |         "trim": true,
215 |         "nullable": true,
216 |         "type": "decimal",
217 |         "nullableValues": [
218 |             "",
219 |             "null"
220 |         ],
221 |         "precision": 10,
222 |         "scale": 2
223 |     },
224 |     {
225 |         "id": "f59aba58-2a8c-40f9-830b-f1abafe80b7f",
226 |         "name": "ehail_fee",
227 |         "description": "Fee for allowing passengers to 'e-hail' a New York City taxicab via downloadable smartphone applications.",
228 |         "trim": true,
229 |         "nullable": true,
230 |         "type": "decimal",
231 |         "nullableValues": [
232 |             "",
233 |             "null"
234 |         ],
235 |         "precision": 10,
236 |         "scale": 2
237 |     },
238 |     {
239 |         "id": "1414fd4b-32ed-430c-a4b0-a569e7144bbb",
240 |         "name": "total_amount",
241 |         "description": "The total amount charged to passengers. Does not include cash tips.",
242 |         "trim": true,
243 |         "nullable": true,
244 |         "type": "decimal",
245 |         "nullableValues": [
246 |             "",
247 |             "null"
248 |         ],
249 |         "precision": 10,
250 |         "scale": 2
251 |     },
252 |     {
253 |         "id": "5b43ec13-dc16-40bd-8af5-4e2f85285e15",
254 |         "name": "payment_type",
255 |         "description": "A numeric code signifying how the passenger paid for the trip.",
256 |         "trim": true,
257 |         "nullable": true,
258 |         "type": "integer",
259 |         "nullableValues": [
260 |             "",
261 |             "null"
262 |         ]
263 |     },
264 |     {
265 |         "id": "bccf357f-6671-4168-998a-c991fdcf7fe0",
266 |         "name": "trip_type",
267 |         "description": "A code indicating whether the trip was a street-hail or a dispatch that is automatically assigned based on the metered rate in use but can be altered by the driver.",
268 |         "trim": true,
269 |         "nullable": true,
270 |         "type": "integer",
271 |         "nullableValues": [
272 |             "",
273 |             "null"
274 |         ]
275 |     }
276 | ]


--------------------------------------------------------------------------------
/spark-on-eks/deployment/app_code/sql/add_calc_field_for_scd2.sql:
--------------------------------------------------------------------------------
1 | SELECT id
2 | , name
3 | , email
4 | , state
5 | , ${CURRENT_TIMESTAMP} AS valid_from
6 | , CAST(null AS timestamp) AS valid_to
7 | , 1 AS iscurrent
8 | , md5(concat(name,email,state)) AS checksum 
9 | FROM ${table_name}


--------------------------------------------------------------------------------
/spark-on-eks/deployment/app_code/sql/create_table_contact.sql:
--------------------------------------------------------------------------------
 1 | CREATE EXTERNAL TABLE IF NOT EXISTS ${table_name}(
 2 |    `id` int
 3 |   ,`name` string
 4 |   ,`email` string
 5 |   ,`state` string
 6 |   ,`valid_from` timestamp
 7 |   ,`valid_to` timestamp
 8 |   ,`iscurrent` tinyint
 9 |   ,`checksum` string
10 | )
11 | ROW FORMAT SERDE 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe'
12 | STORED AS INPUTFORMAT 'org.apache.hadoop.hive.ql.io.SymlinkTextInputFormat'
13 | OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
14 | LOCATION ${datalake_loc}
15 | TBLPROPERTIES (
16 |   'classification'='parquet',
17 |   'parquet.compress'='SNAPPY'
18 |  )


--------------------------------------------------------------------------------
/spark-on-eks/deployment/app_code/sql/sqlvalidate_errors.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 |   SUM(error) = 0 AS valid
 3 |   ,TO_JSON(
 4 |       NAMED_STRUCT(
 5 |         'count', COUNT(error), 
 6 |         'errors', SUM(error)
 7 |       )
 8 |   ) AS message
 9 | FROM (
10 |   SELECT CASE WHEN SIZE(_errors) > 0 THEN 1 ELSE 0 END AS error 
11 |   FROM ${inputView}
12 | ) base


--------------------------------------------------------------------------------
/spark-on-eks/deployment/build-s3-dist.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | #
  3 | # This script packages your project into a solution distributable that can be
  4 | # used as an input to the solution builder validation pipeline.
  5 | #
  6 | # Important notes and prereq's:
  7 | #   1. The initialize-repo.sh script must have been run in order for this script to
  8 | #      function properly.
  9 | #   2. This script should be run from the repo's root folder.
 10 | #
 11 | # This script will perform the following tasks:
 12 | #   1. Remove any old dist files from previous runs.
 13 | #   2. Install dependencies for the cdk-solution-helper; responsible for
 14 | #      converting standard 'cdk synth' output into solution assets.
 15 | #   3. Build and synthesize your CDK project.
 16 | #   4. Run the cdk-solution-helper on template outputs and organize
 17 | #      those outputs into the /global-s3-assets folder.
 18 | #   5. Organize source code artifacts into the /regional-s3-assets folder.
 19 | #   6. Remove any temporary files used for staging.
 20 | #
 21 | # Parameters:
 22 | #  - source-bucket-base-name: Name for the S3 bucket location where the template will source the Lambda
 23 | #    code from. The template will append '-[region_name]' to this bucket name.
 24 | #    For example: ./build-s3-dist.sh solutions v1.0.0
 25 | #    The template will then expect the source code to be located in the solutions-[region_name] bucket
 26 | #  - solution-name: name of the solution for consistency
 27 | #  - version-code: version of the package
 28 | 
 29 | # Important: CDK global version number
 30 | cdk_version===2.105.0
 31 | 
 32 | # Check to see if the required parameters have been provided:
 33 | if [ -z "$1" ] || [ -z "$2" ] || [ -z "$3" ]; then
 34 |     echo "Please provide the base source bucket name, trademark approved solution name and version where the lambda code will eventually reside."
 35 |     echo "For example: ./build-s3-dist.sh solutions trademarked-solution-name v1.0.0 template-bucket-name"
 36 |     exit 1
 37 | fi
 38 | 
 39 | # Get reference for all important folders
 40 | template_dir="$PWD"
 41 | staging_dist_dir="$template_dir/staging"
 42 | template_dist_dir="$template_dir/deployment/global-s3-assets"
 43 | build_dist_dir="$template_dir/deployment/regional-s3-assets"
 44 | source_dir="$template_dir/source"
 45 | 
 46 | echo "------------------------------------------------------------------------------"
 47 | echo "[Init] Remove any old dist files from previous runs"
 48 | echo "------------------------------------------------------------------------------"
 49 | 
 50 | echo "rm -rf $template_dist_dir"
 51 | rm -rf $template_dist_dir
 52 | echo "mkdir -p $template_dist_dir"
 53 | mkdir -p $template_dist_dir
 54 | echo "rm -rf $build_dist_dir"
 55 | rm -rf $build_dist_dir
 56 | echo "mkdir -p $build_dist_dir"
 57 | mkdir -p $build_dist_dir
 58 | echo "rm -rf $staging_dist_dir"
 59 | rm -rf $staging_dist_dir
 60 | echo "mkdir -p $staging_dist_dir"
 61 | mkdir -p $staging_dist_dir
 62 | 
 63 | echo "------------------------------------------------------------------------------"
 64 | echo "[Init] Install dependencies for the cdk-solution-helper"
 65 | echo "------------------------------------------------------------------------------"
 66 | 
 67 | echo "cd $template_dir/deployment/cdk-solution-helper"
 68 | cd $template_dir/deployment/cdk-solution-helper
 69 | echo "npm install"
 70 | # npm audit fix --force
 71 | npm install
 72 | 
 73 | cd $template_dir
 74 | echo "pip3 install -q $source_dir"
 75 | python3 -m venv .env
 76 | source .env/bin/activate
 77 | pip3 install --upgrade pip -q $source_dir
 78 | echo "cd $source_dir"
 79 | cd $source_dir
 80 | 
 81 | echo "------------------------------------------------------------------------------"
 82 | echo "[Synth] CDK Project"
 83 | echo "------------------------------------------------------------------------------"
 84 | 
 85 | # # Install the global aws-cdk package
 86 | echo "npm install -g aws-cdk@$cdk_version"
 87 | # npm audit fix --force
 88 | npm install aws-cdk@$cdk_version
 89 | 
 90 | # Run 'cdk synth' to generate raw solution outputs
 91 | echo "cdk synth --output=$staging_dist_dir"
 92 | node_modules/aws-cdk/bin/cdk synth --output=$staging_dist_dir
 93 | 
 94 | # Remove unnecessary output files
 95 | echo "cd $staging_dist_dir"
 96 | cd $staging_dist_dir
 97 | echo "rm tree.json manifest.json cdk.out"
 98 | rm tree.json manifest.json cdk.out
 99 | 
100 | echo "------------------------------------------------------------------------------"
101 | echo "[Packing] Template artifacts"
102 | echo "------------------------------------------------------------------------------"
103 | 
104 | # Move outputs from staging to template_dist_dir
105 | echo "Move outputs from staging to template_dist_dir"
106 | mv $staging_dist_dir/*.json $template_dist_dir/
107 | 
108 | # Rename all *.template.json files to *.template
109 | echo "Rename all *.template.json to *.template"
110 | echo "copy templates and rename"
111 | for f in $template_dist_dir/*.template.json; do
112 |     mv -- "$f" "${f%.template.json}.template"
113 | done
114 | 
115 | # Run the helper to clean-up the templates and remove unnecessary CDK elements
116 | echo "Run the helper to clean-up the templates and remove unnecessary CDK elements"
117 | echo "node $template_dir/deployment/cdk-solution-helper/index"
118 | node $template_dir/deployment/cdk-solution-helper/index
119 | if [ "$?" = "1" ]; then
120 | 	echo "(cdk-solution-helper) ERROR: there is likely output above." 1>&2
121 | 	exit 1
122 | fi
123 | 
124 | # Find and replace bucket_name, solution_name, and version
125 | echo "Find and replace bucket_name, solution_name, and version"
126 | cd $template_dist_dir
127 | echo "Updating code source bucket in template with $1"
128 | replace="s/%%BUCKET_NAME%%/$1/g"
129 | echo "sed -i '' -e $replace $template_dist_dir/*.template"
130 | sed -i '' -e $replace $template_dist_dir/*.template
131 | replace="s/%%SOLUTION_NAME%%/$2/g"
132 | echo "sed -i '' -e $replace $template_dist_dir/*.template"
133 | sed -i '' -e $replace $template_dist_dir/*.template
134 | replace="s/%%VERSION%%/$3/g"
135 | echo "sed -i '' -e $replace $template_dist_dir/*.template"
136 | sed -i '' -e $replace $template_dist_dir/*.template
137 | 
138 | # Generate CFN template and zip code assets in a user's single bucket 
139 | if [ -z "$4" ]; then
140 |     replace="s/%%TEMPLATE_OUTPUT_BUCKET%%/$1"-"$AWS_REGION/g"
141 |     echo "User's template bucket is: $replace"
142 | else
143 |     replace="s/%%TEMPLATE_OUTPUT_BUCKET%%/$4/g"    
144 | fi
145 | 
146 | echo "sed -i '' -e $replace $template_dist_dir/*.template"
147 | sed -i '' -e $replace $template_dist_dir/*.template
148 | 
149 | rm $template_dist_dir/*.json
150 | 
151 | echo "------------------------------------------------------------------------------"
152 | echo "[Packing] Source code artifacts"
153 | echo "------------------------------------------------------------------------------"
154 | 
155 | # General cleanup of node_modules and package-lock.json files
156 | echo "find $staging_dist_dir -iname "node_modules" -type d -exec rm -rf "{}" \; 2> /dev/null"
157 | find $staging_dist_dir -iname "node_modules" -type d -exec rm -rf "{}" \; 2> /dev/null
158 | echo "find $staging_dist_dir -iname "package-lock.json" -type f -exec rm -f "{}" \; 2> /dev/null"
159 | find $staging_dist_dir -iname "package-lock.json" -type f -exec rm -f "{}" \; 2> /dev/null
160 | 
161 | # ... For each asset.* source code artifact in the temporary /staging folder...
162 | cd $staging_dist_dir
163 | for d in `find . -mindepth 1 -maxdepth 1 -type d`; do
164 | 
165 |     # Rename the artifact, removing the period for handler compatibility
166 |     pfname="$(basename -- $d)"
167 |     fname="$(echo $pfname | sed -e 's/\.//g')"
168 |     echo "zip -r $fname.zip $fname"
169 |     mv $d $fname
170 |     cd $staging_dist_dir/$fname
171 | 
172 |     # Build the artifcats
173 |     if ls *.py 1>/dev/null 2>&1; then
174 |         echo "===================================="
175 |         echo "This is Python runtime"
176 |         echo "===================================="
177 |         venv_folder=".venv-prod"
178 |         rm -fr .venv-test
179 |         rm -fr .venv-prod
180 |         echo "Initiating virtual environment"
181 |         python3 -m venv $venv_folder
182 |         source $venv_folder/bin/activate
183 |         pip3 install --upgrade pip -q $source_dir --target $venv_folder/lib/python3.*/site-packages
184 |         echo "package python artifact"
185 |         cd $venv_folder/lib/python3.*/site-packages
186 |         zip -qr9 $staging_dist_dir/$fname.zip . -x "aws_cdk/*"
187 |         echo "zip -r $staging_dist_dir/$fname"
188 |         cd $staging_dist_dir/$fname
189 |         rm -rf $venv_folder
190 |         zip -grq $staging_dist_dir/$fname.zip .
191 |     elif ls *.js 1>/dev/null 2>&1; then
192 |         echo "===================================="
193 |         echo "This is Node runtime"
194 |         echo "===================================="
195 |         echo "Clean and rebuild artifacts"
196 |         # npm audit fix --force
197 |         echo "copy package.json and package-lock.json files"
198 |         cp -rf $template_dir/deployment/cdk-solution-helper/*.json .
199 |         npm run
200 |         npm ci
201 |         if [ "$?" = "1" ]; then
202 |             echo "ERROR: Seems like package-lock.json does not exists or is out of sync with package.josn. Trying npm install instead" 1>&2
203 |             npm install
204 |         fi
205 |         # Zip the artifact
206 |         echo "zip -r $staging_dist_dir/$fname"
207 |         zip -qr9 $staging_dist_dir/$fname.zip .
208 |     else
209 |         # Zip the artifact
210 |         echo "zip -r $staging_dist_dir/$fname"
211 |         zip -rq $staging_dist_dir/$fname.zip .
212 |     fi
213 | 
214 |     cd $staging_dist_dir
215 |     # Copy the zipped artifact from /staging to /regional-s3-assets
216 |     echo "cp $fname.zip $build_dist_dir"
217 |     mv $fname.zip $build_dist_dir
218 | 
219 |     # Remove the old, unzipped artifact from /staging
220 |     echo "rm -rf $fname"
221 |     rm -rf $fname
222 | 
223 |     # ... repeat until all source code artifacts are zipped and placed in the
224 |     # ... /regional-s3-assets folder
225 | 
226 | done
227 | 
228 | echo "------------------------------------------------------------------------------"
229 | echo "[Move] the zip files from staging to regional-s3-assets folder"
230 | echo "------------------------------------------------------------------------------"
231 | for d in `find . -mindepth 1 -maxdepth 1`; do
232 |     pfname="$(basename -- $d)"
233 |     fname="$(echo $pfname | sed -e 's/asset./asset/g')"
234 |     mv $d $build_dist_dir/$fname
235 | done
236 | 
237 | echo "------------------------------------------------------------------------------"
238 | echo "[Cleanup] Remove temporary files"
239 | echo "------------------------------------------------------------------------------"
240 | 
241 | # Delete the temporary /staging folder
242 | echo "rm -rf $staging_dist_dir"
243 | rm -rf $staging_dist_dir
244 | 


--------------------------------------------------------------------------------
/spark-on-eks/deployment/cdk-solution-helper/README.md:
--------------------------------------------------------------------------------
  1 | # cdk-solution-helper
  2 | 
  3 | A lightweight helper function that cleans-up synthesized templates from the AWS Cloud Development Kit (CDK) and prepares
  4 | them for use with the AWS Solutions publishing pipeline. This function performs the following tasks:
  5 | 
  6 | #### Lambda function preparation
  7 | 
  8 | Replaces the AssetParameter-style properties that identify source code for Lambda functions with the common variables 
  9 | used by the AWS Solutions publishing pipeline. 
 10 | 
 11 | - `Code.S3Bucket` is assigned the `%%BUCKET_NAME%%` placeholder value.
 12 | - `Code.S3Key` is assigned the `%%SOLUTION_NAME%%`/`%%VERSION%%` placeholder value.
 13 | - `Handler` is given a prefix identical to the artifact hash, enabling the Lambda function to properly find the handler in the extracted source code package.
 14 | 
 15 | These placeholders are then replaced with the appropriate values using the default find/replace operation run by the pipeline.
 16 | 
 17 | Before:
 18 | ```
 19 | "examplefunction67F55935": {
 20 |       "Type": "AWS::Lambda::Function",
 21 |       "Properties": {
 22 |         "Code": {
 23 |           "S3Bucket": {
 24 |             "Ref": "AssetParametersd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7S3Bucket54E71A95"
 25 |           },
 26 |           "S3Key": {
 27 |             "Fn::Join": [
 28 |               "",
 29 |               [
 30 |                 {
 31 |                   "Fn::Select": [
 32 |                     0,
 33 |                     {
 34 |                       "Fn::Split": [
 35 |                         "||",
 36 |                         {
 37 |                           "Ref": "AssetParametersd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7S3VersionKeyC789D8B1"
 38 |                         }
 39 |                       ]
 40 |                     }
 41 |                   ]
 42 |                 },
 43 |                 {
 44 |                   "Fn::Select": [
 45 |                     1,
 46 |                     {
 47 |                       "Fn::Split": [
 48 |                         "||",
 49 |                         {
 50 |                           "Ref": "AssetParametersd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7S3VersionKeyC789D8B1"
 51 |                         }
 52 |                       ]
 53 |                     }
 54 |                   ]
 55 |                 }
 56 |               ]
 57 |             ]
 58 |           }
 59 |         }, ...
 60 |         Handler: "index.handler", ...
 61 | ```
 62 | 
 63 | After helper function run:
 64 | ```
 65 | "examplefunction67F55935": {
 66 |       "Type": "AWS::Lambda::Function",
 67 |       "Properties": {
 68 |         "Code": {
 69 |           "S3Bucket": "%%BUCKET_NAME%%",
 70 |           "S3Key": "%%SOLUTION_NAME%%/%%VERSION%%/assetd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7.zip"
 71 |         }, ...
 72 |         "Handler": "assetd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7/index.handler"
 73 | ```
 74 | 
 75 | After build script run:
 76 | ```
 77 | "examplefunction67F55935": {
 78 |       "Type": "AWS::Lambda::Function",
 79 |       "Properties": {
 80 |         "Code": {
 81 |           "S3Bucket": "solutions",
 82 |           "S3Key": "trademarked-solution-name/v1.0.0/asset.d513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7.zip"
 83 |         }, ...
 84 |         "Handler": "assetd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7/index.handler"
 85 | ```
 86 | 
 87 | After CloudFormation deployment:
 88 | ```
 89 | "examplefunction67F55935": {
 90 |       "Type": "AWS::Lambda::Function",
 91 |       "Properties": {
 92 |         "Code": {
 93 |           "S3Bucket": "solutions-us-east-1",
 94 |           "S3Key": "trademarked-solution-name/v1.0.0/asset.d513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7.zip"
 95 |         }, ...
 96 |         "Handler": "assetd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7/index.handler"
 97 | ```
 98 | 
 99 | #### Template cleanup
100 | 
101 | Cleans-up the parameters section and improves readability by removing the AssetParameter-style fields that would have 
102 | been used to specify Lambda source code properties. This allows solution-specific parameters to be highlighted and 
103 | removes unnecessary clutter.
104 | 
105 | Before:
106 | ```
107 | "Parameters": {
108 |     "AssetParametersd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7S3Bucket54E71A95": {
109 |       "Type": "String",
110 |       "Description": "S3 bucket for asset \"d513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7\""
111 |     },
112 |     "AssetParametersd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7S3VersionKeyC789D8B1": {
113 |       "Type": "String",
114 |       "Description": "S3 key for asset version \"d513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7\""
115 |     },
116 |     "AssetParametersd513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7ArtifactHash7AA751FE": {
117 |       "Type": "String",
118 |       "Description": "Artifact hash for asset \"d513e93e266931de36e1c7e79c27b196f84ab928fce63d364d9152ca501551f7\""
119 |     },
120 |     "CorsEnabled" : {
121 |         "Description" : "Would you like to enable Cross-Origin Resource Sharing (CORS) for the image handler API? Select 'Yes' if so.",
122 |         "Default" : "No",
123 |         "Type" : "String",
124 |         "AllowedValues" : [ "Yes", "No" ]
125 |     },
126 |     "CorsOrigin" : {
127 |         "Description" : "If you selected 'Yes' above, please specify an origin value here. A wildcard (*) value will support any origin.",
128 |         "Default" : "*",
129 |         "Type" : "String"
130 |     }
131 |   }
132 |   ```
133 | 
134 | After:
135 | ```
136 | "Parameters": {
137 |     "CorsEnabled" : {
138 |         "Description" : "Would you like to enable Cross-Origin Resource Sharing (CORS) for the image handler API? Select 'Yes' if so.",
139 |         "Default" : "No",
140 |         "Type" : "String",
141 |         "AllowedValues" : [ "Yes", "No" ]
142 |     },
143 |     "CorsOrigin" : {
144 |         "Description" : "If you selected 'Yes' above, please specify an origin value here. A wildcard (*) value will support any origin.",
145 |         "Default" : "*",
146 |         "Type" : "String"
147 |     }
148 |   }
149 |   ```
150 | 
151 | ***
152 | &copy; Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved.


--------------------------------------------------------------------------------
/spark-on-eks/deployment/cdk-solution-helper/index.js:
--------------------------------------------------------------------------------
  1 | // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | // SPDX-License-Identifier: MIT-0
  3 | // Imports
  4 | const fs = require('fs');
  5 | 
  6 | // Paths
  7 | var currentPath = process.cwd();
  8 | const global_s3_assets = currentPath+'/../deployment/global-s3-assets';
  9 | const solution_name='SparkOnEKS';
 10 | 
 11 | function setParameter(template) {
 12 |     const parameters = (template.Parameters) ? template.Parameters : {};
 13 |     const assetParameters = Object.keys(parameters).filter(function(key) {
 14 |       return key.includes('BootstrapVersion');
 15 |     });
 16 |     assetParameters.forEach(function(a) {
 17 |         template.Parameters[a] = undefined;
 18 |     });
 19 |     const rules = (template.Rules) ? template.Rules : {};
 20 |     const rule = Object.keys(rules).filter(function(key) {
 21 |       return key.includes('CheckBootstrapVersion');
 22 |     });
 23 |     rule.forEach(function(a) {
 24 |       template.Rules[a] = undefined;
 25 |   })
 26 | }
 27 | function assetRef(s3BucketRef) {
 28 |   // Get S3 bucket key references from assets file
 29 |     const raw_meta = fs.readFileSync(`${global_s3_assets}/${solution_name}.assets.json`);
 30 |     let template = JSON.parse(raw_meta);
 31 |     const metadata = (template.files[s3BucketRef]) ? template.files[s3BucketRef] : {};
 32 |     var assetPath = metadata.source.path.replace('.json','');
 33 |     return assetPath;
 34 | }
 35 | 
 36 | // For each template in global_s3_assets ...
 37 | fs.readdirSync(global_s3_assets).forEach(file => {
 38 |   if ( file != `${solution_name}.assets.json`) {
 39 |     // Import and parse template file
 40 |     const raw_template = fs.readFileSync(`${global_s3_assets}/${file}`);
 41 |     let template = JSON.parse(raw_template);
 42 | 
 43 |     //1. Clean-up parameters section
 44 |     setParameter(template);
 45 |     // setOutput(template);
 46 | 
 47 |     const resources = (template.Resources) ? template.Resources : {};
 48 |     //3. Clean-up Account ID and region to enable cross account deployment
 49 |     const rsrctype=["AWS::Lambda::Function","AWS::Lambda::LayerVersion","Custom::CDKBucketDeployment", "AWS::CloudFormation::Stack","AWS::CloudFront::Distribution"]
 50 |     const focusTemplate = Object.keys(resources).filter(function(key) {
 51 |       return (resources[key].Type.indexOf(rsrctype) < 0)
 52 |     });
 53 |     focusTemplate.forEach(function(f) {
 54 |         const fn = template.Resources[f];
 55 |         if (fn.Properties.hasOwnProperty('Code') && fn.Properties.Code.hasOwnProperty('S3Bucket')) {
 56 |           // Set Lambda::Function S3 reference to regional folder
 57 |           if (! String(fn.Properties.Code.S3Bucket.Ref).startsWith('appcode')){
 58 |             fn.Properties.Code.S3Key = `%%SOLUTION_NAME%%/%%VERSION%%/asset`+fn.Properties.Code.S3Key;
 59 |             fn.Properties.Code.S3Bucket = {'Fn::Sub': '%%BUCKET_NAME%%-${AWS::Region}'};
 60 |           }
 61 |           // Set the handler
 62 |           // const handler = fn.Properties.Handler;
 63 |           // fn.Properties.Handler = `${handler}`;
 64 |         }
 65 |         else if (fn.Properties.hasOwnProperty('Content') && fn.Properties.Content.hasOwnProperty('S3Bucket')) {
 66 |           // Set Lambda::LayerVersion S3 bucket reference
 67 |           fn.Properties.Content.S3Key = `%%SOLUTION_NAME%%/%%VERSION%%/asset`+fn.Properties.Content.S3Key;
 68 |           fn.Properties.Content.S3Bucket = {'Fn::Sub': '%%BUCKET_NAME%%-${AWS::Region}'};    
 69 |         }
 70 |         else if (fn.Properties.hasOwnProperty('SourceBucketNames')) {
 71 |           // Set CDKBucketDeployment S3 bucket reference
 72 |           fn.Properties.SourceObjectKeys = [`%%SOLUTION_NAME%%/%%VERSION%%/asset`+fn.Properties.SourceObjectKeys[0]];
 73 |           fn.Properties.SourceBucketNames = [{'Fn::Sub': '%%BUCKET_NAME%%-${AWS::Region}'}];
 74 |         }
 75 |         else if (fn.Properties.hasOwnProperty('PolicyName') && fn.Properties.PolicyName.includes('CustomCDKBucketDeployment')) {
 76 |           // Set CDKBucketDeployment S3 bucket Policy reference
 77 |           fn.Properties.PolicyDocument.Statement.forEach(function(sub,i) {
 78 |             if (typeof(sub.Resource[i]) === 'object') {
 79 |               sub.Resource.forEach(function(resource){
 80 |                 var arrayKey = Object.keys(resource);
 81 |                 if (typeof(resource[arrayKey][1]) === 'object') {
 82 |                   resource[arrayKey][1].filter(function(s){
 83 |                       if (s.hasOwnProperty('Ref')) {
 84 |                         fn.Properties.PolicyDocument.Statement[i].Resource = [
 85 |                         {"Fn::Join": ["",["arn:",{"Ref": "AWS::Partition"},":s3:::%%BUCKET_NAME%%-",{"Ref": "AWS::Region"}]]},
 86 |                         {"Fn::Join": ["",["arn:",{"Ref": "AWS::Partition"},":s3:::%%BUCKET_NAME%%-",{"Ref": "AWS::Region"},"/*"]]}
 87 |                         ]
 88 |                       }
 89 |                     });
 90 |                   }
 91 |                 })
 92 |             }
 93 |             });
 94 |         }
 95 |         // Set NestedStack S3 bucket reference
 96 |         else if (fn.Properties.hasOwnProperty('TemplateURL')) {
 97 |           var key=fn.Properties.TemplateURL['Fn::Join'][1][6].replace('.json','').replace('/','');
 98 |           var assetPath = assetRef(key);
 99 |           fn.Properties.TemplateURL = {
100 |             "Fn::Join": [
101 |               "",
102 |               [
103 |                 "https://s3.",
104 |                 {
105 |                   "Ref": "AWS::URLSuffix"
106 |                 },
107 |                 "/"
108 |                 ,`%%TEMPLATE_OUTPUT_BUCKET%%/%%SOLUTION_NAME%%/%%VERSION%%/${assetPath}`
109 |               ]]
110 |           };
111 |         }
112 |         // Set CloudFront logging bucket
113 |         else if (fn.Properties.hasOwnProperty('DistributionConfig')){
114 |           fn.Properties.DistributionConfig.Logging.Bucket= {
115 |             "Fn::Join": ["",[fn.Properties.DistributionConfig.Logging.Bucket['Fn::Join'][1][0],
116 |             ".s3.",{"Ref": "AWS::Region"},".",{"Ref": "AWS::URLSuffix"}]]
117 |           }
118 |         }
119 |     });
120 |     
121 |     //6. Output modified template file
122 |     const output_template = JSON.stringify(template, null, 2);
123 |     fs.writeFileSync(`${global_s3_assets}/${file}`, output_template);
124 |   }
125 | });


--------------------------------------------------------------------------------
/spark-on-eks/deployment/cdk-solution-helper/package.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "name": "cdk-solution-helper",
 3 |     "version": "0.1.0",
 4 |     "devDependencies": {
 5 |         "fs": "0.0.1-security"
 6 |     },
 7 |     "dependencies": {
 8 |         "fs": "0.0.1-security"
 9 |     }
10 | }
11 | 


--------------------------------------------------------------------------------
/spark-on-eks/deployment/delete_all.sh:
--------------------------------------------------------------------------------
 1 | # // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # // SPDX-License-Identifier: MIT-0
 3 | #!/bin/bash
 4 | 
 5 | export stack_name="${1:-SparkOnEKS}"
 6 | export region="${2:-us-east-1}"
 7 | 
 8 | echo "================================================================================================="
 9 | echo "  Make sure your CloudFormation stack name $stack_name is correct and exists in region: $region  "
10 | echo "  If you use a different name, rerun the script with the parameters:"
11 | echo "      ./deployment/delete_all.sh <stack_name> <region>"
12 | echo "================================================================================================="
13 | 
14 | code_bucket=$(aws cloudformation describe-stacks --stack-name $stack_name --region $region --query "Stacks[0].Outputs[?OutputKey=='CODEBUCKET'].OutputValue" --output text)
15 | if ! [ -z "$code_bucket" ] 
16 | then	
17 | 	if ! [ -z $(aws s3api list-buckets --region $region --query 'Buckets[?Name==`'$code_bucket'`].Name' --output text) ]; then
18 | 		echo "Delete logs from S3"
19 | 		aws s3 rm s3://${code_bucket}/vpcRejectlog/
20 | 		echo "Delete athena query result from S3"
21 | 		aws s3 rm s3://${code_bucket}/athena-query-result/
22 | 	fi	
23 | fi
24 | 
25 | # delete glue tables
26 | tbl1=$(aws glue get-tables --region $region --database-name 'default' --query 'TableList[?starts_with(Name,`contact_snapshot`)==`true`]'.Name --output text)
27 | tbl2=$(aws glue get-tables --region $region --database-name 'default' --query 'TableList[?starts_with(Name,`deltalake_contact_jhub`)==`true`]'.Name --output text)
28 | if ! [ -z "$tbl1" ] 
29 | then
30 | 	echo "Drop a Delta Lake table default.contact_snapshot"
31 | 	aws athena start-query-execution --region $region --query-string "DROP TABLE default.contact_snapshot" --result-configuration OutputLocation=s3://$code_bucket/athena-query-result
32 | fi
33 | if ! [ -z "$tbl2" ] 
34 | then
35 | 	echo "Drop a Delta Lake table default.deltalake_contact_jhub"
36 | 	aws athena start-query-execution --region $region --query-string "DROP TABLE default.deltalake_contact_jhub" --result-configuration OutputLocation=s3://$code_bucket/athena-query-result
37 | fi
38 | 
39 | argoALB=$(aws elbv2 describe-load-balancers --region $region --query 'LoadBalancers[?starts_with(DNSName,`k8s-argo`)==`true`].LoadBalancerArn' --output text)
40 | jhubALB=$(aws elbv2 describe-load-balancers --region $region --query 'LoadBalancers[?starts_with(DNSName,`k8s-jupyter`)==`true`].LoadBalancerArn' --output text)
41 | if ! [ -z "$argoALB" ]
42 | then
43 | 	echo "Delete Argo ALB"
44 | 	aws elbv2 delete-load-balancer --load-balancer-arn $argoALB --region $region
45 | 	sleep 5
46 | fi	
47 | if ! [ -z "$jhubALB" ]
48 | then
49 | 	echo "Delete Jupyter ALB"
50 | 	aws elbv2 delete-load-balancer --load-balancer-arn $jhubALB --region $region
51 | 	sleep 5
52 | fi
53 | 
54 | argoTG=$(aws elbv2 describe-target-groups --region $region --query 'TargetGroups[?starts_with(TargetGroupName,`k8s-argo`)==`true`].TargetGroupArn' --output text)
55 | jhubTG=$(aws elbv2 describe-target-groups --region $region --query 'TargetGroups[?starts_with(TargetGroupName,`k8s-jupyter`)==`true`].TargetGroupArn' --output text)
56 | if ! [ -z "$argoTG" ]
57 | then
58 | 	sleep 5
59 | 	echo "Delete Argo Target groups"
60 | 	aws elbv2 delete-target-group --target-group-arn $argoTG --region $region
61 | fi	
62 | if ! [ -z "$jhubTG" ]
63 | then
64 | 	sleep 5
65 | 	echo "Delete Jupyter Target groups"
66 | 	aws elbv2 delete-target-group --target-group-arn $jhubTG --region $region
67 | fi	
68 | 
69 | # delete the rest from CF
70 | echo "Delete the rest of resources by CloudFormation delete command"
71 | aws cloudformation delete-stack --stack-name $stack_name --region $region


--------------------------------------------------------------------------------
/spark-on-eks/deployment/post-deployment.sh:
--------------------------------------------------------------------------------
 1 | # // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # // SPDX-License-Identifier: MIT-0
 3 | #!/bin/bash
 4 | 
 5 | #!/bin/bash
 6 | 
 7 | export stack_name="${1:-SparkOnEKS}"
 8 | export region="${2:-us-east-1}"
 9 | echo "================================================================================================="
10 | echo "  Make sure your CloudFormation stack name $stack_name is correct and exists in region: $region  "
11 | echo "  If you use a different name, rerun the script with the parameters:"
12 | echo "      ./deployment/post-deployment.sh <stack_name> <region>"
13 | echo "================================================================================================="
14 | 
15 | # 1. install k8s command tools 
16 | echo "================================================================================"
17 | echo "  Installing kubectl tool on Linux ..."
18 | echo "  For other operating system, install the kubectl > 1.27 here:"
19 | echo "  https://docs.aws.amazon.com/eks/latest/userguide/install-kubectl.html"
20 | echo "================================================================================"
21 | curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl"
22 | chmod +x kubectl
23 | sudo mkdir -p /usr/local/bin && sudo mv kubectl /usr/local/bin/kubectl && export PATH=$PATH:/usr/local/bin/
24 | echo "Installed kubectl version: "
25 | kubectl version --client
26 | 
27 | echo "================================================================================================"
28 | echo " Installing argoCLI tool on Linux ..."
29 | echo " Check out https://github.com/argoproj/argo-workflows/releases for other OS type installation."
30 | echo "================================================================================================"
31 | VERSION=v3.5.4
32 | sudo curl -sLO https://github.com/argoproj/argo-workflows/releases/download/${VERSION}/argo-linux-amd64.gz && gunzip argo-linux-amd64.gz
33 | chmod +x argo-linux-amd64 && sudo mv ./argo-linux-amd64 /usr/local/bin/argo
34 | argo version --short
35 | 
36 | # 2. connect to the EKS newly created
37 | echo `aws cloudformation describe-stacks --stack-name $stack_name --region $region --query "Stacks[0].Outputs[?starts_with(OutputKey,'eksclusterEKSConfig')].OutputValue" --output text` | bash
38 | echo "Testing EKS connection..."
39 | kubectl get svc
40 | 
41 | # 3. get Jupyter Hub login
42 | LOGIN_URI=$(aws cloudformation describe-stacks --stack-name $stack_name --region $region \
43 | --query "Stacks[0].Outputs[?OutputKey=='JUPYTERURL'].OutputValue" --output text)
44 | SEC_ID=$(aws secretsmanager list-secrets --region $region --query "SecretList[?not_null(Tags[?Value=='$stack_name'])].Name" --output text)
45 | LOGIN=$(aws secretsmanager get-secret-value --region $region --secret-id $SEC_ID --query SecretString --output text)
46 | echo -e "\n=============================== JupyterHub Login =============================================="
47 | echo -e "\nJUPYTER_URL: $LOGIN_URI"
48 | echo "LOGIN: $LOGIN" 
49 | echo "================================================================================================"


--------------------------------------------------------------------------------
/spark-on-eks/images/00-deploy-to-aws.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/16a9ea19df38c0bf5ed11ee123e1287d53bed444/spark-on-eks/images/00-deploy-to-aws.png


--------------------------------------------------------------------------------
/spark-on-eks/images/3-argo-job-dependency.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/16a9ea19df38c0bf5ed11ee123e1287d53bed444/spark-on-eks/images/3-argo-job-dependency.png


--------------------------------------------------------------------------------
/spark-on-eks/images/3-argo-log.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/16a9ea19df38c0bf5ed11ee123e1287d53bed444/spark-on-eks/images/3-argo-log.png


--------------------------------------------------------------------------------
/spark-on-eks/images/3-argo-sidemenu.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/16a9ea19df38c0bf5ed11ee123e1287d53bed444/spark-on-eks/images/3-argo-sidemenu.png


--------------------------------------------------------------------------------
/spark-on-eks/images/4-auto-scaling.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/16a9ea19df38c0bf5ed11ee123e1287d53bed444/spark-on-eks/images/4-auto-scaling.png


--------------------------------------------------------------------------------
/spark-on-eks/images/4-k8s-retry.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/16a9ea19df38c0bf5ed11ee123e1287d53bed444/spark-on-eks/images/4-k8s-retry.png


--------------------------------------------------------------------------------
/spark-on-eks/images/4-spot-console.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/16a9ea19df38c0bf5ed11ee123e1287d53bed444/spark-on-eks/images/4-spot-console.png


--------------------------------------------------------------------------------
/spark-on-eks/images/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/16a9ea19df38c0bf5ed11ee123e1287d53bed444/spark-on-eks/images/architecture.png


--------------------------------------------------------------------------------
/spark-on-eks/images/driver_interruption_test.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/16a9ea19df38c0bf5ed11ee123e1287d53bed444/spark-on-eks/images/driver_interruption_test.gif


--------------------------------------------------------------------------------
/spark-on-eks/images/executor_interruption_test.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/16a9ea19df38c0bf5ed11ee123e1287d53bed444/spark-on-eks/images/executor_interruption_test.png


--------------------------------------------------------------------------------
/spark-on-eks/images/fake_data.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/16a9ea19df38c0bf5ed11ee123e1287d53bed444/spark-on-eks/images/fake_data.gif


--------------------------------------------------------------------------------
/spark-on-eks/images/run_jupyter.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/16a9ea19df38c0bf5ed11ee123e1287d53bed444/spark-on-eks/images/run_jupyter.gif


--------------------------------------------------------------------------------
/spark-on-eks/images/submit_job_in_argo.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/16a9ea19df38c0bf5ed11ee123e1287d53bed444/spark-on-eks/images/submit_job_in_argo.gif


--------------------------------------------------------------------------------
/spark-on-eks/images/submit_native_spark.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/16a9ea19df38c0bf5ed11ee123e1287d53bed444/spark-on-eks/images/submit_native_spark.gif


--------------------------------------------------------------------------------
/spark-on-eks/images/two_architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws-samples/sql-based-etl-on-amazon-eks/16a9ea19df38c0bf5ed11ee123e1287d53bed444/spark-on-eks/images/two_architecture.png


--------------------------------------------------------------------------------
/spark-on-eks/source/app.py:
--------------------------------------------------------------------------------
 1 | # // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # // SPDX-License-Identifier: MIT-0
 3 | 
 4 | #!/usr/bin/env python3
 5 | from aws_cdk import (App,Tags,CfnOutput)
 6 | from lib.spark_on_eks_stack import SparkOnEksStack
 7 | from lib.cloud_front_stack import NestedStack
 8 | 
 9 | app = App()
10 | eks_name = app.node.try_get_context('cluster_name')
11 | eks_stack = SparkOnEksStack(app, 'SparkOnEKS', eks_name)
12 | # The CloudFront offers a default domain name to enable HTTPS.
13 | # Recommend to issue a TLS certificate with your own domain, delete the CF nested stack 
14 | cf_nested_stack = NestedStack(eks_stack,'CreateCloudFront', eks_stack.code_bucket, eks_stack.argo_url, eks_stack.jhub_url)
15 | 
16 | Tags.of(eks_stack).add('project', 'sqlbasedetl')
17 | Tags.of(cf_nested_stack).add('project', 'sqlbasedetl')
18 | 
19 | # Deployment Output
20 | CfnOutput(eks_stack,'CODE_BUCKET', value=eks_stack.code_bucket)
21 | CfnOutput(eks_stack,'ARGO_URL', value='https://'+ cf_nested_stack.argo_cf)
22 | CfnOutput(eks_stack,'JUPYTER_URL', value='http://'+ cf_nested_stack.jhub_cf)
23 | 
24 | app.synth()
25 | 


--------------------------------------------------------------------------------
/spark-on-eks/source/app_resources/alb-iam-role.yaml:
--------------------------------------------------------------------------------
  1 | - Effect: Allow
  2 |   Action:
  3 |   - iam:CreateServiceLinkedRole
  4 |   Resource: "*"
  5 |   Condition:
  6 |     StringEquals:
  7 |       iam:AWSServiceName: elasticloadbalancing.amazonaws.com
  8 | - Effect: Allow
  9 |   Action:
 10 |   - ec2:DescribeAccountAttributes
 11 |   - ec2:DescribeAddresses
 12 |   - ec2:DescribeAvailabilityZones
 13 |   - ec2:DescribeInternetGateways
 14 |   - ec2:DescribeVpcs
 15 |   - ec2:DescribeVpcPeeringConnections
 16 |   - ec2:DescribeSubnets
 17 |   - ec2:DescribeSecurityGroups
 18 |   - ec2:DescribeInstances
 19 |   - ec2:DescribeNetworkInterfaces
 20 |   - ec2:DescribeTags
 21 |   - ec2:GetCoipPoolUsage
 22 |   - ec2:DescribeCoipPools
 23 |   - elasticloadbalancing:DescribeLoadBalancers
 24 |   - elasticloadbalancing:DescribeLoadBalancerAttributes
 25 |   - elasticloadbalancing:DescribeListeners
 26 |   - elasticloadbalancing:DescribeListenerCertificates
 27 |   - elasticloadbalancing:DescribeSSLPolicies
 28 |   - elasticloadbalancing:DescribeRules
 29 |   - elasticloadbalancing:DescribeTargetGroups
 30 |   - elasticloadbalancing:DescribeTargetGroupAttributes
 31 |   - elasticloadbalancing:DescribeTargetHealth
 32 |   - elasticloadbalancing:DescribeTags
 33 |   Resource: "*"
 34 | - Effect: Allow
 35 |   Action:
 36 |   - cognito-idp:DescribeUserPoolClient
 37 |   - acm:ListCertificates
 38 |   - acm:DescribeCertificate
 39 |   - iam:ListServerCertificates
 40 |   - iam:GetServerCertificate
 41 |   - waf-regional:GetWebACL
 42 |   - waf-regional:GetWebACLForResource
 43 |   - waf-regional:AssociateWebACL
 44 |   - waf-regional:DisassociateWebACL
 45 |   - wafv2:GetWebACL
 46 |   - wafv2:GetWebACLForResource
 47 |   - wafv2:AssociateWebACL
 48 |   - wafv2:DisassociateWebACL
 49 |   - shield:GetSubscriptionState
 50 |   - shield:DescribeProtection
 51 |   - shield:CreateProtection
 52 |   - shield:DeleteProtection
 53 |   Resource: "*"
 54 | - Effect: Allow
 55 |   Action:
 56 |   - ec2:AuthorizeSecurityGroupIngress
 57 |   - ec2:RevokeSecurityGroupIngress
 58 |   Resource: "*"
 59 | - Effect: Allow
 60 |   Action:
 61 |   - ec2:CreateSecurityGroup
 62 |   Resource: "*"
 63 | - Effect: Allow
 64 |   Action:
 65 |   - ec2:CreateTags
 66 |   Resource: arn:aws:ec2:*:*:security-group/*
 67 |   Condition:
 68 |     StringEquals:
 69 |       ec2:CreateAction: CreateSecurityGroup
 70 |     'Null':
 71 |       aws:RequestTag/elbv2.k8s.aws/cluster: 'false'
 72 | - Effect: Allow
 73 |   Action:
 74 |   - ec2:CreateTags
 75 |   - ec2:DeleteTags
 76 |   Resource: arn:aws:ec2:*:*:security-group/*
 77 |   Condition:
 78 |     'Null':
 79 |       aws:RequestTag/elbv2.k8s.aws/cluster: 'true'
 80 |       aws:ResourceTag/elbv2.k8s.aws/cluster: 'false'
 81 | - Effect: Allow
 82 |   Action:
 83 |   - ec2:AuthorizeSecurityGroupIngress
 84 |   - ec2:RevokeSecurityGroupIngress
 85 |   - ec2:DeleteSecurityGroup
 86 |   Resource: "*"
 87 |   Condition:
 88 |     'Null':
 89 |       aws:ResourceTag/elbv2.k8s.aws/cluster: 'false'
 90 | - Effect: Allow
 91 |   Action:
 92 |   - elasticloadbalancing:CreateLoadBalancer
 93 |   - elasticloadbalancing:CreateTargetGroup
 94 |   Resource: "*"
 95 |   Condition:
 96 |     'Null':
 97 |       aws:RequestTag/elbv2.k8s.aws/cluster: 'false'
 98 | - Effect: Allow
 99 |   Action:
100 |   - elasticloadbalancing:CreateListener
101 |   - elasticloadbalancing:DeleteListener
102 |   - elasticloadbalancing:CreateRule
103 |   - elasticloadbalancing:DeleteRule
104 |   Resource: "*"
105 | - Effect: Allow
106 |   Action:
107 |   - elasticloadbalancing:AddTags
108 |   - elasticloadbalancing:RemoveTags
109 |   Resource:
110 |   - arn:aws:elasticloadbalancing:*:*:targetgroup/*/*
111 |   - arn:aws:elasticloadbalancing:*:*:loadbalancer/net/*/*
112 |   - arn:aws:elasticloadbalancing:*:*:loadbalancer/app/*/*
113 |   Condition:
114 |     'Null':
115 |       aws:RequestTag/elbv2.k8s.aws/cluster: 'true'
116 |       aws:ResourceTag/elbv2.k8s.aws/cluster: 'false'
117 | - Effect: Allow
118 |   Action:
119 |   - elasticloadbalancing:AddTags
120 |   - elasticloadbalancing:RemoveTags
121 |   Resource:
122 |   - arn:aws:elasticloadbalancing:*:*:listener/net/*/*/*
123 |   - arn:aws:elasticloadbalancing:*:*:listener/app/*/*/*
124 |   - arn:aws:elasticloadbalancing:*:*:listener-rule/net/*/*/*
125 |   - arn:aws:elasticloadbalancing:*:*:listener-rule/app/*/*/*
126 | - Effect: Allow
127 |   Action:
128 |   - elasticloadbalancing:ModifyLoadBalancerAttributes
129 |   - elasticloadbalancing:SetIpAddressType
130 |   - elasticloadbalancing:SetSecurityGroups
131 |   - elasticloadbalancing:SetSubnets
132 |   - elasticloadbalancing:DeleteLoadBalancer
133 |   - elasticloadbalancing:ModifyTargetGroup
134 |   - elasticloadbalancing:ModifyTargetGroupAttributes
135 |   - elasticloadbalancing:DeleteTargetGroup
136 |   Resource: "*"
137 |   Condition:
138 |     'Null':
139 |       aws:ResourceTag/elbv2.k8s.aws/cluster: 'false'
140 | - Effect: Allow
141 |   Action:
142 |   - elasticloadbalancing:AddTags
143 |   Resource:
144 |   - arn:aws:elasticloadbalancing:*:*:targetgroup/*/*
145 |   - arn:aws:elasticloadbalancing:*:*:loadbalancer/net/*/*
146 |   - arn:aws:elasticloadbalancing:*:*:loadbalancer/app/*/*
147 |   Condition:
148 |     StringEquals:
149 |       elasticloadbalancing:CreateAction:
150 |       - CreateTargetGroup
151 |       - CreateLoadBalancer
152 |     'Null':
153 |       aws:RequestTag/elbv2.k8s.aws/cluster: 'false'
154 | - Effect: Allow
155 |   Action:
156 |   - elasticloadbalancing:RegisterTargets
157 |   - elasticloadbalancing:DeregisterTargets
158 |   Resource: arn:aws:elasticloadbalancing:*:*:targetgroup/*/*
159 | - Effect: Allow
160 |   Action:
161 |   - elasticloadbalancing:SetWebAcl
162 |   - elasticloadbalancing:ModifyListener
163 |   - elasticloadbalancing:AddListenerCertificates
164 |   - elasticloadbalancing:RemoveListenerCertificates
165 |   - elasticloadbalancing:ModifyRule
166 |   Resource: "*"


--------------------------------------------------------------------------------
/spark-on-eks/source/app_resources/alb-values.yaml:
--------------------------------------------------------------------------------
1 | # image:
2 | #   tag: v2.2.0
3 | region: {{region_name}}
4 | vpcId: {{vpc_id}}
5 | clusterName: {{cluster_name}}
6 | serviceAccount:
7 |   create: false
8 |   name: alb-aws-load-balancer-controller
9 | 


--------------------------------------------------------------------------------
/spark-on-eks/source/app_resources/argo-values.yaml:
--------------------------------------------------------------------------------
 1 | controller:
 2 |   workflowNamespaces:
 3 |     - argo
 4 | init:
 5 |   serviceAccount: arcjob    
 6 | workflow:
 7 |   namespace: spark
 8 |   serviceAccount:
 9 |     create: false
10 |     name: arcjob
11 | server:
12 |   extraArgs:
13 |   - --auth-mode
14 |   - client
15 |   ingress:  
16 |     enabled: true  
17 |     annotations:    
18 |       kubernetes.io/ingress.class: alb
19 |       alb.ingress.kubernetes.io/scheme: internet-facing
20 |       alb.ingress.kubernetes.io/target-type: ip
21 |       alb.ingress.kubernetes.io/success-codes: 200,301,302
22 |       alb.ingress.kubernetes.io/listen-ports: '[{"HTTP": 2746}]'
23 |       alb.ingress.kubernetes.io/manage-backend-security-group-rules: "true"
24 |       alb.ingress.kubernetes.io/security-groups: {{INBOUND_SG}}


--------------------------------------------------------------------------------
/spark-on-eks/source/app_resources/autoscaler-iam-role.yaml:
--------------------------------------------------------------------------------
 1 | - Effect: Allow
 2 |   Action:
 3 |   - autoscaling:DescribeAutoScalingGroups
 4 |   - autoscaling:DescribeAutoScalingInstances
 5 |   - autoscaling:DescribeLaunchConfigurations
 6 |   - autoscaling:DescribeTags
 7 |   - autoscaling:SetDesiredCapacity
 8 |   - autoscaling:TerminateInstanceInAutoScalingGroup
 9 |   - ec2:DescribeLaunchTemplateVersions
10 |   Resource: 
11 |   - "*"
12 |    


--------------------------------------------------------------------------------
/spark-on-eks/source/app_resources/autoscaler-values.yaml:
--------------------------------------------------------------------------------
 1 | autoDiscovery:
 2 |     clusterName: {{cluster_name}}
 3 | awsRegion: {{region_name}}
 4 | image:
 5 |     tag: v1.27.2
 6 | nodeSelector:
 7 |     app: spark    
 8 | podAnnotations:
 9 |     cluster-autoscaler.kubernetes.io/safe-to-evict: 'false'
10 | extraArgs:
11 |     skip-nodes-with-system-pods: false
12 |     scale-down-unneeded-time: 5m
13 |     scale-down-unready-time: 10m
14 | rbac:
15 |     serviceAccount:
16 |         create: false
17 |         name: cluster-autoscaler
18 | 
19 |   


--------------------------------------------------------------------------------
/spark-on-eks/source/app_resources/etl-iam-role.yaml:
--------------------------------------------------------------------------------
 1 |   - Effect: Allow
 2 |     Action: 
 3 |     - s3:ListBucket
 4 |     - s3:GetBucketLocation
 5 |     Resource:
 6 |     - arn:aws:s3:::{{codeBucket}}
 7 |     - arn:aws:s3:::{{datalakeBucket}}
 8 |     - arn:aws:s3:::nyc-tlc
 9 |   - Effect: Allow
10 |     Action:
11 |     - s3:PutObject
12 |     - s3:GetObject
13 |     Resource:
14 |     - arn:aws:s3:::{{codeBucket}}/*
15 |     - arn:aws:s3:::{{datalakeBucket}}/*
16 |     - arn:aws:s3:::nyc-tlc/*
17 |   - Effect: Allow
18 |     Action:
19 |     - s3:DeleteObject
20 |     Resource:
21 |     - arn:aws:s3:::{{codeBucket}}/*
22 |     - arn:aws:s3:::{{datalakeBucket}}/*
23 |   - Effect: Allow
24 |     Action:
25 |     - kms:Decrypt
26 |     - kms:Encrypt
27 |     - kms:GenerateDataKey*
28 |     - athena:StartQueryExecution
29 |     - athena:GetQueryExecution
30 |     - athena:GetQueryResults
31 |     - glue:CreateTable
32 |     - glue:CreateDatabase
33 |     - glue:CreatePartition
34 |     - glue:UpdatePartition
35 |     - glue:GetDatabase
36 |     Resource:
37 |       - '*'   


--------------------------------------------------------------------------------
/spark-on-eks/source/app_resources/etl-rbac.yaml:
--------------------------------------------------------------------------------
 1 | kind: Role
 2 | apiVersion: rbac.authorization.k8s.io/v1
 3 | metadata:
 4 |   name: etl-workflow-role
 5 |   namespace: spark
 6 | rules:
 7 |   - apiGroups: [""]
 8 |     resources: ["pods","pods/exec","configmaps","services"]
 9 |     verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]
10 |   - apiGroups: ["batch", "extensions"]
11 |     resources: ["jobs"]
12 |     verbs: ["get", "list", "watch", "create", "update", "patch", "delete"]  
13 |   - apiGroups: [""]
14 |     resources: ["events","pods/log","serviceaccounts", "secrets","endpoints"]
15 |     verbs: ["list", "get", "watch"]
16 |   - apiGroups: [""]
17 |     resources: ["persistentvolumeclaims"]
18 |     verbs: ["create", "delete", "get", "list"]
19 |   - apiGroups: ["argoproj.io"]
20 |     resources: ["workflows","workflows/finalizers"]
21 |     verbs: ["*"]
22 |   - apiGroups: ["argoproj.io"]
23 |     resources: ["workflowtemplates","workflowtemplates/finalizers"]
24 |     verbs: ["get", "list", "watch"]
25 |     
26 |   
27 | ---
28 | kind: RoleBinding
29 | apiVersion: rbac.authorization.k8s.io/v1
30 | metadata:
31 |   name: {{MY_SA}}-role-binding
32 |   namespace: spark
33 | subjects:
34 |   - kind: ServiceAccount
35 |     name: {{MY_SA}}
36 |     namespace: spark
37 | roleRef:
38 |   kind: Role
39 |   name: etl-workflow-role
40 |   apiGroup: rbac.authorization.k8s.io


--------------------------------------------------------------------------------
/spark-on-eks/source/app_resources/ex-secret-iam-role.yaml:
--------------------------------------------------------------------------------
 1 | - Effect: Allow
 2 |   Action:
 3 |   - secretsmanager:GetResourcePolicy
 4 |   - secretsmanager:GetSecretValue
 5 |   - secretsmanager:DescribeSecret
 6 |   - secretsmanager:ListSecretVersionIds
 7 |   Resource: {{secretsmanager}}
 8 | - Effect: Allow
 9 |   Action:
10 |   - secretsmanager:GetRandomPassword
11 |   - secretsmanager:ListSecrets
12 |   - kms:Decrypt
13 |   - kms:Encrypt
14 |   Resource: 
15 |   - "*"
16 | 


--------------------------------------------------------------------------------
/spark-on-eks/source/app_resources/ex-secret-values.yaml:
--------------------------------------------------------------------------------
 1 | image:
 2 |   tag: 7.2.0
 3 | env:
 4 |   AWS_REGION: {{region_name}}
 5 |   AWS_DEFAULT_REGION: {{region_name}}
 6 | serviceAccount:
 7 |   create: false
 8 |   name: external-secrets-controller
 9 | securityContext:
10 |   fsGroup: 65534
11 | 


--------------------------------------------------------------------------------
/spark-on-eks/source/app_resources/jupyter-config.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: rbac.authorization.k8s.io/v1
 2 | kind: RoleBinding
 3 | metadata:
 4 |   name: {{MY_SA}}-role-binding
 5 |   namespace: jupyter
 6 | subjects:
 7 |   - kind: ServiceAccount
 8 |     name: {{MY_SA}}
 9 |     namespace: jupyter
10 | roleRef:
11 |   kind: Role
12 |   name: hub
13 |   apiGroup: rbac.authorization.k8s.io
14 | 
15 | ---
16 | apiVersion: networking.k8s.io/v1
17 | kind: Ingress
18 | metadata:
19 |   name: jupyterhub
20 |   namespace: jupyter
21 |   annotations:
22 |     kubernetes.io/ingress.class: alb
23 |     alb.ingress.kubernetes.io/scheme: internet-facing
24 |     alb.ingress.kubernetes.io/target-type: ip
25 |     alb.ingress.kubernetes.io/success-codes: 200,301,302
26 |     alb.ingress.kubernetes.io/listen-ports: '[{"HTTP": 80}]'
27 |     alb.ingress.kubernetes.io/manage-backend-security-group-rules: "true"
28 |     alb.ingress.kubernetes.io/security-groups: {{INBOUND_SG}}
29 |   labels:
30 |     app: jupyterhub
31 | spec:
32 |   rules:
33 |   - host: ""
34 |     http:
35 |       paths:
36 |       - path: /
37 |         pathType: Prefix
38 |         backend:
39 |           service:
40 |               name: proxy-public
41 |               port:
42 |                 number: 80
43 |           
44 | ---
45 | apiVersion: kubernetes-client.io/v1
46 | kind: ExternalSecret
47 | metadata:
48 |   name: jupyter-external-secret
49 |   namespace: jupyter
50 | spec:
51 |   backendType: secretsManager
52 |   region: {{REGION}}
53 |   data:
54 |     - key: {{SECRET_NAME}}
55 |       name: password
56 |       property: password
57 |       
58 | 


--------------------------------------------------------------------------------
/spark-on-eks/source/app_resources/jupyter-values.yaml:
--------------------------------------------------------------------------------
  1 | hub:
  2 |   db:
  3 |     type: sqlite-memory
  4 |   extraConfig:
  5 |     overrideServiceAccount: |
  6 |       import os, sys
  7 |       
  8 |       c.JupyterHub.authenticator_class = 'jupyterhub.auth.DummyAuthenticator'
  9 |       c.DummyAuthenticator.password = os.environ['LOGIN']
 10 |       c.Authenticator.admin_users = {"service-admin"}
 11 |       c.JupyterHub.service_tokens = {
 12 |           "secret-token": "service-admin",
 13 |       }
 14 |       # this script allows serviceAccountName to use dynamic naming based on {unescaped_username}"
 15 |       async def override_service_account_hook(kube_spawner):
 16 |         if kube_spawner.service_account is not None:
 17 |           kube_spawner.service_account = kube_spawner._expand_user_properties(kube_spawner.service_account)
 18 |           kube_spawner.env['USER_NAME'] = kube_spawner._expand_user_properties("{unescaped_username}")
 19 |       c.KubeSpawner.pre_spawn_hook = override_service_account_hook
 20 |       
 21 |       # setup timeout
 22 |       c.JupyterHub.cookie_max_age_days = 0.0105
 23 |       c.Authenticator.refresh_pre_spawn = True
 24 |       # c.JupyterHub.services = [
 25 |       #   {
 26 |       #       "name": "idle_culler",
 27 |       #       "admin": True,
 28 |       #       "command": [sys.executable, "-m", "jupyterhub_idle_culler", "--timeout=1800"],
 29 |       #   }
 30 |       # ]
 31 | 
 32 |   extraEnv:
 33 |     - name: LOGIN
 34 |       valueFrom:
 35 |         secretKeyRef:
 36 |           name: jupyter-external-secret
 37 |           key: password
 38 |   nodeSelector:
 39 |     lifecycle: Ec2Spot
 40 |   readinessProbe:
 41 |     initialDelaySeconds: 30
 42 |     periodSeconds: 10
 43 |           
 44 | proxy:
 45 |   secretToken: "*****"
 46 |   service:
 47 |     type: ClusterIP
 48 |   chp:
 49 |     nodeSelector:
 50 |       lifecycle: OnDemand    
 51 | 
 52 | singleuser:
 53 |   defaultUrl: "/lab"
 54 |   nodeSelector:
 55 |     lifecycle: OnDemand
 56 |   image:
 57 |     name: ghcr.io/tripl-ai/arc-jupyter
 58 |     tag: arc-jupyter_3.14.2_scala_2.12_hadoop_3.2.0_1.1.0
 59 |     pullPolicy: Always
 60 |   lifecycleHooks:
 61 |     postStart:
 62 |       exec:
 63 |         command:
 64 |           - "bash"
 65 |           - "-c"
 66 |           - >
 67 |             cp -r /opt/.jupyter $HOME/.jupyter;
 68 |             echo "git clone https://github.com/aws-samples/sql-based-etl-on-amazon-eks/spark-on-eks";
 69 |             git clone --depth 1 https://github.com/aws-samples/sql-based-etl-on-amazon-eks spark-on-eks;
 70 |             cd spark-on-eks; git filter-branch --prune-empty --subdirectory-filter spark-on-eks HEAD;
 71 | 
 72 |   serviceAccountName: "{username}"
 73 |   cpu:
 74 |     guarantee: 0.25
 75 |     limit: 0.5
 76 |   memory:
 77 |     guarantee: 4G
 78 |     limit: 4G
 79 |   extraEnv:
 80 |     CONF_ALLOW_EXPORT: "true"
 81 |     JAVA_OPTS: -Xmx4G
 82 |     ETL_CONF_DATALAKE_LOC: {{codeBucket}}
 83 |     ETL_CONF_AWS_REGION: {{region}}
 84 |   storage:
 85 |     type: none
 86 |   # storage:
 87 |   #   type: dynamic
 88 |   #   capacity: 10G
 89 |   #   homeMountPath: '/home/{username}/data'
 90 |   #   # mount to EBS  
 91 |   #   dynamic:
 92 |   #     storageClass: gp2 
 93 |   profileList:
 94 |   - default: True
 95 |     display_name: "Small (default): Arc-Jupyter Development Environment"
 96 |     description: "4GB Memory & 1vCPUs"
 97 |     kubespawner_override:
 98 |       cpu_guarantee: 0.5
 99 |       cpu_limit: 1
100 |       mem_guarantee: 4G
101 |       mem_limit: 10G
102 |   - display_name: "Big Arc-Jupyter Development Environment"
103 |     description: "15GB Memory & 2vCPUs"
104 |     kubespawner_override:
105 |       cpu_guarantee: 0.5
106 |       cpu_limit: 2
107 |       mem_guarantee: 10G
108 |       mem_limit: 15G
109 | 
110 | prePuller:
111 |   hook:
112 |     enabled: false 
113 | 
114 | # autoscacling setting
115 | scheduling:
116 |   userScheduler:
117 |     enabled: false
118 | 
119 | cull:
120 |   timeout: 1800
121 | 
122 | # debug:
123 | #   enabled: true      
124 | 


--------------------------------------------------------------------------------
/spark-on-eks/source/app_resources/native-spark-iam-role.yaml:
--------------------------------------------------------------------------------
 1 | - Effect: Allow
 2 |   Action: s3:ListBucket
 3 |   Resource:
 4 |   - arn:aws:s3:::{{codeBucket}}
 5 |   - arn:aws:s3:::{{datalakeBucket}}
 6 |   - arn:aws:s3:::nyc-tlc
 7 | - Effect: Allow
 8 |   Action:
 9 |   - s3:PutObject
10 |   - s3:GetObject
11 |   Resource:
12 |   - arn:aws:s3:::{{codeBucket}}/*
13 |   - arn:aws:s3:::{{datalakeBucket}}/*
14 |   - arn:aws:s3:::nyc-tlc/*
15 | - Effect: Allow
16 |   Action:
17 |   - s3:DeleteObject
18 |   Resource:
19 |   - arn:aws:s3:::{{codeBucket}}/*  
20 | - Effect: Allow
21 |   Action:
22 |   - kms:Encrypt
23 |   - kms:Decrypt
24 |   - kms:GenerateDataKey*
25 |   - kms:DescribeKey
26 |   Resource:
27 |   - '*'    


--------------------------------------------------------------------------------
/spark-on-eks/source/app_resources/native-spark-rbac.yaml:
--------------------------------------------------------------------------------
 1 | kind: RoleBinding
 2 | apiVersion: rbac.authorization.k8s.io/v1
 3 | metadata:
 4 |   name: {{MY_SA}}-role-binding
 5 |   namespace: spark
 6 | subjects:
 7 |   - kind: ServiceAccount
 8 |     name: {{MY_SA}}
 9 |     namespace: spark
10 | roleRef:
11 |   kind: Role
12 |   name: etl-workflow-role
13 |   apiGroup: rbac.authorization.k8s.io


--------------------------------------------------------------------------------
/spark-on-eks/source/app_resources/spark-operator-values.yaml:
--------------------------------------------------------------------------------
 1 | # image:
 2 | #   tag: v1beta2-1.2.3-3.1.1
 3 | serviceAccounts:
 4 |   spark:
 5 |     create: false
 6 |   sparkoperator:
 7 |     create: true  
 8 | metrics:
 9 | # -- Disable prometheus metric scraping
10 |   enable: false
11 | webhook:
12 |   enable: true
13 |   port: 443
14 |    
15 | 


--------------------------------------------------------------------------------
/spark-on-eks/source/app_resources/spark-template.yaml:
--------------------------------------------------------------------------------
  1 | apiVersion: argoproj.io/v1alpha1
  2 | kind: WorkflowTemplate
  3 | metadata:
  4 |   name: spark-template
  5 |   namespace: spark
  6 | spec:
  7 |   templates:
  8 |   - name: smalljob
  9 |     retryStrategy:
 10 |       limit: 3
 11 |       retryPolicy: "Always"
 12 |     inputs:
 13 |       # override defaults here
 14 |       parameters:
 15 |       - name: jobId
 16 |       - name: configUri
 17 |       - name: image
 18 |         value: ghcr.io/tripl-ai/arc:latest
 19 |       - name: pullPolicy
 20 |         value: "Always"    
 21 |       - name: executorInstances
 22 |         value: "1"
 23 |       - name: executorCores
 24 |         value: "1"
 25 |       - name: executorMemory
 26 |         value: "1"
 27 |       - name: sparkConf
 28 |         value: ""
 29 |       - name: tags
 30 |         value: ""
 31 |       - name: parameters
 32 |         value: ""
 33 |       # to exec each stages at a jupyter notebook, we can controle it by matching the environment. Some stages may not required in prod env.   
 34 |       - name: environment
 35 |         value: test  
 36 |     metadata:
 37 |       labels:
 38 |           app: spark
 39 |           workflowId: "{{workflow.uid}}"
 40 |     script:
 41 |       resources:
 42 |         limits:
 43 |           cpu: "1"
 44 |           memory: "1Gi"
 45 |       image: "{{inputs.parameters.image}}"
 46 |       command: ["/bin/sh"]
 47 |       source: |
 48 |         # verbose logging
 49 |         set -ex
 50 | 
 51 |         # print current hostname and ip
 52 |         hostname
 53 |         hostname -I
 54 | 
 55 |         # submit job
 56 |         /opt/spark/bin/spark-submit \
 57 |         --master k8s://kubernetes.default.svc:443 \
 58 |         --deploy-mode client \
 59 |         --class ai.tripl.arc.ARC \
 60 |         --name arc \
 61 |         --conf spark.authenticate=true \
 62 |         --conf spark.driver.extraJavaOptions="-XX:+UseG1GC" \
 63 |         --conf spark.driver.host=$(hostname -I)  \
 64 |         --conf spark.driver.memory=921m \
 65 |         --conf spark.executor.cores={{inputs.parameters.executorCores}} \
 66 |         --conf spark.executor.extraJavaOptions="-XX:+UseG1GC" \
 67 |         --conf spark.executor.instances={{inputs.parameters.executorInstances}} \
 68 |         --conf spark.executor.memory={{inputs.parameters.executorMemory}}G \
 69 |         --conf spark.io.encryption.enabled=true \
 70 |         --conf spark.kubernetes.authenticate.caCertFile=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt \
 71 |         --conf spark.kubernetes.authenticate.driver.serviceAccountName={{workflow.serviceAccountName}} \
 72 |         --conf spark.kubernetes.authenticate.oauthTokenFile=/var/run/secrets/kubernetes.io/serviceaccount/token \
 73 |         --conf spark.kubernetes.container.image.pullPolicy={{inputs.parameters.pullPolicy}} \
 74 |         --conf spark.kubernetes.container.image={{inputs.parameters.image}} \
 75 |         --conf spark.kubernetes.driver.limit.cores=1 \
 76 |         --conf spark.kubernetes.driver.pod.name=$(hostname) \
 77 |         --conf spark.kubernetes.executor.label.workflowId={{workflow.uid}} \
 78 |         --conf spark.kubernetes.executor.limit.cores={{inputs.parameters.executorCores}} \
 79 |         --conf spark.kubernetes.executor.podNamePrefix=$(hostname) \
 80 |         --conf spark.kubernetes.executor.request.cores={{inputs.parameters.executorCores}} \
 81 |         --conf spark.kubernetes.local.dirs.tmpfs=true \
 82 |         --conf spark.kubernetes.namespace={{workflow.namespace}} \
 83 |         --conf spark.network.crypto.enabled=true \
 84 |         --conf spark.sql.ansi.enabled=true \
 85 |         {{inputs.parameters.sparkConf}} \
 86 |         local:///opt/spark/jars/arc.jar \
 87 |         --etl.config.uri={{inputs.parameters.configUri}} \
 88 |         --etl.config.job.id={{inputs.parameters.jobId}} \
 89 |         --etl.config.environment={{inputs.parameters.environment}} \
 90 |         --etl.config.ignoreEnvironments=false \
 91 |         --etl.config.tags="service=arc workflowId={{workflow.uid}} pod={{pod.name}} serviceAccount={{workflow.serviceAccountName}} namespace={{workflow.namespace}} {{inputs.parameters.tags}}" \
 92 |         --ETL_CONF_EPOCH=$(date '+%s') --ETL_CONF_CURRENT_TIMESTAMP="'$(date -u '+%Y-%m-%d %H:%M:%S')'" \
 93 |         {{inputs.parameters.parameters}}
 94 | 
 95 |   - name: mediumjob
 96 |     retryStrategy:
 97 |       limit: 3
 98 |       retryPolicy: "Always"
 99 |     inputs:
100 |       # override defaults here
101 |       parameters:
102 |       - name: jobId
103 |       - name: configUri
104 |       - name: image
105 |         value: ghcr.io/tripl-ai/arc:latest
106 |       - name: pullPolicy
107 |         value: "Always"    
108 |       - name: executorInstances
109 |         value: "2"
110 |       - name: executorCores
111 |         value: "2"
112 |       - name: executorMemory
113 |         value: "10"
114 |       - name: sparkConf
115 |         value: ""
116 |       - name: tags
117 |         value: ""
118 |       - name: parameters
119 |         value: ""
120 |       # to exec each stages at a jupyter notebook, we can controle it by matching the environment. Some stages may not required in prod env.   
121 |       - name: environment
122 |         value: test  
123 |     metadata:
124 |       labels:
125 |           app: spark
126 |           workflowId: "{{workflow.uid}}"
127 |     script:
128 |       resources:
129 |         limits:
130 |           cpu: "2"
131 |           memory: "13Gi"
132 |       image: "{{inputs.parameters.image}}"
133 |       command: ["/bin/sh"]
134 |       source: |
135 |         # verbose logging
136 |         set -ex
137 | 
138 |         # print current hostname and ip
139 |         hostname
140 |         hostname -I
141 | 
142 |         # submit job
143 |         /opt/spark/bin/spark-submit \
144 |         --master k8s://kubernetes.default.svc:443 \
145 |         --deploy-mode client \
146 |         --class ai.tripl.arc.ARC \
147 |         --name arc \
148 |         --conf spark.authenticate=true \
149 |         --conf spark.driver.extraJavaOptions="-XX:+UseG1GC" \
150 |         --conf spark.driver.host=$(hostname -I)  \
151 |         --conf spark.driver.memory=2g \
152 |         --conf spark.executor.cores={{inputs.parameters.executorCores}} \
153 |         --conf spark.executor.extraJavaOptions="-XX:+UseG1GC" \
154 |         --conf spark.executor.instances={{inputs.parameters.executorInstances}} \
155 |         --conf spark.executor.memory={{inputs.parameters.executorMemory}}G \
156 |         --conf spark.io.encryption.enabled=true \
157 |         --conf spark.kubernetes.authenticate.caCertFile=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt \
158 |         --conf spark.kubernetes.authenticate.driver.serviceAccountName={{workflow.serviceAccountName}} \
159 |         --conf spark.kubernetes.authenticate.oauthTokenFile=/var/run/secrets/kubernetes.io/serviceaccount/token \
160 |         --conf spark.kubernetes.container.image.pullPolicy={{inputs.parameters.pullPolicy}} \
161 |         --conf spark.kubernetes.container.image={{inputs.parameters.image}} \
162 |         --conf spark.kubernetes.driver.limit.cores=1 \
163 |         --conf spark.kubernetes.driver.pod.name=$(hostname) \
164 |         --conf spark.kubernetes.executor.label.workflowId={{workflow.uid}} \
165 |         --conf spark.kubernetes.executor.limit.cores={{inputs.parameters.executorCores}} \
166 |         --conf spark.kubernetes.executor.podNamePrefix=$(hostname) \
167 |         --conf spark.kubernetes.executor.request.cores={{inputs.parameters.executorCores}} \
168 |         --conf spark.kubernetes.local.dirs.tmpfs=true \
169 |         --conf spark.kubernetes.namespace={{workflow.namespace}} \
170 |         --conf spark.network.crypto.enabled=true \
171 |         --conf spark.sql.ansi.enabled=true \
172 |         {{inputs.parameters.sparkConf}} \
173 |         local:///opt/spark/jars/arc.jar \
174 |         --etl.config.uri={{inputs.parameters.configUri}} \
175 |         --etl.config.job.id={{inputs.parameters.jobId}} \
176 |         --etl.config.environment={{inputs.parameters.environment}} \
177 |         --etl.config.ignoreEnvironments=false \
178 |         --etl.config.tags="service=arc workflowId={{workflow.uid}} pod={{pod.name}} serviceAccount={{workflow.serviceAccountName}} namespace={{workflow.namespace}} {{inputs.parameters.tags}}" \
179 |         --ETL_CONF_EPOCH=$(date '+%s') --ETL_CONF_CURRENT_TIMESTAMP="'$(date -u '+%Y-%m-%d %H:%M:%S')'" \
180 |         {{inputs.parameters.parameters}}
181 |   
182 |   - name: largejob
183 |     retryStrategy:
184 |       limit: 3
185 |       retryPolicy: "Always"
186 |     inputs:
187 |       # override defaults here
188 |       parameters:
189 |       - name: jobId
190 |       - name: configUri
191 |       - name: image
192 |         value: ghcr.io/tripl-ai/arc:latest
193 |       - name: pullPolicy
194 |         value: "Always"  
195 |       - name: executorInstances
196 |         value: "3"
197 |       - name: executorCores
198 |         value: "2"
199 |       - name: executorMemory
200 |         value: "12"
201 |       - name: sparkConf
202 |         value: ""
203 |       - name: tags
204 |         value: ""
205 |       - name: parameters
206 |         value: ""
207 |       # to exec each stages at a jupyter notebook, we can controle it by matching the environment. Some stages may not required in prod env.   
208 |       - name: environment
209 |         value: test  
210 |     metadata:
211 |       labels:
212 |           app: spark
213 |           workflowId: "{{workflow.uid}}"
214 |     script:
215 |       resources:
216 |         limits:
217 |           cpu: "3"
218 |           memory: "13Gi"
219 |       image: "{{inputs.parameters.image}}"
220 |       command: ["/bin/sh"]
221 |       source: |
222 |         # verbose logging
223 |         set -ex
224 | 
225 |         # print current hostname and ip
226 |         hostname
227 |         hostname -I
228 | 
229 |         # submit job
230 |         /opt/spark/bin/spark-submit \
231 |         --master k8s://kubernetes.default.svc:443 \
232 |         --deploy-mode client \
233 |         --class ai.tripl.arc.ARC \
234 |         --name arc \
235 |         --conf spark.authenticate=true \
236 |         --conf spark.driver.extraJavaOptions="-XX:+UseG1GC" \
237 |         --conf spark.driver.host=$(hostname -I)  \
238 |         --conf spark.driver.memory=4g \
239 |         --conf spark.executor.cores={{inputs.parameters.executorCores}} \
240 |         --conf spark.executor.extraJavaOptions="-XX:+UseG1GC" \
241 |         --conf spark.executor.instances={{inputs.parameters.executorInstances}} \
242 |         --conf spark.executor.memory={{inputs.parameters.executorMemory}}G \
243 |         --conf spark.io.encryption.enabled=true \
244 |         --conf spark.kubernetes.authenticate.caCertFile=/var/run/secrets/kubernetes.io/serviceaccount/ca.crt \
245 |         --conf spark.kubernetes.authenticate.driver.serviceAccountName={{workflow.serviceAccountName}} \
246 |         --conf spark.kubernetes.authenticate.oauthTokenFile=/var/run/secrets/kubernetes.io/serviceaccount/token \
247 |         --conf spark.kubernetes.container.image.pullPolicy={{inputs.parameters.pullPolicy}} \
248 |         --conf spark.kubernetes.container.image={{inputs.parameters.image}} \
249 |         --conf spark.kubernetes.driver.limit.cores=1 \
250 |         --conf spark.kubernetes.driver.pod.name=$(hostname) \
251 |         --conf spark.kubernetes.executor.label.workflowId={{workflow.uid}} \
252 |         --conf spark.kubernetes.executor.limit.cores={{inputs.parameters.executorCores}} \
253 |         --conf spark.kubernetes.executor.podNamePrefix=$(hostname) \
254 |         --conf spark.kubernetes.executor.request.cores={{inputs.parameters.executorCores}} \
255 |         --conf spark.kubernetes.local.dirs.tmpfs=true \
256 |         --conf spark.kubernetes.namespace={{workflow.namespace}} \
257 |         --conf spark.network.crypto.enabled=true \
258 |         --conf spark.sql.ansi.enabled=true \
259 |         {{inputs.parameters.sparkConf}} \
260 |         local:///opt/spark/jars/arc.jar \
261 |         --etl.config.uri={{inputs.parameters.configUri}} \
262 |         --etl.config.job.id={{inputs.parameters.jobId}} \
263 |         --etl.config.environment={{inputs.parameters.environment}} \
264 |         --etl.config.ignoreEnvironments=false \
265 |         --etl.config.tags="service=arc workflowId={{workflow.uid}} pod={{pod.name}} serviceAccount={{workflow.serviceAccountName}} namespace={{workflow.namespace}} {{inputs.parameters.tags}}" \
266 |         --ETL_CONF_EPOCH=$(date '+%s') --ETL_CONF_CURRENT_TIMESTAMP="'$(date -u '+%Y-%m-%d %H:%M:%S')'" \
267 |         {{inputs.parameters.parameters}}
268 |           
269 |   - name: sparklocal
270 |     inputs:
271 |       retryStrategy:
272 |         limit: 3
273 |         retryPolicy: "Always"
274 |       # override defaults here
275 |       parameters:
276 |       - name: jobId
277 |       - name: configUri
278 |       - name: image
279 |         value: ghcr.io/tripl-ai/arc:latest
280 |       - name: executorInstances
281 |         value: "1"
282 |       - name: executorCores
283 |         value: "1"
284 |       - name: executorMemory
285 |         value: "1"
286 |       - name: sparkConf
287 |         value: ""
288 |       - name: tags
289 |         value: ""
290 |       - name: parameters
291 |         value: ""
292 |       - name: pullPolicy
293 |         value: IfNotPresent
294 |       - name: environment
295 |         value: test   
296 |     metadata:
297 |       labels:
298 |         app: spark 
299 |         workflowId: "{{workflow.uid}}"
300 |     podSpecPatch: |
301 |       containers:
302 |         - name: main
303 |           resources:
304 |             requests:
305 |               cpu: "{{inputs.parameters.executorCores}}"
306 |               memory: "{{inputs.parameters.executorMemory}}Gi"
307 |     script:
308 |       image: "{{inputs.parameters.image}}"
309 |       command: ["/bin/sh"]
310 |       source: |
311 |         # verbose logging
312 |         set -ex
313 | 
314 |         # print current hostname and ip
315 |         hostname
316 |         hostname -I
317 | 
318 |         # submit job
319 |         # driver memory is set at 90% of executorMemory
320 |         /opt/spark/bin/spark-submit \
321 |         --master local[{{inputs.parameters.executorCores}}] \
322 |         --driver-memory $(({{inputs.parameters.executorMemory}} * 1024 * 90/100))m \
323 |         --driver-java-options "-XX:+UseG1GC" \
324 |         --class ai.tripl.arc.ARC \
325 |         --name arc \
326 |         --conf spark.driver.host=$(hostname -I)  \
327 |         --conf spark.driver.pod.name=$(hostname)-driver \
328 |         --conf spark.io.encryption.enabled=true \
329 |         --conf spark.sql.adaptive.enabled=true \
330 |         --conf spark.network.crypto.enabled=true \
331 |         --conf spark.ui.enabled=true \
332 |         --conf spark.sql.ansi.enabled=true \
333 |         {{inputs.parameters.sparkConf}} \
334 |         local:///opt/spark/jars/arc.jar \
335 |         --etl.config.uri={{inputs.parameters.configUri}} \
336 |         --etl.config.job.id={{inputs.parameters.jobId}} \
337 |         --etl.config.environment={{inputs.parameters.environment}} \
338 |         --etl.config.ignoreEnvironments=fales \
339 |         --etl.config.tags="service=arc workflowId={{workflow.uid}} pod={{pod.name}} serviceAccount={{workflow.serviceAccountName}} namespace={{workflow.namespace}} {{inputs.parameters.tags}}" \
340 |         --ETL_CONF_EPOCH=$(date '+%s') --ETL_CONF_CURRENT_TIMESTAMP="'$(date -u '+%Y-%m-%d %H:%M:%S')'" \
341 |         {{inputs.parameters.parameters}}        


--------------------------------------------------------------------------------
/spark-on-eks/source/cdk.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "app": "python3 app.py",
 3 |   "context": {
 4 |     "cluster_name": "spark-on-eks",
 5 |     "solution_name": "sql-based-etl-with-apache-spark-on-amazon-eks",
 6 |     "version": "2.0.0",
 7 |     "@aws-cdk/core:stackRelativeExports": true,
 8 |     "@aws-cdk/customresources:installLatestAwsSdkDefault": false
 9 |   }
10 | }
11 | 


--------------------------------------------------------------------------------
/spark-on-eks/source/example/native-spark-job-scheduler.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: "sparkoperator.k8s.io/v1beta2"
 2 | kind: SparkApplication
 3 | metadata:
 4 |   name: word-count
 5 |   namespace: spark
 6 | spec:
 7 |   type: Python
 8 |   pythonVersion: "3"
 9 |   mode: cluster
10 |   image: ghcr.io/tripl-ai/arc:arc_3.10.0_spark_3.0.3_scala_2.12_hadoop_3.2.0_1.0.0
11 |   imagePullPolicy: Always
12 |   mainApplicationFile: "s3a://$(BUCKET_PARAM)/app_code/job/wordcount.py"
13 |   arguments: ["s3a://nyc-tlc/csv_backup/yellow_tripdata*.csv","s3a://$(BUCKET_PARAM)/app_code/output/native"]
14 |   sparkVersion: "3.0.3"
15 |   sparkConf:
16 |     "spark.hadoop.fs.s3a.impl": "org.apache.hadoop.fs.s3a.S3AFileSystem"
17 |     "spark.hadoop.fs.s3a.aws.credentials.provider": "com.amazonaws.auth.WebIdentityTokenCredentialsProvider"
18 |     "spark.kubernetes.allocation.batch.size": "15" 
19 |     "spark.io.encryption.enabled": "true"
20 |     "spark.kubernetes.local.dirs.tmpfs": "true"
21 |   volumes:
22 |     - name: spark-local-dir-1
23 |       hostPath:
24 |         path: "/tmp"
25 |         type: Directory      
26 |   dynamicAllocation:
27 |     enabled: true
28 |     initialExecutors: 1
29 |     minExecutors: 1
30 |     maxExecutors: 20
31 |   restartPolicy:
32 |     type: OnFailure
33 |     onFailureRetries: 3
34 |     onFailureRetryInterval: 10
35 |     onSubmissionFailureRetries: 5
36 |     onSubmissionFailureRetryInterval: 5         
37 |   driver:
38 |     # driver run on Spot
39 |     affinity:
40 |       nodeAffinity:
41 |         requiredDuringSchedulingIgnoredDuringExecution:  
42 |           nodeSelectorTerms:
43 |             - matchExpressions:
44 |               - key: lifecycle
45 |                 operator: In 
46 |                 values: 
47 |                 - Ec2Spot
48 |                 # - OnDemand    
49 |     env:
50 |       - name: BUCKET_PARAM
51 |         valueFrom:
52 |           configMapKeyRef:
53 |             name: special-config
54 |             key: codeBucket
55 |     cores: 1
56 |     memory: "1G"
57 |     labels:
58 |       role: driver
59 |     serviceAccount: nativejob
60 |     volumeMounts:
61 |       - name: spark-local-dir-1
62 |         mountPath: "/tmp"
63 |   executor:
64 |    # executors run on Spot
65 |     affinity:
66 |       nodeAffinity:
67 |         requiredDuringSchedulingIgnoredDuringExecution:     
68 |           nodeSelectorTerms:
69 |             - matchExpressions:
70 |               - key: lifecycle
71 |                 operator: In 
72 |                 values: 
73 |                 - Ec2Spot    
74 |     cores: 1
75 |     memory: "4G"
76 |     labels:
77 |       role: executor
78 |     volumeMounts:
79 |       - name: spark-local-dir-1
80 |         mountPath: "/tmp"
81 | 


--------------------------------------------------------------------------------
/spark-on-eks/source/example/notebook/Spark_streaming_job.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%conf \n",
 10 |     "numRows=5\n",
 11 |     "streaming=false"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "markdown",
 16 |    "metadata": {},
 17 |    "source": [
 18 |     "# 1. Extract static data"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": null,
 24 |    "metadata": {},
 25 |    "outputs": [],
 26 |    "source": [
 27 |     "{\n",
 28 |     "  \"type\": \"DelimitedExtract\",\n",
 29 |     "  \"name\": \"extract initial table\",\n",
 30 |     "  \"environments\": [\"dev\", \"test\"],\n",
 31 |     "  \"inputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/data/initial_contacts.csv\",\n",
 32 |     "  \"schemaURI\":\"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/meta/contact_meta_0.json\",\n",
 33 |     "  \"outputView\": \"initial_raw\",            \n",
 34 |     "  \"delimiter\": \"Comma\",\n",
 35 |     "  \"header\": false,\n",
 36 |     "  \"quote\": \"None\"\n",
 37 |     "}"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": null,
 43 |    "metadata": {},
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "{\n",
 47 |     "  \"type\": \"TypingTransform\",\n",
 48 |     "  \"name\": \"apply table schema 0\",\n",
 49 |     "  \"environments\": [\"dev\", \"test\"],\n",
 50 |     "  \"schemaURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/meta/contact_meta_0.json\",\n",
 51 |     "  \"inputView\": \"initial_raw\",            \n",
 52 |     "  \"outputView\": \"initial_typed\",\n",
 53 |     "  \"numPartitions\": 1\n",
 54 |     "  \"persist\":true\n",
 55 |     "}"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": null,
 61 |    "metadata": {},
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "{\n",
 65 |     "  \"type\": \"DelimitedExtract\",\n",
 66 |     "  \"name\": \"extract updated data\",\n",
 67 |     "  \"environments\": [\"dev\", \"test\"],\n",
 68 |     "  \"inputURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/data/update_contacts.csv\",\n",
 69 |     "  \"schemaURI\":\"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/meta/contact_meta_0.json\",\n",
 70 |     "  \"outputView\": \"delta_raw\",            \n",
 71 |     "  \"delimiter\": \"Comma\",\n",
 72 |     "  \"header\": false,\n",
 73 |     "  \"quote\": \"None\"\n",
 74 |     "}"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": null,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "{\n",
 84 |     "  \"type\": \"TypingTransform\",\n",
 85 |     "  \"name\": \"apply table schema 0\",\n",
 86 |     "  \"environments\": [\"dev\", \"test\"],\n",
 87 |     "  \"schemaURI\": \"s3a://\"${ETL_CONF_DATALAKE_LOC}\"/app_code/meta/contact_meta_0.json\",\n",
 88 |     "  \"inputView\": \"delta_raw\",            \n",
 89 |     "  \"outputView\": \"delta_typed\",\n",
 90 |     "  \"numPartitions\": 1\n",
 91 |     "  \"persist\":true\n",
 92 |     "}"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "markdown",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "# 2. Turn on Spark Streaming"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": null,
105 |    "metadata": {},
106 |    "outputs": [],
107 |    "source": [
108 |     "%conf \n",
109 |     "streaming=true\n",
110 |     "streamingDuration=30"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "markdown",
115 |    "metadata": {},
116 |    "source": [
117 |     "# 2.1 Convert static data to stream\n",
118 |     "- Initial stream = Initial dataset\n",
119 |     "- Delta stream = Incremental dataset"
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": null,
125 |    "metadata": {},
126 |    "outputs": [],
127 |    "source": [
128 |     "{\n",
129 |     "  \"type\": \"RateExtract\",\n",
130 |     "  \"name\": \"create a streaming source\",\n",
131 |     "  \"environments\": [\n",
132 |     "    \"production\",\n",
133 |     "    \"test\"\n",
134 |     "  ],\n",
135 |     "  \"outputView\": \"initial_stream\",\n",
136 |     "  \"numPartitions\": 1,\n",
137 |     "  \"rowsPerSecond\": 5\n",
138 |     "}"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": null,
144 |    "metadata": {},
145 |    "outputs": [],
146 |    "source": [
147 |     "%sql outputView=\"stream_a\" name=\"simulate a stream\" sqlParams=input_table=initial_typed,stream_table=initial_stream numPartitions=1\n",
148 |     "\n",
149 |     "SELECT *\n",
150 |     "FROM ${stream_table} \n",
151 |     "INNER JOIN ${input_table}\n",
152 |     "ON ${input_table}._index = ${stream_table}.value"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": null,
158 |    "metadata": {},
159 |    "outputs": [],
160 |    "source": [
161 |     "{\n",
162 |     "  \"type\": \"RateExtract\",\n",
163 |     "  \"name\": \"create a streaming source\",\n",
164 |     "  \"environments\": [\n",
165 |     "    \"production\",\n",
166 |     "    \"test\"\n",
167 |     "  ],\n",
168 |     "  \"outputView\": \"delta_stream\",\n",
169 |     "  \"numPartitions\": 1,\n",
170 |     "  \"rowsPerSecond\": 5\n",
171 |     "}"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "code",
176 |    "execution_count": null,
177 |    "metadata": {},
178 |    "outputs": [],
179 |    "source": [
180 |     "%sql outputView=\"stream_b\" name=\"simulate b stream\" sqlParams=input_table=delta_typed,stream_table=delta_stream numPartitions=1\n",
181 |     "\n",
182 |     "SELECT *\n",
183 |     "FROM ${stream_table} \n",
184 |     "INNER JOIN ${input_table}\n",
185 |     "ON ${input_table}._index = ${stream_table}.value"
186 |    ]
187 |   },
188 |   {
189 |    "cell_type": "code",
190 |    "execution_count": null,
191 |    "metadata": {},
192 |    "outputs": [],
193 |    "source": [
194 |     "%sql outputView=\"join_streams\" name=\"join two streams\"\n",
195 |     "\n",
196 |     "SELECT\n",
197 |     "    initial.id as initial_id,\n",
198 |     "    initial.name as initial_name,\n",
199 |     "    initial.email as initial_email,\n",
200 |     "    initial.state as initial_state,\n",
201 |     "    delta.email as delta_email,\n",
202 |     "    delta.state as delta_state\n",
203 |     "FROM stream_a initial\n",
204 |     "INNER JOIN stream_b delta\n",
205 |     "ON initial.id = delta.id\n",
206 |     "where initial.email<>delta.email or initial.state<>delta.state\n"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": null,
212 |    "metadata": {},
213 |    "outputs": [],
214 |    "source": []
215 |   }
216 |  ],
217 |  "metadata": {
218 |   "kernelspec": {
219 |    "display_name": "Arc",
220 |    "language": "javascript",
221 |    "name": "arc"
222 |   },
223 |   "language_info": {
224 |    "file_extension": "arc",
225 |    "mimetype": "text/arc",
226 |    "name": "arc",
227 |    "nbconvert_exporter": "text",
228 |    "version": "2.4.2"
229 |   }
230 |  },
231 |  "nbformat": 4,
232 |  "nbformat_minor": 2
233 | }
234 | 


--------------------------------------------------------------------------------
/spark-on-eks/source/example/notebook/nyctaxi-job.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": null,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "%conf \n",
 10 |     "numRows=5\n",
 11 |     "showLog=true"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": null,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "%env \n",
 21 |     "ETL_CONF_DATA_URL=s3a://nyc-tlc/csv_backup\n",
 22 |     "ETL_CONF_JOB_URL=https://raw.githubusercontent.com/tripl-ai/arc-starter/master/examples/kubernetes"
 23 |    ]
 24 |   },
 25 |   {
 26 |    "cell_type": "code",
 27 |    "execution_count": null,
 28 |    "metadata": {},
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "{\n",
 32 |     "  \"type\": \"DelimitedExtract\",\n",
 33 |     "  \"name\": \"extract data from green_tripdata schema 0\",\n",
 34 |     "  \"environments\": [\"production\", \"test\"],\n",
 35 |     "  \"inputURI\": ${ETL_CONF_DATA_URL}\"/green_tripdata_2013-08.csv\",\n",
 36 |     "  \"outputView\": \"green_tripdata0_raw\",            \n",
 37 |     "  \"delimiter\": \"Comma\",\n",
 38 |     "  \"quote\" : \"DoubleQuote\",\n",
 39 |     "  \"header\": true,\n",
 40 |     "  \"persist\": true\n",
 41 |     "}"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": null,
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "{\n",
 51 |     "  \"type\": \"TypingTransform\",\n",
 52 |     "  \"name\": \"apply green_tripdata schema 0 data types\",\n",
 53 |     "  \"environments\": [\"production\", \"test\"],\n",
 54 |     "  \"schemaURI\": ${ETL_CONF_JOB_URL}\"/green_tripdata0.json\",\n",
 55 |     "  \"inputView\": \"green_tripdata0_raw\",            \n",
 56 |     "  \"outputView\": \"green_tripdata0\"\n",
 57 |     "}"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": null,
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "%sqlvalidate name=\"ensure no errors exist after data typing\" environments=production,test\n",
 67 |     "SELECT\n",
 68 |     "  SUM(error) = 0 AS valid\n",
 69 |     "  ,TO_JSON(\n",
 70 |     "      NAMED_STRUCT(\n",
 71 |     "        'count', COUNT(error), \n",
 72 |     "        'errors', SUM(error)\n",
 73 |     "      )\n",
 74 |     "  ) AS message\n",
 75 |     "FROM (\n",
 76 |     "  SELECT \n",
 77 |     "    CASE \n",
 78 |     "      WHEN SIZE(_errors) > 0 THEN 1 \n",
 79 |     "      ELSE 0 \n",
 80 |     "    END AS error \n",
 81 |     "  FROM green_tripdata0\n",
 82 |     ") input_table"
 83 |    ]
 84 |   },
 85 |   {
 86 |    "cell_type": "code",
 87 |    "execution_count": null,
 88 |    "metadata": {},
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "%sql name=\"ensure a query can be executed\" environments=production,test persist=true outputView=green_trip_filtered\n",
 92 |     "SELECT * \n",
 93 |     "FROM green_tripdata0\n",
 94 |     "WHERE store_and_fwd_flag = TRUE"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": null,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": []
103 |   }
104 |  ],
105 |  "metadata": {
106 |   "kernelspec": {
107 |    "display_name": "Arc",
108 |    "language": "javascript",
109 |    "name": "arc"
110 |   },
111 |   "language_info": {
112 |    "file_extension": "arc",
113 |    "mimetype": "text/arc",
114 |    "name": "arc",
115 |    "nbconvert_exporter": "text",
116 |    "version": "2.2.0"
117 |   }
118 |  },
119 |  "nbformat": 4,
120 |  "nbformat_minor": 2
121 | }
122 | 


--------------------------------------------------------------------------------
/spark-on-eks/source/example/nyctaxi-job-scheduler.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: argoproj.io/v1alpha1
 2 | kind: Workflow
 3 | metadata:
 4 |   generateName: nyctaxi-job-
 5 |   namespace: spark
 6 | spec:
 7 |   serviceAccountName: arcjob
 8 |   ttlStrategy:
 9 |     # keep workflows for 30m
10 |     secondsAfterCompletion: 1800
11 |   entrypoint: nyctaxi
12 |   templates:
13 |   - name: nyctaxi
14 |     dag:
15 |       tasks:
16 |         - name: step1-query
17 |           templateRef:
18 |             name: spark-template
19 |             template: sparklocal 
20 |           arguments:
21 |             parameters:
22 |             - name: jobId
23 |               value: nyctaxi 
24 |             - name: tags
25 |               value: "project=sqlbasedetl owner=myowner costcenter=66666" 
26 |             - name: configUri
27 |               value: https://raw.githubusercontent.com/tripl-ai/arc-starter/master/examples/kubernetes/nyctaxi.ipynb
28 |             - name: image
29 |               value: ghcr.io/tripl-ai/arc:arc_3.11.1_spark_3.1.2_scala_2.12_hadoop_3.2.0_1.0.0 
30 |             - name: parameters
31 |               value: "--ETL_CONF_DATA_URL=s3a://nyc-tlc/csv_backup --ETL_CONF_JOB_URL=https://raw.githubusercontent.com/tripl-ai/arc-starter/master/examples/kubernetes"
32 | 


--------------------------------------------------------------------------------
/spark-on-eks/source/example/scd2-job-scheduler.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: argoproj.io/v1alpha1
 2 | kind: Workflow
 3 | metadata:
 4 |   generateName: scd2-job-
 5 |   namespace: spark
 6 | spec:
 7 |   serviceAccountName: arcjob
 8 |   entrypoint: scd2-process
 9 |   arguments:
10 |     parameters:
11 |     - name: codeBucket
12 |       value: cfn_value
13 |   templates:
14 |   - name: scd2-process
15 |     dag:
16 |       tasks:
17 |         - name: initial-load
18 |           templateRef:
19 |             name: spark-template
20 |             template: smalljob
21 |           arguments:
22 |             parameters:
23 |             - name: jobId
24 |               value: initial-load
25 |             - name: image
26 |               value: ghcr.io/tripl-ai/arc:arc_3.10.0_spark_3.0.3_scala_2.12_hadoop_3.2.0_1.0.0
27 |             - name: configUri
28 |               value: "s3a://{{workflow.parameters.codeBucket}}/app_code/job/initial_load.ipynb"
29 |             - name: parameters
30 |               value: "--ETL_CONF_DATALAKE_LOC={{workflow.parameters.codeBucket}}"
31 |         - name: delta-load
32 |           templateRef:
33 |             name: spark-template
34 |             template: smalljob
35 |           arguments:
36 |             parameters:
37 |             - name: jobId
38 |               value: delta-load 
39 |             - name: image
40 |               value: ghcr.io/tripl-ai/arc:arc_3.10.0_spark_3.0.3_scala_2.12_hadoop_3.2.0_1.0.0
41 |             - name: configUri
42 |               value: "s3a://{{workflow.parameters.codeBucket}}/app_code/job/delta_load.ipynb"
43 |             - name: parameters
44 |               value: "--ETL_CONF_DATALAKE_LOC={{workflow.parameters.codeBucket}}"
45 |         - name: SCD2-merge
46 |           dependencies: [initial-load, delta-load]
47 |           templateRef:
48 |             name: spark-template
49 |             template: smalljob
50 |           arguments:
51 |             parameters:
52 |             - name: jobId
53 |               value: SCD2-merge 
54 |             - name: image
55 |               value: ghcr.io/tripl-ai/arc:arc_3.10.0_spark_3.0.3_scala_2.12_hadoop_3.2.0_1.0.0
56 |             - name: configUri
57 |               value: "s3a://{{workflow.parameters.codeBucket}}/app_code/job/scd2_merge.ipynb"
58 |             - name: parameters
59 |               value: "--ETL_CONF_DATALAKE_LOC={{workflow.parameters.codeBucket}}"           
60 |             - name: sparkConf
61 |               value: "--conf spark.databricks.delta.merge.repartitionBeforeWrite.enabled=true" 
62 | 


--------------------------------------------------------------------------------
/spark-on-eks/source/example/test/TEST-arc-jupyter.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: arc-jupyter
 5 |   namespace: spark
 6 | spec:
 7 |   serviceAccountName: arcjob
 8 |   securityContext:
 9 |     fsGroup: 65534
10 |   containers:
11 |   - name: arc-jupyter
12 |     image: ghcr.io/tripl-ai/arc-jupyter:latest 
13 |     imagePullPolicy: IfNotPresent
14 |     env:
15 |       - name: JAVA_OPTS
16 |         value: "-Xmx4G"
17 |       - name: CONF_NUM_ROWS
18 |         value: "10"
19 |       - name: CONF_STORAGE_LEVEL
20 |         value: "MEMORY_ONLY_SER"
21 |       - name: conf_spark_sql_extensions
22 |         value: "io.delta.sql.DeltaSparkSessionExtension"
23 |     resources:
24 |       requests:
25 |         cpu: "1"
26 |         memory: "5Gi"
27 | 


--------------------------------------------------------------------------------
/spark-on-eks/source/example/test/TEST-cron-job-scheduler.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: argoproj.io/v1alpha1
 2 | kind: CronWorkflow
 3 | metadata:
 4 |   namespace: spark
 5 |   generateName: word-count-
 6 | spec:
 7 |   schedule: "* 1 * * *"
 8 |   concurrencyPolicy: "Replace"
 9 |   startingDeadlineSeconds: 4500
10 |   workflowSpec:
11 |     serviceAccountName: nativejob
12 |     entrypoint: spotinterruption
13 |     # must complete in 4h
14 |     activeDeadlineSeconds: 14400
15 |     ttlStrategy:
16 |       secondsAfterCompletion: 28800
17 |     templates:
18 |     - name: spotinterruption
19 |       inputs:
20 |         parameters:
21 |         - name: image
22 |           value: ghcr.io/tripl-ai/arc:latest
23 |       script:
24 |         image: "{{inputs.parameters.image}}"
25 |         resources:
26 |           requests:
27 |             cpu: "1"
28 |             memory: "1Gi"
29 |         command: ["/bin/sh"]
30 |         source: |
31 |           # verbose logging
32 |           set -ex
33 | 
34 |           # submit job
35 |           /opt/spark/bin/spark-submit \
36 |           --master k8s://kubernetes.default.svc:443 \
37 |           --deploy-mode cluster \
38 |           --name 'Word Count' \
39 |           --conf spark.kubernetes.allocation.batch.size=10 \
40 |           --conf spark.kubernetes.container.image={{inputs.parameters.image}}  \
41 |           --conf spark.kubernetes.container.image.pullPolicy=Always \
42 |           --conf spark.kubernetes.namespace=spark \
43 |           --conf spark.driver.memory=1g \
44 |           --conf spark.kubernetes.driver.request.cores=2 \
45 |           --conf spark.kubernetes.driver.limit.cores=3 \
46 |           --conf spark.executor.instances=10 \
47 |           --conf spark.executor.memory=10g \
48 |           --conf spark.kubernetes.executor.request.cores=2 \
49 |           --conf spark.kubernetes.executor.limit.cores=3 \
50 |           --conf spark.hadoop.fs.s3a.impl=org.apache.hadoop.fs.s3a.S3AFileSystem \
51 |           --conf spark.hadoop.fs.s3a.fast.upload=true \
52 |           --conf spark.hadoop.fs.s3a.aws.credentials.provider=com.amazonaws.auth.WebIdentityTokenCredentialsProvider \
53 |           # --conf spark.kubernetes.driver.podTemplateFile='s3://$(BUCKET_PARAM)/app_code/job/driver-pod-template.yaml' \
54 |           # --conf spark.kubernetes.executor.podTemplateFile='s3://$(BUCKET_PARAM)/app_code/job/executor-pod-template.yaml' \
55 |           --conf spark.kubernetes.authenticate.driver.serviceAccountName=nativejob \
56 |           "s3a://{{codeBucket}}/app_code/job/wordcount.py" \
57 |           "s3a://amazon-reviews-pds/parquet/" \
58 |           "s3a://{{codeBucket}}/app_code/output/native"
59 |   
60 | 


--------------------------------------------------------------------------------
/spark-on-eks/source/lib/cdk_infra/eks_base_app.py:
--------------------------------------------------------------------------------
 1 | # // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # // SPDX-License-Identifier: MIT-0
 3 | from aws_cdk import Aws
 4 | from constructs import Construct
 5 | from aws_cdk.aws_eks import ICluster, KubernetesManifest
 6 | from lib.util.manifest_reader import *
 7 | import os
 8 | 
 9 | class EksBaseAppConst(Construct):
10 |     @property
11 |     def secret_created(self):
12 |         return self._ext_secret    
13 | 
14 |     def __init__(self,scope: Construct, id: str, eks_cluster: ICluster, **kwargs,) -> None:
15 |         super().__init__(scope, id, **kwargs)
16 | 
17 |         source_dir=os.path.split(os.environ['VIRTUAL_ENV'])[0]+'/source'
18 |         
19 |         # Add ALB ingress controller to EKS
20 |         self._alb = eks_cluster.add_helm_chart('ALBChart',
21 |             chart='aws-load-balancer-controller',
22 |             repository='https://aws.github.io/eks-charts',
23 |             release='alb',
24 |             version='1.5.5',
25 |             create_namespace=False,
26 |             namespace='kube-system',
27 |             values=load_yaml_replace_var_local(source_dir+'/app_resources/alb-values.yaml',
28 |                 fields={
29 |                     "{{region_name}}": Aws.REGION, 
30 |                     "{{cluster_name}}": eks_cluster.cluster_name, 
31 |                     "{{vpc_id}}": eks_cluster.vpc.vpc_id
32 |                 }
33 |             )
34 |         )
35 |         # Add Cluster Autoscaler to EKS
36 |         _var_mapping = {
37 |             "{{region_name}}": Aws.REGION, 
38 |             "{{cluster_name}}": eks_cluster.cluster_name, 
39 |         }
40 |         eks_cluster.add_helm_chart('ClusterAutoScaler',
41 |             chart='cluster-autoscaler',
42 |             repository='https://kubernetes.github.io/autoscaler',
43 |             release='nodescaler',
44 |             create_namespace=False,
45 |             namespace='kube-system',
46 |             values=load_yaml_replace_var_local(source_dir+'/app_resources/autoscaler-values.yaml',_var_mapping)
47 |         )
48 | 
49 |         # Add container insight (CloudWatch Log) to EKS
50 |         KubernetesManifest(self,'ContainerInsight',
51 |             cluster=eks_cluster, 
52 |             manifest=load_yaml_replace_var_remotely('https://raw.githubusercontent.com/aws-samples/amazon-cloudwatch-container-insights/latest/k8s-deployment-manifest-templates/deployment-mode/daemonset/container-insights-monitoring/quickstart/cwagent-fluentd-quickstart.yaml', 
53 |                     fields=_var_mapping,
54 |                     multi_resource=True
55 |             )
56 |         )
57 |         # Add external secrets controller to EKS
58 |         self._ext_secret = eks_cluster.add_helm_chart('SecretContrChart',
59 |             chart='kubernetes-external-secrets',
60 |             repository='https://external-secrets.github.io/kubernetes-external-secrets/',
61 |             release='external-secrets',
62 |             create_namespace=False,
63 |             namespace='kube-system',
64 |             values=load_yaml_replace_var_local(source_dir+'/app_resources/ex-secret-values.yaml',
65 |                 fields={
66 |                     '{{region_name}}': Aws.REGION
67 |                 }
68 |             )
69 |         )
70 |         self._ext_secret.node.add_dependency(self._alb)
71 |         # Add Spark Operator to EKS
72 |         eks_cluster.add_helm_chart('SparkOperatorChart',
73 |             chart='spark-operator',
74 |             repository='https://kubeflow.github.io/spark-operator',
75 |             release='spark-operator',
76 |             version='1.1.27',
77 |             create_namespace=True,
78 |             values=load_yaml_replace_var_local(source_dir+'/app_resources/spark-operator-values.yaml',fields={'':''})
79 |         )


--------------------------------------------------------------------------------
/spark-on-eks/source/lib/cdk_infra/eks_cluster.py:
--------------------------------------------------------------------------------
 1 | # // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # // SPDX-License-Identifier: MIT-0
 3 | 
 4 | from aws_cdk import (aws_eks as eks,aws_ec2 as ec2)
 5 | from aws_cdk.aws_iam import IRole
 6 | from constructs import Construct
 7 | from aws_cdk.lambda_layer_kubectl_v27 import KubectlV27Layer
 8 | 
 9 | class EksConst(Construct):
10 | 
11 |     @property
12 |     def my_cluster(self):
13 |         return self._my_cluster
14 | 
15 |     def __init__(self,scope: Construct, id:str, eksname: str, eksvpc: ec2.IVpc, noderole: IRole, eks_adminrole: IRole, **kwargs) -> None:
16 |         super().__init__(scope, id, **kwargs)
17 | 
18 |         # 1.Create EKS cluster without node group
19 |         self._my_cluster = eks.Cluster(self,'EKS',
20 |                 vpc= eksvpc,
21 |                 cluster_name=eksname,
22 |                 masters_role=eks_adminrole,
23 |                 output_cluster_name=True,
24 |                 version= eks.KubernetesVersion.V1_27,
25 |                 endpoint_access= eks.EndpointAccess.PUBLIC_AND_PRIVATE,
26 |                 default_capacity=0,
27 |                 kubectl_layer=KubectlV27Layer(self, 'kubectlV27Layer')
28 |         )
29 | 
30 |         # 2.Add Managed NodeGroup to EKS, compute resource to run Spark jobs
31 |         _managed_node = self._my_cluster.add_nodegroup_capacity('onDemand-mn',
32 |             nodegroup_name = 'etl-ondemand',
33 |             node_role = noderole,
34 |             desired_size = 1,
35 |             max_size = 5,
36 |             disk_size = 50,
37 |             instance_types = [ec2.InstanceType('m5.xlarge')],
38 |             labels = {'app':'spark', 'lifecycle':'OnDemand'},
39 |             subnets = ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS,one_per_az=True),
40 |             tags = {'Name':'OnDemand-'+eksname,'k8s.io/cluster-autoscaler/enabled': 'true', 'k8s.io/cluster-autoscaler/'+eksname: 'owned'}
41 |         )  
42 |     
43 | 
44 |         # 3. Add Spot managed NodeGroup to EKS (Run Spark exectutor on spot)
45 |         _spot_node = self._my_cluster.add_nodegroup_capacity('spot-mn',
46 |             nodegroup_name = 'etl-spot',
47 |             node_role = noderole,
48 |             desired_size = 1,
49 |             max_size = 30,
50 |             disk_size = 50,
51 |             instance_types=[ec2.InstanceType("r5.xlarge"),ec2.InstanceType("r4.xlarge"),ec2.InstanceType("r5a.xlarge")],
52 |             labels = {'app':'spark', 'lifecycle':'Ec2Spot'},
53 |             capacity_type=eks.CapacityType.SPOT,
54 |             tags = {'Name':'Spot-'+eksname, 'k8s.io/cluster-autoscaler/enabled': 'true', 'k8s.io/cluster-autoscaler/'+eksname: 'owned'}
55 |         )
56 | 
57 |         # # 4. Add Fargate NodeGroup to EKS, without setup cluster-autoscaler
58 |         # self._my_cluster.add_fargate_profile('FargateEnabled',
59 |         #     selectors =[{
60 |         #         "namespace": "spark"
61 |         #     }],
62 |         #     fargate_profile_name='sparkETL'
63 |         # )
64 | 


--------------------------------------------------------------------------------
/spark-on-eks/source/lib/cdk_infra/eks_service_account.py:
--------------------------------------------------------------------------------
 1 | # // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # // SPDX-License-Identifier: MIT-0
 3 | 
 4 | from aws_cdk import aws_iam as iam
 5 | from constructs import Construct
 6 | from aws_cdk.aws_secretsmanager import ISecret
 7 | from aws_cdk.aws_eks import ICluster
 8 | from lib.util.manifest_reader import *
 9 | import os
10 | 
11 | class EksSAConst(Construct):
12 | 
13 |     def __init__(self,scope: Construct, id:str, eks_cluster: ICluster, secret: ISecret, **kwargs,) -> None:
14 |         super().__init__(scope, id, **kwargs)
15 | 
16 | # //************************************v*************************************************************//
17 | # //***************************** SERVICE ACCOUNT, RBAC and IAM ROLES *******************************//
18 | # //****** Associating IAM role to K8s Service Account to provide fine-grain security control ******//
19 | # //***********************************************************************************************//
20 |         source_dir=os.path.split(os.environ['VIRTUAL_ENV'])[0]+'/source'
21 |          
22 |          # Cluster Auto-scaler
23 |         self._scaler_sa = eks_cluster.add_service_account('AutoScalerSa', 
24 |             name='cluster-autoscaler', 
25 |             namespace='kube-system'
26 |         )  
27 |         _scaler_role = load_yaml_local(source_dir+'/app_resources/autoscaler-iam-role.yaml')
28 |         for statmt in _scaler_role:
29 |             self._scaler_sa.add_to_principal_policy(iam.PolicyStatement.from_json(statmt))
30 | 
31 |         # ALB Ingress
32 |         self._alb_sa = eks_cluster.add_service_account('ALBServiceAcct', 
33 |             name='alb-aws-load-balancer-controller',
34 |             namespace='kube-system'
35 |         )
36 |         _alb_role = load_yaml_local(source_dir+'/app_resources/alb-iam-role.yaml')
37 |         for statmt in _alb_role:
38 |             self._alb_sa.add_to_principal_policy(iam.PolicyStatement.from_json(statmt))
39 | 
40 |         # External secret controller
41 |         self._secrets_sa = eks_cluster.add_service_account('ExSecretController',
42 |             name='external-secrets-controller',
43 |             namespace="kube-system"
44 |         )
45 |         self._secrets_sa.node.add_dependency(secret)
46 |         _secrets_role = load_yaml_replace_var_local(source_dir+'/app_resources/ex-secret-iam-role.yaml',
47 |                         fields={"{{secretsmanager}}": secret.secret_arn+"*"}
48 |                     )
49 |         for statmt in _secrets_role:
50 |             self._secrets_sa.add_to_principal_policy(iam.PolicyStatement.from_json(statmt))  


--------------------------------------------------------------------------------
/spark-on-eks/source/lib/cdk_infra/iam_roles.py:
--------------------------------------------------------------------------------
 1 | # // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # // SPDX-License-Identifier: MIT-0
 3 | 
 4 | import typing
 5 | 
 6 | from aws_cdk import (Tags, aws_iam as iam)
 7 | from typing import  List
 8 | from constructs import Construct
 9 | 
10 | class IamConst(Construct):
11 | 
12 |     @property
13 |     def managed_node_role(self):
14 |         return self._managed_node_role
15 | 
16 |     @property
17 |     def admin_role(self):
18 |         return self._clusterAdminRole
19 | 
20 |     def __init__(self,scope: Construct, id:str, cluster_name:str, **kwargs,) -> None:
21 |         super().__init__(scope, id, **kwargs)
22 | 
23 |         # EKS admin role
24 |         self._clusterAdminRole = iam.Role(self, 'clusterAdmin',
25 |             assumed_by= iam.AccountRootPrincipal()
26 |         )
27 |         self._clusterAdminRole.add_to_policy(iam.PolicyStatement(
28 |             resources=["*"],
29 |             actions=[
30 |                 "eks:Describe*",
31 |                 "eks:List*",
32 |                 "eks:AccessKubernetesApi",
33 |                 "ssm:GetParameter",
34 |                 "iam:ListRoles"
35 |             ],
36 |         ))
37 |         Tags.of(self._clusterAdminRole).add(
38 |             key='eks/%s/type' % cluster_name, 
39 |             value='admin-role'
40 |         )
41 | 
42 |         # Managed Node Group Instance Role
43 |         _managed_node_managed_policies = (
44 |             iam.ManagedPolicy.from_aws_managed_policy_name('AmazonEKSWorkerNodePolicy'),
45 |             iam.ManagedPolicy.from_aws_managed_policy_name('AmazonEKS_CNI_Policy'),
46 |             iam.ManagedPolicy.from_aws_managed_policy_name('AmazonEC2ContainerRegistryReadOnly'),
47 |             iam.ManagedPolicy.from_aws_managed_policy_name('CloudWatchAgentServerPolicy'), 
48 |         )
49 |         self._managed_node_role = iam.Role(self,'NodeInstance-Role',
50 |             role_name= cluster_name + '-NodeInstanceRole',
51 |             path='/',
52 |             assumed_by=iam.ServicePrincipal('ec2.amazonaws.com'),
53 |             managed_policies=list(_managed_node_managed_policies),
54 |         )


--------------------------------------------------------------------------------
/spark-on-eks/source/lib/cdk_infra/network_sg.py:
--------------------------------------------------------------------------------
 1 | # // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # // SPDX-License-Identifier: MIT-0
 3 | 
 4 | from aws_cdk import (Tags, aws_ec2 as ec2)
 5 | from constructs import Construct
 6 | import lib.util.get_aws_managed_prefix as custom
 7 | 
 8 | class NetworkSgConst(Construct):
 9 | 
10 |     @property
11 |     def vpc(self):
12 |         return self._vpc
13 | 
14 |     @property
15 |     def alb_jhub_sg(self):
16 |         return self._alb_jhub_sg
17 |     @property
18 |     def alb_argo_sg(self):
19 |         return self._alb_argo_sg
20 | 
21 |     # @property
22 |     # def efs_sg(self):
23 |     #     return self._eks_efs_sg
24 | 
25 | 
26 |     def __init__(self,scope: Construct, id:str, eksname:str, codebucket: str, **kwargs) -> None:
27 |         super().__init__(scope, id, **kwargs)
28 |         
29 |         # //*************************************************//
30 |         # //******************* NETWORK ********************//
31 |         # //************************************************//
32 |         # create VPC
33 |         self._vpc = ec2.Vpc(self, 'eksVpc',max_azs=2, nat_gateways=1)
34 |         Tags.of(self._vpc).add('Name', eksname + 'EksVpc')
35 | 
36 |         # ALB security group for Jupyter & Argo
37 |         prefixlist_peer=ec2.Peer.prefix_list(
38 |                 custom.AwsManagedPrefixList(self,'cr-getprefixId',
39 |                     custom.AwsManagedPrefixListProps(name='com.amazonaws.global.cloudfront.origin-facing')
40 |                 ).prefixlist_id
41 |             )
42 |         self._alb_jhub_sg=ec2.SecurityGroup(self,'JupyterALBInboundSG', vpc=self._vpc,description='Security Group for Jupyter ALB')
43 |         self._alb_argo_sg=ec2.SecurityGroup(self,'ArgoALBInboundSG', vpc=self._vpc,description='Security Group for Argo ALB')
44 |         self._alb_jhub_sg.add_ingress_rule(prefixlist_peer,ec2.Port.tcp(port=80))
45 |         self._alb_argo_sg.add_ingress_rule(prefixlist_peer,ec2.Port.tcp(port=2746))
46 |         Tags.of(self._alb_jhub_sg).add('Name','SparkOnEKS-JhubSg')
47 |         Tags.of(self._alb_argo_sg).add('Name','SparkOnEKS-ArgoSg')
48 | 
49 |         # VPC endpoint security group
50 |         self._vpc_endpoint_sg = ec2.SecurityGroup(self,'EndpointSg',
51 |             vpc=self._vpc,
52 |             description='Security Group for Endpoint',
53 |         )
54 |         self._vpc_endpoint_sg.add_ingress_rule(ec2.Peer.ipv4(self._vpc.vpc_cidr_block),ec2.Port.tcp(port=443))
55 |         Tags.of(self._vpc_endpoint_sg).add('Name','SparkOnEKS-VPCEndpointSg')
56 | 
57 |         # Add VPC endpoint 
58 |         self._vpc.add_gateway_endpoint("S3GatewayEndpoint",
59 |                                         service=ec2.GatewayVpcEndpointAwsService.S3,
60 |                                         subnets=[ec2.SubnetSelection(subnet_type=ec2.SubnetType.PUBLIC),
61 |                                                  ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE_WITH_EGRESS)])
62 |                                                  
63 |         # self._vpc.add_interface_endpoint("EcrDockerEndpoint",service=ec2.InterfaceVpcEndpointAwsService.ECR_DOCKER, security_groups=[self._vpc_endpoint_sg])
64 |         self._vpc.add_interface_endpoint("CWLogsEndpoint", service=ec2.InterfaceVpcEndpointAwsService.CLOUDWATCH_LOGS,security_groups=[self._vpc_endpoint_sg])
65 |         self._vpc.add_interface_endpoint("AthenaEndpoint", service=ec2.InterfaceVpcEndpointAwsService.ATHENA,security_groups=[self._vpc_endpoint_sg])
66 |         self._vpc.add_interface_endpoint("KMSEndpoint", service=ec2.InterfaceVpcEndpointAwsService.KMS,security_groups=[self._vpc_endpoint_sg])


--------------------------------------------------------------------------------
/spark-on-eks/source/lib/cdk_infra/s3_app_code.py:
--------------------------------------------------------------------------------
 1 | # // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # // SPDX-License-Identifier: MIT-0
 3 | 
 4 | from aws_cdk import (RemovalPolicy, aws_s3 as s3, aws_s3_deployment as s3deploy)
 5 | from constructs import Construct
 6 | import os
 7 | 
 8 | class S3AppCodeConst(Construct):
 9 | 
10 |     @property
11 |     def code_bucket(self):
12 |         return self._code_bucket
13 | 
14 |     def __init__(self,scope: Construct, id: str, **kwargs,) -> None:
15 |         super().__init__(scope, id, **kwargs)
16 | 
17 |        # Upload application code to S3 bucket 
18 |         artifact_bucket=s3.Bucket(self, id, 
19 |             block_public_access=s3.BlockPublicAccess.BLOCK_ALL,
20 |             encryption=s3.BucketEncryption.KMS_MANAGED,
21 |             removal_policy=RemovalPolicy.DESTROY,
22 |             auto_delete_objects=True,
23 |             access_control = s3.BucketAccessControl.LOG_DELIVERY_WRITE,
24 |             object_ownership=s3.ObjectOwnership.OBJECT_WRITER
25 |         )
26 | 
27 |         source_dir=os.path.split(os.environ['VIRTUAL_ENV'])[0]
28 |         s3deploy.BucketDeployment(self, "DeployCode",
29 |             sources=[s3deploy.Source.asset(source_dir+'/deployment/app_code')],
30 |             destination_bucket= artifact_bucket,
31 |             destination_key_prefix="app_code"
32 |         )
33 |         self._code_bucket = artifact_bucket.bucket_name
34 | 


--------------------------------------------------------------------------------
/spark-on-eks/source/lib/cdk_infra/spark_permission.py:
--------------------------------------------------------------------------------
  1 | # // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # // SPDX-License-Identifier: MIT-0
  3 | 
  4 | from aws_cdk import (aws_iam as iam)
  5 | from constructs import Construct
  6 | from aws_cdk.aws_eks import ICluster, KubernetesManifest
  7 | from lib.util.manifest_reader import load_yaml_replace_var_local
  8 | import os
  9 | 
 10 | class SparkOnEksSAConst(Construct):
 11 | 
 12 |     @property
 13 |     def jupyter_sa(self):
 14 |         return self._jupyter_sa.service_account_name
 15 | 
 16 |     def __init__(self,scope: Construct, id: str, 
 17 |         eks_cluster: ICluster, 
 18 |         login_name: str, 
 19 |         code_bucket: str, 
 20 |         datalake_bucket: str,
 21 |         **kwargs,) -> None:
 22 |         super().__init__(scope, id, **kwargs)
 23 | 
 24 | # //******************************************************************************************//
 25 | # //************************ SETUP PERMISSION FOR ARC SPARK JOBS ****************************//
 26 | # //******* create k8s namespace, service account, and IAM role for service account ********//
 27 | # //***************************************************************************************//
 28 |         source_dir=os.path.split(os.environ['VIRTUAL_ENV'])[0]+'/source'
 29 | 
 30 |         # create k8s namespace
 31 |         etl_ns = eks_cluster.add_manifest('SparkNamespace',{
 32 |                 "apiVersion": "v1",
 33 |                 "kind": "Namespace",
 34 |                 "metadata": { 
 35 |                     "name": "spark",
 36 |                     "labels": {"name":"spark"}
 37 |                 }
 38 |             }
 39 |         )
 40 |         jupyter_ns = eks_cluster.add_manifest('jhubNamespace',{
 41 |                 "apiVersion": "v1",
 42 |                 "kind": "Namespace",
 43 |                 "metadata": { 
 44 |                     "name": "jupyter",
 45 |                     "labels": {"name":"spark"}
 46 |                 }
 47 |             }
 48 |         )     
 49 |         
 50 |         # create k8s service account
 51 |         self._etl_sa = eks_cluster.add_service_account('ETLSa', 
 52 |             name='arcjob', 
 53 |             namespace='spark'
 54 |         )
 55 |         self._etl_sa.node.add_dependency(etl_ns)
 56 | 
 57 |         _etl_rb = KubernetesManifest(self,'ETLRoleBinding',
 58 |             cluster=eks_cluster,
 59 |             manifest=load_yaml_replace_var_local(source_dir+'/app_resources/etl-rbac.yaml', 
 60 |             fields= {
 61 |                 "{{MY_SA}}": self._etl_sa.service_account_name
 62 |             }, 
 63 |             multi_resource=True)
 64 |         )
 65 |         _etl_rb.node.add_dependency(self._etl_sa)
 66 | 
 67 |         self._jupyter_sa = eks_cluster.add_service_account('jhubServiceAcct', 
 68 |             name=login_name,
 69 |             namespace='jupyter'
 70 |         )
 71 |         self._jupyter_sa.node.add_dependency(jupyter_ns)
 72 | 
 73 |         # Associate AWS IAM role to K8s Service Account
 74 |         datalake_bucket=code_bucket if not datalake_bucket.strip() else datalake_bucket
 75 |         _bucket_setting={
 76 |                 "{{codeBucket}}": code_bucket,
 77 |                 "{{datalakeBucket}}": datalake_bucket
 78 |         }
 79 |         _etl_iam = load_yaml_replace_var_local(source_dir+'/app_resources/etl-iam-role.yaml',fields=_bucket_setting)
 80 |         for statmnt in _etl_iam:
 81 |             self._etl_sa.add_to_principal_policy(iam.PolicyStatement.from_json(statmnt))
 82 |             self._jupyter_sa.add_to_principal_policy(iam.PolicyStatement.from_json(statmnt))
 83 | 
 84 | # # //*************************************************************************************//
 85 | # # //******************** SETUP PERMISSION FOR NATIVE SPARK JOBS   **********************//
 86 | # # //***********************************************************************************//
 87 |         self._spark_sa = eks_cluster.add_service_account('NativeSparkSa',
 88 |             name='nativejob',
 89 |             namespace='spark'
 90 |         )
 91 |         self._spark_sa.node.add_dependency(etl_ns)
 92 | 
 93 |         _spark_rb = eks_cluster.add_manifest('sparkRoleBinding',
 94 |             load_yaml_replace_var_local(source_dir+'/app_resources/native-spark-rbac.yaml',
 95 |                 fields= {
 96 |                     "{{MY_SA}}": self._spark_sa.service_account_name
 97 |                 })
 98 |         )
 99 |         _spark_rb.node.add_dependency(self._spark_sa)
100 | 
101 |         _native_spark_iam = load_yaml_replace_var_local(source_dir+'/app_resources/native-spark-iam-role.yaml',fields=_bucket_setting)
102 |         for statmnt in _native_spark_iam:
103 |             self._spark_sa.add_to_principal_policy(iam.PolicyStatement.from_json(statmnt))
104 | 


--------------------------------------------------------------------------------
/spark-on-eks/source/lib/cloud_front_stack.py:
--------------------------------------------------------------------------------
 1 | # // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # // SPDX-License-Identifier: MIT-0
 3 | 
 4 | from aws_cdk import (
 5 |     NestedStack, Fn,
 6 |     aws_cloudfront_origins as origins,
 7 |     aws_cloudfront as cf,
 8 |     aws_elasticloadbalancingv2 as alb,
 9 |     aws_s3 as s3
10 | )
11 | from constructs import Construct
12 | 
13 | class NestedStack(NestedStack):
14 | 
15 |     @property
16 |     def jhub_cf(self):
17 |         return self._jhub_cf
18 | 
19 |     @property
20 |     def argo_cf(self):
21 |         return self._argo_cf
22 | 
23 |     def __init__(self, scope: Construct, id: str,logbucket: str,argo_alb_dns_name: str, jhub_alb_dns_name: str, **kwargs) -> None:
24 |         super().__init__(scope, id, **kwargs)
25 | 
26 | # //**********************************************************************************************************//
27 | # //*************************** Add CloudFront to enable HTTPS Endpoint (OPTIONAL) **************************//
28 | # //***** recommended way is to generate your own SSL certificate via AWS Certificate Manager ***************//
29 | # //****************************** add it to the application load balancer *********************************//
30 | # //*******************************************************************************************************//
31 |         self._bucket=s3.Bucket.from_bucket_name(self,'cf_logbucket', logbucket)
32 |         self._jhub_cf = add_distribution(self, 'jhub_dist', jhub_alb_dns_name, 80, self._bucket)
33 |         self._argo_cf = add_distribution(self, 'argo_dist', argo_alb_dns_name, 2746, self._bucket)
34 | 
35 | 
36 | def add_distribution(scope: Construct, id: str, alb_dns_name: str, port: int, logbucket: s3.IBucket
37 | ) -> cf.IDistribution:
38 | 
39 |     load_balancer_arn=Fn.get_att(alb_dns_name,"DNSName")
40 |     security_group_id=Fn.get_att(alb_dns_name,"SecurityGroups")
41 | 
42 |     alb2 = alb.ApplicationLoadBalancer.from_application_load_balancer_attributes(scope, id,
43 |             load_balancer_arn=load_balancer_arn.to_string(),
44 |             security_group_id=security_group_id.to_string(),
45 |             load_balancer_dns_name=alb_dns_name
46 |         )
47 |     _origin = origins.LoadBalancerV2Origin(alb2,
48 |         http_port=port,
49 |         protocol_policy=cf.OriginProtocolPolicy.HTTP_ONLY
50 |     )
51 |     dist = cf.Distribution(scope, "CF-"+id,
52 |         default_behavior={
53 |             "origin": _origin,
54 |             "allowed_methods": cf.AllowedMethods.ALLOW_ALL,
55 |             "cache_policy": cf.CachePolicy.CACHING_DISABLED,
56 |             "origin_request_policy": cf.OriginRequestPolicy.ALL_VIEWER,
57 |             "viewer_protocol_policy": cf.ViewerProtocolPolicy.REDIRECT_TO_HTTPS
58 |         },
59 |         minimum_protocol_version=cf.SecurityPolicyProtocol.TLS_V1_2_2019,
60 |         enable_logging=True,
61 |         log_bucket=logbucket
62 |     )
63 |     return dist.distribution_domain_name
64 | 


--------------------------------------------------------------------------------
/spark-on-eks/source/lib/spark_on_eks_stack.py:
--------------------------------------------------------------------------------
  1 | # // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
  2 | # // SPDX-License-Identifier: MIT-0
  3 | 
  4 | from aws_cdk import (Stack, Duration, RemovalPolicy, Aws, Fn, CfnParameter, aws_eks as eks,aws_secretsmanager as secmger,aws_kms as kms)
  5 | from constructs import Construct
  6 | from lib.cdk_infra.network_sg import NetworkSgConst
  7 | from lib.cdk_infra.iam_roles import IamConst
  8 | from lib.cdk_infra.eks_cluster import EksConst
  9 | from lib.cdk_infra.eks_service_account import EksSAConst
 10 | from lib.cdk_infra.eks_base_app import EksBaseAppConst
 11 | from lib.cdk_infra.s3_app_code import S3AppCodeConst
 12 | from lib.cdk_infra.spark_permission import SparkOnEksSAConst
 13 | from lib.util.manifest_reader import *
 14 | import json,os
 15 | 
 16 | class SparkOnEksStack(Stack):
 17 | 
 18 |     @property
 19 |     def code_bucket(self):
 20 |         return self.app_s3.code_bucket
 21 | 
 22 |     @property
 23 |     def argo_url(self):
 24 |         return self._argo_alb.value
 25 | 
 26 |     @property
 27 |     def jhub_url(self):
 28 |         return self._jhub_alb.value
 29 | 
 30 |     def __init__(self, scope: Construct, id: str, eksname: str, **kwargs) -> None:
 31 |         super().__init__(scope, id, **kwargs)
 32 | 
 33 |         source_dir=os.path.split(os.environ['VIRTUAL_ENV'])[0]+'/source'
 34 | 
 35 |         # Cloudformation input params
 36 |         datalake_bucket = CfnParameter(self, "datalakebucket", type="String",
 37 |             description="Your existing S3 bucket to be accessed by Jupyter Notebook and ETL job. Default: blank",
 38 |             default=""
 39 |         )
 40 |         login_name="sparkoneks"
 41 |         # login_name = CfnParameter(self, "jhubuser", type="String",
 42 |         #     description="Your username login to jupyter hub.",
 43 |         #     default="sparkoneks"
 44 |         # )
 45 | 
 46 |         # Auto-generate a user login in secrets manager
 47 |         key = kms.Key(self, 'KMSKey',removal_policy=RemovalPolicy.DESTROY,enable_key_rotation=True)
 48 |         key.add_alias("alias/secretsManager")
 49 |         jhub_secret = secmger.Secret(self, 'jHubPwd', 
 50 |             generate_secret_string=secmger.SecretStringGenerator(
 51 |                 exclude_punctuation=True,
 52 |                 secret_string_template=json.dumps({'username': login_name}),
 53 |                 # secret_string_template=json.dumps({'username': login_name.value_as_string}),
 54 |                 generate_string_key="password"),
 55 |             removal_policy=RemovalPolicy.DESTROY,
 56 |             encryption_key=key
 57 |         )
 58 | 
 59 |         # A new bucket to store app code and access logs
 60 |         self.app_s3 = S3AppCodeConst(self,'appcode')
 61 | 
 62 |         # 1. Setup EKS base infrastructure
 63 |         network_sg = NetworkSgConst(self,'network-sg', eksname, self.app_s3.code_bucket)
 64 |         iam = IamConst(self,'iam_roles', eksname)
 65 |         eks_cluster = EksConst(self,'eks_cluster', eksname, network_sg.vpc, iam.managed_node_role, iam.admin_role)
 66 |         EksSAConst(self, 'eks_sa', eks_cluster.my_cluster, jhub_secret)
 67 |         base_app=EksBaseAppConst(self, 'eks_base_app', eks_cluster.my_cluster)
 68 | 
 69 |         # 2. Setup Spark application access control
 70 |         app_security = SparkOnEksSAConst(self,'spark_service_account', 
 71 |             eks_cluster.my_cluster, 
 72 |             login_name,
 73 |             # login_name.value_as_string,
 74 |             self.app_s3.code_bucket,
 75 |             datalake_bucket.value_as_string
 76 |         )
 77 |         app_security.node.add_dependency(base_app.secret_created)
 78 |         # 3. Install Arc Jupyter notebook to as Spark ETL IDE
 79 |         jhub_install= eks_cluster.my_cluster.add_helm_chart('JHubChart',
 80 |             chart='jupyterhub',
 81 |             repository='https://jupyterhub.github.io/helm-chart',
 82 |             release='jupyterhub',
 83 |             version='1.2.0',
 84 |             namespace='jupyter',
 85 |             create_namespace=False,
 86 |             values=load_yaml_replace_var_local(source_dir+'/app_resources/jupyter-values.yaml', 
 87 |                 fields={
 88 |                     "{{codeBucket}}": self.app_s3.code_bucket,
 89 |                     "{{region}}": Aws.REGION
 90 |                 })
 91 |         )
 92 |         jhub_install.node.add_dependency(app_security)
 93 | 
 94 |         # get Arc Jupyter login from secrets manager
 95 |         name_parts= Fn.split('-',jhub_secret.secret_name)
 96 |         name_no_suffix=Fn.join('-',[Fn.select(0, name_parts), Fn.select(1, name_parts)])
 97 |         config_hub = eks.KubernetesManifest(self,'JHubConfig',
 98 |             cluster=eks_cluster.my_cluster,
 99 |             manifest=load_yaml_replace_var_local(source_dir+'/app_resources/jupyter-config.yaml', 
100 |                 fields= {
101 |                     "{{MY_SA}}": app_security.jupyter_sa,
102 |                     "{{REGION}}": Aws.REGION, 
103 |                     "{{SECRET_NAME}}": name_no_suffix,
104 |                     "{{INBOUND_SG}}": network_sg.alb_jhub_sg.security_group_id
105 |                 }, 
106 |                 multi_resource=True)
107 |         )
108 |         config_hub.node.add_dependency(jhub_install)
109 | 
110 |         # 4. Install ETL orchestrator - Argo
111 |         # can be replaced by other workflow tool, ie. Airflow
112 |         argo_install = eks_cluster.my_cluster.add_helm_chart('ARGOChart',
113 |             chart='argo-workflows',
114 |             repository='https://argoproj.github.io/argo-helm',
115 |             release='argo',
116 |             version='0.40.7',
117 |             namespace='argo',
118 |             create_namespace=True,
119 |             values=load_yaml_replace_var_local(source_dir+'/app_resources/argo-values.yaml',
120 |                 fields= {
121 |                     "{{INBOUND_SG}}": network_sg.alb_argo_sg.security_group_id
122 |                 })
123 |         )
124 |         argo_install.node.add_dependency(config_hub)
125 |         # Create a Spark workflow template with different T-shirt size
126 |         submit_tmpl = eks_cluster.my_cluster.add_manifest('SubmitSparkWrktmpl',
127 |             load_yaml_local(source_dir+'/app_resources/spark-template.yaml')
128 |         )
129 |         submit_tmpl.node.add_dependency(argo_install)
130 |    
131 |         # 5.(OPTIONAL) retrieve ALB DNS Name to enable Cloudfront in the following nested stack.
132 |         # Recommend to remove the CloudFront component
133 |         # Setup your TLS certificate with your own domain name.
134 |         self._jhub_alb=eks.KubernetesObjectValue(self, 'jhubALB',
135 |             cluster=eks_cluster.my_cluster,
136 |             json_path='..status.loadBalancer.ingress[0].hostname',
137 |             object_type='ingress.networking',
138 |             object_name='jupyterhub',
139 |             object_namespace='jupyter',
140 |             timeout=Duration.minutes(10)
141 |         )
142 |         self._jhub_alb.node.add_dependency(config_hub)
143 |         self._argo_alb = eks.KubernetesObjectValue(self, 'argoALB',
144 |             cluster=eks_cluster.my_cluster,
145 |             json_path='..status.loadBalancer.ingress[0].hostname',
146 |             object_type='ingress.networking',
147 |             object_name='argo-argo-workflows-server',
148 |             object_namespace='argo',
149 |             timeout=Duration.minutes(10)
150 |         )
151 |         self._argo_alb.node.add_dependency(argo_install)
152 | 
153 | 


--------------------------------------------------------------------------------
/spark-on-eks/source/lib/util/get_aws_managed_prefix.py:
--------------------------------------------------------------------------------
 1 | from aws_cdk import (Aws, aws_ec2 as ec2,aws_iam as iam, Fn)
 2 | from aws_cdk.custom_resources import (
 3 |     AwsCustomResource,
 4 |     AwsCustomResourcePolicy,
 5 |     PhysicalResourceId,
 6 |     AwsSdkCall
 7 | )
 8 | from constructs import Construct
 9 | 
10 | class AwsManagedPrefixListProps:
11 |     def __init__(self, name: str):
12 |         """
13 |         Name of the AWS managed prefix list.
14 |         See: https://docs.aws.amazon.com/vpc/latest/userguide/working-with-aws-managed-prefix-lists.html#available-aws-managed-prefix-lists
15 |         eg. com.amazonaws.global.cloudfront.origin-facing
16 |         """
17 |         self.name = name
18 | 
19 | class AwsManagedPrefixList(Construct):
20 |     def __init__(self, scope: Construct, id: str, props: AwsManagedPrefixListProps):
21 |         super().__init__(scope, id)
22 |         res = AwsCustomResource(
23 |             self, 'AWSCustomResource',
24 |             on_create=self.create(props),
25 |             policy=AwsCustomResourcePolicy.from_statements([
26 |                 iam.PolicyStatement(
27 |                     effect=iam.Effect.ALLOW,
28 |                     actions=['ec2:DescribeManagedPrefixLists'],
29 |                     resources=['*'],
30 |                 ),
31 |             ])
32 |         )
33 |         self.prefixlist_id=res.get_response_field("PrefixLists.0.PrefixListId")
34 | 
35 |     def create(self, props):
36 |         custom_params = {
37 |             'Filters': [
38 |                 {
39 |                     'Name': 'prefix-list-name',
40 |                     'Values': [props.name],
41 |                 },
42 |             ]
43 |         }
44 | 
45 |         return AwsSdkCall(
46 |                 service='EC2',
47 |                 action='describeManagedPrefixLists',
48 |                 parameters=custom_params,
49 |                 physical_resource_id=PhysicalResourceId.of(f"{id}-{Fn.select(0, Fn.split(':', self.node.addr))}"),
50 |                 region=Aws.REGION
51 |         )


--------------------------------------------------------------------------------
/spark-on-eks/source/lib/util/manifest_reader.py:
--------------------------------------------------------------------------------
 1 | # // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # // SPDX-License-Identifier: MIT-0
 3 | 
 4 | import yaml
 5 | import urllib.request as request
 6 | import os.path as path
 7 | import sys
 8 | 
 9 | def load_yaml_remotely(url, multi_resource=False):
10 |     try:
11 |         file_to_parse = request.urlopen(url)
12 |         if multi_resource:
13 |             yaml_data = list(yaml.full_load_all(file_to_parse))
14 |         else:
15 |             yaml_data = yaml.full_load(file_to_parse) 
16 |         # print(yaml_data)  
17 |     except:
18 |         print("Cannot read yaml config file {}, check formatting."
19 |                 "".format(file_to_parse))
20 |         sys.exit(1)
21 |         
22 |     return yaml_data 
23 | 
24 | def load_yaml_local(yaml_file, multi_resource=False):
25 | 
26 |     file_to_parse=path.join(path.dirname(__file__), yaml_file)
27 |     if not path.exists(file_to_parse):
28 |         print("The file {} does not exist"
29 |             "".format(file_to_parse))
30 |         sys.exit(1)
31 | 
32 |     try:
33 |         with open(file_to_parse, 'r') as yaml_stream:
34 |             if multi_resource:
35 |                 yaml_data = list(yaml.full_load_all(yaml_stream))
36 |             else:
37 |                 yaml_data = yaml.full_load(yaml_stream) 
38 |             # print(yaml_data)    
39 |     except:
40 |         print("Cannot read yaml config file {}, check formatting."
41 |                 "".format(file_to_parse))
42 |         sys.exit(1)
43 |         
44 |     return yaml_data 
45 | 
46 | def load_yaml_replace_var_remotely(url, fields, multi_resource=False):
47 |     try:
48 |         with request.urlopen(url) as f:
49 |             file_to_replace = f.read().decode('utf-8')
50 |             for searchwrd,replwrd in fields.items():
51 |                 file_to_replace = file_to_replace.replace(searchwrd, replwrd)
52 | 
53 |         if multi_resource:
54 |             yaml_data = list(yaml.full_load_all(file_to_replace))
55 |         else:
56 |             yaml_data = yaml.full_load(file_to_replace) 
57 |         # print(yaml_data)
58 |     except request.URLError as e:
59 |         print(e.reason)
60 |         sys.exit(1)
61 | 
62 |     return yaml_data
63 | 
64 | 
65 | def load_yaml_replace_var_local(yaml_file, fields, multi_resource=False, write_output=False):
66 | 
67 |     file_to_replace=path.join(path.dirname(__file__), yaml_file)
68 |     if not path.exists(file_to_replace):
69 |         print("The file {} does not exist"
70 |             "".format(file_to_replace))
71 |         sys.exit(1)
72 | 
73 |     try:
74 |         with open(file_to_replace, 'r') as f:
75 |             filedata = f.read()
76 | 
77 |             for searchwrd, replwrd in fields.items():
78 |                 filedata = filedata.replace(searchwrd, replwrd)
79 |             if multi_resource:
80 |                 yaml_data = list(yaml.full_load_all(filedata))
81 |             else:
82 |                 yaml_data = yaml.full_load(filedata) 
83 |         if write_output:
84 |             with open(file_to_replace, "w") as f:
85 |                 yaml.dump(yaml_data, f, default_flow_style=False, allow_unicode = True, sort_keys=False)
86 |     
87 |         # print(yaml_data)
88 |     except request.URLError as e:
89 |         print(e.reason)
90 |         sys.exit(1)
91 | 
92 |     return yaml_data
93 | 


--------------------------------------------------------------------------------
/spark-on-eks/source/package.json:
--------------------------------------------------------------------------------
1 | {
2 |   "dependencies": {
3 |     "aws-cdk": "^2.105.0"
4 |   }
5 | }
6 | 


--------------------------------------------------------------------------------
/spark-on-eks/source/requirements.txt:
--------------------------------------------------------------------------------
1 | -e .
2 | pytest


--------------------------------------------------------------------------------
/spark-on-eks/source/setup.py:
--------------------------------------------------------------------------------
 1 | # // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
 2 | # // SPDX-License-Identifier: MIT-0
 3 | import setuptools
 4 | 
 5 | try:
 6 |     with open("../README.md") as fp:
 7 |         long_description = fp.read()
 8 | except IOError as e:
 9 |     long_description = ''
10 | 
11 | setuptools.setup(
12 |     name="sql-based-etl",
13 |     version="3.0.0",
14 | 
15 |     description="A CDK v2 Python app for SQL-based ETL",
16 |     long_description=long_description,
17 |     long_description_content_type="text/markdown",
18 |     
19 |     author="meloyang",
20 | 
21 |     package_dir={"": "./"},
22 |     packages=setuptools.find_packages(where="./"),
23 | 
24 |     install_requires=[
25 |         "aws-cdk-lib==2.105.0",
26 |         "aws-cdk.lambda-layer-kubectl-v27==2.0.0",
27 |         "constructs>=10.0.0,<11.0.0",
28 |         "pyyaml==6.0.1",
29 |     ],
30 | 
31 |     python_requires=">=3.8",
32 | 
33 |     classifiers=[
34 |         "Development Status :: 4 - Beta",
35 | 
36 |         "Intended Audience :: Developers",
37 | 
38 |         "License :: OSI Approved :: MIT License",
39 | 
40 |         "Programming Language :: JavaScript",
41 |         "Programming Language :: Python :: 3 :: Only",
42 |         "Programming Language :: Python :: 3.8",
43 |         "Programming Language :: Python :: 3.9",
44 |         "Programming Language :: Python :: 3.10",
45 |         "Programming Language :: Python :: 3.11",
46 | 
47 |         "Topic :: Software Development :: Code Generators",
48 |         "Topic :: Utilities",
49 | 
50 |         "Typing :: Typed",
51 |     ],
52 | )
53 | 


--------------------------------------------------------------------------------