├── emr
    ├── airflow
    │   ├── mwaa_stack
    │   │   ├── mwaa
    │   │   │   ├── __init__.py
    │   │   │   └── mwaa_stack.py
    │   │   ├── requirements.txt
    │   │   ├── assets
    │   │   │   ├── requirements.txt
    │   │   │   └── dags
    │   │   │   │   ├── example_emr_job.py
    │   │   │   │   └── example_emr_eks_job.py
    │   │   ├── .gitignore
    │   │   ├── app.py
    │   │   ├── source.bat
    │   │   ├── cdk.json
    │   │   ├── setup.py
    │   │   └── README.md
    │   └── README.md
    ├── eks
    │   ├── java
    │   │   └── emr-eks-job-runner
    │   │   │   ├── .gitignore
    │   │   │   ├── Makefile
    │   │   │   ├── src
    │   │   │       └── main
    │   │   │       │   └── java
    │   │   │       │       └── aws
    │   │   │       │           └── example
    │   │   │       │               └── emrcontainers
    │   │   │       │                   ├── ExampleConstants.java
    │   │   │       │                   └── StartJobRunExample.java
    │   │   │   ├── run_example.sh
    │   │   │   ├── README.md
    │   │   │   └── pom.xml
    │   ├── videos
    │   │   ├── pod_templates
    │   │   │   ├── spot_pod_template.yaml
    │   │   │   ├── fargate_pod_template.yaml
    │   │   │   ├── ondemand_pod_template.yaml
    │   │   │   └── README.md
    │   │   ├── README.md
    │   │   ├── custom_images
    │   │   │   ├── requirements.txt
    │   │   │   ├── test
    │   │   │   │   └── gen_plot.py
    │   │   │   ├── Dockerfile
    │   │   │   ├── README.md
    │   │   │   └── generate_aqi_map.py
    │   │   └── external_metastores
    │   │   │   ├── hivejdbc.py
    │   │   │   ├── gluespark.py
    │   │   │   └── README.md
    │   ├── windy_city.py
    │   └── README.md
    ├── julia
    │   ├── julia_notebook.png
    │   ├── ijulia-kernel.sh
    │   ├── julia-1.6.1.sh
    │   └── README.md
    └── studio
    │   ├── README.md
    │   └── notebooks
    │       └── emr-studio-athena.ipynb
├── cdk
    ├── big-data-stack
    │   ├── requirements.txt
    │   ├── .vscode
    │   │   └── settings.json
    │   ├── .gitignore
    │   ├── stacks
    │   │   ├── vpc.py
    │   │   ├── utils.py
    │   │   ├── rds.py
    │   │   ├── emr.py
    │   │   └── eks.py
    │   ├── source.bat
    │   ├── cdk.json
    │   ├── app.py
    │   ├── setup.py
    │   └── README.md
    ├── emr-serverless-job-run
    │   ├── tests
    │   │   ├── __init__.py
    │   │   └── unit
    │   │   │   ├── __init__.py
    │   │   │   └── test_emr_serverless_job_run_stack.py
    │   ├── emr_serverless_job_run
    │   │   ├── __init__.py
    │   │   └── emr_serverless_job_run_stack.py
    │   ├── requirements-dev.txt
    │   ├── requirements.txt
    │   ├── .gitignore
    │   ├── source.bat
    │   ├── app.py
    │   ├── cdk.json
    │   └── README.md
    └── emr-serverless-vpc-to-vpc
    │   ├── tests
    │       ├── __init__.py
    │       └── unit
    │       │   ├── __init__.py
    │       │   └── test_emr_serverless_vpc_to_vpc_stack.py
    │   ├── emr_serverless_vpc_to_vpc
    │       ├── __init__.py
    │       └── emr_serverless_vpc_to_vpc_stack.py
    │   ├── requirements-dev.txt
    │   ├── requirements.txt
    │   ├── diagram.png
    │   ├── .gitignore
    │   ├── source.bat
    │   ├── pg_connect.py
    │   ├── app.py
    │   ├── cdk.json
    │   └── README.md
├── .gitignore
├── spark
    └── local-k8s
    │   ├── kind-config.yaml
    │   ├── spark-shell-pod.yaml
    │   └── README.md
├── README.md
├── reInvent_2018
    └── EMR
    │   ├── Makefile
    │   ├── Demo_Links.md
    │   ├── assets
    │       ├── scripts
    │       │   ├── spark_converter.py
    │       │   └── hive_converter.sql
    │       ├── notebook_code.md
    │       └── cloudformation
    │       │   ├── Spark_Cluster_Versions
    │       │       ├── v0_Initial_Revision.cf.yml
    │       │       ├── v1_Security_Settings.cf.yml
    │       │       ├── v2_Updated_Parameters.cf.yml
    │       │       ├── v3_Cluster_Size.cf.yml
    │       │       ├── v4_Auto_Terminate.cf.yml
    │       │       └── v5_SparkUI.cf.yml
    │       │   ├── Presto_Cluster.cf.yml
    │       │   └── Spark_Cluster.cf.yml
    │   └── create_sc_entries.sh
├── LICENSE
└── athena
    └── Athena_Exploration.md


/emr/airflow/mwaa_stack/mwaa/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/cdk/big-data-stack/requirements.txt:
--------------------------------------------------------------------------------
1 | -e .
2 | 


--------------------------------------------------------------------------------
/cdk/emr-serverless-job-run/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/cdk/emr-serverless-vpc-to-vpc/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/emr/airflow/mwaa_stack/requirements.txt:
--------------------------------------------------------------------------------
1 | -e .
2 | 


--------------------------------------------------------------------------------
/cdk/emr-serverless-job-run/tests/unit/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/cdk/emr-serverless-vpc-to-vpc/tests/unit/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/cdk/emr-serverless-job-run/emr_serverless_job_run/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/cdk/emr-serverless-job-run/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | pytest==6.2.5
2 | 


--------------------------------------------------------------------------------
/cdk/emr-serverless-vpc-to-vpc/emr_serverless_vpc_to_vpc/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/cdk/emr-serverless-vpc-to-vpc/requirements-dev.txt:
--------------------------------------------------------------------------------
1 | pytest==6.2.5
2 | 


--------------------------------------------------------------------------------
/emr/eks/java/emr-eks-job-runner/.gitignore:
--------------------------------------------------------------------------------
1 | .idea/
2 | *.iml
3 | target/
4 | 
5 | 


--------------------------------------------------------------------------------
/cdk/emr-serverless-job-run/requirements.txt:
--------------------------------------------------------------------------------
1 | aws-cdk-lib==2.43.1
2 | constructs>=10.0.0,<11.0.0
3 | 


--------------------------------------------------------------------------------
/cdk/emr-serverless-vpc-to-vpc/requirements.txt:
--------------------------------------------------------------------------------
1 | aws-cdk-lib==2.43.1
2 | constructs>=10.0.0,<11.0.0
3 | 


--------------------------------------------------------------------------------
/emr/julia/julia_notebook.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dacort/demo-code/HEAD/emr/julia/julia_notebook.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Python artifacts
2 | venv/
3 | 
4 | # Notebook checkpoints
5 | .ipynb_chekpoints/
6 | 
7 | # Sekrets
8 | .env
9 | 


--------------------------------------------------------------------------------
/cdk/emr-serverless-vpc-to-vpc/diagram.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dacort/demo-code/HEAD/cdk/emr-serverless-vpc-to-vpc/diagram.png


--------------------------------------------------------------------------------
/cdk/big-data-stack/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "python.formatting.provider": "black",
3 |     "python.pythonPath": ".venv/bin/python"
4 | }


--------------------------------------------------------------------------------
/emr/eks/videos/pod_templates/spot_pod_template.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Pod
3 | spec:
4 |   nodeSelector:
5 |     eks.amazonaws.com/capacityType: SPOT
6 | 


--------------------------------------------------------------------------------
/emr/airflow/mwaa_stack/assets/requirements.txt:
--------------------------------------------------------------------------------
1 | emr-containers @ https://github.com/dacort/emr-eks-airflow2-plugin/archive/main.zip
2 | apache-airflow[amazon]==2.0.2
3 | 


--------------------------------------------------------------------------------
/emr/eks/videos/pod_templates/fargate_pod_template.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Pod
3 | spec:
4 |   nodeSelector:
5 |     eks.amazonaws.com/capacityType: FARGATE
6 | 


--------------------------------------------------------------------------------
/emr/eks/videos/pod_templates/ondemand_pod_template.yaml:
--------------------------------------------------------------------------------
1 | apiVersion: v1
2 | kind: Pod
3 | spec:
4 |   nodeSelector:
5 |     eks.amazonaws.com/capacityType: ON_DEMAND
6 | 


--------------------------------------------------------------------------------
/cdk/emr-serverless-job-run/.gitignore:
--------------------------------------------------------------------------------
 1 | *.swp
 2 | package-lock.json
 3 | __pycache__
 4 | .pytest_cache
 5 | .venv
 6 | *.egg-info
 7 | 
 8 | # CDK asset staging directory
 9 | .cdk.staging
10 | cdk.out
11 | 


--------------------------------------------------------------------------------
/cdk/big-data-stack/.gitignore:
--------------------------------------------------------------------------------
 1 | *.swp
 2 | package-lock.json
 3 | __pycache__
 4 | .pytest_cache
 5 | .env
 6 | .venv
 7 | *.egg-info
 8 | 
 9 | # CDK asset staging directory
10 | .cdk.staging
11 | cdk.out
12 | 


--------------------------------------------------------------------------------
/cdk/emr-serverless-vpc-to-vpc/.gitignore:
--------------------------------------------------------------------------------
 1 | *.swp
 2 | package-lock.json
 3 | __pycache__
 4 | .pytest_cache
 5 | .venv
 6 | *.egg-info
 7 | 
 8 | # CDK asset staging directory
 9 | .cdk.staging
10 | cdk.out
11 | 


--------------------------------------------------------------------------------
/emr/airflow/mwaa_stack/.gitignore:
--------------------------------------------------------------------------------
 1 | *.swp
 2 | package-lock.json
 3 | __pycache__
 4 | .pytest_cache
 5 | .env
 6 | .venv
 7 | *.egg-info
 8 | 
 9 | # CDK asset staging directory
10 | .cdk.staging
11 | cdk.out
12 | 


--------------------------------------------------------------------------------
/emr/julia/ijulia-kernel.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | # Install IJulia Kernel as the emr-notebook user
4 | sudo -u emr-notebook JUPYTER=/emr/notebook-env/bin/jupyter /usr/local/bin/julia -e 'using Pkg; Pkg.add(["IJulia"])'


--------------------------------------------------------------------------------
/spark/local-k8s/kind-config.yaml:
--------------------------------------------------------------------------------
1 | # three node (two workers) cluster config
2 | kind: Cluster
3 | apiVersion: kind.x-k8s.io/v1alpha4
4 | nodes:
5 | - role: control-plane
6 | - role: worker
7 | - role: worker
8 | 


--------------------------------------------------------------------------------
/emr/eks/java/emr-eks-job-runner/Makefile:
--------------------------------------------------------------------------------
1 | all: target/aws-emr-eks-examples-1.0.jar
2 | 
3 | target/aws-emr-eks-examples-1.0.jar: src/main/java/aws/example/emrcontainers/*.java
4 | 	mvn package
5 | 
6 | clean:
7 | 	mvn clean


--------------------------------------------------------------------------------
/emr/airflow/mwaa_stack/app.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import os
 3 | 
 4 | from aws_cdk import core as cdk
 5 | 
 6 | from mwaa.mwaa_stack import MwaaStack
 7 | 
 8 | 
 9 | app = cdk.App()
10 | MwaaStack(app, "MwaaStack")
11 | 
12 | app.synth()
13 | 


--------------------------------------------------------------------------------
/emr/julia/julia-1.6.1.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | curl -OL https://julialang-s3.julialang.org/bin/linux/x64/1.6/julia-1.6.1-linux-x86_64.tar.gz
3 | 
4 | sudo mkdir -p /opt; sudo tar xf julia-1.6.1-linux-x86_64.tar.gz --directory /opt
5 | 
6 | sudo ln -s /opt/julia-1.6.1/bin/julia /usr/local/bin/julia
7 | 


--------------------------------------------------------------------------------
/emr/eks/java/emr-eks-job-runner/src/main/java/aws/example/emrcontainers/ExampleConstants.java:
--------------------------------------------------------------------------------
1 | package aws.example.emrcontainers;
2 | 
3 | public class ExampleConstants {
4 |     public static final String EMR_RELEASE_LABEL = "emr-6.3.0-latest";
5 |     public static final long SLEEP_AMOUNT_IN_MS = 1000;
6 | }
7 | 


--------------------------------------------------------------------------------
/spark/local-k8s/spark-shell-pod.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v1
 2 | kind: Pod
 3 | metadata:
 4 |   name: spark-shell-pod
 5 |   namespace: default
 6 |   labels:
 7 |     app: spark-shell
 8 | spec:
 9 |   serviceAccount: spark
10 |   containers:
11 |     - name: spark-kubernetes-driver
12 |       image: apache/spark:3.5.2
13 |       command: ["/bin/bash"]
14 |       args: ["-c", "trap : term INT; sleep infinity & wait"]


--------------------------------------------------------------------------------
/cdk/big-data-stack/stacks/vpc.py:
--------------------------------------------------------------------------------
 1 | from aws_cdk import core as cdk, aws_ec2 as ec2
 2 | 
 3 | 
 4 | class VPCStack(cdk.Stack):
 5 |     vpc: ec2.Vpc
 6 | 
 7 |     def __init__(self, scope: cdk.Construct, construct_id: str, **kwargs) -> None:
 8 |         super().__init__(scope, construct_id, **kwargs)
 9 | 
10 |         # We create a simple VPC here
11 |         self.vpc = ec2.Vpc(self, "EMRDemos", max_azs=3)  # default is all AZs in region


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # dacort's demo code
2 | 
3 | Bits of code I use during demos
4 | 
5 | - [AWS reInvent 2018](reInvent_2018/EMR) - Amazon EMR and AWS Service Catalog integration and Amazon EMR Notebook demo
6 | - [EMR on EKS](emr/eks) - Examples of using Amazon EMR on Amazon EKS, including an example notebook using [NOAA Integrated Surface Database](https://registry.opendata.aws/noaa-isd/)
7 | - [EMR Studio](emr/studio) - Example notebook for [EMR Studio demo](https://youtu.be/oVgyL5W9FPU)
8 | 


--------------------------------------------------------------------------------
/emr/eks/videos/README.md:
--------------------------------------------------------------------------------
1 | # A video series of EMR on EKS howto guides
2 | 
3 | - [Why EMR on EKS](https://youtu.be/2UMz72NRZss) - Describes common usage scenarios for EMR on EKS
4 | - [Running Jobs](https://youtu.be/eEEqzFGqG_M) - Shows how to submit jobs to the EMR Containers API
5 | - [External Metastores](https://youtu.be/zBXK5GTVUKU) - Demo of connecting to an RDS MySQL Hive metastore and Glue Data Catalog
6 |     - Related code is in [`./external_metastores/`](./external_metastores)


--------------------------------------------------------------------------------
/emr/eks/videos/custom_images/requirements.txt:
--------------------------------------------------------------------------------
 1 | appdirs==1.4.4
 2 | black==21.5b2
 3 | bokeh==2.3.2
 4 | click==8.0.1
 5 | Jinja2==3.0.1
 6 | MarkupSafe==2.0.1
 7 | mypy-extensions==0.4.3
 8 | numpy==1.20.3
 9 | packaging==20.9
10 | pathspec==0.8.1
11 | Pillow==8.3.2
12 | py4j==0.10.9
13 | pyparsing==2.4.7
14 | pyspark==3.1.3
15 | python-dateutil==2.8.1
16 | PyYAML==5.4.1
17 | regex==2021.4.4
18 | Shapely==1.7.1
19 | six==1.16.0
20 | toml==0.10.2
21 | tornado==6.1
22 | typing-extensions==3.10.0.0
23 | 


--------------------------------------------------------------------------------
/reInvent_2018/EMR/Makefile:
--------------------------------------------------------------------------------
 1 | RELEASE_BUCKET?=damons-reinvent-demo
 2 | PREFIX?=reinvent/
 3 | AWS_PROFILE?=default
 4 | 
 5 | deploy:
 6 | 	@aws --profile $(AWS_PROFILE) s3 sync assets/ "s3://$(RELEASE_BUCKET)/$(PREFIX)"
 7 | 	@echo "https://$(RELEASE_BUCKET).s3.amazonaws.com/$(PREFIX)cloudformation/Spark_Cluster.cf.yml"
 8 | 	@echo "https://$(RELEASE_BUCKET).s3.amazonaws.com/$(PREFIX)EMR_Spark_Pipeline.cf.yml"
 9 | 	@echo "https://$(RELEASE_BUCKET).s3.amazonaws.com/$(PREFIX)cloudformation/Presto_Cluster.cf.yml"
10 | 
11 | 


--------------------------------------------------------------------------------
/cdk/big-data-stack/source.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | 
 3 | rem The sole purpose of this script is to make the command
 4 | rem
 5 | rem     source .venv/bin/activate
 6 | rem
 7 | rem (which activates a Python virtualenv on Linux or Mac OS X) work on Windows.
 8 | rem On Windows, this command just runs this batch file (the argument is ignored).
 9 | rem
10 | rem Now we don't need to document a Windows command for activating a virtualenv.
11 | 
12 | echo Executing .venv\Scripts\activate.bat for you
13 | .venv\Scripts\activate.bat
14 | 


--------------------------------------------------------------------------------
/emr/airflow/mwaa_stack/source.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | 
 3 | rem The sole purpose of this script is to make the command
 4 | rem
 5 | rem     source .venv/bin/activate
 6 | rem
 7 | rem (which activates a Python virtualenv on Linux or Mac OS X) work on Windows.
 8 | rem On Windows, this command just runs this batch file (the argument is ignored).
 9 | rem
10 | rem Now we don't need to document a Windows command for activating a virtualenv.
11 | 
12 | echo Executing .venv\Scripts\activate.bat for you
13 | .venv\Scripts\activate.bat
14 | 


--------------------------------------------------------------------------------
/cdk/emr-serverless-job-run/source.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | 
 3 | rem The sole purpose of this script is to make the command
 4 | rem
 5 | rem     source .venv/bin/activate
 6 | rem
 7 | rem (which activates a Python virtualenv on Linux or Mac OS X) work on Windows.
 8 | rem On Windows, this command just runs this batch file (the argument is ignored).
 9 | rem
10 | rem Now we don't need to document a Windows command for activating a virtualenv.
11 | 
12 | echo Executing .venv\Scripts\activate.bat for you
13 | .venv\Scripts\activate.bat
14 | 


--------------------------------------------------------------------------------
/cdk/emr-serverless-vpc-to-vpc/source.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | 
 3 | rem The sole purpose of this script is to make the command
 4 | rem
 5 | rem     source .venv/bin/activate
 6 | rem
 7 | rem (which activates a Python virtualenv on Linux or Mac OS X) work on Windows.
 8 | rem On Windows, this command just runs this batch file (the argument is ignored).
 9 | rem
10 | rem Now we don't need to document a Windows command for activating a virtualenv.
11 | 
12 | echo Executing .venv\Scripts\activate.bat for you
13 | .venv\Scripts\activate.bat
14 | 


--------------------------------------------------------------------------------
/cdk/big-data-stack/stacks/utils.py:
--------------------------------------------------------------------------------
 1 | from aws_cdk import core as cdk, aws_s3 as s3
 2 | 
 3 | 
 4 | def get_or_create_bucket(
 5 |     stack: cdk.Stack, bucket_id: str, context_key: str = None
 6 | ) -> s3.Bucket:
 7 |     if context_key is None or stack.node.try_get_context(context_key) is None:
 8 |         return s3.Bucket(
 9 |             stack,
10 |             bucket_id,
11 |         )
12 |     else:
13 |         bucket_name = stack.node.try_get_context(context_key)
14 |         return s3.Bucket.from_bucket_name(stack, bucket_id, bucket_name)


--------------------------------------------------------------------------------
/emr/eks/java/emr-eks-job-runner/run_example.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | if [[ -z $* ]] ; then
 3 |     echo 'Supply the name of one of the example classes as an argument.'
 4 |     echo 'If there are arguments to the class, put them in quotes after the class name.'
 5 |     exit 1
 6 | fi
 7 | export CLASSPATH=target/emr-eks-examples-1.0.jar
 8 | export className=$1
 9 | echo "## Running $className..."
10 | shift
11 | echo "## arguments: $@"
12 | mvn exec:java -Dexec.mainClass="aws.example.emrcontainers.$className" -Dexec.args="$@" -Dexec.cleanupDaemonThreads=false


--------------------------------------------------------------------------------
/cdk/big-data-stack/cdk.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "app": "python3 app.py",
 3 |   "context": {
 4 |     "@aws-cdk/core:enableStackNameDuplicates": "true",
 5 |     "aws-cdk:enableDiffNoFail": "true",
 6 |     "@aws-cdk/core:stackRelativeExports": "true",
 7 |     "@aws-cdk/aws-ecr-assets:dockerIgnoreSupport": true,
 8 |     "@aws-cdk/aws-secretsmanager:parseOwnedSecretName": true,
 9 |     "@aws-cdk/aws-kms:defaultKeyPolicies": true,
10 |     "@aws-cdk/aws-s3:grantWriteWithoutAcl": true,
11 |     "@aws-cdk/aws-ecs-patterns:removeDefaultDesiredCount": true
12 |   }
13 | }
14 | 


--------------------------------------------------------------------------------
/cdk/emr-serverless-vpc-to-vpc/pg_connect.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import SparkSession
 2 | 
 3 | PG_HOSTNAME = "ip-10-0-XXX-XXX.us-west-2.compute.internal"
 4 | 
 5 | spark = SparkSession.builder.getOrCreate()
 6 | 
 7 | df = (
 8 |     spark.read.format("jdbc")
 9 |     .option(
10 |         "url", f"jdbc:postgresql://{PG_HOSTNAME}:5432/postgres"
11 |     )
12 |     .option("driver", "org.postgresql.Driver")
13 |     .option("dbtable", "users")
14 |     .option("user", "remote")
15 |     .option("password", "remote")
16 |     .load()
17 | )
18 | 
19 | df.show()
20 | print(df.count())
21 | 
22 | 


--------------------------------------------------------------------------------
/cdk/emr-serverless-job-run/tests/unit/test_emr_serverless_job_run_stack.py:
--------------------------------------------------------------------------------
 1 | import aws_cdk as core
 2 | import aws_cdk.assertions as assertions
 3 | 
 4 | from emr_serverless_job_run.emr_serverless_job_run_stack import EmrServerlessJobRunStack
 5 | 
 6 | # example tests. To run these tests, uncomment this file along with the example
 7 | # resource in emr_serverless_job_run/emr_serverless_job_run_stack.py
 8 | def test_sqs_queue_created():
 9 |     app = core.App()
10 |     stack = EmrServerlessJobRunStack(app, "emr-serverless-job-run")
11 |     template = assertions.Template.from_stack(stack)
12 | 
13 | #     template.has_resource_properties("AWS::SQS::Queue", {
14 | #         "VisibilityTimeout": 300
15 | #     })
16 | 


--------------------------------------------------------------------------------
/emr/eks/videos/external_metastores/hivejdbc.py:
--------------------------------------------------------------------------------
 1 | from os.path import abspath
 2 | from pyspark.sql import SparkSession
 3 | from pyspark.sql import Row
 4 | 
 5 | # warehouse_location points to the default location for managed databases and tables
 6 | warehouse_location = abspath("spark-warehouse")
 7 | spark = (
 8 |     SparkSession.builder.appName("hive-demo")
 9 |     .config("spark.sql.warehouse.dir", warehouse_location)
10 |     .enableHiveSupport()
11 |     .getOrCreate()
12 | )
13 | spark.sql("SHOW DATABASES").show()
14 | spark.sql("SELECT count(*) FROM rapid7_fdns_any").show()
15 | spark.sql("SELECT * FROM rapid7_fdns_any WHERE name LIKE '%.starlink.com' AND date = (SELECT MAX(date) from rapid7_fdns_any)").show()
16 | spark.stop()


--------------------------------------------------------------------------------
/cdk/emr-serverless-vpc-to-vpc/tests/unit/test_emr_serverless_vpc_to_vpc_stack.py:
--------------------------------------------------------------------------------
 1 | import aws_cdk as core
 2 | import aws_cdk.assertions as assertions
 3 | 
 4 | from emr_serverless_vpc_to_vpc.emr_serverless_vpc_to_vpc_stack import EmrServerlessVpcToVpcStack
 5 | 
 6 | # example tests. To run these tests, uncomment this file along with the example
 7 | # resource in emr_serverless_vpc_to_vpc/emr_serverless_vpc_to_vpc_stack.py
 8 | def test_sqs_queue_created():
 9 |     app = core.App()
10 |     stack = EmrServerlessVpcToVpcStack(app, "emr-serverless-vpc-to-vpc")
11 |     template = assertions.Template.from_stack(stack)
12 | 
13 | #     template.has_resource_properties("AWS::SQS::Queue", {
14 | #         "VisibilityTimeout": 300
15 | #     })
16 | 


--------------------------------------------------------------------------------
/emr/airflow/mwaa_stack/cdk.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "app": "python3 app.py",
 3 |   "context": {
 4 |     "@aws-cdk/aws-apigateway:usagePlanKeyOrderInsensitiveId": true,
 5 |     "@aws-cdk/core:enableStackNameDuplicates": "true",
 6 |     "aws-cdk:enableDiffNoFail": "true",
 7 |     "@aws-cdk/core:stackRelativeExports": "true",
 8 |     "@aws-cdk/aws-ecr-assets:dockerIgnoreSupport": true,
 9 |     "@aws-cdk/aws-secretsmanager:parseOwnedSecretName": true,
10 |     "@aws-cdk/aws-kms:defaultKeyPolicies": true,
11 |     "@aws-cdk/aws-s3:grantWriteWithoutAcl": true,
12 |     "@aws-cdk/aws-ecs-patterns:removeDefaultDesiredCount": true,
13 |     "@aws-cdk/aws-rds:lowercaseDbIdentifier": true,
14 |     "@aws-cdk/aws-efs:defaultEncryptionAtRest": true,
15 |     "@aws-cdk/aws-lambda:recognizeVersionProps": true
16 |   }
17 | }
18 | 


--------------------------------------------------------------------------------
/emr/eks/videos/external_metastores/gluespark.py:
--------------------------------------------------------------------------------
 1 | from os.path import expanduser, join, abspath
 2 | from pyspark.sql import SparkSession
 3 | from pyspark.sql import Row
 4 | 
 5 | # warehouse_location points to the default location for managed databases and tables
 6 | warehouse_location = abspath("spark-warehouse")
 7 | spark = (
 8 |     SparkSession.builder.appName("glue-demo")
 9 |     .config("spark.sql.warehouse.dir", warehouse_location)
10 |     .enableHiveSupport()
11 |     .getOrCreate()
12 | )
13 | 
14 | spark.sql("SHOW DATABASES").show()
15 | spark.sql("""
16 | SELECT id, snippet.title,
17 |   MAX(CAST(statistics.viewcount AS integer)) AS max_views,
18 |   MAX(CAST(statistics.likecount AS integer)) AS max_likes
19 | FROM damons_datalake.youtube
20 | GROUP BY 1, 2
21 | ORDER BY 3 DESC
22 | """).show(truncate=False)
23 | spark.stop()


--------------------------------------------------------------------------------
/reInvent_2018/EMR/Demo_Links.md:
--------------------------------------------------------------------------------
 1 | # Demo Links
 2 | 
 3 | Replace `damons-reinvent-demo` with your own S3 bucket. 😃
 4 | 
 5 | ## Presto Cluster
 6 | 
 7 | This is an example template that can be used to create a new Presto Product in the Service Catalog.
 8 | 
 9 | https://damons-reinvent-demo.s3.amazonaws.com/reinvent/cloudformation/Presto_Cluster.cf.yml
10 | 
11 | ## Spark Converter
12 | 
13 | Copy and paste the below into Job Parameters
14 | 
15 | s3://damons-reinvent-demo/reinvent/scripts/spark_converter.py
16 | s3://amazon-reviews-pds/tsv/amazon_reviews_us_Toys_v1_00.tsv.gz
17 | s3://damons-reinvent-demo/reinvent/spark/amazon_reviews/
18 | 
19 | ## Hive Converter
20 | 
21 | Copy and paste the below into Job Parameters
22 | 
23 | s3://damons-reinvent-demo/reinvent/scripts/hive_converter.sql
24 | -d INPUT=s3://amazon-reviews-pds/tsv/
25 | -d OUTPUT=s3://damons-reinvent-demo/reinvent/hive/query_output/


--------------------------------------------------------------------------------
/cdk/emr-serverless-job-run/app.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import os
 3 | 
 4 | import aws_cdk as cdk
 5 | 
 6 | from emr_serverless_job_run.emr_serverless_job_run_stack import EmrServerlessJobRunStack
 7 | 
 8 | 
 9 | app = cdk.App()
10 | EmrServerlessJobRunStack(app, "EmrServerlessJobRunStack",
11 |     # If you don't specify 'env', this stack will be environment-agnostic.
12 |     # Account/Region-dependent features and context lookups will not work,
13 |     # but a single synthesized template can be deployed anywhere.
14 | 
15 |     # Uncomment the next line to specialize this stack for the AWS Account
16 |     # and Region that are implied by the current CLI configuration.
17 | 
18 |     #env=cdk.Environment(account=os.getenv('CDK_DEFAULT_ACCOUNT'), region=os.getenv('CDK_DEFAULT_REGION')),
19 | 
20 |     # Uncomment the next line if you know exactly what Account and Region you
21 |     # want to deploy the stack to. */
22 | 
23 |     #env=cdk.Environment(account='123456789012', region='us-east-1'),
24 | 
25 |     # For more information, see https://docs.aws.amazon.com/cdk/latest/guide/environments.html
26 |     )
27 | 
28 | app.synth()
29 | 


--------------------------------------------------------------------------------
/cdk/emr-serverless-vpc-to-vpc/app.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import os
 3 | 
 4 | import aws_cdk as cdk
 5 | 
 6 | from emr_serverless_vpc_to_vpc.emr_serverless_vpc_to_vpc_stack import EmrServerlessVpcToVpcStack
 7 | 
 8 | 
 9 | app = cdk.App()
10 | EmrServerlessVpcToVpcStack(app, "EmrServerlessVpcToVpcStack",
11 |     # If you don't specify 'env', this stack will be environment-agnostic.
12 |     # Account/Region-dependent features and context lookups will not work,
13 |     # but a single synthesized template can be deployed anywhere.
14 | 
15 |     # Uncomment the next line to specialize this stack for the AWS Account
16 |     # and Region that are implied by the current CLI configuration.
17 | 
18 |     env=cdk.Environment(account=os.getenv('CDK_DEFAULT_ACCOUNT'), region=os.getenv('CDK_DEFAULT_REGION')),
19 | 
20 |     # Uncomment the next line if you know exactly what Account and Region you
21 |     # want to deploy the stack to. */
22 | 
23 |     # env=cdk.Environment(account='568026268536', region='us-west-2'),
24 | 
25 |     # For more information, see https://docs.aws.amazon.com/cdk/latest/guide/environments.html
26 |     )
27 | 
28 | app.synth()
29 | 


--------------------------------------------------------------------------------
/cdk/big-data-stack/stacks/rds.py:
--------------------------------------------------------------------------------
 1 | from aws_cdk import core as cdk, aws_ec2 as ec2, aws_rds as rds
 2 | 
 3 | 
 4 | class RDSStack(cdk.Stack):
 5 |     instance: rds.DatabaseInstance
 6 | 
 7 |     def __init__(self, scope: cdk.Construct, construct_id: str, vpc: ec2.IVpc, **kwargs) -> None:
 8 |         super().__init__(scope, construct_id, **kwargs)
 9 | 
10 |         self.instance = rds.DatabaseInstance(
11 |             self,
12 |             construct_id,
13 |             engine=rds.DatabaseInstanceEngine.mysql(
14 |                 version=rds.MysqlEngineVersion.VER_8_0
15 |             ),
16 |             vpc=vpc,
17 |             database_name="metastore",
18 |             removal_policy=cdk.RemovalPolicy.DESTROY,
19 |             deletion_protection=False
20 |         )
21 | 
22 |         self.instance.connections.allow_from_any_ipv4(ec2.Port.tcp(3306), "Allow mysql from anywhere")
23 | 
24 |         # May be able to do this in EMR stack
25 |         # .connections.security_groups[0].add_ingress_rule(
26 |         #     peer = ec2.Peer.ipv4(vpc.vpc_cidr_block),
27 |         #     connection = ec2.Port.tcp(80),
28 |         #     description="Allow http inbound from VPC"
29 |         # )
30 | 


--------------------------------------------------------------------------------
/emr/eks/java/emr-eks-job-runner/README.md:
--------------------------------------------------------------------------------
 1 | # Amazon EMR on EKS examples
 2 | 
 3 | ## Purpose
 4 | 
 5 | Shows how to use the AWS SDK for Java with Amazon EMR on EKS.
 6 | 
 7 | _Amazon EMR on EKS allows users to easily submit Spark jobs on Kubernetes._
 8 | 
 9 | ## Running the code
10 | 
11 | ### Prerequisites
12 | 
13 | - You must have an AWS account, and have your default credentials and AWS Region
14 |     configured as described in the [AWS Tools and SDKs Shared Configuration and
15 |     Credentials Reference Guide](https://docs.aws.amazon.com/credref/latest/refdocs/creds-config-files.html).
16 | 
17 | ### Building
18 | 
19 | - Run `mvn package` and then `./run_example.sh` with the class name of the code example.
20 | 
21 | ## Code examples
22 | 
23 | - [Submit an EMR on EKS job](./src/main/java/aws/example/emrcontainers/StartJobRunExample.java)
24 |   
25 |     `./run_example.sh StartJobRunExample "<VIRTUAL_CLUSTER_ID> <JOB_ROLE_ARN>"`
26 | 
27 | ## Additional Information
28 | - [Amazon EMR on EKS documentatin](https://docs.aws.amazon.com/emr/latest/EMR-on-EKS-DevelopmentGuide/emr-eks.html)
29 | - [Amazon EMR on EKS Best Practices Guide](https://aws.github.io/aws-emr-containers-best-practices/)


--------------------------------------------------------------------------------
/emr/eks/videos/custom_images/test/gen_plot.py:
--------------------------------------------------------------------------------
 1 | from bokeh.plotting import figure
 2 | from bokeh.io import output_file, show, export_png
 3 | from bokeh.io.webdriver import create_chromium_webdriver
 4 | 
 5 | import hashlib
 6 | 
 7 | def sha256sum(filename):
 8 |     h  = hashlib.sha256()
 9 |     b  = bytearray(128*1024)
10 |     mv = memoryview(b)
11 |     with open(filename, 'rb', buffering=0) as f:
12 |         for n in iter(lambda : f.readinto(mv), 0):
13 |             h.update(mv[:n])
14 |     return h.hexdigest()
15 | 
16 | def generate_plot(filename):
17 |     p = figure(plot_width=400, plot_height=400)
18 | 
19 |     p.circle([1, 2, 3, 4, 5], [6, 7, 2, 4, 5], size=15, line_color="navy", 
20 |     fill_color="orange", fill_alpha=0.5)
21 | 
22 |     # --no-sandbox is required per https://stackoverflow.com/q/50642308
23 |     # Maybe look into https://github.com/Zenika/alpine-chrome at some point
24 |     driver = create_chromium_webdriver(['--no-sandbox'])
25 |     export_png(p, filename=filename, webdriver=driver)
26 |     # get_screenshot_as_png
27 | 
28 | generate_plot("plot.png")
29 | hash = sha256sum("plot.png")
30 | assert hash == "ed2ffa2348560a7254753fe0ff70e811e3a0d1879c1609aeeb32efeb2feecc35"
31 | 
32 | print("All good! 🙌")


--------------------------------------------------------------------------------
/emr/airflow/mwaa_stack/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | 
 4 | with open("README.md") as fp:
 5 |     long_description = fp.read()
 6 | 
 7 | 
 8 | setuptools.setup(
 9 |     name="mwaa_stack",
10 |     version="0.0.1",
11 | 
12 |     description="An empty CDK Python app",
13 |     long_description=long_description,
14 |     long_description_content_type="text/markdown",
15 | 
16 |     author="author",
17 | 
18 |     package_dir={"": "mwaa"},
19 |     packages=setuptools.find_packages(where="mwaa"),
20 | 
21 |     install_requires=[
22 |         "aws-cdk.core==1.110.0",
23 |         "aws-cdk.aws_mwaa",
24 |         "aws-cdk.aws_s3_deployment"
25 |     ],
26 | 
27 |     python_requires=">=3.6",
28 | 
29 |     classifiers=[
30 |         "Development Status :: 4 - Beta",
31 | 
32 |         "Intended Audience :: Developers",
33 | 
34 |         "Programming Language :: JavaScript",
35 |         "Programming Language :: Python :: 3 :: Only",
36 |         "Programming Language :: Python :: 3.6",
37 |         "Programming Language :: Python :: 3.7",
38 |         "Programming Language :: Python :: 3.8",
39 | 
40 |         "Topic :: Software Development :: Code Generators",
41 |         "Topic :: Utilities",
42 | 
43 |         "Typing :: Typed",
44 |     ],
45 | )
46 | 


--------------------------------------------------------------------------------
/emr/eks/java/emr-eks-job-runner/pom.xml:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <project xmlns="http://maven.apache.org/POM/4.0.0"
 3 |          xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
 4 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
 5 |     <modelVersion>4.0.0</modelVersion>
 6 | 
 7 |     <groupId>aws.example.emrcontainers</groupId>
 8 |     <artifactId>emr-eks-examples</artifactId>
 9 |     <packaging>jar</packaging>
10 |     <version>1.0</version>
11 |     <name>Amazon EMR on EKS Examples</name>
12 | 
13 |     <dependencyManagement>
14 |         <dependencies>
15 |             <dependency>
16 |                 <groupId>software.amazon.awssdk</groupId>
17 |                 <artifactId>bom</artifactId>
18 |                 <version>2.17.29</version>
19 |                 <type>pom</type>
20 |                 <scope>import</scope>
21 |             </dependency>
22 |         </dependencies>
23 |     </dependencyManagement>
24 | 
25 |     <dependencies>
26 |         <dependency>
27 |             <groupId>software.amazon.awssdk</groupId>
28 |             <artifactId>emrcontainers</artifactId>
29 |             <version>2.17.29</version>
30 |         </dependency>
31 |     </dependencies>
32 | 
33 |     <properties>
34 |         <maven.compiler.source>11</maven.compiler.source>
35 |         <maven.compiler.target>11</maven.compiler.target>
36 |     </properties>
37 | 
38 | 
39 | 
40 | </project>


--------------------------------------------------------------------------------
/cdk/big-data-stack/app.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | import os
 3 | from stacks.emr_studio import EMRStudio
 4 | from stacks.emr_containers import EMRContainersStack
 5 | 
 6 | from aws_cdk import core as cdk
 7 | 
 8 | from stacks.vpc import VPCStack
 9 | from stacks.rds import RDSStack
10 | from stacks.emr import EMRStack
11 | from stacks.eks import EKSStack
12 | from stacks.emr_containers import EMRContainersStack
13 | 
14 | 
15 | app = cdk.App()
16 | 
17 | vpc = VPCStack(app, "VPCStack")
18 | 
19 | # These two stacks are disabled by default
20 | # I use them when to demo EMR with a MySQL-backed Hive metastore
21 | # rds = RDSStack(app, "RDSStack", vpc.vpc)
22 | # emr = EMRStack(
23 | #     app,
24 | #     "EMRStack",
25 | #     vpc.vpc,
26 | #     name="EMR with Hive Metastore",
27 | #     release_label="emr-5.32.0",
28 | #     rds_secret=rds.instance.secret,
29 | #     rds_connections=rds.instance.connections,
30 | # )
31 | 
32 | # The EKS stack requires bootstrapping
33 | # Run "cdk bootstrap aws://account/region"
34 | # You can also optionally specify an IAM role name to be mapped to a cluster admin
35 | # `-c eks_admin_role_name=AdminRole`
36 | eks = EKSStack(app, "EKSStack", vpc.vpc)
37 | 
38 | # Now add a virtual EMR cluster!
39 | emr_containers = EMRContainersStack(app, "EMRContainers", vpc.vpc, eks.cluster)
40 | 
41 | 
42 | # We want to add EMR Studio to the mix as well :)
43 | emr_studio = EMRStudio(app, "EMRStudio", vpc.vpc, "big-data-studio")
44 | 
45 | 
46 | app.synth()
47 | 


--------------------------------------------------------------------------------
/cdk/big-data-stack/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | 
 4 | with open("README.md") as fp:
 5 |     long_description = fp.read()
 6 | 
 7 | 
 8 | setuptools.setup(
 9 |     name="cdk_emr_metastores",
10 |     version="0.0.1",
11 | 
12 |     description="An empty CDK Python app",
13 |     long_description=long_description,
14 |     long_description_content_type="text/markdown",
15 | 
16 |     author="author",
17 | 
18 |     package_dir={"": "stacks"},
19 |     packages=setuptools.find_packages(where="stacks"),
20 | 
21 |     install_requires=[
22 |         "aws-cdk.core==1.95.0",
23 |         "aws_cdk.aws_ec2",
24 |         "aws-cdk.aws-rds",
25 |         "aws-cdk.aws-secretsmanager",
26 |         "aws_cdk.aws-emr",
27 |         "aws_cdk.aws-eks",
28 |         "aws_cdk.aws-emrcontainers",
29 |         "aws_cdk.aws-servicecatalog",
30 |     ],
31 | 
32 |     python_requires=">=3.6",
33 | 
34 |     classifiers=[
35 |         "Development Status :: 4 - Beta",
36 | 
37 |         "Intended Audience :: Developers",
38 | 
39 |         "License :: OSI Approved :: Apache Software License",
40 | 
41 |         "Programming Language :: JavaScript",
42 |         "Programming Language :: Python :: 3 :: Only",
43 |         "Programming Language :: Python :: 3.6",
44 |         "Programming Language :: Python :: 3.7",
45 |         "Programming Language :: Python :: 3.8",
46 | 
47 |         "Topic :: Software Development :: Code Generators",
48 |         "Topic :: Utilities",
49 | 
50 |         "Typing :: Typed",
51 |     ],
52 | )
53 | 


--------------------------------------------------------------------------------
/reInvent_2018/EMR/assets/scripts/spark_converter.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from pyspark.sql import SparkSession
 4 | 
 5 | if len(sys.argv) > 1:
 6 |     INPUT_LOCATION = sys.argv[1]
 7 |     OUTPUT_LOCATION = sys.argv[2]
 8 | else:
 9 |     INPUT_LOCATION = 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Toys_v1_00.tsv.gz'
10 |     OUTPUT_LOCATION = 's3://damons-reinvent-demo/reinvent/spark/amazon_reviews/'
11 | 
12 | # Utility to just take an input file and split it
13 | # df = spark.read.option("sep", "\t").option("header","true").csv(INPUT_LOCATION)
14 | # df.repartition(10).write.csv("s3://damons-reinvent-demo/reinvent/source_toys/")
15 | 
16 | if __name__ == "__main__":
17 |     if len(sys.argv) != 3:
18 |         print("Usage: spark_converter <input> <output>")
19 |         sys.exit(-1)
20 |     
21 |     # Initialize the spark context.
22 |     spark = SparkSession\
23 |         .builder\
24 |         .appName("SparkConverter")\
25 |         .config("spark.sql.parquet.fs.optimized.committer.optimization-enabled", "true")\
26 |         .getOrCreate()
27 |     
28 |     # Read in the desired TSV
29 |     df = spark.read.option('sep', '\t').option('header', 'true').csv(INPUT_LOCATION)
30 | 
31 |     # Repartition for multiple output files and write out to parquet
32 |     df.repartition(10).write.mode('overwrite').parquet(OUTPUT_LOCATION)
33 | 
34 | # To run: s3://damons-reinvent-demo/reinvent/scripts/spark_converter.py s3://amazon-reviews-pds/tsv/amazon_reviews_us_Toys_v1_00.tsv.gz s3://damons-reinvent-demo/reinvent/amazon_reviews/


--------------------------------------------------------------------------------
/cdk/emr-serverless-job-run/cdk.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "app": "python3 app.py",
 3 |   "watch": {
 4 |     "include": [
 5 |       "**"
 6 |     ],
 7 |     "exclude": [
 8 |       "README.md",
 9 |       "cdk*.json",
10 |       "requirements*.txt",
11 |       "source.bat",
12 |       "**/__init__.py",
13 |       "python/__pycache__",
14 |       "tests"
15 |     ]
16 |   },
17 |   "context": {
18 |     "@aws-cdk/aws-apigateway:usagePlanKeyOrderInsensitiveId": true,
19 |     "@aws-cdk/core:stackRelativeExports": true,
20 |     "@aws-cdk/aws-rds:lowercaseDbIdentifier": true,
21 |     "@aws-cdk/aws-lambda:recognizeVersionProps": true,
22 |     "@aws-cdk/aws-lambda:recognizeLayerVersion": true,
23 |     "@aws-cdk/aws-cloudfront:defaultSecurityPolicyTLSv1.2_2021": true,
24 |     "@aws-cdk-containers/ecs-service-extensions:enableDefaultLogDriver": true,
25 |     "@aws-cdk/aws-ec2:uniqueImdsv2TemplateName": true,
26 |     "@aws-cdk/core:checkSecretUsage": true,
27 |     "@aws-cdk/aws-iam:minimizePolicies": true,
28 |     "@aws-cdk/aws-ecs:arnFormatIncludesClusterName": true,
29 |     "@aws-cdk/core:validateSnapshotRemovalPolicy": true,
30 |     "@aws-cdk/aws-codepipeline:crossAccountKeyAliasStackSafeResourceName": true,
31 |     "@aws-cdk/aws-s3:createDefaultLoggingPolicy": true,
32 |     "@aws-cdk/aws-sns-subscriptions:restrictSqsDescryption": true,
33 |     "@aws-cdk/aws-apigateway:disableCloudWatchRole": true,
34 |     "@aws-cdk/core:enablePartitionLiterals": true,
35 |     "@aws-cdk/core:target-partitions": [
36 |       "aws",
37 |       "aws-cn"
38 |     ]
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/cdk/emr-serverless-vpc-to-vpc/cdk.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "app": "python3 app.py",
 3 |   "watch": {
 4 |     "include": [
 5 |       "**"
 6 |     ],
 7 |     "exclude": [
 8 |       "README.md",
 9 |       "cdk*.json",
10 |       "requirements*.txt",
11 |       "source.bat",
12 |       "**/__init__.py",
13 |       "python/__pycache__",
14 |       "tests"
15 |     ]
16 |   },
17 |   "context": {
18 |     "@aws-cdk/aws-apigateway:usagePlanKeyOrderInsensitiveId": true,
19 |     "@aws-cdk/core:stackRelativeExports": true,
20 |     "@aws-cdk/aws-rds:lowercaseDbIdentifier": true,
21 |     "@aws-cdk/aws-lambda:recognizeVersionProps": true,
22 |     "@aws-cdk/aws-lambda:recognizeLayerVersion": true,
23 |     "@aws-cdk/aws-cloudfront:defaultSecurityPolicyTLSv1.2_2021": true,
24 |     "@aws-cdk-containers/ecs-service-extensions:enableDefaultLogDriver": true,
25 |     "@aws-cdk/aws-ec2:uniqueImdsv2TemplateName": true,
26 |     "@aws-cdk/core:checkSecretUsage": true,
27 |     "@aws-cdk/aws-iam:minimizePolicies": true,
28 |     "@aws-cdk/aws-ecs:arnFormatIncludesClusterName": true,
29 |     "@aws-cdk/core:validateSnapshotRemovalPolicy": true,
30 |     "@aws-cdk/aws-codepipeline:crossAccountKeyAliasStackSafeResourceName": true,
31 |     "@aws-cdk/aws-s3:createDefaultLoggingPolicy": true,
32 |     "@aws-cdk/aws-sns-subscriptions:restrictSqsDescryption": true,
33 |     "@aws-cdk/aws-apigateway:disableCloudWatchRole": true,
34 |     "@aws-cdk/core:enablePartitionLiterals": true,
35 |     "@aws-cdk/core:target-partitions": [
36 |       "aws",
37 |       "aws-cn"
38 |     ]
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/emr/eks/videos/custom_images/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM 711395599931.dkr.ecr.us-east-2.amazonaws.com/notebook-spark/emr-6.3.0:latest
 2 | 
 3 | USER root
 4 | 
 5 | # Install Chrome
 6 | # This generates an image that is 2.89gb
 7 | RUN curl https://intoli.com/install-google-chrome.sh | bash && \
 8 |     mv /usr/bin/google-chrome-stable /usr/bin/chrome
 9 | 
10 | # This generates an image that is 3.13gb
11 | # RUN amazon-linux-extras install epel -y && \
12 | #     yum install -y chromium
13 | 
14 | # We need to upgrade pip in order to install pyproj
15 | RUN pip3 install --upgrade pip
16 | 
17 | # If you pip install as root, use this
18 | RUN pip3 install \
19 |     bokeh==2.3.2 \
20 |     boto3==1.17.93 \
21 |     chromedriver-py==91.0.4472.19.0 \
22 |     geopandas==0.9.0 \
23 |     selenium==3.141.0 \
24 |     shapely==1.7.1
25 | 
26 | RUN ln -s /usr/local/lib/python3.7/site-packages/chromedriver_py/chromedriver_linux64 /usr/local/bin/chromedriver
27 | 
28 | # Install bokeh sample data to a tmpdir
29 | RUN mkdir /root/.bokeh && \
30 |     echo "sampledata_dir: /usr/local/share/bokeh" > /root/.bokeh/config
31 | 
32 | RUN bokeh sampledata
33 | 
34 | # Also install census data into the image :)
35 | ADD https://www2.census.gov/geo/tiger/GENZ2020/shp/cb_2020_us_state_500k.zip  /usr/local/share/bokeh/
36 | ADD https://www2.census.gov/geo/tiger/GENZ2020/shp/cb_2020_us_county_500k.zip /usr/local/share/bokeh/
37 | RUN chmod 644 /usr/local/share/bokeh/cb*.zip
38 | 
39 | # This is a simple test to make sure generating the image works properly
40 | COPY test /test/
41 | 
42 | USER hadoop:hadoop
43 | 


--------------------------------------------------------------------------------
/reInvent_2018/EMR/assets/scripts/hive_converter.sql:
--------------------------------------------------------------------------------
 1 | -- Summary: This sample shows you how to convert Amazon review stored in S3 using Hive
 2 | 
 3 | -- Create table using sample data in S3.  Note: you can replace this S3 path with your own.
 4 | CREATE EXTERNAL TABLE IF NOT EXISTS `amazon_reviews_tsv`(
 5 |   `marketplace` string, 
 6 |   `customer_id` string, 
 7 |   `review_id` string, 
 8 |   `product_id` string, 
 9 |   `product_parent` string, 
10 |   `product_title` string, 
11 |   `product_category` string,
12 |   `star_rating` int, 
13 |   `helpful_votes` int, 
14 |   `total_votes` int, 
15 |   `vine` string, 
16 |   `verified_purchase` string, 
17 |   `review_headline` string, 
18 |   `review_body` string, 
19 |   `review_date` string
20 | )
21 | ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
22 | LOCATION 's3://amazon-reviews-pds/tsv/';
23 | 
24 | -- ${INPUT}
25 | 
26 | -- Total requests per operating system for a given time frame
27 | SET hive.groupby.position.alias=true;
28 | 
29 | INSERT OVERWRITE DIRECTORY '${OUTPUT}/top_toys/'
30 | ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t'
31 | SELECT product_id, product_title, count(*) AS num_reviews, avg(star_rating) AS avg_stars
32 | FROM amazon_reviews_tsv where product_category='Toys'
33 | GROUP BY 1, 2
34 | ORDER BY num_reviews DESC
35 | limit 100;
36 | 
37 | -- s3://damons-reinvent-demo/reinvent/scripts/hive_converter.sql -d INPUT=s3://amazon-reviews-pds/tsv/ -d OUTPUT=s3://damons-reinvent-demo/reinvent/hive/query_output/
38 | -- hive-script --run-hive-script --args -f s3://damons-reinvent-demo/reinvent/scripts/hive_converter.sql -d INPUT=s3://amazon-reviews-pds/tsv/ -d OUTPUT=s3://damons-reinvent-demo/reinvent/hive/query_output/


--------------------------------------------------------------------------------
/emr/airflow/mwaa_stack/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Welcome to your CDK Python project!
 3 | 
 4 | This is a blank project for Python development with CDK.
 5 | 
 6 | The `cdk.json` file tells the CDK Toolkit how to execute your app.
 7 | 
 8 | This project is set up like a standard Python project.  The initialization
 9 | process also creates a virtualenv within this project, stored under the `.venv`
10 | directory.  To create the virtualenv it assumes that there is a `python3`
11 | (or `python` for Windows) executable in your path with access to the `venv`
12 | package. If for any reason the automatic creation of the virtualenv fails,
13 | you can create the virtualenv manually.
14 | 
15 | To manually create a virtualenv on MacOS and Linux:
16 | 
17 | ```
18 | $ python3 -m venv .venv
19 | ```
20 | 
21 | After the init process completes and the virtualenv is created, you can use the following
22 | step to activate your virtualenv.
23 | 
24 | ```
25 | $ source .venv/bin/activate
26 | ```
27 | 
28 | If you are a Windows platform, you would activate the virtualenv like this:
29 | 
30 | ```
31 | % .venv\Scripts\activate.bat
32 | ```
33 | 
34 | Once the virtualenv is activated, you can install the required dependencies.
35 | 
36 | ```
37 | $ pip install -r requirements.txt
38 | ```
39 | 
40 | At this point you can now synthesize the CloudFormation template for this code.
41 | 
42 | ```
43 | $ cdk synth
44 | ```
45 | 
46 | To add additional dependencies, for example other CDK libraries, just add
47 | them to your `setup.py` file and rerun the `pip install -r requirements.txt`
48 | command.
49 | 
50 | ## Useful commands
51 | 
52 |  * `cdk ls`          list all stacks in the app
53 |  * `cdk synth`       emits the synthesized CloudFormation template
54 |  * `cdk deploy`      deploy this stack to your default AWS account/region
55 |  * `cdk diff`        compare deployed stack with current state
56 |  * `cdk docs`        open CDK documentation
57 | 
58 | Enjoy!
59 | 


--------------------------------------------------------------------------------
/cdk/emr-serverless-job-run/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # Welcome to your CDK Python project!
 3 | 
 4 | This is a blank project for CDK development with Python.
 5 | 
 6 | The `cdk.json` file tells the CDK Toolkit how to execute your app.
 7 | 
 8 | This project is set up like a standard Python project.  The initialization
 9 | process also creates a virtualenv within this project, stored under the `.venv`
10 | directory.  To create the virtualenv it assumes that there is a `python3`
11 | (or `python` for Windows) executable in your path with access to the `venv`
12 | package. If for any reason the automatic creation of the virtualenv fails,
13 | you can create the virtualenv manually.
14 | 
15 | To manually create a virtualenv on MacOS and Linux:
16 | 
17 | ```
18 | $ python3 -m venv .venv
19 | ```
20 | 
21 | After the init process completes and the virtualenv is created, you can use the following
22 | step to activate your virtualenv.
23 | 
24 | ```
25 | $ source .venv/bin/activate
26 | ```
27 | 
28 | If you are a Windows platform, you would activate the virtualenv like this:
29 | 
30 | ```
31 | % .venv\Scripts\activate.bat
32 | ```
33 | 
34 | Once the virtualenv is activated, you can install the required dependencies.
35 | 
36 | ```
37 | $ pip install -r requirements.txt
38 | ```
39 | 
40 | At this point you can now synthesize the CloudFormation template for this code.
41 | 
42 | ```
43 | $ cdk synth
44 | ```
45 | 
46 | To add additional dependencies, for example other CDK libraries, just add
47 | them to your `setup.py` file and rerun the `pip install -r requirements.txt`
48 | command.
49 | 
50 | ## Useful commands
51 | 
52 |  * `cdk ls`          list all stacks in the app
53 |  * `cdk synth`       emits the synthesized CloudFormation template
54 |  * `cdk deploy`      deploy this stack to your default AWS account/region
55 |  * `cdk diff`        compare deployed stack with current state
56 |  * `cdk docs`        open CDK documentation
57 | 
58 | Enjoy!
59 | 


--------------------------------------------------------------------------------
/emr/studio/README.md:
--------------------------------------------------------------------------------
 1 | # EMR Studio Demo Code
 2 | 
 3 | This is the associated code for the [Intro to Amazon EMR Studio](https://youtu.be/oVgyL5W9FPU) video.
 4 | 
 5 | - [WeatherDay.ipynb](WeatherDay.ipynb) - Notebook that uses [@zflamig](https://github.com/zflamig)'s original [birthday-weather](https://github.com/zflamig/birthday-weather) example that uses [ERA5 Zaar data](https://registry.opendata.aws/ecmwf-era5/) to draw a map of US weather for a given day.
 6 | 
 7 | ## CloudFormation Templates
 8 | 
 9 | There are two templates in this repository for use with EMR Studio. Please note that you can find more examples in the [EMR Studio Samples](https://github.com/aws-samples/emr-studio-samples) repository.
10 | 
11 | 1. [`full_studio_dependencies`](./cloudformation/full_studio_dependencies.cfn.yaml) - Creates everything you need in order to use EMR Studio including a new VPC with security groups and subnets tagged appropriately for use with [EMR Managed Policies](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-managed-iam-policies.html).
12 | 2. [`matplotlib_studio`](./cloudformation/matplotlib_studio.cfn.yaml) - Incorporates the above template and also creates a new Studio associated with the AWS SSO username you provide. Also includes a Service Catalog cluster template that installs `basemap` for usage with matplotlib and the `WeatherDay` notebook above.
13 | 
14 | ## Scheduling Notebooks
15 | 
16 | In order to schedule, you need three pieces of information:
17 | - Editor ID
18 | - Cluster ID
19 | - Service role name
20 | 
21 | ```shell
22 | export EDITOR_ID=e-AAABBB
23 | export CLUSTER_ID=j-CCCDDD
24 | ```
25 | 
26 | 
27 | ```shell
28 | aws emr start-notebook-execution \
29 |   --editor-id ${EDITOR_ID} \
30 |   --notebook-params '{"weather_date": "2019-09-01"}' \
31 |   --relative-path demo-code/emr/studio/WeatherDay.ipynb \
32 |   --notebook-execution-name Summer \
33 |   --execution-engine '{"Id" : "'${CLUSTER_ID}'"}' \
34 |   --service-role EMR_Notebooks_DefaultRole
35 | ```
36 | 
37 | ```shell
38 | aws emr describe-notebook-execution --notebook-execution-id ex-FFFFGGGG
39 | ```
40 | 
41 | ```shell
42 | aws s3 cp s3://<EMR_STUDIO_BUCKET>/e-AAABBB/executions/ex-FFFFGGGG/WeatherDay.ipynb .
43 | ```


--------------------------------------------------------------------------------
/emr/eks/windy_city.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from pyspark.sql import SparkSession
 4 | from pyspark.sql import functions as F
 5 | from pyspark.sql.types import DoubleType
 6 | 
 7 | NOAA_ISD = "s3://noaa-global-hourly-pds/2021/"
 8 | 
 9 | def topDays(spark, longLeft, latBottom, longRight, latTop):
10 |     # Load data for 2021
11 |     df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(NOAA_ISD)
12 | 
13 |     # Convert lat/long columns to doubles
14 |     df = df \
15 |         .withColumn('LATITUDE', df.LATITUDE.cast(DoubleType())) \
16 |         .withColumn('LONGITUDE', df.LONGITUDE.cast(DoubleType()))
17 |     
18 |     # Exclude missing values and filter on our bounding box
19 |     seadf = df \
20 |         .filter(F.split(df.WND, ",")[3] != '9999') \
21 |         .filter(df.LATITUDE >= latBottom) \
22 |         .filter(df.LATITUDE <= latTop) \
23 |         .filter(df.LONGITUDE >= longLeft) \
24 |         .filter(df.LONGITUDE <= longRight)
25 | 
26 |     # Pull out day and windspeed
27 |     wind_date_df = seadf \
28 |         .select("DATE", "NAME", "WND") \
29 |         .withColumn("windSpeed", F.split(seadf.WND, ",")[3].cast(DoubleType())/10 ) \
30 |         .withColumn("ymd", F.split(df.DATE, "T")[0])
31 | 
32 |     # Find top speed for reach day!
33 |     wind_date_df.groupBy("ymd").agg({'windSpeed':'max'}).orderBy("max(windSpeed)", ascending=False).show(50)
34 | 
35 | if __name__ == "__main__":
36 |     """
37 |         Usage: windy_city [bbox]
38 |                 e.g. -122.46,47.48,-122.22,47.73 for Seattle
39 |     """
40 |     spark = SparkSession\
41 |         .builder\
42 |         .appName("WindyCity")\
43 |         .getOrCreate()
44 |     
45 |     # Use http://tools.geofabrik.de/calc/#type=geofabrik_standard&bbox=-122.459696,47.481002,-122.224433,47.734136&tab=1&proj=EPSG:4326&places=2 to 
46 |     # test out or find bounding boxes
47 |     bbox = [float(val) for val in sys.argv[1].split(',')] if len(sys.argv) > 1 else [-122.459696,47.481002,-122.224433,47.734136]
48 | 
49 |     topDays(spark, *bbox)
50 | 
51 |     spark.stop()
52 | 
53 |     # -122.459696,47.481002,-122.224433,47.734136
54 |     # -122.48,47.41,-122.16,47.49
55 |     # left_long, bottom_lat, right_long, top_lat
56 | 


--------------------------------------------------------------------------------
/reInvent_2018/EMR/assets/notebook_code.md:
--------------------------------------------------------------------------------
 1 | ```python
 2 | print("Hello, world!")
 3 | ```
 4 | 
 5 | ```python
 6 | from pyspark.ml.recommendation import ALS
 7 | from pyspark.ml.evaluation import RegressionEvaluator
 8 | 
 9 | from pyspark.sql.functions import *
10 | import sys
11 | ```
12 | 
13 | 
14 | ```python
15 | toys = spark.read.parquet("s3://amazon-reviews-pds/parquet/product_category=Toys/")
16 | toys.printSchema()
17 | ```
18 | 
19 | ```python
20 | toys.count()
21 | ```
22 | 
23 | ```python
24 | ratings = (
25 |     toys.select("customer_id", "product_id", "star_rating", "product_title")
26 |     .withColumn("customer_id_int", abs(hash(col("customer_id")) % sys.maxint))
27 |     .withColumn("product_id_int", abs(hash(col("product_id")) % sys.maxint))
28 | ).repartition(200)
29 | ```
30 | 
31 | ```python
32 | top_toys = ratings\
33 |             .groupby("product_id_int", "product_title")\
34 |             .agg(
35 |                 avg(col("star_rating")).alias("avg_rating"),
36 |                 count("star_rating").alias("count")
37 |             )\
38 |             .sort(desc("count"))\
39 |             .limit(25)\
40 |             .withColumn("avg_rating", round(col('avg_rating'), 3))\
41 |             .withColumn("product_title", col("product_title").substr(1, 45))
42 | top_toys.show(truncate=False)
43 | ```
44 | 
45 | ```python
46 | kids_ratings = (
47 |     toys
48 |     .where("lower(review_body) LIKE '%baby%' OR lower(review_body) LIKE '%infant%'")
49 |     .select("customer_id", "product_id", "star_rating", "product_title")
50 |     .withColumn("customer_id_int", abs(hash(col("customer_id")) % sys.maxint))
51 |     .withColumn("product_id_int", abs(hash(col("product_id")) % sys.maxint))
52 | ).repartition(200)
53 | ```
54 | 
55 | ```python
56 | top_toys = kids_ratings\
57 |             .groupby("product_id_int", "product_title")\
58 |             .agg(
59 |                 avg(col("star_rating")).alias("avg_rating"),
60 |                 count("star_rating").alias("count")
61 |             )\
62 |             .sort(desc("count"))\
63 |             .limit(25)\
64 |             .withColumn("avg_rating", round(col('avg_rating'), 3))\
65 |             .withColumn("product_title", col("product_title").substr(1, 45))
66 | top_toys.show(truncate=False)
67 | ```
68 | 
69 | ```python
70 | (training, test) = ratings.randomSplit([0.8, 0.2])
71 | 
72 | # Build the recommendation model using ALS on the training data
73 | als = ALS(maxIter=5, regParam=0.01, userCol="customer_id_int", itemCol="product_id_int", ratingCol="star_rating", coldStartStrategy="drop")
74 | model = als.fit(training)
75 | ```


--------------------------------------------------------------------------------
/emr/julia/README.md:
--------------------------------------------------------------------------------
 1 | # Julia on EMR
 2 | 
 3 | ## Installing Julia
 4 | 
 5 | Julia can be installed with a bootstrap action when creating your EMR cluster.
 6 | 
 7 | ### Upload Julia installation scripts to S3
 8 | 
 9 | ```shell
10 | S3_BUCKET=<YOUR_BUCKET>
11 | 
12 | aws s3 cp julia-1.6.1.sh s3://<YOUR_BUCKET>/boostrap-actions/julia-1.6.1.sh
13 | aws s3 cp ijulia-kernel.sh s3://<YOUR_BUCKET>/artifacts/steps/ijulia-kernel.sh
14 | ```
15 | 
16 | ### Start up an EMR cluster
17 | 
18 | ```shell
19 | ACCOUNT_ID=<YOUR_AWS_ACCOUNT_ID>
20 | REGION=<AWS_REGION>
21 | SUBNET_ID=<EMR_SUBNET>
22 | KEYPAIR=<EC2_KEYPAIR>
23 | INSTALL_SCRIPT="s3://<YOUR_BUCKET>/boostrap-actions/julia-1.6.1.sh"
24 | IJULIA_SCRIPT="s3://<YOUR_BUCKET>/artifacts/steps/ijulia-kernel.sh"
25 | 
26 | aws emr create-cluster \
27 | --applications Name=Spark Name=Livy Name=JupyterEnterpriseGateway Name=Hive \
28 | --bootstrap-actions '[{"Path":"'${INSTALL_SCRIPT}'","Name":"JuliaInstall"}]' \
29 | --steps '[{"Type":"CUSTOM_JAR","Name":"IJuliaKernelInstall","ActionOnFailure":"TERMINATE_CLUSTER","Jar":"s3://'${REGION}'.elasticmapreduce/libs/script-runner/script-runner.jar","Args":["'${IJULIA_SCRIPT}'"]}]' \
30 | --ebs-root-volume-size 10 \
31 | --instance-type c5.2xlarge \
32 | --instance-count 1 \
33 | --ec2-attributes SubnetId=${SUBNET_ID},KeyName=${KEYPAIR} \
34 | --use-default-roles \
35 | --release-label emr-6.3.0 \
36 | --log-uri s3n://aws-logs-${ACCOUNT_ID}-${REGION}/elasticmapreduce/ \
37 | --name 'DS_julia' \
38 | --scale-down-behavior TERMINATE_AT_TASK_COMPLETION \
39 | --region ${REGION}
40 | ```
41 | 
42 | ## Create a new Julia notebook
43 | 
44 | Now create a new Notebook in EMR connected to the cluster you just made.
45 | 
46 | When you open the Notebook in JupyterLab, you should see a Julia 1.6.1 icon!
47 | 
48 | ![Julia Notebook](julia_notebook.png)
49 | 
50 | See the `julia-elly.ipynb` notebook for a full example of how to use Elly and Distributed.
51 | 
52 | The code from that notebook is below.
53 | 
54 | ```julia
55 | import Pkg; Pkg.add("Elly")
56 | 
57 | yarnhost = readchomp(`hostname -i`)
58 | 
59 | using Elly
60 | 
61 | ENV["HADOOP_USER_NAME"] = "hadoop"
62 | 
63 | yarncm = YarnManager(
64 |     yarnhost=yarnhost,
65 |     rmport=8032,
66 |     schedport=8030,
67 |     launch_timeout=60,
68 |     unmanaged=true     # pass true when running in unmanaged mode
69 | );
70 | 
71 | using Distributed
72 | 
73 | env = Dict(
74 |     "JULIA_LOAD_PATH"=>join([Base.LOAD_PATH..., "/usr/local/julia/packages"], ':'),
75 |     "JULIA_DEPOT_PATH"=>join([Base.DEPOT_PATH..., "/usr/local/julia"], ':')
76 | );
77 | addprocs(yarncm; np=8, env=env);
78 | 
79 | @everywhere using Distributed
80 | @everywhere println(readchomp(`hostname -i`))
81 | ```


--------------------------------------------------------------------------------
/cdk/emr-serverless-job-run/emr_serverless_job_run/emr_serverless_job_run_stack.py:
--------------------------------------------------------------------------------
 1 | from aws_cdk import Stack
 2 | from aws_cdk import aws_emrserverless as emrs
 3 | from aws_cdk import aws_iam as iam  # Duration,
 4 | from aws_cdk import custom_resources as custom
 5 | from constructs import Construct
 6 | 
 7 | 
 8 | class EmrServerlessJobRunStack(Stack):
 9 |     def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
10 |         super().__init__(scope, construct_id, **kwargs)
11 | 
12 |         # Create a serverless Spark app
13 |         serverless_app = emrs.CfnApplication(
14 |             self,
15 |             "spark_app",
16 |             release_label="emr-6.9.0",
17 |             type="SPARK",
18 |             name="cdk-spark",
19 |         )
20 | 
21 |         # We need an execution role to run the job, this one has no access to anything
22 |         # But will be granted PassRole access by the Lambda that's starting the job.
23 |         role = iam.Role(
24 |             scope=self,
25 |             id="spark_job_execution_role",
26 |             assumed_by=iam.ServicePrincipal("emr-serverless.amazonaws.com"),
27 |         )
28 | 
29 |         # Create a custom resource that starts a job run
30 |         myjobrun = custom.AwsCustomResource(
31 |             self,
32 |             "serverless-job-run",
33 |             on_create={
34 |                 "service": "EMRServerless",
35 |                 "action": "startJobRun",
36 |                 "parameters": {
37 |                     "applicationId": serverless_app.attr_application_id,
38 |                     "executionRoleArn": role.role_arn,
39 |                     "name": "cdkJob",
40 |                     "jobDriver": {"sparkSubmit": {"entryPoint": "local:///usr/lib/spark/examples/src/main/python/pi.py"}},
41 |                 },
42 |                 "physical_resource_id": custom.PhysicalResourceId.from_response(
43 |                     "jobRunId"
44 |                 ),
45 |             },
46 |             policy=custom.AwsCustomResourcePolicy.from_sdk_calls(
47 |                 resources=custom.AwsCustomResourcePolicy.ANY_RESOURCE
48 |             ),
49 |         )
50 | 
51 |         # Ensure the Lambda can call startJobRun with the earlier-created role
52 |         myjobrun.grant_principal.add_to_policy(
53 |             iam.PolicyStatement(
54 |                 effect=iam.Effect.ALLOW,
55 |                 resources=[role.role_arn],
56 |                 actions=["iam:PassRole"],
57 |                 conditions={
58 |                     "StringLike": {
59 |                         "iam:PassedToService": "emr-serverless.amazonaws.com"
60 |                     }
61 |                 },
62 |             )
63 |         )
64 | 


--------------------------------------------------------------------------------
/cdk/big-data-stack/README.md:
--------------------------------------------------------------------------------
 1 | # CDK Big Data Stack
 2 | 
 3 | This is a Big Data Stack built with Python CDK. *Python3.9 is required.*
 4 | 
 5 | It installs the following:
 6 | - VPC
 7 | - RDS MySQL Instance
 8 | - EMR Cluster
 9 | - EKS Cluster
10 |     - With k8s Cluster Autoscaler
11 |     - With Kubernetes Dashboard
12 |     - With Apache Airflow 2.0
13 | - EMR Virtual Cluster (for EKS)
14 |     - With Airflow 2.0 plugin
15 | - EMR Studio
16 |     - Service Catalog template
17 | 
18 | You can use the following step to activate your virtualenv.
19 | 
20 | ```
21 | $ source .venv/bin/activate
22 | ```
23 | 
24 | Once the virtualenv is activated, you can install the required dependencies.
25 | 
26 | ```
27 | $ pip install -r requirements.txt
28 | ```
29 | 
30 | At this point you can now synthesize the CloudFormation template for this code.
31 | 
32 | ```
33 | $ cdk synth
34 | ```
35 | 
36 | To add additional dependencies, for example other CDK libraries, just add
37 | them to your `setup.py` file and rerun the `pip install -r requirements.txt`
38 | command.
39 | 
40 | To deploy:
41 | 
42 | ```shell
43 | cdk deploy --all -c eks_admin_role_name=Admin   
44 | ```
45 | 
46 | Where `eks_admin_role_name` is an IAM role that you want to grant admin access to your EKS cluster.
47 | 
48 | ## Stack Overview
49 | 
50 | ### VPC
51 | 
52 | Deploys a simple VPC across 3 availability zones.
53 | 
54 | ### RDS
55 | 
56 | Creates a MySQL RDS instance to be used as a Hive metastore for EMR.
57 | 
58 | ### EMR
59 | 
60 | Creates an EMR 5.32 that is configured to connect to the RDS database above. 
61 | 
62 | A job execution role is created that, currently, has extremely permissive permissions.
63 | 
64 | The cluster is creating using the new EMR roles and is configured to use Spark and the Jupyter Enteprise Gateway so you can use it with EMR Notebooks or EMR Studio.
65 | 
66 | ### EKS
67 | 
68 | Creates an EKS cluster with a single managed Node Group of `m5.xlarge` instances. In addition, it installs the Cluster Autoscaler, Kubernetes Dashboard, and Apache Airflow from the [community-provided Helm Chart](https://github.com/airflow-helm/charts/tree/main/charts/airflow).
69 | 
70 | An IAM service role is created specifically for EMR.
71 | 
72 | Another role is created for Airflow that has permissions to execute jobs against the `emr-containers` API.
73 | 
74 | ### EMRContainers
75 | 
76 | Creates an EMR Virtual Cluster for running EMR jobs on EKS.
77 | 
78 | ## Useful commands
79 | 
80 |  * `cdk ls`          list all stacks in the app
81 |  * `cdk synth`       emits the synthesized CloudFormation template
82 |  * `cdk deploy`      deploy this stack to your default AWS account/region
83 |  * `cdk diff`        compare deployed stack with current state
84 |  * `cdk docs`        open CDK documentation
85 | 
86 | 


--------------------------------------------------------------------------------
/emr/airflow/mwaa_stack/assets/dags/example_emr_job.py:
--------------------------------------------------------------------------------
 1 | from datetime import timedelta
 2 | 
 3 | from airflow import DAG
 4 | from airflow.providers.amazon.aws.operators.emr_create_job_flow import (
 5 |     EmrCreateJobFlowOperator,
 6 | )
 7 | from airflow.providers.amazon.aws.sensors.emr_job_flow import EmrJobFlowSensor
 8 | from airflow.utils.dates import days_ago
 9 | 
10 | DEFAULT_ARGS = {
11 |     "owner": "airflow",
12 |     "depends_on_past": False,
13 |     "email": ["airflow@example.com"],
14 |     "email_on_failure": False,
15 |     "email_on_retry": False,
16 | }
17 | 
18 | # [START howto_operator_emr_automatic_steps_config]
19 | SPARK_STEPS = [
20 |     {
21 |         "Name": "calculate_pi",
22 |         "ActionOnFailure": "CONTINUE",
23 |         "HadoopJarStep": {
24 |             "Jar": "command-runner.jar",
25 |             "Args": ["/usr/lib/spark/bin/run-example", "SparkPi", "10"],
26 |         },
27 |     }
28 | ]
29 | 
30 | 
31 | # "LogUri": "s3://my-emr-log-bucket/default_job_flow_location",
32 | JOB_FLOW_OVERRIDES = {
33 |     "Name": "PiCalc",
34 |     "ReleaseLabel": "emr-6.3.0",
35 |     "Instances": {
36 |         "InstanceGroups": [
37 |             {
38 |                 "Name": "Primary node",
39 |                 "Market": "SPOT",
40 |                 "InstanceRole": "MASTER",
41 |                 "InstanceType": "m5.xlarge",
42 |                 "InstanceCount": 1,
43 |             },
44 |             {
45 |                 "Name": "Core nodes",
46 |                 "Market": "SPOT",
47 |                 "InstanceRole": "CORE",
48 |                 "InstanceType": "m5.xlarge",
49 |                 "InstanceCount": 1,
50 |             },
51 |         ],
52 |         "KeepJobFlowAliveWhenNoSteps": False,
53 |         "TerminationProtected": False,
54 |     },
55 |     "Steps": SPARK_STEPS,
56 |     "JobFlowRole": "EMR_EC2_DefaultRole",
57 |     "ServiceRole": "EMR_DefaultRole",
58 | }
59 | # [END howto_operator_emr_automatic_steps_config]
60 | 
61 | with DAG(
62 |     dag_id="emr_job_flow_automatic_steps_dag",
63 |     default_args=DEFAULT_ARGS,
64 |     dagrun_timeout=timedelta(hours=2),
65 |     start_date=days_ago(2),
66 |     schedule_interval="0 3 * * *",
67 |     tags=["example"],
68 | ) as dag:
69 | 
70 |     # [START howto_operator_emr_automatic_steps_tasks]
71 |     job_flow_creator = EmrCreateJobFlowOperator(
72 |         task_id="create_job_flow",
73 |         job_flow_overrides=JOB_FLOW_OVERRIDES,
74 |         aws_conn_id="aws_default",
75 |         emr_conn_id="emr_default",
76 |     )
77 | 
78 |     job_sensor = EmrJobFlowSensor(
79 |         task_id="check_job_flow",
80 |         job_flow_id="{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}",
81 |         aws_conn_id="aws_default",
82 |     )
83 | 
84 |     job_flow_creator >> job_sensor
85 |     # [END howto_operator_emr_automatic_steps_tasks]
86 | 


--------------------------------------------------------------------------------
/emr/airflow/mwaa_stack/assets/dags/example_emr_eks_job.py:
--------------------------------------------------------------------------------
 1 | # Licensed to the Apache Software Foundation (ASF) under one
 2 | # or more contributor license agreements.  See the NOTICE file
 3 | # distributed with this work for additional information
 4 | # regarding copyright ownership.  The ASF licenses this file
 5 | # to you under the Apache License, Version 2.0 (the
 6 | # "License"); you may not use this file except in compliance
 7 | # with the License.  You may obtain a copy of the License at
 8 | #
 9 | #   http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing,
12 | # software distributed under the License is distributed on an
13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14 | # KIND, either express or implied.  See the License for the
15 | # specific language governing permissions and limitations
16 | # under the License.
17 | """
18 | This is an example dag for an Amazon EMR on EKS Spark job.
19 | """
20 | from datetime import timedelta
21 | 
22 | from airflow import DAG
23 | from airflow.hooks.base_hook import BaseHook
24 | 
25 | from emr_containers.operators.emr_containers import EMRContainerOperator
26 | from airflow.utils.dates import days_ago
27 | 
28 | 
29 | JOB_DRIVER_ARG = {
30 |     "sparkSubmitJobDriver": {
31 |         "entryPoint": "local:///usr/lib/spark/examples/src/main/python/pi.py",
32 |         "sparkSubmitParameters": "--conf spark.executors.instances=2 --conf spark.executors.memory=2G --conf spark.executor.cores=2 --conf spark.driver.cores=1",  # noqa: E501
33 |     }
34 | }
35 | 
36 | CONFIGURATION_OVERRIDES_ARG = {
37 |     "monitoringConfiguration": {
38 |         "cloudWatchMonitoringConfiguration": {
39 |             "logGroupName": "/aws/emr-eks-spark",
40 |             "logStreamNamePrefix": "airflow",
41 |         }
42 |     }
43 | }
44 | 
45 | with DAG(
46 |     dag_id='emr_eks_pi_job',
47 |     dagrun_timeout=timedelta(hours=2),
48 |     start_date=days_ago(1),
49 |     schedule_interval="@once",
50 |     tags=["emr_containers", "example"],
51 | ) as dag:
52 | 
53 |     # An example of how to get the cluster id and arn from an Airflow (>2.1) connection 
54 |     # VIRTUAL_CLUSTER_ID = '{{ conn.emr_eks.extra_dejson["virtual_cluster_id"] }}'
55 |     # JOB_ROLE_ARN = '{{ conn.emr_eks.extra_dejson["job_role_arn"] }}'
56 |     # In 2.0 we just get the connection, but this executes every time the DAG is loaded
57 |     c = BaseHook.get_connection("emr_eks")
58 |     cluster_args = c.extra_dejson
59 |     VIRTUAL_CLUSTER_ID = cluster_args.get('virtual_cluster_id')
60 |     JOB_ROLE_ARN = cluster_args.get('job_role_arn')
61 | 
62 | 
63 |     job_starter = EMRContainerOperator(
64 |         task_id="start_job",
65 |         virtual_cluster_id=VIRTUAL_CLUSTER_ID,
66 |         execution_role_arn=JOB_ROLE_ARN,
67 |         release_label="emr-6.3.0-latest",
68 |         job_driver=JOB_DRIVER_ARG,
69 |         configuration_overrides=CONFIGURATION_OVERRIDES_ARG,
70 |         name="pi.py",
71 |     )


--------------------------------------------------------------------------------
/reInvent_2018/EMR/assets/cloudformation/Spark_Cluster_Versions/v0_Initial_Revision.cf.yml:
--------------------------------------------------------------------------------
  1 | AWSTemplateFormatVersion: 2010-09-09
  2 | Description: On-Demand EMR Cluster
  3 | 
  4 | Parameters:
  5 |   ClusterName:
  6 |     Type: "String"
  7 |     Description: "Name your cluster"
  8 | 
  9 |   JobType:
 10 |     Type: "String"
 11 |     Description: "Select your job type"
 12 |     AllowedValues:
 13 |       - "Spark"
 14 |       - "Hive"
 15 |       - "Interactive"
 16 |     Default: "Spark"
 17 |   
 18 |   ComputeRequirements:
 19 |     Type: "String"
 20 |     Description: "Compute requirements"
 21 |     AllowedValues:
 22 |       - "Generic"
 23 |       - "CPU"
 24 |       - "Memory"
 25 |     Default: "Generic"
 26 | 
 27 |   JobArtifacts:
 28 |     Type: "String"
 29 |     Description: "Spark script or Hive SQL"
 30 | 
 31 | Mappings:
 32 |   ComputeMapping:
 33 |     Generic:
 34 |       "instancetype": "m5.4xlarge"
 35 |     CPU:
 36 |       "instancetype": "c5.4xlarge"
 37 |     Memory:
 38 |       "instancetype": "r5.4xlarge"
 39 |   StepMapping:
 40 |     Spark:
 41 |       "stepcommand": "spark-submit --deploy-mode cluster"
 42 |     Hive:
 43 |       "stepcommand": "hive-script --run-hive-script --args -f"
 44 |   
 45 | Resources:
 46 |   EMRCluster:
 47 |     Type: AWS::EMR::Cluster
 48 |     Properties:
 49 |       Name: { Ref: ClusterName }
 50 |       JobFlowRole: "EMR_EC2_DefaultRole"
 51 |       ServiceRole: "EMR_DefaultRole"
 52 |       ReleaseLabel: "emr-5.19.0"
 53 |       Instances: 
 54 |         Ec2SubnetId: "subnet-XXXX"
 55 |         Ec2KeyName: "sshkeyname"
 56 |         MasterInstanceGroup: 
 57 |           InstanceCount: 1
 58 |           InstanceType:
 59 |             Fn::FindInMap:
 60 |               - ComputeMapping
 61 |               - Ref: "ComputeRequirements"
 62 |               - "instancetype"
 63 |           Market: "ON_DEMAND"
 64 |           Name: "Master"
 65 |         CoreInstanceGroup: 
 66 |           InstanceCount: 2
 67 |           InstanceType:
 68 |             Fn::FindInMap:
 69 |               - ComputeMapping
 70 |               - Ref: "ComputeRequirements"
 71 |               - "instancetype"
 72 |           Market: "ON_DEMAND"
 73 |           Name: "Core"
 74 |       Applications:
 75 |         - Name: "Spark"
 76 |         - Name: "Ganglia"
 77 |         - Name: "Hive"
 78 |       LogUri:
 79 |         Fn::Join: ["", ["s3://aws-logs-", Ref: "AWS::AccountId", "-", Ref: "AWS::Region",  "/", "elasticmapreduce", "/"]]
 80 |   
 81 |   EMRLogProcessor:
 82 |     Type: AWS::EMR::Step
 83 |     Properties:
 84 |       ActionOnFailure: "CONTINUE"
 85 |       HadoopJarStep:
 86 |         Jar: "command-runner.jar"
 87 |         Args: !Split
 88 |         - " "
 89 |         - Fn::Join:
 90 |           - " "
 91 |           - 
 92 |             - Fn::FindInMap: [StepMapping, {Ref: JobType}, "stepcommand"]
 93 |             - {Ref: JobArtifacts}
 94 |       JobFlowId:
 95 |         Ref: EMRCluster
 96 |       Name: "Log Converter"
 97 | 
 98 | Outputs:
 99 |   "MasterNodeHadoopURL":
100 |     Description: "EMR Resource Manager"
101 |     Value:
102 |       Fn::Sub: "http://${EMRCluster.MasterPublicDNS}:8088"
103 | 


--------------------------------------------------------------------------------
/reInvent_2018/EMR/assets/cloudformation/Spark_Cluster_Versions/v1_Security_Settings.cf.yml:
--------------------------------------------------------------------------------
  1 | AWSTemplateFormatVersion: 2010-09-09
  2 | Description: On-Demand EMR Cluster
  3 | 
  4 | Parameters:
  5 |   ClusterName:
  6 |     Type: "String"
  7 |     Description: "Name your cluster"
  8 | 
  9 |   JobType:
 10 |     Type: "String"
 11 |     Description: "Select your job type"
 12 |     AllowedValues:
 13 |       - "Spark"
 14 |       - "Hive"
 15 |       - "Interactive"
 16 |     Default: "Spark"
 17 |   
 18 |   ComputeRequirements:
 19 |     Type: "String"
 20 |     Description: "Compute requirements"
 21 |     AllowedValues:
 22 |       - "Generic"
 23 |       - "CPU"
 24 |       - "Memory"
 25 |     Default: "Generic"
 26 | 
 27 |   JobArtifacts:
 28 |     Type: "String"
 29 |     Description: "Spark script or Hive SQL"
 30 | 
 31 | Mappings:
 32 |   ComputeMapping:
 33 |     Generic:
 34 |       "instancetype": "m5.4xlarge"
 35 |     CPU:
 36 |       "instancetype": "c5.4xlarge"
 37 |     Memory:
 38 |       "instancetype": "r5.4xlarge"
 39 |   StepMapping:
 40 |     Spark:
 41 |       "stepcommand": "spark-submit --deploy-mode cluster"
 42 |     Hive:
 43 |       "stepcommand": "hive-script --run-hive-script --args -f"
 44 |   
 45 | Resources:
 46 |   EMRCluster:
 47 |     Type: AWS::EMR::Cluster
 48 |     Properties:
 49 |       Name: { Ref: ClusterName }
 50 |       JobFlowRole: "EMR_EC2_DefaultRole"
 51 |       ServiceRole: "EMR_DefaultRole"
 52 |       ReleaseLabel: "emr-5.19.0"
 53 |       Instances: 
 54 |         Ec2SubnetId: "subnet-XXXX"
 55 |         Ec2KeyName: "sshkeyname"
 56 |         MasterInstanceGroup: 
 57 |           InstanceCount: 1
 58 |           InstanceType:
 59 |             Fn::FindInMap:
 60 |               - ComputeMapping
 61 |               - Ref: "ComputeRequirements"
 62 |               - "instancetype"
 63 |           Market: "ON_DEMAND"
 64 |           Name: "Master"
 65 |         CoreInstanceGroup: 
 66 |           InstanceCount: 2
 67 |           InstanceType:
 68 |             Fn::FindInMap:
 69 |               - ComputeMapping
 70 |               - Ref: "ComputeRequirements"
 71 |               - "instancetype"
 72 |           Market: "ON_DEMAND"
 73 |           Name: "Core"
 74 |       Applications:
 75 |         - Name: "Spark"
 76 |         - Name: "Ganglia"
 77 |         - Name: "Hive"
 78 |       LogUri:
 79 |         Fn::Join: ["", ["s3://aws-logs-", Ref: "AWS::AccountId", "-", Ref: "AWS::Region",  "/", "elasticmapreduce", "/"]]
 80 |   
 81 |   EMRLogProcessor:
 82 |     Type: AWS::EMR::Step
 83 |     Properties:
 84 |       ActionOnFailure: "CONTINUE"
 85 |       HadoopJarStep:
 86 |         Jar: "command-runner.jar"
 87 |         Args: !Split
 88 |         - " "
 89 |         - Fn::Join:
 90 |           - " "
 91 |           - 
 92 |             - Fn::FindInMap: [StepMapping, {Ref: JobType}, "stepcommand"]
 93 |             - {Ref: JobArtifacts}
 94 |       JobFlowId:
 95 |         Ref: EMRCluster
 96 |       Name: "Log Converter"
 97 | 
 98 | Outputs:
 99 |   "MasterNodeHadoopURL":
100 |     Description: "EMR Resource Manager"
101 |     Value:
102 |       Fn::Sub: "http://${EMRCluster.MasterPublicDNS}:8088"
103 | 


--------------------------------------------------------------------------------
/cdk/emr-serverless-vpc-to-vpc/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | # CDK - EMR Serverless VPC to VPC Connectivity
 3 | 
 4 | This is an example app that shows how to create peering between two VPCs to allow an EMR Serverless job to connect to a service in another VPC.
 5 | 
 6 | In this example, we create an EC2 instance with Postgres installed in one VPC, a test instance in another VPC for verifying connectivity, and an EMR Serverless app with the appropriate subnets and security group created.
 7 | 
 8 | ![](diagram.png)
 9 | 
10 | ## Usage
11 | 
12 | Once the infrastructure is deployed, you can copy the sample job to an S3 bucket, replace the placeholders and run the job.
13 | 
14 | When running the job, it's assumed you already have an S3 bucket and EMR runtime role.
15 | 
16 | ```bash
17 | S3_BUCKET=<your_s3_bucket>
18 | APPLICATION_ID=<app_id_from_cdk>
19 | JOB_ROLE_ARN=<job_role_with_access_to_s3>
20 | 
21 | # First deploy and copy script to s3
22 | # Your Account and Region must be set
23 | CDK_DEFAULT_ACCOUNT=123456789012 CDK_DEFAULT_REGION=us-west-2 cdk deploy
24 | aws s3 cp pg_connect.py s3://${S3_BUCKET}/code/pyspark
25 | 
26 | # Now run an EMR Serverless job with the postgresql package
27 | aws emr-serverless start-job-run \
28 |     --application-id $APPLICATION_ID \
29 |     --execution-role-arn $JOB_ROLE_ARN \
30 |     --job-driver '{
31 |         "sparkSubmit": {
32 |             "entryPoint": "s3://'${S3_BUCKET}'/code/pyspark/pg_connect.py",
33 |             "sparkSubmitParameters": "--packages org.postgresql:postgresql:42.4.0"
34 |         }
35 |     }'
36 | ```
37 | 
38 | The job will fail, but you should get an error message that the `users` table doesn't exist that indicates EMR Serverless was able to connect to the postgres instance.
39 | 
40 | ## Overview
41 | 
42 | This project is set up like a standard Python project.  The initialization
43 | process also creates a virtualenv within this project, stored under the `.venv`
44 | directory.  To create the virtualenv it assumes that there is a `python3`
45 | (or `python` for Windows) executable in your path with access to the `venv`
46 | package. If for any reason the automatic creation of the virtualenv fails,
47 | you can create the virtualenv manually.
48 | 
49 | To manually create a virtualenv on MacOS and Linux:
50 | 
51 | ```
52 | $ python3 -m venv .venv
53 | ```
54 | 
55 | After the init process completes and the virtualenv is created, you can use the following
56 | step to activate your virtualenv.
57 | 
58 | ```
59 | $ source .venv/bin/activate
60 | ```
61 | 
62 | If you are a Windows platform, you would activate the virtualenv like this:
63 | 
64 | ```
65 | % .venv\Scripts\activate.bat
66 | ```
67 | 
68 | Once the virtualenv is activated, you can install the required dependencies.
69 | 
70 | ```
71 | $ pip install -r requirements.txt
72 | ```
73 | 
74 | At this point you can now synthesize the CloudFormation template for this code.
75 | 
76 | ```
77 | $ cdk synth
78 | ```
79 | 
80 | To add additional dependencies, for example other CDK libraries, just add
81 | them to your `setup.py` file and rerun the `pip install -r requirements.txt`
82 | command.
83 | 
84 | ## Useful commands
85 | 
86 |  * `cdk ls`          list all stacks in the app
87 |  * `cdk synth`       emits the synthesized CloudFormation template
88 |  * `cdk deploy`      deploy this stack to your default AWS account/region
89 |  * `cdk diff`        compare deployed stack with current state
90 |  * `cdk docs`        open CDK documentation
91 | 
92 | Enjoy!
93 | 


--------------------------------------------------------------------------------
/emr/eks/videos/external_metastores/README.md:
--------------------------------------------------------------------------------
 1 | # EMR on EKS Metastores
 2 | 
 3 | ## Getting the database connection string
 4 | 
 5 | ```shell
 6 | RDS_SECRETS=$(aws secretsmanager get-secret-value --secret-id 'arn:aws:secretsmanager:us-east-1:123456789012:secret:RDSStackSecret12345' | jq -r '.SecretString' )
 7 | RDS_USERNAME=$(echo $RDS_SECRETS | jq -r '.username')
 8 | RDS_PASSWORD=$(echo $RDS_SECRETS | jq -r '.password')
 9 | RDS_DATABASE=$(echo $RDS_SECRETS | jq -r '.dbname')
10 | RDS_HOSTNAME=$(echo $RDS_SECRETS | jq -r '.host')
11 | RDS_STRING="jdbc:mysql://${RDS_HOSTNAME}:3306/${RDS_DATABASE}"
12 | ```
13 | 
14 | ## Get the connector jar
15 | 
16 | ```shell
17 | curl -O https://downloads.mariadb.com/Connectors/java/latest/mariadb-java-client-2.3.0.jar
18 | aws s3 cp mariadb-java-client-2.3.0.jar s3://${S3_BUCKET}/artifacts/jars/       
19 | ```
20 | 
21 | 
22 | ## Try to run the code
23 | 
24 | ```shell
25 | aws emr-containers start-job-run \
26 |     --virtual-cluster-id ${EMR_EKS_CLUSTER_ID} \
27 |     --name dacort-hive \
28 |     --execution-role-arn ${EMR_EKS_EXECUTION_ARN} \
29 |     --release-label emr-6.2.0-latest \
30 |     --job-driver '{
31 |         "sparkSubmitJobDriver": {
32 |             "entryPoint": "s3://'${S3_BUCKET}'/code/pyspark/hivejdbc.py",
33 |             "sparkSubmitParameters": "--jars s3://'${S3_BUCKET}'/artifacts/jars/mariadb-java-client-2.3.0.jar --conf spark.hadoop.javax.jdo.option.ConnectionDriverName=org.mariadb.jdbc.Driver --conf spark.hadoop.javax.jdo.option.ConnectionUserName='${RDS_USERNAME}' --conf spark.hadoop.javax.jdo.option.ConnectionPassword='${RDS_PASSWORD}' --conf spark.hadoop.javax.jdo.option.ConnectionURL='${RDS_STRING}' --conf spark.driver.cores=1 --conf spark.executor.memory=2G --conf spark.driver.memory=2G --conf spark.executor.cores=2 --conf spark.executor.instances=5"
34 |         }
35 |     }' \
36 |     --configuration-overrides '{
37 |         "monitoringConfiguration": {
38 |             "cloudWatchMonitoringConfiguration": { "logGroupName": "/aws/eks/dacort-emr/eks-spark", "logStreamNamePrefix": "hive" }
39 |         }
40 |     }'
41 | ```
42 | 
43 | https://console.aws.amazon.com/cloudwatch/home?region=us-east-1#logsV2:log-groups/log-group/$252Faws$252Feks$252Fdacort-emr$252Feks-spark$3FlogStreamNameFilter$3Dhive
44 | 
45 | ## Glue
46 | 
47 | 
48 | ```shell
49 | aws emr-containers start-job-run \
50 |     --virtual-cluster-id ${EMR_EKS_CLUSTER_ID} \
51 |     --name dacort-glue \
52 |     --execution-role-arn ${EMR_EKS_EXECUTION_ARN} \
53 |     --release-label emr-6.2.0-latest \
54 |     --job-driver '{
55 |         "sparkSubmitJobDriver": {
56 |             "entryPoint": "s3://'${S3_BUCKET}'/code/pyspark/gluespark.py",
57 |             "sparkSubmitParameters": "--conf spark.driver.cores=1 --conf spark.executor.memory=1G --conf spark.driver.memory=1G --conf spark.executor.cores=1 --conf spark.executor.instances=1"
58 | 
59 |         }
60 |     }' \
61 |     --configuration-overrides '{
62 |         "applicationConfiguration": [
63 |             {
64 |                 "classification": "spark-defaults",
65 |                 "properties": {
66 |                     "spark.hadoop.hive.metastore.client.factory.class": "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory"
67 |                 }
68 |             }
69 |         ],
70 |         "monitoringConfiguration": {
71 |             "cloudWatchMonitoringConfiguration": {
72 |                 "logGroupName": "/aws/eks/dacort-emr/eks-spark",
73 |                 "logStreamNamePrefix": "glue-cat"
74 |             }
75 |         }
76 |     }'
77 | ```


--------------------------------------------------------------------------------
/reInvent_2018/EMR/assets/cloudformation/Spark_Cluster_Versions/v2_Updated_Parameters.cf.yml:
--------------------------------------------------------------------------------
  1 | AWSTemplateFormatVersion: 2010-09-09
  2 | Description: On-Demand EMR Cluster
  3 | 
  4 | Parameters:
  5 |   ClusterName:
  6 |     Type: "String"
  7 |     Description: "Name your cluster"
  8 | 
  9 |   JobType:
 10 |     Type: "String"
 11 |     Description: "Select your job type"
 12 |     AllowedValues:
 13 |       - "Spark"
 14 |       - "Hive"
 15 |       - "Interactive"
 16 |     Default: "Spark"
 17 |   
 18 |   ComputeRequirements:
 19 |     Type: "String"
 20 |     Description: "Compute requirements"
 21 |     AllowedValues:
 22 |       - "Generic"
 23 |       - "CPU"
 24 |       - "Memory"
 25 |     Default: "Generic"
 26 | 
 27 |   JobArtifacts:
 28 |     Type: "String"
 29 |     Description: "Spark script or Hive SQL"
 30 | 
 31 | Metadata:
 32 |   AWS::CloudFormation::Interface:
 33 |     ParameterLabels:
 34 |       ClusterName:
 35 |         default: "Cluster Name"
 36 |       JobType:
 37 |         default: "Job Type"
 38 |       ComputeRequirements:
 39 |         default: "Compute or Memory"
 40 |       JobArtifacts:
 41 |         default: "Job Parameters"
 42 |     ParameterGroups:
 43 |       - 
 44 |         Label:
 45 |           default: "Job Configuration"
 46 |         Parameters:
 47 |           - JobType
 48 |           - JobArtifacts
 49 |       -
 50 |         Label:
 51 |           default: "Cluster Configuration"
 52 |         Parameters:
 53 |           - ClusterName
 54 |           - ComputeRequirements
 55 | 
 56 | Mappings:
 57 |   ComputeMapping:
 58 |     Generic:
 59 |       "instancetype": "m5.4xlarge"
 60 |     CPU:
 61 |       "instancetype": "c5.4xlarge"
 62 |     Memory:
 63 |       "instancetype": "r5.4xlarge"
 64 |   StepMapping:
 65 |     Spark:
 66 |       "stepcommand": "spark-submit --deploy-mode cluster"
 67 |     Hive:
 68 |       "stepcommand": "hive-script --run-hive-script --args -f"
 69 |   
 70 | Resources:
 71 |   EMRCluster:
 72 |     Type: AWS::EMR::Cluster
 73 |     Properties:
 74 |       Name: { Ref: ClusterName }
 75 |       JobFlowRole: "EMR_EC2_DefaultRole"
 76 |       ServiceRole: "EMR_DefaultRole"
 77 |       ReleaseLabel: "emr-5.19.0"
 78 |       Instances: 
 79 |         Ec2SubnetId: "subnet-XXXX"
 80 |         Ec2KeyName: "sshkeyname"
 81 |         MasterInstanceGroup: 
 82 |           InstanceCount: 1
 83 |           InstanceType:
 84 |             Fn::FindInMap:
 85 |               - ComputeMapping
 86 |               - Ref: "ComputeRequirements"
 87 |               - "instancetype"
 88 |           Market: "ON_DEMAND"
 89 |           Name: "Master"
 90 |         CoreInstanceGroup: 
 91 |           InstanceCount: 2
 92 |           InstanceType:
 93 |             Fn::FindInMap:
 94 |               - ComputeMapping
 95 |               - Ref: "ComputeRequirements"
 96 |               - "instancetype"
 97 |           Market: "ON_DEMAND"
 98 |           Name: "Core"
 99 |       Applications:
100 |         - Name: "Spark"
101 |         - Name: "Ganglia"
102 |         - Name: "Hive"
103 |       LogUri:
104 |         Fn::Join: ["", ["s3://aws-logs-", Ref: "AWS::AccountId", "-", Ref: "AWS::Region",  "/", "elasticmapreduce", "/"]]
105 |   
106 |   EMRLogProcessor:
107 |     Type: AWS::EMR::Step
108 |     Properties:
109 |       ActionOnFailure: "CONTINUE"
110 |       HadoopJarStep:
111 |         Jar: "command-runner.jar"
112 |         Args: !Split
113 |         - " "
114 |         - Fn::Join:
115 |           - " "
116 |           - 
117 |             - Fn::FindInMap: [StepMapping, {Ref: JobType}, "stepcommand"]
118 |             - {Ref: JobArtifacts}
119 |       JobFlowId:
120 |         Ref: EMRCluster
121 |       Name: "Log Converter"
122 | 
123 | Outputs:
124 |   "MasterNodeHadoopURL":
125 |     Description: "EMR Resource Manager"
126 |     Value:
127 |       Fn::Sub: "http://${EMRCluster.MasterPublicDNS}:8088"
128 | 


--------------------------------------------------------------------------------
/reInvent_2018/EMR/assets/cloudformation/Spark_Cluster_Versions/v3_Cluster_Size.cf.yml:
--------------------------------------------------------------------------------
  1 | AWSTemplateFormatVersion: 2010-09-09
  2 | Description: On-Demand EMR Cluster
  3 | 
  4 | Parameters:
  5 |   ClusterName:
  6 |     Type: "String"
  7 |     Description: "Name your cluster"
  8 | 
  9 |   JobType:
 10 |     Type: "String"
 11 |     Description: "Select your job type"
 12 |     AllowedValues:
 13 |       - "Spark"
 14 |       - "Hive"
 15 |       - "Interactive"
 16 |     Default: "Spark"
 17 |   
 18 |   ComputeRequirements:
 19 |     Type: "String"
 20 |     Description: "Compute requirements"
 21 |     AllowedValues:
 22 |       - "Generic"
 23 |       - "CPU"
 24 |       - "Memory"
 25 |     Default: "Generic"
 26 |   
 27 |   ClusterSize:
 28 |     Type: "Number"
 29 |     Description: "Size of cluster"
 30 |     AllowedValues:
 31 |       - "2"
 32 |       - "5"
 33 |       - "10"
 34 |       - "20"
 35 |     Default: "2"
 36 | 
 37 |   JobArtifacts:
 38 |     Type: "String"
 39 |     Description: "Spark script or Hive SQL"
 40 | 
 41 | Metadata:
 42 |   AWS::CloudFormation::Interface:
 43 |     ParameterLabels:
 44 |       ClusterName:
 45 |         default: "Cluster Name"
 46 |       JobType:
 47 |         default: "Job Type"
 48 |       ComputeRequirements:
 49 |         default: "Compute or Memory"
 50 |       JobArtifacts:
 51 |         default: "Job Parameters"
 52 |       ClusterSize:
 53 |         default: "Number of core nodes"
 54 |     ParameterGroups:
 55 |       -
 56 |         Label:
 57 |           default: "Cluster Configuration"
 58 |         Parameters:
 59 |           - ClusterName
 60 |           - ComputeRequirements
 61 |           - ClusterSize
 62 |       - 
 63 |         Label:
 64 |           default: "Job Configuration"
 65 |         Parameters:
 66 |           - JobType
 67 |           - JobArtifacts
 68 | 
 69 | Mappings:
 70 |   ComputeMapping:
 71 |     Generic:
 72 |       "instancetype": "m5.4xlarge"
 73 |     CPU:
 74 |       "instancetype": "c5.4xlarge"
 75 |     Memory:
 76 |       "instancetype": "r5.4xlarge"
 77 |   StepMapping:
 78 |     Spark:
 79 |       "stepcommand": "spark-submit --deploy-mode cluster"
 80 |     Hive:
 81 |       "stepcommand": "hive-script --run-hive-script --args -f"
 82 |   
 83 | Resources:
 84 |   EMRCluster:
 85 |     Type: AWS::EMR::Cluster
 86 |     Properties:
 87 |       Name: { Ref: ClusterName }
 88 |       JobFlowRole: "EMR_EC2_DefaultRole"
 89 |       ServiceRole: "EMR_DefaultRole"
 90 |       ReleaseLabel: "emr-5.19.0"
 91 |       Instances: 
 92 |         Ec2SubnetId: "subnet-XXXX"
 93 |         Ec2KeyName: "sshkeyname"
 94 |         MasterInstanceGroup: 
 95 |           InstanceCount: 1
 96 |           InstanceType:
 97 |             Fn::FindInMap:
 98 |               - ComputeMapping
 99 |               - Ref: "ComputeRequirements"
100 |               - "instancetype"
101 |           Market: "ON_DEMAND"
102 |           Name: "Master"
103 |         CoreInstanceGroup: 
104 |           InstanceCount:
105 |             Ref: ClusterSize
106 |           InstanceType:
107 |             Fn::FindInMap:
108 |               - ComputeMapping
109 |               - Ref: "ComputeRequirements"
110 |               - "instancetype"
111 |           Market: "ON_DEMAND"
112 |           Name: "Core"
113 |       Applications:
114 |         - Name: "Spark"
115 |         - Name: "Ganglia"
116 |         - Name: "Hive"
117 |       LogUri:
118 |         Fn::Join: ["", ["s3://aws-logs-", Ref: "AWS::AccountId", "-", Ref: "AWS::Region",  "/", "elasticmapreduce", "/"]]
119 |   
120 |   EMRLogProcessor:
121 |     Type: AWS::EMR::Step
122 |     Properties:
123 |       ActionOnFailure: "CONTINUE"
124 |       HadoopJarStep:
125 |         Jar: "command-runner.jar"
126 |         Args: !Split
127 |         - " "
128 |         - Fn::Join:
129 |           - " "
130 |           - 
131 |             - Fn::FindInMap: [StepMapping, {Ref: JobType}, "stepcommand"]
132 |             - {Ref: JobArtifacts}
133 |       JobFlowId:
134 |         Ref: EMRCluster
135 |       Name: "Log Converter"
136 | 
137 | Outputs:
138 |   "MasterNodeHadoopURL":
139 |     Description: "EMR Resource Manager"
140 |     Value:
141 |       Fn::Sub: "http://${EMRCluster.MasterPublicDNS}:8088"
142 | 


--------------------------------------------------------------------------------
/reInvent_2018/EMR/assets/cloudformation/Presto_Cluster.cf.yml:
--------------------------------------------------------------------------------
  1 | AWSTemplateFormatVersion: 2010-09-09
  2 | Description: Auto-scaling Presto Cluster
  3 | 
  4 | Parameters:
  5 |   ClusterName:
  6 |     Type: "String"
  7 |     Description: "Name your cluster"
  8 |   
  9 |   ClusterSize:
 10 |     Type: "Number"
 11 |     Description: "Size of cluster"
 12 |     AllowedValues:
 13 |       - "2"
 14 |       - "5"
 15 |       - "10"
 16 |       - "20"
 17 |       - "50"
 18 |       - "100"
 19 |     Default: "2"
 20 | 
 21 | Metadata:
 22 |   AWS::CloudFormation::Interface:
 23 |     ParameterLabels:
 24 |       ClusterName:
 25 |         default: "Cluster Name"
 26 |       ClusterSize:
 27 |         default: "Max number of auto-scaled Task nodes"
 28 |     ParameterGroups:
 29 |       -
 30 |         Label:
 31 |           default: "Cluster Configuration"
 32 |         Parameters:
 33 |           - ClusterName
 34 |           - ClusterSize
 35 |   
 36 | Resources:
 37 |   EMRCluster:
 38 |     Type: AWS::EMR::Cluster
 39 |     Properties:
 40 |       Name: { Ref: ClusterName }
 41 |       JobFlowRole: "EMR_EC2_DefaultRole"
 42 |       ServiceRole: "EMR_DefaultRole"
 43 |       AutoScalingRole: "EMR_AutoScaling_DefaultRole"
 44 |       ReleaseLabel: "emr-5.19.0"
 45 |       Instances: 
 46 |         Ec2SubnetId: "subnet-XXXX"
 47 |         Ec2KeyName: "sshkeyname"
 48 |         MasterInstanceGroup: 
 49 |           InstanceCount: 1
 50 |           InstanceType: "r5.4xlarge"
 51 |           Market: "ON_DEMAND"
 52 |           Name: "Master"
 53 |         CoreInstanceGroup: 
 54 |           InstanceCount: 2
 55 |           InstanceType: "r5.4xlarge"
 56 |           Market: "ON_DEMAND"
 57 |           Name: "Core"
 58 |       Applications:
 59 |         - Name: "Presto"
 60 |         - Name: "Ganglia"
 61 |         - Name: "Hue"
 62 |       LogUri:
 63 |         Fn::Join: ["", ["s3://aws-logs-", Ref: "AWS::AccountId", "-", Ref: "AWS::Region",  "/", "elasticmapreduce", "/"]]
 64 |   
 65 |   AutoScalingInstanceGroup: 
 66 |     Type: AWS::EMR::InstanceGroupConfig
 67 |     Properties: 
 68 |       InstanceCount: 2
 69 |       InstanceType: "r5.4xlarge"
 70 |       InstanceRole: "TASK"
 71 |       Market: "ON_DEMAND"
 72 |       Name: "TaskAutoScale"
 73 |       JobFlowId: 
 74 |         Ref: "EMRCluster"
 75 |       AutoScalingPolicy:
 76 |         Constraints: 
 77 |           MaxCapacity:
 78 |             Ref: ClusterSize
 79 |           MinCapacity: 2
 80 |         Rules:
 81 |           - Name: Scale-out
 82 |             Description: Scale-out policy
 83 |             Action:
 84 |               SimpleScalingPolicyConfiguration:
 85 |                 AdjustmentType: CHANGE_IN_CAPACITY
 86 |                 ScalingAdjustment: 18
 87 |                 CoolDown: 300
 88 |             Trigger:
 89 |               CloudWatchAlarmDefinition:
 90 |                 Dimensions:
 91 |                   - Key: JobFlowId
 92 |                     Value: '${emr.clusterId}'
 93 |                 EvaluationPeriods: 1
 94 |                 Namespace: AWS/ElasticMapReduce
 95 |                 Period: 300
 96 |                 ComparisonOperator: GREATER_THAN_OR_EQUAL
 97 |                 Statistic: AVERAGE
 98 |                 Threshold: 1
 99 |                 Unit: COUNT
100 |                 MetricName: ScaleOutToMax
101 |           - Name: Scale-in
102 |             Description: Scale-in policy
103 |             Action:
104 |               SimpleScalingPolicyConfiguration:
105 |                 AdjustmentType: CHANGE_IN_CAPACITY
106 |                 ScalingAdjustment: -18
107 |                 CoolDown: 300
108 |             Trigger:
109 |               CloudWatchAlarmDefinition:
110 |                 Dimensions:
111 |                   - Key: JobFlowId
112 |                     Value: '${emr.clusterId}'
113 |                 EvaluationPeriods: 1
114 |                 Namespace: AWS/ElasticMapReduce
115 |                 Period: 300
116 |                 ComparisonOperator: GREATER_THAN_OR_EQUAL
117 |                 Statistic: AVERAGE
118 |                 Threshold: 1
119 |                 Unit: COUNT
120 |                 MetricName: ScaleInToMin
121 | 
122 | Outputs:
123 |   "PrestoUI":
124 |     Description: "Presto Admin Console"
125 |     Value:
126 |       Fn::Sub: "http://${EMRCluster.MasterPublicDNS}:8889"
127 |   "HueUI":
128 |     Description: "Hue Interface"
129 |     Value:
130 |       Fn::Sub: "http://${EMRCluster.MasterPublicDNS}:8888"
131 | 
132 | 


--------------------------------------------------------------------------------
/emr/eks/java/emr-eks-job-runner/src/main/java/aws/example/emrcontainers/StartJobRunExample.java:
--------------------------------------------------------------------------------
 1 | package aws.example.emrcontainers;
 2 | 
 3 | import software.amazon.awssdk.services.emrcontainers.EmrContainersClient;
 4 | import software.amazon.awssdk.services.emrcontainers.model.*;
 5 | 
 6 | public class StartJobRunExample {
 7 | 
 8 |     public static StartJobRunResponse submitEMRContainersJob(EmrContainersClient emrContainersClient, String virtualClusterId, String jobRoleArn) {
 9 |         SparkSubmitJobDriver sparkSubmit = SparkSubmitJobDriver.builder()
10 |                 .entryPoint("local:///usr/lib/spark/examples/src/main/python/pi.py")
11 |                 .entryPointArguments()
12 |                 .sparkSubmitParameters("--conf spark.executor.instances=1 --conf spark.executor.memory=2G --conf spark.executor.cores=1 --conf spark.driver.cores=1")
13 |                 .build();
14 | 
15 |         JobDriver jobDriver = JobDriver.builder()
16 |                 .sparkSubmitJobDriver(sparkSubmit)
17 |                 .build();
18 | 
19 |         StartJobRunRequest jobRunRequest = StartJobRunRequest.builder()
20 |                 .name("pi.py")
21 |                 .jobDriver(jobDriver)
22 |                 .executionRoleArn(jobRoleArn)
23 |                 .virtualClusterId(virtualClusterId)
24 |                 .releaseLabel(ExampleConstants.EMR_RELEASE_LABEL)
25 |                 .build();
26 | 
27 |         return emrContainersClient.startJobRun(jobRunRequest);
28 |     }
29 | 
30 |     // Wait for an EMR Containers query to complete, fail or to be cancelled
31 |     public static void waitForQueryToComplete(EmrContainersClient emrContainersClient, String virtualClusterId, String jobId) throws InterruptedException {
32 |         DescribeJobRunResponse jobRunResponse;
33 |         DescribeJobRunRequest jobRunRequest = DescribeJobRunRequest.builder()
34 |                 .virtualClusterId(virtualClusterId)
35 |                 .id(jobId)
36 |                 .build();
37 | 
38 |         boolean isQueryStillRunning = true;
39 |         while (isQueryStillRunning) {
40 |             jobRunResponse = emrContainersClient.describeJobRun(jobRunRequest);
41 |             JobRunState jobState = jobRunResponse.jobRun().state();
42 |             if (jobState == JobRunState.FAILED) {
43 |                 throw new RuntimeException("The EMR Containers job failed to run with error message: " +
44 |                         jobRunResponse.jobRun().failureReasonAsString());
45 |             } else if (jobState == JobRunState.CANCELLED) {
46 |                 throw new RuntimeException("The EMR Containers job was cancelled.");
47 |             } else if (jobState == JobRunState.COMPLETED) {
48 |                 isQueryStillRunning = false;
49 |             } else {
50 |                 // Sleep an amount of time before retrying again
51 |                 Thread.sleep(ExampleConstants.SLEEP_AMOUNT_IN_MS);
52 |             }
53 |             System.out.println("The current status is: " + jobState.toString());
54 |         }
55 |     }
56 | 
57 |     public static void main(String[] args) throws InterruptedException {
58 |         final String USAGE = "\n" +
59 |                 "StartJobRunExample - Run an EMR on EKS job\n\n" +
60 |                 "Usage: StartJobRunExample <virtual_cluster_id> <job_role_arn>\n\n" +
61 |                 "Where:\n" +
62 |                 "  virtual_cluster_id - The virtual cluster ID of your EMR on EKS cluster.\n\n" +
63 |                 "  job_role_arn       - The execution role ARN for the job run.\n";
64 | 
65 |         if (args.length < 2) {
66 |             System.out.println(USAGE);
67 |             System.exit(1);
68 |         }
69 | 
70 |         String virtual_cluster_id = args[0];
71 |         String job_role_arn = args[1];
72 | 
73 |         System.out.println("Creating a new job on cluster: " + virtual_cluster_id);
74 | 
75 |         EmrContainersClient emrContainersClient = EmrContainersClient.builder()
76 |                 .build();
77 | 
78 |         // Create a default job on the provided EMR on EKS cluster
79 |         StartJobRunResponse jobRun = submitEMRContainersJob(emrContainersClient, virtual_cluster_id, job_role_arn);
80 |         System.out.println("Started job: " + jobRun.id());
81 | 
82 |         // Now wait for the job to run to completion
83 |         waitForQueryToComplete(emrContainersClient, virtual_cluster_id, jobRun.id());
84 |         emrContainersClient.close();
85 | 
86 |         System.out.println("Done!");
87 |     }
88 | 
89 | }
90 | 


--------------------------------------------------------------------------------
/reInvent_2018/EMR/assets/cloudformation/Spark_Cluster_Versions/v4_Auto_Terminate.cf.yml:
--------------------------------------------------------------------------------
  1 | AWSTemplateFormatVersion: 2010-09-09
  2 | Description: On-Demand EMR Cluster
  3 | 
  4 | Parameters:
  5 |   ClusterName:
  6 |     Type: "String"
  7 |     Description: "Name your cluster"
  8 | 
  9 |   JobType:
 10 |     Type: "String"
 11 |     Description: "Select your job type"
 12 |     AllowedValues:
 13 |       - "Spark"
 14 |       - "Hive"
 15 |       - "Interactive"
 16 |     Default: "Spark"
 17 |   
 18 |   ComputeRequirements:
 19 |     Type: "String"
 20 |     Description: "Compute requirements"
 21 |     AllowedValues:
 22 |       - "Generic"
 23 |       - "CPU"
 24 |       - "Memory"
 25 |     Default: "Generic"
 26 |   
 27 |   ClusterSize:
 28 |     Type: "Number"
 29 |     Description: "Size of cluster"
 30 |     AllowedValues:
 31 |       - "2"
 32 |       - "5"
 33 |       - "10"
 34 |       - "20"
 35 |     Default: "2"
 36 | 
 37 |   JobArtifacts:
 38 |     Type: "String"
 39 |     Description: "Spark script or Hive SQL"
 40 |   
 41 |   AutoTerminateCluster:
 42 |     Type: "String"
 43 |     Description: "Terminate the cluster when the job is done"
 44 |     AllowedValues:
 45 |       - "True"
 46 |       - "False"
 47 |     Default: "False"
 48 | 
 49 | Metadata:
 50 |   AWS::CloudFormation::Interface:
 51 |     ParameterLabels:
 52 |       ClusterName:
 53 |         default: "Cluster Name"
 54 |       JobType:
 55 |         default: "Job Type"
 56 |       ComputeRequirements:
 57 |         default: "Compute or Memory"
 58 |       JobArtifacts:
 59 |         default: "Job Parameters"
 60 |       ClusterSize:
 61 |         default: "Number of core nodes"
 62 |       AutoTerminateCluster:
 63 |         default: "Auto terminate EMR cluster"
 64 |     ParameterGroups:
 65 |       -
 66 |         Label:
 67 |           default: "Cluster Configuration"
 68 |         Parameters:
 69 |           - ClusterName
 70 |           - ComputeRequirements
 71 |           - ClusterSize
 72 |           - AutoTerminateCluster
 73 |       - 
 74 |         Label:
 75 |           default: "Job Configuration"
 76 |         Parameters:
 77 |           - JobType
 78 |           - JobArtifacts
 79 | 
 80 | Mappings:
 81 |   ComputeMapping:
 82 |     Generic:
 83 |       "instancetype": "m5.4xlarge"
 84 |     CPU:
 85 |       "instancetype": "c5.4xlarge"
 86 |     Memory:
 87 |       "instancetype": "r5.4xlarge"
 88 |   StepMapping:
 89 |     Spark:
 90 |       "stepcommand": "spark-submit --deploy-mode cluster"
 91 |     Hive:
 92 |       "stepcommand": "hive-script --run-hive-script --args -f"
 93 |   
 94 | Resources:
 95 |   EMRCluster:
 96 |     Type: AWS::EMR::Cluster
 97 |     Properties:
 98 |       Name: { Ref: ClusterName }
 99 |       JobFlowRole: "EMR_EC2_DefaultRole"
100 |       ServiceRole: "EMR_DefaultRole"
101 |       ReleaseLabel: "emr-5.19.0"
102 |       Instances: 
103 |         Ec2SubnetId: "subnet-XXXX"
104 |         Ec2KeyName: "sshkeyname"
105 |         MasterInstanceGroup: 
106 |           InstanceCount: 1
107 |           InstanceType:
108 |             Fn::FindInMap:
109 |               - ComputeMapping
110 |               - Ref: "ComputeRequirements"
111 |               - "instancetype"
112 |           Market: "ON_DEMAND"
113 |           Name: "Master"
114 |         CoreInstanceGroup: 
115 |           InstanceCount:
116 |             Ref: ClusterSize
117 |           InstanceType:
118 |             Fn::FindInMap:
119 |               - ComputeMapping
120 |               - Ref: "ComputeRequirements"
121 |               - "instancetype"
122 |           Market: "ON_DEMAND"
123 |           Name: "Core"
124 |       Applications:
125 |         - Name: "Spark"
126 |         - Name: "Ganglia"
127 |         - Name: "Hive"
128 |       LogUri:
129 |         Fn::Join: ["", ["s3://aws-logs-", Ref: "AWS::AccountId", "-", Ref: "AWS::Region",  "/", "elasticmapreduce", "/"]]
130 |   
131 |   EMRLogProcessor:
132 |     Type: AWS::EMR::Step
133 |     Properties:
134 |       ActionOnFailure: "CONTINUE"
135 |       HadoopJarStep:
136 |         Jar: "command-runner.jar"
137 |         Args: !Split
138 |         - " "
139 |         - Fn::Join:
140 |           - " "
141 |           - 
142 |             - Fn::FindInMap: [StepMapping, {Ref: JobType}, "stepcommand"]
143 |             - {Ref: JobArtifacts}
144 |       JobFlowId:
145 |         Ref: EMRCluster
146 |       Name: "Log Converter"
147 | 
148 | Outputs:
149 |   "MasterNodeHadoopURL":
150 |     Description: "EMR Resource Manager"
151 |     Value:
152 |       Fn::Sub: "http://${EMRCluster.MasterPublicDNS}:8088"
153 | 


--------------------------------------------------------------------------------
/reInvent_2018/EMR/create_sc_entries.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Ensure dependencies are installed
 4 | if ! [ -x "$(command -v jq)" ]; then
 5 |   echo 'Error: jq is not installed.' >&2
 6 |   exit 1
 7 | fi
 8 | 
 9 | # Define some environment variables 
10 | : ${TARGET_SUBNET:=subnet-XXXX}
11 | : ${TARGET_GRANTEE:=role/Admin}
12 | : ${CLUSTER_SSH_KEY:=sshkeyname}
13 | : ${BUCKET_NAME:=damons-reinvent-demo}
14 | : ${AWS_REGION:=us-east-1}
15 | : ${AWS_PROFILE:=default}
16 | 
17 | # Used to retrieve output from AWS CLI commands
18 | TMP_FILE=$(mktemp)
19 | 
20 | # macOS uses BSD sed, Linux uses GNU sed
21 | if [[ "$OSTYPE" == "linux-gnu" ]]; then
22 |     SED_CMD="sed -i"
23 | elif [[ "$OSTYPE" == "darwin"* ]]; then
24 |     SED_CMD="sed -i ''"
25 | else
26 |     echo "Unsupported operating system, only Linux and macOS are supported."
27 |     exit 1
28 | fi
29 | 
30 | # Update settings specific to our desired region in the CloudFormation templates
31 | find assets/cloudformation -type f -exec \
32 |     ${SED_CMD} "s/Ec2SubnetId:.*/Ec2SubnetId: \"${TARGET_SUBNET}\"/" {} +
33 | find assets/cloudformation -type f -exec \
34 |     ${SED_CMD} "s/Ec2KeyName:.*/Ec2KeyName: \"${CLUSTER_SSH_KEY}\"/" {} +
35 | 
36 | # Deploy the updated templates
37 | RELEASE_BUCKET=${BUCKET_NAME} AWS_PROFILE=${AWS_PROFILE} make
38 | 
39 | # Create a new portfolio
40 | aws --region ${AWS_REGION} servicecatalog create-portfolio \
41 |     --display-name "EMR re:Invent Demo" \
42 |     --provider-name "@dacort" \
43 |     --description "Pre-defined on-demand EMR clusters" \
44 |     | tee ${TMP_FILE}
45 | PORTFOLIO_ID=$(jq -r '.PortfolioDetail.Id' ${TMP_FILE})
46 | 
47 | # Create a product
48 | aws --region ${AWS_REGION} servicecatalog create-product --name "Data Analyst EMR" \
49 |     --owner "@dacort" \
50 |     --description "Provides Hive, Spark, and Hue for interactive queries." \
51 |     --product-type CLOUD_FORMATION_TEMPLATE \
52 |     --provisioning-artifact-parameters '{"Name":"Initial revision", "Description": "", "Info":{"LoadTemplateFromURL":"https://s3.amazonaws.com/'${BUCKET_NAME}'/reinvent/cloudformation/Spark_Cluster_Versions/v0_Initial_Revision.cf.yml"},"Type":"CLOUD_FORMATION_TEMPLATE"}' \
53 |     | tee ${TMP_FILE}
54 | PRODUCT_ID=$(jq -r '.ProductViewDetail.ProductViewSummary.ProductId' ${TMP_FILE})
55 | 
56 | # Connect the product to our portfolio
57 | aws --region ${AWS_REGION} servicecatalog associate-product-with-portfolio --product-id ${PRODUCT_ID} --portfolio-id ${PORTFOLIO_ID}
58 | 
59 | # Also create a Data Science product
60 | aws --region ${AWS_REGION} servicecatalog create-product --name "Data Science EMR" \
61 |     --owner "@dacort" \
62 |     --description "Provides TensorFlow, JupyterHub, and MXNet for ML queries." \
63 |     --product-type CLOUD_FORMATION_TEMPLATE \
64 |     --provisioning-artifact-parameters '{"Name":"Initial revision", "Description": "", "Info":{"LoadTemplateFromURL":"https://s3.amazonaws.com/'${BUCKET_NAME}'/reinvent/cloudformation/Spark_Cluster_Versions/v0_Initial_Revision.cf.yml"},"Type":"CLOUD_FORMATION_TEMPLATE"}' \
65 |     | tee ${TMP_FILE}
66 | DS_PRODUCT_ID=$(jq -r '.ProductViewDetail.ProductViewSummary.ProductId' ${TMP_FILE})
67 | 
68 | # Connect the product to our portfolio
69 | aws --region ${AWS_REGION} servicecatalog associate-product-with-portfolio --product-id ${DS_PRODUCT_ID} --portfolio-id ${PORTFOLIO_ID}
70 | 
71 | # Add different product revisions
72 | VERSIONS=( "Updated security setting:v1_Security_Settings"
73 |         "Updated parameter labels:v2_Updated_Parameters"
74 |         "Choose your own cluster size!:v3_Cluster_Size"
75 |         "Auto-terminate functionality:v4_Auto_Terminate"
76 |         "Spark UI:v5_SparkUI" )
77 | 
78 | for version in "${VERSIONS[@]}" ; do
79 |     NAME=${version%%:*}
80 |     TEMPLATE=${version#*:}
81 |     aws --region ${AWS_REGION} servicecatalog create-provisioning-artifact \
82 |         --product-id ${PRODUCT_ID} \
83 |         --parameters '{
84 |             "Name": "'"${NAME}"'",
85 |             "Description": "",
86 |             "Info": {
87 |                 "LoadTemplateFromURL": "https://s3.amazonaws.com/'${BUCKET_NAME}'/reinvent/cloudformation/Spark_Cluster_Versions/'${TEMPLATE}'.cf.yml"
88 |             },
89 |             "Type": "CLOUD_FORMATION_TEMPLATE"
90 |         }'
91 | done
92 | 
93 | # Grant access to the portfolio
94 | aws --region ${AWS_REGION} servicecatalog associate-principal-with-portfolio \
95 |     --portfolio-id ${PORTFOLIO_ID} \
96 |     --principal-type IAM \
97 |     --principal-arn arn:aws:iam::$(aws --region ${AWS_REGION} sts get-caller-identity --query Account --output text):${TARGET_GRANTEE}


--------------------------------------------------------------------------------
/reInvent_2018/EMR/assets/cloudformation/Spark_Cluster.cf.yml:
--------------------------------------------------------------------------------
  1 | AWSTemplateFormatVersion: 2010-09-09
  2 | Description: On-Demand EMR Cluster
  3 | 
  4 | Parameters:
  5 |   ClusterName:
  6 |     Type: "String"
  7 |     Description: "Name your cluster"
  8 | 
  9 |   JobType:
 10 |     Type: "String"
 11 |     Description: "Select your job type"
 12 |     AllowedValues:
 13 |       - "Spark"
 14 |       - "Hive"
 15 |       - "Interactive"
 16 |     Default: "Spark"
 17 |   
 18 |   ComputeRequirements:
 19 |     Type: "String"
 20 |     Description: "Compute requirements"
 21 |     AllowedValues:
 22 |       - "Generic"
 23 |       - "CPU"
 24 |       - "Memory"
 25 |     Default: "Generic"
 26 |   
 27 |   ClusterSize:
 28 |     Type: "Number"
 29 |     Description: "Size of cluster"
 30 |     AllowedValues:
 31 |       - "2"
 32 |       - "5"
 33 |       - "10"
 34 |       - "20"
 35 |     Default: "2"
 36 | 
 37 |   JobArtifacts:
 38 |     Type: "String"
 39 |     Description: "Spark script or Hive SQL"
 40 |   
 41 |   AutoTerminateCluster:
 42 |     Type: "String"
 43 |     Description: "Terminate the cluster when the job is done"
 44 |     AllowedValues:
 45 |       - "True"
 46 |       - "False"
 47 |     Default: "False"
 48 | 
 49 | Metadata:
 50 |   AWS::CloudFormation::Interface:
 51 |     ParameterLabels:
 52 |       ClusterName:
 53 |         default: "Cluster Name"
 54 |       JobType:
 55 |         default: "Job Type"
 56 |       ComputeRequirements:
 57 |         default: "Compute or Memory"
 58 |       JobArtifacts:
 59 |         default: "Job Parameters"
 60 |       ClusterSize:
 61 |         default: "Number of core nodes"
 62 |       AutoTerminateCluster:
 63 |         default: "Auto terminate EMR cluster"
 64 |     ParameterGroups:
 65 |       -
 66 |         Label:
 67 |           default: "Cluster Configuration"
 68 |         Parameters:
 69 |           - ClusterName
 70 |           - ComputeRequirements
 71 |           - ClusterSize
 72 |           - AutoTerminateCluster
 73 |       - 
 74 |         Label:
 75 |           default: "Job Configuration"
 76 |         Parameters:
 77 |           - JobType
 78 |           - JobArtifacts
 79 | 
 80 | Mappings:
 81 |   ComputeMapping:
 82 |     Generic:
 83 |       "instancetype": "m5.4xlarge"
 84 |     CPU:
 85 |       "instancetype": "c5.4xlarge"
 86 |     Memory:
 87 |       "instancetype": "r5.4xlarge"
 88 |   StepMapping:
 89 |     Spark:
 90 |       "stepcommand": "spark-submit --deploy-mode cluster"
 91 |     Hive:
 92 |       "stepcommand": "hive-script --run-hive-script --args -f"
 93 |   
 94 | Resources:
 95 |   EMRCluster:
 96 |     Type: AWS::EMR::Cluster
 97 |     Properties:
 98 |       Name: { Ref: ClusterName }
 99 |       JobFlowRole: "EMR_EC2_DefaultRole"
100 |       ServiceRole: "EMR_DefaultRole"
101 |       ReleaseLabel: "emr-5.19.0"
102 |       Instances: 
103 |         Ec2SubnetId: "subnet-XXXX"
104 |         Ec2KeyName: "sshkeyname"
105 |         MasterInstanceGroup: 
106 |           InstanceCount: 1
107 |           InstanceType:
108 |             Fn::FindInMap:
109 |               - ComputeMapping
110 |               - Ref: "ComputeRequirements"
111 |               - "instancetype"
112 |           Market: "ON_DEMAND"
113 |           Name: "Master"
114 |         CoreInstanceGroup: 
115 |           InstanceCount:
116 |             Ref: ClusterSize
117 |           InstanceType:
118 |             Fn::FindInMap:
119 |               - ComputeMapping
120 |               - Ref: "ComputeRequirements"
121 |               - "instancetype"
122 |           Market: "ON_DEMAND"
123 |           Name: "Core"
124 |       Applications:
125 |         - Name: "Spark"
126 |         - Name: "Ganglia"
127 |         - Name: "Hive"
128 |       LogUri:
129 |         Fn::Join: ["", ["s3://aws-logs-", Ref: "AWS::AccountId", "-", Ref: "AWS::Region",  "/", "elasticmapreduce", "/"]]
130 |   
131 |   EMRLogProcessor:
132 |     Type: AWS::EMR::Step
133 |     Properties:
134 |       ActionOnFailure: "CONTINUE"
135 |       HadoopJarStep:
136 |         Jar: "command-runner.jar"
137 |         Args: !Split
138 |         - " "
139 |         - Fn::Join:
140 |           - " "
141 |           - 
142 |             - Fn::FindInMap: [StepMapping, {Ref: JobType}, "stepcommand"]
143 |             - {Ref: JobArtifacts}
144 |       JobFlowId:
145 |         Ref: EMRCluster
146 |       Name: "Data Converter"
147 | 
148 | Outputs:
149 |   "MasterNodeHadoopURL":
150 |     Description: "EMR Resource Manager"
151 |     Value:
152 |       Fn::Sub: "http://${EMRCluster.MasterPublicDNS}:8088"
153 |   "SparkHistoryServerURL":
154 |     Description: "Spark UI"
155 |     Value:
156 |       Fn::Sub: "http://${EMRCluster.MasterPublicDNS}:18080"
157 | 
158 | 


--------------------------------------------------------------------------------
/reInvent_2018/EMR/assets/cloudformation/Spark_Cluster_Versions/v5_SparkUI.cf.yml:
--------------------------------------------------------------------------------
  1 | AWSTemplateFormatVersion: 2010-09-09
  2 | Description: On-Demand EMR Cluster
  3 | 
  4 | Parameters:
  5 |   ClusterName:
  6 |     Type: "String"
  7 |     Description: "Name your cluster"
  8 | 
  9 |   JobType:
 10 |     Type: "String"
 11 |     Description: "Select your job type"
 12 |     AllowedValues:
 13 |       - "Spark"
 14 |       - "Hive"
 15 |       - "Interactive"
 16 |     Default: "Spark"
 17 |   
 18 |   ComputeRequirements:
 19 |     Type: "String"
 20 |     Description: "Compute requirements"
 21 |     AllowedValues:
 22 |       - "Generic"
 23 |       - "CPU"
 24 |       - "Memory"
 25 |     Default: "Generic"
 26 |   
 27 |   ClusterSize:
 28 |     Type: "Number"
 29 |     Description: "Size of cluster"
 30 |     AllowedValues:
 31 |       - "2"
 32 |       - "5"
 33 |       - "10"
 34 |       - "20"
 35 |     Default: "2"
 36 | 
 37 |   JobArtifacts:
 38 |     Type: "String"
 39 |     Description: "Spark script or Hive SQL"
 40 |   
 41 |   AutoTerminateCluster:
 42 |     Type: "String"
 43 |     Description: "Terminate the cluster when the job is done"
 44 |     AllowedValues:
 45 |       - "True"
 46 |       - "False"
 47 |     Default: "False"
 48 | 
 49 | Metadata:
 50 |   AWS::CloudFormation::Interface:
 51 |     ParameterLabels:
 52 |       ClusterName:
 53 |         default: "Cluster Name"
 54 |       JobType:
 55 |         default: "Job Type"
 56 |       ComputeRequirements:
 57 |         default: "Compute or Memory"
 58 |       JobArtifacts:
 59 |         default: "Job Parameters"
 60 |       ClusterSize:
 61 |         default: "Number of core nodes"
 62 |       AutoTerminateCluster:
 63 |         default: "Auto terminate EMR cluster"
 64 |     ParameterGroups:
 65 |       -
 66 |         Label:
 67 |           default: "Cluster Configuration"
 68 |         Parameters:
 69 |           - ClusterName
 70 |           - ComputeRequirements
 71 |           - ClusterSize
 72 |           - AutoTerminateCluster
 73 |       - 
 74 |         Label:
 75 |           default: "Job Configuration"
 76 |         Parameters:
 77 |           - JobType
 78 |           - JobArtifacts
 79 | 
 80 | Mappings:
 81 |   ComputeMapping:
 82 |     Generic:
 83 |       "instancetype": "m5.4xlarge"
 84 |     CPU:
 85 |       "instancetype": "c5.4xlarge"
 86 |     Memory:
 87 |       "instancetype": "r5.4xlarge"
 88 |   StepMapping:
 89 |     Spark:
 90 |       "stepcommand": "spark-submit --deploy-mode cluster"
 91 |     Hive:
 92 |       "stepcommand": "hive-script --run-hive-script --args -f"
 93 |   
 94 | Resources:
 95 |   EMRCluster:
 96 |     Type: AWS::EMR::Cluster
 97 |     Properties:
 98 |       Name: { Ref: ClusterName }
 99 |       JobFlowRole: "EMR_EC2_DefaultRole"
100 |       ServiceRole: "EMR_DefaultRole"
101 |       ReleaseLabel: "emr-5.19.0"
102 |       Instances: 
103 |         Ec2SubnetId: "subnet-XXXX"
104 |         Ec2KeyName: "sshkeyname"
105 |         MasterInstanceGroup: 
106 |           InstanceCount: 1
107 |           InstanceType:
108 |             Fn::FindInMap:
109 |               - ComputeMapping
110 |               - Ref: "ComputeRequirements"
111 |               - "instancetype"
112 |           Market: "ON_DEMAND"
113 |           Name: "Master"
114 |         CoreInstanceGroup: 
115 |           InstanceCount:
116 |             Ref: ClusterSize
117 |           InstanceType:
118 |             Fn::FindInMap:
119 |               - ComputeMapping
120 |               - Ref: "ComputeRequirements"
121 |               - "instancetype"
122 |           Market: "ON_DEMAND"
123 |           Name: "Core"
124 |       Applications:
125 |         - Name: "Spark"
126 |         - Name: "Ganglia"
127 |         - Name: "Hive"
128 |       LogUri:
129 |         Fn::Join: ["", ["s3://aws-logs-", Ref: "AWS::AccountId", "-", Ref: "AWS::Region",  "/", "elasticmapreduce", "/"]]
130 |   
131 |   EMRLogProcessor:
132 |     Type: AWS::EMR::Step
133 |     Properties:
134 |       ActionOnFailure: "CONTINUE"
135 |       HadoopJarStep:
136 |         Jar: "command-runner.jar"
137 |         Args: !Split
138 |         - " "
139 |         - Fn::Join:
140 |           - " "
141 |           - 
142 |             - Fn::FindInMap: [StepMapping, {Ref: JobType}, "stepcommand"]
143 |             - {Ref: JobArtifacts}
144 |       JobFlowId:
145 |         Ref: EMRCluster
146 |       Name: "Data Converter"
147 | 
148 | Outputs:
149 |   "MasterNodeHadoopURL":
150 |     Description: "EMR Resource Manager"
151 |     Value:
152 |       Fn::Sub: "http://${EMRCluster.MasterPublicDNS}:8088"
153 |   "SparkHistoryServerURL":
154 |     Description: "Spark UI"
155 |     Value:
156 |       Fn::Sub: "http://${EMRCluster.MasterPublicDNS}:18080"
157 | 
158 | 


--------------------------------------------------------------------------------
/emr/airflow/README.md:
--------------------------------------------------------------------------------
  1 | # Run EMR jobs with Airflow
  2 | 
  3 | Associated video: [https://youtu.be/Z--sNHqkM7c](https://youtu.be/Z--sNHqkM7c)
  4 | 
  5 | Airflow is a popular open source workflow management tool. Amazon EMR is a service that allows you to run various big data frameworks like Spark, Hive, and Presto on top of EC2 or EKS. In this demonstration, we'll show you how to schedule a PySpark job using Airflow on:
  6 | 
  7 | - EMR on EC2
  8 | - EMR on EKS
  9 | 
 10 | What we want to do is run a sample job on both.
 11 | 
 12 | Let's get started.
 13 | 
 14 | ## Pre-requisites
 15 | 
 16 | - Pre-existing VPC and [EMR default roles](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-iam-roles.html)
 17 | - EMR on EKS virtual cluster and execution role (see my [big data stack blog post](https://dacort.dev/posts/cdk-big-data-stack/) for how to deploy this with CDK)
 18 | 
 19 | ## Airflow Operators
 20 | 
 21 | Airflow has several [EMR Operators](https://airflow.apache.org/docs/apache-airflow-providers-amazon/stable/operators/emr.html) that can be used to create a cluster, run a job, and terminate a cluster.
 22 | 
 23 | In addition, there's currently an open pull request to integrate EMR on EKS as well.
 24 | 
 25 | For this demo, we'll show how to use:
 26 | 1. `EmrCreateJobFlowOperator` to create a new EMR on EC2 cluster, run a job, and automatically terminate the cluster.
 27 | 2. `EMRContainerOperator` to submit a job to a pre-existing EMR on EKS virtual cluster.
 28 | 
 29 | ## Configuring Airflow
 30 | 
 31 | ### IAM Permissions
 32 | 
 33 | IAM will need access to start and monitor EMR clusters as well as start and monitor EMR on EKS jobs.
 34 | 
 35 | In addition to the standard permissions for the [MWAA service execution role](https://docs.aws.amazon.com/mwaa/latest/userguide/mwaa-create-role.html), we'll also give it access to create these jobs.
 36 | 
 37 | EMR on EC2 will also need access to `iam:PassRole` for the default EMR roles.
 38 | 
 39 | ```python
 40 | iam.PolicyStatement(
 41 |     actions=[
 42 |         "emr-containers:StartJobRun",
 43 |         "emr-containers:DescribeJobRun",
 44 |         "emr-containers:CancelJobRun",
 45 |     ],
 46 |     effect=iam.Effect.ALLOW,
 47 |     resources=["*"],
 48 | ),
 49 | iam.PolicyStatement(
 50 |     actions=[
 51 |         "elasticmapreduce:RunJobFlow",
 52 |         "elasticmapreduce:DescribeStep",
 53 |         "elasticmapreduce:DescribeCluster",
 54 |     ],
 55 |     effect=iam.Effect.ALLOW,
 56 |     resources=["*"],
 57 | ),
 58 | iam.PolicyStatement(
 59 |     actions=["iam:PassRole"],
 60 |     effect=iam.Effect.ALLOW,
 61 |     resources=[
 62 |         f"arn:aws:iam::{self.account}:role/EMR_DemoRole",
 63 |         f"arn:aws:iam::{self.account}:role/EMR_EC2_DemoRole",
 64 |         f"arn:aws:iam::{self.account}:role/EMR_EC2_DefaultRole",
 65 |         f"arn:aws:iam::{self.account}:role/EMR_DefaultRole",
 66 |     ],
 67 | ),
 68 | ```
 69 | 
 70 | ### Airflow Connections
 71 | 
 72 | If you like, you can hard-code your connection options in your job or you can store them in a connection.
 73 | 
 74 | At the very least, you need to add `region_name` in the Extra section in your `aws_default` connection.
 75 | 
 76 | ```json
 77 | {"region_name":"us-east-1"}
 78 | ```
 79 | 
 80 | EMR on EC2 doesn't need any additional configuration than this because we're going to create the cluster from scratch and use a default set of roles, security groups, and VPC.
 81 | 
 82 | EMR on EKS, however, requires you to already have an execution role and virtual cluster set up.
 83 | 
 84 | Here's an example connection that defines a different region, EMR virtual cluster, and execution role ARN for EMR on EKS.
 85 | 
 86 | ```json
 87 | {"region_name":"us-east-2","virtual_cluster_id":"wfto7bwu9n8ajdohqkri06pc1","job_role_arn":"arn:aws:iam::111122223333:role/emr_eks_default_role"}
 88 | ```
 89 | 
 90 | ## Running Jobs
 91 | 
 92 | ### EMR on EC2
 93 | 
 94 | ONe mistake I made while working on this was I used an instance size of `c5.xlarge` - unfortunately that didn't work with the default SparkPi job, so I had to change it to an `m5.xlarge`. 
 95 | 
 96 | Other than that, the example EMR on EC2 job is pretty straight-forward! It'll create a small cluster with a Step (job) defined by default, wait until that step finishes and then EMR will terminate the cluster.
 97 | 
 98 | Let's try to trigger the DAG and see what happens.
 99 | 
100 | ### EMR on EKS
101 | 
102 | Since the pull request has not been merged, we had to deploy our own custom set of plugins to be able to run the EMR on EKS job.
103 | 
104 | To do this, we made use of my [example EMR on EKS plugin](https://github.com/dacort/emr-eks-airflow2-plugin) repository and added a reference to this in our `requirements.txt` file.
105 | 
106 | ```
107 | emr-containers @ https://github.com/dacort/emr-eks-airflow2-plugin/archive/main.zip
108 | apache-airflow[amazon]==2.0.2
109 | ```
110 | 
111 | The `apache-airflow[amazon]` requirement is needed for the EMR on EC2 Operator.
112 | 
113 | So, with our requirements installed, our connection defined, let's go ahead and trigger the DAG!
114 | 


--------------------------------------------------------------------------------
/emr/eks/videos/custom_images/README.md:
--------------------------------------------------------------------------------
  1 | # EMR on EKS Custom Images
  2 | 
  3 | Use Bokeh with EMR on EKS to draw daily images of Air Quality data in the continental US.
  4 | 
  5 | Demo video: [https://youtu.be/0x4DRKmNPfQ](https://youtu.be/0x4DRKmNPfQ)
  6 | 
  7 | ## Overview
  8 | 
  9 | - First, we need to login to the relevant ECR and pull the latest EMR image we want.
 10 | 
 11 | ```shell
 12 | aws ecr get-login-password --region us-east-2 | docker login --username AWS --password-stdin 711395599931.dkr.ecr.us-east-2.amazonaws.com
 13 | docker pull 711395599931.dkr.ecr.us-east-2.amazonaws.com/notebook-spark/emr-6.3.0:latest
 14 | ```
 15 | 
 16 | - Next, we want to build a Dockerfile that installs the `bokeh` library.
 17 | 
 18 | We also want to populate the bokeh sample dataset directly on the image itself because we use that for our map.
 19 | 
 20 | ```dockerfile
 21 | FROM 711395599931.dkr.ecr.us-east-2.amazonaws.com/notebook-spark/emr-6.3.0:latest
 22 | 
 23 | USER root
 24 | 
 25 | # Install Chrome
 26 | RUN curl https://intoli.com/install-google-chrome.sh | bash && \
 27 |     mv /usr/bin/google-chrome-stable /usr/bin/chrome
 28 | 
 29 | RUN pip3 install \
 30 |     bokeh>=2.3.2 \
 31 |     chromedriver-py>=91.0.4472.19.0 \
 32 |     selenium>=3.141.0
 33 | RUN bokeh sampledata
 34 | 
 35 | RUN ln -s /usr/local/lib/python3.7/site-packages/chromedriver_py/chromedriver_linux64 /usr/local/bin/chromedriver
 36 | 
 37 | USER hadoop:hadoop
 38 | ```
 39 | 
 40 | - Now build your image
 41 | 
 42 | ```shell
 43 | docker build -t emr-6.3.0-bokeh:latest .
 44 | ```
 45 | 
 46 | - Validate
 47 | 
 48 | I added a simple test script that generates a plot and validates it against a known hash.
 49 | 
 50 | ```shell
 51 | docker run --rm -it emr-6.3.0-bokeh python3 /test/gen_plot.py
 52 | ```
 53 | 
 54 | If you see "All good! 🙌" we're good to go!
 55 | 
 56 | - Push it to a (private) GH repo
 57 | 
 58 | ```shell
 59 | export GH_USERNAME=dacort
 60 | echo $CR_PAT| docker login ghcr.io -u ${GH_USERNAME} --password-stdin
 61 | docker tag emr-6.3.0-bokeh:latest ghcr.io/${GH_USERNAME}/emr-6.3.0-bokeh:latest
 62 | docker push ghcr.io/${GH_USERNAME}/emr-6.3.0-bokeh:latest
 63 | ```
 64 | 
 65 | - Set up a secret to allow for the git pull
 66 | 
 67 | ```shell
 68 | DOCKER_AUTH=$(echo -n "${GH_USERNAME}:${CR_PAT}" | base64)
 69 | 
 70 | DOCKER_DATA=$(echo '{ "auths": { "ghcr.io": { "auth":"'${DOCKER_AUTH}'" } } }' | base64)
 71 | 
 72 | cat <<EOF > dockerconfigjson-github-com.yaml
 73 | kind: Secret
 74 | type: kubernetes.io/dockerconfigjson
 75 | apiVersion: v1
 76 | metadata:
 77 |   name: dockerconfigjson-github-com
 78 |   namespace: emr-jobs
 79 |   labels:
 80 |     app: app-name
 81 | data:
 82 |   .dockerconfigjson: ${DOCKER_DATA}
 83 | EOF
 84 | 
 85 | kubectl create -f dockerconfigjson-github-com.yaml -n emr-jobs
 86 | ```
 87 | 
 88 | - Now let's run run it!
 89 | 
 90 | ```shell
 91 | aws emr-containers start-job-run \
 92 |     --virtual-cluster-id ${EMR_EKS_CLUSTER_ID} \
 93 |     --name dacort-aqi \
 94 |     --execution-role-arn ${EMR_EKS_EXECUTION_ARN} \
 95 |     --release-label emr-6.3.0-latest \
 96 |     --job-driver '{
 97 |         "sparkSubmitJobDriver": {
 98 |             "entryPoint": "s3://'${S3_BUCKET}'/code/generate_aqi_map.py",
 99 |             "entryPointArguments": ["'${S3_BUCKET}'", "output/airq/"],
100 |             "sparkSubmitParameters": "--conf spark.kubernetes.container.image=ghcr.io/dacort/emr-6.3.0-bokeh:latest --conf spark.kubernetes.container.image.pullSecrets=dockerconfigjson-github-com"
101 |         }
102 |     }' \
103 |     --configuration-overrides '{
104 |         "monitoringConfiguration": {
105 |             "s3MonitoringConfiguration": { "logUri": "s3://'${S3_BUCKET}'/logs/" }
106 |         }
107 |     }'
108 | ```
109 | 
110 | - We should see some air quality data!
111 | 
112 | ```shell
113 | aws s3 ls s3://${S3_BUCKET}/output/airq/
114 | # 2021-06-15 15:44:49     277735 2021-06-15-latest.png
115 | ```
116 | 
117 | ```shell
118 | aws s3 cp s3://${S3_BUCKET}/output/airq/2021-06-15-latest.png .
119 | open 2021-06-15-latest.png
120 | ```
121 | 
122 | ## Testing your code locally
123 | 
124 | If you want, you can start up `pyspark` on your image locally to interactively test your code.
125 | 
126 | ```shell
127 | docker run --rm -it emr-6.3.0-bokeh pyspark --deploy-mode client --master 'local[1]'
128 | ```
129 | 
130 | Note that if you access AWS resources from within your environment, you'll either need to change your `spark.hadoop.fs.s3.customAWSCredentialsProvider` in your Spark job or set AWS crednetials in your environment. If you have an access key or secret, you can pass those into the `docker run` command like so:
131 | 
132 | ```shell
133 | docker run --rm -it \
134 |     -e AWS_ACCESS_KEY_ID \
135 |     -e AWS_SECRET_ACCESS_KEY \
136 |     emr-6.3.0-bokeh \
137 |     pyspark --deploy-mode client --master 'local[1]'
138 | ```
139 | 
140 | ## References
141 | 
142 | - https://stackoverflow.com/questions/47087506/flatten-a-fiona-structure-to-dictionary-for-bokeh/47135604#47135604
143 | - https://discourse.bokeh.org/t/questions-re-choropleth/2589/3
144 | - https://towardsdatascience.com/walkthrough-mapping-basics-with-bokeh-and-geopandas-in-python-43f40aa5b7e9
145 | 


--------------------------------------------------------------------------------
/cdk/emr-serverless-vpc-to-vpc/emr_serverless_vpc_to_vpc/emr_serverless_vpc_to_vpc_stack.py:
--------------------------------------------------------------------------------
  1 | from aws_cdk import Stack
  2 | from aws_cdk import aws_ec2 as ec2  # Duration,
  3 | from aws_cdk import aws_emrserverless as emrs
  4 | from aws_cdk import aws_iam as iam
  5 | from constructs import Construct
  6 | 
  7 | 
  8 | class EmrServerlessVpcToVpcStack(Stack):
  9 |     def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None:
 10 |         super().__init__(scope, construct_id, **kwargs)
 11 | 
 12 |         # Create two VPCs, ensure their CIDRs don't overlap
 13 |         vpc1 = ec2.Vpc(self, "EMRServerless_VPC1", max_azs=3, cidr="10.0.0.0/16")
 14 |         vpc2 = ec2.Vpc(self, "EMRServerless_VPC2", max_azs=3, cidr="10.1.0.0/16")
 15 | 
 16 |         # This is necessary on Ubuntu instances to install cfn-init and cfn-signal
 17 |         user_data = ec2.UserData.for_linux()
 18 |         user_data.add_commands(
 19 |             "apt-get update -y",
 20 |             "apt-get install -y -o DPkg::Lock::Timeout=60 git python3-pip",
 21 |             "python3 -m pip install -U pip",
 22 |             "python3 -m pip install https://s3.amazonaws.com/cloudformation-examples/aws-cfn-bootstrap-py3-latest.tar.gz",
 23 |             "mkdir -p /opt/aws/bin/",
 24 |             "ln -s /usr/local/bin/cfn-* /opt/aws/bin/",
 25 |         )
 26 | 
 27 |         # Create a an EC2 instance running postgres in VPC1 and an inbound security group
 28 |         svc_sg = ec2.SecurityGroup(self, "VPC1_Service", vpc=vpc1)
 29 |         instance = ec2.Instance(
 30 |             self,
 31 |             "pg",
 32 |             vpc=vpc1,
 33 |             instance_type=ec2.InstanceType("t2.micro"),
 34 |             machine_image=ec2.MachineImage.lookup(
 35 |                 name="ubuntu/images/hvm-ssd/ubuntu-focal-20.04-amd64-server-20211129"
 36 |             ),
 37 |             security_group=svc_sg,
 38 |             init=ec2.CloudFormationInit.from_elements(
 39 |                 ec2.InitCommand.shell_command(
 40 |                     "sudo apt-get install -o DPkg::Lock::Timeout=60 -y postgresql"
 41 |                 ),
 42 |                 ec2.InitCommand.shell_command(
 43 |                     "sudo sh -c 'echo listen_addresses = '*' >> /etc/postgresql/12/main/postgresql.conf'"
 44 |                 ),
 45 |                 ec2.InitCommand.shell_command(
 46 |                     "sudo sh -c 'echo host  all  all 0.0.0.0/0 md5 >> /etc/postgresql/12/main/postgresql.conf'"
 47 |                 ),
 48 |                 ec2.InitCommand.shell_command(
 49 |                     "sudo systemctl restart postgresql.service"
 50 |                 ),
 51 |                 ec2.InitCommand.shell_command(
 52 |                     "sudo -u postgres psql -c \"CREATE USER remote WITH PASSWORD 'remote';\""
 53 |                 ),
 54 |             ),
 55 |             user_data=user_data,
 56 |         )
 57 | 
 58 |         # Add SSM policy so we can remote in without SSH
 59 |         instance.role.add_managed_policy(
 60 |             iam.ManagedPolicy.from_aws_managed_policy_name(
 61 |                 "AmazonSSMManagedInstanceCore"
 62 |             )
 63 |         )
 64 | 
 65 |         # Create a test EC2 instance in VPC2 with the same security group as our EMR Serverless application
 66 |         # We can use this to validate connectivity
 67 |         test_sg = ec2.SecurityGroup(self, "VPC2_Service", vpc=vpc2)
 68 |         instance2 = ec2.Instance(
 69 |             self,
 70 |             "test",
 71 |             vpc=vpc2,
 72 |             instance_type=ec2.InstanceType("t2.micro"),
 73 |             machine_image=ec2.MachineImage.lookup(
 74 |                 name="ubuntu/images/hvm-ssd/ubuntu-focal-20.04-amd64-server-20211129"
 75 |             ),
 76 |             security_group=test_sg,
 77 |             init=ec2.CloudFormationInit.from_elements(
 78 |                 ec2.InitCommand.shell_command(
 79 |                     "sudo apt-get install -o DPkg::Lock::Timeout=60 -y netcat"
 80 |                 ),
 81 |             ),
 82 |             user_data=user_data,
 83 |         )
 84 |         instance2.role.add_managed_policy(
 85 |             iam.ManagedPolicy.from_aws_managed_policy_name(
 86 |                 "AmazonSSMManagedInstanceCore"
 87 |             )
 88 |         )
 89 | 
 90 |         # Peer the two VPCs
 91 |         fn_vPCPeering_connection = ec2.CfnVPCPeeringConnection(
 92 |             self,
 93 |             "MyCfnVPCPeeringConnection",
 94 |             peer_vpc_id=vpc1.vpc_id,
 95 |             vpc_id=vpc2.vpc_id,
 96 |         )
 97 | 
 98 |         # Then create routes between eachof the subnets in each VPC
 99 |         for idx, subnet in enumerate(vpc2.private_subnets):
100 |             ec2.CfnRoute(
101 |                 self,
102 |                 f"PeerRoute-{idx}",
103 |                 route_table_id=subnet.route_table.route_table_id,
104 |                 destination_cidr_block=vpc1.vpc_cidr_block,
105 |                 vpc_peering_connection_id=fn_vPCPeering_connection.ref,
106 |             )
107 | 
108 |         for idx, subnet in enumerate(vpc1.private_subnets):
109 |             ec2.CfnRoute(
110 |                 self,
111 |                 f"PeerRoute-2-{idx}",
112 |                 route_table_id=subnet.route_table.route_table_id,
113 |                 destination_cidr_block=vpc2.vpc_cidr_block,
114 |                 vpc_peering_connection_id=fn_vPCPeering_connection.ref,
115 |             )
116 | 
117 |         # Allow postgres from vpc2 to vpc1
118 |         svc_sg.add_ingress_rule(
119 |             peer=test_sg,
120 |             connection=ec2.Port.tcp(5432),
121 |             description="Allow Postgres from VPC2",
122 |         )
123 | 
124 |         # Finally create an EMR Serverless app to test this on with the appropriate subnets and security group
125 |         emrs.CfnApplication(
126 |             self,
127 |             "spark_app",
128 |             release_label="emr-6.9.0",
129 |             type="SPARK",
130 |             name="cdk-spark",
131 |             network_configuration=emrs.CfnApplication.NetworkConfigurationProperty(
132 |                 subnet_ids=vpc2.select_subnets().subnet_ids,
133 |                 security_group_ids=[test_sg.security_group_id],
134 |             ),
135 |         )
136 | 


--------------------------------------------------------------------------------
/cdk/big-data-stack/stacks/emr.py:
--------------------------------------------------------------------------------
  1 | from aws_cdk import (
  2 |     core as cdk,
  3 |     aws_emr as emr,
  4 |     aws_ec2 as ec2,
  5 |     aws_iam as iam,
  6 |     aws_secretsmanager as secrets,
  7 | )
  8 | 
  9 | from stacks.utils import get_or_create_bucket
 10 | 
 11 | 
 12 | class EMRStack(cdk.Stack):
 13 |     cluster: emr.CfnCluster
 14 | 
 15 |     def __init__(
 16 |         self,
 17 |         scope: cdk.Construct,
 18 |         construct_id: str,
 19 |         vpc: ec2.IVpc,
 20 |         name: str,
 21 |         release_label: str,
 22 |         rds_secret: secrets.Secret,
 23 |         rds_connections: ec2.Connections,
 24 |         log_bucket_name: str = None,
 25 |         ssh_key_name: str = None,
 26 |         **kwargs,
 27 |     ) -> None:
 28 |         super().__init__(scope, construct_id, **kwargs)
 29 | 
 30 |         self.tag_vpc(vpc)
 31 | 
 32 |         job_role = self.get_job_role()
 33 |         service_role = self.get_service_role()
 34 |         instance_profile = self.create_instance_profile(job_role)
 35 |         log_bucket = get_or_create_bucket(self, "emr_logs", log_bucket_name)
 36 | 
 37 |         # Assign necessary permissions
 38 |         # EMR needs to be able to PutObject to the log bucket
 39 |         log_bucket.grant_put(job_role)
 40 | 
 41 |         # EMR needs to be able to PassRole to the instance profile role
 42 |         # https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-iam-role-for-ec2.html#emr-ec2-role-least-privilege
 43 |         # https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-iam-role.html
 44 |         service_role.add_to_policy(
 45 |             iam.PolicyStatement(
 46 |                 actions=["iam:PassRole"],
 47 |                 resources=[job_role.role_arn],
 48 |                 conditions={
 49 |                     "StringEquals": {"iam:PassedToService": "ec2.amazonaws.com"}
 50 |                 },
 51 |             )
 52 |         )
 53 | 
 54 |         # Database configuration variables
 55 |         rds_hostname = rds_secret.secret_value_from_json("host").to_string()
 56 |         rds_port = rds_secret.secret_value_from_json("port").to_string()
 57 |         rds_dbname = rds_secret.secret_value_from_json("dbname").to_string()
 58 | 
 59 |         # Desired subnet for the EMR cluster
 60 |         emr_subnet = vpc.public_subnets[0]
 61 | 
 62 |         self.cluster = emr.CfnCluster(
 63 |             self,
 64 |             construct_id,
 65 |             instances=emr.CfnCluster.JobFlowInstancesConfigProperty(
 66 |                 master_instance_group=emr.CfnCluster.InstanceGroupConfigProperty(
 67 |                     instance_count=1, instance_type="m5.xlarge"
 68 |                 ),
 69 |                 core_instance_group=emr.CfnCluster.InstanceGroupConfigProperty(
 70 |                     instance_count=2, instance_type="m5.xlarge"
 71 |                 ),
 72 |                 ec2_subnet_id=emr_subnet.subnet_id,
 73 |             ),
 74 |             name=name,
 75 |             release_label=release_label,
 76 |             log_uri=f"s3://{log_bucket.bucket_name}/elasticmapreduce/",
 77 |             job_flow_role=job_role.role_name,
 78 |             service_role=service_role.role_name,
 79 |             applications=[
 80 |                 emr.CfnCluster.ApplicationProperty(name=n)
 81 |                 for n in [
 82 |                     "Spark",
 83 |                     "Hive",
 84 |                     "Zeppelin",
 85 |                     "Livy",
 86 |                     "JupyterEnterpriseGateway",
 87 |                 ]
 88 |             ],
 89 |             visible_to_all_users=True, # Required for EMR Notebooks
 90 |             configurations=[
 91 |                 emr.CfnCluster.ConfigurationProperty(
 92 |                     classification="hive-site",
 93 |                     configuration_properties={
 94 |                         "javax.jdo.option.ConnectionURL": f"jdbc:mysql://{rds_hostname}:{rds_port}/{rds_dbname}?createDatabaseIfNotExist=true",
 95 |                         "javax.jdo.option.ConnectionDriverName": "org.mariadb.jdbc.Driver",
 96 |                         "javax.jdo.option.ConnectionUserName": rds_secret.secret_value_from_json(
 97 |                             "username"
 98 |                         ).to_string(),
 99 |                         "javax.jdo.option.ConnectionPassword": rds_secret.secret_value_from_json(
100 |                             "password"
101 |                         ).to_string(),
102 |                     },
103 |                 ),
104 |             ],
105 |             tags=[
106 |                 cdk.CfnTag(
107 |                     key="for-use-with-amazon-emr-managed-policies", value="true"
108 |                 ),
109 |             ],
110 |         )
111 | 
112 |         # Wait for the instance profile to be created
113 |         self.cluster.add_depends_on(instance_profile)
114 | 
115 |         # Allow EMR to connect to the RDS database
116 |         self.add_rds_ingres(emr_subnet.ipv4_cidr_block, rds_connections)
117 | 
118 |     def tag_vpc(
119 |         self,
120 |         vpc: ec2.IVpc,
121 |     ) -> None:
122 |         # The VPC requires a Tag to allow EMR to create the relevant security groups
123 |         cdk.Tags.of(vpc).add("for-use-with-amazon-emr-managed-policies", "true")
124 | 
125 |     def add_rds_ingres(self, subnet, conn: ec2.Connections) -> None:
126 |         conn.security_groups[0].add_ingress_rule(
127 |             peer=ec2.Peer.ipv4(subnet),
128 |             connection=ec2.Port.tcp(3306),
129 |             description="EMR MySQL Access",
130 |         )
131 | 
132 |     def get_service_role(self) -> iam.Role:
133 |         return iam.Role(
134 |             self,
135 |             "emr_service_role",
136 |             assumed_by=iam.ServicePrincipal("elasticmapreduce.amazonaws.com"),
137 |             managed_policies=[
138 |                 iam.ManagedPolicy.from_aws_managed_policy_name(
139 |                     "service-role/AmazonEMRServicePolicy_v2"
140 |                 )
141 |             ],
142 |         )
143 | 
144 |     def get_job_role(self) -> iam.Role:
145 |         """
146 |         Create a new EC2 instance profile role for EMR instances.
147 |         This role allows full read-only access to S3.
148 |         """
149 |         return iam.Role(
150 |             self,
151 |             "EMRJobRole",
152 |             assumed_by=iam.ServicePrincipal("ec2.amazonaws.com"),
153 |             managed_policies=[
154 |                 iam.ManagedPolicy.from_aws_managed_policy_name(
155 |                     "AmazonS3ReadOnlyAccess"
156 |                 )
157 |             ]
158 |         )
159 | 
160 |     def create_instance_profile(self, job_role: iam.Role) -> iam.CfnInstanceProfile:
161 |         return iam.CfnInstanceProfile(
162 |             self,
163 |             "emr_instance_profile",
164 |             instance_profile_name=job_role.role_name,
165 |             roles=[job_role.role_name],
166 |         )
167 | 
168 |     def log_writer_policy(self, bucket: str) -> iam.PolicyStatement:
169 |         return iam.PolicyStatement(
170 |             effect=iam.Effect.ALLOW,
171 |             actions=["s3:PutObject"],
172 |             resources=[f"arn:aws:s3:::{bucket}/*"],
173 |         )


--------------------------------------------------------------------------------
/emr/eks/videos/custom_images/generate_aqi_map.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import datetime
  3 | import io
  4 | 
  5 | import boto3
  6 | import geopandas as gpd
  7 | from bokeh.io.export import get_screenshot_as_png
  8 | from bokeh.io.webdriver import create_chromium_webdriver
  9 | from bokeh.models import ColorBar, GeoJSONDataSource, LinearColorMapper
 10 | from bokeh.palettes import Reds9 as palette
 11 | from bokeh.plotting import figure
 12 | from PIL.Image import Image
 13 | from pyspark.broadcast import Broadcast
 14 | from pyspark.context import SparkContext
 15 | from pyspark.sql import SparkSession, dataframe
 16 | from pyspark.sql.dataframe import DataFrame
 17 | from pyspark.sql.functions import last, udf
 18 | from pyspark.sql.types import StringType
 19 | from pyspark.sql.window import Window
 20 | from shapely.geometry import Point
 21 | 
 22 | STATE_FILE = "file:///usr/local/share/bokeh/cb_2020_us_state_500k.zip"
 23 | COUNTY_FILE = "file:///usr/local/share/bokeh/cb_2020_us_county_500k.zip"
 24 | EXCLUDED_STATES = ["AK", "HI", "PR", "GU", "VI", "MP", "AS"]
 25 | 
 26 | 
 27 | def find_first_county_id(longitude: float, latitude: float):
 28 |     p = Point(longitude, latitude)
 29 |     for index, geo in bc_county.value.items():
 30 |         if geo.intersects(p):
 31 |             return index
 32 |     return None
 33 | 
 34 | 
 35 | find_first_county_id_udf_v2 = udf(find_first_county_id, StringType())
 36 | 
 37 | 
 38 | def load_county_data(sc: SparkContext) -> Broadcast:
 39 |     """
 40 |     Loads census.gov polygon data for US counties and broadcasts
 41 |     a hash of county GEOID to geometry.
 42 |     """
 43 |     countydf = gpd.read_file(COUNTY_FILE)
 44 |     return sc.broadcast(dict(zip(countydf["GEOID"], countydf["geometry"])))
 45 | 
 46 | 
 47 | def get_latest_aqi_avg_by_county(date) -> DataFrame:
 48 |     """
 49 |     Fetches `date` data from the OpenAQ dataset and performs the following:
 50 |         - Filters down to US only
 51 |         - Filters to pm2.5 readings
 52 |         - Retrieves the most recent reading
 53 |         - Enriches the dataframe with Census data county GEOID
 54 |         - Calculates the average reading per county
 55 |     """
 56 |     df = spark.read.json(f"s3://openaq-fetches/realtime-gzipped/{date}/")
 57 | 
 58 |     # Filter down to US locations only
 59 |     usdf = (
 60 |         df.where(df.country == "US")
 61 |         .where(df.parameter == "pm25")
 62 |         .select("coordinates", "date", "parameter", "unit", "value", "location")
 63 |     )
 64 | 
 65 |     # Retrieve the most recent pm2.5 reading per county
 66 |     windowSpec = (
 67 |         Window.partitionBy("location")
 68 |         .orderBy("date.utc")
 69 |         .rangeBetween(Window.unboundedPreceding, Window.unboundedFollowing)
 70 |     )
 71 |     last_reading_df = (
 72 |         usdf.withColumn("last_value", last("value").over(windowSpec))
 73 |         .select("coordinates", "last_value")
 74 |         .distinct()
 75 |     )
 76 | 
 77 |     # Find the county that this reading is from
 78 |     countydf = last_reading_df.withColumn(
 79 |         "GEOID",
 80 |         find_first_county_id_udf_v2(
 81 |             last_reading_df.coordinates.longitude, last_reading_df.coordinates.latitude
 82 |         ),
 83 |     ).select("GEOID", "last_value")
 84 | 
 85 |     # Calculate the average reading per county
 86 |     pm_avg_by_county = (
 87 |         countydf.groupBy("GEOID")
 88 |         .agg({"last_value": "avg"})
 89 |         .withColumnRenamed("avg(last_value)", "avg_value")
 90 |     )
 91 | 
 92 |     return pm_avg_by_county
 93 | 
 94 | 
 95 | def generate_map(df: dataframe, title: str) -> Image:
 96 |     """
 97 |     Generate an air quality map for the continental US.
 98 |     """
 99 |     palette_r = tuple(reversed(palette))
100 | 
101 |     # Read in county and state geo data from census.gov
102 |     county_df = gpd.read_file(COUNTY_FILE).query(f"STUSPS not in {EXCLUDED_STATES}")
103 |     state_df = gpd.read_file(STATE_FILE).query(f"STUSPS not in {EXCLUDED_STATES}")
104 | 
105 |     # Merge in our air quality data
106 |     county_aqi_df = county_df.merge(df.toPandas(), on="GEOID")
107 |     color_column = "avg_value"
108 | 
109 |     # Convert to a "proper" Albers projection :)
110 |     state_json = state_df.to_crs("ESRI:102003").to_json()
111 |     county_json = county_aqi_df.to_crs("ESRI:102003").to_json()
112 | 
113 |     # Now build the plot!
114 |     p = figure(
115 |         title=title,
116 |         plot_width=1100,
117 |         plot_height=700,
118 |         toolbar_location=None,
119 |         x_axis_location=None,
120 |         y_axis_location=None,
121 |         tooltips=[
122 |             ("County", "@NAME"),
123 |             ("Air Quality Index", "@avg_value"),
124 |         ],
125 |     )
126 |     color_mapper = LinearColorMapper(palette=palette_r)
127 |     p.grid.grid_line_color = None
128 |     p.hover.point_policy = "follow_mouse"
129 |     p.patches(
130 |         "xs",
131 |         "ys",
132 |         fill_alpha=0.0,
133 |         line_color="black",
134 |         line_width=0.5,
135 |         source=GeoJSONDataSource(geojson=state_json),
136 |     )
137 |     p.patches(
138 |         "xs",
139 |         "ys",
140 |         fill_alpha=0.7,
141 |         fill_color={"field": color_column, "transform": color_mapper},
142 |         line_color="black",
143 |         line_width=0.5,
144 |         source=GeoJSONDataSource(geojson=county_json),
145 |     )
146 | 
147 |     color_bar = ColorBar(color_mapper=color_mapper, label_standoff=12, width=10)
148 |     p.add_layout(color_bar, "right")
149 | 
150 |     driver = create_chromium_webdriver(["--no-sandbox"])
151 |     return get_screenshot_as_png(p, height=700, width=1100, driver=driver)
152 | 
153 | 
154 | def upload_image(image: Image, bucket_name, key):
155 |     print(f"Uploading image data to s3://{bucket_name}/{key}")
156 |     client = boto3.client("s3")
157 |     in_mem_file = io.BytesIO()
158 |     image.save(in_mem_file, format="png")
159 |     in_mem_file.seek(0)
160 |     client.put_object(Bucket=bucket_name, Key=key, Body=in_mem_file)
161 | 
162 | 
163 | def parse_args() -> argparse.ArgumentParser:
164 |     parser = argparse.ArgumentParser()
165 |     parser.add_argument("bucket", help="The name of the S3 bucket to upload to.")
166 |     parser.add_argument(
167 |         "prefix",
168 |         help="The prefix where the image file (date-latest.png) will be uploaded.",
169 |     )
170 |     parser.add_argument(
171 |         "--date",
172 |         help="The date to create the AQI map for.",
173 |         default=f"{datetime.datetime.utcnow().date()}",
174 |     )
175 |     return parser.parse_args()
176 | 
177 | 
178 | if __name__ == "__main__":
179 |     """
180 |     Generates an Air Quality Index (AQI) map for the continential US.
181 |     By default, it generates the latest AQI readings for the current date.
182 | 
183 |     Usage: generate_aqi_map
184 |     """
185 |     spark = SparkSession.builder.appName("AirQualityMapper").getOrCreate()
186 |     bc_county = load_county_data(spark.sparkContext)
187 | 
188 |     args = parse_args()
189 |     date = args.date
190 |     bucket = args.bucket
191 |     key = f"{args.prefix}{date}-latest.png"
192 | 
193 |     pm_reading_by_county = get_latest_aqi_avg_by_county(date)
194 |     image = generate_map(pm_reading_by_county, f"US PM2.5 by county for {date}")
195 |     upload_image(image, bucket, key)
196 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Creative Commons Legal Code
  2 | 
  3 | CC0 1.0 Universal
  4 | 
  5 |     CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE
  6 |     LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN
  7 |     ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS
  8 |     INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES
  9 |     REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS
 10 |     PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM
 11 |     THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED
 12 |     HEREUNDER.
 13 | 
 14 | Statement of Purpose
 15 | 
 16 | The laws of most jurisdictions throughout the world automatically confer
 17 | exclusive Copyright and Related Rights (defined below) upon the creator
 18 | and subsequent owner(s) (each and all, an "owner") of an original work of
 19 | authorship and/or a database (each, a "Work").
 20 | 
 21 | Certain owners wish to permanently relinquish those rights to a Work for
 22 | the purpose of contributing to a commons of creative, cultural and
 23 | scientific works ("Commons") that the public can reliably and without fear
 24 | of later claims of infringement build upon, modify, incorporate in other
 25 | works, reuse and redistribute as freely as possible in any form whatsoever
 26 | and for any purposes, including without limitation commercial purposes.
 27 | These owners may contribute to the Commons to promote the ideal of a free
 28 | culture and the further production of creative, cultural and scientific
 29 | works, or to gain reputation or greater distribution for their Work in
 30 | part through the use and efforts of others.
 31 | 
 32 | For these and/or other purposes and motivations, and without any
 33 | expectation of additional consideration or compensation, the person
 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she
 35 | is an owner of Copyright and Related Rights in the Work, voluntarily
 36 | elects to apply CC0 to the Work and publicly distribute the Work under its
 37 | terms, with knowledge of his or her Copyright and Related Rights in the
 38 | Work and the meaning and intended legal effect of CC0 on those rights.
 39 | 
 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be
 41 | protected by copyright and related or neighboring rights ("Copyright and
 42 | Related Rights"). Copyright and Related Rights include, but are not
 43 | limited to, the following:
 44 | 
 45 |   i. the right to reproduce, adapt, distribute, perform, display,
 46 |      communicate, and translate a Work;
 47 |  ii. moral rights retained by the original author(s) and/or performer(s);
 48 | iii. publicity and privacy rights pertaining to a person's image or
 49 |      likeness depicted in a Work;
 50 |  iv. rights protecting against unfair competition in regards to a Work,
 51 |      subject to the limitations in paragraph 4(a), below;
 52 |   v. rights protecting the extraction, dissemination, use and reuse of data
 53 |      in a Work;
 54 |  vi. database rights (such as those arising under Directive 96/9/EC of the
 55 |      European Parliament and of the Council of 11 March 1996 on the legal
 56 |      protection of databases, and under any national implementation
 57 |      thereof, including any amended or successor version of such
 58 |      directive); and
 59 | vii. other similar, equivalent or corresponding rights throughout the
 60 |      world based on applicable law or treaty, and any national
 61 |      implementations thereof.
 62 | 
 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention
 64 | of, applicable law, Affirmer hereby overtly, fully, permanently,
 65 | irrevocably and unconditionally waives, abandons, and surrenders all of
 66 | Affirmer's Copyright and Related Rights and associated claims and causes
 67 | of action, whether now known or unknown (including existing as well as
 68 | future claims and causes of action), in the Work (i) in all territories
 69 | worldwide, (ii) for the maximum duration provided by applicable law or
 70 | treaty (including future time extensions), (iii) in any current or future
 71 | medium and for any number of copies, and (iv) for any purpose whatsoever,
 72 | including without limitation commercial, advertising or promotional
 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each
 74 | member of the public at large and to the detriment of Affirmer's heirs and
 75 | successors, fully intending that such Waiver shall not be subject to
 76 | revocation, rescission, cancellation, termination, or any other legal or
 77 | equitable action to disrupt the quiet enjoyment of the Work by the public
 78 | as contemplated by Affirmer's express Statement of Purpose.
 79 | 
 80 | 3. Public License Fallback. Should any part of the Waiver for any reason
 81 | be judged legally invalid or ineffective under applicable law, then the
 82 | Waiver shall be preserved to the maximum extent permitted taking into
 83 | account Affirmer's express Statement of Purpose. In addition, to the
 84 | extent the Waiver is so judged Affirmer hereby grants to each affected
 85 | person a royalty-free, non transferable, non sublicensable, non exclusive,
 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and
 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the
 88 | maximum duration provided by applicable law or treaty (including future
 89 | time extensions), (iii) in any current or future medium and for any number
 90 | of copies, and (iv) for any purpose whatsoever, including without
 91 | limitation commercial, advertising or promotional purposes (the
 92 | "License"). The License shall be deemed effective as of the date CC0 was
 93 | applied by Affirmer to the Work. Should any part of the License for any
 94 | reason be judged legally invalid or ineffective under applicable law, such
 95 | partial invalidity or ineffectiveness shall not invalidate the remainder
 96 | of the License, and in such case Affirmer hereby affirms that he or she
 97 | will not (i) exercise any of his or her remaining Copyright and Related
 98 | Rights in the Work or (ii) assert any associated claims and causes of
 99 | action with respect to the Work, in either case contrary to Affirmer's
100 | express Statement of Purpose.
101 | 
102 | 4. Limitations and Disclaimers.
103 | 
104 |  a. No trademark or patent rights held by Affirmer are waived, abandoned,
105 |     surrendered, licensed or otherwise affected by this document.
106 |  b. Affirmer offers the Work as-is and makes no representations or
107 |     warranties of any kind concerning the Work, express, implied,
108 |     statutory or otherwise, including without limitation warranties of
109 |     title, merchantability, fitness for a particular purpose, non
110 |     infringement, or the absence of latent or other defects, accuracy, or
111 |     the present or absence of errors, whether or not discoverable, all to
112 |     the greatest extent permissible under applicable law.
113 |  c. Affirmer disclaims responsibility for clearing rights of other persons
114 |     that may apply to the Work or any use thereof, including without
115 |     limitation any person's Copyright and Related Rights in the Work.
116 |     Further, Affirmer disclaims responsibility for obtaining any necessary
117 |     consents, permissions or other rights required for any use of the
118 |     Work.
119 |  d. Affirmer understands and acknowledges that Creative Commons is not a
120 |     party to this document and has no duty or obligation with respect to
121 |     this CC0 or use of the Work.
122 | 


--------------------------------------------------------------------------------
/emr/airflow/mwaa_stack/mwaa/mwaa_stack.py:
--------------------------------------------------------------------------------
  1 | from aws_cdk import core as cdk
  2 | 
  3 | import aws_cdk.aws_mwaa as mwaa
  4 | 
  5 | from aws_cdk import (
  6 |     core as cdk,
  7 |     aws_ec2 as ec2,
  8 |     aws_mwaa as mwaa,
  9 |     aws_s3 as s3,
 10 |     aws_s3_deployment as s3d,
 11 |     aws_iam as iam,
 12 | )
 13 | 
 14 | 
 15 | class MwaaStack(cdk.Stack):
 16 |     def __init__(self, scope: cdk.Construct, construct_id: str, **kwargs) -> None:
 17 |         super().__init__(scope, construct_id, **kwargs)
 18 | 
 19 |         # We'll create a VPC just for this
 20 |         vpc = ec2.Vpc(self, "mwaa-vpc", max_azs=2)
 21 | 
 22 |         # We need a bucket for assets
 23 |         bucket = s3.Bucket(
 24 |             self,
 25 |             "mwaa-bucket",
 26 |             versioned=True,
 27 |             auto_delete_objects=True,
 28 |             removal_policy=cdk.RemovalPolicy.DESTROY,
 29 |             block_public_access=s3.BlockPublicAccess.BLOCK_ALL,
 30 |         )
 31 |         files = s3d.BucketDeployment(
 32 |             self,
 33 |             "mwaa-assets",
 34 |             sources=[s3d.Source.asset("./assets")],
 35 |             destination_bucket=bucket,
 36 |         )
 37 | 
 38 |         # And a service role with additional EMR permissions
 39 |         # See https://docs.aws.amazon.com/mwaa/latest/userguide/mwaa-create-role.html
 40 |         mwaa_service_role = iam.Role(
 41 |             self,
 42 |             "mwaa-service-role",
 43 |             assumed_by=iam.CompositePrincipal(
 44 |                 iam.ServicePrincipal("airflow.amazonaws.com"),
 45 |                 iam.ServicePrincipal("airflow-env.amazonaws.com"),
 46 |             ),
 47 |             inline_policies={
 48 |                 "CDKmwaaPolicyDocument": self.mwaa_policy_document(
 49 |                     "dacort-airflow", bucket.bucket_arn
 50 |                 )
 51 |             },
 52 |             path="/service-role/",
 53 |         )
 54 | 
 55 |         # And security group
 56 |         security_group = ec2.SecurityGroup(
 57 |             self, id="mwaa-sg", vpc=vpc, security_group_name="mwaa-sg"
 58 |         )
 59 |         security_group.connections.allow_internally(ec2.Port.all_traffic(), "MWAA")
 60 | 
 61 |         # Enable logging on everything
 62 |         logging_configuration = mwaa.CfnEnvironment.LoggingConfigurationProperty(
 63 |             task_logs=mwaa.CfnEnvironment.ModuleLoggingConfigurationProperty(
 64 |                 enabled=True, log_level="INFO"
 65 |             ),
 66 |             worker_logs=mwaa.CfnEnvironment.ModuleLoggingConfigurationProperty(
 67 |                 enabled=True, log_level="INFO"
 68 |             ),
 69 |             scheduler_logs=mwaa.CfnEnvironment.ModuleLoggingConfigurationProperty(
 70 |                 enabled=True, log_level="INFO"
 71 |             ),
 72 |             dag_processing_logs=mwaa.CfnEnvironment.ModuleLoggingConfigurationProperty(
 73 |                 enabled=True, log_level="INFO"
 74 |             ),
 75 |             webserver_logs=mwaa.CfnEnvironment.ModuleLoggingConfigurationProperty(
 76 |                 enabled=True, log_level="INFO"
 77 |             ),
 78 |         )
 79 | 
 80 |         # Create our MWAA
 81 |         subnets = [subnet.subnet_id for subnet in vpc.private_subnets]
 82 |         airflow = mwaa.CfnEnvironment(
 83 |             self,
 84 |             "airflow-v2",
 85 |             name="dacort-airflow",
 86 |             airflow_version="2.0.2",
 87 |             dag_s3_path=f"dags/",
 88 |             source_bucket_arn=bucket.bucket_arn,
 89 |             execution_role_arn=mwaa_service_role.role_arn,
 90 |             requirements_s3_path="requirements.txt",
 91 |             webserver_access_mode="PUBLIC_ONLY",
 92 |             environment_class="mw1.small",
 93 |             network_configuration=mwaa.CfnEnvironment.NetworkConfigurationProperty(
 94 |                 subnet_ids=subnets,
 95 |                 security_group_ids=[security_group.security_group_id],
 96 |             ),
 97 |             logging_configuration=logging_configuration,
 98 |         )
 99 |         airflow.node.add_dependency(files)
100 | 
101 |         # Register a couple outputs
102 |         cdk.CfnOutput(self, "mwaa_bucket", value=bucket.bucket_name)
103 |         cdk.CfnOutput(self, "mwaa_url", value=f"https://{airflow.attr_webserver_url}")
104 | 
105 |     def mwaa_policy_document(self, mwaa_env_name: str, mwaa_bucket_arn: str):
106 |         return iam.PolicyDocument(
107 |             statements=[
108 |                 iam.PolicyStatement(
109 |                     actions=["airflow:PublishMetrics"],
110 |                     effect=iam.Effect.ALLOW,
111 |                     resources=[
112 |                         f"arn:aws:airflow:{self.region}:{self.account}:environment/{mwaa_env_name}"
113 |                     ],
114 |                 ),
115 |                 iam.PolicyStatement(
116 |                     actions=["s3:ListAllMyBuckets"],
117 |                     effect=iam.Effect.DENY,
118 |                     resources=[f"{mwaa_bucket_arn}/*", f"{mwaa_bucket_arn}"],
119 |                 ),
120 |                 iam.PolicyStatement(
121 |                     actions=["s3:*"],
122 |                     effect=iam.Effect.ALLOW,
123 |                     resources=[f"{mwaa_bucket_arn}/*", f"{mwaa_bucket_arn}"],
124 |                 ),
125 |                 iam.PolicyStatement(
126 |                     actions=[
127 |                         "logs:CreateLogStream",
128 |                         "logs:CreateLogGroup",
129 |                         "logs:PutLogEvents",
130 |                         "logs:GetLogEvents",
131 |                         "logs:GetLogRecord",
132 |                         "logs:GetLogGroupFields",
133 |                         "logs:GetQueryResults",
134 |                     ],
135 |                     effect=iam.Effect.ALLOW,
136 |                     resources=[
137 |                         f"arn:aws:logs:{self.region}:{self.account}:log-group:airflow-{mwaa_env_name}-*"
138 |                     ],
139 |                 ),
140 |                 iam.PolicyStatement(
141 |                     actions=["logs:DescribeLogGroups"],
142 |                     effect=iam.Effect.ALLOW,
143 |                     resources=["*"],
144 |                 ),
145 |                 iam.PolicyStatement(
146 |                     actions=[
147 |                         "sqs:ChangeMessageVisibility",
148 |                         "sqs:DeleteMessage",
149 |                         "sqs:GetQueueAttributes",
150 |                         "sqs:GetQueueUrl",
151 |                         "sqs:ReceiveMessage",
152 |                         "sqs:SendMessage",
153 |                     ],
154 |                     effect=iam.Effect.ALLOW,
155 |                     resources=[f"arn:aws:sqs:{self.region}:*:airflow-celery-*"],
156 |                 ),
157 |                 iam.PolicyStatement(
158 |                     actions=[
159 |                         "kms:Decrypt",
160 |                         "kms:DescribeKey",
161 |                         "kms:GenerateDataKey*",
162 |                         "kms:Encrypt",
163 |                     ],
164 |                     effect=iam.Effect.ALLOW,
165 |                     resources=["*"],
166 |                     conditions={
167 |                         "StringEquals": {
168 |                             "kms:ViaService": [
169 |                                 f"sqs.{self.region}.amazonaws.com",
170 |                                 f"s3.{self.region}.amazonaws.com",
171 |                             ]
172 |                         }
173 |                     },
174 |                 ),
175 |                 iam.PolicyStatement(
176 |                     actions=[
177 |                         "emr-containers:StartJobRun",
178 |                         "emr-containers:DescribeJobRun",
179 |                         "emr-containers:CancelJobRun",
180 |                     ],
181 |                     effect=iam.Effect.ALLOW,
182 |                     resources=["*"],
183 |                 ),
184 |                 iam.PolicyStatement(
185 |                     actions=[
186 |                         "elasticmapreduce:RunJobFlow",
187 |                         "elasticmapreduce:DescribeStep",
188 |                         "elasticmapreduce:DescribeCluster",
189 |                     ],
190 |                     effect=iam.Effect.ALLOW,
191 |                     resources=["*"],
192 |                 ),
193 |                 iam.PolicyStatement(
194 |                     actions=["iam:PassRole"],
195 |                     effect=iam.Effect.ALLOW,
196 |                     resources=[
197 |                         f"arn:aws:iam::{self.account}:role/EMR_DemoRole",
198 |                         f"arn:aws:iam::{self.account}:role/EMR_EC2_DemoRole",
199 |                         f"arn:aws:iam::{self.account}:role/EMR_EC2_DefaultRole",
200 |                         f"arn:aws:iam::{self.account}:role/EMR_DefaultRole",
201 |                     ],
202 |                 ),
203 |             ]
204 |         )
205 | 


--------------------------------------------------------------------------------
/spark/local-k8s/README.md:
--------------------------------------------------------------------------------
  1 | # Spark on Local Kubernetes
  2 | 
  3 | This is a demo of how to get Spark up and running with a local (KIND) Kubernetes environment.
  4 | 
  5 | 
  6 | ## Pre-requisites
  7 | 
  8 | - [KIND](https://kind.sigs.k8s.io/)
  9 | - Docker, `kubectl`, `helm`
 10 | 
 11 | ## Install and start KIND
 12 | 
 13 | ```bash
 14 | kind create cluster --config kind-config.yaml
 15 | kubectl cluster-info --context kind-kind
 16 | ```
 17 | 
 18 | ```
 19 | ❯ kubectl cluster-info --context kind-kind
 20 | Kubernetes control plane is running at https://127.0.0.1:61563
 21 | CoreDNS is running at https://127.0.0.1:61563/api/v1/namespaces/kube-system/services/kube-dns:dns/proxy
 22 | 
 23 | To further debug and diagnose cluster problems, use 'kubectl cluster-info dump'.
 24 | ```
 25 | 
 26 | By default, `kind` adds the cluster info to your `~/.kube/config` file. If you need to get it later (maybe your config got updated), you can always use something like this:
 27 | 
 28 | ```bash
 29 | kind get kubeconfig > ~/.kube/kind-config
 30 | KUBECONFIG=~/.kube/kind-config kubectl get pods
 31 | ```
 32 | 
 33 | ### (Optional) Install Official Apache Spark K8s Operator
 34 | 
 35 | ```bash
 36 | helm install spark-kubernetes-operator \
 37 | https://nightlies.apache.org/spark/charts/spark-kubernetes-operator-0.1.0-SNAPSHOT.tgz
 38 | ```
 39 | 
 40 | #### uninstall process
 41 | 
 42 | ```bash
 43 | helm uninstall spark-kubernetes-operator
 44 | kubectl delete crd sparkapplications.spark.apache.org
 45 | kubectl delete crd sparkclusters.spark.apache.org
 46 | ```
 47 | 
 48 | ## Verify Installs
 49 | 
 50 | ```bash
 51 | kubectl --context=kind-kind get pods --all-namespaces
 52 | ```
 53 | 
 54 | _remaining commands assume your `kubectl` context is set to `kind-kind`_
 55 | 
 56 | ```
 57 | ❯ kubectl --context=kind-kind get pods --all-namespaces
 58 | NAMESPACE            NAME                                             READY   STATUS              RESTARTS   AGE
 59 | kube-system          coredns-7db6d8ff4d-7x252                         1/1     Running             0          2m36s
 60 | kube-system          coredns-7db6d8ff4d-kjk6v                         1/1     Running             0          2m36s
 61 | kube-system          etcd-kind-control-plane                          1/1     Running             0          2m52s
 62 | kube-system          kindnet-62kqj                                    1/1     Running             0          2m36s
 63 | kube-system          kube-apiserver-kind-control-plane                1/1     Running             0          2m51s
 64 | kube-system          kube-controller-manager-kind-control-plane       1/1     Running             0          2m51s
 65 | kube-system          kube-proxy-ggk2n                                 1/1     Running             0          2m36s
 66 | kube-system          kube-scheduler-kind-control-plane                1/1     Running             0          2m51s
 67 | local-path-storage   local-path-provisioner-988d74bc-dkc6k            1/1     Running             0          2m36s
 68 | spark-operator       spark-operator-7b7b54cf75-8p9jb                  1/1     Running             0          25s
 69 | ```
 70 | 
 71 | 
 72 | ## Run a Spark job
 73 | 
 74 | ```bash
 75 | kubectl --context=kind-kind create -f - <<-EOF
 76 | apiVersion: spark.apache.org/v1alpha1
 77 | kind: SparkApplication
 78 | metadata:
 79 |   name: pi-python
 80 | spec:
 81 |   pyFiles: "local:///opt/spark/examples/src/main/python/pi.py"
 82 |   sparkConf:
 83 |     spark.dynamicAllocation.enabled: "true"
 84 |     spark.dynamicAllocation.shuffleTracking.enabled: "true"
 85 |     spark.dynamicAllocation.maxExecutors: "3"
 86 |     spark.log.structuredLogging.enabled: "false"
 87 |     spark.kubernetes.authenticate.driver.serviceAccountName: "spark"
 88 |     spark.kubernetes.container.image: "apache/spark:4.0.0-preview2"
 89 |   applicationTolerations:
 90 |     resourceRetainPolicy: OnFailure
 91 |   runtimeVersions:
 92 |     sparkVersion: "4.0.0-preview2"
 93 | EOF
 94 | ```
 95 | 
 96 | Once the container image downloads and the container starts, you can watch the logs with:
 97 | 
 98 | ```bash
 99 | kubectl logs -f pi-python-0-driver
100 | ```
101 | 
102 | To delete the app, use:
103 | 
104 | ```bash
105 | kubectl delete sparkapp/pi-python
106 | ```
107 | 
108 | ## Let's try S3 tables
109 | 
110 | Per the docs on [S3 tables with Apache Spark](https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-tables-integrating-open-source-spark.html).
111 | 
112 | - Create a table bucket in a region near me
113 | 
114 | ```bash
115 | aws s3tables create-table-bucket \
116 |     --region us-west-2 \
117 |     --name dacort-berg
118 | ```
119 | 
120 | ```json
121 | {
122 |     "arn": "arn:aws:s3tables:us-west-2:<YOUR_AWS_ACCOUNT_ID>:bucket/dacort-berg"
123 | }
124 | ```
125 | 
126 | - Spin up a Spark SQL shell
127 | 
128 | First, we create a persistent pod we can exec into.
129 | 
130 | ```bash
131 | kubectl apply -f spark-shell-pod.yaml
132 | ```
133 | 
134 | Then start up Spark SQL
135 | 
136 | _note that we assume you already have your AWS CLI setup and can export credentials_
137 | 
138 | ```bash
139 | export AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text)
140 | export TABLE_BUCKET_NAME=dacort-berg
141 | 
142 | kubectl exec -it spark-shell-pod -- /bin/bash -c "export AWS_REGION=us-west-2;$(aws configure export-credentials --format env | tr '\n' ';') \
143 |     /opt/spark/bin/spark-sql \
144 |     --packages org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.6.1,software.amazon.awssdk:s3tables:2.29.26,software.amazon.awssdk:s3:2.29.26,software.amazon.awssdk:sts:2.29.26,software.amazon.awssdk:kms:2.29.26,software.amazon.awssdk:glue:2.29.26,software.amazon.awssdk:dynamodb:2.29.26,software.amazon.s3tables:s3-tables-catalog-for-iceberg-runtime:0.1.3 \
145 |     --conf spark.jars.ivy=/opt/spark/work-dir/.ivy2 \
146 |     --conf spark.sql.catalog.s3tablesbucket=org.apache.iceberg.spark.SparkCatalog \
147 |     --conf spark.sql.catalog.s3tablesbucket.catalog-impl=software.amazon.s3tables.iceberg.S3TablesCatalog \
148 |     --conf spark.sql.catalog.s3tablesbucket.warehouse=arn:aws:s3tables:us-west-2:${AWS_ACCOUNT_ID}:bucket/${TABLE_BUCKET_NAME} \
149 |     --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions"
150 | ```
151 | 
152 | - Create a new S3 Table
153 | 
154 | ```sql
155 | CREATE NAMESPACE IF NOT EXISTS s3tablesbucket.default;
156 | 
157 | CREATE TABLE IF NOT EXISTS s3tablesbucket.default.`demo` 
158 |     ( id INT, name STRING, value INT )
159 | USING iceberg;
160 | 
161 | INSERT INTO s3tablesbucket.default.demo VALUES (1, 'damon', 33), (2, 'dad', 34);
162 | 
163 | SELECT * FROM s3tablesbucket.default.demo;
164 | ```
165 | 
166 | ```
167 | spark-sql (default)> SELECT * FROM s3tablesbucket.default.demo;
168 | 1       damon   33
169 | 2       dad     34
170 | Time taken: 3.455 seconds, Fetched 2 row(s)
171 | ```
172 | 
173 | ## Reading S3 Tables with other query engines (DuckDB)
174 | 
175 | The neat(?) thing about S3 Tables is that it's just Iceberg behind the scenes.
176 | 
177 | So if you use `aws s3tables get-table`, you can find the metadata location:
178 | 
179 | ```bash
180 |  aws s3tables get-table --table-bucket-arn arn:aws:s3tables:us-west-2:${AWS_ACCOUNT_ID}:bucket/${TABLE_BUCKET_NAME} --namespace default --name demo
181 |  ```
182 | 
183 |  ```json
184 |  {
185 |     "name": "demo",
186 |     "type": "customer",
187 |     "tableARN": "arn:aws:s3tables:us-west-2:<YOUR_AWS_ACCOUNT_ID>:bucket/dacort-berg/table/e0b502d9-5de1-46a4-8633-412b78401be3",
188 |     "namespace": [
189 |         "default"
190 |     ],
191 |     "versionToken": "<SOME_ID>",
192 |     "metadataLocation": "s3://502d9-5de1-46a4-<SOME_OTHER_ID>--table-s3/metadata/00001-e76a727d-8a4d-4883-95c7-dea809f2a4cb.metadata.json",
193 |     "warehouseLocation": "s3://502d9-5de1-46a4-<SOME_OTHER_ID>--table-s3",
194 |     "createdAt": "2024-12-18T21:20:39.347151+00:00",
195 |     "createdBy": "<YOUR_AWS_ACCOUNT_ID>",
196 |     "modifiedAt": "2024-12-18T21:25:22.327612+00:00",
197 |     "ownerAccountId": "<YOUR_AWS_ACCOUNT_ID>",
198 |     "format": "ICEBERG"
199 | }
200 | ```
201 | 
202 | If you take the `metadataLocation` from the response and use that in DuckDB (with the `iceberg`, `httpfs` extensions installed and an [S3 secret created](https://duckdb.org/docs/configuration/secrets_manager.html#temporary-secrets))...it seems to work!
203 | 
204 | _using duckdb v1.1.3 19864453f7_
205 | 
206 | ```sql
207 | -- Install/load Iceberg and https extensions
208 | -- Set up S3 access to my specific region
209 | INSTALL iceberg;
210 | LOAD iceberg;
211 | INSTALL https;
212 | LOAD https;
213 | CREATE SECRET secret1 (
214 |     TYPE S3,
215 |     PROVIDER CREDENTIAL_CHAIN,
216 |     ENDPOINT 's3.us-west-2.amazonaws.com'
217 | );
218 | 
219 | -- Query using the metadat file from above!
220 | SELECT count(*)
221 | FROM iceberg_scan('s3://502d9-5de1-46a4-<SOME_OTHER_ID>--table-s3/metadata/00001-e76a727d-8a4d-4883-95c7-dea809f2a4cb.metadata.json');
222 | ```
223 | 
224 | ```
225 | ┌──────────────┐
226 | │ count_star() │
227 | │    int64     │
228 | ├──────────────┤
229 | │            2 │
230 | └──────────────┘
231 | ```
232 | 
233 | ```sql
234 | SELECT * FROM iceberg_scan('s3://502d9-5de1-46a4-<SOME_OTHER_ID>--table-s3/metadata/00001-e76a727d-8a4d-4883-95c7-dea809f2a4cb.metadata.json');
235 | ```
236 | 
237 | ```
238 | ┌───────┬─────────┬───────┐
239 | │  id   │  name   │ value │
240 | │ int32 │ varchar │ int32 │
241 | ├───────┼─────────┼───────┤
242 | │     1 │ damon   │    33 │
243 | │     2 │ dad     │    34 │
244 | └───────┴─────────┴───────┘
245 | ```
246 | 
247 | 🤯
248 | 
249 | - What happens if I insert more data?
250 | 
251 | The `metadataLocation` gets updated and we can, of course, query each different version of the table. 🎉
252 | 


--------------------------------------------------------------------------------
/emr/eks/README.md:
--------------------------------------------------------------------------------
  1 | # EMR on EKS Demo
  2 | 
  3 | ## Spin up an EMR on EC2 Cluster
  4 | 
  5 | Typically with EMR you figure out the following:
  6 | 
  7 | - The EMR version you want to run
  8 | - The VPC/Subnet to run your cluster in
  9 | - The SSH keypair to use
 10 | - The S3 bucket to send your cluster logs to
 11 | - The different applications to run on the cluster
 12 | - The instance types, count, and configuration
 13 | 
 14 | The command below spins up a cluster in `us-east-1` with Spark on EMR 5.32.0
 15 | 
 16 | ```shell
 17 | VERSION=emr-5.32.0
 18 | KEYPAIR=<keypair_name>
 19 | SUBNET_ID=<subnet_id>
 20 | LOG_BUCKET=<logs_bucket>
 21 | 
 22 | aws emr create-cluster --applications Name=Spark Name=Zeppelin \
 23 |     --ec2-attributes '{"KeyName":"'${KEYPAIR}'","InstanceProfile":"EMR_EC2_DefaultRole","SubnetId":"'${SUBNET_ID}'"}' \
 24 |     --service-role EMR_DefaultRole \
 25 |     --enable-debugging \
 26 |     --release-label ${VERSION} \
 27 |     --log-uri "s3n://${LOG_BUCKET}/elasticmapreduce/" \
 28 |     --name 'dacort-spark' \
 29 |     --instance-groups '[{"InstanceCount":1,"EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"SizeInGB":32,"VolumeType":"gp2"},"VolumesPerInstance":2}]},"InstanceGroupType":"MASTER","InstanceType":"m5.xlarge","Name":"Master Instance Group"},{"InstanceCount":2,"EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"SizeInGB":32,"VolumeType":"gp2"},"VolumesPerInstance":2}]},"InstanceGroupType":"CORE","InstanceType":"m5.xlarge","Name":"Core Instance Group"}]' \
 30 |     --configurations '[{"Classification":"spark","Properties":{}}]' \
 31 |     --scale-down-behavior TERMINATE_AT_TASK_COMPLETION \
 32 |     --region us-east-1
 33 | ```
 34 | 
 35 | So...let's take a quick look at some data and see what it takes to run analysis on EMR on EKS. 👇
 36 | 
 37 | ## Explore a dataset
 38 | 
 39 | Idea: "What was the max wind speed in Seattle in 2021?" or "Average hourly rainfall when there was rain"
 40 | 
 41 | We can use the [NOAA Integrated Surface Database](https://registry.opendata.aws/noaa-isd/) hourly data in CSV format.
 42 | 
 43 | ```shell
 44 | aws s3 ls s3://noaa-global-hourly-pds/2021/ --no-sign-request
 45 | ```
 46 | 
 47 | See the code in [`windy_city.py`](./windy_city.py) for a full example.
 48 | 
 49 | <details>
 50 |     <summary>Here is exploratory code we can use in a <code>pyspark</code> shell once we've SSH'ed into our EMR cluster.</summary>
 51 | 
 52 | ```python
 53 | from pyspark.sql import functions as F
 54 | from pyspark.sql.types import DoubleType
 55 | 
 56 | # Reads the 2021 ISD data
 57 | df = spark.read.format("csv") \
 58 |         .option("header", "true")\
 59 |         .option("inferSchema", "true") \
 60 |         .load("s3://noaa-global-hourly-pds/2021/")
 61 | 
 62 | # Shows a sample row from Seattle
 63 | df \
 64 |     .withColumn('LATITUDE', df.LATITUDE.cast(DoubleType())) \
 65 |     .withColumn('LONGITUDE', df.LONGITUDE.cast(DoubleType())) \
 66 |     .filter(df.LATITUDE >= 47.41).filter(df.LATITUDE <= 47.49) \
 67 |     .filter(df.LONGITUDE >= -122.48).filter(df.LONGITUDE <= -122.16) \
 68 |     .take(1)
 69 | 
 70 | 
 71 | # See if we can split the wind speed properly
 72 | seadf = df \
 73 |     .withColumn('LATITUDE', df.LATITUDE.cast(DoubleType())) \
 74 |     .withColumn('LONGITUDE', df.LONGITUDE.cast(DoubleType())) \
 75 |     .filter(df.LATITUDE >= 47.41).filter(df.LATITUDE <= 47.49) \
 76 |     .filter(df.LONGITUDE >= -122.48).filter(df.LONGITUDE <= -122.16)
 77 | 
 78 | seadf.select("DATE", "NAME", "WND") \
 79 |     .withColumn("windSpeed", F.split(df.WND, ",")[3].cast(DoubleType())/10 ) \
 80 |     .take(10)
 81 | # [Row(DATE='2021-01-01T00:00:00', NAME='SEATTLE TACOMA AIRPORT, WA US', WND='200,1,N,0046,1', windSpeed=4.6), Row(DATE='2021-01-01T00:17:00', NAME='SEATTLE TACOMA AIRPORT, WA US', WND='200,5,N,0041,5', windSpeed=4.1), Row(DATE='2021-01-01T00:37:00', NAME='SEATTLE TACOMA AIRPORT, WA US', WND='170,5,N,0031,5', windSpeed=3.1), Row(DATE='2021-01-01T00:53:00', NAME='SEATTLE TACOMA AIRPORT, WA US', WND='190,5,N,0041,5', windSpeed=4.1), Row(DATE='2021-01-01T01:53:00', NAME='SEATTLE TACOMA AIRPORT, WA US', WND='190,5,N,0051,5', windSpeed=5.1), Row(DATE='2021-01-01T02:39:00', NAME='SEATTLE TACOMA AIRPORT, WA US', WND='180,5,N,0041,5', windSpeed=4.1), Row(DATE='2021-01-01T02:53:00', NAME='SEATTLE TACOMA AIRPORT, WA US', WND='180,5,N,0041,5', windSpeed=4.1), Row(DATE='2021-01-01T03:32:00', NAME='SEATTLE TACOMA AIRPORT, WA US', WND='190,5,N,0036,5', windSpeed=3.6), Row(DATE='2021-01-01T03:53:00', NAME='SEATTLE TACOMA AIRPORT, WA US', WND='190,5,N,0041,5', windSpeed=4.1), Row(DATE='2021-01-01T04:49:00', NAME='SEATTLE TACOMA AIRPORT, WA US', WND='180,5,N,0031,5', windSpeed=3.1)]
 82 | 
 83 | # OK, now create our slim dataframe and get top wind speed per day
 84 | wind_date_df = seadf.select("DATE", "NAME", "WND") \
 85 |     .withColumn("windSpeed", F.split(df.WND, ",")[3].cast(DoubleType())/10 ) \
 86 |     .withColumn("ymd", F.split(df.DATE, "T")[0]) \
 87 |     .filter(seadf.windSpeed != 999.9)
 88 | 
 89 | wind_date_df.groupBy("ymd") \
 90 |     .agg({'windSpeed':'max'}) \
 91 |     .orderBy("ymd") \
 92 |     .show()
 93 | ```
 94 | 
 95 | </details>
 96 | 
 97 | And the output...
 98 | 
 99 | ```
100 | >>> wind_date_df.groupBy("ymd").agg({'windSpeed':'max'}).orderBy("ymd").show()
101 | +----------+--------------+
102 | |       ymd|max(windSpeed)|
103 | +----------+--------------+
104 | |2021-01-01|           9.3|
105 | |2021-01-02|          10.3|
106 | |2021-01-03|          10.3|
107 | |2021-01-04|           8.2|
108 | |2021-01-05|           9.8|
109 | |2021-01-06|           8.2|
110 | |2021-01-07|           4.6|
111 | |2021-01-08|           8.8|
112 | |2021-01-09|           6.2|
113 | |2021-01-10|           7.2|
114 | |2021-01-11|          10.3|
115 | |2021-01-12|           6.7|
116 | |2021-01-13|          13.9|
117 | +----------+--------------+
118 | ```
119 | 
120 | 👏
121 | 
122 | ## EMR on EKS
123 | 
124 | ### EKS Setup
125 | 
126 | First we need to have an EKS cluster already running with the EMR namespace configured. If you don't already have an EKS cluster running, you'll likely need Admin access to your account to get this all set up.
127 | 
128 | You can follow the [EMR on EKS Getting started guide](https://docs.aws.amazon.com/emr/latest/EMR-on-EKS-DevelopmentGuide/getting-started.html).
129 | 
130 | A couple notes:
131 | 
132 | - When creating the [job execution role](https://docs.aws.amazon.com/emr/latest/EMR-on-EKS-DevelopmentGuide/creating-job-execution-role.html), select `Another AWS account` as the trusted entity and use your Account ID.
133 | - You will need to create a Node Group for Fargate profile for the namespace you created in EMR for the jobs to run.
134 | 
135 | As an example for #2 above, I created an EKS Fargate-only cluster and had to run the following command to create the desired profile:
136 | 
137 | ```shell
138 | eksctl create fargateprofile \
139 |     --cluster <EKS_CLUSTER> \
140 |     --name emr-profile \
141 |     --namespace <EMR_VIRTUAL_CLUSTER_NAMESPACE>
142 | ```
143 | 
144 | ## EMR Setup
145 | 
146 | Now that you've got a running EKS cluster(!), configured your execution roles and created an EMR Virtual Cluster that's mapped to EKS 😅  go ahead and upload your code to S3 and run a job!
147 | 
148 | ```shell
149 | S3_BUCKET=<YOUR_S3_BUCKET>
150 | aws s3 cp windy_city.py s3://${S3_BUCKET}/code/pyspark/windy_city.py
151 | ```
152 | 
153 | Fill in your EMR on EKS Cluster ID and Execution role. I've configured this job to log to S3, but you can also use CloudFront as noted in [EMR EKS Job Parameters](https://docs.aws.amazon.com/emr/latest/EMR-on-EKS-DevelopmentGuide/emr-eks-jobs-CLI.html#emr-eks-jobs-parameters). Just make sure your execution role has the right permissions.
154 | 
155 | ```shell
156 | S3_BUCKET=<YOUR_S3_BUCKET>
157 | EMR_EKS_CLUSTER_ID=<EMR_VIRTUAL_CLUSTER_ID>
158 | EMR_EKS_EXECUTION_ROLE=arn:aws:iam::<ACCOUNT_ID>:role/<EMR_EKS_ROLE_NAME>
159 | 
160 | aws emr-containers start-job-run \
161 |     --virtual-cluster-id ${EMR_EKS_CLUSTER_ID} \
162 |     --name dacort-windycity \
163 |     --execution-role-arn ${EMR_EKS_EXECUTION_ROLE} \
164 |     --release-label emr-5.32.0-latest \
165 |     --job-driver '{
166 |         "sparkSubmitJobDriver": {
167 |             "entryPoint": "s3://'${S3_BUCKET}'/code/pyspark/windy_city.py",
168 |             "sparkSubmitParameters": "--conf spark.executor.instances=2 --conf spark.executor.memory=2G --conf spark.executor.cores=2 --conf spark.driver.cores=1"
169 |         }
170 |     }' \
171 |     --configuration-overrides '{
172 |         "monitoringConfiguration": {
173 |             "s3MonitoringConfiguration": { "logUri": "s3://'${S3_BUCKET}'/emr-eks-logs/windy_city" }
174 |         }
175 |     }'
176 | ```
177 | 
178 | That command should spin up your Spark job on EKS and write the output to S3! 🙌
179 | 
180 | You should see the top wind speed per day in your Spark driver `stdout.gz` file on S3 after the job finishes.
181 | 
182 | - Want to run it on EMR 6.2.0? Change `--release-label` to `emr-6.2.0-latest`
183 | - Want to run the windy city script for San Francisco? Add `"entryPointArguments": ["-123.18,37.64,-122.28,37.93"]` to the `sparkSubmitJobDriver` JSON
184 | 
185 | ## Cleanup
186 | 
187 | 1. Make sure you don't have any managed endpoints for EMR Studio
188 | 
189 | ```shell
190 | # List existing managed endpoints for your virtual cluster
191 | aws emr-containers list-managed-endpoints \
192 |     --virtual-cluster-id ${EMR_EKS_CLUSTER_ID} \
193 |     --output text \
194 |     --query 'endpoints[*].[id,state,name]'
195 | 
196 | # Delete them if you do
197 | for endpoint_id in $(aws emr-containers list-managed-endpoints --virtual-cluster-id ${EMR_EKS_CLUSTER_ID} --output text --query 'endpoints[*].[id]'); do
198 |     echo "Deleting ${endpoint_id}"
199 |     aws emr-containers delete-managed-endpoint \
200 |         --id ${endpoint_id} \
201 |         --virtual-cluster-id ${EMR_EKS_CLUSTER_ID} 
202 | done
203 | ```
204 | 
205 | 2. Delete the virtual cluster
206 | 
207 | ```shell
208 | aws emr-containers delete-virtual-cluster --id ${EMR_EKS_CLUSTER_ID}
209 | ```
210 | 


--------------------------------------------------------------------------------
/athena/Athena_Exploration.md:
--------------------------------------------------------------------------------
  1 | # Athena exploration
  2 | 
  3 | Walkthrough of different Athena functionality using the [Amazon Customer Reviews](https://registry.opendata.aws/amazon-reviews/) open dataset.
  4 | 
  5 | This dataset provides both TSV and Parquet versions of over 130 million customer reviews since 1995.
  6 | 
  7 | ## Table Definitions
  8 | 
  9 | Create a table in Athena over the TSV dataset. 
 10 | 
 11 | ```sql
 12 | CREATE EXTERNAL TABLE amazon_reviews_tsv(
 13 |   marketplace string, 
 14 |   customer_id string, 
 15 |   review_id string, 
 16 |   product_id string, 
 17 |   product_parent string, 
 18 |   product_title string, 
 19 |   product_category string, 
 20 |   star_rating int, 
 21 |   helpful_votes int, 
 22 |   total_votes int, 
 23 |   vine string, 
 24 |   verified_purchase string, 
 25 |   review_headline string, 
 26 |   review_body string, 
 27 |   review_date date)
 28 | ROW FORMAT DELIMITED
 29 |   FIELDS TERMINATED BY '\t'
 30 |   ESCAPED BY '\\'
 31 |   LINES TERMINATED BY '\n'
 32 | LOCATION
 33 |   's3://amazon-reviews-pds/tsv/'
 34 | TBLPROPERTIES ("skip.header.line.count"="1");
 35 | ```
 36 | 
 37 | Run a simple query to preview the data.
 38 | 
 39 | ```sql
 40 | SELECT * FROM "amazon_reviews_tsv"
 41 | WHERE marketplace = 'US'
 42 | limit 10;
 43 | ```
 44 | 
 45 | Create a table over the Parquet dataset. It's partitioned by `product_category`.
 46 | 
 47 | Run a couple aggregation queries to see the amount of data scanned is minimal (kb-mb) compared to the full size of the data on S3 (~50 GiB).
 48 | 
 49 | ```sql
 50 | CREATE EXTERNAL TABLE `amazon_reviews_parquet`(
 51 |   `marketplace` string, 
 52 |   `customer_id` string, 
 53 |   `review_id` string, 
 54 |   `product_id` string, 
 55 |   `product_parent` string, 
 56 |   `product_title` string, 
 57 |   `star_rating` int, 
 58 |   `helpful_votes` int, 
 59 |   `total_votes` int, 
 60 |   `vine` string, 
 61 |   `verified_purchase` string, 
 62 |   `review_headline` string, 
 63 |   `review_body` string, 
 64 |   `review_date` bigint, 
 65 |   `year` int)
 66 | PARTITIONED BY ( `product_category` string )
 67 | STORED AS PARQUET
 68 | LOCATION 's3://amazon-reviews-pds/parquet';
 69 | ```
 70 | 
 71 | ```sql
 72 | SELECT product_id, product_title, count(*) as num_reviews, avg(star_rating) as avg_stars
 73 | FROM amazon_reviews_parquet where product_category='Toys'
 74 | GROUP BY 1, 2
 75 | ORDER BY 3 DESC
 76 | limit 100;
 77 | 
 78 | SELECT COUNT(*) FROM amazon_reviews_parquet where product_category='Toys'AND year >= 2012
 79 | 
 80 | SELECT * FROM amazon_reviews_parquet 
 81 | WHERE product_category='Toys'
 82 | LIMIT 100;
 83 | ```
 84 | 
 85 | ## CTAS Example
 86 | 
 87 | Re-partition by marketplace and year to allow for efficient queries. (This takes ~5 minutes to run),.
 88 | 
 89 | By default, the results are stored in a bucket automatically created in your account for Athena output: `aws-athena-query-results-<account_id>-<region>`.
 90 | 
 91 | See [Athena CTAS examples](https://docs.aws.amazon.com/athena/latest/ug/ctas-examples.html) for how to specify a specific S3 location with the `external_location` parameter.
 92 | 
 93 | ```sql
 94 | CREATE TABLE amazon_reviews_by_marketplace
 95 | WITH (
 96 |   format='PARQUET',
 97 |   partitioned_by = ARRAY['marketplace', 'year']
 98 | ) AS
 99 | SELECT customer_id, review_id, product_id, product_parent, product_title, product_category, star_rating, helpful_votes, total_votes, verified_purchase, review_headline, review_body, review_date,
100 |   marketplace,
101 |   year(review_date) as year
102 | FROM amazon_reviews_tsv
103 | WHERE "$path" LIKE '%tsv.gz'
104 | -- Run time: 4 minutes 43 seconds, Data scanned: 32.24 GB
105 | ```
106 | 
107 | Show different query times and data scanned
108 | 
109 | ```sql
110 | SELECT product_id, COUNT(*) FROM amazon_reviews_by_marketplace
111 | GROUP BY 1 ORDER BY 2 DESC LIMIT 10
112 | -- Run time: 6.7 seconds, Data scanned: 790.26 MB
113 | ```
114 | 
115 | vs.
116 | 
117 | ```sql
118 | SELECT product_id, COUNT(*) FROM amazon_reviews_by_marketplace
119 | WHERE marketplace='US' AND year = 2013
120 | GROUP BY 1 ORDER BY 2 DESC LIMIT 10
121 | -- Run time: 3.87 seconds, Data scanned: 145 MB
122 | ```
123 | 
124 | ## Optimization Techniques
125 | 
126 | ### Sorting by a specific field
127 | 
128 | If you frequently query data based on an ID and expect a limited amount of data to be returned, you can sort the original dataset by that ID and write it out to a limited number of objects on S3. Athena will use the [parquet metadata](#parquet-metadata) to determine if it should read the underlying data.
129 | 
130 | One option is to use CTAS to create a derivative dataset and sort on the specific fields. This can take a while to run thanks to the sorting and the execution plan.
131 | 
132 | ```sql
133 | CREATE TABLE amazon_reviews_sorted
134 | WITH (
135 |   format='PARQUET'
136 | ) AS
137 | SELECT product_id, customer_id, product_parent, star_rating, helpful_votes, total_votes, verified_purchase, marketplace, product_category, review_date
138 | FROM amazon_reviews_by_marketplace
139 | ORDER BY product_id ASC
140 | -- Run time: 18 minutes 13 seconds, Data scanned: 2.44 GB
141 | ```
142 | 
143 | Note that this only outputs seven heavily-skewed files, but all rows for a specific `product_id` should be in one file.
144 | 
145 | ```sql
146 | SELECT "$path", product_id, COUNT(*) FROM amazon_reviews_sorted
147 | WHERE product_id = 'B00E8KLWB4'
148 | GROUP BY 1, 2 ORDER BY 1 ASC
149 | -- Run time: 4.18 seconds, Data scanned: 81.9 MB)
150 | ```
151 | 
152 | vs.
153 | 
154 | ```sql
155 | CREATE TABLE amazon_reviews_unsorted
156 | WITH (
157 |   format='PARQUET',
158 |   bucketed_by = ARRAY['review_id'], 
159 |   bucket_count = 30
160 | ) AS
161 | SELECT review_id, product_id, customer_id, product_parent, star_rating, helpful_votes, total_votes, verified_purchase, marketplace, product_category, review_date
162 | FROM amazon_reviews_by_marketplace
163 | -- Run time: 40.04 seconds, Data scanned: 2.44 GB
164 | ```
165 | 
166 | We used the bucketing functionality to distribute the data evenly across 30 buckets. We used `review_id` as it is high cardinality and will allow for an even distribution.
167 | 
168 | ```sql
169 | SELECT "$path", product_id, COUNT(*) FROM amazon_reviews_unsorted
170 | WHERE product_id = 'B00E8KLWB4'
171 | GROUP BY 1, 2 ORDER BY 1 ASC
172 | -- Run time: 4.39 seconds, Data scanned: 834.36 MB
173 | ```
174 | 
175 | Initially I tried to bucket by `product_id`, but that still puts `product_id` in one file.
176 | It's not sorted across all files, though, as the field is hashed and the hash is used.
177 | Instead, we'll bucket on `review_id`, which will effectively randomize the `product_id`s.
178 | 
179 | It's tough to control sorting and # of output files using CTAS, but Spark can do this well. Using something like EMR Notebooks or AWS Glue, we read the original dataset and use `repartitionByRange` to sort `product_id` into 30 different output files.
180 | 
181 | ```python
182 | (spark.read.parquet("s3://amazon-reviews-pds/parquet/")
183 | .select("marketplace", "customer_id", "review_id", "product_id", "product_parent", "star_rating")
184 | .repartitionByRange(30, "product_id")
185 | .write.mode("overwrite")
186 | .parquet("s3://<bucket>/amazon-reviews-sorted-subset/", compression="gzip")
187 | )
188 | ```
189 | 
190 | And then back in Athena...
191 | 
192 | ```sql
193 | CREATE EXTERNAL TABLE amazon_reviews_spark_sorted (
194 |   marketplace string,
195 |   customer_id string,
196 |   review_id string,
197 |   product_id string,
198 |   product_parent string,
199 |   star_rating int
200 | )
201 | STORED AS PARQUET
202 | LOCATION 's3://<bucket>/amazon-reviews-sorted-subset/'
203 | ```
204 | 
205 | ```sql
206 | SELECT "$path", COUNT(*) FROM amazon_reviews_spark_sorted 
207 | GROUP BY 1 ORDER BY 1 ASC
208 | -- About 5-6M records per file
209 | ```
210 | 
211 | ## Parquet metadata
212 | 
213 | [parquet-tools](https://github.com/apache/parquet-mr/tree/master/parquet-tools) is a fantastic utility for analyzing the content of Parquet files.
214 | 
215 | If you're on a mac, it's available via homebrew: `brew install parquet-tools`
216 | 
217 | Download a sample Parquet file and print out the metadata:
218 | 
219 | ```shell
220 | curl -O https://s3.amazonaws.com/amazon-reviews-pds/parquet/product_category=Watches/part-00009-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
221 | parquet-tools meta part-00009-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet
222 | ```
223 | 
224 | You'll see a bunch of detailed information about the file including number of rows, minimum and maximum values, and the schema.
225 | 
226 | _Some rows left out for brevity_
227 | 
228 | ```
229 | file:              file:/private/tmp/part-00009-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet 
230 | creator:           parquet-mr version 1.8.2 (build c6522788629e590a53eb79874b95f6c3ff11f16c) 
231 | 
232 | file schema:       spark_schema 
233 | --------------------------------------------------------------------------------
234 | product_title:     OPTIONAL BINARY O:UTF8 R:0 D:1
235 | star_rating:       OPTIONAL INT32 R:0 D:1
236 | helpful_votes:     OPTIONAL INT32 R:0 D:1
237 | review_date:       OPTIONAL INT32 O:DATE R:0 D:1
238 | year:              OPTIONAL INT32 R:0 D:1
239 | 
240 | row group 1:       RC:97608 TS:39755962 OFFSET:4 
241 | --------------------------------------------------------------------------------
242 | product_title:      BINARY SNAPPY DO:0 FPO:3243045 SZ:3170609/6450771/2.03 VC:97608 ENC:PLAIN,PLAIN_DICTIONARY,RLE,BIT_PACKED ST:[no stats for this column]
243 | star_rating:        INT32 SNAPPY DO:0 FPO:6413654 SZ:36016/36709/1.02 VC:97608 ENC:PLAIN_DICTIONARY,RLE,BIT_PACKED ST:[min: 1, max: 5, num_nulls: 0]
244 | helpful_votes:      INT32 SNAPPY DO:0 FPO:6449670 SZ:48348/93031/1.92 VC:97608 ENC:PLAIN_DICTIONARY,RLE,BIT_PACKED ST:[min: 0, max: 753, num_nulls: 0]
245 | review_date:        INT32 SNAPPY DO:0 FPO:23689606 SZ:35674/146381/4.10 VC:97608 ENC:PLAIN_DICTIONARY,RLE,BIT_PACKED ST:[min: 2001-04-05, max: 2015-08-31, num_nulls: 0]
246 | year:               INT32 SNAPPY DO:0 FPO:23725280 SZ:2004/37279/18.60 VC:97608 ENC:PLAIN_DICTIONARY,RLE,BIT_PACKED ST:[min: 2001, max: 2015, num_nulls: 0]
247 | ```
248 | 
249 | 
250 | More detailed information on the different fields for each column is [here](https://github.com/apache/parquet-mr/tree/master/parquet-tools#meta-legend).
251 | 
252 | Note that current versions of the tool may not show string statistics by default as they could be incorrect: [PARQUET-686](https://issues.apache.org/jira/browse/PARQUET-686).


--------------------------------------------------------------------------------
/cdk/big-data-stack/stacks/eks.py:
--------------------------------------------------------------------------------
  1 | from aws_cdk import (
  2 |     core as cdk,
  3 |     aws_eks as eks,
  4 |     aws_ec2 as ec2,
  5 |     aws_iam as iam,
  6 | )
  7 | 
  8 | from plugins.eks.autoscaler import ClusterAutoscaler
  9 | 
 10 | 
 11 | class EKSStack(cdk.Stack):
 12 |     cluster_name: str
 13 |     cluster: eks.Cluster
 14 | 
 15 |     def __init__(
 16 |         self,
 17 |         scope: cdk.Construct,
 18 |         construct_id: str,
 19 |         vpc: ec2.IVpc,
 20 |         instance_type: str = "m5.xlarge",
 21 |         **kwargs,
 22 |     ) -> None:
 23 |         super().__init__(scope, construct_id, **kwargs)
 24 | 
 25 |         self.cluster_name = "data-team"
 26 | 
 27 |         # EKS cluster
 28 |         self.cluster = eks.Cluster(
 29 |             self,
 30 |             "EksForSpark",
 31 |             cluster_name=self.cluster_name,
 32 |             version=eks.KubernetesVersion.V1_19,
 33 |             default_capacity=0,
 34 |             endpoint_access=eks.EndpointAccess.PUBLIC_AND_PRIVATE,
 35 |             vpc=vpc,
 36 |             vpc_subnets=[ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE)],
 37 |         )
 38 | 
 39 |         # Default node group
 40 |         ng = self.cluster.add_nodegroup_capacity(
 41 |             "base-node-group",
 42 |             instance_types=[ec2.InstanceType(instance_type)],
 43 |             min_size=1,
 44 |             max_size=20,
 45 |             disk_size=50,
 46 |         )
 47 | 
 48 |         # Add a Spot node group as well for additional capacity
 49 |         spot_ng = self.cluster.add_nodegroup_capacity(
 50 |             "spot-node-group",
 51 |             capacity_type=eks.CapacityType.SPOT,
 52 |             instance_types=[ec2.InstanceType(it) for it in ['c4.2xlarge', 'c5.2xlarge', 'c5d.2xlarge', 'c5a.2xlarge', 'c5n.2xlarge']],
 53 |             min_size=1,
 54 |             max_size=20,
 55 |         )
 56 | 
 57 |         # Add a Graviton node group as well for additional capacity
 58 |         # graviton_ng = self.cluster.add_nodegroup_capacity(
 59 |         #     "graviton-node-group",
 60 |         #     instance_types=[ec2.InstanceType(it) for it in ['m6g.2xlarge', 'c6g.2xlarge', 'r6g.2xlarge']],
 61 |         #     min_size=1,
 62 |         #     max_size=20,
 63 |         # )
 64 | 
 65 |         self.add_admin_role_to_cluster()
 66 |         self.add_cluster_admin()
 67 | 
 68 |         # Cluster AutoScaling FTW
 69 |         ClusterAutoscaler(
 70 |             self.cluster_name, self, self.cluster, [ng, spot_ng]
 71 |         ).enable_autoscaling()
 72 | 
 73 |         # We like to use the Kubernetes Dashboard
 74 |         self.enable_dashboard()
 75 | 
 76 |         # Install Airflow as well
 77 |         # TODO: Make this optional
 78 |         # self.enable_airflow()
 79 | 
 80 |         # This is emr-specific, but we have to do it here to prevent circular dependencies
 81 |         self.map_iam_to_eks()
 82 | 
 83 |     def add_admin_role_to_cluster(self) -> None:
 84 |         admin_role_name = self.node.try_get_context("eks_admin_role_name")
 85 |         if admin_role_name is None:
 86 |             return
 87 | 
 88 |         account_id = cdk.Aws.ACCOUNT_ID
 89 |         admin_role = iam.Role.from_role_arn(
 90 |             self, "admin_role", f"arn:aws:iam::{account_id}:role/{admin_role_name}"
 91 |         )
 92 |         self.cluster.aws_auth.add_masters_role(admin_role)
 93 | 
 94 |     def add_cluster_admin(self, name="eks-admin"):
 95 |         # Add admin privileges so we can sign in to the dashboard as the service account
 96 |         sa = self.cluster.add_manifest(
 97 |             "eks-admin-sa",
 98 |             {
 99 |                 "apiVersion": "v1",
100 |                 "kind": "ServiceAccount",
101 |                 "metadata": {
102 |                     "name": name,
103 |                     "namespace": "kube-system",
104 |                 },
105 |             },
106 |         )
107 |         binding = self.cluster.add_manifest(
108 |             "eks-admin-rbac",
109 |             {
110 |                 "apiVersion": "rbac.authorization.k8s.io/v1beta1",
111 |                 "kind": "ClusterRoleBinding",
112 |                 "metadata": {"name": name},
113 |                 "roleRef": {
114 |                     "apiGroup": "rbac.authorization.k8s.io",
115 |                     "kind": "ClusterRole",
116 |                     "name": "cluster-admin",
117 |                 },
118 |                 "subjects": [
119 |                     {
120 |                         "kind": "ServiceAccount",
121 |                         "name": name,
122 |                         "namespace": "kube-system",
123 |                     }
124 |                 ],
125 |             },
126 |         )
127 | 
128 |     def enable_dashboard(self, namespace: str = "kubernetes-dashboard"):
129 |         chart = self.cluster.add_helm_chart(
130 |             "kubernetes-dashboard",
131 |             namespace=namespace,
132 |             chart="kubernetes-dashboard",
133 |             repository="https://kubernetes.github.io/dashboard/",
134 |             values={
135 |                 "fullnameOverride": "kubernetes-dashboard",  # This must be set to acccess the UI via `kubectl proxy`
136 |                 "extraArgs": ["--token-ttl=0"],
137 |             },
138 |         )
139 | 
140 |     def map_iam_to_eks(self):
141 |         service_role_name = f"arn:aws:iam::{cdk.Aws.ACCOUNT_ID}:role/AWSServiceRoleForAmazonEMRContainers"
142 |         emrsvcrole = iam.Role.from_role_arn(
143 |             self, "EmrSvcRole", service_role_name, mutable=False
144 |         )
145 |         self.cluster.aws_auth.add_role_mapping(
146 |             emrsvcrole, groups=[], username="emr-containers"
147 |         )
148 | 
149 |     def add_emr_containers_for_airflow(self) -> eks.ServiceAccount:
150 |         sa = self.cluster.add_service_account(
151 |             "AirflowServiceAccount", namespace="airflow"
152 |         )
153 | 
154 |         sa.add_to_principal_policy(
155 |             iam.PolicyStatement(
156 |                 actions=[
157 |                     "emr-containers:StartJobRun",
158 |                     "emr-containers:ListJobRuns",
159 |                     "emr-containers:DescribeJobRun",
160 |                     "emr-containers:CancelJobRun",
161 |                 ],
162 |                 resources=["*"],
163 |             )
164 |         )
165 | 
166 |         return sa
167 | 
168 |     def enable_airflow(self, namespace: str = "airflow"):
169 |         # While `add_helm_chart` will create the namespace for us if it doesn't exist,
170 |         # we have to create it here because we need to create a service role for emr-containers.
171 |         ns = self.cluster.add_manifest(
172 |             "airflow-namespace",
173 |             {"apiVersion": "v1", "kind": "Namespace", "metadata": {"name": namespace}},
174 |         )
175 |         # This is specific to emr-containers and Airflow so we can run EMR on EKS jobs
176 |         service_role = self.add_emr_containers_for_airflow()
177 |         service_role.node.add_dependency(ns)
178 | 
179 |         volume = self.cluster.add_manifest("multiaz-volume", self.gp2_multiazvolume())
180 |         chart = self.cluster.add_helm_chart(
181 |             "airflow",
182 |             namespace=namespace,
183 |             chart="airflow",
184 |             repository="https://airflow-helm.github.io/charts",
185 |             version="8.0.5",
186 |             values={
187 |                 "airflow": {
188 |                     "config": {
189 |                         "AIRFLOW__LOGGING__REMOTE_LOGGING": "False",
190 |                     },
191 |                     "executor": "KubernetesExecutor",
192 |                     "image": {
193 |                         "repository": "ghcr.io/dacort/airflow-emr-eks",
194 |                         "tag": "latest",
195 |                         "pullPolicy": "Always",
196 |                     },
197 |                     "extraEnv": [
198 |                         {
199 |                             "name": "AIRFLOW__CORE__FERNET_KEY",
200 |                             "valueFrom": {
201 |                                 "secretKeyRef": {
202 |                                     "name": "airflow-fernet-key",
203 |                                     "key": "value",
204 |                                 }
205 |                             },
206 |                         },
207 |                         {
208 |                             "name": "AWS_DEFAULT_REGION",
209 |                             "value": cdk.Aws.REGION,
210 |                         },
211 |                     ],
212 |                 },
213 |                 "web": {"resources": {"limits": {"cpu": "1", "memory": "1Gi"}}},
214 |                 "workers": {"enabled": False},
215 |                 "flower": {"enabled": False},
216 |                 "redis": {"enabled": False},
217 |                 "dags": {
218 |                     "gitSync": {
219 |                         "enabled": True,
220 |                         "repo": "https://github.com/dacort/airflow-example-dags.git",
221 |                         "branch": "main",
222 |                         "resources": {"requests": {"cpu": "50m", "memory": "64Mi"}},
223 |                     }
224 |                 },
225 |                 "postgresql": {"persistence": {"storageClass": "multiazvolume"}},
226 |                 "serviceAccount": {
227 |                     "create": False,
228 |                     "name": service_role.service_account_name,
229 |                     "annotations": {
230 |                         "eks.amazonaws.com/role-arn": service_role.role.role_arn
231 |                     },
232 |                 },
233 |             },
234 |         )
235 |         chart.node.add_dependency(ns)
236 |         chart.node.add_dependency(volume)
237 | 
238 |         # Display the command necessarty to port-forward the Airflow Web UI
239 |         airflow_forward_cmd = f'kubectl port-forward --namespace {namespace} $(kubectl get pods --namespace {namespace} -l "component=web,app=airflow" -o jsonpath="{{.items[0].metadata.name}}") 8080:8080'
240 |         cdk.CfnOutput(self, "AirflowLoginCommand", value=airflow_forward_cmd)
241 | 
242 |     def gp2_multiazvolume(self):
243 |         return {
244 |             "kind": "StorageClass",
245 |             "apiVersion": "storage.k8s.io/v1",
246 |             "metadata": {"name": "multiazvolume"},
247 |             "provisioner": "kubernetes.io/aws-ebs",
248 |             "parameters": {"type": "gp2", "iopsPerGB": "10", "fsType": "ext4"},
249 |             "volumeBindingMode": "WaitForFirstConsumer",
250 |         }
251 | 
252 | 
253 | # Helpful references
254 | # https://betterprogramming.pub/how-to-organize-your-aws-cdk-project-f1c463aa966e
255 | # https://github.com/aftouh/cdk-template
256 | #
257 | # https://faun.pub/spawning-an-autoscaling-eks-cluster-52977aa8b467


--------------------------------------------------------------------------------
/emr/eks/videos/pod_templates/README.md:
--------------------------------------------------------------------------------
  1 | # EMR on EKS Pod Templates
  2 | 
  3 | ## Demos
  4 | - Running Spark jobs with Dynamic Resource Allocation (DRA)
  5 | - Using pod templates to optimize job cost (Spot and Fargate)
  6 | - Using pod templates to run sidecar containers
  7 | 
  8 | ## Step 1 - Running a simple Spark job
  9 | 
 10 | - Using a local Spark example, submit a job with static executor config
 11 | 
 12 | ```shell
 13 | aws emr-containers start-job-run \
 14 |     --virtual-cluster-id ${EMR_EKS_CLUSTER_ID} \
 15 |     --name dacort-pi-static \
 16 |     --execution-role-arn ${EMR_EKS_EXECUTION_ARN} \
 17 |     --release-label emr-5.33.0-latest \
 18 |     --job-driver '{
 19 |         "sparkSubmitJobDriver": {
 20 |             "entryPoint": "local:///usr/lib/spark/examples/src/main/python/pi.py",
 21 |             "sparkSubmitParameters": "--conf spark.executor.instances=20 --conf spark.executor.memory=2G --conf spark.executor.cores=2 --conf spark.driver.cores=1"
 22 |         }
 23 |     }' \
 24 |     --configuration-overrides '{
 25 |         "monitoringConfiguration": {
 26 |             "s3MonitoringConfiguration": { 
 27 |                 "logUri": "s3://'${S3_BUCKET}'/emr-eks-logs/pi/"
 28 |             }
 29 |         }
 30 |     }'
 31 | ```
 32 | 
 33 | ## Step 2 - Dynamic Resource Allocation
 34 | 
 35 | _Notes_
 36 | - Only works with Spark 3.x / EMR 6
 37 | 
 38 | To enable DRA, we'll add an `applicationConfiguration` section to the `--configuration-overrides` parameter to specifically enable it and define the executor behavior. This is the section we'll add:
 39 | 
 40 | ```json
 41 | {
 42 |     "classification": "spark-defaults",
 43 |     "properties": {
 44 |         "spark.dynamicAllocation.enabled": "true",
 45 |         "spark.dynamicAllocation.shuffleTracking.enabled":"true",
 46 |         "spark.dynamicAllocation.minExecutors":"5",
 47 |         "spark.dynamicAllocation.maxExecutors":"100",
 48 |         "spark.dynamicAllocation.initialExecutors":"10"
 49 |     }
 50 | }
 51 | ```
 52 | 
 53 | ```shell
 54 | aws emr-containers start-job-run \
 55 |     --virtual-cluster-id ${EMR_EKS_CLUSTER_ID} \
 56 |     --name dacort-windycity-dra \
 57 |     --execution-role-arn ${EMR_EKS_EXECUTION_ARN} \
 58 |     --release-label emr-6.2.0-latest \
 59 |     --job-driver '{
 60 |         "sparkSubmitJobDriver": {
 61 |             "entryPoint": "s3://'${S3_BUCKET}'/code/pyspark/windy_city.py",
 62 |             "sparkSubmitParameters": "--conf spark.executor.memory=2G --conf spark.executor.cores=2 --conf spark.driver.cores=1"
 63 |         }
 64 |     }' \
 65 |     --configuration-overrides '{
 66 |         "applicationConfiguration": [
 67 |             {
 68 |                 "classification": "spark-defaults",
 69 |                 "properties": {
 70 |                     "spark.dynamicAllocation.enabled": "true",
 71 |                     "spark.dynamicAllocation.shuffleTracking.enabled":"true",
 72 |                     "spark.dynamicAllocation.minExecutors":"2",
 73 |                     "spark.dynamicAllocation.maxExecutors":"100",
 74 |                     "spark.dynamicAllocation.initialExecutors":"5"
 75 |                 }
 76 |             }
 77 |         ]
 78 |     }'
 79 | ```
 80 | 
 81 | ## Step 3 - Limit executors to Spot
 82 | 
 83 | For this step, we make use of [pod templates](https://docs.aws.amazon.com/emr/latest/EMR-on-EKS-DevelopmentGuide/pod-templates.html).  Pod templates allow you to specify how or where the containers will run in your cluster.
 84 | 
 85 | 
 86 | 1. Create pod templates for Spot and On-Demand
 87 | 
 88 | - `spot_pod_template.yaml`
 89 | ```yaml
 90 | apiVersion: v1
 91 | kind: Pod
 92 | spec:
 93 |   nodeSelector:
 94 |     eks.amazonaws.com/capacityType: SPOT
 95 | ```
 96 | 
 97 | - `ondemand_pod_template.yaml`
 98 | ```yaml
 99 | apiVersion: v1
100 | kind: Pod
101 | spec:
102 |   nodeSelector:
103 |     eks.amazonaws.com/capacityType: ON_DEMAND
104 | ```
105 | 
106 | 2. Upload those templates to S3
107 | 
108 | ```shell
109 | aws s3 cp spot_pod_template.yaml s3://<BUCKET>/artifacts/pod_templates/
110 | aws s3 cp ondemand_pod_template.yaml s3://<BUCKET>/artifacts/pod_templates/
111 | ```
112 | 
113 | 3. Run your Spark job with the pod template specified for the executor!
114 | 
115 | We also specify on-demand for the driver because we want to ensure the driver persists for the entire length of the job.
116 | 
117 | ```shell
118 | aws emr-containers start-job-run \
119 |     --virtual-cluster-id ${EMR_EKS_CLUSTER_ID} \
120 |     --name dacort-windycity-spot \
121 |     --execution-role-arn ${EMR_EKS_EXECUTION_ARN} \
122 |     --release-label emr-5.33.0-latest \
123 |     --job-driver '{
124 |         "sparkSubmitJobDriver": {
125 |             "entryPoint": "s3://'${S3_BUCKET}'/code/pyspark/windy_city.py",
126 |             "sparkSubmitParameters": "--conf spark.executor.instances=5 --conf spark.executor.memory=2G --conf spark.executor.cores=2 --conf spark.driver.cores=1 --conf spark.kubernetes.driver.podTemplateFile=s3://'${S3_BUCKET}'/artifacts/pod_templates/ondemand_pod_template.yaml --conf spark.kubernetes.executor.podTemplateFile=s3://'${S3_BUCKET}'/artifacts/pod_templates/spot_pod_template.yaml"
127 |         }
128 |     }'
129 | ```
130 | 
131 | And now verify the driver and executors are running where we expect!
132 | 
133 | ```shell
134 | kubectl describe node \
135 |   $(kubectl get pods -n emr-jobs --selector=spark-role=driver -o jsonpath='{.items[*].spec.nodeName}' ) \
136 |   | grep -i capacityType
137 | #                     eks.amazonaws.com/capacityType=ON_DEMAND
138 | ```
139 | 
140 | 
141 | ```shell
142 | kubectl describe node \
143 |   $(kubectl get pods -n emr-jobs --selector=spark-role=executor -o jsonpath='{.items[*].spec.nodeName}' ) \
144 |   | grep -i capacityType
145 |                     # eks.amazonaws.com/capacityType=SPOT
146 |                     # eks.amazonaws.com/capacityType=SPOT
147 |                     # eks.amazonaws.com/capacityType=SPOT
148 |                     # eks.amazonaws.com/capacityType=SPOT
149 |                     # eks.amazonaws.com/capacityType=SPOT
150 | ```
151 | 
152 | ## Step 3.5 - Pod Templates: Limit executors to Fargate
153 | 
154 | - Ensure you have the Fargate role
155 | 
156 | ```shell
157 | aws iam create-role --role-name AmazonEKSFargatePodExecutionRole --assume-role-policy-document '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"Service":"eks-fargate-pods.amazonaws.com"},"Action":"sts:AssumeRole"}]}'
158 | aws iam attach-role-policy --role-name AmazonEKSFargatePodExecutionRole --policy-arn arn:aws:iam::aws:policy/AmazonEKSFargatePodExecutionRolePolicy
159 | ```
160 | 
161 | - Create a new Fargate profile in the appropriate namespace
162 | 
163 | We need to add a specific selector, otherwise *all* jobs in the `emr-jobs` namespace will run on Fargate.
164 | 
165 | ```shell
166 | aws eks create-fargate-profile \
167 |     --cluster-name data-team \
168 |     --fargate-profile-name spark-fargate-executors \
169 |     --selectors 'namespace=emr-jobs,labels={eks.amazonaws.com/capacityType=FARGATE}' \
170 |     --pod-execution-role-arn ${FARGATE_EXECUTION_ARN}
171 | ```
172 | 
173 | - Assign Spark executor label
174 | 
175 | In order to properly run our executors _only_ on Fargate, we'll add a label to the spark-submit parameters:
176 | 
177 | ```
178 | --conf spark.kubernetes.executor.label.eks.amazonaws.com/capacityType=FARGATE
179 | ```
180 | 
181 | - Run our job!
182 | 
183 | ```shell
184 | aws emr-containers start-job-run \
185 |     --virtual-cluster-id ${EMR_EKS_CLUSTER_ID} \
186 |     --name dacort-windycity-fargate \
187 |     --execution-role-arn ${EMR_EKS_EXECUTION_ARN} \
188 |     --release-label emr-5.33.0-latest \
189 |     --job-driver '{
190 |         "sparkSubmitJobDriver": {
191 |             "entryPoint": "s3://'${S3_BUCKET}'/code/pyspark/windy_city.py",
192 |             "sparkSubmitParameters": "--conf spark.executor.instances=5 --conf spark.executor.memory=2G --conf spark.executor.cores=2 --conf spark.driver.cores=1 --conf spark.kubernetes.driver.podTemplateFile=s3://'${S3_BUCKET}'/artifacts/pod_templates/ondemand_pod_template.yaml --conf spark.kubernetes.executor.label.eks.amazonaws.com/capacityType=FARGATE"
193 |         }
194 |     }'
195 | ```
196 | 
197 | ## Step 4 - Running a sidecar container
198 | 
199 | Sidecar containers can be used to add additional functionality alongside your Spark drivers and/or executors. A common use-case is to forward logs to a centralized logging provider.
200 | 
201 | For this example, we'll use a custom [Spark Tweeter](https://github.com/dacort/spark-tweeter) container that tweets out when a job starts and stops.
202 | 
203 | - First, we need to add our Twitter credentials as a Kubernetes secret
204 | 
205 | ```shell
206 | kubectl create secret generic -n emr-jobs twitter-creds \
207 |   --from-literal=consumer_key=${CONSUMER_KEY} \
208 |   --from-literal=consumer_secret=${CONSUMER_SECRET} \
209 |   --from-literal=access_token=${ACCESS_TOKEN} \
210 |   --from-literal=access_token_secret=${ACCESS_TOKEN_SECRET}
211 | 
212 | ```
213 | 
214 | - Then we create a sidecar yaml file and upload that to S3
215 | 
216 | ```yaml
217 | # tweetcar.yaml
218 | apiVersion: v1
219 | kind: Pod
220 | spec:
221 |   containers:
222 |   - name: side-car-tweeter
223 |     image: ghcr.io/dacort/spark-tweeter:latest
224 |     env:
225 |       - name: CONSUMER_KEY
226 |         valueFrom:
227 |           secretKeyRef:
228 |             name: twitter-creds
229 |             key: consumer_key
230 |       - name: CONSUMER_SECRET
231 |         valueFrom:
232 |           secretKeyRef:
233 |             name: twitter-creds
234 |             key: consumer_secret
235 |       - name: ACCESS_TOKEN
236 |         valueFrom:
237 |           secretKeyRef:
238 |             name: twitter-creds
239 |             key: access_token
240 |       - name: ACCESS_TOKEN_SECRET
241 |         valueFrom:
242 |           secretKeyRef:
243 |             name: twitter-creds
244 |             key: access_token_secret
245 |       - name: EMR_COMMS_MOUNT
246 |         value: /var/log/fluentd
247 |     resources: {}
248 |     volumeMounts:
249 |       - name: emr-container-application-log-dir
250 |         mountPath: /var/log/spark/user
251 |       - name: emr-container-communicate
252 |         mountPath: /var/log/fluentd
253 | ```
254 | 
255 | ```shell
256 | aws s3 cp tweetcar.yaml s3://<BUCKET>/artifacts/pod_templates/tweetcar.yaml
257 | ```
258 | 
259 | - Now run your Spark job with the sidecar mounted on the Driver
260 | 
261 | ```shell
262 | aws emr-containers start-job-run \
263 |     --virtual-cluster-id ${EMR_EKS_CLUSTER_ID} \
264 |     --name dacort-tweeter \
265 |     --execution-role-arn ${EMR_EKS_EXECUTION_ARN} \
266 |     --release-label emr-5.33.0-latest \
267 |     --job-driver '{
268 |         "sparkSubmitJobDriver": {
269 |             "entryPoint": "s3://'${S3_BUCKET}'/code/pyspark/windy_city.py",
270 |             "sparkSubmitParameters": "--conf spark.executor.instances=20 --conf spark.executor.memory=2G --conf spark.executor.cores=2 --conf spark.driver.cores=1 --conf spark.kubernetes.driver.podTemplateFile=s3://'${S3_BUCKET}'/artifacts/pod_templates/tweetcar.yaml"
271 |         }
272 |     }'
273 | ```


--------------------------------------------------------------------------------
/emr/studio/notebooks/emr-studio-athena.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "id": "90cd3a9c",
  6 |    "metadata": {},
  7 |    "source": [
  8 |     "# Querying Athena from EMR Studio\n",
  9 |     "\n",
 10 |     "1. Install the [pyathena](https://github.com/laughingman7743/PyAthena/) library.\n",
 11 |     "\n",
 12 |     "_If this is the first time installing the library on the cluster, you'll need to restart your Kernel._"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 1,
 18 |    "id": "75c9710d",
 19 |    "metadata": {},
 20 |    "outputs": [
 21 |     {
 22 |      "name": "stdout",
 23 |      "output_type": "stream",
 24 |      "text": [
 25 |       "Defaulting to user installation because normal site-packages is not writeable\n",
 26 |       "Requirement already satisfied: pyathena in /home/emr-notebook/.local/lib/python3.7/site-packages (2.3.0)\n",
 27 |       "Requirement already satisfied: botocore>=1.5.52 in /home/emr-notebook/.local/lib/python3.7/site-packages (from pyathena) (1.21.54)\n",
 28 |       "Requirement already satisfied: boto3>=1.4.4 in /home/emr-notebook/.local/lib/python3.7/site-packages (from pyathena) (1.18.54)\n",
 29 |       "Requirement already satisfied: tenacity>=4.1.0 in /mnt/notebook-env/lib/python3.7/site-packages (from pyathena) (8.0.0)\n",
 30 |       "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/emr-notebook/.local/lib/python3.7/site-packages (from boto3>=1.4.4->pyathena) (0.10.0)\n",
 31 |       "Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/emr-notebook/.local/lib/python3.7/site-packages (from boto3>=1.4.4->pyathena) (0.5.0)\n",
 32 |       "Requirement already satisfied: urllib3<1.27,>=1.25.4 in /mnt/notebook-env/lib/python3.7/site-packages (from botocore>=1.5.52->pyathena) (1.26.6)\n",
 33 |       "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /mnt/notebook-env/lib/python3.7/site-packages (from botocore>=1.5.52->pyathena) (2.8.2)\n",
 34 |       "Requirement already satisfied: six>=1.5 in /mnt/notebook-env/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore>=1.5.52->pyathena) (1.15.0)\n"
 35 |      ]
 36 |     }
 37 |    ],
 38 |    "source": [
 39 |     "!/emr/notebook-env/bin/pip install pyathena"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "markdown",
 44 |    "id": "2fc5de2d",
 45 |    "metadata": {},
 46 |    "source": [
 47 |     "2. Connect to Athena and query!"
 48 |    ]
 49 |   },
 50 |   {
 51 |    "cell_type": "code",
 52 |    "execution_count": 2,
 53 |    "id": "46ac6b1c",
 54 |    "metadata": {},
 55 |    "outputs": [
 56 |     {
 57 |      "name": "stdout",
 58 |      "output_type": "stream",
 59 |      "text": [
 60 |       "('20210923175137', '20210923175137_2_2', '258241be-2f7c-4bad-99f7-f65a2a1bc032', 'americas-brazil-sao_paulo', '658db9b2-5081-49b1-b6e7-6ff1b1d92448-0_2-155-196_20210923175137.parquet', 0.7004607721204296, 0.9911546157239198, 'driver-204', 0.14602911545960373, 0.430070203188727, 89.89115113672493, 'rider-204', 1631914555102, '258241be-2f7c-4bad-99f7-f65a2a1bc032', 'americas-brazil-sao_paulo')\n",
 61 |       "('20210923175441', '20210923175441_1_1', '130b2891-b8e8-45cb-86c5-9ba17bf67e9c', 'americas-brazil-sao_paulo', '658db9b2-5081-49b1-b6e7-6ff1b1d92448-0_1-189-243_20210923175441.parquet', 0.4789745387904072, 0.14781856144057215, 'driver-022', 0.10509642405359532, 0.07682825311613706, 30.429177017810616, 'rider-022', 1632072813847, '130b2891-b8e8-45cb-86c5-9ba17bf67e9c', 'americas-brazil-sao_paulo')\n",
 62 |       "('20210923175137', '20210923175137_2_7', 'e8c22378-8cf4-480a-bbcb-2d727e72bbe9', 'americas-brazil-sao_paulo', '658db9b2-5081-49b1-b6e7-6ff1b1d92448-0_2-155-196_20210923175137.parquet', 0.011933855867048981, 0.16258177392270334, 'driver-204', 0.9635314017496284, 0.6451866124948767, 69.09535493302582, 'rider-204', 1631855582705, 'e8c22378-8cf4-480a-bbcb-2d727e72bbe9', 'americas-brazil-sao_paulo')\n",
 63 |       "('20210923175441', '20210923175441_1_3', '853e3137-1404-476a-9706-bc6862a0b0c1', 'americas-brazil-sao_paulo', '658db9b2-5081-49b1-b6e7-6ff1b1d92448-0_1-189-243_20210923175441.parquet', 0.7863419548389983, 0.09622419308555896, 'driver-022', 0.4461749593405654, 0.8047885824928995, 89.58715055088675, 'rider-022', 1632239701039, '853e3137-1404-476a-9706-bc6862a0b0c1', 'americas-brazil-sao_paulo')\n",
 64 |       "('20210923175441', '20210923175441_1_4', '8077ddc9-b591-41a7-ac05-dd418485e567', 'americas-brazil-sao_paulo', '658db9b2-5081-49b1-b6e7-6ff1b1d92448-0_1-189-243_20210923175441.parquet', 0.4557704708784922, 0.19566457205271448, 'driver-022', 0.04316839215753254, 0.49689215534636744, 98.07565038092373, 'rider-022', 1632278101830, '8077ddc9-b591-41a7-ac05-dd418485e567', 'americas-brazil-sao_paulo')\n",
 65 |       "('20210923175441', '20210923175441_1_6', 'c306ec62-fd26-447e-b071-8e928c3601cc', 'americas-brazil-sao_paulo', '658db9b2-5081-49b1-b6e7-6ff1b1d92448-0_1-189-243_20210923175441.parquet', 0.2357445292459669, 0.20216983131886535, 'driver-022', 0.7985867991529113, 0.6627849637400387, 45.92862425905784, 'rider-022', 1632070802354, 'c306ec62-fd26-447e-b071-8e928c3601cc', 'americas-brazil-sao_paulo')\n",
 66 |       "('20210923175441', '20210923175441_1_8', 'aff4ca90-b07d-4598-a643-510db2396646', 'americas-brazil-sao_paulo', '658db9b2-5081-49b1-b6e7-6ff1b1d92448-0_1-189-243_20210923175441.parquet', 0.5714612197743915, 0.8660402414940012, 'driver-022', 0.42204161309648225, 0.7826771915638148, 56.15793735580833, 'rider-022', 1632216837224, 'aff4ca90-b07d-4598-a643-510db2396646', 'americas-brazil-sao_paulo')\n",
 67 |       "('20210923175137', '20210923175137_1_1', '623fb531-d340-4093-9814-ee4e2d903446', 'asia-india-chennai', 'e587c8cb-7287-429f-9206-6bbd89e091f3-0_1-155-195_20210923175137.parquet', 0.5715455359501902, 0.8806745328835989, 'driver-204', 0.4957985534250222, 0.17496376187467866, 96.4500716154594, 'rider-204', 1632184230245, '623fb531-d340-4093-9814-ee4e2d903446', 'asia-india-chennai')\n",
 68 |       "('20210923175137', '20210923175137_1_6', '9775d219-1cfe-4534-acc0-9740f00e7516', 'asia-india-chennai', 'e587c8cb-7287-429f-9206-6bbd89e091f3-0_1-155-195_20210923175137.parquet', 0.8529563766655098, 0.18417876489592633, 'driver-204', 0.5762896261799536, 0.9686943663190588, 51.299844734112945, 'rider-204', 1632148019323, '9775d219-1cfe-4534-acc0-9740f00e7516', 'asia-india-chennai')\n",
 69 |       "('20210923175441', '20210923175441_0_2', 'e9185a6b-85ec-4cd2-8280-0c876a8a0b2d', 'americas-united_states-san_francisco', 'fd89688b-1001-4686-a31a-17eb31e1c86d-0_0-189-242_20210923175441.parquet', 0.29715343023010277, 0.32560695311233856, 'driver-022', 0.5819606196949516, 0.49547619941585996, 7.078489064840843, 'rider-022', 1632032168497, 'e9185a6b-85ec-4cd2-8280-0c876a8a0b2d', 'americas-united_states-san_francisco')\n"
 70 |      ]
 71 |     }
 72 |    ],
 73 |    "source": [
 74 |     "from pyathena import connect\n",
 75 |     "\n",
 76 |     "cursor = connect(\n",
 77 |     "    s3_staging_dir=\"s3://<BUCKET>/queries/\",\n",
 78 |     "    region_name=\"us-west-2\"\n",
 79 |     ").cursor()\n",
 80 |     "\n",
 81 |     "cursor.execute(\"SELECT * FROM hudi_trips LIMIT 10\")\n",
 82 |     "\n",
 83 |     "for row in cursor:\n",
 84 |     "    print(row)"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "markdown",
 89 |    "id": "253f33a8",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "You can also query from different databases if you want."
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "code",
 97 |    "execution_count": 3,
 98 |    "id": "ebf0e256",
 99 |    "metadata": {},
100 |    "outputs": [
101 |     {
102 |      "name": "stdout",
103 |      "output_type": "stream",
104 |      "text": [
105 |       "[('dea_pain_pills_parquet',), ('dea_pain_pills_tsv',), ('dea_pain_pills_tsv_gz',)]\n"
106 |      ]
107 |     }
108 |    ],
109 |    "source": [
110 |     "cursor.execute(\"SHOW TABLES FROM intro_data_proc\")\n",
111 |     "print(cursor.fetchall())"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "markdown",
116 |    "id": "cc88eb45",
117 |    "metadata": {},
118 |    "source": [
119 |     "## Using SparkSQL\n",
120 |     "\n",
121 |     "If your EMR Cluster is using the [Glue Data Catalog](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-glue.html), you can also query your data on S3 with SparkSQL - just switch to the PySpark kernel and use the `%%sql` magic."
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": 1,
127 |    "id": "eb4c3c9a",
128 |    "metadata": {},
129 |    "outputs": [
130 |     {
131 |      "name": "stdout",
132 |      "output_type": "stream",
133 |      "text": [
134 |       "Starting Spark application\n"
135 |      ]
136 |     },
137 |     {
138 |      "data": {
139 |       "text/html": [
140 |        "<table>\n",
141 |        "<tr><th>ID</th><th>YARN Application ID</th><th>Kind</th><th>State</th><th>Spark UI</th><th>Driver log</th><th>Current session?</th></tr><tr><td>6</td><td>application_1632519122277_0008</td><td>pyspark</td><td>idle</td><td><a target=\"_blank\" href=\"http://ip-10-0-156-235.us-east-2.compute.internal:20888/proxy/application_1632519122277_0008/\" class=\"emr-proxy-link\" emr-resource=\"j-205VRI1EM562R\n",
142 |        "\" application-id=\"application_1632519122277_0008\">Link</a></td><td><a target=\"_blank\" href=\"http://ip-10-0-183-184.us-east-2.compute.internal:8042/node/containerlogs/container_1632519122277_0008_01_000001/livy\" >Link</a></td><td>✔</td></tr></table>"
143 |       ],
144 |       "text/plain": [
145 |        "<IPython.core.display.HTML object>"
146 |       ]
147 |      },
148 |      "metadata": {},
149 |      "output_type": "display_data"
150 |     },
151 |     {
152 |      "data": {
153 |       "application/vnd.jupyter.widget-view+json": {
154 |        "model_id": "",
155 |        "version_major": 2,
156 |        "version_minor": 0
157 |       },
158 |       "text/plain": [
159 |        "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
160 |       ]
161 |      },
162 |      "metadata": {},
163 |      "output_type": "display_data"
164 |     },
165 |     {
166 |      "name": "stdout",
167 |      "output_type": "stream",
168 |      "text": [
169 |       "SparkSession available as 'spark'.\n"
170 |      ]
171 |     },
172 |     {
173 |      "data": {
174 |       "application/vnd.jupyter.widget-view+json": {
175 |        "model_id": "",
176 |        "version_major": 2,
177 |        "version_minor": 0
178 |       },
179 |       "text/plain": [
180 |        "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…"
181 |       ]
182 |      },
183 |      "metadata": {},
184 |      "output_type": "display_data"
185 |     },
186 |     {
187 |      "data": {
188 |       "application/vnd.jupyter.widget-view+json": {
189 |        "model_id": "5381587a33234d798bb3b4fcca2fd8e3",
190 |        "version_major": 2,
191 |        "version_minor": 0
192 |       },
193 |       "text/plain": [
194 |        "VBox(children=(HBox(children=(HTML(value='Type:'), Button(description='Table', layout=Layout(width='70px'), st…"
195 |       ]
196 |      },
197 |      "metadata": {},
198 |      "output_type": "display_data"
199 |     },
200 |     {
201 |      "data": {
202 |       "application/vnd.jupyter.widget-view+json": {
203 |        "model_id": "e053cb95ec0b496fab9a4007dd7c7001",
204 |        "version_major": 2,
205 |        "version_minor": 0
206 |       },
207 |       "text/plain": [
208 |        "Output()"
209 |       ]
210 |      },
211 |      "metadata": {},
212 |      "output_type": "display_data"
213 |     }
214 |    ],
215 |    "source": [
216 |     "%%sql\n",
217 |     "\n",
218 |     "SHOW TABLES from intro_data_proc"
219 |    ]
220 |   }
221 |  ],
222 |  "metadata": {
223 |   "kernelspec": {
224 |    "display_name": "PySpark",
225 |    "language": "",
226 |    "name": "pysparkkernel"
227 |   },
228 |   "language_info": {
229 |    "codemirror_mode": {
230 |     "name": "python",
231 |     "version": 3
232 |    },
233 |    "mimetype": "text/x-python",
234 |    "name": "pyspark",
235 |    "pygments_lexer": "python3"
236 |   }
237 |  },
238 |  "nbformat": 4,
239 |  "nbformat_minor": 5
240 | }
241 | 


--------------------------------------------------------------------------------