├── emr ├── airflow │ ├── mwaa_stack │ │ ├── mwaa │ │ │ ├── __init__.py │ │ │ └── mwaa_stack.py │ │ ├── requirements.txt │ │ ├── assets │ │ │ ├── requirements.txt │ │ │ └── dags │ │ │ │ ├── example_emr_job.py │ │ │ │ └── example_emr_eks_job.py │ │ ├── .gitignore │ │ ├── app.py │ │ ├── source.bat │ │ ├── cdk.json │ │ ├── setup.py │ │ └── README.md │ └── README.md ├── eks │ ├── java │ │ └── emr-eks-job-runner │ │ │ ├── .gitignore │ │ │ ├── Makefile │ │ │ ├── src │ │ │ └── main │ │ │ │ └── java │ │ │ │ └── aws │ │ │ │ └── example │ │ │ │ └── emrcontainers │ │ │ │ ├── ExampleConstants.java │ │ │ │ └── StartJobRunExample.java │ │ │ ├── run_example.sh │ │ │ ├── README.md │ │ │ └── pom.xml │ ├── videos │ │ ├── pod_templates │ │ │ ├── spot_pod_template.yaml │ │ │ ├── fargate_pod_template.yaml │ │ │ ├── ondemand_pod_template.yaml │ │ │ └── README.md │ │ ├── README.md │ │ ├── custom_images │ │ │ ├── requirements.txt │ │ │ ├── test │ │ │ │ └── gen_plot.py │ │ │ ├── Dockerfile │ │ │ ├── README.md │ │ │ └── generate_aqi_map.py │ │ └── external_metastores │ │ │ ├── hivejdbc.py │ │ │ ├── gluespark.py │ │ │ └── README.md │ ├── windy_city.py │ └── README.md ├── julia │ ├── julia_notebook.png │ ├── ijulia-kernel.sh │ ├── julia-1.6.1.sh │ └── README.md └── studio │ ├── README.md │ └── notebooks │ └── emr-studio-athena.ipynb ├── cdk ├── big-data-stack │ ├── requirements.txt │ ├── .vscode │ │ └── settings.json │ ├── .gitignore │ ├── stacks │ │ ├── vpc.py │ │ ├── utils.py │ │ ├── rds.py │ │ ├── emr.py │ │ └── eks.py │ ├── source.bat │ ├── cdk.json │ ├── app.py │ ├── setup.py │ └── README.md ├── emr-serverless-job-run │ ├── tests │ │ ├── __init__.py │ │ └── unit │ │ │ ├── __init__.py │ │ │ └── test_emr_serverless_job_run_stack.py │ ├── emr_serverless_job_run │ │ ├── __init__.py │ │ └── emr_serverless_job_run_stack.py │ ├── requirements-dev.txt │ ├── requirements.txt │ ├── .gitignore │ ├── source.bat │ ├── app.py │ ├── cdk.json │ └── README.md └── emr-serverless-vpc-to-vpc │ ├── tests │ ├── __init__.py │ └── unit │ │ ├── __init__.py │ │ └── test_emr_serverless_vpc_to_vpc_stack.py │ ├── emr_serverless_vpc_to_vpc │ ├── __init__.py │ └── emr_serverless_vpc_to_vpc_stack.py │ ├── requirements-dev.txt │ ├── requirements.txt │ ├── diagram.png │ ├── .gitignore │ ├── source.bat │ ├── pg_connect.py │ ├── app.py │ ├── cdk.json │ └── README.md ├── .gitignore ├── spark └── local-k8s │ ├── kind-config.yaml │ ├── spark-shell-pod.yaml │ └── README.md ├── README.md ├── reInvent_2018 └── EMR │ ├── Makefile │ ├── Demo_Links.md │ ├── assets │ ├── scripts │ │ ├── spark_converter.py │ │ └── hive_converter.sql │ ├── notebook_code.md │ └── cloudformation │ │ ├── Spark_Cluster_Versions │ │ ├── v0_Initial_Revision.cf.yml │ │ ├── v1_Security_Settings.cf.yml │ │ ├── v2_Updated_Parameters.cf.yml │ │ ├── v3_Cluster_Size.cf.yml │ │ ├── v4_Auto_Terminate.cf.yml │ │ └── v5_SparkUI.cf.yml │ │ ├── Presto_Cluster.cf.yml │ │ └── Spark_Cluster.cf.yml │ └── create_sc_entries.sh ├── LICENSE └── athena └── Athena_Exploration.md /emr/airflow/mwaa_stack/mwaa/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cdk/big-data-stack/requirements.txt: -------------------------------------------------------------------------------- 1 | -e . 2 | -------------------------------------------------------------------------------- /cdk/emr-serverless-job-run/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cdk/emr-serverless-vpc-to-vpc/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /emr/airflow/mwaa_stack/requirements.txt: -------------------------------------------------------------------------------- 1 | -e . 2 | -------------------------------------------------------------------------------- /cdk/emr-serverless-job-run/tests/unit/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cdk/emr-serverless-vpc-to-vpc/tests/unit/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cdk/emr-serverless-job-run/emr_serverless_job_run/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cdk/emr-serverless-job-run/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | pytest==6.2.5 2 | -------------------------------------------------------------------------------- /cdk/emr-serverless-vpc-to-vpc/emr_serverless_vpc_to_vpc/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /cdk/emr-serverless-vpc-to-vpc/requirements-dev.txt: -------------------------------------------------------------------------------- 1 | pytest==6.2.5 2 | -------------------------------------------------------------------------------- /emr/eks/java/emr-eks-job-runner/.gitignore: -------------------------------------------------------------------------------- 1 | .idea/ 2 | *.iml 3 | target/ 4 | 5 | -------------------------------------------------------------------------------- /cdk/emr-serverless-job-run/requirements.txt: -------------------------------------------------------------------------------- 1 | aws-cdk-lib==2.43.1 2 | constructs>=10.0.0,<11.0.0 3 | -------------------------------------------------------------------------------- /cdk/emr-serverless-vpc-to-vpc/requirements.txt: -------------------------------------------------------------------------------- 1 | aws-cdk-lib==2.43.1 2 | constructs>=10.0.0,<11.0.0 3 | -------------------------------------------------------------------------------- /emr/julia/julia_notebook.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dacort/demo-code/HEAD/emr/julia/julia_notebook.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python artifacts 2 | venv/ 3 | 4 | # Notebook checkpoints 5 | .ipynb_chekpoints/ 6 | 7 | # Sekrets 8 | .env 9 | -------------------------------------------------------------------------------- /cdk/emr-serverless-vpc-to-vpc/diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dacort/demo-code/HEAD/cdk/emr-serverless-vpc-to-vpc/diagram.png -------------------------------------------------------------------------------- /cdk/big-data-stack/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "python.formatting.provider": "black", 3 | "python.pythonPath": ".venv/bin/python" 4 | } -------------------------------------------------------------------------------- /emr/eks/videos/pod_templates/spot_pod_template.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | spec: 4 | nodeSelector: 5 | eks.amazonaws.com/capacityType: SPOT 6 | -------------------------------------------------------------------------------- /emr/airflow/mwaa_stack/assets/requirements.txt: -------------------------------------------------------------------------------- 1 | emr-containers @ https://github.com/dacort/emr-eks-airflow2-plugin/archive/main.zip 2 | apache-airflow[amazon]==2.0.2 3 | -------------------------------------------------------------------------------- /emr/eks/videos/pod_templates/fargate_pod_template.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | spec: 4 | nodeSelector: 5 | eks.amazonaws.com/capacityType: FARGATE 6 | -------------------------------------------------------------------------------- /emr/eks/videos/pod_templates/ondemand_pod_template.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | spec: 4 | nodeSelector: 5 | eks.amazonaws.com/capacityType: ON_DEMAND 6 | -------------------------------------------------------------------------------- /cdk/emr-serverless-job-run/.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | package-lock.json 3 | __pycache__ 4 | .pytest_cache 5 | .venv 6 | *.egg-info 7 | 8 | # CDK asset staging directory 9 | .cdk.staging 10 | cdk.out 11 | -------------------------------------------------------------------------------- /cdk/big-data-stack/.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | package-lock.json 3 | __pycache__ 4 | .pytest_cache 5 | .env 6 | .venv 7 | *.egg-info 8 | 9 | # CDK asset staging directory 10 | .cdk.staging 11 | cdk.out 12 | -------------------------------------------------------------------------------- /cdk/emr-serverless-vpc-to-vpc/.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | package-lock.json 3 | __pycache__ 4 | .pytest_cache 5 | .venv 6 | *.egg-info 7 | 8 | # CDK asset staging directory 9 | .cdk.staging 10 | cdk.out 11 | -------------------------------------------------------------------------------- /emr/airflow/mwaa_stack/.gitignore: -------------------------------------------------------------------------------- 1 | *.swp 2 | package-lock.json 3 | __pycache__ 4 | .pytest_cache 5 | .env 6 | .venv 7 | *.egg-info 8 | 9 | # CDK asset staging directory 10 | .cdk.staging 11 | cdk.out 12 | -------------------------------------------------------------------------------- /emr/julia/ijulia-kernel.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | # Install IJulia Kernel as the emr-notebook user 4 | sudo -u emr-notebook JUPYTER=/emr/notebook-env/bin/jupyter /usr/local/bin/julia -e 'using Pkg; Pkg.add(["IJulia"])' -------------------------------------------------------------------------------- /spark/local-k8s/kind-config.yaml: -------------------------------------------------------------------------------- 1 | # three node (two workers) cluster config 2 | kind: Cluster 3 | apiVersion: kind.x-k8s.io/v1alpha4 4 | nodes: 5 | - role: control-plane 6 | - role: worker 7 | - role: worker 8 | -------------------------------------------------------------------------------- /emr/eks/java/emr-eks-job-runner/Makefile: -------------------------------------------------------------------------------- 1 | all: target/aws-emr-eks-examples-1.0.jar 2 | 3 | target/aws-emr-eks-examples-1.0.jar: src/main/java/aws/example/emrcontainers/*.java 4 | mvn package 5 | 6 | clean: 7 | mvn clean -------------------------------------------------------------------------------- /emr/airflow/mwaa_stack/app.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | 4 | from aws_cdk import core as cdk 5 | 6 | from mwaa.mwaa_stack import MwaaStack 7 | 8 | 9 | app = cdk.App() 10 | MwaaStack(app, "MwaaStack") 11 | 12 | app.synth() 13 | -------------------------------------------------------------------------------- /emr/julia/julia-1.6.1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | curl -OL https://julialang-s3.julialang.org/bin/linux/x64/1.6/julia-1.6.1-linux-x86_64.tar.gz 3 | 4 | sudo mkdir -p /opt; sudo tar xf julia-1.6.1-linux-x86_64.tar.gz --directory /opt 5 | 6 | sudo ln -s /opt/julia-1.6.1/bin/julia /usr/local/bin/julia 7 | -------------------------------------------------------------------------------- /emr/eks/java/emr-eks-job-runner/src/main/java/aws/example/emrcontainers/ExampleConstants.java: -------------------------------------------------------------------------------- 1 | package aws.example.emrcontainers; 2 | 3 | public class ExampleConstants { 4 | public static final String EMR_RELEASE_LABEL = "emr-6.3.0-latest"; 5 | public static final long SLEEP_AMOUNT_IN_MS = 1000; 6 | } 7 | -------------------------------------------------------------------------------- /spark/local-k8s/spark-shell-pod.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Pod 3 | metadata: 4 | name: spark-shell-pod 5 | namespace: default 6 | labels: 7 | app: spark-shell 8 | spec: 9 | serviceAccount: spark 10 | containers: 11 | - name: spark-kubernetes-driver 12 | image: apache/spark:3.5.2 13 | command: ["/bin/bash"] 14 | args: ["-c", "trap : term INT; sleep infinity & wait"] -------------------------------------------------------------------------------- /cdk/big-data-stack/stacks/vpc.py: -------------------------------------------------------------------------------- 1 | from aws_cdk import core as cdk, aws_ec2 as ec2 2 | 3 | 4 | class VPCStack(cdk.Stack): 5 | vpc: ec2.Vpc 6 | 7 | def __init__(self, scope: cdk.Construct, construct_id: str, **kwargs) -> None: 8 | super().__init__(scope, construct_id, **kwargs) 9 | 10 | # We create a simple VPC here 11 | self.vpc = ec2.Vpc(self, "EMRDemos", max_azs=3) # default is all AZs in region -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # dacort's demo code 2 | 3 | Bits of code I use during demos 4 | 5 | - [AWS reInvent 2018](reInvent_2018/EMR) - Amazon EMR and AWS Service Catalog integration and Amazon EMR Notebook demo 6 | - [EMR on EKS](emr/eks) - Examples of using Amazon EMR on Amazon EKS, including an example notebook using [NOAA Integrated Surface Database](https://registry.opendata.aws/noaa-isd/) 7 | - [EMR Studio](emr/studio) - Example notebook for [EMR Studio demo](https://youtu.be/oVgyL5W9FPU) 8 | -------------------------------------------------------------------------------- /emr/eks/videos/README.md: -------------------------------------------------------------------------------- 1 | # A video series of EMR on EKS howto guides 2 | 3 | - [Why EMR on EKS](https://youtu.be/2UMz72NRZss) - Describes common usage scenarios for EMR on EKS 4 | - [Running Jobs](https://youtu.be/eEEqzFGqG_M) - Shows how to submit jobs to the EMR Containers API 5 | - [External Metastores](https://youtu.be/zBXK5GTVUKU) - Demo of connecting to an RDS MySQL Hive metastore and Glue Data Catalog 6 | - Related code is in [`./external_metastores/`](./external_metastores) -------------------------------------------------------------------------------- /emr/eks/videos/custom_images/requirements.txt: -------------------------------------------------------------------------------- 1 | appdirs==1.4.4 2 | black==21.5b2 3 | bokeh==2.3.2 4 | click==8.0.1 5 | Jinja2==3.0.1 6 | MarkupSafe==2.0.1 7 | mypy-extensions==0.4.3 8 | numpy==1.20.3 9 | packaging==20.9 10 | pathspec==0.8.1 11 | Pillow==8.3.2 12 | py4j==0.10.9 13 | pyparsing==2.4.7 14 | pyspark==3.1.3 15 | python-dateutil==2.8.1 16 | PyYAML==5.4.1 17 | regex==2021.4.4 18 | Shapely==1.7.1 19 | six==1.16.0 20 | toml==0.10.2 21 | tornado==6.1 22 | typing-extensions==3.10.0.0 23 | -------------------------------------------------------------------------------- /reInvent_2018/EMR/Makefile: -------------------------------------------------------------------------------- 1 | RELEASE_BUCKET?=damons-reinvent-demo 2 | PREFIX?=reinvent/ 3 | AWS_PROFILE?=default 4 | 5 | deploy: 6 | @aws --profile $(AWS_PROFILE) s3 sync assets/ "s3://$(RELEASE_BUCKET)/$(PREFIX)" 7 | @echo "https://$(RELEASE_BUCKET).s3.amazonaws.com/$(PREFIX)cloudformation/Spark_Cluster.cf.yml" 8 | @echo "https://$(RELEASE_BUCKET).s3.amazonaws.com/$(PREFIX)EMR_Spark_Pipeline.cf.yml" 9 | @echo "https://$(RELEASE_BUCKET).s3.amazonaws.com/$(PREFIX)cloudformation/Presto_Cluster.cf.yml" 10 | 11 | -------------------------------------------------------------------------------- /cdk/big-data-stack/source.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | rem The sole purpose of this script is to make the command 4 | rem 5 | rem source .venv/bin/activate 6 | rem 7 | rem (which activates a Python virtualenv on Linux or Mac OS X) work on Windows. 8 | rem On Windows, this command just runs this batch file (the argument is ignored). 9 | rem 10 | rem Now we don't need to document a Windows command for activating a virtualenv. 11 | 12 | echo Executing .venv\Scripts\activate.bat for you 13 | .venv\Scripts\activate.bat 14 | -------------------------------------------------------------------------------- /emr/airflow/mwaa_stack/source.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | rem The sole purpose of this script is to make the command 4 | rem 5 | rem source .venv/bin/activate 6 | rem 7 | rem (which activates a Python virtualenv on Linux or Mac OS X) work on Windows. 8 | rem On Windows, this command just runs this batch file (the argument is ignored). 9 | rem 10 | rem Now we don't need to document a Windows command for activating a virtualenv. 11 | 12 | echo Executing .venv\Scripts\activate.bat for you 13 | .venv\Scripts\activate.bat 14 | -------------------------------------------------------------------------------- /cdk/emr-serverless-job-run/source.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | rem The sole purpose of this script is to make the command 4 | rem 5 | rem source .venv/bin/activate 6 | rem 7 | rem (which activates a Python virtualenv on Linux or Mac OS X) work on Windows. 8 | rem On Windows, this command just runs this batch file (the argument is ignored). 9 | rem 10 | rem Now we don't need to document a Windows command for activating a virtualenv. 11 | 12 | echo Executing .venv\Scripts\activate.bat for you 13 | .venv\Scripts\activate.bat 14 | -------------------------------------------------------------------------------- /cdk/emr-serverless-vpc-to-vpc/source.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | 3 | rem The sole purpose of this script is to make the command 4 | rem 5 | rem source .venv/bin/activate 6 | rem 7 | rem (which activates a Python virtualenv on Linux or Mac OS X) work on Windows. 8 | rem On Windows, this command just runs this batch file (the argument is ignored). 9 | rem 10 | rem Now we don't need to document a Windows command for activating a virtualenv. 11 | 12 | echo Executing .venv\Scripts\activate.bat for you 13 | .venv\Scripts\activate.bat 14 | -------------------------------------------------------------------------------- /cdk/big-data-stack/stacks/utils.py: -------------------------------------------------------------------------------- 1 | from aws_cdk import core as cdk, aws_s3 as s3 2 | 3 | 4 | def get_or_create_bucket( 5 | stack: cdk.Stack, bucket_id: str, context_key: str = None 6 | ) -> s3.Bucket: 7 | if context_key is None or stack.node.try_get_context(context_key) is None: 8 | return s3.Bucket( 9 | stack, 10 | bucket_id, 11 | ) 12 | else: 13 | bucket_name = stack.node.try_get_context(context_key) 14 | return s3.Bucket.from_bucket_name(stack, bucket_id, bucket_name) -------------------------------------------------------------------------------- /emr/eks/java/emr-eks-job-runner/run_example.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [[ -z $* ]] ; then 3 | echo 'Supply the name of one of the example classes as an argument.' 4 | echo 'If there are arguments to the class, put them in quotes after the class name.' 5 | exit 1 6 | fi 7 | export CLASSPATH=target/emr-eks-examples-1.0.jar 8 | export className=$1 9 | echo "## Running $className..." 10 | shift 11 | echo "## arguments: $@" 12 | mvn exec:java -Dexec.mainClass="aws.example.emrcontainers.$className" -Dexec.args="$@" -Dexec.cleanupDaemonThreads=false -------------------------------------------------------------------------------- /cdk/big-data-stack/cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "python3 app.py", 3 | "context": { 4 | "@aws-cdk/core:enableStackNameDuplicates": "true", 5 | "aws-cdk:enableDiffNoFail": "true", 6 | "@aws-cdk/core:stackRelativeExports": "true", 7 | "@aws-cdk/aws-ecr-assets:dockerIgnoreSupport": true, 8 | "@aws-cdk/aws-secretsmanager:parseOwnedSecretName": true, 9 | "@aws-cdk/aws-kms:defaultKeyPolicies": true, 10 | "@aws-cdk/aws-s3:grantWriteWithoutAcl": true, 11 | "@aws-cdk/aws-ecs-patterns:removeDefaultDesiredCount": true 12 | } 13 | } 14 | -------------------------------------------------------------------------------- /cdk/emr-serverless-vpc-to-vpc/pg_connect.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | 3 | PG_HOSTNAME = "ip-10-0-XXX-XXX.us-west-2.compute.internal" 4 | 5 | spark = SparkSession.builder.getOrCreate() 6 | 7 | df = ( 8 | spark.read.format("jdbc") 9 | .option( 10 | "url", f"jdbc:postgresql://{PG_HOSTNAME}:5432/postgres" 11 | ) 12 | .option("driver", "org.postgresql.Driver") 13 | .option("dbtable", "users") 14 | .option("user", "remote") 15 | .option("password", "remote") 16 | .load() 17 | ) 18 | 19 | df.show() 20 | print(df.count()) 21 | 22 | -------------------------------------------------------------------------------- /cdk/emr-serverless-job-run/tests/unit/test_emr_serverless_job_run_stack.py: -------------------------------------------------------------------------------- 1 | import aws_cdk as core 2 | import aws_cdk.assertions as assertions 3 | 4 | from emr_serverless_job_run.emr_serverless_job_run_stack import EmrServerlessJobRunStack 5 | 6 | # example tests. To run these tests, uncomment this file along with the example 7 | # resource in emr_serverless_job_run/emr_serverless_job_run_stack.py 8 | def test_sqs_queue_created(): 9 | app = core.App() 10 | stack = EmrServerlessJobRunStack(app, "emr-serverless-job-run") 11 | template = assertions.Template.from_stack(stack) 12 | 13 | # template.has_resource_properties("AWS::SQS::Queue", { 14 | # "VisibilityTimeout": 300 15 | # }) 16 | -------------------------------------------------------------------------------- /emr/eks/videos/external_metastores/hivejdbc.py: -------------------------------------------------------------------------------- 1 | from os.path import abspath 2 | from pyspark.sql import SparkSession 3 | from pyspark.sql import Row 4 | 5 | # warehouse_location points to the default location for managed databases and tables 6 | warehouse_location = abspath("spark-warehouse") 7 | spark = ( 8 | SparkSession.builder.appName("hive-demo") 9 | .config("spark.sql.warehouse.dir", warehouse_location) 10 | .enableHiveSupport() 11 | .getOrCreate() 12 | ) 13 | spark.sql("SHOW DATABASES").show() 14 | spark.sql("SELECT count(*) FROM rapid7_fdns_any").show() 15 | spark.sql("SELECT * FROM rapid7_fdns_any WHERE name LIKE '%.starlink.com' AND date = (SELECT MAX(date) from rapid7_fdns_any)").show() 16 | spark.stop() -------------------------------------------------------------------------------- /cdk/emr-serverless-vpc-to-vpc/tests/unit/test_emr_serverless_vpc_to_vpc_stack.py: -------------------------------------------------------------------------------- 1 | import aws_cdk as core 2 | import aws_cdk.assertions as assertions 3 | 4 | from emr_serverless_vpc_to_vpc.emr_serverless_vpc_to_vpc_stack import EmrServerlessVpcToVpcStack 5 | 6 | # example tests. To run these tests, uncomment this file along with the example 7 | # resource in emr_serverless_vpc_to_vpc/emr_serverless_vpc_to_vpc_stack.py 8 | def test_sqs_queue_created(): 9 | app = core.App() 10 | stack = EmrServerlessVpcToVpcStack(app, "emr-serverless-vpc-to-vpc") 11 | template = assertions.Template.from_stack(stack) 12 | 13 | # template.has_resource_properties("AWS::SQS::Queue", { 14 | # "VisibilityTimeout": 300 15 | # }) 16 | -------------------------------------------------------------------------------- /emr/airflow/mwaa_stack/cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "python3 app.py", 3 | "context": { 4 | "@aws-cdk/aws-apigateway:usagePlanKeyOrderInsensitiveId": true, 5 | "@aws-cdk/core:enableStackNameDuplicates": "true", 6 | "aws-cdk:enableDiffNoFail": "true", 7 | "@aws-cdk/core:stackRelativeExports": "true", 8 | "@aws-cdk/aws-ecr-assets:dockerIgnoreSupport": true, 9 | "@aws-cdk/aws-secretsmanager:parseOwnedSecretName": true, 10 | "@aws-cdk/aws-kms:defaultKeyPolicies": true, 11 | "@aws-cdk/aws-s3:grantWriteWithoutAcl": true, 12 | "@aws-cdk/aws-ecs-patterns:removeDefaultDesiredCount": true, 13 | "@aws-cdk/aws-rds:lowercaseDbIdentifier": true, 14 | "@aws-cdk/aws-efs:defaultEncryptionAtRest": true, 15 | "@aws-cdk/aws-lambda:recognizeVersionProps": true 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /emr/eks/videos/external_metastores/gluespark.py: -------------------------------------------------------------------------------- 1 | from os.path import expanduser, join, abspath 2 | from pyspark.sql import SparkSession 3 | from pyspark.sql import Row 4 | 5 | # warehouse_location points to the default location for managed databases and tables 6 | warehouse_location = abspath("spark-warehouse") 7 | spark = ( 8 | SparkSession.builder.appName("glue-demo") 9 | .config("spark.sql.warehouse.dir", warehouse_location) 10 | .enableHiveSupport() 11 | .getOrCreate() 12 | ) 13 | 14 | spark.sql("SHOW DATABASES").show() 15 | spark.sql(""" 16 | SELECT id, snippet.title, 17 | MAX(CAST(statistics.viewcount AS integer)) AS max_views, 18 | MAX(CAST(statistics.likecount AS integer)) AS max_likes 19 | FROM damons_datalake.youtube 20 | GROUP BY 1, 2 21 | ORDER BY 3 DESC 22 | """).show(truncate=False) 23 | spark.stop() -------------------------------------------------------------------------------- /reInvent_2018/EMR/Demo_Links.md: -------------------------------------------------------------------------------- 1 | # Demo Links 2 | 3 | Replace `damons-reinvent-demo` with your own S3 bucket. 😃 4 | 5 | ## Presto Cluster 6 | 7 | This is an example template that can be used to create a new Presto Product in the Service Catalog. 8 | 9 | https://damons-reinvent-demo.s3.amazonaws.com/reinvent/cloudformation/Presto_Cluster.cf.yml 10 | 11 | ## Spark Converter 12 | 13 | Copy and paste the below into Job Parameters 14 | 15 | s3://damons-reinvent-demo/reinvent/scripts/spark_converter.py 16 | s3://amazon-reviews-pds/tsv/amazon_reviews_us_Toys_v1_00.tsv.gz 17 | s3://damons-reinvent-demo/reinvent/spark/amazon_reviews/ 18 | 19 | ## Hive Converter 20 | 21 | Copy and paste the below into Job Parameters 22 | 23 | s3://damons-reinvent-demo/reinvent/scripts/hive_converter.sql 24 | -d INPUT=s3://amazon-reviews-pds/tsv/ 25 | -d OUTPUT=s3://damons-reinvent-demo/reinvent/hive/query_output/ -------------------------------------------------------------------------------- /cdk/emr-serverless-job-run/app.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | 4 | import aws_cdk as cdk 5 | 6 | from emr_serverless_job_run.emr_serverless_job_run_stack import EmrServerlessJobRunStack 7 | 8 | 9 | app = cdk.App() 10 | EmrServerlessJobRunStack(app, "EmrServerlessJobRunStack", 11 | # If you don't specify 'env', this stack will be environment-agnostic. 12 | # Account/Region-dependent features and context lookups will not work, 13 | # but a single synthesized template can be deployed anywhere. 14 | 15 | # Uncomment the next line to specialize this stack for the AWS Account 16 | # and Region that are implied by the current CLI configuration. 17 | 18 | #env=cdk.Environment(account=os.getenv('CDK_DEFAULT_ACCOUNT'), region=os.getenv('CDK_DEFAULT_REGION')), 19 | 20 | # Uncomment the next line if you know exactly what Account and Region you 21 | # want to deploy the stack to. */ 22 | 23 | #env=cdk.Environment(account='123456789012', region='us-east-1'), 24 | 25 | # For more information, see https://docs.aws.amazon.com/cdk/latest/guide/environments.html 26 | ) 27 | 28 | app.synth() 29 | -------------------------------------------------------------------------------- /cdk/emr-serverless-vpc-to-vpc/app.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | 4 | import aws_cdk as cdk 5 | 6 | from emr_serverless_vpc_to_vpc.emr_serverless_vpc_to_vpc_stack import EmrServerlessVpcToVpcStack 7 | 8 | 9 | app = cdk.App() 10 | EmrServerlessVpcToVpcStack(app, "EmrServerlessVpcToVpcStack", 11 | # If you don't specify 'env', this stack will be environment-agnostic. 12 | # Account/Region-dependent features and context lookups will not work, 13 | # but a single synthesized template can be deployed anywhere. 14 | 15 | # Uncomment the next line to specialize this stack for the AWS Account 16 | # and Region that are implied by the current CLI configuration. 17 | 18 | env=cdk.Environment(account=os.getenv('CDK_DEFAULT_ACCOUNT'), region=os.getenv('CDK_DEFAULT_REGION')), 19 | 20 | # Uncomment the next line if you know exactly what Account and Region you 21 | # want to deploy the stack to. */ 22 | 23 | # env=cdk.Environment(account='568026268536', region='us-west-2'), 24 | 25 | # For more information, see https://docs.aws.amazon.com/cdk/latest/guide/environments.html 26 | ) 27 | 28 | app.synth() 29 | -------------------------------------------------------------------------------- /cdk/big-data-stack/stacks/rds.py: -------------------------------------------------------------------------------- 1 | from aws_cdk import core as cdk, aws_ec2 as ec2, aws_rds as rds 2 | 3 | 4 | class RDSStack(cdk.Stack): 5 | instance: rds.DatabaseInstance 6 | 7 | def __init__(self, scope: cdk.Construct, construct_id: str, vpc: ec2.IVpc, **kwargs) -> None: 8 | super().__init__(scope, construct_id, **kwargs) 9 | 10 | self.instance = rds.DatabaseInstance( 11 | self, 12 | construct_id, 13 | engine=rds.DatabaseInstanceEngine.mysql( 14 | version=rds.MysqlEngineVersion.VER_8_0 15 | ), 16 | vpc=vpc, 17 | database_name="metastore", 18 | removal_policy=cdk.RemovalPolicy.DESTROY, 19 | deletion_protection=False 20 | ) 21 | 22 | self.instance.connections.allow_from_any_ipv4(ec2.Port.tcp(3306), "Allow mysql from anywhere") 23 | 24 | # May be able to do this in EMR stack 25 | # .connections.security_groups[0].add_ingress_rule( 26 | # peer = ec2.Peer.ipv4(vpc.vpc_cidr_block), 27 | # connection = ec2.Port.tcp(80), 28 | # description="Allow http inbound from VPC" 29 | # ) 30 | -------------------------------------------------------------------------------- /emr/eks/java/emr-eks-job-runner/README.md: -------------------------------------------------------------------------------- 1 | # Amazon EMR on EKS examples 2 | 3 | ## Purpose 4 | 5 | Shows how to use the AWS SDK for Java with Amazon EMR on EKS. 6 | 7 | _Amazon EMR on EKS allows users to easily submit Spark jobs on Kubernetes._ 8 | 9 | ## Running the code 10 | 11 | ### Prerequisites 12 | 13 | - You must have an AWS account, and have your default credentials and AWS Region 14 | configured as described in the [AWS Tools and SDKs Shared Configuration and 15 | Credentials Reference Guide](https://docs.aws.amazon.com/credref/latest/refdocs/creds-config-files.html). 16 | 17 | ### Building 18 | 19 | - Run `mvn package` and then `./run_example.sh` with the class name of the code example. 20 | 21 | ## Code examples 22 | 23 | - [Submit an EMR on EKS job](./src/main/java/aws/example/emrcontainers/StartJobRunExample.java) 24 | 25 | `./run_example.sh StartJobRunExample " "` 26 | 27 | ## Additional Information 28 | - [Amazon EMR on EKS documentatin](https://docs.aws.amazon.com/emr/latest/EMR-on-EKS-DevelopmentGuide/emr-eks.html) 29 | - [Amazon EMR on EKS Best Practices Guide](https://aws.github.io/aws-emr-containers-best-practices/) -------------------------------------------------------------------------------- /emr/eks/videos/custom_images/test/gen_plot.py: -------------------------------------------------------------------------------- 1 | from bokeh.plotting import figure 2 | from bokeh.io import output_file, show, export_png 3 | from bokeh.io.webdriver import create_chromium_webdriver 4 | 5 | import hashlib 6 | 7 | def sha256sum(filename): 8 | h = hashlib.sha256() 9 | b = bytearray(128*1024) 10 | mv = memoryview(b) 11 | with open(filename, 'rb', buffering=0) as f: 12 | for n in iter(lambda : f.readinto(mv), 0): 13 | h.update(mv[:n]) 14 | return h.hexdigest() 15 | 16 | def generate_plot(filename): 17 | p = figure(plot_width=400, plot_height=400) 18 | 19 | p.circle([1, 2, 3, 4, 5], [6, 7, 2, 4, 5], size=15, line_color="navy", 20 | fill_color="orange", fill_alpha=0.5) 21 | 22 | # --no-sandbox is required per https://stackoverflow.com/q/50642308 23 | # Maybe look into https://github.com/Zenika/alpine-chrome at some point 24 | driver = create_chromium_webdriver(['--no-sandbox']) 25 | export_png(p, filename=filename, webdriver=driver) 26 | # get_screenshot_as_png 27 | 28 | generate_plot("plot.png") 29 | hash = sha256sum("plot.png") 30 | assert hash == "ed2ffa2348560a7254753fe0ff70e811e3a0d1879c1609aeeb32efeb2feecc35" 31 | 32 | print("All good! 🙌") -------------------------------------------------------------------------------- /emr/airflow/mwaa_stack/setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | 4 | with open("README.md") as fp: 5 | long_description = fp.read() 6 | 7 | 8 | setuptools.setup( 9 | name="mwaa_stack", 10 | version="0.0.1", 11 | 12 | description="An empty CDK Python app", 13 | long_description=long_description, 14 | long_description_content_type="text/markdown", 15 | 16 | author="author", 17 | 18 | package_dir={"": "mwaa"}, 19 | packages=setuptools.find_packages(where="mwaa"), 20 | 21 | install_requires=[ 22 | "aws-cdk.core==1.110.0", 23 | "aws-cdk.aws_mwaa", 24 | "aws-cdk.aws_s3_deployment" 25 | ], 26 | 27 | python_requires=">=3.6", 28 | 29 | classifiers=[ 30 | "Development Status :: 4 - Beta", 31 | 32 | "Intended Audience :: Developers", 33 | 34 | "Programming Language :: JavaScript", 35 | "Programming Language :: Python :: 3 :: Only", 36 | "Programming Language :: Python :: 3.6", 37 | "Programming Language :: Python :: 3.7", 38 | "Programming Language :: Python :: 3.8", 39 | 40 | "Topic :: Software Development :: Code Generators", 41 | "Topic :: Utilities", 42 | 43 | "Typing :: Typed", 44 | ], 45 | ) 46 | -------------------------------------------------------------------------------- /emr/eks/java/emr-eks-job-runner/pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | aws.example.emrcontainers 8 | emr-eks-examples 9 | jar 10 | 1.0 11 | Amazon EMR on EKS Examples 12 | 13 | 14 | 15 | 16 | software.amazon.awssdk 17 | bom 18 | 2.17.29 19 | pom 20 | import 21 | 22 | 23 | 24 | 25 | 26 | 27 | software.amazon.awssdk 28 | emrcontainers 29 | 2.17.29 30 | 31 | 32 | 33 | 34 | 11 35 | 11 36 | 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /cdk/big-data-stack/app.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import os 3 | from stacks.emr_studio import EMRStudio 4 | from stacks.emr_containers import EMRContainersStack 5 | 6 | from aws_cdk import core as cdk 7 | 8 | from stacks.vpc import VPCStack 9 | from stacks.rds import RDSStack 10 | from stacks.emr import EMRStack 11 | from stacks.eks import EKSStack 12 | from stacks.emr_containers import EMRContainersStack 13 | 14 | 15 | app = cdk.App() 16 | 17 | vpc = VPCStack(app, "VPCStack") 18 | 19 | # These two stacks are disabled by default 20 | # I use them when to demo EMR with a MySQL-backed Hive metastore 21 | # rds = RDSStack(app, "RDSStack", vpc.vpc) 22 | # emr = EMRStack( 23 | # app, 24 | # "EMRStack", 25 | # vpc.vpc, 26 | # name="EMR with Hive Metastore", 27 | # release_label="emr-5.32.0", 28 | # rds_secret=rds.instance.secret, 29 | # rds_connections=rds.instance.connections, 30 | # ) 31 | 32 | # The EKS stack requires bootstrapping 33 | # Run "cdk bootstrap aws://account/region" 34 | # You can also optionally specify an IAM role name to be mapped to a cluster admin 35 | # `-c eks_admin_role_name=AdminRole` 36 | eks = EKSStack(app, "EKSStack", vpc.vpc) 37 | 38 | # Now add a virtual EMR cluster! 39 | emr_containers = EMRContainersStack(app, "EMRContainers", vpc.vpc, eks.cluster) 40 | 41 | 42 | # We want to add EMR Studio to the mix as well :) 43 | emr_studio = EMRStudio(app, "EMRStudio", vpc.vpc, "big-data-studio") 44 | 45 | 46 | app.synth() 47 | -------------------------------------------------------------------------------- /cdk/big-data-stack/setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | 4 | with open("README.md") as fp: 5 | long_description = fp.read() 6 | 7 | 8 | setuptools.setup( 9 | name="cdk_emr_metastores", 10 | version="0.0.1", 11 | 12 | description="An empty CDK Python app", 13 | long_description=long_description, 14 | long_description_content_type="text/markdown", 15 | 16 | author="author", 17 | 18 | package_dir={"": "stacks"}, 19 | packages=setuptools.find_packages(where="stacks"), 20 | 21 | install_requires=[ 22 | "aws-cdk.core==1.95.0", 23 | "aws_cdk.aws_ec2", 24 | "aws-cdk.aws-rds", 25 | "aws-cdk.aws-secretsmanager", 26 | "aws_cdk.aws-emr", 27 | "aws_cdk.aws-eks", 28 | "aws_cdk.aws-emrcontainers", 29 | "aws_cdk.aws-servicecatalog", 30 | ], 31 | 32 | python_requires=">=3.6", 33 | 34 | classifiers=[ 35 | "Development Status :: 4 - Beta", 36 | 37 | "Intended Audience :: Developers", 38 | 39 | "License :: OSI Approved :: Apache Software License", 40 | 41 | "Programming Language :: JavaScript", 42 | "Programming Language :: Python :: 3 :: Only", 43 | "Programming Language :: Python :: 3.6", 44 | "Programming Language :: Python :: 3.7", 45 | "Programming Language :: Python :: 3.8", 46 | 47 | "Topic :: Software Development :: Code Generators", 48 | "Topic :: Utilities", 49 | 50 | "Typing :: Typed", 51 | ], 52 | ) 53 | -------------------------------------------------------------------------------- /reInvent_2018/EMR/assets/scripts/spark_converter.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from pyspark.sql import SparkSession 4 | 5 | if len(sys.argv) > 1: 6 | INPUT_LOCATION = sys.argv[1] 7 | OUTPUT_LOCATION = sys.argv[2] 8 | else: 9 | INPUT_LOCATION = 's3://amazon-reviews-pds/tsv/amazon_reviews_us_Toys_v1_00.tsv.gz' 10 | OUTPUT_LOCATION = 's3://damons-reinvent-demo/reinvent/spark/amazon_reviews/' 11 | 12 | # Utility to just take an input file and split it 13 | # df = spark.read.option("sep", "\t").option("header","true").csv(INPUT_LOCATION) 14 | # df.repartition(10).write.csv("s3://damons-reinvent-demo/reinvent/source_toys/") 15 | 16 | if __name__ == "__main__": 17 | if len(sys.argv) != 3: 18 | print("Usage: spark_converter ") 19 | sys.exit(-1) 20 | 21 | # Initialize the spark context. 22 | spark = SparkSession\ 23 | .builder\ 24 | .appName("SparkConverter")\ 25 | .config("spark.sql.parquet.fs.optimized.committer.optimization-enabled", "true")\ 26 | .getOrCreate() 27 | 28 | # Read in the desired TSV 29 | df = spark.read.option('sep', '\t').option('header', 'true').csv(INPUT_LOCATION) 30 | 31 | # Repartition for multiple output files and write out to parquet 32 | df.repartition(10).write.mode('overwrite').parquet(OUTPUT_LOCATION) 33 | 34 | # To run: s3://damons-reinvent-demo/reinvent/scripts/spark_converter.py s3://amazon-reviews-pds/tsv/amazon_reviews_us_Toys_v1_00.tsv.gz s3://damons-reinvent-demo/reinvent/amazon_reviews/ -------------------------------------------------------------------------------- /cdk/emr-serverless-job-run/cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "python3 app.py", 3 | "watch": { 4 | "include": [ 5 | "**" 6 | ], 7 | "exclude": [ 8 | "README.md", 9 | "cdk*.json", 10 | "requirements*.txt", 11 | "source.bat", 12 | "**/__init__.py", 13 | "python/__pycache__", 14 | "tests" 15 | ] 16 | }, 17 | "context": { 18 | "@aws-cdk/aws-apigateway:usagePlanKeyOrderInsensitiveId": true, 19 | "@aws-cdk/core:stackRelativeExports": true, 20 | "@aws-cdk/aws-rds:lowercaseDbIdentifier": true, 21 | "@aws-cdk/aws-lambda:recognizeVersionProps": true, 22 | "@aws-cdk/aws-lambda:recognizeLayerVersion": true, 23 | "@aws-cdk/aws-cloudfront:defaultSecurityPolicyTLSv1.2_2021": true, 24 | "@aws-cdk-containers/ecs-service-extensions:enableDefaultLogDriver": true, 25 | "@aws-cdk/aws-ec2:uniqueImdsv2TemplateName": true, 26 | "@aws-cdk/core:checkSecretUsage": true, 27 | "@aws-cdk/aws-iam:minimizePolicies": true, 28 | "@aws-cdk/aws-ecs:arnFormatIncludesClusterName": true, 29 | "@aws-cdk/core:validateSnapshotRemovalPolicy": true, 30 | "@aws-cdk/aws-codepipeline:crossAccountKeyAliasStackSafeResourceName": true, 31 | "@aws-cdk/aws-s3:createDefaultLoggingPolicy": true, 32 | "@aws-cdk/aws-sns-subscriptions:restrictSqsDescryption": true, 33 | "@aws-cdk/aws-apigateway:disableCloudWatchRole": true, 34 | "@aws-cdk/core:enablePartitionLiterals": true, 35 | "@aws-cdk/core:target-partitions": [ 36 | "aws", 37 | "aws-cn" 38 | ] 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /cdk/emr-serverless-vpc-to-vpc/cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "python3 app.py", 3 | "watch": { 4 | "include": [ 5 | "**" 6 | ], 7 | "exclude": [ 8 | "README.md", 9 | "cdk*.json", 10 | "requirements*.txt", 11 | "source.bat", 12 | "**/__init__.py", 13 | "python/__pycache__", 14 | "tests" 15 | ] 16 | }, 17 | "context": { 18 | "@aws-cdk/aws-apigateway:usagePlanKeyOrderInsensitiveId": true, 19 | "@aws-cdk/core:stackRelativeExports": true, 20 | "@aws-cdk/aws-rds:lowercaseDbIdentifier": true, 21 | "@aws-cdk/aws-lambda:recognizeVersionProps": true, 22 | "@aws-cdk/aws-lambda:recognizeLayerVersion": true, 23 | "@aws-cdk/aws-cloudfront:defaultSecurityPolicyTLSv1.2_2021": true, 24 | "@aws-cdk-containers/ecs-service-extensions:enableDefaultLogDriver": true, 25 | "@aws-cdk/aws-ec2:uniqueImdsv2TemplateName": true, 26 | "@aws-cdk/core:checkSecretUsage": true, 27 | "@aws-cdk/aws-iam:minimizePolicies": true, 28 | "@aws-cdk/aws-ecs:arnFormatIncludesClusterName": true, 29 | "@aws-cdk/core:validateSnapshotRemovalPolicy": true, 30 | "@aws-cdk/aws-codepipeline:crossAccountKeyAliasStackSafeResourceName": true, 31 | "@aws-cdk/aws-s3:createDefaultLoggingPolicy": true, 32 | "@aws-cdk/aws-sns-subscriptions:restrictSqsDescryption": true, 33 | "@aws-cdk/aws-apigateway:disableCloudWatchRole": true, 34 | "@aws-cdk/core:enablePartitionLiterals": true, 35 | "@aws-cdk/core:target-partitions": [ 36 | "aws", 37 | "aws-cn" 38 | ] 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /emr/eks/videos/custom_images/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM 711395599931.dkr.ecr.us-east-2.amazonaws.com/notebook-spark/emr-6.3.0:latest 2 | 3 | USER root 4 | 5 | # Install Chrome 6 | # This generates an image that is 2.89gb 7 | RUN curl https://intoli.com/install-google-chrome.sh | bash && \ 8 | mv /usr/bin/google-chrome-stable /usr/bin/chrome 9 | 10 | # This generates an image that is 3.13gb 11 | # RUN amazon-linux-extras install epel -y && \ 12 | # yum install -y chromium 13 | 14 | # We need to upgrade pip in order to install pyproj 15 | RUN pip3 install --upgrade pip 16 | 17 | # If you pip install as root, use this 18 | RUN pip3 install \ 19 | bokeh==2.3.2 \ 20 | boto3==1.17.93 \ 21 | chromedriver-py==91.0.4472.19.0 \ 22 | geopandas==0.9.0 \ 23 | selenium==3.141.0 \ 24 | shapely==1.7.1 25 | 26 | RUN ln -s /usr/local/lib/python3.7/site-packages/chromedriver_py/chromedriver_linux64 /usr/local/bin/chromedriver 27 | 28 | # Install bokeh sample data to a tmpdir 29 | RUN mkdir /root/.bokeh && \ 30 | echo "sampledata_dir: /usr/local/share/bokeh" > /root/.bokeh/config 31 | 32 | RUN bokeh sampledata 33 | 34 | # Also install census data into the image :) 35 | ADD https://www2.census.gov/geo/tiger/GENZ2020/shp/cb_2020_us_state_500k.zip /usr/local/share/bokeh/ 36 | ADD https://www2.census.gov/geo/tiger/GENZ2020/shp/cb_2020_us_county_500k.zip /usr/local/share/bokeh/ 37 | RUN chmod 644 /usr/local/share/bokeh/cb*.zip 38 | 39 | # This is a simple test to make sure generating the image works properly 40 | COPY test /test/ 41 | 42 | USER hadoop:hadoop 43 | -------------------------------------------------------------------------------- /reInvent_2018/EMR/assets/scripts/hive_converter.sql: -------------------------------------------------------------------------------- 1 | -- Summary: This sample shows you how to convert Amazon review stored in S3 using Hive 2 | 3 | -- Create table using sample data in S3. Note: you can replace this S3 path with your own. 4 | CREATE EXTERNAL TABLE IF NOT EXISTS `amazon_reviews_tsv`( 5 | `marketplace` string, 6 | `customer_id` string, 7 | `review_id` string, 8 | `product_id` string, 9 | `product_parent` string, 10 | `product_title` string, 11 | `product_category` string, 12 | `star_rating` int, 13 | `helpful_votes` int, 14 | `total_votes` int, 15 | `vine` string, 16 | `verified_purchase` string, 17 | `review_headline` string, 18 | `review_body` string, 19 | `review_date` string 20 | ) 21 | ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' 22 | LOCATION 's3://amazon-reviews-pds/tsv/'; 23 | 24 | -- ${INPUT} 25 | 26 | -- Total requests per operating system for a given time frame 27 | SET hive.groupby.position.alias=true; 28 | 29 | INSERT OVERWRITE DIRECTORY '${OUTPUT}/top_toys/' 30 | ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' 31 | SELECT product_id, product_title, count(*) AS num_reviews, avg(star_rating) AS avg_stars 32 | FROM amazon_reviews_tsv where product_category='Toys' 33 | GROUP BY 1, 2 34 | ORDER BY num_reviews DESC 35 | limit 100; 36 | 37 | -- s3://damons-reinvent-demo/reinvent/scripts/hive_converter.sql -d INPUT=s3://amazon-reviews-pds/tsv/ -d OUTPUT=s3://damons-reinvent-demo/reinvent/hive/query_output/ 38 | -- hive-script --run-hive-script --args -f s3://damons-reinvent-demo/reinvent/scripts/hive_converter.sql -d INPUT=s3://amazon-reviews-pds/tsv/ -d OUTPUT=s3://damons-reinvent-demo/reinvent/hive/query_output/ -------------------------------------------------------------------------------- /emr/airflow/mwaa_stack/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Welcome to your CDK Python project! 3 | 4 | This is a blank project for Python development with CDK. 5 | 6 | The `cdk.json` file tells the CDK Toolkit how to execute your app. 7 | 8 | This project is set up like a standard Python project. The initialization 9 | process also creates a virtualenv within this project, stored under the `.venv` 10 | directory. To create the virtualenv it assumes that there is a `python3` 11 | (or `python` for Windows) executable in your path with access to the `venv` 12 | package. If for any reason the automatic creation of the virtualenv fails, 13 | you can create the virtualenv manually. 14 | 15 | To manually create a virtualenv on MacOS and Linux: 16 | 17 | ``` 18 | $ python3 -m venv .venv 19 | ``` 20 | 21 | After the init process completes and the virtualenv is created, you can use the following 22 | step to activate your virtualenv. 23 | 24 | ``` 25 | $ source .venv/bin/activate 26 | ``` 27 | 28 | If you are a Windows platform, you would activate the virtualenv like this: 29 | 30 | ``` 31 | % .venv\Scripts\activate.bat 32 | ``` 33 | 34 | Once the virtualenv is activated, you can install the required dependencies. 35 | 36 | ``` 37 | $ pip install -r requirements.txt 38 | ``` 39 | 40 | At this point you can now synthesize the CloudFormation template for this code. 41 | 42 | ``` 43 | $ cdk synth 44 | ``` 45 | 46 | To add additional dependencies, for example other CDK libraries, just add 47 | them to your `setup.py` file and rerun the `pip install -r requirements.txt` 48 | command. 49 | 50 | ## Useful commands 51 | 52 | * `cdk ls` list all stacks in the app 53 | * `cdk synth` emits the synthesized CloudFormation template 54 | * `cdk deploy` deploy this stack to your default AWS account/region 55 | * `cdk diff` compare deployed stack with current state 56 | * `cdk docs` open CDK documentation 57 | 58 | Enjoy! 59 | -------------------------------------------------------------------------------- /cdk/emr-serverless-job-run/README.md: -------------------------------------------------------------------------------- 1 | 2 | # Welcome to your CDK Python project! 3 | 4 | This is a blank project for CDK development with Python. 5 | 6 | The `cdk.json` file tells the CDK Toolkit how to execute your app. 7 | 8 | This project is set up like a standard Python project. The initialization 9 | process also creates a virtualenv within this project, stored under the `.venv` 10 | directory. To create the virtualenv it assumes that there is a `python3` 11 | (or `python` for Windows) executable in your path with access to the `venv` 12 | package. If for any reason the automatic creation of the virtualenv fails, 13 | you can create the virtualenv manually. 14 | 15 | To manually create a virtualenv on MacOS and Linux: 16 | 17 | ``` 18 | $ python3 -m venv .venv 19 | ``` 20 | 21 | After the init process completes and the virtualenv is created, you can use the following 22 | step to activate your virtualenv. 23 | 24 | ``` 25 | $ source .venv/bin/activate 26 | ``` 27 | 28 | If you are a Windows platform, you would activate the virtualenv like this: 29 | 30 | ``` 31 | % .venv\Scripts\activate.bat 32 | ``` 33 | 34 | Once the virtualenv is activated, you can install the required dependencies. 35 | 36 | ``` 37 | $ pip install -r requirements.txt 38 | ``` 39 | 40 | At this point you can now synthesize the CloudFormation template for this code. 41 | 42 | ``` 43 | $ cdk synth 44 | ``` 45 | 46 | To add additional dependencies, for example other CDK libraries, just add 47 | them to your `setup.py` file and rerun the `pip install -r requirements.txt` 48 | command. 49 | 50 | ## Useful commands 51 | 52 | * `cdk ls` list all stacks in the app 53 | * `cdk synth` emits the synthesized CloudFormation template 54 | * `cdk deploy` deploy this stack to your default AWS account/region 55 | * `cdk diff` compare deployed stack with current state 56 | * `cdk docs` open CDK documentation 57 | 58 | Enjoy! 59 | -------------------------------------------------------------------------------- /emr/studio/README.md: -------------------------------------------------------------------------------- 1 | # EMR Studio Demo Code 2 | 3 | This is the associated code for the [Intro to Amazon EMR Studio](https://youtu.be/oVgyL5W9FPU) video. 4 | 5 | - [WeatherDay.ipynb](WeatherDay.ipynb) - Notebook that uses [@zflamig](https://github.com/zflamig)'s original [birthday-weather](https://github.com/zflamig/birthday-weather) example that uses [ERA5 Zaar data](https://registry.opendata.aws/ecmwf-era5/) to draw a map of US weather for a given day. 6 | 7 | ## CloudFormation Templates 8 | 9 | There are two templates in this repository for use with EMR Studio. Please note that you can find more examples in the [EMR Studio Samples](https://github.com/aws-samples/emr-studio-samples) repository. 10 | 11 | 1. [`full_studio_dependencies`](./cloudformation/full_studio_dependencies.cfn.yaml) - Creates everything you need in order to use EMR Studio including a new VPC with security groups and subnets tagged appropriately for use with [EMR Managed Policies](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-managed-iam-policies.html). 12 | 2. [`matplotlib_studio`](./cloudformation/matplotlib_studio.cfn.yaml) - Incorporates the above template and also creates a new Studio associated with the AWS SSO username you provide. Also includes a Service Catalog cluster template that installs `basemap` for usage with matplotlib and the `WeatherDay` notebook above. 13 | 14 | ## Scheduling Notebooks 15 | 16 | In order to schedule, you need three pieces of information: 17 | - Editor ID 18 | - Cluster ID 19 | - Service role name 20 | 21 | ```shell 22 | export EDITOR_ID=e-AAABBB 23 | export CLUSTER_ID=j-CCCDDD 24 | ``` 25 | 26 | 27 | ```shell 28 | aws emr start-notebook-execution \ 29 | --editor-id ${EDITOR_ID} \ 30 | --notebook-params '{"weather_date": "2019-09-01"}' \ 31 | --relative-path demo-code/emr/studio/WeatherDay.ipynb \ 32 | --notebook-execution-name Summer \ 33 | --execution-engine '{"Id" : "'${CLUSTER_ID}'"}' \ 34 | --service-role EMR_Notebooks_DefaultRole 35 | ``` 36 | 37 | ```shell 38 | aws emr describe-notebook-execution --notebook-execution-id ex-FFFFGGGG 39 | ``` 40 | 41 | ```shell 42 | aws s3 cp s3:///e-AAABBB/executions/ex-FFFFGGGG/WeatherDay.ipynb . 43 | ``` -------------------------------------------------------------------------------- /emr/eks/windy_city.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from pyspark.sql import SparkSession 4 | from pyspark.sql import functions as F 5 | from pyspark.sql.types import DoubleType 6 | 7 | NOAA_ISD = "s3://noaa-global-hourly-pds/2021/" 8 | 9 | def topDays(spark, longLeft, latBottom, longRight, latTop): 10 | # Load data for 2021 11 | df = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(NOAA_ISD) 12 | 13 | # Convert lat/long columns to doubles 14 | df = df \ 15 | .withColumn('LATITUDE', df.LATITUDE.cast(DoubleType())) \ 16 | .withColumn('LONGITUDE', df.LONGITUDE.cast(DoubleType())) 17 | 18 | # Exclude missing values and filter on our bounding box 19 | seadf = df \ 20 | .filter(F.split(df.WND, ",")[3] != '9999') \ 21 | .filter(df.LATITUDE >= latBottom) \ 22 | .filter(df.LATITUDE <= latTop) \ 23 | .filter(df.LONGITUDE >= longLeft) \ 24 | .filter(df.LONGITUDE <= longRight) 25 | 26 | # Pull out day and windspeed 27 | wind_date_df = seadf \ 28 | .select("DATE", "NAME", "WND") \ 29 | .withColumn("windSpeed", F.split(seadf.WND, ",")[3].cast(DoubleType())/10 ) \ 30 | .withColumn("ymd", F.split(df.DATE, "T")[0]) 31 | 32 | # Find top speed for reach day! 33 | wind_date_df.groupBy("ymd").agg({'windSpeed':'max'}).orderBy("max(windSpeed)", ascending=False).show(50) 34 | 35 | if __name__ == "__main__": 36 | """ 37 | Usage: windy_city [bbox] 38 | e.g. -122.46,47.48,-122.22,47.73 for Seattle 39 | """ 40 | spark = SparkSession\ 41 | .builder\ 42 | .appName("WindyCity")\ 43 | .getOrCreate() 44 | 45 | # Use http://tools.geofabrik.de/calc/#type=geofabrik_standard&bbox=-122.459696,47.481002,-122.224433,47.734136&tab=1&proj=EPSG:4326&places=2 to 46 | # test out or find bounding boxes 47 | bbox = [float(val) for val in sys.argv[1].split(',')] if len(sys.argv) > 1 else [-122.459696,47.481002,-122.224433,47.734136] 48 | 49 | topDays(spark, *bbox) 50 | 51 | spark.stop() 52 | 53 | # -122.459696,47.481002,-122.224433,47.734136 54 | # -122.48,47.41,-122.16,47.49 55 | # left_long, bottom_lat, right_long, top_lat 56 | -------------------------------------------------------------------------------- /reInvent_2018/EMR/assets/notebook_code.md: -------------------------------------------------------------------------------- 1 | ```python 2 | print("Hello, world!") 3 | ``` 4 | 5 | ```python 6 | from pyspark.ml.recommendation import ALS 7 | from pyspark.ml.evaluation import RegressionEvaluator 8 | 9 | from pyspark.sql.functions import * 10 | import sys 11 | ``` 12 | 13 | 14 | ```python 15 | toys = spark.read.parquet("s3://amazon-reviews-pds/parquet/product_category=Toys/") 16 | toys.printSchema() 17 | ``` 18 | 19 | ```python 20 | toys.count() 21 | ``` 22 | 23 | ```python 24 | ratings = ( 25 | toys.select("customer_id", "product_id", "star_rating", "product_title") 26 | .withColumn("customer_id_int", abs(hash(col("customer_id")) % sys.maxint)) 27 | .withColumn("product_id_int", abs(hash(col("product_id")) % sys.maxint)) 28 | ).repartition(200) 29 | ``` 30 | 31 | ```python 32 | top_toys = ratings\ 33 | .groupby("product_id_int", "product_title")\ 34 | .agg( 35 | avg(col("star_rating")).alias("avg_rating"), 36 | count("star_rating").alias("count") 37 | )\ 38 | .sort(desc("count"))\ 39 | .limit(25)\ 40 | .withColumn("avg_rating", round(col('avg_rating'), 3))\ 41 | .withColumn("product_title", col("product_title").substr(1, 45)) 42 | top_toys.show(truncate=False) 43 | ``` 44 | 45 | ```python 46 | kids_ratings = ( 47 | toys 48 | .where("lower(review_body) LIKE '%baby%' OR lower(review_body) LIKE '%infant%'") 49 | .select("customer_id", "product_id", "star_rating", "product_title") 50 | .withColumn("customer_id_int", abs(hash(col("customer_id")) % sys.maxint)) 51 | .withColumn("product_id_int", abs(hash(col("product_id")) % sys.maxint)) 52 | ).repartition(200) 53 | ``` 54 | 55 | ```python 56 | top_toys = kids_ratings\ 57 | .groupby("product_id_int", "product_title")\ 58 | .agg( 59 | avg(col("star_rating")).alias("avg_rating"), 60 | count("star_rating").alias("count") 61 | )\ 62 | .sort(desc("count"))\ 63 | .limit(25)\ 64 | .withColumn("avg_rating", round(col('avg_rating'), 3))\ 65 | .withColumn("product_title", col("product_title").substr(1, 45)) 66 | top_toys.show(truncate=False) 67 | ``` 68 | 69 | ```python 70 | (training, test) = ratings.randomSplit([0.8, 0.2]) 71 | 72 | # Build the recommendation model using ALS on the training data 73 | als = ALS(maxIter=5, regParam=0.01, userCol="customer_id_int", itemCol="product_id_int", ratingCol="star_rating", coldStartStrategy="drop") 74 | model = als.fit(training) 75 | ``` -------------------------------------------------------------------------------- /emr/julia/README.md: -------------------------------------------------------------------------------- 1 | # Julia on EMR 2 | 3 | ## Installing Julia 4 | 5 | Julia can be installed with a bootstrap action when creating your EMR cluster. 6 | 7 | ### Upload Julia installation scripts to S3 8 | 9 | ```shell 10 | S3_BUCKET= 11 | 12 | aws s3 cp julia-1.6.1.sh s3:///boostrap-actions/julia-1.6.1.sh 13 | aws s3 cp ijulia-kernel.sh s3:///artifacts/steps/ijulia-kernel.sh 14 | ``` 15 | 16 | ### Start up an EMR cluster 17 | 18 | ```shell 19 | ACCOUNT_ID= 20 | REGION= 21 | SUBNET_ID= 22 | KEYPAIR= 23 | INSTALL_SCRIPT="s3:///boostrap-actions/julia-1.6.1.sh" 24 | IJULIA_SCRIPT="s3:///artifacts/steps/ijulia-kernel.sh" 25 | 26 | aws emr create-cluster \ 27 | --applications Name=Spark Name=Livy Name=JupyterEnterpriseGateway Name=Hive \ 28 | --bootstrap-actions '[{"Path":"'${INSTALL_SCRIPT}'","Name":"JuliaInstall"}]' \ 29 | --steps '[{"Type":"CUSTOM_JAR","Name":"IJuliaKernelInstall","ActionOnFailure":"TERMINATE_CLUSTER","Jar":"s3://'${REGION}'.elasticmapreduce/libs/script-runner/script-runner.jar","Args":["'${IJULIA_SCRIPT}'"]}]' \ 30 | --ebs-root-volume-size 10 \ 31 | --instance-type c5.2xlarge \ 32 | --instance-count 1 \ 33 | --ec2-attributes SubnetId=${SUBNET_ID},KeyName=${KEYPAIR} \ 34 | --use-default-roles \ 35 | --release-label emr-6.3.0 \ 36 | --log-uri s3n://aws-logs-${ACCOUNT_ID}-${REGION}/elasticmapreduce/ \ 37 | --name 'DS_julia' \ 38 | --scale-down-behavior TERMINATE_AT_TASK_COMPLETION \ 39 | --region ${REGION} 40 | ``` 41 | 42 | ## Create a new Julia notebook 43 | 44 | Now create a new Notebook in EMR connected to the cluster you just made. 45 | 46 | When you open the Notebook in JupyterLab, you should see a Julia 1.6.1 icon! 47 | 48 | ![Julia Notebook](julia_notebook.png) 49 | 50 | See the `julia-elly.ipynb` notebook for a full example of how to use Elly and Distributed. 51 | 52 | The code from that notebook is below. 53 | 54 | ```julia 55 | import Pkg; Pkg.add("Elly") 56 | 57 | yarnhost = readchomp(`hostname -i`) 58 | 59 | using Elly 60 | 61 | ENV["HADOOP_USER_NAME"] = "hadoop" 62 | 63 | yarncm = YarnManager( 64 | yarnhost=yarnhost, 65 | rmport=8032, 66 | schedport=8030, 67 | launch_timeout=60, 68 | unmanaged=true # pass true when running in unmanaged mode 69 | ); 70 | 71 | using Distributed 72 | 73 | env = Dict( 74 | "JULIA_LOAD_PATH"=>join([Base.LOAD_PATH..., "/usr/local/julia/packages"], ':'), 75 | "JULIA_DEPOT_PATH"=>join([Base.DEPOT_PATH..., "/usr/local/julia"], ':') 76 | ); 77 | addprocs(yarncm; np=8, env=env); 78 | 79 | @everywhere using Distributed 80 | @everywhere println(readchomp(`hostname -i`)) 81 | ``` -------------------------------------------------------------------------------- /cdk/emr-serverless-job-run/emr_serverless_job_run/emr_serverless_job_run_stack.py: -------------------------------------------------------------------------------- 1 | from aws_cdk import Stack 2 | from aws_cdk import aws_emrserverless as emrs 3 | from aws_cdk import aws_iam as iam # Duration, 4 | from aws_cdk import custom_resources as custom 5 | from constructs import Construct 6 | 7 | 8 | class EmrServerlessJobRunStack(Stack): 9 | def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: 10 | super().__init__(scope, construct_id, **kwargs) 11 | 12 | # Create a serverless Spark app 13 | serverless_app = emrs.CfnApplication( 14 | self, 15 | "spark_app", 16 | release_label="emr-6.9.0", 17 | type="SPARK", 18 | name="cdk-spark", 19 | ) 20 | 21 | # We need an execution role to run the job, this one has no access to anything 22 | # But will be granted PassRole access by the Lambda that's starting the job. 23 | role = iam.Role( 24 | scope=self, 25 | id="spark_job_execution_role", 26 | assumed_by=iam.ServicePrincipal("emr-serverless.amazonaws.com"), 27 | ) 28 | 29 | # Create a custom resource that starts a job run 30 | myjobrun = custom.AwsCustomResource( 31 | self, 32 | "serverless-job-run", 33 | on_create={ 34 | "service": "EMRServerless", 35 | "action": "startJobRun", 36 | "parameters": { 37 | "applicationId": serverless_app.attr_application_id, 38 | "executionRoleArn": role.role_arn, 39 | "name": "cdkJob", 40 | "jobDriver": {"sparkSubmit": {"entryPoint": "local:///usr/lib/spark/examples/src/main/python/pi.py"}}, 41 | }, 42 | "physical_resource_id": custom.PhysicalResourceId.from_response( 43 | "jobRunId" 44 | ), 45 | }, 46 | policy=custom.AwsCustomResourcePolicy.from_sdk_calls( 47 | resources=custom.AwsCustomResourcePolicy.ANY_RESOURCE 48 | ), 49 | ) 50 | 51 | # Ensure the Lambda can call startJobRun with the earlier-created role 52 | myjobrun.grant_principal.add_to_policy( 53 | iam.PolicyStatement( 54 | effect=iam.Effect.ALLOW, 55 | resources=[role.role_arn], 56 | actions=["iam:PassRole"], 57 | conditions={ 58 | "StringLike": { 59 | "iam:PassedToService": "emr-serverless.amazonaws.com" 60 | } 61 | }, 62 | ) 63 | ) 64 | -------------------------------------------------------------------------------- /cdk/big-data-stack/README.md: -------------------------------------------------------------------------------- 1 | # CDK Big Data Stack 2 | 3 | This is a Big Data Stack built with Python CDK. *Python3.9 is required.* 4 | 5 | It installs the following: 6 | - VPC 7 | - RDS MySQL Instance 8 | - EMR Cluster 9 | - EKS Cluster 10 | - With k8s Cluster Autoscaler 11 | - With Kubernetes Dashboard 12 | - With Apache Airflow 2.0 13 | - EMR Virtual Cluster (for EKS) 14 | - With Airflow 2.0 plugin 15 | - EMR Studio 16 | - Service Catalog template 17 | 18 | You can use the following step to activate your virtualenv. 19 | 20 | ``` 21 | $ source .venv/bin/activate 22 | ``` 23 | 24 | Once the virtualenv is activated, you can install the required dependencies. 25 | 26 | ``` 27 | $ pip install -r requirements.txt 28 | ``` 29 | 30 | At this point you can now synthesize the CloudFormation template for this code. 31 | 32 | ``` 33 | $ cdk synth 34 | ``` 35 | 36 | To add additional dependencies, for example other CDK libraries, just add 37 | them to your `setup.py` file and rerun the `pip install -r requirements.txt` 38 | command. 39 | 40 | To deploy: 41 | 42 | ```shell 43 | cdk deploy --all -c eks_admin_role_name=Admin 44 | ``` 45 | 46 | Where `eks_admin_role_name` is an IAM role that you want to grant admin access to your EKS cluster. 47 | 48 | ## Stack Overview 49 | 50 | ### VPC 51 | 52 | Deploys a simple VPC across 3 availability zones. 53 | 54 | ### RDS 55 | 56 | Creates a MySQL RDS instance to be used as a Hive metastore for EMR. 57 | 58 | ### EMR 59 | 60 | Creates an EMR 5.32 that is configured to connect to the RDS database above. 61 | 62 | A job execution role is created that, currently, has extremely permissive permissions. 63 | 64 | The cluster is creating using the new EMR roles and is configured to use Spark and the Jupyter Enteprise Gateway so you can use it with EMR Notebooks or EMR Studio. 65 | 66 | ### EKS 67 | 68 | Creates an EKS cluster with a single managed Node Group of `m5.xlarge` instances. In addition, it installs the Cluster Autoscaler, Kubernetes Dashboard, and Apache Airflow from the [community-provided Helm Chart](https://github.com/airflow-helm/charts/tree/main/charts/airflow). 69 | 70 | An IAM service role is created specifically for EMR. 71 | 72 | Another role is created for Airflow that has permissions to execute jobs against the `emr-containers` API. 73 | 74 | ### EMRContainers 75 | 76 | Creates an EMR Virtual Cluster for running EMR jobs on EKS. 77 | 78 | ## Useful commands 79 | 80 | * `cdk ls` list all stacks in the app 81 | * `cdk synth` emits the synthesized CloudFormation template 82 | * `cdk deploy` deploy this stack to your default AWS account/region 83 | * `cdk diff` compare deployed stack with current state 84 | * `cdk docs` open CDK documentation 85 | 86 | -------------------------------------------------------------------------------- /emr/airflow/mwaa_stack/assets/dags/example_emr_job.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | 3 | from airflow import DAG 4 | from airflow.providers.amazon.aws.operators.emr_create_job_flow import ( 5 | EmrCreateJobFlowOperator, 6 | ) 7 | from airflow.providers.amazon.aws.sensors.emr_job_flow import EmrJobFlowSensor 8 | from airflow.utils.dates import days_ago 9 | 10 | DEFAULT_ARGS = { 11 | "owner": "airflow", 12 | "depends_on_past": False, 13 | "email": ["airflow@example.com"], 14 | "email_on_failure": False, 15 | "email_on_retry": False, 16 | } 17 | 18 | # [START howto_operator_emr_automatic_steps_config] 19 | SPARK_STEPS = [ 20 | { 21 | "Name": "calculate_pi", 22 | "ActionOnFailure": "CONTINUE", 23 | "HadoopJarStep": { 24 | "Jar": "command-runner.jar", 25 | "Args": ["/usr/lib/spark/bin/run-example", "SparkPi", "10"], 26 | }, 27 | } 28 | ] 29 | 30 | 31 | # "LogUri": "s3://my-emr-log-bucket/default_job_flow_location", 32 | JOB_FLOW_OVERRIDES = { 33 | "Name": "PiCalc", 34 | "ReleaseLabel": "emr-6.3.0", 35 | "Instances": { 36 | "InstanceGroups": [ 37 | { 38 | "Name": "Primary node", 39 | "Market": "SPOT", 40 | "InstanceRole": "MASTER", 41 | "InstanceType": "m5.xlarge", 42 | "InstanceCount": 1, 43 | }, 44 | { 45 | "Name": "Core nodes", 46 | "Market": "SPOT", 47 | "InstanceRole": "CORE", 48 | "InstanceType": "m5.xlarge", 49 | "InstanceCount": 1, 50 | }, 51 | ], 52 | "KeepJobFlowAliveWhenNoSteps": False, 53 | "TerminationProtected": False, 54 | }, 55 | "Steps": SPARK_STEPS, 56 | "JobFlowRole": "EMR_EC2_DefaultRole", 57 | "ServiceRole": "EMR_DefaultRole", 58 | } 59 | # [END howto_operator_emr_automatic_steps_config] 60 | 61 | with DAG( 62 | dag_id="emr_job_flow_automatic_steps_dag", 63 | default_args=DEFAULT_ARGS, 64 | dagrun_timeout=timedelta(hours=2), 65 | start_date=days_ago(2), 66 | schedule_interval="0 3 * * *", 67 | tags=["example"], 68 | ) as dag: 69 | 70 | # [START howto_operator_emr_automatic_steps_tasks] 71 | job_flow_creator = EmrCreateJobFlowOperator( 72 | task_id="create_job_flow", 73 | job_flow_overrides=JOB_FLOW_OVERRIDES, 74 | aws_conn_id="aws_default", 75 | emr_conn_id="emr_default", 76 | ) 77 | 78 | job_sensor = EmrJobFlowSensor( 79 | task_id="check_job_flow", 80 | job_flow_id="{{ task_instance.xcom_pull(task_ids='create_job_flow', key='return_value') }}", 81 | aws_conn_id="aws_default", 82 | ) 83 | 84 | job_flow_creator >> job_sensor 85 | # [END howto_operator_emr_automatic_steps_tasks] 86 | -------------------------------------------------------------------------------- /emr/airflow/mwaa_stack/assets/dags/example_emr_eks_job.py: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | """ 18 | This is an example dag for an Amazon EMR on EKS Spark job. 19 | """ 20 | from datetime import timedelta 21 | 22 | from airflow import DAG 23 | from airflow.hooks.base_hook import BaseHook 24 | 25 | from emr_containers.operators.emr_containers import EMRContainerOperator 26 | from airflow.utils.dates import days_ago 27 | 28 | 29 | JOB_DRIVER_ARG = { 30 | "sparkSubmitJobDriver": { 31 | "entryPoint": "local:///usr/lib/spark/examples/src/main/python/pi.py", 32 | "sparkSubmitParameters": "--conf spark.executors.instances=2 --conf spark.executors.memory=2G --conf spark.executor.cores=2 --conf spark.driver.cores=1", # noqa: E501 33 | } 34 | } 35 | 36 | CONFIGURATION_OVERRIDES_ARG = { 37 | "monitoringConfiguration": { 38 | "cloudWatchMonitoringConfiguration": { 39 | "logGroupName": "/aws/emr-eks-spark", 40 | "logStreamNamePrefix": "airflow", 41 | } 42 | } 43 | } 44 | 45 | with DAG( 46 | dag_id='emr_eks_pi_job', 47 | dagrun_timeout=timedelta(hours=2), 48 | start_date=days_ago(1), 49 | schedule_interval="@once", 50 | tags=["emr_containers", "example"], 51 | ) as dag: 52 | 53 | # An example of how to get the cluster id and arn from an Airflow (>2.1) connection 54 | # VIRTUAL_CLUSTER_ID = '{{ conn.emr_eks.extra_dejson["virtual_cluster_id"] }}' 55 | # JOB_ROLE_ARN = '{{ conn.emr_eks.extra_dejson["job_role_arn"] }}' 56 | # In 2.0 we just get the connection, but this executes every time the DAG is loaded 57 | c = BaseHook.get_connection("emr_eks") 58 | cluster_args = c.extra_dejson 59 | VIRTUAL_CLUSTER_ID = cluster_args.get('virtual_cluster_id') 60 | JOB_ROLE_ARN = cluster_args.get('job_role_arn') 61 | 62 | 63 | job_starter = EMRContainerOperator( 64 | task_id="start_job", 65 | virtual_cluster_id=VIRTUAL_CLUSTER_ID, 66 | execution_role_arn=JOB_ROLE_ARN, 67 | release_label="emr-6.3.0-latest", 68 | job_driver=JOB_DRIVER_ARG, 69 | configuration_overrides=CONFIGURATION_OVERRIDES_ARG, 70 | name="pi.py", 71 | ) -------------------------------------------------------------------------------- /reInvent_2018/EMR/assets/cloudformation/Spark_Cluster_Versions/v0_Initial_Revision.cf.yml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Description: On-Demand EMR Cluster 3 | 4 | Parameters: 5 | ClusterName: 6 | Type: "String" 7 | Description: "Name your cluster" 8 | 9 | JobType: 10 | Type: "String" 11 | Description: "Select your job type" 12 | AllowedValues: 13 | - "Spark" 14 | - "Hive" 15 | - "Interactive" 16 | Default: "Spark" 17 | 18 | ComputeRequirements: 19 | Type: "String" 20 | Description: "Compute requirements" 21 | AllowedValues: 22 | - "Generic" 23 | - "CPU" 24 | - "Memory" 25 | Default: "Generic" 26 | 27 | JobArtifacts: 28 | Type: "String" 29 | Description: "Spark script or Hive SQL" 30 | 31 | Mappings: 32 | ComputeMapping: 33 | Generic: 34 | "instancetype": "m5.4xlarge" 35 | CPU: 36 | "instancetype": "c5.4xlarge" 37 | Memory: 38 | "instancetype": "r5.4xlarge" 39 | StepMapping: 40 | Spark: 41 | "stepcommand": "spark-submit --deploy-mode cluster" 42 | Hive: 43 | "stepcommand": "hive-script --run-hive-script --args -f" 44 | 45 | Resources: 46 | EMRCluster: 47 | Type: AWS::EMR::Cluster 48 | Properties: 49 | Name: { Ref: ClusterName } 50 | JobFlowRole: "EMR_EC2_DefaultRole" 51 | ServiceRole: "EMR_DefaultRole" 52 | ReleaseLabel: "emr-5.19.0" 53 | Instances: 54 | Ec2SubnetId: "subnet-XXXX" 55 | Ec2KeyName: "sshkeyname" 56 | MasterInstanceGroup: 57 | InstanceCount: 1 58 | InstanceType: 59 | Fn::FindInMap: 60 | - ComputeMapping 61 | - Ref: "ComputeRequirements" 62 | - "instancetype" 63 | Market: "ON_DEMAND" 64 | Name: "Master" 65 | CoreInstanceGroup: 66 | InstanceCount: 2 67 | InstanceType: 68 | Fn::FindInMap: 69 | - ComputeMapping 70 | - Ref: "ComputeRequirements" 71 | - "instancetype" 72 | Market: "ON_DEMAND" 73 | Name: "Core" 74 | Applications: 75 | - Name: "Spark" 76 | - Name: "Ganglia" 77 | - Name: "Hive" 78 | LogUri: 79 | Fn::Join: ["", ["s3://aws-logs-", Ref: "AWS::AccountId", "-", Ref: "AWS::Region", "/", "elasticmapreduce", "/"]] 80 | 81 | EMRLogProcessor: 82 | Type: AWS::EMR::Step 83 | Properties: 84 | ActionOnFailure: "CONTINUE" 85 | HadoopJarStep: 86 | Jar: "command-runner.jar" 87 | Args: !Split 88 | - " " 89 | - Fn::Join: 90 | - " " 91 | - 92 | - Fn::FindInMap: [StepMapping, {Ref: JobType}, "stepcommand"] 93 | - {Ref: JobArtifacts} 94 | JobFlowId: 95 | Ref: EMRCluster 96 | Name: "Log Converter" 97 | 98 | Outputs: 99 | "MasterNodeHadoopURL": 100 | Description: "EMR Resource Manager" 101 | Value: 102 | Fn::Sub: "http://${EMRCluster.MasterPublicDNS}:8088" 103 | -------------------------------------------------------------------------------- /reInvent_2018/EMR/assets/cloudformation/Spark_Cluster_Versions/v1_Security_Settings.cf.yml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Description: On-Demand EMR Cluster 3 | 4 | Parameters: 5 | ClusterName: 6 | Type: "String" 7 | Description: "Name your cluster" 8 | 9 | JobType: 10 | Type: "String" 11 | Description: "Select your job type" 12 | AllowedValues: 13 | - "Spark" 14 | - "Hive" 15 | - "Interactive" 16 | Default: "Spark" 17 | 18 | ComputeRequirements: 19 | Type: "String" 20 | Description: "Compute requirements" 21 | AllowedValues: 22 | - "Generic" 23 | - "CPU" 24 | - "Memory" 25 | Default: "Generic" 26 | 27 | JobArtifacts: 28 | Type: "String" 29 | Description: "Spark script or Hive SQL" 30 | 31 | Mappings: 32 | ComputeMapping: 33 | Generic: 34 | "instancetype": "m5.4xlarge" 35 | CPU: 36 | "instancetype": "c5.4xlarge" 37 | Memory: 38 | "instancetype": "r5.4xlarge" 39 | StepMapping: 40 | Spark: 41 | "stepcommand": "spark-submit --deploy-mode cluster" 42 | Hive: 43 | "stepcommand": "hive-script --run-hive-script --args -f" 44 | 45 | Resources: 46 | EMRCluster: 47 | Type: AWS::EMR::Cluster 48 | Properties: 49 | Name: { Ref: ClusterName } 50 | JobFlowRole: "EMR_EC2_DefaultRole" 51 | ServiceRole: "EMR_DefaultRole" 52 | ReleaseLabel: "emr-5.19.0" 53 | Instances: 54 | Ec2SubnetId: "subnet-XXXX" 55 | Ec2KeyName: "sshkeyname" 56 | MasterInstanceGroup: 57 | InstanceCount: 1 58 | InstanceType: 59 | Fn::FindInMap: 60 | - ComputeMapping 61 | - Ref: "ComputeRequirements" 62 | - "instancetype" 63 | Market: "ON_DEMAND" 64 | Name: "Master" 65 | CoreInstanceGroup: 66 | InstanceCount: 2 67 | InstanceType: 68 | Fn::FindInMap: 69 | - ComputeMapping 70 | - Ref: "ComputeRequirements" 71 | - "instancetype" 72 | Market: "ON_DEMAND" 73 | Name: "Core" 74 | Applications: 75 | - Name: "Spark" 76 | - Name: "Ganglia" 77 | - Name: "Hive" 78 | LogUri: 79 | Fn::Join: ["", ["s3://aws-logs-", Ref: "AWS::AccountId", "-", Ref: "AWS::Region", "/", "elasticmapreduce", "/"]] 80 | 81 | EMRLogProcessor: 82 | Type: AWS::EMR::Step 83 | Properties: 84 | ActionOnFailure: "CONTINUE" 85 | HadoopJarStep: 86 | Jar: "command-runner.jar" 87 | Args: !Split 88 | - " " 89 | - Fn::Join: 90 | - " " 91 | - 92 | - Fn::FindInMap: [StepMapping, {Ref: JobType}, "stepcommand"] 93 | - {Ref: JobArtifacts} 94 | JobFlowId: 95 | Ref: EMRCluster 96 | Name: "Log Converter" 97 | 98 | Outputs: 99 | "MasterNodeHadoopURL": 100 | Description: "EMR Resource Manager" 101 | Value: 102 | Fn::Sub: "http://${EMRCluster.MasterPublicDNS}:8088" 103 | -------------------------------------------------------------------------------- /cdk/emr-serverless-vpc-to-vpc/README.md: -------------------------------------------------------------------------------- 1 | 2 | # CDK - EMR Serverless VPC to VPC Connectivity 3 | 4 | This is an example app that shows how to create peering between two VPCs to allow an EMR Serverless job to connect to a service in another VPC. 5 | 6 | In this example, we create an EC2 instance with Postgres installed in one VPC, a test instance in another VPC for verifying connectivity, and an EMR Serverless app with the appropriate subnets and security group created. 7 | 8 | ![](diagram.png) 9 | 10 | ## Usage 11 | 12 | Once the infrastructure is deployed, you can copy the sample job to an S3 bucket, replace the placeholders and run the job. 13 | 14 | When running the job, it's assumed you already have an S3 bucket and EMR runtime role. 15 | 16 | ```bash 17 | S3_BUCKET= 18 | APPLICATION_ID= 19 | JOB_ROLE_ARN= 20 | 21 | # First deploy and copy script to s3 22 | # Your Account and Region must be set 23 | CDK_DEFAULT_ACCOUNT=123456789012 CDK_DEFAULT_REGION=us-west-2 cdk deploy 24 | aws s3 cp pg_connect.py s3://${S3_BUCKET}/code/pyspark 25 | 26 | # Now run an EMR Serverless job with the postgresql package 27 | aws emr-serverless start-job-run \ 28 | --application-id $APPLICATION_ID \ 29 | --execution-role-arn $JOB_ROLE_ARN \ 30 | --job-driver '{ 31 | "sparkSubmit": { 32 | "entryPoint": "s3://'${S3_BUCKET}'/code/pyspark/pg_connect.py", 33 | "sparkSubmitParameters": "--packages org.postgresql:postgresql:42.4.0" 34 | } 35 | }' 36 | ``` 37 | 38 | The job will fail, but you should get an error message that the `users` table doesn't exist that indicates EMR Serverless was able to connect to the postgres instance. 39 | 40 | ## Overview 41 | 42 | This project is set up like a standard Python project. The initialization 43 | process also creates a virtualenv within this project, stored under the `.venv` 44 | directory. To create the virtualenv it assumes that there is a `python3` 45 | (or `python` for Windows) executable in your path with access to the `venv` 46 | package. If for any reason the automatic creation of the virtualenv fails, 47 | you can create the virtualenv manually. 48 | 49 | To manually create a virtualenv on MacOS and Linux: 50 | 51 | ``` 52 | $ python3 -m venv .venv 53 | ``` 54 | 55 | After the init process completes and the virtualenv is created, you can use the following 56 | step to activate your virtualenv. 57 | 58 | ``` 59 | $ source .venv/bin/activate 60 | ``` 61 | 62 | If you are a Windows platform, you would activate the virtualenv like this: 63 | 64 | ``` 65 | % .venv\Scripts\activate.bat 66 | ``` 67 | 68 | Once the virtualenv is activated, you can install the required dependencies. 69 | 70 | ``` 71 | $ pip install -r requirements.txt 72 | ``` 73 | 74 | At this point you can now synthesize the CloudFormation template for this code. 75 | 76 | ``` 77 | $ cdk synth 78 | ``` 79 | 80 | To add additional dependencies, for example other CDK libraries, just add 81 | them to your `setup.py` file and rerun the `pip install -r requirements.txt` 82 | command. 83 | 84 | ## Useful commands 85 | 86 | * `cdk ls` list all stacks in the app 87 | * `cdk synth` emits the synthesized CloudFormation template 88 | * `cdk deploy` deploy this stack to your default AWS account/region 89 | * `cdk diff` compare deployed stack with current state 90 | * `cdk docs` open CDK documentation 91 | 92 | Enjoy! 93 | -------------------------------------------------------------------------------- /emr/eks/videos/external_metastores/README.md: -------------------------------------------------------------------------------- 1 | # EMR on EKS Metastores 2 | 3 | ## Getting the database connection string 4 | 5 | ```shell 6 | RDS_SECRETS=$(aws secretsmanager get-secret-value --secret-id 'arn:aws:secretsmanager:us-east-1:123456789012:secret:RDSStackSecret12345' | jq -r '.SecretString' ) 7 | RDS_USERNAME=$(echo $RDS_SECRETS | jq -r '.username') 8 | RDS_PASSWORD=$(echo $RDS_SECRETS | jq -r '.password') 9 | RDS_DATABASE=$(echo $RDS_SECRETS | jq -r '.dbname') 10 | RDS_HOSTNAME=$(echo $RDS_SECRETS | jq -r '.host') 11 | RDS_STRING="jdbc:mysql://${RDS_HOSTNAME}:3306/${RDS_DATABASE}" 12 | ``` 13 | 14 | ## Get the connector jar 15 | 16 | ```shell 17 | curl -O https://downloads.mariadb.com/Connectors/java/latest/mariadb-java-client-2.3.0.jar 18 | aws s3 cp mariadb-java-client-2.3.0.jar s3://${S3_BUCKET}/artifacts/jars/ 19 | ``` 20 | 21 | 22 | ## Try to run the code 23 | 24 | ```shell 25 | aws emr-containers start-job-run \ 26 | --virtual-cluster-id ${EMR_EKS_CLUSTER_ID} \ 27 | --name dacort-hive \ 28 | --execution-role-arn ${EMR_EKS_EXECUTION_ARN} \ 29 | --release-label emr-6.2.0-latest \ 30 | --job-driver '{ 31 | "sparkSubmitJobDriver": { 32 | "entryPoint": "s3://'${S3_BUCKET}'/code/pyspark/hivejdbc.py", 33 | "sparkSubmitParameters": "--jars s3://'${S3_BUCKET}'/artifacts/jars/mariadb-java-client-2.3.0.jar --conf spark.hadoop.javax.jdo.option.ConnectionDriverName=org.mariadb.jdbc.Driver --conf spark.hadoop.javax.jdo.option.ConnectionUserName='${RDS_USERNAME}' --conf spark.hadoop.javax.jdo.option.ConnectionPassword='${RDS_PASSWORD}' --conf spark.hadoop.javax.jdo.option.ConnectionURL='${RDS_STRING}' --conf spark.driver.cores=1 --conf spark.executor.memory=2G --conf spark.driver.memory=2G --conf spark.executor.cores=2 --conf spark.executor.instances=5" 34 | } 35 | }' \ 36 | --configuration-overrides '{ 37 | "monitoringConfiguration": { 38 | "cloudWatchMonitoringConfiguration": { "logGroupName": "/aws/eks/dacort-emr/eks-spark", "logStreamNamePrefix": "hive" } 39 | } 40 | }' 41 | ``` 42 | 43 | https://console.aws.amazon.com/cloudwatch/home?region=us-east-1#logsV2:log-groups/log-group/$252Faws$252Feks$252Fdacort-emr$252Feks-spark$3FlogStreamNameFilter$3Dhive 44 | 45 | ## Glue 46 | 47 | 48 | ```shell 49 | aws emr-containers start-job-run \ 50 | --virtual-cluster-id ${EMR_EKS_CLUSTER_ID} \ 51 | --name dacort-glue \ 52 | --execution-role-arn ${EMR_EKS_EXECUTION_ARN} \ 53 | --release-label emr-6.2.0-latest \ 54 | --job-driver '{ 55 | "sparkSubmitJobDriver": { 56 | "entryPoint": "s3://'${S3_BUCKET}'/code/pyspark/gluespark.py", 57 | "sparkSubmitParameters": "--conf spark.driver.cores=1 --conf spark.executor.memory=1G --conf spark.driver.memory=1G --conf spark.executor.cores=1 --conf spark.executor.instances=1" 58 | 59 | } 60 | }' \ 61 | --configuration-overrides '{ 62 | "applicationConfiguration": [ 63 | { 64 | "classification": "spark-defaults", 65 | "properties": { 66 | "spark.hadoop.hive.metastore.client.factory.class": "com.amazonaws.glue.catalog.metastore.AWSGlueDataCatalogHiveClientFactory" 67 | } 68 | } 69 | ], 70 | "monitoringConfiguration": { 71 | "cloudWatchMonitoringConfiguration": { 72 | "logGroupName": "/aws/eks/dacort-emr/eks-spark", 73 | "logStreamNamePrefix": "glue-cat" 74 | } 75 | } 76 | }' 77 | ``` -------------------------------------------------------------------------------- /reInvent_2018/EMR/assets/cloudformation/Spark_Cluster_Versions/v2_Updated_Parameters.cf.yml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Description: On-Demand EMR Cluster 3 | 4 | Parameters: 5 | ClusterName: 6 | Type: "String" 7 | Description: "Name your cluster" 8 | 9 | JobType: 10 | Type: "String" 11 | Description: "Select your job type" 12 | AllowedValues: 13 | - "Spark" 14 | - "Hive" 15 | - "Interactive" 16 | Default: "Spark" 17 | 18 | ComputeRequirements: 19 | Type: "String" 20 | Description: "Compute requirements" 21 | AllowedValues: 22 | - "Generic" 23 | - "CPU" 24 | - "Memory" 25 | Default: "Generic" 26 | 27 | JobArtifacts: 28 | Type: "String" 29 | Description: "Spark script or Hive SQL" 30 | 31 | Metadata: 32 | AWS::CloudFormation::Interface: 33 | ParameterLabels: 34 | ClusterName: 35 | default: "Cluster Name" 36 | JobType: 37 | default: "Job Type" 38 | ComputeRequirements: 39 | default: "Compute or Memory" 40 | JobArtifacts: 41 | default: "Job Parameters" 42 | ParameterGroups: 43 | - 44 | Label: 45 | default: "Job Configuration" 46 | Parameters: 47 | - JobType 48 | - JobArtifacts 49 | - 50 | Label: 51 | default: "Cluster Configuration" 52 | Parameters: 53 | - ClusterName 54 | - ComputeRequirements 55 | 56 | Mappings: 57 | ComputeMapping: 58 | Generic: 59 | "instancetype": "m5.4xlarge" 60 | CPU: 61 | "instancetype": "c5.4xlarge" 62 | Memory: 63 | "instancetype": "r5.4xlarge" 64 | StepMapping: 65 | Spark: 66 | "stepcommand": "spark-submit --deploy-mode cluster" 67 | Hive: 68 | "stepcommand": "hive-script --run-hive-script --args -f" 69 | 70 | Resources: 71 | EMRCluster: 72 | Type: AWS::EMR::Cluster 73 | Properties: 74 | Name: { Ref: ClusterName } 75 | JobFlowRole: "EMR_EC2_DefaultRole" 76 | ServiceRole: "EMR_DefaultRole" 77 | ReleaseLabel: "emr-5.19.0" 78 | Instances: 79 | Ec2SubnetId: "subnet-XXXX" 80 | Ec2KeyName: "sshkeyname" 81 | MasterInstanceGroup: 82 | InstanceCount: 1 83 | InstanceType: 84 | Fn::FindInMap: 85 | - ComputeMapping 86 | - Ref: "ComputeRequirements" 87 | - "instancetype" 88 | Market: "ON_DEMAND" 89 | Name: "Master" 90 | CoreInstanceGroup: 91 | InstanceCount: 2 92 | InstanceType: 93 | Fn::FindInMap: 94 | - ComputeMapping 95 | - Ref: "ComputeRequirements" 96 | - "instancetype" 97 | Market: "ON_DEMAND" 98 | Name: "Core" 99 | Applications: 100 | - Name: "Spark" 101 | - Name: "Ganglia" 102 | - Name: "Hive" 103 | LogUri: 104 | Fn::Join: ["", ["s3://aws-logs-", Ref: "AWS::AccountId", "-", Ref: "AWS::Region", "/", "elasticmapreduce", "/"]] 105 | 106 | EMRLogProcessor: 107 | Type: AWS::EMR::Step 108 | Properties: 109 | ActionOnFailure: "CONTINUE" 110 | HadoopJarStep: 111 | Jar: "command-runner.jar" 112 | Args: !Split 113 | - " " 114 | - Fn::Join: 115 | - " " 116 | - 117 | - Fn::FindInMap: [StepMapping, {Ref: JobType}, "stepcommand"] 118 | - {Ref: JobArtifacts} 119 | JobFlowId: 120 | Ref: EMRCluster 121 | Name: "Log Converter" 122 | 123 | Outputs: 124 | "MasterNodeHadoopURL": 125 | Description: "EMR Resource Manager" 126 | Value: 127 | Fn::Sub: "http://${EMRCluster.MasterPublicDNS}:8088" 128 | -------------------------------------------------------------------------------- /reInvent_2018/EMR/assets/cloudformation/Spark_Cluster_Versions/v3_Cluster_Size.cf.yml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Description: On-Demand EMR Cluster 3 | 4 | Parameters: 5 | ClusterName: 6 | Type: "String" 7 | Description: "Name your cluster" 8 | 9 | JobType: 10 | Type: "String" 11 | Description: "Select your job type" 12 | AllowedValues: 13 | - "Spark" 14 | - "Hive" 15 | - "Interactive" 16 | Default: "Spark" 17 | 18 | ComputeRequirements: 19 | Type: "String" 20 | Description: "Compute requirements" 21 | AllowedValues: 22 | - "Generic" 23 | - "CPU" 24 | - "Memory" 25 | Default: "Generic" 26 | 27 | ClusterSize: 28 | Type: "Number" 29 | Description: "Size of cluster" 30 | AllowedValues: 31 | - "2" 32 | - "5" 33 | - "10" 34 | - "20" 35 | Default: "2" 36 | 37 | JobArtifacts: 38 | Type: "String" 39 | Description: "Spark script or Hive SQL" 40 | 41 | Metadata: 42 | AWS::CloudFormation::Interface: 43 | ParameterLabels: 44 | ClusterName: 45 | default: "Cluster Name" 46 | JobType: 47 | default: "Job Type" 48 | ComputeRequirements: 49 | default: "Compute or Memory" 50 | JobArtifacts: 51 | default: "Job Parameters" 52 | ClusterSize: 53 | default: "Number of core nodes" 54 | ParameterGroups: 55 | - 56 | Label: 57 | default: "Cluster Configuration" 58 | Parameters: 59 | - ClusterName 60 | - ComputeRequirements 61 | - ClusterSize 62 | - 63 | Label: 64 | default: "Job Configuration" 65 | Parameters: 66 | - JobType 67 | - JobArtifacts 68 | 69 | Mappings: 70 | ComputeMapping: 71 | Generic: 72 | "instancetype": "m5.4xlarge" 73 | CPU: 74 | "instancetype": "c5.4xlarge" 75 | Memory: 76 | "instancetype": "r5.4xlarge" 77 | StepMapping: 78 | Spark: 79 | "stepcommand": "spark-submit --deploy-mode cluster" 80 | Hive: 81 | "stepcommand": "hive-script --run-hive-script --args -f" 82 | 83 | Resources: 84 | EMRCluster: 85 | Type: AWS::EMR::Cluster 86 | Properties: 87 | Name: { Ref: ClusterName } 88 | JobFlowRole: "EMR_EC2_DefaultRole" 89 | ServiceRole: "EMR_DefaultRole" 90 | ReleaseLabel: "emr-5.19.0" 91 | Instances: 92 | Ec2SubnetId: "subnet-XXXX" 93 | Ec2KeyName: "sshkeyname" 94 | MasterInstanceGroup: 95 | InstanceCount: 1 96 | InstanceType: 97 | Fn::FindInMap: 98 | - ComputeMapping 99 | - Ref: "ComputeRequirements" 100 | - "instancetype" 101 | Market: "ON_DEMAND" 102 | Name: "Master" 103 | CoreInstanceGroup: 104 | InstanceCount: 105 | Ref: ClusterSize 106 | InstanceType: 107 | Fn::FindInMap: 108 | - ComputeMapping 109 | - Ref: "ComputeRequirements" 110 | - "instancetype" 111 | Market: "ON_DEMAND" 112 | Name: "Core" 113 | Applications: 114 | - Name: "Spark" 115 | - Name: "Ganglia" 116 | - Name: "Hive" 117 | LogUri: 118 | Fn::Join: ["", ["s3://aws-logs-", Ref: "AWS::AccountId", "-", Ref: "AWS::Region", "/", "elasticmapreduce", "/"]] 119 | 120 | EMRLogProcessor: 121 | Type: AWS::EMR::Step 122 | Properties: 123 | ActionOnFailure: "CONTINUE" 124 | HadoopJarStep: 125 | Jar: "command-runner.jar" 126 | Args: !Split 127 | - " " 128 | - Fn::Join: 129 | - " " 130 | - 131 | - Fn::FindInMap: [StepMapping, {Ref: JobType}, "stepcommand"] 132 | - {Ref: JobArtifacts} 133 | JobFlowId: 134 | Ref: EMRCluster 135 | Name: "Log Converter" 136 | 137 | Outputs: 138 | "MasterNodeHadoopURL": 139 | Description: "EMR Resource Manager" 140 | Value: 141 | Fn::Sub: "http://${EMRCluster.MasterPublicDNS}:8088" 142 | -------------------------------------------------------------------------------- /reInvent_2018/EMR/assets/cloudformation/Presto_Cluster.cf.yml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Description: Auto-scaling Presto Cluster 3 | 4 | Parameters: 5 | ClusterName: 6 | Type: "String" 7 | Description: "Name your cluster" 8 | 9 | ClusterSize: 10 | Type: "Number" 11 | Description: "Size of cluster" 12 | AllowedValues: 13 | - "2" 14 | - "5" 15 | - "10" 16 | - "20" 17 | - "50" 18 | - "100" 19 | Default: "2" 20 | 21 | Metadata: 22 | AWS::CloudFormation::Interface: 23 | ParameterLabels: 24 | ClusterName: 25 | default: "Cluster Name" 26 | ClusterSize: 27 | default: "Max number of auto-scaled Task nodes" 28 | ParameterGroups: 29 | - 30 | Label: 31 | default: "Cluster Configuration" 32 | Parameters: 33 | - ClusterName 34 | - ClusterSize 35 | 36 | Resources: 37 | EMRCluster: 38 | Type: AWS::EMR::Cluster 39 | Properties: 40 | Name: { Ref: ClusterName } 41 | JobFlowRole: "EMR_EC2_DefaultRole" 42 | ServiceRole: "EMR_DefaultRole" 43 | AutoScalingRole: "EMR_AutoScaling_DefaultRole" 44 | ReleaseLabel: "emr-5.19.0" 45 | Instances: 46 | Ec2SubnetId: "subnet-XXXX" 47 | Ec2KeyName: "sshkeyname" 48 | MasterInstanceGroup: 49 | InstanceCount: 1 50 | InstanceType: "r5.4xlarge" 51 | Market: "ON_DEMAND" 52 | Name: "Master" 53 | CoreInstanceGroup: 54 | InstanceCount: 2 55 | InstanceType: "r5.4xlarge" 56 | Market: "ON_DEMAND" 57 | Name: "Core" 58 | Applications: 59 | - Name: "Presto" 60 | - Name: "Ganglia" 61 | - Name: "Hue" 62 | LogUri: 63 | Fn::Join: ["", ["s3://aws-logs-", Ref: "AWS::AccountId", "-", Ref: "AWS::Region", "/", "elasticmapreduce", "/"]] 64 | 65 | AutoScalingInstanceGroup: 66 | Type: AWS::EMR::InstanceGroupConfig 67 | Properties: 68 | InstanceCount: 2 69 | InstanceType: "r5.4xlarge" 70 | InstanceRole: "TASK" 71 | Market: "ON_DEMAND" 72 | Name: "TaskAutoScale" 73 | JobFlowId: 74 | Ref: "EMRCluster" 75 | AutoScalingPolicy: 76 | Constraints: 77 | MaxCapacity: 78 | Ref: ClusterSize 79 | MinCapacity: 2 80 | Rules: 81 | - Name: Scale-out 82 | Description: Scale-out policy 83 | Action: 84 | SimpleScalingPolicyConfiguration: 85 | AdjustmentType: CHANGE_IN_CAPACITY 86 | ScalingAdjustment: 18 87 | CoolDown: 300 88 | Trigger: 89 | CloudWatchAlarmDefinition: 90 | Dimensions: 91 | - Key: JobFlowId 92 | Value: '${emr.clusterId}' 93 | EvaluationPeriods: 1 94 | Namespace: AWS/ElasticMapReduce 95 | Period: 300 96 | ComparisonOperator: GREATER_THAN_OR_EQUAL 97 | Statistic: AVERAGE 98 | Threshold: 1 99 | Unit: COUNT 100 | MetricName: ScaleOutToMax 101 | - Name: Scale-in 102 | Description: Scale-in policy 103 | Action: 104 | SimpleScalingPolicyConfiguration: 105 | AdjustmentType: CHANGE_IN_CAPACITY 106 | ScalingAdjustment: -18 107 | CoolDown: 300 108 | Trigger: 109 | CloudWatchAlarmDefinition: 110 | Dimensions: 111 | - Key: JobFlowId 112 | Value: '${emr.clusterId}' 113 | EvaluationPeriods: 1 114 | Namespace: AWS/ElasticMapReduce 115 | Period: 300 116 | ComparisonOperator: GREATER_THAN_OR_EQUAL 117 | Statistic: AVERAGE 118 | Threshold: 1 119 | Unit: COUNT 120 | MetricName: ScaleInToMin 121 | 122 | Outputs: 123 | "PrestoUI": 124 | Description: "Presto Admin Console" 125 | Value: 126 | Fn::Sub: "http://${EMRCluster.MasterPublicDNS}:8889" 127 | "HueUI": 128 | Description: "Hue Interface" 129 | Value: 130 | Fn::Sub: "http://${EMRCluster.MasterPublicDNS}:8888" 131 | 132 | -------------------------------------------------------------------------------- /emr/eks/java/emr-eks-job-runner/src/main/java/aws/example/emrcontainers/StartJobRunExample.java: -------------------------------------------------------------------------------- 1 | package aws.example.emrcontainers; 2 | 3 | import software.amazon.awssdk.services.emrcontainers.EmrContainersClient; 4 | import software.amazon.awssdk.services.emrcontainers.model.*; 5 | 6 | public class StartJobRunExample { 7 | 8 | public static StartJobRunResponse submitEMRContainersJob(EmrContainersClient emrContainersClient, String virtualClusterId, String jobRoleArn) { 9 | SparkSubmitJobDriver sparkSubmit = SparkSubmitJobDriver.builder() 10 | .entryPoint("local:///usr/lib/spark/examples/src/main/python/pi.py") 11 | .entryPointArguments() 12 | .sparkSubmitParameters("--conf spark.executor.instances=1 --conf spark.executor.memory=2G --conf spark.executor.cores=1 --conf spark.driver.cores=1") 13 | .build(); 14 | 15 | JobDriver jobDriver = JobDriver.builder() 16 | .sparkSubmitJobDriver(sparkSubmit) 17 | .build(); 18 | 19 | StartJobRunRequest jobRunRequest = StartJobRunRequest.builder() 20 | .name("pi.py") 21 | .jobDriver(jobDriver) 22 | .executionRoleArn(jobRoleArn) 23 | .virtualClusterId(virtualClusterId) 24 | .releaseLabel(ExampleConstants.EMR_RELEASE_LABEL) 25 | .build(); 26 | 27 | return emrContainersClient.startJobRun(jobRunRequest); 28 | } 29 | 30 | // Wait for an EMR Containers query to complete, fail or to be cancelled 31 | public static void waitForQueryToComplete(EmrContainersClient emrContainersClient, String virtualClusterId, String jobId) throws InterruptedException { 32 | DescribeJobRunResponse jobRunResponse; 33 | DescribeJobRunRequest jobRunRequest = DescribeJobRunRequest.builder() 34 | .virtualClusterId(virtualClusterId) 35 | .id(jobId) 36 | .build(); 37 | 38 | boolean isQueryStillRunning = true; 39 | while (isQueryStillRunning) { 40 | jobRunResponse = emrContainersClient.describeJobRun(jobRunRequest); 41 | JobRunState jobState = jobRunResponse.jobRun().state(); 42 | if (jobState == JobRunState.FAILED) { 43 | throw new RuntimeException("The EMR Containers job failed to run with error message: " + 44 | jobRunResponse.jobRun().failureReasonAsString()); 45 | } else if (jobState == JobRunState.CANCELLED) { 46 | throw new RuntimeException("The EMR Containers job was cancelled."); 47 | } else if (jobState == JobRunState.COMPLETED) { 48 | isQueryStillRunning = false; 49 | } else { 50 | // Sleep an amount of time before retrying again 51 | Thread.sleep(ExampleConstants.SLEEP_AMOUNT_IN_MS); 52 | } 53 | System.out.println("The current status is: " + jobState.toString()); 54 | } 55 | } 56 | 57 | public static void main(String[] args) throws InterruptedException { 58 | final String USAGE = "\n" + 59 | "StartJobRunExample - Run an EMR on EKS job\n\n" + 60 | "Usage: StartJobRunExample \n\n" + 61 | "Where:\n" + 62 | " virtual_cluster_id - The virtual cluster ID of your EMR on EKS cluster.\n\n" + 63 | " job_role_arn - The execution role ARN for the job run.\n"; 64 | 65 | if (args.length < 2) { 66 | System.out.println(USAGE); 67 | System.exit(1); 68 | } 69 | 70 | String virtual_cluster_id = args[0]; 71 | String job_role_arn = args[1]; 72 | 73 | System.out.println("Creating a new job on cluster: " + virtual_cluster_id); 74 | 75 | EmrContainersClient emrContainersClient = EmrContainersClient.builder() 76 | .build(); 77 | 78 | // Create a default job on the provided EMR on EKS cluster 79 | StartJobRunResponse jobRun = submitEMRContainersJob(emrContainersClient, virtual_cluster_id, job_role_arn); 80 | System.out.println("Started job: " + jobRun.id()); 81 | 82 | // Now wait for the job to run to completion 83 | waitForQueryToComplete(emrContainersClient, virtual_cluster_id, jobRun.id()); 84 | emrContainersClient.close(); 85 | 86 | System.out.println("Done!"); 87 | } 88 | 89 | } 90 | -------------------------------------------------------------------------------- /reInvent_2018/EMR/assets/cloudformation/Spark_Cluster_Versions/v4_Auto_Terminate.cf.yml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Description: On-Demand EMR Cluster 3 | 4 | Parameters: 5 | ClusterName: 6 | Type: "String" 7 | Description: "Name your cluster" 8 | 9 | JobType: 10 | Type: "String" 11 | Description: "Select your job type" 12 | AllowedValues: 13 | - "Spark" 14 | - "Hive" 15 | - "Interactive" 16 | Default: "Spark" 17 | 18 | ComputeRequirements: 19 | Type: "String" 20 | Description: "Compute requirements" 21 | AllowedValues: 22 | - "Generic" 23 | - "CPU" 24 | - "Memory" 25 | Default: "Generic" 26 | 27 | ClusterSize: 28 | Type: "Number" 29 | Description: "Size of cluster" 30 | AllowedValues: 31 | - "2" 32 | - "5" 33 | - "10" 34 | - "20" 35 | Default: "2" 36 | 37 | JobArtifacts: 38 | Type: "String" 39 | Description: "Spark script or Hive SQL" 40 | 41 | AutoTerminateCluster: 42 | Type: "String" 43 | Description: "Terminate the cluster when the job is done" 44 | AllowedValues: 45 | - "True" 46 | - "False" 47 | Default: "False" 48 | 49 | Metadata: 50 | AWS::CloudFormation::Interface: 51 | ParameterLabels: 52 | ClusterName: 53 | default: "Cluster Name" 54 | JobType: 55 | default: "Job Type" 56 | ComputeRequirements: 57 | default: "Compute or Memory" 58 | JobArtifacts: 59 | default: "Job Parameters" 60 | ClusterSize: 61 | default: "Number of core nodes" 62 | AutoTerminateCluster: 63 | default: "Auto terminate EMR cluster" 64 | ParameterGroups: 65 | - 66 | Label: 67 | default: "Cluster Configuration" 68 | Parameters: 69 | - ClusterName 70 | - ComputeRequirements 71 | - ClusterSize 72 | - AutoTerminateCluster 73 | - 74 | Label: 75 | default: "Job Configuration" 76 | Parameters: 77 | - JobType 78 | - JobArtifacts 79 | 80 | Mappings: 81 | ComputeMapping: 82 | Generic: 83 | "instancetype": "m5.4xlarge" 84 | CPU: 85 | "instancetype": "c5.4xlarge" 86 | Memory: 87 | "instancetype": "r5.4xlarge" 88 | StepMapping: 89 | Spark: 90 | "stepcommand": "spark-submit --deploy-mode cluster" 91 | Hive: 92 | "stepcommand": "hive-script --run-hive-script --args -f" 93 | 94 | Resources: 95 | EMRCluster: 96 | Type: AWS::EMR::Cluster 97 | Properties: 98 | Name: { Ref: ClusterName } 99 | JobFlowRole: "EMR_EC2_DefaultRole" 100 | ServiceRole: "EMR_DefaultRole" 101 | ReleaseLabel: "emr-5.19.0" 102 | Instances: 103 | Ec2SubnetId: "subnet-XXXX" 104 | Ec2KeyName: "sshkeyname" 105 | MasterInstanceGroup: 106 | InstanceCount: 1 107 | InstanceType: 108 | Fn::FindInMap: 109 | - ComputeMapping 110 | - Ref: "ComputeRequirements" 111 | - "instancetype" 112 | Market: "ON_DEMAND" 113 | Name: "Master" 114 | CoreInstanceGroup: 115 | InstanceCount: 116 | Ref: ClusterSize 117 | InstanceType: 118 | Fn::FindInMap: 119 | - ComputeMapping 120 | - Ref: "ComputeRequirements" 121 | - "instancetype" 122 | Market: "ON_DEMAND" 123 | Name: "Core" 124 | Applications: 125 | - Name: "Spark" 126 | - Name: "Ganglia" 127 | - Name: "Hive" 128 | LogUri: 129 | Fn::Join: ["", ["s3://aws-logs-", Ref: "AWS::AccountId", "-", Ref: "AWS::Region", "/", "elasticmapreduce", "/"]] 130 | 131 | EMRLogProcessor: 132 | Type: AWS::EMR::Step 133 | Properties: 134 | ActionOnFailure: "CONTINUE" 135 | HadoopJarStep: 136 | Jar: "command-runner.jar" 137 | Args: !Split 138 | - " " 139 | - Fn::Join: 140 | - " " 141 | - 142 | - Fn::FindInMap: [StepMapping, {Ref: JobType}, "stepcommand"] 143 | - {Ref: JobArtifacts} 144 | JobFlowId: 145 | Ref: EMRCluster 146 | Name: "Log Converter" 147 | 148 | Outputs: 149 | "MasterNodeHadoopURL": 150 | Description: "EMR Resource Manager" 151 | Value: 152 | Fn::Sub: "http://${EMRCluster.MasterPublicDNS}:8088" 153 | -------------------------------------------------------------------------------- /reInvent_2018/EMR/create_sc_entries.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Ensure dependencies are installed 4 | if ! [ -x "$(command -v jq)" ]; then 5 | echo 'Error: jq is not installed.' >&2 6 | exit 1 7 | fi 8 | 9 | # Define some environment variables 10 | : ${TARGET_SUBNET:=subnet-XXXX} 11 | : ${TARGET_GRANTEE:=role/Admin} 12 | : ${CLUSTER_SSH_KEY:=sshkeyname} 13 | : ${BUCKET_NAME:=damons-reinvent-demo} 14 | : ${AWS_REGION:=us-east-1} 15 | : ${AWS_PROFILE:=default} 16 | 17 | # Used to retrieve output from AWS CLI commands 18 | TMP_FILE=$(mktemp) 19 | 20 | # macOS uses BSD sed, Linux uses GNU sed 21 | if [[ "$OSTYPE" == "linux-gnu" ]]; then 22 | SED_CMD="sed -i" 23 | elif [[ "$OSTYPE" == "darwin"* ]]; then 24 | SED_CMD="sed -i ''" 25 | else 26 | echo "Unsupported operating system, only Linux and macOS are supported." 27 | exit 1 28 | fi 29 | 30 | # Update settings specific to our desired region in the CloudFormation templates 31 | find assets/cloudformation -type f -exec \ 32 | ${SED_CMD} "s/Ec2SubnetId:.*/Ec2SubnetId: \"${TARGET_SUBNET}\"/" {} + 33 | find assets/cloudformation -type f -exec \ 34 | ${SED_CMD} "s/Ec2KeyName:.*/Ec2KeyName: \"${CLUSTER_SSH_KEY}\"/" {} + 35 | 36 | # Deploy the updated templates 37 | RELEASE_BUCKET=${BUCKET_NAME} AWS_PROFILE=${AWS_PROFILE} make 38 | 39 | # Create a new portfolio 40 | aws --region ${AWS_REGION} servicecatalog create-portfolio \ 41 | --display-name "EMR re:Invent Demo" \ 42 | --provider-name "@dacort" \ 43 | --description "Pre-defined on-demand EMR clusters" \ 44 | | tee ${TMP_FILE} 45 | PORTFOLIO_ID=$(jq -r '.PortfolioDetail.Id' ${TMP_FILE}) 46 | 47 | # Create a product 48 | aws --region ${AWS_REGION} servicecatalog create-product --name "Data Analyst EMR" \ 49 | --owner "@dacort" \ 50 | --description "Provides Hive, Spark, and Hue for interactive queries." \ 51 | --product-type CLOUD_FORMATION_TEMPLATE \ 52 | --provisioning-artifact-parameters '{"Name":"Initial revision", "Description": "", "Info":{"LoadTemplateFromURL":"https://s3.amazonaws.com/'${BUCKET_NAME}'/reinvent/cloudformation/Spark_Cluster_Versions/v0_Initial_Revision.cf.yml"},"Type":"CLOUD_FORMATION_TEMPLATE"}' \ 53 | | tee ${TMP_FILE} 54 | PRODUCT_ID=$(jq -r '.ProductViewDetail.ProductViewSummary.ProductId' ${TMP_FILE}) 55 | 56 | # Connect the product to our portfolio 57 | aws --region ${AWS_REGION} servicecatalog associate-product-with-portfolio --product-id ${PRODUCT_ID} --portfolio-id ${PORTFOLIO_ID} 58 | 59 | # Also create a Data Science product 60 | aws --region ${AWS_REGION} servicecatalog create-product --name "Data Science EMR" \ 61 | --owner "@dacort" \ 62 | --description "Provides TensorFlow, JupyterHub, and MXNet for ML queries." \ 63 | --product-type CLOUD_FORMATION_TEMPLATE \ 64 | --provisioning-artifact-parameters '{"Name":"Initial revision", "Description": "", "Info":{"LoadTemplateFromURL":"https://s3.amazonaws.com/'${BUCKET_NAME}'/reinvent/cloudformation/Spark_Cluster_Versions/v0_Initial_Revision.cf.yml"},"Type":"CLOUD_FORMATION_TEMPLATE"}' \ 65 | | tee ${TMP_FILE} 66 | DS_PRODUCT_ID=$(jq -r '.ProductViewDetail.ProductViewSummary.ProductId' ${TMP_FILE}) 67 | 68 | # Connect the product to our portfolio 69 | aws --region ${AWS_REGION} servicecatalog associate-product-with-portfolio --product-id ${DS_PRODUCT_ID} --portfolio-id ${PORTFOLIO_ID} 70 | 71 | # Add different product revisions 72 | VERSIONS=( "Updated security setting:v1_Security_Settings" 73 | "Updated parameter labels:v2_Updated_Parameters" 74 | "Choose your own cluster size!:v3_Cluster_Size" 75 | "Auto-terminate functionality:v4_Auto_Terminate" 76 | "Spark UI:v5_SparkUI" ) 77 | 78 | for version in "${VERSIONS[@]}" ; do 79 | NAME=${version%%:*} 80 | TEMPLATE=${version#*:} 81 | aws --region ${AWS_REGION} servicecatalog create-provisioning-artifact \ 82 | --product-id ${PRODUCT_ID} \ 83 | --parameters '{ 84 | "Name": "'"${NAME}"'", 85 | "Description": "", 86 | "Info": { 87 | "LoadTemplateFromURL": "https://s3.amazonaws.com/'${BUCKET_NAME}'/reinvent/cloudformation/Spark_Cluster_Versions/'${TEMPLATE}'.cf.yml" 88 | }, 89 | "Type": "CLOUD_FORMATION_TEMPLATE" 90 | }' 91 | done 92 | 93 | # Grant access to the portfolio 94 | aws --region ${AWS_REGION} servicecatalog associate-principal-with-portfolio \ 95 | --portfolio-id ${PORTFOLIO_ID} \ 96 | --principal-type IAM \ 97 | --principal-arn arn:aws:iam::$(aws --region ${AWS_REGION} sts get-caller-identity --query Account --output text):${TARGET_GRANTEE} -------------------------------------------------------------------------------- /reInvent_2018/EMR/assets/cloudformation/Spark_Cluster.cf.yml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Description: On-Demand EMR Cluster 3 | 4 | Parameters: 5 | ClusterName: 6 | Type: "String" 7 | Description: "Name your cluster" 8 | 9 | JobType: 10 | Type: "String" 11 | Description: "Select your job type" 12 | AllowedValues: 13 | - "Spark" 14 | - "Hive" 15 | - "Interactive" 16 | Default: "Spark" 17 | 18 | ComputeRequirements: 19 | Type: "String" 20 | Description: "Compute requirements" 21 | AllowedValues: 22 | - "Generic" 23 | - "CPU" 24 | - "Memory" 25 | Default: "Generic" 26 | 27 | ClusterSize: 28 | Type: "Number" 29 | Description: "Size of cluster" 30 | AllowedValues: 31 | - "2" 32 | - "5" 33 | - "10" 34 | - "20" 35 | Default: "2" 36 | 37 | JobArtifacts: 38 | Type: "String" 39 | Description: "Spark script or Hive SQL" 40 | 41 | AutoTerminateCluster: 42 | Type: "String" 43 | Description: "Terminate the cluster when the job is done" 44 | AllowedValues: 45 | - "True" 46 | - "False" 47 | Default: "False" 48 | 49 | Metadata: 50 | AWS::CloudFormation::Interface: 51 | ParameterLabels: 52 | ClusterName: 53 | default: "Cluster Name" 54 | JobType: 55 | default: "Job Type" 56 | ComputeRequirements: 57 | default: "Compute or Memory" 58 | JobArtifacts: 59 | default: "Job Parameters" 60 | ClusterSize: 61 | default: "Number of core nodes" 62 | AutoTerminateCluster: 63 | default: "Auto terminate EMR cluster" 64 | ParameterGroups: 65 | - 66 | Label: 67 | default: "Cluster Configuration" 68 | Parameters: 69 | - ClusterName 70 | - ComputeRequirements 71 | - ClusterSize 72 | - AutoTerminateCluster 73 | - 74 | Label: 75 | default: "Job Configuration" 76 | Parameters: 77 | - JobType 78 | - JobArtifacts 79 | 80 | Mappings: 81 | ComputeMapping: 82 | Generic: 83 | "instancetype": "m5.4xlarge" 84 | CPU: 85 | "instancetype": "c5.4xlarge" 86 | Memory: 87 | "instancetype": "r5.4xlarge" 88 | StepMapping: 89 | Spark: 90 | "stepcommand": "spark-submit --deploy-mode cluster" 91 | Hive: 92 | "stepcommand": "hive-script --run-hive-script --args -f" 93 | 94 | Resources: 95 | EMRCluster: 96 | Type: AWS::EMR::Cluster 97 | Properties: 98 | Name: { Ref: ClusterName } 99 | JobFlowRole: "EMR_EC2_DefaultRole" 100 | ServiceRole: "EMR_DefaultRole" 101 | ReleaseLabel: "emr-5.19.0" 102 | Instances: 103 | Ec2SubnetId: "subnet-XXXX" 104 | Ec2KeyName: "sshkeyname" 105 | MasterInstanceGroup: 106 | InstanceCount: 1 107 | InstanceType: 108 | Fn::FindInMap: 109 | - ComputeMapping 110 | - Ref: "ComputeRequirements" 111 | - "instancetype" 112 | Market: "ON_DEMAND" 113 | Name: "Master" 114 | CoreInstanceGroup: 115 | InstanceCount: 116 | Ref: ClusterSize 117 | InstanceType: 118 | Fn::FindInMap: 119 | - ComputeMapping 120 | - Ref: "ComputeRequirements" 121 | - "instancetype" 122 | Market: "ON_DEMAND" 123 | Name: "Core" 124 | Applications: 125 | - Name: "Spark" 126 | - Name: "Ganglia" 127 | - Name: "Hive" 128 | LogUri: 129 | Fn::Join: ["", ["s3://aws-logs-", Ref: "AWS::AccountId", "-", Ref: "AWS::Region", "/", "elasticmapreduce", "/"]] 130 | 131 | EMRLogProcessor: 132 | Type: AWS::EMR::Step 133 | Properties: 134 | ActionOnFailure: "CONTINUE" 135 | HadoopJarStep: 136 | Jar: "command-runner.jar" 137 | Args: !Split 138 | - " " 139 | - Fn::Join: 140 | - " " 141 | - 142 | - Fn::FindInMap: [StepMapping, {Ref: JobType}, "stepcommand"] 143 | - {Ref: JobArtifacts} 144 | JobFlowId: 145 | Ref: EMRCluster 146 | Name: "Data Converter" 147 | 148 | Outputs: 149 | "MasterNodeHadoopURL": 150 | Description: "EMR Resource Manager" 151 | Value: 152 | Fn::Sub: "http://${EMRCluster.MasterPublicDNS}:8088" 153 | "SparkHistoryServerURL": 154 | Description: "Spark UI" 155 | Value: 156 | Fn::Sub: "http://${EMRCluster.MasterPublicDNS}:18080" 157 | 158 | -------------------------------------------------------------------------------- /reInvent_2018/EMR/assets/cloudformation/Spark_Cluster_Versions/v5_SparkUI.cf.yml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: 2010-09-09 2 | Description: On-Demand EMR Cluster 3 | 4 | Parameters: 5 | ClusterName: 6 | Type: "String" 7 | Description: "Name your cluster" 8 | 9 | JobType: 10 | Type: "String" 11 | Description: "Select your job type" 12 | AllowedValues: 13 | - "Spark" 14 | - "Hive" 15 | - "Interactive" 16 | Default: "Spark" 17 | 18 | ComputeRequirements: 19 | Type: "String" 20 | Description: "Compute requirements" 21 | AllowedValues: 22 | - "Generic" 23 | - "CPU" 24 | - "Memory" 25 | Default: "Generic" 26 | 27 | ClusterSize: 28 | Type: "Number" 29 | Description: "Size of cluster" 30 | AllowedValues: 31 | - "2" 32 | - "5" 33 | - "10" 34 | - "20" 35 | Default: "2" 36 | 37 | JobArtifacts: 38 | Type: "String" 39 | Description: "Spark script or Hive SQL" 40 | 41 | AutoTerminateCluster: 42 | Type: "String" 43 | Description: "Terminate the cluster when the job is done" 44 | AllowedValues: 45 | - "True" 46 | - "False" 47 | Default: "False" 48 | 49 | Metadata: 50 | AWS::CloudFormation::Interface: 51 | ParameterLabels: 52 | ClusterName: 53 | default: "Cluster Name" 54 | JobType: 55 | default: "Job Type" 56 | ComputeRequirements: 57 | default: "Compute or Memory" 58 | JobArtifacts: 59 | default: "Job Parameters" 60 | ClusterSize: 61 | default: "Number of core nodes" 62 | AutoTerminateCluster: 63 | default: "Auto terminate EMR cluster" 64 | ParameterGroups: 65 | - 66 | Label: 67 | default: "Cluster Configuration" 68 | Parameters: 69 | - ClusterName 70 | - ComputeRequirements 71 | - ClusterSize 72 | - AutoTerminateCluster 73 | - 74 | Label: 75 | default: "Job Configuration" 76 | Parameters: 77 | - JobType 78 | - JobArtifacts 79 | 80 | Mappings: 81 | ComputeMapping: 82 | Generic: 83 | "instancetype": "m5.4xlarge" 84 | CPU: 85 | "instancetype": "c5.4xlarge" 86 | Memory: 87 | "instancetype": "r5.4xlarge" 88 | StepMapping: 89 | Spark: 90 | "stepcommand": "spark-submit --deploy-mode cluster" 91 | Hive: 92 | "stepcommand": "hive-script --run-hive-script --args -f" 93 | 94 | Resources: 95 | EMRCluster: 96 | Type: AWS::EMR::Cluster 97 | Properties: 98 | Name: { Ref: ClusterName } 99 | JobFlowRole: "EMR_EC2_DefaultRole" 100 | ServiceRole: "EMR_DefaultRole" 101 | ReleaseLabel: "emr-5.19.0" 102 | Instances: 103 | Ec2SubnetId: "subnet-XXXX" 104 | Ec2KeyName: "sshkeyname" 105 | MasterInstanceGroup: 106 | InstanceCount: 1 107 | InstanceType: 108 | Fn::FindInMap: 109 | - ComputeMapping 110 | - Ref: "ComputeRequirements" 111 | - "instancetype" 112 | Market: "ON_DEMAND" 113 | Name: "Master" 114 | CoreInstanceGroup: 115 | InstanceCount: 116 | Ref: ClusterSize 117 | InstanceType: 118 | Fn::FindInMap: 119 | - ComputeMapping 120 | - Ref: "ComputeRequirements" 121 | - "instancetype" 122 | Market: "ON_DEMAND" 123 | Name: "Core" 124 | Applications: 125 | - Name: "Spark" 126 | - Name: "Ganglia" 127 | - Name: "Hive" 128 | LogUri: 129 | Fn::Join: ["", ["s3://aws-logs-", Ref: "AWS::AccountId", "-", Ref: "AWS::Region", "/", "elasticmapreduce", "/"]] 130 | 131 | EMRLogProcessor: 132 | Type: AWS::EMR::Step 133 | Properties: 134 | ActionOnFailure: "CONTINUE" 135 | HadoopJarStep: 136 | Jar: "command-runner.jar" 137 | Args: !Split 138 | - " " 139 | - Fn::Join: 140 | - " " 141 | - 142 | - Fn::FindInMap: [StepMapping, {Ref: JobType}, "stepcommand"] 143 | - {Ref: JobArtifacts} 144 | JobFlowId: 145 | Ref: EMRCluster 146 | Name: "Data Converter" 147 | 148 | Outputs: 149 | "MasterNodeHadoopURL": 150 | Description: "EMR Resource Manager" 151 | Value: 152 | Fn::Sub: "http://${EMRCluster.MasterPublicDNS}:8088" 153 | "SparkHistoryServerURL": 154 | Description: "Spark UI" 155 | Value: 156 | Fn::Sub: "http://${EMRCluster.MasterPublicDNS}:18080" 157 | 158 | -------------------------------------------------------------------------------- /emr/airflow/README.md: -------------------------------------------------------------------------------- 1 | # Run EMR jobs with Airflow 2 | 3 | Associated video: [https://youtu.be/Z--sNHqkM7c](https://youtu.be/Z--sNHqkM7c) 4 | 5 | Airflow is a popular open source workflow management tool. Amazon EMR is a service that allows you to run various big data frameworks like Spark, Hive, and Presto on top of EC2 or EKS. In this demonstration, we'll show you how to schedule a PySpark job using Airflow on: 6 | 7 | - EMR on EC2 8 | - EMR on EKS 9 | 10 | What we want to do is run a sample job on both. 11 | 12 | Let's get started. 13 | 14 | ## Pre-requisites 15 | 16 | - Pre-existing VPC and [EMR default roles](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-iam-roles.html) 17 | - EMR on EKS virtual cluster and execution role (see my [big data stack blog post](https://dacort.dev/posts/cdk-big-data-stack/) for how to deploy this with CDK) 18 | 19 | ## Airflow Operators 20 | 21 | Airflow has several [EMR Operators](https://airflow.apache.org/docs/apache-airflow-providers-amazon/stable/operators/emr.html) that can be used to create a cluster, run a job, and terminate a cluster. 22 | 23 | In addition, there's currently an open pull request to integrate EMR on EKS as well. 24 | 25 | For this demo, we'll show how to use: 26 | 1. `EmrCreateJobFlowOperator` to create a new EMR on EC2 cluster, run a job, and automatically terminate the cluster. 27 | 2. `EMRContainerOperator` to submit a job to a pre-existing EMR on EKS virtual cluster. 28 | 29 | ## Configuring Airflow 30 | 31 | ### IAM Permissions 32 | 33 | IAM will need access to start and monitor EMR clusters as well as start and monitor EMR on EKS jobs. 34 | 35 | In addition to the standard permissions for the [MWAA service execution role](https://docs.aws.amazon.com/mwaa/latest/userguide/mwaa-create-role.html), we'll also give it access to create these jobs. 36 | 37 | EMR on EC2 will also need access to `iam:PassRole` for the default EMR roles. 38 | 39 | ```python 40 | iam.PolicyStatement( 41 | actions=[ 42 | "emr-containers:StartJobRun", 43 | "emr-containers:DescribeJobRun", 44 | "emr-containers:CancelJobRun", 45 | ], 46 | effect=iam.Effect.ALLOW, 47 | resources=["*"], 48 | ), 49 | iam.PolicyStatement( 50 | actions=[ 51 | "elasticmapreduce:RunJobFlow", 52 | "elasticmapreduce:DescribeStep", 53 | "elasticmapreduce:DescribeCluster", 54 | ], 55 | effect=iam.Effect.ALLOW, 56 | resources=["*"], 57 | ), 58 | iam.PolicyStatement( 59 | actions=["iam:PassRole"], 60 | effect=iam.Effect.ALLOW, 61 | resources=[ 62 | f"arn:aws:iam::{self.account}:role/EMR_DemoRole", 63 | f"arn:aws:iam::{self.account}:role/EMR_EC2_DemoRole", 64 | f"arn:aws:iam::{self.account}:role/EMR_EC2_DefaultRole", 65 | f"arn:aws:iam::{self.account}:role/EMR_DefaultRole", 66 | ], 67 | ), 68 | ``` 69 | 70 | ### Airflow Connections 71 | 72 | If you like, you can hard-code your connection options in your job or you can store them in a connection. 73 | 74 | At the very least, you need to add `region_name` in the Extra section in your `aws_default` connection. 75 | 76 | ```json 77 | {"region_name":"us-east-1"} 78 | ``` 79 | 80 | EMR on EC2 doesn't need any additional configuration than this because we're going to create the cluster from scratch and use a default set of roles, security groups, and VPC. 81 | 82 | EMR on EKS, however, requires you to already have an execution role and virtual cluster set up. 83 | 84 | Here's an example connection that defines a different region, EMR virtual cluster, and execution role ARN for EMR on EKS. 85 | 86 | ```json 87 | {"region_name":"us-east-2","virtual_cluster_id":"wfto7bwu9n8ajdohqkri06pc1","job_role_arn":"arn:aws:iam::111122223333:role/emr_eks_default_role"} 88 | ``` 89 | 90 | ## Running Jobs 91 | 92 | ### EMR on EC2 93 | 94 | ONe mistake I made while working on this was I used an instance size of `c5.xlarge` - unfortunately that didn't work with the default SparkPi job, so I had to change it to an `m5.xlarge`. 95 | 96 | Other than that, the example EMR on EC2 job is pretty straight-forward! It'll create a small cluster with a Step (job) defined by default, wait until that step finishes and then EMR will terminate the cluster. 97 | 98 | Let's try to trigger the DAG and see what happens. 99 | 100 | ### EMR on EKS 101 | 102 | Since the pull request has not been merged, we had to deploy our own custom set of plugins to be able to run the EMR on EKS job. 103 | 104 | To do this, we made use of my [example EMR on EKS plugin](https://github.com/dacort/emr-eks-airflow2-plugin) repository and added a reference to this in our `requirements.txt` file. 105 | 106 | ``` 107 | emr-containers @ https://github.com/dacort/emr-eks-airflow2-plugin/archive/main.zip 108 | apache-airflow[amazon]==2.0.2 109 | ``` 110 | 111 | The `apache-airflow[amazon]` requirement is needed for the EMR on EC2 Operator. 112 | 113 | So, with our requirements installed, our connection defined, let's go ahead and trigger the DAG! 114 | -------------------------------------------------------------------------------- /emr/eks/videos/custom_images/README.md: -------------------------------------------------------------------------------- 1 | # EMR on EKS Custom Images 2 | 3 | Use Bokeh with EMR on EKS to draw daily images of Air Quality data in the continental US. 4 | 5 | Demo video: [https://youtu.be/0x4DRKmNPfQ](https://youtu.be/0x4DRKmNPfQ) 6 | 7 | ## Overview 8 | 9 | - First, we need to login to the relevant ECR and pull the latest EMR image we want. 10 | 11 | ```shell 12 | aws ecr get-login-password --region us-east-2 | docker login --username AWS --password-stdin 711395599931.dkr.ecr.us-east-2.amazonaws.com 13 | docker pull 711395599931.dkr.ecr.us-east-2.amazonaws.com/notebook-spark/emr-6.3.0:latest 14 | ``` 15 | 16 | - Next, we want to build a Dockerfile that installs the `bokeh` library. 17 | 18 | We also want to populate the bokeh sample dataset directly on the image itself because we use that for our map. 19 | 20 | ```dockerfile 21 | FROM 711395599931.dkr.ecr.us-east-2.amazonaws.com/notebook-spark/emr-6.3.0:latest 22 | 23 | USER root 24 | 25 | # Install Chrome 26 | RUN curl https://intoli.com/install-google-chrome.sh | bash && \ 27 | mv /usr/bin/google-chrome-stable /usr/bin/chrome 28 | 29 | RUN pip3 install \ 30 | bokeh>=2.3.2 \ 31 | chromedriver-py>=91.0.4472.19.0 \ 32 | selenium>=3.141.0 33 | RUN bokeh sampledata 34 | 35 | RUN ln -s /usr/local/lib/python3.7/site-packages/chromedriver_py/chromedriver_linux64 /usr/local/bin/chromedriver 36 | 37 | USER hadoop:hadoop 38 | ``` 39 | 40 | - Now build your image 41 | 42 | ```shell 43 | docker build -t emr-6.3.0-bokeh:latest . 44 | ``` 45 | 46 | - Validate 47 | 48 | I added a simple test script that generates a plot and validates it against a known hash. 49 | 50 | ```shell 51 | docker run --rm -it emr-6.3.0-bokeh python3 /test/gen_plot.py 52 | ``` 53 | 54 | If you see "All good! 🙌" we're good to go! 55 | 56 | - Push it to a (private) GH repo 57 | 58 | ```shell 59 | export GH_USERNAME=dacort 60 | echo $CR_PAT| docker login ghcr.io -u ${GH_USERNAME} --password-stdin 61 | docker tag emr-6.3.0-bokeh:latest ghcr.io/${GH_USERNAME}/emr-6.3.0-bokeh:latest 62 | docker push ghcr.io/${GH_USERNAME}/emr-6.3.0-bokeh:latest 63 | ``` 64 | 65 | - Set up a secret to allow for the git pull 66 | 67 | ```shell 68 | DOCKER_AUTH=$(echo -n "${GH_USERNAME}:${CR_PAT}" | base64) 69 | 70 | DOCKER_DATA=$(echo '{ "auths": { "ghcr.io": { "auth":"'${DOCKER_AUTH}'" } } }' | base64) 71 | 72 | cat < dockerconfigjson-github-com.yaml 73 | kind: Secret 74 | type: kubernetes.io/dockerconfigjson 75 | apiVersion: v1 76 | metadata: 77 | name: dockerconfigjson-github-com 78 | namespace: emr-jobs 79 | labels: 80 | app: app-name 81 | data: 82 | .dockerconfigjson: ${DOCKER_DATA} 83 | EOF 84 | 85 | kubectl create -f dockerconfigjson-github-com.yaml -n emr-jobs 86 | ``` 87 | 88 | - Now let's run run it! 89 | 90 | ```shell 91 | aws emr-containers start-job-run \ 92 | --virtual-cluster-id ${EMR_EKS_CLUSTER_ID} \ 93 | --name dacort-aqi \ 94 | --execution-role-arn ${EMR_EKS_EXECUTION_ARN} \ 95 | --release-label emr-6.3.0-latest \ 96 | --job-driver '{ 97 | "sparkSubmitJobDriver": { 98 | "entryPoint": "s3://'${S3_BUCKET}'/code/generate_aqi_map.py", 99 | "entryPointArguments": ["'${S3_BUCKET}'", "output/airq/"], 100 | "sparkSubmitParameters": "--conf spark.kubernetes.container.image=ghcr.io/dacort/emr-6.3.0-bokeh:latest --conf spark.kubernetes.container.image.pullSecrets=dockerconfigjson-github-com" 101 | } 102 | }' \ 103 | --configuration-overrides '{ 104 | "monitoringConfiguration": { 105 | "s3MonitoringConfiguration": { "logUri": "s3://'${S3_BUCKET}'/logs/" } 106 | } 107 | }' 108 | ``` 109 | 110 | - We should see some air quality data! 111 | 112 | ```shell 113 | aws s3 ls s3://${S3_BUCKET}/output/airq/ 114 | # 2021-06-15 15:44:49 277735 2021-06-15-latest.png 115 | ``` 116 | 117 | ```shell 118 | aws s3 cp s3://${S3_BUCKET}/output/airq/2021-06-15-latest.png . 119 | open 2021-06-15-latest.png 120 | ``` 121 | 122 | ## Testing your code locally 123 | 124 | If you want, you can start up `pyspark` on your image locally to interactively test your code. 125 | 126 | ```shell 127 | docker run --rm -it emr-6.3.0-bokeh pyspark --deploy-mode client --master 'local[1]' 128 | ``` 129 | 130 | Note that if you access AWS resources from within your environment, you'll either need to change your `spark.hadoop.fs.s3.customAWSCredentialsProvider` in your Spark job or set AWS crednetials in your environment. If you have an access key or secret, you can pass those into the `docker run` command like so: 131 | 132 | ```shell 133 | docker run --rm -it \ 134 | -e AWS_ACCESS_KEY_ID \ 135 | -e AWS_SECRET_ACCESS_KEY \ 136 | emr-6.3.0-bokeh \ 137 | pyspark --deploy-mode client --master 'local[1]' 138 | ``` 139 | 140 | ## References 141 | 142 | - https://stackoverflow.com/questions/47087506/flatten-a-fiona-structure-to-dictionary-for-bokeh/47135604#47135604 143 | - https://discourse.bokeh.org/t/questions-re-choropleth/2589/3 144 | - https://towardsdatascience.com/walkthrough-mapping-basics-with-bokeh-and-geopandas-in-python-43f40aa5b7e9 145 | -------------------------------------------------------------------------------- /cdk/emr-serverless-vpc-to-vpc/emr_serverless_vpc_to_vpc/emr_serverless_vpc_to_vpc_stack.py: -------------------------------------------------------------------------------- 1 | from aws_cdk import Stack 2 | from aws_cdk import aws_ec2 as ec2 # Duration, 3 | from aws_cdk import aws_emrserverless as emrs 4 | from aws_cdk import aws_iam as iam 5 | from constructs import Construct 6 | 7 | 8 | class EmrServerlessVpcToVpcStack(Stack): 9 | def __init__(self, scope: Construct, construct_id: str, **kwargs) -> None: 10 | super().__init__(scope, construct_id, **kwargs) 11 | 12 | # Create two VPCs, ensure their CIDRs don't overlap 13 | vpc1 = ec2.Vpc(self, "EMRServerless_VPC1", max_azs=3, cidr="10.0.0.0/16") 14 | vpc2 = ec2.Vpc(self, "EMRServerless_VPC2", max_azs=3, cidr="10.1.0.0/16") 15 | 16 | # This is necessary on Ubuntu instances to install cfn-init and cfn-signal 17 | user_data = ec2.UserData.for_linux() 18 | user_data.add_commands( 19 | "apt-get update -y", 20 | "apt-get install -y -o DPkg::Lock::Timeout=60 git python3-pip", 21 | "python3 -m pip install -U pip", 22 | "python3 -m pip install https://s3.amazonaws.com/cloudformation-examples/aws-cfn-bootstrap-py3-latest.tar.gz", 23 | "mkdir -p /opt/aws/bin/", 24 | "ln -s /usr/local/bin/cfn-* /opt/aws/bin/", 25 | ) 26 | 27 | # Create a an EC2 instance running postgres in VPC1 and an inbound security group 28 | svc_sg = ec2.SecurityGroup(self, "VPC1_Service", vpc=vpc1) 29 | instance = ec2.Instance( 30 | self, 31 | "pg", 32 | vpc=vpc1, 33 | instance_type=ec2.InstanceType("t2.micro"), 34 | machine_image=ec2.MachineImage.lookup( 35 | name="ubuntu/images/hvm-ssd/ubuntu-focal-20.04-amd64-server-20211129" 36 | ), 37 | security_group=svc_sg, 38 | init=ec2.CloudFormationInit.from_elements( 39 | ec2.InitCommand.shell_command( 40 | "sudo apt-get install -o DPkg::Lock::Timeout=60 -y postgresql" 41 | ), 42 | ec2.InitCommand.shell_command( 43 | "sudo sh -c 'echo listen_addresses = '*' >> /etc/postgresql/12/main/postgresql.conf'" 44 | ), 45 | ec2.InitCommand.shell_command( 46 | "sudo sh -c 'echo host all all 0.0.0.0/0 md5 >> /etc/postgresql/12/main/postgresql.conf'" 47 | ), 48 | ec2.InitCommand.shell_command( 49 | "sudo systemctl restart postgresql.service" 50 | ), 51 | ec2.InitCommand.shell_command( 52 | "sudo -u postgres psql -c \"CREATE USER remote WITH PASSWORD 'remote';\"" 53 | ), 54 | ), 55 | user_data=user_data, 56 | ) 57 | 58 | # Add SSM policy so we can remote in without SSH 59 | instance.role.add_managed_policy( 60 | iam.ManagedPolicy.from_aws_managed_policy_name( 61 | "AmazonSSMManagedInstanceCore" 62 | ) 63 | ) 64 | 65 | # Create a test EC2 instance in VPC2 with the same security group as our EMR Serverless application 66 | # We can use this to validate connectivity 67 | test_sg = ec2.SecurityGroup(self, "VPC2_Service", vpc=vpc2) 68 | instance2 = ec2.Instance( 69 | self, 70 | "test", 71 | vpc=vpc2, 72 | instance_type=ec2.InstanceType("t2.micro"), 73 | machine_image=ec2.MachineImage.lookup( 74 | name="ubuntu/images/hvm-ssd/ubuntu-focal-20.04-amd64-server-20211129" 75 | ), 76 | security_group=test_sg, 77 | init=ec2.CloudFormationInit.from_elements( 78 | ec2.InitCommand.shell_command( 79 | "sudo apt-get install -o DPkg::Lock::Timeout=60 -y netcat" 80 | ), 81 | ), 82 | user_data=user_data, 83 | ) 84 | instance2.role.add_managed_policy( 85 | iam.ManagedPolicy.from_aws_managed_policy_name( 86 | "AmazonSSMManagedInstanceCore" 87 | ) 88 | ) 89 | 90 | # Peer the two VPCs 91 | fn_vPCPeering_connection = ec2.CfnVPCPeeringConnection( 92 | self, 93 | "MyCfnVPCPeeringConnection", 94 | peer_vpc_id=vpc1.vpc_id, 95 | vpc_id=vpc2.vpc_id, 96 | ) 97 | 98 | # Then create routes between eachof the subnets in each VPC 99 | for idx, subnet in enumerate(vpc2.private_subnets): 100 | ec2.CfnRoute( 101 | self, 102 | f"PeerRoute-{idx}", 103 | route_table_id=subnet.route_table.route_table_id, 104 | destination_cidr_block=vpc1.vpc_cidr_block, 105 | vpc_peering_connection_id=fn_vPCPeering_connection.ref, 106 | ) 107 | 108 | for idx, subnet in enumerate(vpc1.private_subnets): 109 | ec2.CfnRoute( 110 | self, 111 | f"PeerRoute-2-{idx}", 112 | route_table_id=subnet.route_table.route_table_id, 113 | destination_cidr_block=vpc2.vpc_cidr_block, 114 | vpc_peering_connection_id=fn_vPCPeering_connection.ref, 115 | ) 116 | 117 | # Allow postgres from vpc2 to vpc1 118 | svc_sg.add_ingress_rule( 119 | peer=test_sg, 120 | connection=ec2.Port.tcp(5432), 121 | description="Allow Postgres from VPC2", 122 | ) 123 | 124 | # Finally create an EMR Serverless app to test this on with the appropriate subnets and security group 125 | emrs.CfnApplication( 126 | self, 127 | "spark_app", 128 | release_label="emr-6.9.0", 129 | type="SPARK", 130 | name="cdk-spark", 131 | network_configuration=emrs.CfnApplication.NetworkConfigurationProperty( 132 | subnet_ids=vpc2.select_subnets().subnet_ids, 133 | security_group_ids=[test_sg.security_group_id], 134 | ), 135 | ) 136 | -------------------------------------------------------------------------------- /cdk/big-data-stack/stacks/emr.py: -------------------------------------------------------------------------------- 1 | from aws_cdk import ( 2 | core as cdk, 3 | aws_emr as emr, 4 | aws_ec2 as ec2, 5 | aws_iam as iam, 6 | aws_secretsmanager as secrets, 7 | ) 8 | 9 | from stacks.utils import get_or_create_bucket 10 | 11 | 12 | class EMRStack(cdk.Stack): 13 | cluster: emr.CfnCluster 14 | 15 | def __init__( 16 | self, 17 | scope: cdk.Construct, 18 | construct_id: str, 19 | vpc: ec2.IVpc, 20 | name: str, 21 | release_label: str, 22 | rds_secret: secrets.Secret, 23 | rds_connections: ec2.Connections, 24 | log_bucket_name: str = None, 25 | ssh_key_name: str = None, 26 | **kwargs, 27 | ) -> None: 28 | super().__init__(scope, construct_id, **kwargs) 29 | 30 | self.tag_vpc(vpc) 31 | 32 | job_role = self.get_job_role() 33 | service_role = self.get_service_role() 34 | instance_profile = self.create_instance_profile(job_role) 35 | log_bucket = get_or_create_bucket(self, "emr_logs", log_bucket_name) 36 | 37 | # Assign necessary permissions 38 | # EMR needs to be able to PutObject to the log bucket 39 | log_bucket.grant_put(job_role) 40 | 41 | # EMR needs to be able to PassRole to the instance profile role 42 | # https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-iam-role-for-ec2.html#emr-ec2-role-least-privilege 43 | # https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-iam-role.html 44 | service_role.add_to_policy( 45 | iam.PolicyStatement( 46 | actions=["iam:PassRole"], 47 | resources=[job_role.role_arn], 48 | conditions={ 49 | "StringEquals": {"iam:PassedToService": "ec2.amazonaws.com"} 50 | }, 51 | ) 52 | ) 53 | 54 | # Database configuration variables 55 | rds_hostname = rds_secret.secret_value_from_json("host").to_string() 56 | rds_port = rds_secret.secret_value_from_json("port").to_string() 57 | rds_dbname = rds_secret.secret_value_from_json("dbname").to_string() 58 | 59 | # Desired subnet for the EMR cluster 60 | emr_subnet = vpc.public_subnets[0] 61 | 62 | self.cluster = emr.CfnCluster( 63 | self, 64 | construct_id, 65 | instances=emr.CfnCluster.JobFlowInstancesConfigProperty( 66 | master_instance_group=emr.CfnCluster.InstanceGroupConfigProperty( 67 | instance_count=1, instance_type="m5.xlarge" 68 | ), 69 | core_instance_group=emr.CfnCluster.InstanceGroupConfigProperty( 70 | instance_count=2, instance_type="m5.xlarge" 71 | ), 72 | ec2_subnet_id=emr_subnet.subnet_id, 73 | ), 74 | name=name, 75 | release_label=release_label, 76 | log_uri=f"s3://{log_bucket.bucket_name}/elasticmapreduce/", 77 | job_flow_role=job_role.role_name, 78 | service_role=service_role.role_name, 79 | applications=[ 80 | emr.CfnCluster.ApplicationProperty(name=n) 81 | for n in [ 82 | "Spark", 83 | "Hive", 84 | "Zeppelin", 85 | "Livy", 86 | "JupyterEnterpriseGateway", 87 | ] 88 | ], 89 | visible_to_all_users=True, # Required for EMR Notebooks 90 | configurations=[ 91 | emr.CfnCluster.ConfigurationProperty( 92 | classification="hive-site", 93 | configuration_properties={ 94 | "javax.jdo.option.ConnectionURL": f"jdbc:mysql://{rds_hostname}:{rds_port}/{rds_dbname}?createDatabaseIfNotExist=true", 95 | "javax.jdo.option.ConnectionDriverName": "org.mariadb.jdbc.Driver", 96 | "javax.jdo.option.ConnectionUserName": rds_secret.secret_value_from_json( 97 | "username" 98 | ).to_string(), 99 | "javax.jdo.option.ConnectionPassword": rds_secret.secret_value_from_json( 100 | "password" 101 | ).to_string(), 102 | }, 103 | ), 104 | ], 105 | tags=[ 106 | cdk.CfnTag( 107 | key="for-use-with-amazon-emr-managed-policies", value="true" 108 | ), 109 | ], 110 | ) 111 | 112 | # Wait for the instance profile to be created 113 | self.cluster.add_depends_on(instance_profile) 114 | 115 | # Allow EMR to connect to the RDS database 116 | self.add_rds_ingres(emr_subnet.ipv4_cidr_block, rds_connections) 117 | 118 | def tag_vpc( 119 | self, 120 | vpc: ec2.IVpc, 121 | ) -> None: 122 | # The VPC requires a Tag to allow EMR to create the relevant security groups 123 | cdk.Tags.of(vpc).add("for-use-with-amazon-emr-managed-policies", "true") 124 | 125 | def add_rds_ingres(self, subnet, conn: ec2.Connections) -> None: 126 | conn.security_groups[0].add_ingress_rule( 127 | peer=ec2.Peer.ipv4(subnet), 128 | connection=ec2.Port.tcp(3306), 129 | description="EMR MySQL Access", 130 | ) 131 | 132 | def get_service_role(self) -> iam.Role: 133 | return iam.Role( 134 | self, 135 | "emr_service_role", 136 | assumed_by=iam.ServicePrincipal("elasticmapreduce.amazonaws.com"), 137 | managed_policies=[ 138 | iam.ManagedPolicy.from_aws_managed_policy_name( 139 | "service-role/AmazonEMRServicePolicy_v2" 140 | ) 141 | ], 142 | ) 143 | 144 | def get_job_role(self) -> iam.Role: 145 | """ 146 | Create a new EC2 instance profile role for EMR instances. 147 | This role allows full read-only access to S3. 148 | """ 149 | return iam.Role( 150 | self, 151 | "EMRJobRole", 152 | assumed_by=iam.ServicePrincipal("ec2.amazonaws.com"), 153 | managed_policies=[ 154 | iam.ManagedPolicy.from_aws_managed_policy_name( 155 | "AmazonS3ReadOnlyAccess" 156 | ) 157 | ] 158 | ) 159 | 160 | def create_instance_profile(self, job_role: iam.Role) -> iam.CfnInstanceProfile: 161 | return iam.CfnInstanceProfile( 162 | self, 163 | "emr_instance_profile", 164 | instance_profile_name=job_role.role_name, 165 | roles=[job_role.role_name], 166 | ) 167 | 168 | def log_writer_policy(self, bucket: str) -> iam.PolicyStatement: 169 | return iam.PolicyStatement( 170 | effect=iam.Effect.ALLOW, 171 | actions=["s3:PutObject"], 172 | resources=[f"arn:aws:s3:::{bucket}/*"], 173 | ) -------------------------------------------------------------------------------- /emr/eks/videos/custom_images/generate_aqi_map.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import datetime 3 | import io 4 | 5 | import boto3 6 | import geopandas as gpd 7 | from bokeh.io.export import get_screenshot_as_png 8 | from bokeh.io.webdriver import create_chromium_webdriver 9 | from bokeh.models import ColorBar, GeoJSONDataSource, LinearColorMapper 10 | from bokeh.palettes import Reds9 as palette 11 | from bokeh.plotting import figure 12 | from PIL.Image import Image 13 | from pyspark.broadcast import Broadcast 14 | from pyspark.context import SparkContext 15 | from pyspark.sql import SparkSession, dataframe 16 | from pyspark.sql.dataframe import DataFrame 17 | from pyspark.sql.functions import last, udf 18 | from pyspark.sql.types import StringType 19 | from pyspark.sql.window import Window 20 | from shapely.geometry import Point 21 | 22 | STATE_FILE = "file:///usr/local/share/bokeh/cb_2020_us_state_500k.zip" 23 | COUNTY_FILE = "file:///usr/local/share/bokeh/cb_2020_us_county_500k.zip" 24 | EXCLUDED_STATES = ["AK", "HI", "PR", "GU", "VI", "MP", "AS"] 25 | 26 | 27 | def find_first_county_id(longitude: float, latitude: float): 28 | p = Point(longitude, latitude) 29 | for index, geo in bc_county.value.items(): 30 | if geo.intersects(p): 31 | return index 32 | return None 33 | 34 | 35 | find_first_county_id_udf_v2 = udf(find_first_county_id, StringType()) 36 | 37 | 38 | def load_county_data(sc: SparkContext) -> Broadcast: 39 | """ 40 | Loads census.gov polygon data for US counties and broadcasts 41 | a hash of county GEOID to geometry. 42 | """ 43 | countydf = gpd.read_file(COUNTY_FILE) 44 | return sc.broadcast(dict(zip(countydf["GEOID"], countydf["geometry"]))) 45 | 46 | 47 | def get_latest_aqi_avg_by_county(date) -> DataFrame: 48 | """ 49 | Fetches `date` data from the OpenAQ dataset and performs the following: 50 | - Filters down to US only 51 | - Filters to pm2.5 readings 52 | - Retrieves the most recent reading 53 | - Enriches the dataframe with Census data county GEOID 54 | - Calculates the average reading per county 55 | """ 56 | df = spark.read.json(f"s3://openaq-fetches/realtime-gzipped/{date}/") 57 | 58 | # Filter down to US locations only 59 | usdf = ( 60 | df.where(df.country == "US") 61 | .where(df.parameter == "pm25") 62 | .select("coordinates", "date", "parameter", "unit", "value", "location") 63 | ) 64 | 65 | # Retrieve the most recent pm2.5 reading per county 66 | windowSpec = ( 67 | Window.partitionBy("location") 68 | .orderBy("date.utc") 69 | .rangeBetween(Window.unboundedPreceding, Window.unboundedFollowing) 70 | ) 71 | last_reading_df = ( 72 | usdf.withColumn("last_value", last("value").over(windowSpec)) 73 | .select("coordinates", "last_value") 74 | .distinct() 75 | ) 76 | 77 | # Find the county that this reading is from 78 | countydf = last_reading_df.withColumn( 79 | "GEOID", 80 | find_first_county_id_udf_v2( 81 | last_reading_df.coordinates.longitude, last_reading_df.coordinates.latitude 82 | ), 83 | ).select("GEOID", "last_value") 84 | 85 | # Calculate the average reading per county 86 | pm_avg_by_county = ( 87 | countydf.groupBy("GEOID") 88 | .agg({"last_value": "avg"}) 89 | .withColumnRenamed("avg(last_value)", "avg_value") 90 | ) 91 | 92 | return pm_avg_by_county 93 | 94 | 95 | def generate_map(df: dataframe, title: str) -> Image: 96 | """ 97 | Generate an air quality map for the continental US. 98 | """ 99 | palette_r = tuple(reversed(palette)) 100 | 101 | # Read in county and state geo data from census.gov 102 | county_df = gpd.read_file(COUNTY_FILE).query(f"STUSPS not in {EXCLUDED_STATES}") 103 | state_df = gpd.read_file(STATE_FILE).query(f"STUSPS not in {EXCLUDED_STATES}") 104 | 105 | # Merge in our air quality data 106 | county_aqi_df = county_df.merge(df.toPandas(), on="GEOID") 107 | color_column = "avg_value" 108 | 109 | # Convert to a "proper" Albers projection :) 110 | state_json = state_df.to_crs("ESRI:102003").to_json() 111 | county_json = county_aqi_df.to_crs("ESRI:102003").to_json() 112 | 113 | # Now build the plot! 114 | p = figure( 115 | title=title, 116 | plot_width=1100, 117 | plot_height=700, 118 | toolbar_location=None, 119 | x_axis_location=None, 120 | y_axis_location=None, 121 | tooltips=[ 122 | ("County", "@NAME"), 123 | ("Air Quality Index", "@avg_value"), 124 | ], 125 | ) 126 | color_mapper = LinearColorMapper(palette=palette_r) 127 | p.grid.grid_line_color = None 128 | p.hover.point_policy = "follow_mouse" 129 | p.patches( 130 | "xs", 131 | "ys", 132 | fill_alpha=0.0, 133 | line_color="black", 134 | line_width=0.5, 135 | source=GeoJSONDataSource(geojson=state_json), 136 | ) 137 | p.patches( 138 | "xs", 139 | "ys", 140 | fill_alpha=0.7, 141 | fill_color={"field": color_column, "transform": color_mapper}, 142 | line_color="black", 143 | line_width=0.5, 144 | source=GeoJSONDataSource(geojson=county_json), 145 | ) 146 | 147 | color_bar = ColorBar(color_mapper=color_mapper, label_standoff=12, width=10) 148 | p.add_layout(color_bar, "right") 149 | 150 | driver = create_chromium_webdriver(["--no-sandbox"]) 151 | return get_screenshot_as_png(p, height=700, width=1100, driver=driver) 152 | 153 | 154 | def upload_image(image: Image, bucket_name, key): 155 | print(f"Uploading image data to s3://{bucket_name}/{key}") 156 | client = boto3.client("s3") 157 | in_mem_file = io.BytesIO() 158 | image.save(in_mem_file, format="png") 159 | in_mem_file.seek(0) 160 | client.put_object(Bucket=bucket_name, Key=key, Body=in_mem_file) 161 | 162 | 163 | def parse_args() -> argparse.ArgumentParser: 164 | parser = argparse.ArgumentParser() 165 | parser.add_argument("bucket", help="The name of the S3 bucket to upload to.") 166 | parser.add_argument( 167 | "prefix", 168 | help="The prefix where the image file (date-latest.png) will be uploaded.", 169 | ) 170 | parser.add_argument( 171 | "--date", 172 | help="The date to create the AQI map for.", 173 | default=f"{datetime.datetime.utcnow().date()}", 174 | ) 175 | return parser.parse_args() 176 | 177 | 178 | if __name__ == "__main__": 179 | """ 180 | Generates an Air Quality Index (AQI) map for the continential US. 181 | By default, it generates the latest AQI readings for the current date. 182 | 183 | Usage: generate_aqi_map 184 | """ 185 | spark = SparkSession.builder.appName("AirQualityMapper").getOrCreate() 186 | bc_county = load_county_data(spark.sparkContext) 187 | 188 | args = parse_args() 189 | date = args.date 190 | bucket = args.bucket 191 | key = f"{args.prefix}{date}-latest.png" 192 | 193 | pm_reading_by_county = get_latest_aqi_avg_by_county(date) 194 | image = generate_map(pm_reading_by_county, f"US PM2.5 by county for {date}") 195 | upload_image(image, bucket, key) 196 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Creative Commons Legal Code 2 | 3 | CC0 1.0 Universal 4 | 5 | CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE 6 | LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN 7 | ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS 8 | INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES 9 | REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS 10 | PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM 11 | THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED 12 | HEREUNDER. 13 | 14 | Statement of Purpose 15 | 16 | The laws of most jurisdictions throughout the world automatically confer 17 | exclusive Copyright and Related Rights (defined below) upon the creator 18 | and subsequent owner(s) (each and all, an "owner") of an original work of 19 | authorship and/or a database (each, a "Work"). 20 | 21 | Certain owners wish to permanently relinquish those rights to a Work for 22 | the purpose of contributing to a commons of creative, cultural and 23 | scientific works ("Commons") that the public can reliably and without fear 24 | of later claims of infringement build upon, modify, incorporate in other 25 | works, reuse and redistribute as freely as possible in any form whatsoever 26 | and for any purposes, including without limitation commercial purposes. 27 | These owners may contribute to the Commons to promote the ideal of a free 28 | culture and the further production of creative, cultural and scientific 29 | works, or to gain reputation or greater distribution for their Work in 30 | part through the use and efforts of others. 31 | 32 | For these and/or other purposes and motivations, and without any 33 | expectation of additional consideration or compensation, the person 34 | associating CC0 with a Work (the "Affirmer"), to the extent that he or she 35 | is an owner of Copyright and Related Rights in the Work, voluntarily 36 | elects to apply CC0 to the Work and publicly distribute the Work under its 37 | terms, with knowledge of his or her Copyright and Related Rights in the 38 | Work and the meaning and intended legal effect of CC0 on those rights. 39 | 40 | 1. Copyright and Related Rights. A Work made available under CC0 may be 41 | protected by copyright and related or neighboring rights ("Copyright and 42 | Related Rights"). Copyright and Related Rights include, but are not 43 | limited to, the following: 44 | 45 | i. the right to reproduce, adapt, distribute, perform, display, 46 | communicate, and translate a Work; 47 | ii. moral rights retained by the original author(s) and/or performer(s); 48 | iii. publicity and privacy rights pertaining to a person's image or 49 | likeness depicted in a Work; 50 | iv. rights protecting against unfair competition in regards to a Work, 51 | subject to the limitations in paragraph 4(a), below; 52 | v. rights protecting the extraction, dissemination, use and reuse of data 53 | in a Work; 54 | vi. database rights (such as those arising under Directive 96/9/EC of the 55 | European Parliament and of the Council of 11 March 1996 on the legal 56 | protection of databases, and under any national implementation 57 | thereof, including any amended or successor version of such 58 | directive); and 59 | vii. other similar, equivalent or corresponding rights throughout the 60 | world based on applicable law or treaty, and any national 61 | implementations thereof. 62 | 63 | 2. Waiver. To the greatest extent permitted by, but not in contravention 64 | of, applicable law, Affirmer hereby overtly, fully, permanently, 65 | irrevocably and unconditionally waives, abandons, and surrenders all of 66 | Affirmer's Copyright and Related Rights and associated claims and causes 67 | of action, whether now known or unknown (including existing as well as 68 | future claims and causes of action), in the Work (i) in all territories 69 | worldwide, (ii) for the maximum duration provided by applicable law or 70 | treaty (including future time extensions), (iii) in any current or future 71 | medium and for any number of copies, and (iv) for any purpose whatsoever, 72 | including without limitation commercial, advertising or promotional 73 | purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each 74 | member of the public at large and to the detriment of Affirmer's heirs and 75 | successors, fully intending that such Waiver shall not be subject to 76 | revocation, rescission, cancellation, termination, or any other legal or 77 | equitable action to disrupt the quiet enjoyment of the Work by the public 78 | as contemplated by Affirmer's express Statement of Purpose. 79 | 80 | 3. Public License Fallback. Should any part of the Waiver for any reason 81 | be judged legally invalid or ineffective under applicable law, then the 82 | Waiver shall be preserved to the maximum extent permitted taking into 83 | account Affirmer's express Statement of Purpose. In addition, to the 84 | extent the Waiver is so judged Affirmer hereby grants to each affected 85 | person a royalty-free, non transferable, non sublicensable, non exclusive, 86 | irrevocable and unconditional license to exercise Affirmer's Copyright and 87 | Related Rights in the Work (i) in all territories worldwide, (ii) for the 88 | maximum duration provided by applicable law or treaty (including future 89 | time extensions), (iii) in any current or future medium and for any number 90 | of copies, and (iv) for any purpose whatsoever, including without 91 | limitation commercial, advertising or promotional purposes (the 92 | "License"). The License shall be deemed effective as of the date CC0 was 93 | applied by Affirmer to the Work. Should any part of the License for any 94 | reason be judged legally invalid or ineffective under applicable law, such 95 | partial invalidity or ineffectiveness shall not invalidate the remainder 96 | of the License, and in such case Affirmer hereby affirms that he or she 97 | will not (i) exercise any of his or her remaining Copyright and Related 98 | Rights in the Work or (ii) assert any associated claims and causes of 99 | action with respect to the Work, in either case contrary to Affirmer's 100 | express Statement of Purpose. 101 | 102 | 4. Limitations and Disclaimers. 103 | 104 | a. No trademark or patent rights held by Affirmer are waived, abandoned, 105 | surrendered, licensed or otherwise affected by this document. 106 | b. Affirmer offers the Work as-is and makes no representations or 107 | warranties of any kind concerning the Work, express, implied, 108 | statutory or otherwise, including without limitation warranties of 109 | title, merchantability, fitness for a particular purpose, non 110 | infringement, or the absence of latent or other defects, accuracy, or 111 | the present or absence of errors, whether or not discoverable, all to 112 | the greatest extent permissible under applicable law. 113 | c. Affirmer disclaims responsibility for clearing rights of other persons 114 | that may apply to the Work or any use thereof, including without 115 | limitation any person's Copyright and Related Rights in the Work. 116 | Further, Affirmer disclaims responsibility for obtaining any necessary 117 | consents, permissions or other rights required for any use of the 118 | Work. 119 | d. Affirmer understands and acknowledges that Creative Commons is not a 120 | party to this document and has no duty or obligation with respect to 121 | this CC0 or use of the Work. 122 | -------------------------------------------------------------------------------- /emr/airflow/mwaa_stack/mwaa/mwaa_stack.py: -------------------------------------------------------------------------------- 1 | from aws_cdk import core as cdk 2 | 3 | import aws_cdk.aws_mwaa as mwaa 4 | 5 | from aws_cdk import ( 6 | core as cdk, 7 | aws_ec2 as ec2, 8 | aws_mwaa as mwaa, 9 | aws_s3 as s3, 10 | aws_s3_deployment as s3d, 11 | aws_iam as iam, 12 | ) 13 | 14 | 15 | class MwaaStack(cdk.Stack): 16 | def __init__(self, scope: cdk.Construct, construct_id: str, **kwargs) -> None: 17 | super().__init__(scope, construct_id, **kwargs) 18 | 19 | # We'll create a VPC just for this 20 | vpc = ec2.Vpc(self, "mwaa-vpc", max_azs=2) 21 | 22 | # We need a bucket for assets 23 | bucket = s3.Bucket( 24 | self, 25 | "mwaa-bucket", 26 | versioned=True, 27 | auto_delete_objects=True, 28 | removal_policy=cdk.RemovalPolicy.DESTROY, 29 | block_public_access=s3.BlockPublicAccess.BLOCK_ALL, 30 | ) 31 | files = s3d.BucketDeployment( 32 | self, 33 | "mwaa-assets", 34 | sources=[s3d.Source.asset("./assets")], 35 | destination_bucket=bucket, 36 | ) 37 | 38 | # And a service role with additional EMR permissions 39 | # See https://docs.aws.amazon.com/mwaa/latest/userguide/mwaa-create-role.html 40 | mwaa_service_role = iam.Role( 41 | self, 42 | "mwaa-service-role", 43 | assumed_by=iam.CompositePrincipal( 44 | iam.ServicePrincipal("airflow.amazonaws.com"), 45 | iam.ServicePrincipal("airflow-env.amazonaws.com"), 46 | ), 47 | inline_policies={ 48 | "CDKmwaaPolicyDocument": self.mwaa_policy_document( 49 | "dacort-airflow", bucket.bucket_arn 50 | ) 51 | }, 52 | path="/service-role/", 53 | ) 54 | 55 | # And security group 56 | security_group = ec2.SecurityGroup( 57 | self, id="mwaa-sg", vpc=vpc, security_group_name="mwaa-sg" 58 | ) 59 | security_group.connections.allow_internally(ec2.Port.all_traffic(), "MWAA") 60 | 61 | # Enable logging on everything 62 | logging_configuration = mwaa.CfnEnvironment.LoggingConfigurationProperty( 63 | task_logs=mwaa.CfnEnvironment.ModuleLoggingConfigurationProperty( 64 | enabled=True, log_level="INFO" 65 | ), 66 | worker_logs=mwaa.CfnEnvironment.ModuleLoggingConfigurationProperty( 67 | enabled=True, log_level="INFO" 68 | ), 69 | scheduler_logs=mwaa.CfnEnvironment.ModuleLoggingConfigurationProperty( 70 | enabled=True, log_level="INFO" 71 | ), 72 | dag_processing_logs=mwaa.CfnEnvironment.ModuleLoggingConfigurationProperty( 73 | enabled=True, log_level="INFO" 74 | ), 75 | webserver_logs=mwaa.CfnEnvironment.ModuleLoggingConfigurationProperty( 76 | enabled=True, log_level="INFO" 77 | ), 78 | ) 79 | 80 | # Create our MWAA 81 | subnets = [subnet.subnet_id for subnet in vpc.private_subnets] 82 | airflow = mwaa.CfnEnvironment( 83 | self, 84 | "airflow-v2", 85 | name="dacort-airflow", 86 | airflow_version="2.0.2", 87 | dag_s3_path=f"dags/", 88 | source_bucket_arn=bucket.bucket_arn, 89 | execution_role_arn=mwaa_service_role.role_arn, 90 | requirements_s3_path="requirements.txt", 91 | webserver_access_mode="PUBLIC_ONLY", 92 | environment_class="mw1.small", 93 | network_configuration=mwaa.CfnEnvironment.NetworkConfigurationProperty( 94 | subnet_ids=subnets, 95 | security_group_ids=[security_group.security_group_id], 96 | ), 97 | logging_configuration=logging_configuration, 98 | ) 99 | airflow.node.add_dependency(files) 100 | 101 | # Register a couple outputs 102 | cdk.CfnOutput(self, "mwaa_bucket", value=bucket.bucket_name) 103 | cdk.CfnOutput(self, "mwaa_url", value=f"https://{airflow.attr_webserver_url}") 104 | 105 | def mwaa_policy_document(self, mwaa_env_name: str, mwaa_bucket_arn: str): 106 | return iam.PolicyDocument( 107 | statements=[ 108 | iam.PolicyStatement( 109 | actions=["airflow:PublishMetrics"], 110 | effect=iam.Effect.ALLOW, 111 | resources=[ 112 | f"arn:aws:airflow:{self.region}:{self.account}:environment/{mwaa_env_name}" 113 | ], 114 | ), 115 | iam.PolicyStatement( 116 | actions=["s3:ListAllMyBuckets"], 117 | effect=iam.Effect.DENY, 118 | resources=[f"{mwaa_bucket_arn}/*", f"{mwaa_bucket_arn}"], 119 | ), 120 | iam.PolicyStatement( 121 | actions=["s3:*"], 122 | effect=iam.Effect.ALLOW, 123 | resources=[f"{mwaa_bucket_arn}/*", f"{mwaa_bucket_arn}"], 124 | ), 125 | iam.PolicyStatement( 126 | actions=[ 127 | "logs:CreateLogStream", 128 | "logs:CreateLogGroup", 129 | "logs:PutLogEvents", 130 | "logs:GetLogEvents", 131 | "logs:GetLogRecord", 132 | "logs:GetLogGroupFields", 133 | "logs:GetQueryResults", 134 | ], 135 | effect=iam.Effect.ALLOW, 136 | resources=[ 137 | f"arn:aws:logs:{self.region}:{self.account}:log-group:airflow-{mwaa_env_name}-*" 138 | ], 139 | ), 140 | iam.PolicyStatement( 141 | actions=["logs:DescribeLogGroups"], 142 | effect=iam.Effect.ALLOW, 143 | resources=["*"], 144 | ), 145 | iam.PolicyStatement( 146 | actions=[ 147 | "sqs:ChangeMessageVisibility", 148 | "sqs:DeleteMessage", 149 | "sqs:GetQueueAttributes", 150 | "sqs:GetQueueUrl", 151 | "sqs:ReceiveMessage", 152 | "sqs:SendMessage", 153 | ], 154 | effect=iam.Effect.ALLOW, 155 | resources=[f"arn:aws:sqs:{self.region}:*:airflow-celery-*"], 156 | ), 157 | iam.PolicyStatement( 158 | actions=[ 159 | "kms:Decrypt", 160 | "kms:DescribeKey", 161 | "kms:GenerateDataKey*", 162 | "kms:Encrypt", 163 | ], 164 | effect=iam.Effect.ALLOW, 165 | resources=["*"], 166 | conditions={ 167 | "StringEquals": { 168 | "kms:ViaService": [ 169 | f"sqs.{self.region}.amazonaws.com", 170 | f"s3.{self.region}.amazonaws.com", 171 | ] 172 | } 173 | }, 174 | ), 175 | iam.PolicyStatement( 176 | actions=[ 177 | "emr-containers:StartJobRun", 178 | "emr-containers:DescribeJobRun", 179 | "emr-containers:CancelJobRun", 180 | ], 181 | effect=iam.Effect.ALLOW, 182 | resources=["*"], 183 | ), 184 | iam.PolicyStatement( 185 | actions=[ 186 | "elasticmapreduce:RunJobFlow", 187 | "elasticmapreduce:DescribeStep", 188 | "elasticmapreduce:DescribeCluster", 189 | ], 190 | effect=iam.Effect.ALLOW, 191 | resources=["*"], 192 | ), 193 | iam.PolicyStatement( 194 | actions=["iam:PassRole"], 195 | effect=iam.Effect.ALLOW, 196 | resources=[ 197 | f"arn:aws:iam::{self.account}:role/EMR_DemoRole", 198 | f"arn:aws:iam::{self.account}:role/EMR_EC2_DemoRole", 199 | f"arn:aws:iam::{self.account}:role/EMR_EC2_DefaultRole", 200 | f"arn:aws:iam::{self.account}:role/EMR_DefaultRole", 201 | ], 202 | ), 203 | ] 204 | ) 205 | -------------------------------------------------------------------------------- /spark/local-k8s/README.md: -------------------------------------------------------------------------------- 1 | # Spark on Local Kubernetes 2 | 3 | This is a demo of how to get Spark up and running with a local (KIND) Kubernetes environment. 4 | 5 | 6 | ## Pre-requisites 7 | 8 | - [KIND](https://kind.sigs.k8s.io/) 9 | - Docker, `kubectl`, `helm` 10 | 11 | ## Install and start KIND 12 | 13 | ```bash 14 | kind create cluster --config kind-config.yaml 15 | kubectl cluster-info --context kind-kind 16 | ``` 17 | 18 | ``` 19 | ❯ kubectl cluster-info --context kind-kind 20 | Kubernetes control plane is running at https://127.0.0.1:61563 21 | CoreDNS is running at https://127.0.0.1:61563/api/v1/namespaces/kube-system/services/kube-dns:dns/proxy 22 | 23 | To further debug and diagnose cluster problems, use 'kubectl cluster-info dump'. 24 | ``` 25 | 26 | By default, `kind` adds the cluster info to your `~/.kube/config` file. If you need to get it later (maybe your config got updated), you can always use something like this: 27 | 28 | ```bash 29 | kind get kubeconfig > ~/.kube/kind-config 30 | KUBECONFIG=~/.kube/kind-config kubectl get pods 31 | ``` 32 | 33 | ### (Optional) Install Official Apache Spark K8s Operator 34 | 35 | ```bash 36 | helm install spark-kubernetes-operator \ 37 | https://nightlies.apache.org/spark/charts/spark-kubernetes-operator-0.1.0-SNAPSHOT.tgz 38 | ``` 39 | 40 | #### uninstall process 41 | 42 | ```bash 43 | helm uninstall spark-kubernetes-operator 44 | kubectl delete crd sparkapplications.spark.apache.org 45 | kubectl delete crd sparkclusters.spark.apache.org 46 | ``` 47 | 48 | ## Verify Installs 49 | 50 | ```bash 51 | kubectl --context=kind-kind get pods --all-namespaces 52 | ``` 53 | 54 | _remaining commands assume your `kubectl` context is set to `kind-kind`_ 55 | 56 | ``` 57 | ❯ kubectl --context=kind-kind get pods --all-namespaces 58 | NAMESPACE NAME READY STATUS RESTARTS AGE 59 | kube-system coredns-7db6d8ff4d-7x252 1/1 Running 0 2m36s 60 | kube-system coredns-7db6d8ff4d-kjk6v 1/1 Running 0 2m36s 61 | kube-system etcd-kind-control-plane 1/1 Running 0 2m52s 62 | kube-system kindnet-62kqj 1/1 Running 0 2m36s 63 | kube-system kube-apiserver-kind-control-plane 1/1 Running 0 2m51s 64 | kube-system kube-controller-manager-kind-control-plane 1/1 Running 0 2m51s 65 | kube-system kube-proxy-ggk2n 1/1 Running 0 2m36s 66 | kube-system kube-scheduler-kind-control-plane 1/1 Running 0 2m51s 67 | local-path-storage local-path-provisioner-988d74bc-dkc6k 1/1 Running 0 2m36s 68 | spark-operator spark-operator-7b7b54cf75-8p9jb 1/1 Running 0 25s 69 | ``` 70 | 71 | 72 | ## Run a Spark job 73 | 74 | ```bash 75 | kubectl --context=kind-kind create -f - <<-EOF 76 | apiVersion: spark.apache.org/v1alpha1 77 | kind: SparkApplication 78 | metadata: 79 | name: pi-python 80 | spec: 81 | pyFiles: "local:///opt/spark/examples/src/main/python/pi.py" 82 | sparkConf: 83 | spark.dynamicAllocation.enabled: "true" 84 | spark.dynamicAllocation.shuffleTracking.enabled: "true" 85 | spark.dynamicAllocation.maxExecutors: "3" 86 | spark.log.structuredLogging.enabled: "false" 87 | spark.kubernetes.authenticate.driver.serviceAccountName: "spark" 88 | spark.kubernetes.container.image: "apache/spark:4.0.0-preview2" 89 | applicationTolerations: 90 | resourceRetainPolicy: OnFailure 91 | runtimeVersions: 92 | sparkVersion: "4.0.0-preview2" 93 | EOF 94 | ``` 95 | 96 | Once the container image downloads and the container starts, you can watch the logs with: 97 | 98 | ```bash 99 | kubectl logs -f pi-python-0-driver 100 | ``` 101 | 102 | To delete the app, use: 103 | 104 | ```bash 105 | kubectl delete sparkapp/pi-python 106 | ``` 107 | 108 | ## Let's try S3 tables 109 | 110 | Per the docs on [S3 tables with Apache Spark](https://docs.aws.amazon.com/AmazonS3/latest/userguide/s3-tables-integrating-open-source-spark.html). 111 | 112 | - Create a table bucket in a region near me 113 | 114 | ```bash 115 | aws s3tables create-table-bucket \ 116 | --region us-west-2 \ 117 | --name dacort-berg 118 | ``` 119 | 120 | ```json 121 | { 122 | "arn": "arn:aws:s3tables:us-west-2::bucket/dacort-berg" 123 | } 124 | ``` 125 | 126 | - Spin up a Spark SQL shell 127 | 128 | First, we create a persistent pod we can exec into. 129 | 130 | ```bash 131 | kubectl apply -f spark-shell-pod.yaml 132 | ``` 133 | 134 | Then start up Spark SQL 135 | 136 | _note that we assume you already have your AWS CLI setup and can export credentials_ 137 | 138 | ```bash 139 | export AWS_ACCOUNT_ID=$(aws sts get-caller-identity --query Account --output text) 140 | export TABLE_BUCKET_NAME=dacort-berg 141 | 142 | kubectl exec -it spark-shell-pod -- /bin/bash -c "export AWS_REGION=us-west-2;$(aws configure export-credentials --format env | tr '\n' ';') \ 143 | /opt/spark/bin/spark-sql \ 144 | --packages org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.6.1,software.amazon.awssdk:s3tables:2.29.26,software.amazon.awssdk:s3:2.29.26,software.amazon.awssdk:sts:2.29.26,software.amazon.awssdk:kms:2.29.26,software.amazon.awssdk:glue:2.29.26,software.amazon.awssdk:dynamodb:2.29.26,software.amazon.s3tables:s3-tables-catalog-for-iceberg-runtime:0.1.3 \ 145 | --conf spark.jars.ivy=/opt/spark/work-dir/.ivy2 \ 146 | --conf spark.sql.catalog.s3tablesbucket=org.apache.iceberg.spark.SparkCatalog \ 147 | --conf spark.sql.catalog.s3tablesbucket.catalog-impl=software.amazon.s3tables.iceberg.S3TablesCatalog \ 148 | --conf spark.sql.catalog.s3tablesbucket.warehouse=arn:aws:s3tables:us-west-2:${AWS_ACCOUNT_ID}:bucket/${TABLE_BUCKET_NAME} \ 149 | --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions" 150 | ``` 151 | 152 | - Create a new S3 Table 153 | 154 | ```sql 155 | CREATE NAMESPACE IF NOT EXISTS s3tablesbucket.default; 156 | 157 | CREATE TABLE IF NOT EXISTS s3tablesbucket.default.`demo` 158 | ( id INT, name STRING, value INT ) 159 | USING iceberg; 160 | 161 | INSERT INTO s3tablesbucket.default.demo VALUES (1, 'damon', 33), (2, 'dad', 34); 162 | 163 | SELECT * FROM s3tablesbucket.default.demo; 164 | ``` 165 | 166 | ``` 167 | spark-sql (default)> SELECT * FROM s3tablesbucket.default.demo; 168 | 1 damon 33 169 | 2 dad 34 170 | Time taken: 3.455 seconds, Fetched 2 row(s) 171 | ``` 172 | 173 | ## Reading S3 Tables with other query engines (DuckDB) 174 | 175 | The neat(?) thing about S3 Tables is that it's just Iceberg behind the scenes. 176 | 177 | So if you use `aws s3tables get-table`, you can find the metadata location: 178 | 179 | ```bash 180 | aws s3tables get-table --table-bucket-arn arn:aws:s3tables:us-west-2:${AWS_ACCOUNT_ID}:bucket/${TABLE_BUCKET_NAME} --namespace default --name demo 181 | ``` 182 | 183 | ```json 184 | { 185 | "name": "demo", 186 | "type": "customer", 187 | "tableARN": "arn:aws:s3tables:us-west-2::bucket/dacort-berg/table/e0b502d9-5de1-46a4-8633-412b78401be3", 188 | "namespace": [ 189 | "default" 190 | ], 191 | "versionToken": "", 192 | "metadataLocation": "s3://502d9-5de1-46a4---table-s3/metadata/00001-e76a727d-8a4d-4883-95c7-dea809f2a4cb.metadata.json", 193 | "warehouseLocation": "s3://502d9-5de1-46a4---table-s3", 194 | "createdAt": "2024-12-18T21:20:39.347151+00:00", 195 | "createdBy": "", 196 | "modifiedAt": "2024-12-18T21:25:22.327612+00:00", 197 | "ownerAccountId": "", 198 | "format": "ICEBERG" 199 | } 200 | ``` 201 | 202 | If you take the `metadataLocation` from the response and use that in DuckDB (with the `iceberg`, `httpfs` extensions installed and an [S3 secret created](https://duckdb.org/docs/configuration/secrets_manager.html#temporary-secrets))...it seems to work! 203 | 204 | _using duckdb v1.1.3 19864453f7_ 205 | 206 | ```sql 207 | -- Install/load Iceberg and https extensions 208 | -- Set up S3 access to my specific region 209 | INSTALL iceberg; 210 | LOAD iceberg; 211 | INSTALL https; 212 | LOAD https; 213 | CREATE SECRET secret1 ( 214 | TYPE S3, 215 | PROVIDER CREDENTIAL_CHAIN, 216 | ENDPOINT 's3.us-west-2.amazonaws.com' 217 | ); 218 | 219 | -- Query using the metadat file from above! 220 | SELECT count(*) 221 | FROM iceberg_scan('s3://502d9-5de1-46a4---table-s3/metadata/00001-e76a727d-8a4d-4883-95c7-dea809f2a4cb.metadata.json'); 222 | ``` 223 | 224 | ``` 225 | ┌──────────────┐ 226 | │ count_star() │ 227 | │ int64 │ 228 | ├──────────────┤ 229 | │ 2 │ 230 | └──────────────┘ 231 | ``` 232 | 233 | ```sql 234 | SELECT * FROM iceberg_scan('s3://502d9-5de1-46a4---table-s3/metadata/00001-e76a727d-8a4d-4883-95c7-dea809f2a4cb.metadata.json'); 235 | ``` 236 | 237 | ``` 238 | ┌───────┬─────────┬───────┐ 239 | │ id │ name │ value │ 240 | │ int32 │ varchar │ int32 │ 241 | ├───────┼─────────┼───────┤ 242 | │ 1 │ damon │ 33 │ 243 | │ 2 │ dad │ 34 │ 244 | └───────┴─────────┴───────┘ 245 | ``` 246 | 247 | 🤯 248 | 249 | - What happens if I insert more data? 250 | 251 | The `metadataLocation` gets updated and we can, of course, query each different version of the table. 🎉 252 | -------------------------------------------------------------------------------- /emr/eks/README.md: -------------------------------------------------------------------------------- 1 | # EMR on EKS Demo 2 | 3 | ## Spin up an EMR on EC2 Cluster 4 | 5 | Typically with EMR you figure out the following: 6 | 7 | - The EMR version you want to run 8 | - The VPC/Subnet to run your cluster in 9 | - The SSH keypair to use 10 | - The S3 bucket to send your cluster logs to 11 | - The different applications to run on the cluster 12 | - The instance types, count, and configuration 13 | 14 | The command below spins up a cluster in `us-east-1` with Spark on EMR 5.32.0 15 | 16 | ```shell 17 | VERSION=emr-5.32.0 18 | KEYPAIR= 19 | SUBNET_ID= 20 | LOG_BUCKET= 21 | 22 | aws emr create-cluster --applications Name=Spark Name=Zeppelin \ 23 | --ec2-attributes '{"KeyName":"'${KEYPAIR}'","InstanceProfile":"EMR_EC2_DefaultRole","SubnetId":"'${SUBNET_ID}'"}' \ 24 | --service-role EMR_DefaultRole \ 25 | --enable-debugging \ 26 | --release-label ${VERSION} \ 27 | --log-uri "s3n://${LOG_BUCKET}/elasticmapreduce/" \ 28 | --name 'dacort-spark' \ 29 | --instance-groups '[{"InstanceCount":1,"EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"SizeInGB":32,"VolumeType":"gp2"},"VolumesPerInstance":2}]},"InstanceGroupType":"MASTER","InstanceType":"m5.xlarge","Name":"Master Instance Group"},{"InstanceCount":2,"EbsConfiguration":{"EbsBlockDeviceConfigs":[{"VolumeSpecification":{"SizeInGB":32,"VolumeType":"gp2"},"VolumesPerInstance":2}]},"InstanceGroupType":"CORE","InstanceType":"m5.xlarge","Name":"Core Instance Group"}]' \ 30 | --configurations '[{"Classification":"spark","Properties":{}}]' \ 31 | --scale-down-behavior TERMINATE_AT_TASK_COMPLETION \ 32 | --region us-east-1 33 | ``` 34 | 35 | So...let's take a quick look at some data and see what it takes to run analysis on EMR on EKS. 👇 36 | 37 | ## Explore a dataset 38 | 39 | Idea: "What was the max wind speed in Seattle in 2021?" or "Average hourly rainfall when there was rain" 40 | 41 | We can use the [NOAA Integrated Surface Database](https://registry.opendata.aws/noaa-isd/) hourly data in CSV format. 42 | 43 | ```shell 44 | aws s3 ls s3://noaa-global-hourly-pds/2021/ --no-sign-request 45 | ``` 46 | 47 | See the code in [`windy_city.py`](./windy_city.py) for a full example. 48 | 49 |
50 | Here is exploratory code we can use in a pyspark shell once we've SSH'ed into our EMR cluster. 51 | 52 | ```python 53 | from pyspark.sql import functions as F 54 | from pyspark.sql.types import DoubleType 55 | 56 | # Reads the 2021 ISD data 57 | df = spark.read.format("csv") \ 58 | .option("header", "true")\ 59 | .option("inferSchema", "true") \ 60 | .load("s3://noaa-global-hourly-pds/2021/") 61 | 62 | # Shows a sample row from Seattle 63 | df \ 64 | .withColumn('LATITUDE', df.LATITUDE.cast(DoubleType())) \ 65 | .withColumn('LONGITUDE', df.LONGITUDE.cast(DoubleType())) \ 66 | .filter(df.LATITUDE >= 47.41).filter(df.LATITUDE <= 47.49) \ 67 | .filter(df.LONGITUDE >= -122.48).filter(df.LONGITUDE <= -122.16) \ 68 | .take(1) 69 | 70 | 71 | # See if we can split the wind speed properly 72 | seadf = df \ 73 | .withColumn('LATITUDE', df.LATITUDE.cast(DoubleType())) \ 74 | .withColumn('LONGITUDE', df.LONGITUDE.cast(DoubleType())) \ 75 | .filter(df.LATITUDE >= 47.41).filter(df.LATITUDE <= 47.49) \ 76 | .filter(df.LONGITUDE >= -122.48).filter(df.LONGITUDE <= -122.16) 77 | 78 | seadf.select("DATE", "NAME", "WND") \ 79 | .withColumn("windSpeed", F.split(df.WND, ",")[3].cast(DoubleType())/10 ) \ 80 | .take(10) 81 | # [Row(DATE='2021-01-01T00:00:00', NAME='SEATTLE TACOMA AIRPORT, WA US', WND='200,1,N,0046,1', windSpeed=4.6), Row(DATE='2021-01-01T00:17:00', NAME='SEATTLE TACOMA AIRPORT, WA US', WND='200,5,N,0041,5', windSpeed=4.1), Row(DATE='2021-01-01T00:37:00', NAME='SEATTLE TACOMA AIRPORT, WA US', WND='170,5,N,0031,5', windSpeed=3.1), Row(DATE='2021-01-01T00:53:00', NAME='SEATTLE TACOMA AIRPORT, WA US', WND='190,5,N,0041,5', windSpeed=4.1), Row(DATE='2021-01-01T01:53:00', NAME='SEATTLE TACOMA AIRPORT, WA US', WND='190,5,N,0051,5', windSpeed=5.1), Row(DATE='2021-01-01T02:39:00', NAME='SEATTLE TACOMA AIRPORT, WA US', WND='180,5,N,0041,5', windSpeed=4.1), Row(DATE='2021-01-01T02:53:00', NAME='SEATTLE TACOMA AIRPORT, WA US', WND='180,5,N,0041,5', windSpeed=4.1), Row(DATE='2021-01-01T03:32:00', NAME='SEATTLE TACOMA AIRPORT, WA US', WND='190,5,N,0036,5', windSpeed=3.6), Row(DATE='2021-01-01T03:53:00', NAME='SEATTLE TACOMA AIRPORT, WA US', WND='190,5,N,0041,5', windSpeed=4.1), Row(DATE='2021-01-01T04:49:00', NAME='SEATTLE TACOMA AIRPORT, WA US', WND='180,5,N,0031,5', windSpeed=3.1)] 82 | 83 | # OK, now create our slim dataframe and get top wind speed per day 84 | wind_date_df = seadf.select("DATE", "NAME", "WND") \ 85 | .withColumn("windSpeed", F.split(df.WND, ",")[3].cast(DoubleType())/10 ) \ 86 | .withColumn("ymd", F.split(df.DATE, "T")[0]) \ 87 | .filter(seadf.windSpeed != 999.9) 88 | 89 | wind_date_df.groupBy("ymd") \ 90 | .agg({'windSpeed':'max'}) \ 91 | .orderBy("ymd") \ 92 | .show() 93 | ``` 94 | 95 |
96 | 97 | And the output... 98 | 99 | ``` 100 | >>> wind_date_df.groupBy("ymd").agg({'windSpeed':'max'}).orderBy("ymd").show() 101 | +----------+--------------+ 102 | | ymd|max(windSpeed)| 103 | +----------+--------------+ 104 | |2021-01-01| 9.3| 105 | |2021-01-02| 10.3| 106 | |2021-01-03| 10.3| 107 | |2021-01-04| 8.2| 108 | |2021-01-05| 9.8| 109 | |2021-01-06| 8.2| 110 | |2021-01-07| 4.6| 111 | |2021-01-08| 8.8| 112 | |2021-01-09| 6.2| 113 | |2021-01-10| 7.2| 114 | |2021-01-11| 10.3| 115 | |2021-01-12| 6.7| 116 | |2021-01-13| 13.9| 117 | +----------+--------------+ 118 | ``` 119 | 120 | 👏 121 | 122 | ## EMR on EKS 123 | 124 | ### EKS Setup 125 | 126 | First we need to have an EKS cluster already running with the EMR namespace configured. If you don't already have an EKS cluster running, you'll likely need Admin access to your account to get this all set up. 127 | 128 | You can follow the [EMR on EKS Getting started guide](https://docs.aws.amazon.com/emr/latest/EMR-on-EKS-DevelopmentGuide/getting-started.html). 129 | 130 | A couple notes: 131 | 132 | - When creating the [job execution role](https://docs.aws.amazon.com/emr/latest/EMR-on-EKS-DevelopmentGuide/creating-job-execution-role.html), select `Another AWS account` as the trusted entity and use your Account ID. 133 | - You will need to create a Node Group for Fargate profile for the namespace you created in EMR for the jobs to run. 134 | 135 | As an example for #2 above, I created an EKS Fargate-only cluster and had to run the following command to create the desired profile: 136 | 137 | ```shell 138 | eksctl create fargateprofile \ 139 | --cluster \ 140 | --name emr-profile \ 141 | --namespace 142 | ``` 143 | 144 | ## EMR Setup 145 | 146 | Now that you've got a running EKS cluster(!), configured your execution roles and created an EMR Virtual Cluster that's mapped to EKS 😅 go ahead and upload your code to S3 and run a job! 147 | 148 | ```shell 149 | S3_BUCKET= 150 | aws s3 cp windy_city.py s3://${S3_BUCKET}/code/pyspark/windy_city.py 151 | ``` 152 | 153 | Fill in your EMR on EKS Cluster ID and Execution role. I've configured this job to log to S3, but you can also use CloudFront as noted in [EMR EKS Job Parameters](https://docs.aws.amazon.com/emr/latest/EMR-on-EKS-DevelopmentGuide/emr-eks-jobs-CLI.html#emr-eks-jobs-parameters). Just make sure your execution role has the right permissions. 154 | 155 | ```shell 156 | S3_BUCKET= 157 | EMR_EKS_CLUSTER_ID= 158 | EMR_EKS_EXECUTION_ROLE=arn:aws:iam:::role/ 159 | 160 | aws emr-containers start-job-run \ 161 | --virtual-cluster-id ${EMR_EKS_CLUSTER_ID} \ 162 | --name dacort-windycity \ 163 | --execution-role-arn ${EMR_EKS_EXECUTION_ROLE} \ 164 | --release-label emr-5.32.0-latest \ 165 | --job-driver '{ 166 | "sparkSubmitJobDriver": { 167 | "entryPoint": "s3://'${S3_BUCKET}'/code/pyspark/windy_city.py", 168 | "sparkSubmitParameters": "--conf spark.executor.instances=2 --conf spark.executor.memory=2G --conf spark.executor.cores=2 --conf spark.driver.cores=1" 169 | } 170 | }' \ 171 | --configuration-overrides '{ 172 | "monitoringConfiguration": { 173 | "s3MonitoringConfiguration": { "logUri": "s3://'${S3_BUCKET}'/emr-eks-logs/windy_city" } 174 | } 175 | }' 176 | ``` 177 | 178 | That command should spin up your Spark job on EKS and write the output to S3! 🙌 179 | 180 | You should see the top wind speed per day in your Spark driver `stdout.gz` file on S3 after the job finishes. 181 | 182 | - Want to run it on EMR 6.2.0? Change `--release-label` to `emr-6.2.0-latest` 183 | - Want to run the windy city script for San Francisco? Add `"entryPointArguments": ["-123.18,37.64,-122.28,37.93"]` to the `sparkSubmitJobDriver` JSON 184 | 185 | ## Cleanup 186 | 187 | 1. Make sure you don't have any managed endpoints for EMR Studio 188 | 189 | ```shell 190 | # List existing managed endpoints for your virtual cluster 191 | aws emr-containers list-managed-endpoints \ 192 | --virtual-cluster-id ${EMR_EKS_CLUSTER_ID} \ 193 | --output text \ 194 | --query 'endpoints[*].[id,state,name]' 195 | 196 | # Delete them if you do 197 | for endpoint_id in $(aws emr-containers list-managed-endpoints --virtual-cluster-id ${EMR_EKS_CLUSTER_ID} --output text --query 'endpoints[*].[id]'); do 198 | echo "Deleting ${endpoint_id}" 199 | aws emr-containers delete-managed-endpoint \ 200 | --id ${endpoint_id} \ 201 | --virtual-cluster-id ${EMR_EKS_CLUSTER_ID} 202 | done 203 | ``` 204 | 205 | 2. Delete the virtual cluster 206 | 207 | ```shell 208 | aws emr-containers delete-virtual-cluster --id ${EMR_EKS_CLUSTER_ID} 209 | ``` 210 | -------------------------------------------------------------------------------- /athena/Athena_Exploration.md: -------------------------------------------------------------------------------- 1 | # Athena exploration 2 | 3 | Walkthrough of different Athena functionality using the [Amazon Customer Reviews](https://registry.opendata.aws/amazon-reviews/) open dataset. 4 | 5 | This dataset provides both TSV and Parquet versions of over 130 million customer reviews since 1995. 6 | 7 | ## Table Definitions 8 | 9 | Create a table in Athena over the TSV dataset. 10 | 11 | ```sql 12 | CREATE EXTERNAL TABLE amazon_reviews_tsv( 13 | marketplace string, 14 | customer_id string, 15 | review_id string, 16 | product_id string, 17 | product_parent string, 18 | product_title string, 19 | product_category string, 20 | star_rating int, 21 | helpful_votes int, 22 | total_votes int, 23 | vine string, 24 | verified_purchase string, 25 | review_headline string, 26 | review_body string, 27 | review_date date) 28 | ROW FORMAT DELIMITED 29 | FIELDS TERMINATED BY '\t' 30 | ESCAPED BY '\\' 31 | LINES TERMINATED BY '\n' 32 | LOCATION 33 | 's3://amazon-reviews-pds/tsv/' 34 | TBLPROPERTIES ("skip.header.line.count"="1"); 35 | ``` 36 | 37 | Run a simple query to preview the data. 38 | 39 | ```sql 40 | SELECT * FROM "amazon_reviews_tsv" 41 | WHERE marketplace = 'US' 42 | limit 10; 43 | ``` 44 | 45 | Create a table over the Parquet dataset. It's partitioned by `product_category`. 46 | 47 | Run a couple aggregation queries to see the amount of data scanned is minimal (kb-mb) compared to the full size of the data on S3 (~50 GiB). 48 | 49 | ```sql 50 | CREATE EXTERNAL TABLE `amazon_reviews_parquet`( 51 | `marketplace` string, 52 | `customer_id` string, 53 | `review_id` string, 54 | `product_id` string, 55 | `product_parent` string, 56 | `product_title` string, 57 | `star_rating` int, 58 | `helpful_votes` int, 59 | `total_votes` int, 60 | `vine` string, 61 | `verified_purchase` string, 62 | `review_headline` string, 63 | `review_body` string, 64 | `review_date` bigint, 65 | `year` int) 66 | PARTITIONED BY ( `product_category` string ) 67 | STORED AS PARQUET 68 | LOCATION 's3://amazon-reviews-pds/parquet'; 69 | ``` 70 | 71 | ```sql 72 | SELECT product_id, product_title, count(*) as num_reviews, avg(star_rating) as avg_stars 73 | FROM amazon_reviews_parquet where product_category='Toys' 74 | GROUP BY 1, 2 75 | ORDER BY 3 DESC 76 | limit 100; 77 | 78 | SELECT COUNT(*) FROM amazon_reviews_parquet where product_category='Toys'AND year >= 2012 79 | 80 | SELECT * FROM amazon_reviews_parquet 81 | WHERE product_category='Toys' 82 | LIMIT 100; 83 | ``` 84 | 85 | ## CTAS Example 86 | 87 | Re-partition by marketplace and year to allow for efficient queries. (This takes ~5 minutes to run),. 88 | 89 | By default, the results are stored in a bucket automatically created in your account for Athena output: `aws-athena-query-results--`. 90 | 91 | See [Athena CTAS examples](https://docs.aws.amazon.com/athena/latest/ug/ctas-examples.html) for how to specify a specific S3 location with the `external_location` parameter. 92 | 93 | ```sql 94 | CREATE TABLE amazon_reviews_by_marketplace 95 | WITH ( 96 | format='PARQUET', 97 | partitioned_by = ARRAY['marketplace', 'year'] 98 | ) AS 99 | SELECT customer_id, review_id, product_id, product_parent, product_title, product_category, star_rating, helpful_votes, total_votes, verified_purchase, review_headline, review_body, review_date, 100 | marketplace, 101 | year(review_date) as year 102 | FROM amazon_reviews_tsv 103 | WHERE "$path" LIKE '%tsv.gz' 104 | -- Run time: 4 minutes 43 seconds, Data scanned: 32.24 GB 105 | ``` 106 | 107 | Show different query times and data scanned 108 | 109 | ```sql 110 | SELECT product_id, COUNT(*) FROM amazon_reviews_by_marketplace 111 | GROUP BY 1 ORDER BY 2 DESC LIMIT 10 112 | -- Run time: 6.7 seconds, Data scanned: 790.26 MB 113 | ``` 114 | 115 | vs. 116 | 117 | ```sql 118 | SELECT product_id, COUNT(*) FROM amazon_reviews_by_marketplace 119 | WHERE marketplace='US' AND year = 2013 120 | GROUP BY 1 ORDER BY 2 DESC LIMIT 10 121 | -- Run time: 3.87 seconds, Data scanned: 145 MB 122 | ``` 123 | 124 | ## Optimization Techniques 125 | 126 | ### Sorting by a specific field 127 | 128 | If you frequently query data based on an ID and expect a limited amount of data to be returned, you can sort the original dataset by that ID and write it out to a limited number of objects on S3. Athena will use the [parquet metadata](#parquet-metadata) to determine if it should read the underlying data. 129 | 130 | One option is to use CTAS to create a derivative dataset and sort on the specific fields. This can take a while to run thanks to the sorting and the execution plan. 131 | 132 | ```sql 133 | CREATE TABLE amazon_reviews_sorted 134 | WITH ( 135 | format='PARQUET' 136 | ) AS 137 | SELECT product_id, customer_id, product_parent, star_rating, helpful_votes, total_votes, verified_purchase, marketplace, product_category, review_date 138 | FROM amazon_reviews_by_marketplace 139 | ORDER BY product_id ASC 140 | -- Run time: 18 minutes 13 seconds, Data scanned: 2.44 GB 141 | ``` 142 | 143 | Note that this only outputs seven heavily-skewed files, but all rows for a specific `product_id` should be in one file. 144 | 145 | ```sql 146 | SELECT "$path", product_id, COUNT(*) FROM amazon_reviews_sorted 147 | WHERE product_id = 'B00E8KLWB4' 148 | GROUP BY 1, 2 ORDER BY 1 ASC 149 | -- Run time: 4.18 seconds, Data scanned: 81.9 MB) 150 | ``` 151 | 152 | vs. 153 | 154 | ```sql 155 | CREATE TABLE amazon_reviews_unsorted 156 | WITH ( 157 | format='PARQUET', 158 | bucketed_by = ARRAY['review_id'], 159 | bucket_count = 30 160 | ) AS 161 | SELECT review_id, product_id, customer_id, product_parent, star_rating, helpful_votes, total_votes, verified_purchase, marketplace, product_category, review_date 162 | FROM amazon_reviews_by_marketplace 163 | -- Run time: 40.04 seconds, Data scanned: 2.44 GB 164 | ``` 165 | 166 | We used the bucketing functionality to distribute the data evenly across 30 buckets. We used `review_id` as it is high cardinality and will allow for an even distribution. 167 | 168 | ```sql 169 | SELECT "$path", product_id, COUNT(*) FROM amazon_reviews_unsorted 170 | WHERE product_id = 'B00E8KLWB4' 171 | GROUP BY 1, 2 ORDER BY 1 ASC 172 | -- Run time: 4.39 seconds, Data scanned: 834.36 MB 173 | ``` 174 | 175 | Initially I tried to bucket by `product_id`, but that still puts `product_id` in one file. 176 | It's not sorted across all files, though, as the field is hashed and the hash is used. 177 | Instead, we'll bucket on `review_id`, which will effectively randomize the `product_id`s. 178 | 179 | It's tough to control sorting and # of output files using CTAS, but Spark can do this well. Using something like EMR Notebooks or AWS Glue, we read the original dataset and use `repartitionByRange` to sort `product_id` into 30 different output files. 180 | 181 | ```python 182 | (spark.read.parquet("s3://amazon-reviews-pds/parquet/") 183 | .select("marketplace", "customer_id", "review_id", "product_id", "product_parent", "star_rating") 184 | .repartitionByRange(30, "product_id") 185 | .write.mode("overwrite") 186 | .parquet("s3:///amazon-reviews-sorted-subset/", compression="gzip") 187 | ) 188 | ``` 189 | 190 | And then back in Athena... 191 | 192 | ```sql 193 | CREATE EXTERNAL TABLE amazon_reviews_spark_sorted ( 194 | marketplace string, 195 | customer_id string, 196 | review_id string, 197 | product_id string, 198 | product_parent string, 199 | star_rating int 200 | ) 201 | STORED AS PARQUET 202 | LOCATION 's3:///amazon-reviews-sorted-subset/' 203 | ``` 204 | 205 | ```sql 206 | SELECT "$path", COUNT(*) FROM amazon_reviews_spark_sorted 207 | GROUP BY 1 ORDER BY 1 ASC 208 | -- About 5-6M records per file 209 | ``` 210 | 211 | ## Parquet metadata 212 | 213 | [parquet-tools](https://github.com/apache/parquet-mr/tree/master/parquet-tools) is a fantastic utility for analyzing the content of Parquet files. 214 | 215 | If you're on a mac, it's available via homebrew: `brew install parquet-tools` 216 | 217 | Download a sample Parquet file and print out the metadata: 218 | 219 | ```shell 220 | curl -O https://s3.amazonaws.com/amazon-reviews-pds/parquet/product_category=Watches/part-00009-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet 221 | parquet-tools meta part-00009-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet 222 | ``` 223 | 224 | You'll see a bunch of detailed information about the file including number of rows, minimum and maximum values, and the schema. 225 | 226 | _Some rows left out for brevity_ 227 | 228 | ``` 229 | file: file:/private/tmp/part-00009-495c48e6-96d6-4650-aa65-3c36a3516ddd.c000.snappy.parquet 230 | creator: parquet-mr version 1.8.2 (build c6522788629e590a53eb79874b95f6c3ff11f16c) 231 | 232 | file schema: spark_schema 233 | -------------------------------------------------------------------------------- 234 | product_title: OPTIONAL BINARY O:UTF8 R:0 D:1 235 | star_rating: OPTIONAL INT32 R:0 D:1 236 | helpful_votes: OPTIONAL INT32 R:0 D:1 237 | review_date: OPTIONAL INT32 O:DATE R:0 D:1 238 | year: OPTIONAL INT32 R:0 D:1 239 | 240 | row group 1: RC:97608 TS:39755962 OFFSET:4 241 | -------------------------------------------------------------------------------- 242 | product_title: BINARY SNAPPY DO:0 FPO:3243045 SZ:3170609/6450771/2.03 VC:97608 ENC:PLAIN,PLAIN_DICTIONARY,RLE,BIT_PACKED ST:[no stats for this column] 243 | star_rating: INT32 SNAPPY DO:0 FPO:6413654 SZ:36016/36709/1.02 VC:97608 ENC:PLAIN_DICTIONARY,RLE,BIT_PACKED ST:[min: 1, max: 5, num_nulls: 0] 244 | helpful_votes: INT32 SNAPPY DO:0 FPO:6449670 SZ:48348/93031/1.92 VC:97608 ENC:PLAIN_DICTIONARY,RLE,BIT_PACKED ST:[min: 0, max: 753, num_nulls: 0] 245 | review_date: INT32 SNAPPY DO:0 FPO:23689606 SZ:35674/146381/4.10 VC:97608 ENC:PLAIN_DICTIONARY,RLE,BIT_PACKED ST:[min: 2001-04-05, max: 2015-08-31, num_nulls: 0] 246 | year: INT32 SNAPPY DO:0 FPO:23725280 SZ:2004/37279/18.60 VC:97608 ENC:PLAIN_DICTIONARY,RLE,BIT_PACKED ST:[min: 2001, max: 2015, num_nulls: 0] 247 | ``` 248 | 249 | 250 | More detailed information on the different fields for each column is [here](https://github.com/apache/parquet-mr/tree/master/parquet-tools#meta-legend). 251 | 252 | Note that current versions of the tool may not show string statistics by default as they could be incorrect: [PARQUET-686](https://issues.apache.org/jira/browse/PARQUET-686). -------------------------------------------------------------------------------- /cdk/big-data-stack/stacks/eks.py: -------------------------------------------------------------------------------- 1 | from aws_cdk import ( 2 | core as cdk, 3 | aws_eks as eks, 4 | aws_ec2 as ec2, 5 | aws_iam as iam, 6 | ) 7 | 8 | from plugins.eks.autoscaler import ClusterAutoscaler 9 | 10 | 11 | class EKSStack(cdk.Stack): 12 | cluster_name: str 13 | cluster: eks.Cluster 14 | 15 | def __init__( 16 | self, 17 | scope: cdk.Construct, 18 | construct_id: str, 19 | vpc: ec2.IVpc, 20 | instance_type: str = "m5.xlarge", 21 | **kwargs, 22 | ) -> None: 23 | super().__init__(scope, construct_id, **kwargs) 24 | 25 | self.cluster_name = "data-team" 26 | 27 | # EKS cluster 28 | self.cluster = eks.Cluster( 29 | self, 30 | "EksForSpark", 31 | cluster_name=self.cluster_name, 32 | version=eks.KubernetesVersion.V1_19, 33 | default_capacity=0, 34 | endpoint_access=eks.EndpointAccess.PUBLIC_AND_PRIVATE, 35 | vpc=vpc, 36 | vpc_subnets=[ec2.SubnetSelection(subnet_type=ec2.SubnetType.PRIVATE)], 37 | ) 38 | 39 | # Default node group 40 | ng = self.cluster.add_nodegroup_capacity( 41 | "base-node-group", 42 | instance_types=[ec2.InstanceType(instance_type)], 43 | min_size=1, 44 | max_size=20, 45 | disk_size=50, 46 | ) 47 | 48 | # Add a Spot node group as well for additional capacity 49 | spot_ng = self.cluster.add_nodegroup_capacity( 50 | "spot-node-group", 51 | capacity_type=eks.CapacityType.SPOT, 52 | instance_types=[ec2.InstanceType(it) for it in ['c4.2xlarge', 'c5.2xlarge', 'c5d.2xlarge', 'c5a.2xlarge', 'c5n.2xlarge']], 53 | min_size=1, 54 | max_size=20, 55 | ) 56 | 57 | # Add a Graviton node group as well for additional capacity 58 | # graviton_ng = self.cluster.add_nodegroup_capacity( 59 | # "graviton-node-group", 60 | # instance_types=[ec2.InstanceType(it) for it in ['m6g.2xlarge', 'c6g.2xlarge', 'r6g.2xlarge']], 61 | # min_size=1, 62 | # max_size=20, 63 | # ) 64 | 65 | self.add_admin_role_to_cluster() 66 | self.add_cluster_admin() 67 | 68 | # Cluster AutoScaling FTW 69 | ClusterAutoscaler( 70 | self.cluster_name, self, self.cluster, [ng, spot_ng] 71 | ).enable_autoscaling() 72 | 73 | # We like to use the Kubernetes Dashboard 74 | self.enable_dashboard() 75 | 76 | # Install Airflow as well 77 | # TODO: Make this optional 78 | # self.enable_airflow() 79 | 80 | # This is emr-specific, but we have to do it here to prevent circular dependencies 81 | self.map_iam_to_eks() 82 | 83 | def add_admin_role_to_cluster(self) -> None: 84 | admin_role_name = self.node.try_get_context("eks_admin_role_name") 85 | if admin_role_name is None: 86 | return 87 | 88 | account_id = cdk.Aws.ACCOUNT_ID 89 | admin_role = iam.Role.from_role_arn( 90 | self, "admin_role", f"arn:aws:iam::{account_id}:role/{admin_role_name}" 91 | ) 92 | self.cluster.aws_auth.add_masters_role(admin_role) 93 | 94 | def add_cluster_admin(self, name="eks-admin"): 95 | # Add admin privileges so we can sign in to the dashboard as the service account 96 | sa = self.cluster.add_manifest( 97 | "eks-admin-sa", 98 | { 99 | "apiVersion": "v1", 100 | "kind": "ServiceAccount", 101 | "metadata": { 102 | "name": name, 103 | "namespace": "kube-system", 104 | }, 105 | }, 106 | ) 107 | binding = self.cluster.add_manifest( 108 | "eks-admin-rbac", 109 | { 110 | "apiVersion": "rbac.authorization.k8s.io/v1beta1", 111 | "kind": "ClusterRoleBinding", 112 | "metadata": {"name": name}, 113 | "roleRef": { 114 | "apiGroup": "rbac.authorization.k8s.io", 115 | "kind": "ClusterRole", 116 | "name": "cluster-admin", 117 | }, 118 | "subjects": [ 119 | { 120 | "kind": "ServiceAccount", 121 | "name": name, 122 | "namespace": "kube-system", 123 | } 124 | ], 125 | }, 126 | ) 127 | 128 | def enable_dashboard(self, namespace: str = "kubernetes-dashboard"): 129 | chart = self.cluster.add_helm_chart( 130 | "kubernetes-dashboard", 131 | namespace=namespace, 132 | chart="kubernetes-dashboard", 133 | repository="https://kubernetes.github.io/dashboard/", 134 | values={ 135 | "fullnameOverride": "kubernetes-dashboard", # This must be set to acccess the UI via `kubectl proxy` 136 | "extraArgs": ["--token-ttl=0"], 137 | }, 138 | ) 139 | 140 | def map_iam_to_eks(self): 141 | service_role_name = f"arn:aws:iam::{cdk.Aws.ACCOUNT_ID}:role/AWSServiceRoleForAmazonEMRContainers" 142 | emrsvcrole = iam.Role.from_role_arn( 143 | self, "EmrSvcRole", service_role_name, mutable=False 144 | ) 145 | self.cluster.aws_auth.add_role_mapping( 146 | emrsvcrole, groups=[], username="emr-containers" 147 | ) 148 | 149 | def add_emr_containers_for_airflow(self) -> eks.ServiceAccount: 150 | sa = self.cluster.add_service_account( 151 | "AirflowServiceAccount", namespace="airflow" 152 | ) 153 | 154 | sa.add_to_principal_policy( 155 | iam.PolicyStatement( 156 | actions=[ 157 | "emr-containers:StartJobRun", 158 | "emr-containers:ListJobRuns", 159 | "emr-containers:DescribeJobRun", 160 | "emr-containers:CancelJobRun", 161 | ], 162 | resources=["*"], 163 | ) 164 | ) 165 | 166 | return sa 167 | 168 | def enable_airflow(self, namespace: str = "airflow"): 169 | # While `add_helm_chart` will create the namespace for us if it doesn't exist, 170 | # we have to create it here because we need to create a service role for emr-containers. 171 | ns = self.cluster.add_manifest( 172 | "airflow-namespace", 173 | {"apiVersion": "v1", "kind": "Namespace", "metadata": {"name": namespace}}, 174 | ) 175 | # This is specific to emr-containers and Airflow so we can run EMR on EKS jobs 176 | service_role = self.add_emr_containers_for_airflow() 177 | service_role.node.add_dependency(ns) 178 | 179 | volume = self.cluster.add_manifest("multiaz-volume", self.gp2_multiazvolume()) 180 | chart = self.cluster.add_helm_chart( 181 | "airflow", 182 | namespace=namespace, 183 | chart="airflow", 184 | repository="https://airflow-helm.github.io/charts", 185 | version="8.0.5", 186 | values={ 187 | "airflow": { 188 | "config": { 189 | "AIRFLOW__LOGGING__REMOTE_LOGGING": "False", 190 | }, 191 | "executor": "KubernetesExecutor", 192 | "image": { 193 | "repository": "ghcr.io/dacort/airflow-emr-eks", 194 | "tag": "latest", 195 | "pullPolicy": "Always", 196 | }, 197 | "extraEnv": [ 198 | { 199 | "name": "AIRFLOW__CORE__FERNET_KEY", 200 | "valueFrom": { 201 | "secretKeyRef": { 202 | "name": "airflow-fernet-key", 203 | "key": "value", 204 | } 205 | }, 206 | }, 207 | { 208 | "name": "AWS_DEFAULT_REGION", 209 | "value": cdk.Aws.REGION, 210 | }, 211 | ], 212 | }, 213 | "web": {"resources": {"limits": {"cpu": "1", "memory": "1Gi"}}}, 214 | "workers": {"enabled": False}, 215 | "flower": {"enabled": False}, 216 | "redis": {"enabled": False}, 217 | "dags": { 218 | "gitSync": { 219 | "enabled": True, 220 | "repo": "https://github.com/dacort/airflow-example-dags.git", 221 | "branch": "main", 222 | "resources": {"requests": {"cpu": "50m", "memory": "64Mi"}}, 223 | } 224 | }, 225 | "postgresql": {"persistence": {"storageClass": "multiazvolume"}}, 226 | "serviceAccount": { 227 | "create": False, 228 | "name": service_role.service_account_name, 229 | "annotations": { 230 | "eks.amazonaws.com/role-arn": service_role.role.role_arn 231 | }, 232 | }, 233 | }, 234 | ) 235 | chart.node.add_dependency(ns) 236 | chart.node.add_dependency(volume) 237 | 238 | # Display the command necessarty to port-forward the Airflow Web UI 239 | airflow_forward_cmd = f'kubectl port-forward --namespace {namespace} $(kubectl get pods --namespace {namespace} -l "component=web,app=airflow" -o jsonpath="{{.items[0].metadata.name}}") 8080:8080' 240 | cdk.CfnOutput(self, "AirflowLoginCommand", value=airflow_forward_cmd) 241 | 242 | def gp2_multiazvolume(self): 243 | return { 244 | "kind": "StorageClass", 245 | "apiVersion": "storage.k8s.io/v1", 246 | "metadata": {"name": "multiazvolume"}, 247 | "provisioner": "kubernetes.io/aws-ebs", 248 | "parameters": {"type": "gp2", "iopsPerGB": "10", "fsType": "ext4"}, 249 | "volumeBindingMode": "WaitForFirstConsumer", 250 | } 251 | 252 | 253 | # Helpful references 254 | # https://betterprogramming.pub/how-to-organize-your-aws-cdk-project-f1c463aa966e 255 | # https://github.com/aftouh/cdk-template 256 | # 257 | # https://faun.pub/spawning-an-autoscaling-eks-cluster-52977aa8b467 -------------------------------------------------------------------------------- /emr/eks/videos/pod_templates/README.md: -------------------------------------------------------------------------------- 1 | # EMR on EKS Pod Templates 2 | 3 | ## Demos 4 | - Running Spark jobs with Dynamic Resource Allocation (DRA) 5 | - Using pod templates to optimize job cost (Spot and Fargate) 6 | - Using pod templates to run sidecar containers 7 | 8 | ## Step 1 - Running a simple Spark job 9 | 10 | - Using a local Spark example, submit a job with static executor config 11 | 12 | ```shell 13 | aws emr-containers start-job-run \ 14 | --virtual-cluster-id ${EMR_EKS_CLUSTER_ID} \ 15 | --name dacort-pi-static \ 16 | --execution-role-arn ${EMR_EKS_EXECUTION_ARN} \ 17 | --release-label emr-5.33.0-latest \ 18 | --job-driver '{ 19 | "sparkSubmitJobDriver": { 20 | "entryPoint": "local:///usr/lib/spark/examples/src/main/python/pi.py", 21 | "sparkSubmitParameters": "--conf spark.executor.instances=20 --conf spark.executor.memory=2G --conf spark.executor.cores=2 --conf spark.driver.cores=1" 22 | } 23 | }' \ 24 | --configuration-overrides '{ 25 | "monitoringConfiguration": { 26 | "s3MonitoringConfiguration": { 27 | "logUri": "s3://'${S3_BUCKET}'/emr-eks-logs/pi/" 28 | } 29 | } 30 | }' 31 | ``` 32 | 33 | ## Step 2 - Dynamic Resource Allocation 34 | 35 | _Notes_ 36 | - Only works with Spark 3.x / EMR 6 37 | 38 | To enable DRA, we'll add an `applicationConfiguration` section to the `--configuration-overrides` parameter to specifically enable it and define the executor behavior. This is the section we'll add: 39 | 40 | ```json 41 | { 42 | "classification": "spark-defaults", 43 | "properties": { 44 | "spark.dynamicAllocation.enabled": "true", 45 | "spark.dynamicAllocation.shuffleTracking.enabled":"true", 46 | "spark.dynamicAllocation.minExecutors":"5", 47 | "spark.dynamicAllocation.maxExecutors":"100", 48 | "spark.dynamicAllocation.initialExecutors":"10" 49 | } 50 | } 51 | ``` 52 | 53 | ```shell 54 | aws emr-containers start-job-run \ 55 | --virtual-cluster-id ${EMR_EKS_CLUSTER_ID} \ 56 | --name dacort-windycity-dra \ 57 | --execution-role-arn ${EMR_EKS_EXECUTION_ARN} \ 58 | --release-label emr-6.2.0-latest \ 59 | --job-driver '{ 60 | "sparkSubmitJobDriver": { 61 | "entryPoint": "s3://'${S3_BUCKET}'/code/pyspark/windy_city.py", 62 | "sparkSubmitParameters": "--conf spark.executor.memory=2G --conf spark.executor.cores=2 --conf spark.driver.cores=1" 63 | } 64 | }' \ 65 | --configuration-overrides '{ 66 | "applicationConfiguration": [ 67 | { 68 | "classification": "spark-defaults", 69 | "properties": { 70 | "spark.dynamicAllocation.enabled": "true", 71 | "spark.dynamicAllocation.shuffleTracking.enabled":"true", 72 | "spark.dynamicAllocation.minExecutors":"2", 73 | "spark.dynamicAllocation.maxExecutors":"100", 74 | "spark.dynamicAllocation.initialExecutors":"5" 75 | } 76 | } 77 | ] 78 | }' 79 | ``` 80 | 81 | ## Step 3 - Limit executors to Spot 82 | 83 | For this step, we make use of [pod templates](https://docs.aws.amazon.com/emr/latest/EMR-on-EKS-DevelopmentGuide/pod-templates.html). Pod templates allow you to specify how or where the containers will run in your cluster. 84 | 85 | 86 | 1. Create pod templates for Spot and On-Demand 87 | 88 | - `spot_pod_template.yaml` 89 | ```yaml 90 | apiVersion: v1 91 | kind: Pod 92 | spec: 93 | nodeSelector: 94 | eks.amazonaws.com/capacityType: SPOT 95 | ``` 96 | 97 | - `ondemand_pod_template.yaml` 98 | ```yaml 99 | apiVersion: v1 100 | kind: Pod 101 | spec: 102 | nodeSelector: 103 | eks.amazonaws.com/capacityType: ON_DEMAND 104 | ``` 105 | 106 | 2. Upload those templates to S3 107 | 108 | ```shell 109 | aws s3 cp spot_pod_template.yaml s3:///artifacts/pod_templates/ 110 | aws s3 cp ondemand_pod_template.yaml s3:///artifacts/pod_templates/ 111 | ``` 112 | 113 | 3. Run your Spark job with the pod template specified for the executor! 114 | 115 | We also specify on-demand for the driver because we want to ensure the driver persists for the entire length of the job. 116 | 117 | ```shell 118 | aws emr-containers start-job-run \ 119 | --virtual-cluster-id ${EMR_EKS_CLUSTER_ID} \ 120 | --name dacort-windycity-spot \ 121 | --execution-role-arn ${EMR_EKS_EXECUTION_ARN} \ 122 | --release-label emr-5.33.0-latest \ 123 | --job-driver '{ 124 | "sparkSubmitJobDriver": { 125 | "entryPoint": "s3://'${S3_BUCKET}'/code/pyspark/windy_city.py", 126 | "sparkSubmitParameters": "--conf spark.executor.instances=5 --conf spark.executor.memory=2G --conf spark.executor.cores=2 --conf spark.driver.cores=1 --conf spark.kubernetes.driver.podTemplateFile=s3://'${S3_BUCKET}'/artifacts/pod_templates/ondemand_pod_template.yaml --conf spark.kubernetes.executor.podTemplateFile=s3://'${S3_BUCKET}'/artifacts/pod_templates/spot_pod_template.yaml" 127 | } 128 | }' 129 | ``` 130 | 131 | And now verify the driver and executors are running where we expect! 132 | 133 | ```shell 134 | kubectl describe node \ 135 | $(kubectl get pods -n emr-jobs --selector=spark-role=driver -o jsonpath='{.items[*].spec.nodeName}' ) \ 136 | | grep -i capacityType 137 | # eks.amazonaws.com/capacityType=ON_DEMAND 138 | ``` 139 | 140 | 141 | ```shell 142 | kubectl describe node \ 143 | $(kubectl get pods -n emr-jobs --selector=spark-role=executor -o jsonpath='{.items[*].spec.nodeName}' ) \ 144 | | grep -i capacityType 145 | # eks.amazonaws.com/capacityType=SPOT 146 | # eks.amazonaws.com/capacityType=SPOT 147 | # eks.amazonaws.com/capacityType=SPOT 148 | # eks.amazonaws.com/capacityType=SPOT 149 | # eks.amazonaws.com/capacityType=SPOT 150 | ``` 151 | 152 | ## Step 3.5 - Pod Templates: Limit executors to Fargate 153 | 154 | - Ensure you have the Fargate role 155 | 156 | ```shell 157 | aws iam create-role --role-name AmazonEKSFargatePodExecutionRole --assume-role-policy-document '{"Version":"2012-10-17","Statement":[{"Effect":"Allow","Principal":{"Service":"eks-fargate-pods.amazonaws.com"},"Action":"sts:AssumeRole"}]}' 158 | aws iam attach-role-policy --role-name AmazonEKSFargatePodExecutionRole --policy-arn arn:aws:iam::aws:policy/AmazonEKSFargatePodExecutionRolePolicy 159 | ``` 160 | 161 | - Create a new Fargate profile in the appropriate namespace 162 | 163 | We need to add a specific selector, otherwise *all* jobs in the `emr-jobs` namespace will run on Fargate. 164 | 165 | ```shell 166 | aws eks create-fargate-profile \ 167 | --cluster-name data-team \ 168 | --fargate-profile-name spark-fargate-executors \ 169 | --selectors 'namespace=emr-jobs,labels={eks.amazonaws.com/capacityType=FARGATE}' \ 170 | --pod-execution-role-arn ${FARGATE_EXECUTION_ARN} 171 | ``` 172 | 173 | - Assign Spark executor label 174 | 175 | In order to properly run our executors _only_ on Fargate, we'll add a label to the spark-submit parameters: 176 | 177 | ``` 178 | --conf spark.kubernetes.executor.label.eks.amazonaws.com/capacityType=FARGATE 179 | ``` 180 | 181 | - Run our job! 182 | 183 | ```shell 184 | aws emr-containers start-job-run \ 185 | --virtual-cluster-id ${EMR_EKS_CLUSTER_ID} \ 186 | --name dacort-windycity-fargate \ 187 | --execution-role-arn ${EMR_EKS_EXECUTION_ARN} \ 188 | --release-label emr-5.33.0-latest \ 189 | --job-driver '{ 190 | "sparkSubmitJobDriver": { 191 | "entryPoint": "s3://'${S3_BUCKET}'/code/pyspark/windy_city.py", 192 | "sparkSubmitParameters": "--conf spark.executor.instances=5 --conf spark.executor.memory=2G --conf spark.executor.cores=2 --conf spark.driver.cores=1 --conf spark.kubernetes.driver.podTemplateFile=s3://'${S3_BUCKET}'/artifacts/pod_templates/ondemand_pod_template.yaml --conf spark.kubernetes.executor.label.eks.amazonaws.com/capacityType=FARGATE" 193 | } 194 | }' 195 | ``` 196 | 197 | ## Step 4 - Running a sidecar container 198 | 199 | Sidecar containers can be used to add additional functionality alongside your Spark drivers and/or executors. A common use-case is to forward logs to a centralized logging provider. 200 | 201 | For this example, we'll use a custom [Spark Tweeter](https://github.com/dacort/spark-tweeter) container that tweets out when a job starts and stops. 202 | 203 | - First, we need to add our Twitter credentials as a Kubernetes secret 204 | 205 | ```shell 206 | kubectl create secret generic -n emr-jobs twitter-creds \ 207 | --from-literal=consumer_key=${CONSUMER_KEY} \ 208 | --from-literal=consumer_secret=${CONSUMER_SECRET} \ 209 | --from-literal=access_token=${ACCESS_TOKEN} \ 210 | --from-literal=access_token_secret=${ACCESS_TOKEN_SECRET} 211 | 212 | ``` 213 | 214 | - Then we create a sidecar yaml file and upload that to S3 215 | 216 | ```yaml 217 | # tweetcar.yaml 218 | apiVersion: v1 219 | kind: Pod 220 | spec: 221 | containers: 222 | - name: side-car-tweeter 223 | image: ghcr.io/dacort/spark-tweeter:latest 224 | env: 225 | - name: CONSUMER_KEY 226 | valueFrom: 227 | secretKeyRef: 228 | name: twitter-creds 229 | key: consumer_key 230 | - name: CONSUMER_SECRET 231 | valueFrom: 232 | secretKeyRef: 233 | name: twitter-creds 234 | key: consumer_secret 235 | - name: ACCESS_TOKEN 236 | valueFrom: 237 | secretKeyRef: 238 | name: twitter-creds 239 | key: access_token 240 | - name: ACCESS_TOKEN_SECRET 241 | valueFrom: 242 | secretKeyRef: 243 | name: twitter-creds 244 | key: access_token_secret 245 | - name: EMR_COMMS_MOUNT 246 | value: /var/log/fluentd 247 | resources: {} 248 | volumeMounts: 249 | - name: emr-container-application-log-dir 250 | mountPath: /var/log/spark/user 251 | - name: emr-container-communicate 252 | mountPath: /var/log/fluentd 253 | ``` 254 | 255 | ```shell 256 | aws s3 cp tweetcar.yaml s3:///artifacts/pod_templates/tweetcar.yaml 257 | ``` 258 | 259 | - Now run your Spark job with the sidecar mounted on the Driver 260 | 261 | ```shell 262 | aws emr-containers start-job-run \ 263 | --virtual-cluster-id ${EMR_EKS_CLUSTER_ID} \ 264 | --name dacort-tweeter \ 265 | --execution-role-arn ${EMR_EKS_EXECUTION_ARN} \ 266 | --release-label emr-5.33.0-latest \ 267 | --job-driver '{ 268 | "sparkSubmitJobDriver": { 269 | "entryPoint": "s3://'${S3_BUCKET}'/code/pyspark/windy_city.py", 270 | "sparkSubmitParameters": "--conf spark.executor.instances=20 --conf spark.executor.memory=2G --conf spark.executor.cores=2 --conf spark.driver.cores=1 --conf spark.kubernetes.driver.podTemplateFile=s3://'${S3_BUCKET}'/artifacts/pod_templates/tweetcar.yaml" 271 | } 272 | }' 273 | ``` -------------------------------------------------------------------------------- /emr/studio/notebooks/emr-studio-athena.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "90cd3a9c", 6 | "metadata": {}, 7 | "source": [ 8 | "# Querying Athena from EMR Studio\n", 9 | "\n", 10 | "1. Install the [pyathena](https://github.com/laughingman7743/PyAthena/) library.\n", 11 | "\n", 12 | "_If this is the first time installing the library on the cluster, you'll need to restart your Kernel._" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 1, 18 | "id": "75c9710d", 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "Defaulting to user installation because normal site-packages is not writeable\n", 26 | "Requirement already satisfied: pyathena in /home/emr-notebook/.local/lib/python3.7/site-packages (2.3.0)\n", 27 | "Requirement already satisfied: botocore>=1.5.52 in /home/emr-notebook/.local/lib/python3.7/site-packages (from pyathena) (1.21.54)\n", 28 | "Requirement already satisfied: boto3>=1.4.4 in /home/emr-notebook/.local/lib/python3.7/site-packages (from pyathena) (1.18.54)\n", 29 | "Requirement already satisfied: tenacity>=4.1.0 in /mnt/notebook-env/lib/python3.7/site-packages (from pyathena) (8.0.0)\n", 30 | "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /home/emr-notebook/.local/lib/python3.7/site-packages (from boto3>=1.4.4->pyathena) (0.10.0)\n", 31 | "Requirement already satisfied: s3transfer<0.6.0,>=0.5.0 in /home/emr-notebook/.local/lib/python3.7/site-packages (from boto3>=1.4.4->pyathena) (0.5.0)\n", 32 | "Requirement already satisfied: urllib3<1.27,>=1.25.4 in /mnt/notebook-env/lib/python3.7/site-packages (from botocore>=1.5.52->pyathena) (1.26.6)\n", 33 | "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /mnt/notebook-env/lib/python3.7/site-packages (from botocore>=1.5.52->pyathena) (2.8.2)\n", 34 | "Requirement already satisfied: six>=1.5 in /mnt/notebook-env/lib/python3.7/site-packages (from python-dateutil<3.0.0,>=2.1->botocore>=1.5.52->pyathena) (1.15.0)\n" 35 | ] 36 | } 37 | ], 38 | "source": [ 39 | "!/emr/notebook-env/bin/pip install pyathena" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "id": "2fc5de2d", 45 | "metadata": {}, 46 | "source": [ 47 | "2. Connect to Athena and query!" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 2, 53 | "id": "46ac6b1c", 54 | "metadata": {}, 55 | "outputs": [ 56 | { 57 | "name": "stdout", 58 | "output_type": "stream", 59 | "text": [ 60 | "('20210923175137', '20210923175137_2_2', '258241be-2f7c-4bad-99f7-f65a2a1bc032', 'americas-brazil-sao_paulo', '658db9b2-5081-49b1-b6e7-6ff1b1d92448-0_2-155-196_20210923175137.parquet', 0.7004607721204296, 0.9911546157239198, 'driver-204', 0.14602911545960373, 0.430070203188727, 89.89115113672493, 'rider-204', 1631914555102, '258241be-2f7c-4bad-99f7-f65a2a1bc032', 'americas-brazil-sao_paulo')\n", 61 | "('20210923175441', '20210923175441_1_1', '130b2891-b8e8-45cb-86c5-9ba17bf67e9c', 'americas-brazil-sao_paulo', '658db9b2-5081-49b1-b6e7-6ff1b1d92448-0_1-189-243_20210923175441.parquet', 0.4789745387904072, 0.14781856144057215, 'driver-022', 0.10509642405359532, 0.07682825311613706, 30.429177017810616, 'rider-022', 1632072813847, '130b2891-b8e8-45cb-86c5-9ba17bf67e9c', 'americas-brazil-sao_paulo')\n", 62 | "('20210923175137', '20210923175137_2_7', 'e8c22378-8cf4-480a-bbcb-2d727e72bbe9', 'americas-brazil-sao_paulo', '658db9b2-5081-49b1-b6e7-6ff1b1d92448-0_2-155-196_20210923175137.parquet', 0.011933855867048981, 0.16258177392270334, 'driver-204', 0.9635314017496284, 0.6451866124948767, 69.09535493302582, 'rider-204', 1631855582705, 'e8c22378-8cf4-480a-bbcb-2d727e72bbe9', 'americas-brazil-sao_paulo')\n", 63 | "('20210923175441', '20210923175441_1_3', '853e3137-1404-476a-9706-bc6862a0b0c1', 'americas-brazil-sao_paulo', '658db9b2-5081-49b1-b6e7-6ff1b1d92448-0_1-189-243_20210923175441.parquet', 0.7863419548389983, 0.09622419308555896, 'driver-022', 0.4461749593405654, 0.8047885824928995, 89.58715055088675, 'rider-022', 1632239701039, '853e3137-1404-476a-9706-bc6862a0b0c1', 'americas-brazil-sao_paulo')\n", 64 | "('20210923175441', '20210923175441_1_4', '8077ddc9-b591-41a7-ac05-dd418485e567', 'americas-brazil-sao_paulo', '658db9b2-5081-49b1-b6e7-6ff1b1d92448-0_1-189-243_20210923175441.parquet', 0.4557704708784922, 0.19566457205271448, 'driver-022', 0.04316839215753254, 0.49689215534636744, 98.07565038092373, 'rider-022', 1632278101830, '8077ddc9-b591-41a7-ac05-dd418485e567', 'americas-brazil-sao_paulo')\n", 65 | "('20210923175441', '20210923175441_1_6', 'c306ec62-fd26-447e-b071-8e928c3601cc', 'americas-brazil-sao_paulo', '658db9b2-5081-49b1-b6e7-6ff1b1d92448-0_1-189-243_20210923175441.parquet', 0.2357445292459669, 0.20216983131886535, 'driver-022', 0.7985867991529113, 0.6627849637400387, 45.92862425905784, 'rider-022', 1632070802354, 'c306ec62-fd26-447e-b071-8e928c3601cc', 'americas-brazil-sao_paulo')\n", 66 | "('20210923175441', '20210923175441_1_8', 'aff4ca90-b07d-4598-a643-510db2396646', 'americas-brazil-sao_paulo', '658db9b2-5081-49b1-b6e7-6ff1b1d92448-0_1-189-243_20210923175441.parquet', 0.5714612197743915, 0.8660402414940012, 'driver-022', 0.42204161309648225, 0.7826771915638148, 56.15793735580833, 'rider-022', 1632216837224, 'aff4ca90-b07d-4598-a643-510db2396646', 'americas-brazil-sao_paulo')\n", 67 | "('20210923175137', '20210923175137_1_1', '623fb531-d340-4093-9814-ee4e2d903446', 'asia-india-chennai', 'e587c8cb-7287-429f-9206-6bbd89e091f3-0_1-155-195_20210923175137.parquet', 0.5715455359501902, 0.8806745328835989, 'driver-204', 0.4957985534250222, 0.17496376187467866, 96.4500716154594, 'rider-204', 1632184230245, '623fb531-d340-4093-9814-ee4e2d903446', 'asia-india-chennai')\n", 68 | "('20210923175137', '20210923175137_1_6', '9775d219-1cfe-4534-acc0-9740f00e7516', 'asia-india-chennai', 'e587c8cb-7287-429f-9206-6bbd89e091f3-0_1-155-195_20210923175137.parquet', 0.8529563766655098, 0.18417876489592633, 'driver-204', 0.5762896261799536, 0.9686943663190588, 51.299844734112945, 'rider-204', 1632148019323, '9775d219-1cfe-4534-acc0-9740f00e7516', 'asia-india-chennai')\n", 69 | "('20210923175441', '20210923175441_0_2', 'e9185a6b-85ec-4cd2-8280-0c876a8a0b2d', 'americas-united_states-san_francisco', 'fd89688b-1001-4686-a31a-17eb31e1c86d-0_0-189-242_20210923175441.parquet', 0.29715343023010277, 0.32560695311233856, 'driver-022', 0.5819606196949516, 0.49547619941585996, 7.078489064840843, 'rider-022', 1632032168497, 'e9185a6b-85ec-4cd2-8280-0c876a8a0b2d', 'americas-united_states-san_francisco')\n" 70 | ] 71 | } 72 | ], 73 | "source": [ 74 | "from pyathena import connect\n", 75 | "\n", 76 | "cursor = connect(\n", 77 | " s3_staging_dir=\"s3:///queries/\",\n", 78 | " region_name=\"us-west-2\"\n", 79 | ").cursor()\n", 80 | "\n", 81 | "cursor.execute(\"SELECT * FROM hudi_trips LIMIT 10\")\n", 82 | "\n", 83 | "for row in cursor:\n", 84 | " print(row)" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "id": "253f33a8", 90 | "metadata": {}, 91 | "source": [ 92 | "You can also query from different databases if you want." 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 3, 98 | "id": "ebf0e256", 99 | "metadata": {}, 100 | "outputs": [ 101 | { 102 | "name": "stdout", 103 | "output_type": "stream", 104 | "text": [ 105 | "[('dea_pain_pills_parquet',), ('dea_pain_pills_tsv',), ('dea_pain_pills_tsv_gz',)]\n" 106 | ] 107 | } 108 | ], 109 | "source": [ 110 | "cursor.execute(\"SHOW TABLES FROM intro_data_proc\")\n", 111 | "print(cursor.fetchall())" 112 | ] 113 | }, 114 | { 115 | "cell_type": "markdown", 116 | "id": "cc88eb45", 117 | "metadata": {}, 118 | "source": [ 119 | "## Using SparkSQL\n", 120 | "\n", 121 | "If your EMR Cluster is using the [Glue Data Catalog](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark-glue.html), you can also query your data on S3 with SparkSQL - just switch to the PySpark kernel and use the `%%sql` magic." 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 1, 127 | "id": "eb4c3c9a", 128 | "metadata": {}, 129 | "outputs": [ 130 | { 131 | "name": "stdout", 132 | "output_type": "stream", 133 | "text": [ 134 | "Starting Spark application\n" 135 | ] 136 | }, 137 | { 138 | "data": { 139 | "text/html": [ 140 | "\n", 141 | "
IDYARN Application IDKindStateSpark UIDriver logCurrent session?
6application_1632519122277_0008pysparkidleLinkLink
" 143 | ], 144 | "text/plain": [ 145 | "" 146 | ] 147 | }, 148 | "metadata": {}, 149 | "output_type": "display_data" 150 | }, 151 | { 152 | "data": { 153 | "application/vnd.jupyter.widget-view+json": { 154 | "model_id": "", 155 | "version_major": 2, 156 | "version_minor": 0 157 | }, 158 | "text/plain": [ 159 | "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" 160 | ] 161 | }, 162 | "metadata": {}, 163 | "output_type": "display_data" 164 | }, 165 | { 166 | "name": "stdout", 167 | "output_type": "stream", 168 | "text": [ 169 | "SparkSession available as 'spark'.\n" 170 | ] 171 | }, 172 | { 173 | "data": { 174 | "application/vnd.jupyter.widget-view+json": { 175 | "model_id": "", 176 | "version_major": 2, 177 | "version_minor": 0 178 | }, 179 | "text/plain": [ 180 | "FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…" 181 | ] 182 | }, 183 | "metadata": {}, 184 | "output_type": "display_data" 185 | }, 186 | { 187 | "data": { 188 | "application/vnd.jupyter.widget-view+json": { 189 | "model_id": "5381587a33234d798bb3b4fcca2fd8e3", 190 | "version_major": 2, 191 | "version_minor": 0 192 | }, 193 | "text/plain": [ 194 | "VBox(children=(HBox(children=(HTML(value='Type:'), Button(description='Table', layout=Layout(width='70px'), st…" 195 | ] 196 | }, 197 | "metadata": {}, 198 | "output_type": "display_data" 199 | }, 200 | { 201 | "data": { 202 | "application/vnd.jupyter.widget-view+json": { 203 | "model_id": "e053cb95ec0b496fab9a4007dd7c7001", 204 | "version_major": 2, 205 | "version_minor": 0 206 | }, 207 | "text/plain": [ 208 | "Output()" 209 | ] 210 | }, 211 | "metadata": {}, 212 | "output_type": "display_data" 213 | } 214 | ], 215 | "source": [ 216 | "%%sql\n", 217 | "\n", 218 | "SHOW TABLES from intro_data_proc" 219 | ] 220 | } 221 | ], 222 | "metadata": { 223 | "kernelspec": { 224 | "display_name": "PySpark", 225 | "language": "", 226 | "name": "pysparkkernel" 227 | }, 228 | "language_info": { 229 | "codemirror_mode": { 230 | "name": "python", 231 | "version": 3 232 | }, 233 | "mimetype": "text/x-python", 234 | "name": "pyspark", 235 | "pygments_lexer": "python3" 236 | } 237 | }, 238 | "nbformat": 4, 239 | "nbformat_minor": 5 240 | } 241 | --------------------------------------------------------------------------------