├── docker ├── requirements.txt ├── app │ ├── config.ini │ ├── db-details.py │ ├── python_mysql_dbconfig.py │ ├── boto-ecs.py │ └── read-data-q.py ├── Dockerfile └── setup.sh ├── cdk ├── mwaa-cdk │ ├── requirements.txt │ ├── cdk.json │ ├── dags │ │ ├── sample-cdk-dag-od.py │ │ └── sample-cdk-dag.py │ ├── app.py │ └── mwaa_cdk │ │ ├── mwaa_cdk_backend.py │ │ └── mwaa_cdk_env.py └── ecs-cdk │ └── ecs-anywhere │ ├── cdk.context.json │ ├── requirements.txt │ ├── cdk.json │ ├── app.py │ └── ecs_anywhere │ ├── ecs_anywhere_vpc.py │ └── ecs_anywhere_taskdef.py ├── images ├── ecs-reg-1.png ├── ecs-reg-2.png ├── ecs-reg-3.png ├── ecs-reg-4.png ├── airflow-dag-1.png ├── airflow-dag-2.png ├── airflow-dag-3.png └── ricsue-airflow-hybrid.png ├── secrets └── tmp-elt.json ├── README.md ├── dag ├── ecs-external.py ├── ecs-hybrid-ec2.py ├── ecs-hybrid-external.py ├── ecs-hybrid-local.py ├── ecs-hybrid-boto3.py └── ecs-hybrid.py ├── LICENSE └── scripts └── create-ecs-task.py /docker/requirements.txt: -------------------------------------------------------------------------------- 1 | mysql-connector-python==8.0.28 2 | boto3 3 | 4 | -------------------------------------------------------------------------------- /docker/app/config.ini: -------------------------------------------------------------------------------- 1 | [mysql] 2 | host = 3 | database = 4 | user = 5 | password = 6 | -------------------------------------------------------------------------------- /cdk/mwaa-cdk/requirements.txt: -------------------------------------------------------------------------------- 1 | boto3 2 | aws-cdk-lib==2.2.0 3 | constructs>=10.0.0,<11.0.0 4 | -------------------------------------------------------------------------------- /images/ecs-reg-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/094459/blogpost-airflow-hybrid/HEAD/images/ecs-reg-1.png -------------------------------------------------------------------------------- /images/ecs-reg-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/094459/blogpost-airflow-hybrid/HEAD/images/ecs-reg-2.png -------------------------------------------------------------------------------- /images/ecs-reg-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/094459/blogpost-airflow-hybrid/HEAD/images/ecs-reg-3.png -------------------------------------------------------------------------------- /images/ecs-reg-4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/094459/blogpost-airflow-hybrid/HEAD/images/ecs-reg-4.png -------------------------------------------------------------------------------- /images/airflow-dag-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/094459/blogpost-airflow-hybrid/HEAD/images/airflow-dag-1.png -------------------------------------------------------------------------------- /images/airflow-dag-2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/094459/blogpost-airflow-hybrid/HEAD/images/airflow-dag-2.png -------------------------------------------------------------------------------- /images/airflow-dag-3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/094459/blogpost-airflow-hybrid/HEAD/images/airflow-dag-3.png -------------------------------------------------------------------------------- /images/ricsue-airflow-hybrid.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/094459/blogpost-airflow-hybrid/HEAD/images/ricsue-airflow-hybrid.png -------------------------------------------------------------------------------- /secrets/tmp-elt.json: -------------------------------------------------------------------------------- 1 | { 2 | "username": "{db-user}", 3 | "password": "{db-user-secure password}", 4 | "host": "{db-host-address}", 5 | "database" : "{db}" 6 | } 7 | -------------------------------------------------------------------------------- /cdk/ecs-cdk/ecs-anywhere/cdk.context.json: -------------------------------------------------------------------------------- 1 | { 2 | "availability-zones:account=704533066374:region=eu-west-2": [ 3 | "eu-west-2a", 4 | "eu-west-2b", 5 | "eu-west-2c" 6 | ] 7 | } 8 | -------------------------------------------------------------------------------- /cdk/ecs-cdk/ecs-anywhere/requirements.txt: -------------------------------------------------------------------------------- 1 | boto3 2 | aws-cdk-lib==2.2.0 3 | constructs>=10.0.0,<11.0.0 4 | #aws_cdk.aws_ecs 5 | #aws_cdk.aws_ec2 6 | #aws_cdk.aws_elasticloadbalancingv2 7 | #aws_cdk.aws_iam 8 | #aws_cdk.aws_efs 9 | #aws_cdk.core 10 | 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ### Using Apache Airflow to orchestrate hybrid workflows 2 | 3 | This repo contains the code for the blog post, [Using Apache Airflow to orchestrate hybrid workflows 4 | ](https://dev.to/aws/orchestrating-hybrid-workflows-using-amazon-managed-workflows-for-apache-airflow-mwaa-2boc) 5 | 6 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM public.ecr.aws/docker/library/python:latest 2 | 3 | WORKDIR /app 4 | 5 | COPY requirements.txt requirements.txt 6 | RUN pip3 install -r requirements.txt 7 | 8 | COPY . . 9 | 10 | #CMD [ "python3", "app/read-data-q.py"] 11 | ENTRYPOINT [ "python3", "app/read-data-q.py" ] -------------------------------------------------------------------------------- /cdk/mwaa-cdk/cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "python app.py", 3 | "context": { 4 | "@aws-cdk/aws-apigateway:usagePlanKeyOrderInsensitiveId": false, 5 | "@aws-cdk/aws-cloudfront:defaultSecurityPolicyTLSv1.2_2021": false, 6 | "@aws-cdk/aws-rds:lowercaseDbIdentifier": false, 7 | "@aws-cdk/core:stackRelativeExports": false 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /cdk/ecs-cdk/ecs-anywhere/cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "python app.py", 3 | "context": { 4 | "@aws-cdk/aws-apigateway:usagePlanKeyOrderInsensitiveId": false, 5 | "@aws-cdk/aws-cloudfront:defaultSecurityPolicyTLSv1.2_2021": false, 6 | "@aws-cdk/aws-rds:lowercaseDbIdentifier": false, 7 | "@aws-cdk/core:stackRelativeExports": false 8 | } 9 | } 10 | 11 | -------------------------------------------------------------------------------- /cdk/mwaa-cdk/dags/sample-cdk-dag-od.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from airflow import DAG 3 | from airflow.operators.dummy_operator import DummyOperator 4 | from airflow.operators.python_operator import PythonOperator 5 | 6 | def print_hello(): 7 | return 'Hello Wolrd' 8 | 9 | dag = DAG('hello_world_ondemand', description='Hello world example', schedule_interval=None, start_date=datetime(2017, 3, 20), catchup=False) 10 | 11 | dummy_operator = DummyOperator(task_id='dummy_task', retries = 3, dag=dag) 12 | 13 | hello_operator = PythonOperator(task_id='hello_task', python_callable=print_hello, dag=dag) 14 | 15 | dummy_operator >> hello_operator 16 | 17 | -------------------------------------------------------------------------------- /cdk/mwaa-cdk/dags/sample-cdk-dag.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from airflow import DAG 3 | from airflow.operators.dummy_operator import DummyOperator 4 | from airflow.operators.python_operator import PythonOperator 5 | 6 | def print_hello(): 7 | return 'Hello Wolrd' 8 | 9 | dag = DAG('hello_world_schedule', description='Hello world example', schedule_interval='0 12 * * *', start_date=datetime(2017, 3, 20), catchup=False) 10 | 11 | dummy_operator = DummyOperator(task_id='dummy_task', retries = 3, dag=dag) 12 | 13 | hello_operator = PythonOperator(task_id='hello_task', python_callable=print_hello, dag=dag) 14 | 15 | dummy_operator >> hello_operator 16 | 17 | -------------------------------------------------------------------------------- /docker/app/db-details.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import json 3 | 4 | 5 | def get_secret(): 6 | 7 | secret_name = "rds-airflow-hybrid" 8 | region_name = "eu-west-2" 9 | 10 | # Create a Secrets Manager client 11 | session = boto3.session.Session() 12 | client = session.client( 13 | service_name='secretsmanager', 14 | region_name=region_name 15 | ) 16 | get_secret_value_response = client.get_secret_value(SecretId=secret_name) 17 | info=json.loads(get_secret_value_response['SecretString']) 18 | pw=info['password'] 19 | un=info['username'] 20 | hs=info['host'] 21 | db=info['database'] 22 | 23 | get_secret() 24 | 25 | -------------------------------------------------------------------------------- /dag/ecs-external.py: -------------------------------------------------------------------------------- 1 | from airflow import DAG 2 | from datetime import datetime, timedelta 3 | from airflow.providers.amazon.aws.operators.ecs import ECSOperator 4 | 5 | 6 | default_args = { 7 | 'owner': 'ubuntu', 8 | 'start_date': datetime(2019, 8, 14), 9 | 'retry_delay': timedelta(seconds=60*60) 10 | } 11 | 12 | with DAG('airflow_dag_test_external', catchup=False, default_args=default_args, schedule_interval=None) as dag: 13 | test = ECSOperator( 14 | task_id="test", 15 | dag=dag, 16 | cluster="test-hybrid", 17 | task_definition="test-external", 18 | launch_type="EXTERNAL", 19 | overrides={ 20 | "containerOverrides": [ ], 21 | }, 22 | awslogs_group="/ecs/test-external", 23 | awslogs_stream_prefix="ecs", 24 | ) 25 | 26 | test -------------------------------------------------------------------------------- /dag/ecs-hybrid-ec2.py: -------------------------------------------------------------------------------- 1 | from airflow import DAG 2 | from datetime import datetime, timedelta 3 | from airflow.providers.amazon.aws.operators.ecs import ECSOperator 4 | 5 | 6 | default_args = { 7 | 'owner': 'ubuntu', 8 | 'start_date': datetime(2019, 8, 14), 9 | 'retry_delay': timedelta(seconds=60*60) 10 | } 11 | 12 | with DAG('hybrid_airflow_ec2_dag', catchup=False, default_args=default_args, schedule_interval=None) as dag: 13 | 14 | cloudquery = ECSOperator( 15 | task_id="cloudquery", 16 | dag=dag, 17 | cluster="hybrid-airflow-cluster", 18 | task_definition="apache-airflow", 19 | overrides={ }, 20 | launch_type="EC2", 21 | awslogs_group="/ecs/hybrid-airflow", 22 | awslogs_stream_prefix="ecs/Hybrid-ELT-TaskDef" 23 | ) 24 | 25 | cloudquery 26 | -------------------------------------------------------------------------------- /dag/ecs-hybrid-external.py: -------------------------------------------------------------------------------- 1 | from airflow import DAG 2 | from datetime import datetime, timedelta 3 | from airflow.providers.amazon.aws.operators.ecs import ECSOperator 4 | 5 | 6 | default_args = { 7 | 'owner': 'ubuntu', 8 | 'start_date': datetime(2019, 8, 14), 9 | 'retry_delay': timedelta(seconds=60*60) 10 | } 11 | 12 | with DAG('hybrid_airflow_external_dag', catchup=False, default_args=default_args, schedule_interval=None) as dag: 13 | 14 | remotequery = ECSOperator( 15 | task_id="remotequery", 16 | dag=dag, 17 | cluster="hybrid-airflow-cluster", 18 | task_definition="apache-airflow", 19 | overrides={ }, 20 | launch_type="EXTERNAL", 21 | awslogs_group="/ecs/hybrid-airflow", 22 | awslogs_stream_prefix="ecs/Hybrid-ELT-TaskDef" 23 | ) 24 | 25 | remotequery 26 | -------------------------------------------------------------------------------- /docker/app/python_mysql_dbconfig.py: -------------------------------------------------------------------------------- 1 | from configparser import ConfigParser 2 | 3 | 4 | def read_db_config(filename='config.ini', section='mysql'): 5 | """ Read database configuration file and return a dictionary object 6 | :param filename: name of the configuration file 7 | :param section: section of database configuration 8 | :return: a dictionary of database parameters 9 | """ 10 | # create parser and read ini configuration file 11 | parser = ConfigParser() 12 | parser.read(filename) 13 | 14 | # get section, default to mysql 15 | db = {} 16 | if parser.has_section(section): 17 | items = parser.items(section) 18 | for item in items: 19 | db[item[0]] = item[1] 20 | else: 21 | raise Exception('{0} not found in the {1} file'.format(section, filename)) 22 | 23 | return db -------------------------------------------------------------------------------- /cdk/mwaa-cdk/app.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | #!/usr/bin/env python3 4 | 5 | import aws_cdk as cdk 6 | 7 | from mwaa_cdk.mwaa_cdk_backend import MwaaCdkStackBackend 8 | from mwaa_cdk.mwaa_cdk_env import MwaaCdkStackEnv 9 | 10 | env_EU=cdk.Environment(region="eu-central-1", account="704533066374") 11 | mwaa_props = {'dagss3location': '094459-airflow-hybrid-demo','mwaa_env' : 'mwaa-hybrid-demo'} 12 | 13 | app = cdk.App() 14 | 15 | mwaa_hybrid_backend = MwaaCdkStackBackend( 16 | scope=app, 17 | id="mwaa-hybrid-backend", 18 | env=env_EU, 19 | mwaa_props=mwaa_props 20 | ) 21 | 22 | mwaa_hybrid_env = MwaaCdkStackEnv( 23 | scope=app, 24 | id="mwaa-hybrid-environment", 25 | vpc=mwaa_hybrid_backend.vpc, 26 | env=env_EU, 27 | mwaa_props=mwaa_props 28 | ) 29 | 30 | app.synth() 31 | -------------------------------------------------------------------------------- /cdk/ecs-cdk/ecs-anywhere/app.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | #!/usr/bin/env python3 4 | 5 | #from aws_cdk import core 6 | import aws_cdk as cdk 7 | 8 | from ecs_anywhere.ecs_anywhere_vpc import EcsAnywhereVPCStack 9 | from ecs_anywhere.ecs_anywhere_taskdef import EcsAnywhereTaskDefStack 10 | 11 | env_EU=cdk.Environment(region="eu-west-2", account="704533066374") 12 | props = { 13 | 'ecsclustername':'hybrid-airflow', 14 | 'ecstaskdef':'demo-hybrid-airflow', 15 | 'ecr-repo': 'hybrid-airflow', 16 | 'image-tag' : 'airflw-amd64', 17 | 'awsvpccidr':'10.0.0.0/16', 18 | 's3':'094459-hybrid-airflow' 19 | } 20 | 21 | app = cdk.App() 22 | 23 | mydc_vpc = EcsAnywhereVPCStack( 24 | scope=app, 25 | id="ecs-anywhere-vpc", 26 | env=env_EU, 27 | props=props 28 | ) 29 | 30 | mydc_ecs_cicd = EcsAnywhereTaskDefStack( 31 | scope=app, 32 | id="ecs-anywhere-taskdef", 33 | env=env_EU, 34 | vpc=mydc_vpc.vpc, 35 | props=props 36 | ) 37 | 38 | app.synth() -------------------------------------------------------------------------------- /dag/ecs-hybrid-local.py: -------------------------------------------------------------------------------- 1 | from airflow import DAG 2 | from datetime import datetime, timedelta 3 | from airflow.providers.amazon.aws.operators.ecs import ECSOperator 4 | 5 | 6 | default_args = { 7 | 'owner': 'ubuntu', 8 | 'start_date': datetime(2019, 8, 14), 9 | 'retry_delay': timedelta(seconds=60*60) 10 | } 11 | 12 | with DAG('hybrid_airflow_local_dag', catchup=False, default_args=default_args, schedule_interval=None) as dag: 13 | 14 | localquery = ECSOperator( 15 | task_id="localquery", 16 | dag=dag, 17 | cluster="hybrid-airflow-cluster", 18 | task_definition="apache-airflow", 19 | overrides={ "containerOverrides": [ 20 | { 21 | "name": "Hybrid-ELT-TaskDef", 22 | "command" : [ "ricsue-airflow-hybrid","period1/region-data.csv", "select * from customers WHERE country = \"Spain\"", "mydc-airflow-hybrid","eu-west-2" ]} 23 | ] }, 24 | launch_type="EXTERNAL", 25 | awslogs_group="/ecs/hybrid-airflow", 26 | awslogs_stream_prefix="ecs/Hybrid-ELT-TaskDef" 27 | ) 28 | 29 | localquery 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 094459 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /cdk/mwaa-cdk/mwaa_cdk/mwaa_cdk_backend.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | from aws_cdk import ( 4 | aws_iam as iam, 5 | aws_ec2 as ec2, 6 | Stack, 7 | CfnOutput 8 | ) 9 | from constructs import Construct 10 | 11 | class MwaaCdkStackBackend(Stack): 12 | 13 | def __init__(self, scope: Construct, id: str, mwaa_props, **kwargs) -> None: 14 | super().__init__(scope, id, **kwargs) 15 | 16 | # Create VPC network 17 | 18 | self.vpc = ec2.Vpc( 19 | self, 20 | id="MWAA-Hybrid-ApacheAirflow-VPC", 21 | cidr="10.192.0.0/16", 22 | max_azs=2, 23 | nat_gateways=1, 24 | subnet_configuration=[ 25 | ec2.SubnetConfiguration( 26 | name="public", cidr_mask=24, 27 | reserved=False, subnet_type=ec2.SubnetType.PUBLIC), 28 | ec2.SubnetConfiguration( 29 | name="private", cidr_mask=24, 30 | reserved=False, subnet_type=ec2.SubnetType.PRIVATE_WITH_NAT) 31 | ], 32 | enable_dns_hostnames=True, 33 | enable_dns_support=True 34 | ) 35 | 36 | 37 | CfnOutput( 38 | self, 39 | id="VPCId", 40 | value=self.vpc.vpc_id, 41 | description="VPC ID", 42 | export_name=f"{self.region}:{self.account}:{self.stack_name}:vpc-id" 43 | ) 44 | 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /scripts/create-ecs-task.py: -------------------------------------------------------------------------------- 1 | #Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | #SPDX-License-Identifier: Apache-2.0 3 | 4 | import boto3 5 | import json 6 | 7 | 8 | client = boto3.client("ecs", region_name="eu-west-2") 9 | 10 | def create_task(): 11 | response = client.register_task_definition( 12 | containerDefinitions=[ 13 | { 14 | "name": "airflow-hybrid-boto3", 15 | "image": "public.ecr.aws/a4b5h6u6/beachgeek:latest", 16 | "cpu": 0, 17 | "portMappings": [], 18 | "essential": True, 19 | "environment": [], 20 | "mountPoints": [], 21 | "volumesFrom": [], 22 | "command": ["ricsue-airflow-hybrid","period1/temp.csv", "select * from customers WHERE location = \"Spain\"", "rds-airflow-hybrid","eu-west-2"], 23 | "logConfiguration": { 24 | "logDriver": "awslogs", 25 | "options": { 26 | "awslogs-group": "/ecs/test-external", 27 | "awslogs-region": "eu-west-2", 28 | "awslogs-stream-prefix": "ecs" 29 | } 30 | } 31 | } 32 | ], 33 | taskRoleArn="arn:aws:iam::704533066374:role/ecsTaskExecutionRole", 34 | executionRoleArn="arn:aws:iam::704533066374:role/ecsTaskExecutionRole", 35 | family= "test-external", 36 | networkMode="HOST", 37 | requiresCompatibilities= [ 38 | "EXTERNAL" 39 | ], 40 | cpu= "256", 41 | memory= "512") 42 | -------------------------------------------------------------------------------- /cdk/ecs-cdk/ecs-anywhere/ecs_anywhere/ecs_anywhere_vpc.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | from aws_cdk import ( 4 | aws_iam as iam, 5 | aws_ec2 as ec2, 6 | Stack, 7 | CfnOutput 8 | ) 9 | from constructs import Construct 10 | 11 | class EcsAnywhereVPCStack(Stack): 12 | 13 | def __init__(self, scope: Construct, id: str, props, **kwargs) -> None: 14 | super().__init__(scope, id, **kwargs) 15 | 16 | # Create VPC networking environment 17 | # Public subnet/ "private"(isolated) subnet 18 | # Customer Gateway and Virtual Private Gateway 19 | # Site to Site VPN Connection 20 | 21 | self.vpc = ec2.Vpc( 22 | self, 23 | id="mydc-vpn-vpc", 24 | cidr=f"{props['awsvpccidr']}", 25 | nat_gateways=1, 26 | subnet_configuration=[ 27 | ec2.SubnetConfiguration( 28 | name="public", cidr_mask=24, 29 | reserved=False, subnet_type=ec2.SubnetType.PUBLIC), 30 | ec2.SubnetConfiguration( 31 | name="private", cidr_mask=24, 32 | #reserved=False, subnet_type=ec2.SubnetType.ISOLATED) 33 | reserved=False, subnet_type=ec2.SubnetType.PRIVATE_WITH_NAT) 34 | ], 35 | max_azs=2, 36 | enable_dns_hostnames=True, 37 | enable_dns_support=True, 38 | vpn_gateway=False 39 | ) 40 | CfnOutput( 41 | self, 42 | id="VPCId", 43 | value=self.vpc.vpc_id, 44 | description="VPC ID", 45 | export_name=f"{self.region}:{self.account}:{self.stack_name}:vpc-id" 46 | ) 47 | 48 | 49 | -------------------------------------------------------------------------------- /docker/setup.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Change these values for your own environment 4 | # it should match what values you use in the CDK app 5 | # if you are using this script together to deploy 6 | # the multi-arch demo 7 | 8 | AWS_DEFAULT_REGION=eu-west-2 9 | AWS_ACCOUNT=704533066374 10 | AWS_ECR_REPO=hybrid-airflow 11 | COMMIT_HASH="airflw" 12 | 13 | # You can alter these values, but the defaults will work for any environment 14 | 15 | IMAGE_TAG=${COMMIT_HASH:=latest} 16 | AMD_TAG=${COMMIT_HASH}-amd64 17 | DOCKER_CLI_EXPERIMENTAL=enabled 18 | REPOSITORY_URI=$AWS_ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com/$AWS_ECR_REPO 19 | 20 | # Login to ECR 21 | # Old deprecated 22 | # $(aws ecr get-login --region $AWS_DEFAULT_REGION --no-include-email) 23 | aws ecr get-login-password --region $AWS_DEFAULT_REGION | docker login --username AWS --password-stdin $AWS_ACCOUNT.dkr.ecr.$AWS_DEFAULT_REGION.amazonaws.com 24 | 25 | # create AWS ECR Repo 26 | 27 | if (aws ecr describe-repositories --repository-names $AWS_ECR_REPO ) then 28 | echo "Skipping the create repo as already exists" 29 | else 30 | echo "Creating repos as it does not exists" 31 | aws ecr create-repository --region $AWS_DEFAULT_REGION --repository-name $AWS_ECR_REPO 32 | fi 33 | 34 | # Build initial image and upload to ECR Repo 35 | 36 | docker build -t $REPOSITORY_URI:latest . 37 | docker tag $REPOSITORY_URI:latest $REPOSITORY_URI:$AMD_TAG 38 | docker push $REPOSITORY_URI:$AMD_TAG 39 | 40 | # Create the image manifests and upload to ECR 41 | 42 | docker manifest create $REPOSITORY_URI:$COMMIT_HASH $REPOSITORY_URI:$AMD_TAG 43 | docker manifest annotate --arch amd64 $REPOSITORY_URI:$COMMIT_HASH $REPOSITORY_URI:$AMD_TAG 44 | docker manifest inspect $REPOSITORY_URI:$COMMIT_HASH 45 | docker manifest push $REPOSITORY_URI:$COMMIT_HASH 46 | -------------------------------------------------------------------------------- /docker/app/boto-ecs.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import json 3 | 4 | # Thanks to https://hands-on.cloud/working-with-ecs-in-python-using-boto3/ for a good cheatsheet 5 | 6 | client = boto3.client("ecs", region_name="eu-west-2") 7 | 8 | ## create a new task in ecs 9 | 10 | response = client.register_task_definition( 11 | containerDefinitions=[ 12 | { 13 | "name": "airflow-hybrid-boto3", 14 | "image": "public.ecr.aws/a4b5h6u6/beachgeek:latest", 15 | "cpu": 0, 16 | "portMappings": [], 17 | "essential": True, 18 | "environment": [], 19 | "mountPoints": [], 20 | "volumesFrom": [], 21 | "command": ["ricsue-airflow-hybrid","period1/temp.csv", "select * from customers WHERE location = \"China\"", "rds-airflow-hybrid","eu-west-2"], 22 | "logConfiguration": { 23 | "logDriver": "awslogs", 24 | "options": { 25 | "awslogs-group": "/ecs/test-external", 26 | "awslogs-region": "eu-west-2", 27 | "awslogs-stream-prefix": "ecs" 28 | } 29 | } 30 | } 31 | ], 32 | taskRoleArn="arn:aws:iam::704533066374:role/ecsTaskExecutionRole", 33 | #taskDefinitionArn="arn:aws:ecs:eu-west-2:704533066374:task-definition/test-external:5", 34 | executionRoleArn="arn:aws:iam::704533066374:role/ecsTaskExecutionRole", 35 | family= "test-external", 36 | networkMode="bridge", 37 | requiresCompatibilities= [ 38 | "EXTERNAL" 39 | ], 40 | cpu= "256", 41 | memory= "512") 42 | 43 | print(json.dumps(response, indent=4, default=str)) 44 | 45 | 46 | # it will automatically use the latest version 47 | # ideally you do not want this as this might impact idempotency 48 | # so configure an explict version 49 | 50 | new_taskdef=json.dumps(response['taskDefinition']['revision'], indent=4, default=str) 51 | print("TaskDef is now at :" + str(new_taskdef)) 52 | 53 | 54 | 55 | #run task 56 | # explicity set taskdef 57 | 58 | response2 = client.run_task( 59 | cluster='test-hybrid', 60 | count=1, 61 | launchType='EXTERNAL', 62 | taskDefinition='test-external:{taskdef}'.format(taskdef=new_taskdef) 63 | ) 64 | 65 | print(json.dumps(response2, indent=4, default=str)) -------------------------------------------------------------------------------- /dag/ecs-hybrid-boto3.py: -------------------------------------------------------------------------------- 1 | #Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | #SPDX-License-Identifier: Apache-2.0 3 | 4 | from airflow import DAG 5 | from datetime import datetime, timedelta 6 | from airflow.operators.python import PythonOperator 7 | import boto3 8 | import json 9 | 10 | default_args = { 11 | 'owner': 'ubuntu', 12 | 'start_date': datetime(2019, 8, 14), 13 | 'retry_delay': timedelta(seconds=60*60) 14 | } 15 | 16 | # Grab variables - fure improvement 17 | 18 | #region 19 | #taskRoleArn 20 | #executionRoleArn 21 | #family 22 | #awslogs-group 23 | #awslogs-stream-prefix 24 | #task-name 25 | #container-image 26 | #command 27 | #cluster 28 | 29 | 30 | client = boto3.client("ecs", region_name="eu-west-2") 31 | 32 | # Function that will take variables and create our new ECS Task Definition 33 | def create_task(ti): 34 | response = client.register_task_definition( 35 | containerDefinitions=[ 36 | { 37 | "name": "airflow-hybrid-boto3", 38 | "image": "public.ecr.aws/xxx/xxx:latest", 39 | "cpu": 0, 40 | "portMappings": [], 41 | "essential": True, 42 | "environment": [], 43 | "mountPoints": [], 44 | "volumesFrom": [], 45 | "command": ["ricsue-airflow-hybrid","period1/temp.csv", "select * from customers WHERE location = \"Spain\"", "rds-airflow-hybrid","eu-west-2"], 46 | "logConfiguration": { 47 | "logDriver": "awslogs", 48 | "options": { 49 | "awslogs-group": "/ecs/test-external", 50 | "awslogs-region": "eu-west-2", 51 | "awslogs-stream-prefix": "ecs" 52 | } 53 | } 54 | } 55 | ], 56 | taskRoleArn="arn:aws:iam::xxx:role/ecsTaskExecutionRole", 57 | executionRoleArn="arn:aws:iam::xxxx:role/ecsTaskExecutionRole", 58 | family= "test-external", 59 | networkMode="bridge", 60 | requiresCompatibilities= [ 61 | "EXTERNAL" 62 | ], 63 | cpu= "256", 64 | memory= "512") 65 | 66 | # we now need to store the version of the new task so we can ensure idemopotency 67 | 68 | new_taskdef=json.dumps(response['taskDefinition']['revision'], indent=4, default=str) 69 | print("TaskDef is now at :" + str(new_taskdef)) 70 | return new_taskdef 71 | 72 | # Function that will run our ECS Task 73 | def run_task(ti): 74 | #new_taskdef=ti.xcom_pull(key='new_taskdef', task_ids=['create_taskdef'][0]) 75 | new_taskdef=ti.xcom_pull(task_ids=['create_taskdef'][0]) 76 | print("TaskDef passed is :" + str(new_taskdef)) 77 | response2 = client.run_task( 78 | cluster='test-hybrid', 79 | count=1, 80 | launchType='EXTERNAL', 81 | taskDefinition='test-external:{taskdef}'.format(taskdef=new_taskdef) 82 | ) 83 | 84 | with DAG('airflow_ecsanywhere_boto3', catchup=False, default_args=default_args, schedule_interval=None) as dag: 85 | first_task=PythonOperator(task_id='create_taskdef', python_callable=create_task, provide_context=True, dag=dag) 86 | second_task=PythonOperator(task_id='run_task', python_callable=run_task, provide_context=True, dag=dag) 87 | 88 | first_task >> second_task -------------------------------------------------------------------------------- /dag/ecs-hybrid.py: -------------------------------------------------------------------------------- 1 | #Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | #SPDX-License-Identifier: Apache-2.0 3 | 4 | from airflow import DAG 5 | from datetime import datetime, timedelta 6 | from airflow.providers.amazon.aws.operators.ecs import ECSOperator 7 | from airflow.operators.python import PythonOperator 8 | import boto3 9 | import json 10 | 11 | 12 | default_args = { 13 | 'owner': 'ubuntu', 14 | 'start_date': datetime(2019, 8, 14), 15 | 'retry_delay': timedelta(seconds=60*60) 16 | } 17 | 18 | # Function that will take variables and create our new ECS Task Definition 19 | def create_task(ti): 20 | client = boto3.client("ecs", region_name="eu-west-2") 21 | response = client.register_task_definition( 22 | containerDefinitions=[ 23 | { 24 | "name": "airflow-hybrid-demo", 25 | "image": "public.ecr.aws/xx/xx:latest", 26 | "cpu": 0, 27 | "portMappings": [], 28 | "essential": True, 29 | "environment": [], 30 | "mountPoints": [], 31 | "volumesFrom": [], 32 | "command": ["ricsue-airflow-hybrid","period1/hq-data.csv", "select * from customers WHERE location = \"Spain\"", "rds-airflow-hybrid","eu-west-2"], 33 | "logConfiguration": { 34 | "logDriver": "awslogs", 35 | "options": { 36 | "awslogs-group": "/ecs/test-external", 37 | "awslogs-region": "eu-west-2", 38 | "awslogs-stream-prefix": "ecs" 39 | } 40 | } 41 | } 42 | ], 43 | taskRoleArn="arn:aws:iam::xx:role/ecsTaskExecutionRole", 44 | executionRoleArn="arn:aws:iam::xx:role/ecsTaskExecutionRole", 45 | family= "test-external", 46 | networkMode="host", 47 | requiresCompatibilities= [ 48 | "EXTERNAL" 49 | ], 50 | cpu= "256", 51 | memory= "512") 52 | 53 | # we now need to store the version of the new task so we can ensure idemopotency 54 | 55 | new_taskdef=json.dumps(response['taskDefinition']['revision'], indent=4, default=str) 56 | print("TaskDef is now at :" + str(new_taskdef)) 57 | return new_taskdef 58 | 59 | 60 | with DAG('hybrid_airflow_dag_test', catchup=False, default_args=default_args, schedule_interval=None) as dag: 61 | create_taskdef = PythonOperator( 62 | task_id='create_taskdef', 63 | provide_context=True, 64 | python_callable=create_task, 65 | dag=dag 66 | ) 67 | 68 | cloudquery = ECSOperator( 69 | task_id="cloudquery", 70 | dag=dag, 71 | cluster="test-hybrid", 72 | task_definition="test-external", 73 | overrides={ }, 74 | launch_type="EC2", 75 | awslogs_group="/ecs/test-external", 76 | awslogs_stream_prefix="ecs" 77 | ) 78 | 79 | # switch between these to change between remote and local MySQL 80 | # "command" : [ "ricsue-airflow-hybrid","period1/region-data.csv", "select * from customers WHERE location = \"Poland\"", "rds-airflow-hybrid","eu-west-2" ]} 81 | # "command" : [ "ricsue-airflow-hybrid","period1/region-data.csv", "select * from regionalcustomers WHERE country = \"Poland\"", "localmysql-airflow-hybrid","eu-west-2" ]} 82 | 83 | remotequery = ECSOperator( 84 | task_id="remotequery", 85 | dag=dag, 86 | cluster="test-hybrid", 87 | task_definition="test-external", 88 | launch_type="EXTERNAL", 89 | overrides={ "containerOverrides": [ 90 | { 91 | "name": "airflow-hybrid-demo", 92 | "command" : [ "ricsue-airflow-hybrid","period1/region-data.csv", "select * from regionalcustomers WHERE country = \"Poland\"", "localmysql-airflow-hybrid","eu-west-2" ]} 93 | ] }, 94 | awslogs_group="/ecs/test-external", 95 | awslogs_stream_prefix="ecs", 96 | ) 97 | 98 | create_taskdef >> cloudquery 99 | create_taskdef >> remotequery -------------------------------------------------------------------------------- /docker/app/read-data-q.py: -------------------------------------------------------------------------------- 1 | from copy import copy 2 | from mysql.connector import MySQLConnection, Error 3 | from python_mysql_dbconfig import read_db_config 4 | import sys 5 | import csv 6 | import boto3 7 | import json 8 | import socket 9 | def query_with_fetchone(query2run,secret,region): 10 | try: 11 | # Grab MySQL connection and database settings. We areusing AWS Secrets Manager 12 | # but you could use another service like Hashicorp Vault 13 | # We cannot use Apache Airflow to store these as this script runs stand alone 14 | secret_name = secret 15 | region_name = region 16 | session = boto3.session.Session() 17 | client = session.client( 18 | service_name='secretsmanager', 19 | region_name=region_name 20 | ) 21 | get_secret_value_response = client.get_secret_value(SecretId=secret_name) 22 | info=json.loads(get_secret_value_response['SecretString']) 23 | pw=info['password'] 24 | un=info['username'] 25 | hs=info['host'] 26 | db=info['database'] 27 | # Output to the log so we can see and confirm WHERE we are running and WHAT 28 | # we are connecting to 29 | 30 | print("Connecting to ",str(hs)," database ", str(db), " as user ", str(un)) 31 | print("Database host IP is :", socket.gethostbyname(hs)) 32 | print("Source IP is ", socket.gethostname()) 33 | 34 | conn = MySQLConnection(user=un, password=pw, host=hs, database=db) 35 | cursor = conn.cursor() 36 | query=query2run 37 | print("Query is", str(query)) 38 | cursor.execute(query) 39 | records = cursor.fetchall() 40 | c = csv.writer(open("temp.csv","w")) 41 | c.writerows(records) 42 | print("Records exported:") 43 | for row in records: 44 | print(row[0],",",row[1],",",row[2],",",row[3],",",row[4],",",row[5], ",",row[6],",",row[7] ) 45 | 46 | except Error as e: 47 | print(e) 48 | sys.exit(1) 49 | 50 | finally: 51 | cursor.close() 52 | conn.close() 53 | def upload_to_s3(s3bucket,s3folder,region): 54 | # We will upload the temp (temp.csv) file and copy it based on the input params of the script (bucket and dir/file) 55 | try: 56 | s3 = boto3.client('s3', region_name=region) 57 | s3.upload_file('temp.csv',s3bucket,s3folder) 58 | except FileNotFoundError: 59 | print("The file was not found") 60 | return False 61 | except Error as e: 62 | print(e) 63 | sys.exit(1) 64 | 65 | if __name__ == '__main__': 66 | try: 67 | arg = sys.argv[2] 68 | except IndexError: 69 | raise SystemExit(f"Usage: {sys.argv[0]} ") 70 | # The script needs the following arguments to run 71 | # 1. Target S3 bucket where the output of the SQL script will be copied 72 | # 2. Target S3 folder/filename 73 | # 3. The query to execute 74 | # 4. The parameter store (we use AWS Secrets) which holds the values on where to find the MySQL database 75 | # 5. The AWS region 76 | s3bucket=sys.argv[1] 77 | s3folder=sys.argv[2] 78 | query2run=sys.argv[3] 79 | secret=sys.argv[4] 80 | region=sys.argv[5] 81 | query_with_fetchone(query2run,secret,region) 82 | upload_to_s3(s3bucket,s3folder,region) 83 | 84 | # demo command to test this from the cli 85 | # for Cloud based MySQL 86 | # python app/read-data-q.py ricsue-airflow-hybrid period1/temp.csv "select * from customers WHERE location = 'Poland' AND (date BETWEEN '2022-01-01 14:15:55' AND '2022-09-29 10:15:55')" rds-airflow-hybrid eu-west-2 87 | # for local/remote based MySQL 88 | # python app/read-data-q.py ricsue-airflow-hybrid period1/temp2.csv "select * from customers WHERE location = 'China' AND (date BETWEEN '2022-01-01 14:15:55' AND '2022-09-29 10:15:55')" localmysql-airflow-hybrid eu-west-2 89 | # other queries you can try, for example 90 | # "select * from customers WHERE location = '{country}' AND (date BETWEEN '{start}' AND '{end}')".format(country=country,start=start,end=end) 91 | 92 | 93 | 94 | -------------------------------------------------------------------------------- /cdk/ecs-cdk/ecs-anywhere/ecs_anywhere/ecs_anywhere_taskdef.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | 4 | from aws_cdk import ( 5 | aws_iam as iam, 6 | aws_ecs as ecs, 7 | aws_ec2 as ec2, 8 | aws_ecr as ecr, 9 | aws_logs as log, 10 | aws_s3 as s3, 11 | aws_autoscaling as autoscaling, 12 | Stack, 13 | CfnOutput 14 | ) 15 | from constructs import Construct 16 | 17 | class EcsAnywhereTaskDefStack(Stack): 18 | 19 | def __init__(self, scope: Construct, id: str, vpc, props, **kwargs) -> None: 20 | super().__init__(scope, id, **kwargs) 21 | 22 | airflow_repo = ecr.Repository.from_repository_name(self, "Hybrid-ELT-Repo", repository_name=f"{props['ecr-repo']}") 23 | airflow_image = ecs.ContainerImage.from_ecr_repository(airflow_repo, f"{props['image-tag']}") 24 | 25 | ecscluster_role = iam.Role( 26 | self, 27 | f"{props['ecsclustername']}-ecsrole", 28 | role_name=f"{props['ecsclustername']}-ECSInstanceRole", 29 | assumed_by=iam.ServicePrincipal("ssm.amazonaws.com"), 30 | managed_policies=[iam.ManagedPolicy.from_aws_managed_policy_name("AmazonSSMManagedInstanceCore")] 31 | ) 32 | ecsfix = ecscluster_role.node.default_child 33 | ecsfix.add_property_override( 34 | "AssumeRolePolicyDocument.Statement.0.Principal.Service", "ssm.amazonaws.com" 35 | ) 36 | ecscluster_role.add_managed_policy(iam.ManagedPolicy.from_aws_managed_policy_name("service-role/AmazonEC2ContainerServiceforEC2Role")) 37 | 38 | ecscluster = ecs.Cluster( 39 | self, 40 | f"{props['ecsclustername']}-ecscluster", 41 | cluster_name=f"{props['ecsclustername']}-cluster", 42 | vpc=vpc 43 | ) 44 | 45 | ecscluster.add_capacity( 46 | "x86AutoScalingGroup", 47 | instance_type=ec2.InstanceType("t2.xlarge"), 48 | desired_capacity=1 49 | ) 50 | 51 | data_lake = s3.Bucket.from_bucket_name(self, "DataLake", f"{props['s3']}") 52 | data_lake_arn = data_lake.bucket_arn 53 | 54 | task_def_policy_document = iam.PolicyDocument( 55 | statements=[ 56 | iam.PolicyStatement( 57 | actions=[ "s3:*" ], 58 | effect=iam.Effect.ALLOW, 59 | resources=[ 60 | f"{data_lake_arn}/*", 61 | f"{data_lake_arn}" 62 | ], 63 | ), 64 | iam.PolicyStatement( 65 | actions=[ 66 | "ecs:RunTask", 67 | "ecs:DescribeTasks", 68 | "ecs:RegisterTaskDefinition", 69 | "ecs:DescribeTaskDefinition", 70 | "ecs:ListTasks", 71 | "ecs:StopTask" 72 | ], 73 | effect=iam.Effect.ALLOW, 74 | resources=[ 75 | "*" 76 | ], 77 | ), 78 | iam.PolicyStatement( 79 | actions=[ 80 | "iam:PassRole" 81 | ], 82 | effect=iam.Effect.ALLOW, 83 | resources=[ "*" ], 84 | conditions= { "StringLike": { "iam:PassedToService": "ecs-tasks.amazonaws.com" } }, 85 | ), 86 | iam.PolicyStatement( 87 | actions=[ 88 | "logs:CreateLogStream", 89 | "logs:CreateLogGroup", 90 | "logs:PutLogEvents", 91 | "logs:GetLogEvents", 92 | "logs:GetLogRecord", 93 | "logs:GetLogGroupFields", 94 | "logs:GetQueryResults" 95 | ], 96 | effect=iam.Effect.ALLOW, 97 | resources=[ 98 | f"arn:aws:logs:*:*:log-group:/ecs/{props['ecsclustername']}:log-stream:/ecs/*" 99 | ] 100 | ) 101 | ] 102 | ) 103 | 104 | task_def_policy_document_role = iam.Role( 105 | self, 106 | "ECSTaskDefRole", 107 | role_name=f"{props['ecsclustername']}-ECSTaskDefRole", 108 | assumed_by=iam.ServicePrincipal("ecs-tasks.amazonaws.com"), 109 | inline_policies={"ECSTaskDefPolicyDocument": task_def_policy_document} 110 | ) 111 | 112 | managed_secret_manager_policy = iam.ManagedPolicy.from_aws_managed_policy_name("SecretsManagerReadWrite") 113 | task_def_policy_document_role.add_managed_policy(managed_secret_manager_policy) 114 | 115 | external_task_def_policy_document_role = iam.Role( 116 | self, 117 | "ExternalECSAnywhereRole", 118 | role_name=f"{props['ecsclustername']}-ExternalECSAnywhereRole", 119 | assumed_by=iam.ServicePrincipal("ssm.amazonaws.com") 120 | ) 121 | 122 | extfix = external_task_def_policy_document_role.node.default_child 123 | extfix.add_property_override( 124 | "AssumeRolePolicyDocument.Statement.0.Principal.Service", "ssm.amazonaws.com" 125 | ) 126 | 127 | external_managed_SSM_policy = iam.ManagedPolicy.from_aws_managed_policy_name("AmazonSSMManagedInstanceCore") 128 | external_managed_ECS_policy = iam.ManagedPolicy.from_aws_managed_policy_name("service-role/AmazonEC2ContainerServiceforEC2Role") 129 | external_task_def_policy_document_role.add_managed_policy(external_managed_SSM_policy) 130 | external_task_def_policy_document_role.add_managed_policy(external_managed_ECS_policy) 131 | 132 | 133 | log_group = log.LogGroup( 134 | self, 135 | "LogGroup", 136 | log_group_name=f"/ecs/{props['ecsclustername']}" 137 | ) 138 | ec2_task_definition = ecs.Ec2TaskDefinition( 139 | self, 140 | f"{props['ecsclustername']}-ApacheAirflowTaskDef", 141 | family=f"{props['ecstaskdef']}", 142 | network_mode=ecs.NetworkMode.HOST, 143 | task_role=task_def_policy_document_role 144 | ) 145 | 146 | ## For the purpose of the demo, these values are coded here. If you were doing 147 | # this properly you would separate these out and make it more re-usable 148 | 149 | ec2_task_definition.add_container( 150 | "Hybrid-ELT-TaskDef", 151 | image=airflow_image, 152 | memory_limit_mib=1024, 153 | cpu=100, 154 | # Configure CloudWatch logging 155 | logging=ecs.LogDrivers.aws_logs(stream_prefix="ecs",log_group=log_group), 156 | essential=True, 157 | command= [ "094459-hybrid-airflow", "hybrid/hq-data.csv", "select * from customers WHERE country = \"Romania\"", "rds-airflow-hybrid", "eu-west-2" ], 158 | ) 159 | 160 | CfnOutput( 161 | self, 162 | id="ECSClusterName", 163 | value=ecscluster.cluster_name, 164 | description="Name of ECS Cluster created" 165 | ) 166 | CfnOutput( 167 | self, 168 | id="ECSRoleName", 169 | value=ecscluster_role.role_name, 170 | description="Name of ECS Role created" 171 | ) 172 | CfnOutput( 173 | self, 174 | id="ECSAnywhereRoleName", 175 | value=external_task_def_policy_document_role.role_name, 176 | description="Name of ECS Role created" 177 | ) 178 | 179 | 180 | 181 | 182 | -------------------------------------------------------------------------------- /cdk/mwaa-cdk/mwaa_cdk/mwaa_cdk_env.py: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | 4 | from aws_cdk import ( 5 | aws_iam as iam, 6 | aws_ec2 as ec2, 7 | aws_s3 as s3, 8 | aws_s3_deployment as s3deploy, 9 | aws_mwaa as mwaa, 10 | aws_kms as kms, 11 | Stack, 12 | CfnOutput, 13 | Tags 14 | ) 15 | from constructs import Construct 16 | 17 | class MwaaCdkStackEnv(Stack): 18 | 19 | def __init__(self, scope: Construct, id: str, vpc, mwaa_props, **kwargs) -> None: 20 | super().__init__(scope, id, **kwargs) 21 | 22 | key_suffix = 'Key' 23 | 24 | # Create MWAA S3 Bucket and upload local dags 25 | 26 | s3_tags = { 27 | 'env': f"{mwaa_props['mwaa_env']}", 28 | 'service': 'MWAA Apache AirFlow' 29 | } 30 | 31 | dags_bucket = s3.Bucket( 32 | self, 33 | "mwaa-dags", 34 | bucket_name=f"{mwaa_props['dagss3location'].lower()}", 35 | versioned=True, 36 | block_public_access=s3.BlockPublicAccess.BLOCK_ALL 37 | ) 38 | 39 | for tag in s3_tags: 40 | Tags.of(dags_bucket).add(tag, s3_tags[tag]) 41 | 42 | s3deploy.BucketDeployment(self, "DeployDAG", 43 | sources=[s3deploy.Source.asset("./dags")], 44 | destination_bucket=dags_bucket, 45 | destination_key_prefix="dags", 46 | prune=False, 47 | retain_on_delete=False 48 | ) 49 | 50 | dags_bucket_arn = dags_bucket.bucket_arn 51 | 52 | # Create MWAA IAM Policies and Roles, copied from MWAA documentation site 53 | # After destroy remove cloudwatch log groups, S3 bucket and verify KMS key is removed. 54 | 55 | mwaa_policy_document = iam.PolicyDocument( 56 | statements=[ 57 | iam.PolicyStatement( 58 | actions=["airflow:PublishMetrics"], 59 | effect=iam.Effect.ALLOW, 60 | resources=[f"arn:aws:airflow:{self.region}:{self.account}:environment/{mwaa_props['mwaa_env']}"], 61 | ), 62 | iam.PolicyStatement( 63 | actions=[ 64 | "s3:ListAllMyBuckets" 65 | ], 66 | effect=iam.Effect.DENY, 67 | resources=[ 68 | f"{dags_bucket_arn}/*", 69 | f"{dags_bucket_arn}" 70 | ], 71 | ), 72 | iam.PolicyStatement( 73 | actions=[ 74 | "s3:*" 75 | ], 76 | effect=iam.Effect.ALLOW, 77 | resources=[ 78 | f"{dags_bucket_arn}/*", 79 | f"{dags_bucket_arn}" 80 | ], 81 | ), 82 | iam.PolicyStatement( 83 | actions=[ 84 | "logs:CreateLogStream", 85 | "logs:CreateLogGroup", 86 | "logs:PutLogEvents", 87 | "logs:GetLogEvents", 88 | "logs:GetLogRecord", 89 | "logs:GetLogGroupFields", 90 | "logs:GetQueryResults", 91 | "logs:DescribeLogGroups" 92 | ], 93 | effect=iam.Effect.ALLOW, 94 | resources=[f"arn:aws:logs:{self.region}:{self.account}:log-group:airflow-{mwaa_props['mwaa_env']}-*"], 95 | ), 96 | iam.PolicyStatement( 97 | actions=[ 98 | "logs:DescribeLogGroups" 99 | ], 100 | effect=iam.Effect.ALLOW, 101 | resources=["*"], 102 | ), 103 | iam.PolicyStatement( 104 | actions=[ 105 | "sqs:ChangeMessageVisibility", 106 | "sqs:DeleteMessage", 107 | "sqs:GetQueueAttributes", 108 | "sqs:GetQueueUrl", 109 | "sqs:ReceiveMessage", 110 | "sqs:SendMessage" 111 | ], 112 | effect=iam.Effect.ALLOW, 113 | resources=[f"arn:aws:sqs:{self.region}:*:airflow-celery-*"], 114 | ), 115 | iam.PolicyStatement( 116 | actions=[ 117 | "ecs:RunTask", 118 | "ecs:DescribeTasks", 119 | "ecs:RegisterTaskDefinition", 120 | "ecs:DescribeTaskDefinition", 121 | "ecs:ListTasks" 122 | ], 123 | effect=iam.Effect.ALLOW, 124 | resources=[ 125 | "*" 126 | ], 127 | ), 128 | iam.PolicyStatement( 129 | actions=[ 130 | "iam:PassRole" 131 | ], 132 | effect=iam.Effect.ALLOW, 133 | resources=[ "*" ], 134 | conditions= { "StringLike": { "iam:PassedToService": "ecs-tasks.amazonaws.com" } }, 135 | ), 136 | iam.PolicyStatement( 137 | actions=[ 138 | "kms:Decrypt", 139 | "kms:DescribeKey", 140 | "kms:GenerateDataKey*", 141 | "kms:Encrypt", 142 | "kms:PutKeyPolicy" 143 | ], 144 | effect=iam.Effect.ALLOW, 145 | resources=["*"], 146 | conditions={ 147 | "StringEquals": { 148 | "kms:ViaService": [ 149 | f"sqs.{self.region}.amazonaws.com", 150 | f"s3.{self.region}.amazonaws.com", 151 | ] 152 | } 153 | }, 154 | ), 155 | ] 156 | ) 157 | 158 | mwaa_service_role = iam.Role( 159 | self, 160 | "mwaa-service-role", 161 | assumed_by=iam.CompositePrincipal( 162 | iam.ServicePrincipal("airflow.amazonaws.com"), 163 | iam.ServicePrincipal("airflow-env.amazonaws.com"), 164 | iam.ServicePrincipal("ecs-tasks.amazonaws.com"), 165 | ), 166 | inline_policies={"CDKmwaaPolicyDocument": mwaa_policy_document}, 167 | path="/service-role/" 168 | ) 169 | 170 | 171 | # Create MWAA Security Group and get networking info 172 | 173 | security_group = ec2.SecurityGroup( 174 | self, 175 | id = "mwaa-sg", 176 | vpc = vpc, 177 | security_group_name = "mwaa-sg" 178 | ) 179 | 180 | security_group_id = security_group.security_group_id 181 | 182 | security_group.connections.allow_internally(ec2.Port.all_traffic(),"MWAA") 183 | 184 | subnets = [subnet.subnet_id for subnet in vpc.private_subnets] 185 | network_configuration = mwaa.CfnEnvironment.NetworkConfigurationProperty( 186 | security_group_ids=[security_group_id], 187 | subnet_ids=subnets, 188 | ) 189 | 190 | # **OPTIONAL** Configure specific MWAA settings - you can externalise these if you want 191 | 192 | logging_configuration = mwaa.CfnEnvironment.LoggingConfigurationProperty( 193 | dag_processing_logs=mwaa.CfnEnvironment.ModuleLoggingConfigurationProperty( 194 | enabled=True, 195 | log_level="INFO" 196 | ), 197 | task_logs=mwaa.CfnEnvironment.ModuleLoggingConfigurationProperty( 198 | enabled=True, 199 | log_level="INFO" 200 | ), 201 | worker_logs=mwaa.CfnEnvironment.ModuleLoggingConfigurationProperty( 202 | enabled=True, 203 | log_level="INFO" 204 | ), 205 | scheduler_logs=mwaa.CfnEnvironment.ModuleLoggingConfigurationProperty( 206 | enabled=True, 207 | log_level="INFO" 208 | ), 209 | webserver_logs=mwaa.CfnEnvironment.ModuleLoggingConfigurationProperty( 210 | enabled=True, 211 | log_level="INFO" 212 | ) 213 | ) 214 | 215 | options = { 216 | 'core.load_default_connections': False, 217 | 'core.load_examples': False, 218 | 'webserver.dag_default_view': 'tree', 219 | 'webserver.dag_orientation': 'TB' 220 | } 221 | 222 | tags = { 223 | 'env': f"{mwaa_props['mwaa_env']}", 224 | 'service': 'MWAA Apache AirFlow' 225 | } 226 | 227 | # **OPTIONAL** Create KMS key that MWAA will use for encryption 228 | 229 | kms_mwaa_policy_document = iam.PolicyDocument( 230 | statements=[ 231 | iam.PolicyStatement( 232 | actions=[ 233 | "kms:Create*", 234 | "kms:Describe*", 235 | "kms:Enable*", 236 | "kms:List*", 237 | "kms:Put*", 238 | "kms:Decrypt*", 239 | "kms:Update*", 240 | "kms:Revoke*", 241 | "kms:Disable*", 242 | "kms:Get*", 243 | "kms:Delete*", 244 | "kms:ScheduleKeyDeletion", 245 | "kms:GenerateDataKey*", 246 | "kms:CancelKeyDeletion" 247 | ], 248 | principals=[ 249 | iam.AccountRootPrincipal(), 250 | # Optional: 251 | # iam.ArnPrincipal(f"arn:aws:sts::{self.account}:assumed-role/AWSReservedSSO_rest_of_SSO_account"), 252 | ], 253 | resources=["*"]), 254 | iam.PolicyStatement( 255 | actions=[ 256 | "kms:Decrypt*", 257 | "kms:Describe*", 258 | "kms:GenerateDataKey*", 259 | "kms:Encrypt*", 260 | "kms:ReEncrypt*", 261 | "kms:PutKeyPolicy" 262 | ], 263 | effect=iam.Effect.ALLOW, 264 | resources=["*"], 265 | principals=[iam.ServicePrincipal("logs.amazonaws.com", region=f"{self.region}")], 266 | conditions={"ArnLike": {"kms:EncryptionContext:aws:logs:arn": f"arn:aws:logs:{self.region}:{self.account}:*"}}, 267 | ), 268 | ] 269 | ) 270 | 271 | 272 | 273 | key = kms.Key( 274 | self, 275 | f"{mwaa_props['mwaa_env']}{key_suffix}", 276 | enable_key_rotation=True, 277 | policy=kms_mwaa_policy_document 278 | ) 279 | 280 | key.add_alias(f"alias/{mwaa_props['mwaa_env']}{key_suffix}") 281 | 282 | # Create MWAA environment using all the info above 283 | 284 | managed_airflow = mwaa.CfnEnvironment( 285 | scope=self, 286 | id='airflow-test-environment', 287 | name=f"{mwaa_props['mwaa_env']}", 288 | airflow_configuration_options={'core.default_timezone': 'utc'}, 289 | airflow_version='2.0.2', 290 | dag_s3_path="dags", 291 | environment_class='mw1.small', 292 | execution_role_arn=mwaa_service_role.role_arn, 293 | kms_key=key.key_arn, 294 | logging_configuration=logging_configuration, 295 | max_workers=5, 296 | network_configuration=network_configuration, 297 | #plugins_s3_object_version=None, 298 | #plugins_s3_path=None, 299 | #requirements_s3_object_version=None, 300 | #requirements_s3_path=None, 301 | source_bucket_arn=dags_bucket_arn, 302 | webserver_access_mode='PUBLIC_ONLY', 303 | #weekly_maintenance_window_start=None 304 | ) 305 | 306 | managed_airflow.add_override('Properties.AirflowConfigurationOptions', options) 307 | managed_airflow.add_override('Properties.Tags', tags) 308 | 309 | CfnOutput( 310 | self, 311 | id="MWAASecurityGroup", 312 | value=security_group_id, 313 | description="Security Group name used by MWAA" 314 | ) 315 | --------------------------------------------------------------------------------