├── .gitignore ├── src ├── dbt-project │ ├── analysis │ │ └── .gitkeep │ ├── data │ │ └── .gitkeep │ ├── macros │ │ └── .gitkeep │ ├── tests │ │ └── .gitkeep │ ├── packages.yml │ ├── profiles.yml │ ├── run-dbt.sh │ ├── models │ │ └── example │ │ │ ├── top_nations.sql │ │ │ └── top_customers.sql │ ├── README.md │ └── dbt_project.yml └── Dockerfile ├── CODE_OF_CONDUCT.md ├── README.md ├── LICENSE ├── config └── buildspec.yml ├── CONTRIBUTING.md └── cloudformation_files ├── redshift.yaml └── etl_automation.yaml /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | -------------------------------------------------------------------------------- /src/dbt-project/analysis/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/dbt-project/data/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/dbt-project/macros/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/dbt-project/tests/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /src/dbt-project/packages.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - package: dbt-labs/dbt_utils 3 | version: 0.8.4 4 | -------------------------------------------------------------------------------- /src/dbt-project/profiles.yml: -------------------------------------------------------------------------------- 1 | default: 2 | outputs: 3 | default: 4 | type: redshift 5 | threads: 1 6 | host: 7 | port: 5439 8 | user: 9 | pass: 10 | dbname: dev 11 | schema: 12 | sslmode: require -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Automate deployment of Amazon Redshift ETL jobs with CodeBuild, AWS Batch, and DBT 4 | 5 | Code to accompany AWS blog "Automate deployment of Amazon Redshift ETL jobs with CodeBuild, AWS Batch, and DBT" 6 | 7 | ## Security 8 | 9 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information. 10 | 11 | ## License 12 | 13 | This library is licensed under the MIT-0 License. See the LICENSE file. 14 | 15 | -------------------------------------------------------------------------------- /src/dbt-project/run-dbt.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | date 3 | echo "This job creates a database using a seed and then runs the dbt job" 4 | echo "jobId: $AWS_BATCH_JOB_ID" 5 | echo "jobQueue: $AWS_BATCH_JQ_NAME" 6 | echo "computeEnvironment: $AWS_BATCH_CE_NAME" 7 | 8 | if [ $# -eq 0 ]; then 9 | echo "No models were specified. Executing all models" 10 | dbt run --profiles-dir . 11 | else 12 | echo "Executing only specified models" 13 | dbt run --profiles-dir . -m $@ 14 | fi 15 | -------------------------------------------------------------------------------- /src/dbt-project/models/example/top_nations.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | SPDX-License-Identifier: MIT-0 4 | */ 5 | 6 | 7 | /* 8 | Find 3 nations with the most active customers 9 | */ 10 | 11 | {{ config(materialized='table') }} 12 | 13 | select n_name, count(*) as active_buyers 14 | from nation n left join {{ ref('top_customers') }} c on n.n_nationkey = c.c_nationkey 15 | group by n_name 16 | order by active_buyers desc 17 | limit 3 -------------------------------------------------------------------------------- /src/dbt-project/models/example/top_customers.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | SPDX-License-Identifier: MIT-0 4 | */ 5 | 6 | 7 | /* 8 | Find top 100 customers with the most number of orders 9 | */ 10 | 11 | {{ config(materialized='table') }} 12 | 13 | select c_custkey, c_nationkey, count(*) total_order 14 | from customer c left join orders o on c.c_custkey = o.o_custkey 15 | group by c_custkey, c_nationkey 16 | order by total_order desc 17 | limit 100 18 | -------------------------------------------------------------------------------- /src/Dockerfile: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | 4 | FROM public.ecr.aws/lambda/python:3.8 5 | 6 | ENTRYPOINT ["/bin/bash", "-l", "-c"] 7 | 8 | ADD dbt-project /dbt-project 9 | 10 | # Update and install system packages 11 | RUN yum update kernel -y 12 | RUN yum update -y 13 | RUN pip install -U pip 14 | 15 | # Install DBT 16 | RUN pip install --no-cache-dir dbt-core 17 | RUN pip install --no-cache-dir dbt-redshift 18 | 19 | WORKDIR /dbt-project 20 | RUN chmod +x run-dbt.sh 21 | RUN dbt deps --profiles-dir . 22 | 23 | -------------------------------------------------------------------------------- /src/dbt-project/README.md: -------------------------------------------------------------------------------- 1 | Welcome to your new dbt project! 2 | 3 | ### Using the starter project 4 | 5 | Try running the following commands: 6 | - dbt run 7 | - dbt test 8 | 9 | 10 | ### Resources: 11 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/overview) 12 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers 13 | - Join the [chat](http://slack.getdbt.com/) on Slack for live discussions and support 14 | - Find [dbt events](https://events.getdbt.com) near you 15 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices 16 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software is furnished to do so. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 15 | 16 | -------------------------------------------------------------------------------- /config/buildspec.yml: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | 4 | version: 0.2 5 | env: 6 | secrets-manager: 7 | REDSHIFT_USER: $REDSHIFT_USER_SECRET 8 | REDSHIFT_PASSWORD: $REDSHIFT_PASSWORD_SECRET 9 | phases: 10 | install: 11 | runtime-versions: 12 | python: 3.7 13 | pre_build: 14 | commands: 15 | - aws --version 16 | - echo 'region - ' - $AWS_DEFAULT_REGION 17 | - echo 'repository - ' $REPOSITORY_URI 18 | - cd src/dbt-project/ 19 | - sed -i -e "s@\(host:\).*@\1 $REDSHIFT_HOST@" profiles.yml 20 | - sed -i -e "s/\(user:\).*/\1 $REDSHIFT_USER/" profiles.yml 21 | - sed -i -e "s/\(pass:\).*/\1 $REDSHIFT_PASSWORD/" profiles.yml 22 | - sed -i -e "s/\(schema:\).*/\1 $SCHEMA/" profiles.yml 23 | - cat profiles.yml 24 | - cd ../ 25 | - echo Logging in to Amazon ECR 26 | - $(aws ecr get-login --region $AWS_DEFAULT_REGION --no-include-email) 27 | build: 28 | commands: 29 | - echo Build started on `date` 30 | - echo Building the Docker image... 31 | - docker build -t $REPOSITORY_URI . 32 | - docker tag $REPOSITORY_URI $REPOSITORY_URI 33 | post_build: 34 | commands: 35 | - echo Build completed on `date` 36 | - echo Push the latest Docker Image... 37 | - docker push $REPOSITORY_URI 38 | -------------------------------------------------------------------------------- /src/dbt-project/dbt_project.yml: -------------------------------------------------------------------------------- 1 | 2 | # Name your project! Project names should contain only lowercase characters 3 | # and underscores. A good package name should reflect your organization's 4 | # name or the intended use of these models 5 | name: 'my_new_project' 6 | version: '1.0.0' 7 | 8 | config-version: 2 9 | 10 | # This setting configures which "profile" dbt uses for this project. 11 | profile: 'default' 12 | 13 | # These configurations specify where dbt should look for different types of files. 14 | # The `source-paths` config, for example, states that models in this project can be 15 | # found in the "models/" directory. You probably won't need to change these! 16 | model-paths: ["models"] 17 | analysis-paths: ["analysis"] 18 | test-paths: ["tests"] 19 | seed-paths: ["data"] 20 | macro-paths: ["macros"] 21 | 22 | target-path: "target" # directory which will store compiled SQL files 23 | clean-targets: # directories to be removed by `dbt clean` 24 | - "target" 25 | - "dbt_modules" 26 | 27 | 28 | 29 | # Configuring models 30 | # Full documentation: https://docs.getdbt.com/docs/configuring-models 31 | 32 | # In this example config, we tell dbt to build all models in the example/ directory 33 | # as tables. These settings can be overridden in the individual model files 34 | # using the `{{ config(...) }}` macro. 35 | models: 36 | my_new_project: 37 | # Applies to all files under models/example/ 38 | example: 39 | +materialized: view 40 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *master* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | 61 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes. 62 | -------------------------------------------------------------------------------- /cloudformation_files/redshift.yaml: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | 4 | AWSTemplateFormatVersion: 2010-09-09 5 | 6 | Description: 'Template to deploy Redshift and VPC with public and private subnet, and ingest data to Redshift' 7 | 8 | Resources: 9 | 10 | ############# VPC, Private subnet, Public subnet ################## 11 | VPC: 12 | Type: 'AWS::EC2::VPC' 13 | Properties: 14 | CidrBlock: 172.16.0.0/16 15 | InstanceTenancy: default 16 | EnableDnsSupport: true 17 | EnableDnsHostnames: false 18 | 19 | PublicSubnet: 20 | Type: 'AWS::EC2::Subnet' 21 | Properties: 22 | CidrBlock: 172.16.0.0/24 23 | VpcId: !Ref VPC 24 | AvailabilityZone: !Select 25 | - 0 26 | - Fn::GetAZs: !Ref 'AWS::Region' 27 | MapPublicIpOnLaunch: true 28 | Tags: 29 | - Key: Name 30 | Value: PublicSubnet 31 | 32 | PrivateSubnet: 33 | Type: 'AWS::EC2::Subnet' 34 | Properties: 35 | CidrBlock: 172.16.2.0/24 36 | VpcId: !Ref VPC 37 | AvailabilityZone: !Select 38 | - 0 39 | - Fn::GetAZs: !Ref 'AWS::Region' 40 | Tags: 41 | - Key: Name 42 | Value: PrivateSubnet 43 | 44 | NetworkAcl: 45 | Type: 'AWS::EC2::NetworkAcl' 46 | Properties: 47 | VpcId: !Ref VPC 48 | 49 | InboundNetworkAclEntry: 50 | Type: 'AWS::EC2::NetworkAclEntry' 51 | Properties: 52 | CidrBlock: 0.0.0.0/0 53 | Egress: 'False' 54 | Protocol: '-1' 55 | RuleAction: allow 56 | RuleNumber: '100' 57 | NetworkAclId: !Ref NetworkAcl 58 | 59 | OutboundNetworkAclEntry: 60 | Type: 'AWS::EC2::NetworkAclEntry' 61 | Properties: 62 | CidrBlock: 0.0.0.0/0 63 | Egress: 'True' 64 | Protocol: '-1' 65 | RuleAction: allow 66 | RuleNumber: '100' 67 | NetworkAclId: !Ref NetworkAcl 68 | 69 | NetworkAclPublicSubnet: 70 | Type: 'AWS::EC2::SubnetNetworkAclAssociation' 71 | DependsOn: 72 | - InboundNetworkAclEntry 73 | - OutboundNetworkAclEntry 74 | Properties: 75 | NetworkAclId: !Ref NetworkAcl 76 | SubnetId: !Ref PublicSubnet 77 | 78 | NetworkAclPrivateSubnet: 79 | Type: 'AWS::EC2::SubnetNetworkAclAssociation' 80 | DependsOn: 81 | - InboundNetworkAclEntry 82 | - OutboundNetworkAclEntry 83 | Properties: 84 | NetworkAclId: !Ref NetworkAcl 85 | SubnetId: !Ref PrivateSubnet 86 | 87 | InternetGateway: 88 | Type: 'AWS::EC2::InternetGateway' 89 | 90 | InternetGatewayAttachment: 91 | Type: 'AWS::EC2::VPCGatewayAttachment' 92 | Properties: 93 | VpcId: !Ref VPC 94 | InternetGatewayId: !Ref InternetGateway 95 | 96 | NatGatewayEIP: 97 | Type: AWS::EC2::EIP 98 | Properties: 99 | Domain: vpc 100 | 101 | NatGateway: 102 | Type: AWS::EC2::NatGateway 103 | Properties: 104 | AllocationId: !GetAtt NatGatewayEIP.AllocationId 105 | SubnetId: 106 | Ref: PublicSubnet 107 | 108 | PublicRouteTable: 109 | Type: 'AWS::EC2::RouteTable' 110 | Properties: 111 | VpcId: !Ref VPC 112 | Tags: 113 | - Key: Name 114 | Value: Public Route Table 115 | 116 | PublicRouteTableSubnet: 117 | Type: 'AWS::EC2::SubnetRouteTableAssociation' 118 | Properties: 119 | RouteTableId: !Ref PublicRouteTable 120 | SubnetId: !Ref PublicSubnet 121 | 122 | PrivateRouteTable: 123 | Type: 'AWS::EC2::RouteTable' 124 | Properties: 125 | VpcId: !Ref VPC 126 | Tags: 127 | - Key: Name 128 | Value: Private Route Table1 129 | 130 | PrivateRouteTableSubnet: 131 | Type: 'AWS::EC2::SubnetRouteTableAssociation' 132 | Properties: 133 | RouteTableId: !Ref PrivateRouteTable 134 | SubnetId: !Ref PrivateSubnet 135 | 136 | Route2InternetGateway: 137 | Type: 'AWS::EC2::Route' 138 | Properties: 139 | DestinationCidrBlock: 0.0.0.0/0 140 | RouteTableId: !Ref PublicRouteTable 141 | GatewayId: !Ref InternetGateway 142 | DependsOn: 143 | - InternetGatewayAttachment 144 | - PublicRouteTableSubnet 145 | 146 | Route2NatGateway: 147 | Type: 'AWS::EC2::Route' 148 | DependsOn: 149 | - PrivateRouteTableSubnet 150 | Properties: 151 | DestinationCidrBlock: 0.0.0.0/0 152 | RouteTableId: !Ref PrivateRouteTable 153 | NatGatewayId: !Ref NatGateway 154 | 155 | 156 | ######### REDSHIFT : IAM role, cluster, security group, secret manager ########## 157 | RedshiftAccessRole: 158 | Type: AWS::IAM::Role 159 | Properties: 160 | AssumeRolePolicyDocument: 161 | Version: 2012-10-17 162 | Statement: 163 | - 164 | Effect: Allow 165 | Principal: 166 | Service: 167 | - redshift.amazonaws.com 168 | Action: 169 | - sts:AssumeRole 170 | 171 | AccessPolicy: 172 | Type: AWS::IAM::ManagedPolicy 173 | Properties: 174 | ManagedPolicyName: RedshiftDbtAccessPolicy 175 | PolicyDocument: 176 | Version: 2012-10-17 177 | Statement: 178 | - 179 | Effect: Allow 180 | Action: 181 | - s3:Get* 182 | - s3:List* 183 | Resource: 184 | - arn:aws:s3:::aws-bigdata-blog/artifacts/automate-redshift-etl-dbt/* 185 | - arn:aws:s3:::aws-bigdata-blog 186 | Roles: 187 | - !Ref RedshiftAccessRole 188 | 189 | RedshiftSubnetGroup: 190 | Type: 'AWS::Redshift::ClusterSubnetGroup' 191 | Properties: 192 | Description: Redshift in private subnet 193 | SubnetIds: 194 | - !Ref PrivateSubnet 195 | 196 | RedshiftSecurityGroup: 197 | Type: AWS::EC2::SecurityGroup 198 | Properties: 199 | GroupDescription: "Security group for Redshift" 200 | VpcId: !Ref VPC 201 | 202 | RedshiftSecret: 203 | Type: AWS::SecretsManager::Secret 204 | Properties: 205 | Name: "redshift-creds" 206 | Description: 'This is the secret for Redshift cluster' 207 | GenerateSecretString: 208 | SecretStringTemplate: '{"username": "awsuser", "dbname": "dev"}' 209 | GenerateStringKey: 'password' 210 | PasswordLength: 16 211 | ExcludePunctuation: True 212 | 213 | kmsKey: 214 | DeletionPolicy: Delete 215 | UpdateReplacePolicy: Delete 216 | Type: 'AWS::KMS::Key' 217 | Properties: 218 | Description: Key for encrypting Redshift 219 | EnableKeyRotation: true 220 | KeyPolicy: 221 | Version: 2012-10-17 222 | Statement: 223 | - Effect: Allow 224 | Principal: 225 | AWS: 226 | 'Fn::Sub': 'arn:aws:iam::${AWS::AccountId}:root' 227 | Action: 228 | - 'kms:CancelKeyDeletion' 229 | - 'kms:Create*' 230 | - 'kms:Delete*' 231 | - 'kms:Describe*' 232 | - 'kms:Disable*' 233 | - 'kms:Enable*' 234 | - 'kms:Get*' 235 | - 'kms:List*' 236 | - 'kms:Put*' 237 | - 'kms:Revoke*' 238 | - 'kms:ScheduleKeyDeletion' 239 | - 'kms:TagResource' 240 | - 'kms:Update*' 241 | - 'kms:Encrypt' 242 | - 'kms:Decrypt' 243 | - 'kms:ReEncrypt*' 244 | - 'kms:GenerateDataKey*' 245 | Resource: '*' 246 | 247 | RedshiftCluster: 248 | Type: 'AWS::Redshift::Cluster' 249 | DependsOn: 250 | - InternetGatewayAttachment 251 | - AccessPolicy 252 | Properties: 253 | PubliclyAccessible: False 254 | ClusterType: 'single-node' 255 | NodeType: 'dc2.large' 256 | DBName: 'dev' 257 | MasterUsername: 'awsuser' 258 | MasterUserPassword: '{{resolve:secretsmanager:redshift-creds:SecretString:password}}' 259 | Port: 5439 260 | ClusterSubnetGroupName: !Ref RedshiftSubnetGroup 261 | IamRoles: 262 | - !GetAtt RedshiftAccessRole.Arn 263 | VpcSecurityGroupIds: 264 | - !Ref RedshiftSecurityGroup 265 | Encrypted: True 266 | KmsKeyId: !Ref kmsKey 267 | 268 | # Lambda to ingest data to Redshift 269 | LambdaIngestFunction: 270 | Type: AWS::Lambda::Function 271 | DependsOn: 272 | - Route2InternetGateway 273 | - Route2NatGateway 274 | - NetworkAclPrivateSubnet 275 | - NetworkAclPublicSubnet 276 | Properties: 277 | Code: 278 | ZipFile: !Sub | 279 | import psycopg2 280 | from psycopg2 import ProgrammingError 281 | import boto3 282 | import json 283 | import cfnresponse 284 | 285 | def get_redshift_creds(): 286 | print("get credentials") 287 | client = boto3.client('secretsmanager', region_name='${AWS::Region}') 288 | get_secret_value_response = client.get_secret_value(SecretId = "redshift-creds") 289 | secret = get_secret_value_response['SecretString'] 290 | db_creds = json.loads(secret) 291 | return db_creds["username"], db_creds["password"] 292 | 293 | def create_conn(username, password): 294 | print("get connection") 295 | conn = psycopg2.connect( 296 | dbname="dev", 297 | user=username, 298 | password=password, 299 | port="5439", 300 | host="${RedshiftCluster.Endpoint.Address}") 301 | conn.autocommit = True 302 | return conn 303 | 304 | def run_query(conn, query): 305 | cursor = conn.cursor() 306 | cursor.execute(query) 307 | print("executed") 308 | 309 | def lambda_handler(event, context): 310 | print(event) 311 | if event["RequestType"] == "Delete": 312 | cfnresponse.send(event, context, cfnresponse.SUCCESS, {}, {}) 313 | return 314 | 315 | username, password = get_redshift_creds() 316 | conn = create_conn(username, password) 317 | run_query(conn, "DROP TABLE IF EXISTS public.orders CASCADE;") 318 | run_query(conn, "DROP TABLE IF EXISTS public.customer CASCADE;") 319 | run_query(conn, "DROP TABLE IF EXISTS public.nation CASCADE;") 320 | run_query(conn, """ 321 | CREATE TABLE public.nation ( 322 | N_NATIONKEY bigint NOT NULL PRIMARY KEY, 323 | N_NAME varchar(25), 324 | N_REGIONKEY bigint, 325 | N_COMMENT varchar(152)) 326 | diststyle all; 327 | """) 328 | run_query(conn, """ 329 | create table public.customer ( 330 | C_CUSTKEY bigint NOT NULL PRIMARY KEY, 331 | C_NAME varchar(25), 332 | C_ADDRESS varchar(40), 333 | C_NATIONKEY bigint REFERENCES nation(N_NATIONKEY), 334 | C_PHONE varchar(15), 335 | C_ACCTBAL decimal(18,4), 336 | C_MKTSEGMENT varchar(10), 337 | C_COMMENT varchar(117)) 338 | diststyle all; 339 | """) 340 | run_query(conn, """ 341 | create table public.orders ( 342 | O_ORDERKEY bigint NOT NULL PRIMARY KEY, 343 | O_CUSTKEY bigint REFERENCES customer(C_CUSTKEY), 344 | O_ORDERSTATUS varchar(1), 345 | O_TOTALPRICE decimal(18,4), 346 | O_ORDERDATE Date, 347 | O_ORDERPRIORITY varchar(15), 348 | O_CLERK varchar(15), 349 | O_SHIPPRIORITY Integer, 350 | O_COMMENT varchar(79)) 351 | distkey (O_ORDERKEY) 352 | sortkey (O_ORDERDATE); 353 | """) 354 | run_query(conn, """ 355 | COPY public.nation FROM 's3://aws-bigdata-blog/artifacts/automate-redshift-etl-dbt/sample_data/nation/nation.tbl.' 356 | iam_role '${RedshiftAccessRole.Arn}' 357 | region 'us-east-1' lzop delimiter '|' COMPUPDATE PRESET; 358 | """) 359 | run_query(conn, """ 360 | copy public.customer from 's3://aws-bigdata-blog/artifacts/automate-redshift-etl-dbt/sample_data/customer/customer.tbl.' 361 | iam_role '${RedshiftAccessRole.Arn}' 362 | region 'us-east-1' lzop delimiter '|' COMPUPDATE PRESET; 363 | """) 364 | run_query(conn, """ 365 | copy public.orders from 's3://aws-bigdata-blog/artifacts/automate-redshift-etl-dbt/sample_data/orders/orders.tbl.' 366 | iam_role '${RedshiftAccessRole.Arn}' 367 | region 'us-east-1' lzop delimiter '|' COMPUPDATE PRESET; 368 | """) 369 | cfnresponse.send(event, context, cfnresponse.SUCCESS, {}, {}) 370 | 371 | Handler: "index.lambda_handler" 372 | Timeout: 900 373 | Role: 374 | Fn::GetAtt: 375 | - LambdaIngestFunctionRole 376 | - Arn 377 | Runtime: python3.7 378 | Layers: 379 | - !Ref LambdaLayer 380 | VpcConfig: 381 | SubnetIds: 382 | - !Ref PrivateSubnet 383 | SecurityGroupIds: 384 | - !Ref LambdaSecurityGroup 385 | 386 | LambdaLayer: 387 | Type: AWS::Lambda::LayerVersion 388 | Properties: 389 | CompatibleRuntimes: 390 | - python3.7 391 | Content: 392 | S3Bucket: aws-bigdata-blog 393 | S3Key: artifacts/automate-redshift-etl-dbt/python_libs.zip 394 | Description: Package for redshift connection 395 | LayerName: RedshiftUtil 396 | LicenseInfo: MIT 397 | 398 | LambdaSecurityGroup: 399 | Type: AWS::EC2::SecurityGroup 400 | Properties: 401 | GroupDescription: EC2 Security Group for Lambda function 402 | VpcId: 403 | Ref: VPC 404 | 405 | LambdaToRedshiftSgRule: 406 | Type: AWS::EC2::SecurityGroupIngress 407 | Properties: 408 | Description: Access from AWS Batch for DBT 409 | GroupId: !Ref RedshiftSecurityGroup 410 | IpProtocol: tcp 411 | SourceSecurityGroupId: !Ref LambdaSecurityGroup 412 | FromPort: 5439 413 | ToPort: 5439 414 | 415 | LambdaIngestFunctionRole: 416 | Type: AWS::IAM::Role 417 | Properties: 418 | ManagedPolicyArns: 419 | - arn:aws:iam::aws:policy/service-role/AWSLambdaVPCAccessExecutionRole 420 | AssumeRolePolicyDocument: 421 | Version: 2012-10-17 422 | Statement: 423 | - Effect: Allow 424 | Principal: 425 | Service: 426 | - lambda.amazonaws.com 427 | Action: 428 | - sts:AssumeRole 429 | Path: / 430 | 431 | LambdaAccessPolicy: 432 | Type: AWS::IAM::ManagedPolicy 433 | Properties: 434 | ManagedPolicyName: LambdaDbtAccessPolicy 435 | PolicyDocument: 436 | Version: 2012-10-17 437 | Statement: 438 | - Effect: Allow 439 | Action: 440 | - "secretsmanager:GetResourcePolicy" 441 | - "secretsmanager:GetSecretValue" 442 | - "secretsmanager:DescribeSecret" 443 | Resource: 444 | - !Ref RedshiftSecret 445 | Roles: 446 | - !Ref LambdaIngestFunctionRole 447 | 448 | RedshiftDataIngestion: 449 | Type: Custom::RedshiftDataIngestion 450 | Properties: 451 | ServiceToken: !GetAtt LambdaIngestFunction.Arn 452 | 453 | Outputs: 454 | VPC: 455 | Description: VPC ID 456 | Value: !Ref VPC 457 | Export: 458 | Name: !Sub "${AWS::StackName}-VPC" 459 | PrivateSubnet: 460 | Description: Private subnet ID 461 | Value: !Ref PrivateSubnet 462 | Export: 463 | Name: !Sub "${AWS::StackName}-PrivateSubnet" 464 | PublicSubnet: 465 | Description: Public subnet ID 466 | Value: !Ref PublicSubnet 467 | Export: 468 | Name: !Sub "${AWS::StackName}-PublicSubnet" 469 | RedshiftIamRole: 470 | Description: Name of IAM Role used by Reshift cluster 471 | Value: !GetAtt RedshiftAccessRole.Arn 472 | RedshiftClusterName: 473 | Description: Name of Redshift cluster 474 | Value: !Ref RedshiftCluster 475 | RedshiftSecurityGroup: 476 | Description: Security group of Redshift cluster 477 | Value: !Ref RedshiftSecurityGroup 478 | Export: 479 | Name: !Sub "${AWS::StackName}-RedshiftSecurityGroup" 480 | RedshiftClusterEndpoint: 481 | Description: Redshift cluster endpoint 482 | Value: !Sub "${RedshiftCluster.Endpoint.Address}" 483 | Export: 484 | Name: !Sub "${AWS::StackName}-RedshiftClusterEndpoint" 485 | 486 | -------------------------------------------------------------------------------- /cloudformation_files/etl_automation.yaml: -------------------------------------------------------------------------------- 1 | # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | # SPDX-License-Identifier: MIT-0 3 | 4 | --- 5 | AWSTemplateFormatVersion: '2010-09-09' 6 | Description: 'Template to deploy Redshift ETL automation with DBT' 7 | 8 | Parameters: 9 | GithubRepoUrl: 10 | Type: String 11 | Default: '' 12 | Description: Enter the github repository url of your DBT project e.g. https://github.com/myuser/dbt-batch.git 13 | GithubToken: 14 | Type: String 15 | Default: '' 16 | NoEcho: true 17 | Description: Enter your github token 18 | MonitoringEmail: 19 | Type: String 20 | Default: '' 21 | Description: Email address that will receive monitoring alerts 22 | JobFrequency: 23 | Type: String 24 | Default: 0 4 * * ? * 25 | Description: Frequency of the DBT job in CRON format. Time will be in UTC timezone. For example, "0 4 * * ? *" (without quotes) will run the job everyday at 4:00 UTC. 26 | RedshiftStackName: 27 | Type: String 28 | Description: Name of cloudformation stack where you deployed Redshift cluster 29 | RedshiftSchema: 30 | Type: String 31 | Default: 'public' 32 | Description: Name of Redshift schema with your data 33 | GithubType: 34 | Type: String 35 | Default: 'GITHUB' 36 | AllowedValues: 37 | - GITHUB 38 | - GITHUB_ENTERPRISE 39 | Description: Whether you are using regular GitHub (i.e. github.com) or GitHub Enterprise from your company. 40 | 41 | 42 | Resources: 43 | 44 | kmsKey: 45 | DeletionPolicy: Delete 46 | UpdateReplacePolicy: Delete 47 | Type: 'AWS::KMS::Key' 48 | Properties: 49 | Description: Key for encrypting resources on ETL automation 50 | EnableKeyRotation: true 51 | KeyPolicy: 52 | Version: 2012-10-17 53 | Statement: 54 | - Effect: Allow 55 | Principal: 56 | AWS: 57 | 'Fn::Sub': 'arn:aws:iam::${AWS::AccountId}:root' 58 | Action: 59 | - 'kms:CancelKeyDeletion' 60 | - 'kms:Create*' 61 | - 'kms:Delete*' 62 | - 'kms:Describe*' 63 | - 'kms:Disable*' 64 | - 'kms:Enable*' 65 | - 'kms:Get*' 66 | - 'kms:List*' 67 | - 'kms:Put*' 68 | - 'kms:Revoke*' 69 | - 'kms:ScheduleKeyDeletion' 70 | - 'kms:TagResource' 71 | - 'kms:Update*' 72 | - 'kms:Encrypt' 73 | - 'kms:Decrypt' 74 | - 'kms:ReEncrypt*' 75 | - 'kms:GenerateDataKey*' 76 | Resource: '*' 77 | 78 | kmsAlias: 79 | DeletionPolicy: Delete 80 | UpdateReplacePolicy: Delete 81 | Type: 'AWS::KMS::Alias' 82 | Properties: 83 | AliasName: !Sub 'alias/${AWS::StackName}-dbt-redshift-etl' 84 | TargetKeyId: 85 | Ref: kmsKey 86 | 87 | # AWS BATCH 88 | SecurityGroup: 89 | Type: AWS::EC2::SecurityGroup 90 | Properties: 91 | GroupDescription: EC2 Security Group for instances launched in the VPC by Batch 92 | VpcId: 93 | Fn::ImportValue: 94 | !Sub "${RedshiftStackName}-VPC" 95 | 96 | BatchToRedshiftSgRule: 97 | Type: AWS::EC2::SecurityGroupIngress 98 | Properties: 99 | Description: Access from AWS Batch to Redshift 100 | GroupId: 101 | Fn::ImportValue: 102 | !Sub "${RedshiftStackName}-RedshiftSecurityGroup" 103 | IpProtocol: tcp 104 | SourceSecurityGroupId: !Ref SecurityGroup 105 | FromPort: 5439 106 | ToPort: 5439 107 | 108 | BatchServiceRole: 109 | Type: AWS::IAM::Role 110 | Properties: 111 | AssumeRolePolicyDocument: 112 | Version: '2012-10-17' 113 | Statement: 114 | - Effect: Allow 115 | Principal: 116 | Service: batch.amazonaws.com 117 | Action: sts:AssumeRole 118 | ManagedPolicyArns: 119 | - arn:aws:iam::aws:policy/service-role/AWSBatchServiceRole 120 | 121 | EcsInstanceRole: 122 | Type: AWS::IAM::Role 123 | Properties: 124 | AssumeRolePolicyDocument: 125 | Version: '2008-10-17' 126 | Statement: 127 | - Sid: '' 128 | Effect: Allow 129 | Principal: 130 | Service: ecs-tasks.amazonaws.com 131 | Action: sts:AssumeRole 132 | ManagedPolicyArns: 133 | - arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role 134 | - !Ref ECSAccessPolicy 135 | 136 | ECSAccessPolicy: 137 | Type: AWS::IAM::ManagedPolicy 138 | Properties: 139 | ManagedPolicyName: ECSDbtPolicy 140 | PolicyDocument: 141 | Version: 2012-10-17 142 | Statement: 143 | - Effect: Allow 144 | Action: 145 | - 'kms:Decrypt' 146 | - 'kms:GenerateDataKey*' 147 | Resource: !GetAtt kmsKey.Arn 148 | 149 | BatchProcessingJobDefinition: 150 | Type: AWS::Batch::JobDefinition 151 | Properties: 152 | Type: container 153 | JobDefinitionName: 'BatchJobDefinition' 154 | PlatformCapabilities: 155 | - FARGATE 156 | ContainerProperties: 157 | Image: 158 | Fn::Join: 159 | - '' 160 | - - Ref: AWS::AccountId 161 | - .dkr.ecr. 162 | - Ref: AWS::Region 163 | - ".amazonaws.com/dbt-batch-processing-job-repository:latest" 164 | ResourceRequirements: 165 | - Type: MEMORY 166 | Value: 1024 167 | - Type: VCPU 168 | Value: 0.5 169 | JobRoleArn: !GetAtt EcsInstanceRole.Arn 170 | ExecutionRoleArn: !GetAtt EcsInstanceRole.Arn 171 | Command: 172 | - ./run-dbt.sh 173 | NetworkConfiguration: 174 | AssignPublicIp: ENABLED 175 | RetryStrategy: 176 | Attempts: 1 177 | 178 | BatchProcessingJobQueue: 179 | Type: AWS::Batch::JobQueue 180 | Properties: 181 | JobQueueName: 'BatchProcessingJobQueue' 182 | Priority: 1 183 | ComputeEnvironmentOrder: 184 | - Order: 1 185 | ComputeEnvironment: 186 | Ref: ComputeEnvironment 187 | 188 | ComputeEnvironment: 189 | Type: AWS::Batch::ComputeEnvironment 190 | Properties: 191 | Type: MANAGED 192 | ComputeResources: 193 | Type: FARGATE 194 | MaxvCpus: 32 195 | Subnets: 196 | - Fn::ImportValue: 197 | !Sub "${RedshiftStackName}-PublicSubnet" 198 | SecurityGroupIds: 199 | - Ref: SecurityGroup 200 | ServiceRole: 201 | Fn::GetAtt: [ BatchServiceRole, Arn ] 202 | 203 | ### Email notification 204 | AWSBatchEventsRule: 205 | Type: AWS::Events::Rule 206 | Properties: 207 | Description: Events Rule for AWS Batch 208 | EventPattern: 209 | source: 210 | - aws.batch 211 | detail-type: 212 | - Batch Job State Change 213 | detail: 214 | status: 215 | - "FAILED" 216 | State: ENABLED 217 | Targets: 218 | - Arn: 219 | Ref: ErrorsTopic 220 | Id: cloudwatch-batch-eventrules 221 | InputTransformer: 222 | InputPathsMap: 223 | logStream: "$.detail.container.logStreamName" 224 | time: "$.time" 225 | InputTemplate: "\"Your ETL Batch job has failed at