├── .classpath ├── .gitignore ├── .project ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── automated-deployment ├── README.md ├── source-account │ ├── deploy.sh │ ├── parameters.json │ ├── tags.json │ └── template.yaml └── target-account │ ├── deploy.sh │ ├── tags.json │ └── template.yaml ├── pom.xml └── src ├── main └── java │ └── com │ └── amazonaws │ └── gdcreplication │ ├── lambda │ ├── DLQImportDatabaseOrTable.java │ ├── ExportDatabaseWithTables.java │ ├── ExportLargeTable.java │ ├── GDCReplicationPlanner.java │ ├── ImportDatabaseOrTable.java │ └── ImportLargeTable.java │ └── util │ ├── DBReplicationStatus.java │ ├── DDBUtil.java │ ├── GDCUtil.java │ ├── GlueUtil.java │ ├── LargeTable.java │ ├── S3Util.java │ ├── SNSUtil.java │ ├── SQSUtil.java │ ├── TableReplicationStatus.java │ └── TableWithPartitions.java └── test └── resources ├── Glue_Replication.png ├── Glue_Table_Anatomy.png ├── SNS_Cross_Account_Permissions.txt ├── sample_database_schema.json ├── sample_ddb_policy_source_and_target_accounts.json ├── sample_glue_policy_source_account.json ├── sample_glue_policy_target_account.json ├── sample_large-table_message_payload.json ├── sample_partitions_schema.json ├── sample_sns_policy_source_account.json ├── sample_sqs_policy_source_and_target_accounts.json ├── sample_table-with-partitions_message_payload.json └── sample_table_schema.json /.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | .settings/ 3 | build/ -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | GDCCrossAccountReplicationUtility 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | org.eclipse.m2e.core.maven2Builder 15 | 16 | 17 | 18 | 19 | 20 | org.eclipse.jdt.core.javanature 21 | org.eclipse.m2e.core.maven2Nature 22 | 23 | 24 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *master* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | 61 | We may ask you to sign a [Contributor License Agreement (CLA)](http://en.wikipedia.org/wiki/Contributor_License_Agreement) for larger changes. 62 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software is furnished to do so. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 15 | 16 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AWS Glue Data Catalog Replication Utility 2 | 3 | This Utility is used to replicate Glue Data Catalog from one AWS account to another AWS account. Using this, you can replicate Databases, Tables, and Partitions from one source AWS account to one or more target AWS accounts. It uses AWS Glue APIs / AWS SDK for Java and serverless technologies such as AWS Lambda, Amazon SQS, and Amazon SNS. The architecture of this utility is shown in the following diagram. 4 | ![Alt](./src/test/resources/Glue_Replication.png) 5 | 6 | ## Automated Deployment 7 | Follow the instructions in this [README.md](automated-deployment/README.md) to deploy this utility through CloudFormation in your AWS accounts. Otherwise follow the guide below for a manual deployment. 8 | 9 | ## Build Instructions 10 | 11 | 1. The source code has Maven nature, you can build it using standard Maven commands e.g. ```mvn -X clean install```. or use the options available in your IDE 12 | 2. The above step generates a Jar file e.g. aws-glue-data-catalog-replication-utility-1.0.0.jar 13 | 14 | ## AWS Service Requirements 15 | This utility requires the following AWS services 16 | ### Source Account 17 | - 3 AWS Lambda functions 18 | - 3 Amazon DynamoDB tables 19 | - 2 Amazon SNS Topics 20 | - 1 Amazon SQS Queue 21 | - 1 Amazon S3 Bucket 22 | 23 | ### Each Target Account 24 | - 3 AWS Lambda functions 25 | - 2 Amazon DynamoDB tables 26 | - 2 Amazon SQS Queues 27 | 28 | ## Lambda Functions Overview 29 | | Class | Purpose | 30 | |-------------------------------------------------------------- | -------------| 31 | | [GDCReplicationPlannerLambda](./src/main/java/com/amazonaws/gdcreplication/lambda/GDCReplicationPlanner.java) | Lambda function determines the list of databases to export. It is the driver program initiates the replication process.| | 32 | | [ExportLambda](./src/main/java/com/amazonaws/gdcreplication/lambda/ExportDatabaseWithTables.java) | Lambda function to export databases and tables.| 33 | | [ExportLargeTableLambda](./src/main/java/com/amazonaws/gdcreplication/lambda/ExportLargeTable.java) | Lambda function to export large tables tables with more than 10 partitions.| 34 | | [ImportLambda](./src/main/java/com/amazonaws/gdcreplication/lambda/ImportDatabaseOrTable.java) | Lambda function to import databases and tables.| 35 | | [ImportLargeTableLambda](./src/main/java/com/amazonaws/gdcreplication/lambda/ImportLargeTable.java) | Lambda function to import large tables.| 36 | | [DLQProcessorLambda](./src/main/java/com/amazonaws/gdcreplication/lambda/DLQImportDatabaseOrTable.java) | Lambda function used to process errors generated by ImportLambda.| 37 | 38 | ## Deployment Instructions - Source Account 39 | 40 | 1. Create DynamoDB tables as defined in the following table 41 | 42 | | Table | Purpose | Schema | Capacity | 43 | |-------------------|----------------| ------------ | -------------- | 44 | | glue_database_export_task | audit data for replication planner | Partition key - db_id (String), Sort key - export_run_id (Number) | On-Demand | 45 | | db_status | audit data for databases exported | Partition key - db_id (String), Sort key - export_run_id (Number) | On-Demand | 46 | | table_status | audit data for tables exported | Partition key - table_id (String), Sort key - export_run_id (Number) | On-Demand | 47 | 48 | 2. Create two SNS Topics 49 | 1. Topic 1: Name = e.g. ```ReplicationPlannerSNSTopic``` 50 | 2. Topic 2: Name = e.g. ```SchemaDistributionSNSTopic``` 51 | 52 | 3. Create an S3 Bucket. It is used to save partitions for large tables (partitions > 10). This bucket must provide cross-account permissions to the IAM roles used by **ImportLargeTable** Lambda function in Target Account. Refer the following AWS resources for more details: 53 | 1. https://aws.amazon.com/premiumsupport/knowledge-center/cross-account-access-s3/ 54 | 2. https://docs.aws.amazon.com/AmazonS3/latest/dev/example-walkthroughs-managing-access-example2.html 55 | 56 | 4. Create one SQS Queue 57 | 1. Queue Name = e.g. ```LargeTableSQSQueue``` 58 | 2. Queue Type = Standard 59 | 3. Default Visibility Timeout = e.g. 3 minutes 15 seconds. **Note:** It must be higher than execution timeout of **ExportLargeTable** Lambda Function 60 | 61 | 5. Create Lambda Execution IAM Role and attach it to the Lambda functions deployed in Source Account. This role needs to have multiple permissions. Refer the following IAM policies to know about required permissions: 62 | 1. You can use AWS managed policy called **AWSLambdaExecute** (Policy ARN # arn:aws:iam::aws:policy/AWSLambdaExecute) 63 | 2. [sample_sqs_policy_source_and_target_accounts](./src/test/resources/sample_sqs_policy_source_and_target_accounts.json) 64 | 3. [sample_sns_policy_source_account](./src/test/resources/sample_sns_policy_source_account.json) 65 | 4. [sample_glue_policy_source_account](./src/test/resources/sample_glue_policy_source_account.json) 66 | 5. [sample_ddb_policy_source_and_target_accounts](./src/test/resources/sample_ddb_policy_source_and_target_accounts.json) 67 | 68 | 6. Deploy **GDCReplicationPlannerLambda** function 69 | 1. Runtime = Java 8 70 | 1. Function package = Use the Jar file generated. Refer section [Build Instructions](#Build-Instructions) 71 | 2. Lambda Handler = ```com.amazonaws.gdcreplication.lambda.GDCReplicationPlanner``` 72 | 3. Timeout = e.g. 5 minutes 73 | 4. Memory = e.g. 128 MB 74 | 5. Environment variable = as defined in the following table 75 | 76 | | Variable Name | Variable Value | 77 | |---------------------------------- |------------------------- | 78 | | source_glue_catalog_id | Source AWS Account Id | 79 | | ddb_name_gdc_replication_planner | Name of the DDB Table for **glue_database_export_task** of source account | 80 | | database_prefix_list | List of database prefixes separated by a token. **E.g. raw_data_,processed_data_**. To export all databases, do not add this variable. | 81 | | separator | The separator used in the database_prefix_list. **E.g. ,**. This can be skipped when database_prefix_list is not added. | 82 | | region | e.g. us-east-1 | 83 | | sns_topic_arn_gdc_replication_planner | SNS Topic ARN for **ReplicationPlannerSNSTopic** | 84 | 85 | 7. Deploy **ExportLambda** function 86 | 1. Runtime = Java 8 87 | 1. Function package = Use the Jar file generated. Refer section [Build Instructions](#Build-Instructions) 88 | 2. Lambda Handler = ```com.amazonaws.gdcreplication.lambda.ExportDatabaseWithTables``` 89 | 3. Timeout = e.g. 5 minutes 90 | 4. Memory = e.g. 192 MB 91 | 5. Environment variable = as defined in the following table 92 | 93 | | Variable Name | Variable Value | 94 | |---------------------------------- |------------------------- | 95 | | source_glue_catalog_id | Source AWS Account Id | 96 | | ddb_name_db_export_status | Name of the DDB Table for **db_status** of source account | 97 | | ddb_name_table_export_status | Name of the DDB Table for **table_status** of source account | 98 | | region | e.g. us-east-1 | 99 | | sns_topic_arn_export_dbs_tables | SNS Topic ARN for **SchemaDistributionSNSTopic** | 100 | | sqs_queue_url_large_tables | SQS Queue URL for **LargeTableSQSQueue** | 101 | 102 | 8. Add **ReplicationPlannerSNSTopic** as a trigger to **ExportLambda** function 103 | 104 | 9. Deploy **ExportLargeTableLambda** function 105 | 1. Runtime = Java 8 106 | 1. Function package = Use the Jar file generated. Refer section [Build Instructions](#Build-Instructions) 107 | 2. Lambda Handler = ```com.amazonaws.gdcreplication.lambda.ExportLargeTable``` 108 | 3. Timeout = e.g. 3 minutes 109 | 4. Memory = e.g. 256 MB 110 | 5. Environment variable = as defined in the following table 111 | 112 | | Variable Name | Variable Value | 113 | |---------------------------------- |------------------------- | 114 | | s3_bucket_name | Name of the S3 Bucket used to save partitions for large Tables | 115 | | ddb_name_table_export_status | Name of the DDB Table for **table_status** of source account | 116 | | region | e.g. us-east-1 | 117 | | sns_topic_arn_export_dbs_tables | SNS Topic ARN for **SchemaDistributionSNSTopic** | 118 | 119 | 10. Add **LargeTableSQSQueue** as a trigger to **ExportLargeTableLambda** function 120 | 1. Batch size = 1 121 | 122 | 11. Cross-Account permissions in Source Account. Grant permissions to Target Account to subscribe to the second SNS Topic: 123 | 124 | ``` 125 | aws sns add-permission --label lambda-access --aws-account-id TargetAccount \ 126 | --topic-arn arn:aws:sns:us-east-1:SourceAccount:SchemaDistributionSNSTopic \ 127 | --action-name Subscribe ListSubscriptionsByTopic Receive 128 | ``` 129 | 130 | ## Deployment Instructions - Target Account 131 | 132 | 1. Create DynamoDB tables as defined in the following table 133 | 134 | | Table | Purpose | Schema | Capacity | 135 | |-------------------|----------------| ------------ | -------------- | 136 | | db_status | audit data for databases imported | Partition key - db_id (String), Sort key - import_run_id (Number) | On-Demand | 137 | | table_status | audit data for tables imported | Partition key - table_id (String), Sort key - import_run_id (Number) | On-Demand | 138 | 139 | 2. Create SQS Queue 140 | 1. Queue Name = ```LargeTableSQSQueue``` 141 | 2. Queue Type = Standard 142 | 3. Default Visibility Timeout = e.g. 3 minutes 15 seconds. **Note:** It must be higher than execution timeout of **ImportLargeTable** Lambda Function 143 | 144 | 3. Create SQS Queue - dead letter queue processing 145 | 1. Queue Name = ```DeadLetterQueue``` 146 | 2. Queue Type = Standard 147 | 3. Default Visibility Timeout = e.g. 3 minutes 15 seconds 148 | 149 | 4. Create Lambda Execution IAM Role and attach it to the Lambda functions deployed in Target Account. This role needs to have multiple permissions. Refer the following IAM policies to know about required permissions: 150 | 1. You can use AWS managed policy called **AWSLambdaExecute** (Policy ARN # arn:aws:iam::aws:policy/AWSLambdaExecute) 151 | 2. [sample_sqs_policy_source_and_target_accounts](./src/test/resources/sample_sqs_policy_source_and_target_accounts.json) 152 | 3. [sample_glue_policy_target_account](./src/test/resources/sample_glue_policy_target_account.json) 153 | 4. [sample_ddb_policy_source_and_target_accounts](./src/test/resources/sample_ddb_policy_source_and_target_accounts.json) 154 | 155 | 5. Deploy **ImportLambda** function 156 | 1. Runtime = Java 8 157 | 1. Function package = Use the Jar file generated. Refer section [Build Instructions](#Build-Instructions) 158 | 2. Lambda Handler = ```com.amazonaws.gdcreplication.lambda.ImportDatabaseOrTable``` 159 | 3. Timeout = e.g. 5 minutes 160 | 4. Memory = e.g. 192 MB 161 | 5. Environment variable = as defined in the following table 162 | 163 | | Variable Name | Variable Value | 164 | |---------------------------------- |------------------------- | 165 | | target_glue_catalog_id | Target AWS Account Id | 166 | | ddb_name_db_import_status | Name of the DDB Table for **db_status** of target account | 167 | | ddb_name_table_import_status | Name of the DDB Table for **table_status** of target account | 168 | | skip_archive | true | 169 | | region | e.g. us-east-1 | 170 | | sqs_queue_url_large_tables | SQS Queue URL for **LargeTableSQSQueue** | 171 | | dlq_url_sqs | SQS Queue URL for **DeadLetterQueue** | 172 | 173 | 6. Give **SchemaDistributionSNSTopic** permissions to invoke Lambda function 174 | 175 | ``` 176 | aws lambda add-permission --function-name ImportLambda \ 177 | --source-arn arn:aws:sns:us-east-1:SourceAccount:SchemaDistributionSNSTopic \ 178 | --statement-id sns-x-account --action "lambda:InvokeFunction" \ 179 | --principal sns.amazonaws.com 180 | ``` 181 | 182 | 7. Subscribe **ImportLambda** function to **SchemaDistributionSNSTopic** 183 | 184 | ``` 185 | aws sns subscribe --protocol lambda \ 186 | --topic-arn arn:aws:sns:us-east-1:SourceAccount:SchemaDistributionSNSTopic \ 187 | --notification-endpoint arn:aws:lambda:us-east-1:TargetAccount:function:ImportLambda 188 | ``` 189 | Additional References: 190 | - https://docs.aws.amazon.com/lambda/latest/dg/with-sns-example.html#with-sns-create-x-account-permissions 191 | 192 | 8. Deploy **ImportLargeTableLambda** function 193 | 1. Runtime = Java 8 194 | 1. Function package = Use the Jar file generated. Refer section [Build Instructions](#Build-Instructions) 195 | 2. Lambda Handler = ```com.amazonaws.gdcreplication.lambda.ImportLargeTable``` 196 | 3. Timeout = e.g. 3 minutes 197 | 4. Memory = e.g. 256 MB 198 | 5. Environment variable = as defined in the following table 199 | 200 | | Variable Name | Variable Value | 201 | |---------------------------------- |---------------------- | 202 | | target_glue_catalog_id | Target AWS Account Id | 203 | | ddb_name_table_import_status | Name of the DDB Table for **table_status** of target account | 204 | | skip_archive | true | 205 | | region | e.g. us-east-1 | 206 | 207 | 9. Add **LargeTableSQSQueue** as a trigger to **ImportLargeTableLambda** function 208 | 1. Batch size = 1 209 | 210 | 10. Deploy **DLQProcessorLambda** function 211 | 1. Runtime = Java 8 212 | 1. Function package = Use the Jar file generated. Refer section [Build Instructions](#Build-Instructions) 213 | 2. Lambda Handler = ```com.amazonaws.gdcreplication.lambda.DLQImportDatabaseOrTable``` 214 | 3. Timeout = e.g. 3 minutes 215 | 4. Memory = e.g. 192 MB 216 | 5. Environment variable = as defined in the following table 217 | 218 | | Variable Name | Variable Value | 219 | |---------------------------------- |------------------------- | 220 | | target_glue_catalog_id | Target AWS Account Id | 221 | | ddb_name_db_import_status | Name of the DDB Table for **db_status** of target account | 222 | | ddb_name_table_import_status | Name of the DDB Table for **table_status** of target account | 223 | | skip_archive | true | 224 | | dlq_url_sqs | SQS Queue URL for **DeadLetterQueue** | 225 | | region | e.g. us-east-1 | 226 | 227 | 11. Add Dead Letter SQS Queue as a trigger to **DLQProcessorLambda** Lambda function 228 | 1. Batch size = 1 229 | 230 | ## Advantages 231 | This solution was designed around 3 main tenets, which are simplicity, scalability, and cost-effectiveness. 232 | The following are direct benefits: 233 | 234 | 1. Target AWS accounts are independent allowing the solution to scale efficiently. 235 | 2. The target accounts always see the latest table information. 236 | 3. Light weight and dependable at scale. 237 | 4. The implementation is fully customizable. 238 | 239 | ## Limitations 240 | Following are the primary limitations: 241 | 1. This utility is NOT intended for real-time replication. Refer section [Use Case 2 - Ongoing replication](#Use-Case-2:-Ongoing-replication) to know about how to run the replication process as a scheduled job. 242 | 2. This utility is NOT intended for two-way replication between AWS Accounts. 243 | 3. This utility does NOT attempt to resolve database and table name conflicts which may result in undesirable behavior. 244 | 245 | ## Applicable Use Cases 246 | ### Use Case 1: One-time replication 247 | To do this, you can run **GDCReplicationPlannerLambda** function using a Test event in AWS Lambda console. 248 | 249 | ### Use Case 2: Ongoing replication 250 | To do this, you can create a CloudWatch Event Rule in Source Account and add **GDCReplicationPlannerLambda** as its target. 251 | Refer the following AWS documentation for more details: 252 | 1. [Schedule Expressions for Rules](https://docs.aws.amazon.com/AmazonCloudWatch/latest/events/ScheduledEvents.html) 253 | 2. [Tutorial: Schedule AWS Lambda Functions Using CloudWatch Events](https://docs.aws.amazon.com/AmazonCloudWatch/latest/events/RunLambdaSchedule.html) 254 | 255 | ## Replication Mechanism in Target Account 256 | For databases and tables, the actions taken by import Lambdas depend on the state of Glue Data Catalog in target account. 257 | Those actions are summarized in the following table. 258 | 259 | |Input Message Type | State of Target Glue Data Catalog | Action Taken in Target Glue Data Catalog | 260 | |-------------------|-----------------------------------|------------ | 261 | |Database | Database exist already | Skip the message | 262 | |Database | Database does not exist | Create Database | 263 | |Table | Table exist already | Update Table | 264 | |Table | Table does not exist | Create Table | 265 | 266 | For partitions, the actions are summarized in the following table: 267 | 268 | |Partitions in Export | State in Target Glue Data Catalog | Action Taken in Target Account| 269 | |-----------------------|-----------------------------------|------------ | 270 | |Partitions DO NOT exist| Target Table has no partitions | No action taken | 271 | |Partitions DO NOT exist| Target Table has partitions | Delete current partitions | 272 | |Partitions exist | Target Table has no partitions | Create new partitions | 273 | |Partitions exist | Target Table has partitions | Delete current partitions, create new partitions | 274 | 275 | 276 | ## License Summary 277 | This sample code is made available under the MIT-0 license. See the LICENSE file. -------------------------------------------------------------------------------- /automated-deployment/README.md: -------------------------------------------------------------------------------- 1 | This guide explains how to deploy the AWS Glue data catalog utility in both the Source and Target AWS accounts. 2 | 3 | ## Prerequisites: 4 | 1. Administrator access to two AWS accounts: 5 | 1. Source AWS account holding the Glue catalog to replicate 6 | 2. Target AWS account where Glue catalog will be replicated 7 | 2. The ```aws``` [CLI](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-install.html) installed and configured with a profile (```default``` is used if no profile is specified) 8 | 3. The ```jq``` [CLI](https://stedolan.github.io/jq/manual/) installed (Test it's present with ```jq --version``` otherwise run ```sudo yum install jq```) 9 | 10 | ## Deployment 11 | 12 | ### Source Account: 13 | 1. Log in to the Source AWS account console using the Admin role and select the AWS region where the Glue catalog you wish to replicate is located 14 | 15 | 2. Set up a Cloud9 Environment in the same AWS region (t2.small or higher, Linux AMI) and log into it. While these commands can be run from any machine with a Linux flavored OS, using a Cloud9 environment ensures that required software (e.g. Git) is pre-installed and that neither your computer nor the Cloud9 instance are polluted by existing environment variables 16 | 17 | 3. Install ```jq```: 18 | ```bash 19 | echo y | sudo yum install jq 20 | ``` 21 | 22 | 4. Git clone this repository: 23 | ```bash 24 | git clone https://github.com/aws-samples/aws-glue-data-catalog-replication-utility.git 25 | cd ./aws-glue-data-catalog-replication-utility/automated-deployment/source-account/ 26 | ``` 27 | 28 | 5. Modify the ```parameters.json``` file with relevant values: 29 | ```bash 30 | [ 31 | { 32 | "ParameterKey": "pDatabasePrefixList", # List of database prefixes separated by a token. E.g. raw_data_,processed_data_. To export all databases, leave as is 33 | "ParameterValue": "" 34 | }, 35 | { 36 | "ParameterKey": "pDatabasePrefixSeparator", # The separator used in the database_prefix_list. E.g. ",". To export all databases, leave as is 37 | "ParameterValue": "|" 38 | }, 39 | { 40 | "ParameterKey": "pReplicationSchedule", # Cron Expression to schedule and trigger Glue catalog replication. Defaults to everday at midnight and 30 minutes 41 | "ParameterValue": "cron(30 0 * * ? *)" 42 | } 43 | ] 44 | ``` 45 | 46 | 6. After updating the parameters, run: 47 | ```bash 48 | ./deploy.sh -a 49 | ``` 50 | ***IMPORTANT***: The ```-a``` parameter is relative to the Target account NOT the Source. If this is the first time you run the script, it will ask to create an S3 bucket to store CloudFormation artificats. Type ```y``` when prompted. Following that, the entire infrastructure required to replicate the Glue catalog from the source account will be deployed 51 | 52 | 7. Navigate to S3 and locate the ```import-large-table-``` bucket. Add a cross-account bucket policy referencing the target account(s). For example: 53 | ```bash 54 | { 55 | "Version": "2012-10-17", 56 | "Statement": [ 57 | { 58 | "Effect": "Allow", 59 | "Principal": { 60 | "AWS": "arn:aws:iam:::root" 61 | }, 62 | "Action": [ 63 | "s3:GetBucketLocation", 64 | "s3:ListBucket" 65 | ], 66 | "Resource": "arn:aws:s3:::import-large-table-b2465b90-638f-11ea-8000-0a52752701a6" 67 | } 68 | ] 69 | } 70 | ``` 71 | You can add statements for as many target accounts as necessary. More details on how to add a bucket policy can be found here: 72 | 1. https://aws.amazon.com/premiumsupport/knowledge-center/cross-account-access-s3/ 73 | 2. https://docs.aws.amazon.com/AmazonS3/latest/dev/example-walkthroughs-managing-access-example2.html 74 | 75 | 8. This utility replicates your Glue Metadata Catalog. However, access to the ```underlying``` data is still needed if you wish to query it. To achieve that, add a cross-account bucket policy to the bucket holding your data allowing the target account(s) to access it. 76 | 77 | ### Target Account: 78 | 1. Log in to the Target AWS account console using the Admin role and select the AWS region where you wish to replicate the Glue catalog from the Source account 79 | 80 | 2. Set up a Cloud9 Environment in the same AWS region (t2.small or higher, Linux AMI) and log into it. While these commands can be run from any machine with a Linux flavored OS, using a Cloud9 environment ensures that required software (e.g. Git) is pre-installed and that neither your computer nor the Cloud9 instance are polluted by existing environment variables 81 | 82 | 3. Install ```jq```: 83 | ```bash 84 | echo y | sudo yum install jq 85 | ``` 86 | 87 | 4. Git clone this repository: 88 | ```bash 89 | git clone https://github.com/aws-samples/aws-glue-data-catalog-replication-utility.git 90 | cd ./aws-glue-data-catalog-replication-utility/automated-deployment/target-account/ 91 | ``` 92 | 93 | 5. Then run: 94 | ```bash 95 | ./deploy.sh -a -r 96 | ``` 97 | ***IMPORTANT***: The ```-a``` and ```-r``` parameters are relative to the Source account NOT the Target. If this is the first time you run the script, it will ask to create an S3 bucket to store CloudFormation artificats. Type ```y``` when prompted. Following that, the entire infrastructure required to replicate the Glue catalog from the source account will be deployed 98 | 99 | ## Testing the replication: 100 | Back in the Source AWS account in the AWS Lambda console, you can run the GDCReplicationPlanner Lambda function using a Test event to trigger the initial replication -------------------------------------------------------------------------------- /automated-deployment/source-account/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | sflag=false 3 | nflag=false 4 | pflag=false 5 | 6 | DIRNAME=$(dirname "$0") 7 | 8 | usage () { echo " 9 | -h -- Opens up this help message 10 | -a -- Target AWS Account ID 11 | -n -- Name of the CloudFormation stack 12 | -p -- Name of the AWS profile to use 13 | -s -- Name of S3 bucket to upload artifacts to 14 | "; } 15 | options=':a:n:p:s:h' 16 | while getopts $options option 17 | do 18 | case "$option" in 19 | a ) aflag=true; TARGET_ACCOUNT=$OPTARG;; 20 | n ) nflag=true; STACK_NAME=$OPTARG;; 21 | p ) pflag=true; PROFILE=$OPTARG;; 22 | s ) sflag=true; S3_BUCKET=$OPTARG;; 23 | h ) usage; exit;; 24 | \? ) echo "Unknown option: -$OPTARG" >&2; exit 1;; 25 | : ) echo "Missing option argument for -$OPTARG" >&2; exit 1;; 26 | * ) echo "Unimplemented option: -$OPTARG" >&2; exit 1;; 27 | esac 28 | done 29 | 30 | if ! $aflag 31 | then 32 | echo "-a not specified, the target AWS account ID (12 digits) must be specified. Aborting..." >&2 33 | exit 0 34 | fi 35 | if ! $pflag 36 | then 37 | echo "-p not specified, using default..." >&2 38 | PROFILE="default" 39 | SOURCE_REGION=$(aws configure get region --profile ${PROFILE}) 40 | SOURCE_ACCOUNT=$(aws sts get-caller-identity --profile ${PROFILE} | python3 -c "import sys, json; print(json.load(sys.stdin)['Account'])") 41 | fi 42 | if ! $sflag 43 | then 44 | S3_BUCKET=glue-data-catalog-replication-$SOURCE_REGION-$SOURCE_ACCOUNT 45 | fi 46 | if ! $nflag 47 | then 48 | STACK_NAME="glue-data-catalog-replication-source" 49 | fi 50 | 51 | echo "Checking if bucket exists ..." 52 | if ! aws s3 ls $S3_BUCKET --profile $PROFILE; then 53 | echo "S3 bucket named $S3_BUCKET does not exist. Create? [Y/N]" 54 | read choice 55 | if [ $choice == "Y" ] || [ $choice == "y" ]; then 56 | aws s3 mb s3://$S3_BUCKET --profile $PROFILE 57 | else 58 | echo "Bucket does not exist. Deploy aborted." 59 | exit 1 60 | fi 61 | fi 62 | 63 | mkdir $DIRNAME/output 64 | aws cloudformation package --profile $PROFILE --template-file $DIRNAME/template.yaml --s3-bucket $S3_BUCKET --output-template-file $DIRNAME/output/packaged-template.yaml 65 | 66 | echo "Checking if stack exists ..." 67 | if ! aws cloudformation describe-stacks --profile $PROFILE --stack-name $STACK_NAME; then 68 | echo -e "Stack does not exist, creating ..." 69 | aws cloudformation create-stack \ 70 | --stack-name $STACK_NAME \ 71 | --parameters file://$DIRNAME/parameters.json \ 72 | --template-body file://$DIRNAME/output/packaged-template.yaml \ 73 | --tags file://$DIRNAME/tags.json \ 74 | --capabilities "CAPABILITY_NAMED_IAM" "CAPABILITY_AUTO_EXPAND" \ 75 | --profile $PROFILE 76 | 77 | echo "Waiting for stack to be created ..." 78 | aws cloudformation wait stack-create-complete --profile $PROFILE \ 79 | --stack-name $STACK_NAME 80 | else 81 | echo -e "Stack exists, attempting update ..." 82 | 83 | set +e 84 | update_output=$( aws cloudformation update-stack \ 85 | --profile $PROFILE \ 86 | --stack-name $STACK_NAME \ 87 | --parameters file://$DIRNAME/parameters.json \ 88 | --template-body file://$DIRNAME/output/packaged-template.yaml \ 89 | --tags file://$DIRNAME/tags.json \ 90 | --capabilities "CAPABILITY_NAMED_IAM" "CAPABILITY_AUTO_EXPAND" 2>&1) 91 | status=$? 92 | set -e 93 | 94 | echo "$update_output" 95 | 96 | if [ $status -ne 0 ] ; then 97 | # Don't fail for no-op update 98 | if [[ $update_output == *"ValidationError"* && $update_output == *"No updates"* ]] ; then 99 | echo -e "\nFinished create/update - no updates to be performed"; 100 | exit 0; 101 | else 102 | exit $status 103 | fi 104 | fi 105 | 106 | echo "Waiting for stack update to complete ..." 107 | aws cloudformation wait stack-update-complete --profile $PROFILE \ 108 | --stack-name $STACK_NAME 109 | echo "Finished create/update successfully!" 110 | fi 111 | 112 | echo "Subscribing Target account to SNS Schema Distribution topic..." 113 | aws sns add-permission --label lambda-access --aws-account-id $TARGET_ACCOUNT \ 114 | --topic-arn arn:aws:sns:$SOURCE_REGION:$SOURCE_ACCOUNT:SchemaDistributionSNSTopic \ 115 | --action-name Subscribe ListSubscriptionsByTopic Receive -------------------------------------------------------------------------------- /automated-deployment/source-account/parameters.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "ParameterKey": "pDatabasePrefixList", 4 | "ParameterValue": "" 5 | }, 6 | { 7 | "ParameterKey": "pDatabasePrefixSeparator", 8 | "ParameterValue": "|" 9 | }, 10 | { 11 | "ParameterKey": "pReplicationSchedule", 12 | "ParameterValue": "cron(30 0 * * ? *)" 13 | } 14 | ] -------------------------------------------------------------------------------- /automated-deployment/source-account/tags.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "Key": "Project", 4 | "Value": "GlueCatalogReplication" 5 | } 6 | ] -------------------------------------------------------------------------------- /automated-deployment/source-account/template.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: "2010-09-09" 2 | Transform: "AWS::Serverless-2016-10-31" 3 | Description: "AWS Glue Data Catalog Replication Utility - Source Account" 4 | 5 | Parameters: 6 | pDatabasePrefixList: 7 | Description: "List of database prefixes separated by a token. E.g. raw_data_,processed_data_. To export all databases, do not add this variable" 8 | Type: String 9 | Default: "" 10 | pDatabasePrefixSeparator: 11 | Description: "The separator used in the database_prefix_list. E.g. ,. This can be skipped when database_prefix_list is not added" 12 | Type: String 13 | Default: "|" 14 | pReplicationSchedule: 15 | Description: "Cron Expression to schedule and trigger Glue catalog replication" 16 | Type: String 17 | Default: "cron(30 0 * * ? *)" 18 | pKmsKeyARNSQS: 19 | Description: "KMS Key ARN for SQS Queue" 20 | Type: String 21 | Default: "" 22 | pKmsKeyARNSNS: 23 | Description: "KMS Key ARN for SNS Topic" 24 | Type: String 25 | Default: "" 26 | 27 | Resources: 28 | ### DynamoDB ### 29 | rGlueDatabaseExportTask: 30 | Type: "AWS::DynamoDB::Table" 31 | Properties: 32 | TableName: "glue_database_export_task" 33 | BillingMode: "PAY_PER_REQUEST" 34 | AttributeDefinitions: 35 | - AttributeName: "db_id" 36 | AttributeType: "S" 37 | - AttributeName: "export_run_id" 38 | AttributeType: "N" 39 | KeySchema: 40 | - 41 | AttributeName: "db_id" 42 | KeyType: "HASH" 43 | - 44 | AttributeName: "export_run_id" 45 | KeyType: "RANGE" 46 | 47 | rDBStatus: 48 | Type: "AWS::DynamoDB::Table" 49 | Properties: 50 | TableName: "db_status" 51 | BillingMode: "PAY_PER_REQUEST" 52 | AttributeDefinitions: 53 | - AttributeName: "db_id" 54 | AttributeType: "S" 55 | - AttributeName: "export_run_id" 56 | AttributeType: "N" 57 | KeySchema: 58 | - 59 | AttributeName: "db_id" 60 | KeyType: "HASH" 61 | - 62 | AttributeName: "export_run_id" 63 | KeyType: "RANGE" 64 | 65 | rTableStatus: 66 | Type: "AWS::DynamoDB::Table" 67 | Properties: 68 | TableName: "table_status" 69 | BillingMode: "PAY_PER_REQUEST" 70 | AttributeDefinitions: 71 | - AttributeName: "table_id" 72 | AttributeType: "S" 73 | - AttributeName: "export_run_id" 74 | AttributeType: "N" 75 | KeySchema: 76 | - 77 | AttributeName: "table_id" 78 | KeyType: "HASH" 79 | - 80 | AttributeName: "export_run_id" 81 | KeyType: "RANGE" 82 | 83 | ### SNS ### 84 | rReplicationPlannerSNSTopic: 85 | Type: AWS::SNS::Topic 86 | Properties: 87 | TopicName: "ReplicationPlannerSNSTopic" 88 | KmsMasterKeyId: !Ref pKmsKeyARNSNS 89 | Subscription: 90 | - Endpoint: !GetAtt rExportLambda.Arn 91 | Protocol: lambda 92 | 93 | rSchemaDistributionSNSTopic: 94 | Type: AWS::SNS::Topic 95 | Properties: 96 | TopicName: "SchemaDistributionSNSTopic" 97 | KmsMasterKeyId: !Ref pKmsKeyARNSNS 98 | 99 | ### S3 ### 100 | rImportLargeTableBucket: 101 | Type: "AWS::S3::Bucket" 102 | Properties: 103 | BucketName: !Join 104 | - '' 105 | - - 'import-large-table-' 106 | - !Select [2, !Split ['/', !Ref "AWS::StackId"] ] 107 | VersioningConfiguration: 108 | Status: Enabled 109 | PublicAccessBlockConfiguration: 110 | BlockPublicAcls: True 111 | BlockPublicPolicy: True 112 | IgnorePublicAcls: True 113 | RestrictPublicBuckets: True 114 | BucketEncryption: 115 | ServerSideEncryptionConfiguration: 116 | - ServerSideEncryptionByDefault: 117 | SSEAlgorithm: AES256 118 | 119 | ### SQS ### 120 | rLargeTableSQSQueue: 121 | Type: "AWS::SQS::Queue" 122 | Properties: 123 | QueueName: "LargeTableSQSQueue" 124 | VisibilityTimeout: 195 125 | KmsMasterKeyId: !Ref pKmsKeyARNSQS 126 | 127 | ### IAM ### 128 | rGlueCatalogReplicationPolicyRole: 129 | Type: "AWS::IAM::Role" 130 | Properties: 131 | AssumeRolePolicyDocument: 132 | Version: "2012-10-17" 133 | Statement: 134 | - Effect: Allow 135 | Principal: 136 | Service: lambda.amazonaws.com 137 | Action: "sts:AssumeRole" 138 | Path: "/" 139 | ManagedPolicyArns: 140 | - "arn:aws:iam::aws:policy/AWSLambdaExecute" 141 | Policies: 142 | - PolicyName: GlueCatalogReplicationPolicy 143 | PolicyDocument: 144 | Version: '2012-10-17' 145 | Statement: 146 | - Effect: Allow 147 | Action: 148 | - "glue:GetDatabase" 149 | - "glue:GetPartition" 150 | - "glue:GetTableVersion" 151 | - "glue:GetTables" 152 | - "glue:GetTableVersions" 153 | - "glue:GetPartitions" 154 | - "glue:BatchDeleteTableVersion" 155 | - "glue:BatchGetPartition" 156 | - "glue:GetDatabases" 157 | - "glue:GetTable" 158 | Resource: "*" 159 | - Effect: Allow 160 | Action: 161 | - "sqs:DeleteMessage" 162 | - "sqs:ListQueues" 163 | - "sqs:GetQueueUrl" 164 | - "sqs:ListDeadLetterSourceQueues" 165 | - "sqs:ChangeMessageVisibility" 166 | - "sqs:DeleteMessageBatch" 167 | - "sqs:SendMessageBatch" 168 | - "sqs:ReceiveMessage" 169 | - "sqs:SendMessage" 170 | - "sqs:GetQueueAttributes" 171 | - "sqs:ListQueueTags" 172 | Resource: 173 | - '*' 174 | - Effect: Allow 175 | Action: 176 | - "dynamodb:BatchWriteItem" 177 | - "dynamodb:PutItem" 178 | Resource: 179 | - "*" 180 | - Effect: Allow 181 | Action: 182 | - "sns:Publish" 183 | Resource: 184 | - "*" 185 | 186 | ### Lambda ### 187 | rGDCReplicationPlannerLambda: 188 | Type: "AWS::Serverless::Function" 189 | Properties: 190 | CodeUri: ../aws-glue-data-catalog-replication-utility-1.0.0.jar 191 | FunctionName: "GDCReplicationPlannerLambda" 192 | Environment: 193 | Variables: 194 | source_glue_catalog_id: !Ref 'AWS::AccountId' 195 | ddb_name_gdc_replication_planner: !Ref rGlueDatabaseExportTask 196 | database_prefix_list: !Ref pDatabasePrefixList 197 | separator: !Ref pDatabasePrefixSeparator 198 | region: !Ref 'AWS::Region' 199 | sns_topic_arn_gdc_replication_planner: !Ref rReplicationPlannerSNSTopic 200 | Handler: com.amazonaws.gdcreplication.lambda.GDCReplicationPlanner 201 | Runtime: java8 202 | Description: "Replication Planner Lambda" 203 | MemorySize: 512 204 | Timeout: 300 205 | Role: !GetAtt rGlueCatalogReplicationPolicyRole.Arn 206 | 207 | rReplicationLambdaTriggerRule: 208 | Type: "AWS::Events::Rule" 209 | Properties: 210 | Name: "glue-catalog-replication-trigger" 211 | Description: Glue catalog Replication Lambda Trigger 212 | State: ENABLED 213 | ScheduleExpression: !Ref pReplicationSchedule 214 | Targets: 215 | - Id: "glue-catalog-replication-trigger" 216 | Arn: !GetAtt rGDCReplicationPlannerLambda.Arn 217 | 218 | rPermissionEventsInvokeRoutingLambda: 219 | Type: AWS::Lambda::Permission 220 | Properties: 221 | FunctionName: !Ref rGDCReplicationPlannerLambda 222 | Action: "lambda:InvokeFunction" 223 | Principal: "events.amazonaws.com" 224 | SourceArn: !GetAtt rReplicationLambdaTriggerRule.Arn 225 | 226 | rExportLambda: 227 | Type: "AWS::Serverless::Function" 228 | Properties: 229 | CodeUri: ../aws-glue-data-catalog-replication-utility-1.0.0.jar 230 | FunctionName: "ExportLambda" 231 | Environment: 232 | Variables: 233 | source_glue_catalog_id: !Ref 'AWS::AccountId' 234 | ddb_name_db_export_status: !Ref rDBStatus 235 | ddb_name_table_export_status: !Ref rTableStatus 236 | region: !Ref 'AWS::Region' 237 | sns_topic_arn_export_dbs_tables: !Ref rSchemaDistributionSNSTopic 238 | sqs_queue_url_large_tables: !Ref rLargeTableSQSQueue 239 | Handler: com.amazonaws.gdcreplication.lambda.ExportDatabaseWithTables 240 | Runtime: java8 241 | Description: "Export Lambda" 242 | MemorySize: 512 243 | Timeout: 300 244 | Role: !GetAtt rGlueCatalogReplicationPolicyRole.Arn 245 | 246 | rExportLambdaSNSPermission: 247 | Type: AWS::Lambda::Permission 248 | Properties: 249 | Action: lambda:InvokeFunction 250 | FunctionName: !Ref rExportLambda 251 | Principal: sns.amazonaws.com 252 | SourceArn: !Sub "arn:aws:sns:${AWS::Region}:${AWS::AccountId}:ReplicationPlannerSNSTopic" 253 | 254 | rExportLargeTableLambda: 255 | Type: "AWS::Serverless::Function" 256 | Properties: 257 | CodeUri: ../aws-glue-data-catalog-replication-utility-1.0.0.jar 258 | FunctionName: "ExportLargeTableLambda" 259 | Environment: 260 | Variables: 261 | s3_bucket_name: !Ref rImportLargeTableBucket 262 | ddb_name_table_export_status: !Ref rTableStatus 263 | region: !Ref 'AWS::Region' 264 | sns_topic_arn_export_dbs_tables: !Ref rSchemaDistributionSNSTopic 265 | Handler: com.amazonaws.gdcreplication.lambda.ExportLargeTable 266 | Runtime: java8 267 | Description: "Export Large Table Lambda" 268 | MemorySize: 512 269 | Timeout: 180 270 | Role: !GetAtt rGlueCatalogReplicationPolicyRole.Arn 271 | 272 | rExportLargeTableLambdaSQSPermission: 273 | Type: AWS::Lambda::EventSourceMapping 274 | Properties: 275 | BatchSize: 1 276 | Enabled: True 277 | EventSourceArn: !GetAtt rLargeTableSQSQueue.Arn 278 | FunctionName: !GetAtt rExportLargeTableLambda.Arn -------------------------------------------------------------------------------- /automated-deployment/target-account/deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | aflag=false 3 | rflag=false 4 | sflag=false 5 | nflag=false 6 | pflag=false 7 | 8 | DIRNAME=$(dirname "$0") 9 | 10 | usage () { echo " 11 | -h -- Opens up this help message 12 | -a -- Source AWS account ID 13 | -r -- Source AWS account region 14 | -n -- Name of the CloudFormation stack 15 | -p -- Name of the AWS profile to use 16 | -s -- Name of S3 bucket to upload artifacts to 17 | "; } 18 | options=':a:r:n:p:s:h' 19 | while getopts $options option 20 | do 21 | case "$option" in 22 | a ) aflag=true; SOURCE_ACCOUNT=$OPTARG;; 23 | r ) rflag=true; SOURCE_REGION=$OPTARG;; 24 | n ) nflag=true; STACK_NAME=$OPTARG;; 25 | p ) pflag=true; PROFILE=$OPTARG;; 26 | s ) sflag=true; S3_BUCKET=$OPTARG;; 27 | h ) usage; exit;; 28 | \? ) echo "Unknown option: -$OPTARG" >&2; exit 1;; 29 | : ) echo "Missing option argument for -$OPTARG" >&2; exit 1;; 30 | * ) echo "Unimplemented option: -$OPTARG" >&2; exit 1;; 31 | esac 32 | done 33 | 34 | if ! $aflag 35 | then 36 | echo "-a not specified, the source AWS account ID (12 digits) must be specified. Aborting..." >&2 37 | exit 0 38 | fi 39 | if ! $rflag 40 | then 41 | echo "-r not specified, the source AWS region must be specified (e.g. eu-west-1). Aborting..." >&2 42 | exit 0 43 | fi 44 | if ! $pflag 45 | then 46 | echo "-p not specified, using default..." >&2 47 | PROFILE="default" 48 | TARGET_REGION=$(aws configure get region --profile ${PROFILE}) 49 | TARGET_ACCOUNT=$(aws sts get-caller-identity --profile ${PROFILE} | python3 -c "import sys, json; print(json.load(sys.stdin)['Account'])") 50 | fi 51 | if ! $sflag 52 | then 53 | S3_BUCKET=glue-data-catalog-replication-$TARGET_REGION-$TARGET_ACCOUNT 54 | fi 55 | if ! $nflag 56 | then 57 | STACK_NAME="glue-data-catalog-replication-target" 58 | fi 59 | 60 | echo "Checking if bucket exists ..." 61 | if ! aws s3 ls $S3_BUCKET --profile $PROFILE; then 62 | echo "S3 bucket named $S3_BUCKET does not exist. Create? [Y/N]" 63 | read choice 64 | if [ $choice == "Y" ] || [ $choice == "y" ]; then 65 | aws s3 mb s3://$S3_BUCKET --profile $PROFILE 66 | else 67 | echo "Bucket does not exist. Deploy aborted." 68 | exit 1 69 | fi 70 | fi 71 | 72 | mkdir $DIRNAME/output 73 | aws cloudformation package --profile $PROFILE --template-file $DIRNAME/template.yaml --s3-bucket $S3_BUCKET --output-template-file $DIRNAME/output/packaged-template.yaml 74 | 75 | echo "Checking if stack exists ..." 76 | if ! aws cloudformation describe-stacks --profile $PROFILE --stack-name $STACK_NAME; then 77 | echo -e "Stack does not exist, creating ..." 78 | aws cloudformation create-stack \ 79 | --stack-name $STACK_NAME \ 80 | --template-body file://$DIRNAME/output/packaged-template.yaml \ 81 | --tags file://$DIRNAME/tags.json \ 82 | --capabilities "CAPABILITY_NAMED_IAM" "CAPABILITY_AUTO_EXPAND" \ 83 | --profile $PROFILE 84 | 85 | echo "Waiting for stack to be created ..." 86 | aws cloudformation wait stack-create-complete --profile $PROFILE \ 87 | --stack-name $STACK_NAME 88 | else 89 | echo -e "Stack exists, attempting update ..." 90 | 91 | set +e 92 | update_output=$( aws cloudformation update-stack \ 93 | --profile $PROFILE \ 94 | --stack-name $STACK_NAME \ 95 | --template-body file://$DIRNAME/output/packaged-template.yaml \ 96 | --tags file://$DIRNAME/tags.json \ 97 | --capabilities "CAPABILITY_NAMED_IAM" "CAPABILITY_AUTO_EXPAND" 2>&1) 98 | status=$? 99 | set -e 100 | 101 | echo "$update_output" 102 | 103 | if [ $status -ne 0 ] ; then 104 | # Don't fail for no-op update 105 | if [[ $update_output == *"ValidationError"* && $update_output == *"No updates"* ]] ; then 106 | echo -e "\nFinished create/update - no updates to be performed"; 107 | exit 0; 108 | else 109 | exit $status 110 | fi 111 | fi 112 | 113 | echo "Waiting for stack update to complete ..." 114 | aws cloudformation wait stack-update-complete --profile $PROFILE \ 115 | --stack-name $STACK_NAME 116 | echo "Finished create/update successfully!" 117 | fi 118 | 119 | echo "Subscribing Lambda to Source SNS Schema Distribution topic..." 120 | aws lambda add-permission --function-name ImportLambda \ 121 | --source-arn arn:aws:sns:$SOURCE_REGION:$SOURCE_ACCOUNT:SchemaDistributionSNSTopic \ 122 | --statement-id sns-x-account --action "lambda:InvokeFunction" \ 123 | --principal sns.amazonaws.com 124 | 125 | aws sns subscribe --region $SOURCE_REGION --protocol lambda \ 126 | --topic-arn arn:aws:sns:$SOURCE_REGION:$SOURCE_ACCOUNT:SchemaDistributionSNSTopic \ 127 | --notification-endpoint arn:aws:lambda:$TARGET_REGION:$TARGET_ACCOUNT:function:ImportLambda -------------------------------------------------------------------------------- /automated-deployment/target-account/tags.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "Key": "Project", 4 | "Value": "GlueCatalogReplication" 5 | } 6 | ] -------------------------------------------------------------------------------- /automated-deployment/target-account/template.yaml: -------------------------------------------------------------------------------- 1 | AWSTemplateFormatVersion: "2010-09-09" 2 | Transform: "AWS::Serverless-2016-10-31" 3 | Description: "AWS Glue Data Catalog Replication Utility - Target Account" 4 | 5 | Parameters: 6 | pKmsKeyARNSQS: 7 | Description: "KMS Key ARN for SQS Queue" 8 | Type: String 9 | Default: "" 10 | 11 | Resources: 12 | ### DynamoDB ### 13 | rDBStatus: 14 | Type: "AWS::DynamoDB::Table" 15 | Properties: 16 | TableName: "db_status" 17 | BillingMode: "PAY_PER_REQUEST" 18 | AttributeDefinitions: 19 | - AttributeName: "db_id" 20 | AttributeType: "S" 21 | - AttributeName: "import_run_id" 22 | AttributeType: "N" 23 | KeySchema: 24 | - 25 | AttributeName: "db_id" 26 | KeyType: "HASH" 27 | - 28 | AttributeName: "import_run_id" 29 | KeyType: "RANGE" 30 | 31 | rTableStatus: 32 | Type: "AWS::DynamoDB::Table" 33 | Properties: 34 | TableName: "table_status" 35 | BillingMode: "PAY_PER_REQUEST" 36 | AttributeDefinitions: 37 | - AttributeName: "table_id" 38 | AttributeType: "S" 39 | - AttributeName: "import_run_id" 40 | AttributeType: "N" 41 | KeySchema: 42 | - 43 | AttributeName: "table_id" 44 | KeyType: "HASH" 45 | - 46 | AttributeName: "import_run_id" 47 | KeyType: "RANGE" 48 | 49 | ### SQS ### 50 | rLargeTableSQSQueue: 51 | Type: "AWS::SQS::Queue" 52 | Properties: 53 | QueueName: "LargeTableSQSQueue" 54 | VisibilityTimeout: 195 55 | KmsMasterKeyId: !Ref pKmsKeyARNSQS 56 | rDeadLetterQueue: 57 | Type: 'AWS::SQS::Queue' 58 | Properties: 59 | QueueName: "DeadLetterQueue" 60 | VisibilityTimeout: 195 61 | KmsMasterKeyId: !Ref pKmsKeyARNSQS 62 | 63 | ### IAM ### 64 | rGlueCatalogReplicationPolicyRole: 65 | Type: "AWS::IAM::Role" 66 | Properties: 67 | AssumeRolePolicyDocument: 68 | Version: "2012-10-17" 69 | Statement: 70 | - Effect: Allow 71 | Principal: 72 | Service: lambda.amazonaws.com 73 | Action: "sts:AssumeRole" 74 | Path: "/" 75 | ManagedPolicyArns: 76 | - "arn:aws:iam::aws:policy/AWSLambdaExecute" 77 | Policies: 78 | - PolicyName: GlueCatalogReplicationPolicy 79 | PolicyDocument: 80 | Version: '2012-10-17' 81 | Statement: 82 | - Effect: Allow 83 | Action: 84 | - "glue:SearchTables" 85 | - "glue:BatchCreatePartition" 86 | - "glue:GetDataCatalogEncryptionSettings" 87 | - "glue:GetTableVersions" 88 | - "glue:GetPartitions" 89 | - "glue:BatchDeletePartition" 90 | - "glue:DeleteTableVersion" 91 | - "glue:UpdateTable" 92 | - "glue:GetSecurityConfiguration" 93 | - "glue:GetResourcePolicy" 94 | - "glue:GetTableVersion" 95 | - "glue:CreatePartition" 96 | - "glue:UpdatePartition" 97 | - "glue:UpdateDatabase" 98 | - "glue:CreateTable" 99 | - "glue:GetTables" 100 | - "glue:BatchGetPartition" 101 | - "glue:GetSecurityConfigurations" 102 | - "glue:GetDatabases" 103 | - "glue:GetTable" 104 | - "glue:GetDatabase" 105 | - "glue:GetPartition" 106 | - "glue:CreateDatabase" 107 | - "glue:BatchDeleteTableVersion" 108 | - "glue:DeletePartition" 109 | Resource: "*" 110 | - Effect: Allow 111 | Action: 112 | - "sqs:DeleteMessage" 113 | - "sqs:ListQueues" 114 | - "sqs:GetQueueUrl" 115 | - "sqs:ListDeadLetterSourceQueues" 116 | - "sqs:ChangeMessageVisibility" 117 | - "sqs:DeleteMessageBatch" 118 | - "sqs:SendMessageBatch" 119 | - "sqs:ReceiveMessage" 120 | - "sqs:SendMessage" 121 | - "sqs:GetQueueAttributes" 122 | - "sqs:ListQueueTags" 123 | Resource: 124 | - '*' 125 | - Effect: Allow 126 | Action: 127 | - "dynamodb:BatchWriteItem" 128 | - "dynamodb:PutItem" 129 | Resource: 130 | - "*" 131 | 132 | ### Lambda ### 133 | rImportLambda: 134 | Type: "AWS::Serverless::Function" 135 | Properties: 136 | CodeUri: ../aws-glue-data-catalog-replication-utility-1.0.0.jar 137 | FunctionName: "ImportLambda" 138 | Environment: 139 | Variables: 140 | target_glue_catalog_id: !Ref 'AWS::AccountId' 141 | ddb_name_db_import_status: !Ref rDBStatus 142 | ddb_name_table_import_status: !Ref rTableStatus 143 | skip_archive: "true" 144 | region: !Ref 'AWS::Region' 145 | sqs_queue_url_large_tables: !Ref rLargeTableSQSQueue 146 | dlq_url_sqs: !Ref rDeadLetterQueue 147 | Handler: com.amazonaws.gdcreplication.lambda.ImportDatabaseOrTable 148 | Runtime: java8 149 | Description: "Import Lambda" 150 | MemorySize: 512 151 | Timeout: 300 152 | Role: !GetAtt rGlueCatalogReplicationPolicyRole.Arn 153 | 154 | rImportLargeTableLambda: 155 | Type: "AWS::Serverless::Function" 156 | Properties: 157 | CodeUri: ../aws-glue-data-catalog-replication-utility-1.0.0.jar 158 | FunctionName: "ImportLargeTableLambda" 159 | Environment: 160 | Variables: 161 | target_glue_catalog_id: !Ref 'AWS::AccountId' 162 | ddb_name_table_import_status: !Ref rTableStatus 163 | skip_archive: "true" 164 | region: !Ref 'AWS::Region' 165 | Handler: com.amazonaws.gdcreplication.lambda.ImportLargeTable 166 | Runtime: java8 167 | Description: "Import Large Table Lambda" 168 | MemorySize: 512 169 | Timeout: 180 170 | Role: !GetAtt rGlueCatalogReplicationPolicyRole.Arn 171 | 172 | rImportLargeTableLambdaSQSPermission: 173 | Type: AWS::Lambda::EventSourceMapping 174 | Properties: 175 | BatchSize: 1 176 | Enabled: True 177 | EventSourceArn: !GetAtt rLargeTableSQSQueue.Arn 178 | FunctionName: !GetAtt rImportLargeTableLambda.Arn 179 | 180 | rDLQProcessorLambda: 181 | Type: "AWS::Serverless::Function" 182 | Properties: 183 | CodeUri: ../aws-glue-data-catalog-replication-utility-1.0.0.jar 184 | FunctionName: "DLQProcessorLambda" 185 | Environment: 186 | Variables: 187 | target_glue_catalog_id: !Ref 'AWS::AccountId' 188 | ddb_name_db_import_status: !Ref rDBStatus 189 | ddb_name_table_import_status: !Ref rTableStatus 190 | skip_archive: "true" 191 | dlq_url_sqs: !Ref rDeadLetterQueue 192 | region: !Ref 'AWS::Region' 193 | Handler: com.amazonaws.gdcreplication.lambda.DLQImportDatabaseOrTable 194 | Runtime: java8 195 | Description: "DLQ Lambda" 196 | MemorySize: 512 197 | Timeout: 180 198 | Role: !GetAtt rGlueCatalogReplicationPolicyRole.Arn 199 | 200 | rDLQProcessorLambdaSQSPermission: 201 | Type: AWS::Lambda::EventSourceMapping 202 | Properties: 203 | BatchSize: 1 204 | Enabled: True 205 | EventSourceArn: !GetAtt rDeadLetterQueue.Arn 206 | FunctionName: !GetAtt rDLQProcessorLambda.Arn 207 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | com.amazonaws.glue.catalog.replication 5 | aws-glue-data-catalog-replication-utility 6 | 1.0.0 7 | jar 8 | 9 | 10 | 11 | 12 | org.apache.maven.plugins 13 | maven-compiler-plugin 14 | 3.7.0 15 | 16 | 1.8 17 | 1.8 18 | 19 | 20 | 21 | org.apache.maven.plugins 22 | maven-shade-plugin 23 | 3.1.0 24 | 25 | 26 | package 27 | 28 | shade 29 | 30 | 31 | 32 | 34 | com.amazonaws.gdcreplication.lambda.GDCReplicationPlanner 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | com.amazonaws 48 | aws-java-sdk-bom 49 | 1.11.595 50 | pom 51 | import 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | com.amazonaws 61 | aws-java-sdk-glue 62 | 63 | 64 | 65 | 66 | com.amazonaws 67 | aws-java-sdk-sqs 68 | 69 | 70 | 71 | 72 | com.amazonaws 73 | aws-java-sdk-sns 74 | 75 | 76 | 77 | 78 | com.amazonaws 79 | aws-java-sdk-dynamodb 80 | 81 | 82 | 83 | 84 | junit 85 | junit 86 | 4.13.1 87 | test 88 | 89 | 90 | 91 | 92 | com.amazonaws 93 | aws-lambda-java-events 94 | 2.2.7 95 | 96 | 97 | com.amazonaws 98 | aws-lambda-java-core 99 | 1.2.0 100 | 101 | 102 | 103 | 104 | com.google.guava 105 | guava 106 | 32.0.0-jre 107 | 108 | 109 | 110 | 111 | com.google.code.gson 112 | gson 113 | 2.8.9 114 | 115 | 116 | 117 | 118 | -------------------------------------------------------------------------------- /src/main/java/com/amazonaws/gdcreplication/lambda/DLQImportDatabaseOrTable.java: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: MIT-0 3 | 4 | package com.amazonaws.gdcreplication.lambda; 5 | 6 | import java.util.Map.Entry; 7 | import java.util.Optional; 8 | 9 | import com.amazonaws.ClientConfiguration; 10 | import com.amazonaws.gdcreplication.util.GDCUtil; 11 | import com.amazonaws.gdcreplication.util.TableWithPartitions; 12 | import com.amazonaws.regions.Regions; 13 | import com.amazonaws.services.glue.AWSGlue; 14 | import com.amazonaws.services.glue.AWSGlueClientBuilder; 15 | import com.amazonaws.services.glue.model.Database; 16 | import com.amazonaws.services.lambda.runtime.Context; 17 | import com.amazonaws.services.lambda.runtime.RequestHandler; 18 | import com.amazonaws.services.lambda.runtime.events.SQSEvent; 19 | import com.amazonaws.services.lambda.runtime.events.SQSEvent.MessageAttribute; 20 | import com.amazonaws.services.lambda.runtime.events.SQSEvent.SQSMessage; 21 | import com.amazonaws.services.sqs.AmazonSQS; 22 | import com.amazonaws.services.sqs.AmazonSQSClientBuilder; 23 | import com.google.gson.Gson; 24 | import com.google.gson.JsonSyntaxException; 25 | 26 | public class DLQImportDatabaseOrTable implements RequestHandler { 27 | 28 | @Override 29 | public String handleRequest(SQSEvent event, Context context) { 30 | 31 | String region = Optional.ofNullable(System.getenv("region")).orElse(Regions.US_EAST_1.getName()); 32 | String targetGlueCatalogId = Optional.ofNullable(System.getenv("target_glue_catalog_id")).orElse("1234567890"); 33 | boolean skipTableArchive = Boolean 34 | .parseBoolean(Optional.ofNullable(System.getenv("skip_archive")).orElse("true")); 35 | String ddbTblNameForDBStatusTracking = Optional.ofNullable(System.getenv("ddb_name_db_import_status")) 36 | .orElse("ddb_name_db_import_status"); 37 | String ddbTblNameForTableStatusTracking = Optional.ofNullable(System.getenv("ddb_name_table_import_status")) 38 | .orElse("ddb_name_table_import_status"); 39 | String sqsQueueURL = Optional.ofNullable(System.getenv("dlq_url_sqs")).orElse(""); 40 | 41 | // Print environment variables 42 | printEnvVariables(targetGlueCatalogId, skipTableArchive, ddbTblNameForDBStatusTracking, 43 | ddbTblNameForTableStatusTracking, sqsQueueURL, region); 44 | 45 | // Set client configuration 46 | ClientConfiguration cc = new ClientConfiguration(); 47 | cc.setMaxErrorRetry(10); 48 | 49 | // Create Objects for Glue and SQS 50 | AWSGlue glue = AWSGlueClientBuilder.standard().withRegion(region).withClientConfiguration(cc).build(); 51 | AmazonSQS sqs = AmazonSQSClientBuilder.standard().withRegion(region).withClientConfiguration(cc).build(); 52 | 53 | /** 54 | * Iterate and process all the messages which are part of SQSEvent 55 | */ 56 | System.out.println("Number of messages in SQS Event: " + event.getRecords().size()); 57 | for (SQSMessage msg : event.getRecords()) { 58 | String ddl = new String(msg.getBody()); 59 | String exportBatchId = ""; 60 | String sourceGlueCatalogId = ""; 61 | String schemaType = ""; 62 | boolean isTable = false; 63 | 64 | // Read Message Attributes 65 | for (Entry entry : msg.getMessageAttributes().entrySet()) { 66 | if ("ExportBatchId".equalsIgnoreCase(entry.getKey())) { 67 | exportBatchId = entry.getValue().getStringValue(); 68 | System.out.println("Export Batch Id: " + exportBatchId); 69 | } else if ("SourceGlueDataCatalogId".equalsIgnoreCase(entry.getKey())) { 70 | sourceGlueCatalogId = entry.getValue().getStringValue(); 71 | System.out.println("Source Glue Data Cagalog Id: " + sourceGlueCatalogId); 72 | } else if ("SchemaType".equalsIgnoreCase(entry.getKey())) { 73 | schemaType = entry.getValue().getStringValue(); 74 | System.out.println("Message Schema Type " + schemaType); 75 | } 76 | } 77 | System.out.println("Schema: " + ddl); 78 | if (schemaType.equalsIgnoreCase("Table")) 79 | isTable = true; 80 | 81 | processsRecord(context, glue, sqs, sqsQueueURL, targetGlueCatalogId, ddbTblNameForDBStatusTracking, 82 | ddbTblNameForTableStatusTracking, ddl, skipTableArchive, exportBatchId, sourceGlueCatalogId, 83 | isTable); 84 | 85 | } 86 | return "Success"; 87 | } 88 | 89 | /** 90 | * Print environment variables 91 | * 92 | * @param targetGlueCatalogId 93 | * @param skipTableArchive 94 | * @param ddbTblNameForDBStatusTracking 95 | * @param ddbTblNameForTableStatusTracking 96 | * @param sqsQueueURL 97 | * @param region 98 | */ 99 | public void printEnvVariables(String targetGlueCatalogId, boolean skipTableArchive, 100 | String ddbTblNameForDBStatusTracking, String ddbTblNameForTableStatusTracking, String sqsQueueURL, 101 | String region) { 102 | System.out.println("Target Catalog Id: " + targetGlueCatalogId); 103 | System.out.println("Skip Table Archive: " + skipTableArchive); 104 | System.out.println("DynamoDB Table for DB Import Auditing: " + ddbTblNameForDBStatusTracking); 105 | System.out.println("DynamoDB Table for Table Import Auditing: " + ddbTblNameForTableStatusTracking); 106 | System.out.println("Dead Letter Queue URL: " + sqsQueueURL); 107 | System.out.println("Region: " + region); 108 | } 109 | 110 | /** 111 | * This method processes a record from SQS 112 | * 113 | * @param context 114 | * @param glue 115 | * @param glueUtil 116 | * @param ddbUtil 117 | * @param targetGlueCatalogId 118 | * @param ddbTblNameForDBStatusTracking 119 | * @param ddbTblNameForTableStatusTracking 120 | * @param message 121 | * @param skipTableArchive 122 | * @param exportBatchId 123 | * @param sourceGlueCatalogId 124 | * @param isTable 125 | */ 126 | public void processsRecord(Context context, AWSGlue glue, AmazonSQS sqs, String sqsQueueURL, 127 | String targetGlueCatalogId, String ddbTblNameForDBStatusTracking, String ddbTblNameForTableStatusTracking, 128 | String message, boolean skipTableArchive, String exportBatchId, String sourceGlueCatalogId, 129 | boolean isTable) { 130 | 131 | boolean isDatabaseType = false; 132 | boolean isTableType = false; 133 | 134 | Database db = null; 135 | TableWithPartitions table = null; 136 | Gson gson = new Gson(); 137 | 138 | if (isTable) { 139 | context.getLogger().log("The input message is of type Glue Table."); 140 | try { 141 | table = gson.fromJson(message, TableWithPartitions.class); 142 | isTableType = true; 143 | } catch (JsonSyntaxException e) { 144 | System.out.println("Cannot parse SNS message to Glue Table Type."); 145 | e.printStackTrace(); 146 | } 147 | } else { 148 | context.getLogger().log("The input message is of type Glue Database."); 149 | try { 150 | db = gson.fromJson(message, Database.class); 151 | isDatabaseType = true; 152 | } catch (JsonSyntaxException e) { 153 | System.out.println("Cannot parse SNS message to Glue Database Type."); 154 | e.printStackTrace(); 155 | } 156 | } 157 | // Execute the business logic based on the message type 158 | GDCUtil gdcUtil = new GDCUtil(); 159 | if (isDatabaseType) { 160 | gdcUtil.processDatabseSchema(glue, sqs, targetGlueCatalogId, db, message, sqsQueueURL, sourceGlueCatalogId, 161 | exportBatchId, ddbTblNameForDBStatusTracking); 162 | } else if (isTableType) { 163 | gdcUtil.processTableSchema(glue, sqs, targetGlueCatalogId, sourceGlueCatalogId, table, message, 164 | ddbTblNameForTableStatusTracking, sqsQueueURL, exportBatchId, skipTableArchive); 165 | } 166 | } 167 | } -------------------------------------------------------------------------------- /src/main/java/com/amazonaws/gdcreplication/lambda/ExportDatabaseWithTables.java: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: MIT-0 3 | 4 | package com.amazonaws.gdcreplication.lambda; 5 | 6 | import java.util.ArrayList; 7 | import java.util.HashMap; 8 | import java.util.List; 9 | import java.util.Map; 10 | import java.util.Optional; 11 | import java.util.concurrent.atomic.AtomicInteger; 12 | 13 | import com.amazonaws.ClientConfiguration; 14 | import com.amazonaws.gdcreplication.util.LargeTable; 15 | import com.amazonaws.gdcreplication.util.DDBUtil; 16 | import com.amazonaws.gdcreplication.util.GlueUtil; 17 | import com.amazonaws.gdcreplication.util.SNSUtil; 18 | import com.amazonaws.gdcreplication.util.SQSUtil; 19 | import com.amazonaws.gdcreplication.util.TableWithPartitions; 20 | import com.amazonaws.regions.Regions; 21 | import com.amazonaws.services.dynamodbv2.model.AttributeValue; 22 | import com.amazonaws.services.dynamodbv2.model.PutRequest; 23 | import com.amazonaws.services.dynamodbv2.model.WriteRequest; 24 | import com.amazonaws.services.glue.AWSGlue; 25 | import com.amazonaws.services.glue.AWSGlueClientBuilder; 26 | import com.amazonaws.services.glue.model.Database; 27 | import com.amazonaws.services.glue.model.Partition; 28 | import com.amazonaws.services.glue.model.Table; 29 | import com.amazonaws.services.lambda.runtime.Context; 30 | import com.amazonaws.services.lambda.runtime.RequestHandler; 31 | import com.amazonaws.services.lambda.runtime.events.SNSEvent; 32 | import com.amazonaws.services.lambda.runtime.events.SNSEvent.MessageAttribute; 33 | import com.amazonaws.services.lambda.runtime.events.SNSEvent.SNSRecord; 34 | import com.amazonaws.services.sns.AmazonSNS; 35 | import com.amazonaws.services.sns.AmazonSNSClientBuilder; 36 | import com.amazonaws.services.sns.model.PublishResult; 37 | import com.amazonaws.services.sqs.AmazonSQS; 38 | import com.amazonaws.services.sqs.AmazonSQSClientBuilder; 39 | import com.google.gson.Gson; 40 | import com.google.gson.JsonSyntaxException; 41 | 42 | /** 43 | * This class has AWS Lambda Handler method. Upon invocation, it gets an SNS Event from source SNS 44 | * Topic, gets the message(s) from the event. 45 | * 46 | * For each message, it takes the following actions: 47 | * 1. Parse the message to database 48 | * 2. Check if a database exist in Glue 49 | * 3. If exist, fetches all tables for the database 50 | * 51 | * For each table, it takes the following actions: 52 | * 1. Convert Glue Table object to JSON String (This is a Table DDL) 53 | * 2. Publish the Table DDL to an SNS Topic 54 | * 3. Insert a record to a DynamoDB table for status tracking 55 | * 56 | * @author Ravi Itha, Amazon Web Services, Inc. 57 | * 58 | */ 59 | public class ExportDatabaseWithTables implements RequestHandler { 60 | 61 | @Override 62 | public String handleRequest(SNSEvent request, Context context) { 63 | 64 | String region = Optional.ofNullable(System.getenv("region")).orElse(Regions.US_EAST_1.getName()); 65 | String sourceGlueCatalogId = Optional.ofNullable(System.getenv("source_glue_catalog_id")).orElse("1234567890"); 66 | String topicArn = Optional.ofNullable(System.getenv("sns_topic_arn_export_dbs_tables")) 67 | .orElse("arn:aws:sns:us-east-1:1234567890:GlueExportSNSTopic"); 68 | String ddbTblNameForDBStatusTracking = Optional.ofNullable(System.getenv("ddb_name_db_export_status")) 69 | .orElse("ddb_name_db_export_status"); 70 | String ddbTblNameForTableStatusTracking = Optional.ofNullable(System.getenv("ddb_name_table_export_status")) 71 | .orElse("ddb_name_table_export_status"); 72 | String sqsQueue4LargeTables = Optional.ofNullable(System.getenv("sqs_queue_url_large_tables")).orElse(""); 73 | int partitionThreshold = 10; 74 | 75 | // Client configuration 76 | ClientConfiguration cc = new ClientConfiguration(); 77 | cc.setMaxErrorRetry(10); 78 | 79 | List snsRecods = request.getRecords(); 80 | AWSGlue glue = AWSGlueClientBuilder.standard().withRegion(region).build(); 81 | AmazonSNS sns = AmazonSNSClientBuilder.standard().withRegion(region).build(); 82 | AmazonSQS sqs = AmazonSQSClientBuilder.standard().withRegion(region).withClientConfiguration(cc).build(); 83 | 84 | printEnvVariables(sourceGlueCatalogId, topicArn, ddbTblNameForDBStatusTracking, 85 | ddbTblNameForTableStatusTracking, sqsQueue4LargeTables); 86 | System.out.printf("Number of messages in SNS Event: \n" + snsRecods.size()); 87 | processSNSEvent(snsRecods, context, glue, sns, sqs, sourceGlueCatalogId, ddbTblNameForDBStatusTracking, 88 | ddbTblNameForTableStatusTracking, topicArn, sqsQueue4LargeTables, partitionThreshold); 89 | 90 | return "Message from SNS Topic was processed successfully!"; 91 | } 92 | 93 | /** 94 | * This method prints environment variables 95 | * @param sourceGlueCatalogId 96 | * @param topicArn 97 | * @param ddbTblNameForDBStatusTracking 98 | * @param ddbTblNameForTableStatusTracking 99 | */ 100 | public static void printEnvVariables(String sourceGlueCatalogId, String topicArn, 101 | String ddbTblNameForDBStatusTracking, String ddbTblNameForTableStatusTracking, String sqsQueue4LargeTables) { 102 | System.out.println("SNS Topic Arn: " + topicArn); 103 | System.out.println("Source Catalog Id: " + sourceGlueCatalogId); 104 | System.out.println("DynamoDB Table for DB Export Auditing: " + ddbTblNameForDBStatusTracking); 105 | System.out.println("DynamoDB Table for Table Export Auditing: " + ddbTblNameForTableStatusTracking); 106 | System.out.println("SQS queue for large tables: " + sqsQueue4LargeTables); 107 | } 108 | 109 | /** 110 | * This method processes SNSEvent 111 | * @param snsRecods 112 | * @param context 113 | * @param glue 114 | * @param sns 115 | * @param sourceGlueCatalogId 116 | * @param ddbTblNameForDBStatusTracking 117 | * @param ddbTblNameForTableStatusTracking 118 | * @param topicArn 119 | */ 120 | public static void processSNSEvent(List snsRecods, Context context, AWSGlue glue, AmazonSNS sns, 121 | AmazonSQS sqs, String sourceGlueCatalogId, String ddbTblNameForDBStatusTracking, 122 | String ddbTblNameForTableStatusTracking, String topicArn, String sqsQueue4LargePartTables, 123 | int partitionThreshold) { 124 | Database db = null; 125 | Gson gson = new Gson(); 126 | DDBUtil ddbUtil = new DDBUtil(); 127 | SNSUtil snsUtil = new SNSUtil(); 128 | GlueUtil glueUtil = new GlueUtil(); 129 | SQSUtil sqsUtil = new SQSUtil(); 130 | long exportRunId = System.currentTimeMillis(); 131 | 132 | for (SNSRecord snsRecod : snsRecods) { 133 | 134 | List itemList = new ArrayList(); 135 | 136 | boolean isDatabaseType = false; 137 | AtomicInteger numberOfTablesExported = new AtomicInteger(); 138 | String databaseDDL = snsRecod.getSNS().getMessage(); 139 | context.getLogger().log("SNS Message Payload: " + databaseDDL); 140 | Map msgAttributeMap = snsRecod.getSNS().getMessageAttributes(); 141 | MessageAttribute msgAttrMessageType = msgAttributeMap.get("message_type"); 142 | MessageAttribute msgAttrExportBatchId = msgAttributeMap.get("export_batch_id"); 143 | 144 | context.getLogger().log("Message Attribute value: " + msgAttrMessageType.getValue()); 145 | // Convert Message to Glue Database Type 146 | try { 147 | if (msgAttrMessageType.getValue().equalsIgnoreCase("database")) { 148 | db = gson.fromJson(databaseDDL, Database.class); 149 | isDatabaseType = true; 150 | } 151 | } catch (JsonSyntaxException e) { 152 | System.out.println("Cannot parse SNS message to Glue Database Type."); 153 | e.printStackTrace(); 154 | } 155 | if (isDatabaseType) { 156 | // Check if a database exist in Glue 157 | Database database = glueUtil.getDatabaseIfExist(glue, sourceGlueCatalogId, db); 158 | if (Optional.ofNullable(database).isPresent()) { 159 | PublishResult publishDBResponse = snsUtil.publishDatabaseSchemaToSNS(sns, topicArn, databaseDDL, 160 | sourceGlueCatalogId, msgAttrExportBatchId.getValue()); 161 | if (Optional.ofNullable(publishDBResponse.getMessageId()).isPresent()) { 162 | System.out.println("Database schema published to SNS Topic. Message_Id: " 163 | + publishDBResponse.getMessageId()); 164 | ddbUtil.trackDatabaseExportStatus(ddbTblNameForDBStatusTracking, db.getName(), databaseDDL, 165 | publishDBResponse.getMessageId(), sourceGlueCatalogId, exportRunId, msgAttrExportBatchId.getValue(), true); 166 | } else { 167 | ddbUtil.trackDatabaseExportStatus(ddbTblNameForDBStatusTracking, db.getName(), databaseDDL, "", 168 | sourceGlueCatalogId, exportRunId, msgAttrExportBatchId.getValue(), false); 169 | } 170 | // Get Tables for a given Database 171 | List dbTableList = glueUtil.getTables(glue, sourceGlueCatalogId, database.getName()); 172 | for (Table table : dbTableList) { 173 | List partitionList = glueUtil.getPartitions(glue, sourceGlueCatalogId, table.getDatabaseName(), table.getName()); 174 | if(partitionList.size() <= partitionThreshold) { 175 | System.out.printf("Database: %s, Table: %s, num_partitions: %d \n", table.getDatabaseName(), table.getName(), partitionList.size()); 176 | TableWithPartitions tableWithParts = new TableWithPartitions(); 177 | tableWithParts.setPartitionList(partitionList); 178 | tableWithParts.setTable(table); 179 | 180 | // Convert Table to JSON String 181 | String tableDDL = gson.toJson(tableWithParts); 182 | 183 | // Publish a message to Amazon SNS topic. 184 | PublishResult publishTableResponse = snsUtil.publishTableSchemaToSNS(sns, topicArn, table, tableDDL, 185 | sourceGlueCatalogId, msgAttrExportBatchId.getValue()); 186 | 187 | Map item = new HashMap(); 188 | item.put("table_id", new AttributeValue().withS(table.getName().concat("|").concat(table.getDatabaseName()))); 189 | item.put("export_run_id", new AttributeValue().withN(Long.valueOf(exportRunId).toString())); 190 | item.put("export_batch_id", new AttributeValue().withS(msgAttrExportBatchId.getValue())); 191 | item.put("source_glue_catalog_id", new AttributeValue().withS(sourceGlueCatalogId)); 192 | item.put("table_schema", new AttributeValue().withS(tableDDL)); 193 | item.put("is_large_table", new AttributeValue().withS(Boolean.toString(false))); 194 | 195 | if (Optional.ofNullable(publishTableResponse.getMessageId()).isPresent()) { 196 | item.put("sns_msg_id", new AttributeValue().withS(publishTableResponse.getMessageId())); 197 | item.put("is_exported", new AttributeValue().withS(Boolean.toString(true))); 198 | numberOfTablesExported.getAndIncrement(); 199 | } else { 200 | item.put("sns_msg_id", new AttributeValue().withS("")); 201 | item.put("is_exported", new AttributeValue().withS(Boolean.toString(false))); 202 | } 203 | 204 | itemList.add(new WriteRequest().withPutRequest(new PutRequest().withItem(item))); 205 | } else { 206 | LargeTable largeTable = new LargeTable(); 207 | largeTable.setTable(table); 208 | largeTable.setLargeTable(true); 209 | largeTable.setNumberOfPartitions(partitionList.size()); 210 | largeTable.setCatalogId(sourceGlueCatalogId); 211 | 212 | System.out.printf("Database: %s, Table: %s, num_partitions: %d \n", table.getDatabaseName(), table.getName(), partitionList.size()); 213 | System.out.println("This will be sent to SQS Queue for furhter processing."); 214 | 215 | sqsUtil.sendTableSchemaToSQSQueue(sqs, sqsQueue4LargePartTables, largeTable, msgAttrExportBatchId.getValue(), sourceGlueCatalogId); 216 | } 217 | } 218 | System.out.printf("Inserting Table statistics to DynamoDB for database: %s \n", database.getName()); 219 | ddbUtil.insertIntoDynamoDB(itemList, ddbTblNameForTableStatusTracking); 220 | System.out.printf( 221 | "Table export statistics: number of tables exist in Database = %d, number of tables exported to SNS = %d. \n", 222 | dbTableList.size(), numberOfTablesExported.get()); 223 | } else 224 | System.out.printf( 225 | "There is no Database with name '%s' exist in Glue Data Catalog. Tables cannot be retrieved. \n", 226 | database.getName()); 227 | } else { 228 | System.out.println( 229 | "Message received from SNS Topic seems to be invalid. It could not be converted to Glue Database Type."); 230 | } 231 | 232 | } 233 | } 234 | } -------------------------------------------------------------------------------- /src/main/java/com/amazonaws/gdcreplication/lambda/ExportLargeTable.java: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: MIT-0 3 | 4 | package com.amazonaws.gdcreplication.lambda; 5 | 6 | import java.text.SimpleDateFormat; 7 | import java.util.Date; 8 | import java.util.List; 9 | import java.util.Map.Entry; 10 | import java.util.Optional; 11 | import java.util.concurrent.atomic.AtomicInteger; 12 | 13 | import com.amazonaws.ClientConfiguration; 14 | import com.amazonaws.gdcreplication.util.DDBUtil; 15 | import com.amazonaws.gdcreplication.util.GlueUtil; 16 | import com.amazonaws.gdcreplication.util.LargeTable; 17 | import com.amazonaws.gdcreplication.util.S3Util; 18 | import com.amazonaws.gdcreplication.util.SNSUtil; 19 | import com.amazonaws.regions.Regions; 20 | import com.amazonaws.services.glue.AWSGlue; 21 | import com.amazonaws.services.glue.AWSGlueClientBuilder; 22 | import com.amazonaws.services.glue.model.Partition; 23 | import com.amazonaws.services.glue.model.Table; 24 | import com.amazonaws.services.lambda.runtime.Context; 25 | import com.amazonaws.services.lambda.runtime.RequestHandler; 26 | import com.amazonaws.services.lambda.runtime.events.SQSEvent; 27 | import com.amazonaws.services.lambda.runtime.events.SQSEvent.MessageAttribute; 28 | import com.amazonaws.services.lambda.runtime.events.SQSEvent.SQSMessage; 29 | import com.amazonaws.services.sns.AmazonSNS; 30 | import com.amazonaws.services.sns.AmazonSNSClientBuilder; 31 | import com.amazonaws.services.sns.model.PublishResult; 32 | import com.google.gson.Gson; 33 | 34 | public class ExportLargeTable implements RequestHandler { 35 | 36 | @Override 37 | public String handleRequest(SQSEvent event, Context context) { 38 | 39 | String region = Optional.ofNullable(System.getenv("region")).orElse(Regions.US_EAST_1.getName()); 40 | String topicArn = Optional.ofNullable(System.getenv("sns_topic_arn_export_dbs_tables")) 41 | .orElse("arn:aws:sns:us-east-1:1234567890:GlueExportSNSTopic"); 42 | String bucketName = Optional.ofNullable(System.getenv("s3_bucket_name")).orElse(""); 43 | String ddbTblNameForTableStatusTracking = Optional.ofNullable(System.getenv("ddb_name_table_export_status")) 44 | .orElse("ddb_name_table_export_status"); 45 | 46 | // Set client configuration 47 | ClientConfiguration cc = new ClientConfiguration(); 48 | cc.setMaxErrorRetry(10); 49 | 50 | // Create Objects for Glue and SQS 51 | AWSGlue glue = AWSGlueClientBuilder.standard().withRegion(region).withClientConfiguration(cc).build(); 52 | AmazonSNS sns = AmazonSNSClientBuilder.standard().withRegion(region).build(); 53 | 54 | // // Create Objects for Utility classes 55 | DDBUtil ddbUtil = new DDBUtil(); 56 | GlueUtil glueUtil = new GlueUtil(); 57 | S3Util s3Util = new S3Util(); 58 | SNSUtil snsUtil = new SNSUtil(); 59 | 60 | String objectKey = ""; 61 | LargeTable largeTable = null; 62 | boolean recordProcessed = false; 63 | boolean objectCreated = false; 64 | 65 | /** 66 | * Iterate and process all the messages which are part of SQSEvent 67 | */ 68 | System.out.println("Number of messages in SQS Event: " + event.getRecords().size()); 69 | for (SQSMessage msg : event.getRecords()) { 70 | String payLoad = new String(msg.getBody()); 71 | String exportBatchId = ""; 72 | String sourceGlueCatalogId = ""; 73 | String messageType = ""; 74 | 75 | Gson gson = new Gson(); 76 | long exportRunId = System.currentTimeMillis(); 77 | 78 | // Read Message Attributes 79 | for (Entry entry : msg.getMessageAttributes().entrySet()) { 80 | if ("ExportBatchId".equalsIgnoreCase(entry.getKey())) { 81 | exportBatchId = entry.getValue().getStringValue(); 82 | System.out.println("Export Batch Id: " + exportBatchId); 83 | } else if ("SourceGlueDataCatalogId".equalsIgnoreCase(entry.getKey())) { 84 | sourceGlueCatalogId = entry.getValue().getStringValue(); 85 | System.out.println("Source Glue Data Cagalog Id: " + sourceGlueCatalogId); 86 | } else if ("SchemaType".equalsIgnoreCase(entry.getKey())) { 87 | messageType = entry.getValue().getStringValue(); 88 | System.out.println("Message Type " + messageType); 89 | } 90 | } 91 | 92 | if (messageType.equalsIgnoreCase("largeTable")) { 93 | largeTable = gson.fromJson(payLoad, LargeTable.class); 94 | if (largeTable.isLargeTable()) { 95 | 96 | // Create object key 97 | SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd"); 98 | StringBuilder date = new StringBuilder(simpleDateFormat.format(new Date())); 99 | objectKey = date.append("_").append(Long.toString(System.currentTimeMillis())).append("_") 100 | .append(sourceGlueCatalogId).append("_").append(largeTable.getTable().getDatabaseName()) 101 | .append("_").append(largeTable.getTable().getName()).append(".txt").toString(); 102 | 103 | String content = getPartitionsAndCreateObjectContent(context, glue, glueUtil, sourceGlueCatalogId, largeTable, exportBatchId); 104 | objectCreated = s3Util.createS3Object(region, bucketName, objectKey, content); 105 | 106 | } 107 | PublishResult publishResponse = null; 108 | String largeTableJSON = ""; 109 | // Send S3 object key to SNS Topic 110 | if (objectCreated && !objectKey.equalsIgnoreCase("")) { 111 | largeTable.setS3ObjectKey(objectKey); 112 | largeTable.setS3BucketName(bucketName); 113 | largeTableJSON = gson.toJson(largeTable); 114 | System.out.println("Large Table JSON: " + largeTableJSON); 115 | publishResponse = snsUtil.publishLargeTableSchemaToSNS(sns, topicArn, region, bucketName, largeTableJSON, 116 | sourceGlueCatalogId, exportBatchId, messageType); 117 | if(Optional.ofNullable(publishResponse).isPresent()) { 118 | System.out.println("Large Table Schema Published to SNS Topic. Message Id: " + publishResponse.getMessageId()); 119 | recordProcessed = true; 120 | } 121 | } 122 | // track status in DDB 123 | if (Optional.ofNullable(publishResponse).isPresent()) { 124 | ddbUtil.trackTableExportStatus(ddbTblNameForTableStatusTracking, 125 | largeTable.getTable().getDatabaseName(), largeTable.getTable().getName(), largeTableJSON, 126 | publishResponse.getMessageId(), sourceGlueCatalogId, exportRunId, exportBatchId, true, true, 127 | bucketName, objectKey); 128 | } else { 129 | ddbUtil.trackTableExportStatus(ddbTblNameForTableStatusTracking, 130 | largeTable.getTable().getDatabaseName(), largeTable.getTable().getName(), largeTableJSON, 131 | publishResponse.getMessageId(), sourceGlueCatalogId, exportRunId, exportBatchId, false, true, 132 | null, null); 133 | } 134 | } 135 | } 136 | if (!recordProcessed) { 137 | System.out.printf( 138 | "Schema for table '%s' of database '%s' could not be exported. This is an exception. It will be retried again. \n", 139 | largeTable.getTable().getName(), largeTable.getTable().getDatabaseName()); 140 | throw new RuntimeException(); 141 | } 142 | return "Success"; 143 | } 144 | 145 | /** 146 | * This method processes a record from SQS 147 | * 148 | * @param context 149 | * @param glue 150 | * @param glueUtil 151 | * @param ddbUtil 152 | * @param targetGlueCatalogId 153 | * @param ddbTblNameForDBStatusTracking 154 | * @param ddbTblNameForTableStatusTracking 155 | * @param message 156 | * @param skipTableArchive 157 | * @param exportBatchId 158 | * @param sourceGlueCatalogId 159 | * @param isTable 160 | */ 161 | public String getPartitionsAndCreateObjectContent(Context context, AWSGlue glue, GlueUtil glueUtil, 162 | String sourceGlueCatalogId, LargeTable largeTable, String exportBatchId) { 163 | 164 | StringBuilder sb = new StringBuilder(); 165 | Table table = glueUtil.getTable(glue, sourceGlueCatalogId, largeTable.getTable().getDatabaseName(), 166 | largeTable.getTable().getName()); 167 | if (Optional.ofNullable(table).isPresent()) { 168 | List partitionList = glueUtil.getPartitions(glue, sourceGlueCatalogId, 169 | largeTable.getTable().getDatabaseName(), largeTable.getTable().getName()); 170 | AtomicInteger ai = new AtomicInteger(); 171 | for (Partition p : partitionList) { 172 | Gson gson = new Gson(); 173 | String partitionDDL = gson.toJson(p); 174 | sb.append(String.format("%s%n", partitionDDL)); 175 | System.out.printf("Partition #: %d, schema: %s. \n", ai.incrementAndGet(), partitionDDL); 176 | } 177 | } 178 | return sb.toString(); 179 | } 180 | } -------------------------------------------------------------------------------- /src/main/java/com/amazonaws/gdcreplication/lambda/GDCReplicationPlanner.java: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: MIT-0 3 | 4 | package com.amazonaws.gdcreplication.lambda; 5 | 6 | import java.util.ArrayList; 7 | import java.util.Collections; 8 | import java.util.List; 9 | import java.util.Optional; 10 | import java.util.StringTokenizer; 11 | import java.util.stream.Collectors; 12 | 13 | import com.amazonaws.gdcreplication.util.DDBUtil; 14 | import com.amazonaws.gdcreplication.util.GlueUtil; 15 | import com.amazonaws.gdcreplication.util.SNSUtil; 16 | import com.amazonaws.regions.Regions; 17 | import com.amazonaws.services.glue.AWSGlue; 18 | import com.amazonaws.services.glue.AWSGlueClientBuilder; 19 | import com.amazonaws.services.glue.model.Database; 20 | import com.amazonaws.services.lambda.runtime.Context; 21 | import com.amazonaws.services.lambda.runtime.RequestHandler; 22 | import com.amazonaws.services.sns.AmazonSNS; 23 | import com.amazonaws.services.sns.AmazonSNSClientBuilder; 24 | 25 | /** 26 | * This class has AWS Lambda Handler method. Upon invocation, it fetches all the 27 | * databases form Glue Catalog, for each database, it takes the following 28 | * actions: 29 | * 1. Convert Glue Database object to JSON String (This is Database DDL) 30 | * 2. Publish the Database DDL to an SNS Topic 31 | * 3. Insert a record to a DynamoDB table for status tracking 32 | * 33 | * @author Ravi Itha, Amazon Web Services, Inc. 34 | * 35 | */ 36 | public class GDCReplicationPlanner implements RequestHandler { 37 | 38 | @Override 39 | public String handleRequest(Object input, Context context) { 40 | 41 | context.getLogger().log("Input: " + input); 42 | 43 | String region = Optional.ofNullable(System.getenv("region")).orElse(Regions.US_EAST_1.getName()); 44 | String sourceGlueCatalogId = Optional.ofNullable(System.getenv("source_glue_catalog_id")).orElse("1234567890"); 45 | String dbPrefixString = Optional.ofNullable(System.getenv("database_prefix_list")).orElse(""); 46 | String separator = Optional.ofNullable(System.getenv("separator")).orElse("|"); 47 | String topicArn = Optional.ofNullable(System.getenv("sns_topic_arn_gdc_replication_planner")) 48 | .orElse("arn:aws:sns:us-east-1:1234567890:GlueExportSNSTopic"); 49 | String ddbTblNameForDBStatusTracking = Optional.ofNullable(System.getenv("ddb_name_gdc_replication_planner")) 50 | .orElse("ddb_name_gdc_replication_planner"); 51 | 52 | // Print environment variables 53 | printEnvVariables(sourceGlueCatalogId, topicArn, ddbTblNameForDBStatusTracking, dbPrefixString, separator); 54 | 55 | // Create Objects for Glue and SQS 56 | AWSGlue glue = AWSGlueClientBuilder.standard().withRegion(region).build(); 57 | AmazonSNS sns = AmazonSNSClientBuilder.standard().withRegion(region).build(); 58 | 59 | // Create Objects for Utility classes 60 | DDBUtil ddbUtil = new DDBUtil(); 61 | SNSUtil snsUtil = new SNSUtil(); 62 | GlueUtil glueUtil = new GlueUtil(); 63 | 64 | // Get databases from Glue 65 | int numberOfDatabasesExported = 0; 66 | List dBList = glueUtil.getDatabases(glue, sourceGlueCatalogId); 67 | 68 | // When database Prefix string is empty or not provided then, it imports all databases 69 | // else, it imports only the databases that has the same prefix 70 | if (dbPrefixString.equalsIgnoreCase("")) { 71 | numberOfDatabasesExported = snsUtil.publishDatabaseSchemasToSNS(sns, dBList, topicArn, ddbUtil, 72 | ddbTblNameForDBStatusTracking, sourceGlueCatalogId); 73 | } else { 74 | // Tokenize the database prefix string to a List of database prefixes 75 | List dbPrefixList = tokenizeDatabasePrefixString(dbPrefixString, separator); 76 | // Identify required databases to export 77 | List dBsListToExport = getRequiredDatabases(dBList, dbPrefixList); 78 | // Publish schemas for databases to SNS Topic 79 | numberOfDatabasesExported = snsUtil.publishDatabaseSchemasToSNS(sns, dBsListToExport, topicArn, ddbUtil, 80 | ddbTblNameForDBStatusTracking, sourceGlueCatalogId); 81 | } 82 | System.out.printf( 83 | "Database export statistics: number of databases exist = %d, number of databases exported to SNS = %d. \n", 84 | dBList.size(), numberOfDatabasesExported); 85 | return "Lambda function to get a list of Databases completed successfully!"; 86 | } 87 | 88 | /** 89 | * This method prints environment variables 90 | * @param sourceGlueCatalogId 91 | * @param topicArn 92 | * @param ddbTblNameForDBStatusTracking 93 | */ 94 | public static void printEnvVariables(String sourceGlueCatalogId, String topicArn, 95 | String ddbTblNameForDBStatusTracking, String dbPrefixString, String separator) { 96 | System.out.println("SNS Topic Arn: " + topicArn); 97 | System.out.println("Source Catalog Id: " + sourceGlueCatalogId); 98 | System.out.println("Database Prefix String: " + dbPrefixString); 99 | System.out.println("Prefix Separator: " + separator); 100 | System.out.println("DynamoDB Table to track GDC Replication Planning: " + ddbTblNameForDBStatusTracking); 101 | } 102 | 103 | /** 104 | * Tokenize the Data Prefix String to a List of Prefixes 105 | * @param dbPrefixString 106 | * @param token 107 | * @return 108 | */ 109 | public static List tokenizeDatabasePrefixString(String str, String separator) { 110 | 111 | List dbPrefixesList = Collections.list(new StringTokenizer(str, separator)).stream() 112 | .map(token -> (String) token) 113 | .collect(Collectors.toList()); 114 | System.out.println("Number of database prefixes: " + dbPrefixesList.size()); 115 | return dbPrefixesList; 116 | } 117 | 118 | /** 119 | * 120 | * @param dBList 121 | * @param requiredDBPrefixList 122 | * @return 123 | */ 124 | public static List getRequiredDatabases(List dBList, List dbPrefixesList){ 125 | 126 | List dBsToExportList = new ArrayList(); 127 | for(Database database : dBList) { 128 | for(String dbPrefix : dbPrefixesList) { 129 | if(database.getName().toLowerCase().startsWith(dbPrefix)) { 130 | dBsToExportList.add(database); 131 | break; 132 | } 133 | } 134 | } 135 | System.out.printf("Number of databases in Glue Catalog: %d, number of databases to be exported: %d \n", dBList.size(), dBsToExportList.size()); 136 | return dBsToExportList; 137 | } 138 | } -------------------------------------------------------------------------------- /src/main/java/com/amazonaws/gdcreplication/lambda/ImportDatabaseOrTable.java: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: MIT-0 3 | 4 | package com.amazonaws.gdcreplication.lambda; 5 | 6 | import java.util.List; 7 | import java.util.Map; 8 | import java.util.Optional; 9 | 10 | import com.amazonaws.ClientConfiguration; 11 | import com.amazonaws.gdcreplication.util.GDCUtil; 12 | import com.amazonaws.gdcreplication.util.LargeTable; 13 | import com.amazonaws.gdcreplication.util.SQSUtil; 14 | import com.amazonaws.gdcreplication.util.TableWithPartitions; 15 | import com.amazonaws.regions.Regions; 16 | import com.amazonaws.services.glue.AWSGlue; 17 | import com.amazonaws.services.glue.AWSGlueClientBuilder; 18 | import com.amazonaws.services.glue.model.Database; 19 | import com.amazonaws.services.lambda.runtime.Context; 20 | import com.amazonaws.services.lambda.runtime.RequestHandler; 21 | import com.amazonaws.services.lambda.runtime.events.SNSEvent; 22 | import com.amazonaws.services.lambda.runtime.events.SNSEvent.MessageAttribute; 23 | import com.amazonaws.services.lambda.runtime.events.SNSEvent.SNSRecord; 24 | import com.amazonaws.services.sqs.AmazonSQS; 25 | import com.amazonaws.services.sqs.AmazonSQSClientBuilder; 26 | import com.google.gson.Gson; 27 | import com.google.gson.JsonSyntaxException; 28 | 29 | /** 30 | * This class has AWS Lambda Handler method. Upon invocation, it gets an SNS 31 | * Event from source SNS Topic, gets the message(s) from the event, parse the 32 | * message to database or table type and takes one of the following actions on 33 | * the target glue catalog: 34 | * 35 | * 1. Create a Database if it does not exist already 2. Create a Table if it 36 | * does not exist already 3. Update a Table if it exist already 37 | * 38 | * @author Ravi Itha, Amazon Web Services, Inc. 39 | * 40 | */ 41 | public class ImportDatabaseOrTable implements RequestHandler { 42 | 43 | public Object handleRequest(SNSEvent request, Context context) { 44 | 45 | String region = Optional.ofNullable(System.getenv("region")).orElse(Regions.US_EAST_1.getName()); 46 | String targetGlueCatalogId = Optional.ofNullable(System.getenv("target_glue_catalog_id")).orElse("1234567890"); 47 | boolean skipTableArchive = Boolean 48 | .parseBoolean(Optional.ofNullable(System.getenv("skip_archive")).orElse("true")); 49 | String ddbTblNameForDBStatusTracking = Optional.ofNullable(System.getenv("ddb_name_db_import_status")) 50 | .orElse("ddb_name_db_import_status"); 51 | String ddbTblNameForTableStatusTracking = Optional.ofNullable(System.getenv("ddb_name_table_import_status")) 52 | .orElse("ddb_name_table_import_status"); 53 | String sqsQueueURL = Optional.ofNullable(System.getenv("dlq_url_sqs")).orElse(""); 54 | String sqsQueueURLLargeTable = Optional.ofNullable(System.getenv("sqs_queue_url_large_tables")).orElse(""); 55 | 56 | // Print environment variables 57 | printEnvVariables(targetGlueCatalogId, skipTableArchive, ddbTblNameForDBStatusTracking, 58 | ddbTblNameForTableStatusTracking, sqsQueueURL, region, sqsQueueURLLargeTable); 59 | 60 | // Set client configuration 61 | ClientConfiguration cc = new ClientConfiguration(); 62 | cc.setMaxErrorRetry(10); 63 | 64 | // Create Objects for Glue and SQS 65 | AWSGlue glue = AWSGlueClientBuilder.standard().withRegion(region).withClientConfiguration(cc).build(); 66 | AmazonSQS sqs = AmazonSQSClientBuilder.standard().withRegion(region).withClientConfiguration(cc).build(); 67 | 68 | // Process records 69 | List snsRecods = request.getRecords(); 70 | processSNSEvent(context, snsRecods, glue, sqs, sqsQueueURL, sqsQueueURLLargeTable, targetGlueCatalogId, 71 | ddbTblNameForDBStatusTracking, ddbTblNameForTableStatusTracking, skipTableArchive, region); 72 | return "Success"; 73 | } 74 | 75 | /** 76 | * This method processes SNS event and has the business logic to import 77 | * Databases and Tables to Glue Catalog 78 | * @param context 79 | * @param snsRecods 80 | * @param glue 81 | * @param sqs 82 | * @param sqsQueueURL 83 | * @param sqsQueueURLLargeTable 84 | * @param targetGlueCatalogId 85 | * @param ddbTblNameForDBStatusTracking 86 | * @param ddbTblNameForTableStatusTracking 87 | * @param skipTableArchive 88 | * @param region 89 | */ 90 | public void processSNSEvent(Context context, List snsRecods, AWSGlue glue, AmazonSQS sqs, 91 | String sqsQueueURL, String sqsQueueURLLargeTable, String targetGlueCatalogId, 92 | String ddbTblNameForDBStatusTracking, String ddbTblNameForTableStatusTracking, boolean skipTableArchive, 93 | String region) { 94 | 95 | SQSUtil sqsUtil = new SQSUtil(); 96 | for (SNSRecord snsRecod : snsRecods) { 97 | boolean isDatabaseType = false; 98 | boolean isTableType = false; 99 | boolean isLargeTable = false; 100 | LargeTable largeTable = null; 101 | Database db = null; 102 | TableWithPartitions table = null; 103 | Gson gson = new Gson(); 104 | String message = snsRecod.getSNS().getMessage(); 105 | context.getLogger().log("SNS Message Payload: " + message); 106 | 107 | // Get message attributes from the SNS Payload 108 | Map msgAttributeMap = snsRecod.getSNS().getMessageAttributes(); 109 | MessageAttribute msgTypeAttr = msgAttributeMap.get("message_type"); 110 | MessageAttribute sourceCatalogIdAttr = msgAttributeMap.get("source_catalog_id"); 111 | MessageAttribute exportBatchIdAttr = msgAttributeMap.get("export_batch_id"); 112 | String sourceGlueCatalogId = sourceCatalogIdAttr.getValue(); 113 | String exportBatchId = exportBatchIdAttr.getValue(); 114 | context.getLogger().log("Message Type: " + msgTypeAttr.getValue()); 115 | context.getLogger().log("Source Catalog Id: " + sourceGlueCatalogId); 116 | 117 | // Serialize JSON String based on the message type 118 | try { 119 | if (msgTypeAttr.getValue().equalsIgnoreCase("database")) { 120 | db = gson.fromJson(message, Database.class); 121 | isDatabaseType = true; 122 | } else if (msgTypeAttr.getValue().equalsIgnoreCase("table")) { 123 | table = gson.fromJson(message, TableWithPartitions.class); 124 | isTableType = true; 125 | } else if (msgTypeAttr.getValue().equalsIgnoreCase("largeTable")) { 126 | largeTable = gson.fromJson(message, LargeTable.class); 127 | isLargeTable = true; 128 | } 129 | } catch (JsonSyntaxException e) { 130 | System.out.println("Cannot parse SNS message to Glue Database Type."); 131 | e.printStackTrace(); 132 | } 133 | 134 | // Execute the business logic based on the message type 135 | GDCUtil gdcUtil = new GDCUtil(); 136 | if (isDatabaseType) { 137 | gdcUtil.processDatabseSchema(glue, sqs, targetGlueCatalogId, db, message, sqsQueueURL, sourceGlueCatalogId, 138 | exportBatchId, ddbTblNameForDBStatusTracking); 139 | } else if (isTableType) { 140 | gdcUtil.processTableSchema(glue, sqs, targetGlueCatalogId, sourceGlueCatalogId, table, message, 141 | ddbTblNameForTableStatusTracking, sqsQueueURL, exportBatchId, skipTableArchive); 142 | } else if (isLargeTable) { 143 | sqsUtil.sendLargeTableSchemaToSQS(sqs, sqsQueueURLLargeTable, exportBatchId, sourceGlueCatalogId, 144 | message, largeTable); 145 | } 146 | } 147 | } 148 | 149 | /** 150 | * Print environment variables 151 | * @param targetGlueCatalogId 152 | * @param ddbTblNameForDBStatusTracking 153 | * @param ddbTblNameForTableStatusTracking 154 | * @param sqsURL 155 | */ 156 | public void printEnvVariables(String targetGlueCatalogId, boolean skipTableArchive, 157 | String ddbTblNameForDBStatusTracking, String ddbTblNameForTableStatusTracking, String sqsQueueURL, 158 | String region, String sqsQueueURLLargeTable) { 159 | System.out.println("Target Catalog Id: " + targetGlueCatalogId); 160 | System.out.println("Skip Table Archive: " + skipTableArchive); 161 | System.out.println("DynamoDB Table for DB Import Auditing: " + ddbTblNameForDBStatusTracking); 162 | System.out.println("DynamoDB Table for Table Import Auditing: " + ddbTblNameForTableStatusTracking); 163 | System.out.println("Dead Letter Queue URL: " + sqsQueueURL); 164 | System.out.println("Region: " + region); 165 | System.out.println("SQS Queue URL for Large Tables: " + sqsQueueURLLargeTable); 166 | } 167 | 168 | 169 | } 170 | -------------------------------------------------------------------------------- /src/main/java/com/amazonaws/gdcreplication/lambda/ImportLargeTable.java: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: MIT-0 3 | 4 | package com.amazonaws.gdcreplication.lambda; 5 | 6 | import java.util.List; 7 | import java.util.Map.Entry; 8 | import java.util.Optional; 9 | 10 | import com.amazonaws.ClientConfiguration; 11 | import com.amazonaws.gdcreplication.util.DDBUtil; 12 | import com.amazonaws.gdcreplication.util.GlueUtil; 13 | import com.amazonaws.gdcreplication.util.LargeTable; 14 | import com.amazonaws.gdcreplication.util.S3Util; 15 | import com.amazonaws.gdcreplication.util.TableReplicationStatus; 16 | import com.amazonaws.regions.Regions; 17 | import com.amazonaws.services.glue.AWSGlue; 18 | import com.amazonaws.services.glue.AWSGlueClientBuilder; 19 | import com.amazonaws.services.glue.model.Partition; 20 | import com.amazonaws.services.lambda.runtime.Context; 21 | import com.amazonaws.services.lambda.runtime.RequestHandler; 22 | import com.amazonaws.services.lambda.runtime.events.SQSEvent; 23 | import com.amazonaws.services.lambda.runtime.events.SQSEvent.MessageAttribute; 24 | import com.amazonaws.services.lambda.runtime.events.SQSEvent.SQSMessage; 25 | import com.amazonaws.services.sqs.AmazonSQS; 26 | import com.amazonaws.services.sqs.AmazonSQSClientBuilder; 27 | import com.google.gson.Gson; 28 | import com.google.gson.JsonSyntaxException; 29 | 30 | public class ImportLargeTable implements RequestHandler { 31 | 32 | @Override 33 | public String handleRequest(SQSEvent event, Context context) { 34 | 35 | String region = Optional.ofNullable(System.getenv("region")).orElse(Regions.US_EAST_1.getName()); 36 | String targetGlueCatalogId = Optional.ofNullable(System.getenv("target_glue_catalog_id")).orElse("1234567890"); 37 | boolean skipTableArchive = Boolean 38 | .parseBoolean(Optional.ofNullable(System.getenv("skip_archive")).orElse("true")); 39 | String ddbTblNameForTableStatusTracking = Optional.ofNullable(System.getenv("ddb_name_table_import_status")) 40 | .orElse("ddb_name_table_import_status"); 41 | boolean recordProcessed = false; 42 | 43 | // Print environment variables 44 | printEnvVariables(targetGlueCatalogId, skipTableArchive, ddbTblNameForTableStatusTracking, region); 45 | 46 | // Set client configuration 47 | ClientConfiguration cc = new ClientConfiguration(); 48 | cc.setMaxErrorRetry(10); 49 | 50 | // Create Objects for Glue and SQS 51 | AWSGlue glue = AWSGlueClientBuilder.standard().withRegion(region).withClientConfiguration(cc).build(); 52 | AmazonSQS sqs = AmazonSQSClientBuilder.standard().withRegion(region).withClientConfiguration(cc).build(); 53 | 54 | // Iterate and process all the messages which are part of SQSEvent 55 | System.out.println("Number of messages in SQS Event: " + event.getRecords().size()); 56 | for (SQSMessage msg : event.getRecords()) { 57 | String ddl = new String(msg.getBody()); 58 | String exportBatchId = ""; 59 | String schemaType = ""; 60 | String sourceGlueCatalogId = ""; 61 | // Read Message Attributes 62 | for (Entry entry : msg.getMessageAttributes().entrySet()) { 63 | if ("ExportBatchId".equalsIgnoreCase(entry.getKey())) { 64 | exportBatchId = entry.getValue().getStringValue(); 65 | System.out.println("Export Batch Id: " + exportBatchId); 66 | } else if ("SourceGlueDataCatalogId".equalsIgnoreCase(entry.getKey())) { 67 | sourceGlueCatalogId = entry.getValue().getStringValue(); 68 | System.out.println("Source Glue Data Cagalog Id: " + sourceGlueCatalogId); 69 | } else if ("SchemaType".equalsIgnoreCase(entry.getKey())) { 70 | schemaType = entry.getValue().getStringValue(); 71 | System.out.println("Message Schema Type " + schemaType); 72 | } 73 | } 74 | if (schemaType.equalsIgnoreCase("largeTable")) { 75 | recordProcessed = processsRecord(context, glue, sqs, targetGlueCatalogId, ddbTblNameForTableStatusTracking, 76 | ddl, skipTableArchive, exportBatchId, sourceGlueCatalogId, region); 77 | } 78 | if (!recordProcessed) { 79 | System.out.printf("Input message '%s' could not be processed. This is an exception. It will be reprocessed again. \n", ddl); 80 | throw new RuntimeException(); 81 | } 82 | } 83 | return "Success"; 84 | } 85 | 86 | /** 87 | * Print environment variables 88 | * 89 | * @param targetGlueCatalogId 90 | * @param skipTableArchive 91 | * @param ddbTblNameForTableStatusTracking 92 | * @param region 93 | * @param sqsQueueURL 94 | */ 95 | public void printEnvVariables(String targetGlueCatalogId, boolean skipTableArchive, 96 | String ddbTblNameForTableStatusTracking, String region) { 97 | System.out.println("Target Catalog Id: " + targetGlueCatalogId); 98 | System.out.println("Skip Table Archive: " + skipTableArchive); 99 | System.out.println("DynamoDB Table for Table Import Auditing: " + ddbTblNameForTableStatusTracking); 100 | System.out.println("Region: " + region); 101 | } 102 | 103 | /** 104 | * This method processes a record from SQS 105 | * 106 | * @param context 107 | * @param glue 108 | * @param sqs 109 | * @param sqsQueueURL 110 | * @param targetGlueCatalogId 111 | * @param ddbTblNameForTableStatusTracking 112 | * @param message 113 | * @param skipTableArchive 114 | * @param exportBatchId 115 | * @param sourceGlueCatalogId 116 | * @param region 117 | */ 118 | public boolean processsRecord(Context context, AWSGlue glue, AmazonSQS sqs, 119 | String targetGlueCatalogId, String ddbTblNameForTableStatusTracking, String message, 120 | boolean skipTableArchive, String exportBatchId, String sourceGlueCatalogId, String region) { 121 | 122 | boolean recordProcessed = false; 123 | Gson gson = new Gson(); 124 | S3Util s3Util = new S3Util(); 125 | DDBUtil ddbUtil = new DDBUtil(); 126 | GlueUtil glueUtil = new GlueUtil(); 127 | 128 | LargeTable largeTable = null; 129 | TableReplicationStatus tableStatus = null; 130 | long importRunId = System.currentTimeMillis(); 131 | 132 | // Parse input message to LargeTable object 133 | try { 134 | largeTable = gson.fromJson(message, LargeTable.class); 135 | } catch (JsonSyntaxException e) { 136 | System.out.println("Cannot parse SNS message to Glue Table Type."); 137 | e.printStackTrace(); 138 | } 139 | 140 | // Create or update Table 141 | if (Optional.ofNullable(largeTable).isPresent()) { 142 | tableStatus = glueUtil.createOrUpdateTable(glue, largeTable.getTable(), targetGlueCatalogId, 143 | skipTableArchive); 144 | tableStatus.setTableSchema(message); 145 | } 146 | 147 | // Update table partitions 148 | if (!tableStatus.isError()) { 149 | // Get partitions from S3 150 | List partitionListFromExport = s3Util.getPartitionsFromS3(region, largeTable.getS3BucketName(), 151 | largeTable.getS3ObjectKey()); 152 | 153 | // Get table partitions from Target Account 154 | List partitionsB4Replication = glueUtil.getPartitions(glue, targetGlueCatalogId, 155 | largeTable.getTable().getDatabaseName(), largeTable.getTable().getName()); 156 | System.out.println("Number of partitions before replication: " + partitionsB4Replication.size()); 157 | 158 | // Add Partitions to the table if the export has Partitions 159 | if (tableStatus.isReplicated() && partitionListFromExport.size() > 0) { 160 | tableStatus.setExportHasPartitions(true); 161 | if (partitionsB4Replication.size() == 0) { 162 | System.out.println("Adding partitions based on the export."); 163 | boolean partitionsAdded = glueUtil.addPartitions(glue, partitionListFromExport, targetGlueCatalogId, 164 | largeTable.getTable().getDatabaseName(), largeTable.getTable().getName()); 165 | if (partitionsAdded) { 166 | tableStatus.setPartitionsReplicated(true); 167 | recordProcessed = true; 168 | } 169 | } else { 170 | System.out.println( 171 | "Target table has partitions. They will be deleted first before adding partitions based on Export."); 172 | // delete partitions in batch mode 173 | boolean partitionsDeleted = glueUtil.deletePartitions(glue, targetGlueCatalogId, 174 | largeTable.getTable().getDatabaseName(), largeTable.getTable().getName(), 175 | partitionsB4Replication); 176 | 177 | // Enable the below code for debugging purpose. Check number of table partitions after deletion 178 | // List partitionsAfterDeletion = glueUtil.getPartitions(glue, targetGlueCatalogId, 179 | // largeTable.getTable().getDatabaseName(), largeTable.getTable().getName()); 180 | // System.out.println("Number of partitions after deletion: " + partitionsAfterDeletion.size()); 181 | 182 | // add partitions from S3 object 183 | boolean partitionsAdded = glueUtil.addPartitions(glue, partitionListFromExport, targetGlueCatalogId, 184 | largeTable.getTable().getDatabaseName(), largeTable.getTable().getName()); 185 | 186 | if (partitionsDeleted && partitionsAdded) { 187 | tableStatus.setPartitionsReplicated(true); 188 | recordProcessed = true; 189 | } 190 | // Enable the below code for debugging purpose. Check number of table partitions after addition 191 | // List partitionsAfterAddition = glueUtil.getPartitions(glue, targetGlueCatalogId, 192 | // largeTable.getTable().getDatabaseName(), largeTable.getTable().getName()); 193 | // System.out.println("Number of partitions after addition: " + partitionsAfterAddition.size()); 194 | } 195 | 196 | } else if (tableStatus.isReplicated() && partitionListFromExport.size() == 0) { 197 | tableStatus.setExportHasPartitions(false); 198 | if (partitionsB4Replication.size() > 0) { 199 | // Export has no partitions but table already has some partitions. Those 200 | // partitions will be deleted in batch mode. 201 | boolean partitionsDeleted = glueUtil.deletePartitions(glue, targetGlueCatalogId, 202 | largeTable.getTable().getDatabaseName(), largeTable.getTable().getName(), 203 | partitionsB4Replication); 204 | if (partitionsDeleted) { 205 | tableStatus.setPartitionsReplicated(true); 206 | recordProcessed = true; 207 | } 208 | } 209 | } 210 | } 211 | // If there is any error in creating/updating table then send it to DLQ 212 | else { 213 | System.out.println("Table replicated but partitions were not replicated. Message will be reprocessed again."); 214 | } 215 | 216 | // Track status in DynamoDB 217 | ddbUtil.trackTableImportStatus(tableStatus, sourceGlueCatalogId, targetGlueCatalogId, importRunId, 218 | exportBatchId, ddbTblNameForTableStatusTracking); 219 | System.out.printf( 220 | "Processing of Table shcema completed. Result: Table replicated: %b, Export has partitions: %b, " 221 | + "Partitions replicated: %b, error: %b \n", 222 | tableStatus.isReplicated(), tableStatus.isExportHasPartitions(), tableStatus.isPartitionsReplicated(), 223 | tableStatus.isError()); 224 | 225 | return recordProcessed; 226 | } 227 | } -------------------------------------------------------------------------------- /src/main/java/com/amazonaws/gdcreplication/util/DBReplicationStatus.java: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: MIT-0 3 | 4 | package com.amazonaws.gdcreplication.util; 5 | 6 | public class DBReplicationStatus { 7 | 8 | private String dbName; 9 | private boolean created; 10 | private boolean error; 11 | 12 | public String getDbName() { 13 | return dbName; 14 | } 15 | public void setDbName(String dbName) { 16 | this.dbName = dbName; 17 | } 18 | public boolean isCreated() { 19 | return created; 20 | } 21 | public void setCreated(boolean created) { 22 | this.created = created; 23 | } 24 | public boolean isError() { 25 | return error; 26 | } 27 | public void setError(boolean error) { 28 | this.error = error; 29 | } 30 | 31 | } 32 | -------------------------------------------------------------------------------- /src/main/java/com/amazonaws/gdcreplication/util/DDBUtil.java: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: MIT-0 3 | 4 | package com.amazonaws.gdcreplication.util; 5 | 6 | import java.util.HashMap; 7 | import java.util.List; 8 | import java.util.Map; 9 | import java.util.Optional; 10 | 11 | import com.amazonaws.ClientConfiguration; 12 | import com.amazonaws.services.dynamodbv2.AmazonDynamoDB; 13 | import com.amazonaws.services.dynamodbv2.AmazonDynamoDBClientBuilder; 14 | import com.amazonaws.services.dynamodbv2.document.DynamoDB; 15 | import com.amazonaws.services.dynamodbv2.document.Item; 16 | import com.amazonaws.services.dynamodbv2.document.PutItemOutcome; 17 | import com.amazonaws.services.dynamodbv2.document.Table; 18 | import com.amazonaws.services.dynamodbv2.model.BatchWriteItemRequest; 19 | import com.amazonaws.services.dynamodbv2.model.BatchWriteItemResult; 20 | import com.amazonaws.services.dynamodbv2.model.ReturnConsumedCapacity; 21 | import com.amazonaws.services.dynamodbv2.model.WriteRequest; 22 | import com.google.common.collect.Lists; 23 | 24 | /** 25 | *

26 | * This is a utility class with methods to write items to DynamoDB table. 27 | * from / to a DynamoDB table. 28 | *

29 | * 30 | * @author Ravi Itha, Amazon Web Services, Inc. 31 | * 32 | */ 33 | public class DDBUtil { 34 | 35 | /** 36 | * Method to track the status of Tables imported 37 | * @param tableStatus 38 | * @param sourceGlueCatalogId 39 | * @param targetGlueCatalogId 40 | * @param importRunId 41 | * @param ddbTblName 42 | * @return 43 | */ 44 | public boolean trackTableImportStatus(TableReplicationStatus tableStatus, String sourceGlueCatalogId, 45 | String targetGlueCatalogId, long importRunId, String exportBatchId, String ddbTblName) { 46 | boolean itemInserted = false; 47 | 48 | ClientConfiguration cc = new ClientConfiguration(); 49 | cc.setMaxErrorRetry(10); 50 | AmazonDynamoDB client = AmazonDynamoDBClientBuilder.standard().withClientConfiguration(cc).build(); 51 | DynamoDB dynamoDB = new DynamoDB(client); 52 | 53 | Table table = dynamoDB.getTable(ddbTblName); 54 | Item item = new Item().withPrimaryKey("table_id", tableStatus.getTableName().concat("|").concat(tableStatus.getDbName())) 55 | .withNumber("import_run_id", importRunId) 56 | .withString("export_batch_id", exportBatchId) 57 | .withString("table_name", tableStatus.getTableName()) 58 | .withString("database_name", tableStatus.getDbName()) 59 | .withString("table_schema", tableStatus.getTableSchema()) 60 | .withString("target_glue_catalog_id", targetGlueCatalogId) 61 | .withString("source_glue_catalog_id", sourceGlueCatalogId) 62 | .withBoolean("table_created", tableStatus.isCreated()) 63 | .withBoolean("table_updated", tableStatus.isUpdated()) 64 | .withBoolean("export_has_partitions", tableStatus.isExportHasPartitions()) 65 | .withBoolean("partitions_updated", tableStatus.isPartitionsReplicated()); 66 | // Write the item to the table 67 | try { 68 | PutItemOutcome outcome = table.putItem(item); 69 | int statusCode = outcome.getPutItemResult().getSdkHttpMetadata().getHttpStatusCode(); 70 | if (statusCode == 200) { 71 | itemInserted = true; 72 | System.out 73 | .println("Table item inserted to DynamoDB table. Table name: " + tableStatus.getTableName()); 74 | } 75 | } catch(Exception e) { 76 | e.printStackTrace(); 77 | System.out.println("Could not insert a Table import status to DynamoDB table: " + ddbTblName); 78 | } 79 | dynamoDB.shutdown(); 80 | return itemInserted; 81 | } 82 | 83 | /** 84 | * Method to track the status of Databases imported 85 | * @param sourceGlueCatalogId 86 | * @param targetGlueCatalogId 87 | * @param ddbTblName 88 | * @param databaseName 89 | * @param importRunId 90 | * @param isCreated 91 | * @return 92 | */ 93 | public boolean trackDatabaseImportStatus(String sourceGlueCatalogId, String targetGlueCatalogId, String ddbTblName, String databaseName, 94 | long importRunId, String exportBatchId, boolean isCreated) { 95 | boolean itemInserted = false; 96 | 97 | ClientConfiguration cc = new ClientConfiguration(); 98 | cc.setMaxErrorRetry(10); 99 | AmazonDynamoDB client = AmazonDynamoDBClientBuilder.standard().withClientConfiguration(cc).build(); 100 | DynamoDB dynamoDB = new DynamoDB(client); 101 | 102 | com.amazonaws.services.dynamodbv2.document.Table table = dynamoDB.getTable(ddbTblName); 103 | Item item = new Item().withPrimaryKey("db_id", databaseName).withNumber("import_run_id", importRunId) 104 | .withString("export_batch_id", exportBatchId).withString("target_glue_catalog_id", targetGlueCatalogId) 105 | .withString("source_glue_catalog_id", sourceGlueCatalogId).withBoolean("is_created", isCreated); 106 | // Write the item to the table 107 | try { 108 | PutItemOutcome outcome = table.putItem(item); 109 | int statusCode = outcome.getPutItemResult().getSdkHttpMetadata().getHttpStatusCode(); 110 | if (statusCode == 200) { 111 | itemInserted = true; 112 | System.out 113 | .println("Database item inserted to DynamoDB table. Database name: " + databaseName); 114 | } 115 | } catch(Exception e) { 116 | e.printStackTrace(); 117 | System.out.println("Could not insert a Database import status to DynamoDB table: " + ddbTblName); 118 | } 119 | dynamoDB.shutdown(); 120 | return itemInserted; 121 | } 122 | 123 | /** 124 | * Method to track the status of Tables exported 125 | * @param ddbTblName 126 | * @param glueDBName 127 | * @param glueTableName 128 | * @param glueTableSchema 129 | * @param snsMsgId 130 | * @param glueCatalogId 131 | * @param exportRunId 132 | * @param isExported 133 | * @return 134 | */ 135 | public boolean trackTableExportStatus(String ddbTblName, String glueDBName, String glueTableName, 136 | String glueTableSchema, String snsMsgId, String glueCatalogId, long exportRunId, String exportBatchId, 137 | boolean isExported, boolean isLargeTable, String bucketName, String objectKey) { 138 | 139 | boolean itemInserted = false; 140 | if (Optional.of(glueDBName).isPresent() && Optional.of(glueTableName).isPresent() 141 | && Optional.of(glueTableSchema).isPresent() && Optional.of(snsMsgId).isPresent()) { 142 | 143 | ClientConfiguration cc = new ClientConfiguration(); 144 | cc.setMaxErrorRetry(10); 145 | AmazonDynamoDB client = AmazonDynamoDBClientBuilder.standard().withClientConfiguration(cc).build(); 146 | DynamoDB dynamoDB = new DynamoDB(client); 147 | 148 | com.amazonaws.services.dynamodbv2.document.Table table = dynamoDB.getTable(ddbTblName); 149 | 150 | Item item = new Item().withPrimaryKey("table_id", glueTableName.concat("|").concat(glueDBName)) 151 | .withNumber("export_run_id", exportRunId).withString("export_batch_id", exportBatchId) 152 | .withString("source_glue_catalog_id", glueCatalogId).withString("table_schema", glueTableSchema) 153 | .withString("sns_msg_id", snsMsgId).withBoolean("is_exported", isExported) 154 | .withBoolean("is_large_table", isLargeTable); 155 | 156 | if(Optional.ofNullable(bucketName).isPresent() && Optional.ofNullable(objectKey).isPresent()) 157 | item.withString("s3_bucket_name", bucketName).withString("object_key", objectKey); 158 | 159 | // Write the item to the table 160 | try { 161 | PutItemOutcome outcome = table.putItem(item); 162 | int statusCode = outcome.getPutItemResult().getSdkHttpMetadata().getHttpStatusCode(); 163 | if (statusCode == 200) { 164 | itemInserted = true; 165 | System.out.println("Table item inserted to DynamoDB table. Table name: " + glueTableName); 166 | } 167 | } catch(Exception e) { 168 | e.printStackTrace(); 169 | System.out.println("Could not insert a Table export status to DynamoDB table: " + ddbTblName); 170 | } 171 | dynamoDB.shutdown(); 172 | } else { 173 | System.out.println("Not all the values present to insert Table item to "); 174 | } 175 | return itemInserted; 176 | } 177 | 178 | /** 179 | * Method to track the status of Databases exported 180 | * @param ddbTblName 181 | * @param glueDBName 182 | * @param glueDBSchema 183 | * @param snsMsgId 184 | * @param glueCatalogId 185 | * @param exportRunId 186 | * @param isExported 187 | * @return 188 | */ 189 | public boolean trackDatabaseExportStatus(String ddbTblName, String glueDBName, String glueDBSchema, String snsMsgId, 190 | String glueCatalogId, long exportRunId, String exportBatchId, boolean isExported) { 191 | boolean itemInserted = false; 192 | 193 | ClientConfiguration cc = new ClientConfiguration(); 194 | cc.setMaxErrorRetry(10); 195 | AmazonDynamoDB client = AmazonDynamoDBClientBuilder.standard().withClientConfiguration(cc).build(); 196 | DynamoDB dynamoDB = new DynamoDB(client); 197 | 198 | com.amazonaws.services.dynamodbv2.document.Table table = dynamoDB.getTable(ddbTblName); 199 | Item item = new Item().withPrimaryKey("db_id", glueDBName) 200 | .withNumber("export_run_id", exportRunId) 201 | .withString("export_batch_id", exportBatchId) 202 | .withString("source_glue_catalog_id", glueCatalogId) 203 | .withString("database_schema", glueDBSchema) 204 | .withString("sns_msg_id", snsMsgId) 205 | .withBoolean("is_exported", isExported); 206 | // Write the item to the table 207 | try { 208 | PutItemOutcome outcome = table.putItem(item); 209 | int statusCode = outcome.getPutItemResult().getSdkHttpMetadata().getHttpStatusCode(); 210 | if (statusCode == 200) { 211 | itemInserted = true; 212 | System.out.println("Status inserted to DynamoDB table for Glue Database: " + glueDBName); 213 | } 214 | } catch(Exception e) { 215 | e.printStackTrace(); 216 | System.out.println("Could not insert a Database export status to DynamoDB table: " + ddbTblName); 217 | } 218 | dynamoDB.shutdown(); 219 | return itemInserted; 220 | } 221 | 222 | /** 223 | * This method inserts multiple items to a DynamoDB table using Batch Write Item API 224 | * @param itemList 225 | * @param dynamoDBTblName 226 | */ 227 | public void insertIntoDynamoDB(List itemList, String dynamoDBTblName) { 228 | 229 | System.out.printf("Inserting %d items to DynamoDB using Batch API call. \n", itemList.size()); 230 | AmazonDynamoDB dynamoDB = AmazonDynamoDBClientBuilder.standard().build(); 231 | for (List miniBatch : Lists.partition(itemList, 25)) { 232 | Map> requestItems = new HashMap>(); 233 | requestItems.put(dynamoDBTblName, miniBatch); 234 | BatchWriteItemRequest batchWriteItemRequest = new BatchWriteItemRequest() 235 | .withReturnConsumedCapacity(ReturnConsumedCapacity.TOTAL) 236 | .withRequestItems(requestItems); 237 | BatchWriteItemResult result = dynamoDB.batchWriteItem(batchWriteItemRequest); 238 | while (result.getUnprocessedItems().size() > 0) { 239 | Map> unprocessedItems = result.getUnprocessedItems(); 240 | result = dynamoDB.batchWriteItem(unprocessedItems); 241 | } 242 | } 243 | dynamoDB.shutdown(); 244 | } 245 | 246 | } -------------------------------------------------------------------------------- /src/main/java/com/amazonaws/gdcreplication/util/GDCUtil.java: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: MIT-0 3 | 4 | package com.amazonaws.gdcreplication.util; 5 | 6 | import java.util.List; 7 | import java.util.Optional; 8 | 9 | import com.amazonaws.services.glue.AWSGlue; 10 | import com.amazonaws.services.glue.model.Database; 11 | import com.amazonaws.services.glue.model.Partition; 12 | import com.amazonaws.services.glue.model.Table; 13 | import com.amazonaws.services.sqs.AmazonSQS; 14 | 15 | public class GDCUtil { 16 | 17 | /** 18 | * This method processes a Message that belongs to Table schema 19 | * @param glue 20 | * @param sqs 21 | * @param targetGlueCatalogId 22 | * @param sourceGlueCatalogId 23 | * @param tableWithPartitions 24 | * @param message 25 | * @param ddbTblNameForTableStatusTracking 26 | * @param sqsQueueURL 27 | * @param exportBatchId 28 | * @param skipTableArchive 29 | */ 30 | public void processTableSchema(AWSGlue glue, AmazonSQS sqs, String targetGlueCatalogId, String sourceGlueCatalogId, 31 | TableWithPartitions tableWithPartitions, String message, String ddbTblNameForTableStatusTracking, 32 | String sqsQueueURL, String exportBatchId, boolean skipTableArchive) { 33 | 34 | DDBUtil ddbUtil = new DDBUtil(); 35 | SQSUtil sqsUtil = new SQSUtil(); 36 | GlueUtil glueUtil = new GlueUtil(); 37 | long importRunId = System.currentTimeMillis(); 38 | 39 | // Get Table and its Partitions from Input JSON 40 | Table table = tableWithPartitions.getTable(); 41 | List partitionListFromExport = tableWithPartitions.getPartitionList(); 42 | 43 | // Create or update table 44 | TableReplicationStatus tableStatus = glueUtil.createOrUpdateTable(glue, table, targetGlueCatalogId, 45 | skipTableArchive); 46 | // If database not found then create one 47 | if (tableStatus.isDbNotFoundError()) { 48 | System.out.printf("Creating Database with name: '%s'. \n", table.getDatabaseName()); 49 | DBReplicationStatus dbStatus = glueUtil.createGlueDatabase(glue, targetGlueCatalogId, 50 | table.getDatabaseName(), 51 | "Database Imported from Glue Data Catalog of AWS Account Id: ".concat(sourceGlueCatalogId)); 52 | // Now, try to create / update table again. 53 | if (dbStatus.isCreated()) { 54 | tableStatus = glueUtil.createOrUpdateTable(glue, tableWithPartitions.getTable(), targetGlueCatalogId, 55 | skipTableArchive); 56 | } 57 | } 58 | tableStatus.setTableSchema(message); 59 | 60 | // Update table partitions 61 | if (!tableStatus.isError()) { 62 | // Get table partitions from Target Account 63 | List partitionsB4Replication = glueUtil.getPartitions(glue, targetGlueCatalogId, 64 | table.getDatabaseName(), table.getName()); 65 | System.out.println("Number of partitions before replication: " + partitionsB4Replication.size()); 66 | 67 | // Add Partitions to the table if the export has Partitions 68 | if (partitionListFromExport.size() > 0) { 69 | tableStatus.setExportHasPartitions(true); 70 | if (partitionsB4Replication.size() == 0) { 71 | System.out.println("Adding partitions based on the export."); 72 | boolean partitionsAdded = glueUtil.addPartitions(glue, partitionListFromExport, targetGlueCatalogId, 73 | table.getDatabaseName(), table.getName()); 74 | if (partitionsAdded) 75 | tableStatus.setPartitionsReplicated(true); 76 | } else { 77 | System.out.println( 78 | "Table has partitions. They will be deleted first before adding partitions based on Export."); 79 | // delete partitions in batch mode 80 | boolean partitionsDeleted = glueUtil.deletePartitions(glue, targetGlueCatalogId, 81 | table.getDatabaseName(), table.getName(), partitionsB4Replication); 82 | 83 | // Enable the below code for debugging purpose. Check number of table partitions after deletion 84 | // List partitionsAfterDeletion = glueUtil.getPartitions(glue, targetGlueCatalogId, 85 | // table.getDatabaseName(), table.getName()); 86 | // System.out.println("Number of partitions after deletion: " + partitionsAfterDeletion.size()); 87 | 88 | // add partitions from S3 object 89 | boolean partitionsAdded = glueUtil.addPartitions(glue, partitionListFromExport, targetGlueCatalogId, 90 | table.getDatabaseName(), table.getName()); 91 | 92 | if (partitionsDeleted && partitionsAdded) 93 | tableStatus.setPartitionsReplicated(true); 94 | 95 | // Enable the below code for debugging purpose. Check number of table partitions after addition 96 | // List partitionsAfterAddition = glueUtil.getPartitions(glue, targetGlueCatalogId, 97 | // table.getDatabaseName(), table.getName()); 98 | // System.out.println("Number of partitions after addition: " + partitionsAfterAddition.size()); 99 | } 100 | } else if (partitionListFromExport.size() == 0) { 101 | tableStatus.setExportHasPartitions(false); 102 | if (partitionsB4Replication.size() > 0) { 103 | // Export has no partitions but table already has some partitions. Those 104 | // partitions will be deleted in batch mode. 105 | boolean partitionsDeleted = glueUtil.deletePartitions(glue, targetGlueCatalogId, 106 | table.getDatabaseName(), table.getName(), partitionsB4Replication); 107 | if (partitionsDeleted) 108 | tableStatus.setPartitionsReplicated(true); 109 | } 110 | } 111 | } 112 | // If there is any error in creating/updating table then send it to DLQ 113 | else { 114 | System.out.println("Error in creating/updating table in the Glue Data Catalog. It will be send to DLQ."); 115 | sqsUtil.sendTableSchemaToDeadLetterQueue(sqs, sqsQueueURL, tableStatus, exportBatchId, sourceGlueCatalogId); 116 | } 117 | // Track status in DynamoDB 118 | ddbUtil.trackTableImportStatus(tableStatus, sourceGlueCatalogId, targetGlueCatalogId, importRunId, 119 | exportBatchId, ddbTblNameForTableStatusTracking); 120 | System.out.printf( 121 | "Processing of Table shcema completed. Result: Table replicated: %b, Export has partitions: %b, " 122 | + "Partitions replicated: %b, error: %b \n", 123 | tableStatus.isReplicated(), tableStatus.isExportHasPartitions(), tableStatus.isPartitionsReplicated(), 124 | tableStatus.isError()); 125 | } 126 | 127 | /** 128 | * This method processes a Message that belongs to Database schema 129 | * @param glue 130 | * @param sqs 131 | * @param targetGlueCatalogId 132 | * @param db 133 | * @param message 134 | * @param sqsQueueURL 135 | * @param sourceGlueCatalogId 136 | * @param exportBatchId 137 | * @param ddbTblNameForDBStatusTracking 138 | */ 139 | public void processDatabseSchema(AWSGlue glue, AmazonSQS sqs, String targetGlueCatalogId, Database db, 140 | String message, String sqsQueueURL, String sourceGlueCatalogId, String exportBatchId, 141 | String ddbTblNameForDBStatusTracking) { 142 | 143 | DDBUtil ddbUtil = new DDBUtil(); 144 | GlueUtil glueUtil = new GlueUtil(); 145 | SQSUtil sqsUtil = new SQSUtil(); 146 | 147 | boolean isDBCreated = false; 148 | long importRunId = System.currentTimeMillis(); 149 | Database database = glueUtil.getDatabaseIfExist(glue, targetGlueCatalogId, db); 150 | boolean dbExist = Optional.ofNullable(database).isPresent(); 151 | if (!dbExist) { 152 | DBReplicationStatus dbStatus = glueUtil.createGlueDatabase(glue, targetGlueCatalogId, db); 153 | if (dbStatus.isError()) { 154 | System.out.println("Error in creating database in the Glue Data Catalog. It will be send to DLQ."); 155 | sqsUtil.sendDatabaseSchemaToDeadLetterQueue(sqs, sqsQueueURL, message, db.getName(), exportBatchId, 156 | sourceGlueCatalogId); 157 | } else 158 | isDBCreated = true; 159 | } else 160 | System.out.printf( 161 | "Database with name '%s' exist already in target Glue Data Catalog. No action will be taken. \n", 162 | database.getName()); 163 | // Track status in DynamoDB 164 | ddbUtil.trackDatabaseImportStatus(sourceGlueCatalogId, targetGlueCatalogId, ddbTblNameForDBStatusTracking, 165 | db.getName(), importRunId, exportBatchId, isDBCreated); 166 | System.out.printf("Processing of Database shcema completed. Result: DB already exist: %b, DB created: %b. \n", 167 | dbExist, isDBCreated); 168 | } 169 | } 170 | -------------------------------------------------------------------------------- /src/main/java/com/amazonaws/gdcreplication/util/GlueUtil.java: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: MIT-0 3 | 4 | package com.amazonaws.gdcreplication.util; 5 | 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | import java.util.Optional; 9 | import java.util.concurrent.atomic.AtomicInteger; 10 | 11 | import com.amazonaws.services.glue.AWSGlue; 12 | import com.amazonaws.services.glue.model.BatchCreatePartitionRequest; 13 | import com.amazonaws.services.glue.model.BatchCreatePartitionResult; 14 | import com.amazonaws.services.glue.model.BatchDeletePartitionRequest; 15 | import com.amazonaws.services.glue.model.BatchDeletePartitionResult; 16 | import com.amazonaws.services.glue.model.CreateDatabaseRequest; 17 | import com.amazonaws.services.glue.model.CreateDatabaseResult; 18 | import com.amazonaws.services.glue.model.CreateTableRequest; 19 | import com.amazonaws.services.glue.model.CreateTableResult; 20 | import com.amazonaws.services.glue.model.Database; 21 | import com.amazonaws.services.glue.model.DatabaseInput; 22 | import com.amazonaws.services.glue.model.DeletePartitionRequest; 23 | import com.amazonaws.services.glue.model.DeletePartitionResult; 24 | import com.amazonaws.services.glue.model.EntityNotFoundException; 25 | import com.amazonaws.services.glue.model.GetDatabaseRequest; 26 | import com.amazonaws.services.glue.model.GetDatabaseResult; 27 | import com.amazonaws.services.glue.model.GetDatabasesRequest; 28 | import com.amazonaws.services.glue.model.GetDatabasesResult; 29 | import com.amazonaws.services.glue.model.GetPartitionsRequest; 30 | import com.amazonaws.services.glue.model.GetPartitionsResult; 31 | import com.amazonaws.services.glue.model.GetTableRequest; 32 | import com.amazonaws.services.glue.model.GetTableResult; 33 | import com.amazonaws.services.glue.model.GetTablesRequest; 34 | import com.amazonaws.services.glue.model.GetTablesResult; 35 | import com.amazonaws.services.glue.model.Partition; 36 | import com.amazonaws.services.glue.model.PartitionError; 37 | import com.amazonaws.services.glue.model.PartitionInput; 38 | import com.amazonaws.services.glue.model.PartitionValueList; 39 | import com.amazonaws.services.glue.model.StorageDescriptor; 40 | import com.amazonaws.services.glue.model.Table; 41 | import com.amazonaws.services.glue.model.TableInput; 42 | import com.amazonaws.services.glue.model.UpdateTableRequest; 43 | import com.amazonaws.services.glue.model.UpdateTableResult; 44 | import com.google.common.collect.Lists; 45 | 46 | /** 47 | * This is class has utility methods to work with AWS Glue Data Catalog 48 | * 49 | * @author Ravi Itha, Amazon Web Services, Inc. 50 | * 51 | */ 52 | public class GlueUtil { 53 | 54 | /** 55 | * This method checks if a Database exist with the given name in the Glue Data 56 | * Catalog 57 | * 58 | * @param glue 59 | * @param targetCatalogId 60 | * @param db 61 | * @return 62 | */ 63 | public Database getDatabaseIfExist(AWSGlue glue, String targetCatalogId, Database db) { 64 | Database database = null; 65 | GetDatabaseRequest getDatabaseRequest = new GetDatabaseRequest(); 66 | getDatabaseRequest.setCatalogId(targetCatalogId); 67 | getDatabaseRequest.setName(db.getName()); 68 | try { 69 | GetDatabaseResult getDatabaseResult = glue.getDatabase(getDatabaseRequest); 70 | database = getDatabaseResult.getDatabase(); 71 | } catch (EntityNotFoundException e) { 72 | System.out.printf("Database '%s' not found. \n", db.getName()); 73 | } 74 | return database; 75 | } 76 | 77 | /** 78 | * This method get all the databases from a given Glue Data Catalog 79 | * 80 | * @param glue 81 | * @param sourceGlueCatalogId 82 | * @return 83 | */ 84 | public List getDatabases(AWSGlue glue, String sourceGlueCatalogId) { 85 | List masterDBList = new ArrayList(); 86 | GetDatabasesRequest getDatabasesRequest = new GetDatabasesRequest(); 87 | getDatabasesRequest.setCatalogId(sourceGlueCatalogId); 88 | GetDatabasesResult getDatabasesResult = glue.getDatabases(getDatabasesRequest); 89 | List databaseList = getDatabasesResult.getDatabaseList(); 90 | masterDBList.addAll(databaseList); 91 | String databaseResultNextToken = getDatabasesResult.getNextToken(); 92 | if (Optional.ofNullable(databaseResultNextToken).isPresent()) { 93 | do { 94 | // create a new GetDatabasesRequest using next token. 95 | getDatabasesRequest = new GetDatabasesRequest(); 96 | getDatabasesRequest.setNextToken(databaseResultNextToken); 97 | getDatabasesResult = glue.getDatabases(getDatabasesRequest); 98 | databaseList = getDatabasesResult.getDatabaseList(); 99 | masterDBList.addAll(databaseList); 100 | databaseResultNextToken = getDatabasesResult.getNextToken(); 101 | } while (Optional.ofNullable(databaseResultNextToken).isPresent()); 102 | } 103 | System.out.println("Total number of databases fetched: " + masterDBList.size()); 104 | return masterDBList; 105 | } 106 | 107 | /** 108 | * This method creates a new Database in Glue Data Catalog 109 | * 110 | * @param glue 111 | * @param targetGlueCatalogId 112 | * @param db 113 | * @return 114 | */ 115 | 116 | public DBReplicationStatus createGlueDatabase(AWSGlue glue, String targetGlueCatalogId, String dbName, 117 | String dbDescription) { 118 | DBReplicationStatus dbStatus = new DBReplicationStatus(); 119 | CreateDatabaseRequest createDatabaseRequest = new CreateDatabaseRequest(); 120 | DatabaseInput databaseInput = new DatabaseInput(); 121 | databaseInput.setName(dbName); 122 | databaseInput.setDescription(dbDescription); 123 | createDatabaseRequest.setDatabaseInput(databaseInput); 124 | try { 125 | CreateDatabaseResult result = glue.createDatabase(createDatabaseRequest); 126 | int statusCode = result.getSdkHttpMetadata().getHttpStatusCode(); 127 | if (statusCode == 200) { 128 | System.out.printf("Database created successfully. Database name: '%s'. \n", dbName); 129 | dbStatus.setCreated(true); 130 | dbStatus.setError(false); 131 | } else 132 | System.out.println("Database could not be created"); 133 | } catch (Exception e) { 134 | e.printStackTrace(); 135 | dbStatus.setDbName(dbName); 136 | dbStatus.setError(true); 137 | System.out.println("Exception thrown while creating Glue Database"); 138 | } 139 | return dbStatus; 140 | } 141 | 142 | public DBReplicationStatus createGlueDatabase(AWSGlue glue, String targetGlueCatalogId, Database db) { 143 | DBReplicationStatus dbStatus = new DBReplicationStatus(); 144 | CreateDatabaseRequest createDatabaseRequest = new CreateDatabaseRequest(); 145 | DatabaseInput databaseInput = new DatabaseInput(); 146 | databaseInput.setName(db.getName()); 147 | databaseInput.setDescription(db.getDescription()); 148 | databaseInput.setLocationUri(db.getLocationUri()); 149 | databaseInput.setParameters(db.getParameters()); 150 | createDatabaseRequest.setDatabaseInput(databaseInput); 151 | try { 152 | CreateDatabaseResult result = glue.createDatabase(createDatabaseRequest); 153 | int statusCode = result.getSdkHttpMetadata().getHttpStatusCode(); 154 | if (statusCode == 200) { 155 | System.out.printf("Database created successfully. Database name: '%s'. \n", db.getName()); 156 | dbStatus.setCreated(true); 157 | dbStatus.setError(false); 158 | } else 159 | System.out.println("Database could not be created"); 160 | } catch (Exception e) { 161 | e.printStackTrace(); 162 | dbStatus.setDbName(db.getName()); 163 | dbStatus.setError(true); 164 | System.out.printf("Exception in creating Database with name: '%s'. \n", db.getName()); 165 | } 166 | return dbStatus; 167 | } 168 | 169 | /** 170 | * This method creates a TableInput object using Table object 171 | * 172 | * @param table 173 | * @return 174 | */ 175 | public TableInput createTableInput(Table table) { 176 | TableInput tableInput = new TableInput(); 177 | tableInput.setDescription(table.getDescription()); 178 | tableInput.setLastAccessTime(table.getLastAccessTime()); 179 | tableInput.setOwner(table.getOwner()); 180 | tableInput.setName(table.getName()); 181 | if (Optional.ofNullable(table.getStorageDescriptor()).isPresent()) { 182 | tableInput.setStorageDescriptor(table.getStorageDescriptor()); 183 | if (Optional.ofNullable(table.getStorageDescriptor().getParameters()).isPresent()) 184 | tableInput.setParameters(table.getStorageDescriptor().getParameters()); 185 | } 186 | tableInput.setPartitionKeys(table.getPartitionKeys()); 187 | tableInput.setTableType(table.getTableType()); 188 | tableInput.setViewExpandedText(table.getViewExpandedText()); 189 | tableInput.setViewOriginalText(table.getViewOriginalText()); 190 | tableInput.setParameters(table.getParameters()); 191 | return tableInput; 192 | } 193 | 194 | /** 195 | * This method gets all the tables for a given databases from Glue Data Catalog 196 | * 197 | * @param glue 198 | * @param glueCatalogId 199 | * @param databaseName 200 | * @return 201 | */ 202 | public List

getTables(AWSGlue glue, String glueCatalogId, String databaseName) { 203 | System.out.printf("Start - Fetching table list for Database %s \n", databaseName); 204 | List
masterTableList = new ArrayList
(); 205 | GetTablesRequest getTablesRequest = new GetTablesRequest(); 206 | getTablesRequest.setCatalogId(glueCatalogId); 207 | getTablesRequest.setDatabaseName(databaseName); 208 | GetTablesResult getTablesResult = glue.getTables(getTablesRequest); 209 | List
tableList = getTablesResult.getTableList(); 210 | masterTableList.addAll(tableList); 211 | String tableResultNextToken = getTablesResult.getNextToken(); 212 | if (Optional.ofNullable(tableResultNextToken).isPresent()) { 213 | do { 214 | // creating a new GetTablesResult using next token. 215 | getTablesRequest = new GetTablesRequest(); 216 | getTablesRequest.setNextToken(tableResultNextToken); 217 | getTablesRequest.setCatalogId(glueCatalogId); 218 | getTablesRequest.setDatabaseName(databaseName); 219 | getTablesResult = glue.getTables(getTablesRequest); 220 | tableList = getTablesResult.getTableList(); 221 | masterTableList.addAll(tableList); 222 | tableResultNextToken = getTablesResult.getNextToken(); 223 | } while (Optional.ofNullable(tableResultNextToken).isPresent()); 224 | } 225 | System.out.printf("Database '%s' has %d tables. \n", databaseName, masterTableList.size()); 226 | System.out.printf("End - Fetching table list for Database %s \n", databaseName); 227 | return masterTableList; 228 | } 229 | 230 | /** 231 | * This method gets a Table using the given name from Glue Data Catalog. If 232 | * there is no table exist with the provided name then it returns null. 233 | * 234 | * @param glue 235 | * @param glueCatalogId 236 | * @param databaseName 237 | * @param tableName 238 | * @return 239 | */ 240 | public Table getTable(AWSGlue glue, String glueCatalogId, String databaseName, String tableName) { 241 | Table table = null; 242 | GetTableRequest getTableRequest = new GetTableRequest(); 243 | getTableRequest.setDatabaseName(databaseName); 244 | getTableRequest.setName(tableName); 245 | getTableRequest.setCatalogId(glueCatalogId); 246 | try { 247 | GetTableResult tableResult = glue.getTable(getTableRequest); 248 | table = tableResult.getTable(); 249 | } catch (EntityNotFoundException e) { 250 | System.out.printf("Table '%s' not found. \n", tableName); 251 | } 252 | return table; 253 | } 254 | 255 | /** 256 | * This method creates or updates a Table in Glue Data Catalog 257 | * 258 | * @param glue 259 | * @param sourceTable 260 | * @param targetGlueCatalogId 261 | * @param skipTableArchive 262 | * @return 263 | */ 264 | public TableReplicationStatus createOrUpdateTable(AWSGlue glue, Table sourceTable, String targetGlueCatalogId, 265 | boolean skipTableArchive) { 266 | 267 | TableReplicationStatus tableStatus = new TableReplicationStatus(); 268 | tableStatus.setTableName(sourceTable.getName()); 269 | tableStatus.setDbName(sourceTable.getDatabaseName()); 270 | tableStatus.setReplicationTime(System.currentTimeMillis()); 271 | 272 | // Check if a table exist already 273 | GetTableRequest targetTableRequest = new GetTableRequest(); 274 | targetTableRequest.setCatalogId(targetGlueCatalogId); 275 | targetTableRequest.setDatabaseName(sourceTable.getDatabaseName()); 276 | targetTableRequest.setName(sourceTable.getName()); 277 | Table targetTable = null; 278 | try { 279 | GetTableResult targetTableResult = glue.getTable(targetTableRequest); 280 | targetTable = targetTableResult.getTable(); 281 | } catch (EntityNotFoundException e) { 282 | System.out.printf("Table '%s' not found. It will be created. \n", sourceTable.getName()); 283 | } catch (Exception e) { 284 | e.printStackTrace(); 285 | System.out.println("Exception in getting getTable"); 286 | } 287 | TableInput tableInput = createTableInput(sourceTable); 288 | 289 | // If table exist - update the table with the schema in the input message. 290 | if (Optional.ofNullable(targetTable).isPresent()) { 291 | System.out.println("Table exist. It will be updated"); 292 | UpdateTableRequest updateTableRequest = new UpdateTableRequest(); 293 | updateTableRequest.setTableInput(tableInput); 294 | updateTableRequest.setSkipArchive(skipTableArchive); 295 | updateTableRequest.setDatabaseName(sourceTable.getDatabaseName()); 296 | 297 | try { 298 | UpdateTableResult updateTableResult = glue.updateTable(updateTableRequest); 299 | int statusCode = updateTableResult.getSdkHttpMetadata().getHttpStatusCode(); 300 | if (statusCode == 200) { 301 | tableStatus.setUpdated(true); 302 | tableStatus.setReplicated(true); 303 | tableStatus.setError(false); 304 | System.out.printf("Table '%s' updated successfully. \n", sourceTable.getName()); 305 | } 306 | } catch (EntityNotFoundException e) { 307 | e.printStackTrace(); 308 | System.out.printf("Exception thrown while updating table '%s'. Reason: '%s' do not exist already. \n", 309 | sourceTable.getName(), sourceTable.getDatabaseName()); 310 | tableStatus.setReplicated(false); 311 | tableStatus.setDbNotFoundError(true); 312 | tableStatus.setError(true); 313 | } catch (Exception e) { 314 | e.printStackTrace(); 315 | System.out.printf("Exception thrown while updating table '%s'. \n", sourceTable.getName()); 316 | tableStatus.setReplicated(false); 317 | tableStatus.setError(true); 318 | } 319 | } 320 | // If the table do not exist - create a new table with the schema in the input 321 | // message. 322 | else { 323 | CreateTableRequest createTableRequest = new CreateTableRequest(); 324 | createTableRequest.setCatalogId(targetGlueCatalogId); 325 | createTableRequest.setDatabaseName(sourceTable.getDatabaseName()); 326 | createTableRequest.setTableInput(tableInput); 327 | try { 328 | CreateTableResult createTableResult = glue.createTable(createTableRequest); 329 | int statusCode = createTableResult.getSdkHttpMetadata().getHttpStatusCode(); 330 | if (statusCode == 200) { 331 | tableStatus.setCreated(true); 332 | tableStatus.setReplicated(true); 333 | tableStatus.setError(false); 334 | System.out.printf("Table '%s' created successfully. \n", sourceTable.getName()); 335 | } 336 | } catch (EntityNotFoundException e) { 337 | e.printStackTrace(); 338 | System.out.printf("Exception thrown while creating table '%s'. Reason: '%s' do not exist already. \n.", 339 | sourceTable.getName(), sourceTable.getDatabaseName()); 340 | tableStatus.setReplicated(false); 341 | tableStatus.setDbNotFoundError(true); 342 | } catch (Exception e) { 343 | e.printStackTrace(); 344 | System.out.printf("Exception thrown while creating table '%s' \n.", sourceTable.getName()); 345 | tableStatus.setReplicated(false); 346 | tableStatus.setError(true); 347 | } 348 | } 349 | return tableStatus; 350 | } 351 | 352 | /** 353 | * This method gets a list of partitions for a given table. 354 | * 355 | * @param glue 356 | * @param catalogId 357 | * @param databaseName 358 | * @param tableName 359 | * @return 360 | */ 361 | public List getPartitions(AWSGlue glue, String catalogId, String databaseName, String tableName) { 362 | List masterPartitionList = new ArrayList(); 363 | GetPartitionsRequest getPartitionsRequest = new GetPartitionsRequest(); 364 | getPartitionsRequest.setDatabaseName(databaseName); 365 | getPartitionsRequest.setCatalogId(catalogId); 366 | getPartitionsRequest.setTableName(tableName); 367 | GetPartitionsResult getPartitionResult = glue.getPartitions(getPartitionsRequest); 368 | List partitionList = getPartitionResult.getPartitions(); 369 | masterPartitionList.addAll(partitionList); 370 | String partitionResultNextToken = getPartitionResult.getNextToken(); 371 | if (Optional.ofNullable(partitionResultNextToken).isPresent()) { 372 | do { 373 | // create a new GetPartitionsRequest using next token. 374 | getPartitionsRequest = new GetPartitionsRequest(); 375 | getPartitionsRequest.setDatabaseName(databaseName); 376 | getPartitionsRequest.setCatalogId(catalogId); 377 | getPartitionsRequest.setTableName(tableName); 378 | getPartitionsRequest.setNextToken(partitionResultNextToken); 379 | getPartitionResult = glue.getPartitions(getPartitionsRequest); 380 | partitionList = getPartitionResult.getPartitions(); 381 | masterPartitionList.addAll(partitionList); 382 | partitionResultNextToken = getPartitionResult.getNextToken(); 383 | } while (Optional.ofNullable(partitionResultNextToken).isPresent()); 384 | } 385 | return masterPartitionList; 386 | } 387 | 388 | /** 389 | * Add partitions in batch mode 390 | * @param glue 391 | * @param partitionsToAdd 392 | * @param catalogId 393 | * @param databaseName 394 | * @param tableName 395 | * @return 396 | */ 397 | public boolean addPartitions(AWSGlue glue, List partitionsToAdd, String catalogId, String databaseName, 398 | String tableName) { 399 | AtomicInteger numPartitionsAdded = new AtomicInteger(); 400 | boolean partitionsAdded = false; 401 | BatchCreatePartitionRequest batchCreatePartitionRequest = new BatchCreatePartitionRequest(); 402 | batchCreatePartitionRequest.setCatalogId(catalogId); 403 | batchCreatePartitionRequest.setDatabaseName(databaseName); 404 | batchCreatePartitionRequest.setTableName(tableName); 405 | 406 | List partitionInputList = new ArrayList(); 407 | for (Partition p : partitionsToAdd) { 408 | PartitionInput pi = new PartitionInput(); 409 | StorageDescriptor storageDescriptor = p.getStorageDescriptor(); 410 | pi.setStorageDescriptor(storageDescriptor); 411 | pi.setValues(p.getValues()); 412 | partitionInputList.add(pi); 413 | } 414 | System.out.println("Partition Input List Size: " + partitionInputList.size()); 415 | if(partitionInputList.size() > 100) 416 | System.out.println("The input has more than 100 partitions, it will be sliced into smaller lists with 100 partitions each."); 417 | 418 | List> listofSmallerLists = Lists.partition(partitionInputList, 100); 419 | for(List partInputList : listofSmallerLists) { 420 | batchCreatePartitionRequest.setPartitionInputList(partInputList); 421 | try { 422 | BatchCreatePartitionResult result = glue.batchCreatePartition(batchCreatePartitionRequest); 423 | int statusCode = result.getSdkHttpMetadata().getHttpStatusCode(); 424 | List partErrors = result.getErrors(); 425 | if (statusCode == 200 && partErrors.size() == 0) { 426 | System.out.printf("%d partitions were added to table '%s' of database '%s'. \n", partInputList.size(), 427 | tableName, databaseName); 428 | partitionsAdded = true; 429 | numPartitionsAdded.getAndAdd(partInputList.size()); 430 | System.out.printf("%d of %d partitions added so far. \n", numPartitionsAdded.get(), partitionInputList.size()); 431 | } else { 432 | System.out.printf("Not all partitions were added. Status Code: %d, Number of partition errors: %d \n", 433 | statusCode, partErrors.size()); 434 | for (PartitionError pe : partErrors) { 435 | System.out.println("Partition Error Message: " + pe.getErrorDetail().getErrorMessage()); 436 | List pv = pe.getPartitionValues(); 437 | for (String v : pv) { 438 | System.out.println("Partition error value: " + v); 439 | } 440 | } 441 | } 442 | } catch(Exception e) { 443 | e.printStackTrace(); 444 | System.out.printf("Exception in adding partitions. \n"); 445 | System.out.printf("%d of %d partitions added so far. \n", numPartitionsAdded.get(), partitionInputList.size()); 446 | // TODO - what to do when there are exceptions here? 447 | } 448 | } 449 | System.out.println("Total partitions added: " + numPartitionsAdded.get()); 450 | return partitionsAdded; 451 | } 452 | 453 | /** 454 | * Delete a single partition 455 | * @param glue 456 | * @param catalogId 457 | * @param databaseName 458 | * @param tableName 459 | * @param partition 460 | * @return 461 | */ 462 | public boolean deletePartition(AWSGlue glue, String catalogId, String databaseName, String tableName, 463 | Partition partition) { 464 | boolean partitionDeleted = false; 465 | DeletePartitionRequest deletePartitionRequest = new DeletePartitionRequest(); 466 | deletePartitionRequest.setCatalogId(catalogId); 467 | deletePartitionRequest.setDatabaseName(databaseName); 468 | deletePartitionRequest.setTableName(tableName); 469 | deletePartitionRequest.setPartitionValues(partition.getValues()); 470 | 471 | DeletePartitionResult result = glue.deletePartition(deletePartitionRequest); 472 | int statusCode = result.getSdkHttpMetadata().getHttpStatusCode(); 473 | if (statusCode == 200) { 474 | System.out.printf("Partition deleted from table '%s' of database '%s' \n", tableName, databaseName); 475 | partitionDeleted = true; 476 | } 477 | return partitionDeleted; 478 | } 479 | 480 | /** 481 | * Delete partitions using Batch mode 482 | * @param glue 483 | * @param catalogId 484 | * @param databaseName 485 | * @param tableName 486 | * @param partitionsToDelete 487 | * @return 488 | */ 489 | public boolean deletePartitions(AWSGlue glue, String catalogId, String databaseName, String tableName, 490 | List partitionsToDelete) { 491 | 492 | boolean partitionsDeleted = false; 493 | 494 | BatchDeletePartitionRequest batchDeletePartitionRequest = new BatchDeletePartitionRequest(); 495 | batchDeletePartitionRequest.setCatalogId(catalogId); 496 | batchDeletePartitionRequest.setDatabaseName(databaseName); 497 | batchDeletePartitionRequest.setTableName(tableName); 498 | 499 | // Prepare a List of PartitionValueList 500 | List listOfPartitionValueList = new ArrayList(); 501 | 502 | // For each Partition, get its values, add create a PartitionValueList, and add 503 | // them to List of PartitionValueList 504 | for (Partition p : partitionsToDelete) { 505 | PartitionValueList pvList = new PartitionValueList(); 506 | pvList.setValues(p.getValues()); 507 | listOfPartitionValueList.add(pvList); 508 | } 509 | 510 | System.out.println("Size of List of PartitionValueList: " + listOfPartitionValueList.size()); 511 | List> listofSmallerLists = Lists.partition(listOfPartitionValueList, 25); 512 | for (List smallerList : listofSmallerLists) { 513 | // Add List of PartitionValueList to BatchDeletePartitionRequest 514 | batchDeletePartitionRequest.setPartitionsToDelete(smallerList); 515 | try { 516 | BatchDeletePartitionResult batchDeletePartitionResult = glue 517 | .batchDeletePartition(batchDeletePartitionRequest); 518 | int statusCode = batchDeletePartitionResult.getSdkHttpMetadata().getHttpStatusCode(); 519 | List partErrors = batchDeletePartitionResult.getErrors(); 520 | if (statusCode == 200 && partErrors.size() == 0) { 521 | System.out.printf("%d partitions from table '%s' of database '%s' were deleted. \n", 522 | smallerList.size(), tableName, databaseName); 523 | partitionsDeleted = true; 524 | } else { 525 | System.out.printf( 526 | "Not all partitions were deleted. Status Code: %d, Number of partition errors: %d \n", 527 | statusCode, partErrors.size()); 528 | 529 | for (PartitionError pe : partErrors) { 530 | System.out.println("Partition Error Message: " + pe.getErrorDetail().getErrorMessage()); 531 | List pv = pe.getPartitionValues(); 532 | for (String v : pv) { 533 | System.out.println("Partition value: " + v); 534 | } 535 | } 536 | } 537 | } catch (Exception e) { 538 | System.out.println("Exception in deleting partitions."); 539 | e.printStackTrace(); 540 | } 541 | } 542 | return partitionsDeleted; 543 | } 544 | } 545 | -------------------------------------------------------------------------------- /src/main/java/com/amazonaws/gdcreplication/util/LargeTable.java: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: MIT-0 3 | 4 | package com.amazonaws.gdcreplication.util; 5 | 6 | import com.amazonaws.services.glue.model.Table; 7 | 8 | /** 9 | * This is a POJO class for Glue Database Table 10 | * 11 | * @author Ravi Itha, Amazon Web Services, Inc. 12 | * 13 | */ 14 | public class LargeTable { 15 | 16 | private String catalogId; 17 | private boolean largeTable; 18 | private int numberOfPartitions; 19 | private Table table; 20 | private String s3ObjectKey; 21 | private String s3BucketName; 22 | 23 | public Table getTable() { 24 | return table; 25 | } 26 | 27 | public void setTable(Table table) { 28 | this.table = table; 29 | } 30 | 31 | public String getS3BucketName() { 32 | return s3BucketName; 33 | } 34 | 35 | public void setS3BucketName(String s3BucketName) { 36 | this.s3BucketName = s3BucketName; 37 | } 38 | 39 | public String getS3ObjectKey() { 40 | return s3ObjectKey; 41 | } 42 | 43 | public void setS3ObjectKey(String s3ObjectKey) { 44 | this.s3ObjectKey = s3ObjectKey; 45 | } 46 | 47 | public String getCatalogId() { 48 | return catalogId; 49 | } 50 | 51 | public void setCatalogId(String catalogId) { 52 | this.catalogId = catalogId; 53 | } 54 | 55 | public boolean isLargeTable() { 56 | return largeTable; 57 | } 58 | 59 | public void setLargeTable(boolean largeTable) { 60 | this.largeTable = largeTable; 61 | } 62 | 63 | public int getNumberOfPartitions() { 64 | return numberOfPartitions; 65 | } 66 | 67 | public void setNumberOfPartitions(int numberOfPartitions) { 68 | this.numberOfPartitions = numberOfPartitions; 69 | } 70 | } 71 | -------------------------------------------------------------------------------- /src/main/java/com/amazonaws/gdcreplication/util/S3Util.java: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: MIT-0 3 | 4 | package com.amazonaws.gdcreplication.util; 5 | 6 | import java.io.BufferedReader; 7 | import java.io.ByteArrayInputStream; 8 | import java.io.File; 9 | import java.io.IOException; 10 | import java.io.InputStream; 11 | import java.io.InputStreamReader; 12 | import java.nio.charset.StandardCharsets; 13 | import java.util.ArrayList; 14 | import java.util.List; 15 | 16 | import com.amazonaws.AmazonServiceException; 17 | import com.amazonaws.SdkClientException; 18 | import com.amazonaws.auth.profile.ProfileCredentialsProvider; 19 | import com.amazonaws.services.glue.model.Partition; 20 | import com.amazonaws.services.s3.AmazonS3; 21 | import com.amazonaws.services.s3.AmazonS3ClientBuilder; 22 | import com.amazonaws.services.s3.model.GetObjectRequest; 23 | import com.amazonaws.services.s3.model.ObjectMetadata; 24 | import com.amazonaws.services.s3.model.PutObjectRequest; 25 | import com.amazonaws.services.s3.model.ResponseHeaderOverrides; 26 | import com.amazonaws.services.s3.model.S3Object; 27 | import com.google.gson.Gson; 28 | import com.google.gson.JsonSyntaxException; 29 | 30 | public class S3Util { 31 | 32 | 33 | /** 34 | * 35 | * Create an object in S3 with list of partitions. 36 | * @param region 37 | * @param bucket 38 | * @param objectKey 39 | * @param content 40 | * @return 41 | * 42 | * Wrote this method based on inputs from Zoran Ivanovic of AWS 43 | */ 44 | public boolean createS3Object(String region, String bucket, String objectKey, String content) { 45 | boolean objectCreated = false; 46 | AmazonS3 s3 = AmazonS3ClientBuilder.standard().withRegion(region).build(); 47 | 48 | byte[] contentBytes = content.getBytes(StandardCharsets.UTF_8); 49 | InputStream inputStream = new ByteArrayInputStream(contentBytes); 50 | 51 | ObjectMetadata metadata = new ObjectMetadata(); 52 | metadata.setContentLength(contentBytes.length); 53 | PutObjectRequest putObjectRequest = new PutObjectRequest(bucket, objectKey, inputStream, metadata); 54 | // send request to S3 to create an object with the content 55 | try { 56 | s3.putObject(putObjectRequest); 57 | objectCreated = true; 58 | System.out.println("Partition Object uploaded to S3. Object key: " + objectKey); 59 | } catch (AmazonServiceException e) { 60 | System.err.println(e.getErrorMessage()); 61 | } catch (Exception e) { 62 | e.printStackTrace(); 63 | } 64 | try { 65 | inputStream.close(); 66 | } catch(Exception e) { 67 | e.printStackTrace(); 68 | System.out.println("Exception thrown while closing InputStream."); 69 | } 70 | return objectCreated; 71 | } 72 | 73 | /** 74 | * Upload a file as an object to S3. 75 | * @param region 76 | * @param bucketName 77 | * @param objKeyName 78 | * @param localFilePath 79 | * @return 80 | * @throws IOException 81 | */ 82 | public boolean uploadObject(String region, String bucketName, String objKeyName, String localFilePath) 83 | throws IOException { 84 | 85 | System.out.println("Uploading file to S3."); 86 | boolean objectUploaded = false; 87 | AmazonS3 s3Client = AmazonS3ClientBuilder.standard().withRegion(region).build(); 88 | 89 | try { 90 | // Upload a text string as a new object. 91 | s3Client.putObject(bucketName, objKeyName, "Uploaded String Object"); 92 | // Upload a file as a new object with ContentType and title specified. 93 | PutObjectRequest request = new PutObjectRequest(bucketName, objKeyName, new File(localFilePath)); 94 | ObjectMetadata metadata = new ObjectMetadata(); 95 | metadata.setContentType("plain/text"); 96 | metadata.addUserMetadata("x-amz-meta-title", "PartitionFile"); 97 | request.setMetadata(metadata); 98 | s3Client.putObject(request); 99 | objectUploaded = true; 100 | } catch (AmazonServiceException e) { 101 | // The call was transmitted successfully, but Amazon S3 couldn't process 102 | // it, so it returned an error response. 103 | e.printStackTrace(); 104 | } catch (SdkClientException e) { 105 | // Amazon S3 couldn't be contacted for a response, or the client 106 | // couldn't parse the response from Amazon S3. 107 | e.printStackTrace(); 108 | } 109 | 110 | return objectUploaded; 111 | } 112 | 113 | public boolean createObject(String region, String bucketName, String tableDDL, String stringObjKeyName) 114 | throws IOException { 115 | 116 | boolean objectCreated = false; 117 | 118 | try { 119 | AmazonS3 s3Client = AmazonS3ClientBuilder.standard().withRegion(region).build(); 120 | 121 | // Upload a text string as a new object. 122 | s3Client.putObject(bucketName, stringObjKeyName, tableDDL); 123 | objectCreated = true; 124 | 125 | } catch (AmazonServiceException e) { 126 | // The call was transmitted successfully, but Amazon S3 couldn't process 127 | // it, so it returned an error response. 128 | e.printStackTrace(); 129 | } catch (SdkClientException e) { 130 | // Amazon S3 couldn't be contacted for a response, or the client 131 | // couldn't parse the response from Amazon S3. 132 | e.printStackTrace(); 133 | } 134 | return objectCreated; 135 | } 136 | 137 | public void getObject(String region, String bucketName, String key) throws IOException { 138 | 139 | S3Object fullObject = null, objectPortion = null, headerOverrideObject = null; 140 | try { 141 | AmazonS3 s3Client = AmazonS3ClientBuilder.standard().withRegion(region) 142 | .withCredentials(new ProfileCredentialsProvider()).build(); 143 | 144 | // Get an object and print its contents. 145 | System.out.println("Downloading an object"); 146 | fullObject = s3Client.getObject(new GetObjectRequest(bucketName, key)); 147 | System.out.println("Content-Type: " + fullObject.getObjectMetadata().getContentType()); 148 | System.out.println("Content: "); 149 | displayTextInputStream(fullObject.getObjectContent()); 150 | 151 | // Get a range of bytes from an object and print the bytes. 152 | GetObjectRequest rangeObjectRequest = new GetObjectRequest(bucketName, key).withRange(0, 9); 153 | objectPortion = s3Client.getObject(rangeObjectRequest); 154 | System.out.println("Printing bytes retrieved."); 155 | 156 | displayTextInputStream(objectPortion.getObjectContent()); 157 | 158 | // Get an entire object, overriding the specified response headers, and print 159 | // the object's content. 160 | 161 | ResponseHeaderOverrides headerOverrides = new ResponseHeaderOverrides().withCacheControl("No-cache") 162 | .withContentDisposition("attachment; filename=example.txt"); 163 | GetObjectRequest getObjectRequestHeaderOverride = new GetObjectRequest(bucketName, key) 164 | .withResponseHeaders(headerOverrides); 165 | headerOverrideObject = s3Client.getObject(getObjectRequestHeaderOverride); 166 | displayTextInputStream(headerOverrideObject.getObjectContent()); 167 | } catch (AmazonServiceException e) { 168 | // The call was transmitted successfully, but Amazon S3 couldn't process 169 | // it, so it returned an error response. 170 | e.printStackTrace(); 171 | } catch (SdkClientException e) { 172 | // Amazon S3 couldn't be contacted for a response, or the client 173 | // couldn't parse the response from Amazon S3. 174 | e.printStackTrace(); 175 | } finally { 176 | // To ensure that the network connection doesn't remain open, close any open 177 | // input streams. 178 | if (fullObject != null) { 179 | fullObject.close(); 180 | } 181 | if (objectPortion != null) { 182 | objectPortion.close(); 183 | } 184 | if (headerOverrideObject != null) { 185 | headerOverrideObject.close(); 186 | } 187 | } 188 | 189 | } 190 | 191 | public static void displayTextInputStream(InputStream input) throws IOException { 192 | // Read the text input stream one line at a time and display each line. 193 | BufferedReader reader = new BufferedReader(new InputStreamReader(input)); 194 | String line = null; 195 | while ((line = reader.readLine()) != null) { 196 | System.out.println(line); 197 | } 198 | System.out.println(); 199 | } 200 | 201 | public List getPartitionsFromS3(String region, String bucket, String key) { 202 | 203 | String contentType = ""; 204 | Gson gson = new Gson(); 205 | S3Object fullObject = null; 206 | AmazonS3 s3 = AmazonS3ClientBuilder.standard().withRegion(region).build(); 207 | System.out.printf("Bucket Name: %s, Object Key: %s \n", bucket, key); 208 | 209 | try { 210 | fullObject = s3.getObject(new GetObjectRequest(bucket, key)); 211 | } catch (Exception e) { 212 | e.printStackTrace(); 213 | System.out.println("Exception thrown while reading object from S3"); 214 | } 215 | 216 | InputStream input = fullObject.getObjectContent(); 217 | contentType = fullObject.getObjectMetadata().getContentType(); 218 | System.out.println("CONTENT TYPE: " + contentType); 219 | 220 | // Read the text input stream one line at a time and display each line. 221 | List partitionList = new ArrayList(); 222 | 223 | BufferedReader reader = new BufferedReader(new InputStreamReader(input)); 224 | String line = null; 225 | try { 226 | while ((line = reader.readLine()) != null) { 227 | Partition partition = gson.fromJson(line, Partition.class); 228 | partitionList.add(partition); 229 | } 230 | } catch (JsonSyntaxException | IOException e) { 231 | System.out.println("Exception occured while reading partition information from S3 object."); 232 | e.printStackTrace(); 233 | } 234 | System.out.println("Number of partitions read from S3: " + partitionList.size()); 235 | return partitionList; 236 | } 237 | 238 | } 239 | -------------------------------------------------------------------------------- /src/main/java/com/amazonaws/gdcreplication/util/SNSUtil.java: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: MIT-0 3 | 4 | package com.amazonaws.gdcreplication.util; 5 | 6 | import java.util.HashMap; 7 | import java.util.List; 8 | import java.util.Map; 9 | import java.util.concurrent.atomic.AtomicInteger; 10 | 11 | import com.amazonaws.services.glue.model.Database; 12 | import com.amazonaws.services.glue.model.Table; 13 | import com.amazonaws.services.sns.AmazonSNS; 14 | import com.amazonaws.services.sns.model.MessageAttributeValue; 15 | import com.amazonaws.services.sns.model.PublishRequest; 16 | import com.amazonaws.services.sns.model.PublishResult; 17 | import com.google.gson.Gson; 18 | 19 | /** 20 | * This is class has utility methods to work with Amazon SNS 21 | * @author Ravi Itha, Amazon Web Services, Inc. 22 | * 23 | */ 24 | public class SNSUtil { 25 | 26 | 27 | public PublishResult publishLargeTableSchemaToSNS(AmazonSNS sns, String topicArn, String region, String bucketName, String message, 28 | String sourceGlueCatalogId, String exportBatchId, String messageType) { 29 | 30 | PublishResult publishResponse = null; 31 | 32 | PublishRequest publishRequest = new PublishRequest(topicArn, message); 33 | 34 | Map messageAttributes = new HashMap(); 35 | messageAttributes.put("source_catalog_id", createStringAttribute(sourceGlueCatalogId)); 36 | messageAttributes.put("message_type", createStringAttribute(messageType)); 37 | messageAttributes.put("export_batch_id", createStringAttribute(exportBatchId)); 38 | messageAttributes.put("bucket_name", createStringAttribute(bucketName)); 39 | messageAttributes.put("region_name", createStringAttribute(region)); 40 | publishRequest.setMessageAttributes(messageAttributes); 41 | try { 42 | publishResponse = sns.publish(publishRequest); 43 | } catch (Exception e) { 44 | e.printStackTrace(); 45 | System.out.println("Large Table message could not be published to SNS Topic. Topic ARN: " + topicArn); 46 | System.out.println("Message to be published: " + message); 47 | } 48 | return publishResponse; 49 | } 50 | 51 | /** 52 | * This method publishes one Table Schema (DDL) to SNS Topic 53 | * 54 | * @param sns 55 | * @param topicArn 56 | * @param databaseDDL 57 | * @param sourceGlueCatalogId 58 | * @return 59 | */ 60 | public PublishResult publishDatabaseSchemaToSNS(AmazonSNS sns, String topicArn, String databaseDDL, 61 | String sourceGlueCatalogId, String exportBatchId) { 62 | PublishResult publishResponse = null; 63 | PublishRequest publishRequest = new PublishRequest(topicArn, databaseDDL); 64 | Map messageAttributes = new HashMap(); 65 | messageAttributes.put("source_catalog_id", createStringAttribute(sourceGlueCatalogId)); 66 | messageAttributes.put("message_type", createStringAttribute("database")); 67 | messageAttributes.put("export_batch_id", createStringAttribute(exportBatchId)); 68 | publishRequest.setMessageAttributes(messageAttributes); 69 | try { 70 | publishResponse = sns.publish(publishRequest); 71 | } catch (Exception e) { 72 | System.out.println("Database schema could not be published to SNS Topic."); 73 | } 74 | return publishResponse; 75 | } 76 | 77 | /** 78 | * This method publishes all Database Schemas (DDL) to SNS Topic and tracks the 79 | * status in a DynamoDB table. 80 | * 81 | * @param sns 82 | * @param dBList 83 | * @param snsTopicArn 84 | * @param ddbUtil 85 | * @param ddbTblName 86 | * @param sourceGlueCatalogId 87 | * @return 88 | */ 89 | public int publishDatabaseSchemasToSNS(AmazonSNS sns, List masterDBList, String snsTopicArn, 90 | DDBUtil ddbUtil, String ddbTblName, String sourceGlueCatalogId) { 91 | long exportRunId = System.currentTimeMillis(); 92 | String exportBatchId = Long.toString(exportRunId); 93 | AtomicInteger numberOfDatabasesExported = new AtomicInteger(); 94 | // Create Message Attributes 95 | MessageAttributeValue sourceCatalogIdMA = createStringAttribute(sourceGlueCatalogId); 96 | MessageAttributeValue msgTypeMA = createStringAttribute("database"); 97 | MessageAttributeValue exportBatchIdMA = createStringAttribute(exportBatchId); 98 | // Convert databases to JSON Messages and publish them to SNS Topic 99 | for (Database db : masterDBList) { 100 | // Convert Glue Database to JSON String 101 | Gson gson = new Gson(); 102 | String databaseDDL = gson.toJson(db); 103 | // Publish JSON String to Amazon SNS topic 104 | PublishRequest publishRequest = new PublishRequest(snsTopicArn, databaseDDL); 105 | Map messageAttributes = new HashMap(); 106 | messageAttributes.put("source_catalog_id", sourceCatalogIdMA); 107 | messageAttributes.put("message_type", msgTypeMA); 108 | messageAttributes.put("export_batch_id", exportBatchIdMA); 109 | publishRequest.setMessageAttributes(messageAttributes); 110 | try { 111 | PublishResult publishResponse = sns.publish(publishRequest); 112 | numberOfDatabasesExported.getAndIncrement(); 113 | System.out.printf("Schema for Database '%s' published to SNS Topic. Message_Id: %s. \n", 114 | db.getName(), publishResponse.getMessageId()); 115 | ddbUtil.trackDatabaseExportStatus(ddbTblName, db.getName(), databaseDDL, publishResponse.getMessageId(), 116 | sourceGlueCatalogId, exportRunId, exportBatchId, true); 117 | } catch (Exception e) { 118 | e.printStackTrace(); 119 | System.out.printf( 120 | "Schema for Database '%s' could not be published to SNS Topic. It will be audited in DynamoDB table. \n", 121 | db.getName()); 122 | ddbUtil.trackDatabaseExportStatus(ddbTblName, db.getName(), databaseDDL, "", sourceGlueCatalogId, 123 | exportRunId, exportBatchId, false); 124 | } 125 | } 126 | System.out.println("Number of databases exported to SNS: " + numberOfDatabasesExported.get()); 127 | return numberOfDatabasesExported.get(); 128 | } 129 | 130 | /** 131 | * This method publishes Table Schema (DDL) to SNS Topic 132 | * 133 | * @param sns 134 | * @param topicArn 135 | * @param tableDDL 136 | * @param sourceGlueCatalogId 137 | * @return 138 | */ 139 | public PublishResult publishTableSchemaToSNS(AmazonSNS sns, String topicArn, Table table, String tableDDL, 140 | String sourceGlueCatalogId, String exportBatchId) { 141 | PublishResult publishResponse = null; 142 | PublishRequest publishRequest = new PublishRequest(topicArn, tableDDL); 143 | Map messageAttributes = new HashMap(); 144 | messageAttributes.put("source_catalog_id", createStringAttribute(sourceGlueCatalogId)); 145 | messageAttributes.put("message_type", createStringAttribute("table")); 146 | messageAttributes.put("export_batch_id", createStringAttribute(exportBatchId)); 147 | publishRequest.setMessageAttributes(messageAttributes); 148 | try { 149 | publishResponse = sns.publish(publishRequest); 150 | System.out.printf("Table schema for Table '%s' of database '%s' published to SNS Topic. Message_Id: %s. \n", table.getName(), 151 | table.getDatabaseName(), publishResponse.getMessageId()); 152 | } catch (Exception e) { 153 | e.printStackTrace(); 154 | System.out.printf( 155 | "Table schema for Table '%s' of database '%s' could not be published to SNS Topic. This will be tracked in DynamoDB table. \n", 156 | table.getName(), table.getDatabaseName()); 157 | } 158 | return publishResponse; 159 | } 160 | 161 | /** 162 | * This method creates MessageAttributeValue using a String value 163 | * 164 | * @param attributeValue 165 | * @return 166 | */ 167 | public MessageAttributeValue createStringAttribute(final String attributeValue) { 168 | final MessageAttributeValue messageAttributeValue = new MessageAttributeValue().withDataType("String") 169 | .withStringValue(attributeValue); 170 | return messageAttributeValue; 171 | } 172 | 173 | } 174 | -------------------------------------------------------------------------------- /src/main/java/com/amazonaws/gdcreplication/util/SQSUtil.java: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: MIT-0 3 | 4 | package com.amazonaws.gdcreplication.util; 5 | 6 | import java.util.HashMap; 7 | import java.util.Map; 8 | 9 | import com.amazonaws.services.sqs.AmazonSQS; 10 | import com.amazonaws.services.sqs.model.MessageAttributeValue; 11 | import com.amazonaws.services.sqs.model.SendMessageRequest; 12 | import com.amazonaws.services.sqs.model.SendMessageResult; 13 | import com.google.gson.Gson; 14 | 15 | public class SQSUtil { 16 | 17 | public boolean sendTableSchemaToSQSQueue(AmazonSQS sqs, String queueUrl, LargeTable largeTable, 18 | String exportBatchId, String sourceGlueCatalogId) { 19 | 20 | Gson gson = new Gson(); 21 | String tableInfo = gson.toJson(largeTable); 22 | System.out.println(tableInfo); 23 | 24 | int statusCode = 400; 25 | boolean messageSentToSQS = false; 26 | Map messageAttributes = new HashMap<>(); 27 | messageAttributes.put("ExportBatchId", 28 | new MessageAttributeValue().withDataType("String.ExportBatchId").withStringValue(exportBatchId)); 29 | messageAttributes.put("SourceGlueDataCatalogId", new MessageAttributeValue() 30 | .withDataType("String.SourceGlueDataCatalogId").withStringValue(sourceGlueCatalogId)); 31 | messageAttributes.put("SchemaType", 32 | new MessageAttributeValue().withDataType("String.SchemaType").withStringValue("largeTable")); 33 | 34 | SendMessageRequest req = new SendMessageRequest().withQueueUrl(queueUrl) 35 | .withMessageBody(tableInfo).withMessageAttributes(messageAttributes); 36 | 37 | try { 38 | SendMessageResult sendMsgRes = sqs.sendMessage(req); 39 | statusCode = sendMsgRes.getSdkHttpMetadata().getHttpStatusCode(); 40 | } catch (Exception e) { 41 | e.printStackTrace(); 42 | System.out.println("Exception thrown while writing message to SQS. " + e.getLocalizedMessage()); 43 | } 44 | if (statusCode == 200) { 45 | messageSentToSQS = true; 46 | System.out.printf("Table details for table '%s' of database '%s' sent to SQS. \n", 47 | largeTable.getTable().getName(), largeTable.getTable().getDatabaseName()); 48 | } 49 | return messageSentToSQS; 50 | 51 | } 52 | 53 | public void sendLargeTableSchemaToSQS(AmazonSQS sqs, String queueUrl, 54 | String exportBatchId, String sourceGlueCatalogId, String message, LargeTable largeTable) { 55 | 56 | int statusCode = 400; 57 | Map messageAttributes = new HashMap<>(); 58 | messageAttributes.put("ExportBatchId", 59 | new MessageAttributeValue().withDataType("String.ExportBatchId").withStringValue(exportBatchId)); 60 | messageAttributes.put("SourceGlueDataCatalogId", new MessageAttributeValue() 61 | .withDataType("String.SourceGlueDataCatalogId").withStringValue(sourceGlueCatalogId)); 62 | messageAttributes.put("SchemaType", 63 | new MessageAttributeValue().withDataType("String.SchemaType").withStringValue("largeTable")); 64 | 65 | SendMessageRequest req = new SendMessageRequest().withQueueUrl(queueUrl) 66 | .withMessageBody(message).withMessageAttributes(messageAttributes); 67 | 68 | try { 69 | SendMessageResult sendMsgRes = sqs.sendMessage(req); 70 | statusCode = sendMsgRes.getSdkHttpMetadata().getHttpStatusCode(); 71 | } catch (Exception e) { 72 | e.printStackTrace(); 73 | System.out.println("Exception thrown while writing message to SQS. " + e.getLocalizedMessage()); 74 | } 75 | if (statusCode == 200) 76 | System.out.printf("Large Table schema for table '%s' of database '%s' sent to SQS. \n", 77 | largeTable.getTable().getName(), largeTable.getTable().getDatabaseName()); 78 | 79 | } 80 | 81 | public void sendTableSchemaToDeadLetterQueue(AmazonSQS sqs, String queueUrl, TableReplicationStatus tableStatus, 82 | String exportBatchId, String sourceGlueCatalogId) { 83 | 84 | int statusCode = 400; 85 | Map messageAttributes = new HashMap<>(); 86 | messageAttributes.put("ExportBatchId", 87 | new MessageAttributeValue().withDataType("String.ExportBatchId").withStringValue(exportBatchId)); 88 | messageAttributes.put("SourceGlueDataCatalogId", new MessageAttributeValue() 89 | .withDataType("String.SourceGlueDataCatalogId").withStringValue(sourceGlueCatalogId)); 90 | messageAttributes.put("SchemaType", 91 | new MessageAttributeValue().withDataType("String.SchemaType").withStringValue("Table")); 92 | 93 | SendMessageRequest req = new SendMessageRequest().withQueueUrl(queueUrl) 94 | .withMessageBody(tableStatus.getTableSchema()).withMessageAttributes(messageAttributes); 95 | 96 | try { 97 | SendMessageResult sendMsgRes = sqs.sendMessage(req); 98 | statusCode = sendMsgRes.getSdkHttpMetadata().getHttpStatusCode(); 99 | } catch (Exception e) { 100 | e.printStackTrace(); 101 | System.out.println("Exception thrown while writing message to SQS. " + e.getLocalizedMessage()); 102 | } 103 | if (statusCode == 200) 104 | System.out.printf("Table schema for table '%s' of database '%s' sent to SQS. \n", 105 | tableStatus.getTableName(), tableStatus.getDbName()); 106 | 107 | } 108 | 109 | public void sendDatabaseSchemaToDeadLetterQueue(AmazonSQS sqs, String queueUrl, String databaseDDL, 110 | String databaseName, String exportBatchId, String sourceGlueCatalogId) { 111 | 112 | int statusCode = 400; 113 | Map messageAttributes = new HashMap<>(); 114 | messageAttributes.put("ExportBatchId", 115 | new MessageAttributeValue().withDataType("String.ExportBatchId").withStringValue(exportBatchId)); 116 | messageAttributes.put("SourceGlueDataCatalogId", new MessageAttributeValue() 117 | .withDataType("String.SourceGlueDataCatalogId").withStringValue(sourceGlueCatalogId)); 118 | messageAttributes.put("SchemaType", 119 | new MessageAttributeValue().withDataType("String.SchemaType").withStringValue("Database")); 120 | 121 | SendMessageRequest req = new SendMessageRequest().withQueueUrl(queueUrl).withMessageBody(databaseDDL) 122 | .withMessageAttributes(messageAttributes); 123 | 124 | try { 125 | SendMessageResult sendMsgRes = sqs.sendMessage(req); 126 | statusCode = sendMsgRes.getSdkHttpMetadata().getHttpStatusCode(); 127 | } catch (Exception e) { 128 | e.printStackTrace(); 129 | System.out.println("Exception thrown while writing message to SQS. " + e.getLocalizedMessage()); 130 | } 131 | if (statusCode == 200) 132 | System.out.printf("Database schema for database '%s' sent to SQS. \n", databaseName); 133 | 134 | } 135 | 136 | } 137 | -------------------------------------------------------------------------------- /src/main/java/com/amazonaws/gdcreplication/util/TableReplicationStatus.java: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: MIT-0 3 | 4 | package com.amazonaws.gdcreplication.util; 5 | 6 | /** 7 | * 8 | * This is a POJO class for Glue Database Table Replication Status 9 | * @author Ravi Itha, Amazon Web Services, Inc. 10 | * 11 | */ 12 | 13 | public class TableReplicationStatus { 14 | 15 | private String dbName; 16 | private String tableName; 17 | private String replicationDay; 18 | private String tableSchema; 19 | private long replicationTime; 20 | private boolean created; 21 | private boolean updated; 22 | private boolean replicated; 23 | private boolean exportHasPartitions; 24 | private boolean partitionsReplicated; 25 | private boolean error; 26 | private boolean dbNotFoundError; 27 | 28 | public boolean isDbNotFoundError() { 29 | return dbNotFoundError; 30 | } 31 | public void setDbNotFoundError(boolean dbNotFoundError) { 32 | this.dbNotFoundError = dbNotFoundError; 33 | } 34 | public boolean isError() { 35 | return error; 36 | } 37 | public void setError(boolean error) { 38 | this.error = error; 39 | } 40 | public String getTableSchema() { 41 | return tableSchema; 42 | } 43 | public void setTableSchema(String tableSchema) { 44 | this.tableSchema = tableSchema; 45 | } 46 | public String getDbName() { 47 | return dbName; 48 | } 49 | public void setDbName(String dbName) { 50 | this.dbName = dbName; 51 | } 52 | public String getTableName() { 53 | return tableName; 54 | } 55 | public void setTableName(String tableName) { 56 | this.tableName = tableName; 57 | } 58 | public boolean isReplicated() { 59 | return replicated; 60 | } 61 | public void setReplicated(boolean replicated) { 62 | this.replicated = replicated; 63 | } 64 | public String getReplicationDay() { 65 | return replicationDay; 66 | } 67 | public void setReplicationDay(String replicationDay) { 68 | this.replicationDay = replicationDay; 69 | } 70 | public long getReplicationTime() { 71 | return replicationTime; 72 | } 73 | public void setReplicationTime(long replicationTime) { 74 | this.replicationTime = replicationTime; 75 | } 76 | public boolean isCreated() { 77 | return created; 78 | } 79 | public void setCreated(boolean created) { 80 | this.created = created; 81 | } 82 | public boolean isUpdated() { 83 | return updated; 84 | } 85 | public void setUpdated(boolean updated) { 86 | this.updated = updated; 87 | } 88 | public boolean isExportHasPartitions() { 89 | return exportHasPartitions; 90 | } 91 | public void setExportHasPartitions(boolean exportHasPartitions) { 92 | this.exportHasPartitions = exportHasPartitions; 93 | } 94 | public boolean isPartitionsReplicated() { 95 | return partitionsReplicated; 96 | } 97 | public void setPartitionsReplicated(boolean partitionsReplicated) { 98 | this.partitionsReplicated = partitionsReplicated; 99 | } 100 | 101 | } 102 | -------------------------------------------------------------------------------- /src/main/java/com/amazonaws/gdcreplication/util/TableWithPartitions.java: -------------------------------------------------------------------------------- 1 | // Copyright 2019 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: MIT-0 3 | package com.amazonaws.gdcreplication.util; 4 | 5 | import java.util.List; 6 | 7 | import com.amazonaws.services.glue.model.Partition; 8 | import com.amazonaws.services.glue.model.Table; 9 | 10 | public class TableWithPartitions { 11 | 12 | private Table table; 13 | private List partitionList; 14 | 15 | public Table getTable() { 16 | return table; 17 | } 18 | public void setTable(Table table) { 19 | this.table = table; 20 | } 21 | public List getPartitionList() { 22 | return partitionList; 23 | } 24 | public void setPartitionList(List partitionList) { 25 | this.partitionList = partitionList; 26 | } 27 | 28 | 29 | 30 | } 31 | -------------------------------------------------------------------------------- /src/test/resources/Glue_Replication.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-glue-data-catalog-replication-utility/be40a89f45b0b0285c3cfcf806e638b701c69e03/src/test/resources/Glue_Replication.png -------------------------------------------------------------------------------- /src/test/resources/Glue_Table_Anatomy.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-glue-data-catalog-replication-utility/be40a89f45b0b0285c3cfcf806e638b701c69e03/src/test/resources/Glue_Table_Anatomy.png -------------------------------------------------------------------------------- /src/test/resources/SNS_Cross_Account_Permissions.txt: -------------------------------------------------------------------------------- 1 | ## Sample commands necessary setup cross-account permissions for AWS Lambda and Amazon SNS Topic 2 | 3 | Command 1: Run this command in Source AWS Account. Grant permission to Target AWS Account to subscribe to the topic: 4 | 5 | aws sns add-permission --label lambda-access --aws-account-id TargetAccount \ 6 | --topic-arn arn:aws:sns:us-east-1:SourceAccount:SchemaDistributionSNSTopic \ 7 | --action-name Subscribe ListSubscriptionsByTopic Receive 8 | 9 | Command 2: Run this command in Target AWS Account. Add Lambda permissions to allow invocation from Amazon SNS Topic in Source Account. 10 | 11 | aws lambda add-permission --function-name ImportLambda \ 12 | --source-arn arn:aws:sns:us-east-1:SourceAccount:SchemaDistributionSNSTopic \ 13 | --statement-id sns-x-account --action "lambda:InvokeFunction" \ 14 | --principal sns.amazonaws.com 15 | 16 | Command 3: Subscribe Lambda function to the Amazon SNS Topic in Source Account. 17 | 18 | aws sns subscribe --protocol lambda \ 19 | --topic-arn arn:aws:sns:us-east-1:SourceAccount:SchemaDistributionSNSTopic \ 20 | --notification-endpoint arn:aws:lambda:us-east-1:TargetAccount:function:ImportLambda 21 | 22 | For more detailed explanation, refer AWS documentation at: https://docs.aws.amazon.com/lambda/latest/dg/with-sns-example.html#with-sns-create-x-account-permissions 23 | -------------------------------------------------------------------------------- /src/test/resources/sample_database_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "database_in_account_a", 3 | "createTime": "Sep 10, 2019 10:16:55 PM" 4 | } 5 | -------------------------------------------------------------------------------- /src/test/resources/sample_ddb_policy_source_and_target_accounts.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Sid": "VisualEditor0", 6 | "Effect": "Allow", 7 | "Action": [ 8 | "dynamodb:BatchWriteItem", 9 | "dynamodb:PutItem" 10 | ], 11 | "Resource": "*" 12 | } 13 | ] 14 | } -------------------------------------------------------------------------------- /src/test/resources/sample_glue_policy_source_account.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Sid": "VisualEditor0", 6 | "Effect": "Allow", 7 | "Action": [ 8 | "glue:GetDatabase", 9 | "glue:GetPartition", 10 | "glue:GetTableVersion", 11 | "glue:GetTables", 12 | "glue:GetTableVersions", 13 | "glue:GetPartitions", 14 | "glue:BatchDeleteTableVersion", 15 | "glue:BatchGetPartition", 16 | "glue:GetDatabases", 17 | "glue:GetTable" 18 | ], 19 | "Resource": "*" 20 | } 21 | ] 22 | } -------------------------------------------------------------------------------- /src/test/resources/sample_glue_policy_target_account.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Sid": "VisualEditor0", 6 | "Effect": "Allow", 7 | "Action": [ 8 | "glue:SearchTables", 9 | "glue:BatchCreatePartition", 10 | "glue:GetDataCatalogEncryptionSettings", 11 | "glue:GetTableVersions", 12 | "glue:GetPartitions", 13 | "glue:BatchDeletePartition", 14 | "glue:DeleteTableVersion", 15 | "glue:UpdateTable", 16 | "glue:GetSecurityConfiguration", 17 | "glue:GetResourcePolicy", 18 | "glue:GetTableVersion", 19 | "glue:CreatePartition", 20 | "glue:UpdatePartition", 21 | "glue:UpdateDatabase", 22 | "glue:CreateTable", 23 | "glue:GetTables", 24 | "glue:BatchGetPartition", 25 | "glue:GetSecurityConfigurations", 26 | "glue:GetDatabases", 27 | "glue:GetTable", 28 | "glue:GetDatabase", 29 | "glue:GetPartition", 30 | "glue:CreateDatabase", 31 | "glue:BatchDeleteTableVersion", 32 | "glue:DeletePartition" 33 | ], 34 | "Resource": "*" 35 | } 36 | ] 37 | } -------------------------------------------------------------------------------- /src/test/resources/sample_large-table_message_payload.json: -------------------------------------------------------------------------------- 1 | { 2 | "largeTable": true, 3 | "numberOfPartitions": 31, 4 | "s3ObjectKey": "2019-10-18_1571431520830_01234567890_database_in_account_a_data", 5 | "s3BucketName": "bucket_name", 6 | "table": { 7 | "name": "data", 8 | "databaseName": "database_in_account_a", 9 | "owner": "owner", 10 | "createTime": "Sep 25, 2019 6:31:08 PM", 11 | "updateTime": "Sep 25, 2019 6:31:08 PM", 12 | "lastAccessTime": "Sep 25, 2019 6:31:08 PM", 13 | "retention": 0, 14 | "storageDescriptor": { 15 | "columns": [ 16 | { 17 | "name": "id", 18 | "type": "string" 19 | }, 20 | { 21 | "name": "type", 22 | "type": "string" 23 | }, 24 | { 25 | "name": "actor", 26 | "type": "struct\u003cid:int,login:string,display_login:string,gravatar_id:string,url:string,avatar_url:string\u003e" 27 | }, 28 | { 29 | "name": "repo", 30 | "type": "struct\u003cid:int,name:string,url:string\u003e" 31 | }, 32 | { 33 | "name": "payload", 34 | "type": "struct\u003cpush_id:int,size:int,distinct_size:int,ref:string,head:string,before:string,commits:array\u003cstruct\u003csha:string,author:struct\u003cname:string,email:string\u003e,message:string,distinct:boolean,url:string\u003e\u003e,ref_type:string,master_branch:string,description:string,pusher_type:string,action:string,number:int,pull_request:struct\u003curl:string,id:int,html_url:string,diff_url:string,patch_url:string,issue_url:string,number:int,state:string,locked:boolean,title:string,user:struct\u003clogin:string,id:int,avatar_url:string,gravatar_id:string,url:string,html_url:string,followers_url:string,following_url:string,gists_url:string,starred_url:string,subscriptions_url:string,organizations_url:string,repos_url:string,events_url:string,received_events_url:string,type:string,site_admin:boolean\u003e,body:string,created_at:string,updated_at:string,closed_at:string,merged_at:string,merge_commit_sha:string,assignee:struct\u003clogin:string,id:int,avatar_url:string,gravatar_id:string,url:string,html_url:string,followers_url:string,following_url:string,gists_url:string,starred_url:string,subscriptions_url:string,organizations_url:string,repos_url:string,events_url:string,received_events_url:string,type:string,site_admin:boolean\u003e,assignees:array\u003cstruct\u003clogin:string,id:int,avatar_url:string,gravatar_id:string,url:string,html_url:string,followers_url:string,following_url:string,gists_url:string,starred_url:string,subscriptions_url:string,organizations_url:string,repos_url:string,events_url:string,received_events_url:string,type:string,site_admin:boolean\u003e\u003e,milestone:struct\u003curl:string,html_url:string,labels_url:string,id:int,number:int,title:string,description:string,creator:struct\u003clogin:string,id:int,avatar_url:string,gravatar_id:string,url:string,html_url:string,followers_url:string,following_url:string,gists_url:string,starred_url:string,subscriptions_url:string,organizations_url:string,repos_url:string,events_url:string,received_events_url:string,type:string,site_admin:boolean\u003e,open_issues:int,closed_issues:int,state:string,created_at:string,updated_at:string,due_on:string,closed_at:string\u003e,commits_url:string,review_comments_url:string,review_comment_url:string,comments_url:string,statuses_url:string,head:struct\u003clabel:string,ref:string,sha:string,user:struct\u003clogin:string,id:int,avatar_url:string,gravatar_id:string,url:string,html_url:string,followers_url:string,following_url:string,gists_url:string,starred_url:string,subscriptions_url:string,organizations_url:string,repos_url:string,events_url:string,received_events_url:string,type:string,site_admin:boolean\u003e,repo:struct\u003cid:int,name:string,full_name:string,owner:struct\u003clogin:string,id:int,avatar_url:string,gravatar_id:string,url:string,html_url:string,followers_url:string,following_url:string,gists_url:string,starred_url:string,subscriptions_url:string,organizations_url:string,repos_url:string,events_url:string,received_events_url:string,type:string,site_admin:boolean\u003e,private:boolean,html_url:string,description:string,fork:boolean,url:string,forks_url:string,keys_url:string,collaborators_url:string,teams_url:string,hooks_url:string,issue_events_url:string,events_url:string,assignees_url:string,branches_url:string,tags_url:string,blobs_url:string,git_tags_url:string,git_refs_url:string,trees_url:string,statuses_url:string,languages_url:string,stargazers_url:string,contributors_url:string,subscribers_url:string,subscription_url:string,commits_url:string,git_commits_url:string,comments_url:string,issue_comment_url:string,contents_url:string,compare_url:string,merges_url:string,archive_url:string,downloads_url:string,issues_url:string,pulls_url:string,milestones_url:string,notifications_url:string,labels_url:string,releases_url:string,deployments_url:string,created_at:string,updated_at:string,pushed_at:string,git_url:string,ssh_url:string,clone_url:string,svn_url:string,homepage:string,size:int,stargazers_count:int,watchers_count:int,language:string,has_issues:boolean,has_downloads:boolean,has_wiki:boolean,has_pages:boolean,forks_count:int,mirror_url:string,open_issues_count:int,forks:int,open_issues:int,watchers:int,default_branch:string\u003e\u003e,base:struct\u003clabel:string,ref:string,sha:string,user:struct\u003clogin:string,id:int,avatar_url:string,gravatar_id:string,url:string,html_url:string,followers_url:string,following_url:string,gists_url:string,starred_url:string,subscriptions_url:string,organizations_url:string,repos_url:string,events_url:string,received_events_url:string,type:string,site_admin:boolean\u003e,repo:struct\u003cid:int,name:string,full_name:string,owner:struct\u003clogin:string,id:int,avatar_url:string,gravatar_id:string,url:string,html_url:string,followers_url:string,following_url:string,gists_url:string,starred_url:string,subscriptions_url:string,organizations_url:string,repos_url:string,events_url:string,received_events_url:string,type:string,site_admin:boolean\u003e,private:boolean,html_url:string,description:string,fork:boolean,url:string,forks_url:string,keys_url:string,collaborators_url:string,teams_url:string,hooks_url:string,issue_events_url:string,events_url:string,assignees_url:string,branches_url:string,tags_url:string,blobs_url:string,git_tags_url:string,git_refs_url:string,trees_url:string,statuses_url:string,languages_url:string,stargazers_url:string,contributors_url:string,subscribers_url:string,subscription_url:string,commits_url:string,git_commits_url:string,comments_url:string,issue_comment_url:string,contents_url:string,compare_url:string,merges_url:string,archive_url:string,downloads_url:string,issues_url:string,pulls_url:string,milestones_url:string,notifications_url:string,labels_url:string,releases_url:string,deployments_url:string,created_at:string,updated_at:string,pushed_at:string,git_url:string,ssh_url:string,clone_url:string,svn_url:string,homepage:string,size:int,stargazers_count:int,watchers_count:int,language:string,has_issues:boolean,has_downloads:boolean,has_wiki:boolean,has_pages:boolean,forks_count:int,mirror_url:string,open_issues_count:int,forks:int,open_issues:int,watchers:int,default_branch:string\u003e\u003e,_links:struct\u003cself:struct\u003chref:string\u003e,html:struct\u003chref:string\u003e,issue:struct\u003chref:string\u003e,comments:struct\u003chref:string\u003e,review_comments:struct\u003chref:string\u003e,review_comment:struct\u003chref:string\u003e,commits:struct\u003chref:string\u003e,statuses:struct\u003chref:string\u003e\u003e,merged:boolean,mergeable:boolean,mergeable_state:string,merged_by:struct\u003clogin:string,id:int,avatar_url:string,gravatar_id:string,url:string,html_url:string,followers_url:string,following_url:string,gists_url:string,starred_url:string,subscriptions_url:string,organizations_url:string,repos_url:string,events_url:string,received_events_url:string,type:string,site_admin:boolean\u003e,comments:int,review_comments:int,maintainer_can_modify:boolean,commits:int,additions:int,deletions:int,changed_files:int\u003e,issue:struct\u003curl:string,repository_url:string,labels_url:string,comments_url:string,events_url:string,html_url:string,id:int,number:int,title:string,user:struct\u003clogin:string,id:int,avatar_url:string,gravatar_id:string,url:string,html_url:string,followers_url:string,following_url:string,gists_url:string,starred_url:string,subscriptions_url:string,organizations_url:string,repos_url:string,events_url:string,received_events_url:string,type:string,site_admin:boolean\u003e,labels:array\u003cstruct\u003cid:int,url:string,name:string,color:string,default:boolean\u003e\u003e,state:string,locked:boolean,assignee:struct\u003clogin:string,id:int,avatar_url:string,gravatar_id:string,url:string,html_url:string,followers_url:string,following_url:string,gists_url:string,starred_url:string,subscriptions_url:string,organizations_url:string,repos_url:string,events_url:string,received_events_url:string,type:string,site_admin:boolean\u003e,assignees:array\u003cstruct\u003clogin:string,id:int,avatar_url:string,gravatar_id:string,url:string,html_url:string,followers_url:string,following_url:string,gists_url:string,starred_url:string,subscriptions_url:string,organizations_url:string,repos_url:string,events_url:string,received_events_url:string,type:string,site_admin:boolean\u003e\u003e,milestone:struct\u003curl:string,html_url:string,labels_url:string,id:int,number:int,title:string,description:string,creator:struct\u003clogin:string,id:int,avatar_url:string,gravatar_id:string,url:string,html_url:string,followers_url:string,following_url:string,gists_url:string,starred_url:string,subscriptions_url:string,organizations_url:string,repos_url:string,events_url:string,received_events_url:string,type:string,site_admin:boolean\u003e,open_issues:int,closed_issues:int,state:string,created_at:string,updated_at:string,due_on:string,closed_at:string\u003e,comments:int,created_at:string,updated_at:string,closed_at:string,body:string,pull_request:struct\u003curl:string,html_url:string,diff_url:string,patch_url:string\u003e\u003e,comment:struct\u003curl:string,html_url:string,issue_url:string,id:int,user:struct\u003clogin:string,id:int,avatar_url:string,gravatar_id:string,url:string,html_url:string,followers_url:string,following_url:string,gists_url:string,starred_url:string,subscriptions_url:string,organizations_url:string,repos_url:string,events_url:string,received_events_url:string,type:string,site_admin:boolean\u003e,created_at:string,updated_at:string,body:string,pull_request_review_id:int,diff_hunk:string,path:string,position:int,original_position:int,commit_id:string,original_commit_id:string,pull_request_url:string,_links:struct\u003cself:struct\u003chref:string\u003e,html:struct\u003chref:string\u003e,pull_request:struct\u003chref:string\u003e\u003e,line:int\u003e,forkee:struct\u003cid:int,name:string,full_name:string,owner:struct\u003clogin:string,id:int,avatar_url:string,gravatar_id:string,url:string,html_url:string,followers_url:string,following_url:string,gists_url:string,starred_url:string,subscriptions_url:string,organizations_url:string,repos_url:string,events_url:string,received_events_url:string,type:string,site_admin:boolean\u003e,private:boolean,html_url:string,description:string,fork:boolean,url:string,forks_url:string,keys_url:string,collaborators_url:string,teams_url:string,hooks_url:string,issue_events_url:string,events_url:string,assignees_url:string,branches_url:string,tags_url:string,blobs_url:string,git_tags_url:string,git_refs_url:string,trees_url:string,statuses_url:string,languages_url:string,stargazers_url:string,contributors_url:string,subscribers_url:string,subscription_url:string,commits_url:string,git_commits_url:string,comments_url:string,issue_comment_url:string,contents_url:string,compare_url:string,merges_url:string,archive_url:string,downloads_url:string,issues_url:string,pulls_url:string,milestones_url:string,notifications_url:string,labels_url:string,releases_url:string,deployments_url:string,created_at:string,updated_at:string,pushed_at:string,git_url:string,ssh_url:string,clone_url:string,svn_url:string,homepage:string,size:int,stargazers_count:int,watchers_count:int,language:string,has_issues:boolean,has_downloads:boolean,has_wiki:boolean,has_pages:boolean,forks_count:int,mirror_url:string,open_issues_count:int,forks:int,open_issues:int,watchers:int,default_branch:string,public:boolean\u003e,pages:array\u003cstruct\u003cpage_name:string,title:string,summary:string,action:string,sha:string,html_url:string\u003e\u003e,release:struct\u003curl:string,assets_url:string,upload_url:string,html_url:string,id:int,tag_name:string,target_commitish:string,name:string,draft:boolean,author:struct\u003clogin:string,id:int,avatar_url:string,gravatar_id:string,url:string,html_url:string,followers_url:string,following_url:string,gists_url:string,starred_url:string,subscriptions_url:string,organizations_url:string,repos_url:string,events_url:string,received_events_url:string,type:string,site_admin:boolean\u003e,prerelease:boolean,created_at:string,published_at:string,assets:array\u003cstruct\u003curl:string,id:int,name:string,label:string,uploader:struct\u003clogin:string,id:int,avatar_url:string,gravatar_id:string,url:string,html_url:string,followers_url:string,following_url:string,gists_url:string,starred_url:string,subscriptions_url:string,organizations_url:string,repos_url:string,events_url:string,received_events_url:string,type:string,site_admin:boolean\u003e,content_type:string,state:string,size:int,download_count:int,created_at:string,updated_at:string,browser_download_url:string\u003e\u003e,tarball_url:string,zipball_url:string,body:string\u003e,member:struct\u003clogin:string,id:int,avatar_url:string,gravatar_id:string,url:string,html_url:string,followers_url:string,following_url:string,gists_url:string,starred_url:string,subscriptions_url:string,organizations_url:string,repos_url:string,events_url:string,received_events_url:string,type:string,site_admin:boolean\u003e\u003e" 35 | }, 36 | { 37 | "name": "public", 38 | "type": "boolean" 39 | }, 40 | { 41 | "name": "created_at", 42 | "type": "string" 43 | }, 44 | { 45 | "name": "org", 46 | "type": "struct\u003cid:int,login:string,gravatar_id:string,url:string,avatar_url:string\u003e" 47 | } 48 | ], 49 | "location": "s3://aws-glue-datasets-us-east-1/examples/githubarchive/month/data/", 50 | "inputFormat": "org.apache.hadoop.mapred.TextInputFormat", 51 | "outputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", 52 | "compressed": true, 53 | "numberOfBuckets": -1, 54 | "serdeInfo": { 55 | "serializationLibrary": "org.openx.data.jsonserde.JsonSerDe", 56 | "parameters": { 57 | "paths": "actor,created_at,id,org,payload,public,repo,type" 58 | } 59 | }, 60 | "bucketColumns": [ 61 | ], 62 | "sortColumns": [ 63 | ], 64 | "parameters": { 65 | "sizeKey": "11026096613", 66 | "objectCount": "744", 67 | "UPDATED_BY_CRAWLER": "Github", 68 | "CrawlerSchemaSerializerVersion": "1.0", 69 | "recordCount": "4795154", 70 | "averageRecordSize": "2471", 71 | "CrawlerSchemaDeserializerVersion": "1.0", 72 | "compressionType": "gzip", 73 | "classification": "json", 74 | "typeOfData": "file" 75 | }, 76 | "storedAsSubDirectories": false 77 | }, 78 | "partitionKeys": [ 79 | { 80 | "name": "partition_0", 81 | "type": "string" 82 | }, 83 | { 84 | "name": "partition_1", 85 | "type": "string" 86 | }, 87 | { 88 | "name": "partition_2", 89 | "type": "string" 90 | } 91 | ], 92 | "tableType": "EXTERNAL_TABLE", 93 | "parameters": { 94 | "sizeKey": "11026096613", 95 | "objectCount": "744", 96 | "UPDATED_BY_CRAWLER": "Github", 97 | "CrawlerSchemaSerializerVersion": "1.0", 98 | "recordCount": "4795154", 99 | "averageRecordSize": "2471", 100 | "CrawlerSchemaDeserializerVersion": "1.0", 101 | "compressionType": "gzip", 102 | "classification": "json", 103 | "typeOfData": "file" 104 | }, 105 | "createdBy": "arn:aws:sts::01234567890:assumed-role/AWSGlueServiceRole-abc/AWS-Crawler" 106 | } 107 | } -------------------------------------------------------------------------------- /src/test/resources/sample_sns_policy_source_account.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Sid": "VisualEditor0", 6 | "Effect": "Allow", 7 | "Action": "sns:Publish", 8 | "Resource": "*" 9 | } 10 | ] 11 | } -------------------------------------------------------------------------------- /src/test/resources/sample_sqs_policy_source_and_target_accounts.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Sid": "VisualEditor0", 6 | "Effect": "Allow", 7 | "Action": [ 8 | "sqs:DeleteMessage", 9 | "sqs:ListQueues", 10 | "sqs:GetQueueUrl", 11 | "sqs:ListDeadLetterSourceQueues", 12 | "sqs:ChangeMessageVisibility", 13 | "sqs:DeleteMessageBatch", 14 | "sqs:SendMessageBatch", 15 | "sqs:ReceiveMessage", 16 | "sqs:SendMessage", 17 | "sqs:GetQueueAttributes", 18 | "sqs:ListQueueTags" 19 | ], 20 | "Resource": "*" 21 | } 22 | ] 23 | } -------------------------------------------------------------------------------- /src/test/resources/sample_table_schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "data", 3 | "databaseName": "database_in_account_a", 4 | "owner": "owner", 5 | "createTime": "Sep 25, 2019 1:31:08 PM", 6 | "updateTime": "Sep 25, 2019 1:31:08 PM", 7 | "lastAccessTime": "Sep 25, 2019 1:31:08 PM", 8 | "retention": 0, 9 | "storageDescriptor": { 10 | "columns": [ 11 | { 12 | "name": "id", 13 | "type": "string" 14 | }, 15 | { 16 | "name": "type", 17 | "type": "string" 18 | }, 19 | { 20 | "name": "actor", 21 | "type": "struct\u003cid:int,login:string,display_login:string,gravatar_id:string,url:string,avatar_url:string\u003e" 22 | }, 23 | { 24 | "name": "repo", 25 | "type": "struct\u003cid:int,name:string,url:string\u003e" 26 | }, 27 | { 28 | "name": "payload", 29 | "type": "struct\u003cpush_id:int,size:int,distinct_size:int,ref:string,head:string,before:string,commits:array\u003cstruct\u003csha:string,author:struct\u003cname:string,email:string\u003e,message:string,distinct:boolean,url:string\u003e\u003e,ref_type:string,master_branch:string,description:string,pusher_type:string,action:string,number:int,pull_request:struct\u003curl:string,id:int,html_url:string,diff_url:string,patch_url:string,issue_url:string,number:int,state:string,locked:boolean,title:string,user:struct\u003clogin:string,id:int,avatar_url:string,gravatar_id:string,url:string,html_url:string,followers_url:string,following_url:string,gists_url:string,starred_url:string,subscriptions_url:string,organizations_url:string,repos_url:string,events_url:string,received_events_url:string,type:string,site_admin:boolean\u003e,body:string,created_at:string,updated_at:string,closed_at:string,merged_at:string,merge_commit_sha:string,assignee:struct\u003clogin:string,id:int,avatar_url:string,gravatar_id:string,url:string,html_url:string,followers_url:string,following_url:string,gists_url:string,starred_url:string,subscriptions_url:string,organizations_url:string,repos_url:string,events_url:string,received_events_url:string,type:string,site_admin:boolean\u003e,assignees:array\u003cstruct\u003clogin:string,id:int,avatar_url:string,gravatar_id:string,url:string,html_url:string,followers_url:string,following_url:string,gists_url:string,starred_url:string,subscriptions_url:string,organizations_url:string,repos_url:string,events_url:string,received_events_url:string,type:string,site_admin:boolean\u003e\u003e,milestone:struct\u003curl:string,html_url:string,labels_url:string,id:int,number:int,title:string,description:string,creator:struct\u003clogin:string,id:int,avatar_url:string,gravatar_id:string,url:string,html_url:string,followers_url:string,following_url:string,gists_url:string,starred_url:string,subscriptions_url:string,organizations_url:string,repos_url:string,events_url:string,received_events_url:string,type:string,site_admin:boolean\u003e,open_issues:int,closed_issues:int,state:string,created_at:string,updated_at:string,due_on:string,closed_at:string\u003e,commits_url:string,review_comments_url:string,review_comment_url:string,comments_url:string,statuses_url:string,head:struct\u003clabel:string,ref:string,sha:string,user:struct\u003clogin:string,id:int,avatar_url:string,gravatar_id:string,url:string,html_url:string,followers_url:string,following_url:string,gists_url:string,starred_url:string,subscriptions_url:string,organizations_url:string,repos_url:string,events_url:string,received_events_url:string,type:string,site_admin:boolean\u003e,repo:struct\u003cid:int,name:string,full_name:string,owner:struct\u003clogin:string,id:int,avatar_url:string,gravatar_id:string,url:string,html_url:string,followers_url:string,following_url:string,gists_url:string,starred_url:string,subscriptions_url:string,organizations_url:string,repos_url:string,events_url:string,received_events_url:string,type:string,site_admin:boolean\u003e,private:boolean,html_url:string,description:string,fork:boolean,url:string,forks_url:string,keys_url:string,collaborators_url:string,teams_url:string,hooks_url:string,issue_events_url:string,events_url:string,assignees_url:string,branches_url:string,tags_url:string,blobs_url:string,git_tags_url:string,git_refs_url:string,trees_url:string,statuses_url:string,languages_url:string,stargazers_url:string,contributors_url:string,subscribers_url:string,subscription_url:string,commits_url:string,git_commits_url:string,comments_url:string,issue_comment_url:string,contents_url:string,compare_url:string,merges_url:string,archive_url:string,downloads_url:string,issues_url:string,pulls_url:string,milestones_url:string,notifications_url:string,labels_url:string,releases_url:string,deployments_url:string,created_at:string,updated_at:string,pushed_at:string,git_url:string,ssh_url:string,clone_url:string,svn_url:string,homepage:string,size:int,stargazers_count:int,watchers_count:int,language:string,has_issues:boolean,has_downloads:boolean,has_wiki:boolean,has_pages:boolean,forks_count:int,mirror_url:string,open_issues_count:int,forks:int,open_issues:int,watchers:int,default_branch:string\u003e\u003e,base:struct\u003clabel:string,ref:string,sha:string,user:struct\u003clogin:string,id:int,avatar_url:string,gravatar_id:string,url:string,html_url:string,followers_url:string,following_url:string,gists_url:string,starred_url:string,subscriptions_url:string,organizations_url:string,repos_url:string,events_url:string,received_events_url:string,type:string,site_admin:boolean\u003e,repo:struct\u003cid:int,name:string,full_name:string,owner:struct\u003clogin:string,id:int,avatar_url:string,gravatar_id:string,url:string,html_url:string,followers_url:string,following_url:string,gists_url:string,starred_url:string,subscriptions_url:string,organizations_url:string,repos_url:string,events_url:string,received_events_url:string,type:string,site_admin:boolean\u003e,private:boolean,html_url:string,description:string,fork:boolean,url:string,forks_url:string,keys_url:string,collaborators_url:string,teams_url:string,hooks_url:string,issue_events_url:string,events_url:string,assignees_url:string,branches_url:string,tags_url:string,blobs_url:string,git_tags_url:string,git_refs_url:string,trees_url:string,statuses_url:string,languages_url:string,stargazers_url:string,contributors_url:string,subscribers_url:string,subscription_url:string,commits_url:string,git_commits_url:string,comments_url:string,issue_comment_url:string,contents_url:string,compare_url:string,merges_url:string,archive_url:string,downloads_url:string,issues_url:string,pulls_url:string,milestones_url:string,notifications_url:string,labels_url:string,releases_url:string,deployments_url:string,created_at:string,updated_at:string,pushed_at:string,git_url:string,ssh_url:string,clone_url:string,svn_url:string,homepage:string,size:int,stargazers_count:int,watchers_count:int,language:string,has_issues:boolean,has_downloads:boolean,has_wiki:boolean,has_pages:boolean,forks_count:int,mirror_url:string,open_issues_count:int,forks:int,open_issues:int,watchers:int,default_branch:string\u003e\u003e,_links:struct\u003cself:struct\u003chref:string\u003e,html:struct\u003chref:string\u003e,issue:struct\u003chref:string\u003e,comments:struct\u003chref:string\u003e,review_comments:struct\u003chref:string\u003e,review_comment:struct\u003chref:string\u003e,commits:struct\u003chref:string\u003e,statuses:struct\u003chref:string\u003e\u003e,merged:boolean,mergeable:boolean,mergeable_state:string,merged_by:struct\u003clogin:string,id:int,avatar_url:string,gravatar_id:string,url:string,html_url:string,followers_url:string,following_url:string,gists_url:string,starred_url:string,subscriptions_url:string,organizations_url:string,repos_url:string,events_url:string,received_events_url:string,type:string,site_admin:boolean\u003e,comments:int,review_comments:int,maintainer_can_modify:boolean,commits:int,additions:int,deletions:int,changed_files:int\u003e,issue:struct\u003curl:string,repository_url:string,labels_url:string,comments_url:string,events_url:string,html_url:string,id:int,number:int,title:string,user:struct\u003clogin:string,id:int,avatar_url:string,gravatar_id:string,url:string,html_url:string,followers_url:string,following_url:string,gists_url:string,starred_url:string,subscriptions_url:string,organizations_url:string,repos_url:string,events_url:string,received_events_url:string,type:string,site_admin:boolean\u003e,labels:array\u003cstruct\u003cid:int,url:string,name:string,color:string,default:boolean\u003e\u003e,state:string,locked:boolean,assignee:struct\u003clogin:string,id:int,avatar_url:string,gravatar_id:string,url:string,html_url:string,followers_url:string,following_url:string,gists_url:string,starred_url:string,subscriptions_url:string,organizations_url:string,repos_url:string,events_url:string,received_events_url:string,type:string,site_admin:boolean\u003e,assignees:array\u003cstruct\u003clogin:string,id:int,avatar_url:string,gravatar_id:string,url:string,html_url:string,followers_url:string,following_url:string,gists_url:string,starred_url:string,subscriptions_url:string,organizations_url:string,repos_url:string,events_url:string,received_events_url:string,type:string,site_admin:boolean\u003e\u003e,milestone:struct\u003curl:string,html_url:string,labels_url:string,id:int,number:int,title:string,description:string,creator:struct\u003clogin:string,id:int,avatar_url:string,gravatar_id:string,url:string,html_url:string,followers_url:string,following_url:string,gists_url:string,starred_url:string,subscriptions_url:string,organizations_url:string,repos_url:string,events_url:string,received_events_url:string,type:string,site_admin:boolean\u003e,open_issues:int,closed_issues:int,state:string,created_at:string,updated_at:string,due_on:string,closed_at:string\u003e,comments:int,created_at:string,updated_at:string,closed_at:string,body:string,pull_request:struct\u003curl:string,html_url:string,diff_url:string,patch_url:string\u003e\u003e,comment:struct\u003curl:string,html_url:string,issue_url:string,id:int,user:struct\u003clogin:string,id:int,avatar_url:string,gravatar_id:string,url:string,html_url:string,followers_url:string,following_url:string,gists_url:string,starred_url:string,subscriptions_url:string,organizations_url:string,repos_url:string,events_url:string,received_events_url:string,type:string,site_admin:boolean\u003e,created_at:string,updated_at:string,body:string,pull_request_review_id:int,diff_hunk:string,path:string,position:int,original_position:int,commit_id:string,original_commit_id:string,pull_request_url:string,_links:struct\u003cself:struct\u003chref:string\u003e,html:struct\u003chref:string\u003e,pull_request:struct\u003chref:string\u003e\u003e,line:int\u003e,forkee:struct\u003cid:int,name:string,full_name:string,owner:struct\u003clogin:string,id:int,avatar_url:string,gravatar_id:string,url:string,html_url:string,followers_url:string,following_url:string,gists_url:string,starred_url:string,subscriptions_url:string,organizations_url:string,repos_url:string,events_url:string,received_events_url:string,type:string,site_admin:boolean\u003e,private:boolean,html_url:string,description:string,fork:boolean,url:string,forks_url:string,keys_url:string,collaborators_url:string,teams_url:string,hooks_url:string,issue_events_url:string,events_url:string,assignees_url:string,branches_url:string,tags_url:string,blobs_url:string,git_tags_url:string,git_refs_url:string,trees_url:string,statuses_url:string,languages_url:string,stargazers_url:string,contributors_url:string,subscribers_url:string,subscription_url:string,commits_url:string,git_commits_url:string,comments_url:string,issue_comment_url:string,contents_url:string,compare_url:string,merges_url:string,archive_url:string,downloads_url:string,issues_url:string,pulls_url:string,milestones_url:string,notifications_url:string,labels_url:string,releases_url:string,deployments_url:string,created_at:string,updated_at:string,pushed_at:string,git_url:string,ssh_url:string,clone_url:string,svn_url:string,homepage:string,size:int,stargazers_count:int,watchers_count:int,language:string,has_issues:boolean,has_downloads:boolean,has_wiki:boolean,has_pages:boolean,forks_count:int,mirror_url:string,open_issues_count:int,forks:int,open_issues:int,watchers:int,default_branch:string,public:boolean\u003e,pages:array\u003cstruct\u003cpage_name:string,title:string,summary:string,action:string,sha:string,html_url:string\u003e\u003e,release:struct\u003curl:string,assets_url:string,upload_url:string,html_url:string,id:int,tag_name:string,target_commitish:string,name:string,draft:boolean,author:struct\u003clogin:string,id:int,avatar_url:string,gravatar_id:string,url:string,html_url:string,followers_url:string,following_url:string,gists_url:string,starred_url:string,subscriptions_url:string,organizations_url:string,repos_url:string,events_url:string,received_events_url:string,type:string,site_admin:boolean\u003e,prerelease:boolean,created_at:string,published_at:string,assets:array\u003cstruct\u003curl:string,id:int,name:string,label:string,uploader:struct\u003clogin:string,id:int,avatar_url:string,gravatar_id:string,url:string,html_url:string,followers_url:string,following_url:string,gists_url:string,starred_url:string,subscriptions_url:string,organizations_url:string,repos_url:string,events_url:string,received_events_url:string,type:string,site_admin:boolean\u003e,content_type:string,state:string,size:int,download_count:int,created_at:string,updated_at:string,browser_download_url:string\u003e\u003e,tarball_url:string,zipball_url:string,body:string\u003e,member:struct\u003clogin:string,id:int,avatar_url:string,gravatar_id:string,url:string,html_url:string,followers_url:string,following_url:string,gists_url:string,starred_url:string,subscriptions_url:string,organizations_url:string,repos_url:string,events_url:string,received_events_url:string,type:string,site_admin:boolean\u003e\u003e" 30 | }, 31 | { 32 | "name": "public", 33 | "type": "boolean" 34 | }, 35 | { 36 | "name": "created_at", 37 | "type": "string" 38 | }, 39 | { 40 | "name": "org", 41 | "type": "struct\u003cid:int,login:string,gravatar_id:string,url:string,avatar_url:string\u003e" 42 | } 43 | ], 44 | "location": "s3://aws-glue-datasets-us-east-1/examples/githubarchive/month/data/", 45 | "inputFormat": "org.apache.hadoop.mapred.TextInputFormat", 46 | "outputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", 47 | "compressed": true, 48 | "numberOfBuckets": -1, 49 | "serdeInfo": { 50 | "serializationLibrary": "org.openx.data.jsonserde.JsonSerDe", 51 | "parameters": { 52 | "paths": "actor,created_at,id,org,payload,public,repo,type" 53 | } 54 | }, 55 | "bucketColumns": [ 56 | ], 57 | "sortColumns": [ 58 | ], 59 | "parameters": { 60 | "sizeKey": "11026096613", 61 | "objectCount": "744", 62 | "UPDATED_BY_CRAWLER": "Github", 63 | "CrawlerSchemaSerializerVersion": "1.0", 64 | "recordCount": "4795154", 65 | "averageRecordSize": "2471", 66 | "CrawlerSchemaDeserializerVersion": "1.0", 67 | "compressionType": "gzip", 68 | "classification": "json", 69 | "typeOfData": "file" 70 | }, 71 | "storedAsSubDirectories": false 72 | }, 73 | "partitionKeys": [ 74 | { 75 | "name": "partition_0", 76 | "type": "string" 77 | }, 78 | { 79 | "name": "partition_1", 80 | "type": "string" 81 | }, 82 | { 83 | "name": "partition_2", 84 | "type": "string" 85 | } 86 | ], 87 | "tableType": "EXTERNAL_TABLE", 88 | "parameters": { 89 | "sizeKey": "11026096613", 90 | "objectCount": "744", 91 | "UPDATED_BY_CRAWLER": "Github", 92 | "CrawlerSchemaSerializerVersion": "1.0", 93 | "recordCount": "4795154", 94 | "averageRecordSize": "2471", 95 | "CrawlerSchemaDeserializerVersion": "1.0", 96 | "compressionType": "gzip", 97 | "classification": "json", 98 | "typeOfData": "file" 99 | }, 100 | "createdBy": "arn:aws:sts::0123456789:assumed-role/AWSGlueServiceRole-abc/AWS-Crawler" 101 | } 102 | --------------------------------------------------------------------------------