├── .gitignore ├── src └── main │ ├── resources │ ├── AWS_Glue_Table_versions_cleanup_utility.png │ ├── table_versions_cleanup_planner_lambda_sqs_policy.json │ ├── table_versions_cleanup_cloudwatch_logs_policy.json │ ├── table_versions_cleanup_planner_lambda_glue_policy.json │ ├── table_versions_cleanup_lambda_dynamodb_policy.json │ ├── table_versions_cleanup_lambda_glue_policy.json │ ├── table_versions_cleanup_lambda_sqs_policy.json │ └── AWS_Glue_Table_versions_cleanup_utility.drawio │ └── java │ └── software │ └── aws │ └── glue │ └── tableversions │ ├── utils │ ├── GlueTable.java │ ├── Test.java │ ├── TableVersionStatus.java │ ├── TestDDBUtil.java │ ├── SQSUtil.java │ ├── DDBUtil.java │ └── GlueUtil.java │ └── lambda │ ├── TableVersionsCleanupLambda.java │ └── TableVersionsCleanupPlannerLambda.java ├── CODE_OF_CONDUCT.md ├── .project ├── LICENSE ├── dependency-reduced-pom.xml ├── .classpath ├── CONTRIBUTING.md ├── pom.xml └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | .settings/ 3 | build/ 4 | target/classes/ -------------------------------------------------------------------------------- /src/main/resources/AWS_Glue_Table_versions_cleanup_utility.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/aws-glue-table-versions-cleanup-utility/HEAD/src/main/resources/AWS_Glue_Table_versions_cleanup_utility.png -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /src/main/resources/table_versions_cleanup_planner_lambda_sqs_policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Sid": "VisualEditor0", 6 | "Effect": "Allow", 7 | "Action": [ 8 | "sqs:SendMessageBatch", 9 | "sqs:SendMessage" 10 | ], 11 | "Resource": "*" 12 | } 13 | ] 14 | } -------------------------------------------------------------------------------- /src/main/resources/table_versions_cleanup_cloudwatch_logs_policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Sid": "VisualEditor0", 6 | "Effect": "Allow", 7 | "Action": [ 8 | "logs:CreateLogStream", 9 | "logs:CreateLogGroup", 10 | "logs:PutLogEvents" 11 | ], 12 | "Resource": "*" 13 | } 14 | ] 15 | } -------------------------------------------------------------------------------- /src/main/resources/table_versions_cleanup_planner_lambda_glue_policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Sid": "VisualEditor0", 6 | "Effect": "Allow", 7 | "Action": [ 8 | "glue:GetDatabase", 9 | "glue:GetTables", 10 | "glue:GetDatabases", 11 | "glue:GetTable" 12 | ], 13 | "Resource": "*" 14 | } 15 | ] 16 | } -------------------------------------------------------------------------------- /src/main/resources/table_versions_cleanup_lambda_dynamodb_policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Sid": "VisualEditor0", 6 | "Effect": "Allow", 7 | "Action": [ 8 | "dynamodb:BatchWriteItem", 9 | "dynamodb:PutItem", 10 | "dynamodb:DeleteItem", 11 | "dynamodb:UpdateItem" 12 | ], 13 | "Resource": "*" 14 | } 15 | ] 16 | } -------------------------------------------------------------------------------- /src/main/resources/table_versions_cleanup_lambda_glue_policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Sid": "VisualEditor0", 6 | "Effect": "Allow", 7 | "Action": [ 8 | "glue:GetTableVersion", 9 | "glue:GetTableVersions", 10 | "glue:DeleteTableVersion", 11 | "glue:BatchDeleteTableVersion" 12 | ], 13 | "Resource": "*" 14 | } 15 | ] 16 | } -------------------------------------------------------------------------------- /src/main/resources/table_versions_cleanup_lambda_sqs_policy.json: -------------------------------------------------------------------------------- 1 | { 2 | "Version": "2012-10-17", 3 | "Statement": [ 4 | { 5 | "Sid": "VisualEditor0", 6 | "Effect": "Allow", 7 | "Action": [ 8 | "sqs:DeleteMessage", 9 | "sqs:GetQueueUrl", 10 | "sqs:ListDeadLetterSourceQueues", 11 | "sqs:ReceiveMessage", 12 | "sqs:GetQueueAttributes" 13 | ], 14 | "Resource": "*" 15 | } 16 | ] 17 | } -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | GlueTableVersionCleanup_2 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | org.eclipse.m2e.core.maven2Builder 15 | 16 | 17 | 18 | 19 | 20 | org.eclipse.jdt.core.javanature 21 | org.eclipse.m2e.core.maven2Nature 22 | 23 | 24 | -------------------------------------------------------------------------------- /src/main/java/software/aws/glue/tableversions/utils/GlueTable.java: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: MIT-0 3 | 4 | package software.aws.glue.tableversions.utils; 5 | 6 | public class GlueTable { 7 | 8 | private String databaseName; 9 | private String tableName; 10 | public String getDatabaseName() { 11 | return databaseName; 12 | } 13 | public void setDatabaseName(String databaseName) { 14 | this.databaseName = databaseName; 15 | } 16 | public String getTableName() { 17 | return tableName; 18 | } 19 | public void setTableName(String tableName) { 20 | this.tableName = tableName; 21 | } 22 | 23 | 24 | } 25 | -------------------------------------------------------------------------------- /src/main/java/software/aws/glue/tableversions/utils/Test.java: -------------------------------------------------------------------------------- 1 | package software.aws.glue.tableversions.utils; 2 | 3 | import com.amazonaws.services.securitytoken.AWSSecurityTokenService; 4 | import com.amazonaws.services.securitytoken.AWSSecurityTokenServiceClientBuilder; 5 | import com.amazonaws.services.securitytoken.model.GetCallerIdentityRequest; 6 | import com.amazonaws.services.securitytoken.model.GetCallerIdentityResult; 7 | 8 | public class Test { 9 | 10 | public static void main(String[] args) { 11 | AWSSecurityTokenService client = AWSSecurityTokenServiceClientBuilder.standard().build(); 12 | GetCallerIdentityRequest request = new GetCallerIdentityRequest(); 13 | GetCallerIdentityResult response = client.getCallerIdentity(request); 14 | System.out.println("Account Id: " + response.getAccount()); 15 | 16 | 17 | } 18 | 19 | } 20 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 4 | software and associated documentation files (the "Software"), to deal in the Software 5 | without restriction, including without limitation the rights to use, copy, modify, 6 | merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 7 | permit persons to whom the Software is furnished to do so. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 10 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 11 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 12 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 13 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 14 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. -------------------------------------------------------------------------------- /src/main/java/software/aws/glue/tableversions/utils/TableVersionStatus.java: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: MIT-0 3 | 4 | package software.aws.glue.tableversions.utils; 5 | 6 | public class TableVersionStatus { 7 | 8 | private String tableName; 9 | private String databaseName; 10 | private String versionId; 11 | private boolean deleted; 12 | 13 | public String getTableName() { 14 | return tableName; 15 | } 16 | public void setTableName(String tableName) { 17 | this.tableName = tableName; 18 | } 19 | public String getDatabaseName() { 20 | return databaseName; 21 | } 22 | public void setDatabaseName(String databaseName) { 23 | this.databaseName = databaseName; 24 | } 25 | public String getVersionId() { 26 | return versionId; 27 | } 28 | public void setVersionId(String versionId) { 29 | this.versionId = versionId; 30 | } 31 | public boolean isDeleted() { 32 | return deleted; 33 | } 34 | public void setDeleted(boolean deleted) { 35 | this.deleted = deleted; 36 | } 37 | 38 | } 39 | -------------------------------------------------------------------------------- /src/main/java/software/aws/glue/tableversions/utils/TestDDBUtil.java: -------------------------------------------------------------------------------- 1 | package software.aws.glue.tableversions.utils; 2 | 3 | import com.amazonaws.services.dynamodbv2.AmazonDynamoDB; 4 | import com.amazonaws.services.dynamodbv2.AmazonDynamoDBClientBuilder; 5 | 6 | public class TestDDBUtil { 7 | 8 | public static void main(String[] args) { 9 | 10 | String ddbTableName_1 = "glue_table_version_cleanup_planner"; 11 | String ddbTableName_2 = "glue_table_version_cleanup_statistics"; 12 | long executionId = System.currentTimeMillis(); 13 | 14 | DDBUtil ddbUtil = new DDBUtil(); 15 | 16 | String hashKey_1 = "execution_batch_id"; 17 | String rangeKey_1 = "database_name_table_name"; 18 | 19 | String hashKey_2 = "execution_id"; 20 | String rangeKey_2 = "execution_batch_id"; 21 | 22 | String databaseName = "test_db"; 23 | String tableName = "test_table"; 24 | int numTableVersionsB4Cleanup = 20; 25 | int numVersionsRetained = 10; 26 | int numDeletedVersions = 10; 27 | 28 | long executionBatchId = System.currentTimeMillis(); 29 | AmazonDynamoDB ddbClient = AmazonDynamoDBClientBuilder.standard().withRegion("us-east-1").build(); 30 | String notificationTime = new java.util.Date().toString(); 31 | 32 | ddbUtil.insertTableDetailsToDynamoDB(ddbClient, ddbTableName_1, hashKey_1, rangeKey_1, executionBatchId, 33 | databaseName, tableName, notificationTime); 34 | 35 | ddbUtil.insertCleanupStatusToDynamoDB(ddbClient, ddbTableName_2, hashKey_2, rangeKey_2, executionId, 36 | Long.toString(executionBatchId), databaseName, tableName, numTableVersionsB4Cleanup, 37 | numVersionsRetained, numDeletedVersions); 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /src/main/java/software/aws/glue/tableversions/utils/SQSUtil.java: -------------------------------------------------------------------------------- 1 | //Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | //SPDX-License-Identifier: MIT-0 3 | 4 | package software.aws.glue.tableversions.utils; 5 | 6 | import java.util.HashMap; 7 | import java.util.Map; 8 | 9 | import com.amazonaws.services.sqs.AmazonSQS; 10 | import com.amazonaws.services.sqs.model.MessageAttributeValue; 11 | import com.amazonaws.services.sqs.model.SendMessageRequest; 12 | import com.amazonaws.services.sqs.model.SendMessageResult; 13 | 14 | public class SQSUtil { 15 | 16 | /** 17 | * This method send a message to SQS queue. 18 | * @param sqs 19 | * @param queueURI 20 | * @param message 21 | * @param executionBatchId 22 | * @param databaseName 23 | * @return 24 | */ 25 | public boolean sendTableSchemaToSQSQueue(AmazonSQS sqs, String queueURI, String message, long executionBatchId, 26 | String databaseName) { 27 | int statusCode = 400; 28 | boolean messageSentToSQS = false; 29 | Map messageAttributes = new HashMap<>(); 30 | messageAttributes.put("ExecutionBatchId", new MessageAttributeValue().withDataType("String.ExecutionBatchId") 31 | .withStringValue(Long.toString(executionBatchId))); 32 | SendMessageRequest req = new SendMessageRequest().withQueueUrl(queueURI).withMessageBody(message) 33 | .withMessageGroupId(databaseName).withMessageAttributes(messageAttributes); 34 | try { 35 | SendMessageResult sendMsgRes = sqs.sendMessage(req); 36 | statusCode = sendMsgRes.getSdkHttpMetadata().getHttpStatusCode(); 37 | } catch (Exception e) { 38 | e.printStackTrace(); 39 | System.out.println("Exception thrown while writing message to SQS. " + e.getLocalizedMessage()); 40 | } 41 | if (statusCode == 200) { 42 | messageSentToSQS = true; 43 | } else 44 | System.out.printf("Cannot write Table schema %s to SQS queue. \n", message); 45 | return messageSentToSQS; 46 | } 47 | 48 | } 49 | -------------------------------------------------------------------------------- /src/main/resources/AWS_Glue_Table_versions_cleanup_utility.drawio: -------------------------------------------------------------------------------- 1 | 7VrbcuI4EP0aHkPJF26PYEJmtpKdXGaTmaeUsITRjGw5shwgX7+SLWMbGxaSQLzZJVXBbrV1aZ0+rW7Tshx/ecFhOL9iCNOWCdCyZY1bpjmwu/K/EqxSQdfQAo8TlIqMXHBHXrAWAi2NCcJRSVEwRgUJy0KXBQF2RUkGOWeLstqM0fKoIfRwRXDnQlqVPhAk5qm03wG5/Asm3jwb2QC6xYeZshZEc4jYoiCyzluWwxkT6ZW/dDBVtsvscvP1/Pby/tcTv/1jSDvugHZ+Xp2lnU0OeWS9BI4D8equp+xPNBzyn07vr9AdTvrgGxlnXUdildkLI2k+fcu4mDOPBZCe59IRZ3GAsOoVyLtc55KxUAoNKfyFhVhpLMBYMCmaC5/q1hkLRDaEtq0adWO//mGx2cxZzF28S0/DVUDuYbHDFPmWSlfAzMeCr+RzHFMoyHN5dlCD0lvr5YaXF9r2B+xD5/99yExhv3EfkkeHnMNVQSFkJBBRoedrJZAKmubMnp32qFlOCjb2NO0x3+H11PbadNgRi8mXqx8L7z64/tZd3bxc3Geme4Y01stqmV0qbTOKQhiU4NB9ihXNJFt2psVD1b+AlLiyIVeRV576vsBqfQgKOIWRZGCzC32Ji1EwjcJUjabqk3S09KlMPOVvHf891wIDlEBnSpOF1E98w4EEXooy3jmOyIvqQ7uMBoXU7oxanbEahxIvkAJXYh5zKXjGXBAZTIa6wScIJc5H4RTTEXR/e4kbOowynoxrzZJPvXup7vCyLu7paeXRpuh42ymj6ie69zPQ7oLM5zSoz8z09jBXyn0lU2GzWaTAtcF5h3lEfTgClW08AQ9GkpPEUJ01pCBgAc5kE6Kmn3SLA5RpuBRGUQJUKdQqjaDTesVBE/jU7naOz6dmhU+/c+mhCZ6giKP/HEUYW840unvQVoGutE9GgxhiJ+4Lmzx8uGs5ZmsoFwsuoT9FsKUsJBUtQ+3h98RwJrjHPCIskDAADsUwiEN5dU2hNC6vQIPFgpIAO+usBOjtKmwjSD5S7nGICM7bNIfMJDcU1Meg4xi9hFs4+403lBGM5msSyzB1qSB0zSIi5MRl25QJwfwa0AlFdVVsJvhKac8wK3iTD0RhuroZWarBU7xjfv6MU9gn9DiHoXrAX3oqQ2zDRWS3aWLox1kcuOncRocRXAXmW1Fq2oMSRC2jn94v8nQuy+bmhUwuk73lrF6/gt5HHdbrsFdDIe8XZPr7Bplubbw7QTa1c94FknAoi9EDFO5cyhN4y+/bWBHD8T1/5BhWp3tSz99M845CAlhpPAri40eVY6AjskDmzY0hgUE1DvnwhalUZ7wKoM/G1ZTkTdCSbfa4Z06sQtuYcKz5V6KJK0tsYk9+7NGggr3iKaRxAKwFmzyaJXz11U3CjbxNr8paKLE9mh4RitlZdn22tcHJsFh7zLM+Kh4dI9vZXpvbo3Zknijq7JpkHSPc3dzJ/zcxjl8Xb+TfRM2kjhQmlm33zcNIYUtA+qykED1Faz6oya/2g+nefHDK2FSf9n1IVb/hVZTtfNHYIkrPKifnJymiWBUWa3oRZatfv75ustvhQdsEVr+0NWe63NWIwskuqG/UTSovBdYVlPq3BVteC4QQIRJ4aR0fhEtVgEl686VnkaAg36zs7yrOfMqSzEkSs23VmfcPfj3Qa1bwG3x47MsjWzH8GeXwpyGWx74ttZzjxT7733CmzmptBdq6xQISdaj+ofiFRSKZjJtWdJ7XNJLzVfEVIsIUC8U3jCJV9S080LSwVgRDRt7vGOO2nGjWIc7od41yiMtOI82NcXZ9jAMX6t4EYyjUqwFHflHmvW8iVh9fOkMLjDqfL7540qCP6qcFj6425unSKxPsWW6x2xl7HEBL8jb/RVeKy/xncdb53w== -------------------------------------------------------------------------------- /dependency-reduced-pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4.0.0 4 | software.aws.glue 5 | glue-tableversions-cleanup 6 | aws-glue-table-versions-cleanup-utility 7 | 0.1 8 | 9 | 10 | 11 | maven-compiler-plugin 12 | 3.7.0 13 | 14 | 1.8 15 | 1.8 16 | 17 | 18 | 19 | maven-shade-plugin 20 | 3.1.0 21 | 22 | 23 | package 24 | 25 | shade 26 | 27 | 28 | 29 | 30 | software.aws.glue.tableversions.lambda.TableVersionsCleanupPlannerLambda 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | junit 42 | junit 43 | 4.12 44 | test 45 | 46 | 47 | hamcrest-core 48 | org.hamcrest 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | com.amazonaws 57 | aws-java-sdk-bom 58 | 1.11.873 59 | pom 60 | import 61 | 62 | 63 | 64 | 65 | 66 | -------------------------------------------------------------------------------- /.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | -------------------------------------------------------------------------------- /src/main/java/software/aws/glue/tableversions/utils/DDBUtil.java: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: MIT-0 3 | 4 | package software.aws.glue.tableversions.utils; 5 | 6 | import com.amazonaws.services.dynamodbv2.AmazonDynamoDB; 7 | import com.amazonaws.services.dynamodbv2.document.DynamoDB; 8 | import com.amazonaws.services.dynamodbv2.document.Item; 9 | import com.amazonaws.services.dynamodbv2.document.PutItemOutcome; 10 | import com.amazonaws.services.dynamodbv2.document.Table; 11 | 12 | /** 13 | * This is a utility class with methods to write items to DynamoDB table. from / 14 | * to a DynamoDB table. 15 | * 16 | * @author Ravi Itha, Amazon Web Services, Inc. 17 | * 18 | */ 19 | public class DDBUtil { 20 | 21 | public boolean insertCleanupStatusToDynamoDB(AmazonDynamoDB ddbClient, String ddbTableName, String hashKey, 22 | String rangeKey, long executionId, String executionBatchId, String databaseName, String tableName, 23 | int numTableVersionsB4Cleanup, int numVersionsRetained, int numDeletedVersions) { 24 | 25 | boolean itemInserted = false; 26 | DynamoDB dynamoDB = new DynamoDB(ddbClient); 27 | Table table = dynamoDB.getTable(ddbTableName); 28 | Item item = new Item() 29 | .withPrimaryKey(hashKey, executionId) 30 | .withNumber(rangeKey, Long.parseLong(executionBatchId)) 31 | .withString("table_name", tableName) 32 | .withString("database_name", databaseName) 33 | .withNumber("number_of_versions_before_cleanup", numTableVersionsB4Cleanup) 34 | .withNumber("number_of_versions_retained", numVersionsRetained) 35 | .withNumber("number_of_versions_deleted", numDeletedVersions); 36 | // Write the item to the table 37 | PutItemOutcome outcome = table.putItem(item); 38 | int statusCode = outcome.getPutItemResult().getSdkHttpMetadata().getHttpStatusCode(); 39 | if (statusCode == 200) { 40 | itemInserted = true; 41 | System.out.println("Table version inserted to DynamoDB table: " + ddbTableName); 42 | } 43 | return itemInserted; 44 | } 45 | 46 | /** 47 | * Method to write Table version details to a DynamoDB table. 48 | * 49 | * @param dynamoDBClient 50 | * @param dynamoDBTblName 51 | * @param primaryPartKey 52 | * @param primarySortKey 53 | * @param executionBatchId 54 | * @param databaseName 55 | * @param tableName 56 | * @param notificationTime 57 | * @return 58 | */ 59 | public boolean insertTableDetailsToDynamoDB(AmazonDynamoDB ddbClient, String ddbTableName, String hashKey, 60 | String rangeKey, long executionBatchId, String databaseName, String tableName, String messageSentTime) { 61 | 62 | boolean itemInserted = false; 63 | DynamoDB dynamoDB = new DynamoDB(ddbClient); 64 | Table table = dynamoDB.getTable(ddbTableName); 65 | Item item = new Item().withPrimaryKey(hashKey, executionBatchId) 66 | .withString(rangeKey, databaseName.concat("|").concat(tableName)).withString("table_name", tableName) 67 | .withString("database_name", databaseName).withString("message_sent_time", messageSentTime); 68 | // Write the item to the table 69 | PutItemOutcome outcome = table.putItem(item); 70 | int statusCode = outcome.getPutItemResult().getSdkHttpMetadata().getHttpStatusCode(); 71 | if (statusCode == 200) { 72 | itemInserted = true; 73 | System.out.println("Table version inserted to DynamoDB table: " + ddbTableName); 74 | } 75 | return itemInserted; 76 | } 77 | 78 | } -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | software.aws.glue 4 | glue-tableversions-cleanup 5 | 0.1 6 | aws-glue-table-versions-cleanup-utility 7 | 8 | 9 | 10 | 11 | org.apache.maven.plugins 12 | maven-compiler-plugin 13 | 3.7.0 14 | 15 | 1.8 16 | 1.8 17 | 18 | 19 | 20 | org.apache.maven.plugins 21 | maven-shade-plugin 22 | 3.1.0 23 | 24 | 25 | package 26 | 27 | shade 28 | 29 | 30 | 31 | 33 | software.aws.glue.tableversions.lambda.TableVersionsCleanupPlannerLambda 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | com.amazonaws 47 | aws-java-sdk-bom 48 | 1.11.873 49 | pom 50 | import 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | com.amazonaws 60 | aws-java-sdk-glue 61 | 62 | 63 | 64 | 65 | com.amazonaws 66 | aws-java-sdk-sqs 67 | 68 | 69 | 70 | com.amazonaws 71 | aws-java-sdk-sts 72 | 73 | 74 | 75 | 76 | com.amazonaws 77 | aws-java-sdk-dynamodb 78 | 79 | 80 | 81 | 82 | junit 83 | junit 84 | 4.13.1 85 | test 86 | 87 | 88 | 89 | 90 | com.amazonaws 91 | aws-lambda-java-events 92 | 2.2.7 93 | 94 | 95 | com.amazonaws 96 | aws-lambda-java-core 97 | 1.2.0 98 | 99 | 100 | 101 | 102 | com.google.guava 103 | guava 104 | 30.0-jre 105 | 106 | 107 | 108 | 109 | com.google.code.gson 110 | gson 111 | 2.8.9 112 | 113 | 114 | 115 | -------------------------------------------------------------------------------- /src/main/java/software/aws/glue/tableversions/lambda/TableVersionsCleanupLambda.java: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: MIT-0 3 | 4 | package software.aws.glue.tableversions.lambda; 5 | 6 | import java.util.ArrayList; 7 | import java.util.List; 8 | import java.util.Map.Entry; 9 | import java.util.Optional; 10 | 11 | import com.amazonaws.services.dynamodbv2.AmazonDynamoDB; 12 | import com.amazonaws.services.dynamodbv2.AmazonDynamoDBClientBuilder; 13 | import com.amazonaws.services.glue.AWSGlue; 14 | import com.amazonaws.services.glue.AWSGlueClientBuilder; 15 | import com.amazonaws.services.glue.model.TableVersion; 16 | import com.amazonaws.services.lambda.runtime.Context; 17 | import com.amazonaws.services.lambda.runtime.RequestHandler; 18 | import com.amazonaws.services.lambda.runtime.events.SQSEvent; 19 | import com.amazonaws.services.lambda.runtime.events.SQSEvent.MessageAttribute; 20 | import com.amazonaws.services.lambda.runtime.events.SQSEvent.SQSMessage; 21 | import com.google.common.primitives.Ints; 22 | import com.google.gson.Gson; 23 | 24 | import software.aws.glue.tableversions.utils.DDBUtil; 25 | import software.aws.glue.tableversions.utils.GlueTable; 26 | import software.aws.glue.tableversions.utils.GlueUtil; 27 | import software.aws.glue.tableversions.utils.TableVersionStatus; 28 | 29 | /** 30 | * This class has AWS Lambda Handler method. Upon invocation, it gets an event 31 | * from source SQS queue, gets the message(s). 32 | * 33 | * For each message, it takes the following actions: 1. Parse the message and 34 | * get Table name 2. Fetch list of table versions 3. Determine the list of table 35 | * versions to retains 4. Delete old table versions 5. Insert a record into 36 | * DynamoDB table with the statistics 37 | * 38 | * @author Ravi Itha, Amazon Web Services, Inc. 39 | * 40 | */ 41 | public class TableVersionsCleanupLambda implements RequestHandler { 42 | 43 | @Override 44 | public String handleRequest(SQSEvent event, Context context) { 45 | 46 | String region = Optional.ofNullable(System.getenv("region")).orElse("us-east-1"); 47 | String ddbTableName = Optional.ofNullable(System.getenv("ddb_table_name")) 48 | .orElse("glue_table_version_cleanup_statistics"); 49 | String hashKey = Optional.ofNullable(System.getenv("hash_key")).orElse("execution_id"); 50 | String rangeKey = Optional.ofNullable(System.getenv("range_key")).orElse("execution_batch_id"); 51 | int numberofVersionsToRetain = Ints 52 | .tryParse(Optional.ofNullable(System.getenv("number_of_versions_to_retain")).orElse("100")); 53 | 54 | System.out.println("Region: " + region); 55 | System.out.println("Number of table versions to retain: " + numberofVersionsToRetain); 56 | System.out.println("DynamoDB Table to track statistics: " + ddbTableName); 57 | 58 | AWSGlue glueClient = AWSGlueClientBuilder.standard().withRegion(region).build(); 59 | AmazonDynamoDB dynamoDBClient = AmazonDynamoDBClientBuilder.standard().withRegion(region).build(); 60 | 61 | if (numberofVersionsToRetain < 50) { 62 | throw new RuntimeException(); 63 | } else { 64 | System.out.println("Number of messages in SQS Event: " + event.getRecords().size()); 65 | List sqsMessages = event.getRecords(); 66 | processEvent(glueClient, dynamoDBClient, sqsMessages, numberofVersionsToRetain, ddbTableName, hashKey, 67 | rangeKey); 68 | } 69 | return "SNS event to Lambda processed successfully!"; 70 | } 71 | 72 | /** 73 | * This method processes SQS event 74 | * 75 | * @param glueClient 76 | * @param dynamoDBClient 77 | * @param sqsMessages 78 | * @param numberofVersionsToRetain 79 | * @param dynamoDBTableName 80 | * @param primaryPartKey 81 | * @param primarySortKey 82 | */ 83 | public void processEvent(AWSGlue glueClient, AmazonDynamoDB dynamoDBClient, List sqsMessages, 84 | int numberofVersionsToRetain, String dynamoDBTableName, String hashKey, String rangeKey) { 85 | 86 | DDBUtil ddbUtil = new DDBUtil(); 87 | GlueUtil glueUtil = new GlueUtil(); 88 | List tblVersionsNotDeletedMasterList = new ArrayList(); 89 | 90 | for (SQSMessage sqsMessage : sqsMessages) { 91 | long executionId = System.currentTimeMillis(); 92 | // get Execution Batch Id from Message Attributes 93 | String executionBatchId = ""; 94 | for (Entry entry : sqsMessage.getMessageAttributes().entrySet()) { 95 | if ("ExecutionBatchId".equalsIgnoreCase(entry.getKey())) { 96 | executionBatchId = entry.getValue().getStringValue(); 97 | System.out.println("Execution Batch Id: " + executionBatchId); 98 | } 99 | } 100 | 101 | // de-serialize SQS message to GlueTable 102 | Gson gson = new Gson(); 103 | String message = new String(sqsMessage.getBody()); 104 | GlueTable glueTable = gson.fromJson(message, GlueTable.class); 105 | System.out.printf("Process event for table '%s' under database '%s' \n", glueTable.getTableName(), 106 | glueTable.getDatabaseName()); 107 | 108 | // get table versions 109 | List tableVersionList = glueUtil.getTableVersions(glueClient, glueTable.getTableName(), 110 | glueTable.getDatabaseName()); 111 | 112 | if (tableVersionList.size() > numberofVersionsToRetain) { 113 | // identify the versions that are older than numberofVersionsToRetain 114 | List> lists = glueUtil.determineOldVersions(tableVersionList, glueTable.getTableName(), 115 | glueTable.getDatabaseName(), numberofVersionsToRetain); 116 | List versionsToKeep = lists.get(0); 117 | List versionsToDelete = lists.get(1); 118 | 119 | System.out.printf("For table '%s', versions to be deleted: %d, versions to be retaind: %d \n", 120 | glueTable.getTableName(), versionsToDelete.size(), versionsToKeep.size()); 121 | 122 | // delete older versions 123 | List tblVersionsNotDeletedList = glueUtil.deleteTableVersions(glueClient, 124 | versionsToDelete, glueTable.getTableName(), glueTable.getDatabaseName()); 125 | 126 | int numTableVersionsB4Cleanup = tableVersionList.size(); 127 | int numDeletedVersions = versionsToDelete.size() - tblVersionsNotDeletedList.size(); 128 | 129 | boolean itemInserted = ddbUtil.insertCleanupStatusToDynamoDB(dynamoDBClient, dynamoDBTableName, hashKey, 130 | rangeKey, executionId, executionBatchId, glueTable.getDatabaseName(), glueTable.getTableName(), 131 | numTableVersionsB4Cleanup, versionsToKeep.size(), numDeletedVersions); 132 | 133 | if (tblVersionsNotDeletedList.size() == 0) 134 | System.out.printf("Older versions of table '%s' under database '%s' were deleted. \n", 135 | glueTable.getTableName(), glueTable.getDatabaseName()); 136 | else 137 | tblVersionsNotDeletedMasterList.addAll(tblVersionsNotDeletedList); 138 | } else { 139 | System.out.printf("Table '%s' does not have more than %d versions. Skipping. \n", 140 | glueTable.getTableName(), numberofVersionsToRetain); 141 | } 142 | } 143 | } 144 | } -------------------------------------------------------------------------------- /src/main/java/software/aws/glue/tableversions/lambda/TableVersionsCleanupPlannerLambda.java: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: MIT-0 3 | 4 | package software.aws.glue.tableversions.lambda; 5 | 6 | import java.util.ArrayList; 7 | import java.util.Collections; 8 | import java.util.Date; 9 | import java.util.List; 10 | import java.util.Optional; 11 | import java.util.StringTokenizer; 12 | import java.util.concurrent.atomic.AtomicInteger; 13 | import java.util.stream.Collectors; 14 | 15 | import com.amazonaws.regions.Regions; 16 | import com.amazonaws.services.dynamodbv2.AmazonDynamoDB; 17 | import com.amazonaws.services.dynamodbv2.AmazonDynamoDBClientBuilder; 18 | import com.amazonaws.services.glue.AWSGlue; 19 | import com.amazonaws.services.glue.AWSGlueClientBuilder; 20 | import com.amazonaws.services.glue.model.Database; 21 | import com.amazonaws.services.glue.model.Table; 22 | import com.amazonaws.services.lambda.runtime.Context; 23 | import com.amazonaws.services.lambda.runtime.RequestHandler; 24 | import com.amazonaws.services.securitytoken.AWSSecurityTokenService; 25 | import com.amazonaws.services.securitytoken.AWSSecurityTokenServiceClientBuilder; 26 | import com.amazonaws.services.securitytoken.model.GetCallerIdentityRequest; 27 | import com.amazonaws.services.securitytoken.model.GetCallerIdentityResult; 28 | import com.amazonaws.services.sqs.AmazonSQS; 29 | import com.amazonaws.services.sqs.AmazonSQSClientBuilder; 30 | import com.google.gson.Gson; 31 | 32 | import software.aws.glue.tableversions.utils.DDBUtil; 33 | import software.aws.glue.tableversions.utils.GlueTable; 34 | import software.aws.glue.tableversions.utils.GlueUtil; 35 | import software.aws.glue.tableversions.utils.SQSUtil; 36 | 37 | /** 38 | * This class has AWS Lambda Handler method. Upon invocation, it takes the 39 | * following actions: 1. it fetches all databases form Glue Catalog 2. for each 40 | * database, fetches all of its tables 3. for each table, it publishes table 41 | * and database names to SQS queue. 42 | * 43 | * @author Ravi Itha, Amazon Web Services, Inc. 44 | * 45 | */ 46 | public class TableVersionsCleanupPlannerLambda implements RequestHandler { 47 | 48 | @Override 49 | public String handleRequest(Object input, Context context) { 50 | 51 | String separator = Optional.ofNullable(System.getenv("separator")).orElse("$"); 52 | String region = Optional.ofNullable(System.getenv("region")).orElse(Regions.US_EAST_1.getName()); 53 | String databaseNamesStringLiteral = Optional.ofNullable(System.getenv("database_names_string_literal")) 54 | .orElse("database_1$database_2"); 55 | String sqsQueueURI = Optional.ofNullable(System.getenv("sqs_queue_url")) 56 | .orElse("https://sqs.us-east-1.amazonaws.com/1234567890/table_versions_cleanup_planner_queue.fifo"); 57 | String ddbTableName = Optional.ofNullable(System.getenv("ddb_table_name")) 58 | .orElse("glue_table_version_cleanup_planner"); 59 | String hashKey = Optional.ofNullable(System.getenv("hash_key")).orElse("execution_batch_id"); 60 | String rangeKey = Optional.ofNullable(System.getenv("range_key")).orElse("database_name_table_name"); 61 | 62 | long executionBatchId = System.currentTimeMillis(); 63 | 64 | AWSSecurityTokenService client = AWSSecurityTokenServiceClientBuilder.standard().build(); 65 | GetCallerIdentityRequest request = new GetCallerIdentityRequest(); 66 | GetCallerIdentityResult response = client.getCallerIdentity(request); 67 | String homeCatalogId = response.getAccount(); 68 | context.getLogger().log("Catalog Id: " + homeCatalogId); 69 | 70 | context.getLogger().log("Input: " + input); 71 | printEnvVariables(sqsQueueURI, databaseNamesStringLiteral, separator, region, ddbTableName, hashKey, rangeKey); 72 | 73 | // Create objects for AWS Glue and Amazon SQS 74 | AWSGlue glue = AWSGlueClientBuilder.standard().withRegion(region).build(); 75 | AmazonSQS sqs = AmazonSQSClientBuilder.standard().withRegion(region).build(); 76 | AmazonDynamoDB dynamoDBClient = AmazonDynamoDBClientBuilder.standard().withRegion(region).build(); 77 | 78 | DDBUtil ddbUtil = new DDBUtil(); 79 | SQSUtil sqsUtil = new SQSUtil(); 80 | GlueUtil glueUtil = new GlueUtil(); 81 | List databaseNames = new ArrayList(); 82 | List databaseList = new ArrayList(); 83 | AtomicInteger numberOfTablesExported = new AtomicInteger(); 84 | 85 | // When list of databases are provided as a token separated values then the 86 | // cleanup process will be initiated for those databases. 87 | // else, it imports the cleanup process will be initiated for all databases 88 | 89 | if (databaseNamesStringLiteral.equalsIgnoreCase("")) { 90 | databaseList = glueUtil.getDatabases(glue, homeCatalogId); 91 | } else { 92 | databaseNames = tokenizeStrings(databaseNamesStringLiteral, separator); 93 | for (String databaseName : databaseNames) { 94 | Database database = glueUtil.getDatabase(glue, homeCatalogId, databaseName); 95 | if (Optional.ofNullable(database).isPresent()) 96 | databaseList.add(database); 97 | } 98 | } 99 | 100 | List tableList = glueUtil.getTables(glue, databaseList, homeCatalogId); 101 | for (Table table : tableList) { 102 | GlueTable tableMessage = new GlueTable(); 103 | tableMessage.setDatabaseName(table.getDatabaseName()); 104 | tableMessage.setTableName(table.getName()); 105 | 106 | Gson gson = new Gson(); 107 | String message = gson.toJson(tableMessage); 108 | 109 | // Write a message to Amazon SQS queue. 110 | boolean messageSentToSQS = sqsUtil.sendTableSchemaToSQSQueue(sqs, sqsQueueURI, message, executionBatchId, table.getDatabaseName()); 111 | if (messageSentToSQS) { 112 | String messageSentTime = new Date().toString(); 113 | numberOfTablesExported.incrementAndGet(); 114 | ddbUtil.insertTableDetailsToDynamoDB(dynamoDBClient, ddbTableName, hashKey, rangeKey, executionBatchId, 115 | table.getDatabaseName(), table.getName(), messageSentTime); 116 | } 117 | } 118 | System.out.printf("Number of messages written to SQS Queue: %d \n", numberOfTablesExported.get()); 119 | return "TableVersionsCleanupPlannerLambda completed successfully!"; 120 | } 121 | 122 | /** 123 | * This method prints environment variables 124 | * 125 | * @param sourceGlueCatalogId 126 | * @param topicArn 127 | * @param ddbTblNameForDBStatusTracking 128 | */ 129 | public static void printEnvVariables(String sqsQueueURI, String databaseNamesStringLiteral, String separator, 130 | String region, String ddbTableName, String hashKey, String rangeKey) { 131 | System.out.println("Region: " + region); 132 | System.out.println("SQS URL: " + sqsQueueURI); 133 | System.out.println("Separator: " + separator); 134 | System.out.println("Database names string literal: " + sqsQueueURI); 135 | System.out.println("DynamoDB table Name: " + ddbTableName); 136 | System.out.println("DynamoDB table - hash key: " + hashKey); 137 | System.out.println("DynamoDB table - range key: " + rangeKey); 138 | } 139 | 140 | /** 141 | * This method tokenizes strings using a provided separator 142 | * 143 | * @param str 144 | * @param separator 145 | * @return 146 | */ 147 | public static List tokenizeStrings(String str, String separator) { 148 | List tokenList = Collections.list(new StringTokenizer(str, separator)).stream() 149 | .map(token -> (String) token).collect(Collectors.toList()); 150 | return tokenList; 151 | } 152 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AWS Glue Table versions cleanup utility 2 | 3 | AWS Glue has soft limits for **Number of table versions per table** and **Number of table versions per account**. For more details on the soft-limits, refer [AWS Glue endpoints and quotas](https://docs.aws.amazon.com/general/latest/gr/glue.html). AWS Glue Table versions cleanup utility helps you delete old versions of Glue Tables. This is developed using AWS Glue SDK for Java. This is deployed as two AWS Lambda functions. This helps you retain X number of most recent versions for each Table and deletes the rest. Using this utility, you will be able to keep per-table and account level soft-limits under control. This can be scheduled using Amazon CloudWatch Events e.g. once in a month. 4 | 5 | This utility comes in two forms: 6 | 7 | 1. Java - use [main](https://github.com/aws-samples/aws-glue-table-versions-cleanup-utility/tree/main) branch 8 | 1. Python - use [main-python](https://github.com/aws-samples/aws-glue-table-versions-cleanup-utility/tree/main-python) branch 9 | 10 | Note: This utility safely ignores Databases and Tables that are resource linked from an another AWS account to the AWS account this utility is deployed into. In other words, this utility cleans up old versions of a table ONLY when the table belongs to the account this utility is deployed to run. Refer [How Resource Links Work in Lake Formation](https://docs.aws.amazon.com/lake-formation/latest/dg/resource-links-about.html) for more details. 11 | 12 | --- 13 | 14 | ## Architecture 15 | 16 | The Architecture of this utility is shown in the below diagram 17 | ![Alt](./src/main/resources/AWS_Glue_Table_versions_cleanup_utility.png) 18 | 19 | --- 20 | 21 | ## Application Overview 22 | 23 | ### Pre-requisites 24 | 25 | 1. JDK 8 26 | 1. IDE for e.g. [Eclipse](https://www.eclipse.org/) or [Spring Tools](https://spring.io/tools) or [Intellij IDEA](https://www.jetbrains.com/idea/) 27 | 1. [Apache Maven](https://maven.apache.org/) 28 | 1. Access to AWS account 29 | 1. [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html) 30 | 31 | --- 32 | 33 | ### AWS Service Requirements 34 | 35 | The following AWS services are required to deploy this utility: 36 | 37 | 1. 2 AWS Lambda functions 38 | 1. 2 IAM roles 39 | 1. 1 Amazon SQS Queue 40 | 1. 2 Amazon DynamoDB tables 41 | 1. 1 AWS CloudWatch Event Rule 42 | 1. 1 Amazon S3 bucket to upload AWS Lambda Function binary 43 | 44 | ### Lambda Functions Overview 45 | 46 | | Class | Overview | 47 | |---------- | -------- | 48 | | [TableVersionsCleanupPlannerLambda](./src/main/java/software/aws/glue/tableversions/lambda/TableVersionsCleanupPlannerLambda.java) | Lambda Function gets a list of tables for all databases and initiates the cleanup process.| 49 | | [TableVersionsCleanupLambda](./src/main/java/software/aws/glue/tableversions/lambda/TableVersionsCleanupLambda.java) | Lambda Function deletes old versions of a table. | 50 | 51 | --- 52 | 53 | ## Build Instructions 54 | 55 | 1. Clone this code repo to your Laptop / MacBook 56 | 1. This project has Maven nature, so you can import it to your IDE. 57 | 1. Build a Jar file using one of the steps below: 58 | 1. Using standalone Maven, go to project home directory and run command ```mvn -X clean install``` 59 | 1. From Eclipse or STS, run command ```-X clean install```. Navigation: Project right click --> Run As --> Maven Build (Option 4) 60 | 1. This will generate a jar file ```glue-tableversions-cleanup-0.1.jar``` 61 | 1. Note: The size of the jar file is around 16 MB 62 | 63 | --- 64 | 65 | ## Deployment Instructions 66 | 67 | 1. Log onto AWS console, select S3, select a bucket you want to use. If you do not have bucket already create one 68 | 1. Create a folder with name ```table_version_cleanup_lambda_jar``` 69 | 1. Open command prompt on your Laptop / MacBook 70 | 1. Upload Lambda function Jar file to S3 bucket 71 | 72 | ```bash 73 | aws s3 cp glue-tableversions-cleanup-0.1.jar s3:///table_version_cleanup_lambda_jar/ 74 | ``` 75 | 76 | 1. Create an Amazon SQS queue with the below details: 77 | 1. name = ```table_versions_cleanup_planner_queue.fifo``` 78 | 1. Type = FIFO 79 | 1. Configuration: 80 | 1. Visibility timeout = 15 minutes 81 | 1. Message retention period = 4 Days 82 | 1. Delivery delay = 0 seconds 83 | 1. Content-based deduplication = enable 84 | 85 | 1. Create DynamoDB tables 86 | 87 | | Table | Schema | Capacity | 88 | |--------------|--------|-----------| 89 | | glue_table_version_cleanup_planner | Primary partition key - execution_batch_id (Number), Primary sort key - database_name_table_name (String) | Provisioned read capacity units = 5, Provisioned write capacity units = 10 | 90 | | glue_table_version_cleanup_statistics | Primary partition key - execution_id (Number), Primary sort key - execution_batch_id (Number) | Provisioned read capacity units = 5, Provisioned write capacity units = 10 | 91 | 92 | 1. Create IAM policies that are common to both Lambda functions 93 | 1. Amazon DynamoDB policy 94 | 1. name = ```table_versions_cleanup_lambda_dynamodb_policy``` 95 | 1. sample policy = [table_versions_cleanup_lambda_dynamodb_policy](./src/main/resources/table_versions_cleanup_lambda_dynamodb_policy.json) 96 | 1. Amazon CloudWatch policy 97 | 1. name = ```table_versions_cleanup_lambda_cloudwatch_policy``` 98 | 1. sample policy = [table_versions_cleanup_cloudwatch_logs_policy](./src/main/resources/table_versions_cleanup_cloudwatch_logs_policy.json) 99 | 100 | 1. Create IAM policies for **TableVersionsCleanupPlannerLambdaExecRole** 101 | 102 | 1. AWS Glue policy 103 | 1. name = ```table_versions_cleanup_planner_lambda_glue_policy``` 104 | 1. sample policy = [table_versions_cleanup_planner_lambda_glue_policy](./src/main/resources/table_versions_cleanup_planner_lambda_glue_policy.json) 105 | 1. Amazon SQS policy 106 | 1. name = ```table_versions_cleanup_planner_lambda_sqs_policy``` 107 | 1. sample policy = [table_versions_cleanup_planner_lambda_sqs_policy](./src/main/resources/table_versions_cleanup_planner_lambda_sqs_policy.json) 108 | 109 | 1. Create IAM policies for **TableVersionsCleanupLambdaExecRole** 110 | 111 | 1. AWS Glue policy 112 | 1. name = ```table_versions_cleanup_lambda_glue_policy``` 113 | 1. sample policy = [table_versions_cleanup_lambda_glue_policy](./src/main/resources/table_versions_cleanup_lambda_glue_policy.json) 114 | 1. Amazon SQS policy 115 | 1. name = ```table_versions_cleanup_lambda_sqs_policy``` 116 | 1. sample policy = [table_versions_cleanup_lambda_sqs_policy](./src/main/resources/table_versions_cleanup_lambda_sqs_policy.json) 117 | 118 | 1. Create an IAM role with name ```TableVersionsCleanupPlannerLambdaExecRole``` and attach below policies: 119 | 1. table_versions_cleanup_lambda_dynamodb_policy 120 | 1. table_versions_cleanup_lambda_cloudwatch_policy 121 | 1. table_versions_cleanup_planner_lambda_sqs_policy 122 | 1. table_versions_cleanup_planner_lambda_glue_policy 123 | 124 | 1. Create an IAM role with name ```TableVersionsCleanupLambdaExecRole``` and attach below policies: 125 | 1. table_versions_cleanup_lambda_sqs_policy 126 | 1. table_versions_cleanup_lambda_glue_policy 127 | 1. table_versions_cleanup_lambda_dynamodb_policy 128 | 1. table_versions_cleanup_lambda_cloudwatch_policy 129 | 130 | 1. Deploy **TableVersionsCleanupPlannerLambda** function 131 | 132 | 1. Runtime = Java 8 133 | 1. IAM Execution role = ```TableVersionsCleanupPlannerLambdaExecRole``` 134 | 1. Function package = ```s3:///table_version_cleanup_lambda_jar/glue-tableversions-cleanup-0.1.jar``` 135 | 1. Lambda Handler = ```software.aws.glue.tableversions.lambda.TableVersionsCleanupPlannerLambda``` 136 | 1. Timeout = e.g. 15 minutes 137 | 1. Memory = e.g. 128 MB 138 | 1. Environment variable = as defined in the following table 139 | 140 | | Variable Name | E.g. Value | Description | 141 | |----------------|------------ | -------------| 142 | | database_names_string_literal | database_1$database_2$database_3 | database names string literal separated by a separator token | 143 | | separator | $ | The separator used in the database_prefix_list | 144 | | region | us-east-1 | AWS region used | 145 | | sqs_queue_url | ```https://sqs.us-east-1.amazonaws.com//table_versions_cleanup_planner_queue.fifo``` | SQS queue name used | 146 | | ddb_table_name | glue_table_version_cleanup_planner | DynamoDB Table used | 147 | | hash_key | execution_batch_id | Primary partition key used | 148 | | range_key | database_name_table_name | Primary sort key used | 149 | 150 | 1. Deploy **TableVersionsCleanupLambda** function 151 | 152 | 1. Runtime = Java 8 153 | 1. IAM Execution role = ```TableVersionsCleanupPlannerLambdaExecRole``` 154 | 1. Function package = ```s3:///table_version_cleanup_lambda_jar/glue-tableversions-cleanup-0.1.jar``` 155 | 1. Lambda Handler = ```software.aws.glue.tableversions.lambda.TableVersionsCleanupLambda``` 156 | 1. Timeout = e.g. 15 minutes 157 | 1. Memory = e.g. 192 MB 158 | 1. Environment variable = as defined in the following table 159 | 160 | | Variable Name | E.g. Variable Value | Description | 161 | |----------------|------------ | ------------------| 162 | | region | us-east-1 | AWS region used | 163 | | number_of_versions_to_retain | 100 | Number of old versions to retain per table | 164 | | ddb_table_name | glue_table_version_cleanup_statistics | DynamoDB Table used | 165 | | hash_key | execution_id | Primary partition key used | 166 | | range_key | execution_batch_id | Primary sort key used | 167 | 168 | 1. Add an SQS trigger and select ```table_versions_cleanup_planner_queue.fifo``` 169 | 170 | 1. Create a CloudWatch Event Rule and add **TableVersionsCleanupPlannerLambda** as its target. 171 | Refer the following AWS documentation for more details: 172 | 173 | 1. [Schedule Expressions for Rules](https://docs.aws.amazon.com/AmazonCloudWatch/latest/events/ScheduledEvents.html) 174 | 1. [Tutorial: Schedule AWS Lambda Functions Using CloudWatch Events](https://docs.aws.amazon.com/AmazonCloudWatch/latest/events/RunLambdaSchedule.html) 175 | 176 | --- 177 | 178 | ## Contributors 179 | 180 | 1. Ravi Itha, Senior Big Data Consultant, Amazon Web Services, Inc. 181 | 1. Phanee Gottumukkala, Associate Cloud Developer, Amazon Web Services, Inc. 182 | 1. Julia Kroll, Data & ML Engineer, Amazon Web Services, Inc. 183 | 184 | --- 185 | 186 | ## License Summary 187 | 188 | This sample code is made available under the MIT-0 license. See the LICENSE file. 189 | -------------------------------------------------------------------------------- /src/main/java/software/aws/glue/tableversions/utils/GlueUtil.java: -------------------------------------------------------------------------------- 1 | // Copyright 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | // SPDX-License-Identifier: MIT-0 3 | 4 | package software.aws.glue.tableversions.utils; 5 | 6 | import java.util.ArrayList; 7 | import java.util.Collections; 8 | import java.util.List; 9 | import java.util.Optional; 10 | import java.util.stream.Collectors; 11 | 12 | import com.amazonaws.services.glue.AWSGlue; 13 | import com.amazonaws.services.glue.model.BatchDeleteTableVersionRequest; 14 | import com.amazonaws.services.glue.model.BatchDeleteTableVersionResult; 15 | import com.amazonaws.services.glue.model.Database; 16 | import com.amazonaws.services.glue.model.EntityNotFoundException; 17 | import com.amazonaws.services.glue.model.GetDatabaseRequest; 18 | import com.amazonaws.services.glue.model.GetDatabaseResult; 19 | import com.amazonaws.services.glue.model.GetDatabasesRequest; 20 | import com.amazonaws.services.glue.model.GetDatabasesResult; 21 | import com.amazonaws.services.glue.model.GetTableVersionsRequest; 22 | import com.amazonaws.services.glue.model.GetTableVersionsResult; 23 | import com.amazonaws.services.glue.model.GetTablesRequest; 24 | import com.amazonaws.services.glue.model.GetTablesResult; 25 | import com.amazonaws.services.glue.model.Table; 26 | import com.amazonaws.services.glue.model.TableVersion; 27 | import com.amazonaws.services.glue.model.TableVersionError; 28 | import com.google.common.collect.Lists; 29 | 30 | public class GlueUtil { 31 | 32 | /** 33 | * Get all versions of a table 34 | * 35 | * @param glueClient 36 | * @param tableName 37 | * @param databaseName 38 | * @return 39 | */ 40 | public List getTableVersions(AWSGlue glueClient, String tableName, String databaseName) { 41 | 42 | List masterTableVersionList = new ArrayList(); 43 | 44 | // Prepare GetTableVersionsRequest and Get Table versions 45 | GetTableVersionsRequest getTableVersionsRequest = new GetTableVersionsRequest(); 46 | getTableVersionsRequest.setTableName(tableName); 47 | getTableVersionsRequest.setDatabaseName(databaseName); 48 | GetTableVersionsResult getTableVersionsResult = glueClient.getTableVersions(getTableVersionsRequest); 49 | List tableVersionList = getTableVersionsResult.getTableVersions(); 50 | masterTableVersionList.addAll(tableVersionList); 51 | 52 | // Get and check next token if it is null 53 | String tableVersionsNextToken = getTableVersionsResult.getNextToken(); 54 | if (Optional.ofNullable(tableVersionsNextToken).isPresent()) { 55 | do { 56 | getTableVersionsRequest = new GetTableVersionsRequest(); 57 | getTableVersionsRequest.setTableName(tableName); 58 | getTableVersionsRequest.setDatabaseName(databaseName); 59 | getTableVersionsRequest.setNextToken(tableVersionsNextToken); 60 | getTableVersionsResult = glueClient.getTableVersions(getTableVersionsRequest); 61 | tableVersionList = getTableVersionsResult.getTableVersions(); 62 | masterTableVersionList.addAll(tableVersionList); 63 | tableVersionsNextToken = getTableVersionsResult.getNextToken(); 64 | } while (Optional.ofNullable(tableVersionsNextToken).isPresent()); 65 | } 66 | return masterTableVersionList; 67 | } 68 | 69 | /** 70 | * Get all tables of a database 71 | * 72 | * @param glue 73 | * @param masterdatabaseList 74 | * @return 75 | */ 76 | public List
getTables(AWSGlue glue, List databaseList, String homeCatalogId) { 77 | 78 | List
masterTableList = new ArrayList
(); 79 | // Iterate through all the databases 80 | for (Database db : databaseList) { 81 | String databaseName = db.getName(); 82 | // Get tables 83 | GetTablesRequest getTablesRequest = new GetTablesRequest(); 84 | getTablesRequest.setDatabaseName(databaseName); 85 | GetTablesResult getTablesResult = glue.getTables(getTablesRequest); 86 | List
tableList = getTablesResult.getTableList(); 87 | for (Table table : tableList) { 88 | if (!Optional.ofNullable(table.getTargetTable()).isPresent()) { 89 | masterTableList.add(table); 90 | } else { 91 | System.out.printf("Table '%s' under database '%s' seems to have resource linked from AWS Account Id: '%s'. So, it will be skipped. \n", 92 | table.getName(), table.getDatabaseName(), table.getTargetTable().getCatalogId()); 93 | } 94 | } 95 | String tableResultNextToken = getTablesResult.getNextToken(); 96 | if (Optional.ofNullable(tableResultNextToken).isPresent()) { 97 | do { 98 | getTablesRequest = new GetTablesRequest(); 99 | getTablesRequest.setDatabaseName(databaseName); 100 | getTablesRequest.setNextToken(tableResultNextToken); 101 | getTablesResult = glue.getTables(getTablesRequest); 102 | tableList = getTablesResult.getTableList(); 103 | for (Table table : tableList) { 104 | if (!Optional.ofNullable(table.getTargetTable()).isPresent()) { 105 | masterTableList.add(table); 106 | } else { 107 | System.out.printf("Table '%s' under database '%s' seems to have resource linked from AWS Account Id: '%s'. So, it will be skipped. \n", 108 | table.getName(), table.getDatabaseName(), table.getTargetTable().getCatalogId()); 109 | } 110 | } 111 | tableResultNextToken = getTablesResult.getNextToken(); 112 | } while (Optional.ofNullable(tableResultNextToken).isPresent()); 113 | } 114 | } 115 | return masterTableList; 116 | } 117 | 118 | /** 119 | * This method gets AWS Glue Database based on a provided name 120 | * 121 | * @param glue 122 | * @param homeCatalogId 123 | * @return 124 | */ 125 | public Database getDatabase(AWSGlue glue, String homeCatalogId, String databaseName) { 126 | Database database = null; 127 | GetDatabaseRequest getDatabaseRequest = new GetDatabaseRequest(); 128 | getDatabaseRequest.setName(databaseName); 129 | getDatabaseRequest.setCatalogId(homeCatalogId); 130 | try { 131 | GetDatabaseResult getDatabaseResult = glue.getDatabase(getDatabaseRequest); 132 | database = getDatabaseResult.getDatabase(); 133 | } catch (EntityNotFoundException exception) { 134 | System.out.printf( 135 | "There is no database exist with name '%s' in AWS Account %s. It may be possible it is a resource linked from other database. " 136 | + "Hence, it will be skipped from clean-up process. \n", 137 | databaseName, homeCatalogId); 138 | } 139 | return database; 140 | } 141 | 142 | /** 143 | * Method to get all databases 144 | * 145 | * @param glue 146 | * @return 147 | */ 148 | public List getDatabases(AWSGlue glue, String homeCatalogId) { 149 | List masterDatabaseList = new ArrayList(); 150 | GetDatabasesRequest getDatabasesRequest = new GetDatabasesRequest(); 151 | GetDatabasesResult getDatabasesResult = glue.getDatabases(getDatabasesRequest); 152 | List databaseList = getDatabasesResult.getDatabaseList(); 153 | 154 | // filter databases that are resource linked 155 | for (Database database : databaseList) { 156 | if (!Optional.ofNullable(database.getTargetDatabase()).isPresent()) { 157 | masterDatabaseList.add(database); 158 | } else { 159 | System.out.printf("Database '%s' seems to have resource linked from AWS Account Id: '%s'. So, it will be skipped. \n", 160 | database.getName(), database.getTargetDatabase().getCatalogId()); 161 | } 162 | } 163 | String databaseResultNextToken = getDatabasesResult.getNextToken(); 164 | if (Optional.ofNullable(databaseResultNextToken).isPresent()) { 165 | do { 166 | // creating a new GetDatabasesRequest using next token. 167 | getDatabasesRequest = new GetDatabasesRequest(); 168 | getDatabasesRequest.setNextToken(databaseResultNextToken); 169 | getDatabasesResult = glue.getDatabases(getDatabasesRequest); 170 | databaseList = getDatabasesResult.getDatabaseList(); 171 | // filter databases that are resource linked 172 | for (Database database : databaseList) { 173 | if (!Optional.ofNullable(database.getTargetDatabase()).isPresent()) { 174 | masterDatabaseList.add(database); 175 | } else { 176 | System.out.printf("Database '%s' seems to have resource linked from AWS Account Id: '%s'. So, it will be skipped. \n", 177 | database.getName(), database.getTargetDatabase().getCatalogId()); 178 | } 179 | } 180 | databaseResultNextToken = getDatabasesResult.getNextToken(); 181 | } while (Optional.ofNullable(databaseResultNextToken).isPresent()); 182 | } 183 | return masterDatabaseList; 184 | } 185 | 186 | /** 187 | * Method to delete a list of tables versions 188 | * 189 | * @param glueClient 190 | * @param listofVersionsToDelete 191 | * @param databaseName 192 | * @param tableName 193 | * @return 194 | */ 195 | public List deleteTableVersions(AWSGlue glueClient, List listofVersionsToDelete, 196 | String tableName, String databaseName) { 197 | 198 | // This code deletes 100 versions at a time. So, it breaks the input list into 199 | // smaller 200 | // Lists of size 100. 201 | List versionsNotDeleted = new ArrayList(); 202 | List> listofSmallerLists = Lists.partition(listofVersionsToDelete, 100); 203 | 204 | for (List smallerList : listofSmallerLists) { 205 | List versionIdSmallerList = new ArrayList(); 206 | for (Integer versionId : smallerList) { 207 | versionIdSmallerList.add(Integer.toString(versionId)); 208 | } 209 | // Batch Delete Table versions 100 items at a time. 210 | if (versionIdSmallerList.size() >= 1) { 211 | 212 | // BatchDeleteTableVersionRequest 213 | BatchDeleteTableVersionRequest batchDeleteTableVersionRequest = new BatchDeleteTableVersionRequest(); 214 | batchDeleteTableVersionRequest.setDatabaseName(databaseName); 215 | batchDeleteTableVersionRequest.setTableName(tableName); 216 | batchDeleteTableVersionRequest.setVersionIds(versionIdSmallerList); 217 | 218 | // execute batchDelete operation 219 | BatchDeleteTableVersionResult batchDeleteTableVersionResult = glueClient 220 | .batchDeleteTableVersion(batchDeleteTableVersionRequest); 221 | 222 | // Check the result and re-process rejected records 223 | batchDeleteTableVersionResult.getSdkHttpMetadata().getHttpStatusCode(); 224 | if (batchDeleteTableVersionResult.getErrors().isEmpty()) { 225 | System.out.printf( 226 | "Up to 100 table versions deleted successfully for table '%s' under database '%s' \n", 227 | tableName, databaseName); 228 | } else { 229 | List tableVersionErrors = batchDeleteTableVersionResult.getErrors(); 230 | for (TableVersionError tvError : tableVersionErrors) { 231 | TableVersionStatus tvStatus = new TableVersionStatus(); 232 | tvStatus.setDatabaseName(databaseName); 233 | tvStatus.setDeleted(false); 234 | tvStatus.setTableName(tvError.getTableName()); 235 | tvStatus.setVersionId(tvError.getVersionId()); 236 | versionsNotDeleted.add(tvStatus); 237 | } 238 | } 239 | } 240 | versionIdSmallerList.clear(); 241 | } 242 | return versionsNotDeleted; 243 | } 244 | 245 | /** 246 | * Method to determine how many table versions to kept and how many to delete. 247 | * 248 | * @param tableVersionList 249 | * @param databaseName 250 | * @param tableName 251 | * @param numberofVersionsToKeep 252 | * @return 253 | */ 254 | public List> determineOldVersions(List tableVersionList, String tableName, 255 | String databaseName, int numberofVersionsToKeep) { 256 | 257 | List versionIdList = new ArrayList(); 258 | for (TableVersion tableVersion : tableVersionList) { 259 | // System.out.printf("Table name: %s, Table version: %s \n", tableName, 260 | // tableVersion.getVersionId()); 261 | versionIdList.add(Integer.parseInt(tableVersion.getVersionId())); 262 | } 263 | // sort the versions in descending order 264 | Collections.sort(versionIdList, Collections.reverseOrder()); 265 | System.out.printf("%d table versions found for table: %s \n", versionIdList.size(), tableName); 266 | System.out.printf("%d is the current (latest) version of the table: %s \n", versionIdList.get(0), tableName); 267 | 268 | // Break the list into two parts. The first part is all the most recent table 269 | // versions that need to be retained. 270 | // the second list contains all the older table version which need to be deleted 271 | List> lists = new ArrayList>(versionIdList.stream() 272 | .collect(Collectors.partitioningBy(s -> versionIdList.indexOf(s) > numberofVersionsToKeep - 1)) 273 | .values()); 274 | return lists; 275 | } 276 | 277 | } 278 | --------------------------------------------------------------------------------