├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── hippolyte ├── __init__.py ├── aws_utils.py ├── config_util.py ├── dynamodb_backup.py ├── dynamodb_booster.py ├── monitor.py ├── multiple.template ├── pipeline_scheduler.py ├── pipeline_translator.py ├── project_config.py └── utils.py ├── requirements-dev.txt ├── requirements.txt ├── serverless.yml └── tests ├── __init__.py ├── resources └── test_backup_metadata.json ├── test.py ├── test_dynamodb_backup.py ├── test_dynamodb_booster.py ├── test_monitor.py └── test_utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *.cover 46 | .hypothesis/ 47 | 48 | 49 | # Serverless directories 50 | .serverless 51 | .requirements 52 | 53 | # Pycharm 54 | .idea/ 55 | 56 | # Node.js 57 | node_modules -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | 3 | python: 4 | - "2.7" 5 | 6 | env: 7 | - AWS_DEFAULT_REGION=us-east-1 TRAVIS_NODE_VERSION=6.11.0 SERVERLESS_VERSION=1.16.1 SERVERLESS_PYTHONR_VERSION=2.3.3 8 | 9 | install: 10 | - rm -rf ~/.nvm && git clone https://github.com/creationix/nvm.git ~/.nvm && (cd ~/.nvm && git checkout `git describe --abbrev=0 --tags`) && source ~/.nvm/nvm.sh && nvm install $TRAVIS_NODE_VERSION 11 | - npm install serverless@$SERVERLESS_VERSION -g 12 | - npm install --save serverless-python-requirements@$SERVERLESS_PYTHONR_VERSION 13 | - pip install -r requirements-dev.txt 14 | 15 | script: 16 | - python tests/test.py 17 | - serverless deploy --region $AWS_DEFAULT_REGION --stage dev --email test@test.com --noDeploy 18 | - serverless deploy --region $AWS_DEFAULT_REGION --stage prod --email test@test.com --noDeploy 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Hippolyte [![Build Status](https://travis-ci.org/ocadotechnology/hippolyte.svg?branch=master)](https://travis-ci.org/ocadotechnology/hippolyte) [![Gitter](https://img.shields.io/gitter/room/TechnologyAdvice/Stardust.svg)](https://gitter.im/ocado-hippolyte) 2 | _Project Discontinued: AWS released [native DynamoDB backups](https://aws.amazon.com/blogs/aws/new-for-amazon-dynamodb-global-tables-and-on-demand-backup/)._ 3 | 4 | Hippolyte an at-scale, point-in-time backup solution for DynamoDB. It is designed to handle frequent, recurring backups of large numbers of tables, scale read throughput, and batch together backup jobs over multiple EMR clusters. 5 | 6 | ## Deployment 7 | Hippolyte is deployed with the [Serverless Framework](https://serverless.com/). We have tested it with Node.js 6.11 / Serverless 1.16.1 / serverless-python-requirements 2.3.3. This can be installed with `npm`. To start with run: 8 | ``` 9 | npm install serverless@1.16.1 10 | npm install --save serverless-python-requirements@2.3.3 11 | ``` 12 | To configure the project for your Amazon accounts, update `hippolyte/project_config.py` with the details of the account in which you intend to run the backup process, you will also need AWS credentials for creating all the dependent resources: 13 | * Lambda Function 14 | * CloudWatch scheduled events 15 | * SNS topic 16 | 17 | To deploy the stack run 18 | `serverless deploy --region --stage --email ` 19 | You can update `serverless.yml` to associate your credentials with stages if you intended to deploy multiple instances of the service. The email setting is optional and uses SNS to alert the provided address to an failed pipelines or tables. 20 | 21 | ## Motivation 22 | Since DynamoDB is a fully managed service and supports cross-region replication you may wonder why you even need to backup data in the first place. If you're running production applications on AWS then you probably already have a lot of confidence in the durability of data in services like DynamoDB or S3. 23 | 24 | Our motivation for building this was to protect against application or user error. No matter how durable Amazon's services are, they wont protect you from unintended updates and deletes. Historical snapshots of data and state also provide additional value, allowing you to restore to a separate table and compare against live data. 25 | 26 | ## Design 27 | We've chosen [Amazon Data Pipeline](https://aws.amazon.com/datapipeline/) as a tool to create, manage and run our backup tasks. Data Pipeline helps with orchestration, automatic retries for failed jobs and the potential to make use of SNS notifications for successful or failed EMR tasks. 28 | 29 | We also use AWS Lambda to schedule and monitor backup jobs. This is responsible for dynamically generating Data Pipeline templates based on configuration and discovered tables, and modifying table throughputs to reduce the duration of the backup job. 30 | 31 | ## Scaling 32 | Part of the job of our scheduling Lambda function is to attempt to optimally assign DynamoDB tables to individual EMR clusters that will be created. Since new tables may be created each day and size may grow significantly, this optimisation is performed each night during the scheduling step. By default, each data pipeline only supports 100 objects; this means each pipeline can support 32 tables, this is because each tables requires 3 Data Pipeline objects: 33 | 34 | * DynamoDBDataNode 35 | * EmrActivity 36 | * S3DataNode 37 | 38 | In addition to this 2 additional objects are needed, the pipeline configuration and an EmrCluster node. 32 * 3 + 2 = 98. In addition to this hard limit we also want every backup to run between 12:00 AM and 7:00 AM. We can work out how long each pipeline will take to complete by starting with some static values 39 | 40 | * EMR cluster bootstrap (10 min) 41 | * EMR activity bootstrap (1 min) 42 | 43 | From there calculating how long each table will take to backup, this can be done with the following formula: 44 | 45 | $$$ 46 | Duration = \frac{Size}{RCU * ConsumedPercentage * 4096\ bytes/second} 47 | $$$ 48 | 49 | Where _Size_ is the table size in bytes, _RCU_ is the provisioned Read Capacity Units for a given table and _ConsumedPercentage_ is what proportion of this capacity the backup job will use. Since each EMR cluster will run backup jobs sequentially and we have limits to the number of tables and length of time, we can pack each pipeline with tables until one of those two constraints is met. 50 | 51 | Additionally some tables are either too large to be backed up in a timely manner with their provisioned read capacity. Here we derive the ratio between the expected backup duration and what is desired and increase our read capacity units by this ratio. We can also increase the percentage of provisioned throughput we consume while preserving the original amount needed for the application. Typically since we paying for clusters and capacity by the hour, it's rarely worth reduce the total expected duration to be less than that. 52 | 53 | ## Restore 54 | Restore process is also done by Data Pipelines. Target table needs to be done manualy and has to have the same: 55 | 56 | * partitioning key 57 | * sort key 58 | * secondary indices 59 | 60 | as the original table. We also recommend setting write capacity to 1000, for the restore process duration. 61 | 62 | 63 | You need to get subnet id of your EMR-Subnet, as you’ll later need it. 64 | Go to DataPipeline web console and create new pipeline. In our example, values would be: 65 | 66 | * Name: restore-test 67 | * Source: Build using template ( Import DynamoDB backup data from S3 ). 68 | * Input S3 folder: s3://hippolyte-eu-west-1-prod-backups/table_name/2017-02-22-00-10-39/ 69 | * Target DynamoDB table name: table_name 70 | * DynamoDB write throughput ratio: 1 //use full speed, as we are the only users now ) 71 | * Region of DynamoDB table: eu-west-1 72 | * Schedule: Run (on pipeline activation) 73 | * Logging: Enabled (s3://hippolyte-eu-west-1-prod-backups/logs/) 74 | * IAM Roles: Default 75 | 76 | 77 | Do not click Activate yet, as at the time of writing it that default template is missing some mandatory parameters. Instead click Edit in Architect. 78 | Now click EmrCluster on diagram and go to Add an optional field… find terminateAfter and set it to a value, high above estimated restore duration, like 3 Days. Add in the same place Subnet Id and set it to EMR-Subnet. Save it and click Activate. 79 | 80 | ## Need help with the setup? 81 | Please pm me: romek.rjm@gmail.com 82 | -------------------------------------------------------------------------------- /hippolyte/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = "roman.subik" 2 | -------------------------------------------------------------------------------- /hippolyte/aws_utils.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | import time 3 | import json 4 | from uuid import uuid4 5 | import boto3 6 | from botocore.exceptions import ClientError 7 | from retrying import retry 8 | import hippolyte.pipeline_translator as pipeline_translator 9 | from hippolyte.utils import chunks 10 | 11 | 12 | def retry_if_throttling_error(exception): 13 | if isinstance(exception, ClientError): 14 | return 'Throttling' in exception.message or 'limit exceeded' in exception.message 15 | 16 | return False 17 | 18 | 19 | class DataPipelineUtil(object): 20 | def __init__(self): 21 | self.client = boto3.client('datapipeline') 22 | 23 | @retry(retry_on_exception=retry_if_throttling_error, 24 | wait_exponential_multiplier=1000, 25 | stop_max_attempt_number=5) 26 | def create_pipeline(self, name=None): 27 | if not name: 28 | name = "dynamodb-backup-" + datetime.now().strftime("%Y-%m-%d-%H-%M-%S") 29 | 30 | return self.client.create_pipeline( 31 | name=name, 32 | uniqueId=str(uuid4()), 33 | description="Used to do automatic DynamoDB backups.", 34 | tags=[ 35 | { 36 | "key": "app", 37 | "value": "hippolyte-datapipeline" 38 | } 39 | ] 40 | ) 41 | 42 | @retry(retry_on_exception=retry_if_throttling_error, 43 | wait_exponential_multiplier=1000, 44 | stop_max_attempt_number=5) 45 | def put_pipeline_definition(self, pipeline_id, definition): 46 | return self.client.put_pipeline_definition( 47 | pipelineId=pipeline_id, 48 | pipelineObjects=pipeline_translator.definition_to_api_objects(definition), 49 | parameterObjects=pipeline_translator.definition_to_api_parameters(definition), 50 | parameterValues=pipeline_translator.definition_to_parameter_values(definition) 51 | ) 52 | 53 | @retry(retry_on_exception=retry_if_throttling_error, 54 | wait_exponential_multiplier=1000, 55 | stop_max_attempt_number=5) 56 | def activate_pipeline(self, pipeline_id, definition): 57 | return self.client.activate_pipeline( 58 | pipelineId=pipeline_id, 59 | parameterValues=pipeline_translator.definition_to_parameter_values(definition) 60 | ) 61 | 62 | @retry(retry_on_exception=retry_if_throttling_error, 63 | wait_exponential_multiplier=1000, 64 | stop_max_attempt_number=5) 65 | def list_pipelines(self): 66 | pipelines = [] 67 | paginator = self.client.get_paginator('list_pipelines') 68 | page_iterator = paginator.paginate(PaginationConfig={ 69 | 'MaxItems': 1000 70 | }) 71 | 72 | for page in page_iterator: 73 | pipelines += page.get("pipelineIdList", []) 74 | 75 | return pipelines 76 | 77 | @retry(retry_on_exception=retry_if_throttling_error, 78 | wait_exponential_multiplier=1000, 79 | stop_max_attempt_number=5) 80 | def describe_pipelines(self): 81 | pipeline_list = self.list_pipelines() 82 | pipeline_ids = map(lambda x: x['id'], pipeline_list) 83 | pipeline_ids_chunked = list(chunks(pipeline_ids, 25)) 84 | pipeline_descriptions = [] 85 | 86 | for pipeline_id in pipeline_ids_chunked: 87 | descriptions = self.client.describe_pipelines(pipelineIds=pipeline_id) 88 | pipeline_descriptions += descriptions['pipelineDescriptionList'] 89 | 90 | return pipeline_descriptions 91 | 92 | @retry(retry_on_exception=retry_if_throttling_error, 93 | wait_exponential_multiplier=1000, 94 | stop_max_attempt_number=5) 95 | def delete_pipeline(self, pipeline_id): 96 | self.client.delete_pipeline(pipelineId=pipeline_id) 97 | 98 | 99 | class DynamoDBUtil(object): 100 | def __init__(self): 101 | self.client = boto3.client('dynamodb') 102 | 103 | @retry(retry_on_exception=retry_if_throttling_error, 104 | wait_exponential_multiplier=1000, 105 | stop_max_attempt_number=5) 106 | def list_tables(self): 107 | tables = [] 108 | paginator = self.client.get_paginator('list_tables') 109 | page_iterator = paginator.paginate(PaginationConfig={ 110 | 'MaxItems': 10000, 111 | 'PageSize': 100 112 | }) 113 | 114 | for page in page_iterator: 115 | tables += page.get("TableNames", []) 116 | 117 | return tables 118 | 119 | @retry(retry_on_exception=retry_if_throttling_error, 120 | wait_exponential_multiplier=1000, 121 | stop_max_attempt_number=5) 122 | def describe_table(self, table_name): 123 | return self.client.describe_table(TableName=table_name) 124 | 125 | def describe_tables(self, table_names): 126 | table_descriptions = [] 127 | 128 | for table_name in table_names: 129 | table_descriptions.append(self.describe_table(table_name)) 130 | 131 | return table_descriptions 132 | 133 | @retry(retry_on_exception=retry_if_throttling_error, 134 | wait_exponential_multiplier=1000, 135 | stop_max_attempt_number=5) 136 | def describe_limits(self): 137 | return self.client.describe_limits() 138 | 139 | @retry(retry_on_exception=retry_if_throttling_error, 140 | wait_exponential_multiplier=1000, 141 | stop_max_attempt_number=5) 142 | def batch_write_items(self, table_name, items): 143 | table = self.client.Table(TableName=table_name) 144 | 145 | with table.batch_writer() as batch: 146 | for item in items: 147 | batch.put_item(Item=item) 148 | 149 | @retry(retry_on_exception=retry_if_throttling_error, 150 | wait_exponential_multiplier=1000, 151 | stop_max_attempt_number=5) 152 | def update_item(self, table_name, key, update_expression, expression_attribute_values): 153 | table = self.client.Table(TableName=table_name) 154 | table.update_item(Key=key, UpdateExpression=update_expression, 155 | ExpressionAttributeValues=expression_attribute_values) 156 | 157 | @retry(retry_on_exception=retry_if_throttling_error, 158 | wait_exponential_multiplier=1000, 159 | stop_max_attempt_number=5) 160 | def change_capacity_units(self, table_name, new_read_throughput=None, new_write_throughput=None): 161 | table_description = self.describe_table(table_name).get('Table', {}) 162 | 163 | throughput, requires_update = self._get_adjusted_throughput(table_description, 164 | new_read_throughput, new_write_throughput) 165 | 166 | if requires_update: 167 | self.client.update_table(TableName=table_name, ProvisionedThroughput=throughput) 168 | 169 | def _get_adjusted_throughput(self, table_description, new_read_throughput, new_write_throughput): 170 | current_throughput = table_description.get('ProvisionedThroughput') 171 | current_read_throughput = current_throughput.get('ReadCapacityUnits') 172 | current_write_throughput = current_throughput.get('WriteCapacityUnits') 173 | 174 | throughput = { 175 | 'ReadCapacityUnits': current_read_throughput, 176 | 'WriteCapacityUnits': current_write_throughput 177 | } 178 | 179 | requires_update = False 180 | 181 | if new_read_throughput and (current_read_throughput != new_read_throughput): 182 | throughput['ReadCapacityUnits'] = new_read_throughput 183 | requires_update = True 184 | 185 | if new_write_throughput and (current_write_throughput != new_write_throughput): 186 | throughput['WriteCapacityUnits'] = new_write_throughput 187 | requires_update = True 188 | 189 | return throughput, requires_update 190 | 191 | 192 | class S3Util(object): 193 | def __init__(self): 194 | self.client = boto3.client('s3') 195 | 196 | @retry(retry_on_exception=retry_if_throttling_error, 197 | wait_exponential_multiplier=1000, 198 | stop_max_attempt_number=5) 199 | def put_json(self, bucket, key, json_file): 200 | body = json.dumps(json_file, default=lambda o: str(o), sort_keys=True, indent=4) 201 | self.client.put_object(Bucket=bucket, Key=key, Body=body) 202 | 203 | @retry(retry_on_exception=retry_if_throttling_error, 204 | wait_exponential_multiplier=1000, 205 | stop_max_attempt_number=5) 206 | def get_json(self, bucket, key): 207 | obj = self.client.get_object(Bucket=bucket, Key=key) 208 | return json.loads(obj.get('Body').read().decode('utf-8')) 209 | 210 | @retry(retry_on_exception=retry_if_throttling_error, 211 | wait_exponential_multiplier=1000, 212 | stop_max_attempt_number=5) 213 | def list_objects(self, bucket, prefix): 214 | paginator = self.client.get_paginator('list_objects') 215 | contents = [] 216 | response = None 217 | 218 | for page in paginator.paginate(Bucket=bucket, Prefix=prefix): 219 | response = page 220 | contents += page.get('Contents', []) 221 | 222 | if response: 223 | response['Contents'] = contents 224 | 225 | return response 226 | 227 | @retry(retry_on_exception=retry_if_throttling_error, 228 | wait_exponential_multiplier=1000, 229 | stop_max_attempt_number=5) 230 | def object_exists(self, bucket, key): 231 | try: 232 | self.client.get_object(Bucket=bucket, Key=key) 233 | except ClientError as ce: 234 | if ce.response['Error']['Code'] == "404": 235 | return False 236 | 237 | return True 238 | 239 | 240 | class ApplicationAutoScalingUtil(object): 241 | def __init__(self): 242 | self.client = self._init_client() 243 | 244 | def _init_client(self): 245 | return boto3.client('application-autoscaling') 246 | 247 | @retry(retry_on_exception=retry_if_throttling_error, 248 | wait_exponential_multiplier=1000, 249 | stop_max_attempt_number=5) 250 | def describe_scalable_targets(self, service_namespace): 251 | paginator = self.client.get_paginator('describe_scalable_targets') 252 | targets = [] 253 | response = None 254 | 255 | for page in paginator.paginate(ServiceNamespace=service_namespace): 256 | response = page 257 | targets += page.get('ScalableTargets', []) 258 | time.sleep(3) 259 | 260 | if response: 261 | response['ScalableTargets'] = targets 262 | 263 | return response 264 | 265 | @retry(retry_on_exception=retry_if_throttling_error, 266 | wait_exponential_multiplier=1000, 267 | stop_max_attempt_number=5) 268 | def describe_scaling_policies(self, service_namespace): 269 | paginator = self.client.get_paginator('describe_scaling_policies') 270 | policies = [] 271 | response = None 272 | 273 | for page in paginator.paginate(ServiceNamespace=service_namespace): 274 | response = page 275 | policies += page.get('ScalingPolicies', []) 276 | 277 | if response: 278 | response['ScalingPolicies'] = policies 279 | 280 | return response 281 | 282 | @retry(retry_on_exception=retry_if_throttling_error, 283 | wait_exponential_multiplier=1000, 284 | stop_max_attempt_number=5) 285 | def delete_scaling_policy(self, policy_name, service_namespace, resource_id, scalable_dimension): 286 | self.client.delete_scaling_policy(PolicyName=policy_name, 287 | ServiceNamespace=service_namespace, 288 | ResourceId=resource_id, 289 | ScalableDimension=scalable_dimension) 290 | 291 | @retry(retry_on_exception=retry_if_throttling_error, 292 | wait_exponential_multiplier=1000, 293 | stop_max_attempt_number=5) 294 | def deregister_scalable_target(self, service_namespace, resource_id, scalable_dimension): 295 | self.client.deregister_scalable_target(ServiceNamespace=service_namespace, 296 | ResourceId=resource_id, 297 | ScalableDimension=scalable_dimension) 298 | 299 | @retry(retry_on_exception=retry_if_throttling_error, 300 | wait_exponential_multiplier=1000, 301 | stop_max_attempt_number=5) 302 | def put_scaling_policy(self, policy_name, service_namespace, resource_id, scalable_dimension, policy_type, 303 | target_scaling_policy_configuration): 304 | self.client.put_scaling_policy(PolicyName=policy_name, 305 | ServiceNamespace=service_namespace, 306 | ResourceId=resource_id, 307 | ScalableDimension=scalable_dimension, 308 | PolicyType=policy_type, 309 | TargetTrackingScalingPolicyConfiguration=target_scaling_policy_configuration) 310 | 311 | @retry(retry_on_exception=retry_if_throttling_error, 312 | wait_exponential_multiplier=1000, 313 | stop_max_attempt_number=5) 314 | def register_scalable_target(self, service_namespace, resource_id, scalable_dimension, 315 | min_capacity, max_capacity, role_arn): 316 | self.client.register_scalable_target(ServiceNamespace=service_namespace, 317 | ResourceId=resource_id, 318 | ScalableDimension=scalable_dimension, 319 | MinCapacity=min_capacity, 320 | MaxCapacity=max_capacity, 321 | RoleARN=role_arn) 322 | 323 | 324 | class SnsUtil(object): 325 | def __init__(self): 326 | self.client = boto3.client('sns') 327 | 328 | @retry(retry_on_exception=retry_if_throttling_error, 329 | wait_exponential_multiplier=1000, 330 | stop_max_attempt_number=5) 331 | def publish(self, sns_topic, subject, message): 332 | self.client.publish( 333 | TopicArn=sns_topic, 334 | Message=message, 335 | Subject=subject 336 | ) 337 | -------------------------------------------------------------------------------- /hippolyte/config_util.py: -------------------------------------------------------------------------------- 1 | __author__ = "roman.subik" 2 | 3 | from hippolyte.aws_utils import S3Util, DataPipelineUtil 4 | from hippolyte.utils import get_date_suffix 5 | import logging 6 | 7 | COMMON_PREFIX = 'backup_metadata' 8 | DONE_STATES = ["CANCELED", "CASCADE_FAILED", "FAILED", "FINISHED", "INACTIVE", "PAUSED", "SKIPPED", "TIMEDOUT"] 9 | 10 | logger = logging.getLogger() 11 | logger.setLevel(logging.INFO) 12 | 13 | 14 | class ConfigUtil(object): 15 | def __init__(self): 16 | self.s3_util = S3Util() 17 | self.data_pipeline_util = DataPipelineUtil() 18 | 19 | def save_configuration(self, pipeline_definitions, backup_bucket, table_descriptions, 20 | scaling_policies, scalable_targets): 21 | self.s3_util.put_json( 22 | backup_bucket, self._get_metadata_file_name(), 23 | { 24 | "Tables": table_descriptions, 25 | "Pipelines": pipeline_definitions, 26 | "ScalingPolicies": scaling_policies, 27 | "ScalableTargets": scalable_targets 28 | } 29 | ) 30 | 31 | def load_configuration(self, backup_bucket): 32 | contents = self.s3_util.list_objects( 33 | backup_bucket, COMMON_PREFIX 34 | ).get("Contents", []) 35 | 36 | contents = sorted(contents, key=lambda x: x['LastModified'], reverse=True) 37 | 38 | if contents: 39 | return self.s3_util.get_json( 40 | backup_bucket, contents[0].get('Key') 41 | ) 42 | else: 43 | return 44 | 45 | def _get_metadata_file_name(self): 46 | return '{}-{}'.format(COMMON_PREFIX, get_date_suffix()) 47 | 48 | def list_backed_up_tables(self, pipelines, backup_bucket): 49 | finished_pipelines = self.list_finished_pipelines(backup_bucket, pipelines) 50 | backed_up_tables = [] 51 | 52 | for pipeline in pipelines: 53 | if pipeline['pipeline_id'] in finished_pipelines: 54 | backed_up_tables += pipeline['backed_up_tables'] 55 | 56 | return backed_up_tables 57 | 58 | def list_finished_pipelines(self, backup_bucket=None, backup_pipelines=None): 59 | if not backup_pipelines: 60 | last_configuration = self.load_configuration(backup_bucket) 61 | 62 | if last_configuration: 63 | backup_pipelines = last_configuration['Pipelines'] 64 | 65 | if not backup_pipelines: 66 | logger.error("Couldn't find any backed up tables. Has your backup ran?") 67 | return [] 68 | 69 | backup_pipeline_names = map(lambda x: x['pipeline_id'], backup_pipelines) 70 | pipelines = self.data_pipeline_util.describe_pipelines() 71 | finished_pipelines = [] 72 | 73 | for pipeline in pipelines: 74 | fields = pipeline["fields"] 75 | pipeline_id = pipeline["pipelineId"] 76 | 77 | logger.info("Checking pipeline {}".format(str(pipeline_id))) 78 | 79 | if pipeline_id not in backup_pipeline_names: 80 | continue 81 | 82 | for field in fields: 83 | if field["key"] != "@pipelineState": 84 | continue 85 | if field["stringValue"] in DONE_STATES: 86 | logger.info("Pipeline {} state is in DONE_STATES.".format(str(pipeline_id))) 87 | logger.debug(str(pipeline)) 88 | finished_pipelines.append(pipeline_id) 89 | 90 | return finished_pipelines 91 | -------------------------------------------------------------------------------- /hippolyte/dynamodb_backup.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import logging 3 | import re 4 | 5 | from botocore.exceptions import ClientError 6 | from hippolyte.aws_utils import DataPipelineUtil, DynamoDBUtil 7 | from hippolyte.config_util import ConfigUtil 8 | from hippolyte.monitor import Monitor 9 | from hippolyte.pipeline_scheduler import Scheduler 10 | from hippolyte.dynamodb_booster import DynamoDbBooster 11 | from hippolyte.utils import MAX_DURATION_SINGLE_PIPELINE, INITIAL_READ_THROUGHPUT_PERCENT, list_tables_in_definition 12 | from hippolyte.project_config import ACCOUNT_CONFIGS 13 | 14 | logger = logging.getLogger() 15 | logger.setLevel(logging.INFO) 16 | 17 | 18 | def _extract_from_arn(arn, position): 19 | """ 20 | Helper Function to extract part of an ARN 21 | 22 | :param arn: Arn to extract from 23 | :param position: Position in Arn of interest 24 | :return: String containing value at requested position 25 | """ 26 | 27 | return re.findall("(.*?):", arn)[position] 28 | 29 | 30 | def get_table_descriptions(exclude_from_backup, always_backup): 31 | """ 32 | Decides which tables should be backed up, based on their names. 33 | :param exclude_from_backup: list of regexp., matching tables will be skipped from backup 34 | :param always_backup: those tables will always be backed up, despite exclude_from_backup matching 35 | :return: list of table names to backup 36 | """ 37 | dynamo_db_util = DynamoDBUtil() 38 | table_names = dynamo_db_util.list_tables() 39 | tables_filtered = set() 40 | patterns = map(lambda x: re.compile(x), exclude_from_backup) 41 | 42 | for table_name in table_names: 43 | if table_name in always_backup or _not_excluded(table_name, patterns): 44 | tables_filtered.add(table_name) 45 | 46 | return dynamo_db_util.describe_tables(tables_filtered) 47 | 48 | 49 | def _not_excluded(table_name, patterns): 50 | should_be_added = True 51 | 52 | for pattern in patterns: 53 | if pattern.match(table_name): 54 | should_be_added = False 55 | break 56 | 57 | return should_be_added 58 | 59 | 60 | def get_account(context): 61 | return _extract_from_arn(context.invoked_function_arn, 4) 62 | 63 | 64 | def get_sns_endpoint(context): 65 | region = _extract_from_arn(context.invoked_function_arn, 3) 66 | return 'arn:aws:sns:{}:{}:hippolyte-backup-monitoringbackup'.format(region, get_account(context)) 67 | 68 | 69 | def detect_action(event): 70 | resources = event.get("resources", []) 71 | 72 | for resource in resources: 73 | if resource.endswith('monitor-dynamodb-backup'): 74 | return monitor 75 | 76 | return backup 77 | 78 | 79 | def backup(**kwargs): 80 | logger.info("Performing full DynamoDB backup task.") 81 | logger.info("Building pipeline definitions") 82 | scheduler = Scheduler(kwargs['table_descriptions'], 'multiple.template', kwargs['emr_subnet'], 83 | kwargs['region'], kwargs['backup_bucket'], kwargs['log_bucket']) 84 | pipeline_definitions = scheduler.build_pipeline_definitions() 85 | 86 | logger.info("Creating pipelines.") 87 | pipeline_descriptions = [] 88 | for definition in pipeline_definitions: 89 | created = True 90 | 91 | try: 92 | response = kwargs['pipeline_util'].create_pipeline() 93 | except ClientError as e: 94 | if e.message == 'LimitExceededException': 95 | logger.warn("Can't create more pipelines, as account limit exceeded. Details: {}" 96 | .format(e.message)) 97 | 98 | created = False 99 | logger.warn("Can't create more pipelines. Details: {}".format(e.message)) 100 | 101 | if created: 102 | pipeline_descriptions.append( 103 | { 104 | 'pipeline_id': response.get("pipelineId"), 105 | 'backed_up_tables': list_tables_in_definition(definition), 106 | 'definition': definition 107 | } 108 | ) 109 | 110 | logger.info("Updating throughputs, to meet Time Point Objective.") 111 | kwargs['dynamodb_booster'].boost_throughput(pipeline_descriptions, MAX_DURATION_SINGLE_PIPELINE) 112 | 113 | for description in pipeline_descriptions: 114 | pipeline_id = description["pipeline_id"] 115 | pipeline_definition = description["definition"] 116 | 117 | logger.info("Deploying pipeline definition to {}".format(pipeline_id)) 118 | kwargs['pipeline_util'].put_pipeline_definition(pipeline_id, pipeline_definition) 119 | 120 | logger.info("Activating pipeline: {}".format(pipeline_id)) 121 | kwargs['pipeline_util'].activate_pipeline(pipeline_id, pipeline_definition) 122 | 123 | logger.info("Finished dynamo db backup.") 124 | 125 | 126 | def monitor(**kwargs): 127 | logger.info("Performing monitoring only this time.") 128 | logger.info("Restoring original throughputs.") 129 | kwargs['dynamodb_booster'].restore_throughput() 130 | 131 | finished_pipelines = ConfigUtil().list_finished_pipelines(kwargs['backup_bucket']) 132 | for pipeline_id in finished_pipelines: 133 | logger.info("Deleting finished pipeline: {}".format(pipeline_id)) 134 | kwargs['pipeline_util'].delete_pipeline(pipeline_id) 135 | 136 | logger.info("Looking for failed backups.") 137 | monitor = Monitor(kwargs['account'], kwargs['log_bucket'], kwargs['backup_bucket'], kwargs['sns_endpoint']) 138 | monitor.notify_about_failures(finished_pipelines) 139 | 140 | 141 | def lambda_handler(event, context): 142 | account_id = get_account(context) 143 | 144 | if account_id not in ACCOUNT_CONFIGS: 145 | logger.error("Couldn't find configuration for {} in project_config.py.".format(account_id)) 146 | return 147 | 148 | account_config = ACCOUNT_CONFIGS[account_id] 149 | exclude_from_backup = account_config.get('exclude_from_backup', []) 150 | always_backup = account_config.get('always_backup', []) 151 | 152 | logger.info("Describing tables in the account.") 153 | table_descriptions = get_table_descriptions(exclude_from_backup, always_backup) 154 | 155 | action = detect_action(event) 156 | action(**{ 157 | 'table_descriptions': table_descriptions, 158 | 'pipeline_util': DataPipelineUtil(), 159 | 'dynamodb_booster': DynamoDbBooster(table_descriptions, 160 | account_config['backup_bucket'], 161 | INITIAL_READ_THROUGHPUT_PERCENT), 162 | 'account': account_id, 163 | 'log_bucket': account_config['log_bucket'], 164 | 'sns_endpoint': get_sns_endpoint(context), 165 | 'backup_bucket': account_config['backup_bucket'], 166 | 'emr_subnet': account_config['emr_subnet'], 167 | 'region': _extract_from_arn(context.invoked_function_arn, 3) 168 | }) 169 | 170 | # Uncomment to test monitor phase: 171 | # class Context(object): 172 | # def __init__(self): 173 | # self.invoked_function_arn = "a:b:c:eu-west-1:274670120741:e" 174 | # 175 | # lambda_handler({'resources': ['monitor-dynamodb-backup']}, Context()) 176 | 177 | # Uncomment to test backup phase 178 | # lambda_handler({}, Context()) 179 | -------------------------------------------------------------------------------- /hippolyte/dynamodb_booster.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | import logging 3 | from botocore.exceptions import ClientError 4 | from hippolyte.aws_utils import ApplicationAutoScalingUtil, DataPipelineUtil, DynamoDBUtil 5 | from hippolyte.config_util import ConfigUtil 6 | from hippolyte.utils import ACTIVITY_BOOTSTRAP_TIME, EMR_BOOTSTRAP_TIME, MAX_DURATION_SEC, \ 7 | MAX_ALLOWED_PROVISIONED_READ_THROUGHPUT, INITIAL_READ_THROUGHPUT_PERCENT, \ 8 | estimate_backup_duration, compute_required_throughput, get_first_element_in_the_list_with 9 | 10 | logger = logging.getLogger() 11 | logger.setLevel(logging.INFO) 12 | 13 | 14 | class DynamoDbBooster(object): 15 | def __init__(self, table_descriptions, backup_bucket, read_throughput_percent): 16 | self.table_descriptions = table_descriptions 17 | self.backup_bucket = backup_bucket 18 | self.read_throughput_percent = read_throughput_percent 19 | self.dynamo_db_util = DynamoDBUtil() 20 | self.config_util = ConfigUtil() 21 | self.data_pipeline_util = DataPipelineUtil() 22 | self.application_auto_scaling_util = ApplicationAutoScalingUtil() 23 | 24 | def boost_throughput(self, pipeline_descriptions, desired_backup_duration): 25 | scaling_policies = self.list_dynamodb_scaling_policies() 26 | scalable_targets = self.list_dynamodb_scalable_targets() 27 | self.config_util.save_configuration(pipeline_descriptions, self.backup_bucket, self.table_descriptions, 28 | scaling_policies, scalable_targets) 29 | self.disable_auto_scaling(scaling_policies, scalable_targets) 30 | 31 | limits = self.dynamo_db_util.describe_limits() 32 | total_increase = 0 33 | 34 | pipeline_definitions = map(lambda x: x['definition'], pipeline_descriptions) 35 | 36 | for nodes in pipeline_definitions: 37 | total_increase += self._boost_single_pipeline(nodes.get('objects'), desired_backup_duration, limits) 38 | 39 | logger.info("Total throughput increase: {}".format(total_increase)) 40 | 41 | def restore_throughput(self): 42 | last_configuration = self.config_util.load_configuration(self.backup_bucket) 43 | 44 | if not last_configuration: 45 | logger.error("Couldn't find configuration file. Stopping throughput restore process.") 46 | return 47 | 48 | self._restore_all_tables(last_configuration) 49 | 50 | self.reenable_auto_scaling(last_configuration) 51 | 52 | def _restore_all_tables(self, last_configuration): 53 | pipelines = last_configuration['Pipelines'] 54 | tables = last_configuration['Tables'] 55 | 56 | backed_up_tables = self.config_util.list_backed_up_tables(pipelines, self.backup_bucket) 57 | previous_table_state = filter(lambda x: 'TableArn' in x['Table'], tables) 58 | current_table_state = filter(lambda x: 'TableArn' in x['Table'], self.table_descriptions) 59 | 60 | logger.debug("Previous table state: {}".format(str(previous_table_state))) 61 | logger.debug("Current table state: {}".format(str(current_table_state))) 62 | 63 | for previous_state in previous_table_state: 64 | previous_name, previous_throughput = self._get_name_and_capacity(previous_state) 65 | 66 | if previous_name not in backed_up_tables: 67 | continue 68 | 69 | for current_state in current_table_state: 70 | current_name, current_throughput = self._get_name_and_capacity(current_state) 71 | logger.debug("current_name:{}, current_throughput:{}, previous_name:{}, previous_throughput:{}" 72 | .format(current_name, current_throughput, previous_name, previous_throughput)) 73 | 74 | if current_name == previous_name and current_throughput != previous_throughput: 75 | logger.info("Decreasing throughput of {} from {} to {}.".format( 76 | current_name, current_throughput, previous_throughput)) 77 | 78 | try: 79 | self.dynamo_db_util.change_capacity_units(current_name, previous_throughput) 80 | except ClientError as e: 81 | if 'decreased' in e.message: 82 | logger.error("Can't decrease throughput of {}, max number of decreases for 24h reached." 83 | .format(current_name)) 84 | else: 85 | logger.error("Can't decrease throughput of {}, reason: ".format(e.message)) 86 | 87 | def _boost_single_pipeline(self, nodes, desired_backup_duration, limits): 88 | dynamo_db_nodes = filter(lambda x: 'tableName' in x, nodes) 89 | bootstrap_duration = EMR_BOOTSTRAP_TIME + ACTIVITY_BOOTSTRAP_TIME * len(dynamo_db_nodes) 90 | max_backup_duration = MAX_DURATION_SEC - bootstrap_duration 91 | total_backup_duration = 0 92 | table_durations = [] 93 | total_increase = 0 94 | 95 | for node in dynamo_db_nodes: 96 | table_description = filter(lambda x: x.get('Table', {}).get('TableName') == node['tableName'], 97 | self.table_descriptions)[0] 98 | table_size = table_description.get('Table', {}).get('TableSizeBytes') 99 | read_capacity_units = table_description.get('Table', {}).get('ProvisionedThroughput', {}) \ 100 | .get('ReadCapacityUnits', {}) 101 | duration = estimate_backup_duration(self.read_throughput_percent, table_size, read_capacity_units) 102 | table_durations.append((node, table_description, read_capacity_units, duration)) 103 | total_backup_duration += duration 104 | 105 | if total_backup_duration <= max_backup_duration: 106 | return total_increase 107 | 108 | for node, description, read_capacity_units, duration in table_durations: 109 | target_duration = float(duration) * desired_backup_duration / total_backup_duration 110 | new_read_capacity_units, new_throughput_percent = compute_required_throughput( 111 | duration, target_duration, read_capacity_units, INITIAL_READ_THROUGHPUT_PERCENT) 112 | 113 | read_limit = min(MAX_ALLOWED_PROVISIONED_READ_THROUGHPUT, limits['TableMaxReadCapacityUnits']) 114 | 115 | if new_read_capacity_units > read_limit: 116 | logger.error("Can't meet RTO for {} as max table read capacity limit is {}, conntact aws support, " 117 | "to increase it. ".format(node['tableName'], read_limit)) 118 | new_read_capacity_units = read_limit 119 | 120 | logger.info("Increasing throughput of {} from {} to {}.".format( 121 | node['tableName'], read_capacity_units, new_read_capacity_units)) 122 | node['readThroughputPercent'] = str(new_throughput_percent) 123 | 124 | try: 125 | self.dynamo_db_util.change_capacity_units(node['tableName'], new_read_capacity_units) 126 | except ClientError as e: 127 | if e.message == 'LimitExceededException': 128 | logger.error("Can't meet RTO for {} as max account read capacity limit exceeded. Details: {}" 129 | .format(node['tableName'], e.message)) 130 | else: 131 | logger.error("Failed to increase table {} read capacity limit. Details: {}" 132 | .format(node['tableName'], e.message)) 133 | 134 | new_read_capacity_units = read_capacity_units 135 | 136 | total_increase += new_read_capacity_units - read_capacity_units 137 | 138 | return total_increase 139 | 140 | def _get_name_and_capacity(self, state): 141 | table = state.get('Table', {}) 142 | name = table.get('TableName', '') 143 | throughput = table.get('ProvisionedThroughput', {}).get('ReadCapacityUnits') 144 | 145 | return name, throughput 146 | 147 | def disable_auto_scaling(self, scaling_policies, scalable_targets): 148 | logger.info("Disabling autoscaling on backed up tables, for backup duration.") 149 | 150 | for table in self.table_descriptions: 151 | table_name = table.get('Table', {}).get('TableName') 152 | resource_id = "table/{}".format(table_name) 153 | 154 | read_scaling_policy = get_first_element_in_the_list_with(scaling_policies, 'ResourceId', resource_id) 155 | 156 | if read_scaling_policy: 157 | logger.info("Removing scaling policy: {}".format(read_scaling_policy['PolicyName'])) 158 | 159 | try: 160 | self.application_auto_scaling_util. \ 161 | delete_scaling_policy(read_scaling_policy['PolicyName'], "dynamodb", 162 | resource_id, "dynamodb:table:ReadCapacityUnits") 163 | except ClientError as e: 164 | if 'No scaling policy found for service namespace' in e.message: 165 | logger.warn("Can't delete scaling policy for: {}, as it does not exist".format(table_name)) 166 | else: 167 | logger.warn( 168 | "Can't delete scaling policy for: {}, error: {}".format(table_name, e.message)) 169 | 170 | read_scalable_target = get_first_element_in_the_list_with(scalable_targets, 'ResourceId', resource_id) 171 | 172 | if read_scalable_target: 173 | logger.info("Removing scalable target for: {}".format(resource_id)) 174 | try: 175 | self.application_auto_scaling_util. \ 176 | deregister_scalable_target("dynamodb", resource_id, "dynamodb:table:ReadCapacityUnits") 177 | except ClientError as e: 178 | if 'No scalable target found for service namespace' in e.message: 179 | logger.warn("Can't delete scalable target for: {}, as it does not exist".format(table_name)) 180 | else: 181 | logger.warn( 182 | "Can't delete scalable target for: {}, error: {}".format(table_name, e.message)) 183 | 184 | def reenable_auto_scaling(self, last_configuration): 185 | logger.info("Reenabling autoscaling tables after backup.") 186 | scalable_targets = last_configuration['ScalableTargets'] 187 | scaling_policies = last_configuration['ScalingPolicies'] 188 | 189 | for target in scalable_targets: 190 | logger.info("Adding scalable target for: {}".format(target['ResourceId'])) 191 | 192 | try: 193 | self.application_auto_scaling_util.register_scalable_target(target['ServiceNamespace'], 194 | target['ResourceId'], 195 | target['ScalableDimension'], 196 | target['MinCapacity'], 197 | target['MaxCapacity'], 198 | target['RoleARN']) 199 | except ClientError as e: 200 | if 'table does not exist' in e.message: 201 | logger.warn("Can't restore scalable target for: {}, table was deleted".format(target['ResourceId'])) 202 | else: 203 | logger.warn( 204 | "Can't restore scalable target for: {}, error: {}".format(target['ResourceId'], e.message)) 205 | 206 | for policy in scaling_policies: 207 | logger.info("Adding scaling policy: {}".format(policy['PolicyName'])) 208 | 209 | try: 210 | self.application_auto_scaling_util.put_scaling_policy(policy['PolicyName'], 211 | policy['ServiceNamespace'], 212 | policy['ResourceId'], 213 | policy['ScalableDimension'], 214 | policy['PolicyType'], 215 | policy[ 216 | 'TargetTrackingScalingPolicyConfiguration']) 217 | except ClientError as e: 218 | if 'table does not exist' in e.message: 219 | logger.warn("Can't restore scaling policy for: {}, table was deleted".format(target['ResourceId'])) 220 | else: 221 | logger.warn( 222 | "Can't restore scaling policy for: {}, error: {}".format(target['ResourceId'], e.message)) 223 | 224 | def list_dynamodb_scalable_targets(self): 225 | targets = self.application_auto_scaling_util \ 226 | .describe_scalable_targets("dynamodb").get('ScalableTargets', []) 227 | return self._only_return_rcu_dimension(targets) 228 | 229 | def list_dynamodb_scaling_policies(self): 230 | policies = self.application_auto_scaling_util \ 231 | .describe_scaling_policies("dynamodb").get('ScalingPolicies', []) 232 | return self._only_return_rcu_dimension(policies) 233 | 234 | def _only_return_rcu_dimension(self, _list): 235 | return filter(lambda x: x.get('ScalableDimension') == 'dynamodb:table:ReadCapacityUnits', _list) 236 | -------------------------------------------------------------------------------- /hippolyte/monitor.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from datetime import datetime 3 | import logging 4 | from hippolyte.aws_utils import S3Util, SnsUtil 5 | from hippolyte.config_util import ConfigUtil 6 | from hippolyte.utils import TIME_IN_BETWEEN_BACKUPS 7 | 8 | logger = logging.getLogger() 9 | logger.setLevel(logging.INFO) 10 | 11 | 12 | class Monitor(object): 13 | def __init__(self, account, log_bucket, backup_bucket, sns_endpoint): 14 | self.account = account 15 | self.log_bucket = log_bucket 16 | self.backup_bucket = backup_bucket 17 | self.sns_endpoint = sns_endpoint 18 | self.config_util = ConfigUtil() 19 | self.s3_util = S3Util() 20 | self.sns_util = SnsUtil() 21 | 22 | def notify_about_failures(self, pipelines): 23 | configuration = self.config_util.load_configuration(self.backup_bucket) 24 | 25 | if not configuration: 26 | logger.info("Couldn't find configuration file. Stopping throughput restore process, sending email.") 27 | 28 | email_body = all_failed_backup_email_template.format( 29 | account=self.account, 30 | log_bucket = self.log_bucket 31 | ) 32 | self.send_notification_email(email_body) 33 | 34 | pipeline_failed_tables = {} 35 | 36 | for pipeline_id in pipelines: 37 | finished_pipeline = filter(lambda x: x['pipeline_id'] == pipeline_id, configuration['Pipelines']) 38 | failed_tables = [] 39 | 40 | if finished_pipeline: 41 | failed_tables = self.extract_failed_tables(finished_pipeline[0]) 42 | 43 | if failed_tables: 44 | pipeline_failed_tables[finished_pipeline[0]['pipeline_id']] = failed_tables 45 | 46 | if pipeline_failed_tables: 47 | logger.info('Some tables were not backed up properly: {}'.format(str(pipeline_failed_tables))) 48 | logger.info('Sending sns notification about failures.') 49 | 50 | email_body = failed_table_backup_email_template.format( 51 | account=self.account, 52 | description=create_description(pipeline_failed_tables), 53 | log_bucket = self.log_bucket 54 | ) 55 | self.send_notification_email(email_body) 56 | 57 | def extract_failed_tables(self, pipeline): 58 | objects = pipeline.get('definition', {'objects': []}).get('objects', []) 59 | s3_attributes = filter(lambda x: 'directoryPath' in x, objects) 60 | failed_tables = [] 61 | 62 | for s3_attribute in s3_attributes: 63 | 64 | protocol, _, bucket, table_name, timestamp = s3_attribute['directoryPath'].split('/') 65 | 66 | backup_archive = self.s3_util.list_objects( 67 | bucket, table_name 68 | ).get("Contents", []) 69 | 70 | backup_archive = sorted(backup_archive, key=lambda x: x['LastModified'], reverse=True) 71 | 72 | if not backup_archive: 73 | failed_tables.append(table_name) 74 | continue 75 | 76 | success_flag = get_first_success_flag(backup_archive) 77 | 78 | if not success_flag: 79 | failed_tables.append(table_name) 80 | continue 81 | 82 | if not is_backup_from_current_batch(success_flag): 83 | failed_tables.append(table_name) 84 | continue 85 | 86 | return failed_tables 87 | 88 | def send_notification_email(self, email_body): 89 | email_subject = email_subject_template.format(account=self.account) 90 | self.sns_util.publish(self.sns_endpoint, email_subject, email_body) 91 | 92 | 93 | def get_first_success_flag(backup_dir_contents): 94 | for content in backup_dir_contents: 95 | if content['Key'].endswith('_SUCCESS'): 96 | return content 97 | 98 | return None 99 | 100 | 101 | failed_table_backup_email_template = """ 102 | Hello 103 | 104 | You have been notified, as some of tables in {account} account were not backed up in last 24h. 105 | Please find details below: 106 | 107 | Pipeline Id: Failed tables 108 | 109 | {description} 110 | 111 | Please check logs in: {log_bucket} for details. 112 | 113 | Best regards, 114 | Hippolyte 115 | """ 116 | 117 | email_subject_template = "Failed to backup DynamoDB tables in {account} account." 118 | 119 | all_failed_backup_email_template = """ 120 | Hello 121 | 122 | You have been notified, as DynamoDB backup failed completely in {account}. 123 | I couldn't even find a backup_metadata* file in {log_bucket}. 124 | 125 | Best regards, 126 | Hippolyte 127 | """ 128 | 129 | 130 | def is_backup_from_current_batch(backup_dir): 131 | last_modified = backup_dir['LastModified'] 132 | return (datetime.now(tz=last_modified.tzinfo) - last_modified).total_seconds() <= TIME_IN_BETWEEN_BACKUPS 133 | 134 | 135 | def create_description(pipeline_failed_tables): 136 | table = "" 137 | for pipeline_id in pipeline_failed_tables: 138 | table += "{}: {}\n".format(pipeline_id, ",".join(pipeline_failed_tables[pipeline_id])) 139 | 140 | return table 141 | -------------------------------------------------------------------------------- /hippolyte/multiple.template: -------------------------------------------------------------------------------- 1 | { 2 | "objects": [ 3 | { 4 | "subnetId": "{{subnetId}}", 5 | "bootstrapAction": "s3://{{region}}.elasticmapreduce/bootstrap-actions/configure-hadoop, {{clusterMemory}}", 6 | "name": "EmrClusterForBackup", 7 | "coreInstanceCount": "{{coreInstanceCount}}", 8 | "coreInstanceType": "{{coreInstanceType}}", 9 | "amiVersion": "3.9.0", 10 | "masterInstanceType": "{{masterInstanceType}}", 11 | "id": "EmrClusterForBackup", 12 | "region": "{{region}}", 13 | "type": "EmrCluster", 14 | "terminateAfter": "{{terminateAfter}}" 15 | }, 16 | { 17 | "failureAndRerunMode": "CASCADE", 18 | "resourceRole": "DataPipelineDefaultResourceRole", 19 | "role": "DataPipelineDefaultRole", 20 | "pipelineLogUri": "s3://{{s3PipelineLogBucket}}/", 21 | "scheduleType": "ONDEMAND", 22 | "name": "Default", 23 | "id": "Default" 24 | }, 25 | {{#backups}} 26 | { 27 | "readThroughputPercent": "{{dbSourceTableReadThroughputPercent}}", 28 | "name": "{{dbSourceTableName}}", 29 | "id": "{{dbSourceTableId}}", 30 | "type": "DynamoDBDataNode", 31 | "tableName": "{{dynamoDBTableName}}" 32 | }, 33 | { 34 | "output": { 35 | "ref": "{{s3BackupLocationId}}" 36 | }, 37 | "input": { 38 | "ref": "{{dbSourceTableId}}" 39 | }, 40 | "maximumRetries": "{{tableBackupActivityMaximumRetries}}", 41 | "name": "{{tableBackupActivityName}}", 42 | "step": "s3://dynamodb-emr-{{region}}/emr-ddb-storage-handler/2.1.0/emr-ddb-2.1.0.jar,org.apache.hadoop.dynamodb.tools.DynamoDbExport,#{output.directoryPath},#{input.tableName},#{input.readThroughputPercent}", 43 | "id": "{{tableBackupActivityId}}", 44 | "runsOn": { 45 | "ref": "EmrClusterForBackup" 46 | }, 47 | "type": "EmrActivity", 48 | "resizeClusterBeforeRunning": "false" 49 | }, 50 | { 51 | "directoryPath": "s3://{{s3BackupBucket}}/{{dynamoDBTableName}}/#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')}", 52 | "name": "{{s3BackupLocationName}}", 53 | "id": "{{s3BackupLocationId}}", 54 | "type": "S3DataNode" 55 | }{{#comma}},{{/comma}} 56 | {{/backups}} 57 | ], 58 | "parameters": [ 59 | ], 60 | "values": { 61 | } 62 | } -------------------------------------------------------------------------------- /hippolyte/pipeline_scheduler.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | 3 | import json 4 | import logging 5 | import math 6 | import os 7 | import pystache 8 | 9 | from hippolyte.utils import EMR_BOOTSTRAP_TIME, MAX_DURATION_SEC, ACTIVITY_BOOTSTRAP_TIME, \ 10 | MAX_TABLES_PER_PIPELINE, INITIAL_READ_THROUGHPUT_PERCENT, estimate_backup_duration, get_date_suffix 11 | 12 | logger = logging.getLogger() 13 | logger.setLevel(logging.INFO) 14 | 15 | 16 | CLUSTER_CONFIGS = [ 17 | { 18 | 'masterInstanceType': 'm1.medium', 19 | 'coreInstanceType': 'm1.medium', 20 | 'coreInstanceCount': 1, 21 | 'clusterMemory': '--yarn-key-value,yarn.nodemanager.resource.memory-mb=2048,' 22 | '--yarn-key-value,yarn.scheduler.maximum-allocation-mb=2048,' 23 | '--yarn-key-value,yarn.scheduler.minimum-allocation-mb=256,' 24 | '--yarn-key-value,yarn.app.mapreduce.am.resource.mb=1024,' 25 | '--mapred-key-value,mapreduce.map.memory.mb=768,' 26 | '--mapred-key-value,mapreduce.map.java.opts=-Xmx512M,' 27 | '--mapred-key-value,mapreduce.reduce.memory.mb=1024,' 28 | '--mapred-key-value,mapreduce.reduce.java.opts=-Xmx768m,' 29 | '--mapred-key-value,mapreduce.map.speculative=false', 30 | 'maxTotalDynamoDbSizeBytes': 597688320 # 570MB 31 | }, 32 | { 33 | 'masterInstanceType': 'm3.xlarge', 34 | 'coreInstanceType': 'm3.xlarge', 35 | 'coreInstanceCount': 1, 36 | 'clusterMemory': '--yarn-key-value,yarn.nodemanager.resource.memory-mb=11520,' 37 | '--yarn-key-value,yarn.scheduler.maximum-allocation-mb=11520,' 38 | '--yarn-key-value,yarn.scheduler.minimum-allocation-mb=1440,' 39 | '--yarn-key-value,yarn.app.mapreduce.am.resource.mb=2880,' 40 | '--mapred-key-value,mapreduce.map.memory.mb=5760,' 41 | '--mapred-key-value,mapreduce.map.java.opts=-Xmx4608M,' 42 | '--mapred-key-value,mapreduce.reduce.memory.mb=2880,' 43 | '--mapred-key-value,mapreduce.reduce.java.opts=-Xmx2304m,' 44 | '--mapred-key-value,mapreduce.map.speculative=false', 45 | 'maxTotalDynamoDbSizeBytes': 1099511627776000 # 1PB 46 | } 47 | ] 48 | 49 | 50 | class Scheduler(object): 51 | def __init__(self, table_descriptions, template_file, subnet_id, region, 52 | s3_backup_bucket, s3_pipeline_log_bucket, max_retries=2, 53 | read_throughput_percent=INITIAL_READ_THROUGHPUT_PERCENT): 54 | """ 55 | :param table_descriptions: descriptions, as returned from DynamoDBUtil.describe_tables() 56 | :param template_file: path to template file 57 | :param read_throughput_percent: how much read throughput should be used for backing up, ex. 0.5 - 50% 58 | :param subnet_id: EMR subnet 59 | :param region: 60 | :param s3_backup_bucket: S3 location, where backup files go 61 | :param s3_pipeline_log_bucket: S3 location, where pipeline logs go 62 | :param max_retries: how many times to retry pipeline execution on error, before giving up 63 | :return: 64 | """ 65 | self.table_descriptions = table_descriptions 66 | self.template_file = template_file 67 | self.read_throughput_percent = read_throughput_percent 68 | self.subnet_id = subnet_id 69 | self.region = region 70 | self.s3_backup_bucket = s3_backup_bucket 71 | self.s3_pipeline_log_bucket = s3_pipeline_log_bucket 72 | self.max_retries = max_retries 73 | self.s3_log_location = '{}/logs/{}'.format(s3_pipeline_log_bucket, get_date_suffix()) 74 | self.terminate_after = int(math.ceil(MAX_DURATION_SEC / 3600.0)) + 1 75 | 76 | def build_pipeline_definitions(self): 77 | """ 78 | Creates list of pipeline definitions, which could be use to populate data pipelines. 79 | Does it by combining template with parameter list. 80 | :return: list of pipeline definitions 81 | """ 82 | data_pipelines = [] 83 | 84 | template = self.read_template() 85 | dp_parameters = self.build_parameters() 86 | 87 | for parameters in dp_parameters: 88 | data_pipelines.append(json.loads(pystache.render(template, parameters))) 89 | 90 | return data_pipelines 91 | 92 | def read_template(self): 93 | template_file = os.path.join(os.path.dirname(__file__), self.template_file) 94 | with open(template_file, "r") as f: 95 | return f.read() 96 | 97 | def build_parameters(self): 98 | """ 99 | Builds list of parameters, describing dynamo db backup process on a single data pipeline. 100 | Performs scheduling, in terms of what tables to assign to pipelines, to achieve close execution time, 101 | on all pipelines. 102 | :return: list of parameters for single data pipeline 103 | """ 104 | data_pipeline_parameters = [] 105 | total_duration = EMR_BOOTSTRAP_TIME 106 | backups = [] 107 | table_counter = 0 108 | table_index = 0 109 | total_table_size = 0 110 | table_backup_durations = self.build_table_backup_durations() 111 | 112 | for table_name, backup_duration, table_size_bytes in table_backup_durations: 113 | total_duration += backup_duration 114 | 115 | backups.append(self.create_backup_parameters(table_counter, table_name)) 116 | table_counter += 1 117 | total_table_size += table_size_bytes 118 | 119 | if not self.should_add_more_tables(table_index, total_duration, 120 | table_backup_durations, backups): 121 | backups = self.normalize_backup_parameters(backups) 122 | 123 | data_pipeline_parameters.append(self.create_pipeline_parameters(backups, total_table_size)) 124 | 125 | logger.info('Total estimated duration of pipeline execution: {}'.format(total_duration)) 126 | 127 | total_duration = EMR_BOOTSTRAP_TIME 128 | backups = [] 129 | table_counter = 0 130 | total_table_size = 0 131 | 132 | table_index += 1 133 | 134 | return data_pipeline_parameters 135 | 136 | def build_table_backup_durations(self): 137 | """ 138 | Describes dynamo db tables in the account and assigns estimated duration time to each one of those. 139 | Tables with 0 size will not be backed up. 140 | :return: list of dynamo db table descriptions, sorted by ascending estimated backup duration 141 | """ 142 | table_backup_duration = [] 143 | for description in self.table_descriptions: 144 | table = description['Table'] 145 | duration = self.estimate_duration(table) 146 | 147 | if table['TableSizeBytes']: 148 | table_backup_duration.append((table['TableName'], duration, table['TableSizeBytes'])) 149 | else: 150 | logger.info("Skipping {} as it appears to be empty.".format(table["TableName"])) 151 | 152 | return sorted(table_backup_duration, key=lambda x: x[1]) 153 | 154 | def create_pipeline_parameters(self, backups, total_table_size): 155 | """ 156 | :param backups: list of elements, as returned from create_backup_parameters 157 | :param max_table_size: sizew in bytes of the biggest table currently backed up 158 | :return: list of parameters needed for data pipeline Config and EMRCluster nodes 159 | """ 160 | cluster_config = None 161 | 162 | for config in CLUSTER_CONFIGS: 163 | if total_table_size < config['maxTotalDynamoDbSizeBytes']: 164 | cluster_config = config 165 | break 166 | 167 | return { 168 | 'subnetId': '{}'.format(self.subnet_id), 169 | 'coreInstanceCount': cluster_config['coreInstanceCount'], 170 | 'coreInstanceType': cluster_config['coreInstanceType'], 171 | 'masterInstanceType': cluster_config['masterInstanceType'], 172 | 'clusterMemory': cluster_config['clusterMemory'], 173 | 'region': '{}'.format(self.region), 174 | 'terminateAfter': '{} Hour'.format(self.terminate_after), 175 | 's3BackupBucket': '{}'.format(self.s3_backup_bucket), 176 | 's3PipelineLogBucket': '{}'.format(self.s3_log_location), 177 | 'backups': backups 178 | } 179 | 180 | def create_backup_parameters(self, table_counter, table_name): 181 | """ 182 | :param table_counter: 183 | :param table_name: 184 | :return: list of parameters, needed for backing up single dynamo db table. 185 | """ 186 | return {'dbSourceTableReadThroughputPercent': '{}'.format(self.read_throughput_percent), 187 | 'dbSourceTableName': 'DDBSourceTable{}'.format(table_counter), 188 | 'dbSourceTableId': 'DDBSourceTable{}'.format(table_counter), 189 | 'dynamoDBTableName': table_name, 190 | 's3BackupLocationId': 'S3BackupLocation{}'.format(table_counter), 191 | 's3BackupLocationName': 'S3BackupLocation{}'.format(table_counter), 192 | 'tableBackupActivityMaximumRetries': '{}'.format(self.max_retries), 193 | 'tableBackupActivityName': 'TableBackupActivity{}'.format(table_counter), 194 | 'tableBackupActivityId': 'TableBackupActivity{}'.format(table_counter), 195 | 'region': '{}'.format(self.region), 196 | 'comma': True} 197 | 198 | def normalize_backup_parameters(self, backups): 199 | if backups: 200 | backups[-1]['comma'] = False 201 | 202 | return backups 203 | 204 | def should_add_more_tables(self, table_index, total_duration, table_backup_durations, backups): 205 | """ 206 | Checks whether or not more tables should be backed up on current data pipeline 207 | :param table_index: 208 | :param total_duration: 209 | :param table_backup_durations: 210 | :param backups: 211 | :return: 212 | """ 213 | add_more_tables = True 214 | 215 | if table_index + 1 < len(table_backup_durations): 216 | if total_duration + table_backup_durations[table_index + 1][1] >= MAX_DURATION_SEC: 217 | add_more_tables = False 218 | 219 | if len(backups) >= MAX_TABLES_PER_PIPELINE: 220 | add_more_tables = False 221 | else: 222 | add_more_tables = False 223 | 224 | return add_more_tables 225 | 226 | def estimate_duration(self, data): 227 | """ 228 | Gives rough estimate, on how long backing up dynamo db table will take. 229 | :param data: dynamic dynamo db table definition 230 | :return: Estimated time in seconds. 231 | """ 232 | table_size_bytes = data.get('TableSizeBytes', 0) 233 | read_capacity_units = data['ProvisionedThroughput']['ReadCapacityUnits'] 234 | 235 | return estimate_backup_duration(self.read_throughput_percent, table_size_bytes, 236 | read_capacity_units) + ACTIVITY_BOOTSTRAP_TIME 237 | -------------------------------------------------------------------------------- /hippolyte/pipeline_translator.py: -------------------------------------------------------------------------------- 1 | import json 2 | from copy import deepcopy 3 | 4 | # Copyright 2014 Amazon.com, Inc. or its affiliates. All Rights Reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"). You 7 | # may not use this file except in compliance with the License. A copy of 8 | # the License is located at 9 | # 10 | # http://aws.amazon.com/apache2.0/ 11 | # 12 | # or in the "license" file accompanying this file. This file is 13 | # distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF 14 | # ANY KIND, either express or implied. See the License for the specific 15 | # language governing permissions and limitations under the License. 16 | 17 | 18 | class PipelineDefinitionError(Exception): 19 | def __init__(self, msg): 20 | full_msg = ( 21 | "Error in pipeline definition: %s\n" % msg) 22 | super(PipelineDefinitionError, self).__init__(full_msg) 23 | self.msg = msg 24 | 25 | 26 | def definition_to_api_objects(definition): 27 | definition_copy = deepcopy(definition) 28 | if 'objects' not in definition_copy: 29 | raise PipelineDefinitionError('Missing "objects" key') 30 | api_elements = [] 31 | # To convert to the structure expected by the service, 32 | # we convert the existing structure to a list of dictionaries. 33 | # Each dictionary has a 'fields', 'id', and 'name' key. 34 | for element in definition_copy['objects']: 35 | try: 36 | element_id = element.pop('id') 37 | except KeyError: 38 | raise PipelineDefinitionError('Missing "id" key of element: %s' % 39 | json.dumps(element)) 40 | api_object = {'id': element_id} 41 | # If a name is provided, then we use that for the name, 42 | # otherwise the id is used for the name. 43 | name = element.pop('name', element_id) 44 | api_object['name'] = name 45 | # Now we need the field list. Each element in the field list is a dict 46 | # with a 'key', 'stringValue'|'refValue' 47 | fields = [] 48 | for key, value in sorted(element.items()): 49 | fields.extend(_parse_each_field(key, value)) 50 | api_object['fields'] = fields 51 | api_elements.append(api_object) 52 | return api_elements 53 | 54 | 55 | def definition_to_api_parameters(definition): 56 | definition_copy = deepcopy(definition) 57 | if 'parameters' not in definition_copy: 58 | return None 59 | parameter_objects = [] 60 | for element in definition_copy['parameters']: 61 | try: 62 | parameter_id = element.pop('id') 63 | except KeyError: 64 | raise PipelineDefinitionError('Missing "id" key of parameter: %s' % 65 | json.dumps(element)) 66 | parameter_object = {'id': parameter_id} 67 | # Now we need the attribute list. Each element in the attribute list 68 | # is a dict with a 'key', 'stringValue' 69 | attributes = [] 70 | for key, value in sorted(element.items()): 71 | attributes.extend(_parse_each_field(key, value)) 72 | parameter_object['attributes'] = attributes 73 | parameter_objects.append(parameter_object) 74 | return parameter_objects 75 | 76 | 77 | def definition_to_parameter_values(definition): 78 | definition_copy = deepcopy(definition) 79 | if 'values' not in definition_copy: 80 | return None 81 | parameter_values = [] 82 | for key in definition_copy['values']: 83 | parameter_values.extend( 84 | _convert_single_parameter_value(key, definition_copy['values'][key])) 85 | 86 | return parameter_values 87 | 88 | 89 | def _parse_each_field(key, value): 90 | values = [] 91 | if isinstance(value, list): 92 | for item in value: 93 | values.append(_convert_single_field(key, item)) 94 | else: 95 | values.append(_convert_single_field(key, value)) 96 | return values 97 | 98 | 99 | def _convert_single_field(key, value): 100 | field = {'key': key} 101 | if isinstance(value, dict) and list(value.keys()) == ['ref']: 102 | field['refValue'] = value['ref'] 103 | else: 104 | field['stringValue'] = value 105 | return field 106 | 107 | 108 | def _convert_single_parameter_value(key, values): 109 | parameter_values = [] 110 | if isinstance(values, list): 111 | for each_value in values: 112 | parameter_value = {'id': key, 'stringValue': each_value} 113 | parameter_values.append(parameter_value) 114 | else: 115 | parameter_value = {'id': key, 'stringValue': values} 116 | parameter_values.append(parameter_value) 117 | return parameter_values 118 | -------------------------------------------------------------------------------- /hippolyte/project_config.py: -------------------------------------------------------------------------------- 1 | ACCOUNT_CONFIGS = { 2 | '123456789100': { 3 | 'name': 'example-account', 4 | 'emr_subnet': 'example-subnet-id', 5 | 'log_bucket': 'hippolyte-eu-west-1-prod-backups', 6 | 'backup_bucket': 'hippolyte-eu-west-1-prod-backups', 7 | 'exclude_from_backup': [ 8 | 'example-table-*' 9 | ], 10 | 'always_backup': [ 11 | 'this-is-not-an-example-table-1' 12 | ] 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /hippolyte/utils.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | MAX_TABLES_PER_PIPELINE = 32 4 | READ_BLOCK_SIZE_BYTES = 4096 5 | MAX_ALLOWED_PROVISIONED_READ_THROUGHPUT = 1000 6 | MAX_MSG_BULK_READ = 100 7 | MAX_BULK_READ_SIZE_BYTES = 16777216 8 | MAX_DURATION_SEC = 12 * 3600 9 | MAX_DURATION_SINGLE_PIPELINE = 3300 #less than 1h 10 | ACTIVITY_BOOTSTRAP_TIME = 60 11 | EMR_BOOTSTRAP_TIME = 600 12 | INITIAL_READ_THROUGHPUT_PERCENT = 0.5 13 | TIME_IN_BETWEEN_BACKUPS = 86400 14 | 15 | 16 | def estimate_backup_duration(read_throughput_percent, table_size_bytes, read_capacity_units): 17 | """ 18 | Gives rough estimate, on how long backing up dynamo db table will take. 19 | :param table_size_bytes: 20 | :param read_capacity_units 21 | :return: Estimated time in seconds. 22 | """ 23 | read_bytes_per_second = read_capacity_units * read_throughput_percent * READ_BLOCK_SIZE_BYTES 24 | 25 | return table_size_bytes / read_bytes_per_second 26 | 27 | 28 | def compute_required_throughput(estimated_duration, target_duration, read_capacity_units, read_throughput_percent): 29 | """ 30 | :param estimated_duration: estimated duration using current: read_capacity_units, read_throughput_percent 31 | :param target_duration: how long should backup take 32 | :param read_capacity_units: current provisioned read capacity 33 | :param read_throughput_percent: current backup read throughput as % of total read throughput 34 | :return: new read throughput with new read throughput percent 35 | """ 36 | ratio = estimated_duration / float(target_duration) 37 | new_read_capacity_units = read_capacity_units * (ratio + 1) 38 | new_read_throughput_percent = float(read_capacity_units) / new_read_capacity_units 39 | 40 | new_read_throughput_percent = 1 - max(new_read_throughput_percent, 0.01) 41 | 42 | return int(round(new_read_capacity_units)), round(new_read_throughput_percent, 2) 43 | 44 | 45 | def get_date_suffix(): 46 | return datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") 47 | 48 | 49 | def chunks(l, n): 50 | for i in range(0, len(l), n): 51 | yield l[i:i + n] 52 | 53 | 54 | def list_tables_in_definition(pipeline_definition): 55 | nodes = pipeline_definition.get('objects') 56 | table_nodes = filter(lambda x: 'tableName' in x, nodes) 57 | 58 | return map(lambda x: x['tableName'], table_nodes) 59 | 60 | 61 | def get_first_element_in_the_list_with(l, key, value): 62 | element = filter(lambda x: x[key] == value, l) 63 | 64 | if element: 65 | return element[0] 66 | 67 | return None 68 | 69 | -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | boto3==1.4.4 2 | pystache==0.5.4 3 | retrying==1.3.3 4 | mock==2.0.0 5 | moto==1.0.1 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pystache==0.5.4 2 | retrying==1.3.3 -------------------------------------------------------------------------------- /serverless.yml: -------------------------------------------------------------------------------- 1 | service: hippolyte 2 | 3 | provider: 4 | name: aws 5 | runtime: python2.7 6 | role: LambdaRole 7 | 8 | # If you have multiple accounts then it's recommended to use multiple profiles to store AWS credentials 9 | # The 'custom' sections for the stage/profiles key value map. 10 | region: ${opt:region, self:custom.defaultRegion} 11 | stage: ${opt:stage, self:custom.defaultStage} 12 | profile: ${self:custom.profiles.${self:provider.stage}} 13 | 14 | plugins: 15 | - serverless-python-requirements 16 | 17 | custom: 18 | defaultStage: dev 19 | defaultRegion: us-east-1 20 | profiles: 21 | # These should correspond to the account credentails stored in ~/.aws/credentials 22 | dev: devProfile 23 | prod: prodProfile 24 | 25 | 26 | functions: 27 | backup: 28 | handler: hippolyte.dynamodb_backup.lambda_handler 29 | events: 30 | # The lambda function depends on the names of these events to determine in which mode to run 31 | - schedule: 32 | name: hippolyte-${self:provider.stage}-backup-event 33 | rate: cron(10 0 * * ? *) 34 | - schedule: 35 | name: hippolyte-${self:provider.stage}-monitor-dynamodb-backup 36 | rate: cron(15 1-10 * * ? *) 37 | 38 | resources: 39 | Resources: 40 | LambdaRole: 41 | Type: "AWS::IAM::Role" 42 | Properties: 43 | AssumeRolePolicyDocument: 44 | Version: "2012-10-17" 45 | Statement: 46 | - Effect: Allow 47 | Principal: 48 | Service: 49 | - lambda.amazonaws.com 50 | Action: "sts:AssumeRole" 51 | ManagedPolicyArns: 52 | - "arn:aws:iam::aws:policy/AmazonDynamoDBFullAccesswithDataPipeline" 53 | Policies: 54 | - PolicyName: LambdaPolicy 55 | PolicyDocument: 56 | Version: "2012-10-17" 57 | Statement: 58 | - Effect: Allow 59 | Action: 60 | - logs:CreateLogGroup 61 | - logs:CreateLogStream 62 | - logs:PutLogEvents 63 | Resource: 64 | - Fn::Sub: "arn:aws:logs:${self:provider.region}:*:log-group:/aws/lambda/*:*:*" 65 | - Effect: "Allow" 66 | Action: 67 | - "sns:Publish" 68 | Resource: {"Ref": "EmailNotificationTopic"} 69 | BackupBucket: 70 | Type: "AWS::S3::Bucket" 71 | Properties: 72 | BucketName: hippolyte-${self:provider.region}-${self:provider.stage}-backups 73 | EmailNotificationTopic: 74 | Type: "AWS::SNS::Topic" 75 | Properties: 76 | TopicName: hippolyte-backup-monitoring 77 | Subscription: 78 | - Protocol: email 79 | Endpoint: ${opt:email, self:custom.defaultEmail} 80 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ocadotechnology/hippolyte/a8f95f7a7de7a8499c89029106941ee0ea3a62d7/tests/__init__.py -------------------------------------------------------------------------------- /tests/resources/test_backup_metadata.json: -------------------------------------------------------------------------------- 1 | { 2 | "Pipelines": [ 3 | { 4 | "backed_up_tables": [ 5 | "prd-shd-euw1-scotty_audit-actions" 6 | ], 7 | "definition": { 8 | "objects": [ 9 | { 10 | "amiVersion": "3.9.0", 11 | "bootstrapAction": "s3://eu-west-1.elasticmapreduce/bootstrap-actions/configure-hadoop, --yarn-key-value,yarn.nodemanager.resource.memory-mb=11520,--yarn-key-value,yarn.scheduler.maximum-allocation-mb=11520,--yarn-key-value,yarn.scheduler.minimum-allocation-mb=1440,--yarn-key-value,yarn.app.mapreduce.am.resource.mb=2880,--mapred-key-value,mapreduce.map.memory.mb=5760,--mapred-key-value,mapreduce.map.java.opts=-Xmx4608M,--mapred-key-value,mapreduce.reduce.memory.mb=2880,--mapred-key-value,mapreduce.reduce.java.opts=-Xmx2304m,--mapred-key-value,mapreduce.map.speculative=false", 12 | "coreInstanceCount": "1", 13 | "coreInstanceType": "m3.xlarge", 14 | "id": "EmrClusterForBackup", 15 | "masterInstanceType": "m3.xlarge", 16 | "name": "EmrClusterForBackup", 17 | "region": "eu-west-1", 18 | "subnetId": "subnet-9f2395c6", 19 | "terminateAfter": "5 Hour", 20 | "type": "EmrCluster" 21 | }, 22 | { 23 | "failureAndRerunMode": "CASCADE", 24 | "id": "Default", 25 | "name": "Default", 26 | "pipelineLogUri": "s3://euw1-dynamodb-backups-prd-480503113116/logs/2017-06-22-13-53-48/", 27 | "resourceRole": "DataPipelineDefaultResourceRole", 28 | "role": "DataPipelineDefaultRole", 29 | "scheduleType": "ONDEMAND" 30 | }, 31 | { 32 | "id": "DDBSourceTable5", 33 | "name": "DDBSourceTable5", 34 | "readThroughputPercent": "0.5", 35 | "tableName": "prd-shd-euw1-scotty_audit-actions", 36 | "type": "DynamoDBDataNode" 37 | }, 38 | { 39 | "id": "TableBackupActivity5", 40 | "input": { 41 | "ref": "DDBSourceTable5" 42 | }, 43 | "maximumRetries": "2", 44 | "name": "TableBackupActivity5", 45 | "output": { 46 | "ref": "S3BackupLocation5" 47 | }, 48 | "resizeClusterBeforeRunning": "false", 49 | "runsOn": { 50 | "ref": "EmrClusterForBackup" 51 | }, 52 | "step": "s3://dynamodb-emr-eu-west-1/emr-ddb-storage-handler/2.1.0/emr-ddb-2.1.0.jar,org.apache.hadoop.dynamodb.tools.DynamoDbExport,#{output.directoryPath},#{input.tableName},#{input.readThroughputPercent}", 53 | "type": "EmrActivity" 54 | }, 55 | { 56 | "directoryPath": "s3://euw1-dynamodb-backups-prd-480503113116/prd-shd-euw1-scotty_audit-actions/#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')}", 57 | "id": "S3BackupLocation5", 58 | "name": "S3BackupLocation5", 59 | "type": "S3DataNode" 60 | } 61 | ], 62 | "parameters": [], 63 | "values": {} 64 | }, 65 | "pipeline_id": "df-0024453ANW0OBWGL7YE" 66 | } 67 | ], 68 | "ScalableTargets": [ 69 | { 70 | "CreationTime": "2017-06-26 15:21:59.485000+01:00", 71 | "MaxCapacity": 20, 72 | "MinCapacity": 1, 73 | "ResourceId": "table/prd-shd-euw1-scotty_audit-actions", 74 | "RoleARN": "arn:aws:iam::480503113116:role/service-role/DynamoDBAutoscaleRole", 75 | "ScalableDimension": "dynamodb:table:ReadCapacityUnits", 76 | "ServiceNamespace": "dynamodb" 77 | } 78 | ], 79 | "ScalingPolicies": [ 80 | { 81 | "Alarms": [ 82 | { 83 | "AlarmARN": "arn:aws:cloudwatch:eu-west-1:480503113116:alarm:TargetTracking-table/prd-shd-euw1-scotty_audit-actions-AlarmHigh-13a0215d-5b1f-4ddc-ad7e-5d58ba296a66", 84 | "AlarmName": "TargetTracking-table/prd-shd-euw1-scotty_audit-actions-AlarmHigh-13a0215d-5b1f-4ddc-ad7e-5d58ba296a66" 85 | }, 86 | { 87 | "AlarmARN": "arn:aws:cloudwatch:eu-west-1:480503113116:alarm:TargetTracking-table/prd-shd-euw1-scotty_audit-actions-AlarmLow-ea7fbdd6-1aa4-4e9c-9ab9-66d8d7183e22", 88 | "AlarmName": "TargetTracking-table/prd-shd-euw1-scotty_audit-actions-AlarmLow-ea7fbdd6-1aa4-4e9c-9ab9-66d8d7183e22" 89 | }, 90 | { 91 | "AlarmARN": "arn:aws:cloudwatch:eu-west-1:480503113116:alarm:TargetTracking-table/prd-shd-euw1-scotty_audit-actions-ProvisionedCapacityHigh-3acc8c06-62cd-4c1e-95e4-6f22a911a2ef", 92 | "AlarmName": "TargetTracking-table/prd-shd-euw1-scotty_audit-actions-ProvisionedCapacityHigh-3acc8c06-62cd-4c1e-95e4-6f22a911a2ef" 93 | }, 94 | { 95 | "AlarmARN": "arn:aws:cloudwatch:eu-west-1:480503113116:alarm:TargetTracking-table/prd-shd-euw1-scotty_audit-actions-ProvisionedCapacityLow-a8f80034-8f75-4860-b2a3-2fada5cce22d", 96 | "AlarmName": "TargetTracking-table/prd-shd-euw1-scotty_audit-actions-ProvisionedCapacityLow-a8f80034-8f75-4860-b2a3-2fada5cce22d" 97 | } 98 | ], 99 | "CreationTime": "2017-06-26 15:21:59.571000+01:00", 100 | "PolicyARN": "arn:aws:autoscaling:eu-west-1:480503113116:scalingPolicy:ddb81ffd-483d-4b37-8e38-1440e5d7d37d:resource/dynamodb/table/prd-shd-euw1-scotty_audit-actions:policyName/DynamoDBReadCapacityUtilization:table/prd-shd-euw1-scotty_audit-actions", 101 | "PolicyName": "DynamoDBReadCapacityUtilization:table/prd-shd-euw1-scotty_audit-actions", 102 | "PolicyType": "TargetTrackingScaling", 103 | "ResourceId": "table/prd-shd-euw1-scotty_audit-actions", 104 | "ScalableDimension": "dynamodb:table:ReadCapacityUnits", 105 | "ServiceNamespace": "dynamodb", 106 | "TargetTrackingScalingPolicyConfiguration": { 107 | "PredefinedMetricSpecification": { 108 | "PredefinedMetricType": "DynamoDBReadCapacityUtilization" 109 | }, 110 | "TargetValue": 70.0 111 | } 112 | } 113 | ], 114 | "Tables": [ 115 | { 116 | "ResponseMetadata": { 117 | "HTTPHeaders": { 118 | "content-length": "1202", 119 | "content-type": "application/x-amz-json-1.0", 120 | "date": "Thu, 22 Jun 2017 13:53:47 GMT", 121 | "x-amz-crc32": "3477012252", 122 | "x-amzn-requestid": "O02UK4SOAUB6CP49O5US1K1O63VV4KQNSO5AEMVJF66Q9ASUAAJG" 123 | }, 124 | "HTTPStatusCode": 200, 125 | "RequestId": "O02UK4SOAUB6CP49O5US1K1O63VV4KQNSO5AEMVJF66Q9ASUAAJG" 126 | }, 127 | "Table": { 128 | "AttributeDefinitions": [ 129 | { 130 | "AttributeName": "id", 131 | "AttributeType": "S" 132 | }, 133 | { 134 | "AttributeName": "startDate", 135 | "AttributeType": "S" 136 | }, 137 | { 138 | "AttributeName": "startTime", 139 | "AttributeType": "S" 140 | } 141 | ], 142 | "CreationDateTime": "2015-06-15 16:12:43.471000+00:00", 143 | "GlobalSecondaryIndexes": [ 144 | { 145 | "IndexArn": "arn:aws:dynamodb:eu-west-1:480503113116:table/prd-shd-euw1-scotty_audit-actions/index/StartDateIndex", 146 | "IndexName": "StartDateIndex", 147 | "IndexSizeBytes": 4010436256, 148 | "IndexStatus": "ACTIVE", 149 | "ItemCount": 2091475, 150 | "KeySchema": [ 151 | { 152 | "AttributeName": "startDate", 153 | "KeyType": "HASH" 154 | }, 155 | { 156 | "AttributeName": "startTime", 157 | "KeyType": "RANGE" 158 | } 159 | ], 160 | "Projection": { 161 | "ProjectionType": "ALL" 162 | }, 163 | "ProvisionedThroughput": { 164 | "NumberOfDecreasesToday": 0, 165 | "ReadCapacityUnits": 100, 166 | "WriteCapacityUnits": 50 167 | } 168 | } 169 | ], 170 | "ItemCount": 2091475, 171 | "KeySchema": [ 172 | { 173 | "AttributeName": "id", 174 | "KeyType": "HASH" 175 | } 176 | ], 177 | "ProvisionedThroughput": { 178 | "LastDecreaseDateTime": "2017-06-21 09:37:33.736000+00:00", 179 | "LastIncreaseDateTime": "2017-04-26 00:10:44.103000+00:00", 180 | "NumberOfDecreasesToday": 0, 181 | "ReadCapacityUnits": 350, 182 | "WriteCapacityUnits": 50 183 | }, 184 | "TableArn": "arn:aws:dynamodb:eu-west-1:480503113116:table/prd-shd-euw1-scotty_audit-actions", 185 | "TableName": "prd-shd-euw1-scotty_audit-actions", 186 | "TableSizeBytes": 4010436256, 187 | "TableStatus": "ACTIVE" 188 | } 189 | } 190 | ] 191 | } -------------------------------------------------------------------------------- /tests/test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import sys 3 | 4 | 5 | if __name__ == '__main__': 6 | runner = unittest.TextTestRunner() 7 | suite = unittest.TestLoader().discover('.') 8 | exit_code = not runner.run(suite).wasSuccessful() 9 | sys.exit(exit_code) 10 | -------------------------------------------------------------------------------- /tests/test_dynamodb_backup.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import boto3 3 | import json 4 | import sys 5 | import os 6 | from moto import mock_dynamodb2 7 | 8 | sys.path.append(os.path.join(os.getcwd() + '/../code')) 9 | 10 | from hippolyte.dynamodb_backup import get_table_descriptions 11 | from test_utils import create_test_table, load_backup_metadata 12 | 13 | 14 | class TestDynamoDbBackup(unittest.TestCase): 15 | @mock_dynamodb2 16 | def test_get_table_descriptions(self): 17 | dynamodb_client = boto3.client('dynamodb', region_name='eu-west-1') 18 | 19 | backup_metadata = load_backup_metadata() 20 | table_descriptions = json.loads(backup_metadata)['Tables'] 21 | 22 | always_backup = ['prd-mol-euw1-fluxcapacitor-fluxcapacitor-alertdefinition', 23 | 'prd-mol-euw1-fluxcapacitor-fluxcapacitor-alertdefinitionhistory', 24 | 'prd-mol-euw1-fluxcapacitor-fluxcapacitor-dashboard', 25 | 'prd-mol-euw1-fluxcapacitor-fluxcapacitor-eventtype', 26 | 'prd-mol-euw1-fluxcapacitor-fluxcapacitor-fluxcontext', 27 | 'prd-mol-euw1-fluxcapacitor-fluxcapacitor-kinesisstream', 28 | 'prd-mol-euw1-fluxcapacitor-fluxcapacitor-metricdefinition', 29 | 'prd-mol-euw1-fluxcapacitor-fluxcapacitor-metricdefinitionhistory', 30 | 'prd-mol-euw1-fluxcapacitor-fluxcapacitor-pagerdutyServiceIntegration', 31 | 'prd-mol-euw1-fluxcapacitor-fluxcapacitor-snssubscription', 32 | 'prd-mol-euw1-fluxcapacitor-fluxcapacitor-system', 33 | 'prd-mol-euw1-fluxcapacitor-fluxcapacitor-systemaccessrule', 34 | 'prd-mol-euw1-fluxcapacitor-fluxcapacitor-useraccessrule' 35 | ] 36 | 37 | for table_name in always_backup + [ 38 | 'prd-cymes-euw1-commsflux--alertdefinition', 39 | 'prd-cymes-euw1-fluxcapacitor-prd-cymes-euw1-fluxcapacitor-alert', 40 | 'prd-cymes-euw1-fluxcapacitor-prd-cymes-euw1-storepick-flux-events-192.168.1.39-kinesis-consumer', 41 | 'prd-shd-euw1-flux-table1', 42 | 'prd-shd-euw1-smth-flux', 43 | 'prd-mol-euw1-storepickreporting-ContainerReport-snapshots', 44 | 'prd-mol-euw1-will-be-backed-up', 45 | 'prd-mol-euw1-will-be-backed-up-snapshots' 46 | ]: 47 | create_test_table(dynamodb_client, table_name, table_descriptions[0]['Table']) 48 | 49 | exclude_from_backup = [ 50 | '.*flux.*', 51 | '.*storepickreporting-.*-snapshots' 52 | ] 53 | 54 | table_descriptions = get_table_descriptions(exclude_from_backup, always_backup) 55 | included_tables = map(lambda x: x['Table']['TableName'], table_descriptions) 56 | included_tables.sort() 57 | 58 | expected_tables = always_backup + [ 59 | 'prd-mol-euw1-will-be-backed-up', 60 | 'prd-mol-euw1-will-be-backed-up-snapshots' 61 | ] 62 | expected_tables.sort() 63 | 64 | self.assertListEqual(included_tables, expected_tables) 65 | -------------------------------------------------------------------------------- /tests/test_dynamodb_booster.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import boto3 3 | import json 4 | import sys 5 | import os 6 | from botocore.exceptions import ClientError 7 | from moto import mock_s3, mock_datapipeline, mock_dynamodb2 8 | from mock import patch, Mock 9 | 10 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 11 | 12 | import hippolyte.dynamodb_booster 13 | import hippolyte.aws_utils 14 | from test_utils import create_test_table, load_backup_metadata 15 | 16 | TABLE_NAME = 'prd-shd-euw1-scotty_audit-actions' 17 | 18 | 19 | def create_backup_metadata(s3_client, bucket, key, body): 20 | s3_client.create_bucket(Bucket=bucket) 21 | s3_client.put_object(Bucket=bucket, Key=key, Body=body) 22 | 23 | 24 | def get_old_rcu_and_boost(table_descriptions, new_read_capacity): 25 | old = table_descriptions[0]['Table']['ProvisionedThroughput']['ReadCapacityUnits'] 26 | table_descriptions[0]['Table']['ProvisionedThroughput']['ReadCapacityUnits'] = new_read_capacity 27 | 28 | return old 29 | 30 | 31 | class FakeApplicationAutoscalingClient(): 32 | def __init__(self): 33 | self.scalable_targets = [] 34 | self.scaling_policies = [] 35 | 36 | def get_paginator(self, paginator_name): 37 | _list = [] 38 | _key = '' 39 | if paginator_name == 'describe_scalable_targets': 40 | _list = self.scalable_targets 41 | _key = 'ScalableTargets' 42 | elif paginator_name == 'describe_scaling_policies': 43 | _list = self.scaling_policies 44 | _key = 'ScalingPolicies' 45 | 46 | paginator = Mock() 47 | paginator.paginate = Mock(return_value=[{ 48 | _key: _list 49 | }]) 50 | 51 | return paginator 52 | 53 | def delete_scaling_policy(self, PolicyName, ServiceNamespace, ResourceId, ScalableDimension): 54 | before_delete = len(self.scaling_policies) 55 | self.scaling_policies = filter( 56 | lambda x: x['PolicyName'] != PolicyName or x['ScalableDimension'] != ScalableDimension, 57 | self.scaling_policies) 58 | 59 | if len(self.scaling_policies) == before_delete: 60 | raise ClientError( 61 | { 62 | 'Error': 63 | { 64 | 'Code': 'ObjectNotFoundException', 65 | 'Message': 'No scaling policy found for service namespace: dynamodb, resource ID: {},' 66 | ' scalable dimension: {}: ObjectNotFoundException'.format(ResourceId, 67 | ScalableDimension) 68 | } 69 | }, 70 | 'DeleteScalingPolicy' 71 | ) 72 | 73 | def deregister_scalable_target(self, ServiceNamespace, ResourceId, ScalableDimension): 74 | before_delete = len(self.scalable_targets) 75 | self.scalable_targets = filter( 76 | lambda x: x['ResourceId'] != ResourceId or x['ScalableDimension'] != ScalableDimension, 77 | self.scalable_targets) 78 | 79 | if len(self.scalable_targets) == before_delete: 80 | raise ClientError( 81 | { 82 | 'Error': 83 | { 84 | 'Code': 'ObjectNotFoundException', 85 | 'Message': 'No scalable target found for service namespace: dynamodb, resource ID: {},' 86 | ' scalable dimension: {}: ObjectNotFoundException'.format(ResourceId, 87 | ScalableDimension) 88 | } 89 | }, 90 | 'DeregisterScalableTarget' 91 | ) 92 | 93 | def put_scaling_policy(self, PolicyName, ServiceNamespace, ResourceId, ScalableDimension, 94 | PolicyType, TargetTrackingScalingPolicyConfiguration): 95 | self.scaling_policies.append({ 96 | "PolicyName": PolicyName, 97 | "ServiceNamespace": ServiceNamespace, 98 | "ResourceId": ResourceId, 99 | "ScalableDimension": ScalableDimension, 100 | "PolicyType": PolicyType, 101 | "TargetTrackingScalingPolicyConfiguration": TargetTrackingScalingPolicyConfiguration 102 | }) 103 | 104 | def register_scalable_target(self, ServiceNamespace, ResourceId, ScalableDimension, 105 | MinCapacity, MaxCapacity, RoleARN): 106 | self.scalable_targets.append({ 107 | "ServiceNamespace": ServiceNamespace, 108 | "ResourceId": ResourceId, 109 | "ScalableDimension": ScalableDimension, 110 | "MinCapacity": MinCapacity, 111 | "MaxCapacity": MaxCapacity, 112 | "RoleARN": RoleARN 113 | }) 114 | 115 | 116 | class TestDynamoDbBooster(unittest.TestCase): 117 | @mock_dynamodb2 118 | @mock_datapipeline 119 | @mock_s3 120 | @patch("hippolyte.config_util.ConfigUtil.list_backed_up_tables", return_value=TABLE_NAME) 121 | @patch("hippolyte.aws_utils.ApplicationAutoScalingUtil._init_client", 122 | return_value=FakeApplicationAutoscalingClient()) 123 | def test_restore_throughput(self, config_mock, autoscaling_mock): 124 | dynamodb_client = boto3.client('dynamodb', region_name='eu-west-1') 125 | s3_client = boto3.client('s3') 126 | 127 | backup_metadata = load_backup_metadata() 128 | table_descriptions = json.loads(backup_metadata)['Tables'] 129 | create_test_table(dynamodb_client, TABLE_NAME, table_descriptions[0]['Table']) 130 | old_rcu = get_old_rcu_and_boost(table_descriptions, 1000) 131 | 132 | bucket = 'euw1-dynamodb-backups-prd-480503113116' 133 | booster = hippolyte.dynamodb_booster.DynamoDbBooster(table_descriptions, bucket, 0.5) 134 | create_backup_metadata(s3_client, bucket, 'backup_metadata-2099-06-06-00-00-01', backup_metadata) 135 | 136 | booster.restore_throughput() 137 | 138 | table = dynamodb_client.describe_table(TableName=TABLE_NAME) 139 | self.assertEqual(table['Table']['ProvisionedThroughput']['ReadCapacityUnits'], old_rcu) 140 | 141 | @mock_dynamodb2 142 | @mock_datapipeline 143 | @mock_s3 144 | @patch("hippolyte.aws_utils.ApplicationAutoScalingUtil._init_client", 145 | return_value=FakeApplicationAutoscalingClient()) 146 | def test_autoscaling_support(self, autoscaling_mock): 147 | backup_metadata = load_backup_metadata() 148 | backup_metadata_dict = json.loads(backup_metadata) 149 | table_descriptions = backup_metadata_dict['Tables'] 150 | scaling_policies = backup_metadata_dict['ScalingPolicies'] 151 | scalable_targets = backup_metadata_dict['ScalableTargets'] 152 | 153 | booster = hippolyte.dynamodb_booster.DynamoDbBooster(table_descriptions, 'foo', 0.5) 154 | autoscaling_util = booster.application_auto_scaling_util 155 | 156 | for policy in scaling_policies: 157 | autoscaling_util.put_scaling_policy(policy['PolicyName'], 158 | policy['ServiceNamespace'], 159 | policy['ResourceId'], 160 | policy['ScalableDimension'], 161 | policy['PolicyType'], 162 | policy['TargetTrackingScalingPolicyConfiguration']) 163 | 164 | for target in scalable_targets: 165 | autoscaling_util.register_scalable_target(target['ServiceNamespace'], 166 | target['ResourceId'], 167 | target['ScalableDimension'], 168 | target['MinCapacity'], 169 | target['MaxCapacity'], 170 | target['RoleARN']) 171 | 172 | scaling_policies_before = autoscaling_util.describe_scaling_policies("dynamodb").get('ScalingPolicies') 173 | scalable_targets_before = autoscaling_util.describe_scalable_targets("dynamodb").get('ScalableTargets') 174 | 175 | booster.disable_auto_scaling(scaling_policies, scalable_targets) 176 | 177 | self.assertFalse(autoscaling_util.describe_scaling_policies("dynamodb").get('ScalingPolicies')) 178 | self.assertFalse(autoscaling_util.describe_scalable_targets("dynamodb").get('ScalableTargets')) 179 | 180 | booster.reenable_auto_scaling(backup_metadata_dict) 181 | 182 | scaling_policies_after = autoscaling_util.describe_scaling_policies("dynamodb").get('ScalingPolicies') 183 | scalable_targets_after = autoscaling_util.describe_scalable_targets("dynamodb").get('ScalableTargets') 184 | 185 | self.assertListEqual(scaling_policies_before, scaling_policies_after) 186 | self.assertListEqual(scalable_targets_before, scalable_targets_after) 187 | 188 | @mock_dynamodb2 189 | @mock_datapipeline 190 | @mock_s3 191 | @patch('hippolyte.dynamodb_booster.logger') 192 | @patch("hippolyte.aws_utils.ApplicationAutoScalingUtil._init_client", 193 | return_value=FakeApplicationAutoscalingClient()) 194 | def test_disable_autoscaling_warns_on_missing_resources(self, logger_mock, autoscaling_mock): 195 | backup_metadata = load_backup_metadata() 196 | backup_metadata_dict = json.loads(backup_metadata) 197 | table_descriptions = backup_metadata_dict['Tables'] 198 | scaling_policies = backup_metadata_dict['ScalingPolicies'] 199 | scalable_targets = backup_metadata_dict['ScalableTargets'] 200 | 201 | hippolyte.dynamodb_booster.logger = logger_mock 202 | booster = hippolyte.dynamodb_booster.DynamoDbBooster(table_descriptions, 'foo', 0.5) 203 | autoscaling_util = booster.application_auto_scaling_util 204 | 205 | for policy in scaling_policies: 206 | autoscaling_util.put_scaling_policy(policy['PolicyName'], 207 | policy['ServiceNamespace'], 208 | policy['ResourceId'], 209 | 'NonExisting', 210 | policy['PolicyType'], 211 | policy['TargetTrackingScalingPolicyConfiguration']) 212 | 213 | for target in scalable_targets: 214 | autoscaling_util.register_scalable_target(target['ServiceNamespace'], 215 | target['ResourceId'], 216 | 'NonExisting', 217 | target['MinCapacity'], 218 | target['MaxCapacity'], 219 | target['RoleARN']) 220 | 221 | booster.disable_auto_scaling(scaling_policies, scalable_targets) 222 | self.assertEqual(logger_mock.warn.call_count, 2) 223 | -------------------------------------------------------------------------------- /tests/test_monitor.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import boto3 3 | import sys 4 | import os 5 | from moto import mock_s3, mock_datapipeline, mock_sns 6 | from datetime import datetime, timedelta 7 | 8 | sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) 9 | 10 | from hippolyte.monitor import Monitor, is_backup_from_current_batch 11 | 12 | 13 | class TestESMonitor(unittest.TestCase): 14 | @mock_sns 15 | @mock_datapipeline 16 | @mock_s3 17 | def test_success_on_large_number_of_backup_files(self): 18 | bucket = 'euw1-dynamodb-backups-prd-480503113116' 19 | log_bucket = 'euw1-infrastructure_logs-prd-480503113116' 20 | s3 = boto3.client('s3', region_name='eu-west-1') 21 | s3.create_bucket(Bucket=bucket) 22 | 23 | for day in range(1, 30): 24 | key = 'prd-shd-euw1-scotty_audit-events/2099-05-{}-00-10-38/'.format(str(day).zfill(2)) 25 | for file_name in range(0, 100): 26 | s3.put_object(Bucket=bucket, Key=key + str(file_name), Body='') 27 | 28 | s3.put_object(Bucket=bucket, Key=key + '_SUCCESS', Body='') 29 | 30 | dummy_pipeline = { 31 | "definition": { 32 | "objects": [ 33 | { 34 | "directoryPath": "s3://euw1-dynamodb-backups-prd-480503113116/prd-shd-euw1-scotty_audit-events/#{format(@scheduledStartTime, 'YYYY-MM-dd-HH-mm-ss')}" 35 | } 36 | ] 37 | } 38 | } 39 | 40 | monitor = Monitor('480503113116', log_bucket, bucket, 'dummy_sns') 41 | failed_tables = monitor.extract_failed_tables(dummy_pipeline) 42 | 43 | self.assertFalse(failed_tables) 44 | 45 | def test_is_backup_from_current_batch_success(self): 46 | last_modified = datetime.utcnow() 47 | self.assertTrue(is_backup_from_current_batch({'LastModified': last_modified})) 48 | 49 | def test_is_backup_from_current_batch_failure(self): 50 | last_modified = datetime.utcnow() - timedelta(hours=26) 51 | self.assertFalse(is_backup_from_current_batch({'LastModified': last_modified})) -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | from copy import deepcopy 3 | 4 | 5 | def create_test_table(dynamodb_client, table_name, table): 6 | _table = deepcopy(table) 7 | del _table['ProvisionedThroughput']["LastIncreaseDateTime"] 8 | del _table['ProvisionedThroughput']["LastDecreaseDateTime"] 9 | del _table['ProvisionedThroughput']["NumberOfDecreasesToday"] 10 | 11 | dynamodb_client.create_table(TableName=table_name, 12 | AttributeDefinitions=_table['AttributeDefinitions'], 13 | KeySchema=_table['KeySchema'], 14 | ProvisionedThroughput=_table['ProvisionedThroughput'] 15 | ) 16 | 17 | 18 | def load_backup_metadata(): 19 | metadata_file = os.path.join(os.path.dirname(__file__), 'resources/test_backup_metadata.json') 20 | with open(metadata_file) as f: 21 | return f.read() --------------------------------------------------------------------------------