├── .gitignore ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── Excel └── DynamoDB+Cost+Template.xlsx ├── LICENSE ├── NOTICE ├── README.md ├── archived ├── README.md └── table_class_evaluator │ ├── README.md │ ├── requirements.txt │ └── table_class_evaluator.py ├── capacity-mode-evaluator ├── .gitignore ├── README.md ├── capacity_reco.py ├── requirements.txt ├── src │ ├── cost_estimates.py │ ├── dynamodb.py │ ├── frontend.py │ ├── getmetrics.py │ ├── metrics_estimates.py │ └── pricing.py └── static │ └── analysis_summary.csv ├── ddb-migration ├── .npmignore ├── README.md ├── bin │ └── ddb-migration.ts ├── cdk.json ├── documentation │ ├── DDBPlaybookUpdated.png │ └── migration-playbook.md ├── glue-scripts │ ├── direct-migration.py │ └── large-migration.py ├── jest.config.js ├── lambda │ ├── enable-trigger │ │ └── index.py │ ├── setup-check │ │ └── index.py │ ├── stream-processor │ │ └── index.py │ └── write-cdc │ │ └── index.py ├── lib │ ├── account-check │ │ └── index.py │ └── ddb-migration-stack.ts ├── package-lock.json ├── package.json ├── test │ └── ddb-migration.test.ts └── tsconfig.json ├── ddb_cost_tool ├── README.MD ├── __init__.py ├── config │ ├── __init__.py │ └── metrics.json ├── ddb_cost_tool.py ├── ddb_table.py ├── documentation │ ├── Report_view.png │ ├── Table_report_view.png │ ├── Tables_view.png │ ├── ddb-cost-tool-upload.gif │ └── running_script.gif ├── metrics.py ├── region.py └── requirements.txt ├── ddbtools ├── __init__.py ├── constants.py ├── mysql_s3.py ├── pricing.py ├── table.py └── util.py ├── item_size_calculator ├── README.md ├── index.min.js ├── package-lock.json └── package.json ├── metrics-collector ├── README.md ├── documentation │ └── metrics.gif ├── metric_config.json ├── metrics_collector │ ├── __init__.py │ ├── collector.py │ ├── logger_config.py │ ├── metrics.py │ ├── storage.py │ └── utilization_example.py ├── poetry.lock ├── pyproject.toml ├── requirements.txt └── tests │ └── __init__.py ├── reco ├── .gitignore ├── CHANGELOG.md ├── README.md ├── requirements.txt ├── src │ ├── ddb_rc_reco │ │ ├── __init__.py │ │ ├── config.py │ │ └── reco.py │ └── ddbr.py ├── static │ ├── Reserved Capacity-Page-1.png │ └── Reserved Capacity.drawio └── test │ ├── README.md │ ├── test_functional.py │ └── test_unit.py ├── table_class_optimizer ├── DDB_TableClassReco.sql └── README.md └── table_tagger.py /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .vscode 3 | .tool-versions 4 | 5 | ### Python ### 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | # Environments 12 | .venv 13 | .poetry -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | -------------------------------------------------------------------------------- /Excel/DynamoDB+Cost+Template.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/amazon-dynamodb-tools/894ddf3ba87318aa06271e277e09cf436d8b2c92/Excel/DynamoDB+Cost+Template.xlsx -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | -------------------------------------------------------------------------------- /archived/README.md: -------------------------------------------------------------------------------- 1 | # 🗃️ Archived DynamoDB Tools and Code 2 | 3 | This folder contains deprecated tools or code that are no longer actively supported. These resources are kept for historical reference, but should be used with caution as they may not reflect the latest best practices or be compatible with current AWS services and SDKs. 4 | 5 | ## 🕰️ Blast from the Past 6 | 7 | The artifacts stored in this folder represent earlier stages of the Amazon DynamoDB ecosystem. While these samples may no longer be the preferred approach, they can still offer valuable insights into the evolution of DynamoDB and cloud-native development. 8 | 9 | ## 🔍 Explore with Caution 10 | 11 | Before utilizing any of the code or configurations in this folder, please be aware that they may: 12 | 13 | - Rely on outdated AWS services or SDK versions 14 | - Contain security vulnerabilities or antipatterns 15 | - Lack comprehensive documentation and support 16 | 17 | ## 🌱 Nurturing the Future 18 | 19 | While these archived resources are preserved for reference, we encourage you to focus your efforts on the actively maintained examples and solutions in the rest of this DynamoDB repository. The community is continuously working to expand and improve the available DynamoDB content to better serve your needs. 20 | 21 | 22 | ## Archived resources. 23 | 24 | - October 2024: [Python script to help choose the right DynamoDB table class](./table_class_evaluator/README.md) 25 | -------------------------------------------------------------------------------- /archived/table_class_evaluator/README.md: -------------------------------------------------------------------------------- 1 | ## Table Class Evaluator [ARCHIVED] 2 | This tool was archived in October 2024 in favor of [table class optimizer](/table_class_optimizer/README.md). **This archived tool does not provide accurate recommendations because it relies on instantaneous throughput, not historical data like the current, recommended tool.** 3 | 4 | ### Overview 5 | 6 | Amazon DynamoDB supports two [table classes](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/HowItWorks.TableClasses.html): 7 | 8 | - Standard: The default for new tables, this table class balances storage costs and provisioned throughput. 9 | 10 | - Standard Infrequent Access (Standard-IA): This table class offers lower storage pricing and higher throughput pricing comapred to the Standard table class. The Standard-IA table class is a good fit for tables where data is not queried frequently, and can be a good choice for tables using the Standard table class where storage costs exceed 50% of total throughput costs. 11 | 12 | The Table Class Evaluator tool evaluates one or more tables in an AWS region for suitability for the Infrequent Access table class. The tool accomplishes this by calculating costs for both table classes for the following cost dimensions: 13 | 14 | - AWS Region 15 | - Table storage utilization 16 | - Instantaneous provisioned throughput 17 | - Global Tables replicated writes 18 | - Global Secondary Indexes (GSIs) 19 | 20 | The tool will will return recommendations for tables that may benefit from a change in table class. 21 | 22 | ### Limitations 23 | 24 | The Table Class Evaluator tool has the following limitations: 25 | 26 | - Estimated costs are calculated from the current (instantaneous) provisioned throughput. If the provisioned capacity of the table being evaluated changes frequently due to Auto Scaling activity, the recommendation could be incorrect. 27 | - Tables using On-Demand pricing are not supported. 28 | - Local Secondary Index costs are not calculated. 29 | 30 | ### Using the Table Class Evaluator tool 31 | 32 | The Table Class Evaluator is a command-line tool written in Python 3, and requires the AWS Python SDK (Boto3) >= 1.23.18. You can find instructions for installing the AWS Python SDK at https://aws.amazon.com/sdk-for-python/. The tool can be run directly from the cloned repository without installation. 33 | 34 | The tool is invoked from the command line like so: 35 | 36 | ```console 37 | user@host$ python3 table_class_evaluator.py --help 38 | usage: table_class_evaluator.py [-h] [--estimates-only] [--region REGION] [--table-name TABLE_NAME] [--profile PROFILE] 39 | 40 | Recommend Amazon DynamoDB table class changes to optimize costs. 41 | 42 | optional arguments: 43 | -h, --help show this help message and exit 44 | --estimates-only print table cost estimates instead of change recommendations 45 | --region REGION evaluate tables in REGION (default: us-east-1) 46 | --table-name TABLE_NAME 47 | evaluate TABLE_NAME (defaults to all tables in region) 48 | --profile PROFILE set a custom profile name to perform the operation under 49 | ``` 50 | 51 | With no arguments, the tool will evaluate costs for all tables in the default region (us-east-1), and returns a list of JSON objects, each containing details for a change recommendation: 52 | 53 | ```console 54 | user@host$ python3 table_class_evaluator.py 55 | [{ 56 | "recommendation_type": "CHANGE_TABLE_CLASS", 57 | "recommended_table_class": "STANDARD_INFREQUENT_ACCESS", 58 | "estimated_monthly_savings": "1.35", 59 | "estimate_detail": { 60 | "table_name": "test", 61 | "pricing_data": { 62 | "billing_mode": "PROVISIONED", 63 | "size_in_gb": "13.61", 64 | "provisioned_rcus": 5, 65 | "provisioned_wcus": 5, 66 | "table_arn": "arn:aws:dynamodb:us-east-1:123456789012:table/test", 67 | "table_class": "STANDARD" 68 | }, 69 | "table_mo_costs": { 70 | "std_storage_cost": "3.40", 71 | "std_mo_rcu_cost": "0.47", 72 | "std_mo_wcu_cost": "2.37", 73 | "std_mo_total_cost": "6.25", 74 | "ia_mo_storage_cost": "1.36", 75 | "ia_mo_rcu_cost": "0.58", 76 | "ia_mo_wcu_cost": "2.96", 77 | "ia_mo_total_cost": "4.90", 78 | "total_std_mo_costs": "6.25", 79 | "total_ia_mo_costs": "4.90" 80 | } 81 | } 82 | }] 83 | ``` 84 | 85 | If cost calculations don't reveal any change recommendations, the tool returns an empty list: 86 | 87 | ```console 88 | user@host$ python3 table_class_evaluator.py 89 | [] 90 | ``` -------------------------------------------------------------------------------- /archived/table_class_evaluator/requirements.txt: -------------------------------------------------------------------------------- 1 | botocore >= 1.23.18 2 | boto3 >= 1.23.18 -------------------------------------------------------------------------------- /archived/table_class_evaluator/table_class_evaluator.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | 3 | import argparse 4 | import json 5 | import logging 6 | import sys 7 | 8 | import boto3 9 | 10 | from ddbtools import constants 11 | from ddbtools.table import TableUtility 12 | from ddbtools.util import DecimalEncoder 13 | 14 | 15 | class DynamoDBTableClassCalculator(object): 16 | """Calculate pricing for all table classes, and make optimization recommendations 17 | for tables that may save money by using a different table class. 18 | 19 | Note: This tool assumes a table is not overprovisioned when calculating costs.""" 20 | def __init__(self, args: argparse.Namespace): 21 | self.args = args 22 | 23 | # Versions of boto3 older than 1.20.18 will still run, but don't support the table class attribute in 24 | # the result of describe_table, which would result in assuming all tables used the Standard table class. 25 | # Check the Boto version after import to avoid this situation. 26 | boto_version_elements = boto3.__version__.split('.') 27 | major_version = int(boto_version_elements[0]) 28 | minor_version = int(boto_version_elements[1]) 29 | patch_version = int(boto_version_elements[2]) 30 | 31 | if ((major_version < 1) or 32 | (major_version == 1 and minor_version < 20) or 33 | (major_version == 1 and minor_version == 20 and patch_version < 18)): 34 | message = f"Error: Boto3 >= 1.20.18 required. See https://aws.amazon.com/sdk-for-python/ for more." 35 | print(message) 36 | exit(0) 37 | 38 | self.table_utility = TableUtility(region_name=self.args.region, profile_name=self.args.profile) 39 | 40 | 41 | # Setup logging 42 | log_level = logging.INFO 43 | 44 | root_logger = logging.getLogger() 45 | root_logger.setLevel(log_level) 46 | 47 | root_handler = logging.StreamHandler(sys.stdout) 48 | root_handler.setLevel(log_level) 49 | formatter = logging.Formatter('%(asctime)s: %(message)s') 50 | root_handler.setFormatter(formatter) 51 | root_logger.addHandler(root_handler) 52 | 53 | 54 | def run(self): 55 | """Main program entry point""" 56 | table_names = [] 57 | 58 | try: 59 | if self.args.table_name is not None: 60 | table_names = [self.args.table_name] 61 | else: 62 | table_names = self.table_utility.get_table_names() 63 | 64 | table_cost_estimates = self.table_utility.estimate_table_costs_for_region(table_names, self.args.region) 65 | 66 | if not table_cost_estimates: 67 | print("No table cost results returned.") 68 | exit(0) 69 | 70 | if self.args.estimates_only: 71 | print(json.dumps(table_cost_estimates, cls=DecimalEncoder, indent=2)) 72 | exit(0) 73 | 74 | recommendations = [] 75 | 76 | # evaluate tables costs for storage classes 77 | for table_estimate in table_cost_estimates: 78 | table_pricing_data = table_estimate[constants.PRICING_DATA] 79 | 80 | # skip on-demand tables 81 | if table_pricing_data[constants.BILLING_MODE] == constants.ON_DEMAND_BILLING: 82 | continue 83 | 84 | table_class = table_pricing_data[constants.TABLE_CLASS] 85 | monthly_cost_estimates = table_estimate[constants.ESTIMATED_MONTHLY_COSTS] 86 | 87 | ia_cost_differential = (monthly_cost_estimates[constants.IA_MO_TOTAL_COST] 88 | - monthly_cost_estimates[constants.STD_MO_TOTAL_COST]) 89 | 90 | if ia_cost_differential < 0: 91 | if table_class == constants.STD_TABLE_CLASS: 92 | recommendation = {constants.RECOMMENDATION_TYPE: constants.TABLE_CLASS_CHANGE_RECOMMENDATION, 93 | constants.RECOMMENDED_TABLE_CLASS: constants.IA_TABLE_CLASS, 94 | constants.ESTIMATED_MO_SAVINGS: abs(ia_cost_differential), 95 | constants.ESTIMATE_DETAIL: table_estimate} 96 | recommendations.append(recommendation) 97 | 98 | elif ia_cost_differential > 0: 99 | if table_class == constants.IA_TABLE_CLASS: 100 | recommendation = {constants.RECOMMENDATION_TYPE: constants.TABLE_CLASS_CHANGE_RECOMMENDATION, 101 | constants.RECOMMENDED_TABLE_CLASS: constants.STD_TABLE_CLASS, 102 | constants.ESTIMATED_MO_SAVINGS: ia_cost_differential, 103 | constants.ESTIMATE_DETAIL: table_estimate} 104 | recommendations.append(recommendation) 105 | 106 | 107 | output = json.dumps(recommendations, cls=DecimalEncoder, indent=2) 108 | print(output) 109 | exit(0) 110 | except Exception as e: 111 | print(f"Table evaluation failed: {e}") 112 | import traceback 113 | traceback.print_exc() 114 | exit(0) 115 | 116 | 117 | def main(): 118 | parser = argparse.ArgumentParser(description='Recommend Amazon DynamoDB table class changes to optimize costs.') 119 | 120 | parser.add_argument( 121 | '--estimates-only', required=False, action='store_true', 122 | help='print table cost estimates instead of change recommendations') 123 | 124 | parser.add_argument( 125 | '--region', required=False, type=str, default='us-east-1', 126 | help='evaluate tables in REGION (default: us-east-1)') 127 | 128 | parser.add_argument( 129 | '--table-name', required=False, type=str, 130 | help='evaluate TABLE_NAME (defaults to all tables in region)') 131 | 132 | parser.add_argument( 133 | '--profile', required=False, type=str, default=None, 134 | help='set a custom profile name to perform the operation under') 135 | 136 | args = parser.parse_args() 137 | calculator = DynamoDBTableClassCalculator(args) 138 | calculator.run() 139 | 140 | if __name__ == "__main__": 141 | main() 142 | -------------------------------------------------------------------------------- /capacity-mode-evaluator/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | output/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | cdk*.json, 29 | cdk.out/ 30 | requirements*.txt, 31 | source.bat, 32 | python/__pycache__, 33 | tests 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .nox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *.cover 55 | *.py,cover 56 | .hypothesis/ 57 | .pytest_cache/ 58 | cover/ 59 | 60 | # Translations 61 | *.mo 62 | *.pot 63 | 64 | # Django stuff: 65 | *.log 66 | local_settings.py 67 | db.sqlite3 68 | db.sqlite3-journal 69 | 70 | # Flask stuff: 71 | instance/ 72 | .webassets-cache 73 | 74 | # Scrapy stuff: 75 | .scrapy 76 | 77 | # Sphinx documentation 78 | docs/_build/ 79 | 80 | # PyBuilder 81 | .pybuilder/ 82 | target/ 83 | 84 | # Jupyter Notebook 85 | .ipynb_checkpoints 86 | 87 | # IPython 88 | profile_default/ 89 | ipython_config.py 90 | 91 | # pyenv 92 | # For a library or package, you might want to ignore these files since the code is 93 | # intended to run in multiple environments; otherwise, check them in: 94 | # .python-version 95 | 96 | # pipenv 97 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 98 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 99 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 100 | # install all needed dependencies. 101 | #Pipfile.lock 102 | 103 | # poetry 104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure reproducibility, and is more 106 | # commonly ignored for libraries. 107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 108 | #poetry.lock 109 | 110 | # pdm 111 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 112 | #pdm.lock 113 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 114 | # in version control. 115 | # https://pdm.fming.dev/#use-with-ide 116 | .pdm.toml 117 | 118 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 119 | __pypackages__/ 120 | 121 | # Celery stuff 122 | celerybeat-schedule 123 | celerybeat.pid 124 | 125 | # SageMath parsed files 126 | *.sage.py 127 | 128 | # Environments 129 | .env 130 | .venv 131 | env/ 132 | venv/ 133 | ENV/ 134 | env.bak/ 135 | venv.bak/ 136 | 137 | # Spyder project settings 138 | .spyderproject 139 | .spyproject 140 | 141 | # Rope project settings 142 | .ropeproject 143 | 144 | # mkdocs documentation 145 | /site 146 | 147 | # mypy 148 | .mypy_cache/ 149 | .dmypy.json 150 | dmypy.json 151 | 152 | # Pyre type checker 153 | .pyre/ 154 | 155 | # pytype static type analyzer 156 | .pytype/ 157 | 158 | # Cython debug symbols 159 | cython_debug/ 160 | 161 | # PyCharm 162 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 163 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 164 | # and can be added to the global gitignore or merged into this file. For a more nuclear 165 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 166 | #.idea/ 167 | .DS_Store 168 | -------------------------------------------------------------------------------- /capacity-mode-evaluator/capacity_reco.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | import subprocess 4 | from datetime import datetime 5 | from src.dynamodb import DDBScalingInfo 6 | from src.getmetrics import get_metrics 7 | from src.cost_estimates import recommendation_summary 8 | import pandas as pd 9 | import pytz 10 | import os 11 | from tqdm.contrib.concurrent import thread_map 12 | 13 | # Initialize logging 14 | logging.basicConfig(level=logging.INFO) 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | def setup_output_directory(output_dir): 19 | try: 20 | if not os.path.exists(output_dir): 21 | os.makedirs(output_dir) 22 | except Exception as e: 23 | logger.error(f"Error creating directory {output_dir}: {str(e)}") 24 | 25 | 26 | def get_params(args): 27 | params = {} 28 | params['dynamodb_tablename'] = args.dynamodb_tablename 29 | params['dynamodb_read_utilization'] = args.dynamodb_read_utilization 30 | params['dynamodb_write_utilization'] = args.dynamodb_write_utilization 31 | params['dynamodb_minimum_write_unit'] = args.dynamodb_minimum_write_unit 32 | params['dynamodb_maximum_write_unit'] = args.dynamodb_maximum_write_unit 33 | params['dynamodb_minimum_read_unit'] = args.dynamodb_minimum_read_unit 34 | params['dynamodb_maximum_read_unit'] = args.dynamodb_maximum_read_unit 35 | params['number_of_days_look_back'] = args.number_of_days_look_back 36 | params['max_concurrent_tasks'] = args.max_concurrent_tasks 37 | params['show_dashboard'] = args.show_dashboard 38 | 39 | now = datetime.utcnow() 40 | midnight = datetime(now.year, now.month, now.day, 0, 0, 0, tzinfo=pytz.UTC) 41 | params['cloudwatch_metric_end_datatime'] = midnight.strftime( 42 | '%Y-%m-%d %H:%M:%S') 43 | 44 | return params 45 | 46 | 47 | def process_table(args): 48 | table_name, params, debug, filename, dynamodb_table_info = args 49 | current_params = params.copy() 50 | current_params['dynamodb_tablename'] = table_name 51 | 52 | try: 53 | result = get_metrics(current_params) 54 | 55 | metric_df = result[0] 56 | estimate_df = result[1] 57 | 58 | summary_result = recommendation_summary( 59 | current_params, metric_df, estimate_df, dynamodb_table_info) 60 | 61 | if debug: 62 | filename_metrics = os.path.join(output_path, f'metrics_{table_name}.csv') 63 | filename_estimate = os.path.join(output_path, f'metrics_estimate_{table_name}.csv') 64 | filename_cost_estimate = os.path.join(output_path, f'cost_estimate_{table_name}.csv') 65 | metric_df.to_csv(filename_metrics, index=False) 66 | estimate_df.to_csv(filename_estimate, index=False) 67 | summary_result[1].to_csv(filename_cost_estimate, index=False) 68 | 69 | with open(filename, 'a') as analysis_summary: 70 | summary_result[0].to_csv(analysis_summary, index=False, header=not os.path.exists(filename)) 71 | 72 | return summary_result[0] 73 | except Exception as e: 74 | logger.error(f"Error processing table {table_name}: {str(e)}") 75 | return None 76 | 77 | 78 | def process_dynamodb_table(dynamodb_table_info: pd.DataFrame, params: dict, output_path: str, debug: bool) -> pd.DataFrame: 79 | filename = os.path.join(output_path, f'analysis_summary{timestamp}.csv') 80 | with open(filename, 'w') as analysis_summary: 81 | analysis_summary.write('base_table_name,index_name,class,metric_name,est_provisioned_cost,current_provisioned_cost,ondemand_cost,recommended_mode,current_mode,status,savings_pct,current_cost,recommended_cost,number_of_days,current_min_capacity,simulated_min_capacity,current_max_capacity,simulated_max_capacity,current_target_utilization,simulated_target_utilizatio,autoscaling_enabled,Note\n') 82 | unique_tables = dynamodb_table_info['base_table_name'].unique() 83 | 84 | args_list = [(table_name, params, debug, filename, dynamodb_table_info) for table_name in unique_tables] 85 | 86 | results = thread_map(process_table, args_list, total=len(args_list), desc="Processing Tables", max_workers=params['max_concurrent_tasks']) 87 | 88 | # Filter out None values and concatenate the valid results 89 | valid_results = [result for result in results if result is not None] 90 | concatenated_summary_result = pd.concat(valid_results) 91 | 92 | return concatenated_summary_result 93 | 94 | 95 | if __name__ == '__main__': 96 | parser = argparse.ArgumentParser( 97 | description='Process DynamoDB table metrics.') 98 | 99 | parser.add_argument('--debug', action='store_true', 100 | help='Save metrics and estimates as CSV files in debug mode') 101 | parser.add_argument('--dynamodb-tablename', type=str, 102 | default=None, help='DynamoDB table name') 103 | parser.add_argument('--dynamodb-read-utilization', 104 | type=int, default=70, help='DynamoDB read utilization') 105 | parser.add_argument('--dynamodb-write-utilization', 106 | type=int, default=70, help='DynamoDB write utilization') 107 | parser.add_argument('--dynamodb-minimum-write-unit', 108 | type=int, default=1, help='DynamoDB minimum write unit') 109 | parser.add_argument('--dynamodb-maximum-write-unit', 110 | type=int, default=80000, help='DynamoDB maximum write unit') 111 | parser.add_argument('--dynamodb-minimum-read-unit', 112 | type=int, default=1, help='DynamoDB minimum read unit') 113 | parser.add_argument('--dynamodb-maximum-read-unit', 114 | type=int, default=80000, help='DynamoDB maximum read unit') 115 | parser.add_argument('--number-of-days-look-back', type=int, 116 | default=14, help='Number of days to look back') 117 | parser.add_argument('--max-concurrent-tasks', type=int, 118 | default=5, help='Maximum number of tasks to run concurrently') 119 | parser.add_argument('--show-dashboard', action='store_true', help='Display results in a GUI for simple visualization') 120 | args = parser.parse_args() 121 | 122 | timestamp = datetime.now().strftime("%Y%m%d-%H%M%S") 123 | output_path = "output" 124 | setup_output_directory(output_path) 125 | logger.info(f"Output directory: {output_path}") 126 | params = get_params(args) 127 | logger.info(f"Parameters: {params}") 128 | DDBinfo = DDBScalingInfo() 129 | dynamo_tables_result = DDBinfo.get_all_dynamodb_autoscaling_settings_with_indexes( 130 | params['dynamodb_tablename'], params['max_concurrent_tasks']) 131 | 132 | dynamo_tables_result.to_csv( 133 | os.path.join(output_path, 'dynamodb_table_info.csv'), index=False) 134 | 135 | process_dynamodb_result = process_dynamodb_table( 136 | dynamo_tables_result, params, output_path, args.debug) 137 | if params['show_dashboard']: 138 | subprocess.run(["streamlit", "run", "./src/frontend.py"]) 139 | -------------------------------------------------------------------------------- /capacity-mode-evaluator/requirements.txt: -------------------------------------------------------------------------------- 1 | futures==2.2.0 2 | numpy==1.24.4 3 | pandas==2.2.1 4 | pytz==2021.1 5 | tqdm 6 | boto3 7 | botocore 8 | streamlit -------------------------------------------------------------------------------- /capacity-mode-evaluator/src/dynamodb.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from concurrent.futures import ThreadPoolExecutor 3 | import numpy as np 4 | from tqdm import tqdm 5 | import boto3 6 | import logging 7 | 8 | logging.basicConfig(level=logging.INFO) 9 | logger = logging.getLogger(__name__) 10 | 11 | 12 | class DDBScalingInfo: 13 | def __init__(self): 14 | self.dynamodb_client = boto3.client('dynamodb') 15 | self.app_autoscaling = boto3.client('application-autoscaling') 16 | 17 | def get_dynamodb_autoscaling_settings(self, base_table_name: str, table_storage_class: str, index_name: str = None): 18 | 19 | app_autoscaling = self.app_autoscaling 20 | 21 | resource_id = f"table/{base_table_name}" 22 | if index_name: 23 | resource_id = f"{resource_id}/index/{index_name}" 24 | # Get the current autoscaling settings for the table 25 | response = app_autoscaling.describe_scalable_targets( 26 | ResourceIds=[resource_id], ServiceNamespace='dynamodb') 27 | autoscaling_settings = response['ScalableTargets'] 28 | scalable_targets = response.get('ScalableTargets', []) 29 | if not scalable_targets: 30 | return [[base_table_name, index_name, table_storage_class, None, None, None, None, 'False', 'Provisioned']] 31 | data = [] 32 | for setting in autoscaling_settings: 33 | policy_response = app_autoscaling.describe_scaling_policies( 34 | ServiceNamespace='dynamodb', 35 | ResourceId=setting['ResourceId'], 36 | ScalableDimension=setting['ScalableDimension'] 37 | ) 38 | try: 39 | policy = policy_response['ScalingPolicies'][0]["TargetTrackingScalingPolicyConfiguration"] 40 | 41 | data.append([ 42 | base_table_name, 43 | index_name, 44 | table_storage_class, 45 | setting['ScalableDimension'], 46 | setting['MinCapacity'], 47 | setting['MaxCapacity'], 48 | policy['TargetValue'], 49 | 'True', 50 | 'Provisioned' 51 | ]) 52 | except: 53 | data.append([ 54 | base_table_name, 55 | index_name, 56 | table_storage_class, 57 | None, 58 | None, 59 | None, 60 | None, 61 | 'policy_missing', 62 | 'Provisioned' 63 | ]) 64 | return data 65 | 66 | def _process_table(self, name): 67 | try: 68 | desc_table = self.dynamodb_client.describe_table(TableName=name) 69 | table_data = desc_table.get('Table', {}) 70 | table_storage_class = table_data.get('TableClassSummary', {}).get('TableClass', 'STANDARD') 71 | global_indexes = table_data.get('GlobalSecondaryIndexes', []) 72 | billing_mode = table_data.get('BillingModeSummary', {}).get('BillingMode', 'PROVISIONED') 73 | 74 | # Initialize DataFrame columns 75 | columns = ['base_table_name', 'index_name', 'class', 'metric_name', 'min_capacity', 'max_capacity', 'target_utilization', 'autoscaling_enabled', 'throughput_mode'] 76 | result_data = [] 77 | 78 | if billing_mode == 'PAY_PER_REQUEST': 79 | result_data.append([name, None, table_storage_class, None, None, None, None, None, 'Ondemand']) 80 | for index in global_indexes: 81 | result_data.append([name, index['IndexName'], table_storage_class, None, None, None, None, None, 'Ondemand']) 82 | else: 83 | table_settings = self.get_dynamodb_autoscaling_settings(name, table_storage_class) 84 | if table_settings: 85 | result_data.extend(table_settings) 86 | 87 | for index in global_indexes: 88 | index_settings = self.get_dynamodb_autoscaling_settings(name, table_storage_class, index_name=index['IndexName']) 89 | if index_settings: 90 | result_data.extend(index_settings) 91 | result_df = pd.DataFrame(result_data, columns=columns) 92 | return result_df 93 | except Exception as e: 94 | logger.error(f"Error processing table {name}: {e}") 95 | return pd.DataFrame(columns=columns) 96 | 97 | def get_all_dynamodb_autoscaling_settings_with_indexes(self, table_name: str, max_concurrent_tasks: int) -> pd.DataFrame: 98 | 99 | dynamodb_client = self.dynamodb_client 100 | 101 | # Get a list of all DynamoDB tables 102 | table_names = [] 103 | last_evaluated_table_name = None 104 | if not table_name: 105 | while last_evaluated_table_name != '': 106 | params = {} 107 | if last_evaluated_table_name: 108 | params['ExclusiveStartTableName'] = last_evaluated_table_name 109 | response = dynamodb_client.list_tables(**params) 110 | table_names += response['TableNames'] 111 | last_evaluated_table_name = response.get( 112 | 'LastEvaluatedTableName', '') 113 | 114 | else: 115 | table_names = [table_name] 116 | 117 | settings_list = [] 118 | if len(table_names) != 0: 119 | # Create a thread pool to execute _process_table() for each table in parallel 120 | with ThreadPoolExecutor(max_workers=max_concurrent_tasks) as executor: 121 | futures = [executor.submit(self._process_table, name) 122 | for name in table_names] 123 | progress_bar = tqdm(total=len(table_names), 124 | desc=f"Getting DynamoDB Tables info ...") 125 | 126 | settings_list = [] 127 | for future in futures: 128 | progress_bar.update(1) 129 | try: 130 | result = future.result() 131 | if result is not None: 132 | settings_list.append(result) 133 | except Exception as e: 134 | logger.error(f"Error processing table: {e}") 135 | progress_bar.close() 136 | if len(settings_list) > 0: 137 | settings = pd.concat(settings_list, axis=0) 138 | settings['index_name'] = settings.apply(lambda x: x['base_table_name'] if pd.isnull( 139 | x['index_name']) else x['base_table_name'] + ':' + x['index_name'], axis=1) 140 | if settings['metric_name'].notnull().any(): 141 | settings['metric_name'] = settings['metric_name'].replace( 142 | {'dynamodb:table:ReadCapacityUnits': 'ProvisionedReadCapacityUnits', 'dynamodb:index:ReadCapacityUnits': 'ProvisionedReadCapacityUnits'}, regex=True) 143 | settings['metric_name'] = settings['metric_name'].replace( 144 | {'dynamodb:table:WriteCapacityUnits': 'ProvisionedWriteCapacityUnits', 'dynamodb:index:WriteCapacityUnits': 'ProvisionedWriteCapacityUnits'}, regex=True) 145 | else: 146 | settings = pd.DataFrame() 147 | return settings 148 | else: 149 | logger.info("No DynamoDB tables found in this region") 150 | raise ValueError("No DynamoDB tables found in this region") 151 | -------------------------------------------------------------------------------- /capacity-mode-evaluator/src/frontend.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import pandas as pd 3 | import glob 4 | import os 5 | from datetime import datetime 6 | 7 | # Dynamically locate the latest CSV file in ./output/ 8 | def get_latest_csv(): 9 | csv_files = glob.glob("./output/analysis_summary*.csv") 10 | if not csv_files: 11 | st.error("No CSV files found in the output directory.") 12 | st.stop() 13 | return max(csv_files, key=os.path.getctime) 14 | 15 | file_path = get_latest_csv() 16 | df = pd.read_csv(file_path) 17 | 18 | st.set_page_config(layout="wide") # Use full screen width 19 | st.title("DynamoDB Cost & Scaling Analysis") 20 | 21 | # Add a toggle button to filter between Optimized and Not Optimized tables 22 | status_filter = st.radio("Filter Tables by Status:", ("All", "Optimized", "Not Optimized")) 23 | if status_filter != "All": 24 | df = df[df['status'] == status_filter] 25 | 26 | # Group tables and their indexes 27 | table_groups = df.groupby('base_table_name') 28 | 29 | # Summary statistics 30 | st.subheader("Summary Statistics") 31 | col1, col2, col3 = st.columns(3) 32 | col1.metric(label="Total Current Cost", value=f"${df['current_cost'].sum():,.2f}") 33 | col2.metric(label="Total Recommended Cost", value=f"${df['recommended_cost'].sum():,.2f}") 34 | col3.metric(label="Average Savings %", value=f"{df['savings_pct'].mean():.2f}%") 35 | 36 | # Display recommendations in a component style layout 37 | st.subheader("Table Recommendations") 38 | for table, data in table_groups: 39 | highlight = "🔴" if data.iloc[0]['status'] == "Not Optimized" else "🟢" 40 | with st.expander(f"{highlight} Table: {table}"): 41 | st.write(f"**Status:** {data.iloc[0]['status']}") 42 | st.write(f"**Current Mode:** {data.iloc[0]['current_mode']}") 43 | st.write(f"**Recommended Mode:** {data.iloc[0]['recommended_mode']}") 44 | st.write(f"**Current Cost:** ${data.iloc[0]['current_cost']:,.2f}") 45 | st.write(f"**Recommended Cost:** ${data.iloc[0]['recommended_cost']:,.2f}") 46 | st.write(f"**Savings %:** {data.iloc[0]['savings_pct']:.2f}%") 47 | st.write(f"**Autoscaling Enabled:** {data.iloc[0]['autoscaling_enabled']}") 48 | 49 | indexes = data.dropna(subset=['index_name']) 50 | if not indexes.empty: 51 | st.subheader("Global Secondary Indexes") 52 | index_list = indexes.to_dict(orient='records') 53 | 54 | cols = st.columns(3) # Create 3 columns for stacking indexes 55 | for i, index in enumerate(index_list): 56 | with cols[i % 3]: 57 | index_highlight = "🔴" if index['status'] == "Not Optimized" else "🟢" 58 | st.markdown(f"{index_highlight} **{index['index_name']}**") 59 | st.write(f"**Status:** {index['status']}") 60 | st.write(f"**Current Mode:** {index['current_mode']}") 61 | st.write(f"**Recommended Mode:** {index['recommended_mode']}") 62 | st.write(f"**Current Cost:** ${index['current_cost']:,.2f}") 63 | st.write(f"**Recommended Cost:** ${index['recommended_cost']:,.2f}") 64 | st.write(f"**Savings %:** {index['savings_pct']:.2f}%") 65 | st.write(f"**Autoscaling Enabled:** {index['autoscaling_enabled']}") -------------------------------------------------------------------------------- /capacity-mode-evaluator/src/getmetrics.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | from datetime import datetime, timedelta 3 | from queue import Queue 4 | import boto3 5 | import src.metrics_estimates as estimates 6 | import pandas as pd 7 | from tqdm.contrib.concurrent import thread_map 8 | import logging 9 | 10 | 11 | logging.basicConfig(level=logging.INFO) 12 | logger = logging.getLogger(__name__) 13 | 14 | def list_metrics(tablename: str) -> list: 15 | cw = boto3.client('cloudwatch') 16 | metrics_list = [] 17 | 18 | paginator = cw.get_paginator('list_metrics') 19 | 20 | if not tablename: 21 | operation_parameters = {'Namespace': 'AWS/DynamoDB'} 22 | else: 23 | operation_parameters = {'Namespace': 'AWS/DynamoDB', 24 | 'Dimensions': [{'Name': 'TableName', 'Value': tablename}]} 25 | 26 | for response in paginator.paginate(**operation_parameters): 27 | metrics_list.extend(response['Metrics']) 28 | 29 | return metrics_list 30 | 31 | 32 | def process_results(metrics_list, metric, metric_result_queue, estimate_result_queue, read_utilization, write_utilization, read_min, write_min, read_max, write_max): 33 | 34 | metrics_result = [] 35 | for result in metrics_list['MetricDataResults']: 36 | 37 | try: 38 | name = str(metric[0]['Value']) + ":" + str(metric[1]['Value']) 39 | except: 40 | name = str(metric[0]['Value']) 41 | metric_list = list(zip(result['Timestamps'], result['Values'])) 42 | tmdf = pd.DataFrame(metric_list, columns=['timestamp', 'unit']) 43 | tmdf['unit'] = tmdf['unit'].astype(float) 44 | tmdf['timestamp'] = pd.to_datetime(tmdf['timestamp'], unit='ms') 45 | tmdf['name'] = name 46 | tmdf['metric_name'] = result['Label'] 47 | tmdf = tmdf[['metric_name', 'timestamp', 'name', 'unit']] 48 | metrics_result.append(tmdf) 49 | metric_result_queue.put(tmdf) 50 | metrics_result = pd.concat(metrics_result) 51 | estimate_units = estimates.estimate( 52 | metrics_result, read_utilization, write_utilization, read_min, write_min, read_max, write_max) 53 | 54 | estimate_result_queue.put(estimate_units) 55 | 56 | 57 | def fetch_metric_data(metric, start_time, end_time, consumed_period, provisioned_period): 58 | cw = boto3.client('cloudwatch') 59 | 60 | if metric['MetricName'] == 'ProvisionedWriteCapacityUnits': 61 | result = cw.get_metric_data(MetricDataQueries=[ 62 | { 63 | 'Id': 'provisioned_rcu', 64 | 'MetricStat': { 65 | 'Metric': { 66 | 'Namespace': 'AWS/DynamoDB', 67 | 'MetricName': 'ProvisionedReadCapacityUnits', 68 | 'Dimensions': metric['Dimensions'] 69 | }, 70 | 'Period': provisioned_period, 71 | 'Stat': 'Average' 72 | }, 73 | }, 74 | { 75 | 'Id': 'provisioned_wcu', 76 | 'MetricStat': { 77 | 'Metric': { 78 | 'Namespace': 'AWS/DynamoDB', 79 | 'MetricName': 'ProvisionedWriteCapacityUnits', 80 | 'Dimensions': metric['Dimensions'] 81 | }, 82 | 'Period': provisioned_period, 83 | 'Stat': 'Average' 84 | } 85 | } 86 | ], StartTime=start_time, EndTime=end_time) 87 | return (result, metric['Dimensions']) 88 | 89 | elif metric['MetricName'] == 'ConsumedReadCapacityUnits': 90 | result = cw.get_metric_data(MetricDataQueries=[ 91 | { 92 | 'Id': 'consumed_rcu', 93 | 'MetricStat': { 94 | 'Metric': { 95 | 'Namespace': 'AWS/DynamoDB', 96 | 'MetricName': 'ConsumedReadCapacityUnits', 97 | 'Dimensions': metric['Dimensions'] 98 | }, 99 | 'Period': consumed_period, 100 | 'Stat': 'Sum' 101 | }, 102 | }, 103 | { 104 | 'Id': 'consumed_wcu', 105 | 'MetricStat': { 106 | 'Metric': { 107 | 'Namespace': 'AWS/DynamoDB', 108 | 'MetricName': 'ConsumedWriteCapacityUnits', 109 | 'Dimensions': metric['Dimensions'] 110 | }, 111 | 'Period': consumed_period, 112 | 'Stat': 'Sum' 113 | } 114 | } 115 | ], StartTime=start_time, EndTime=end_time) 116 | return (result, metric['Dimensions']) 117 | 118 | return None 119 | 120 | 121 | def get_table_metrics(metrics, start_time, end_time, consumed_period, provisioned_period, read_utilization, write_utilization, read_min, write_min, read_max, write_max, max_concurrent_tasks,dynamodb_tablename): 122 | metric_result_queue = Queue() 123 | estimate_result_queue = Queue() 124 | metric_data_list = thread_map(lambda metric: fetch_metric_data(metric, start_time, end_time, consumed_period, provisioned_period), 125 | metrics, max_workers=max_concurrent_tasks, desc="Fetching CloudWatch metrics for: " + dynamodb_tablename) 126 | 127 | metric_data_list = [ 128 | result for result in metric_data_list if result is not None] 129 | 130 | thread_map(lambda result: process_results(result[0], result[1], metric_result_queue, estimate_result_queue, read_utilization, write_utilization, read_min, write_min, read_max, write_max), 131 | metric_data_list, max_workers=max_concurrent_tasks, desc="Estimating DynamoDB table provisioned metrics for: " + dynamodb_tablename) 132 | 133 | processed_metric = [] 134 | processed_estimate = [] 135 | while not metric_result_queue.empty(): 136 | processed_metric.append(metric_result_queue.get()) 137 | while not estimate_result_queue.empty(): 138 | processed_estimate.append(estimate_result_queue.get()) 139 | if all(df.empty for df in processed_metric): 140 | logger.info("No metrics were retrieved from CloudWatch.") 141 | else: 142 | metric_df = pd.concat(processed_metric, ignore_index=True) 143 | estimate_df = pd.concat(processed_estimate, ignore_index=True) 144 | return [metric_df, estimate_df] 145 | 146 | 147 | def get_metrics(params): 148 | 149 | provisioned_period = 3600 150 | consumed_period = 60 151 | read_min = params['dynamodb_minimum_read_unit'] 152 | write_min = params['dynamodb_minimum_write_unit'] 153 | read_max = params['dynamodb_maximum_read_unit'] 154 | write_max = params['dynamodb_maximum_write_unit'] 155 | read_utilization = params['dynamodb_read_utilization'] 156 | write_utilization = params['dynamodb_write_utilization'] 157 | dynamodb_tablename = params['dynamodb_tablename'] 158 | interval = params['number_of_days_look_back'] 159 | now = params['cloudwatch_metric_end_datatime'] 160 | now = datetime.strptime(now, '%Y-%m-%d %H:%M:%S') 161 | end_time = now 162 | start_time = end_time - timedelta(days=interval) 163 | end_time = end_time.strftime('%Y-%m-%dT%H:%M:%SZ') 164 | start_time = start_time.strftime('%Y-%m-%dT%H:%M:%SZ') 165 | max_concurrent_tasks = params['max_concurrent_tasks'] 166 | 167 | metrics = list_metrics(dynamodb_tablename) 168 | result = get_table_metrics(metrics, start_time, end_time, consumed_period, 169 | provisioned_period, read_utilization, write_utilization, read_min, write_min, read_max, write_max, max_concurrent_tasks,dynamodb_tablename) 170 | 171 | return result 172 | -------------------------------------------------------------------------------- /capacity-mode-evaluator/src/metrics_estimates.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from datetime import datetime, timedelta, date 3 | 4 | 5 | def max_a(i, j): 6 | return i if i > j else j 7 | 8 | 9 | def min_a(i, j): 10 | return j if i > j else i 11 | 12 | 13 | def decrease(L): 14 | return any(x > y for x, y in zip(L, L[1:])) 15 | 16 | 17 | def estimate_units(read, write, read_utilization, write_utilization, read_min, write_min, read_max, write_max): 18 | # columns [metric_name,timestamp,name,units,unitps,estunit] 19 | if len(read) <= len(write): 20 | smallest_list = read 21 | else: 22 | smallest_list = write 23 | final_read_cu = [] 24 | 25 | # Scale-in threshold = 20% percent to prevent small fluctuations in capacity usage from triggering unnecessary scale-ins. 26 | scale_in_threshold = 1.20 27 | count = 0 28 | last_change = "read" 29 | final_write_cu = [] 30 | prev_read = read[0] 31 | prev_write = write[0] 32 | final_write_cu += [prev_write] 33 | final_read_cu += [prev_read] 34 | prev_read[5] = min(max((prev_read[4] / read_utilization) 35 | * 100, read_min), read_max) 36 | prev_write[5] = min(max((prev_write[4] / write_utilization) 37 | * 100, write_min), write_max) 38 | for i in range(1, len(smallest_list)): 39 | current_read = read[i] 40 | current_write = write[i] 41 | 42 | date_time_obj = current_read[1].to_pydatetime() 43 | midnight = date_time_obj.replace(hour=0, minute=0, second=0) 44 | if date_time_obj == midnight: 45 | count = 0 46 | 47 | # compare with prev val 48 | 49 | if i <= 2: 50 | current_read[5] = prev_read[5] 51 | current_write[5] = prev_write[5] 52 | final_read_cu += [current_read] 53 | final_write_cu += [current_write] 54 | continue 55 | # creating a list with last 2 records. 56 | last2_read = [v[4] for v in list(read[i - 2: i])] 57 | last2_write = [v[4] for v in list(write[i - 2: i])] 58 | 59 | last2_max_read = max(last2_read) 60 | last2_max_write = max(last2_write) 61 | last2_min_read = min(last2_read) 62 | last2_min_write = min(last2_write) 63 | max_vread = min(max_a((last2_min_read / read_utilization) 64 | * 100, prev_read[5]), read_max) 65 | 66 | max_vwrite = min(max_a((last2_min_write / write_utilization) 67 | * 100, prev_write[5]), write_max) 68 | # scale out based on last 2 min Units. 69 | 70 | if current_read[0] == 'ConsumedReadCapacityUnits': 71 | if max_vread == (last2_min_read / read_utilization) * 100: 72 | 73 | current_read[5] = (last2_max_read / read_utilization) * 100 74 | 75 | else: 76 | 77 | current_read[5] = max_vread 78 | 79 | if current_write[0] == 'ConsumedWriteCapacityUnits': 80 | if max_vwrite == (last2_min_write / write_utilization) * 100: 81 | 82 | current_write[5] = (last2_max_write / write_utilization) * 100 83 | else: 84 | 85 | current_write[5] = max_vwrite 86 | 87 | if i <= 14: 88 | prev_read = current_read 89 | final_read_cu += [current_read] 90 | prev_write = current_write 91 | final_write_cu += [current_write] 92 | continue 93 | # Create list from last 15 Consumed Read Units 94 | last15_read = [v[4] for v in list(read[i - 15: i])] 95 | last15_read2 = [v[5] for v in list(read[i - 15: i])] 96 | last15_max_read = max(last15_read) 97 | # Create list from last 15 Consumed Write Units 98 | last15_write = [v[4] for v in list(write[i - 15: i])] 99 | last15_write2 = [v[5] for v in list(write[i - 15: i])] 100 | last15_max_write = max(last15_write) 101 | # Scale-in based on last 15 Consumed Units 102 | # First 4 scale-in operation can happen anytime during the a day, there after every once an hour 103 | if count < 4: 104 | if not decrease(last15_read2): 105 | if prev_read[5] > (max(min_a( 106 | (last15_max_read / read_utilization) * 100, current_read[5]), read_min) * scale_in_threshold): 107 | current_read[5] = max(min_a( 108 | (last15_max_read / read_utilization) * 100, current_read[5]), read_min) 109 | if prev_read[5] > current_read[5]: 110 | 111 | count += 1 112 | 113 | if not decrease(last15_write2): 114 | if prev_write[5] > (max(min_a( 115 | (last15_max_write / write_utilization) * 100, current_write[5]), write_min) * scale_in_threshold): 116 | current_write[5] = max(min_a( 117 | (last15_max_write / write_utilization) * 100, current_write[5]), write_min) 118 | if prev_write[5] > current_write[5]: 119 | count += 1 120 | 121 | else: 122 | if i >= 60: 123 | # Create list from last 60 Consumed Units 124 | last60_read = [v[5] for v in list(read[i - 60: i])] 125 | last60_write = [v[5] for v in list(write[i - 60: i])] 126 | # if Table has not scale in in past 60 minutes then scale in 127 | if not decrease(last60_read) and not decrease(last60_write): 128 | if prev_read[5] > (max( 129 | min_a((last15_max_read / read_utilization) * 100, current_read[5]), read_min) * scale_in_threshold) and prev_write[5] > (max(min_a((last15_max_write / write_utilization) * 100, current_write[5]), write_min) * scale_in_threshold): 130 | if last_change == "write": 131 | current_read[5] = max( 132 | min_a((last15_max_read / read_utilization) * 100, current_read[5]), read_min) 133 | last_change = "read" 134 | else: 135 | current_write[5] = max( 136 | min_a((last15_max_write / write_utilization) * 100, current_write[5]), write_min) 137 | last_change = "write" 138 | else: 139 | if prev_read[5] > (max( 140 | min_a((last15_max_read / read_utilization) * 100, current_read[5]), read_min) * scale_in_threshold): 141 | current_read[5] = max( 142 | min_a((last15_max_read / read_utilization) * 100, current_read[5]), read_min) 143 | 144 | if prev_write[5] > (max 145 | (min_a((last15_max_write / write_utilization) * 100, current_write[5]), write_min) * scale_in_threshold): 146 | current_write[5] = max( 147 | min_a((last15_max_write / write_utilization) * 100, current_write[5]), write_min) 148 | 149 | else: 150 | pass 151 | 152 | prev_read = current_read 153 | prev_write = current_write 154 | final_read_cu += [current_read] 155 | final_write_cu += [current_write] 156 | final_list = final_write_cu + final_read_cu 157 | return final_list 158 | 159 | 160 | def estimate(df, read_utilization, write_utilization, read_min, write_min, read_max, write_max): 161 | 162 | df['unitps'] = df['unit'] / 60 163 | df['estunit'] = 5 164 | 165 | name = df['name'].unique() 166 | final_cu = [] 167 | for table_name in name: 168 | 169 | rcu = df.query( 170 | "metric_name == 'ConsumedReadCapacityUnits' and name == @table_name") 171 | wcu = df.query( 172 | "metric_name == 'ConsumedWriteCapacityUnits' and name == @table_name") 173 | rcu = ((rcu.sort_values(by='timestamp', ascending=True) 174 | ).reset_index(drop=True)).values.tolist() 175 | wcu = ((wcu.sort_values(by='timestamp', ascending=True) 176 | ).reset_index(drop=True)).values.tolist() 177 | if len(rcu) > 0 and len(wcu) > 0: 178 | final_cu += estimate_units(rcu, wcu, 179 | read_utilization, write_utilization, read_min, write_min, read_max, write_max) 180 | if len(final_cu) > 0: 181 | final_df = pd.DataFrame(final_cu) 182 | final_df.columns = ['metric_name', 'timestamp', 183 | 'name', 'unit', 'unitps', 'estunit'] 184 | return final_df 185 | else: 186 | return None 187 | -------------------------------------------------------------------------------- /capacity-mode-evaluator/src/pricing.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import json 3 | 4 | from decimal import Decimal 5 | 6 | 7 | class PricingUtility(object): 8 | def __init__(self, region_name, profile_name=None): 9 | 10 | closest_api_region = 'us-east-1' 11 | 12 | AMERICAN_REGIONS = ['us-east-1', 'us-east-2', 13 | 'us-west-1', 'us-west-2', 14 | 'us-gov-west-1', 'us-gov-west-2', 15 | 'ca-central-1', 'sa-east-1'] 16 | # the pricing API is only available in us-east-1 and ap-south-1 17 | # pick the closest endpoint to the supplied region 18 | if region_name not in AMERICAN_REGIONS: 19 | closest_api_region = 'ap-south-1' 20 | 21 | self.session = boto3.session.Session(profile_name=profile_name) 22 | self.pricing_client = self.session.client( 23 | 'pricing', region_name=closest_api_region) 24 | 25 | def get_provisioned_capacity_pricing(self, region_code: str) -> dict: 26 | """Get DynamoDB provisioned capacity pricing for a given region.""" 27 | throughput_pricing = {} 28 | 29 | response = self.pricing_client.get_products( 30 | ServiceCode='AmazonDynamoDB', 31 | Filters=[{'Type': 'TERM_MATCH', 32 | 'Field': 'productFamily', 33 | 'Value': 'Provisioned IOPS'}, 34 | {'Type': 'TERM_MATCH', 35 | 'Field': 'regionCode', 36 | 'Value': region_code} 37 | ], 38 | FormatVersion='aws_v1', 39 | MaxResults=100 40 | ) 41 | price_list = response['PriceList'] 42 | 43 | for entry in price_list: 44 | product = json.loads(entry) 45 | product_group = product['product']['attributes']['group'] 46 | offer = product['terms']['OnDemand'].popitem() 47 | offer_terms = offer[1] 48 | price_dimensions = offer_terms['priceDimensions'] 49 | 50 | for price_dimension_code in price_dimensions: 51 | price_terms = price_dimensions[price_dimension_code] 52 | price_per_unit = price_terms['pricePerUnit']['USD'] 53 | price = Decimal(price_per_unit) 54 | 55 | # Regions with free tier pricing will have an initial entry set to zero; skip this 56 | if price != 0: 57 | if product_group == 'DDB-ReadUnits': 58 | throughput_pricing['std_rcu_pricing'] = price 59 | elif product_group == 'DDB-WriteUnits': 60 | throughput_pricing['std_wcu_pricing'] = price 61 | elif product_group == 'DDB-ReadUnitsIA': 62 | throughput_pricing['ia_rcu_pricing'] = price 63 | elif product_group == 'DDB-WriteUnitsIA': 64 | throughput_pricing['ia_wcu_pricing'] = price 65 | 66 | return throughput_pricing 67 | 68 | def get_on_demand_capacity_pricing(self, region_code: str) -> dict: 69 | """Get DynamoDB On-demand capacity pricing for a given region.""" 70 | throughput_pricing = {} 71 | 72 | response = self.pricing_client.get_products( 73 | ServiceCode='AmazonDynamoDB', 74 | Filters=[{'Type': 'TERM_MATCH', 75 | 'Field': 'productFamily', 76 | 'Value': 'Amazon DynamoDB PayPerRequest Throughput'}, 77 | {'Type': 'TERM_MATCH', 78 | 'Field': 'regionCode', 79 | 'Value': region_code} 80 | ], 81 | FormatVersion='aws_v1', 82 | MaxResults=100 83 | ) 84 | price_list = response['PriceList'] 85 | 86 | for entry in price_list: 87 | product = json.loads(entry) 88 | product_group = product['product']['attributes']['group'] 89 | offer = product['terms']['OnDemand'].popitem() 90 | offer_terms = offer[1] 91 | price_dimensions = offer_terms['priceDimensions'] 92 | 93 | for price_dimension_code in price_dimensions: 94 | price_terms = price_dimensions[price_dimension_code] 95 | price_per_unit = price_terms['pricePerUnit']['USD'] 96 | price = Decimal(price_per_unit) 97 | 98 | # Regions with free tier pricing will have an initial entry set to zero; skip this 99 | if price != 0: 100 | if product_group == 'DDB-ReadUnits': 101 | throughput_pricing['std_rcu_pricing'] = price 102 | elif product_group == 'DDB-WriteUnits': 103 | throughput_pricing['std_wcu_pricing'] = price 104 | elif product_group == 'DDB-ReadUnitsIA': 105 | throughput_pricing['ia_rcu_pricing'] = price 106 | elif product_group == 'DDB-WriteUnitsIA': 107 | throughput_pricing['ia_wcu_pricing'] = price 108 | 109 | return throughput_pricing 110 | -------------------------------------------------------------------------------- /ddb-migration/.npmignore: -------------------------------------------------------------------------------- 1 | *.ts 2 | !*.d.ts 3 | 4 | # CDK asset staging directory 5 | .cdk.staging 6 | cdk.out 7 | -------------------------------------------------------------------------------- /ddb-migration/README.md: -------------------------------------------------------------------------------- 1 | # 🔄 DynamoDB Migration Project 2 | 3 | This sample development project presents the content explained in the [DynamoDB migration playbook](./documentation/migration-playbook.md). 4 | ⚠️ This project is not intended to be deployed in production as-is, and it is your responsibility to complete and run proper testing in lower environments. ⚠️ 5 | 6 | ## 🚀 Getting Started 7 | 8 | To execute this CDK project, you'll need to provide the `sourceTableArn` and the `destinationTableArn` DynamoDB table ARNs. For the source table you must enable DynamoDB Streams to project [`NEW_AND_OLD_IMAGES`](https://docs.aws.amazon.com/amazondynamodb/latest/APIReference/API_StreamSpecification.html) view type for this solution to work properly. 9 | 10 | 📋 Before you begin, ensure your shell has credentials for the account you're operating in through your AWS profile or your environment variables. 11 | 12 | Copy and paste your table ARNs into the shell variables while setting the correct AWS region to deploy the resources: 13 | 14 | ```bash 15 | # Change to match your region. This should be the same region as the source table. 16 | export AWS_DEFAULT_REGION=us-east-1 17 | # The source table should be in the same account as the deployed resources. 18 | export SOURCE_TABLE_ARN=arn:aws:dynamodb:us-east-1:111122223333:table/my-source-table 19 | # If destination table is in a different account, follow the 'Cross-account access' section 20 | export DEST_TABLE_ARN=arn:aws:dynamodb:us-east-1:111122223333:table/my-source-table-migrated 21 | ``` 22 | 23 | ## 🍎 Setup for macOS (for brand new users) 24 | 25 | If this is your first time running any CDK application and you happen to be on macOS, follow these instructions first to install dependencies. 26 | 27 | **Prerequisites:** 28 | 29 | - You should have [brew](https://brew.sh/) installed. 30 | - You should set `AWS_DEFAULT_REGION` to the region you want to operate in. 31 | 32 | Install the following packages with brew to begin: 33 | 34 | ```bash 35 | brew install typescript 36 | brew install aws-cdk 37 | brew install node 38 | ``` 39 | 40 | With npm, Install the aws-cdk-lib to start: 41 | 42 | ```bash 43 | npm i aws-cdk-lib 44 | ``` 45 | 46 | Then you need to bootstrap CDK: 47 | 48 | ```bash 49 | cdk bootstrap -c sourceTableArn=$SOURCE_TABLE_ARN -c destinationTableArn=$DEST_TABLE_ARN 50 | ``` 51 | 52 | ### 🚀 Deployment 53 | 54 | The following command will generate the synthesized CloudFormation template, you can use this output to explore the stack that will be generated. 55 | 56 | ```bash 57 | cdk synth -c sourceTableArn=$SOURCE_TABLE_ARN -c destinationTableArn=$DEST_TABLE_ARN 58 | ``` 59 | 60 | Once you are ready to deploy your solution you can execute it with the following command. Please remember the source table needs to have Amazon DynamoDB streams enabled. 61 | 62 | ```bash 63 | 64 | cdk deploy -c sourceTableArn=$SOURCE_TABLE_ARN -c destinationTableArn=$DEST_TABLE_ARN 65 | ... 66 | ... 67 | DdbMigration-source-table-To-destination-table | 33/33 | 11:45:37 AM | CREATE_COMPLETE | AWS::CloudFormation::Stack | DdbMigration-source-table-To-destination-table 68 | 69 | ✅ DdbMigration-source-table-To-destination-table 70 | 71 | ✨ Deployment time: 81.71s 72 | 73 | Outputs: 74 | DdbMigration-source-table-To-destination-table.DestinationTablePolicy = { 75 | "Version": "2012-10-17", 76 | "Statement": [ 77 | { 78 | "Effect": "Allow", 79 | "Action": [ 80 | "dynamodb:PutItem", 81 | "dynamodb:BatchWriteItem", 82 | "dynamodb:DescribeTable" 83 | ], 84 | "Principal": { 85 | "AWS": [ 86 | "arn:aws:iam::1111222223333:role/DdbMigration-source-table-To-de-GlueJobRoleF1B69418-DiAybmqsORAy", 87 | "arn:aws:iam::1111222223333:role/DdbMigration-source-table-WriteCdcLambdaServiceRole-Zl7IcNJaFqsM" 88 | ] 89 | }, 90 | "Resource": "arn:aws:dynamodb:us-east-1:1111222223333:table/destination-table" 91 | } 92 | ] 93 | } 94 | DdbMigration-source-table-To-destination-table.DirectMigrationJobName = DirectMigrationJob-l63F97BtqXQq 95 | DdbMigration-source-table-To-destination-table.LargeMigrationJobName = LargeMigrationJob-VSYJGsC9ISBz 96 | DdbMigration-source-table-To-destination-table.MigrationBucketName = ddbmigration-source-table--migrationbucket1234567 97 | DdbMigration-source-table-To-destination-table.StateMachineArn = arn:aws:states:us-east-1:1111222223333:stateMachine:DdbMigration-source-table-To-destination-table 98 | Stack ARN: 99 | arn:aws:cloudformation:us-east-1:1111222223333:stack/DdbMigration-source-table-To-destination-table/eeab6590-eaf2-1111-a0a0-0affea56b8cb 100 | 101 | ✨ Total time: 86.88s 102 | ``` 103 | 104 | 1. Run the `cdk deploy` command and note the output of the IAM policy in the Output _DestinationTablePolicy_. 105 | 2. Copy the IAM policy into your clipboard. This is a resource-based policy that must be applied to the destination DynamoDB table in your destination account. If you do not see this output, the script has determined the source and destination are the same AWS account. You can also see this output on the CloudFormation stack in the AWS Management Console. 106 | 3. Follow [these developer documentation instructions](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/rbac-attach-resource-based-policy.html) to add the policy to your destination table, preferably using the AWS Management Console. 107 | 4. Execute the code in AWS Step Functions console as normal. The Lambda function WriteCdc will write cross-account into your table with the permissions granted by this resource-based policy 108 | 109 | ### 🌐 Cross-account access 110 | 111 | The resources for the migration as well as the source DynamoDB table must be in the same account, but the destination table can be in any account. However, you must update the destination table's permissions with a [resource-based policy](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/access-control-resource-based.html) created and placed into the CloudFormation stack Outputs section in order for the replication to work. 112 | 113 | > 📝 **Note:** _If you attempt to deploy this into an AWS account different from the source table's account, the CloudFormation stack creation will fail with an error due to a custom resource we made to validate account ids called AccountValidationCustomResource. If you make this mistake, you must run the clean-up step to destroy the stack._ 114 | 115 | ### 🧹 Clean-up 116 | 117 | To destroy the resources (when the migration is completed and you decide to cut-over), execute: 118 | 119 | ```bash 120 | cdk destroy -c sourceTableArn=$SOURCE_TABLE_ARN -c destinationTableArn=$DEST_TABLE_ARN 121 | ``` 122 | 123 | ### ⚠️ Current Limitations 124 | 125 | - The current deployment assumes the deployed resources and the source table are in the same account. The destination table can be in a separate account. 126 | - You must change the table ARNs and re-run `cdk deploy` for every table combination. You can't re-use the same stack for different table combinations. 127 | 128 | ## 👏 Acknowledgements 129 | 130 | We would like to extend our heartfelt thanks to the major collaborators who made significant contributions to this project: 131 | 132 | - 🌟 [Esteban Serna](https://github.com/tebanieo) 133 | - 🌟 [Sean Shriver](https://github.com/switch180) 134 | - 🌟 [John Terhune](https://github.com/terhunej) 135 | 136 | Their expertise, dedication, and hard work have been instrumental in the development and success of this DynamoDB Migration Project. We are truly grateful for their valuable input and collaborative spirit. 137 | 138 | ## 🤝 Contributing 139 | 140 | We welcome contributions from the community! Whether you're fixing bugs, improving documentation, or proposing new features, your efforts are greatly appreciated. Here are some ways you can contribute: 141 | 142 | ### 🚀 Feature Enhancements 143 | 144 | - Help us increase the current throughput limitation of 9K TPS 145 | - Implement new migration strategies or optimizations 146 | - Add support for additional AWS services or integrations 147 | 148 | ### 🧪 Testing 149 | 150 | - Improve and expand our unit test coverage 151 | - Develop integration tests 152 | - Perform thorough testing in various scenarios and environments 153 | 154 | ### 📚 Documentation 155 | 156 | - Improve existing documentation for clarity and completeness 157 | - Add examples, tutorials, or use cases 158 | - Translate documentation to other languages 159 | 160 | ### 🧹 Code Quality 161 | 162 | - Implement or improve linting configurations 163 | - Refactor code for better readability and maintainability 164 | - Optimize performance in existing codebase 165 | 166 | ### 🐛 Bug Hunting 167 | 168 | - Identify and report bugs 169 | - Provide detailed reproduction steps for issues 170 | - Submit pull requests with bug fixes 171 | 172 | ### 💡 Ideas and Discussions 173 | 174 | - Propose new features or improvements 175 | - Participate in discussions about the project's direction 176 | - Share your use cases and how the project could better support them 177 | 178 | To get started: 179 | 180 | 1. Please create an [issue first](https://github.com/awslabs/amazon-dynamodb-tools/issues/new) to discuss your contribution. 181 | 2. Fork the repository 182 | 3. Create a new branch for your contribution 183 | 4. Make your changes 184 | 5. Submit a pull request with a clear description of your changes 185 | 186 | Please ensure your code adheres to our coding standards and includes appropriate tests and documentation. 187 | 188 | We look forward to your contributions and are excited to see how together we can improve this DynamoDB Migration Project! 189 | 190 | #### 🛠 Original Readme - Useful commands 191 | 192 | - `npm run build` compile typescript to js 193 | - `npm run watch` watch for changes and compile 194 | - `npm run test` perform the jest unit tests 195 | - `npx cdk deploy` deploy this stack to your default AWS account/region 196 | - `npx cdk diff` compare deployed stack with current state 197 | - `npx cdk synth` emits the synthesized CloudFormation template 198 | -------------------------------------------------------------------------------- /ddb-migration/bin/ddb-migration.ts: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env node 2 | import "source-map-support/register"; 3 | import * as cdk from "aws-cdk-lib"; 4 | import { DdbMigrationStack } from "../lib/ddb-migration-stack"; 5 | 6 | const app = new cdk.App(); 7 | 8 | let sourceTableArn = app.node.tryGetContext("sourceTableArn"); 9 | let destinationTableArn = app.node.tryGetContext("destinationTableArn"); 10 | 11 | if (!sourceTableArn) { 12 | throw new Error( 13 | 'Context parameter "sourceTableArn" is required. Use -c sourceTableArn=' 14 | ); 15 | } 16 | if (!destinationTableArn) { 17 | throw new Error( 18 | 'Context parameter "destinationTableArn" is required. Use -c destinationTableArn=' 19 | ); 20 | } 21 | // Extract table names from ARNs 22 | const getTableNameFromArn = (arn: string) => { 23 | const parts = arn.split(":"); 24 | return parts[parts.length - 1].split("/")[1]; 25 | }; 26 | 27 | let sourceTableName = getTableNameFromArn(sourceTableArn); 28 | let destinationTableName = getTableNameFromArn(destinationTableArn); 29 | 30 | // Sanitize the table name 31 | sourceTableName = sourceTableName.replace(/^[^A-Za-z]+/, ""); 32 | sourceTableName = sourceTableName.replace(/[^A-Za-z0-9-]/g, "-"); 33 | destinationTableName = destinationTableName.replace(/^[^A-Za-z]+/, ""); 34 | destinationTableName = destinationTableName.replace(/[^A-Za-z0-9-]/g, "-"); 35 | 36 | // Validate the table name against the regex 37 | const regex = /^[A-Za-z][A-Za-z0-9-]*$/; 38 | if (!regex.test(sourceTableName) && !regex.test(destinationTableName)) { 39 | throw new Error( 40 | "Sanitized table name does not match the required pattern /^[A-Za-z][A-Za-z0-9-]*$/" 41 | ); 42 | } 43 | 44 | // Generate a unique stack name 45 | const stackName = `DdbMigration-${sourceTableName}-To-${destinationTableName}`; 46 | 47 | new DdbMigrationStack(app, stackName, { 48 | sourceTableArn, 49 | sourceTableName: getTableNameFromArn(sourceTableArn), 50 | destinationTableArn, 51 | destinationTableName: getTableNameFromArn(destinationTableArn), 52 | env: { 53 | account: process.env.CDK_DEFAULT_ACCOUNT, 54 | region: process.env.CDK_DEFAULT_REGION, 55 | }, 56 | }); 57 | -------------------------------------------------------------------------------- /ddb-migration/cdk.json: -------------------------------------------------------------------------------- 1 | { 2 | "app": "npx ts-node --prefer-ts-exts bin/ddb-migration.ts", 3 | "watch": { 4 | "include": [ 5 | "**" 6 | ], 7 | "exclude": [ 8 | "README.md", 9 | "cdk*.json", 10 | "**/*.d.ts", 11 | "**/*.js", 12 | "tsconfig.json", 13 | "package*.json", 14 | "yarn.lock", 15 | "node_modules", 16 | "test" 17 | ] 18 | }, 19 | "context": { 20 | "@aws-cdk/aws-lambda:recognizeLayerVersion": true, 21 | "@aws-cdk/core:checkSecretUsage": true, 22 | "@aws-cdk/core:target-partitions": [ 23 | "aws", 24 | "aws-cn" 25 | ], 26 | "@aws-cdk-containers/ecs-service-extensions:enableDefaultLogDriver": true, 27 | "@aws-cdk/aws-ec2:uniqueImdsv2TemplateName": true, 28 | "@aws-cdk/aws-ecs:arnFormatIncludesClusterName": true, 29 | "@aws-cdk/aws-iam:minimizePolicies": true, 30 | "@aws-cdk/core:validateSnapshotRemovalPolicy": true, 31 | "@aws-cdk/aws-codepipeline:crossAccountKeyAliasStackSafeResourceName": true, 32 | "@aws-cdk/aws-s3:createDefaultLoggingPolicy": true, 33 | "@aws-cdk/aws-sns-subscriptions:restrictSqsDescryption": true, 34 | "@aws-cdk/aws-apigateway:disableCloudWatchRole": true, 35 | "@aws-cdk/core:enablePartitionLiterals": true, 36 | "@aws-cdk/aws-events:eventsTargetQueueSameAccount": true, 37 | "@aws-cdk/aws-ecs:disableExplicitDeploymentControllerForCircuitBreaker": true, 38 | "@aws-cdk/aws-iam:importedRoleStackSafeDefaultPolicyName": true, 39 | "@aws-cdk/aws-s3:serverAccessLogsUseBucketPolicy": true, 40 | "@aws-cdk/aws-route53-patters:useCertificate": true, 41 | "@aws-cdk/customresources:installLatestAwsSdkDefault": false, 42 | "@aws-cdk/aws-rds:databaseProxyUniqueResourceName": true, 43 | "@aws-cdk/aws-codedeploy:removeAlarmsFromDeploymentGroup": true, 44 | "@aws-cdk/aws-apigateway:authorizerChangeDeploymentLogicalId": true, 45 | "@aws-cdk/aws-ec2:launchTemplateDefaultUserData": true, 46 | "@aws-cdk/aws-secretsmanager:useAttachedSecretResourcePolicyForSecretTargetAttachments": true, 47 | "@aws-cdk/aws-redshift:columnId": true, 48 | "@aws-cdk/aws-stepfunctions-tasks:enableEmrServicePolicyV2": true, 49 | "@aws-cdk/aws-ec2:restrictDefaultSecurityGroup": true, 50 | "@aws-cdk/aws-apigateway:requestValidatorUniqueId": true, 51 | "@aws-cdk/aws-kms:aliasNameRef": true, 52 | "@aws-cdk/aws-autoscaling:generateLaunchTemplateInsteadOfLaunchConfig": true, 53 | "@aws-cdk/core:includePrefixInUniqueNameGeneration": true, 54 | "@aws-cdk/aws-efs:denyAnonymousAccess": true, 55 | "@aws-cdk/aws-opensearchservice:enableOpensearchMultiAzWithStandby": true, 56 | "@aws-cdk/aws-lambda-nodejs:useLatestRuntimeVersion": true, 57 | "@aws-cdk/aws-efs:mountTargetOrderInsensitiveLogicalId": true, 58 | "@aws-cdk/aws-rds:auroraClusterChangeScopeOfInstanceParameterGroupWithEachParameters": true, 59 | "@aws-cdk/aws-appsync:useArnForSourceApiAssociationIdentifier": true, 60 | "@aws-cdk/aws-rds:preventRenderingDeprecatedCredentials": true, 61 | "@aws-cdk/aws-codepipeline-actions:useNewDefaultBranchForCodeCommitSource": true, 62 | "@aws-cdk/aws-cloudwatch-actions:changeLambdaPermissionLogicalIdForLambdaAction": true, 63 | "@aws-cdk/aws-codepipeline:crossAccountKeysDefaultValueToFalse": true, 64 | "@aws-cdk/aws-codepipeline:defaultPipelineTypeToV2": true, 65 | "@aws-cdk/aws-kms:reduceCrossAccountRegionPolicyScope": true, 66 | "@aws-cdk/aws-eks:nodegroupNameAttribute": true, 67 | "@aws-cdk/aws-ec2:ebsDefaultGp3Volume": true, 68 | "@aws-cdk/aws-ecs:removeDefaultDeploymentAlarm": true, 69 | "@aws-cdk/custom-resources:logApiResponseDataPropertyTrueDefault": false, 70 | "@aws-cdk/aws-s3:keepNotificationInImportedBucket": false 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /ddb-migration/documentation/DDBPlaybookUpdated.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/amazon-dynamodb-tools/894ddf3ba87318aa06271e277e09cf436d8b2c92/ddb-migration/documentation/DDBPlaybookUpdated.png -------------------------------------------------------------------------------- /ddb-migration/documentation/migration-playbook.md: -------------------------------------------------------------------------------- 1 | # Migration Playbook 2 | 3 | Today, there is no automatic migration mechanism that allows to migrate an [Amazon DynamoDB](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Introduction.html) table from an Amplify Project to an infrastructure managed by the [Cloud Development Kit - CDK](https://docs.aws.amazon.com/cdk/v2/guide/home.html). To avoid production impact it is recommended to create a parallel environment that will be managed by the CDK, where the table's data will be synchronized across environments. 4 | 5 | As for the date of creation of this playbook, there is no an automated tool that allows restoring a DynamoDB table into a CloudFormation or CDK managed resource. When you restore to a new table using PITR (Point in Time recovery), import data from S3 or restore an snapshot you will need to restore the resource to a new DynamoDB table, and afterwards import the existing resource into your CDK of CF resource. 6 | 7 | ## Assumptions 8 | 9 | - The infrastructure (tables) will be created outside the scope of this playbook/solution. You must have two tables, your source table with all the informaton, and your destination table, empty but with the same primary key and indexes. 10 | - The DynamoDB tables and indexes will remain the same, no primary key modification. 11 | - There will be no access pattern modification and the data will remain as is. 12 | - This procedure requires the table(s) to exist(s) before it is executed. 13 | - An external synchronization mechanism will be created to migrate each one of the DynamoDB tables, this mechanism will not be controlled by Amplify CLI, nor the new CDK stack that holds the new infrastructure. 14 | - All the source DynamoDB tables have point in time recovery [(PITR)](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Point-in-time-recovery.html) enabled and DynamoDB Streams. 15 | 16 | ## High-level architecture design 17 | 18 | The synchronization process is executed per table, it consists of two steps, one data load and the [change data capture](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/streamsmain.html) to keep the tables in sync until required, once the first data load is completed CDC should remain enabled until the migration is completed and there is no more traffic from the source table. 19 | 20 | The entire process is orchestrated through AWS Step Functions via a State Machine. Through CDK, all resources are created: 21 | 22 | - AWS Step Functions: The workflow service tracking the migration phase, and assisting with monitoring and re-driving any failures 23 | - DynamoDB built-in features: 24 | - DynamoDB Export to S3 API and DynamoDB Scan API: The DynamoDB [`ExportTableToPointInTime`](https://docs.aws.amazon.com/amazondynamodb/latest/APIReference/API_ExportTableToPointInTime.html) API is used to export tables to S3 that are greater than 80GB in size. For tables that are smaller, we use the [`Scan`](https://docs.aws.amazon.com/amazondynamodb/latest/APIReference/API_Scan.html) API to read from DynamoDB directly 25 | - [DynamoDB Streams](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Streams.html): providing a 24-hour CDC stream, we use Streams to capture changes to the source table to replicate to the destination, via SQS FIFO 26 | - [Amazon SQS FIFO queue](https://docs.aws.amazon.com/AWSSimpleQueueService/latest/SQSDeveloperGuide/sqs-fifo-queues.html): We create a SQS FIFO queue to store changes from the DynamoDB Stream. An AWS Lambda function on the DynamoDB stream puts changes into a SQS FIFO queue for eventual replication, once the import is complete 27 | - [AWS Glue ETL](https://aws.amazon.com/glue/): We have two Python Glue scripts, and pick which to use based on the table size <> 80GB: 28 | - For small tables (and for MVP) we use direction migration, and the `DirectMigrationJob` uses the DynamoDB Scan API with [`BatchWriteItem`](https://docs.aws.amazon.com/amazondynamodb/latest/APIReference/API_BatchWriteItem.html) to insert items into the destination DynamoDB table. 29 | - For large tables, we use `LargeMigrationJob` to export the table to S3 and the use the same BatchWriteItem API to insert new items into the destination DynamoDB table 30 | - [AWS Lambda](https://aws.amazon.com/pm/lambda/) (each function listed in approximate execution order in the Step Functions state machine): 31 | - [`SetupCheck`](../lambda/setup-check/index.py): This Lambda function ensures DynamoDB Streams and, for large tables, PITR is enabled 32 | - [`StreamProcessor`](../lambda/stream-processor/index.py): This reads records from the DynamoDB stream and writes them SQS FIFO. Once the import is complete to the destination table, the EnableTrigger function is executed to connect the SQS FIFO queue to the WriteCdc function to begin continuous replication 33 | - [`EnableTrigger`](../lambda/enable-trigger/index.py): This Lambda connects the SQS FIFO queue to the WriteCDC function at the end of the import 34 | - [`WriteCdc`](../lambda/write-cdc/index.py): This is the final Lambda function which writes changes from the FIFO queue to the destination DynamoDB table 35 | 36 | For background, there are two options for the CDC: Amazon DynamoDB Streams or use Kinesis Data Streams (KDS). More information is available on this [video](https://youtu.be/UgG17Wh2y0g?si=mcBLljJ_1_YmtGst) for a comparison between them. The choice of a solution will depend of your data velocity and size, however DynamoDB Streams promises every change appears exactly once and will be used by this playbook. 37 | 38 | > ℹ️ NOTE: DynamoDB Streams has a retention period of 24 hours, but we copy changes in SQS Fifo to get 4 days of retention out of the box. This gives us a theoretical total of 5 days to do the bulk copy from the source to destination table 39 | 40 | ![Architecture Diagram](./DDBPlaybookUpdated.png) 41 | 42 | This is the list of the high steps this solution automates with AWS Step Functions to migrate information from one table to another with near zero downtime: 43 | 44 | 1. Ensure the Source Table has Amazon DynamoDB Streams enabled. 45 | 1. Before starting the migration process it is required to capture all the writes from the DDB table and send them over to a FIFO queue. 46 | 2. IF the table is > 80GB, enable PITR as well 47 | 2. Read the data from DynamoDB: 48 | 1. IF the table is > 80GB, Export the DynamoDB table to S3 49 | 2. IF it is < 80GB, Scan the DynamoDB table (read the data from the data plane API) 50 | 3. Use Glue to write the items to the new DynamoDB table. 51 | 1. The items are shuffled between the read and write to limit the number of throttles on write due to an internal hashing system inherent to DynamoDB 52 | 4. Enable a lambda trigger from the FIFO queue to a lambda function to write the CDC data. Make sure to filter the information based on the export time. (Step 2) 53 | 5. Cut-over your application. 54 | 6. Delete the resources created, FIFO queue, CDC lambda and decommission the old stack. 55 | -------------------------------------------------------------------------------- /ddb-migration/glue-scripts/direct-migration.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pyspark.context import SparkContext 3 | from awsglue.context import GlueContext 4 | from awsglue.job import Job 5 | from awsglue.utils import getResolvedOptions 6 | from awsglue.dynamicframe import DynamicFrame 7 | from pyspark.sql import functions as F 8 | 9 | 10 | # Get job parameters 11 | args = getResolvedOptions(sys.argv, [ 12 | 'JOB_NAME', 13 | 'source_table_arn', 14 | 'destination_table_arn' 15 | ]) 16 | 17 | 18 | # Initialize Glue context 19 | sc = SparkContext() 20 | glueContext = GlueContext(sc) 21 | spark = glueContext.spark_session 22 | TASKS_PER_EXECUTOR = int(spark.sparkContext.getConf().get("spark.executor.cores")) 23 | NUM_EXECUTORS = int(spark.sparkContext.getConf().get("spark.executor.instances")) 24 | SPLITS_STR = str(NUM_EXECUTORS * TASKS_PER_EXECUTOR) 25 | job = Job(glueContext) 26 | job.init(args['JOB_NAME'], args) 27 | 28 | 29 | try: 30 | # Read from source DynamoDB table using native connector 31 | source_dyf = glueContext.create_dynamic_frame.from_options( 32 | connection_type="dynamodb", 33 | connection_options={ 34 | "dynamodb.input.tableName": args['source_table_arn'], 35 | "dynamodb.throughput.read.percent": "1.0", 36 | "dynamodb.splits": SPLITS_STR 37 | } 38 | ) 39 | records_count = source_dyf.count() 40 | print(f"Total records to migrate: {records_count}") 41 | print("Shuffling.") 42 | # Convert to DataFrame for more control 43 | df = source_dyf.toDF() 44 | 45 | # Calculate number of partitions based on data size and available resources 46 | num_partitions = max(200, records_count // 100000) # Minimum 200 partitions, or 1 partition per 100,000 records 47 | 48 | print(f"Shuffling data into {num_partitions} partitions") 49 | 50 | # Add a random UUID and use it for partitioning 51 | df_shuffled = df.withColumn("random_id", F.rand()) \ 52 | .repartition(num_partitions, "random_id") \ 53 | .drop("random_id") 54 | 55 | # Convert back to DynamicFrame 56 | shuffled_dyf = DynamicFrame.fromDF(df_shuffled, glueContext, "shuffled_dyf") 57 | 58 | # Write to destination table 59 | glueContext.write_dynamic_frame_from_options( 60 | frame=shuffled_dyf, 61 | connection_type="dynamodb", 62 | connection_options={ 63 | "dynamodb.output.tableName": args['destination_table_arn'], 64 | "dynamodb.throughput.write.percent": "1.0" 65 | } 66 | ) 67 | 68 | # Print some statistics 69 | print(f"Records processed: {records_count}") 70 | 71 | except Exception as e: 72 | print(f"Error during migration: {str(e)}") 73 | raise e 74 | 75 | finally: 76 | # Commit the job 77 | job.commit() -------------------------------------------------------------------------------- /ddb-migration/glue-scripts/large-migration.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pyspark.context import SparkContext 3 | from awsglue.context import GlueContext 4 | from awsglue.job import Job 5 | from awsglue.utils import getResolvedOptions 6 | from awsglue.dynamicframe import DynamicFrame 7 | from pyspark.sql import functions as F 8 | 9 | # Get job parameters 10 | args = getResolvedOptions(sys.argv, [ 11 | 'JOB_NAME', 12 | 'source_table_arn', 13 | 'destination_table', 14 | 'migration_bucket', 15 | 'account_id' 16 | ]) 17 | 18 | # Initialize Glue context 19 | sc = SparkContext() 20 | glueContext = GlueContext(sc) 21 | spark = glueContext.spark_session 22 | job = Job(glueContext) 23 | job.init(args['JOB_NAME'], args) 24 | 25 | try: 26 | print(f"Starting large table migration from {args['source_table_arn']}") 27 | 28 | # Use the export connector for reading the source table 29 | source_dyf = glueContext.create_dynamic_frame.from_options( 30 | connection_type="dynamodb", 31 | connection_options={ 32 | "dynamodb.export": "ddb", 33 | "dynamodb.tableArn": args['source_table_arn'], 34 | "dynamodb.s3.bucket": args['migration_bucket'], 35 | "dynamodb.s3.prefix": "export/", 36 | "dynamodb.s3.bucketOwner": args['account_id'], 37 | "dynamodb.simplifyDDBJson": True 38 | } 39 | ) 40 | 41 | # Print schema and count for logging 42 | print("Source table schema:") 43 | source_dyf.printSchema() 44 | records_count = source_dyf.count() 45 | print(f"Total records to migrate: {records_count}") 46 | print("Shuffling.") 47 | 48 | # Convert DynamicFrame to DataFrame, shuffle, and convert back to DynamicFrame 49 | df = source_dyf.toDF() 50 | df = df.orderBy(F.rand()) 51 | shuffled_dyf = DynamicFrame.fromDF(df, glueContext, "shuffled_dyf") 52 | 53 | # Write to destination table with controlled throughput 54 | print(f"Writing to destination table: {args['destination_table']}") 55 | glueContext.write_dynamic_frame_from_options( 56 | frame=shuffled_dyf, 57 | connection_type="dynamodb", 58 | connection_options={ 59 | "dynamodb.output.tableName": args['destination_table'], 60 | "dynamodb.throughput.write.percent": "0.9", # Higher performance 61 | "dynamodb.output.retry": "3" # Retry failed writes 62 | } 63 | ) 64 | 65 | print(f"Migration completed. {records_count} records processed.") 66 | 67 | except Exception as e: 68 | print(f"Error during migration: {str(e)}") 69 | raise e 70 | 71 | finally: 72 | # Clean up S3 export if needed (you might want to keep it for verification) 73 | try: 74 | import boto3 75 | s3 = boto3.client('s3') 76 | # List and delete exported files 77 | paginator = s3.get_paginator('list_objects_v2') 78 | for page in paginator.paginate( 79 | Bucket=args['migration_bucket'], 80 | Prefix='export/' 81 | ): 82 | if 'Contents' in page: 83 | for obj in page['Contents']: 84 | s3.delete_object( 85 | Bucket=args['migration_bucket'], 86 | Key=obj['Key'] 87 | ) 88 | print("Cleaned up S3 export files") 89 | except Exception as cleanup_error: 90 | print(f"Warning: Error during cleanup: {str(cleanup_error)}") 91 | 92 | # Commit the job 93 | job.commit() -------------------------------------------------------------------------------- /ddb-migration/jest.config.js: -------------------------------------------------------------------------------- 1 | module.exports = { 2 | testEnvironment: 'node', 3 | roots: ['/test'], 4 | testMatch: ['**/*.test.ts'], 5 | transform: { 6 | '^.+\\.tsx?$': 'ts-jest' 7 | } 8 | }; 9 | -------------------------------------------------------------------------------- /ddb-migration/lambda/enable-trigger/index.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import os 3 | import json 4 | import re 5 | from botocore.exceptions import ClientError 6 | 7 | def handler(event, context): 8 | lambda_client = boto3.client('lambda') 9 | 10 | fifo_queue_arn = os.environ['FIFO_QUEUE_ARN'] 11 | write_cdc_function_name = os.environ['WRITE_CDC_FUNCTION_NAME'] 12 | migration_time = event['migrationTime'] 13 | 14 | try: 15 | response = lambda_client.create_event_source_mapping( 16 | EventSourceArn=fifo_queue_arn, 17 | FunctionName=write_cdc_function_name, 18 | BatchSize=10, 19 | FilterCriteria={ 20 | 'Filters': [ 21 | { 22 | 'Pattern': json.dumps({ 23 | 'timestamp': [{'numeric': ['>=', migration_time]}] 24 | }) 25 | } 26 | ] 27 | } 28 | ) 29 | return { 30 | 'EventSourceMappingId': response['UUID'] 31 | } 32 | except ClientError as e: 33 | if e.response['Error']['Code'] == 'ResourceConflictException': 34 | # Extract UUID from the exception message 35 | uuid_match = re.search(r'UUID\s+([0-9a-f-]+)', str(e)) 36 | if uuid_match: 37 | existing_uuid = uuid_match.group(1) 38 | return { 39 | 'EventSourceMappingId': existing_uuid, 40 | 'Message': 'Event source mapping already exists' 41 | } 42 | else: 43 | raise Exception("Event source mapping already exists. In addition, UUID for the mapping not found in the exception message") 44 | else: 45 | # If it's a different exception, re-raise it 46 | raise 47 | 48 | 49 | -------------------------------------------------------------------------------- /ddb-migration/lambda/setup-check/index.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import time 3 | import os 4 | import re 5 | from botocore.exceptions import ClientError 6 | 7 | def handler(event, context): 8 | dynamodb = boto3.client('dynamodb') 9 | lambda_client = boto3.client('lambda') 10 | table_arn = event['tableArn'] 11 | table_name = table_arn.split('/')[1] 12 | processor_function_name = os.environ['STREAM_PROCESSOR_FUNCTION_NAME'] 13 | 14 | try: 15 | # Get table details and size 16 | table = dynamodb.describe_table(TableName=table_name) 17 | size_bytes = table['Table']['TableSizeBytes'] 18 | is_large = size_bytes > 80 * 1024 * 1024 * 1024 # 80GB threshold 19 | 20 | # Check if streams are enabled 21 | stream_specification = table['Table'].get('StreamSpecification', {}) 22 | streams_enabled = stream_specification.get('StreamEnabled', False) 23 | stream_view_type = stream_specification.get('StreamViewType', False) 24 | 25 | # Check PITR status 26 | pitr_description = dynamodb.describe_continuous_backups(TableName=table_name) 27 | pitr_status = pitr_description['ContinuousBackupsDescription']['PointInTimeRecoveryDescription']['PointInTimeRecoveryStatus'] 28 | pitr_enabled = pitr_status == 'ENABLED' 29 | 30 | ex_message = list() 31 | if not streams_enabled: 32 | ex_message.append("DynamoDB Streams") 33 | elif stream_view_type != "NEW_AND_OLD_IMAGES": 34 | ex_message.append("DynamoDB Streams (NEW_AND_OLD_IMAGES view type)") 35 | if is_large and not pitr_enabled: 36 | ex_message.append("DynamoDB PITR") 37 | 38 | if len(ex_message): 39 | raise Exception(f"{' and '.join(ex_message)} is NOT enabled on the source table.") 40 | 41 | stream_arn = table['Table']['LatestStreamArn'] 42 | 43 | # Enable trigger for stream processor 44 | try: 45 | response = lambda_client.create_event_source_mapping( 46 | EventSourceArn=stream_arn, 47 | FunctionName=processor_function_name, 48 | StartingPosition='LATEST' 49 | ) 50 | event_source_mapping_uuid = response['UUID'] 51 | except ClientError as e: 52 | if e.response['Error']['Code'] == 'ResourceConflictException': 53 | print(f"Event source mapping already exists for stream {stream_arn} and function {processor_function_name}. Continuing execution.") 54 | error_message = str(e) 55 | uuid_match = re.search(r'UUID ([0-9a-f-]+)', error_message) 56 | if uuid_match: 57 | event_source_mapping_uuid = uuid_match.group(1) 58 | print(f"Extracted existing event source mapping UUID: {event_source_mapping_uuid}") 59 | else: 60 | raise e 61 | 62 | # Capture current timestamp for export 63 | export_time = int(time.time()) 64 | 65 | return { 66 | 'isLargeTable': is_large, 67 | 'tableName': table_name, 68 | 'exportTime': export_time, 69 | 'tableSize': size_bytes, 70 | 'streamsEnabled': True, 71 | 'streamArn': stream_arn, 72 | 'eventSourceMappingId': event_source_mapping_uuid 73 | } 74 | except Exception as e: 75 | print(f"Error in setup check for table {table_name}: {str(e)}") 76 | raise e -------------------------------------------------------------------------------- /ddb-migration/lambda/stream-processor/index.py: -------------------------------------------------------------------------------- 1 | import json 2 | import boto3 3 | import os 4 | import hashlib 5 | from datetime import datetime 6 | from botocore.exceptions import ClientError 7 | import logging 8 | 9 | # Set up logging 10 | logger = logging.getLogger() 11 | logger.setLevel(logging.INFO) 12 | 13 | sqs = boto3.client("sqs") 14 | 15 | QUEUE_URL = os.environ["SQS_FIFO_QUEUE_URL"] 16 | DLQ_URL = os.environ["SQS_DLQ_URL"] 17 | 18 | MAX_RETRIES = 3 19 | MAX_DLQ_RETRIES = 3 20 | 21 | 22 | def send_message_to_fifo(message): 23 | retry_count = 0 24 | while retry_count < MAX_RETRIES: 25 | try: 26 | logger.info( 27 | f"Attempting to send message to FIFO queue: {json.dumps(message)}" 28 | ) 29 | response = sqs.send_message( 30 | QueueUrl=QUEUE_URL, 31 | MessageBody=message["MessageBody"], 32 | MessageDeduplicationId=message["MessageDeduplicationId"], 33 | MessageGroupId=message["MessageGroupId"], 34 | ) 35 | logger.info( 36 | f"Message sent successfully to FIFO queue. Response: {json.dumps(response)}" 37 | ) 38 | return True 39 | except ClientError as e: 40 | retry_count += 1 41 | logger.warning( 42 | f"Failed to send message to FIFO queue. Attempt {retry_count} of {MAX_RETRIES}. Error: {str(e)}" 43 | ) 44 | 45 | logger.error(f"Failed to send message to FIFO queue after {MAX_RETRIES} attempts") 46 | return False 47 | 48 | 49 | def send_message_to_dlq(message): 50 | retry_count = 0 51 | while retry_count < MAX_DLQ_RETRIES: 52 | try: 53 | logger.info(f"Attempting to send message to DLQ: {json.dumps(message)}") 54 | response = sqs.send_message( 55 | QueueUrl=DLQ_URL, 56 | MessageBody=json.dumps( 57 | { 58 | "original_message": message, 59 | "error": "Failed to send to FIFO queue after multiple retries", 60 | } 61 | ), 62 | ) 63 | logger.info( 64 | f"Message sent successfully to DLQ. Response: {json.dumps(response)}" 65 | ) 66 | return True 67 | except ClientError as e: 68 | retry_count += 1 69 | logger.warning( 70 | f"Failed to send message to DLQ. Attempt {retry_count} of {MAX_DLQ_RETRIES}. Error: {str(e)}" 71 | ) 72 | 73 | logger.error(f"Failed to send message to DLQ after {MAX_DLQ_RETRIES} attempts") 74 | return False 75 | 76 | 77 | def handler(event, context): 78 | logger.info(f"Received event: {json.dumps(event)}") 79 | 80 | for record in event["Records"]: 81 | if record["eventName"] in ["INSERT", "MODIFY", "DELETE"]: 82 | item = { 83 | k: list(v.values())[0] 84 | for k, v in record["dynamodb"] 85 | .get("NewImage" if record["eventName"] != "DELETE" else "OldImage", {}) 86 | .items() 87 | } 88 | timestamp = record["dynamodb"]["ApproximateCreationDateTime"] 89 | 90 | message_id = generate_unique_id(item, timestamp) 91 | 92 | message = { 93 | "id": message_id, 94 | "data": item, 95 | "event_type": record["eventName"], 96 | "timestamp": datetime.fromtimestamp(timestamp).isoformat(), 97 | } 98 | 99 | fifo_message = { 100 | "MessageBody": json.dumps(message), 101 | "MessageDeduplicationId": message_id, 102 | "MessageGroupId": item.get("pk", "default"), 103 | } 104 | 105 | success = send_message_to_fifo(fifo_message) 106 | if success: 107 | logger.info(f"Successfully sent message {message_id} to FIFO queue") 108 | else: 109 | logger.error( 110 | f"Failed to send message {message_id} to FIFO queue. Attempting to send to DLQ." 111 | ) 112 | dlq_success = send_message_to_dlq(fifo_message) 113 | if dlq_success: 114 | logger.info(f"Successfully sent message {message_id} to DLQ") 115 | else: 116 | logger.error( 117 | f"Failed to send message {message_id} to both FIFO queue and DLQ" 118 | ) 119 | 120 | 121 | def generate_unique_id(item, timestamp): 122 | unique_string = json.dumps(item, sort_keys=True) + str(timestamp) 123 | return hashlib.md5(unique_string.encode()).hexdigest() 124 | -------------------------------------------------------------------------------- /ddb-migration/lambda/write-cdc/index.py: -------------------------------------------------------------------------------- 1 | import json 2 | import boto3 3 | from botocore.exceptions import ClientError 4 | import os 5 | 6 | sqs = boto3.client("sqs") 7 | dynamodb = boto3.resource("dynamodb") 8 | 9 | SQS_QUEUE_URL = os.environ["SQS_FIFO_QUEUE_URL"] 10 | DYNAMODB_TABLE_NAME = os.environ["DESTINATION_TABLE_NAME"] 11 | DLQ_URL = os.environ["SQS_DLQ_URL"] 12 | MAX_BATCH_SIZE = 25 13 | MAX_RETRIES = 3 14 | 15 | 16 | def handler(event, context): 17 | table = dynamodb.Table(DYNAMODB_TABLE_NAME) 18 | processed_count = 0 19 | failed_messages = [] 20 | 21 | try: 22 | items_to_write = [] 23 | for record in event["Records"]: 24 | message = json.loads(record["body"]) 25 | 26 | item = message["data"] 27 | # item["event_type"] = message["event_type"] 28 | # item["timestamp"] = message["timestamp"] 29 | # item["sqs_message_id"] = record["messageId"] 30 | 31 | items_to_write.append(item) 32 | 33 | if len(items_to_write) == MAX_BATCH_SIZE: 34 | unprocessed = batch_write_to_dynamodb(items_to_write) 35 | process_results(unprocessed, items_to_write, failed_messages) 36 | processed_count += len(items_to_write) - len(unprocessed) 37 | items_to_write = [] 38 | 39 | # Write any remaining items 40 | if items_to_write: 41 | unprocessed = batch_write_to_dynamodb(items_to_write) 42 | process_results(unprocessed, items_to_write, failed_messages) 43 | processed_count += len(items_to_write) - len(unprocessed) 44 | 45 | # Handle failed messages 46 | if failed_messages: 47 | send_to_dlq_with_retry(failed_messages) 48 | 49 | # Delete successfully processed messages 50 | delete_processed_messages(event["Records"], failed_messages) 51 | 52 | return { 53 | "statusCode": 200, 54 | "body": json.dumps( 55 | f"Successfully processed {processed_count} messages, Failed: {len(failed_messages)}" 56 | ), 57 | } 58 | except Exception as e: 59 | print(f"Error processing messages: {str(e)}") 60 | return { 61 | "statusCode": 500, 62 | "body": json.dumps(f"Error processing messages: {str(e)}"), 63 | } 64 | 65 | 66 | def batch_write_to_dynamodb(items): 67 | try: 68 | response = dynamodb.batch_write_item( 69 | RequestItems={ 70 | DYNAMODB_TABLE_NAME: [{"PutRequest": {"Item": item}} for item in items] 71 | } 72 | ) 73 | unprocessed = response.get("UnprocessedItems", {}).get(DYNAMODB_TABLE_NAME, []) 74 | return [item["PutRequest"]["Item"] for item in unprocessed] 75 | except ClientError as e: 76 | print(f"Error batch writing to DynamoDB: {str(e)}") 77 | return items # Consider all items as failed if there's a client error 78 | 79 | 80 | def process_results(unprocessed_items, all_items, failed_messages): 81 | for item in unprocessed_items: 82 | failed_messages.append( 83 | { 84 | "Id": item.get("id", "unknown"), 85 | "MessageBody": json.dumps( 86 | { 87 | "data": item, 88 | "event_type": item.get("event_type"), 89 | "timestamp": item.get("timestamp"), 90 | } 91 | ), 92 | "ReceiptHandle": next( 93 | ( 94 | record["receiptHandle"] 95 | for record in event["Records"] 96 | if record["messageId"] == item["sqs_message_id"] 97 | ), 98 | None, 99 | ), 100 | } 101 | ) 102 | 103 | 104 | def send_to_dlq_with_retry(messages): 105 | for message in messages: 106 | retry_count = 0 107 | while retry_count < MAX_RETRIES: 108 | try: 109 | sqs.send_message( 110 | QueueUrl=DLQ_URL, 111 | MessageBody=json.dumps( 112 | { 113 | "original_message": message, 114 | "error": "Failed to write to DynamoDB", 115 | } 116 | ), 117 | ) 118 | print(f"Message {message['Id']} sent to DLQ") 119 | break 120 | except Exception as e: 121 | retry_count += 1 122 | print( 123 | f"Attempt {retry_count} failed to send message {message['Id']} to DLQ: {str(e)}" 124 | ) 125 | 126 | if retry_count == MAX_RETRIES: 127 | print( 128 | f"CRITICAL: Failed to send message {message['Id']} to DLQ after {MAX_RETRIES} attempts" 129 | ) 130 | 131 | 132 | def delete_processed_messages(all_messages, failed_messages): 133 | failed_message_ids = {msg["Id"] for msg in failed_messages} 134 | messages_to_delete = [ 135 | {"Id": msg["messageId"], "ReceiptHandle": msg["receiptHandle"]} 136 | for msg in all_messages 137 | if msg["messageId"] not in failed_message_ids 138 | ] 139 | 140 | if messages_to_delete: 141 | sqs.delete_message_batch(QueueUrl=SQS_QUEUE_URL, Entries=messages_to_delete) 142 | -------------------------------------------------------------------------------- /ddb-migration/lib/account-check/index.py: -------------------------------------------------------------------------------- 1 | import json 2 | import boto3 3 | import urllib3 4 | 5 | http = urllib3.PoolManager() 6 | 7 | def send_response(event, context, status, reason=None): 8 | response_body = { 9 | "Status": status, 10 | "Reason": reason or "", 11 | "PhysicalResourceId": context.log_stream_name, 12 | "StackId": event["StackId"], 13 | "RequestId": event["RequestId"], 14 | "LogicalResourceId": event["LogicalResourceId"], 15 | } 16 | 17 | url = event["ResponseURL"] 18 | encoded_body = json.dumps(response_body).encode("utf-8") 19 | headers = {"Content-Type": ""} 20 | 21 | try: 22 | http.request("PUT", url, body=encoded_body, headers=headers) 23 | except Exception as e: 24 | print(f"Failed to send response: {str(e)}") 25 | 26 | 27 | def lambda_handler(event, context): 28 | print(f"Received event: {json.dumps(event)}") 29 | 30 | try: 31 | if event["RequestType"] in ["Create", "Update"]: 32 | source_account_id = event["ResourceProperties"]["SourceAccountId"] 33 | current_account_id = boto3.client("sts").get_caller_identity()["Account"] 34 | 35 | if source_account_id != current_account_id: 36 | raise ValueError( 37 | f"The account ID of the DynamoDB source table {source_account_id} does not match current account ID {current_account_id}." 38 | ) 39 | 40 | # Respond success 41 | send_response(event, context, "SUCCESS") 42 | 43 | except Exception as e: 44 | print(f"Error: {str(e)}") 45 | send_response(event, context, "FAILED", str(e)) -------------------------------------------------------------------------------- /ddb-migration/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "ddb-migration", 3 | "version": "0.1.0", 4 | "bin": { 5 | "ddb-migration": "bin/ddb-migration.js" 6 | }, 7 | "scripts": { 8 | "build": "tsc", 9 | "watch": "tsc -w", 10 | "test": "jest", 11 | "cdk": "cdk" 12 | }, 13 | "devDependencies": { 14 | "@aws-sdk/client-dynamodb": "^3.682.0", 15 | "@aws-sdk/client-glue": "^3.684.0", 16 | "@aws-sdk/client-s3": "^3.685.0", 17 | "@types/jest": "^29.5.12", 18 | "@types/node": "22.5.4", 19 | "aws-cdk": "2.158.0", 20 | "jest": "^29.7.0", 21 | "ts-jest": "^29.2.5", 22 | "ts-node": "^10.9.2", 23 | "typescript": "~5.6.2" 24 | }, 25 | "dependencies": { 26 | "aws-cdk-lib": "^2.177.0", 27 | "constructs": "^10.4.2", 28 | "source-map-support": "^0.5.21" 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /ddb-migration/test/ddb-migration.test.ts: -------------------------------------------------------------------------------- 1 | // import * as cdk from 'aws-cdk-lib'; 2 | // import { Template } from 'aws-cdk-lib/assertions'; 3 | // import * as DdbMigration from '../lib/ddb-migration-stack'; 4 | 5 | // example test. To run these tests, uncomment this file along with the 6 | // example resource in lib/ddb-migration-stack.ts 7 | test('SQS Queue Created', () => { 8 | // const app = new cdk.App(); 9 | // // WHEN 10 | // const stack = new DdbMigration.DdbMigrationStack(app, 'MyTestStack'); 11 | // // THEN 12 | // const template = Template.fromStack(stack); 13 | 14 | // template.hasResourceProperties('AWS::SQS::Queue', { 15 | // VisibilityTimeout: 300 16 | // }); 17 | }); 18 | -------------------------------------------------------------------------------- /ddb-migration/tsconfig.json: -------------------------------------------------------------------------------- 1 | { 2 | "compilerOptions": { 3 | "target": "ES2020", 4 | "module": "commonjs", 5 | "lib": [ 6 | "es2020", 7 | "dom" 8 | ], 9 | "declaration": true, 10 | "strict": true, 11 | "noImplicitAny": true, 12 | "strictNullChecks": true, 13 | "noImplicitThis": true, 14 | "alwaysStrict": true, 15 | "noUnusedLocals": false, 16 | "noUnusedParameters": false, 17 | "noImplicitReturns": true, 18 | "noFallthroughCasesInSwitch": false, 19 | "inlineSourceMap": true, 20 | "inlineSources": true, 21 | "experimentalDecorators": true, 22 | "strictPropertyInitialization": false, 23 | "typeRoots": [ 24 | "./node_modules/@types" 25 | ] 26 | }, 27 | "exclude": [ 28 | "node_modules", 29 | "cdk.out" 30 | ] 31 | } 32 | -------------------------------------------------------------------------------- /ddb_cost_tool/README.MD: -------------------------------------------------------------------------------- 1 | # DynamoDB Cost tool metrics and metadata retrieval 2 | 3 | This script will help you obtain the table metadata and metrics for the tables in your current account. 4 | 5 | You can specify via the parameter `--regions` the regions from where you want to obtain the information. If you specify `all` as parameter for your regions, this script will collect information for all the aws cloud regions avaialble (that you have access). 6 | 7 | ![Execution](./documentation/running_script.gif) 8 | 9 | This is one sample excecution. 10 | 11 | ```shell 12 | ❯ cd cost_tool 13 | ❯ python ddb_cost_tool.py --regions us-east-1 us-east-2 14 | us-east-1 15 | Collecting DynamoDB tables metadata in us-east-1: 16 | Obtaining information for table chalice-cost-sandbox-costtableCCA192A6-1WBAKAMYOYROL 17 | Obtaining information for table describe_sample_table 18 | Obtaining information for table my_handler_table 19 | Obtaining information for table my_table 20 | Obtaining information for table sample-org-table 21 | Obtaining information for Continuous Backups for table chalice-cost-sandbox-costtableCCA192A6-1WBAKAMYOYROL 22 | Obtaining information for Continuous Backups for table my_handler_table 23 | Obtaining information for Continuous Backups for table my_table 24 | Obtaining information for Continuous Backups for table sample-org-table 25 | Obtaining information for Continuous Backups for table describe_sample_table 26 | us-east-2 27 | Collecting DynamoDB tables metadata in us-east-2: 28 | Obtaining information for table describe_sample_table 29 | Obtaining information for Continuous Backups for table describe_sample_table 30 | Get metrics for 1 and 5 minutes 31 | Working with table_id 7618f4f8-2fdf-4ca5-8dcf-a18c4f474e8a 32 | Working with table_id 7442fbd5-0f2b-431a-badb-e2de080e342e 33 | Working with table_id acbc28f5-0a22-4ad4-9038-e43da98dd5lsb6 34 | Working with table_id 9f38ae29-ed3a-446c-b377-9cf3129680f1 35 | Working with table_id 7abf7966-3dde-41b8-bc42-4b77339ab4af 36 | Working with table_id 31f7719d-1a1e-4cd2-9eb3-ae9d79a41a1f 37 | Finished processing table 7618f4f8-2fdf-4ca5-8dcf-a18c4f474e8a 38 | Finished processing table 7abf7966-3dde-41b8-bc42-4b77339ab4af 39 | Finished processing table 9f38ae29-ed3a-446c-b377-9cf3129680f1 40 | Finished processing table acbc28f5-0a22-4ad4-9038-e43da98dd5b6 41 | Finished processing table 7442fbd5-0f2b-431a-badb-e2de080e342e 42 | Finished processing table 31f7719d-1a1e-4cd2-9eb3-ae9d79a41a1f 43 | 44 | ❯ ls ./output 45 | 31f7719d-1a1e-4cd2-9eb3-ae9d79a41a1f.tar.gz 46 | 7442fbd5-0f2b-431a-badb-e2de080e342e.tar.gz 47 | 7618f4f8-2fdf-4ca5-8dcf-a18c4f474e8a.tar.gz 48 | 7abf7966-3dde-41b8-bc42-4b77339ab4af.tar.gz 49 | 9f38ae29-ed3a-446c-b377-9cf3129680f1.tar.gz 50 | acbc28f5-0a22-4ad4-9038-e43da98dd5b6.tar.gz 51 | ``` 52 | 53 | ## Required permissions 54 | 55 | The role that you will assume to execute this script needs provides access to this script to capture table metadata and CloudWatch metrics, you can always use the`"*"` value on the resource values below, however it is recommended to follow the least privilege access. You can create a role like the one in the example below to provide access to only the tables from `us-east-1` and `us-east-2`: 56 | 57 | ```JSON 58 | { 59 | "Version": "2012-10-17", 60 | "Statement": [ 61 | { 62 | "Sid": "DDBPermissions", 63 | "Effect": "Allow", 64 | "Action": [ 65 | "dynamodb:DescribeTable", 66 | "dynamodb:ListTables", 67 | "dynamodb:DescribeContinuousBackups", 68 | "dynamodb:ListTagsOfResource" 69 | ], 70 | "Resource": ["arn:aws:dynamodb:us-east-1::table/*", "arn:aws:dynamodb:us-east-2::table/*" ] 71 | }, 72 | { 73 | "Sid": "CloudWatchPermissions", 74 | "Effect": "Allow", 75 | "Action": "cloudwatch:GetMetricData", 76 | "Resource": "*" 77 | } 78 | ] 79 | } 80 | 81 | ``` 82 | 83 | ## Results 84 | 85 | The `cost_tool/output` folder will contain the compressed results of this script, which can be directly uploaded to the [DynamoDB Cost Portal](https://bit.ly/3JzReos). 86 | 87 | You will need to enter the username `ddb-cost-tool` and the password `DDB-cost-tool-2024`. This portal will allow you to upload the metadata files so the cost optimization tool can provide the analysis. 88 | 89 | ![Upload Results](./documentation/ddb-cost-tool-upload.gif) 90 | 91 | After submitting your credentials, generate a new report. The current version includes 3 fields (`user_id`, `customer_name`, and `aws_alias`) that are utilized for report search. You may input any desired values for these fields, as they are solely for identification purposes when accessing the report results.Please provide the generated `report_id` to your AWS Account SA, or TAM, and ask them to retrieve the results in the [DynamoDB Cost Optimization report page](https://tiny.amazon.com/12cqro6jh). Please notice Only AWS resources have access to this page. 92 | 93 | When the analysis is complete, your AWS representative can access the report: 94 | 95 | ![Report_View](./documentation/Report_view.png) 96 | 97 | Once you select the report ID, the uploaded tables will be displayed. 98 | 99 | ![Tables_view](./documentation/Tables_view.png) 100 | 101 | Clicking on the tables will provide you with in-depth information about each one. 102 | 103 | ![Table_report_view](./documentation/Table_report_view.png) 104 | 105 | If you have any issue, feedback or comment, please open an issue and tag the cost optimization tool or the script. 106 | 107 | Thanks! 108 | -------------------------------------------------------------------------------- /ddb_cost_tool/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/amazon-dynamodb-tools/894ddf3ba87318aa06271e277e09cf436d8b2c92/ddb_cost_tool/__init__.py -------------------------------------------------------------------------------- /ddb_cost_tool/config/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/amazon-dynamodb-tools/894ddf3ba87318aa06271e277e09cf436d8b2c92/ddb_cost_tool/config/__init__.py -------------------------------------------------------------------------------- /ddb_cost_tool/config/metrics.json: -------------------------------------------------------------------------------- 1 | { 2 | "dimensionMetrics": [ 3 | { "metric_name": "ConsumedReadCapacityUnits", "stat": "Sum" }, 4 | { "metric_name": "ConsumedWriteCapacityUnits", "stat": "Sum" }, 5 | { "metric_name": "ProvisionedReadCapacityUnits", "stat": "Average" }, 6 | { "metric_name": "ProvisionedWriteCapacityUnits", "stat": "Average" }, 7 | { "metric_name": "ReadThrottleEvents", "stat": "Sum" }, 8 | { "metric_name": "WriteThrottleEvents", "stat": "Sum" } 9 | ] 10 | } 11 | -------------------------------------------------------------------------------- /ddb_cost_tool/ddb_cost_tool.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | SPDX-License-Identifier: MIT-0 4 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 5 | software and associated documentation files (the "Software"), to deal in the Software 6 | without restriction, including without limitation the rights to use, copy, modify, 7 | merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so. 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 10 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 11 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 12 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 13 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 14 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 15 | 16 | DynamoDB Cost Optimization Tool. 17 | 18 | """ 19 | import argparse 20 | import logging 21 | import os 22 | import shutil 23 | import sys 24 | from multiprocessing import Pool 25 | 26 | import region 27 | 28 | logging.getLogger().setLevel(logging.INFO) 29 | logger = logging.getLogger("cost-optimization-tool") 30 | log = logging.StreamHandler() 31 | logger.addHandler(log) 32 | 33 | REGIONS = [ 34 | "all", # All Regions" 35 | "us-east-2", # US East (Ohio) 36 | "us-east-1", # US East (N. Virginia) 37 | "us-west-1", # US West (N. California) 38 | "us-west-2", # US West (Oregon) 39 | "af-south-1", # Africa (Cape Town) 40 | "ap-east-1", # Asia Pacific (Hong Kong) 41 | "ap-south-2", # Asia Pacific (Hyderabad) 42 | "ap-southeast-3", # Asia Pacific (Jakarta) 43 | "ap-southeast-4", # Asia Pacific (Melbourne) 44 | "ap-south-1", # Asia Pacific (Mumbai) 45 | "ap-northeast-3", # Asia Pacific (Osaka) 46 | "ap-northeast-2", # Asia Pacific (Seoul) 47 | "ap-southeast-1", # Asia Pacific (Singapore) 48 | "ap-southeast-2", # Asia Pacific (Sydney) 49 | "ap-northeast-1", # Asia Pacific (Tokyo) 50 | "ca-central-1", # Canada (Central) 51 | "ca-west-1", # Canada West (Calgary) 52 | "eu-central-1", # Europe (Frankfurt) 53 | "eu-west-1", # Europe (Ireland) 54 | "eu-west-2", # Europe (London) 55 | "eu-south-1", # Europe (Milan) 56 | "eu-west-3", # Europe (Paris) 57 | "eu-south-2", # Europe (Spain) 58 | "eu-north-1", # Europe (Stockholm) 59 | "eu-central-2", # Europe (Zurich) 60 | "il-central-1", # Israel (Tel Aviv) 61 | "me-south-1", # Middle East (Bahrain) 62 | "me-central-1", # Middle East (UAE) 63 | "sa-east-1", # South America (São Paulo) 64 | "us-gov-east-1", # AWS GovCloud (US-East) 65 | "us-gov-west-1", # AWS GovCloud (US-West) 66 | ] 67 | 68 | 69 | def main(): 70 | """Main function that will run when the script is run 71 | 72 | Raises: 73 | argparse.ArgumentError: validates if the region provided is valid 74 | 75 | Returns: 76 | [list]: List of regions provided by the user 77 | """ 78 | parser = argparse.ArgumentParser(description="DynamoDB Cost Optimization Tool") 79 | parser.add_argument("--regions", nargs="+", help="Provide an array of values") 80 | try: 81 | args = parser.parse_args() 82 | 83 | regions = args.regions 84 | print(regions) 85 | if "all" in regions: 86 | REGIONS.pop(0) 87 | return REGIONS 88 | else: 89 | tmp = [i for i in regions if i in REGIONS] 90 | if len(tmp) == len(regions): 91 | return regions 92 | else: 93 | raise argparse.ArgumentError( 94 | None, "You have one or more invalid region names" 95 | ) 96 | except argparse.ArgumentError as e: 97 | logger.error(e) 98 | sys.exit(1) 99 | 100 | 101 | def get_local_files(tables): 102 | """Core logic that runs the capture metrics method in parallel. 103 | 104 | Args: 105 | tables (dict): Table metadata as result of describe table and other describe* api call. 106 | """ 107 | logger.info("Get metrics for 1 and 5 minutes") 108 | pool = Pool() # Defaults to max CPUs available 109 | results = pool.map(region.capture_metrics, tables) 110 | pool.close() # Prevents any more tasks being submitted to the pool 111 | pool.join() # Waits for the worker process to exit, you need to call close() or terminate() before using join 112 | return results 113 | 114 | 115 | def clean_env(): 116 | """Removes the files in the output folder path""" 117 | try: 118 | output_path = "./output" 119 | shutil.rmtree(output_path) 120 | print(f"The directory '{output_path}' has been successfully deleted.") 121 | os.mkdir(output_path) 122 | except FileNotFoundError: 123 | print(f"The directory '{output_path}' does not exist.") 124 | except Exception as e: 125 | print(f"An error occurred while deleting the directory: {e}") 126 | 127 | 128 | def get_ddb_table_metrics(region_name): 129 | """Obtains DynamoDB tables and its metadata for later use in the calculations 130 | 131 | Returns: 132 | list: An array containing all the describe table information for all the tables 133 | """ 134 | logger.info("Collecting DynamoDB tables metadata in {0}:".format(region_name)) 135 | local_tables = region.get_local_tables(region_name) 136 | fn_arguments = [(i, region_name) for i in local_tables] 137 | 138 | with Pool() as pool: 139 | return pool.starmap(region.get_ddb_base_object, fn_arguments) 140 | 141 | 142 | if __name__ == "__main__": 143 | region_names = main() 144 | table_metadata = [] 145 | clean_env() 146 | for region_name in region_names: 147 | print(region_name) 148 | table_metadata.extend(get_ddb_table_metrics(region_name)) 149 | get_local_files(table_metadata) 150 | 151 | sys.exit(0) 152 | -------------------------------------------------------------------------------- /ddb_cost_tool/ddb_table.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | SPDX-License-Identifier: MIT-0 4 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 5 | software and associated documentation files (the "Software"), to deal in the Software 6 | without restriction, including without limitation the rights to use, copy, modify, 7 | merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so. 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 10 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 11 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 12 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 13 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 14 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 15 | 16 | ddb_table.py Contains helper functions for the ddb_cost_tool.py 17 | 18 | """ 19 | 20 | 21 | def prettify_describe_json(describe_ddb: dict) -> dict: 22 | prov_throughput = describe_ddb.get("ProvisionedThroughput") 23 | if prov_throughput: 24 | describe_ddb["throughput"] = {} 25 | describe_ddb["throughput"]["rcu"] = prov_throughput["ReadCapacityUnits"] 26 | describe_ddb["throughput"]["wcu"] = prov_throughput["WriteCapacityUnits"] 27 | 28 | billing_mode = describe_ddb.get("BillingModeSummary") 29 | if billing_mode: 30 | describe_ddb["billing_mode"] = ( 31 | "on_demand" 32 | if billing_mode["BillingMode"] == "PAY_PER_REQUEST" 33 | else "provisioned" 34 | ) 35 | else: 36 | describe_ddb["billing_mode"] = "provisioned" 37 | 38 | stream_spec = describe_ddb.get("StreamSpecification") 39 | describe_ddb["stream_spec"] = {} 40 | if stream_spec: 41 | describe_ddb["stream_spec"]["stream_enabled"] = stream_spec["StreamEnabled"] 42 | describe_ddb["stream_spec"]["stream_view_type"] = stream_spec["StreamViewType"] 43 | describe_ddb["stream_spec"]["stream_arn"] = describe_ddb["LatestStreamArn"] 44 | else: 45 | describe_ddb["stream_spec"]["stream_enabled"] = False 46 | return describe_ddb 47 | 48 | 49 | def get_metric_dimensions(table): 50 | table_name = table["TableName"] 51 | base_dimension = {"Name": "TableName", "Value": f"{table_name}"} 52 | global_secondary_indexes = table.get("GlobalSecondaryIndexes", []) 53 | local_secondary_indexes = table.get("LocalSecondaryIndexes", []) 54 | table_dimensions = [[base_dimension]] 55 | if global_secondary_indexes: 56 | for gsi in global_secondary_indexes: 57 | if gsi: 58 | table_dimensions.append( 59 | [ 60 | base_dimension, 61 | { 62 | "Name": "GlobalSecondaryIndexName", 63 | "Value": gsi["IndexName"], 64 | }, 65 | ] 66 | ) 67 | 68 | if local_secondary_indexes: 69 | for lsi in local_secondary_indexes: 70 | if lsi: 71 | table_dimensions.append( 72 | [ 73 | base_dimension, 74 | { 75 | "Name": "LocalSecondaryIndexName", 76 | "Value": lsi["IndexName"], 77 | }, 78 | ] 79 | ) 80 | return table_dimensions 81 | -------------------------------------------------------------------------------- /ddb_cost_tool/documentation/Report_view.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/amazon-dynamodb-tools/894ddf3ba87318aa06271e277e09cf436d8b2c92/ddb_cost_tool/documentation/Report_view.png -------------------------------------------------------------------------------- /ddb_cost_tool/documentation/Table_report_view.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/amazon-dynamodb-tools/894ddf3ba87318aa06271e277e09cf436d8b2c92/ddb_cost_tool/documentation/Table_report_view.png -------------------------------------------------------------------------------- /ddb_cost_tool/documentation/Tables_view.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/amazon-dynamodb-tools/894ddf3ba87318aa06271e277e09cf436d8b2c92/ddb_cost_tool/documentation/Tables_view.png -------------------------------------------------------------------------------- /ddb_cost_tool/documentation/ddb-cost-tool-upload.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/amazon-dynamodb-tools/894ddf3ba87318aa06271e277e09cf436d8b2c92/ddb_cost_tool/documentation/ddb-cost-tool-upload.gif -------------------------------------------------------------------------------- /ddb_cost_tool/documentation/running_script.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/amazon-dynamodb-tools/894ddf3ba87318aa06271e277e09cf436d8b2c92/ddb_cost_tool/documentation/running_script.gif -------------------------------------------------------------------------------- /ddb_cost_tool/metrics.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | SPDX-License-Identifier: MIT-0 4 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 5 | software and associated documentation files (the "Software"), to deal in the Software 6 | without restriction, including without limitation the rights to use, copy, modify, 7 | merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so. 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 10 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 11 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 12 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 13 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 14 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 15 | 16 | """ 17 | import json 18 | import logging 19 | import os 20 | from datetime import datetime, timedelta 21 | 22 | import boto3 23 | import pandas as pd 24 | 25 | session = boto3.Session() 26 | cw_client = session.client("cloudwatch") 27 | 28 | logging.getLogger().setLevel(logging.INFO) 29 | logger = logging.getLogger("metrics") 30 | log = logging.StreamHandler() 31 | logger.addHandler(log) 32 | 33 | METRICS_FILE = "config/metrics.json" 34 | 35 | 36 | def format_metric_query( # pylint: disable=dangerous-default-value 37 | metrics: dict, # pylint: disable=redefined-outer-name 38 | dimension: dict, 39 | periods: list = [60, 300], # pylint: disable=dangerous-default-value 40 | ) -> dict: 41 | """Helper function that formats the metrics as required by CloudWatch 42 | 43 | Args: 44 | metrics (dict): metrics dict 45 | dimension (dict): dimension dict 46 | periods (list, optional): List of periods. Defaults to [300, 3600]. 47 | 48 | Returns: 49 | dict: The JSON required by CW. 50 | """ 51 | metric_data_query = [] 52 | for period in periods: 53 | for metric in metrics: 54 | metric_data_query.append( 55 | { 56 | "Id": metric["metric_name"].lower(), 57 | "MetricStat": { 58 | "Metric": { 59 | "Namespace": "AWS/DynamoDB", 60 | "MetricName": metric["metric_name"], 61 | "Dimensions": dimension, 62 | }, 63 | "Period": period, 64 | "Stat": metric["stat"], 65 | }, 66 | "Label": metric["metric_name"], 67 | "ReturnData": True, 68 | } 69 | ) 70 | return metric_data_query 71 | 72 | 73 | def get_metrics_file(): 74 | with open(METRICS_FILE, "r") as jsonfile: 75 | data = json.load(jsonfile) 76 | 77 | return data["dimensionMetrics"] 78 | 79 | 80 | def get_local_metric_data( # pylint: disable=too-many-arguments, inconsistent-return-statements 81 | metric_data_query: dict, 82 | period: int, 83 | client: object = cw_client, 84 | ) -> pd.DataFrame: # pylint: disable=too-many-arguments 85 | """Captures the table metrics and returns them as a dataframe ready to staore in S3, 86 | Json format that can be imported later. 87 | 88 | Args: 89 | get_local_metric_data (dict): Provides the dimesion that will be sent to CW 90 | period (int): period 60 or 300 91 | client (object, optional): CloudWatch Client. Defaults to cw_client. 92 | 93 | Returns: 94 | pd.DataFrame: The dataframe object with the cloudwatch metrics 95 | """ 96 | try: 97 | results = {} 98 | metric_data_query = format_metric_query( 99 | get_metrics_file(), metric_data_query, [period] 100 | ) 101 | start_date, end_date = get_start_end_date(period) 102 | logger.debug("Getting metric data from %s, to %s", start_date, end_date) 103 | logger.debug("Metric Data Query: %s", metric_data_query) 104 | logger.debug("Period: %s", period) 105 | response = client.get_metric_data( 106 | MetricDataQueries=metric_data_query, 107 | StartTime=start_date, 108 | EndTime=end_date, 109 | ) 110 | # print(response["MetricDataResults"]) 111 | for metric in response["MetricDataResults"]: 112 | results[metric["Label"]] = {"Timestamps": [], "Values": []} 113 | results[metric["Label"]]["Values"] += metric["Values"] 114 | results[metric["Label"]]["Timestamps"] += metric["Timestamps"] 115 | while "NextToken" in response: 116 | response = client.get_metric_data( 117 | MetricDataQueries=metric_data_query, 118 | StartTime=start_date, 119 | EndTime=end_date, 120 | NextToken=response["NextToken"], 121 | ) 122 | for metric in response["MetricDataResults"]: 123 | results[metric["Label"]]["Values"] += metric["Values"] 124 | results[metric["Label"]]["Timestamps"] += metric["Timestamps"] 125 | 126 | time_series_pd = [] 127 | for res, data in results.items(): 128 | time_series_pd.append( 129 | pd.Series( 130 | data["Values"], 131 | name=res, 132 | dtype="float64", 133 | index=data["Timestamps"], 134 | ) 135 | ) 136 | 137 | result = pd.concat([i for i in time_series_pd], axis=1) 138 | # result.index = pd.to_datetime(result.index) 139 | # https://github.com/pandas-dev/pandas/issues/39537 140 | # result.index = pd.to_datetime(result.index).tz_convert("UTC") 141 | result = result.fillna(0) 142 | 143 | if result.empty: 144 | return_value = None 145 | else: 146 | return_value = result.to_json(orient="table") 147 | # return_value = result.to_json() 148 | return return_value 149 | except client.exceptions.InvalidParameterValueException as exception: 150 | logger.exception(exception) 151 | # To Do 152 | # pass 153 | except client.exceptions.InternalServiceFault as exception: 154 | logger.exception(exception) 155 | # To Do 156 | # pass 157 | 158 | 159 | def get_start_end_date(period: int) -> str: 160 | """Obtain the start and end date for the API calls, 161 | for 1 minute, return the last 15 days, and for 5 minutes, return the last 63. 162 | 163 | Args: 164 | period (int): 60 | 300 165 | 166 | Returns: 167 | str: Start and end date in isoformat, start_date, end_date 168 | """ 169 | # TODO - Update this to 15 days for 1 minute 170 | base = {60: 15, 300: 63} 171 | now = datetime.now() 172 | 173 | def round_down_to_nearest_multiple(value, multiple): 174 | """Round down to the nearest multiple of a number""" 175 | return value - (value % multiple) 176 | 177 | # Round down to the nearest minute based on the period (in multiples of 5) 178 | # This improves CW response times. 179 | rounded_minute = round_down_to_nearest_multiple(now.minute, int(period / 60)) 180 | end_date = now.replace(minute=rounded_minute, second=0, microsecond=0) 181 | 182 | time_delta = timedelta(days=base[period]) 183 | start_date = end_date - time_delta 184 | 185 | return start_date.isoformat(), end_date.isoformat() 186 | -------------------------------------------------------------------------------- /ddb_cost_tool/requirements.txt: -------------------------------------------------------------------------------- 1 | # statsmodels==0.14.0 2 | pandas==1.5.0 3 | # numpy==1.23.3 4 | aws-lambda-powertools==1.30.0 -------------------------------------------------------------------------------- /ddbtools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/amazon-dynamodb-tools/894ddf3ba87318aa06271e277e09cf436d8b2c92/ddbtools/__init__.py -------------------------------------------------------------------------------- /ddbtools/constants.py: -------------------------------------------------------------------------------- 1 | """bill calculation constants""" 2 | GB_IN_BYTES = 1024*1024*1024 3 | HOURS_IN_MONTH = 730 4 | 5 | # table data constants 6 | ESTIMATED_MONTHLY_COSTS = 'table_mo_costs' 7 | GSI_MONTHLY_COSTS = 'gsi_mo_costs' 8 | PRICING_DATA = 'pricing_data' 9 | REPLICAS = 'replicas' 10 | TABLE_CLASS = 'table_class' 11 | TABLE_NAME = 'table_name' 12 | 13 | # table pricing data 14 | BILLING_MODE = 'billing_mode' 15 | GSIS = 'global_secondary_indexes' 16 | INDEX_NAME = 'index_name' 17 | INDEX_ARN = 'index_arn' 18 | IA_RCU_PRICING = 'ia_rcu_pricing' 19 | IA_TABLE_CLASS = 'STANDARD_INFREQUENT_ACCESS' 20 | IA_WCU_PRICING = 'ia_wcu_pricing' 21 | PROVISIONED_RCUS = 'provisioned_rcus' 22 | PROVISIONED_WCUS = 'provisioned_wcus' 23 | REPLICATED_IA_WCU_PRICING = 'replicated_ia_wcu_pricing' 24 | REPLICATED_STD_WCU_PRICING = 'replicated_std_wcu_pricing' 25 | SIZE_IN_GB = 'size_in_gb' 26 | STD_RCU_PRICING = 'std_rcu_pricing' 27 | STD_TABLE_CLASS = 'STANDARD' 28 | STD_WCU_PRICING = 'std_wcu_pricing' 29 | TABLE_ARN = 'table_arn' 30 | 31 | # Pricing API constants 32 | DDB_RESOURCE_CODE = 'AmazonDynamoDB' 33 | PROVISIONED_BILLING = 'PROVISIONED' 34 | ON_DEMAND_BILLING = 'PAY_PER_REQUEST' 35 | STD_VOLUME_TYPE = 'Amazon DynamoDB - Indexed DataStore' 36 | IA_VOLUME_TYPE = 'Amazon DynamoDB - Indexed DataStore - IA' 37 | 38 | # calculated table costs 39 | IA_MO_COST_DIFFERENCE = 'ia_mo_cost_difference' 40 | IA_MO_STORAGE_COST = 'ia_mo_storage_cost' 41 | IA_MO_RCU_COST = 'ia_mo_rcu_cost' 42 | IA_MO_WCU_COST = 'ia_mo_wcu_cost' 43 | IA_MO_TOTAL_COST = 'ia_mo_total_cost' 44 | STD_MO_STORAGE_COST = 'std_storage_cost' 45 | STD_MO_RCU_COST = 'std_mo_rcu_cost' 46 | STD_MO_WCU_COST = 'std_mo_wcu_cost' 47 | STD_MO_TOTAL_COST = 'std_mo_total_cost' 48 | STD_MO_STORAGE_FACTOR = 'std_mo_storage_factor' 49 | 50 | # recommendation constants 51 | ESTIMATED_MO_SAVINGS = 'estimated_monthly_savings' 52 | RECOMMENDATION_TYPE = 'recommendation_type' 53 | RECOMMENDED_TABLE_CLASS = 'recommended_table_class' 54 | TABLE_CLASS_CHANGE_RECOMMENDATION = 'CHANGE_TABLE_CLASS' 55 | ESTIMATE_DETAIL = 'estimate_detail' 56 | TOTAL_IA_MO_COSTS = 'total_ia_mo_costs' 57 | TOTAL_STD_MO_COSTS = 'total_std_mo_costs' 58 | 59 | # region constants 60 | AMERICAN_REGIONS = ['us-east-1', 'us-east-2', 61 | 'us-west-1', 'us-west-2', 62 | 'us-gov-west-1', 'us-gov-west-2', 63 | 'ca-central-1', 'sa-east-1'] 64 | -------------------------------------------------------------------------------- /ddbtools/mysql_s3.py: -------------------------------------------------------------------------------- 1 | import mysql.connector 2 | import decimal 3 | import datetime 4 | import boto3 5 | 6 | sql = "SELECT * FROM customer_features.v_features WHERE record_type = 'events'" 7 | s3_bucket = 's3-import-demo' 8 | s3_path = 'demo/' 9 | region = 'us-east-2' 10 | items_per_file = 5 11 | 12 | 13 | def main(): 14 | mydb = mysql.connector.connect( 15 | host="my-endpoint-host.us-east-1.rds.amazonaws.com", 16 | user="admin", 17 | password="mriA6p5M7eH" 18 | ) 19 | 20 | cur = mydb.cursor(buffered=True, dictionary=True) 21 | 22 | cur.execute(sql) 23 | 24 | res = cur.fetchall() 25 | rowcount = 0 26 | filetext = '' 27 | for row in res: 28 | if rowcount % items_per_file == 0 and rowcount > 0: 29 | write_s3(s3_bucket, s3_path, f'data_upto_{rowcount}.json', filetext) 30 | filetext = '' 31 | rowcount += 1 32 | rowtext = '{"Item":{' 33 | for key in row: 34 | if row[key] is not None: 35 | rowtext += parse_attr(key, row[key]) + ',' 36 | rowtext = rowtext[:-1] + '}}' 37 | 38 | filetext += rowtext + '\n' 39 | 40 | write_s3(s3_bucket, s3_path, f'data_upto_{rowcount}.json', filetext) 41 | 42 | 43 | def write_s3(bucket, path, objname, obj): 44 | client = boto3.client('s3', region_name=region) 45 | fullpath = path + objname 46 | res = client.put_object( 47 | Body=obj, 48 | Bucket=bucket, 49 | Key=fullpath, 50 | ACL='public-read') 51 | 52 | print(f'HTTP {res["ResponseMetadata"]["HTTPStatusCode"]} for S3 object s3://{bucket}/{path}{objname}') 53 | 54 | return 'ok' 55 | 56 | 57 | def parse_attr(key, value): 58 | rtype = 'S' 59 | rvalue = '' 60 | if isinstance(value, int): 61 | rvalue = str(value) 62 | rtype = 'N' 63 | 64 | elif isinstance(value, decimal.Decimal): 65 | rvalue = str(value) 66 | rtype = 'N' 67 | 68 | elif isinstance(value, datetime.datetime): 69 | rvalue = str(value) 70 | rtype = 'S' 71 | 72 | else: 73 | rvalue = value 74 | rtype = 'S' 75 | 76 | return '"' + key + '":{"' + rtype + '":"' + rvalue + '"}' 77 | 78 | 79 | if __name__ == "__main__": 80 | main() 81 | -------------------------------------------------------------------------------- /ddbtools/pricing.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import json 3 | 4 | from ddbtools import constants 5 | from decimal import Decimal 6 | 7 | class PricingUtility(object): 8 | def __init__(self, region_name, profile_name=None): 9 | closest_api_region = 'us-east-1' 10 | 11 | # the pricing API is only available in us-east-1 and ap-south-1 12 | # pick the closest endpoint to the supplied region 13 | if region_name not in constants.AMERICAN_REGIONS: 14 | closest_api_region = 'ap-south-1' 15 | 16 | self.session = boto3.session.Session(profile_name=profile_name) 17 | self.pricing_client = self.session.client('pricing', region_name=closest_api_region) 18 | 19 | def get_replicated_write_pricing(self, region_code: str) -> dict: 20 | """Get DynamoDB replicated write (for global tables) pricing for a given region.""" 21 | replicated_writes_pricing = {} 22 | 23 | response = self.pricing_client.get_products( 24 | ServiceCode='AmazonDynamoDB', 25 | Filters=[{'Type': 'TERM_MATCH', 26 | 'Field': 'productFamily', 27 | 'Value': 'DDB-Operation-ReplicatedWrite'}, 28 | {'Type': 'TERM_MATCH', 29 | 'Field': 'regionCode', 30 | 'Value': region_code} 31 | ], 32 | FormatVersion='aws_v1', 33 | MaxResults=100 34 | ) 35 | price_list = response['PriceList'] 36 | 37 | for entry in price_list: 38 | product = json.loads(entry) 39 | product_group = product['product']['attributes']['group'] 40 | offer = product['terms']['OnDemand'].popitem() 41 | offer_terms = offer[1] 42 | price_dimensions = offer_terms['priceDimensions'] 43 | 44 | for price_dimension_code in price_dimensions: 45 | price_terms = price_dimensions[price_dimension_code] 46 | price_per_unit = price_terms['pricePerUnit']['USD'] 47 | price = Decimal(price_per_unit) 48 | 49 | # Regions with free tier pricing will have an initial entry set to zero; skip this 50 | if price != 0: 51 | if product_group == 'DDB-ReplicatedWriteUnits': 52 | replicated_writes_pricing[constants.REPLICATED_STD_WCU_PRICING] = price 53 | elif product_group == 'DDB-ReplicatedWriteUnitsIA': 54 | replicated_writes_pricing[constants.REPLICATED_IA_WCU_PRICING] = price 55 | 56 | return replicated_writes_pricing 57 | 58 | 59 | def get_storage_pricing(self, region_code: str) -> dict: 60 | """Get pricing for all DynamoDB storage classes in this region.""" 61 | storage_pricing = {} 62 | storage_pricing[constants.STD_VOLUME_TYPE] = self.get_storage_class_pricing(region_code, 63 | constants.STD_VOLUME_TYPE) 64 | storage_pricing[constants.IA_VOLUME_TYPE] = self.get_storage_class_pricing(region_code, 65 | constants.IA_VOLUME_TYPE) 66 | return storage_pricing 67 | 68 | 69 | def get_storage_class_pricing(self, region_code: str, volume_type: str) -> Decimal: 70 | """Get table class pricing by looking for a specific volume type in the specified region.""" 71 | response = self.pricing_client.get_products( 72 | ServiceCode=constants.DDB_RESOURCE_CODE, 73 | Filters=[{'Type': 'TERM_MATCH', 74 | 'Field': 'volumeType', 75 | 'Value': volume_type}, 76 | {'Type': 'TERM_MATCH', 77 | 'Field': 'regionCode', 78 | 'Value': region_code} 79 | ], 80 | FormatVersion='aws_v1', 81 | MaxResults=1 82 | ) 83 | 84 | price_list = response['PriceList'] 85 | product = json.loads(price_list[0]) 86 | offer = product['terms']['OnDemand'].popitem() 87 | offer_terms = offer[1] 88 | price_dimensions = offer_terms['priceDimensions'] 89 | 90 | for price_dimension_code in price_dimensions: 91 | price_terms = price_dimensions[price_dimension_code] 92 | price_per_unit = price_terms['pricePerUnit']['USD'] 93 | storage_pricing = Decimal(price_per_unit) 94 | 95 | # Regions with free tier pricing will have an initial entry set to zero; skip this 96 | if storage_pricing != 0: 97 | return storage_pricing 98 | 99 | return None 100 | 101 | 102 | def get_provisioned_capacity_pricing(self, region_code: str) -> dict: 103 | """Get DynamoDB provisioned capacity pricing for a given region.""" 104 | throughput_pricing = {} 105 | 106 | response = self.pricing_client.get_products( 107 | ServiceCode='AmazonDynamoDB', 108 | Filters=[{'Type': 'TERM_MATCH', 109 | 'Field': 'productFamily', 110 | 'Value': 'Provisioned IOPS'}, 111 | {'Type': 'TERM_MATCH', 112 | 'Field': 'regionCode', 113 | 'Value': region_code} 114 | ], 115 | FormatVersion='aws_v1', 116 | MaxResults=100 117 | ) 118 | price_list = response['PriceList'] 119 | 120 | for entry in price_list: 121 | product = json.loads(entry) 122 | product_group = product['product']['attributes']['group'] 123 | offer = product['terms']['OnDemand'].popitem() 124 | offer_terms = offer[1] 125 | price_dimensions = offer_terms['priceDimensions'] 126 | 127 | for price_dimension_code in price_dimensions: 128 | price_terms = price_dimensions[price_dimension_code] 129 | price_per_unit = price_terms['pricePerUnit']['USD'] 130 | price = Decimal(price_per_unit) 131 | 132 | # Regions with free tier pricing will have an initial entry set to zero; skip this 133 | if price != 0: 134 | if product_group == 'DDB-ReadUnits': 135 | throughput_pricing[constants.STD_RCU_PRICING] = price 136 | elif product_group == 'DDB-WriteUnits': 137 | throughput_pricing[constants.STD_WCU_PRICING] = price 138 | elif product_group == 'DDB-ReadUnitsIA': 139 | throughput_pricing[constants.IA_RCU_PRICING] = price 140 | elif product_group == 'DDB-WriteUnitsIA': 141 | throughput_pricing[constants.IA_WCU_PRICING] = price 142 | 143 | return throughput_pricing -------------------------------------------------------------------------------- /ddbtools/util.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import json 3 | 4 | from ddbtools import constants 5 | from decimal import Decimal 6 | 7 | class DecimalEncoder(json.JSONEncoder): 8 | """Convert the Decimal type to a string for display in JSON data""" 9 | def default(self, obj): 10 | if isinstance(obj, Decimal): 11 | return f"{obj:.2f}" 12 | return json.JSONEncoder.default(self, obj) 13 | 14 | -------------------------------------------------------------------------------- /item_size_calculator/README.md: -------------------------------------------------------------------------------- 1 | # DynamoDB-ItemSizeCalculator 2 | 3 | > Utility tool to calculate the size of DynamoDB items. 4 | 5 | [![NPM Version][npm-image]][npm-url] 6 | [![Downloads Stats][npm-downloads]][npm-url] 7 | 8 | Utility tool to gain item size information in Bytes for DynamoDB JSON items. This allows us to understand capacity consumption and ensure items are under the 400KB DynamoDB item size limit. 9 | 10 | DynamoDB SDKs cater for both DDB-JSON and Native JSON. This package can be used to calculate both. By default, it uses DDB-JSON but you can alter methods to take Native JSON by passing boolean value `true` as a parameter to the method: 11 | 12 | ```js 13 | CalculateSize(item, true) 14 | ``` 15 | 16 | ![DynamoDB Icon](https://www.cdata.com/blog/articles/20191018-dynamodb-performance-0.png) 17 | 18 | ## Installation 19 | 20 | OS X & Linux: 21 | 22 | ```sh 23 | npm install ddb-calc --save 24 | ``` 25 | 26 | ## Usage example 27 | 28 | ### **Require** 29 | 30 | ```js 31 | const CALC = require('ddb-calc') 32 | ``` 33 | 34 | ### **Sample DynamoDB JSON item** 35 | 36 | ```js 37 | const item = { 38 | "Id": { 39 | "N": "101" 40 | }, 41 | "Title": { 42 | "S": "Book 101 Title" 43 | }, 44 | "ISBN": { 45 | "S": "111-1111111111" 46 | }, 47 | "Authors": { 48 | "L": [ 49 | { 50 | "S": "Author1" 51 | } 52 | ] 53 | }, 54 | "Price": { 55 | "N": "2" 56 | }, 57 | "Dimensions": { 58 | "S": "8.5 x 11.0 x 0.5" 59 | }, 60 | "PageCount": { 61 | "N": "500" 62 | }, 63 | "InPublication": { 64 | "BOOL": true 65 | }, 66 | "ProductCategory": { 67 | "S": "Book" 68 | } 69 | } 70 | ``` 71 | 72 | ### **Calculate Size** 73 | 74 | ```js 75 | const size = CALC.CalculateSize(item); 76 | ``` 77 | 78 | ```js 79 | { 80 | rcu: 1, 81 | wcu: 1, 82 | size: 137 // in Bytes 83 | } 84 | ``` 85 | 86 | ### **Understand if an item is under the 400KB limit** 87 | 88 | ```js 89 | const isValid = CALC.IsUnderLimit(item); 90 | ``` 91 | 92 | ### **Sample Native JSON item** 93 | 94 | ```js 95 | const item = { 96 | "Id": 101, 97 | "Title": "Book 101 Title", 98 | "ISBN": "111-1111111111", 99 | "Authors": [ 100 | "Author1" 101 | ], 102 | "Price": 2, 103 | "Dimensions": "8.5 x 11.0 x 0.5", 104 | "PageCount": 500, 105 | "InPublication": true, 106 | "ProductCategory": "Book" 107 | } 108 | ``` 109 | 110 | ### **Calculate Size** 111 | 112 | ```js 113 | const size = CALC.CalculateSize(item, true); 114 | ``` 115 | 116 | ```js 117 | { 118 | rcu: 1, 119 | wcu: 1, 120 | size: 137 // in Bytes 121 | } 122 | ``` 123 | 124 | ### **Understand if an item is under the 400KB limit** 125 | 126 | ```js 127 | const isValid = CALC.IsUnderLimit(item, true); 128 | ``` 129 | 130 | ## Release History 131 | 132 | * 0.0.4 133 | * Alter: Native JSON now supported by bool value: `CalculateSizeJson(item, true)` 134 | * 0.0.3 135 | * ADD: Added native JSON functions `CalculateSizeJson()` and `IsUnderLimitJson()` 136 | * 0.0.2 137 | * ADD: Added `marshalling` capability for native JSON 138 | * 0.0.1 139 | * The first proper release 140 | * ADD: Added `isUnderLimit()` function 141 | * 0.0.0 142 | * Work in progress 143 | 144 | ## Contributing 145 | 146 | 1. Fork it () 147 | 2. Create your feature branch (`git checkout -b feature/fooBar`) 148 | 3. Commit your changes (`git commit -am 'Add some fooBar'`) 149 | 4. Push to the branch (`git push origin feature/fooBar`) 150 | 5. Create a new Pull Request 151 | 152 | 153 | [npm-image]: https://img.shields.io/npm/v/ddb-calc.svg?style=flat-square 154 | [npm-url]: https://npmjs.org/package/ddb-calc 155 | [npm-downloads]: https://img.shields.io/npm/dm/ddb-calc.svg?style=flat-square 156 | -------------------------------------------------------------------------------- /item_size_calculator/index.min.js: -------------------------------------------------------------------------------- 1 | const utf8=require("utf8"),Decimal=require("decimal.js"),{marshall:e}=require("@aws-sdk/util-dynamodb"),NESTED_OVERHEAD=1,MAP_LIST_OVERHEAD=3;function sizeInBytes(e){if(!e)return 0;var r=0;return Object.keys(e).forEach(t=>{e.hasOwnProperty(t)&&(r+=utf8.encode(t).length,r+=attributeSizeBytes(e[t]))}),r}function attributeSizeBytes(e){if(!e)return 0;if(e.hasOwnProperty("S"))return utf8.encode(e.S).length;if(e.hasOwnProperty("N"))return numberSizeBytes(e.N);if(e.hasOwnProperty("B"))return atob(e.B).length;if(e.hasOwnProperty("BOOL")||e.hasOwnProperty("NULL"))return 1;if(e.hasOwnProperty("SS")){for(var r=0,t=0;t{e.M.hasOwnProperty(t)&&(r+=utf8.encode(t).length,r+=attributeSizeBytes(e.M[t]),r+=1)}),r}if(e.hasOwnProperty("L")){for(var r=3,t=0;t21&&(n=21),n}function measure(e){if(-1!==e.indexOf(".")){var r=e.split("."),t=r[0],n=r[1];return"0"===t&&(t="",n=zeros(n,!0)),t.length%2!=0&&(t="Z"+t),n.length%2!=0&&(n+="Z"),measure(t+n)}return Math.ceil((e=zeros(e,!0,!0)).length/2)}function zeros(e,r,t){for(;r;){var n=e.replace(/^(0{2})/,"");if(n.length==e.length)break;e=n}for(;t;){var n=e.replace(/(0{2})$/,"");if(n.length==e.length)break;e=n}return e}exports.CalculateSize=(r,t)=>{if(t)try{r=e(r)}catch(n){throw n}var i=sizeInBytes(r),a=Math.ceil(i/4096),u=Math.ceil(i/1024);let s={rcu:a,wcu:u,size:i};return s},exports.IsUnderLimit=(r,t)=>{if(t)try{r=e(r)}catch(n){throw n}let i=this.CalculateSize(r);return i.size<4e5}; -------------------------------------------------------------------------------- /item_size_calculator/package-lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "ddb-calc", 3 | "version": "1.0.2", 4 | "lockfileVersion": 1, 5 | "requires": true, 6 | "dependencies": { 7 | "@aws-sdk/util-dynamodb": { 8 | "version": "3.17.0", 9 | "resolved": "https://registry.npmjs.org/@aws-sdk/util-dynamodb/-/util-dynamodb-3.17.0.tgz", 10 | "integrity": "sha512-B/N8/6fa8nCvMXKBjNpynk5c2/D9uCu+tk2Pjvnd++yQU5jf1YRavLZVP1i7UQMKPQKJkJH9NeEWgFKc/K/WvQ==", 11 | "requires": { 12 | "tslib": "^2.0.0" 13 | } 14 | }, 15 | "decimal.js": { 16 | "version": "10.2.1", 17 | "resolved": "https://registry.npmjs.org/decimal.js/-/decimal.js-10.2.1.tgz", 18 | "integrity": "sha512-KaL7+6Fw6i5A2XSnsbhm/6B+NuEA7TZ4vqxnd5tXz9sbKtrN9Srj8ab4vKVdK8YAqZO9P1kg45Y6YLoduPf+kw==" 19 | }, 20 | "tslib": { 21 | "version": "2.2.0", 22 | "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.2.0.tgz", 23 | "integrity": "sha512-gS9GVHRU+RGn5KQM2rllAlR3dU6m7AcpJKdtH8gFvQiC4Otgk98XnmMU+nZenHt/+VhnBPWwgrJsyrdcw6i23w==" 24 | }, 25 | "utf8": { 26 | "version": "3.0.0", 27 | "resolved": "https://registry.npmjs.org/utf8/-/utf8-3.0.0.tgz", 28 | "integrity": "sha512-E8VjFIQ/TyQgp+TZfS6l8yp/xWppSAHzidGiRrqe4bK4XP9pTRyKFgGJpO3SN7zdX4DeomTrwaseCHovfpFcqQ==" 29 | } 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /item_size_calculator/package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "ddb-calc", 3 | "version": "1.0.4", 4 | "description": "Calculate the size of DynamoDB item in DynamoDB JSON", 5 | "main": "index.min.js", 6 | "scripts": { 7 | "test": "echo \"Error: no test specified\" && exit 1" 8 | }, 9 | "repository": { 10 | "type": "git", 11 | "url": "git+https://github.com/LeeroyHannigan/DynamoDB-ItemSizeCalculator.git" 12 | }, 13 | "keywords": [ 14 | "dynamodb", 15 | "item", 16 | "size", 17 | "calculator", 18 | "limit" 19 | ], 20 | "author": "Lee Hannigan", 21 | "license": "ISC", 22 | "bugs": { 23 | "url": "https://github.com/LeeroyHannigan/DynamoDB-ItemSizeCalculator/issues" 24 | }, 25 | "homepage": "https://github.com/LeeroyHannigan/DynamoDB-ItemSizeCalculator#readme", 26 | "dependencies": { 27 | "@aws-sdk/util-dynamodb": "^3.17.0", 28 | "decimal.js": "^10.2.1", 29 | "utf8": "^3.0.0" 30 | }, 31 | "devDependencies": {} 32 | } 33 | -------------------------------------------------------------------------------- /metrics-collector/README.md: -------------------------------------------------------------------------------- 1 | # DynamoDB Metrics Collector 2 | 3 | ## 📊 Uncover Hidden Insights in Your DynamoDB Tables 4 | 5 | Are your DynamoDB tables optimized for performance and cost? The DynamoDB Metrics Collector is here to help you find out! This tool scans your AWS regions, identifies provisioned DynamoDB tables, and provides detailed utilization metrics to help you optimize your database infrastructure. 6 | 7 | ![DynamoDB Metrics Collector Demo](./documentation/metrics.gif) 8 | 9 | ## 🚀 Features 10 | 11 | - 🌎 **Multi-Region Support**: Scans all your AWS regions automatically 12 | - 🔍 **Smart Table Detection**: Identifies provisioned DynamoDB tables 13 | - 📈 **Comprehensive Metrics**: Collects read and write utilization data 14 | - 💡 **Utilization Insights**: Highlights tables with low utilization (below 45%) 15 | - 📊 **CSV Exports**: Generates easy-to-analyze CSV reports 16 | 17 | ## 🛠 Installation 18 | 19 | This project is built with Poetry for dependency management. To get started: 20 | 21 | 1. Clone the repository: 22 | 23 | `git clone https://github.com/awslabs/amazon-dynamodb-tools.git cd metrics-collector` 24 | 25 | 26 | 2. Install dependencies with Poetry: 27 | `poetry install` 28 | 29 | 3. Alternatively, you can use pip with the provided `requirements.txt`: 30 | `pip install -r requirements.txt` 31 | 32 | The install might take a couple of minutes because of the dependencies. 33 | 34 | ## 🏃‍♂️ Usage 35 | 36 | Run the metrics collector with a single command: 37 | 38 | ```bash 39 | python -m metrics_collector.utilization_example --start-time 2025-02-19 40 | 41 | Options: 42 | 43 | --start-time: Specify the start time for metric collection (ISO8601 format) 44 | --end-time: Specify the end time (defaults to current time if not provided) 45 | --config: Path to a custom configuration file 46 | --output: Custom name for the output CSV file 47 | ``` 48 | 49 | ## 📊 Output 50 | 51 | The tool generates two CSV files: 52 | 53 | ```bash 54 | dynamodb_utilization_YYYYMMDD_HHMMSS.csv: Lists tables with utilization below 45% 55 | dynamodb_raw_metrics_YYYYMMDD_HHMMSS.csv: Contains raw metric data for all tables 56 | ``` 57 | 58 | ### 🖥 Sample Output 59 | 60 | ``` 61 | ❯ pip install -r requirements.txt 62 | Collecting aioboto3==13.4.0 63 | Downloading aioboto3-13.4.0-py3-none-any.whl (34 kB) 64 | Collecting aiobotocore==2.18.0 65 | Downloading aiobotocore-2.18.0-py3-none-any.whl (77 kB) 66 | ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 77.6/77.6 kB 3.5 MB/s eta 0:00:00 67 | ... 68 | Successfully installed aioboto3-13.4.0 aiobotocore-2.18.0 aiofiles-24.1.0 aiohappyeyeballs-2.4.6 aiohttp-3.11.12 aioitertools-0.12.0 aiosignal-1.3.2 async-timeout-5.0.1 asyncio-3.4.3 attrs-25.1.0 boto3-1.36.1 botocore-1.36.1 colorama-0.4.6 frozenlist-1.5.0 idna-3.10 multidict-6.1.0 propcache-0.2.1 python-dateutil-2.9.0.post0 s3transfer-0.11.2 simpleeval-1.0.3 six-1.17.0 tqdm-4.67.1 typing-extensions-4.12.2 urllib3-2.3.0 wrapt-1.17.2 yarl-1.18.3 69 | 70 | ❯ python -m metrics_collector.utilization_example --start-time 2025-02-19 71 | 2025-02-19T15:40:04.456749 - INFO - Initializing DynamoDBMetricsCollector 72 | 2025-02-19T15:40:04.456851 - INFO - Collecting metrics from 2025-02-19 00:00:00+00:00 to 2025-02-19 23:59:59.999999+00:00 73 | 2025-02-19T15:40:04.456873 - INFO - Fetching all AWS regions... 74 | 2025-02-19T15:40:05.461387 - INFO - Found 17 regions. 75 | 2025-02-19T15:40:05.461463 - INFO - Identifying provisioned tables in each region... 76 | Scanning regions: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 17/17 [00:13<00:00, 1.23it/s] 77 | 2025-02-19T15:40:19.284727 - INFO - Found 14 provisioned tables across all regions. 78 | 2025-02-19T15:40:19.285119 - INFO - Collecting metrics for provisioned tables... 79 | Collecting metrics: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 14/14 [00:04<00:00, 3.46it/s] 80 | 2025-02-19T15:40:23.335240 - INFO - Metrics collected and stored successfully. 81 | 2025-02-19T15:40:23.335333 - INFO - Found 13 tables with utilization below 45% 82 | 2025-02-19T15:40:23.335774 - INFO - Tables with low utilization are written to dynamodb_utilization_20250219_154023.csv 83 | 2025-02-19T15:40:23.349258 - INFO - Raw metrics data written to dynamodb_raw_metrics_20250219_154023.csv 84 | 2025-02-19T15:40:23.361457 - INFO - Raw metrics data written to dynamodb_raw_metrics_20250219_154023.csv 85 | ``` 86 | 87 | ## 🔧 Configuration 88 | 89 | The metrics collection is driven by a configuration file named `metrics_config.json`. This file allows you to customize which metrics are collected and how they are calculated. 90 | 91 | ### Sample `metrics_config.json`: 92 | 93 | ```json 94 | { 95 | "metrics": [ 96 | { 97 | "Id": "consumed_read", 98 | "MetricName": "ConsumedReadCapacityUnits", 99 | "Stat": "Sum" 100 | }, 101 | { 102 | "Id": "provisioned_read", 103 | "MetricName": "ProvisionedReadCapacityUnits", 104 | "Stat": "Average" 105 | }, 106 | { 107 | "Id": "consumed_write", 108 | "MetricName": "ConsumedWriteCapacityUnits", 109 | "Stat": "Sum" 110 | }, 111 | { 112 | "Id": "provisioned_write", 113 | "MetricName": "ProvisionedWriteCapacityUnits", 114 | "Stat": "Average" 115 | } 116 | ], 117 | "period": 300, 118 | "calculations": [ 119 | { 120 | "id": "read_utilization", 121 | "formula": "(consumed_read / period) / provisioned_read", 122 | "required_vars": ["consumed_read", "provisioned_read"] 123 | }, 124 | { 125 | "id": "write_utilization", 126 | "formula": "(consumed_write / period) / provisioned_write", 127 | "required_vars": ["consumed_write", "provisioned_write"] 128 | } 129 | ] 130 | } 131 | ``` 132 | 133 | ### Configuration Breakdown: 134 | 135 | 1. **metrics**: An array of metrics to collect from CloudWatch. 136 | - `Id`: A unique identifier for the metric. 137 | - `MetricName`: The name of the CloudWatch metric. 138 | - `Stat`: The statistic to retrieve (e.g., "Sum", "Average"). 139 | 140 | 2. **period**: The time period (in seconds) for each data point. 141 | 142 | 3. **calculations**: An array of custom calculations to perform on the collected metrics. 143 | - `id`: A unique identifier for the calculation. 144 | - `formula`: The mathematical formula to calculate the metric. 145 | - `required_vars`: Variables required for the calculation. 146 | 147 | ### Customizing Metrics 148 | 149 | You can modify this file to collect different metrics or perform custom calculations: 150 | 151 | 1. To add a new metric: 152 | - Append to the `metrics` array with the appropriate CloudWatch metric details. 153 | 154 | 2. To create a new calculation: 155 | - Add to the `calculations` array with your custom formula. 156 | 157 | This configuration flexibility allows you to tailor the metrics collection to your specific needs and focus on the DynamoDB performance aspects most relevant to your use case. 158 | 159 | 160 | ## 📈 Visualize Your Data 161 | 162 | There is a companion project in the making where we will simplify metric visualization. Stay tuned for future updates! 163 | 164 | Want to turn your CSV data into stunning visualizations? Check out our companion project DynamoDB Metrics Visualizer to create interactive dashboards and charts! 165 | 166 | The current report will present data in csv 167 | 168 | | Region | Table Name | Read Utilization | Write Utilization | 169 | |--------|------------|------------------|-------------------| 170 | | us-east-1 | Table-Acccount-A | 0.00 | 0.00 | 171 | | us-east-1 | my_handler_table | 0.00 | 0.00 | 172 | | us-east-1 | my_table | 0.00 | 0.00 | 173 | | us-east-1 | vpc-test-table-01 | 0.00 | 0.06 | 174 | | us-east-1 | vpc-test-table-02 | 0.00 | 0.12 | 175 | | us-east-1 | vpc-test-table-03 | 0.00 | 0.18 | 176 | | us-east-1 | vpc-test-table-04 | 0.00 | 0.24 | 177 | | us-east-1 | vpc-test-table-05 | 0.00 | 0.30 | 178 | | us-east-1 | vpc-test-table-06 | 0.00 | 0.36 | 179 | | us-east-1 | vpc-test-table-07 | 0.00 | 0.41 | 180 | | us-east-1 | vpc-test-table-09 | 0.00 | 0.00 | 181 | | us-east-1 | vpc-test-table-10 | 0.00 | 0.00 | 182 | | ap-southeast-2 | ddbeventstable-StreamsSampleDDBTable-5W08OVKQE1PN | 0.00 | 0.00 | 183 | 184 | ## Compatibility 185 | 186 | This project has been tested with: 187 | 188 | - Python 3.10.6 189 | - Python 3.13.1 190 | 191 | ## 🤝 Contributing 192 | 193 | We welcome contributions! Please see our Contributing Guide for more details. 194 | 195 | ## 📜 License 196 | 197 | This project is licensed under the MIT License - see the [LICENSE](../LICENSE) file for details. 198 | 199 | Built with ❤️ by DynamoDB Specialist Solutions Architects. -------------------------------------------------------------------------------- /metrics-collector/documentation/metrics.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/amazon-dynamodb-tools/894ddf3ba87318aa06271e277e09cf436d8b2c92/metrics-collector/documentation/metrics.gif -------------------------------------------------------------------------------- /metrics-collector/metric_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "metrics": [ 3 | { 4 | "Id": "consumed_read", 5 | "MetricName": "ConsumedReadCapacityUnits", 6 | "Stat": "Sum" 7 | }, 8 | { 9 | "Id": "provisioned_read", 10 | "MetricName": "ProvisionedReadCapacityUnits", 11 | "Stat": "Average" 12 | }, 13 | { 14 | "Id": "consumed_write", 15 | "MetricName": "ConsumedWriteCapacityUnits", 16 | "Stat": "Sum" 17 | }, 18 | { 19 | "Id": "provisioned_write", 20 | "MetricName": "ProvisionedWriteCapacityUnits", 21 | "Stat": "Average" 22 | } 23 | ], 24 | "period": 300, 25 | "calculations": [ 26 | { 27 | "id": "read_utilization", 28 | "formula": "(consumed_read / period) / provisioned_read", 29 | "required_vars": ["consumed_read", "provisioned_read"] 30 | }, 31 | { 32 | "id": "write_utilization", 33 | "formula": "(consumed_write / period) / provisioned_write", 34 | "required_vars": ["consumed_write", "provisioned_write"] 35 | } 36 | ] 37 | } 38 | -------------------------------------------------------------------------------- /metrics-collector/metrics_collector/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.1.0" 2 | -------------------------------------------------------------------------------- /metrics-collector/metrics_collector/collector.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | SPDX-License-Identifier: MIT-0 4 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 5 | software and associated documentation files (the "Software"), to deal in the Software 6 | without restriction, including without limitation the rights to use, copy, modify, 7 | merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so. 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 10 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 11 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 12 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 13 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 14 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 15 | 16 | DynamoDB metrics collector 17 | """ 18 | 19 | import asyncio 20 | import aioboto3 21 | from tqdm import tqdm 22 | from tqdm.asyncio import tqdm_asyncio 23 | from metrics_collector.metrics import get_metrics_for_table 24 | from metrics_collector.logger_config import setup_logger 25 | 26 | logger = setup_logger(__name__) 27 | 28 | MAX_CONCURRENT_REGIONS = 10 29 | MAX_CONCURRENT_TABLE_CHECKS = 1000 30 | 31 | 32 | class DynamoDBMetricsCollector: 33 | """ 34 | A class to collect and analyze DynamoDB table metrics across all AWS regions. 35 | """ 36 | 37 | def __init__(self, config): 38 | self.session = aioboto3.Session() 39 | self.table_semaphore = asyncio.Semaphore(MAX_CONCURRENT_TABLE_CHECKS) 40 | self.region_semaphore = asyncio.Semaphore(MAX_CONCURRENT_REGIONS) 41 | self.config = config 42 | logger.info("Initializing DynamoDBMetricsCollector") 43 | 44 | async def get_all_regions(self): 45 | """ 46 | Fetch all available AWS regions. 47 | 48 | Returns: 49 | list: A list of AWS region names. 50 | """ 51 | async with self.session.client("ec2") as ec2: 52 | response = await ec2.describe_regions() 53 | return [region["RegionName"] for region in response["Regions"]] 54 | 55 | async def get_tables_in_region(self, region): 56 | """ 57 | Retrieve all DynamoDB tables in a specific region. 58 | 59 | Args: 60 | region (str): The AWS region name. 61 | 62 | Returns: 63 | list: A list of table names in the specified region. 64 | """ 65 | async with self.session.client("dynamodb", region_name=region) as dynamodb: 66 | tables = [] 67 | paginator = dynamodb.get_paginator("list_tables") 68 | try: 69 | async for page in paginator.paginate(): 70 | tables.extend(page.get("TableNames", [])) 71 | return tables 72 | except Exception as e: 73 | logger.error(f"Error retrieving tables for region {region}: {str(e)}") 74 | return [] 75 | 76 | async def get_table_billing_mode(self, region, table_name): 77 | """ 78 | Determine the billing mode of a specific DynamoDB table. 79 | 80 | Args: 81 | region (str): The AWS region name. 82 | table_name (str): The name of the DynamoDB table. 83 | 84 | Returns: 85 | tuple: A tuple containing the table name and its billing mode. 86 | """ 87 | async with self.table_semaphore: 88 | async with self.session.client("dynamodb", region_name=region) as dynamodb: 89 | try: 90 | response = await dynamodb.describe_table(TableName=table_name) 91 | return table_name, response["Table"].get("BillingModeSummary", {}).get("BillingMode", "PROVISIONED") 92 | except Exception as e: 93 | logger.error(f"Error getting billing mode for table {table_name} in region {region}: {str(e)}") 94 | return table_name, None 95 | 96 | async def get_provisioned_tables(self, region): 97 | """ 98 | Retrieve all provisioned tables in a specific region. 99 | 100 | Args: 101 | region (str): The AWS region name. 102 | 103 | Returns: 104 | list: A list of provisioned table names in the specified region. 105 | """ 106 | tables = await self.get_tables_in_region(region) 107 | tasks = [self.get_table_billing_mode(region, table) for table in tables] 108 | results = await asyncio.gather(*tasks) 109 | return [table for table, billing_mode in results if billing_mode == "PROVISIONED"] 110 | 111 | async def get_tables_and_metrics(self, region, start_time, end_time): 112 | """ 113 | Retrieve provisioned tables and their metrics for a specific region. 114 | 115 | Args: 116 | region (str): The AWS region name. 117 | start_time (datetime): The start time for metric collection. 118 | end_time (datetime): The end time for metric collection. 119 | 120 | Returns: 121 | tuple: A tuple containing the region, provisioned tables, and their metrics. 122 | """ 123 | async with self.region_semaphore: 124 | provisioned_tables = await self.get_provisioned_tables(region) 125 | tasks = [ 126 | get_metrics_for_table(self.session, table, region, start_time, end_time, self.config) 127 | for table in provisioned_tables 128 | ] 129 | table_metrics = await asyncio.gather(*tasks) 130 | return region, provisioned_tables, table_metrics 131 | 132 | async def collect_all_metrics(self, start_time, end_time): 133 | """ 134 | Collect metrics for all provisioned DynamoDB tables across all regions. 135 | 136 | Args: 137 | start_time (datetime): The start time for metric collection. 138 | end_time (datetime): The end time for metric collection. 139 | 140 | Returns: 141 | tuple: A tuple containing all metrics and low utilization tables. 142 | """ 143 | all_metrics = {} 144 | low_utilization_tables = {} 145 | 146 | logger.info("Fetching all AWS regions...") 147 | regions = await self.get_all_regions() 148 | logger.info(f"Found {len(regions)} regions.") 149 | 150 | logger.info("Identifying provisioned tables in each region...") 151 | total_provisioned_tables = 0 152 | async for region in tqdm_asyncio(regions, desc="Scanning regions"): 153 | provisioned_tables = await self.get_provisioned_tables(region) 154 | total_provisioned_tables += len(provisioned_tables) 155 | 156 | logger.info(f"Found {total_provisioned_tables} provisioned tables across all regions.") 157 | 158 | logger.info("Collecting metrics for provisioned tables...") 159 | region_tasks = [self.get_tables_and_metrics(region, start_time, end_time) for region in regions] 160 | 161 | with tqdm(total=total_provisioned_tables, desc="Collecting metrics") as pbar: 162 | for future in asyncio.as_completed(region_tasks): 163 | region, tables, table_metrics = await future 164 | all_metrics[region] = {} 165 | low_utilization_tables[region] = [] 166 | 167 | for table, metrics in zip(tables, table_metrics): 168 | if metrics: 169 | all_metrics[region][table] = metrics 170 | avg_read_util = self.calculate_average_utilization(metrics, "read_utilization") 171 | avg_write_util = self.calculate_average_utilization(metrics, "write_utilization") 172 | 173 | if self.is_low_utilization(avg_read_util, avg_write_util): 174 | low_utilization_tables[region].append((table, avg_read_util, avg_write_util)) 175 | pbar.update(1) 176 | 177 | return all_metrics, low_utilization_tables 178 | 179 | @staticmethod 180 | def calculate_average_utilization(metrics, utilization_type): 181 | """ 182 | Calculate the average utilization for a specific metric type. 183 | 184 | Args: 185 | metrics (list): List of metric dictionaries. 186 | utilization_type (str): The type of utilization to calculate ('read_utilization' or 'write_utilization'). 187 | 188 | Returns: 189 | float or None: The average utilization, or None if no valid metrics are found. 190 | """ 191 | valid_metrics = [m[utilization_type] for m in metrics if m[utilization_type] is not None] 192 | return sum(valid_metrics) / len(valid_metrics) if valid_metrics else None 193 | 194 | @staticmethod 195 | def is_low_utilization(read_util, write_util, threshold=0.45): 196 | """ 197 | Determine if a table has low utilization based on read and write utilization. 198 | 199 | Args: 200 | read_util (float or None): The read utilization value. 201 | write_util (float or None): The write utilization value. 202 | threshold (float): The utilization threshold for considering low utilization (default: 0.45). 203 | 204 | Returns: 205 | bool: True if the table has low utilization, False otherwise. 206 | """ 207 | return (read_util is None or 0 <= read_util <= threshold) and (write_util is None or 0 <= write_util <= threshold) 208 | -------------------------------------------------------------------------------- /metrics-collector/metrics_collector/logger_config.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from datetime import datetime 3 | 4 | 5 | class IsoTimeFormatter(logging.Formatter): 6 | def formatTime(self, record, datefmt=None): 7 | return datetime.fromtimestamp(record.created).isoformat() 8 | 9 | 10 | def setup_logger(name=None): 11 | logger = logging.getLogger(name) 12 | logger.setLevel(logging.INFO) 13 | 14 | # Remove any existing handlers 15 | for handler in logger.handlers[:]: 16 | logger.removeHandler(handler) 17 | 18 | handler = logging.StreamHandler() 19 | formatter = IsoTimeFormatter("%(asctime)s - %(levelname)s - %(message)s") 20 | handler.setFormatter(formatter) 21 | logger.addHandler(handler) 22 | 23 | return logger 24 | -------------------------------------------------------------------------------- /metrics-collector/metrics_collector/metrics.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | SPDX-License-Identifier: MIT-0 4 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 5 | software and associated documentation files (the "Software"), to deal in the Software 6 | without restriction, including without limitation the rights to use, copy, modify, 7 | merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so. 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 10 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 11 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 12 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 13 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 14 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 15 | 16 | DynamoDB Metrics Retrieval Module 17 | 18 | Note: This module is designed to be used asynchronously with aioboto3. 19 | """ 20 | 21 | from simpleeval import simple_eval 22 | from metrics_collector.logger_config import setup_logger 23 | 24 | logger = setup_logger(__name__) 25 | 26 | 27 | async def get_metrics_for_table(session, table_name, region, start_time, end_time, config): 28 | """ 29 | Retrieve and process CloudWatch metrics for a specific DynamoDB table. 30 | 31 | This function fetches metrics from CloudWatch for the specified table and time range, 32 | then processes and calculates additional metrics based on the provided configuration. 33 | 34 | Args: 35 | session (aioboto3.Session): An aioboto3 session object. 36 | table_name (str): The name of the DynamoDB table. 37 | region (str): The AWS region of the table. 38 | start_time (datetime): The start time for the metric query. 39 | end_time (datetime): The end time for the metric query. 40 | config (dict): A configuration dictionary containing metric and calculation definitions. 41 | 42 | Returns: 43 | list: A list of dictionaries, each containing metric data for a specific timestamp, 44 | sorted by timestamp. 45 | 46 | Raises: 47 | Exception: If there's an error fetching or processing the metrics. 48 | """ 49 | async with session.client("cloudwatch", region_name=region) as cloudwatch: 50 | try: 51 | metric_data_queries = [ 52 | { 53 | "Id": metric["Id"], 54 | "MetricStat": { 55 | "Metric": { 56 | "Namespace": "AWS/DynamoDB", 57 | "MetricName": metric["MetricName"], 58 | "Dimensions": [{"Name": "TableName", "Value": table_name}], 59 | }, 60 | "Period": config["period"], 61 | "Stat": metric["Stat"], 62 | }, 63 | } 64 | for metric in config["metrics"] 65 | ] 66 | 67 | all_results = [] 68 | next_token = None 69 | 70 | while True: 71 | if next_token: 72 | response = await cloudwatch.get_metric_data( 73 | MetricDataQueries=metric_data_queries, 74 | StartTime=start_time, 75 | EndTime=end_time, 76 | NextToken=next_token, 77 | ) 78 | else: 79 | response = await cloudwatch.get_metric_data( 80 | MetricDataQueries=metric_data_queries, 81 | StartTime=start_time, 82 | EndTime=end_time, 83 | ) 84 | 85 | metric_data_dict = {result["Id"]: result for result in response["MetricDataResults"]} 86 | 87 | # Find the metric with the most data points to use as a base 88 | base_metric = max(metric_data_dict.values(), key=lambda x: len(x["Timestamps"])) 89 | 90 | for i, timestamp in enumerate(base_metric["Timestamps"]): 91 | result = {"Timestamp": timestamp} 92 | for metric in config["metrics"]: 93 | metric_id = metric["Id"] 94 | result[metric_id] = ( 95 | metric_data_dict[metric_id]["Values"][i] 96 | if i < len(metric_data_dict[metric_id]["Values"]) 97 | else None 98 | ) 99 | 100 | # Perform calculations 101 | for calc in config["calculations"]: 102 | try: 103 | # Check if all required values are not None before performing calculation 104 | if all(result.get(var) is not None for var in calc.get("required_vars", [])): 105 | result[calc["id"]] = simple_eval( 106 | calc["formula"], 107 | names={**result, "period": config["period"]}, 108 | ) 109 | else: 110 | result[calc["id"]] = None 111 | except Exception as e: 112 | logger.error(f"Error in calculation {calc['id']}: {str(e)}") 113 | result[calc["id"]] = None 114 | 115 | all_results.append(result) 116 | 117 | next_token = response.get("NextToken") 118 | if not next_token: 119 | break 120 | 121 | return sorted(all_results, key=lambda x: x["Timestamp"]) 122 | 123 | except Exception as e: 124 | logger.error(f"Error fetching metrics for table {table_name} in region {region}: {str(e)}") 125 | return [] 126 | -------------------------------------------------------------------------------- /metrics-collector/metrics_collector/storage.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | SPDX-License-Identifier: MIT-0 4 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 5 | software and associated documentation files (the "Software"), to deal in the Software 6 | without restriction, including without limitation the rights to use, copy, modify, 7 | merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so. 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 10 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 11 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 12 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 13 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 14 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 15 | """ 16 | 17 | # Experimental! 18 | import json 19 | import os 20 | import numpy as np 21 | from datetime import datetime 22 | from metrics_collector.logger_config import setup_logger 23 | 24 | logger = setup_logger(__name__) 25 | 26 | 27 | class DateTimeEncoder(json.JSONEncoder): 28 | def default(self, obj): 29 | if isinstance(obj, datetime): 30 | return obj.isoformat() 31 | return super().default(obj) 32 | 33 | 34 | class MetricsStorage: 35 | def __init__(self, storage_type="disk", base_path="metrics"): 36 | self.storage_type = storage_type 37 | self.base_path = base_path 38 | self.memory_storage = {} 39 | logger.info("Initializing MetricsStorage") 40 | 41 | async def store(self, metrics): 42 | if self.storage_type == "disk": 43 | os.makedirs(self.base_path, exist_ok=True) 44 | for region, region_metrics in metrics.items(): 45 | region_path = os.path.join(self.base_path, region) 46 | os.makedirs(region_path, exist_ok=True) 47 | for table, table_metrics in region_metrics.items(): 48 | table_file = os.path.join(region_path, f"{table}.npy") 49 | self._store_table_metrics(table_metrics, table_file) 50 | elif self.storage_type == "memory": 51 | self.memory_storage = metrics 52 | 53 | def _store_table_metrics(self, table_metrics, file_path): 54 | if not table_metrics: 55 | print(f"No metrics data for {file_path}") 56 | return 57 | 58 | # Determine the metrics available in the data 59 | available_metrics = set() 60 | for entry in table_metrics: 61 | available_metrics.update(entry["Metrics"].keys()) 62 | 63 | # Create the dtype based on available metrics 64 | dtype = [("timestamp", "datetime64[ns]")] 65 | for metric in available_metrics: 66 | dtype.extend([(f"{metric}_Average", "f8"), (f"{metric}_Maximum", "f8")]) 67 | 68 | data = [] 69 | for entry in table_metrics: 70 | row = [np.datetime64(entry["Timestamp"])] 71 | for metric in available_metrics: 72 | values = entry["Metrics"].get(metric, {"Average": 0, "Maximum": 0}) 73 | row.extend([values["Average"], values["Maximum"]]) 74 | data.append(tuple(row)) 75 | 76 | try: 77 | arr = np.array(data, dtype=dtype) 78 | np.save(file_path, arr) 79 | print(f"Saved metrics to {file_path}") 80 | except Exception as e: 81 | print(f"Error saving metrics to {file_path}: {str(e)}") 82 | print(f"Data shape: {len(data)} rows, {len(dtype)} columns") 83 | print(f"First row: {data[0] if data else 'No data'}") 84 | print(f"dtype: {dtype}") 85 | 86 | async def retrieve(self, region, table): 87 | if self.storage_type == "disk": 88 | file_path = os.path.join(self.base_path, region, f"{table}.npy") 89 | if os.path.exists(file_path): 90 | return np.load(file_path) 91 | elif self.storage_type == "memory": 92 | return self.memory_storage.get(region, {}).get(table) 93 | return None 94 | -------------------------------------------------------------------------------- /metrics-collector/metrics_collector/utilization_example.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 3 | SPDX-License-Identifier: MIT-0 4 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 5 | software and associated documentation files (the "Software"), to deal in the Software 6 | without restriction, including without limitation the rights to use, copy, modify, 7 | merge, publish, distribute, sublicense, and/or sell copies of the Software, and to 8 | permit persons to whom the Software is furnished to do so. 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, 10 | INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A 11 | PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT 12 | HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION 13 | OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 14 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 15 | 16 | DynamoDB Utilization Metrics Collector 17 | 18 | """ 19 | 20 | import asyncio 21 | import argparse 22 | import csv 23 | import json 24 | from datetime import datetime, timezone, timedelta 25 | from metrics_collector.collector import DynamoDBMetricsCollector 26 | 27 | # from metrics_collector.storage import MetricsStorage 28 | from metrics_collector.logger_config import setup_logger 29 | 30 | logger = setup_logger(__name__) 31 | 32 | 33 | def parse_iso8601(date_string): 34 | """ 35 | Parse an ISO8601 formatted date string to a datetime object with UTC timezone. 36 | 37 | Args: 38 | date_string (str): A date string in ISO8601 format. 39 | 40 | Returns: 41 | datetime: A datetime object representing the input date string, with UTC timezone. 42 | """ 43 | return datetime.fromisoformat(date_string).replace(tzinfo=timezone.utc) 44 | 45 | 46 | def write_csv(data, header, output_file): 47 | """ 48 | Write data to a CSV file with the given header. 49 | 50 | Args: 51 | data (List[List]): The data to write to the CSV file. 52 | header (List[str]): The header row for the CSV file. 53 | output_file (str): The path to the output CSV file. 54 | 55 | Returns: 56 | str: The path to the written CSV file. 57 | """ 58 | with open(output_file, "w", newline="") as csvfile: 59 | csvwriter = csv.writer(csvfile) 60 | csvwriter.writerow(header) 61 | csvwriter.writerows(data) 62 | return output_file 63 | 64 | 65 | def write_raw_metrics_csv(all_metrics, config, output_file=None): 66 | """ 67 | Generate a CSV file with raw metrics data for all tables. 68 | 69 | Args: 70 | all_metrics (Dict): A dictionary containing all collected metrics. 71 | config (Dict): The configuration dictionary containing metric definitions. 72 | output_file (str, optional): The path to the output CSV file. If None, a default name is generated. 73 | 74 | Returns: 75 | str: The path to the written CSV file. 76 | """ 77 | if output_file is None: 78 | output_file = f"dynamodb_raw_metrics_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv" 79 | 80 | header = ["Timestamp", "Region", "Table Name"] 81 | header.extend(f"{metric['MetricName']} ({metric['Stat']})" for metric in config["metrics"]) 82 | header.extend(calc["id"] for calc in config["calculations"]) 83 | 84 | data = [] 85 | for region, tables in all_metrics.items(): 86 | for table, metrics in tables.items(): 87 | for metric in metrics: 88 | row = [metric["Timestamp"], region, table] 89 | row.extend(metric.get(m["Id"], "N/A") for m in config["metrics"]) 90 | row.extend(metric.get(calc["id"], "N/A") for calc in config["calculations"]) 91 | data.append(row) 92 | 93 | return write_csv(data, header, output_file) 94 | 95 | 96 | def write_utilization_csv(low_utilization_tables, output_file=None): 97 | """ 98 | Generate a CSV file with low utilization table data. 99 | 100 | Args: 101 | low_utilization_tables (Dict): A dictionary containing tables with low utilization. 102 | output_file (str, optional): The path to the output CSV file. If None, a default name is generated. 103 | 104 | Returns: 105 | str: The path to the written CSV file. 106 | """ 107 | if output_file is None: 108 | output_file = f"dynamodb_utilization_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv" 109 | 110 | header = ["Region", "Table Name", "Read Utilization", "Write Utilization"] 111 | data = [ 112 | [region, table, f"{read_util:.2f}", f"{write_util:.2f}"] 113 | for region, tables in low_utilization_tables.items() 114 | for table, read_util, write_util in tables 115 | ] 116 | 117 | return write_csv(data, header, output_file) 118 | 119 | 120 | async def main(): 121 | """ 122 | The main function that orchestrates the metric collection and report generation process. 123 | 124 | This function: 125 | 1. Parses command-line arguments 126 | 2. Loads the configuration file 127 | 3. Initializes the DynamoDBMetricsCollector 128 | 4. Determines the time range for metric collection 129 | 5. Collects metrics for all tables 130 | 6. Generates CSV reports for low utilization tables and raw metrics 131 | """ 132 | parser = argparse.ArgumentParser(description="Collect DynamoDB metrics") 133 | parser.add_argument("--start-time", type=str, help="Start time in ISO8601 format") 134 | parser.add_argument("--end-time", type=str, help="End time in ISO8601 format") 135 | parser.add_argument( 136 | "--storage", 137 | choices=["disk", "memory"], 138 | default="disk", 139 | help="Storage type (default: disk)", 140 | ) 141 | parser.add_argument( 142 | "--output", 143 | type=str, 144 | help="Output CSV file name (default: dynamodb_utilization_YYYYMMDD_HHMMSS.csv)", 145 | ) 146 | parser.add_argument( 147 | "--config", 148 | type=str, 149 | default="metric_config.json", 150 | help="Path to the metric configuration JSON file", 151 | ) 152 | args = parser.parse_args() 153 | 154 | with open(args.config, "r") as config_file: 155 | config = json.load(config_file) 156 | 157 | collector = DynamoDBMetricsCollector(config) 158 | # Experimental 159 | # storage = MetricsStorage(storage_type=args.storage, base_path="metrics_data") 160 | 161 | now = datetime.now(timezone.utc) 162 | start_time = ( 163 | parse_iso8601(args.start_time) 164 | if args.start_time 165 | else (now - timedelta(days=1)).replace(hour=0, minute=0, second=0, microsecond=0) 166 | ) 167 | end_time = ( 168 | parse_iso8601(args.end_time) if args.end_time else now.replace(hour=23, minute=59, second=59, microsecond=999999) 169 | ) 170 | 171 | logger.info(f"Collecting metrics from {start_time} to {end_time}") 172 | 173 | all_metrics, low_utilization_tables = await collector.collect_all_metrics(start_time, end_time) 174 | 175 | logger.info("Metrics collected and stored successfully.") 176 | 177 | total_tables = sum(len(tables) for tables in low_utilization_tables.values()) 178 | logger.info(f"Found {total_tables} tables with utilization below 45%") 179 | 180 | csv_file = write_utilization_csv(low_utilization_tables, args.output) 181 | logger.info(f"Tables with low utilization are written to {csv_file}") 182 | 183 | raw_csv_file = write_raw_metrics_csv(all_metrics, config) 184 | logger.info(f"Raw metrics data written to {raw_csv_file}") 185 | 186 | raw_csv_file = write_raw_metrics_csv(all_metrics, config) 187 | logger.info(f"Raw metrics data written to {raw_csv_file}") 188 | 189 | 190 | if __name__ == "__main__": 191 | asyncio.run(main()) 192 | -------------------------------------------------------------------------------- /metrics-collector/pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "metrics-collector" 3 | version = "0.1.0" 4 | description = "" 5 | authors = [ 6 | {name = "Esteban",email = "estserna@amazon.com"} 7 | ] 8 | readme = "README.md" 9 | requires-python = ">=3.10,<4.0" 10 | dependencies = [ 11 | "aioboto3 (>=13.4.0,<14.0.0)", 12 | "asyncio (>=3.4.3,<4.0.0)", 13 | "tqdm (>=4.67.1,<5.0.0)", 14 | "simpleeval (>=1.0.3,<2.0.0)", 15 | ] 16 | 17 | 18 | [build-system] 19 | requires = ["poetry-core>=2.0.0,<3.0.0"] 20 | build-backend = "poetry.core.masonry.api" 21 | 22 | [tool.poetry.requires-plugins] 23 | poetry-plugin-export = ">=1.8" 24 | 25 | [tool.poetry] 26 | name = "metrics-collector" 27 | -------------------------------------------------------------------------------- /metrics-collector/requirements.txt: -------------------------------------------------------------------------------- 1 | aioboto3==13.4.0 2 | aiobotocore==2.18.0 3 | aiofiles==24.1.0 4 | aiohappyeyeballs==2.4.6 5 | aiohttp==3.11.12 6 | aioitertools==0.12.0 7 | aiosignal==1.3.2 8 | async-timeout==5.0.1 9 | asyncio==3.4.3 10 | attrs==25.1.0 11 | boto3==1.36.1 12 | botocore==1.36.1 13 | colorama==0.4.6 14 | frozenlist==1.5.0 15 | idna==3.10 16 | jmespath==1.0.1 17 | multidict==6.1.0 18 | propcache==0.2.1 19 | python-dateutil==2.9.0.post0 20 | s3transfer==0.11.2 21 | simpleeval==1.0.3 22 | six==1.17.0 23 | tqdm==4.67.1 24 | typing-extensions==4.12.2 25 | urllib3==2.3.0 26 | wrapt==1.17.2 27 | yarl==1.18.3 28 | -------------------------------------------------------------------------------- /metrics-collector/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/amazon-dynamodb-tools/894ddf3ba87318aa06271e277e09cf436d8b2c92/metrics-collector/tests/__init__.py -------------------------------------------------------------------------------- /reco/.gitignore: -------------------------------------------------------------------------------- 1 | *.zip 2 | *.gz 3 | # Created by https://www.gitignore.io/api/macos,python 4 | # Edit at https://www.gitignore.io/?templates=macos,python 5 | 6 | ### macOS ### 7 | # General 8 | .DS_Store 9 | .AppleDouble 10 | .LSOverride 11 | *.xlsx 12 | 13 | # Icon must end with two \r 14 | Icon 15 | 16 | # Thumbnails 17 | ._* 18 | 19 | # Files that might appear in the root of a volume 20 | .DocumentRevisions-V100 21 | .fseventsd 22 | .Spotlight-V100 23 | .TemporaryItems 24 | .Trashes 25 | .VolumeIcon.icns 26 | .com.apple.timemachine.donotpresent 27 | 28 | # Directories potentially created on remote AFP share 29 | .AppleDB 30 | .AppleDesktop 31 | Network Trash Folder 32 | Temporary Items 33 | .apdisk 34 | 35 | ### Python ### 36 | # Byte-compiled / optimized / DLL files 37 | __pycache__/ 38 | *.py[cod] 39 | *$py.class 40 | 41 | # C extensions 42 | *.so 43 | 44 | # Distribution / packaging 45 | .Python 46 | build/ 47 | develop-eggs/ 48 | dist/ 49 | downloads/ 50 | eggs/ 51 | .eggs/ 52 | lib/ 53 | lib64/ 54 | parts/ 55 | sdist/ 56 | var/ 57 | wheels/ 58 | pip-wheel-metadata/ 59 | share/python-wheels/ 60 | *.egg-info/ 61 | .installed.cfg 62 | *.egg 63 | MANIFEST 64 | 65 | # PyInstaller 66 | # Usually these files are written by a python script from a template 67 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 68 | *.manifest 69 | *.spec 70 | 71 | # Installer logs 72 | pip-log.txt 73 | pip-delete-this-directory.txt 74 | 75 | # Unit test / coverage reports 76 | htmlcov/ 77 | .tox/ 78 | .nox/ 79 | .coverage 80 | .coverage.* 81 | .cache 82 | nosetests.xml 83 | coverage.xml 84 | *.cover 85 | .hypothesis/ 86 | .pytest_cache/ 87 | 88 | # Translations 89 | *.mo 90 | *.pot 91 | 92 | # Django stuff: 93 | *.log 94 | local_settings.py 95 | db.sqlite3 96 | 97 | # Flask stuff: 98 | instance/ 99 | .webassets-cache 100 | 101 | # Scrapy stuff: 102 | .scrapy 103 | 104 | # Sphinx documentation 105 | docs/_build/ 106 | 107 | # PyBuilder 108 | target/ 109 | 110 | # Jupyter Notebook 111 | .ipynb_checkpoints 112 | 113 | # IPython 114 | profile_default/ 115 | ipython_config.py 116 | 117 | # pyenv 118 | .python-version 119 | 120 | # pipenv 121 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 122 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 123 | # having no cross-platform support, pipenv may install dependencies that don’t work, or not 124 | # install all needed dependencies. 125 | #Pipfile.lock 126 | 127 | # celery beat schedule file 128 | celerybeat-schedule 129 | 130 | # SageMath parsed files 131 | *.sage.py 132 | 133 | # Environments 134 | .env 135 | .venv 136 | env/ 137 | venv/ 138 | ENV/ 139 | env.bak/ 140 | venv.bak/ 141 | 142 | # Spyder project settings 143 | .spyderproject 144 | .spyproject 145 | 146 | # Rope project settings 147 | .ropeproject 148 | 149 | # mkdocs documentation 150 | /site 151 | 152 | # mypy 153 | .mypy_cache/ 154 | .dmypy.json 155 | dmypy.json 156 | 157 | # Pyre type checker 158 | .pyre/ 159 | 160 | 161 | # End of https://www.gitignore.io/api/macos,python 162 | -------------------------------------------------------------------------------- /reco/CHANGELOG.md: -------------------------------------------------------------------------------- 1 | ### 2024-11-20 - Version 1.1.4 - switch180 2 | * Updated pricing to reflect new rates for GT rWCU [even though we don't use this pricing data](https://aws.amazon.com/blogs/database/new-amazon-dynamodb-lowers-pricing-for-on-demand-throughput-and-global-tables/) 3 | 4 | ### 2024-07-25 - Version 1.1.3 - switch180 5 | * Added pricing for TLV and DXB 6 | * Updated pricing for BOM, CPT, BAH 7 | 8 | ### 2024-03-20 - Version 1.1.2 - switch180 9 | * Removed CMH 3yr RI because it does not appear on [public pricing page](https://aws.amazon.com/dynamodb/pricing/provisioned/). 10 | * Fixed off-by-one error in day calculation 11 | 12 | ### 2023-02-24 - Version 1.1.1 - switch180 13 | * Added three regions' 3-year RI rates to the configuration. 14 | 15 | ### 2022-12-21 - Version 1.1.0 - switch180 16 | * BUG: Fix bug that includes S-IA capacity units in reservation recommendation. This created potentially incorrect recommendations in regions where customers have tables in the Standard Infrequent Access table class 17 | 18 | ### 2022-10-01 - Version 1.0.0 - switch180 19 | * Initial release on GitHub 20 | -------------------------------------------------------------------------------- /reco/requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.23.1 2 | pytest==7.1.2 3 | -------------------------------------------------------------------------------- /reco/src/ddb_rc_reco/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/amazon-dynamodb-tools/894ddf3ba87318aa06271e277e09cf436d8b2c92/reco/src/ddb_rc_reco/__init__.py -------------------------------------------------------------------------------- /reco/src/ddbr.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import csv 4 | import argparse 5 | import logging 6 | from zipfile import ZipFile 7 | from tempfile import NamedTemporaryFile 8 | from datetime import timedelta 9 | 10 | from ddb_rc_reco.reco import generate_hours_for_regions, make_a_wish 11 | from ddb_rc_reco.reco import refresh_csv_index, process_csv, get_range_time, dt_format 12 | from ddb_rc_reco.reco import parse_dt, print_dt, open_file_read, region_list 13 | from ddb_rc_reco.reco import generate_reco_tables, output_table, output_csv 14 | from ddb_rc_reco.config import ideal_sql, version 15 | ''' 16 | 17 | DDBR CLI for DynamoDB RC recommendations 18 | 19 | ''' 20 | 21 | logging.getLogger().setLevel(20) 22 | logger = logging.getLogger('ddbr') 23 | log = logging.StreamHandler() 24 | logger.addHandler(log) 25 | 26 | 27 | 28 | 29 | def main(): 30 | arg_modes = ['reco'] 31 | main_parser = argparse.ArgumentParser(description='Tool to simulate RC purchases') 32 | sub_parser = main_parser.add_subparsers(help=None) 33 | 34 | reco_parser = sub_parser.add_parser(arg_modes[0], help='Generate recommendation using AWS Usage / CUR data.') 35 | output_options = ['plain', 'csv', 'dict', 'all'] 36 | rc_terms = ['1', '3', 'all'] 37 | file_data_type = ['cur', 'dli', 'dli_rt'] 38 | reco_parser.add_argument('--athena-sql', action='store_true', help='generate Athena SQL and exit') 39 | reco_parser.add_argument('--debug', action='store_true', help='Turn up log level') 40 | reco_parser.add_argument('--file-name', type=str, help='File name or path to file where usage data resides.') 41 | reco_parser.add_argument('--term', choices=rc_terms, help='RC term length to consider.') 42 | reco_parser.add_argument('--file-type', choices=file_data_type, help='CUR Athena query (cur) or DBR file (dli*). Detailed line items (dli). DLI with resources and tags (dli_rt)', default=file_data_type[0]) 43 | reco_parser.add_argument('--output', type=str, help='Format of text to be displayed', choices=output_options) 44 | reco_parser.add_argument('--start-time', help="Start time with leading zeroes in format --start-time \"{}\"".format(dt_format.replace('%', '%%')), type=parse_dt) 45 | reco_parser.add_argument('--end-time', help="End time with leading zeroes in format --end-time \"{}\"".format(dt_format.replace('%', '%%')), type=parse_dt) 46 | reco_parser.add_argument('--package', help="Should output be ZIP'd into a user-deliverable format. Provide the package ZIP suffix", type=str) 47 | reco_parser.add_argument('--version', action='store_true', help='Print version and exit.') 48 | # TODO reco_parser.add_argument('--region', type=str, choices=pricing.keys(), help='Airport code for region to process') 49 | 50 | args = main_parser.parse_args() 51 | if args.debug is True: 52 | logging.getLogger().setLevel(10) 53 | if args.version is True: 54 | print("reco v{}".format(version)) 55 | elif args.athena_sql is True: 56 | print(ideal_sql) 57 | elif args.file_name: 58 | csv_loc = args.file_name 59 | terms = [args.term] 60 | if any(term in terms for term in rc_terms): 61 | if rc_terms[2] in terms: 62 | terms = [rc_terms[0], rc_terms[1]] 63 | else: 64 | logger.info("Defaulting to 1 year RC term.") 65 | terms = [rc_terms[0]] # default to 1 yr 66 | start_time = None 67 | end_time = None 68 | found_regions = None 69 | region_hours = None 70 | 71 | refresh_csv_index(file_data_type.index(args.file_type)) 72 | 73 | try: 74 | with open_file_read(csv_loc) as csvfile: 75 | row_reader = csv.reader(csvfile, delimiter=',', quotechar='"') 76 | try: 77 | start_time, end_time = get_range_time(row_reader) 78 | if args.start_time: 79 | if args.start_time < start_time: 80 | raise RuntimeError("The start time of {} is not within the dataset start time of {}".format(args.start_time, start_time)) 81 | start_time = args.start_time 82 | if args.end_time: 83 | if args.end_time > end_time: 84 | raise RuntimeError("The end time of {} is not within the dataset end time of {}".format(args.end_time, end_time)) 85 | end_time = args.end_time 86 | logger.info("Recommendation will be generated between {} and {} based on source data.".format(print_dt(start_time), print_dt(end_time))) 87 | except UnboundLocalError as err: 88 | logger.error("The input data file did not match the format we expected. Verify the file-type, and make sure the data has CapacityUnit-Hrs usage types with a cost.") 89 | raise err 90 | days_at_target_time = (end_time + timedelta(hours=1) - start_time).days 91 | if days_at_target_time < 28 or days_at_target_time > 31: 92 | logger.warning("WARNING: The selected start and end times in file have greater or fewer days than a normal month. As a result, the 'monthly' summaries in the output will not reflect the true monthly cost.") 93 | with open_file_read(csv_loc) as csvfile: 94 | row_reader = csv.reader(csvfile, delimiter=',', quotechar='"') 95 | found_regions = region_list(row_reader) 96 | region_hours = generate_hours_for_regions(start_time, end_time, found_regions) 97 | logger.info("Recommendation will be generated for {} region(s)".format(len(region_hours))) 98 | with open_file_read(csv_loc) as csvfile: 99 | csv_iterator = csv.reader(csvfile, delimiter=',', quotechar='"') 100 | next(csv_iterator) # dump header 101 | logger.info("Loading CSV into memory. Please wait.") 102 | process_csv(csv_iterator, region_hours) 103 | logger.info("Generating recommendations.") 104 | should_package = args.package is not None 105 | ntf_package = dict() 106 | for term in terms: 107 | wish = make_a_wish(region_hours, term) 108 | reco_table = generate_reco_tables(region_hours, wish) 109 | def shunt_to_file(file_name, output_method, *input): 110 | stdout = sys.stdout 111 | if sys.platform == 'win32': 112 | ntf = NamedTemporaryFile(delete=False, mode='w') 113 | else: 114 | ntf = NamedTemporaryFile(delete=True, mode='w') 115 | sys.stdout = ntf 116 | #print("HELLO WORLD") 117 | output_method(*input) 118 | sys.stdout = stdout 119 | ntf.flush() 120 | ntf_package[file_name] = ntf 121 | if args.output in [output_options[1], output_options[3]]: 122 | output_csv(reco_table) 123 | if should_package: 124 | shunt_to_file('rc-{}-{}-year.csv'.format(args.package, term), output_csv, reco_table) 125 | if args.output in [output_options[2], output_options[3]]: 126 | print(reco_table) 127 | if should_package: 128 | #Do not output python dictionary to file 129 | pass 130 | if args.output in [output_options[0], output_options[3]]: 131 | output_table(reco_table) 132 | if should_package: 133 | shunt_to_file('rc-{}-{}-year.txt'.format(args.package, term), output_table, reco_table) 134 | logger.debug(should_package) 135 | if should_package and len(ntf_package): 136 | with ZipFile('recommendations-{}.zip'.format(args.package), 'w') as reco_zip: 137 | folder_fmt = '%Y-%m-%d' 138 | folder_prefix = "{} to {}".format(start_time.strftime(folder_fmt), end_time.strftime(folder_fmt)) 139 | for file_name, ntf in ntf_package.items(): 140 | output_file_name = os.path.join(args.package, folder_prefix, file_name) 141 | logger.debug("Writing file {} to zip at location '{}'".format(ntf.name, output_file_name)) 142 | reco_zip.write(ntf.name, arcname=output_file_name) 143 | ntf.close() 144 | if sys.platform == 'win32': 145 | os.unlink(ntf.name) 146 | 147 | except FileNotFoundError as err: 148 | logger.error("error: rec'd filename '{}' but this file name is not found in our base path '{}'.".format(args.file_name, os.getcwd())) 149 | else: 150 | raise RuntimeException('Missing valid arguments. Run -h for more information on options.') 151 | if __name__ == '__main__': 152 | main() 153 | -------------------------------------------------------------------------------- /reco/static/Reserved Capacity-Page-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/awslabs/amazon-dynamodb-tools/894ddf3ba87318aa06271e277e09cf436d8b2c92/reco/static/Reserved Capacity-Page-1.png -------------------------------------------------------------------------------- /reco/static/Reserved Capacity.drawio: -------------------------------------------------------------------------------- 1 | 7Vtdc9o4FP01zOw+xONv4DEmyW5n20422W7bJ0bYArw1liPLCeTX9wrJn7KBLZh0kpKZBMmWLN1z7tG9kjOwJqv1HxQlyw8kwNHA1IP1wLoamKZhjkfwh9dsRI2bVyxoGMibyor78BnLSl3WZmGA09qNjJCIhUm90idxjH1Wq0OUkqf6bXMS1Z+aoAVWKu59FKm1n8OALUXtyNHL+j9xuFjmTzZ0eWWF8ptlRbpEAXmqVFnXA2tCCWHi22o9wRE3Xm4X0e6m42oxMIpjdkiDf/Tn2/gDsR9nNvPX6J2F/7q6MF3RzSOKMjljOVq2yU1ASRYHmPdiDCwPUV+iBBBZXoDSZXEtZZR8wxMSEbptas3nOnzgyjyMorw+JjE09xYUBSGMvVHNO7xFjGEaQyWQRbeLnnMEwHaeHDemDK87LWIUdgaCYrLCjG7gFtkgB1FSs+DcUwm0k8O5rIBsjmQlkuRaFF2X9ocvEoL/Acd4Pxo4AH7KIqFsSRYkRtF1WeuVeHHLl/e8JySRQP2HGdtIGFHGCFQt2SqSV8Uz+YN2GxZcEdEFZrv41Q4AxRFi4WO9/zZryqa3JIQnF8AZbgO5JiApyaiPZauqTzQ6Msd7OhITVDraglvM58fxzvGtAP5EQwaCZ+qTT3dcNxFDA9ONwMjejMK3Bf+2hPlFG4Ub4AqsDiWKwgX3Ix+Qw+BlHneYEOTtUl5YhUEgWIPT8BnNtl1x3iR8zlsrON7AuYKaCM1w5CH/22LLsJqb888up5TyLPsvRbHKqW6PUAkk8bqwRTdH8unCrLcg83mKewHcUPC+/HzPwSYpHw+KA/j9KeWLkqnf4QS8V0GZZCwKY5DZfMXbCiwplRQQgZ8bPqwulW1RaglhVamrHRVCr1dY9J5T4pakIQtJjU1Nms0IY2TVRsgKWZ+WQP37BPl8ok8QT8h5VdebbVnaYrvkwO1hvICSxUtLlPArq/WChyMaekptLaEkyHz2zudD9BIqvtTv8cH+U7D+NOO2n1Jh+ZOsMsOGxNi2ssqMWhYZw+hrkVE1JxcYbtwa19yHjOQXLtItDJdwgzFK1uXFXJQuV+gZLGvq95aqWF7mf+MuJeph3OJZ4mIvFG8SeTg0DFehfiXq2MvugsRNdjO+sO6kdpXGZoPG/JEoTcRE5+Gaj0PIL6bXj1iosNHF7pkw7EmoWhAx56rTEhENNXes0tXtja26wo6fKgQSocaOCVgHhkr2kaHScVZWFyafYiQiER6DzFCK30K8UUnN2jzkQtfABYyalxjHxSD9Bx3WXsHP0+5c0i1V3ScRyYIbQiGv5WqoCPw9A6SO0ve+NbhFPi0t5cOewtepzyc4LyaoLCBDZzzZtd6cQH9t29BGwxq5bL0tKdXyvL0qwXZfEpxHIhUCgTYEeY7yM6vzCyWotvljCeqpPN5W04w8OLtkSxyjX0nFGZIKJEx9Gmkw67rw8mmE0ZZH/MxasDdSc3rRAteuhwuWdeY9JlUMaMaVgO+jv4WQzujQ+BPvIdk1lJ16+x53lFR4X8INAQK6+VItfOWdaU5evFrLzkVpUy3dYhqCHTi7ROU6ZF+2zXVHFiu9QansjBc2lUKzq1PLwwEhhdWPjJgNGWmGe4fueVsjU7Mtvfy4tX4Nu0HdvuVJzU4m28xTJhRvQJ86Mv8y5TQMd3icSG1aG/SnSuboVyzQ4nvOvrygw9kuKUWbym2Sud3PaWzk5VlklyY078/HVbJCjOC0HFFPOx8yTN/EsZbwj06f1zW7Hk4cGaP07/BOSxRyikOFryQ7dFOpY8tcvHShbDBVN4baTgSiRhaqUO3YnalGTuzo/Kcl3VylPsIaTC2LsZalmE45UZNOTh6eU7p63eutsbrXNDI0XWr5Wbb7h3t5dMBeJcU+KRKcw8izF9UKXw4EuJVWDdR13bu+Mds2GZ4zijWfBHgKTXDHidApSGCYml4Xm2InsXbsc0YSWOqG47FqAlKdG72szN9PUo8y9x3K82OR3yaf7n6HFmjF4Y9naSJ6OIhv/a5pPDSSjnIimhj2SBs5NZoYI1ehiWW27EGZJ+DJl2fz4eOd/TB8cL+a/268v2kctrxN8TFbzTDFHDAvA8+NU3Wiu1+Ta31Q21aXPADJETYqyM86YYdnhUmKO/Ylq2xoHgg3hMOHz+7YpDrlbut160Jjv9EaK2CfFevuQ6zC2uYrg0B9dUTV5bNi0LbV1MDAemUYNPfd22Kks2LQHWgX5rZfGQZO80Vd54W1qO2l6QYGzivDQMkVRi/sB92JQmFuV31L4ZWjYlq9oQLF8n8VxK5B+R8f1vV3ldFPD4IgFADwT8OxDWUsO5vlanXJrTMTEjb0OcRlffp0YMa61InHj8d7/EEkrYe9Ya08ARcaxZgPiGxRHEeUJOMwycPJmlIHlVHcJy1wUU/hEXvtFRddkGgBtFVtiCU0jShtYMwYuIdpN9Bh15ZV4gsuJdPfelXcSqcJxYvnQlVy7hxhv1KzOdlDJxmH+weRDJHUAFgX1UMq9PR487vkB4p1kbVdLQkuzrg59puVK7b7Z8v7CkY09tfSY7AcbZwEH0yyFw== -------------------------------------------------------------------------------- /reco/test/README.md: -------------------------------------------------------------------------------- 1 | Note: Test source files, zip and gzip, are not included in this repo and therefore the tests cannot be run locally to verify function. 2 | -------------------------------------------------------------------------------- /reco/test/test_functional.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from unittest import TestCase 3 | from functools import lru_cache 4 | from ddb_rc_reco.reco import get_range_time, region_list, make_a_wish, generate_reco_tables, open_file_read 5 | from ddb_rc_reco.reco import get_region_for_usage_type, process_csv, generate_hours_for_regions, parse_dt, test_file_loc 6 | 7 | 8 | class test_full(TestCase): 9 | test_csv_apn1 = 'test/APN1.csv' 10 | def setUp(self): 11 | pass 12 | #TODO add functional test for GZIP 13 | def test_reco_tables(self): 14 | region_hours = _get_region_hours() 15 | wish = make_a_wish(region_hours, 1) 16 | reco_table = generate_reco_tables(region_hours, wish) 17 | stats = reco_table['NRT']['rcu']['stats'] 18 | self.assertEqual(stats['min'], 8843.0) 19 | self.assertEqual(stats['median'], 48226.0) 20 | self.assertEqual(stats['max'], 97725.0) 21 | self.assertEqual(stats['average'], 49451.81) 22 | self.assertEqual(stats['std_dev'], 15766.74) 23 | self.assertEqual(stats['sum'], 36792145.0) 24 | def test_make_a_wish(self): 25 | region_hours = _get_region_hours() 26 | wish = make_a_wish(region_hours, 1) 27 | 28 | self.assertEqual(wish['NRT']['rcu']['sim_result'], 49800) 29 | self.assertEqual(wish['CMH']['rcu']['sim_result'], 0) 30 | self.assertEqual(wish['IAD']['rcu']['sim_result'], 647700) 31 | self.assertEqual(len(wish.keys()), 16) 32 | 33 | @lru_cache(maxsize=10) 34 | def _get_region_hours(csv_loc=test_file_loc): 35 | start_time = None 36 | end_time = None 37 | found_regions = None 38 | region_hours = None 39 | import csv 40 | with open_file_read(csv_loc) as csvfile: 41 | row_reader = csv.reader(csvfile, delimiter=',', quotechar='"') 42 | start_time, end_time = get_range_time(row_reader) 43 | with open_file_read(csv_loc) as csvfile: 44 | row_reader = csv.reader(csvfile, delimiter=',', quotechar='"') 45 | found_regions = region_list(row_reader) 46 | region_hours = generate_hours_for_regions(start_time, end_time, found_regions) 47 | with open_file_read(csv_loc) as csvfile: 48 | csv_iterator = csv.reader(csvfile, delimiter=',', quotechar='"') 49 | next(csv_iterator) # dump header 50 | process_csv(csv_iterator, region_hours) 51 | return region_hours 52 | -------------------------------------------------------------------------------- /table_tagger.py: -------------------------------------------------------------------------------- 1 | #!/bin/env python 2 | 3 | import argparse 4 | import json 5 | import logging 6 | import sys 7 | 8 | from ddbtools import constants 9 | from ddbtools.table import TableUtility 10 | 11 | 12 | class DynamoDBEponymousTagger(object): 13 | """Iterate through all DynamoDB tables in a region, 14 | and tag each table with its own name if not already tagged.""" 15 | def __init__(self, args: argparse.Namespace): 16 | self.args = args 17 | self.table_utility = TableUtility(region_name=self.args.region, profile_name=self.args.profile) 18 | 19 | # Setup logging 20 | log_level = logging.INFO 21 | root_logger = logging.getLogger() 22 | root_logger.setLevel(log_level) 23 | 24 | root_handler = logging.StreamHandler(sys.stdout) 25 | root_handler.setLevel(log_level) 26 | formatter = logging.Formatter('%(asctime)s: %(message)s') 27 | root_handler.setFormatter(formatter) 28 | root_logger.addHandler(root_handler) 29 | 30 | 31 | def eponymously_tag_all_tables(self, tag_name:str, table_names:list, dry_run:bool) -> list: 32 | """Tag all tables in the region with their own name if not already tagged.""" 33 | tagged_tables = [] 34 | 35 | for table_name in table_names: 36 | table_arn = self.table_utility.get_table_arn(table_name) 37 | table_tags = self.table_utility.get_table_tags(table_arn=table_arn) 38 | 39 | if tag_name not in table_tags: 40 | name_tag = [{'Key': tag_name, 'Value': table_name}] 41 | 42 | if not self.args.dry_run: 43 | self.table_utility.add_tags_to_table(table_arn, name_tag) 44 | logging.info(f"table_tagger: Tagged {table_arn} with Key: {tag_name}, Value: {table_name}") 45 | 46 | tag_info = {'table_arn': table_arn, 47 | 'tag_key': tag_name, 48 | 'tag_value': table_name} 49 | tagged_tables.append(tag_info) 50 | 51 | return tagged_tables 52 | 53 | 54 | def run(self): 55 | """Main program entry point""" 56 | table_names = [] 57 | 58 | try: 59 | if self.args.table_name is not None: 60 | table_names = [self.args.table_name] 61 | else: 62 | table_names = self.table_utility.get_table_names() 63 | 64 | tagged_tables = self.eponymously_tag_all_tables(self.args.tag_name, 65 | table_names, 66 | self.args.dry_run) 67 | output = json.dumps(tagged_tables, indent=2) 68 | print(output) 69 | 70 | exit(0) 71 | 72 | except Exception as e: 73 | print(f"Table tagging failed: {e}") 74 | exit(0) 75 | 76 | 77 | def main(): 78 | parser = argparse.ArgumentParser(description='Tag all DynamoDB tables in a region with their own name.') 79 | 80 | parser.add_argument( 81 | '--dry-run', required=False, action='store_true', help='output results but do not actually tag tables') 82 | 83 | parser.add_argument( 84 | '--region', required=False, type=str, default='us-east-1', help='tag tables in REGION (default: us-east-1)') 85 | 86 | parser.add_argument( 87 | '--table-name', required=False, type=str, help='tag only TABLE_NAME (defaults to all tables in region)') 88 | 89 | parser.add_argument( 90 | '--tag-name', required=False, type=str, default='table_name', help='tag table with tag TAG_NAME (default is "table_name")') 91 | 92 | parser.add_argument( 93 | '--profile', required=False, type=str, default=None, help='set a custom profile name to perform the operation under') 94 | 95 | args = parser.parse_args() 96 | calculator = DynamoDBEponymousTagger(args) 97 | calculator.run() 98 | 99 | if __name__ == "__main__": 100 | main() 101 | --------------------------------------------------------------------------------