├── .DS_Store ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── IAM_Permissions.pdf ├── LICENSE ├── README-v1.md ├── README.md ├── cloudformation-template └── redshift-node-config-comparison.yaml ├── configurations ├── .DS_Store ├── RedshiftConfigTestingLambda.py ├── RedshiftConfigTestingLambda.py.zip ├── RedshiftConfigTestingStepFunction.json ├── extract_bootstrap.sh ├── gather_comparison_stats.sql ├── parameter_group_config.json ├── performance_test_bootstrap.sh ├── populate_comparison_results.sql ├── redshift-performance-test.py └── replay_bootstrap.sh ├── images ├── .DS_Store ├── architecture.png ├── redshift-clusters.png └── statemachine.png ├── serverless-v2 ├── cfn │ └── redshift_node_config_compare_v2.yaml ├── configurations │ ├── RedshiftConfigTestingLambda.py │ ├── RedshiftConfigTestingLambda.py.zip │ ├── RedshiftConfigTestingStepFunction.json │ ├── create_external_schema.py │ ├── extract_bootstrap.sh │ ├── gather_comparison_stats.sql │ ├── gather_comparison_stats_serverless.sql │ ├── parameter_group_config.json │ ├── performance_test_bootstrap.sh │ ├── populate_comparison_results.sql │ ├── redshift-performance-test.py │ └── replay_bootstrap.sh ├── images │ ├── architecure-serverless.png │ ├── batch-cw-log-group.png │ ├── redshift-clusters-provisioned.png │ ├── redshift-clusters-serverless.png │ ├── redshift-clusters.png │ ├── statemachine-log.png │ └── statemachine.png └── user_config.json ├── test-cases ├── ddl.sql ├── parameter_group_config.json ├── source-wlm.json ├── test_queries.sql ├── user_config.json └── wlm-concurrency-scaling.json └── user_config.json /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-redshift-config-compare/23138cfdb68d939e5f25dcbaaf2452ff0861d56f/.DS_Store -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | -------------------------------------------------------------------------------- /IAM_Permissions.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-redshift-config-compare/23138cfdb68d939e5f25dcbaaf2452ff0861d56f/IAM_Permissions.pdf -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of 4 | this software and associated documentation files (the "Software"), to deal in 5 | the Software without restriction, including without limitation the rights to 6 | use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of 7 | the Software, and to permit persons to whom the Software is furnished to do so. 8 | 9 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 10 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS 11 | FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR 12 | COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER 13 | IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 14 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 15 | 16 | -------------------------------------------------------------------------------- /README-v1.md: -------------------------------------------------------------------------------- 1 | # Amazon Redshift Node Configuration Comparison utility 2 | 3 | Amazon Redshift Node Configuration Comparison utility answers a very common question on which instance type and number of nodes should we choose for your workload on Amazon Redshift. You can use this utility to find the right configuration for your cluster based on your query performance expectation for sequential or concurrently running queries. If you are already using Amazon Redshift, you may also run your past workloads using [Amazon Redshift Simple Replay utility](https://github.com/awslabs/amazon-redshift-utils/tree/master/src/SimpleReplay) to evaluate performance metrics for different Amazon Redshift configurations to meet your needs. It helps you find the best configuration for your Amazon Redshift cluster based on price performance expectation. 4 | 5 | ## Solution Overview 6 | 7 | The solution uses [AWS Step Functions](https://aws.amazon.com/step-functions/), [AWS Lambda](https://aws.amazon.com/lambda/) and [AWS Batch](https://aws.amazon.com/batch/) to run an end-to-end automated orchestration to find the best [Amazon Redshift](https://aws.amazon.com/redshift/) configuration based on your price/performance requirements. [AWS CloudFormation template](https://aws.amazon.com/cloudformation/) is used to deploy and run this solution in your AWS Account. Along with other resources, this template also creates an [Amazon S3](https://aws.amazon.com/s3/) bucket to store all data and metadata related to this process. 8 | 9 | ![Architecture Diagram](images/architecture.png) 10 | 11 | You need to create a JSON file to provide your input configurations for your test: 12 | 13 | 1. Amazon Redshift Cluster Configurations 14 | 2. DDL and Load Script (Optional) 15 | 3. Redshift Snapshot Identifier (Optional) 16 | 4. SQL Script to conduct sequential and concurrency test (Optional) 17 | 5. Amazon Redshift Audit Log location and simple replay time window (Optional) 18 | 19 | You need to store this file in an existing Amazon S3 bucket and then use [this AWS CloudFormation template](https://console.aws.amazon.com/cloudformation/home?#/stacks/new?stackName=redshift-node-config-comparison&templateURL=https://s3-us-west-2.amazonaws.com/redshift-simple-replay-ra3/cfn/redshift-node-config-comparison.yaml) to deploy this solution, which will also initiate an iteration of this test by invoking an Amazon Step Functions state machine in your AWS account. 20 | 21 | ## Prerequisites 22 | 23 | This solution uses [AWS CloudFormation](https://aws.amazon.com/cloudformation/) to automatically provision all the required resources in your AWS accounts. It uses AWS Lake Formation to manage access on the AWS Glue catalog which stores the performance comparison stats. If you haven't used AWS Lakeformation before , you need to add yourself as Data Lake Administrator, please follow the instructions here on [Setting up AWS Lake Formation](https://docs.aws.amazon.com/lake-formation/latest/dg/getting-started-setup.html#create-data-lake-admin). For more information, see [Getting started with AWS CloudFormation](https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/GettingStarted.html). 24 | 25 | If you are already running Amazon Redshift workload in production, you may like to use this solution to replay your past workload leveraging [Amazon Redshift Simple Replay Utility](https://github.com/awslabs/amazon-redshift-utils/tree/master/src/SimpleReplay). As a prerequisite to use simple replay utility, you need to enable [audit logging](https://docs.aws.amazon.com/redshift/latest/mgmt/db-auditing.html#db-auditing-enable-logging) and [user-activity logging](https://docs.aws.amazon.com/redshift/latest/mgmt/db-auditing.html#db-auditing-user-activity-log) in your Amazon Redshift cluster. 26 | 27 | ## Example Use Case 28 | 29 | As an example, you may assume you have an existing Amazon Redshift cluster with 2 nodes of DC2.8XLarge instances. You would like to evaluate moving this cluster to RA3.4XLarge instances with two and four nodes. For that, you would like to run five test queries sequentially as well as in five parallel sessions in all these clusters. You would also like to replay one hour past workload in these clusters and compare their performance. 30 | 31 | For your RA3.4XLarge four node configuration, you would also like to test your workload performance with [concurrency scaling](https://docs.aws.amazon.com/redshift/latest/dg/concurrency-scaling.html) enabled in that cluster, which could help improve concurrent workloads with consistently fast query performance. 32 | 33 | At the end of this test, you would like to compare various metrics like total, average, median and maximum time taken for these four cluster configurations: 34 | 35 | | **node type** | **number of nodes** | **option** | 36 | | --- | --- | --- | 37 | | dc2.8xlarge | 2 | concurrency scaling disabled | 38 | | ra3.4xlarge | 2 | concurrency scaling disabled | 39 | | ra3.4xlarge | 4 | concurrency scaling disabled | 40 | | ra3.4xlarge | 4 | concurrency scaling enabled | 41 | 42 | To perform this test using [Amazon Redshift node configuration comparison utility](https://github.com/aws-samples/amazon-redshift-config-compare), you would like to provide these configurations in a [JSON file](https://github.com/aws-samples/amazon-redshift-config-compare/blob/main/user_config.json) and store it in an Amazon S3 bucket. You may then use [AWS CloudFormation Template](https://console.aws.amazon.com/cloudformation/home?#/stacks/new?stackName=redshift-node-config-comparison&templateURL=https://s3-us-west-2.amazonaws.com/redshift-simple-replay-ra3/cfn/redshift-node-config-comparison.yaml) to deploy this utility, which would perform the end-to-end performance testing in all above clusters in parallel and produce a price/performance evaluation summary. Based on that summary, you would be easily deciding which configuration works best for you. 43 | 44 | ## Input JSON File 45 | 46 | You need to provide a configuration JSON file to use this solution. Below are the input parameters for this JSON file. 47 | 48 | | **JSON Parameter** | **Valid Values** | **Description** | 49 | | --- | --- | --- | 50 | | SNAPSHOT\_ID | N/A, Redshift Snapshot Identifier | Input Snapshot Identifier, if you would like to create new redshift clusters by restoring from a snapshot. If you are using this solution in a different AWS account, please make sure to share yur Amazon Redshift cluster snapshot with this account. Please read the [documentation](https://aws.amazon.com/premiumsupport/knowledge-center/account-transfer-redshift/) for more. Input N/A if not applicable | 51 | | SNAPSHOT\_ACCOUNT\_ID | N/A,AWS Account ID | AWS Account ID where above snapshot was created. Input N/A if not applicable | 52 | | PARAMETER\_GROUP\_CONFIG\_S3\_PATH | N/A,Amazon S3 URI | If you may use a custom parameter group for this testing, please input its S3 URI. You may get this JSON by running this command in AWS Command Line interface: "aws redshift describe-cluster-parameters --parameter-group-name your-custom-param-group --output json" Input N/A if not applicable | 53 | | DDL\_AND\_COPY\_SCRIPT\_S3\_PATH | N/A,Amazon S3 URI | If you may create tables and load data on them before performing the testing, please input its S3 URI. InputN/A if not applicable | 54 | | SQL\_SCRIPT\_S3\_PATH | N/A,Amazon S3 URI | If you may run performance testing of your queries, input S3 URI of your script consisting of all your SQL commands. These commands should be deliminated by semicolon (;). InputN/A if not applicable | 55 | | NUMBER\_OF\_PARALLEL\_SESSIONS\_LIST | N/A,Amazon S3 URI | Input comma separated numbers to denote number of parallel sessions in which you would like to run above script | 56 | | SIMPLE\_REPLAY\_LOG\_LOCATION | N/A,Amazon S3 URI | If you are already running Amazon Redshift workload and your cluster has audit logging enabled. Please input the S3 URI of your Redshift Audit Logging location. If you are using this solution in a different AWS account, please make sure to copy these logs from your source clusters' audit logging bucket to an Amazon S3 bucket in this account. | 57 | | SIMPLE\_REPLAY\_EXTRACT\_START\_TIME | N/A,Amazon S3 URI | If using simple-replay in this testing to replay your past workload, input the start time of that workload in ISO-8601 format (e.g. 2021-01-26T21:41:16+00:00) | 58 | | SIMPLE\_REPLAY\_EXTRACT\_END\_TIME | N/A, Amazon S3 URI | If using simple-replay in this testing to replay your past workload, input the end time of that workload in ISO-8601 format (e.g. 2021-01-26T21:41:16+00:00) | 59 | | SIMPLE\_REPLAY\_EXTRACT\_OVERWRITE\_S3\_PATH | N/A,Amazon S3 URI | If using simple-replay and you may like to use a custom extract.yaml file, please input its S3 URI | 60 | | SIMPLE\_REPLAY\_OVERWRITE\_S3\_PATH | N/A,Amazon S3 URI | If using simple-replay and you may like to use a custom replay.yaml file, please input its S3 URI | 61 | | AUTO\_PAUSE | true,false | Input true if you may like to automatically pause all except first Amazon Redshift clusters created for this testing | 62 | | DATABASE\_NAME | N/A,Redshift database name | Specify the primary database name of your Redshift cluster. If you’re using Simple Replay, provide the database name for which you want to replay the workload. Amazon Redshift automatically creates a default database named dev, which may not be your primary database| 63 | | CONFIGURATIONS | JSON Array with parameters NODE\_TYPE, NUMBER\_OF\_NODES, WLM\_CONFIG\_S3\_PATH | Input a JSON Array mentioning your Amazon Redshift cluster configurations, for which you may like to perform this testing. Below are the parameters for this: | 64 | | | | | 65 | | NODE\_TYPE | ra3.xlplus, ra3.4xlarge, ra3.16xlarge, dc2.large, dc2.8xlarge, ds2.xlarge, ds2.8xlarge | Input Amazon Redshift Cluster Node Type for which, you would like to run this testing. | 66 | | NUMBER\_OF\_NODES | a number between 1 and 128 | Input number of nodes for your Amazon Redshift Cluster | 67 | | WLM\_CONFIG\_S3\_PATH | N/A,Amazon S3 URI | If you may like to use custom workload management settings if different Amazon Redshift clusters, please provide the S3 URI for that. | 68 | 69 | Here is a sample configuration JSON file, used to implement this example use-case: 70 | 71 | ```json 72 | { 73 | "SNAPSHOT_ID": "redshift-cluster-manual-snapshot", 74 | "SNAPSHOT_ACCOUNT_ID": "123456789012", 75 | 76 | "PARAMETER_GROUP_CONFIG_S3_PATH": "s3://node-config-compare-bucket/pg_config.json", 77 | 78 | "DDL_AND_COPY_SCRIPT_S3_PATH": "s3://node-config-compare-bucket/ddl.sql", 79 | "SQL_SCRIPT_S3_PATH":"s3://node-config-compare-bucket/test_queries.sql", 80 | "NUMBER_OF_PARALLEL_SESSIONS_LIST": "1,5,10", 81 | 82 | "SIMPLE_REPLAY_LOG_LOCATION":"s3://redshift-logging-xxxxxxxx/RSLogs/", 83 | "SIMPLE_REPLAY_EXTRACT_START_TIME":"2021-08-28T11:15:00+00:00", 84 | "SIMPLE_REPLAY_EXTRACT_END_TIME":"2021-08-28T12:00:00+00:00", 85 | 86 | "SIMPLE_REPLAY_EXTRACT_OVERWRITE_S3_PATH":"N/A", 87 | "SIMPLE_REPLAY_OVERWRITE_S3_PATH":"N/A", 88 | 89 | "AUTO_PAUSE": true, 90 | "DATABASE_NAME": "database_name", 91 | 92 | "CONFIGURATIONS": [ 93 | { 94 | "NODE_TYPE": "dc2.8xlarge", 95 | "NUMBER_OF_NODES": "2", 96 | "WLM_CONFIG_S3_PATH": "s3://node-config-compare-bucket/source-wlm.json" 97 | }, 98 | { 99 | "NODE_TYPE": "ra3.4xlarge", 100 | "NUMBER_OF_NODES": "2", 101 | "WLM_CONFIG_S3_PATH": "s3://node-config-compare-bucket/source-wlm.json" 102 | }, 103 | { 104 | "NODE_TYPE": "ra3.4xlarge", 105 | "NUMBER_OF_NODES": "4", 106 | "WLM_CONFIG_S3_PATH": "s3://node-config-compare-bucket/source-wlm.json" 107 | }, 108 | { 109 | "NODE_TYPE": "ra3.4xlarge", 110 | "NUMBER_OF_NODES": "4", 111 | "WLM_CONFIG_S3_PATH": "s3://node-config-compare-bucket/wlm-concurrency-scaling.json" 112 | } 113 | ] 114 | } 115 | ``` 116 | 117 | **Please Note:** Make sure to use same Amazon S3 bucket to store all your configurations for this testing. For example, we used Amazon S3 bucket node-config-compare-bucket to store all configuration scripts. After populating all parameters in this JSON file, please save this JSON file in the same Amazon S3 bucket in your AWS Account. 118 | 119 | ## Deployment using AWS CloudFormation 120 | 121 | Once the configuration JSON file is saved in an Amazon S3 bucket, you may use [this AWS CloudFormation template](https://console.aws.amazon.com/cloudformation/home?#/stacks/new?stackName=redshift-node-config-comparison&templateURL=https://s3-us-west-2.amazonaws.com/redshift-simple-replay-ra3/cfn/redshift-node-config-comparison.yaml) to deploy this solution, which will also initiate an iteration of this test. This template provisions the required AWS Resources except the Amazon Redshift clusters, which gets created in the subsequent step by an AWS Step Functions state machine. This template requires you to provide the following parameters: 122 | 123 | | **CloudFormation Parameter** | **Valid Values** | **Description** | 124 | | --- | --- | --- | 125 | | ConfigJsonS3Path | Amazon S3 URI | Input S3 URI where you stored your JSON Configuration File from the previous step. The template would grant access on this Amazon S3 bucket to appropriate AWS resources created by this solution. | 126 | | ClusterIdentifierPrefix | Prefix of Amazon Redshift cluster identifiers | Input a valid string like rs, to be used as prefix of your Amazon Redshift cluster identifiers, created by this solution | 127 | | PreExistingS3BucketToGrantRedshiftAccess | N/A,Amazon S3 URI | If using Redshift Simple Replay, please input Redshift Audit Logging Bucket Name here so that it can grant appropriate permissions to the AWS Resources. You may also add an existing Amazon S3 bucket in same AWS Region, which can be accessed by Redshift. Input N/A if not applicable | 128 | | GrantS3ReadOnlyAccessToRedshift | Yes,No | If you’re using Simple Replay in the same AWS account as the source Amazon Redshift cluster, enter Yes for this parameter, which grants AmazonS3ReadOnlyAccess to the new Amazon Redshift clusters to replay copy statements within the account. Otherwise, enter No so you can’t replay copy statements if running on a different AWS account without manually configuring it. 129 | | SourceRedshiftClusterKMSKeyARN | N/A, AWS KMS Key ARN | [AWS Key Management Service (KMS) ](https://aws.amazon.com/kms/)Key ARN (Amazon Resource Name) if your source Redshift cluster is encrypted (available on the stack Outputs tab). You need to run extract and replay in the same account, if your source cluster is encrypted. 130 | | OnPremisesCIDR | CIDR Notation | The IP range (CIDR notation) for your existing infrastructure to access the target and replica clusters from a SQL client. If unsure, enter your corporate desktop's CIDR address. For instance, if your desktop's IP address is 10.156.87.45, enter10.156.87.45/32. 131 | | VPC | VPC ID | An existing [Amazon Virtual Private Cloud](https://aws.amazon.com/vpc/) (Amazon VPC) where you want to deploy the clusters and EC2 instances. 132 | | SubnetId | Subnet ID| An existing subnet within the VPC in which you deploy the Amazon Redshift clusters and AWS Batch compute environment. 133 | | UseAWSLakeFormationForGlueCatalog | No,Yes | Default value is No ,Select Yes if AWS Lake Formation is enabled for the account and manages access for Glue catalog 134 | 135 | 136 | ## Orchestration with AWS Step Functions State Machine 137 | 138 | This solution uses AWS Step Functions state machine to orchestrate the end-to-end workflow. The state machine performs the following steps to evaluate price performance of your Amazon Redshift workload: 139 | 140 | 1. First, it reads the configuration JSON file you provided and creates parallel steps work on different Amazon Redshift cluster configurations in parallel. 141 | 2. For each of these steps, it starts by creating new Amazon Redshift clusters based on the configurations you provided in the input JSON file. 142 | 3. If you have provided a valid SQL\_SCRIPT\_S3\_PATH parameter value in the input JSON file, it runs performance testing on each of these new Amazon Redshift clusters in parallel. It runs these iterations concurrently based on the input parameter NUMBER\_OF\_PARALLEL\_SESSIONS\_LIST 143 | 4. If you have provided a valid SIMPLE\_REPLAY\_LOG\_LOCATION parameter value in the input JSON file, it runs extract and replay steps of [Amazon Redshift Simple Replay Utility](https://github.com/awslabs/amazon-redshift-utils/tree/master/src/SimpleReplay) to replay your past workloads in these clusters. It replays your past workloads starting SIMPLE\_REPLAY\_EXTRACT\_START\_TIME till SIMPLE\_REPLAY\_EXTRACT\_END\_TIME as mentioned in the input JSON file. 144 | 5. Then it [unloads](https://docs.aws.amazon.com/redshift/latest/dg/r_UNLOAD.html) statistics of this testing from each of these clusters to an Amazon S3 bucket, which got created by the CloudFormation template stack in previous step. 145 | 6. If you have provided the value true for parameter AUTO\_PAUSE in the input JSON file, it will pause the Amazon Redshift clusters except the first cluster 146 | 7. When above steps are completed for all new Amazon Redshift clusters that were created as part of this process, it runs an [AWS Glue Crawler](https://docs.aws.amazon.com/glue/latest/dg/add-crawler.html) to create tables in [AWS Glue Data Catalog](https://docs.aws.amazon.com/glue/latest/dg/populate-data-catalog.html) to facilitate comparing performance of these Amazon Redshift clusters from the unloaded statistics. 147 | 8. At the last step, it unloads the comparison results to the Amazon S3 bucket for your future reference. 148 | 149 | This state machine is run automatically when the CloudFormation stack is deployed in your account. Subsequently, you may re-upload your input parameter JSON file to try changing different parameter values and then rerun this state machine from the [AWS Console](https://console.aws.amazon.com/states/home). Following diagram shows this AWS Step Functions State Machine workflow: 150 | 151 | ![Step Function](images/statemachine.png) 152 | 153 | 154 | For the example use-case, below Amazon Redshift clusters got created as part of this state machine run, which automatically paused all clusters except the first one: 155 | 156 | ![Redshift Clusters](images/redshift-clusters.png) 157 | 158 | ## Performance Evaluation 159 | 160 | This solution creates an external schema redshift\_config\_comparison and creates three external tables comparison\_stats, cluster\_config and pricingin that schema to read the raw data created by this solution in an Amazon S3 bucket. Based on these external tables, it creates the views redshift\_config\_comparison\_results, redshift\_config\_comparison\_raw and redshift\_config\_comparison\_pricing in public schema of your Amazon Redshift clusters to compare their price-performance metrics. 161 | 162 | **REDSHIFT\_CONFIG\_COMPARISON\_RESULTS:** 163 | 164 | This view provides the aggregated comparison summary of your Amazon Redshift clusters. Test Type column in this view indicates if the test type was to replay your past workload using simple replay utility or a concurrency test to run your queries in parallel with different concurrency numbers. 165 | 166 | It provides the raw value and a percentage number for metrices like total, mean, median, max query times, percentile-75, percentile-90 and percentile-90 to show how your Amazon Redshift clusters are performing as compared to the worst performing cluster for all these test types. For example, below was the outcome of your example use-case: 167 | 168 | ```sql 169 | select * from public.redshift_config_comparison_results; 170 | ``` 171 | 172 | | **test type** | **cluster identifier** | **total query time seconds** | **improved total query time** | **mean query time seconds** | **improved mean query time** | **median query time seconds** | **improved median query time** | 173 | | --- | --- | --- | --- | --- | --- | --- | --- | 174 | | simple-replay | rs-dc2-8xlarge-2 | 20.1 | 75% | 1 | 76% | 0.801 | 46% | 175 | | simple-replay | rs-ra3-4xlarge-2 | 35.26 | 0% | 1.76 | 0% | 1.063 | 10% | 176 | | simple-replay | rs-ra3-4xlarge-4 | 19.58 | 80% | 0.98 | 80% | 0.681 | 72% | 177 | | simple-replay | rs-ra3-4xlarge-4-cs | 20.16 | 75% | 1.01 | 74% | 0.716 | 63% | 178 | | concurrency-1 | rs-dc2-8xlarge-2 | 3.46 | 4% | 0.69 | 4% | 0.576 | 0% | 179 | | concurrency-1 | rs-ra3-4xlarge-2 | 3.61 | 0% | 0.72 | 0% | 0.571 | 1% | 180 | | concurrency-1 | rs-ra3-4xlarge-4 | 2.67 | 35% | 0.53 | 36% | 0.445 | 29% | 181 | | concurrency-1 | rs-ra3-4xlarge-4-cs | 1.45 | 149% | 0.29 | 148% | 0.275 | 109% | 182 | | concurrency-5 | rs-dc2-8xlarge-2 | 22.74 | 102% | 0.91 | 102% | 0.615 | 135% | 183 | | concurrency-5 | rs-ra3-4xlarge-2 | 45.88 | 0% | 1.84 | 0% | 1.443 | 0% | 184 | | concurrency-5 | rs-ra3-4xlarge-4 | 22.75 | 102% | 0.91 | 102% | 0.808 | 79% | 185 | | concurrency-5 | rs-ra3-4xlarge-4-cs | 21.19 | 117% | 0.9 | 104% | 0.884 | 63% | 186 | | concurrency-10 | rs-dc2-8xlarge-2 | 127.46 | 39% | 2.55 | 38% | 1.797 | 81% | 187 | | concurrency-10 | rs-ra3-4xlarge-2 | 176.62 | 0% | 3.53 | 0% | 3.256 | 0% | 188 | | concurrency-10 | rs-ra3-4xlarge-4 | 92.18 | 92% | 1.84 | 92% | 1.661 | 96% | 189 | | concurrency-10 | rs-ra3-4xlarge-4-cs | 88.16 | 100% | 1.76 | 101% | 1.59 | 105% | 190 | 191 | 192 | Based on above results, you may observe that four nodes of RA3.4XLarge with concurrency scaling enabled was the best performing configuration in this testing. 193 | 194 | **REDSHIFT\_CONFIG\_COMPARISON\_RAW:** 195 | 196 | This view provides the query level comparison summary of your Amazon Redshift clusters. 197 | 198 | ```sql 199 | select * from public.redshift_config_comparison_raw; 200 | ``` 201 | 202 | | **query hash** | **cluster identifier** | **exec time seconds** | **total query time seconds** | **compile time seconds** | **queue time seconds** | **cc scaling** | **userid** | **query** | 203 | | --- | --- | --- | --- | --- | --- | --- | --- | --- | 204 | | 0531f3b54885afb | rs-dc2-8xlarge-2 | 2 | 7 | 5 | 0 | 0 | 100 | 623 | 205 | | 0531f3b54885afb | rs-ra3-4xlarge-2 | 4 | 5 | 0 | 0 | 0 | 100 | 718 | 206 | | 0531f3b54885afb | rs-ra3-4xlarge-4 | 2 | 5 | 3 | 0 | 0 | 100 | 727 | 207 | | 0531f3b54885afb | rs-ra3-4xlarge-4-cs | 2 | 5 | 3 | 0 | 0 | 100 | 735 | 208 | | 10ef3990f05c9f8 | rs-dc2-8xlarge-2 | 0 | 0 | 0 | 0 | 0 | 100 | 547 | 209 | | 10ef3990f05c9f8 | rs-ra3-4xlarge-2 | 0 | 0 | 0 | 0 | 0 | 100 | 644 | 210 | | 10ef3990f05c9f8 | rs-ra3-4xlarge-4 | 0 | 0 | 0 | 0 | 0 | 100 | 659 | 211 | | 10ef3990f05c9f8 | rs-ra3-4xlarge-4-cs | 0 | 0 | 0 | 0 | 0 | 100 | 661 | 212 | | 27dcd325d97f079 | rs-dc2-8xlarge-2 | 1 | 1 | 0 | 0 | 0 | 100 | 646 | 213 | | 27dcd325d97f079 | rs-ra3-4xlarge-2 | 3 | 4 | 0 | 0 | 0 | 100 | 743 | 214 | 215 | **REDSHIFT\_CONFIG\_COMPARISON\_PRICING** 216 | 217 | This view provides the public pricing information for your Amazon Redshift cluster configurations based on data available in [AWS Price List API](https://docs.aws.amazon.com/awsaccountbilling/latest/aboutv2/using-ppslong.html) as below: 218 | 219 | ```sql 220 | select * from public.redshift_config_comparison_pricing; 221 | ``` 222 | | **node**** type **|** number of nodes **|** pricing ****options** | **your cluster yearly compute cost** | **per compute node yearly cost** | 223 | | --- | --- | --- | --- | --- | 224 | | dc2.8xlarge | 2 | On-Demand | $84,096 | $42,048 | 225 | | dc2.8xlarge | 2 | Reserved-1yr-All Upfront | $55,280 | $27,640 | 226 | | dc2.8xlarge | 2 | Reserved-1yr-No Upfront | $66,576 | $33,288 | 227 | | dc2.8xlarge | 2 | Reserved-3yr-All Upfront | $26,312 | $13,156 | 228 | | ra3.4xlarge | 2 | On-Demand | $57,114 | $28,557 | 229 | | ra3.4xlarge | 2 | Reserved-1yr-All Upfront | $37,696 | $18,848 | 230 | | ra3.4xlarge | 2 | Reserved-1yr-No Upfront | $39,980 | $19,990 | 231 | | ra3.4xlarge | 2 | Reserved-3yr-All Upfront | $21,418 | $10,709 | 232 | | ra3.4xlarge | 2 | Reserved-3yr-No Upfront | $24,844 | $12,422 | 233 | | ra3.4xlarge | 4 | On-Demand | $114,228 | $28,557 | 234 | | ra3.4xlarge | 4 | Reserved-1yr-All Upfront | $75,392 | $18,848 | 235 | | ra3.4xlarge | 4 | Reserved-1yr-No Upfront | $79,960 | $19,990 | 236 | | ra3.4xlarge | 4 | Reserved-3yr-All Upfront | $42,836 | $10,709 | 237 | | ra3.4xlarge | 4 | Reserved-3yr-No Upfront | $49,688 | $12,422 | 238 | 239 | 240 | 241 | ## Security 242 | 243 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information. 244 | 245 | ## License 246 | 247 | This library is licensed under the MIT-0 License. See the LICENSE file. 248 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Amazon Redshift Node Configuration Comparison utility 2 | 3 | ### ** Note: This readme walks you through the latest version of this utility which now supports Redshift Serverless to test your workload for performance.If you want to either explore different Redshift Serverless configurations or combination of Redshift Provisioned and Serverless configurations based on your workload, please follow instructions in this readme. If you are still using the previous version which only supports provisioned clusters, please refer to this [readme](https://github.com/aws-samples/amazon-redshift-config-compare/blob/main/README-v1.md) ** 4 | 5 | Amazon Redshift Node Configuration Comparison utility answers a very common question on which instance type and number of nodes should we choose for your workload on Amazon Redshift. You can use this utility to find the right datawarehouse configuration for your workload based on your query performance expectation for sequential or concurrently running queries. If you are already using Amazon Redshift, you may also run your past workloads using [Amazon Redshift Simple Replay utility](https://github.com/awslabs/amazon-redshift-utils/tree/master/src/SimpleReplay) to evaluate performance metrics for different Amazon Redshift configurations to meet your needs. It helps you find the best Amazon Redshift datawarehouse configuration based on your price performance expectation. 6 | 7 | ## Solution Overview 8 | 9 | The solution uses [AWS Step Functions](https://aws.amazon.com/step-functions/), [AWS Lambda](https://aws.amazon.com/lambda/) and [AWS Batch](https://aws.amazon.com/batch/) to run an end-to-end automated orchestration to find the best [Amazon Redshift](https://aws.amazon.com/redshift/) configuration based on your price/performance requirements. [AWS CloudFormation template](https://aws.amazon.com/cloudformation/) is used to deploy and run this solution in your AWS Account. Along with other resources, this template also creates an [Amazon S3](https://aws.amazon.com/s3/) bucket to store all data and metadata related to this process. 10 | 11 | ![Architecture Diagram](https://github.com/aws-samples/amazon-redshift-config-compare/blob/main/serverless-v2/images/architecure-serverless.png) 12 | 13 | You need to create a JSON file to provide your input configurations for your test: 14 | 15 | 1. Amazon Redshift provisioned clusters and Serverless workgroups configurations 16 | 2. DDL and Load Script (Optional) 17 | 3. Redshift Snapshot Identifier (Optional) 18 | 4. SQL Script to conduct sequential and concurrency test (Optional) 19 | 5. Amazon Redshift Audit Log location and simple replay time window (Optional) 20 | 21 | You need to store this file in an existing Amazon S3 bucket and then use [this AWS CloudFormation template](https://amazon-redshift-node-config-compare.s3.amazonaws.com/cfn/redshift_node_config_compare_v2.yaml) to deploy this solution, which will also initiate an iteration of this test by invoking an Amazon Step Functions state machine in your AWS account. 22 | 23 | ## Prerequisites 24 | 25 | This solution uses [AWS CloudFormation](https://aws.amazon.com/cloudformation/) to automatically provision all the required resources in your AWS accounts. It uses AWS Lake Formation to manage access on the AWS Glue catalog which stores the performance comparison stats. If you haven't used AWS Lakeformation before , you need to add yourself as Data Lake Administrator, please follow the instructions here on [Setting up AWS Lake Formation](https://docs.aws.amazon.com/lake-formation/latest/dg/getting-started-setup.html#create-data-lake-admin). For more information, see [Getting started with AWS CloudFormation](https://docs.aws.amazon.com/AWSCloudFormation/latest/UserGuide/GettingStarted.html). 26 | 27 | If you are already running Amazon Redshift workload in production, you may like to use this solution to replay your past workload leveraging [Amazon Redshift Simple Replay Utility](https://github.com/awslabs/amazon-redshift-utils/tree/master/src/SimpleReplay). As a prerequisite to use simple replay utility, you need to enable [audit logging](https://docs.aws.amazon.com/redshift/latest/mgmt/db-auditing.html#db-auditing-enable-logging) and [user-activity logging](https://docs.aws.amazon.com/redshift/latest/mgmt/db-auditing.html#db-auditing-user-activity-log) in your Amazon Redshift provisioned clusters and serverless workgroups. 28 | 29 | If you are going to replay your workload into Serverless workgroup then make sure that you must have at least three subnets, and they must span across three Availability Zones. You can review the considerations when using Amazon Redshift Serverless [here](https://docs.aws.amazon.com/redshift/latest/mgmt/serverless-known-issues.html). 30 | 31 | You need to provide at least one subnet in the same VPC (where you have Redshift provisioned clusters or serverless workgroups) which has access to the internet to download the ECR container image. 32 | 33 | ## Example Use Case 34 | 35 | As an example, you may assume you have an existing Amazon Redshift cluster with 2 nodes of DC2.8XLarge instances. You would like to evaluate moving this cluster to RA3.4XLarge instances with four nodes and Redshift Serverless with base RPU 64 and 128. You would replay one hour past workload in these Redshift datawareshouse configurations and workgroups and compare their performance. 36 | 37 | For your RA3.4XLarge four node configuration, you would also like to test your workload performance with [concurrency scaling](https://docs.aws.amazon.com/redshift/latest/dg/concurrency-scaling.html) enabled in that provisioned cluster, which could help improve concurrent workloads with consistently fast query performance. 38 | 39 | At the end of this test, you would like to compare various metrics like total, average, median and maximum time taken for these five Amazon Redshift datawarehouse configurations: 40 | 41 | | **node type** | **number of nodes/Base RPU** | **option** | 42 | | --- | --- | --- | 43 | | dc2.8xlarge | 2 | default auto wlm | 44 | | ra3.4xlarge | 4 | default auto wlm | 45 | | ra3.4xlarge | 4 | concurrency scaling enabled | 46 | | Redshift Serverless | 64 | auto scaling | 47 | | Redshift Serverless | 128 | auto scaling | 48 | 49 | To perform this test using [Amazon Redshift node configuration comparison utility](https://github.com/aws-samples/amazon-redshift-config-compare), you would like to provide these configurations in a [JSON file](https://github.com/aws-samples/amazon-redshift-config-compare/blob/main/serverless-v2/user_config.json) and store it in an Amazon S3 bucket. You may then use [AWS CloudFormation Template](https://console.aws.amazon.com/cloudformation/home?#/stacks/new?stackName=redshift-node-config-comparison&templateURL=https://amazon-redshift-node-config-compare.s3.amazonaws.com/cfn/redshift_node_config_compare_v2.yaml) to deploy this utility, which would perform the end-to-end performance testing in all above configurations in parallel and produce a price/performance evaluation summary. Based on that summary, you would be easily deciding which configuration works best for you. 50 | 51 | ## Input JSON File 52 | 53 | You need to provide a configuration JSON file to use this solution. Below are the input parameters for this JSON file. 54 | 55 | | **JSON Parameter** | **Valid Values** | **Description** | 56 | | --- | --- | --- | 57 | | SNAPSHOT\_ID | N/A, Redshift Snapshot Identifier | Input Snapshot Identifier, if you would like to create new Amazon Redshift provisioned clusters and Serverless workgroups by restoring from a snapshot. If you are using this solution in a different AWS account, please make sure to share your Amazon Redshift provisioned cluster snapshot with this account. Please read the [documentation](https://aws.amazon.com/premiumsupport/knowledge-center/account-transfer-redshift/) for more. Input N/A if not applicable | 58 | | SNAPSHOT\_ACCOUNT\_ID | N/A,AWS Account ID | AWS Account ID where above snapshot was created. Input N/A if not applicable | 59 | | PARAMETER\_GROUP\_CONFIG\_S3\_PATH | N/A,Amazon S3 URI | If you may use a custom parameter group for this testing, please input its S3 URI. You may get this JSON by running this command in AWS Command Line interface: "aws redshift describe-cluster-parameters --parameter-group-name your-custom-param-group --output json" Input N/A if not applicable | 60 | | DDL\_AND\_COPY\_SCRIPT\_S3\_PATH | N/A,Amazon S3 URI | If you may create tables and load data on them before performing the testing, please input its S3 URI. InputN/A if not applicable | 61 | | SQL\_SCRIPT\_S3\_PATH | N/A,Amazon S3 URI | If you may run performance testing of your queries, input S3 URI of your script consisting of all your SQL commands. These commands should be deliminated by semicolon (;). InputN/A if not applicable | 62 | | NUMBER\_OF\_PARALLEL\_SESSIONS\_LIST | N/A | Input comma separated numbers to denote number of parallel sessions in which you would like to run above script | 63 | | SIMPLE\_REPLAY\_LOG\_LOCATION | N/A,Amazon S3 URI | If you are already running Amazon Redshift workload and your provisioned cluster has audit logging enabled. Please input the S3 URI of your Redshift Audit Logging location. If you are using this solution in a different AWS account, please make sure to copy these logs from your source clusters' audit logging bucket to an Amazon S3 bucket in this account. | 64 | | SIMPLE\_REPLAY\_EXTRACT\_START\_TIME | N/A,Amazon S3 URI | If using simple-replay in this testing to replay your past workload, input the start time of that workload in ISO-8601 format (e.g. 2021-01-26T21:41:16+00:00) | 65 | | SIMPLE\_REPLAY\_EXTRACT\_END\_TIME | N/A, Amazon S3 URI | If using simple-replay in this testing to replay your past workload, input the end time of that workload in ISO-8601 format (e.g. 2021-01-26T21:41:16+00:00) | 66 | | SIMPLE\_REPLAY\_EXTRACT\_OVERWRITE\_S3\_PATH | N/A,Amazon S3 URI | If using simple-replay and you may like to use a custom extract.yaml file, please input its S3 URI | 67 | | SIMPLE\_REPLAY\_OVERWRITE\_S3\_PATH | N/A,Amazon S3 URI | If using simple-replay and you may like to use a custom replay.yaml file, please input its S3 URI | 68 | | AUTO\_PAUSE | true,false | Input true if you would like to automatically pause all Amazon Redshift provisioned clusters after completion of the step function | 69 | | DATABASE\_NAME | N/A,Redshift database name | Specify the primary database name of your Redshift endpoint. If you’re using Simple Replay, provide the database name for which you want to replay the workload. Amazon Redshift automatically creates a default database named dev, which may not be your primary database| 70 | | CONFIGURATIONS | JSON Array with parameters NODE\_TYPE, NUMBER\_OF\_NODES, WLM\_CONFIG\_S3\_PATH | Input a JSON Array mentioning your Amazon Redshift provisioned cluster and/or Serverless workgroups configurations, for which you may like to perform this testing. Below are the parameters for this: | 71 | | | | | 72 | | NODE\_TYPE | ra3.xlplus, ra3.4xlarge, ra3.16xlarge, dc2.large, dc2.8xlarge, ds2.xlarge, ds2.8xlarge | Input Amazon Redshift provisioned cluster Node Type for which, you would like to run this testing. This is applicable only for provisioned cluster. | 73 | | NUMBER\_OF\_NODES | a number between 1 and 128 | Input number of nodes for your Amazon Redshift provisioned cluster. This is applicable only for provisioned cluster. | 74 | | WLM\_CONFIG\_S3\_PATH | N/A,Amazon S3 URI | If you may like to use custom workload management settings if different Amazon Redshift provisioned clusters, please provide the S3 URI for that. This is applicable only for provisioned cluster. | 75 | | TYPE | Provisioned, Serverless | Input Redshift datawarehouse type for which you would like to replay your workload | 76 | | MAINTENANCE_TRACK | N/A, Trailing, Current | Amazon Redshift version against which you would like to replay your workload. This is applicable only for provisioned cluster. | 77 | | BASE_RPU | Base capacity setting from 32 RPUs to 512 RPUs | This setting specifies the base data warehouse capacity of your Amazon Redshift serverless workgroup. This is applicable only for Serverless workgroup. | 78 | 79 | 80 | Here is a sample configuration JSON file, used to implement this example use-case: 81 | 82 | ```json 83 | { 84 | "SNAPSHOT_ID": "redshift-cluster-manual-snapshot", 85 | "SNAPSHOT_ACCOUNT_ID": "123456789012", 86 | 87 | "PARAMETER_GROUP_CONFIG_S3_PATH": "s3://node-config-compare-bucket/pg_config.json", 88 | 89 | "DDL_AND_COPY_SCRIPT_S3_PATH": "s3://node-config-compare-bucket/ddl.sql", 90 | "SQL_SCRIPT_S3_PATH":"s3://node-config-compare-bucket/test_queries.sql", 91 | "NUMBER_OF_PARALLEL_SESSIONS_LIST": "1", 92 | 93 | "SIMPLE_REPLAY_LOG_LOCATION":"s3://redshift-logging-xxxxxxxx/RSLogs/", 94 | "SIMPLE_REPLAY_EXTRACT_START_TIME":"2021-08-28T11:15:00+00:00", 95 | "SIMPLE_REPLAY_EXTRACT_END_TIME":"2021-08-28T12:00:00+00:00", 96 | 97 | "SIMPLE_REPLAY_EXTRACT_OVERWRITE_S3_PATH":"N/A", 98 | "SIMPLE_REPLAY_OVERWRITE_S3_PATH":"N/A", 99 | 100 | "AUTO_PAUSE": true, 101 | "DATABASE_NAME": "database_name", 102 | 103 | "CONFIGURATIONS": [ 104 | { 105 | "TYPE": "Provisioned", 106 | "NODE_TYPE": "dc2.8xlarge", 107 | "NUMBER_OF_NODES": "2", 108 | "WLM_CONFIG_S3_PATH": "N/A" 109 | }, 110 | { 111 | "TYPE": "Provisioned", 112 | "NODE_TYPE": "ra3.4xlarge", 113 | "NUMBER_OF_NODES": "4", 114 | "WLM_CONFIG_S3_PATH": "N/A" 115 | }, 116 | { 117 | "TYPE": "Provisioned", 118 | "NODE_TYPE": "ra3.4xlarge", 119 | "NUMBER_OF_NODES": "4", 120 | "WLM_CONFIG_S3_PATH": "s3://node-config-compare-bucket/wlmconfig.json" 121 | }, 122 | { 123 | "TYPE": "Serverless", 124 | "BASE_RPU": "64" 125 | }, 126 | { 127 | "TYPE": "Serverless", 128 | "BASE_RPU": "128" 129 | } 130 | ] 131 | } 132 | ``` 133 | 134 | **Please Note:** Make sure to use same Amazon S3 bucket to store all your configurations for this testing. For example, we used Amazon S3 bucket node-config-compare-bucket to store all configuration scripts. After populating all parameters in this JSON file, please save this JSON file in the same Amazon S3 bucket in your AWS Account. 135 | 136 | ## Deployment using AWS CloudFormation 137 | 138 | Once the configuration JSON file is saved in an Amazon S3 bucket, you may use [this AWS CloudFormation template](https://console.aws.amazon.com/cloudformation/home?#/stacks/new?stackName=redshift-node-config-comparison&templateURL=https://amazon-redshift-node-config-compare.s3.amazonaws.com/cfn/redshift_node_config_compare_v2.yaml) to deploy this solution.This template provisions the required AWS Resources except the Amazon Redshift provisioned clusters and/or Serverless workgroups, which gets created in the subsequent step by an AWS Step Functions state machine. This template requires you to provide the following parameters: 139 | 140 | | **CloudFormation Parameter** | **Valid Values** | **Description** | 141 | | --- | --- | --- | 142 | | ConfigJsonS3Path | Amazon S3 URI | Input S3 URI where you stored your JSON Configuration File from the previous step. The template would grant access on this Amazon S3 bucket to appropriate AWS resources created by this solution. | 143 | | ClusterIdentifierPrefix | Prefix of Amazon Redshift endpoints ( **only lowercase is supported** ) | Input a valid string like rs, to be used as prefix of your Amazon Redshift provisioned clusters, namespaces & workgroups | 144 | | AuditLoggingS3Bucket | N/A,Amazon S3 URI | If using Redshift Simple Replay, please input Redshift Audit Logging Bucket Name here so that it can grant appropriate permissions to the AWS Resources. You may also add an existing Amazon S3 bucket in same AWS Region, which can be accessed by Redshift. Input N/A if not applicable | 145 | | GrantS3ReadOnlyAccessToRedshift | Yes,No | If you’re using Simple Replay in the same AWS account as the source Amazon Redshift provisioned cluster, enter Yes for this parameter, which grants AmazonS3ReadOnlyAccess to the new Amazon Redshift provisioned clusters and Serverless workgroups to replay copy statements within the account. Otherwise, enter No so you can’t replay copy statements if running on a different AWS account without manually configuring it. 146 | | SourceRedshiftClusterKMSKeyARN | N/A, AWS KMS Key ARN | [AWS Key Management Service (KMS) ](https://aws.amazon.com/kms/)Key ARN (Amazon Resource Name) if your source Redshift provisioned cluster is encrypted (available on the stack Outputs tab). You need to run extract and replay in the same account, if your source provisoned cluster is encrypted. 147 | | OnPremisesCIDR | CIDR Notation | The IP range (CIDR notation) for your existing infrastructure to access the target and replica provisioned clusters and Serverless workgroups from a SQL client. If unsure, enter your corporate desktop's CIDR address. For instance, if your desktop's IP address is 10.156.87.45, enter10.156.87.45/32. 148 | | VPC | VPC ID | An existing [Amazon Virtual Private Cloud](https://aws.amazon.com/vpc/) (Amazon VPC) where you want to deploy the provisioned clusters, Serverless workgroups and EC2 instances. 149 | | RedshiftSubnetId | Subnet ID | You can provide upto 3 subnets within the same VPC to deploy the Amazon Redshift provisioned clusters and Serverless workgroups. 150 | | AWSBatchSubnetId | Subnet ID | Provide 1 existing subnet (subnet should have route to the internet) within the VPC in which you deploy AWS Batch compute environment. 151 | | AWSECRContainerImage | N/A, Amazon Elastic Container Registry | Default value is N/A, Provide container image if you would like to use your private image which is already available. 152 | | UseAWSLakeFormationForGlueCatalog | No,Yes | Default value is No ,Select Yes if AWS Lake Formation is enabled for the account and manages access for Glue catalog 153 | | NotificationEmail | N/A, Email address | Default value is N/A , Provide one email address if you would like to receive step function status notifications 154 | 155 | ## Orchestration with AWS Step Functions State Machine 156 | 157 | This solution uses AWS Step Functions state machine to orchestrate the end-to-end workflow. The state machine performs the following steps to evaluate price performance of your Amazon Redshift workload: 158 | 159 | 1. First, it validates the inputs provided in the user configuration file and checks if audit logging is enabled. 160 | 2. If you have provided a valid SIMPLE\_REPLAY\_LOG\_LOCATION parameter value in the input JSON file, it runs extract and generates SQL files to replay the steps from [Amazon Redshift Simple Replay Utility](https://github.com/awslabs/amazon-redshift-utils/tree/master/src/SimpleReplay) on your past workloads in Amazon Redshift provisoned clusters and/or Serverless workgroups based on the configurations you provided in the input JSON file. It replays your past workloads starting SIMPLE\_REPLAY\_EXTRACT\_START\_TIME till SIMPLE\_REPLAY\_EXTRACT\_END\_TIME as mentioned in the input JSON file. 161 | 3. It reads the configuration JSON file you provided and creates parallel steps work on different Amazon Redshift datawarehouse configurations in parallel. 162 | 2. For each of these steps, it starts by creating new Amazon Redshift provisoned clusters and/or Serverless workgroups based on the configurations you provided in the input JSON file. 163 | 3. If you have provided a valid SQL\_SCRIPT\_S3\_PATH parameter value in the input JSON file, it runs performance testing on each of these new Amazon Redshift provisoned clusters and/or Serverless workgroups in parallel. It runs these iterations concurrently based on the input parameter NUMBER\_OF\_PARALLEL\_SESSIONS\_LIST 164 | 4. It replay's the extracted workload from Step 2 in each of the Redshift datawarehouse configuration in parallel. 165 | 5. Then it [unloads](https://docs.aws.amazon.com/redshift/latest/dg/r_UNLOAD.html) statistics of this testing from each of these Redshift datawarehouse configuration to an Amazon S3 bucket, which got created by the CloudFormation template stack in previous step. 166 | 6. If AUTO\_PAUSE parameter in the input JSON file is True, it will pause all the Amazon Redshift provisioned clusters 167 | 7. When above steps are completed for all new Amazon Redshift clusters and serverless workgroups that were created as part of this process, it runs an [AWS Glue Crawler](https://docs.aws.amazon.com/glue/latest/dg/add-crawler.html) to create tables in [AWS Glue Data Catalog](https://docs.aws.amazon.com/glue/latest/dg/populate-data-catalog.html) to facilitate comparing performance of these Amazon Redshift clusters from the unloaded statistics. 168 | 8. At the last step, it unloads the comparison results to the Amazon S3 bucket for your future reference. 169 | 170 | You need to start a new execution of the state machine after the CloudFormation stack is deployed in your account. Subsequently, you may re-upload your input parameter JSON file to try changing different parameter values ( for e.g adding new Redshift datawarehouse configuration ) and then rerun this state machine from the [AWS Console](https://console.aws.amazon.com/states/home). Following diagram shows this AWS Step Functions State Machine workflow: 171 | 172 | ![Step Function](https://github.com/aws-samples/amazon-redshift-config-compare/blob/main/serverless-v2/images/statemachine.png) 173 | 174 | 175 | For the example use-case, below Amazon Redshift provisioned clusters and serverless workgroups got created as part of the state machine execution. 176 | 177 | ![Redshift Clusters](https://github.com/aws-samples/amazon-redshift-config-compare/blob/main/serverless-v2/images/redshift-clusters-provisioned.png) 178 | ![Redshift Clusters](https://github.com/aws-samples/amazon-redshift-config-compare/blob/main/serverless-v2/images/redshift-clusters-serverless.png) 179 | 180 | ## Performance Evaluation 181 | 182 | This solution creates an external schema redshift\_config\_comparison and creates three external tables comparison\_stats, cluster\_config and pricingin that schema to read the raw data created by this solution in an Amazon S3 bucket. Based on these external tables, it creates the views redshift\_config\_comparison\_results, redshift\_config\_comparison\_raw and redshift\_config\_comparison\_aggregrate in public schema of your Amazon Redshift clusters and serverless workgroups to compare the price-performance metrics across the different Redshift datawarehouse configurations. 183 | 184 | **REDSHIFT\_CONFIG\_COMPARISON\_RESULTS:** 185 | 186 | This view provides the aggregated comparison summary of your Amazon Redshift provisioned clusters and serverless workgroups. Test Type column in this view indicates that the test type was to replay your past workload using simple replay utility. 187 | 188 | It provides the raw value and a percentage number for metrices like total, mean, median, max query times, percentile-75, percentile-90 and percentile-95 to show how your Amazon Redshift clusters and serverless workgroups are performing against each other. 189 | 190 | For example, below was the outcome of your example use-case: ( ** **please note - the results below are for illustration purposes only based on our internal sample workloads, your test results might vary based on your workload** ** ) 191 | 192 | ```sql 193 | SELECT test_type,cluster_identifier,total_query_time_seconds,improvement_total_query_time,pct75_query_time_seconds,pct95_query_time_seconds 194 | FROM public.redshift_config_comparison_results 195 | order by total_query_time_seconds; 196 | ``` 197 | 198 | | **test_type** | **cluster_identifier** | **total_query_time_seconds** | **improvement_total_query_time** | **pct75_query_time_seconds** | **pct95_query_time_seconds** | 199 | | --- | --- | --- | --- | --- | --- | 200 | | simple-replay | workgroup-ncc-128 | 120.51 | 237% | 0.006 | 3.313 201 | | simple-replay | ncc-ra3-4xlarge-4wlmconfig | 228.64 | 78% | 1.301 | 4.017 202 | | simple-replay | ncc-ra3-4xlarge-4 | 252.2 | 61% | 1.276 | 4.665 203 | | simple-replay | workgroup-ncc-64 | 269.71 | 51% | 1.937 | 5.677 204 | | simple-replay | ncc-dc2-8xlarge-2 | 406.2 | 0% | 1.922 | 9.295 205 | 206 | Based on above results, you may observe that Redshift serverless with 128 RPUs was the best performing configuration across all the Redshift configurations , ra3.4xlarge 4 nodes with concurreny scaling enabled was the best among the provisioned Redshift clusters. 207 | 208 | **REDSHIFT\_CONFIG\_COMPARISON\_RAW:** 209 | 210 | This view provides the query level comparison summary of your Amazon Redshift clusters and/or Serverless workgroups. 211 | 212 | ```sql 213 | select query_hash,cluster_identifier,exec_time_seconds,elasped_time_seconds,queue_time_seconds,user_id,query_id 214 | from public.redshift_config_comparison_raw; 215 | ``` 216 | 217 | | **query_hash** | **cluster_identifier** | **exec_time_seconds** | **elasped_time_seconds** | **queue_time_seconds** | **user_id** | **query_id** | 218 | | --- | --- | --- | --- | --- | --- | --- | 219 | | 0531f3b54885afb | ncc-dc2-8xlarge-2 | 5 | 6 | 0 | 100 | 623 | 220 | | 0531f3b54885afb | ncc-ra3-4xlarge-4 | 4 | 5 | 0 | 100 | 727 | 221 | | 0531f3b54885afb | ncc-ra3-4xlarge-4wlmconfig | 3 | 3 | 0 | 100 | 735 | 222 | | 0531f3b54885afb | workgroup-ncc-64 | 2 | 3 | 0 | 100 | 718 | 223 | | 0531f3b54885afb | workgroup-ncc-128 | 1 | 1 | 0 | 100 | 718 | 224 | 225 | ## Access permissions and security 226 | To deploy this solution, you need administrator access on the AWS accounts where you plan to deploy the AWS CloudFormation resources for this solution. 227 | 228 | User deploying the AWS CloudFormation stack needs full permission on these services: 229 | 230 | AWS IAM, Amazon ECR, AWS Batch, AWS Lambda, Amazon CloudWatch, AWS Glue, Amazon S3, AWS StepFunction, Amazon Redshift, AWS Secrets Manager, Amazon EC2 – SecurityGroup, AWS LakeFormation (if Selected Yes for the CloudFormation parameter UseAWSLakeFormationForGlueCatalog) 231 | 232 | The CloudFormation template provisions all the required resources using security best practices based on the principle of least privileges and hosts all resources within your account VPC. Access to the Amazon Redshift clusters is controlled with the CloudFormation template parameter OnPremisesCIDR, which you need to provide to allow on-premises users to connect to the new clusters using their SQL clients on the Amazon Redshift port. 233 | 234 | Access permissions for all the resources are controlled using AWS Identity and Access Management (IAM) roles granting appropriate permissions to Amazon Redshift, AWS Lambda, AWS Step Functions, AWS Glue, and AWS Batch. Read and write access privileges are granted to the Amazon Redshift clusters and AWS Batch jobs on the S3 bucket created by the CloudFormation template so that it can read and update data and metadata configurations from that bucket. Read and write access privileges are also granted on the S3 bucket where the user configuration JSON file is uploaded. AWS Batch requires internet access in order to pull images from Amazon ECR public repository. AWS LakeFormation is used to manage access control on the AWS Glue catalog tables created for performance evaluation, this is optional, based on the UseAWSLakeFormationForGlueCatalog parameter in the CloudFormation template. 235 | 236 | You can find [here](https://github.com/aws-samples/amazon-redshift-config-compare/blob/main/IAM_Permissions.pdf) the list of IAM permissions used in the utility. 237 | 238 | ## Troubleshooting 239 | 240 | AWS Batch jobs can fail with error **– CannotPullContainerError**, if the subnet doesn’t have route to the internet to pull the container image. Refer to [this](https://aws.amazon.com/premiumsupport/knowledge-center/ecs-fargate-pull-container-error/) KB article to resolve the issue. 241 | 242 | There might be some rare instances in which failures occur in the state machine running this solution. To troubleshoot, refer to its logs, along with logs from the AWS Batch jobs in [Amazon CloudWatch Logs](https://docs.aws.amazon.com/AmazonCloudWatch/latest/logs/WhatIsCloudWatchLogs.html). To view the AWS Batch logs, navigate to the [Amazon CloudWatch](https://aws.amazon.com/cloudwatch/) console and choose **Logs** in the navigation pane. Find the log group with name **`<`Your CloudFormation Stack Name`>`/log** and choose the latest log streams. 243 | 244 | ![Cloudwatch Console](https://github.com/aws-samples/amazon-redshift-config-compare/blob/main/serverless-v2/images/batch-cw-log-group.png) 245 | 246 | To view the Step Functions logs, navigate to the state machine’s latest run on the Step Functions console and choose CloudWatch Logs for the failed Step Functions step. 247 | 248 | ![State Machine Console](https://github.com/aws-samples/amazon-redshift-config-compare/blob/main/serverless-v2/images/statemachine-log.png) 249 | 250 | After you fix the issue, you can restart the state machine by choosing New execution. 251 | 252 | ## Clean up 253 | 254 | Running this template in your AWS account may have some cost implications because it provisions new Amazon Redshift provisioned clusters, serverless namespaces and workgroups. Once you are done with the evaluation and you don’t plan to run this test in future, you should delete the CloudFormation stack. It deletes all the resources it created, except the below ones needs to be deleted manually: 255 | 256 | 1. Amazon Redshift clusters, serverless workgroups and namespaces 257 | 2. Amazon S3 bucket created by the cloudformation stack 258 | 259 | ## Security 260 | 261 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information. 262 | 263 | ## License 264 | 265 | This library is licensed under the MIT-0 License. See the LICENSE file. 266 | -------------------------------------------------------------------------------- /configurations/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-redshift-config-compare/23138cfdb68d939e5f25dcbaaf2452ff0861d56f/configurations/.DS_Store -------------------------------------------------------------------------------- /configurations/RedshiftConfigTestingLambda.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | import time 3 | import traceback 4 | import botocore.exceptions as be 5 | import json 6 | import re 7 | import os 8 | 9 | def handler(event, context): 10 | print(event) 11 | action = event['Input'].get('action') 12 | 13 | user_config = get_json_config_from_s3(os.environ['USER_CONFIG_JSON_S3_PATH']) 14 | system_config = get_json_config_from_s3(os.environ['SYSTEM_CONFIG_JSON_S3_PATH']) 15 | cluster_identifier_prefix = os.environ['CLUSTER_IDENTIFIER_PREFIX'] 16 | 17 | what_if_timestamp = event['Input'].get('what_if_timestamp') 18 | cluster_identifier = event['Input'].get('cluster_identifier') 19 | sql_id = event['Input'].get('sql_id') 20 | job_id = event['Input'].get('job_id') 21 | redshift_cluster_configuration = event['Input'].get('redshift_cluster_configuration') 22 | redshift_cluster_index = event['Input'].get('redshift_cluster_index') 23 | 24 | try: 25 | 26 | client = boto3.client('redshift') 27 | if user_config.get('DATABASE_NAME') == 'N/A' or user_config.get('DATABASE_NAME') is None : 28 | database_name = system_config.get('DATABASE_NAME') 29 | print("Database name from system_config") 30 | else: 31 | database_name = user_config.get('DATABASE_NAME') 32 | print("Database name from user_config") 33 | print("Database name {}". format(database_name)) 34 | if action == "initiate": 35 | what_if_timestamp = time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime(time.time())) 36 | res = {'status': what_if_timestamp} 37 | elif action == "run_extract": 38 | res = { 39 | 'job_id': run_extract( 40 | what_if_timestamp=what_if_timestamp, 41 | simple_replay_log_location=user_config.get('SIMPLE_REPLAY_LOG_LOCATION'), 42 | simple_replay_extract_start_time=user_config.get('SIMPLE_REPLAY_EXTRACT_START_TIME'), 43 | simple_replay_extract_end_time=user_config.get('SIMPLE_REPLAY_EXTRACT_END_TIME'), 44 | simple_replay_extract_overwrite_s3_path=user_config.get('SIMPLE_REPLAY_EXTRACT_OVERWRITE_S3_PATH'), 45 | bucket_name=system_config.get('S3_BUCKET_NAME'), 46 | redshift_user_name=system_config.get('MASTER_USER_NAME'), 47 | extract_prefix=system_config.get('EXTRACT_PREFIX'), 48 | script_prefix=system_config.get('SCRIPT_PREFIX'), 49 | extract_bootstrap_script=system_config.get('EXTRACT_BOOTSTRAP_SCRIPT'), 50 | job_definition=system_config.get('JOB_DEFINITION'), 51 | job_queue=system_config.get('JOB_QUEUE') 52 | )} 53 | elif action == "batch_job_status": 54 | res = {'status': batch_job_status(job_id=job_id)} 55 | elif action == "get_redshift_configurations": 56 | res = {'status': user_config.get('CONFIGURATIONS')} 57 | elif action == "get_cluster_identifier": 58 | res = {'status': get_cluster_identifier(client, user_config, redshift_cluster_configuration, 59 | cluster_identifier_prefix)} 60 | elif action == "cluster_status": 61 | res = {'status': cluster_status(client, cluster_identifier)} 62 | elif action == "create_parameter_group": 63 | res = {'status': create_parameter_group(client, cluster_identifier)} 64 | elif action == "update_parameter_group": 65 | 66 | if user_config.get('PARAMETER_GROUP_CONFIG_S3_PATH') is None or user_config.get( 67 | 'PARAMETER_GROUP_CONFIG_S3_PATH') == "N/A": 68 | parameter_group = system_config.get('PARAMETER_GROUP_CONFIG') 69 | else: 70 | parameter_group = user_config.get('PARAMETER_GROUP_CONFIG_S3_PATH') 71 | 72 | res = {'status': update_parameter_group(client, cluster_identifier, parameter_group)} 73 | 74 | elif action == "create_cluster": 75 | res = { 76 | 'status': create_cluster(client, 77 | cluster_identifier, 78 | user_config.get('SNAPSHOT_ID'), 79 | system_config.get('REDSHIFT_IAM_ROLE'), 80 | cluster_identifier, 81 | system_config.get('SUBNET_GROUP'), 82 | system_config.get('SECURITY_GROUP_ID'), 83 | user_config.get('SNAPSHOT_ACCOUNT_ID'), 84 | redshift_cluster_configuration.get('NODE_TYPE'), 85 | redshift_cluster_configuration.get('NUMBER_OF_NODES'), 86 | master_user_name=system_config.get('MASTER_USER_NAME'), 87 | database_name=database_name, 88 | secrets_manager_arn=system_config.get('SECRETS_MANAGER_ARN'), 89 | port=int(system_config.get('PORT')), 90 | publicly_accessible=(system_config.get('PUBLICLY_ACCESSIBLE')=="true") 91 | )} 92 | 93 | elif action == "classic_resize_cluster": 94 | res = {'status': classic_resize_cluster(client, cluster_identifier, 95 | redshift_cluster_configuration.get('NODE_TYPE'), 96 | redshift_cluster_configuration.get('NUMBER_OF_NODES'))} 97 | elif action == "resume_cluster": 98 | client.resume_cluster(ClusterIdentifier=cluster_identifier) 99 | res = {'status': 'initiated'} 100 | elif action == "pause_cluster": 101 | res = { 102 | 'status': pause_cluster(client=client, 103 | cluster_identifier=cluster_identifier, 104 | redshift_cluster_index=redshift_cluster_index, 105 | auto_pause=user_config.get('AUTO_PAUSE'))} 106 | elif action == "update_wlm_config": 107 | res = {'status': update_wlm_config(client, cluster_identifier, 108 | redshift_cluster_configuration.get('WLM_CONFIG_S3_PATH'))} 109 | 110 | ## Added to check for clusters in pending reboot after wlm change ## 111 | elif action == "check_pending_reboot_status": 112 | res = {'status': check_pending_reboot_status(client, cluster_identifier) } 113 | 114 | elif action == "run_ddl_and_copy_script": 115 | res = { 116 | 'sql_id': run_sql_script_from_s3(script_s3_path=user_config.get('DDL_AND_COPY_SCRIPT_S3_PATH'), 117 | action=action, 118 | cluster_identifier=cluster_identifier, 119 | redshift_iam_role=system_config.get('REDSHIFT_IAM_ROLE'), 120 | bucket_name=system_config.get('S3_BUCKET_NAME'), 121 | db=database_name, 122 | user=system_config.get('MASTER_USER_NAME'))} 123 | elif action == "run_redshift_performance_test": 124 | res = { 125 | 'job_id': run_redshift_performance_test( 126 | client=client, 127 | cluster_identifier=cluster_identifier, 128 | bucket_name=system_config.get('S3_BUCKET_NAME'), 129 | performance_test_bootstrap_script=system_config.get('PERFORMANCE_TEST_BOOTSTRAP_SCRIPT'), 130 | performance_test_python_script=system_config.get('PERFORMANCE_TEST_PYTHON_SCRIPT'), 131 | sql_script_s3_path=user_config.get('SQL_SCRIPT_S3_PATH'), 132 | number_of_parallel_sessions_list=user_config.get('NUMBER_OF_PARALLEL_SESSIONS_LIST'), 133 | job_definition=system_config.get('JOB_DEFINITION'), 134 | job_queue=system_config.get('JOB_QUEUE'), 135 | redshift_iam_role=system_config.get('REDSHIFT_IAM_ROLE'), 136 | redshift_user_name=system_config.get('MASTER_USER_NAME'), 137 | db=database_name, 138 | disable_result_cache=system_config.get('DISABLE_RESULT_CACHE'), 139 | default_output_limit=system_config.get('DEFAULT_OUTPUT_LIMIT'), 140 | max_number_of_queries=system_config.get('MAX_NUMBER_OF_QUERIES'), 141 | max_parallel_sessions=system_config.get('MAX_PARALLEL_SESSIONS'), 142 | query_label_prefix=system_config.get('QUERY_LABEL_PREFIX') 143 | 144 | )} 145 | elif action == "run_replay": 146 | res = { 147 | 'job_id': run_replay( 148 | client=client, 149 | what_if_timestamp=what_if_timestamp, 150 | cluster_identifier=cluster_identifier, 151 | extract_s3_path='s3://' + system_config.get('S3_BUCKET_NAME') + '/' + system_config.get( 152 | 'EXTRACT_PREFIX') + '/' + what_if_timestamp + '/', 153 | simple_replay_overwrite_s3_path=user_config.get('SIMPLE_REPLAY_OVERWRITE_S3_PATH'), 154 | simple_replay_log_location=user_config.get('SIMPLE_REPLAY_LOG_LOCATION'), 155 | bucket_name=system_config.get('S3_BUCKET_NAME'), 156 | redshift_user_name=system_config.get('MASTER_USER_NAME'), 157 | redshift_iam_role=system_config.get('REDSHIFT_IAM_ROLE'), 158 | db=database_name, 159 | extract_prefix=system_config.get('EXTRACT_PREFIX'), 160 | replay_prefix=system_config.get('REPLAY_PREFIX'), 161 | script_prefix=system_config.get('SCRIPT_PREFIX'), 162 | snapshot_account_id=user_config.get('SNAPSHOT_ACCOUNT_ID'), 163 | replay_bootstrap_script=system_config.get('REPLAY_BOOTSTRAP_SCRIPT'), 164 | job_definition=system_config.get('JOB_DEFINITION'), 165 | job_queue=system_config.get('JOB_QUEUE') 166 | )} 167 | elif action == "gather_comparison_stats": 168 | res = {'sql_id': gather_comparison_stats(script_s3_path=system_config.get('GATHER_COMPARISON_STATS_SCRIPT'), 169 | action=action, 170 | cluster_identifier=cluster_identifier, 171 | redshift_iam_role=system_config.get('REDSHIFT_IAM_ROLE'), 172 | bucket_name=system_config.get('S3_BUCKET_NAME'), 173 | db=database_name, 174 | user=system_config.get('MASTER_USER_NAME'), 175 | run_type='sync', 176 | what_if_timestamp=what_if_timestamp, 177 | comparison_stats_s3_path=system_config.get( 178 | 'COMPARISON_STATS_S3_PATH'), 179 | external_schema_script=system_config.get('EXTERNAL_SCHEMA_SCRIPT'), 180 | query_label_prefix=system_config.get('QUERY_LABEL_PREFIX'), 181 | node_type=redshift_cluster_configuration.get('NODE_TYPE'), 182 | number_of_nodes=redshift_cluster_configuration.get('NUMBER_OF_NODES'), 183 | region=system_config.get('REGION'), 184 | cluster_config_s3_path=system_config.get('CLUSTER_CONFIG_S3_PATH')) 185 | } 186 | elif action == "populate_comparison_results": 187 | res = { 188 | 'sql_id': populate_comparison_results( 189 | script_s3_path=system_config.get('POPULATE_COMPARISON_RESULTS_SCRIPT'), 190 | action=action, 191 | cluster_identifier=cluster_identifier, 192 | redshift_iam_role=system_config.get('REDSHIFT_IAM_ROLE'), 193 | bucket_name=system_config.get('S3_BUCKET_NAME'), 194 | db=database_name, 195 | user=system_config.get('MASTER_USER_NAME'), 196 | what_if_timestamp=what_if_timestamp, 197 | raw_comparison_results_s3_path=system_config.get('RAW_COMPARISON_RESULTS_S3_PATH'), 198 | comparison_results_s3_path=system_config.get('COMPARISON_RESULTS_S3_PATH')) 199 | } 200 | 201 | elif action == "sql_status": 202 | res = {'status': sql_status(sql_id)} 203 | elif action == "run_glue_crawler": 204 | res = {'status': run_glue_crawler(system_config.get('CRAWLER_NAME'))} 205 | else: 206 | raise ValueError("Invalid Task: " + action) 207 | except Exception as e: 208 | print(e) 209 | print(traceback.format_exc()) 210 | raise 211 | print(res) 212 | return res 213 | 214 | 215 | def populate_comparison_results(script_s3_path, action, cluster_identifier, redshift_iam_role, bucket_name, db, user, 216 | what_if_timestamp, raw_comparison_results_s3_path, comparison_results_s3_path): 217 | return run_sql_script_from_s3(script_s3_path=script_s3_path, 218 | action=action, 219 | cluster_identifier=cluster_identifier, 220 | redshift_iam_role=redshift_iam_role, 221 | bucket_name=bucket_name, 222 | db=db, 223 | user=user, 224 | what_if_timestamp=what_if_timestamp, 225 | raw_comparison_results_s3_path=raw_comparison_results_s3_path, 226 | comparison_results_s3_path=comparison_results_s3_path) 227 | 228 | 229 | 230 | def s3_put(script_s3_path, object): 231 | bucket, key = script_s3_path.replace("s3://", "").split("/", 1) 232 | boto3.client('s3').put_object(Bucket=bucket, Key=key, Body=object) 233 | 234 | def gather_comparison_stats(script_s3_path, action, cluster_identifier, redshift_iam_role, bucket_name, db, 235 | user, run_type, what_if_timestamp, comparison_stats_s3_path, external_schema_script, 236 | query_label_prefix,node_type,number_of_nodes,region,cluster_config_s3_path): 237 | config=f'cluster_identifier,node_type,number_of_nodes,region\n{cluster_identifier},{node_type},{number_of_nodes},{region}' 238 | s3_put(cluster_config_s3_path+'/'+cluster_identifier+'.csv', config) 239 | try: 240 | run_sql(clusterid=cluster_identifier, 241 | db=db, 242 | user=user, 243 | script=external_schema_script, 244 | with_event=False, 245 | run_type='sync') 246 | except Exception as e: 247 | if "already exists" not in str(e): 248 | raise 249 | return run_sql_script_from_s3(script_s3_path=script_s3_path, 250 | action=action, 251 | cluster_identifier=cluster_identifier, 252 | redshift_iam_role=redshift_iam_role, 253 | bucket_name=bucket_name, 254 | db=db, 255 | user=user, 256 | run_type=run_type, 257 | what_if_timestamp=what_if_timestamp, 258 | comparison_stats_s3_path=comparison_stats_s3_path, 259 | query_label_prefix=query_label_prefix) 260 | 261 | 262 | def pause_cluster(client, cluster_identifier, redshift_cluster_index, auto_pause): 263 | if auto_pause and redshift_cluster_index > 0: 264 | try: 265 | client.pause_cluster(ClusterIdentifier=cluster_identifier) 266 | except be.ClientError as e: 267 | if e.response['Error']['Code'] == 'InvalidClusterState': 268 | print(e.response['Error']['Code']) 269 | else: 270 | raise 271 | return "initiated" 272 | else: 273 | return "auto_pause config is false" 274 | 275 | def cluster_status(client, clusterid): 276 | try: 277 | desc = client.describe_clusters(ClusterIdentifier=clusterid)['Clusters'][0] 278 | if isinstance(desc, dict): 279 | status = desc.get('ClusterStatus') + desc.get('ClusterAvailabilityStatus') + ( 280 | desc.get('RestoreStatus').get('Status') if desc.get('RestoreStatus') else "") 281 | else: 282 | status = 'Unavailable' 283 | except be.ClientError as e: 284 | msg = e.response['Error']['Code'] 285 | if msg == 'ClusterNotFound': 286 | status = 'nonExistent' 287 | else: 288 | raise 289 | return status 290 | 291 | 292 | 293 | def get_cluster_identifier(client, config, redshift_configurations, cluster_identifier_prefix): 294 | if redshift_configurations.get('USER_FRIENDLY_NAME_SUFFIX') is None or redshift_configurations.get( 295 | 'USER_FRIENDLY_NAME_SUFFIX') == 'N/A': 296 | if redshift_configurations.get('WLM_CONFIG_S3_PATH') is None or redshift_configurations.get( 297 | 'WLM_CONFIG_S3_PATH') == 'N/A': 298 | wlm_name = "" 299 | else: 300 | wlm_name = redshift_configurations.get('WLM_CONFIG_S3_PATH').replace("s3://", "").replace("/", "").replace( 301 | ".json", "") 302 | wlm_name = re.sub('[^A-Za-z0-9]+', '', wlm_name) 303 | cluster_suffix = redshift_configurations.get('NODE_TYPE') + "-" + redshift_configurations.get( 304 | 'NUMBER_OF_NODES') + wlm_name 305 | cluster_suffix = cluster_suffix.replace(".", "-") 306 | else: 307 | cluster_suffix = redshift_configurations.get('USER_FRIENDLY_NAME_SUFFIX') 308 | return (cluster_identifier_prefix + "-" + cluster_suffix).lower()[0:63] 309 | 310 | 311 | def update_wlm_config(client, cluster_identifier, wlm_config_s3_path): 312 | if wlm_config_s3_path is None or wlm_config_s3_path == "N/A": 313 | return "N/A" 314 | else: 315 | wlm_config = get_json_config_from_s3(wlm_config_s3_path) 316 | print("Changing {} parameter group wlm : {}".format(cluster_identifier, wlm_config)) 317 | client.modify_cluster_parameter_group( 318 | ParameterGroupName=cluster_identifier, 319 | Parameters=[ 320 | { 321 | 'ParameterName': 'wlm_json_configuration', 322 | 'ParameterValue': json.dumps(wlm_config), 323 | 'ApplyType': 'dynamic', 324 | 'IsModifiable': True 325 | }, 326 | ]) 327 | return "initiated" 328 | 329 | ## Added to check for clusters in pending reboot after wlm change ## 330 | def check_pending_reboot_status(client,cluster_identifier): 331 | try: 332 | cluster_desc = client.describe_clusters(ClusterIdentifier=cluster_identifier)['Clusters'][0] 333 | desc_paramgroup_status = cluster_desc['ClusterParameterGroups'][0]['ParameterApplyStatus'] 334 | status = cluster_desc.get('ClusterStatus') + cluster_desc.get('ClusterAvailabilityStatus') + desc_paramgroup_status 335 | if desc_paramgroup_status == 'pending-reboot': 336 | print('Cluster {} needs to be rebooted to apply the WLM config changes'.format(cluster_identifier)) 337 | client.reboot_cluster(ClusterIdentifier=cluster_identifier) 338 | except Exception as err: 339 | print(err) 340 | status = 'availableAvailablein-sync' 341 | return status 342 | 343 | def get_json_config_from_s3(script_s3_path): 344 | bucket, key = script_s3_path.replace("s3://", "").split("/", 1) 345 | obj = boto3.client('s3').get_object(Bucket=bucket, Key=key) 346 | return json.loads(obj['Body'].read().decode('utf-8')) 347 | 348 | 349 | def create_parameter_group(client, parameter_group_name): 350 | try: 351 | client.create_cluster_parameter_group( 352 | ParameterGroupName=parameter_group_name, 353 | ParameterGroupFamily='redshift-1.0', 354 | Description='redshift cluster parameter group' 355 | ) 356 | except be.ClientError as e: 357 | if e.response['Error']['Code'] == 'ClusterParameterGroupAlreadyExists': 358 | print(e.response['Error']['Code']) 359 | else: 360 | raise 361 | return 'initiated' 362 | 363 | 364 | def parameter_group_status(client, parameter_group_name): 365 | parameter_group = client.describe_cluster_parameters(ParameterGroupName=parameter_group_name) 366 | return parameter_group 367 | 368 | 369 | def update_parameter_group(client, parameter_group_name, parameter_group_config_s3_path): 370 | target_parameter_group = client.describe_cluster_parameters(ParameterGroupName=parameter_group_name)["Parameters"] 371 | target_parameters = {} 372 | for i in target_parameter_group: 373 | target_parameters[i['ParameterName']] = i['ParameterValue'] 374 | source_parameter_group = get_json_config_from_s3(parameter_group_config_s3_path)["Parameters"] 375 | modified_parameter_group = [] 376 | for i in source_parameter_group: 377 | source_parameter_value = i['ParameterValue'].replace(" ", "") 378 | target_parameter_value = target_parameters[i['ParameterName']].replace(" ", "") 379 | if source_parameter_value != target_parameter_value: 380 | modified_parameter_group.append(i) 381 | if modified_parameter_group: 382 | client.modify_cluster_parameter_group( 383 | ParameterGroupName=parameter_group_name, 384 | Parameters=modified_parameter_group) 385 | return "Initiated" 386 | 387 | 388 | def classic_resize_cluster(client, clusterid, node_type, number_of_nodes): 389 | client.resize_cluster(ClusterIdentifier=clusterid, NodeType=node_type, NumberOfNodes=int(number_of_nodes), 390 | ClusterType='single-node' if int(number_of_nodes) == 1 else 'multi-node', Classic=True) 391 | return "Initiated" 392 | 393 | 394 | def create_cluster(client, cluster_identifier, snapshot_id, redshift_iam_role, parameter_group_name, subnet_group, 395 | security_group_id, snapshot_account_id, node_type, number_of_nodes, master_user_name, database_name, 396 | port, publicly_accessible, secrets_manager_arn): 397 | try: 398 | if snapshot_id is None or snapshot_id == "N/A": 399 | master_user_secret = json.loads( 400 | boto3.client('secretsmanager').get_secret_value(SecretId=secrets_manager_arn).get('SecretString')) 401 | master_user_password = master_user_secret.get('password') 402 | client.create_cluster(DBName=database_name, 403 | ClusterIdentifier=cluster_identifier, 404 | ClusterType='single-node' if int(number_of_nodes) == 1 else 'multi-node', 405 | NodeType=node_type, 406 | MasterUsername=master_user_name, 407 | MasterUserPassword=master_user_password, 408 | VpcSecurityGroupIds=[security_group_id], 409 | ClusterSubnetGroupName=subnet_group, 410 | ClusterParameterGroupName=parameter_group_name, 411 | Port=port, 412 | NumberOfNodes=int(number_of_nodes), 413 | PubliclyAccessible=publicly_accessible, 414 | IamRoles=[redshift_iam_role]) 415 | else: 416 | if snapshot_account_id is None or snapshot_account_id == "N/A": 417 | snapshot_account_id = boto3.client('sts').get_caller_identity()['Account'] 418 | 419 | client.restore_from_cluster_snapshot(NumberOfNodes=int(number_of_nodes), 420 | NodeType=node_type, 421 | ClusterIdentifier=cluster_identifier, 422 | SnapshotIdentifier=snapshot_id, 423 | OwnerAccount=snapshot_account_id, 424 | Port=port, 425 | ClusterSubnetGroupName=subnet_group, 426 | PubliclyAccessible=publicly_accessible, 427 | ClusterParameterGroupName=parameter_group_name, 428 | VpcSecurityGroupIds=[security_group_id], 429 | IamRoles=[redshift_iam_role]) 430 | status = 'Initiated' 431 | except be.ClientError as e: 432 | msg = e.response['Error']['Code'] 433 | if msg == 'ClusterAlreadyExists': 434 | status = msg 435 | elif msg == 'InvalidParameterValue': 436 | source_node_type, source_number_of_nodes = get_source_cluster_config(client, snapshot_id) 437 | client.restore_from_cluster_snapshot(NumberOfNodes=source_number_of_nodes, 438 | NodeType=source_node_type, 439 | ClusterIdentifier=cluster_identifier, 440 | SnapshotIdentifier=snapshot_id, 441 | OwnerAccount=snapshot_account_id, 442 | Port=port, 443 | ClusterSubnetGroupName=subnet_group, 444 | PubliclyAccessible=publicly_accessible, 445 | ClusterParameterGroupName=parameter_group_name, 446 | VpcSecurityGroupIds=[security_group_id], 447 | IamRoles=[redshift_iam_role]) 448 | status = 'NeedClassicResize' 449 | else: 450 | raise 451 | return status 452 | 453 | 454 | def get_source_cluster_config(client, snapshot_id): 455 | resp = client.describe_cluster_snapshots(SnapshotIdentifier=snapshot_id) 456 | node_type = resp['Snapshots'][0]['NodeType'] 457 | number_of_nodes = resp['Snapshots'][0]['NumberOfNodes'] 458 | return (node_type, number_of_nodes) 459 | 460 | 461 | def run_sql_script_from_s3(script_s3_path, action, cluster_identifier, redshift_iam_role, bucket_name, db, 462 | user, run_type='async', result_cache='true', with_event=False, what_if_timestamp=None, 463 | comparison_stats_s3_path=None, comparison_results_s3_path=None, 464 | raw_comparison_results_s3_path=None, query_label_prefix=None): 465 | if script_s3_path is None or script_s3_path == "N/A": 466 | return "N/A" 467 | else: 468 | bucket, key = script_s3_path.replace("s3://", "").split("/", 1) 469 | obj = boto3.client('s3').get_object(Bucket=bucket, Key=key) 470 | script = obj['Body'].read().decode('utf-8') 471 | script = script.format(redshift_iam_role=redshift_iam_role, 472 | bucket_name=bucket_name, 473 | cluster_identifier=cluster_identifier, 474 | what_if_timestamp=what_if_timestamp, 475 | comparison_stats_s3_path=comparison_stats_s3_path, 476 | comparison_results_s3_path=comparison_results_s3_path, 477 | raw_comparison_results_s3_path=raw_comparison_results_s3_path, 478 | query_label_prefix=query_label_prefix) 479 | query_group_statement = "set query_group to '" + action + "';\n" 480 | result_cache_statement = "set enable_result_cache_for_session to " + result_cache + "; \n" 481 | script = query_group_statement + result_cache_statement + script 482 | sql_id = run_sql(cluster_identifier, db, user, script, with_event, run_type) 483 | return sql_id 484 | 485 | 486 | def run_sql(clusterid, db, user, script, with_event, run_type): 487 | res = boto3.client("redshift-data").execute_statement(Database=db, DbUser=user, Sql=script, 488 | ClusterIdentifier=clusterid, WithEvent=with_event) 489 | query_id = res["Id"] 490 | done = False 491 | while not done: 492 | status = sql_status(query_id) 493 | if run_type == 'async': 494 | break 495 | elif status == "FINISHED": 496 | break 497 | return query_id 498 | 499 | 500 | 501 | 502 | def sql_status(query_id): 503 | if query_id == "N/A": 504 | return "FINISHED" 505 | res = boto3.client("redshift-data").describe_statement(Id=query_id) 506 | status = res["Status"] 507 | if status == "FAILED": 508 | print(res) 509 | raise Exception(res["Error"]) 510 | return status.strip('"') 511 | 512 | 513 | def run_redshift_performance_test(client, cluster_identifier, bucket_name, performance_test_bootstrap_script, 514 | performance_test_python_script, 515 | sql_script_s3_path, number_of_parallel_sessions_list, job_definition, job_queue, 516 | redshift_iam_role, redshift_user_name, db, 517 | disable_result_cache, default_output_limit, max_number_of_queries, 518 | max_parallel_sessions, query_label_prefix): 519 | if sql_script_s3_path is None or sql_script_s3_path == "N/A": 520 | return "N/A" 521 | else: 522 | desc = client.describe_clusters(ClusterIdentifier=cluster_identifier)['Clusters'][0] 523 | cluster_endpoint = desc.get('Endpoint').get('Address') + ":" + str(desc.get('Endpoint').get('Port')) + "/" + db 524 | response = boto3.client('batch').submit_job(jobName='AmazonRedshiftPerformanceTesting', 525 | jobQueue=job_queue, 526 | jobDefinition=job_definition, 527 | containerOverrides={ 528 | "command": ["sh", "-c", 529 | "yum install -y awscli; aws s3 cp $BOOTSTRAP_SCRIPT ./bootstrap.sh; sh ./bootstrap.sh"], 530 | "environment": [ 531 | {"name": "BOOTSTRAP_SCRIPT", 532 | "value": performance_test_bootstrap_script}, 533 | {"name": "BUCKET_NAME", "value": bucket_name}, 534 | {"name": "PYTHON_SCRIPT", 535 | "value": performance_test_python_script}, 536 | {"name": "REDSHIFT_CLUSTER_ENDPOINT", 537 | "value": cluster_endpoint}, 538 | {"name": "REDSHIFT_IAM_ROLE", "value": redshift_iam_role}, 539 | {"name": "REDSHIFT_USER_NAME", "value": redshift_user_name}, 540 | {"name": "SQL_SCRIPT_S3_PATH", "value": sql_script_s3_path}, 541 | {"name": "NUMBER_OF_PARALLEL_SESSIONS_LIST", 542 | "value": number_of_parallel_sessions_list}, 543 | {"name": "DISABLE_RESULT_CACHE", 544 | "value": disable_result_cache}, 545 | {"name": "DEFAULT_OUTPUT_LIMIT", 546 | "value": default_output_limit}, 547 | {"name": "MAX_NUMBER_OF_QUERIES", 548 | "value": max_number_of_queries}, 549 | {"name": "MAX_PARALLEL_SESSIONS", 550 | "value": max_parallel_sessions}, 551 | {"name": "QUERY_LABEL_PREFIX", "value": query_label_prefix} 552 | ] 553 | }) 554 | return response['jobId'] 555 | 556 | 557 | def get_workload_location(extract_s3_path): 558 | bucket, key = extract_s3_path.replace("s3://", "").split("/", 1) 559 | response = boto3.client('s3').list_objects_v2(Bucket=bucket, Prefix=key) 560 | if response.get('Contents'): 561 | key = response.get('Contents')[0].get('Key').split('/')[2] 562 | return extract_s3_path + key 563 | else: 564 | return None 565 | 566 | 567 | def run_extract(what_if_timestamp, simple_replay_log_location, 568 | simple_replay_extract_start_time, simple_replay_extract_end_time, 569 | simple_replay_extract_overwrite_s3_path, 570 | bucket_name, redshift_user_name, 571 | extract_prefix, script_prefix, extract_bootstrap_script, job_definition, job_queue): 572 | if simple_replay_log_location is None or simple_replay_log_location == "N/A": 573 | return "N/A" 574 | else: 575 | if simple_replay_extract_overwrite_s3_path is None: 576 | simple_replay_extract_overwrite_s3_path="N/A" 577 | 578 | response = boto3.client('batch').submit_job(jobName='AmazonRedshiftExtract', 579 | jobQueue=job_queue, 580 | jobDefinition=job_definition, 581 | containerOverrides={ 582 | "command": ["sh", "-c", 583 | "yum install -y awscli; aws s3 cp $BOOTSTRAP_SCRIPT ./bootstrap.sh; sh ./bootstrap.sh"], 584 | "environment": [ 585 | {"name": "BOOTSTRAP_SCRIPT", 586 | "value": extract_bootstrap_script}, 587 | {"name": "BUCKET_NAME", "value": bucket_name}, 588 | {"name": "SIMPLE_REPLAY_EXTRACT_OVERWRITE_S3_PATH", 589 | "value": simple_replay_extract_overwrite_s3_path}, 590 | {"name": "SIMPLE_REPLAY_LOG_LOCATION", 591 | "value": simple_replay_log_location}, 592 | {"name": "REDSHIFT_USER_NAME", "value": redshift_user_name}, 593 | {"name": "WHAT_IF_TIMESTAMP", "value": what_if_timestamp}, 594 | {"name": "SIMPLE_REPLAY_EXTRACT_START_TIME", 595 | "value": simple_replay_extract_start_time}, 596 | {"name": "SIMPLE_REPLAY_EXTRACT_END_TIME", 597 | "value": simple_replay_extract_end_time}, 598 | {"name": "EXTRACT_PREFIX", "value": extract_prefix}, 599 | {"name": "SCRIPT_PREFIX", "value": script_prefix} 600 | ] 601 | }) 602 | return response['jobId'] 603 | 604 | 605 | def run_replay(client, what_if_timestamp, cluster_identifier, extract_s3_path, simple_replay_log_location, 606 | simple_replay_overwrite_s3_path, bucket_name, redshift_user_name, 607 | redshift_iam_role, db, extract_prefix, replay_prefix,script_prefix, snapshot_account_id, 608 | replay_bootstrap_script, job_definition, job_queue): 609 | if simple_replay_log_location is None or simple_replay_log_location == "N/A": 610 | return "N/A" 611 | else: 612 | if simple_replay_overwrite_s3_path is None: 613 | simple_replay_overwrite_s3_path="N/A" 614 | desc = client.describe_clusters(ClusterIdentifier=cluster_identifier)['Clusters'][0] 615 | cluster_endpoint = desc.get('Endpoint').get('Address') + ":" + str(desc.get('Endpoint').get('Port')) + "/" + db 616 | workload_location = get_workload_location(extract_s3_path) 617 | response = boto3.client('batch').submit_job(jobName='AmazonRedshiftReplay', 618 | jobQueue=job_queue, 619 | jobDefinition=job_definition, 620 | containerOverrides={ 621 | "command": ["sh", "-c", 622 | "yum install -y awscli; aws s3 cp $BOOTSTRAP_SCRIPT ./bootstrap.sh; sh ./bootstrap.sh"], 623 | "environment": [ 624 | {"name": "BOOTSTRAP_SCRIPT", 625 | "value": replay_bootstrap_script}, 626 | {"name": "WHAT_IF_TIMESTAMP", "value": what_if_timestamp}, 627 | {"name": "CLUSTER_IDENTIFIER", "value": cluster_identifier}, 628 | {"name": "CLUSTER_ENDPOINT", "value": cluster_endpoint}, 629 | {"name": "WORKLOAD_LOCATION", "value": workload_location}, 630 | {"name": "SIMPLE_REPLAY_OVERWRITE_S3_PATH", 631 | "value": simple_replay_overwrite_s3_path}, 632 | {"name": "SIMPLE_REPLAY_LOG_LOCATION", 633 | "value": simple_replay_log_location}, 634 | {"name": "BUCKET_NAME", "value": bucket_name}, 635 | {"name": "REDSHIFT_USER_NAME", "value": redshift_user_name}, 636 | {"name": "REDSHIFT_IAM_ROLE", "value": redshift_iam_role}, 637 | {"name": "EXTRACT_PREFIX", "value": extract_prefix}, 638 | {"name": "REPLAY_PREFIX", "value": replay_prefix}, 639 | {"name": "SCRIPT_PREFIX", "value": script_prefix}, 640 | {"name": "SNAPSHOT_ACCOUNT_ID", "value": snapshot_account_id} 641 | ] 642 | }) 643 | 644 | return response['jobId'] 645 | 646 | 647 | def batch_job_status(job_id, extract_s3_path=None): 648 | if job_id == "N/A": 649 | return "FINISHED" 650 | else: 651 | job_stats = boto3.client('batch').describe_jobs(jobs=[job_id]).get('jobs')[0] 652 | if job_stats.get('status') == "FAILED": 653 | raise Exception('Error:' + str(job_stats)) 654 | elif job_stats.get('status') == "SUCCEEDED": 655 | return "FINISHED" 656 | else: 657 | return job_stats.get('status') 658 | 659 | def run_glue_crawler(crawler_name): 660 | try: 661 | response = boto3.client('glue').start_crawler(Name=crawler_name) 662 | return "initiated" 663 | except be.ClientError as e: 664 | raise Exception("run_glue_crawler: " + e.__str__()) 665 | except Exception as e: 666 | raise Exception("run_glue_crawler: " + e.__str__()) 667 | -------------------------------------------------------------------------------- /configurations/RedshiftConfigTestingLambda.py.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-redshift-config-compare/23138cfdb68d939e5f25dcbaaf2452ff0861d56f/configurations/RedshiftConfigTestingLambda.py.zip -------------------------------------------------------------------------------- /configurations/RedshiftConfigTestingStepFunction.json: -------------------------------------------------------------------------------- 1 | { 2 | "Comment": "Step function to run Redshift What If Analysis", 3 | "StartAt": "initiate", 4 | "States": { 5 | "initiate": { 6 | "Comment": "Invoke lambda function", 7 | "Type": "Task", 8 | "Resource": "arn:aws:states:::lambda:invoke", 9 | "TimeoutSeconds": 300, 10 | "HeartbeatSeconds": 60, 11 | "ResultPath": "$.what_if_timestamp", 12 | "ResultSelector": { 13 | "output.$": "$.Payload" 14 | }, 15 | "Parameters": { 16 | "FunctionName": "${FunctionArn}", 17 | "Payload": { 18 | "Input": { 19 | "action": "initiate" 20 | } 21 | } 22 | }, 23 | "Next": "run_extract" 24 | }, 25 | "run_extract": { 26 | "Comment": "Invoke lambda function", 27 | "Type": "Task", 28 | "Resource": "arn:aws:states:::lambda:invoke", 29 | "TimeoutSeconds": 300, 30 | "HeartbeatSeconds": 60, 31 | "ResultPath": "$.extract_job_id", 32 | "ResultSelector": { 33 | "output.$": "$.Payload" 34 | }, 35 | "Parameters": { 36 | "FunctionName": "${FunctionArn}", 37 | "Payload": { 38 | "Input": { 39 | "action": "run_extract", 40 | "what_if_timestamp.$": "$.what_if_timestamp.output.status" 41 | } 42 | } 43 | }, 44 | "Next": "get_redshift_configurations" 45 | }, 46 | "get_redshift_configurations": { 47 | "Comment": "Invoke lambda function", 48 | "Type": "Task", 49 | "Resource": "arn:aws:states:::lambda:invoke", 50 | "TimeoutSeconds": 300, 51 | "HeartbeatSeconds": 60, 52 | "ResultPath": "$.redshift_configurations", 53 | "ResultSelector": { 54 | "output.$": "$.Payload" 55 | }, 56 | "Parameters": { 57 | "FunctionName": "${FunctionArn}", 58 | "Payload": { 59 | "Input": { 60 | "action": "get_redshift_configurations" 61 | } 62 | } 63 | }, 64 | "Next": "run_redshift_config_testing" 65 | }, 66 | "run_redshift_config_testing": { 67 | "Type": "Map", 68 | "ItemsPath": "$.redshift_configurations.output.status", 69 | "ResultPath": "$.redshift_configurations.output.status", 70 | "Parameters": { 71 | "redshift_cluster_configuration.$": "$$.Map.Item.Value", 72 | "redshift_cluster_index.$": "$$.Map.Item.Index", 73 | "what_if_timestamp.$": "$.what_if_timestamp", 74 | "extract_job_id.$": "$.extract_job_id" 75 | }, 76 | "Iterator": { 77 | "StartAt": "get_cluster_identifier", 78 | "States": { 79 | "get_cluster_identifier": { 80 | "Comment": "Invoke lambda function", 81 | "Type": "Task", 82 | "Resource": "arn:aws:states:::lambda:invoke", 83 | "TimeoutSeconds": 300, 84 | "HeartbeatSeconds": 60, 85 | "ResultPath": "$.cluster_identifier", 86 | "ResultSelector": { 87 | "output.$": "$.Payload" 88 | }, 89 | "Parameters": { 90 | "FunctionName": "${FunctionArn}", 91 | "Payload": { 92 | "Input": { 93 | "action": "get_cluster_identifier", 94 | "cluster_identifier_prefix": "${ClusterIdentifierPrefix}", 95 | "redshift_cluster_configuration.$": "$.redshift_cluster_configuration" 96 | } 97 | } 98 | }, 99 | "Next": "cluster_status_wait" 100 | }, 101 | "cluster_status_wait": { 102 | "Comment": "Wait 2 minutes before status check", 103 | "Type": "Wait", 104 | "Seconds": 120, 105 | "Next": "check_cluster_status" 106 | }, 107 | "check_cluster_status": { 108 | "Comment": "Invoke lambda function", 109 | "Type": "Task", 110 | "Resource": "arn:aws:states:::lambda:invoke", 111 | "TimeoutSeconds": 300, 112 | "HeartbeatSeconds": 60, 113 | "ResultPath": "$.step_output", 114 | "ResultSelector": { 115 | "output.$": "$.Payload" 116 | }, 117 | "Parameters": { 118 | "FunctionName": "${FunctionArn}", 119 | "Payload": { 120 | "Input": { 121 | "action": "cluster_status", 122 | "cluster_identifier.$": "$.cluster_identifier.output.status" 123 | } 124 | } 125 | }, 126 | "Next": "cluster_status_action" 127 | }, 128 | "cluster_status_action": { 129 | "Comment": "check if previous step is complete", 130 | "Type": "Choice", 131 | "Choices": [ 132 | { 133 | "And": [ 134 | { 135 | "Variable": "$.create_cluster", 136 | "IsPresent": true 137 | }, 138 | { 139 | "Variable": "$.create_cluster.output.status", 140 | "StringEquals": "NeedClassicResize" 141 | }, 142 | { 143 | "Variable": "$.step_output.output.status", 144 | "StringEquals": "availableAvailablecompleted" 145 | } 146 | ], 147 | "Next": "classic_resize_cluster" 148 | }, 149 | { 150 | "And": [ 151 | { 152 | "Variable": "$.create_cluster", 153 | "IsPresent": true 154 | }, 155 | { 156 | "Variable": "$.create_cluster.output.status", 157 | "StringEquals": "NeedClassicResize" 158 | }, 159 | { 160 | "Variable": "$.step_output.output.status", 161 | "StringEquals": "nonExistent" 162 | } 163 | ], 164 | "Next": "cluster_status_wait" 165 | }, 166 | { 167 | "Variable": "$.step_output.output.status", 168 | "StringEquals": "availableAvailablecompleted", 169 | "Next": "update_wlm_config" 170 | }, 171 | { 172 | "Variable": "$.step_output.output.status", 173 | "StringEquals": "availableAvailable", 174 | "Next": "update_wlm_config" 175 | }, 176 | { 177 | "Variable": "$.step_output.output.status", 178 | "StringEquals": "nonExistent", 179 | "Next": "create_parameter_group" 180 | }, 181 | { 182 | "Variable": "$.step_output.output.status", 183 | "StringEquals": "pausedPaused", 184 | "Next": "resume_cluster" 185 | }, 186 | { 187 | "Variable": "$.step_output.output.status", 188 | "StringEquals": "pausedPausedcompleted", 189 | "Next": "resume_cluster" 190 | } 191 | ], 192 | "Default": "cluster_status_wait" 193 | }, 194 | "create_parameter_group": { 195 | "Comment": "sync target redshift parameter group with source cluster", 196 | "Type": "Task", 197 | "Resource": "arn:aws:states:::lambda:invoke", 198 | "TimeoutSeconds": 300, 199 | "HeartbeatSeconds": 60, 200 | "ResultPath": "$.step_output", 201 | "ResultSelector": { 202 | "output.$": "$.Payload" 203 | }, 204 | "Parameters": { 205 | "FunctionName": "${FunctionArn}", 206 | "Payload": { 207 | "Input": { 208 | "action": "create_parameter_group", 209 | "cluster_identifier.$": "$.cluster_identifier.output.status" 210 | } 211 | } 212 | }, 213 | "Next": "update_parameter_group" 214 | }, 215 | "update_parameter_group": { 216 | "Comment": "sync target redshift parameter group with source cluster", 217 | "Type": "Task", 218 | "Resource": "arn:aws:states:::lambda:invoke", 219 | "TimeoutSeconds": 300, 220 | "HeartbeatSeconds": 60, 221 | "ResultPath": "$.step_output", 222 | "ResultSelector": { 223 | "output.$": "$.Payload" 224 | }, 225 | "Parameters": { 226 | "FunctionName": "${FunctionArn}", 227 | "Payload": { 228 | "Input": { 229 | "action": "update_parameter_group", 230 | "cluster_identifier.$": "$.cluster_identifier.output.status" 231 | } 232 | } 233 | }, 234 | "Next": "create_cluster" 235 | }, 236 | "create_cluster": { 237 | "Comment": "Invoke lambda function", 238 | "Type": "Task", 239 | "Resource": "arn:aws:states:::lambda:invoke", 240 | "TimeoutSeconds": 300, 241 | "HeartbeatSeconds": 60, 242 | "ResultPath": "$.create_cluster", 243 | "ResultSelector": { 244 | "output.$": "$.Payload" 245 | }, 246 | "Parameters": { 247 | "FunctionName": "${FunctionArn}", 248 | "Payload": { 249 | "Input": { 250 | "action": "create_cluster", 251 | "cluster_identifier.$": "$.cluster_identifier.output.status", 252 | "redshift_cluster_configuration.$": "$.redshift_cluster_configuration" 253 | } 254 | } 255 | }, 256 | "Next": "cluster_status_wait" 257 | }, 258 | "classic_resize_cluster": { 259 | "Comment": "Invoke lambda function", 260 | "Type": "Task", 261 | "Resource": "arn:aws:states:::lambda:invoke", 262 | "TimeoutSeconds": 300, 263 | "HeartbeatSeconds": 60, 264 | "ResultPath": "$.create_cluster", 265 | "ResultSelector": { 266 | "output.$": "$.Payload" 267 | }, 268 | "Parameters": { 269 | "FunctionName": "${FunctionArn}", 270 | "Payload": { 271 | "Input": { 272 | "action": "classic_resize_cluster", 273 | "cluster_identifier.$": "$.cluster_identifier.output.status", 274 | "redshift_cluster_configuration.$": "$.redshift_cluster_configuration" 275 | } 276 | } 277 | }, 278 | "Next": "cluster_status_wait" 279 | }, 280 | "resume_cluster": { 281 | "Comment": "Invoke lambda function", 282 | "Type": "Task", 283 | "Resource": "arn:aws:states:::lambda:invoke", 284 | "TimeoutSeconds": 300, 285 | "HeartbeatSeconds": 60, 286 | "ResultPath": "$.step_output", 287 | "ResultSelector": { 288 | "output.$": "$.Payload" 289 | }, 290 | "Parameters": { 291 | "FunctionName": "${FunctionArn}", 292 | "Payload": { 293 | "Input": { 294 | "action": "resume_cluster", 295 | "cluster_identifier.$": "$.cluster_identifier.output.status" 296 | } 297 | } 298 | }, 299 | "Next": "cluster_status_wait" 300 | }, 301 | "update_wlm_config": { 302 | "Comment": "Invoke lambda function", 303 | "Type": "Task", 304 | "Resource": "arn:aws:states:::lambda:invoke", 305 | "TimeoutSeconds": 300, 306 | "HeartbeatSeconds": 60, 307 | "ResultPath": "$.step_output", 308 | "ResultSelector": { 309 | "output.$": "$.Payload" 310 | }, 311 | "Parameters": { 312 | "FunctionName": "${FunctionArn}", 313 | "Payload": { 314 | "Input": { 315 | "action": "update_wlm_config", 316 | "cluster_identifier.$": "$.cluster_identifier.output.status", 317 | "redshift_cluster_configuration.$": "$.redshift_cluster_configuration" 318 | } 319 | } 320 | }, 321 | "Next": "update_wlm_wait" 322 | }, 323 | "update_wlm_wait": { 324 | "Comment": "Wait before next", 325 | "Type": "Wait", 326 | "Seconds": 120, 327 | "Next": "check_pending_reboot_status" 328 | }, 329 | "check_pending_reboot_status": { 330 | "Comment": "check if clsuter is in pending reboot", 331 | "Type": "Task", 332 | "Resource": "arn:aws:states:::lambda:invoke", 333 | "TimeoutSeconds": 300, 334 | "HeartbeatSeconds": 60, 335 | "ResultPath": "$.result", 336 | "ResultSelector": { 337 | "output.$": "$.Payload" 338 | }, 339 | "Parameters": { 340 | "FunctionName": "${FunctionArn}", 341 | "Payload": { 342 | "Input": { 343 | "action": "check_pending_reboot_status", 344 | "cluster_identifier.$": "$.cluster_identifier.output.status" 345 | } 346 | } 347 | }, 348 | "Next": "cluster_reboot_status_check" 349 | }, 350 | "cluster_reboot_status_check": { 351 | "Type": "Choice", 352 | "Choices": [ 353 | { 354 | "Variable": "$.result.output.status", 355 | "StringEquals": "availableAvailablein-sync", 356 | "Next": "run_ddl_and_copy_script" 357 | } 358 | ], 359 | "Default": "cluster_reboot_wait" 360 | }, 361 | "cluster_reboot_wait": { 362 | "Comment": "Wait for cluster to be available after reboot", 363 | "Type": "Wait", 364 | "Seconds": 60, 365 | "Next": "check_pending_reboot_status" 366 | }, 367 | "run_ddl_and_copy_script": { 368 | "Comment": "Invoke lambda function", 369 | "Type": "Task", 370 | "Resource": "arn:aws:states:::lambda:invoke", 371 | "TimeoutSeconds": 300, 372 | "HeartbeatSeconds": 60, 373 | "ResultPath": "$.sql_output", 374 | "ResultSelector": { 375 | "output.$": "$.Payload" 376 | }, 377 | "Parameters": { 378 | "FunctionName": "${FunctionArn}", 379 | "Payload": { 380 | "Input": { 381 | "action": "run_ddl_and_copy_script", 382 | "cluster_identifier.$": "$.cluster_identifier.output.status" 383 | } 384 | } 385 | }, 386 | "Next": "ddl_and_copy_script_wait" 387 | }, 388 | "ddl_and_copy_script_wait": { 389 | "Comment": "Wait before status check", 390 | "Type": "Wait", 391 | "Seconds": 120, 392 | "Next": "ddl_and_copy_script_status_check" 393 | }, 394 | "ddl_and_copy_script_status_check": { 395 | "Comment": "Check Task Status", 396 | "Type": "Task", 397 | "Resource": "arn:aws:states:::lambda:invoke", 398 | "TimeoutSeconds": 300, 399 | "HeartbeatSeconds": 60, 400 | "ResultPath": "$.step_output", 401 | "ResultSelector": { 402 | "output.$": "$.Payload" 403 | }, 404 | "Parameters": { 405 | "FunctionName": "${FunctionArn}", 406 | "Payload": { 407 | "Input": { 408 | "action": "sql_status", 409 | "sql_id.$": "$.sql_output.output.sql_id" 410 | } 411 | } 412 | }, 413 | "Next": "is_ddl_and_copy_script_complete" 414 | }, 415 | "is_ddl_and_copy_script_complete": { 416 | "Comment": "check if previous step is complete", 417 | "Type": "Choice", 418 | "Choices": [ 419 | { 420 | "Variable": "$.step_output.output.status", 421 | "StringEquals": "FINISHED", 422 | "Next": "run_redshift_performance_test" 423 | } 424 | ], 425 | "Default": "ddl_and_copy_script_wait" 426 | }, 427 | "run_redshift_performance_test": { 428 | "Comment": "Invoke lambda function", 429 | "Type": "Task", 430 | "Resource": "arn:aws:states:::lambda:invoke", 431 | "TimeoutSeconds": 300, 432 | "HeartbeatSeconds": 60, 433 | "ResultPath": "$.batch_output", 434 | "ResultSelector": { 435 | "output.$": "$.Payload" 436 | }, 437 | "Parameters": { 438 | "FunctionName": "${FunctionArn}", 439 | "Payload": { 440 | "Input": { 441 | "action": "run_redshift_performance_test", 442 | "cluster_identifier.$": "$.cluster_identifier.output.status" 443 | } 444 | } 445 | }, 446 | "Next": "run_redshift_performance_test_wait" 447 | }, 448 | "run_redshift_performance_test_wait": { 449 | "Comment": "Wait before status check", 450 | "Type": "Wait", 451 | "Seconds": 60, 452 | "Next": "redshift_performance_test_status_check" 453 | }, 454 | "redshift_performance_test_status_check": { 455 | "Comment": "Check Task Status", 456 | "Type": "Task", 457 | "Resource": "arn:aws:states:::lambda:invoke", 458 | "TimeoutSeconds": 300, 459 | "HeartbeatSeconds": 60, 460 | "ResultPath": "$.step_output", 461 | "ResultSelector": { 462 | "output.$": "$.Payload" 463 | }, 464 | "Parameters": { 465 | "FunctionName": "${FunctionArn}", 466 | "Payload": { 467 | "Input": { 468 | "action": "batch_job_status", 469 | "job_id.$": "$.batch_output.output.job_id", 470 | "what_if_timestamp.$": "$.what_if_timestamp.output.status" 471 | } 472 | } 473 | }, 474 | "Next": "is_run_redshift_performance_test_complete" 475 | }, 476 | "is_run_redshift_performance_test_complete": { 477 | "Comment": "check if previous step is complete", 478 | "Type": "Choice", 479 | "Choices": [ 480 | { 481 | "Variable": "$.step_output.output.status", 482 | "StringEquals": "FINISHED", 483 | "Next": "extract_status_check" 484 | } 485 | ], 486 | "Default": "run_redshift_performance_test_wait" 487 | }, 488 | "extract_wait": { 489 | "Comment": "Wait before status check", 490 | "Type": "Wait", 491 | "Seconds": 60, 492 | "Next": "extract_status_check" 493 | }, 494 | "extract_status_check": { 495 | "Comment": "Check Task Status", 496 | "Type": "Task", 497 | "Resource": "arn:aws:states:::lambda:invoke", 498 | "TimeoutSeconds": 300, 499 | "HeartbeatSeconds": 60, 500 | "ResultPath": "$.step_output", 501 | "ResultSelector": { 502 | "output.$": "$.Payload" 503 | }, 504 | "Parameters": { 505 | "FunctionName": "${FunctionArn}", 506 | "Payload": { 507 | "Input": { 508 | "action": "batch_job_status", 509 | "job_id.$": "$.extract_job_id.output.job_id" 510 | } 511 | } 512 | }, 513 | "Next": "is_extract_complete" 514 | }, 515 | "is_extract_complete": { 516 | "Comment": "check if previous step is complete", 517 | "Type": "Choice", 518 | "Choices": [ 519 | { 520 | "Variable": "$.step_output.output.status", 521 | "StringEquals": "FINISHED", 522 | "Next": "run_replay" 523 | } 524 | ], 525 | "Default": "extract_wait" 526 | }, 527 | "run_replay": { 528 | "Comment": "Invoke lambda function", 529 | "Type": "Task", 530 | "Resource": "arn:aws:states:::lambda:invoke", 531 | "TimeoutSeconds": 300, 532 | "HeartbeatSeconds": 60, 533 | "ResultPath": "$.batch_output", 534 | "ResultSelector": { 535 | "output.$": "$.Payload" 536 | }, 537 | "Parameters": { 538 | "FunctionName": "${FunctionArn}", 539 | "Payload": { 540 | "Input": { 541 | "action": "run_replay", 542 | "cluster_identifier.$": "$.cluster_identifier.output.status", 543 | "what_if_timestamp.$": "$.what_if_timestamp.output.status" 544 | } 545 | } 546 | }, 547 | "Next": "replay_wait" 548 | }, 549 | "replay_wait": { 550 | "Comment": "Wait before status check", 551 | "Type": "Wait", 552 | "Seconds": 300, 553 | "Next": "replay_status_check" 554 | }, 555 | "replay_status_check": { 556 | "Comment": "Check Task Status", 557 | "Type": "Task", 558 | "Resource": "arn:aws:states:::lambda:invoke", 559 | "TimeoutSeconds": 300, 560 | "HeartbeatSeconds": 60, 561 | "ResultPath": "$.step_output", 562 | "ResultSelector": { 563 | "output.$": "$.Payload" 564 | }, 565 | "Parameters": { 566 | "FunctionName": "${FunctionArn}", 567 | "Payload": { 568 | "Input": { 569 | "action": "batch_job_status", 570 | "job_id.$": "$.batch_output.output.job_id", 571 | "what_if_timestamp.$": "$.what_if_timestamp.output.status" 572 | } 573 | } 574 | }, 575 | "Next": "is_replay_complete" 576 | }, 577 | "is_replay_complete": { 578 | "Comment": "check if previous step is complete", 579 | "Type": "Choice", 580 | "Choices": [ 581 | { 582 | "Variable": "$.step_output.output.status", 583 | "StringEquals": "FINISHED", 584 | "Next": "gather_comparison_stats" 585 | } 586 | ], 587 | "Default": "replay_wait" 588 | }, 589 | "gather_comparison_stats": { 590 | "Comment": "Invoke lambda function", 591 | "Type": "Task", 592 | "Resource": "arn:aws:states:::lambda:invoke", 593 | "TimeoutSeconds": 300, 594 | "HeartbeatSeconds": 60, 595 | "ResultPath": "$.step_output", 596 | "ResultSelector": { 597 | "output.$": "$.Payload" 598 | }, 599 | "Parameters": { 600 | "FunctionName": "${FunctionArn}", 601 | "Payload": { 602 | "Input": { 603 | "action": "gather_comparison_stats", 604 | "cluster_identifier.$": "$.cluster_identifier.output.status", 605 | "what_if_timestamp.$": "$.what_if_timestamp.output.status", 606 | "redshift_cluster_configuration.$": "$.redshift_cluster_configuration" 607 | } 608 | } 609 | }, 610 | "Next": "gather_comparison_stats_wait" 611 | }, 612 | "gather_comparison_stats_wait": { 613 | "Comment": "Wait before status check", 614 | "Type": "Wait", 615 | "Seconds": 60, 616 | "Next": "pause_cluster" 617 | }, 618 | "pause_cluster": { 619 | "Comment": "Invoke lambda function", 620 | "Type": "Task", 621 | "Resource": "arn:aws:states:::lambda:invoke", 622 | "TimeoutSeconds": 300, 623 | "HeartbeatSeconds": 60, 624 | "ResultPath": "$.step_output", 625 | "ResultSelector": { 626 | "output.$": "$.Payload" 627 | }, 628 | "Parameters": { 629 | "FunctionName": "${FunctionArn}", 630 | "Payload": { 631 | "Input": { 632 | "action": "pause_cluster", 633 | "cluster_identifier.$": "$.cluster_identifier.output.status", 634 | "redshift_cluster_index.$": "$.redshift_cluster_index" 635 | } 636 | } 637 | }, 638 | "End": true 639 | } 640 | } 641 | }, 642 | "Next": "testrun_complete" 643 | }, 644 | 645 | "testrun_complete": { 646 | "Comment": "testrun complete", 647 | "Type": "Pass", 648 | "Parameters": { 649 | "redshift_configurations.$": "$.redshift_configurations" 650 | }, 651 | "Next": "run_glue_crawler" 652 | }, 653 | "run_glue_crawler": { 654 | "Comment": "Invoke lambda function", 655 | "Type": "Task", 656 | "Resource": "arn:aws:states:::lambda:invoke", 657 | "TimeoutSeconds": 300, 658 | "HeartbeatSeconds": 60, 659 | "ResultPath": "$.step_output", 660 | "ResultSelector": { 661 | "output.$": "$.Payload" 662 | }, 663 | "Parameters": { 664 | "FunctionName": "${FunctionArn}", 665 | "Payload": { 666 | "Input": { 667 | "action": "run_glue_crawler" 668 | } 669 | } 670 | }, 671 | "Next": "crawler_wait" 672 | }, 673 | "crawler_wait": { 674 | "Comment": "Wait before populate results", 675 | "Type": "Wait", 676 | "Seconds": 300, 677 | "Next": "populate_comparison_results" 678 | }, 679 | "populate_comparison_results": { 680 | "Comment": "Invoke lambda function", 681 | "Type": "Task", 682 | "Resource": "arn:aws:states:::lambda:invoke", 683 | "TimeoutSeconds": 300, 684 | "HeartbeatSeconds": 60, 685 | "ResultPath": "$.step_output", 686 | "ResultSelector": { 687 | "output.$": "$.Payload" 688 | }, 689 | "Parameters": { 690 | "FunctionName": "${FunctionArn}", 691 | "Payload": { 692 | "Input": { 693 | "action": "populate_comparison_results", 694 | "cluster_identifier.$": "$.redshift_configurations.output.status[0].cluster_identifier.output.status", 695 | "what_if_timestamp.$": "$.redshift_configurations.output.status[0].what_if_timestamp.output.status" 696 | } 697 | } 698 | }, 699 | "End": true 700 | } 701 | } 702 | } 703 | -------------------------------------------------------------------------------- /configurations/extract_bootstrap.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | echo "bucket_name: $BUCKET_NAME" 4 | echo "simple_replay_extract_overwrite_s3_path: $SIMPLE_REPLAY_EXTRACT_OVERWRITE_S3_PATH" 5 | echo "simple_replay_log_location: $SIMPLE_REPLAY_LOG_LOCATION" 6 | echo "redshift_user_name: $REDSHIFT_USER_NAME" 7 | echo "what_if_timestamp: $WHAT_IF_TIMESTAMP" 8 | echo "simple_replay_extract_start_time: $SIMPLE_REPLAY_EXTRACT_START_TIME" 9 | echo "simple_replay_extract_end_time: $SIMPLE_REPLAY_EXTRACT_END_TIME" 10 | echo "extract_prefix: $EXTRACT_PREFIX" 11 | echo "script_prefix: $SCRIPT_PREFIX" 12 | 13 | yum update -y 14 | yum -y install git 15 | yum -y install python3 16 | yum -y install python3-pip 17 | yum -y install aws-cfn-bootstrap 18 | yum -y install gcc gcc-c++ python3 python3-devel unixODBC unixODBC-devel 19 | mkdir /amazonutils 20 | cd /amazonutils 21 | git clone https://github.com/awslabs/amazon-redshift-utils.git 22 | pip3 install -r /amazonutils/amazon-redshift-utils/src/SimpleReplay/requirements.txt 23 | cd /amazonutils/amazon-redshift-utils/src/SimpleReplay 24 | if [[ "$SIMPLE_REPLAY_EXTRACT_OVERWRITE_S3_PATH" != "N/A" ]]; then 25 | aws s3 cp $SIMPLE_REPLAY_EXTRACT_OVERWRITE_S3_PATH replay.yaml 26 | fi 27 | WORKLOAD_LOCATION="s3://${BUCKET_NAME}/${EXTRACT_PREFIX}/${WHAT_IF_TIMESTAMP}" 28 | sed -i "s#master_username: \".*\"#master_username: \"$REDSHIFT_USER_NAME\"#g" extract.yaml 29 | sed -i "s#log_location: \".*\"#log_location: \"$SIMPLE_REPLAY_LOG_LOCATION\"#g" extract.yaml 30 | sed -i "s#workload_location: \".*\"#workload_location: \"$WORKLOAD_LOCATION\"#g" extract.yaml 31 | sed -i "s#start_time: \".*\"#start_time: \"$SIMPLE_REPLAY_EXTRACT_START_TIME\"#g" extract.yaml 32 | sed -i "s#end_time: \".*\"#end_time: \"$SIMPLE_REPLAY_EXTRACT_END_TIME\"#g" extract.yaml 33 | aws s3 cp extract.yaml s3://$BUCKET_NAME/$SCRIPT_PREFIX/ 34 | # 35 | # run extract process 36 | # 37 | python3 extract.py extract.yaml 38 | # 39 | # upload metadata 40 | # 41 | # output=$(aws s3 ls s3://$BUCKET_NAME/$EXTRACT_PREFIX/$WHAT_IF_TIMESTAMP/ | awk '{print $2}') 42 | # echo "output: $output" 43 | # extract_output=${output::-1} 44 | # echo "{\"timestamp\": \"$WHAT_IF_TIMESTAMP\", \"extract_output\": \"$extract_output\"}" > $EXTRACT_PREFIX.json 45 | # 46 | # aws s3 cp $EXTRACT_PREFIX.json s3://$BUCKET_NAME/$EXTRACT_PREFIX/ -------------------------------------------------------------------------------- /configurations/gather_comparison_stats.sql: -------------------------------------------------------------------------------- 1 | 2 | drop view if exists public.redshift_detailed_query_stats cascade; 3 | 4 | CREATE OR replace VIEW public.redshift_detailed_query_stats 5 | AS 6 | WITH queries 7 | AS (SELECT query, 8 | Listagg(DISTINCT schemaname 9 | ||'.' 10 | ||table_name, ',') 11 | within GROUP(ORDER BY table_name) tables_scanned 12 | FROM (WITH scan_delete_insert 13 | AS (SELECT 'scan' query_type, 14 | query, 15 | Lpad(segment, 3, '0') segment, 16 | tbl 17 | FROM stl_scan 18 | WHERE userid > 1 19 | AND perm_table_name != 'Internal Worktable' 20 | AND tbl <> 0 21 | UNION ALL 22 | SELECT 'delete' query_type, 23 | query, 24 | Lpad(segment, 3, '0') segment, 25 | tbl 26 | FROM stl_delete 27 | WHERE userid > 1 28 | AND tbl <> 0 29 | UNION ALL 30 | SELECT 'insert' query_type, 31 | query, 32 | Lpad(segment, 3, '0') segment, 33 | tbl 34 | FROM stl_insert 35 | WHERE userid > 1 36 | AND tbl <> 0) 37 | SELECT sdi.query_type, 38 | sdi.query, 39 | sdi.segment, 40 | sdi.tbl, 41 | Trim(n.nspname) AS schemaname, 42 | Trim(c.relname) table_name 43 | FROM scan_delete_insert sdi 44 | join pg_class c 45 | ON c.oid = sdi.tbl 46 | join pg_namespace n 47 | ON n.oid = c.relnamespace) 48 | GROUP BY query), 49 | compiles 50 | AS (SELECT query, 51 | SUM(Datediff (microsecond, q.starttime, q.endtime)) 52 | compile_time 53 | FROM svl_compile q 54 | WHERE COMPILE = 1 55 | GROUP BY query) 56 | SELECT nvl(s.name,'Result Cache') as queue, 57 | Trim(u.usename) AS username, 58 | CASE 59 | WHEN q.concurrency_scaling_status = 1 THEN 1 60 | ELSE 0 61 | END AS cc_scaling, 62 | q.aborted, 63 | w.total_queue_time queue_time, 64 | Nvl(ct.compile_time, 0) compile_time, 65 | w.total_exec_time - Nvl(ct.compile_time, 0) exec_time, 66 | Datediff(microsecond, q.starttime, q.endtime) AS total_query_time, 67 | q.userid, 68 | q.query, 69 | q.label query_label, 70 | q.xid, 71 | q.pid, 72 | w.service_class, 73 | q.starttime, 74 | q.endtime, 75 | tables_scanned, 76 | Trim(q.querytxt) querytxt, 77 | SHA2(q.querytxt,256) query_hash 78 | FROM stl_query q 79 | join stl_wlm_query w USING (userid, query) 80 | join pg_user u 81 | ON u.usesysid = q.userid 82 | left join stv_wlm_service_class_config s 83 | ON w.service_class = s.service_class 84 | left outer join queries 85 | ON queries.query = q.query 86 | left outer join compiles ct 87 | ON w.query = ct.query 88 | WHERE q.userid > 1 89 | AND w.service_class > 5; 90 | 91 | 92 | drop view if exists public.redshift_config_comparison_raw cascade; 93 | 94 | 95 | create or replace view public.redshift_config_comparison_raw as 96 | SELECT 97 | partition_1 cluster_identifier, 98 | SHA2(split_part(querytxt,'}',2),256) as query_hash, 99 | round( exec_time / ( 1000 * 1000 ) ,2) exec_time_seconds, 100 | round( total_query_time / ( 1000 * 1000 ) ,2) total_query_time_seconds, 101 | round( compile_time / ( 1000 * 1000 ) ,2) compile_time_seconds, 102 | round( queue_time / ( 1000 * 1000 ) ,2) queue_time_seconds, 103 | queue, 104 | username, 105 | cc_scaling, 106 | aborted, 107 | queue_time, 108 | compile_time, 109 | exec_time, 110 | total_query_time, 111 | userid, 112 | query, 113 | trim(query_label::varchar) query_label, 114 | xid, 115 | pid, 116 | service_class, 117 | starttime, 118 | endtime, 119 | tables_scanned, 120 | querytxt , 121 | partition_0::varchar analysis_timestamp 122 | FROM redshift_config_comparison.comparison_stats q 123 | where partition_0 = (SELECT 124 | MAX(partition_0) 125 | FROM 126 | redshift_config_comparison.comparison_stats) 127 | --AND (query_label like '{query_label_prefix}%' or q.querytxt LIKE '%Replay source file%') 128 | AND q.querytxt LIKE '%replay_start%' 129 | with no schema binding; 130 | 131 | 132 | 133 | drop view if exists public.redshift_config_comparison_aggregate cascade; 134 | 135 | 136 | create or replace view public.redshift_config_comparison_aggregate as 137 | SELECT case when query_label like '{query_label_prefix}%' then query_label else 'simple-replay' end test_type 138 | ,cluster_identifier 139 | ,queue 140 | ,username 141 | , ROUND(SUM(r.total_query_time::NUMERIC) / ( 1000 * 1000 ) ,2) total_query_time_seconds 142 | , ROUND(AVG(r.total_query_time::NUMERIC) / ( 1000 * 1000 ) ,2) mean_query_time_seconds 143 | , ROUND(( PERCENTILE_CONT(0.50) WITHIN GROUP( ORDER BY total_query_time) )::NUMERIC / ( 1000 * 1000 ),3) AS median_query_time_seconds 144 | , ROUND(MAX(total_query_time)::NUMERIC / ( 1000 * 1000 ),2) max_query_time_seconds 145 | , ROUND(( PERCENTILE_CONT(0.75) WITHIN GROUP( ORDER BY total_query_time) )::NUMERIC / ( 1000 * 1000 ),3) AS pct75_query_time_seconds 146 | , ROUND(( PERCENTILE_CONT(0.90) WITHIN GROUP( ORDER BY total_query_time) )::NUMERIC / ( 1000 * 1000 ),3) AS pct90_query_time_seconds 147 | , ROUND(( PERCENTILE_CONT(0.95) WITHIN GROUP( ORDER BY total_query_time) )::NUMERIC / ( 1000 * 1000 ),3) AS pct95_query_time_seconds 148 | ,SUM(cc_scaling) count_cc_scaling 149 | ,SUM(aborted) count_aborted 150 | ,SUM(1) count_queries 151 | , analysis_timestamp 152 | FROM public.redshift_config_comparison_raw r 153 | group by test_type, 154 | cluster_identifier, 155 | queue, 156 | username, 157 | analysis_timestamp 158 | with no schema binding 159 | ; 160 | 161 | 162 | drop view if exists public.redshift_config_comparison_results cascade; 163 | 164 | create or replace view public.redshift_config_comparison_results as 165 | with agg_data as (SELECT test_type, 166 | cluster_identifier, 167 | queue, 168 | username, 169 | total_query_time_seconds, 170 | max(total_query_time_seconds) over (partition by test_type,username rows between unbounded preceding and unbounded following) max_total_query_time_seconds, 171 | mean_query_time_seconds, 172 | max(mean_query_time_seconds) over (partition by test_type,username rows between unbounded preceding and unbounded following ) max_mean_query_time_seconds, 173 | median_query_time_seconds, 174 | max(median_query_time_seconds) over (partition by test_type,username rows between unbounded preceding and unbounded following ) max_median_query_time_seconds, 175 | max_query_time_seconds, 176 | max(max_query_time_seconds) over (partition by test_type,username rows between unbounded preceding and unbounded following ) max_max_query_time_seconds, 177 | pct75_query_time_seconds, 178 | max(pct75_query_time_seconds) over (partition by test_type,username rows between unbounded preceding and unbounded following ) max_pct75_query_time_seconds, 179 | pct90_query_time_seconds, 180 | max(pct90_query_time_seconds) over (partition by test_type,username rows between unbounded preceding and unbounded following ) max_pct90_query_time_seconds, 181 | pct95_query_time_seconds, 182 | max(pct95_query_time_seconds) over (partition by test_type,username rows between unbounded preceding and unbounded following ) max_pct95_query_time_seconds, 183 | count_cc_scaling, 184 | count_aborted, 185 | count_queries, 186 | analysis_timestamp 187 | FROM public.redshift_config_comparison_aggregate) 188 | select test_type, 189 | cluster_identifier, 190 | queue, 191 | username, 192 | total_query_time_seconds, 193 | round(((max_total_query_time_seconds-total_query_time_seconds)/case when total_query_time_seconds=0 then 1 else total_query_time_seconds end)*100)||'%' improvement_total_query_time, 194 | mean_query_time_seconds, 195 | round(((max_mean_query_time_seconds-mean_query_time_seconds)/case when mean_query_time_seconds=0 then 1 else mean_query_time_seconds end)*100)||'%' improvement_mean_query_time, 196 | median_query_time_seconds, 197 | round(((max_median_query_time_seconds-median_query_time_seconds)/case when median_query_time_seconds=0 then 1 else median_query_time_seconds end)*100)||'%' improvement_median_query_time, 198 | max_query_time_seconds, 199 | round(((max_max_query_time_seconds-max_query_time_seconds)/case when max_query_time_seconds=0 then 1 else max_query_time_seconds end)*100)||'%' improvement_max_query_time, 200 | pct75_query_time_seconds, 201 | round(((max_pct75_query_time_seconds-pct75_query_time_seconds)/case when pct75_query_time_seconds=0 then 1 else pct75_query_time_seconds end)*100)||'%' improvement_pct75_query_time, 202 | pct90_query_time_seconds, 203 | round(((max_pct90_query_time_seconds-pct90_query_time_seconds)/case when pct90_query_time_seconds=0 then 1 else pct90_query_time_seconds end)*100)||'%' improvement_pct90_query_time, 204 | pct95_query_time_seconds, 205 | round(((max_pct95_query_time_seconds-pct95_query_time_seconds)/case when pct95_query_time_seconds=0 then 1 else pct95_query_time_seconds end)*100)||'%' improvement_pct95_query_time, 206 | count_cc_scaling, 207 | count_aborted, 208 | count_queries, 209 | analysis_timestamp 210 | from agg_data 211 | with no schema binding; 212 | 213 | 214 | 215 | drop view if exists public.redshift_pricing cascade; 216 | 217 | create or replace view public.redshift_pricing as 218 | select case 219 | when termtype='OnDemand' then priceperunit*365*24 220 | when purchaseoption='No Upfront' and leasecontractlength='1yr' then priceperunit*365*24 221 | when purchaseoption='All Upfront' and leasecontractlength='1yr' and unit='Quantity' then priceperunit 222 | when purchaseoption='No Upfront' and leasecontractlength='3yr' then priceperunit*365*24 223 | when purchaseoption='All Upfront' and leasecontractlength='3yr' and unit='Quantity' then priceperunit/3 224 | end::bigint per_compute_node_yearly_cost, 225 | unit,location, "instance type" instance_type, termtype,leasecontractlength,purchaseoption,unit 226 | FROM redshift_config_comparison.pricing where "product family"='Compute Instance' 227 | and nvl(purchaseoption,'OnDemand') in ('OnDemand','All Upfront','No Upfront') 228 | and priceperunit>0 229 | with no schema binding; 230 | 231 | 232 | drop view if exists public.redshift_config_comparison_pricing cascade; 233 | 234 | create or replace view public.redshift_config_comparison_pricing as 235 | SELECT distinct 236 | c.node_type 237 | , c.number_of_nodes 238 | , NVL(p.termtype||'-'||p.leasecontractlength||'-'||p.purchaseoption,'On-Demand') options 239 | , p.per_compute_node_yearly_cost * c.number_of_nodes your_cluster_yearly_compute_cost 240 | , p.per_compute_node_yearly_cost 241 | FROM 242 | public.redshift_pricing p, 243 | redshift_config_comparison.cluster_config c 244 | WHERE p.instance_type = c.node_type 245 | AND p.location = c.REGION 246 | with no schema binding; 247 | 248 | 249 | 250 | unload ($$ 251 | select * from public.redshift_detailed_query_stats where starttime > to_timestamp('{what_if_timestamp}','YYYY-MM-DD-HH24-MI-SS') 252 | $$) to '{comparison_stats_s3_path}/{what_if_timestamp}/{cluster_identifier}/' 253 | FORMAT AS PARQUET ALLOWOVERWRITE iam_role '{redshift_iam_role}'; 254 | -------------------------------------------------------------------------------- /configurations/parameter_group_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "Parameters": [{ 3 | "ParameterName": "auto_analyze", 4 | "ParameterValue": "true", 5 | "Description": "Use auto analyze", 6 | "Source": "user", 7 | "DataType": "boolean", 8 | "AllowedValues": "true,false", 9 | "ApplyType": "static", 10 | "IsModifiable": true 11 | }, { 12 | "ParameterName": "datestyle", 13 | "ParameterValue": "ISO, MDY", 14 | "Description": "Sets the display format for date and time values.", 15 | "Source": "engine-default", 16 | "DataType": "string", 17 | "ApplyType": "static", 18 | "IsModifiable": true 19 | }, { 20 | "ParameterName": "enable_user_activity_logging", 21 | "ParameterValue": "true", 22 | "Description": "parameter for audit logging purpose", 23 | "Source": "user", 24 | "DataType": "boolean", 25 | "AllowedValues": "true,false", 26 | "ApplyType": "static", 27 | "IsModifiable": true 28 | }, { 29 | "ParameterName": "extra_float_digits", 30 | "ParameterValue": "0", 31 | "Description": "Sets the number of digits displayed for floating-point values", 32 | "Source": "engine-default", 33 | "DataType": "integer", 34 | "AllowedValues": "-15-2", 35 | "ApplyType": "static", 36 | "IsModifiable": true 37 | }, { 38 | "ParameterName": "max_concurrency_scaling_clusters", 39 | "ParameterValue": "1", 40 | "Description": "The maximum concurrency scaling clusters can be used.", 41 | "Source": "engine-default", 42 | "DataType": "integer", 43 | "AllowedValues": "0-10", 44 | "ApplyType": "static", 45 | "IsModifiable": true 46 | }, { 47 | "ParameterName": "max_cursor_result_set_size", 48 | "ParameterValue": "default", 49 | "Description": "Sets the max cursor result set size", 50 | "Source": "engine-default", 51 | "DataType": "integer", 52 | "AllowedValues": "0-14400000", 53 | "ApplyType": "static", 54 | "IsModifiable": true 55 | }, { 56 | "ParameterName": "query_group", 57 | "ParameterValue": "default", 58 | "Description": "This parameter applies a user-defined label to a group of queries that are run during the same session..", 59 | "Source": "engine-default", 60 | "DataType": "string", 61 | "ApplyType": "static", 62 | "IsModifiable": true 63 | }, { 64 | "ParameterName": "require_ssl", 65 | "ParameterValue": "true", 66 | "Description": "require ssl for all databaseconnections", 67 | "Source": "user", 68 | "DataType": "boolean", 69 | "AllowedValues": "true,false", 70 | "ApplyType": "static", 71 | "IsModifiable": true 72 | }, { 73 | "ParameterName": "search_path", 74 | "ParameterValue": "$user, public", 75 | "Description": "Sets the schema search order for names that are not schema-qualified.", 76 | "Source": "engine-default", 77 | "DataType": "string", 78 | "ApplyType": "static", 79 | "IsModifiable": true 80 | }, { 81 | "ParameterName": "statement_timeout", 82 | "ParameterValue": "86400000", 83 | "Description": "Aborts any statement that takes over the specified number of milliseconds.", 84 | "Source": "user", 85 | "DataType": "integer", 86 | "AllowedValues": "0,100-2147483647", 87 | "ApplyType": "static", 88 | "IsModifiable": true 89 | }, { 90 | "ParameterName": "use_fips_ssl", 91 | "ParameterValue": "false", 92 | "Description": "Use fips ssl library", 93 | "Source": "engine-default", 94 | "DataType": "boolean", 95 | "AllowedValues": "true,false", 96 | "ApplyType": "static", 97 | "IsModifiable": true 98 | }, { 99 | "ParameterName": "wlm_json_configuration", 100 | "ParameterValue": "[ { \"query_group\" : [ ],\"query_group_wild_card\" : 0,\"user_group\" : [ ],\"user_group_wild_card\" : 0,\"concurrency_scaling\" : \"off\",\"rules\" : [ { \"rule_name\" : \"DiskSpilling\", \"predicate\" : [ { \"metric_name\" : \"query_temp_blocks_to_disk\", \"operator\" : \">\", \"value\" : 100000 } ], \"action\" : \"log\"}, { \"rule_name\" : \"QueryRunningMoreThan30min\", \"predicate\" : [ { \"metric_name\" : \"query_execution_time\", \"operator\" : \">\", \"value\" : 1800 } ], \"action\" : \"log\"} ],\"priority\" : \"normal\",\"queue_type\" : \"auto\",\"auto_wlm\" : true }, {\"short_query_queue\" : true } ]", 101 | "Description": "wlm json configuration", 102 | "Source": "user", 103 | "DataType": "string", 104 | "ApplyType": "static", 105 | "IsModifiable": true 106 | }], 107 | "ResponseMetadata": { 108 | "RequestId": "fed328c9-52c8-4bba-a255-71b8c5cddded", 109 | "HTTPStatusCode": 200, 110 | "HTTPHeaders": { 111 | "x-amzn-requestid": "fed328c9-52c8-4bba-a255-71b8c5cddded", 112 | "content-type": "text/xml", 113 | "content-length": "6168", 114 | "vary": "accept-encoding", 115 | "date": "Wed, 31 Mar 2021 20:29:02 GMT" 116 | }, 117 | "RetryAttempts": 0 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /configurations/performance_test_bootstrap.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # This script bootstraps a base amazonlinux image to run the Redshift 3 | # Node Config concurrency test. 4 | # 1. Install the AWS CLI, Python3, and necessary Python libraries. 5 | # 2. Copy Python program source for concurrency test 6 | # 3. Execute that Python program 7 | # We expect all configuration to be defined as environment variables 8 | # for the Batch job. 9 | 10 | set -eu 11 | 12 | yum install -y awscli python3 13 | pip3 install boto3 psycopg2-binary pandas sqlalchemy 14 | 15 | aws s3 cp "$PYTHON_SCRIPT" ./script.py 16 | 17 | # This Python program requires these environment variables to be set: 18 | # `$SQL_SCRIPT_S3_PATH`, `$REDSHIFT_CLUSTER_ENDPOINT`, 19 | # `$REDSHIFT_IAM_ROLE`, `$BUCKET_NAME`, `$REDSHIFT_USER_NAME` 20 | python3 ./script.py 21 | -------------------------------------------------------------------------------- /configurations/populate_comparison_results.sql: -------------------------------------------------------------------------------- 1 | unload ($$ 2 | select * from public.redshift_config_comparison_raw 3 | $$) to '{raw_comparison_results_s3_path}/{what_if_timestamp}/' 4 | FORMAT AS CSV HEADER ALLOWOVERWRITE iam_role '{redshift_iam_role}'; 5 | 6 | 7 | unload ($$ 8 | select * from public.redshift_config_comparison_results 9 | $$) to '{comparison_results_s3_path}/{what_if_timestamp}/' 10 | parallel off FORMAT AS CSV HEADER ALLOWOVERWRITE iam_role '{redshift_iam_role}'; 11 | -------------------------------------------------------------------------------- /configurations/redshift-performance-test.py: -------------------------------------------------------------------------------- 1 | import json 2 | import boto3 3 | import psycopg2 4 | import time 5 | import pandas 6 | from sqlalchemy import create_engine 7 | from sqlalchemy import text 8 | from concurrent.futures import ThreadPoolExecutor 9 | from concurrent.futures import as_completed 10 | from urllib.parse import quote_plus as urlquote 11 | import urllib 12 | import re 13 | import os 14 | 15 | SQL_SCRIPT_S3_PATH=os.environ['SQL_SCRIPT_S3_PATH'] 16 | REDSHIFT_CLUSTER_ENDPOINT=os.environ['REDSHIFT_CLUSTER_ENDPOINT'] 17 | REDSHIFT_IAM_ROLE=os.environ['REDSHIFT_IAM_ROLE'] 18 | BUCKET_NAME=os.environ['SQL_SCRIPT_S3_PATH'] 19 | REDSHIFT_USER_NAME=os.environ['REDSHIFT_USER_NAME'] 20 | NUMBER_OF_PARALLEL_SESSIONS_LIST=os.environ['NUMBER_OF_PARALLEL_SESSIONS_LIST'] 21 | DISABLE_RESULT_CACHE=os.environ['DISABLE_RESULT_CACHE'] 22 | DEFAULT_OUTPUT_LIMIT=os.environ['DEFAULT_OUTPUT_LIMIT'] 23 | MAX_NUMBER_OF_QUERIES=os.environ['MAX_NUMBER_OF_QUERIES'] 24 | MAX_PARALLEL_SESSIONS=os.environ['MAX_PARALLEL_SESSIONS'] 25 | QUERY_LABEL_PREFIX=os.environ['QUERY_LABEL_PREFIX'] 26 | 27 | 28 | def connect_to_redshift(host,username): 29 | client = boto3.client('redshift') 30 | cluster_creds = client.get_cluster_credentials(DbUser=username, 31 | DbName=REDSHIFT_CLUSTER_ENDPOINT.split('/')[1], 32 | ClusterIdentifier=REDSHIFT_CLUSTER_ENDPOINT.split('.')[0]) 33 | 34 | 35 | connection_string='postgresql://'+ urlquote(cluster_creds['DbUser']) + ':'+ urlquote(cluster_creds['DbPassword']) + '@'+ REDSHIFT_CLUSTER_ENDPOINT 36 | return create_engine(connection_string,pool_size=0, max_overflow=-1) 37 | 38 | def get_json_config_from_s3(script_s3_path): 39 | bucket, key = script_s3_path.replace("s3://", "").split("/", 1) 40 | obj = boto3.client('s3').get_object(Bucket=bucket, Key=key) 41 | return json.loads(obj['Body'].read().decode('utf-8')) 42 | 43 | 44 | def get_sql_scripts_from_s3(): 45 | 46 | bucket, key = SQL_SCRIPT_S3_PATH.replace("s3://", "").split("/", 1) 47 | obj = boto3.client('s3').get_object(Bucket=bucket, Key=key) 48 | script = obj['Body'].read().decode('utf-8') 49 | script = script.format(redshift_iam_role=REDSHIFT_IAM_ROLE, bucket_name=BUCKET_NAME) 50 | split_scripts = script.split(';')[:-1] 51 | if len(split_scripts)>int(MAX_NUMBER_OF_QUERIES): 52 | split_scripts=split_scripts[0:int(MAX_NUMBER_OF_QUERIES)] 53 | return split_scripts 54 | 55 | 56 | def get_sql(engine, number_of_parallel_sessions): 57 | sql_script = "" 58 | 59 | pattern = re.compile(r'limit[\s|\t|\n]+[\d]+[\s]*$', re.IGNORECASE) 60 | for query in get_sql_scripts_from_s3(): 61 | if not re.search(pattern, query): 62 | query += " limit " + DEFAULT_OUTPUT_LIMIT 63 | sql_script+=query + ";\n" 64 | 65 | if DISABLE_RESULT_CACHE=='true': 66 | sql_script = "set enable_result_cache_for_session to false;\n" + sql_script 67 | 68 | sql_script = "set query_group to '" + QUERY_LABEL_PREFIX + str(number_of_parallel_sessions) + "';\n" + sql_script 69 | 70 | df = pandas.read_sql(text(sql_script), engine) 71 | return df 72 | 73 | 74 | 75 | def run_concurrency_test(number_of_parallel_sessions): 76 | engine=connect_to_redshift(REDSHIFT_CLUSTER_ENDPOINT,REDSHIFT_USER_NAME) 77 | start_time = time.time() 78 | try: 79 | with ThreadPoolExecutor(max_workers=number_of_parallel_sessions) as executor: 80 | futures = [] 81 | for _ in range(number_of_parallel_sessions): 82 | futures.append(executor.submit( 83 | get_sql, engine, number_of_parallel_sessions)) 84 | for future in as_completed(futures): 85 | rs = future.result() 86 | 87 | except Exception as e: 88 | raise e 89 | elapsed_time_in_secs = (time.time() - start_time) 90 | print("--- %s seconds ---" % elapsed_time_in_secs) 91 | return elapsed_time_in_secs 92 | 93 | print(f'script:{SQL_SCRIPT_S3_PATH}, cluster:{REDSHIFT_CLUSTER_ENDPOINT},role:{REDSHIFT_IAM_ROLE},bucket:{BUCKET_NAME},user:{REDSHIFT_USER_NAME},sessions:{NUMBER_OF_PARALLEL_SESSIONS_LIST}') 94 | for sessions in NUMBER_OF_PARALLEL_SESSIONS_LIST.split(','): 95 | number_of_parallel_sessions=int(sessions) 96 | if number_of_parallel_sessions <= int(MAX_PARALLEL_SESSIONS): 97 | print(f'running {number_of_parallel_sessions} parallel threads ..') 98 | run_concurrency_test(number_of_parallel_sessions) 99 | else: 100 | print(f'parallel sessions {number_of_parallel_sessions} exceeds maximum allowed {MAX_PARALLEL_SESSIONS} ..') 101 | -------------------------------------------------------------------------------- /configurations/replay_bootstrap.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | echo "bucket_name: $BUCKET_NAME" 4 | echo "simple_replay_overwrite_s3_path: $SIMPLE_REPLAY_OVERWRITE_S3_PATH" 5 | echo "redshift_user_name: $REDSHIFT_USER_NAME" 6 | echo "what_if_timestamp: $WHAT_IF_TIMESTAMP" 7 | echo "extract_prefix: $EXTRACT_PREFIX" 8 | echo "replay_prefix: $REPLAY_PREFIX" 9 | echo "script_prefix: $SCRIPT_PREFIX" 10 | echo "redshift_iam_role: $REDSHIFT_IAM_ROLE" 11 | echo "workload_location: $WORKLOAD_LOCATION" 12 | echo "cluster_endpoint: $CLUSTER_ENDPOINT" 13 | echo "cluster_identifier: $CLUSTER_IDENTIFIER" 14 | echo "snapshot_account_id: $SNAPSHOT_ACCOUNT_ID" 15 | account_id=`aws sts get-caller-identity --query Account --output text` 16 | echo "account_id: $account_id" 17 | TARGET_CLUSTER_REGION=$(echo $CLUSTER_ENDPOINT | cut -f3 -d'.') 18 | ##region = os.environ['AWS_REGION'] 19 | 20 | yum update -y 21 | yum -y install git 22 | yum -y install python3 23 | yum -y install python3-pip 24 | yum -y install aws-cfn-bootstrap 25 | yum -y install gcc gcc-c++ python3 python3-devel unixODBC unixODBC-devel 26 | mkdir /amazonutils 27 | cd /amazonutils 28 | git clone https://github.com/awslabs/amazon-redshift-utils.git 29 | pip3 install -r /amazonutils/amazon-redshift-utils/src/SimpleReplay/requirements.txt 30 | # 31 | # configure extract replay metadata 32 | # 33 | cd /amazonutils/amazon-redshift-utils/src/SimpleReplay 34 | if [[ "$SIMPLE_REPLAY_OVERWRITE_S3_PATH" != "N/A" ]]; then 35 | aws s3 cp $SIMPLE_REPLAY_OVERWRITE_S3_PATH replay.yaml 36 | fi 37 | 38 | sed -i "s#master_username: \".*\"#master_username: \"$REDSHIFT_USER_NAME\"#g" replay.yaml 39 | 40 | ## start - commented to avoid running any UNLOAD commands as part of replay and unload of system tables ## 41 | 42 | #sed -i "s#execute_unload_statements: \"false\"#execute_unload_statements: \"true\"#g" replay.yaml 43 | #sed -i "s#unload_iam_role: \".*\"#unload_iam_role: \"$REDSHIFT_IAM_ROLE\"#g" replay.yaml 44 | #sed -i "s#target_cluster_system_table_unload_iam_role: \".*\"#target_cluster_system_table_unload_iam_role: \"$REDSHIFT_IAM_ROLE\"#g" replay.yaml 45 | 46 | ## end - commented to avoid running any UNLOAD commands as part of replay and unload of system tables ## 47 | 48 | sed -i "s#workload_location: \".*\"#workload_location: \"$WORKLOAD_LOCATION\"#g" replay.yaml 49 | sed -i "s#target_cluster_endpoint: \".*\"#target_cluster_endpoint: \"$CLUSTER_ENDPOINT\"#g" replay.yaml 50 | sed -i "s#replay_output: \".*\"#replay_output: \"s3://$BUCKET_NAME/$REPLAY_PREFIX/$WHAT_IF_TIMESTAMP/$CLUSTER_IDENTIFIER\"#g" replay.yaml 51 | sed -i "s#target_cluster_region: \".*\"#target_cluster_region: \"$TARGET_CLUSTER_REGION\"#g" replay.yaml 52 | 53 | 54 | if [[ "$account_id" == "$SNAPSHOT_ACCOUNT_ID" ]]; then 55 | sed -i "s#execute_copy_statements: \"false\"#execute_copy_statements: \"true\"#g" replay.yaml 56 | aws s3 cp $WORKLOAD_LOCATION/copy_replacements.csv . || true 57 | sed -z -i "s#,,\n#,,$REDSHIFT_IAM_ROLE\n#g" copy_replacements.csv || true 58 | aws s3 cp copy_replacements.csv $WORKLOAD_LOCATION/copy_replacements.csv || true 59 | fi 60 | aws s3 cp replay.yaml s3://$BUCKET_NAME/$SCRIPT_PREFIX/ 61 | python3 replay.py replay.yaml 62 | -------------------------------------------------------------------------------- /images/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-redshift-config-compare/23138cfdb68d939e5f25dcbaaf2452ff0861d56f/images/.DS_Store -------------------------------------------------------------------------------- /images/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-redshift-config-compare/23138cfdb68d939e5f25dcbaaf2452ff0861d56f/images/architecture.png -------------------------------------------------------------------------------- /images/redshift-clusters.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-redshift-config-compare/23138cfdb68d939e5f25dcbaaf2452ff0861d56f/images/redshift-clusters.png -------------------------------------------------------------------------------- /images/statemachine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-redshift-config-compare/23138cfdb68d939e5f25dcbaaf2452ff0861d56f/images/statemachine.png -------------------------------------------------------------------------------- /serverless-v2/configurations/RedshiftConfigTestingLambda.py.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-redshift-config-compare/23138cfdb68d939e5f25dcbaaf2452ff0861d56f/serverless-v2/configurations/RedshiftConfigTestingLambda.py.zip -------------------------------------------------------------------------------- /serverless-v2/configurations/create_external_schema.py: -------------------------------------------------------------------------------- 1 | import redshift_connector 2 | import boto3 3 | import yaml 4 | import json 5 | 6 | rs_client = boto3.client('redshift') 7 | with open('replay.yaml','r') as fr: 8 | config_read = yaml.safe_load(fr) 9 | target_cluster_endpoint = config_read['target_cluster_endpoint'] 10 | cluster_endpoint_split = target_cluster_endpoint.split(".") 11 | workgroup_id = cluster_endpoint_split[0] 12 | db_host = target_cluster_endpoint.split(":")[0] 13 | db_port = cluster_endpoint_split[5].split("/")[0][4:] 14 | db_name = cluster_endpoint_split[5].split("/")[1] 15 | db_username = config_read['master_username'] 16 | serverless_cluster_id = f"redshift-serverless-{workgroup_id}" 17 | with open('system_config.json','r') as jr: 18 | json_data = json.load(jr) 19 | script = json_data['EXTERNAL_SCHEMA_SCRIPT'] 20 | try: 21 | response = rs_client.get_cluster_credentials( 22 | DbUser=db_username, ClusterIdentifier=serverless_cluster_id, AutoCreate=False, 23 | DurationSeconds=3600 24 | ) 25 | except rs_client.exceptions.ClientError as e: 26 | if e.response['Error']['Code'] == 'ExpiredToken': 27 | print(f"Error retrieving credentials for {serverless_cluster_id}: IAM credentials have expired.") 28 | exit(-1) 29 | elif e.response['Error']['Code'] == 'ResourceNotFoundException': 30 | print(f"Serverless endpoint could not be found " 31 | f"RedshiftServerless:GetCredentials. {e}") 32 | exit(-1) 33 | else: 34 | print(f"Got exception retrieving credentials ({e.response['Error']['Code']})") 35 | raise e 36 | db_user = response['DbUser'] 37 | db_password = response['DbPassword'] 38 | try: 39 | conn = redshift_connector.connect( 40 | host=db_host, 41 | database=db_name, 42 | user=db_user, 43 | password=db_password 44 | ) 45 | cursor = conn.cursor() 46 | conn.autocommit = True 47 | cursor.execute(script) 48 | print(f"Executed script.{script}") 49 | except Exception as err: 50 | if "already exists" not in str(err): 51 | print(f"Got exception while executing script {err}") 52 | raise 53 | -------------------------------------------------------------------------------- /serverless-v2/configurations/extract_bootstrap.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | echo "bucket_name: $BUCKET_NAME" 4 | echo "simple_replay_extract_overwrite_s3_path: $SIMPLE_REPLAY_EXTRACT_OVERWRITE_S3_PATH" 5 | echo "simple_replay_log_location: $SIMPLE_REPLAY_LOG_LOCATION" 6 | echo "redshift_user_name: $REDSHIFT_USER_NAME" 7 | echo "what_if_timestamp: $WHAT_IF_TIMESTAMP" 8 | echo "simple_replay_extract_start_time: $SIMPLE_REPLAY_EXTRACT_START_TIME" 9 | echo "simple_replay_extract_end_time: $SIMPLE_REPLAY_EXTRACT_END_TIME" 10 | echo "extract_prefix: $EXTRACT_PREFIX" 11 | echo "script_prefix: $SCRIPT_PREFIX" 12 | 13 | yum update -y 14 | yum -y install git 15 | yum -y install python3 16 | yum -y install python3-pip 17 | yum -y install aws-cfn-bootstrap 18 | yum -y install gcc gcc-c++ python3 python3-devel unixODBC unixODBC-devel 19 | mkdir /amazonutils 20 | cd /amazonutils 21 | git clone https://github.com/awslabs/amazon-redshift-utils.git 22 | pip3 install -r /amazonutils/amazon-redshift-utils/src/SimpleReplay/requirements.txt 23 | cd /amazonutils/amazon-redshift-utils/src/SimpleReplay 24 | if [[ "$SIMPLE_REPLAY_EXTRACT_OVERWRITE_S3_PATH" != "N/A" ]]; then 25 | aws s3 cp $SIMPLE_REPLAY_EXTRACT_OVERWRITE_S3_PATH replay.yaml 26 | fi 27 | WORKLOAD_LOCATION="s3://${BUCKET_NAME}/${EXTRACT_PREFIX}/${WHAT_IF_TIMESTAMP}" 28 | sed -i "s#master_username: \".*\"#master_username: \"$REDSHIFT_USER_NAME\"#g" extract.yaml 29 | sed -i "s#log_location: \".*\"#log_location: \"$SIMPLE_REPLAY_LOG_LOCATION\"#g" extract.yaml 30 | sed -i "s#workload_location: \".*\"#workload_location: \"$WORKLOAD_LOCATION\"#g" extract.yaml 31 | sed -i "s#start_time: \".*\"#start_time: \"$SIMPLE_REPLAY_EXTRACT_START_TIME\"#g" extract.yaml 32 | sed -i "s#end_time: \".*\"#end_time: \"$SIMPLE_REPLAY_EXTRACT_END_TIME\"#g" extract.yaml 33 | aws s3 cp extract.yaml s3://$BUCKET_NAME/$SCRIPT_PREFIX/ 34 | # 35 | # run extract process 36 | # 37 | python3 extract.py extract.yaml 38 | # 39 | # upload metadata 40 | # 41 | # output=$(aws s3 ls s3://$BUCKET_NAME/$EXTRACT_PREFIX/$WHAT_IF_TIMESTAMP/ | awk '{print $2}') 42 | # echo "output: $output" 43 | # extract_output=${output::-1} 44 | # echo "{\"timestamp\": \"$WHAT_IF_TIMESTAMP\", \"extract_output\": \"$extract_output\"}" > $EXTRACT_PREFIX.json 45 | # 46 | # aws s3 cp $EXTRACT_PREFIX.json s3://$BUCKET_NAME/$EXTRACT_PREFIX/ -------------------------------------------------------------------------------- /serverless-v2/configurations/gather_comparison_stats.sql: -------------------------------------------------------------------------------- 1 | 2 | drop view if exists public.redshift_detailed_query_stats cascade; 3 | 4 | CREATE OR replace VIEW public.redshift_detailed_query_stats 5 | AS 6 | WITH queries 7 | AS (SELECT query, 8 | Listagg(DISTINCT schemaname 9 | ||'.' 10 | ||table_name, ',') 11 | within GROUP(ORDER BY table_name) tables_scanned 12 | FROM (WITH scan_delete_insert 13 | AS (SELECT 'scan' query_type, 14 | query, 15 | Lpad(segment, 3, '0') segment, 16 | tbl 17 | FROM stl_scan 18 | WHERE userid > 1 19 | AND perm_table_name != 'Internal Worktable' 20 | AND tbl <> 0 21 | UNION ALL 22 | SELECT 'delete' query_type, 23 | query, 24 | Lpad(segment, 3, '0') segment, 25 | tbl 26 | FROM stl_delete 27 | WHERE userid > 1 28 | AND tbl <> 0 29 | UNION ALL 30 | SELECT 'insert' query_type, 31 | query, 32 | Lpad(segment, 3, '0') segment, 33 | tbl 34 | FROM stl_insert 35 | WHERE userid > 1 36 | AND tbl <> 0) 37 | SELECT sdi.query_type, 38 | sdi.query, 39 | sdi.segment, 40 | sdi.tbl, 41 | Trim(n.nspname) AS schemaname, 42 | Trim(c.relname) table_name 43 | FROM scan_delete_insert sdi 44 | join pg_class c 45 | ON c.oid = sdi.tbl 46 | join pg_namespace n 47 | ON n.oid = c.relnamespace) 48 | GROUP BY query), 49 | compiles 50 | AS (SELECT query, 51 | SUM(Datediff (microsecond, q.starttime, q.endtime)) 52 | compile_time 53 | FROM svl_compile q 54 | WHERE COMPILE = 1 55 | GROUP BY query) 56 | SELECT nvl(s.name,'Result Cache') as queue, 57 | Trim(u.usename) AS username, 58 | CASE 59 | WHEN q.concurrency_scaling_status = 1 THEN 1 60 | ELSE 0 61 | END AS cc_scaling, 62 | q.aborted, 63 | w.total_queue_time queue_time, 64 | Nvl(ct.compile_time, 0) compile_time, 65 | w.total_exec_time - Nvl(ct.compile_time, 0) exec_time, 66 | Datediff(microsecond, q.starttime, q.endtime) AS total_query_time, 67 | q.userid, 68 | q.query, 69 | q.label query_label, 70 | q.xid, 71 | q.pid, 72 | w.service_class, 73 | q.starttime, 74 | q.endtime, 75 | tables_scanned, 76 | Trim(q.querytxt) querytxt, 77 | SHA2(q.querytxt,256) query_hash 78 | FROM stl_query q 79 | left join stl_wlm_query w USING (userid, query) 80 | join pg_user u 81 | ON u.usesysid = q.userid 82 | left join stv_wlm_service_class_config s 83 | ON w.service_class = s.service_class 84 | left outer join queries 85 | ON queries.query = q.query 86 | left outer join compiles ct 87 | ON w.query = ct.query 88 | WHERE q.userid > 1 89 | AND w.service_class > 5; 90 | 91 | 92 | drop view if exists public.redshift_config_comparison_raw cascade; 93 | 94 | 95 | create or replace view public.redshift_config_comparison_raw as 96 | SELECT 97 | partition_1 cluster_identifier, 98 | SHA2(split_part(query_text,'}}',2),256) query_hash, 99 | round( exec_time / ( 1000 * 1000 ) ,2) exec_time_seconds, 100 | round( total_query_time / ( 1000 * 1000 ) ,2) total_query_time_seconds, 101 | round( compile_time / ( 1000 * 1000 ) ,2) compile_time_seconds, 102 | round( queue_time / ( 1000 * 1000 ) ,2) queue_time_seconds, 103 | queue, 104 | username, 105 | cc_scaling, 106 | aborted, 107 | queue_time, 108 | compile_time, 109 | exec_time, 110 | total_query_time, 111 | userid, 112 | query, 113 | trim(query_label::varchar) query_label, 114 | xid, 115 | pid, 116 | service_class, 117 | starttime, 118 | endtime, 119 | tables_scanned, 120 | querytxt , 121 | partition_0::varchar analysis_timestamp 122 | FROM redshift_config_comparison.comparison_stats_provisioned q 123 | where partition_0 = (SELECT 124 | MAX(partition_0) 125 | FROM 126 | redshift_config_comparison.comparison_stats_provisioned) 127 | --AND (query_label like '{query_label_prefix}%' or q.querytxt LIKE '%Replay source file%') 128 | AND q.querytxt LIKE '%replay_start%' 129 | with no schema binding; 130 | 131 | 132 | 133 | drop view if exists public.redshift_config_comparison_aggregate cascade; 134 | 135 | 136 | create or replace view public.redshift_config_comparison_aggregate as 137 | SELECT case when query_label like '{query_label_prefix}%' then query_label else 'simple-replay' end test_type 138 | ,cluster_identifier 139 | ,queue 140 | ,username 141 | , ROUND(SUM(r.total_query_time::NUMERIC) / ( 1000 * 1000 ) ,2) total_query_time_seconds 142 | , ROUND(AVG(r.total_query_time::NUMERIC) / ( 1000 * 1000 ) ,2) mean_query_time_seconds 143 | , ROUND(( PERCENTILE_CONT(0.50) WITHIN GROUP( ORDER BY total_query_time) )::NUMERIC / ( 1000 * 1000 ),3) AS median_query_time_seconds 144 | , ROUND(MAX(total_query_time)::NUMERIC / ( 1000 * 1000 ),2) max_query_time_seconds 145 | , ROUND(( PERCENTILE_CONT(0.75) WITHIN GROUP( ORDER BY total_query_time) )::NUMERIC / ( 1000 * 1000 ),3) AS pct75_query_time_seconds 146 | , ROUND(( PERCENTILE_CONT(0.90) WITHIN GROUP( ORDER BY total_query_time) )::NUMERIC / ( 1000 * 1000 ),3) AS pct90_query_time_seconds 147 | , ROUND(( PERCENTILE_CONT(0.95) WITHIN GROUP( ORDER BY total_query_time) )::NUMERIC / ( 1000 * 1000 ),3) AS pct95_query_time_seconds 148 | ,SUM(cc_scaling) count_cc_scaling 149 | ,SUM(aborted) count_aborted 150 | ,SUM(1) count_queries 151 | , analysis_timestamp 152 | FROM public.redshift_config_comparison_raw r 153 | group by test_type, 154 | cluster_identifier, 155 | queue, 156 | username, 157 | analysis_timestamp 158 | with no schema binding 159 | ; 160 | 161 | 162 | drop view if exists public.redshift_config_comparison_results cascade; 163 | 164 | create or replace view public.redshift_config_comparison_results as 165 | with agg_data as (SELECT test_type, 166 | cluster_identifier, 167 | queue, 168 | username, 169 | total_query_time_seconds, 170 | max(total_query_time_seconds) over (partition by test_type,username rows between unbounded preceding and unbounded following) max_total_query_time_seconds, 171 | mean_query_time_seconds, 172 | max(mean_query_time_seconds) over (partition by test_type,username rows between unbounded preceding and unbounded following ) max_mean_query_time_seconds, 173 | median_query_time_seconds, 174 | max(median_query_time_seconds) over (partition by test_type,username rows between unbounded preceding and unbounded following ) max_median_query_time_seconds, 175 | max_query_time_seconds, 176 | max(max_query_time_seconds) over (partition by test_type,username rows between unbounded preceding and unbounded following ) max_max_query_time_seconds, 177 | pct75_query_time_seconds, 178 | max(pct75_query_time_seconds) over (partition by test_type,username rows between unbounded preceding and unbounded following ) max_pct75_query_time_seconds, 179 | pct90_query_time_seconds, 180 | max(pct90_query_time_seconds) over (partition by test_type,username rows between unbounded preceding and unbounded following ) max_pct90_query_time_seconds, 181 | pct95_query_time_seconds, 182 | max(pct95_query_time_seconds) over (partition by test_type,username rows between unbounded preceding and unbounded following ) max_pct95_query_time_seconds, 183 | count_cc_scaling, 184 | count_aborted, 185 | count_queries, 186 | analysis_timestamp 187 | FROM public.redshift_config_comparison_aggregate) 188 | select test_type, 189 | cluster_identifier, 190 | queue, 191 | username, 192 | total_query_time_seconds, 193 | round(((max_total_query_time_seconds-total_query_time_seconds)/case when total_query_time_seconds=0 then 1 else total_query_time_seconds end)*100)||'%' improvement_total_query_time, 194 | mean_query_time_seconds, 195 | round(((max_mean_query_time_seconds-mean_query_time_seconds)/case when mean_query_time_seconds=0 then 1 else mean_query_time_seconds end)*100)||'%' improvement_mean_query_time, 196 | median_query_time_seconds, 197 | round(((max_median_query_time_seconds-median_query_time_seconds)/case when median_query_time_seconds=0 then 1 else median_query_time_seconds end)*100)||'%' improvement_median_query_time, 198 | max_query_time_seconds, 199 | round(((max_max_query_time_seconds-max_query_time_seconds)/case when max_query_time_seconds=0 then 1 else max_query_time_seconds end)*100)||'%' improvement_max_query_time, 200 | pct75_query_time_seconds, 201 | round(((max_pct75_query_time_seconds-pct75_query_time_seconds)/case when pct75_query_time_seconds=0 then 1 else pct75_query_time_seconds end)*100)||'%' improvement_pct75_query_time, 202 | pct90_query_time_seconds, 203 | round(((max_pct90_query_time_seconds-pct90_query_time_seconds)/case when pct90_query_time_seconds=0 then 1 else pct90_query_time_seconds end)*100)||'%' improvement_pct90_query_time, 204 | pct95_query_time_seconds, 205 | round(((max_pct95_query_time_seconds-pct95_query_time_seconds)/case when pct95_query_time_seconds=0 then 1 else pct95_query_time_seconds end)*100)||'%' improvement_pct95_query_time, 206 | count_cc_scaling, 207 | count_aborted, 208 | count_queries, 209 | analysis_timestamp 210 | from agg_data 211 | with no schema binding; 212 | 213 | 214 | 215 | drop view if exists public.redshift_pricing cascade; 216 | 217 | create or replace view public.redshift_pricing as 218 | select case 219 | when termtype='OnDemand' then priceperunit*365*24 220 | when purchaseoption='No Upfront' and leasecontractlength='1yr' then priceperunit*365*24 221 | when purchaseoption='All Upfront' and leasecontractlength='1yr' and unit='Quantity' then priceperunit 222 | when purchaseoption='No Upfront' and leasecontractlength='3yr' then priceperunit*365*24 223 | when purchaseoption='All Upfront' and leasecontractlength='3yr' and unit='Quantity' then priceperunit/3 224 | end::bigint per_compute_node_yearly_cost, 225 | unit,location, "instance type" instance_type, termtype,leasecontractlength,purchaseoption,unit 226 | FROM redshift_config_comparison.pricing where "product family"='Compute Instance' 227 | and nvl(purchaseoption,'OnDemand') in ('OnDemand','All Upfront','No Upfront') 228 | and priceperunit>0 229 | with no schema binding; 230 | 231 | 232 | drop view if exists public.redshift_config_comparison_pricing cascade; 233 | 234 | create or replace view public.redshift_config_comparison_pricing as 235 | SELECT distinct 236 | c.node_type 237 | , c.number_of_nodes 238 | , NVL(p.termtype||'-'||p.leasecontractlength||'-'||p.purchaseoption,'On-Demand') options 239 | , p.per_compute_node_yearly_cost * c.number_of_nodes your_cluster_yearly_compute_cost 240 | , p.per_compute_node_yearly_cost 241 | FROM 242 | public.redshift_pricing p, 243 | redshift_config_comparison.cluster_config c 244 | WHERE p.instance_type = c.node_type 245 | AND p.location = c.REGION 246 | with no schema binding; 247 | 248 | 249 | 250 | unload ($$ 251 | select * from public.redshift_detailed_query_stats where starttime > to_timestamp('{what_if_timestamp}','YYYY-MM-DD-HH24-MI-SS') 252 | $$) to '{comparison_stats_s3_path}/{what_if_timestamp}/{cluster_identifier}/' 253 | FORMAT AS PARQUET ALLOWOVERWRITE iam_role '{redshift_iam_role}'; 254 | -------------------------------------------------------------------------------- /serverless-v2/configurations/gather_comparison_stats_serverless.sql: -------------------------------------------------------------------------------- 1 | unload ($$ 2 | select a.*,Trim(u.usename) as username from sys_query_history a , pg_user u 3 | where a.user_id = u.usesysid 4 | and a.start_time > to_timestamp('{what_if_timestamp}','YYYY-MM-DD-HH24-MI-SS') 5 | $$) to '{comparison_stats_s3_path}/{what_if_timestamp}/{cluster_identifier}/' 6 | FORMAT AS PARQUET ALLOWOVERWRITE iam_role '{redshift_iam_role}'; 7 | 8 | 9 | create or replace view public.redshift_config_comparison_raw as 10 | SELECT 11 | partition_1 cluster_identifier, 12 | SHA2(split_part(query_text,'}}',2),256) as query_hash, 13 | round( execution_time / ( 1000 * 1000 ) ,2) exec_time_seconds, 14 | round( elapsed_time / ( 1000 * 1000 ) ,2) elasped_time_seconds, 15 | round( queue_time / ( 1000 * 1000 ) ,2) queue_time_seconds, 16 | user_id, 17 | username, 18 | database_name, 19 | queue_time, 20 | execution_time, 21 | elapsed_time, 22 | query_id, 23 | trim(query_label::varchar) query_label, 24 | transaction_id, 25 | session_id, 26 | result_cache_hit, 27 | start_time, 28 | end_time, 29 | error_message, 30 | query_text , 31 | status, 32 | partition_0::varchar analysis_timestamp 33 | FROM redshift_config_comparison.comparison_stats q 34 | where partition_0 = (SELECT 35 | MAX(partition_0) 36 | FROM 37 | redshift_config_comparison.comparison_stats) 38 | --AND (query_label like '{query_label_prefix}%' or q.querytxt LIKE '%Replay source file%') 39 | AND q.query_text LIKE '%replay_start%' 40 | with no schema binding; 41 | 42 | 43 | create or replace view public.redshift_config_comparison_aggregate as 44 | SELECT case when query_label like '{query_label_prefix}%' then query_label else 'simple-replay' end test_type 45 | ,cluster_identifier 46 | ,username 47 | , ROUND(SUM(r.elapsed_time::NUMERIC) / ( 1000 * 1000 ) ,2) total_query_time_seconds 48 | , ROUND(AVG(r.elapsed_time::NUMERIC) / ( 1000 * 1000 ) ,2) mean_query_time_seconds 49 | ,Percentile_cont(1.0) within group(ORDER BY elapsed_time) AS max_query_time_seconds 50 | , ROUND(( PERCENTILE_CONT(0.50) WITHIN GROUP( ORDER BY elapsed_time) )::NUMERIC / ( 1000 * 1000 ),3) AS pct50_query_time_seconds 51 | , ROUND(( PERCENTILE_CONT(0.75) WITHIN GROUP( ORDER BY elapsed_time) )::NUMERIC / ( 1000 * 1000 ),3) AS pct75_query_time_seconds 52 | , ROUND(( PERCENTILE_CONT(0.90) WITHIN GROUP( ORDER BY elapsed_time) )::NUMERIC / ( 1000 * 1000 ),3) AS pct90_query_time_seconds 53 | , ROUND(( PERCENTILE_CONT(0.95) WITHIN GROUP( ORDER BY elapsed_time) )::NUMERIC / ( 1000 * 1000 ),3) AS pct95_query_time_seconds 54 | ,ROUND(( PERCENTILE_CONT(0.99) WITHIN GROUP( ORDER BY elapsed_time) )::NUMERIC / ( 1000 * 1000 ),3) AS pct99_query_time_seconds 55 | ,SUM(1) count_queries 56 | , analysis_timestamp 57 | FROM public.redshift_config_comparison_raw r 58 | group by test_type, 59 | cluster_identifier, 60 | username, 61 | analysis_timestamp 62 | with no schema binding 63 | ; 64 | 65 | drop view if exists public.redshift_config_comparison_results cascade; 66 | 67 | create or replace view public.redshift_config_comparison_results as 68 | with agg_data as (SELECT test_type, 69 | cluster_identifier, 70 | username, 71 | total_query_time_seconds, 72 | max(total_query_time_seconds) over (partition by test_type,username rows between unbounded preceding and unbounded following) max_total_query_time_seconds, 73 | mean_query_time_seconds, 74 | max(mean_query_time_seconds) over (partition by test_type,username rows between unbounded preceding and unbounded following ) max_mean_query_time_seconds, 75 | pct50_query_time_seconds, 76 | max(pct50_query_time_seconds) over (partition by test_type,username rows between unbounded preceding and unbounded following ) max_pct50_query_time_seconds, 77 | max_query_time_seconds, 78 | max(max_query_time_seconds) over (partition by test_type,username rows between unbounded preceding and unbounded following ) max_max_query_time_seconds, 79 | pct75_query_time_seconds, 80 | max(pct75_query_time_seconds) over (partition by test_type,username rows between unbounded preceding and unbounded following ) max_pct75_query_time_seconds, 81 | pct90_query_time_seconds, 82 | max(pct90_query_time_seconds) over (partition by test_type,username rows between unbounded preceding and unbounded following ) max_pct90_query_time_seconds, 83 | pct95_query_time_seconds, 84 | max(pct95_query_time_seconds) over (partition by test_type,username rows between unbounded preceding and unbounded following ) max_pct95_query_time_seconds, 85 | pct99_query_time_seconds, 86 | max(pct99_query_time_seconds) over (partition by test_type,username rows between unbounded preceding and unbounded following ) max_pct99_query_time_seconds, 87 | count_queries, 88 | analysis_timestamp 89 | FROM public.redshift_config_comparison_aggregate) 90 | select test_type, 91 | cluster_identifier, 92 | username, 93 | total_query_time_seconds, 94 | round(((max_total_query_time_seconds-total_query_time_seconds)/case when total_query_time_seconds=0 then 1 else total_query_time_seconds end)*100)||'%' improvement_total_query_time, 95 | mean_query_time_seconds, 96 | round(((max_mean_query_time_seconds-mean_query_time_seconds)/case when mean_query_time_seconds=0 then 1 else mean_query_time_seconds end)*100)||'%' improvement_mean_query_time, 97 | pct50_query_time_seconds, 98 | round(((max_pct50_query_time_seconds-pct50_query_time_seconds)/case when pct50_query_time_seconds=0 then 1 else pct50_query_time_seconds end)*100)||'%' improvement_pct50_query_time, 99 | max_query_time_seconds, 100 | round(((max_max_query_time_seconds-max_query_time_seconds)/case when max_query_time_seconds=0 then 1 else max_query_time_seconds end)*100)||'%' improvement_max_query_time, 101 | pct75_query_time_seconds, 102 | round(((max_pct75_query_time_seconds-pct75_query_time_seconds)/case when pct75_query_time_seconds=0 then 1 else pct75_query_time_seconds end)*100)||'%' improvement_pct75_query_time, 103 | pct90_query_time_seconds, 104 | round(((max_pct90_query_time_seconds-pct90_query_time_seconds)/case when pct90_query_time_seconds=0 then 1 else pct90_query_time_seconds end)*100)||'%' improvement_pct90_query_time, 105 | pct95_query_time_seconds, 106 | round(((max_pct95_query_time_seconds-pct95_query_time_seconds)/case when pct95_query_time_seconds=0 then 1 else pct95_query_time_seconds end)*100)||'%' improvement_pct95_query_time, 107 | pct99_query_time_seconds, 108 | round(((max_pct99_query_time_seconds-pct99_query_time_seconds)/case when pct99_query_time_seconds=0 then 1 else pct99_query_time_seconds end)*100)||'%' improvement_pct99_query_time, 109 | count_queries, 110 | analysis_timestamp 111 | from agg_data 112 | with no schema binding; -------------------------------------------------------------------------------- /serverless-v2/configurations/parameter_group_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "Parameters": [{ 3 | "ParameterName": "auto_analyze", 4 | "ParameterValue": "true", 5 | "Description": "Use auto analyze", 6 | "Source": "user", 7 | "DataType": "boolean", 8 | "AllowedValues": "true,false", 9 | "ApplyType": "static", 10 | "IsModifiable": true 11 | }, { 12 | "ParameterName": "datestyle", 13 | "ParameterValue": "ISO, MDY", 14 | "Description": "Sets the display format for date and time values.", 15 | "Source": "engine-default", 16 | "DataType": "string", 17 | "ApplyType": "static", 18 | "IsModifiable": true 19 | }, { 20 | "ParameterName": "enable_user_activity_logging", 21 | "ParameterValue": "true", 22 | "Description": "parameter for audit logging purpose", 23 | "Source": "user", 24 | "DataType": "boolean", 25 | "AllowedValues": "true,false", 26 | "ApplyType": "static", 27 | "IsModifiable": true 28 | }, { 29 | "ParameterName": "extra_float_digits", 30 | "ParameterValue": "0", 31 | "Description": "Sets the number of digits displayed for floating-point values", 32 | "Source": "engine-default", 33 | "DataType": "integer", 34 | "AllowedValues": "-15-2", 35 | "ApplyType": "static", 36 | "IsModifiable": true 37 | }, { 38 | "ParameterName": "max_concurrency_scaling_clusters", 39 | "ParameterValue": "1", 40 | "Description": "The maximum concurrency scaling clusters can be used.", 41 | "Source": "engine-default", 42 | "DataType": "integer", 43 | "AllowedValues": "0-10", 44 | "ApplyType": "static", 45 | "IsModifiable": true 46 | }, { 47 | "ParameterName": "max_cursor_result_set_size", 48 | "ParameterValue": "default", 49 | "Description": "Sets the max cursor result set size", 50 | "Source": "engine-default", 51 | "DataType": "integer", 52 | "AllowedValues": "0-14400000", 53 | "ApplyType": "static", 54 | "IsModifiable": true 55 | }, { 56 | "ParameterName": "query_group", 57 | "ParameterValue": "default", 58 | "Description": "This parameter applies a user-defined label to a group of queries that are run during the same session..", 59 | "Source": "engine-default", 60 | "DataType": "string", 61 | "ApplyType": "static", 62 | "IsModifiable": true 63 | }, { 64 | "ParameterName": "require_ssl", 65 | "ParameterValue": "true", 66 | "Description": "require ssl for all databaseconnections", 67 | "Source": "user", 68 | "DataType": "boolean", 69 | "AllowedValues": "true,false", 70 | "ApplyType": "static", 71 | "IsModifiable": true 72 | }, { 73 | "ParameterName": "search_path", 74 | "ParameterValue": "$user, public", 75 | "Description": "Sets the schema search order for names that are not schema-qualified.", 76 | "Source": "engine-default", 77 | "DataType": "string", 78 | "ApplyType": "static", 79 | "IsModifiable": true 80 | }, { 81 | "ParameterName": "statement_timeout", 82 | "ParameterValue": "86400000", 83 | "Description": "Aborts any statement that takes over the specified number of milliseconds.", 84 | "Source": "user", 85 | "DataType": "integer", 86 | "AllowedValues": "0,100-2147483647", 87 | "ApplyType": "static", 88 | "IsModifiable": true 89 | }, { 90 | "ParameterName": "use_fips_ssl", 91 | "ParameterValue": "false", 92 | "Description": "Use fips ssl library", 93 | "Source": "engine-default", 94 | "DataType": "boolean", 95 | "AllowedValues": "true,false", 96 | "ApplyType": "static", 97 | "IsModifiable": true 98 | }, { 99 | "ParameterName": "wlm_json_configuration", 100 | "ParameterValue": "[ { \"query_group\" : [ ],\"query_group_wild_card\" : 0,\"user_group\" : [ ],\"user_group_wild_card\" : 0,\"concurrency_scaling\" : \"off\",\"rules\" : [ { \"rule_name\" : \"DiskSpilling\", \"predicate\" : [ { \"metric_name\" : \"query_temp_blocks_to_disk\", \"operator\" : \">\", \"value\" : 100000 } ], \"action\" : \"log\"}, { \"rule_name\" : \"QueryRunningMoreThan30min\", \"predicate\" : [ { \"metric_name\" : \"query_execution_time\", \"operator\" : \">\", \"value\" : 1800 } ], \"action\" : \"log\"} ],\"priority\" : \"normal\",\"queue_type\" : \"auto\",\"auto_wlm\" : true }, {\"short_query_queue\" : true } ]", 101 | "Description": "wlm json configuration", 102 | "Source": "user", 103 | "DataType": "string", 104 | "ApplyType": "static", 105 | "IsModifiable": true 106 | }], 107 | "ResponseMetadata": { 108 | "RequestId": "fed328c9-52c8-4bba-a255-71b8c5cddded", 109 | "HTTPStatusCode": 200, 110 | "HTTPHeaders": { 111 | "x-amzn-requestid": "fed328c9-52c8-4bba-a255-71b8c5cddded", 112 | "content-type": "text/xml", 113 | "content-length": "6168", 114 | "vary": "accept-encoding", 115 | "date": "Wed, 31 Mar 2021 20:29:02 GMT" 116 | }, 117 | "RetryAttempts": 0 118 | } 119 | } 120 | -------------------------------------------------------------------------------- /serverless-v2/configurations/performance_test_bootstrap.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | # This script bootstraps a base amazonlinux image to run the Redshift 3 | # Node Config concurrency test. 4 | # 1. Install the AWS CLI, Python3, and necessary Python libraries. 5 | # 2. Copy Python program source for concurrency test 6 | # 3. Execute that Python program 7 | # We expect all configuration to be defined as environment variables 8 | # for the Batch job. 9 | 10 | set -eu 11 | 12 | yum install -y awscli python3 13 | pip3 install boto3 psycopg2-binary pandas sqlalchemy 14 | 15 | aws s3 cp "$PYTHON_SCRIPT" ./script.py 16 | 17 | # This Python program requires these environment variables to be set: 18 | # `$SQL_SCRIPT_S3_PATH`, `$REDSHIFT_CLUSTER_ENDPOINT`, 19 | # `$REDSHIFT_IAM_ROLE`, `$BUCKET_NAME`, `$REDSHIFT_USER_NAME` 20 | python3 ./script.py 21 | -------------------------------------------------------------------------------- /serverless-v2/configurations/populate_comparison_results.sql: -------------------------------------------------------------------------------- 1 | unload ($$ 2 | select * from public.redshift_config_comparison_raw 3 | $$) to '{raw_comparison_results_s3_path}/{what_if_timestamp}/' 4 | FORMAT AS CSV HEADER ALLOWOVERWRITE iam_role '{redshift_iam_role}'; 5 | 6 | 7 | unload ($$ 8 | select * from public.redshift_config_comparison_results 9 | $$) to '{comparison_results_s3_path}/{what_if_timestamp}/' 10 | parallel off FORMAT AS CSV HEADER ALLOWOVERWRITE iam_role '{redshift_iam_role}'; 11 | -------------------------------------------------------------------------------- /serverless-v2/configurations/redshift-performance-test.py: -------------------------------------------------------------------------------- 1 | import json 2 | import boto3 3 | import psycopg2 4 | import time 5 | import pandas 6 | from sqlalchemy import create_engine 7 | from sqlalchemy import text 8 | from concurrent.futures import ThreadPoolExecutor 9 | from concurrent.futures import as_completed 10 | from urllib.parse import quote_plus as urlquote 11 | import urllib 12 | import re 13 | import os 14 | 15 | SQL_SCRIPT_S3_PATH=os.environ['SQL_SCRIPT_S3_PATH'] 16 | REDSHIFT_CLUSTER_ENDPOINT=os.environ['REDSHIFT_CLUSTER_ENDPOINT'] 17 | REDSHIFT_IAM_ROLE=os.environ['REDSHIFT_IAM_ROLE'] 18 | BUCKET_NAME=os.environ['SQL_SCRIPT_S3_PATH'] 19 | REDSHIFT_USER_NAME=os.environ['REDSHIFT_USER_NAME'] 20 | NUMBER_OF_PARALLEL_SESSIONS_LIST=os.environ['NUMBER_OF_PARALLEL_SESSIONS_LIST'] 21 | DISABLE_RESULT_CACHE=os.environ['DISABLE_RESULT_CACHE'] 22 | DEFAULT_OUTPUT_LIMIT=os.environ['DEFAULT_OUTPUT_LIMIT'] 23 | MAX_NUMBER_OF_QUERIES=os.environ['MAX_NUMBER_OF_QUERIES'] 24 | MAX_PARALLEL_SESSIONS=os.environ['MAX_PARALLEL_SESSIONS'] 25 | QUERY_LABEL_PREFIX=os.environ['QUERY_LABEL_PREFIX'] 26 | 27 | 28 | def connect_to_redshift(host,username): 29 | client = boto3.client('redshift') 30 | cluster_creds = client.get_cluster_credentials(DbUser=username, 31 | DbName=REDSHIFT_CLUSTER_ENDPOINT.split('/')[1], 32 | ClusterIdentifier=REDSHIFT_CLUSTER_ENDPOINT.split('.')[0]) 33 | 34 | 35 | connection_string='postgresql://'+ urlquote(cluster_creds['DbUser']) + ':'+ urlquote(cluster_creds['DbPassword']) + '@'+ REDSHIFT_CLUSTER_ENDPOINT 36 | return create_engine(connection_string,pool_size=0, max_overflow=-1) 37 | 38 | def get_json_config_from_s3(script_s3_path): 39 | bucket, key = script_s3_path.replace("s3://", "").split("/", 1) 40 | obj = boto3.client('s3').get_object(Bucket=bucket, Key=key) 41 | return json.loads(obj['Body'].read().decode('utf-8')) 42 | 43 | 44 | def get_sql_scripts_from_s3(): 45 | 46 | bucket, key = SQL_SCRIPT_S3_PATH.replace("s3://", "").split("/", 1) 47 | obj = boto3.client('s3').get_object(Bucket=bucket, Key=key) 48 | script = obj['Body'].read().decode('utf-8') 49 | script = script.format(redshift_iam_role=REDSHIFT_IAM_ROLE, bucket_name=BUCKET_NAME) 50 | split_scripts = script.split(';')[:-1] 51 | if len(split_scripts)>int(MAX_NUMBER_OF_QUERIES): 52 | split_scripts=split_scripts[0:int(MAX_NUMBER_OF_QUERIES)] 53 | return split_scripts 54 | 55 | 56 | def get_sql(engine, number_of_parallel_sessions): 57 | sql_script = "" 58 | 59 | pattern = re.compile(r'limit[\s|\t|\n]+[\d]+[\s]*$', re.IGNORECASE) 60 | for query in get_sql_scripts_from_s3(): 61 | if not re.search(pattern, query): 62 | query += " limit " + DEFAULT_OUTPUT_LIMIT 63 | sql_script+=query + ";\n" 64 | 65 | if DISABLE_RESULT_CACHE=='true': 66 | sql_script = "set enable_result_cache_for_session to false;\n" + sql_script 67 | 68 | sql_script = "set query_group to '" + QUERY_LABEL_PREFIX + str(number_of_parallel_sessions) + "';\n" + sql_script 69 | 70 | df = pandas.read_sql(text(sql_script), engine) 71 | return df 72 | 73 | 74 | 75 | def run_concurrency_test(number_of_parallel_sessions): 76 | engine=connect_to_redshift(REDSHIFT_CLUSTER_ENDPOINT,REDSHIFT_USER_NAME) 77 | start_time = time.time() 78 | try: 79 | with ThreadPoolExecutor(max_workers=number_of_parallel_sessions) as executor: 80 | futures = [] 81 | for _ in range(number_of_parallel_sessions): 82 | futures.append(executor.submit( 83 | get_sql, engine, number_of_parallel_sessions)) 84 | for future in as_completed(futures): 85 | rs = future.result() 86 | 87 | except Exception as e: 88 | raise e 89 | elapsed_time_in_secs = (time.time() - start_time) 90 | print("--- %s seconds ---" % elapsed_time_in_secs) 91 | return elapsed_time_in_secs 92 | 93 | print(f'script:{SQL_SCRIPT_S3_PATH}, cluster:{REDSHIFT_CLUSTER_ENDPOINT},role:{REDSHIFT_IAM_ROLE},bucket:{BUCKET_NAME},user:{REDSHIFT_USER_NAME},sessions:{NUMBER_OF_PARALLEL_SESSIONS_LIST}') 94 | for sessions in NUMBER_OF_PARALLEL_SESSIONS_LIST.split(','): 95 | number_of_parallel_sessions=int(sessions) 96 | if number_of_parallel_sessions <= int(MAX_PARALLEL_SESSIONS): 97 | print(f'running {number_of_parallel_sessions} parallel threads ..') 98 | run_concurrency_test(number_of_parallel_sessions) 99 | else: 100 | print(f'parallel sessions {number_of_parallel_sessions} exceeds maximum allowed {MAX_PARALLEL_SESSIONS} ..') 101 | -------------------------------------------------------------------------------- /serverless-v2/configurations/replay_bootstrap.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -e 3 | echo "bucket_name: $BUCKET_NAME" 4 | echo "simple_replay_overwrite_s3_path: $SIMPLE_REPLAY_OVERWRITE_S3_PATH" 5 | echo "redshift_user_name: $REDSHIFT_USER_NAME" 6 | echo "what_if_timestamp: $WHAT_IF_TIMESTAMP" 7 | echo "extract_prefix: $EXTRACT_PREFIX" 8 | echo "replay_prefix: $REPLAY_PREFIX" 9 | echo "script_prefix: $SCRIPT_PREFIX" 10 | echo "redshift_iam_role: $REDSHIFT_IAM_ROLE" 11 | echo "workload_location: $WORKLOAD_LOCATION" 12 | echo "cluster_endpoint: $CLUSTER_ENDPOINT" 13 | echo "cluster_identifier: $CLUSTER_IDENTIFIER" 14 | echo "snapshot_account_id: $SNAPSHOT_ACCOUNT_ID" 15 | account_id=`aws sts get-caller-identity --query Account --output text` 16 | echo "account_id: $account_id" 17 | source_tag="NodeConfig" 18 | echo "source_tag: $source_tag" 19 | echo "endpoint_type: $ENDPOINT_TYPE" 20 | TARGET_CLUSTER_REGION=$(echo $CLUSTER_ENDPOINT | cut -f3 -d'.') 21 | ##region = os.environ['AWS_REGION'] 22 | yum update -y 23 | yum -y install git 24 | yum -y install python3 25 | yum -y install python3-pip 26 | yum -y install aws-cfn-bootstrap 27 | yum -y install gcc gcc-c++ python3 python3-devel unixODBC unixODBC-devel 28 | mkdir /amazonutils 29 | cd /amazonutils 30 | git clone https://github.com/awslabs/amazon-redshift-utils.git 31 | pip3 install -r /amazonutils/amazon-redshift-utils/src/SimpleReplay/requirements.txt 32 | # 33 | # configure extract replay metadata 34 | # 35 | cd /amazonutils/amazon-redshift-utils/src/SimpleReplay 36 | if [[ "$SIMPLE_REPLAY_OVERWRITE_S3_PATH" != "N/A" ]]; then 37 | aws s3 cp $SIMPLE_REPLAY_OVERWRITE_S3_PATH replay.yaml 38 | fi 39 | 40 | sed -i "s#source_tag: \".*\"#source_tag: \"$source_tag\"#g" replay.yaml 41 | sed -i "s#master_username: \".*\"#master_username: \"$REDSHIFT_USER_NAME\"#g" replay.yaml 42 | #sed -i "s#execute_unload_statements: \"false\"#execute_unload_statements: \"true\"#g" replay.yaml 43 | #sed -i "s#unload_iam_role: \".*\"#unload_iam_role: \"$REDSHIFT_IAM_ROLE\"#g" replay.yaml 44 | sed -i "s#workload_location: \".*\"#workload_location: \"$WORKLOAD_LOCATION\"#g" replay.yaml 45 | sed -i "s#target_cluster_endpoint: \".*\"#target_cluster_endpoint: \"$CLUSTER_ENDPOINT\"#g" replay.yaml 46 | sed -i "s#replay_output: \".*\"#replay_output: \"s3://$BUCKET_NAME/$REPLAY_PREFIX/$WHAT_IF_TIMESTAMP/$CLUSTER_IDENTIFIER\"#g" replay.yaml 47 | sed -i "s#target_cluster_region: \".*\"#target_cluster_region: \"$TARGET_CLUSTER_REGION\"#g" replay.yaml 48 | if [[ "$account_id" == "$SNAPSHOT_ACCOUNT_ID" ]]; then 49 | sed -i "s#execute_copy_statements: \"false\"#execute_copy_statements: \"true\"#g" replay.yaml 50 | aws s3 cp $WORKLOAD_LOCATION/copy_replacements.csv . || true 51 | sed -z -i "s#,,\n#,,$REDSHIFT_IAM_ROLE\n#g" copy_replacements.csv || true 52 | aws s3 cp copy_replacements.csv $WORKLOAD_LOCATION/copy_replacements.csv || true 53 | fi 54 | aws s3 cp replay.yaml s3://$BUCKET_NAME/$SCRIPT_PREFIX/replay_$CLUSTER_IDENTIFIER.yaml 55 | python3 replay.py replay.yaml 56 | if [[ $ENDPOINT_TYPE == 'SERVERLESS' ]]; then 57 | aws s3 cp s3://$BUCKET_NAME/$SCRIPT_PREFIX/system_config.json . 58 | aws s3 cp s3://$BUCKET_NAME/$SCRIPT_PREFIX/create_external_schema.py . 59 | python3 create_external_schema.py 60 | fi -------------------------------------------------------------------------------- /serverless-v2/images/architecure-serverless.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-redshift-config-compare/23138cfdb68d939e5f25dcbaaf2452ff0861d56f/serverless-v2/images/architecure-serverless.png -------------------------------------------------------------------------------- /serverless-v2/images/batch-cw-log-group.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-redshift-config-compare/23138cfdb68d939e5f25dcbaaf2452ff0861d56f/serverless-v2/images/batch-cw-log-group.png -------------------------------------------------------------------------------- /serverless-v2/images/redshift-clusters-provisioned.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-redshift-config-compare/23138cfdb68d939e5f25dcbaaf2452ff0861d56f/serverless-v2/images/redshift-clusters-provisioned.png -------------------------------------------------------------------------------- /serverless-v2/images/redshift-clusters-serverless.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-redshift-config-compare/23138cfdb68d939e5f25dcbaaf2452ff0861d56f/serverless-v2/images/redshift-clusters-serverless.png -------------------------------------------------------------------------------- /serverless-v2/images/redshift-clusters.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-redshift-config-compare/23138cfdb68d939e5f25dcbaaf2452ff0861d56f/serverless-v2/images/redshift-clusters.png -------------------------------------------------------------------------------- /serverless-v2/images/statemachine-log.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-redshift-config-compare/23138cfdb68d939e5f25dcbaaf2452ff0861d56f/serverless-v2/images/statemachine-log.png -------------------------------------------------------------------------------- /serverless-v2/images/statemachine.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aws-samples/amazon-redshift-config-compare/23138cfdb68d939e5f25dcbaaf2452ff0861d56f/serverless-v2/images/statemachine.png -------------------------------------------------------------------------------- /serverless-v2/user_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "SNAPSHOT_ID": "redshift-cluster-manual-snapshot", 3 | "SNAPSHOT_ACCOUNT_ID": "123456789012", 4 | 5 | "PARAMETER_GROUP_CONFIG_S3_PATH": "s3://node-config-compare-bucket/pg_config.json", 6 | 7 | "DDL_AND_COPY_SCRIPT_S3_PATH": "s3://node-config-compare-bucket/ddl.sql", 8 | "SQL_SCRIPT_S3_PATH":"s3://node-config-compare-bucket/test_queries.sql", 9 | "NUMBER_OF_PARALLEL_SESSIONS_LIST": "1", 10 | 11 | "SIMPLE_REPLAY_LOG_LOCATION":"s3://redshift-logging-xxxxxxxx/RSLogs/", 12 | "SIMPLE_REPLAY_EXTRACT_START_TIME":"2021-08-28T11:15:00+00:00", 13 | "SIMPLE_REPLAY_EXTRACT_END_TIME":"2021-08-28T12:00:00+00:00", 14 | 15 | "SIMPLE_REPLAY_EXTRACT_OVERWRITE_S3_PATH":"N/A", 16 | "SIMPLE_REPLAY_OVERWRITE_S3_PATH":"N/A", 17 | 18 | "AUTO_PAUSE": true, 19 | "DATABASE_NAME": "database_name", 20 | 21 | "CONFIGURATIONS": [ 22 | { 23 | "TYPE": "Provisioned", 24 | "NODE_TYPE": "dc2.8xlarge", 25 | "NUMBER_OF_NODES": "2", 26 | "WLM_CONFIG_S3_PATH": "N/A" 27 | }, 28 | { 29 | "TYPE": "Provisioned", 30 | "NODE_TYPE": "ra3.4xlarge", 31 | "NUMBER_OF_NODES": "4", 32 | "WLM_CONFIG_S3_PATH": "N/A" 33 | }, 34 | { 35 | "TYPE": "Provisioned", 36 | "NODE_TYPE": "ra3.4xlarge", 37 | "NUMBER_OF_NODES": "4", 38 | "WLM_CONFIG_S3_PATH": "s3://node-config-compare-bucket/wlmconfig.json" 39 | }, 40 | { 41 | "TYPE": "Serverless", 42 | "BASE_RPU": "64" 43 | }, 44 | { 45 | "TYPE": "Serverless", 46 | "BASE_RPU": "128" 47 | } 48 | ] 49 | } 50 | 51 | -------------------------------------------------------------------------------- /test-cases/ddl.sql: -------------------------------------------------------------------------------- 1 | create table if not exists example_table 2 | (id INTEGER IDENTITY(1, 1) NOT NULL, column_value varchar(10), insert_timestamp timestamp default sysdate); 3 | 4 | insert into example_table (column_value) values('data'); 5 | -------------------------------------------------------------------------------- /test-cases/parameter_group_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "Parameters": [ 3 | { 4 | "ParameterName": "auto_analyze", 5 | "ParameterValue": "true", 6 | "Description": "Use auto analyze", 7 | "Source": "engine-default", 8 | "DataType": "boolean", 9 | "AllowedValues": "true,false", 10 | "ApplyType": "static", 11 | "IsModifiable": true 12 | }, 13 | { 14 | "ParameterName": "datestyle", 15 | "ParameterValue": "ISO, MDY", 16 | "Description": "Sets the display format for date and time values.", 17 | "Source": "engine-default", 18 | "DataType": "string", 19 | "ApplyType": "static", 20 | "IsModifiable": true 21 | }, 22 | { 23 | "ParameterName": "enable_case_sensitive_identifier", 24 | "ParameterValue": "false", 25 | "Description": "Preserve case sensitivity for database identifiers such as table or column names in parser", 26 | "Source": "engine-default", 27 | "DataType": "boolean", 28 | "AllowedValues": "true,false", 29 | "ApplyType": "static", 30 | "IsModifiable": true 31 | }, 32 | { 33 | "ParameterName": "enable_user_activity_logging", 34 | "ParameterValue": "false", 35 | "Description": "parameter for audit logging purpose", 36 | "Source": "user", 37 | "DataType": "boolean", 38 | "AllowedValues": "true,false", 39 | "ApplyType": "static", 40 | "IsModifiable": true 41 | }, 42 | { 43 | "ParameterName": "extra_float_digits", 44 | "ParameterValue": "0", 45 | "Description": "Sets the number of digits displayed for floating-point values", 46 | "Source": "engine-default", 47 | "DataType": "integer", 48 | "AllowedValues": "-15-2", 49 | "ApplyType": "static", 50 | "IsModifiable": true 51 | }, 52 | { 53 | "ParameterName": "max_concurrency_scaling_clusters", 54 | "ParameterValue": "2", 55 | "Description": "The maximum concurrency scaling clusters can be used.", 56 | "Source": "user", 57 | "DataType": "integer", 58 | "AllowedValues": "0-10", 59 | "ApplyType": "static", 60 | "IsModifiable": true 61 | }, 62 | { 63 | "ParameterName": "max_cursor_result_set_size", 64 | "ParameterValue": "default", 65 | "Description": "Sets the max cursor result set size", 66 | "Source": "engine-default", 67 | "DataType": "integer", 68 | "AllowedValues": "0-14400000", 69 | "ApplyType": "static", 70 | "IsModifiable": true 71 | }, 72 | { 73 | "ParameterName": "query_group", 74 | "ParameterValue": "default", 75 | "Description": "This parameter applies a user-defined label to a group of queries that are run during the same session..", 76 | "Source": "engine-default", 77 | "DataType": "string", 78 | "ApplyType": "static", 79 | "IsModifiable": true 80 | }, 81 | { 82 | "ParameterName": "require_ssl", 83 | "ParameterValue": "true", 84 | "Description": "require ssl for all databaseconnections", 85 | "Source": "user", 86 | "DataType": "boolean", 87 | "AllowedValues": "true,false", 88 | "ApplyType": "static", 89 | "IsModifiable": true 90 | }, 91 | { 92 | "ParameterName": "search_path", 93 | "ParameterValue": "$user, public", 94 | "Description": "Sets the schema search order for names that are not schema-qualified.", 95 | "Source": "engine-default", 96 | "DataType": "string", 97 | "ApplyType": "static", 98 | "IsModifiable": true 99 | }, 100 | { 101 | "ParameterName": "statement_timeout", 102 | "ParameterValue": "0", 103 | "Description": "Aborts any statement that takes over the specified number of milliseconds.", 104 | "Source": "engine-default", 105 | "DataType": "integer", 106 | "AllowedValues": "0,100-2147483647", 107 | "ApplyType": "static", 108 | "IsModifiable": true 109 | }, 110 | { 111 | "ParameterName": "use_fips_ssl", 112 | "ParameterValue": "false", 113 | "Description": "Use fips ssl library", 114 | "Source": "engine-default", 115 | "DataType": "boolean", 116 | "AllowedValues": "true,false", 117 | "ApplyType": "static", 118 | "IsModifiable": true 119 | }, 120 | { 121 | "ParameterName": "wlm_json_configuration", 122 | "ParameterValue": "[{\"auto_wlm\":true}]", 123 | "Description": "wlm json configuration", 124 | "Source": "engine-default", 125 | "DataType": "string", 126 | "ApplyType": "static", 127 | "IsModifiable": true 128 | } 129 | ] 130 | } 131 | -------------------------------------------------------------------------------- /test-cases/source-wlm.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "query_group": [], 4 | "query_group_wild_card": 0, 5 | "user_group": [], 6 | "user_group_wild_card": 0, 7 | "concurrency_scaling": "off", 8 | "rules": [ 9 | { 10 | "rule_name": "DiskSpilling", 11 | "predicate": [ 12 | { 13 | "metric_name": "query_temp_blocks_to_disk", 14 | "operator": ">", 15 | "value": 100000 16 | } 17 | ], 18 | "action": "log" 19 | }, 20 | { 21 | "rule_name": "QueryRunningMoreThan30min", 22 | "predicate": [ 23 | { 24 | "metric_name": "query_execution_time", 25 | "operator": ">", 26 | "value": 1800 27 | } 28 | ], 29 | "action": "log" 30 | } 31 | ], 32 | "priority": "normal", 33 | "queue_type": "auto", 34 | "auto_wlm": true 35 | }, 36 | { 37 | "short_query_queue": true 38 | } 39 | ] 40 | -------------------------------------------------------------------------------- /test-cases/test_queries.sql: -------------------------------------------------------------------------------- 1 | --first_query 2 | 3 | SELECT 4 | s_acctbal 5 | , s_name 6 | , n_name 7 | , p_partkey 8 | , p_mfgr 9 | , s_address 10 | , s_phone 11 | , s_comment 12 | FROM 13 | part, 14 | supplier, 15 | partsupp, 16 | nation, 17 | REGION 18 | WHERE p_partkey = ps_partkey 19 | AND s_suppkey = ps_suppkey 20 | AND p_size = 34 21 | AND p_type LIKE '%COPPER' 22 | AND s_nationkey = n_nationkey 23 | AND n_regionkey = r_regionkey 24 | AND r_name = 'MIDDLE EAST' 25 | AND ps_supplycost = (SELECT 26 | MIN(ps_supplycost) 27 | FROM 28 | partsupp, 29 | supplier, 30 | nation, 31 | REGION 32 | WHERE p_partkey = ps_partkey 33 | AND s_suppkey = ps_suppkey 34 | AND s_nationkey = n_nationkey 35 | AND n_regionkey = r_regionkey 36 | AND r_name = 'MIDDLE EAST') 37 | ORDER BY 38 | s_acctbal DESC 39 | , n_name 40 | , s_name 41 | , p_partkey ; 42 | 43 | --second_query 44 | 45 | SELECT 46 | ps_partkey 47 | , SUM(ps_supplycost * ps_availqty) AS value 48 | FROM 49 | partsupp, 50 | supplier, 51 | nation 52 | WHERE ps_suppkey = s_suppkey 53 | AND s_nationkey = n_nationkey 54 | AND n_name = 'SAUDI ARABIA' 55 | GROUP BY 56 | ps_partkey 57 | HAVING 58 | SUM(ps_supplycost * ps_availqty) > (SELECT 59 | SUM(ps_supplycost * ps_availqty) * 0.0000000333 60 | FROM 61 | partsupp, 62 | supplier, 63 | nation 64 | WHERE ps_suppkey = s_suppkey 65 | AND s_nationkey = n_nationkey 66 | AND n_name = 'SAUDI ARABIA') 67 | ORDER BY 68 | value DESC ; 69 | 70 | --third_query 71 | 72 | SELECT 73 | p_brand 74 | , p_type 75 | , p_size 76 | , COUNT(DISTINCT ps_suppkey) AS supplier_cnt 77 | FROM 78 | partsupp, 79 | part 80 | WHERE p_partkey = ps_partkey 81 | AND p_brand <> 'Brand#23' 82 | AND p_type NOT LIKE 'MEDIUM ANODIZED%' 83 | AND p_size IN (1, 32, 33, 46, 7, 42, 21, 40) 84 | AND ps_suppkey NOT IN (SELECT 85 | s_suppkey 86 | FROM 87 | supplier 88 | WHERE s_comment LIKE '%Customer%Complaints%') 89 | GROUP BY 90 | p_brand 91 | , p_type 92 | , p_size 93 | ORDER BY 94 | supplier_cnt DESC 95 | , p_brand 96 | , p_type 97 | , p_size ; 98 | 99 | 100 | --fourth_query 101 | 102 | SELECT r_name,count(1) number_of_supplies 103 | FROM 104 | part, 105 | partsupp, 106 | supplier, 107 | nation, 108 | REGION 109 | WHERE p_partkey = ps_partkey 110 | AND s_suppkey = ps_suppkey 111 | AND s_nationkey = n_nationkey 112 | AND n_regionkey = r_regionkey 113 | group by 1 114 | order by 1; 115 | 116 | 117 | --fifth_query 118 | 119 | SELECT 120 | n_name 121 | , COUNT(1) total_count 122 | FROM 123 | supplier, 124 | nation 125 | WHERE s_suppkey IN (SELECT 126 | ps_suppkey 127 | FROM 128 | partsupp 129 | WHERE ps_partkey IN (SELECT 130 | p_partkey 131 | FROM 132 | part 133 | WHERE p_name LIKE 'olive%') 134 | AND ps_availqty > 1) 135 | AND s_nationkey = n_nationkey 136 | GROUP BY 137 | 1 138 | ORDER BY 139 | 1; 140 | -------------------------------------------------------------------------------- /test-cases/user_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "SNAPSHOT_ID": "redshift-cluster-manual-snapshot", 3 | "SNAPSHOT_ACCOUNT_ID": "123456789012", 4 | 5 | "PARAMETER_GROUP_CONFIG_S3_PATH": "s3://your-node-config-compare-bucket/test-cases/parameter_group_config.json", 6 | 7 | "DDL_AND_COPY_SCRIPT_S3_PATH": "s3://your-node-config-compare-bucket/test-cases/ddl.sql", 8 | "SQL_SCRIPT_S3_PATH":"s3://your-node-config-compare-bucket/test-cases/test_queries.sql", 9 | "NUMBER_OF_PARALLEL_SESSIONS_LIST": "1,5,10", 10 | 11 | "SIMPLE_REPLAY_LOG_LOCATION":"s3://redshift-logging/RedshiftAuditLogs/", 12 | "SIMPLE_REPLAY_EXTRACT_START_TIME":"2021-08-28T11:15:00+00:00", 13 | "SIMPLE_REPLAY_EXTRACT_END_TIME":"2021-08-28T12:00:00+00:00", 14 | 15 | "SIMPLE_REPLAY_EXTRACT_OVERWRITE_S3_PATH":"N/A", 16 | "SIMPLE_REPLAY_OVERWRITE_S3_PATH":"N/A", 17 | 18 | "AUTO_PAUSE": true, 19 | "DATABASE_NAME": "database_name" 20 | 21 | "CONFIGURATIONS": [ 22 | { 23 | "NODE_TYPE": "dc2.8xlarge", 24 | "NUMBER_OF_NODES": "2", 25 | "WLM_CONFIG_S3_PATH": "s3://your-node-config-compare-bucket/test-cases/source-wlm.json" 26 | }, 27 | { 28 | "NODE_TYPE": "ra3.4xlarge", 29 | "NUMBER_OF_NODES": "2", 30 | "WLM_CONFIG_S3_PATH": "s3://your-node-config-compare-bucket/test-cases/source-wlm.json" 31 | }, 32 | { 33 | "NODE_TYPE": "ra3.4xlarge", 34 | "NUMBER_OF_NODES": "4", 35 | "WLM_CONFIG_S3_PATH": "s3://your-node-config-compare-bucket/test-cases/source-wlm.json" 36 | }, 37 | { 38 | "NODE_TYPE": "ra3.4xlarge", 39 | "NUMBER_OF_NODES": "4", 40 | "WLM_CONFIG_S3_PATH": "s3://your-node-config-compare-bucket/test-cases/wlm-concurrency-scaling.json" 41 | } 42 | ] 43 | } 44 | -------------------------------------------------------------------------------- /test-cases/wlm-concurrency-scaling.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "query_group": [], 4 | "query_group_wild_card": 0, 5 | "user_group": [], 6 | "user_group_wild_card": 0, 7 | "concurrency_scaling": "auto", 8 | "rules": [ 9 | { 10 | "rule_name": "DiskSpilling", 11 | "predicate": [ 12 | { 13 | "metric_name": "query_temp_blocks_to_disk", 14 | "operator": ">", 15 | "value": 100000 16 | } 17 | ], 18 | "action": "log" 19 | }, 20 | { 21 | "rule_name": "QueryRunningMoreThan30min", 22 | "predicate": [ 23 | { 24 | "metric_name": "query_execution_time", 25 | "operator": ">", 26 | "value": 1800 27 | } 28 | ], 29 | "action": "log" 30 | } 31 | ], 32 | "priority": "normal", 33 | "queue_type": "auto", 34 | "auto_wlm": true 35 | }, 36 | { 37 | "short_query_queue": true 38 | } 39 | ] 40 | -------------------------------------------------------------------------------- /user_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "SNAPSHOT_ID": "redshift-cluster-manual-snapshot", 3 | "SNAPSHOT_ACCOUNT_ID": "123456789012", 4 | 5 | "PARAMETER_GROUP_CONFIG_S3_PATH": "s3://node-config-compare-bucket/pg_config.json", 6 | 7 | "DDL_AND_COPY_SCRIPT_S3_PATH": "s3://node-config-compare-bucket/ddl.sql", 8 | "SQL_SCRIPT_S3_PATH":"s3://node-config-compare-bucket/test_queries.sql", 9 | "NUMBER_OF_PARALLEL_SESSIONS_LIST": "1,5,10", 10 | 11 | "SIMPLE_REPLAY_LOG_LOCATION":"s3://redshift-logging-xxxxxxxx/RSLogs/", 12 | "SIMPLE_REPLAY_EXTRACT_START_TIME":"2021-08-28T11:15:00+00:00", 13 | "SIMPLE_REPLAY_EXTRACT_END_TIME":"2021-08-28T12:00:00+00:00", 14 | 15 | "SIMPLE_REPLAY_EXTRACT_OVERWRITE_S3_PATH":"N/A", 16 | "SIMPLE_REPLAY_OVERWRITE_S3_PATH":"N/A", 17 | 18 | "AUTO_PAUSE": true, 19 | "DATABASE_NAME": "database_name", 20 | 21 | "CONFIGURATIONS": [ 22 | { 23 | "NODE_TYPE": "dc2.8xlarge", 24 | "NUMBER_OF_NODES": "2", 25 | "WLM_CONFIG_S3_PATH": "s3://node-config-compare-bucket/source-wlm.json" 26 | }, 27 | { 28 | "NODE_TYPE": "ra3.4xlarge", 29 | "NUMBER_OF_NODES": "2", 30 | "WLM_CONFIG_S3_PATH": "s3://node-config-compare-bucket/source-wlm.json" 31 | }, 32 | { 33 | "NODE_TYPE": "ra3.4xlarge", 34 | "NUMBER_OF_NODES": "4", 35 | "WLM_CONFIG_S3_PATH": "s3://node-config-compare-bucket/source-wlm.json" 36 | }, 37 | { 38 | "NODE_TYPE": "ra3.4xlarge", 39 | "NUMBER_OF_NODES": "4", 40 | "WLM_CONFIG_S3_PATH": "s3://node-config-compare-bucket/wlm-concurrency-scaling.json" 41 | } 42 | ] 43 | } --------------------------------------------------------------------------------