"
2 | source_cluster_endpoint:
3 | #Provide region. eg: "us-east-2"
4 | region:
5 | #Provide redshift username. eg: "awsuser"
6 | redshift_user :
7 |
8 | #Provide the start time and end time of workload to be replicated. eg: “2020-07-24T09:31:00+00:00”
9 | start_time :
10 | end_time :
11 |
12 | #Provide the S3 bucket location where you want the replicator to store cloned objects. eg: "s3://mybucket/myworkload"
13 | target_s3_location:
14 |
15 | # Set the amount of logging - either INFO or DEBUG
16 | log_level:
--------------------------------------------------------------------------------
/config/extract.yaml:
--------------------------------------------------------------------------------
1 | # Required. Where to save the extracted workload. Either S3 location or local directory.
2 | workload_location: "s3://mybucketname/myworkload"
3 |
4 | # Optional. Providing this enables automatic log retrieval from S3 and system
5 | # table information retrieval (which allows query start and end times to be
6 | # extracted, rather than just record times)
7 | source_cluster_endpoint: ""
8 |
9 | # Required only if source_cluster_endpoint is given.
10 | master_username: "awsuser"
11 |
12 | #Required for generating copy_replacements and if log location for cloudwatch logs is specified
13 | region: ""
14 |
15 |
16 | # Required. Start and end time of the workload to be extracted, e.g. 2020-06-14T21:41:16+00:00
17 | start_time: ""
18 | end_time: ""
19 |
20 | #Replacement s3 location and IAM roles for copy files
21 | replacement_copy_location: ""
22 | replacement_iam_location: ""
23 |
24 |
25 | # Required only if extraction using ODBC is preferred and installed. Otherwise, Python driver is used.
26 | odbc_driver: ""
27 |
28 | # Leave blank to automatically retrieve audit logs from the source cluster.
29 | # You can specify a local location or S3 location to load the audit logs from
30 | # another location.
31 | log_location: ""
32 |
33 | # Location of the SQL file containing queries to unload system tables
34 | unload_system_table_queries: "core/replay/unload_system_tables.sql"
35 |
36 | # Should be a S3 location. If unspecified, system tables will not be unloaded
37 | source_cluster_system_table_unload_location: ""
38 |
39 | # If an IAM role is provided, UNLOAD will occur. If this is blank, UNLOAD of system tables will not occur.
40 | source_cluster_system_table_unload_iam_role: ""
41 |
42 | ##
43 | ## The settings below probably don't need to be modified for a typical run
44 | ##
45 |
46 | # Set the amount of logging
47 | log_level: info
48 |
49 | # Number of logfiles to maintain
50 | backup_count: 1
51 |
52 | #Provide the schemas list for spectrum to avoid modification during Replay in format ['schema_name']
53 | external_schemas:
--------------------------------------------------------------------------------
/config/replay.yaml:
--------------------------------------------------------------------------------
1 | # Optional - Custom identifier for this replay run
2 | tag: ""
3 |
4 | # Directory location of extracted workload, relative to current directory
5 | workload_location: ""
6 |
7 | # Endpoint and username of target cluster to replay queries on
8 | target_cluster_endpoint: ""
9 | target_cluster_region: ""
10 | master_username: ""
11 |
12 | # NLB or NAT endpoint for Simple Replay to connect to. This NLB or NAT should have connectivity to target_cluster_endpoint
13 | nlb_nat_dns: ""
14 |
15 | # Required only for playback using ODBC (pyodbc)
16 | odbc_driver: ""
17 |
18 | # If original driver isn't supported (e.g. JDBC), use this driver. "psql" or
19 | # "odbc" are the only valid values.
20 | default_interface: "psql"
21 |
22 | # Optional - Leaving it empty defers to connections.json. "all on" preserves
23 | # time between transactions. "all off" disregards time between transactions,
24 | # executing them as a batch.
25 | time_interval_between_transactions: ""
26 |
27 | # Optional - Leaving it empty defers to connections.json. "all on" preserves
28 | # time between queries. "all off" disregards time between queries, executing
29 | # them as a batch.
30 | time_interval_between_queries: ""
31 |
32 | # Should COPY statements be executed?
33 | execute_copy_statements: "false"
34 |
35 | # Should UNLOAD statements be executed?
36 | execute_unload_statements: "false"
37 |
38 | # Optional - Where the UNLOADs and system table unload goes.
39 | replay_output: ""
40 |
41 | # Optional - Where the analysis data and summary report will be uploaded. Example: s3://bucket_name/path
42 | analysis_output: ""
43 |
44 | # Optional - Leaving this blank means UNLOADs will not be replayed. IAM role for UNLOADs to be performed with.
45 | unload_iam_role: ""
46 |
47 | # Optional - Leaving this blank means analysis will not be run. IAM role for analysis needs UNLOAD access.
48 | analysis_iam_role: ""
49 |
50 | # Location of the SQL file containing queries to unload system tables
51 | unload_system_table_queries: "core/replay/unload_system_tables.sql"
52 |
53 | # IAM role to UNLOAD system tables from source cluster to S3 location for later
54 | # analysis
55 | target_cluster_system_table_unload_iam_role: ""
56 |
57 | # Include filters will work as "db AND user AND pid". Exclude filters will work as "db OR user OR pid".
58 | # In case of multiple values for any specific filter, please enclose each in single quotes
59 | filters:
60 | include:
61 | database_name: ['*']
62 | username: ['*']
63 | pid: ['*']
64 | exclude:
65 | database_name: []
66 | username: []
67 | pid: []
68 |
69 | ##
70 | ## The settings below probably don't need to be modified for a typical run
71 | ##
72 |
73 | # Set the amount of logging
74 | log_level: "DEBUG"
75 |
76 | # number of proceses to use to parallelize the work. If omitted or null, uses
77 | # one process per cpu - 1
78 | num_workers: ~
79 |
80 | # output warnings if connections are not within this number of seconds from
81 | # their expected time.
82 | connection_tolerance_sec: 300
83 |
84 | # Number of TestDrive logfiles to maintain
85 | backup_count: 1
86 |
87 | # Should we discard the returned data
88 | drop_return: true
89 |
90 | # Should connections in the replay be throttled
91 | limit_concurrent_connections: ~
92 |
93 | # Should multistatement SQL be split
94 | split_multi: true
95 |
96 | # In case of Serverless, set up a secret to store admin username and password. Specify the name of the secret below
97 | # Note: This admin username maps to the username specified as `master_username` in this file. This will be updated to `admin_username` in a future release.
98 | secret_name: ""
99 |
--------------------------------------------------------------------------------
/config/user_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "SNAPSHOT_ID": "redshift-cluster-manual-snapshot",
3 | "SNAPSHOT_ACCOUNT_ID": "123456789012",
4 | "PARAMETER_GROUP_CONFIG_S3_PATH": "s3://node-config-compare-bucket/pg_config.json",
5 | "DDL_AND_COPY_SCRIPT_S3_PATH": "s3://node-config-compare-bucket/ddl.sql",
6 | "SQL_SCRIPT_S3_PATH": "s3://node-config-compare-bucket/test_queries.sql",
7 | "NUMBER_OF_PARALLEL_SESSIONS_LIST": "1",
8 | "SIMPLE_REPLAY_LOG_LOCATION": "s3://redshift-logging-xxxxxxxx/RSLogs/",
9 | "SIMPLE_REPLAY_EXTRACT_START_TIME": "2021-08-28T11:15:00+00:00",
10 | "SIMPLE_REPLAY_EXTRACT_END_TIME": "2021-08-28T12:00:00+00:00",
11 | "SIMPLE_REPLAY_UNLOAD_STATEMENTS": "false",
12 | "SIMPLE_REPLAY_EXTRACT_OVERWRITE_S3_PATH": "N/A",
13 | "SIMPLE_REPLAY_OVERWRITE_S3_PATH": "N/A",
14 | "AUTO_PAUSE": true,
15 | "DATABASE_NAME": "database_name",
16 | "CONFIGURATIONS": [
17 | {
18 | "TYPE": "Provisioned",
19 | "NODE_TYPE": "dc2.8xlarge",
20 | "NUMBER_OF_NODES": "2",
21 | "WLM_CONFIG_S3_PATH": "N/A"
22 | },
23 | {
24 | "TYPE": "Provisioned",
25 | "NODE_TYPE": "ra3.4xlarge",
26 | "NUMBER_OF_NODES": "4",
27 | "WLM_CONFIG_S3_PATH": "N/A"
28 | },
29 | {
30 | "TYPE": "Provisioned",
31 | "NODE_TYPE": "ra3.4xlarge",
32 | "NUMBER_OF_NODES": "4",
33 | "WLM_CONFIG_S3_PATH": "s3://node-config-compare-bucket/wlmconfig.json"
34 | },
35 | {
36 | "TYPE": "Serverless",
37 | "BASE_RPU": "64"
38 | },
39 | {
40 | "TYPE": "Serverless",
41 | "BASE_RPU": "128"
42 | }
43 | ]
44 | }
--------------------------------------------------------------------------------
/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/redshift-test-drive/354b7ed75180a6b915d856175cffd6414cae998e/core/__init__.py
--------------------------------------------------------------------------------
/core/extract/__init__.py:
--------------------------------------------------------------------------------
1 | from pkgutil import extend_path
2 |
3 | __path__ = extend_path(__path__, __name__)
4 |
--------------------------------------------------------------------------------
/core/extract/cloudwatch_extractor.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import tempfile
3 | import gzip
4 | import sys
5 |
6 | import common.aws_service as aws_service_helper
7 | from core.extract.extract_parser import parse_log
8 |
9 | logger = logging.getLogger("WorkloadReplicatorLogger")
10 |
11 |
12 | class CloudwatchExtractor:
13 | config = None
14 |
15 | def __init__(self, config):
16 | self.config = config
17 |
18 | def get_extract_from_cloudwatch(self, start_time, end_time):
19 | cloudwatch_logs = []
20 | if self.config.get("source_cluster_endpoint"):
21 | logger.info(
22 | f"Extracting logs from source cluster endpoint: {self.config['source_cluster_endpoint']}"
23 | )
24 | source_cluster_endpoint = self.config.get("source_cluster_endpoint")
25 | region = source_cluster_endpoint.split(".")[2]
26 | endpoint = source_cluster_endpoint.split(".")[0]
27 | response = aws_service_helper.cw_describe_log_groups(region=region)
28 | cloudwatch_logs = self._read_cloudwatch_logs(
29 | response, endpoint, start_time, end_time, region
30 | )
31 | elif self.config.get("log_location"):
32 | logger.info(f"Extracting logs for {self.config['log_location']}")
33 | response = aws_service_helper.cw_describe_log_groups(
34 | log_group_name=self.config.get("log_location"),
35 | region=self.config.get("region"),
36 | )
37 | for log_group in response["logGroups"]:
38 | log_group_name = log_group["logGroupName"]
39 | response_stream = aws_service_helper.cw_describe_log_streams(
40 | log_group_name, self.config.get("region")
41 | )
42 | endpoint = response_stream["logStreams"][0]["logStreamName"]
43 | cloudwatch_logs = self._read_cloudwatch_logs(
44 | response, endpoint, start_time, end_time, self.config.get("region")
45 | )
46 | else:
47 | logger.error(
48 | "For Cloudwatch Log Extraction, one of source_cluster_endpoint or log_location must be provided"
49 | )
50 | sys.exit(-1)
51 | return cloudwatch_logs
52 |
53 | def _read_cloudwatch_logs(self, response, endpoint, start_time, end_time, region):
54 | connections = {}
55 | last_connections = {}
56 | logs = {}
57 | databases = set()
58 | for log_group in response["logGroups"]:
59 | log_group_name = log_group["logGroupName"]
60 | stream_batch = aws_service_helper.cw_describe_log_streams(
61 | log_group_name=log_group_name, region=region
62 | )["logStreams"]
63 | for stream in stream_batch:
64 | stream_name = stream["logStreamName"]
65 | if endpoint == stream_name:
66 | logger.info(
67 | f"Extracting for log group: {log_group_name} between time {start_time} and {end_time}"
68 | )
69 |
70 | log_list = aws_service_helper.cw_get_paginated_logs(
71 | log_group_name,
72 | stream["logStreamName"],
73 | start_time,
74 | end_time,
75 | region,
76 | )
77 | if "useractivitylog" in log_group_name:
78 | log_type = "useractivitylog"
79 | elif "connectionlog" in log_group_name:
80 | log_type = "connectionlog"
81 | else:
82 | logger.warning(
83 | f"Unsupported log file {log_group_name}, cannot determine type"
84 | )
85 | continue
86 |
87 | with tempfile.TemporaryDirectory(suffix="TestDrive") as tempdir:
88 | with gzip.open(f"{tempdir}/{log_type}.gz", "wt") as gzip_file:
89 | gzip_file.write("\n".join(log_list))
90 |
91 | if log_type == "connectionlog":
92 | logger.info("Parsing connection logs...")
93 | with gzip.open(f"{tempdir}/connectionlog.gz", "r") as gzip_file:
94 | parse_log(
95 | gzip_file,
96 | "connectionlog.gz",
97 | connections,
98 | last_connections,
99 | logs,
100 | databases,
101 | start_time,
102 | end_time,
103 | )
104 | if log_type == "useractivitylog":
105 | logger.info("Parsing user activity logs...")
106 | with gzip.open(f"{tempdir}/useractivitylog.gz", "r") as gzip_file:
107 | parse_log(
108 | gzip_file,
109 | "useractivitylog.gz",
110 | connections,
111 | last_connections,
112 | logs,
113 | databases,
114 | start_time,
115 | end_time,
116 | )
117 |
118 | return connections, logs, databases, last_connections
119 |
--------------------------------------------------------------------------------
/core/extract/extract.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import sys
3 | import hashlib
4 | import datetime
5 | import yaml
6 | import os
7 | import zipfile
8 | import time
9 | import re
10 | import common.config as config_helper
11 | import common.log as log_helper
12 | from common import aws_service as aws_service_helper
13 | from common.util import cluster_dict, db_connect
14 | import core.extract.extractor as extractor
15 |
16 | logger = logging.getLogger("WorkloadReplicatorLogger")
17 |
18 | serverless_cluster_endpoint_pattern = (
19 | r"(.+)\.(.+)\.(.+).redshift-serverless(-dev)?\.amazonaws\.com:[0-9]{4,5}\/(.)+"
20 | )
21 |
22 |
23 | def is_serverless(config):
24 | return bool(
25 | re.fullmatch(serverless_cluster_endpoint_pattern, config["source_cluster_endpoint"])
26 | )
27 |
28 |
29 | def main():
30 |
31 | extract_start_time = time.time()
32 |
33 | # Parse config file
34 | config = config_helper.get_config_file_from_args()
35 | config_helper.validate_config_file_for_extract(config)
36 |
37 | # UID for extract logs
38 | extract_start_timestamp = datetime.datetime.now(tz=datetime.timezone.utc)
39 | id_hash = hashlib.sha1(
40 | extract_start_timestamp.isoformat().encode("UTF-8")
41 | ).hexdigest()[:5]
42 | if config.get("source_cluster_endpoint", "") != "":
43 | cluster = cluster_dict(config["source_cluster_endpoint"])
44 | if config.get("tag", "") != "":
45 | extract_id = f'{extract_start_timestamp.isoformat()}_{cluster.get("id")}_{config["tag"]}_{id_hash}'
46 | else:
47 | extract_id = (
48 | f'{extract_start_timestamp.isoformat()}_{cluster.get("id")}_{id_hash}'
49 | )
50 | else:
51 | log_location = config.get("log_location")
52 | if config.get("tag", "") != "":
53 | extract_id = f'{extract_start_timestamp.isoformat()}_{log_location}_{config["tag"]}_{id_hash}'
54 | else:
55 | extract_id = (
56 | f"{extract_start_timestamp.isoformat()}_{log_location}_{id_hash}"
57 | )
58 |
59 | # Setup Logging
60 | level = logging.getLevelName(config.get("log_level", "INFO").upper())
61 | log_helper.init_logging(
62 | "extract.log",
63 | dir=f"core/logs/extract/extract_log-{extract_id}",
64 | level=level,
65 | preamble=yaml.dump(config),
66 | backup_count=config.get("backup_count", 2),
67 | script_type="extract",
68 | log_id=extract_id,
69 | )
70 | log_helper.log_version()
71 |
72 | e = extractor.Extractor(config)
73 | if not e.load_driver():
74 | sys.exit("Failed to load driver")
75 |
76 | # setting application name for tracking
77 | if config.get("source_cluster_endpoint"):
78 | application = "WorkloadReplicator-Extract"
79 |
80 | if is_serverless(config):
81 | host = f'redshift-serverless-{config.get("source_cluster_endpoint").split(".")[0]}'
82 | else:
83 | host = config.get("source_cluster_endpoint").split(".")[0]
84 | port = int(config.get("source_cluster_endpoint").split(":")[-1]
85 | .split("/")[0])
86 | DbUser = config.get("master_username")
87 | DbName = config.get("source_cluster_endpoint").split("/")[-1]
88 | region = config.get("region")
89 | endpoint = config.get('source_cluster_endpoint').split(":")[0]
90 |
91 | response = aws_service_helper.redshift_get_cluster_credentials(
92 | user=DbUser,
93 | database_name=DbName,
94 | cluster_id=host,
95 | region=region)
96 | db_connect(host=endpoint,
97 | port=port,
98 | database=DbName,
99 | password=response['DbPassword'],
100 | username=response['DbUser'], app_name=application)
101 |
102 | # Run extract job
103 | (
104 | extraction_name,
105 | start_time,
106 | end_time,
107 | log_location,
108 | ) = e.get_parameters_for_log_extraction()
109 | (connections, audit_logs, databases, last_connections) = e.get_extract(
110 | log_location, start_time, end_time
111 | )
112 |
113 | e.validate_log_result(connections, audit_logs)
114 | e.retrieve_cluster_endpoint_info(extraction_name)
115 |
116 | e.save_logs(
117 | audit_logs,
118 | last_connections,
119 | config["workload_location"] + "/" + extraction_name,
120 | connections,
121 | start_time,
122 | end_time,
123 | )
124 |
125 | # save the extract logs to S3
126 | output_directory = f'{config["workload_location"]+ "/" + extraction_name}'
127 | if output_directory.startswith("s3://"):
128 | output_s3_location = output_directory[5:].partition("/")
129 | bucket_name = output_s3_location[0]
130 | output_prefix = output_s3_location[2]
131 | object_key = "extract_logs.zip"
132 | zip_file_name = f"extract_logs.zip"
133 | logger.info(f"Uploading extract logs to {bucket_name}/{output_prefix}")
134 | dir = f"core/logs/extract/extract_log-{extract_id}"
135 | with zipfile.ZipFile(zip_file_name, "w", zipfile.ZIP_DEFLATED) as zip_object:
136 | for folder_name, sub_folders, file_names in os.walk(dir):
137 | for filename in file_names:
138 | file_path = os.path.join(folder_name, filename)
139 | zip_object.write(file_path)
140 | with open(zip_file_name, "rb") as f:
141 | aws_service_helper.s3_put_object(
142 | f, bucket_name, f"{output_prefix}/{object_key}"
143 | )
144 |
145 | total_extract_time = str(datetime.timedelta(seconds=(time.time() - extract_start_time)))
146 | logger.info(f"Extract completed in {total_extract_time}")
147 |
148 | if __name__ == "__main__":
149 | main()
150 |
--------------------------------------------------------------------------------
/core/extract/local_extractor.py:
--------------------------------------------------------------------------------
1 | import gzip
2 | import logging
3 | import os
4 | from tqdm import tqdm
5 | from core.extract import extract_parser
6 |
7 | logger = logging.getLogger("WorkloadReplicatorLogger")
8 |
9 |
10 | class LocalExtractor:
11 | disable_progress_bar = None
12 | bar_format = "{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}{postfix}]"
13 |
14 | def __init__(self, config):
15 | self.config = config
16 |
17 | def get_extract_locally(self, log_directory_path, start_time, end_time):
18 | """
19 |
20 | :param log_directory_path:
21 | :param start_time:
22 | :param end_time:
23 | :return:
24 | """
25 | connections = {}
26 | last_connections = {}
27 | logs = {}
28 | databases = set()
29 |
30 | unsorted_list = os.listdir(log_directory_path)
31 | log_directory = sorted(unsorted_list)
32 |
33 | for filename in tqdm(
34 | log_directory,
35 | disable=self.disable_progress_bar,
36 | unit="files",
37 | desc="Files processed",
38 | bar_format=self.bar_format,
39 | ):
40 | if self.disable_progress_bar:
41 | logger.info(f"Processing {filename}")
42 | if "start_node" in filename:
43 | log_file = gzip.open(
44 | log_directory_path + "/" + filename, "rt", encoding="ISO-8859-1"
45 | )
46 | else:
47 | log_file = gzip.open(log_directory_path + "/" + filename, "r")
48 |
49 | extract_parser.parse_log(
50 | log_file,
51 | filename,
52 | connections,
53 | last_connections,
54 | logs,
55 | databases,
56 | start_time,
57 | end_time,
58 | )
59 | log_file.close()
60 |
61 | return connections, logs, databases, last_connections
62 |
--------------------------------------------------------------------------------
/core/extract/s3_extractor.py:
--------------------------------------------------------------------------------
1 | import gzip
2 | import logging
3 | import common.aws_service as aws_service_helper
4 | from tqdm import tqdm
5 | from core.util.log_validation import get_logs_in_range
6 | from core.extract.extract_parser import parse_log
7 |
8 | logger = logging.getLogger("WorkloadReplicatorLogger")
9 |
10 |
11 | class S3Extractor:
12 | disable_progress_bar = None
13 | bar_format = "{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}{postfix}]"
14 |
15 | def __init__(self, config):
16 | self.disable_progress_bar = config.get("disable_progress_bar")
17 |
18 | def get_extract_from_s3(self, log_bucket, log_prefix, start_time, end_time):
19 | """
20 | getting logs from s3 and passing it to get_s3_audit_logs()
21 | :param log_bucket:
22 | :param log_prefix:
23 | :param start_time:
24 | :param end_time:
25 | :return:
26 | """
27 | connections = {}
28 | logs = {}
29 | last_connections = {}
30 | databases = set()
31 | bucket_objects = aws_service_helper.sync_s3_get_bucket_contents(log_bucket, log_prefix)
32 |
33 | s3_connection_logs = []
34 | s3_user_activity_logs = []
35 |
36 | for log in bucket_objects:
37 | filename = log["Key"].split("/")[-1]
38 | if "connectionlog" in filename:
39 | s3_connection_logs.append(log)
40 | elif "useractivitylog" in filename:
41 | s3_user_activity_logs.append(log)
42 |
43 | logger.info("Parsing connection logs")
44 | self._get_s3_audit_logs(
45 | log_bucket,
46 | log_prefix,
47 | start_time,
48 | end_time,
49 | s3_connection_logs,
50 | connections,
51 | logs,
52 | databases,
53 | last_connections,
54 | )
55 | logger.info("Parsing user activity logs")
56 | self._get_s3_audit_logs(
57 | log_bucket,
58 | log_prefix,
59 | start_time,
60 | end_time,
61 | s3_user_activity_logs,
62 | connections,
63 | logs,
64 | databases,
65 | last_connections,
66 | )
67 | return connections, logs, databases, last_connections
68 |
69 | def _get_s3_audit_logs(
70 | self,
71 | log_bucket,
72 | log_prefix,
73 | start_time,
74 | end_time,
75 | audit_objects,
76 | connections,
77 | logs,
78 | databases,
79 | last_connections,
80 | ):
81 | """
82 | Getting audit logs from S3 for the cluster from get_s3_logs and calling the pasrse_log()
83 |
84 | :param log_bucket:
85 | :param log_prefix:
86 | :param start_time:
87 | :param end_time:
88 | :param audit_objects:
89 | :param connections:
90 | :param logs:
91 | :param databases:
92 | :param last_connections:
93 | :return:
94 | """
95 |
96 | index_of_last_valid_log = len(audit_objects) - 1
97 |
98 | log_filenames = get_logs_in_range(audit_objects, start_time, end_time)
99 |
100 | logger.info(f"Processing {len(log_filenames)} files")
101 |
102 | curr_index = index_of_last_valid_log
103 | for filename in tqdm(
104 | log_filenames,
105 | disable=self.disable_progress_bar,
106 | unit="files",
107 | desc="Files processed",
108 | bar_format=self.bar_format,
109 | ):
110 | log_object = aws_service_helper.s3_get_object(log_bucket, filename)
111 | log_file = gzip.GzipFile(fileobj=log_object.get()["Body"])
112 |
113 | parse_log(
114 | log_file,
115 | filename,
116 | connections,
117 | last_connections,
118 | logs,
119 | databases,
120 | start_time,
121 | end_time,
122 | )
123 |
124 | logger.debug(
125 | f'First audit log in start_time range: {audit_objects[curr_index]["Key"].split("/")[-1]}'
126 | )
127 | return connections, logs, databases, last_connections
128 |
--------------------------------------------------------------------------------
/core/replay/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/redshift-test-drive/354b7ed75180a6b915d856175cffd6414cae998e/core/replay/__init__.py
--------------------------------------------------------------------------------
/core/replay/copy_replacements_parser.py:
--------------------------------------------------------------------------------
1 | import csv
2 | import logging
3 | import sys
4 |
5 | import common.aws_service as aws_service_helper
6 |
7 | logger = logging.getLogger("WorkloadReplicatorLogger")
8 |
9 |
10 | def parse_copy_replacements(workload_directory):
11 | copy_replacements = {}
12 | replacements_path = workload_directory.rstrip("/") + "/copy_replacements.csv"
13 |
14 | if replacements_path.startswith("s3://"):
15 | workload_s3_location = replacements_path[5:].partition("/")
16 | bucket_name = workload_s3_location[0]
17 | prefix = workload_s3_location[2]
18 | s3_object = aws_service_helper.s3_client_get_object(bucket_name, prefix)
19 | csv_string = s3_object["Body"].read().decode("utf-8")
20 | copy_replacements_reader = csv.reader(csv_string.splitlines())
21 | next(copy_replacements_reader) # Skip header
22 | for row in copy_replacements_reader:
23 | if len(row) == 3 and row[2]:
24 | copy_replacements[row[0]] = [row[1], row[2]]
25 | else:
26 | with open(replacements_path, "r") as csvfile:
27 | copy_replacements_reader = csv.reader(csvfile)
28 | next(copy_replacements_reader) # Skip header
29 | for idx, row in enumerate(copy_replacements_reader):
30 | if len(row) != 3:
31 | logger.error(
32 | f"Replacements file {replacements_path} is malformed (row {idx}, line:\n{row}"
33 | )
34 | sys.exit()
35 | copy_replacements[row[0]] = [row[1], row[2]]
36 |
37 | logger.info(f"Loaded {len(copy_replacements)} COPY replacements from {replacements_path}")
38 | return copy_replacements
39 |
--------------------------------------------------------------------------------
/core/replay/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/redshift-test-drive/354b7ed75180a6b915d856175cffd6414cae998e/core/replay/logo.png
--------------------------------------------------------------------------------
/core/replay/report_content.yaml:
--------------------------------------------------------------------------------
1 | # Replay Information and Report Details
2 | title: "Simple Replay Workload Analysis"
3 | subtitle: "Replay ID: {REPLAY_ID} "
4 |
5 | report_paragraph: "This report summarizes the performance of the replayed workload shown above."
6 |
7 | glossary_header: "Glossary"
8 | glossary_paragraph: "The following terms are used in this report:"
9 | glossary:
10 | - "Compile Time is the total amount of time spent compiling a query."
11 | - "Queue Time is the amount of time a query spends waiting before executing in a workload management (WLM) queue."
12 | - "Execution Time is how long a query spends in the execution phase."
13 | - "Query Latency is the total runtime of a query in Redshift."
14 | - "Commit Queue Time is the time a transaction spent waiting before entering the commit phase."
15 | - "Commit Time the time a transaction spent being committed."
16 |
17 | data_header: "Accessing the data"
18 | data_paragraph: >
19 | All of the performance data collected for this report is available in S3 at the following location:
20 | s3://{S3_BUCKET}/replays/{REPLAY_ID}/
21 | The raw_data directory contains the following raw CSV files unloaded from the Redshift cluster:
22 | raw_data:
23 | - "statement_types000 Statement counts by type (e.g. SELECT, COPY, etc.)"
24 | - "query_metrics000 Query-level performance data."
25 | - "cluster_level_metrics000 Cluster-level summary of performance data. This is used to generate the Cluster Metrics table on page 2."
26 | - "query_distribution000 User-level summary of performance data and broken down by query execution phase. This is used to generate the latency, commit, queue, compile, and execution time tables that begin on page 3."
27 |
28 | agg_data_paragraph: "The aggregated_data directory in S3 contains CSV files of the aggregated table data used to generate this report."
29 |
30 | notes_header: "Workload Notes"
31 | notes_paragraph: >
32 | Redshift Test Drive attempts to replay the source cluster workload as faithfully as possible on the target cluster.
33 | However, the replayed workload may differ from the original workload in the following ways:
34 | notes:
35 | - "The percentiles in this report exclude DDLs, Utility statements, and any leader node-only catalog queries."
36 | - "The reports grouped by user show the top 100 users based on the count of queries executed per user during the replay. All additional users above the top 100 are rolled up as “Others.” Data for all users is available in S3."
37 | - "Query compilation time is distributed evenly between queries that hop between service classes and can therefore occasionally result in execution or elapsed times that are less than zero for very short queries."
38 |
39 |
40 | # Query Breakdown and Cluster Level Performance
41 | query_breakdown:
42 | table1:
43 | title: "Query Breakdown"
44 | paragraph: "The table below shows the total number of queries, number of aborted queries, and number of queries executed on concurrency scaling clusters broken down by statement type."
45 | note: "* note that query counts are approximate and based on statement text"
46 |
47 | graph:
48 | title: "Query Latency"
49 | paragraph: "The histogram shows a breakdown of query latency on a log scale. The distribution show shorter running queries on the left and longer running queries on the right."
50 |
51 | cluster_metrics:
52 | table2:
53 | title: "Cluster Metrics"
54 | paragraph: "The table below shows performance statistics broken down by cluster-level workload metric."
55 | note: "* note that query latency excludes compile time"
56 |
57 | # Performance Breakdown
58 | measure_tables:
59 | table3:
60 | title: "Query Latency"
61 | paragraph: "Query latency is the combined amount of time a query spends queued in WLM and executing. Note that this does not include query compilation time."
62 |
63 | table4:
64 | title: "Compile Time"
65 | paragraph: >
66 | Redshift compiles queries before executing them, and then caches the compiled result. This table shows how much
67 | time is spent compiling queries, broken down by user. Note that a workload run on a new cluster may have higher
68 | compile time than the original source cluster workload since it may not benefit from prior caching.
69 |
70 | table5:
71 | title: "Queue Time"
72 | paragraph: >
73 | Queue time shows how much time a query is spent waiting to start executing. It should usually be
74 | considered together with Execution Time (since high queue time + low execution time is the same to the user as low queue time + high execution time).
75 |
76 |
77 | table6:
78 | title: "Execution Time"
79 | paragraph: >
80 | Execution time shows how much time a query is spent executing. It should usually be considered together with
81 | Queue Time (since high queue time + low execution time is the same to the user as low queue time + high execution time).
82 |
83 | table7:
84 | title: "Commit Queue Time"
85 | paragraph: "Transactions may be queued before before the commit phase starts. This table summarize how much time each user’s transactions spend waiting to start the commit."
86 |
87 | table8:
88 | title: "Commit Time"
89 | paragraph: "This table summarizes how much time is spent committing transactions, broken down by user."
--------------------------------------------------------------------------------
/core/replay/stats.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | logger = logging.getLogger("WorkloadReplicatorLogger")
4 |
5 |
6 | def percent(num, den):
7 | if den == 0:
8 | return 0
9 | return float(num) / den * 100.0
10 |
11 |
12 | def print_stats(stats):
13 | if 0 not in stats:
14 | logger.warning("No stats gathered.")
15 | return
16 |
17 | max_connection_diff = 0
18 | for process_idx in stats.keys():
19 | if abs(stats[process_idx].get("connection_diff_sec", 0)) > abs(max_connection_diff):
20 | max_connection_diff = stats[process_idx]["connection_diff_sec"]
21 | logger.debug(
22 | f"[{process_idx}] Max connection offset: {stats[process_idx].get('connection_diff_sec', 0):+.3f} sec"
23 | )
24 | logger.debug(f"Max connection offset: {max_connection_diff:+.3f} sec")
25 |
26 |
27 | def display_stats(stats, total_queries, peak_connections):
28 | stats_str = ""
29 | stats_str += (
30 | f"Queries executed: {stats['query_success'] + stats['query_error']} of {total_queries} "
31 | f"({percent(stats['query_success'] + stats['query_error'], total_queries):.1f}%)"
32 | )
33 | stats_str += " ["
34 | stats_str += f"Success: {stats['query_success']} ({percent(stats['query_success'], stats['query_success'] + stats['query_error']):.1f}%), "
35 | stats_str += f"Failed: {stats['query_error']} ({percent(stats['query_error'], stats['query_success'] + stats['query_error']):.1f}%), "
36 | stats_str += f"Peak connections: {peak_connections.value}"
37 | stats_str += "]"
38 |
39 | logger.info(f"{stats_str}")
40 |
41 |
42 | def init_stats(stats_dict):
43 | # init by key to ensure Manager is notified of change, if applicable
44 | stats_dict["connection_diff_sec"] = 0
45 | stats_dict["transaction_success"] = 0
46 | stats_dict["transaction_error"] = 0
47 | stats_dict["query_success"] = 0
48 | stats_dict["query_error"] = 0
49 | stats_dict["connection_error_log"] = {} # map filename to array of connection errors
50 | stats_dict["transaction_error_log"] = {} # map filename to array of transaction errors
51 | stats_dict["multi_statements"] = 0
52 | stats_dict["executed_queries"] = 0 # includes multi-statement queries
53 | return stats_dict
54 |
55 |
56 | def collect_stats(aggregated_stats, stats):
57 | """Aggregate the per-thread stats into the overall stats for this aggregated process"""
58 |
59 | if not stats:
60 | return
61 |
62 | # take the maximum absolute connection difference between actual and expected
63 | if abs(stats["connection_diff_sec"]) >= abs(aggregated_stats.get("connection_diff_sec", 0)):
64 | aggregated_stats["connection_diff_sec"] = stats["connection_diff_sec"]
65 |
66 | # for each aggregated, add up these scalars across all threads
67 | for stat in (
68 | "transaction_success",
69 | "transaction_error",
70 | "query_success",
71 | "query_error",
72 | ):
73 | aggregated_stats[stat] += stats[stat]
74 |
75 | # same for arrays.
76 | for stat in ("transaction_error_log", "connection_error_log"):
77 | # note that per the Manager python docs, this extra copy is required to
78 | # get manager to notice the update
79 | new_stats = aggregated_stats[stat]
80 | new_stats.update(stats[stat])
81 | aggregated_stats[stat] = new_stats
82 |
--------------------------------------------------------------------------------
/core/replay/summarizer.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import logging
3 | import os
4 | from tqdm import tqdm
5 |
6 | from boto3 import client
7 |
8 | import common.aws_service as aws_service_helper
9 |
10 | logger = logging.getLogger("WorkloadReplicatorLogger")
11 |
12 |
13 | def summarize(
14 | connection_logs,
15 | config,
16 | replay_start_timestamp,
17 | aggregated_stats,
18 | query_count,
19 | replay_id,
20 | transaction_count,
21 | replay_end_time,
22 | ):
23 | replay_summary = []
24 | logger.info("Replay summary:")
25 | replay_summary.append(
26 | f"Attempted to replay {query_count} queries, {transaction_count} transactions, "
27 | f"{len(connection_logs)} connections."
28 | )
29 | try:
30 | replay_summary.append(
31 | f"Successfully replayed {aggregated_stats.get('transaction_success', 0)} out of {transaction_count} "
32 | f"({round((aggregated_stats.get('transaction_success', 0) / transaction_count) * 100)}%) transactions."
33 | )
34 | replay_summary.append(
35 | f"Successfully replayed {aggregated_stats.get('query_success', 0)} out of {query_count} "
36 | f"({round((aggregated_stats.get('query_success', 0) / query_count) * 100)}%) queries."
37 | )
38 | except ZeroDivisionError:
39 | pass
40 | error_location = config.get("error_location", config["workload_location"])
41 | replay_summary.append(
42 | f"Encountered {len(aggregated_stats['connection_error_log'])} "
43 | f"connection errors and {len(aggregated_stats['transaction_error_log'])} transaction errors"
44 | )
45 | # and save them
46 | export_errors(
47 | aggregated_stats["connection_error_log"],
48 | aggregated_stats["transaction_error_log"],
49 | error_location,
50 | replay_id,
51 | )
52 | replay_summary.append(f"Replay finished in {replay_end_time - replay_start_timestamp}.")
53 | for line in replay_summary:
54 | logger.info(line)
55 | logger.info(
56 | f"Replay finished in {datetime.datetime.now(tz=datetime.timezone.utc) - replay_start_timestamp}."
57 | )
58 | return replay_summary
59 |
60 |
61 | def export_errors(connection_errors, transaction_errors, workload_location, replay_name):
62 | """Save any errors that occurred during replay to a local directory or s3"""
63 |
64 | if len(connection_errors) == len(transaction_errors) == 0:
65 | logger.info("No errors, nothing to save")
66 | return
67 |
68 | logger.info(
69 | f"Saving {len(connection_errors)} connection errors, {len(transaction_errors)} transaction_errors"
70 | )
71 |
72 | connection_error_location = workload_location + "/" + replay_name + "/connection_errors"
73 | transaction_error_location = workload_location + "/" + replay_name + "/transaction_errors"
74 |
75 |
76 |
77 | if workload_location.startswith("s3://"):
78 | workload_s3_location = workload_location[5:].partition("/")
79 | bucket_name = workload_s3_location[0]
80 | prefix = workload_s3_location[2]
81 | s3_client = client("s3")
82 | else:
83 | os.makedirs(connection_error_location)
84 | os.makedirs(transaction_error_location)
85 |
86 | logger.info(f"Exporting connection errors to {connection_error_location}/")
87 | bar_format = "{desc}: {percentage:3.0f}%|{bar}| {n_fmt}/{total_fmt} [{elapsed}{postfix}]"
88 | for filename, connection_error_text in tqdm(
89 | connection_errors.items(),
90 | disable=False,
91 | unit="files",
92 | desc="Files processed",
93 | bar_format=bar_format,
94 | ):
95 |
96 | if workload_location.startswith("s3://"):
97 | if prefix:
98 | key_loc = "%s/%s/connection_errors/%s.txt" % (
99 | prefix,
100 | replay_name,
101 | filename,
102 | )
103 | else:
104 | key_loc = "%s/connection_errors/%s.txt" % (replay_name, filename)
105 | aws_service_helper.s3_put_object(connection_error_text,bucket_name, key_loc)
106 | else:
107 | error_file = open(connection_error_location + "/" + filename + ".txt", "w")
108 | error_file.write(connection_error_text)
109 | error_file.close()
110 |
111 | logger.info(f"Exporting transaction errors to {transaction_error_location}/")
112 | for filename, transaction_errors in tqdm(
113 | transaction_errors.items(),
114 | disable=False,
115 | unit="files",
116 | desc="Files processed",
117 | bar_format=bar_format,
118 | ):
119 | error_file_text = ""
120 | for transaction_error in transaction_errors:
121 | error_file_text += f"{transaction_error[0]}\n{transaction_error[1]}\n\n"
122 |
123 | if workload_location.startswith("s3://"):
124 | if prefix:
125 | key_loc = "%s/%s/transaction_errors/%s.txt" % (
126 | prefix,
127 | replay_name,
128 | filename,
129 | )
130 | else:
131 | key_loc = "%s/transaction_errors/%s.txt" % (replay_name, filename)
132 | s3_client.put_object(
133 | Body=error_file_text,
134 | Bucket=bucket_name,
135 | Key=key_loc,
136 | )
137 | else:
138 | error_file = open(transaction_error_location + "/" + filename + ".txt", "w")
139 | error_file.write(error_file_text)
140 | error_file.close()
141 |
--------------------------------------------------------------------------------
/core/replay/unload_sys_table.py:
--------------------------------------------------------------------------------
1 | from core.replay.prep import ReplayPrep
2 | from common.util import db_connect
3 | import re
4 | import logging
5 |
6 | logger = logging.getLogger("WorkloadReplicatorLogger")
7 |
8 |
9 | class UnloadSysTable:
10 | def __init__(self, config, replay_id):
11 | self.config = config
12 | self.default_interface = config["default_interface"]
13 | self.unload_system_table_queries_file = config["unload_system_table_queries"]
14 | self.unload_location = config["replay_output"] + "/" + replay_id
15 | self.unload_iam_role = config["target_cluster_system_table_unload_iam_role"]
16 |
17 | def unload_system_table(self):
18 | # TODO: wrap this in retries and proper logging
19 | prep = ReplayPrep(self.config)
20 | credentials = prep.get_connection_credentials(self.config["master_username"])
21 | try:
22 | conn = db_connect(
23 | self.default_interface,
24 | host=credentials["host"],
25 | port=int(credentials["port"]),
26 | username=credentials["username"],
27 | password=credentials["password"],
28 | database=credentials["database"],
29 | odbc_driver=credentials["odbc_driver"],
30 | )
31 | except Exception as e:
32 | logger.debug(f"Unable to connect: {e}", exc_info=True)
33 | unload_queries = {}
34 | table_name = ""
35 | query_text = ""
36 | for line in open(self.unload_system_table_queries_file, "r"):
37 | if line.startswith("--"):
38 | unload_queries[table_name] = query_text.strip("\n")
39 | table_name = line[2:].strip("\n")
40 | query_text = ""
41 | else:
42 | query_text += line
43 |
44 | unload_queries[table_name] = query_text.strip("\n")
45 | del unload_queries[""]
46 |
47 | cursor = conn.cursor()
48 | for table_name, unload_query in unload_queries.items():
49 | if table_name and unload_query:
50 | unload_query = re.sub(
51 | r"to ''",
52 | f"TO '{self.unload_location}/system_tables/{table_name}/'",
53 | unload_query,
54 | flags=re.IGNORECASE,
55 | )
56 | unload_query = re.sub(
57 | r"credentials ''",
58 | f"CREDENTIALS 'aws_iam_role={self.unload_iam_role}'",
59 | unload_query,
60 | flags=re.IGNORECASE,
61 | )
62 | try:
63 | cursor.execute(unload_query)
64 | except Exception as e:
65 | logger.error(f"Failed to unload query. {e}")
66 | logger.debug(f"Executed unload query: {table_name}")
67 |
--------------------------------------------------------------------------------
/core/replay/unload_system_tables.sql:
--------------------------------------------------------------------------------
1 | --SVL_STATEMENTTEXT
2 | UNLOAD ('SELECT * FROM SVL_STATEMENTTEXT WHERE userid>1') TO '' CREDENTIALS '';
3 | --STL_Query
4 | UNLOAD ('SELECT * FROM STL_QUERY WHERE userid>1') TO '' CREDENTIALS '';
5 | --STL_WLM_QUERY
6 | UNLOAD ('SELECT * FROM STL_WLM_QUERY WHERE userid>1') TO '' CREDENTIALS '';
7 |
8 | --stl_wlm_service_class_config
9 | --UNLOAD ('SELECT * FROM stl_wlm_service_class_config') TO '' CREDENTIALS '';
10 |
11 | --stv_wlm_qmr_config
12 | UNLOAD ('SELECT * FROM stv_wlm_qmr_config') TO '' CREDENTIALS '';
13 |
14 | --stv_wlm_query_queue_state
15 | UNLOAD ('SELECT * FROM stv_wlm_query_queue_state') TO '' CREDENTIALS '';
16 | --stv_wlm_query_state
17 | UNLOAD ('SELECT * FROM stv_wlm_query_state') TO '' CREDENTIALS '';
18 | --stl_connection_log
19 | UNLOAD ('SELECT * FROM stl_connection_log') TO '' CREDENTIALS '';
20 | --stl_compile_info
21 | --UNLOAD ('SELECT * FROM stl_compile_info WHERE userid>1') TO '' CREDENTIALS '';
22 | --stl_catalog_bloat
23 | --UNLOAD ('SELECT * FROM stl_catalog_bloat WHERE userid>1') TO '' CREDENTIALS '';
24 | --stl_catalog_rebuild_info
25 | --UNLOAD ('SELECT * FROM stl_catalog_rebuild_info') TO '' CREDENTIALS '';
26 | --stl_query_metrics
27 | UNLOAD ('SELECT * FROM stl_query_metrics WHERE userid>1') TO '' CREDENTIALS '';
28 | --svl_query_summary
29 | UNLOAD ('SELECT * FROM svl_query_summary WHERE userid>1') TO '' CREDENTIALS '';
30 | --svl_query_report
31 | UNLOAD ('SELECT * FROM svl_query_report WHERE userid>1') TO '' CREDENTIALS '';
32 | --stl_vacuum
33 | UNLOAD ('SELECT * FROM stl_vacuum WHERE userid>1') TO '' CREDENTIALS '';
34 |
35 | --stl_s3client
36 |
37 | UNLOAD ('SELECT * FROM stl_s3client WHERE userid>1') TO '' CREDENTIALS '';
38 |
39 | --stl_tiered_storage_s3_blocks
40 | --UNLOAD ('SELECT * FROM stl_tiered_storage_s3_blocks') TO '' CREDENTIALS '';
41 |
42 |
43 | --stl_commit_stats
44 | UNLOAD ('SELECT * FROM stl_commit_stats') TO '' CREDENTIALS '';
45 |
46 | --svl_query_metrics_summary
47 | UNLOAD ('SELECT * FROM svl_query_metrics_summary') TO '' CREDENTIALS '';
48 |
--------------------------------------------------------------------------------
/core/sql/aborted_queries.sql:
--------------------------------------------------------------------------------
1 | /*AbortedQueries*/
2 | CREATE TEMP TABLE aborted_queries AS (
3 | SELECT q.user_id as "userid"
4 | , case when q.result_cache_hit = 't' then 'Result Cache' else 'Default queue' end as "queue"
5 | , date_trunc('hour', q.start_time) as "period"
6 | , q.transaction_id as "xid"
7 | , q.query_id as "query"
8 | , q.query_text::char(50) as "querytxt"
9 | , q.queue_time / 1000000.00 as "queue_s"
10 | , q.execution_time / 1000000.00 as "exec_time_s" -- This includes compile time. Differs in behavior from provisioned metric
11 | , case when q.status = 'failed' then 1 else 0 end "aborted"
12 | , q.elapsed_time / 1000000.00 as "total_elapsed_s" -- This includes compile time. Differs in behavior from provisioned metric
13 | FROM sys_query_history q
14 | WHERE q.user_id > 1
15 | AND q.start_time >={{START_TIME}}
16 | AND q.start_time <={{END_TIME}}
17 | AND q.query_text LIKE '%replay_start%'
18 | AND q.status = 'failed'
19 | );
20 |
21 | SELECT a.userid,
22 | b.usename,
23 | a.queue,
24 | a.period,
25 | a.xid,
26 | a.query,
27 | a.querytxt,
28 | a.queue_s,
29 | a.exec_time_s,
30 | a.aborted,
31 | a.total_elapsed_s
32 | FROM aborted_queries a
33 | LEFT JOIN pg_user b ON a.userid = b.usesysid
34 | ORDER BY a.total_elapsed_s DESC;
35 |
--------------------------------------------------------------------------------
/core/sql/cluster_level_metrics.sql:
--------------------------------------------------------------------------------
1 | /*ClusterLevelMetrics*/
2 | WITH queries AS
3 | (
4 | select q.user_id as "userid"
5 | , date_trunc('hour', q.start_time) as "period"
6 | , q.transaction_id as "xid"
7 | , q.query_id as "query"
8 | , q.query_text::char(50) as "querytxt"
9 | , q.queue_time / 1000000.00 as "queue_s"
10 | , q.execution_time / 1000000.00 as "exec_time_s" -- This includes compile time. Differs in behavior from provisioned metric
11 | , case when q.status = 'failed' then 1 else 0 end "aborted"
12 | , q.elapsed_time / 1000000.00 as "total_elapsed_s" -- This includes compile time. Differs in behavior from provisioned metric
13 | FROM sys_query_history q
14 | WHERE q.user_id > 1
15 | AND q.start_time >= {{START_TIME}}
16 | AND q.start_time <= {{END_TIME}}
17 | AND q.query_text LIKE '%replay_start%'
18 | AND q.status != 'failed'
19 | ),
20 | elapsed_time AS
21 | (
22 | SELECT 'Query Latency' AS measure_type,
23 | COUNT(*) AS query_count,
24 | ROUND(PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY total_elapsed_s), 2) AS p25_s,
25 | ROUND(PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY total_elapsed_s), 2) AS p50_s,
26 | ROUND(PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY total_elapsed_s), 2) AS p75_s,
27 | ROUND(PERCENTILE_CONT(0.90) WITHIN GROUP (ORDER BY total_elapsed_s), 2) AS p90_s,
28 | ROUND(PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY total_elapsed_s), 2) AS p95_s,
29 | ROUND(PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY total_elapsed_s), 2) AS p99_s,
30 | MAX(total_elapsed_s) AS max_s,
31 | AVG(total_elapsed_s) AS avg_s,
32 | stddev(total_elapsed_s) AS std_s
33 | FROM queries
34 | GROUP BY 1
35 | ),
36 | exec_time AS
37 | (
38 | SELECT 'Execution Time' AS measure_type,
39 | COUNT(*) AS query_count,
40 | ROUND(PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY exec_time_s), 2) AS p25_s,
41 | ROUND(PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY exec_time_s), 2) AS p50_s,
42 | ROUND(PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY exec_time_s), 2) AS p75_s,
43 | ROUND(PERCENTILE_CONT(0.90) WITHIN GROUP (ORDER BY exec_time_s), 2) AS p90_s,
44 | ROUND(PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY exec_time_s), 2) AS p95_s,
45 | ROUND(PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY exec_time_s), 2) AS p99_s,
46 | MAX(exec_time_s) AS max_s,
47 | AVG(exec_time_s) AS avg_s,
48 | stddev(exec_time_s) AS std_s
49 | FROM queries
50 | GROUP BY 1
51 | ),
52 | queue_time AS
53 | (
54 | SELECT 'Queue Time' AS measure_type,
55 | COUNT(*) AS query_count,
56 | ROUND(PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY queue_s), 2) AS p25_s,
57 | ROUND(PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY queue_s), 2) AS p50_s,
58 | ROUND(PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY queue_s), 2) AS p75_s,
59 | ROUND(PERCENTILE_CONT(0.90) WITHIN GROUP (ORDER BY queue_s), 2) AS p90_s,
60 | ROUND(PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY queue_s), 2) AS p95_s,
61 | ROUND(PERCENTILE_CONT(0.99) WITHIN GROUP (ORDER BY queue_s), 2) AS p99_s,
62 | MAX(queue_s) AS max_s,
63 | AVG(queue_s) AS avg_s,
64 | stddev(queue_s) AS std_s
65 | FROM queries
66 | GROUP BY 1
67 | )
68 | SELECT measure_type,
69 | query_count,
70 | p25_s,
71 | p50_s,
72 | p75_s,
73 | p90_s,
74 | p95_s,
75 | p99_s,
76 | max_s,
77 | avg_s,
78 | std_s
79 | FROM exec_time
80 | UNION ALL
81 | SELECT measure_type,
82 | query_count,
83 | p25_s,
84 | p50_s,
85 | p75_s,
86 | p90_s,
87 | p95_s,
88 | p99_s,
89 | max_s,
90 | avg_s,
91 | std_s
92 | FROM queue_time
93 | UNION ALL
94 | SELECT measure_type,
95 | query_count,
96 | p25_s,
97 | p50_s,
98 | p75_s,
99 | p90_s,
100 | p95_s,
101 | p99_s,
102 | max_s,
103 | avg_s,
104 | std_s
105 | FROM elapsed_time
106 | ORDER BY 1;
--------------------------------------------------------------------------------
/core/sql/latency_distribution.sql:
--------------------------------------------------------------------------------
1 | /*LatencyDistribution*/
2 | WITH queries AS
3 | (
4 | SELECT q.query_id
5 | , q.elapsed_time / 1000000.00 as total_elapsed_s
6 | FROM sys_query_history q
7 | WHERE q.user_id > 1
8 | AND q.start_time >= {{START_TIME}}
9 | AND q.start_time <= {{END_TIME}}
10 | AND q.query_text LIKE '%replay_start%'
11 | AND status != 'failed'
12 | )
13 | ,
14 | pct AS
15 | (
16 | SELECT ROUND(PERCENTILE_CONT(0.98) WITHIN GROUP (ORDER BY q1.total_elapsed_s), 2) AS p98_s,
17 | COUNT(*) AS query_count,
18 | MAX(q1.total_elapsed_s) max_s,
19 | MIN(q1.total_elapsed_s) min_s,
20 | MIN(CASE WHEN q1.total_elapsed_s = 0.00 THEN NULL ELSE q1.total_elapsed_s END) min_2s
21 | FROM queries q1
22 | ),
23 | bucket_count AS
24 | (
25 | SELECT CASE
26 | WHEN query_count > 100 THEN 40
27 | ELSE 5
28 | END AS b_count
29 | FROM pct
30 | ),
31 | buckets AS
32 | (
33 | SELECT (min_2s + ((n) * (p98_s / b_count))) AS sec_end,
34 | n,
35 | (min_2s + ((n - 1) * (p98_s / b_count))) AS sec_start
36 | FROM (SELECT ROW_NUMBER() OVER () n FROM pg_class LIMIT 39),
37 | bucket_count,
38 | pct
39 | WHERE sec_end <= p98_s
40 | UNION ALL
41 | SELECT min_2s AS sec_end,
42 | 0 AS n,
43 | 0.00 AS sec_start
44 | FROM pct
45 | UNION ALL
46 | SELECT (max_s + 0.01) AS sec_end,
47 | b_count AS n,
48 | p98_s AS sec_start
49 | FROM pct,
50 | bucket_count
51 | )
52 | SELECT sec_end,
53 | n,
54 | sec_start,
55 | COUNT(query_id)
56 | FROM buckets
57 | LEFT JOIN queries
58 | ON total_elapsed_s >= sec_start
59 | AND total_elapsed_s < sec_end
60 | GROUP BY 1,
61 | 2,
62 | 3
63 | ORDER BY 2;
--------------------------------------------------------------------------------
/core/sql/query_metrics.sql:
--------------------------------------------------------------------------------
1 | /*QueryMetrics*/
2 | CREATE TEMP TABLE query_metrics AS (
3 | select q.user_id as "userid"
4 | , case when q.result_cache_hit = 't' then 'Result Cache' else 'Default queue' end as "queue"
5 | , date_trunc('hour', q.start_time) as "period"
6 | , q.transaction_id as "xid"
7 | , q.query_id as "query"
8 | , q.query_text::char(50) as "querytxt"
9 | , q.queue_time / 1000000.00 as "queue_s"
10 | , q.execution_time / 1000000.00 as "exec_time_s" -- This includes compile time. Differs in behavior from provisioned metric
11 | , case when q.status = 'failed' then 1 else 0 end "aborted"
12 | , q.elapsed_time / 1000000.00 as "total_elapsed_s" -- This includes compile time. Differs in behavior from provisioned metric
13 | FROM sys_query_history q
14 | WHERE q.user_id > 1
15 | AND q.start_time >={{START_TIME}}
16 | AND q.start_time <={{END_TIME}}
17 | AND q.query_text LIKE '%replay_start%'
18 | AND q.status != 'failed'
19 | );
20 |
21 | SELECT a.userid,
22 | u.usename,
23 | a.queue,
24 | a.period,
25 | a.xid,
26 | a.query,
27 | a.querytxt,
28 | a.queue_s,
29 | a.exec_time_s,
30 | a.aborted,
31 | a.total_elapsed_s
32 | FROM query_metrics a
33 | LEFT JOIN pg_user u on a.userid = u.usesysid;
34 |
--------------------------------------------------------------------------------
/core/sql/statement_types.sql:
--------------------------------------------------------------------------------
1 | /*StatementTypes*/
2 | SELECT CASE
3 | WHEN REGEXP_INSTR("query_text", '(padb_|pg_internal)')
4 | THEN 'SYSTEM'
5 | WHEN query_type = 'DELETE'
6 | THEN 'DELETE'
7 | WHEN query_type = 'COPY'
8 | THEN 'COPY'
9 | WHEN query_type = 'UPDATE'
10 | THEN 'UPDATE'
11 | WHEN query_type = 'INSERT'
12 | THEN 'INSERT'
13 | WHEN query_type = 'SELECT'
14 | THEN 'SELECT'
15 | WHEN query_type = 'UNLOAD'
16 | THEN 'UNLOAD'
17 | WHEN query_type = 'DDL'
18 | THEN 'DDL'
19 | WHEN query_type = 'UTILITY'
20 | THEN CASE
21 | WHEN REGEXP_INSTR("query_text", '[vV][aA][cC][uU][uU][mM][ :]')
22 | THEN 'VACUUM'
23 | WHEN REGEXP_INSTR("query_text", '[rR][oO][lL][lL][bB][aA][cC][kK] ')
24 | THEN 'ROLLBACK'
25 | WHEN REGEXP_INSTR("query_text", '[fF][eE][tT][cC][hH] ')
26 | THEN 'FETCH'
27 | WHEN REGEXP_INSTR("query_text", '[cC][uU][rR][sS][oO][rR] ')
28 | THEN 'CURSOR'
29 | ELSE 'UTILITY'
30 | END
31 | ELSE 'OTHER'
32 | END statement_type
33 | , COUNT(CASE
34 | WHEN status = 'failed'
35 | THEN 1
36 | END) AS aborted
37 | , COUNT(*) AS total_count
38 | FROM sys_query_history
39 | WHERE user_id > 1
40 | AND query_text LIKE '%replay_start%'
41 | AND start_time >= {{START_TIME}}
42 | AND start_time <= {{END_TIME}}
43 | GROUP BY
44 | 1
45 | ORDER BY
46 | 2 DESC;
--------------------------------------------------------------------------------
/core/sql/sys_external_query_data.sql:
--------------------------------------------------------------------------------
1 | /*SysExternalQueryData*/
2 | SELECT user_id,
3 | query_id,
4 | child_query_sequence,
5 | transaction_id,
6 | segment_id,
7 | source_type,
8 | start_time,
9 | end_time,
10 | duration,
11 | total_partitions,
12 | qualified_partitions,
13 | scanned_files,
14 | returned_rows,
15 | returned_bytes,
16 | file_format,
17 | file_location,
18 | external_query_text
19 | from SYS_EXTERNAL_QUERY_DETAIL
20 | WHERE user_id > 1
21 | AND start_time >= {{START_TIME}}
22 | AND start_time <= {{END_TIME}};
--------------------------------------------------------------------------------
/core/sql/sys_load_history.sql:
--------------------------------------------------------------------------------
1 | /*SysLoadHistory*/
2 | SELECT user_id,
3 | query_id,
4 | status,
5 | session_id,
6 | transaction_id,
7 | database_name,
8 | table_name,
9 | start_time,
10 | end_time,
11 | duration,
12 | data_source,
13 | loaded_rows,
14 | loaded_bytes,
15 | source_file_count,
16 | source_file_bytes,
17 | error_count
18 | from SYS_LOAD_HISTORY
19 | WHERE user_id > 1
20 | AND start_time >= {{START_TIME}}
21 | AND start_time <= {{END_TIME}};
22 |
--------------------------------------------------------------------------------
/core/sql/sys_query_history.sql:
--------------------------------------------------------------------------------
1 | /*SysQueryHistory*/
2 | SELECT h.user_id,
3 | u.usename as user_name,
4 | query_id,
5 | transaction_id,
6 | session_id,
7 | database_name,
8 | start_time,
9 | end_time,
10 | elapsed_time,
11 | status,
12 | result_cache_hit,
13 | queue_time,
14 | execution_time,
15 | query_text,
16 | query_label,
17 | query_type,
18 | error_message,
19 | returned_rows,
20 | returned_bytes,
21 | redshift_version
22 | from sys_query_history h
23 | LEFT JOIN pg_user u on h.user_id = u.usesysid
24 | WHERE user_id > 1
25 | AND start_time >= {{START_TIME}}
26 | AND start_time <= {{END_TIME}};
--------------------------------------------------------------------------------
/core/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/redshift-test-drive/354b7ed75180a6b915d856175cffd6414cae998e/core/tests/__init__.py
--------------------------------------------------------------------------------
/core/tests/test_cloudwatch_extractor.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from unittest.mock import patch, mock_open
3 |
4 | from core.extract.cloudwatch_extractor import CloudwatchExtractor
5 |
6 |
7 | def mock_cw_describe_log_groups(log_group_name=None, region=""):
8 | return {"logGroups": [{"logGroupName": "useractivitylog"}]}
9 |
10 |
11 | def mock_cw_describe_log_streams(log_group_name, region):
12 | return {
13 | "logStreams": [
14 | {"logStreamName": "redshift-serverless.test.us-east-1"},
15 | ]
16 | }
17 |
18 |
19 | def mock_cw_get_paginated_logs(log_group_name, stream_name, start_time, end_time, region):
20 | return []
21 |
22 |
23 | def mock_s3_upload():
24 | return ""
25 |
26 |
27 | def mock_parse_log():
28 | return
29 |
30 |
31 | class CloudwatchExtractorTestCases(unittest.TestCase):
32 | @patch("common.aws_service.cw_describe_log_groups", mock_cw_describe_log_groups)
33 | @patch("common.aws_service.cw_describe_log_streams", mock_cw_describe_log_streams)
34 | @patch.object(CloudwatchExtractor, "_read_cloudwatch_logs")
35 | def test_get_extract_from_cw_source_cluster_endpoint_specified(
36 | self, mock_read_cloudwatch_logs
37 | ):
38 | cw_extractor = CloudwatchExtractor(
39 | {
40 | "source_cluster_endpoint": "redshift-serverless.test.us-east-1",
41 | "workload_location": "s3://test/t",
42 | }
43 | )
44 | cw_extractor.get_extract_from_cloudwatch("2021-08-15T15:50", "2021-08-15T18:55")
45 | mock_read_cloudwatch_logs.assert_called()
46 |
47 | @patch("common.aws_service.cw_describe_log_groups", mock_cw_describe_log_groups)
48 | @patch("common.aws_service.cw_describe_log_streams", mock_cw_describe_log_streams)
49 | @patch.object(CloudwatchExtractor, "_read_cloudwatch_logs")
50 | def test_get_extract_from_cw_source_cluster_endpoint_not_specified(
51 | self, mock_read_cloudwatch_logs
52 | ):
53 | cw_extractor = CloudwatchExtractor({"log_location": "/aws/logs/"})
54 | cw_extractor.get_extract_from_cloudwatch("2021-08-15T15:50", "2021-08-15T18:55")
55 | mock_read_cloudwatch_logs.assert_called()
56 |
57 | def test_get_extract_from_cw_error(self):
58 | cw_extractor = CloudwatchExtractor({})
59 | with self.assertRaises(SystemExit):
60 | cw_extractor.get_extract_from_cloudwatch("2021-08-15T15:50", "2021-08-15T18:55")
61 |
62 | @patch("core.extract.cloudwatch_extractor.parse_log")
63 | @patch("gzip.open", mock_open())
64 | @patch("tempfile.TemporaryDirectory")
65 | @patch("common.aws_service.cw_get_paginated_logs", mock_cw_get_paginated_logs)
66 | @patch("common.aws_service.cw_describe_log_streams", mock_cw_describe_log_streams)
67 | def test_read_cloudwatch_logs_success(self, mock_tmp_dir, mock_parse_log):
68 | cw_extractor = CloudwatchExtractor({})
69 | response = {
70 | "logGroups": [{"logGroupName": "useractivitylog"}, {"logGroupName": "connectionlog"}]
71 | }
72 | cw_extractor._read_cloudwatch_logs(
73 | response,
74 | "redshift-serverless.test.us-east-1",
75 | "2021-08-15T15:50",
76 | "2021-08-15T18:55",
77 | "us-east-1",
78 | )
79 | self.assertEqual(mock_parse_log.call_count, 2)
80 |
81 |
82 | if __name__ == "__main__":
83 | unittest.main()
84 |
--------------------------------------------------------------------------------
/core/tests/test_connections_parser.py:
--------------------------------------------------------------------------------
1 | from unittest.mock import patch, mock_open
2 | import unittest
3 | from core.replay.connections_parser import parse_connections
4 |
5 | time_interval_between_transactions = "all on"
6 | time_interval_between_queries = "all on"
7 | filters = {
8 | "include": {"database_name": ["*"], "username": ["*"], "pid": ["*"]},
9 | "exclude": {"database_name": [], "username": [], "pid": []},
10 | }
11 |
12 | open_mock_1 = mock_open(
13 | read_data=(
14 | """[{
15 | "session_initiation_time": "2023-01-09 15:48:15.313000+00:00",
16 | "disconnection_time": "2023-01-09 15:48:15.872000+00:00",
17 | "database_name": "dev",
18 | "username": "awsuser",
19 | "pid": "1073815778",
20 | "application_name": "",
21 | "time_interval_between_transactions": "True",
22 | "time_interval_between_queries": "transaction"
23 | }]"""
24 | )
25 | )
26 |
27 | open_mock_2 = mock_open(
28 | read_data=(
29 | """[{
30 | "session_initiation_time": "",
31 | "disconnection_time": "2023-01-09 15:48:15.872000+00:00",
32 | "database_name": "dev",
33 | "username": "awsuser",
34 | "pid": "1073815778",
35 | "application_name": "",
36 | "time_interval_between_transactions": "True",
37 | "time_interval_between_queries": "transaction"
38 | }]"""
39 | )
40 | )
41 |
42 | open_mock_3 = mock_open(
43 | read_data=(
44 | """
45 | [
46 | {
47 | "session_initiation_time": "2023-01-09 15:48:15.313000+00:00",
48 | "disconnection_time": "",
49 | "database_name": "dev",
50 | "username": "awsuser",
51 | "pid": "1073815778",
52 | "application_name": "",
53 | "time_interval_between_transactions": "True",
54 | "time_interval_between_queries": "transaction"
55 | }
56 | ]
57 | """
58 | )
59 | )
60 |
61 | open_mock_4 = mock_open(
62 | read_data=(
63 | """
64 | [
65 | {
66 | "session_initiation_time_error": "2023-01-09 15:48:15.313000+00:00",
67 | "disconnection_time_error": "2023-01-09 15:48:15.872000+00:00",
68 | "database_name": "dev",
69 | "username": "awsuser",
70 | "pid": "1073815778",
71 | "application_name": "",
72 | "time_interval_between_transactions": true,
73 | "time_interval_between_queries": "transaction"
74 | }
75 | ]
76 | """
77 | )
78 | )
79 |
80 |
81 | class TestConnectionsParser(unittest.TestCase):
82 | @patch("core.replay.connections_parser.client")
83 | @patch("core.replay.connections_parser.json")
84 | def test_parse_connections(self, mock_json, mock_client):
85 | workload_directory = (
86 | "s3://test/extracts/Edited_Extraction_2023-01-23T09:46:24.784062+00:00"
87 | )
88 | mock_json.loads.return_value = [
89 | {
90 | "session_initiation_time": "2023-01-09 15:48:15.313000+00:00",
91 | "disconnection_time": "2023-01-09 15:48:15.872000+00:00",
92 | "database_name": "dev",
93 | "username": "awsuser",
94 | "pid": "1073815778",
95 | "application_name": "",
96 | "time_interval_between_transactions": True,
97 | "time_interval_between_queries": "transaction",
98 | }
99 | ]
100 | mock_client.get_object.return_value = mock_json
101 |
102 | connections, total_connections = parse_connections(
103 | workload_directory,
104 | time_interval_between_transactions,
105 | time_interval_between_queries,
106 | filters,
107 | )
108 | self.assertEqual(connections[0].pid, "1073815778")
109 | self.assertEqual(total_connections, 1)
110 |
111 | @patch("core.replay.connections_parser.open", open_mock_1)
112 | def test_parse_connections_s3_location(self):
113 | workload_directory = "testdata/testlocation"
114 |
115 | connections, total_connections = parse_connections(
116 | workload_directory,
117 | time_interval_between_transactions,
118 | time_interval_between_queries,
119 | filters,
120 | )
121 | self.assertEqual(connections[0].pid, "1073815778")
122 | self.assertEqual(total_connections, 1)
123 |
124 | @patch("core.replay.connections_parser.open", open_mock_2)
125 | def test_parse_connections_initiation_time(self):
126 | workload_directory = "testdata/testlocation"
127 |
128 | connections, total_connections = parse_connections(
129 | workload_directory,
130 | time_interval_between_transactions,
131 | time_interval_between_queries,
132 | filters,
133 | )
134 | self.assertEqual(connections[0].session_initiation_time, None)
135 | self.assertEqual(total_connections, 1)
136 |
137 | @patch("core.replay.connections_parser.open", open_mock_3)
138 | def test_parse_connections_disconnection_time(self):
139 | workload_directory = "testdata/testlocation"
140 |
141 | connections, total_connections = parse_connections(
142 | workload_directory,
143 | time_interval_between_transactions,
144 | time_interval_between_queries,
145 | filters,
146 | )
147 | self.assertEqual(connections[0].disconnection_time, None)
148 | self.assertEqual(total_connections, 1)
149 |
150 | @patch("core.replay.connections_parser.open", open_mock_4)
151 | def test_parse_connections_except_case(self):
152 | workload_directory = "testdata/testlocation"
153 |
154 | connections, total_connections = parse_connections(
155 | workload_directory,
156 | time_interval_between_transactions,
157 | time_interval_between_queries,
158 | filters,
159 | )
160 | self.assertEqual(total_connections, 0)
161 |
--------------------------------------------------------------------------------
/core/tests/test_copy_replacements_parser.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from unittest.mock import patch, Mock, mock_open
3 |
4 | from core.replay.copy_replacements_parser import parse_copy_replacements
5 |
6 |
7 | class CopyReplacementsParserTests(unittest.TestCase):
8 | @patch("common.aws_service.s3_client_get_object")
9 | def test_parse_copy_replacements_s3(self, patched_get_object):
10 | workload_directory = "s3://test-bucket/test-folder/prefix"
11 | mock_body = Mock()
12 | mock_body.read.return_value = "HeaderA,HeaderB,HeaderC\nA,B,C".encode("utf-8")
13 | patched_get_object.return_value = {"Body": mock_body}
14 | result = parse_copy_replacements(workload_directory)
15 |
16 | self.assertEqual(result, {"A": ["B", "C"]})
17 |
18 | def test_parse_copy_replacements_local(self):
19 | with patch(
20 | "builtins.open", mock_open(read_data="HeaderA,HeaderB,HeaderC\nA,B,C")
21 | ) as patched_open:
22 | result = parse_copy_replacements("/tmp")
23 |
24 | self.assertEqual(result, {"A": ["B", "C"]})
25 |
26 |
27 | if __name__ == "__main__":
28 | unittest.main()
29 |
--------------------------------------------------------------------------------
/core/tests/test_extract_parser.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timezone
2 | from unittest import TestCase
3 | from unittest.mock import Mock
4 |
5 | from core.extract import extract_parser
6 |
7 | from core.replay.connections_parser import ConnectionLog
8 |
9 | pid = "12324"
10 | xid = "123142412"
11 | start_time = datetime.fromisoformat("2023-01-01T00:00:00").replace(tzinfo=timezone.utc)
12 | end_time = datetime.fromisoformat("2023-02-01T00:00:00").replace(tzinfo=timezone.utc)
13 |
14 |
15 | class ExtractParserTestCases(TestCase):
16 | def test_parse_log_useractivitylog(self):
17 | mock_file = Mock()
18 | mock_file.readlines.return_value = [
19 | # valid log line
20 | f"'2023-01-01T00:00:00Z UTC [ db=testdb user=testuser pid={pid} userid=4 xid={xid} ]' LOG: SELECT * FROM TEST_TABLE LIMIT 10;".format(
21 | pid, xid
22 | ).encode(),
23 | # invalid log line
24 | f"'2023-01-01T01:00:00Z UTC [ db=testdb user=testuser pid={pid} userid=4 xid={xid} ]' LOG: call test.set($1, $2);".format(
25 | pid, xid
26 | ).encode(),
27 | ]
28 | logs = {}
29 | extract_parser.parse_log(
30 | mock_file, "useractivitylog", {}, {}, logs, set(), start_time, end_time
31 | )
32 | self.assertEqual(len(logs), 1)
33 | for key, value in logs.items():
34 | self.assertEqual(len(value), 1)
35 | log = value[0]
36 | self.assertEqual(log.xid, xid)
37 | self.assertEqual(log.pid, pid)
38 | self.assertEqual(log.text, "SELECT * FROM TEST_TABLE LIMIT 10;")
39 |
40 | def test_parse_log_connectionlog(self):
41 | mock_file = Mock()
42 | set_application_name_line = f"set application_name |Sun, 01 Jan 2023 01:05:07:124|[local] |{xid} |{pid}|testdb |testuser |test |12312|TLSv1.2 |TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384 |0| | | |JDBC-1.2.54.1082 |Linux 5.4.0-1086-aws amd64 |Amazon Redshift JDBC Driver 1.2.54.1082 |none |0|02d54c77-8302-4ae6-8e83".format(
43 | xid, pid
44 | ).encode()
45 | mock_file.readlines.return_value = [
46 | f"initiating session |Sun, 01 Jan 2023 00:00:12:212|[local] | |{pid}|testdb |testuser |Ident |0| | |0| | | | | | | |0|03e74c8e-c3cb-4a98-a3d9".format(
47 | pid
48 | ).encode(),
49 | f"disconnecting session |Sun, 01 Jan 2023 00:02:21:471|[local] | |{pid}|testdb |testuser |Ident |7460885| | |0| | | | | | | |0|03e74c8e-c3cb-4a98-a3d9".format(
50 | pid
51 | ).encode(),
52 | set_application_name_line,
53 | ]
54 | connections = {}
55 | event_time = datetime.strptime(
56 | "Sun, 01 Jan 2023 01:05:07:124", "%a, %d %b %Y %H:%M:%S:%f"
57 | ).replace(tzinfo=timezone.utc)
58 | last_connection = ConnectionLog(event_time, end_time, "testdb", "testuser", pid)
59 | last_connections = {hash(set_application_name_line): last_connection.get_pk()}
60 | extract_parser.parse_log(
61 | mock_file,
62 | "connectionlog",
63 | connections,
64 | last_connections,
65 | {},
66 | set(),
67 | start_time,
68 | end_time,
69 | )
70 | print(list(connections.values())[0])
71 | print(last_connections)
72 |
--------------------------------------------------------------------------------
/core/tests/test_local_extractor.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from unittest.mock import patch, Mock
3 |
4 | from core.extract.local_extractor import LocalExtractor
5 |
6 |
7 | class LocalExtractorTestCases(unittest.TestCase):
8 | def test_extract_locally(self):
9 | with patch("os.listdir") as mock_list_dir:
10 | with patch("gzip.open") as mock_gzip_open:
11 | with patch("core.extract.extract_parser.parse_log") as mock_parse_log:
12 | mock_list_dir.return_value = [
13 | "start_node.log.gz",
14 | "useractivity.log.gz",
15 | "connections.log.gz",
16 | ]
17 | mock_gzip_open.return_value = Mock()
18 | mock_parse_log.return_value = None
19 | e = LocalExtractor({})
20 | e.get_extract_locally("test", "2022-11-16T00:00:00", "2022-11-18T00:00:00")
21 | self.assertTrue(mock_list_dir.called)
22 | self.assertTrue(mock_gzip_open.called)
23 | self.assertTrue(mock_parse_log.called)
24 |
25 |
26 | if __name__ == "__main__":
27 | unittest.main()
28 |
--------------------------------------------------------------------------------
/core/tests/test_s3_extractor.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import unittest
3 | from unittest.mock import patch, Mock
4 |
5 | from core.extract.s3_extractor import S3Extractor
6 |
7 |
8 | def mock_sync_s3_get_bucket_contents(bucket, prefix):
9 | return [
10 | {"Key": "s3://bucket/cluster_connectionlog_2021-08-15T15:00.gz"},
11 | {"Key": "s3://bucket/cluster_useractivitylog_2021-08-15T19:00.gz"},
12 | ]
13 |
14 |
15 | def mock_get_logs_in_range(audit_objects, start_time, end_time):
16 | return ["A", "B"]
17 |
18 |
19 | def mock_s3_get_object(bucket, filename):
20 | mock_obj = Mock()
21 | mock_obj.get = Mock(return_value={"Body": ""})
22 | return mock_obj
23 |
24 |
25 | def mock_parse_log(
26 | log_file,
27 | filename,
28 | connections,
29 | last_connections,
30 | logs,
31 | databases,
32 | start_time,
33 | end_time,
34 | ):
35 | return
36 |
37 |
38 | class S3ExtractorTestCases(unittest.TestCase):
39 | @patch("core.extract.s3_extractor.get_logs_in_range", mock_get_logs_in_range)
40 | @patch("common.aws_service.sync_s3_get_bucket_contents", mock_sync_s3_get_bucket_contents)
41 | @patch("common.aws_service.s3_get_object", mock_s3_get_object)
42 | @patch("core.extract.extract_parser", mock_parse_log)
43 | def test_get_extract_from_s3(self):
44 | s3_extractor = S3Extractor({})
45 | s3_extractor.get_extract_from_s3(
46 | "test_bucket",
47 | "test",
48 | datetime.datetime.fromisoformat("2021-08-15T15:50").utcoffset(),
49 | datetime.datetime.fromisoformat("2021-08-15T18:55").utcoffset(),
50 | )
51 |
52 |
53 | if __name__ == "__main__":
54 | unittest.main()
55 |
--------------------------------------------------------------------------------
/core/tests/test_stats.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from unittest.mock import patch, call
3 | from core.replay.stats import percent, print_stats, display_stats, init_stats, collect_stats
4 | from multiprocessing.managers import SyncManager
5 |
6 |
7 | stats = {
8 | "connection_diff_sec": 1.734,
9 | "query_success": 10,
10 | "query_error": 2,
11 | "transaction_success": 10,
12 | "transaction_error": 2,
13 | "transaction_error_log": {"test": 3},
14 | "connection_error_log": {"conn_test": 4},
15 | }
16 |
17 | aggregated_stats = {
18 | "connection_diff_sec": 1,
19 | "query_success": 10,
20 | "query_error": 2,
21 | "transaction_success": 0,
22 | "transaction_error": 0,
23 | "transaction_error_log": {},
24 | "connection_error_log": {},
25 | }
26 |
27 |
28 | class TestStats(unittest.TestCase):
29 | def test_percentage(self):
30 | den = 0
31 | num = 1
32 |
33 | response = percent(num, den)
34 | self.assertEqual(response, 0)
35 |
36 | def test_percentage_with_non_zero_den(self):
37 | den = 100
38 | num = 10
39 |
40 | response = percent(num, den)
41 | self.assertEqual(response, 10)
42 |
43 | def test_print_stats(self):
44 | stats = [1]
45 |
46 | response = print_stats(stats)
47 | self.assertEqual(response, None)
48 |
49 | @patch("replay.stats.logger.debug")
50 | def test_print_stats_zero_in_stats(self, mock_logger):
51 | stats = {
52 | 0: {"connection_diff_sec": 1.734, "query_success": 10, "query_error": 2},
53 | 1: {"connection_diff_sec": 1},
54 | }
55 |
56 | print_stats(stats)
57 | calls = [
58 | call("[0] Max connection offset: +1.734 sec"),
59 | call("[1] Max connection offset: +1.000 sec"),
60 | call("Max connection offset: +1.734 sec"),
61 | ]
62 |
63 | mock_logger.assert_has_calls(calls)
64 |
65 | @patch("replay.stats.logger.info")
66 | def test_display_stats(self, mock_logger):
67 | manager = SyncManager()
68 | manager.start()
69 | stats = {
70 | "connection_diff_sec": 1.734,
71 | "query_success": 10,
72 | "query_error": 2,
73 | }
74 |
75 | peak_conn = manager.Value(int, 3)
76 |
77 | display_stats(stats, 100, peak_conn)
78 |
79 | mock_logger.assert_called_once_with(
80 | "Queries executed: 12 of 100 (12.0%) [Success: 10 (83.3%), Failed: 2 (16.7%), Peak connections: 3]"
81 | )
82 |
83 | def test_init_stats(self):
84 | stats_test_value = {}
85 |
86 | response = init_stats(stats_test_value)
87 | self.assertEqual(response["connection_diff_sec"], 0)
88 | self.assertEqual(response["connection_error_log"], {})
89 |
90 | def test_collect_stats_not_stats(self):
91 | aggregated_stats = {}
92 | stats = {}
93 |
94 | response = collect_stats(aggregated_stats, stats)
95 | self.assertEqual(response, None)
96 |
97 | def test_collect_stats(self):
98 | collect_stats(aggregated_stats, stats)
99 | self.assertEqual(aggregated_stats["connection_diff_sec"], stats["connection_diff_sec"])
100 | self.assertEqual(aggregated_stats["transaction_success"], stats["transaction_success"])
101 | self.assertEqual(aggregated_stats["transaction_error_log"], stats["transaction_error_log"])
102 |
--------------------------------------------------------------------------------
/core/tests/test_unload_sys_table.py:
--------------------------------------------------------------------------------
1 | from unittest.mock import patch, mock_open, MagicMock
2 | from core.replay.unload_sys_table import UnloadSysTable
3 | from core.replay.prep import ReplayPrep
4 | import unittest
5 |
6 |
7 | config = {
8 | "tag": "",
9 | "workload_location": "test-location/extract",
10 | "target_cluster_endpoint": "test-redshift-test-testing.us-east-1.redshift.amazonaws.com:5439/dev",
11 | "target_cluster_region": "us-east-1",
12 | "master_username": "awsuser",
13 | "nlb_nat_dns": None,
14 | "odbc_driver": None,
15 | "default_interface": "psql",
16 | "time_interval_between_transactions": "all off",
17 | "time_interval_between_queries": "all off",
18 | "execute_copy_statements": "false",
19 | "execute_unload_statements": "false",
20 | "replay_output": "s3://location/replay",
21 | "analysis_output": "s3://location/analysis",
22 | "unload_system_table_queries": "unload_system_tables.sql",
23 | "target_cluster_system_table_unload_iam_role": "arn:iam:role/test",
24 | }
25 |
26 | replay_id = "2023-02-13T04:59:40.864968+00:00_test-redshift-test-testing_76f32"
27 |
28 | file = mock_open(read_data=("--stl_test\nselect * from stl_test"))
29 | file_2 = mock_open(
30 | read_data=("--stl_unload\nunload (select * from stl_unload) to '' credentials ''")
31 | )
32 | file_3 = mock_open(read_data=("--stl_test\nselect * from stl_test"))
33 |
34 | conn = MagicMock()
35 | cursor = MagicMock()
36 |
37 | cursor.execute.return_value = True
38 | conn.cursor.return_value = cursor
39 |
40 |
41 | def mock_get_connection_cred(self, val):
42 | return {
43 | "host": "somehost",
44 | "port": 5437,
45 | "username": "myname",
46 | "password": "cantshare",
47 | "database": "idk",
48 | "odbc_driver": None,
49 | }
50 |
51 |
52 | def mock_db_connect(interface, host, port, username, password, database, odbc_driver):
53 | return conn
54 |
55 |
56 | def mock_db_connect_error(interface, host, port, username, password, database, odbc_driver):
57 | cursor.execute.side_effect = KeyError
58 | conn.cursor.return_value = cursor
59 |
60 | return conn
61 |
62 |
63 | class TestReplay(unittest.TestCase):
64 | @patch.object(ReplayPrep, "get_connection_credentials", mock_get_connection_cred)
65 | @patch("core.replay.unload_sys_table.db_connect", mock_db_connect)
66 | @patch("core.replay.unload_sys_table.logger.debug")
67 | @patch("builtins.open", file)
68 | @patch("core.replay.prep.boto3")
69 | def test_unload_system_table(self, mock_boto, mock_debug):
70 | unload_object = UnloadSysTable(config, replay_id)
71 | unload_object.unload_system_table()
72 |
73 | mock_debug.assert_called_once_with("Executed unload query: stl_test")
74 |
75 | @patch.object(ReplayPrep, "get_connection_credentials", mock_get_connection_cred)
76 | @patch("core.replay.unload_sys_table.db_connect", mock_db_connect)
77 | @patch("core.replay.unload_sys_table.logger.debug")
78 | @patch("builtins.open", file_2)
79 | @patch("core.replay.prep.boto3")
80 | def test_unload_system_table_with_unload_query(self, mock_boto, mock_debug):
81 | unload_object = UnloadSysTable(config, replay_id)
82 |
83 | unload_object.unload_system_table()
84 |
85 | mock_debug.assert_called_once_with("Executed unload query: stl_unload")
86 |
87 | @patch.object(ReplayPrep, "get_connection_credentials", mock_get_connection_cred)
88 | @patch("core.replay.unload_sys_table.db_connect", mock_db_connect_error)
89 | @patch("core.replay.unload_sys_table.logger.error")
90 | @patch("builtins.open", file_3)
91 | @patch("core.replay.prep.boto3")
92 | def test_unload_system_table_with_error(self, mock_boto, mock_error):
93 | unload_object = UnloadSysTable(config, replay_id)
94 |
95 | unload_object.unload_system_table()
96 |
97 | mock_error.assert_called_once_with("Failed to unload query. ")
98 |
--------------------------------------------------------------------------------
/core/util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/redshift-test-drive/354b7ed75180a6b915d856175cffd6414cae998e/core/util/__init__.py
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | boto3==1.24.24
2 | botocore==1.27.24
3 | Flask==2.2.5
4 | matplotlib==3.5.2
5 | moto==3.1.16
6 | numpy==1.26.4
7 | pandas
8 | python-dateutil==2.8.1
9 | PyYAML==6.0
10 | redshift-connector
11 | reportlab==3.6.13
12 | sqlparse==0.4.2
13 | tabulate==0.8.10
14 | tqdm==4.59.0
15 | coverage
16 | pre-commit
17 | black
18 | flake8
19 | pytest
20 |
--------------------------------------------------------------------------------
/tools/ExternalObjectReplicator/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/redshift-test-drive/354b7ed75180a6b915d856175cffd6414cae998e/tools/ExternalObjectReplicator/__init__.py
--------------------------------------------------------------------------------
/tools/ExternalObjectReplicator/sql/external_table_query.sql:
--------------------------------------------------------------------------------
1 | SELECT DISTINCT
2 | cast(q.database as varchar(100)) as rs_db
3 | ,cast(v.table_schema as varchar(100)) as rs_external_schema
4 | ,cast(v.table_name as varchar(100)) as external_table
5 | ,cast(es.databasename as varchar(100)) as glue_databasename
6 | FROM stl_query q
7 | JOIN svl_s3query_summary s
8 | ON q.query = s.query
9 | JOIN svv_tables v
10 | on s.external_table_name like '%' + v.table_schema + '%'
11 | and s.external_table_name like '%' + v.table_catalog + '%'
12 | and s.external_table_name like '%' + v.table_name + '%'
13 | join svv_external_schemas es
14 | on es.schemaname like v.table_schema
15 | WHERE q.userid > 1 and v.table_type = 'EXTERNAL TABLE'
16 | and q.starttime >= cast('{start}' as datetime)
17 | and q.starttime <= cast('{end}' as datetime)
18 | AND q.DATABASE = '{db}';
--------------------------------------------------------------------------------
/tools/ExternalObjectReplicator/sql/stl_load_query.sql:
--------------------------------------------------------------------------------
1 | SELECT DISTINCT trim(filename) AS filename
2 | FROM STL_LOAD_COMMITS
3 | WHERE QUERY IN
4 | (SELECT DISTINCT QUERY FROM STL_QUERY
5 | WHERE starttime >= cast('{start}' as datetime)
6 | AND starttime <= cast('{end}' as datetime)
7 | AND DATABASE = '{db}')
8 | OR QUERY IN
9 | (SELECT DISTINCT QUERY FROM STL_QUERY
10 | WHERE starttime >= cast('{start}' as datetime)
11 | AND starttime <= cast('{end}' as datetime)
12 | AND (querytxt LIKE '%manifest%' OR querytxt LIKE '%Manifest%' OR querytxt LIKE '%MANIFEST%')
13 | AND DATABASE = '{db}')
14 | order by 1;
15 |
--------------------------------------------------------------------------------
/tools/ExternalObjectReplicator/sql/svl_s3_list.sql:
--------------------------------------------------------------------------------
1 | SELECT DISTINCT bucket, prefix
2 | FROM SVL_S3LIST WHERE QUERY IN
3 | ( SELECT DISTINCT QUERY FROM STL_QUERY
4 | WHERE userid>1
5 | and starttime >= cast('{start}' as datetime)
6 | and starttime <= cast('{end}' as datetime)
7 | AND DATABASE = '{db}')
8 | ORDER BY 1;
9 |
--------------------------------------------------------------------------------
/tools/ExternalObjectReplicator/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/redshift-test-drive/354b7ed75180a6b915d856175cffd6414cae998e/tools/ExternalObjectReplicator/tests/__init__.py
--------------------------------------------------------------------------------
/tools/ExternalObjectReplicator/util/glue_util.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import uuid
3 |
4 | from moto.glue.exceptions import DatabaseNotFoundException, TableNotFoundException
5 |
6 | import common.aws_service as aws_helper
7 |
8 | from tqdm import tqdm
9 | from common.util import bucket_dict
10 |
11 | logger = logging.getLogger("ExternalObjectReplicatorLogger")
12 |
13 |
14 | def clone_glue_catalog(records, dest_location, region):
15 | """
16 | It reads through the systems table to create clone of the database,tables and partitions
17 | record[3]['stringValue'] : glue database vale from table
18 | record[2]['stringValue'] : external glue table value
19 | @param records:
20 | @param region
21 | @param dest_location
22 | @return:
23 | """
24 | glue_db_append_name = uuid.uuid1()
25 | new_glue_db_list = []
26 | checked_db_list = []
27 | pbar = tqdm(range(len(records)))
28 | for i in pbar:
29 | record = records[i]
30 | original_glue_db = record[3]["stringValue"]
31 | original_glue_table = record[2]["stringValue"]
32 | new_glue_db = f"{glue_db_append_name}-{original_glue_db}"
33 | pbar.set_description_str(
34 | f"Cloning {original_glue_table} in {original_glue_db} - {i + 1} out of {len(records)} glue objects"
35 | )
36 | # if the database hasn't been checked yet
37 | if original_glue_db not in checked_db_list:
38 | database_copy(new_glue_db, original_glue_db, original_glue_table, region)
39 | checked_db_list.append(original_glue_db)
40 | new_glue_db_list.append(new_glue_db)
41 | glue_table_copy(original_glue_db, new_glue_db, original_glue_table, dest_location, region)
42 | logger.debug(f"New Glue database created: {new_glue_db_list}.")
43 | logger.info("== Finished cloning Glue databases and tables ==")
44 | return new_glue_db_list
45 |
46 |
47 | def database_copy(new_glue_db, original_glue_db, original_glue_table, region):
48 | """
49 | Create a new database
50 | @return:
51 |
52 | Parameters
53 | ----------
54 | region
55 | original_glue_table
56 | original_glue_db
57 | new_glue_db
58 | """
59 | try:
60 | aws_helper.glue_get_database(name=new_glue_db, region=region)
61 | except DatabaseNotFoundException as _:
62 | aws_helper.glue_create_database(
63 | new_glue_db, "Database clone created by External Object Replicator", region
64 | )
65 | except Exception as e:
66 | logger.error(f"Error doing database copy in Glue: {e}")
67 | exit(-1)
68 |
69 | return original_glue_db, new_glue_db, original_glue_table
70 |
71 |
72 | def glue_table_copy(original_glue_db, new_glue_db, original_glue_table, dest_location, region):
73 | """
74 | CHeck if glue table exists in the new glue database, if not create the table structure along with the partitions
75 | @param original_glue_db:
76 | @param new_glue_db:
77 | @param original_glue_table:
78 | @param dest_location
79 | @param region
80 | @return:
81 | """
82 | dest_bucket = bucket_dict(dest_location)["bucket_name"]
83 | try:
84 | table_get_response = aws_helper.glue_get_table(
85 | database=new_glue_db, table=original_glue_table, region=region
86 | )
87 | new_s3_loc = table_get_response["Table"]["StorageDescriptor"]["Location"]
88 | except TableNotFoundException as _:
89 | table_get_response = aws_helper.glue_get_table(
90 | database=original_glue_db,
91 | table=original_glue_table,
92 | region=region,
93 | )
94 | index_response = aws_helper.glue_get_partition_indexes(
95 | database=original_glue_db, table=original_glue_table, region=region
96 | )
97 | orig_s3_loc = table_get_response["Table"]["StorageDescriptor"]["Location"].split("/")
98 | new_s3_loc = f"{dest_bucket}/spectrumfiles/{'/'.join(orig_s3_loc[2:])}"
99 | table_input = {
100 | "Name": table_get_response["Table"]["Name"],
101 | "Description": "For use with Redshfit candidate release testing",
102 | "StorageDescriptor": {
103 | "Columns": table_get_response["Table"]["StorageDescriptor"]["Columns"],
104 | "Location": new_s3_loc,
105 | },
106 | "PartitionKeys": table_get_response["Table"]["PartitionKeys"],
107 | }
108 |
109 | if index_response["PartitionIndexDescriptorList"]:
110 | aws_helper.glue_create_table(
111 | new_database=new_glue_db,
112 | table_input=table_input.update(
113 | {"PartitionIndexes": index_response["PartitionIndexDescriptorList"]}
114 | ),
115 | region=region,
116 | )
117 | else:
118 | aws_helper.glue_create_table(
119 | new_database=new_glue_db, table_input=table_input, region=region
120 | )
121 | return new_s3_loc
122 | except Exception as e:
123 | logger.error(f"Failed to copy table in Glue: {e}")
124 | exit(-1)
125 |
--------------------------------------------------------------------------------
/tools/NodeConfigCompare/IAM_Permissions.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/redshift-test-drive/354b7ed75180a6b915d856175cffd6414cae998e/tools/NodeConfigCompare/IAM_Permissions.pdf
--------------------------------------------------------------------------------
/tools/NodeConfigCompare/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/redshift-test-drive/354b7ed75180a6b915d856175cffd6414cae998e/tools/NodeConfigCompare/__init__.py
--------------------------------------------------------------------------------
/tools/NodeConfigCompare/bootstrap_scripts/extract_bootstrap.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | echo "bucket_name: $BUCKET_NAME"
4 | echo "simple_replay_extract_overwrite_s3_path: $SIMPLE_REPLAY_EXTRACT_OVERWRITE_S3_PATH"
5 | echo "simple_replay_log_location: $SIMPLE_REPLAY_LOG_LOCATION"
6 | echo "redshift_user_name: $REDSHIFT_USER_NAME"
7 | echo "what_if_timestamp: $WHAT_IF_TIMESTAMP"
8 | echo "simple_replay_extract_start_time: $SIMPLE_REPLAY_EXTRACT_START_TIME"
9 | echo "simple_replay_extract_end_time: $SIMPLE_REPLAY_EXTRACT_END_TIME"
10 | echo "extract_prefix: $EXTRACT_PREFIX"
11 | echo "script_prefix: $SCRIPT_PREFIX"
12 |
13 | yum update -y
14 | yum -y install git
15 | yum -y install python3
16 | yum -y install python3-pip
17 | yum -y install aws-cfn-bootstrap
18 | yum -y install gcc gcc-c++ python3 python3-devel unixODBC unixODBC-devel
19 | mkdir amazonutils
20 | cd amazonutils
21 | git clone https://github.com/aws/redshift-test-drive.git
22 | cd redshift-test-drive
23 | make setup
24 | if [[ "$SIMPLE_REPLAY_EXTRACT_OVERWRITE_S3_PATH" != "N/A" ]]; then
25 | aws s3 cp $SIMPLE_REPLAY_EXTRACT_OVERWRITE_S3_PATH config/extract.yaml
26 | fi
27 | WORKLOAD_LOCATION="s3://${BUCKET_NAME}/${EXTRACT_PREFIX}/${WHAT_IF_TIMESTAMP}"
28 | sed -i "s#master_username: \".*\"#master_username: \"$REDSHIFT_USER_NAME\"#g" config/extract.yaml
29 | sed -i "s#log_location: \".*\"#log_location: \"$SIMPLE_REPLAY_LOG_LOCATION\"#g" config/extract.yaml
30 | sed -i "s#workload_location: \".*\"#workload_location: \"$WORKLOAD_LOCATION\"#g" config/extract.yaml
31 | sed -i "s#start_time: \".*\"#start_time: \"$SIMPLE_REPLAY_EXTRACT_START_TIME\"#g" config/extract.yaml
32 | sed -i "s#end_time: \".*\"#end_time: \"$SIMPLE_REPLAY_EXTRACT_END_TIME\"#g" config/extract.yaml
33 | aws s3 cp config/extract.yaml s3://$BUCKET_NAME/$SCRIPT_PREFIX/
34 | make extract
--------------------------------------------------------------------------------
/tools/NodeConfigCompare/bootstrap_scripts/performance_test_bootstrap.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | # This script bootstraps a base amazonlinux image to run the Redshift
3 | # Node Config concurrency test.
4 | # 1. Install the AWS CLI, Python3, and necessary Python libraries.
5 | # 2. Copy Python program source for concurrency test
6 | # 3. Execute that Python program
7 | # We expect all configuration to be defined as environment variables
8 | # for the Batch job.
9 |
10 | set -eu
11 |
12 | yum install -y awscli python3
13 | pip3 install boto3 psycopg2-binary pandas sqlalchemy
14 |
15 | aws s3 cp "$PYTHON_SCRIPT" ./script.py
16 |
17 | # This Python program requires these environment variables to be set:
18 | # `$SQL_SCRIPT_S3_PATH`, `$REDSHIFT_CLUSTER_ENDPOINT`,
19 | # `$REDSHIFT_IAM_ROLE`, `$BUCKET_NAME`, `$REDSHIFT_USER_NAME`
20 | python3 ./script.py
21 |
--------------------------------------------------------------------------------
/tools/NodeConfigCompare/bootstrap_scripts/replay_bootstrap.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -e
3 | echo "bucket_name: $BUCKET_NAME"
4 | echo "simple_replay_overwrite_s3_path: $SIMPLE_REPLAY_OVERWRITE_S3_PATH"
5 | echo "redshift_user_name: $REDSHIFT_USER_NAME"
6 | echo "what_if_timestamp: $WHAT_IF_TIMESTAMP"
7 | echo "extract_prefix: $EXTRACT_PREFIX"
8 | echo "replay_prefix: $REPLAY_PREFIX"
9 | echo "script_prefix: $SCRIPT_PREFIX"
10 | echo "redshift_iam_role: $REDSHIFT_IAM_ROLE"
11 | echo "workload_location: $WORKLOAD_LOCATION"
12 | echo "cluster_endpoint: $CLUSTER_ENDPOINT"
13 | echo "cluster_identifier: $CLUSTER_IDENTIFIER"
14 | echo "execute_unload_statements: $SIMPLE_REPLAY_UNLOAD_STATEMENTS"
15 | echo "snapshot_account_id: $SNAPSHOT_ACCOUNT_ID"
16 | account_id=`aws sts get-caller-identity --query Account --output text`
17 | echo "account_id: $account_id"
18 | echo "endpoint_type: $ENDPOINT_TYPE"
19 | TARGET_CLUSTER_REGION=$(echo $CLUSTER_ENDPOINT | cut -f3 -d'.')
20 | ##region = os.environ['AWS_REGION']
21 | yum update -y
22 | yum -y install git
23 | yum -y install python3
24 | yum -y install python3-pip
25 | yum -y install aws-cfn-bootstrap
26 | yum -y install gcc gcc-c++ python3 python3-devel unixODBC unixODBC-devel
27 | mkdir amazonutils
28 | cd amazonutils
29 | git clone https://github.com/aws/redshift-test-drive.git
30 | cd redshift-test-drive
31 | make setup
32 | if [[ "$SIMPLE_REPLAY_OVERWRITE_S3_PATH" != "N/A" ]]; then
33 | aws s3 cp $SIMPLE_REPLAY_OVERWRITE_S3_PATH config/replay.yaml
34 | fi
35 |
36 | sed -i "s#master_username: \".*\"#master_username: \"$REDSHIFT_USER_NAME\"#g" config/replay.yaml
37 | sed -i "s#unload_iam_role: \".*\"#unload_iam_role: \"$REDSHIFT_IAM_ROLE\"#g" config/replay.yaml
38 | sed -i "s#workload_location: \".*\"#workload_location: \"$WORKLOAD_LOCATION\"#g" config/replay.yaml
39 | sed -i "s#target_cluster_endpoint: \".*\"#target_cluster_endpoint: \"$CLUSTER_ENDPOINT\"#g" config/replay.yaml
40 | sed -i "s#target_cluster_region: \".*\"#target_cluster_region: \"$TARGET_CLUSTER_REGION\"#g" config/replay.yaml
41 | sed -i "s#analysis_iam_role: \".*\"#analysis_iam_role: \"$REDSHIFT_IAM_ROLE\"#g" config/replay.yaml
42 | sed -i "s#analysis_output: \".*\"#analysis_output: \"$WORKLOAD_LOCATION\"#g" config/replay.yaml
43 |
44 | if [ "$SIMPLE_REPLAY_UNLOAD_STATEMENTS" == "true" ]; then
45 | sed -i "s#unload_iam_role: \".*\"#unload_iam_role: \"$REDSHIFT_IAM_ROLE\"#g" config/replay.yaml
46 | sed -i "s#replay_output: \".*\"#replay_output: \"s3://$BUCKET_NAME/$REPLAY_PREFIX/$WHAT_IF_TIMESTAMP/$CLUSTER_IDENTIFIER\"#g" config/replay.yaml
47 | fi
48 |
49 |
50 | if [[ "$account_id" == "$SNAPSHOT_ACCOUNT_ID" ]]; then
51 | sed -i "s#execute_copy_statements: \"false\"#execute_copy_statements: \"true\"#g" config/replay.yaml
52 | aws s3 cp $WORKLOAD_LOCATION/copy_replacements.csv . || true
53 | sed -z -i "s#,,\n#,,$REDSHIFT_IAM_ROLE\n#g" copy_replacements.csv || true
54 | aws s3 cp copy_replacements.csv $WORKLOAD_LOCATION/copy_replacements.csv || true
55 | fi
56 | aws s3 cp config/replay.yaml s3://$BUCKET_NAME/$SCRIPT_PREFIX/replay_$CLUSTER_IDENTIFIER.yaml
57 | make replay
58 | if [[ $ENDPOINT_TYPE == 'SERVERLESS' ]]; then
59 | aws s3 cp s3://$BUCKET_NAME/$SCRIPT_PREFIX/system_config.json .
60 | aws s3 cp s3://$BUCKET_NAME/$SCRIPT_PREFIX/create_external_schema.py .
61 | python3 tools/NodeConfigCompare/python_scripts/create_external_schema.py
62 | fi
63 |
--------------------------------------------------------------------------------
/tools/NodeConfigCompare/configuration/parameter_group_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "Parameters": [
3 | {
4 | "ParameterName": "auto_analyze",
5 | "ParameterValue": "true",
6 | "Description": "Use auto analyze",
7 | "Source": "engine-default",
8 | "DataType": "boolean",
9 | "AllowedValues": "true,false",
10 | "ApplyType": "static",
11 | "IsModifiable": true
12 | },
13 | {
14 | "ParameterName": "datestyle",
15 | "ParameterValue": "ISO, MDY",
16 | "Description": "Sets the display format for date and time values.",
17 | "Source": "engine-default",
18 | "DataType": "string",
19 | "ApplyType": "static",
20 | "IsModifiable": true
21 | },
22 | {
23 | "ParameterName": "enable_case_sensitive_identifier",
24 | "ParameterValue": "false",
25 | "Description": "Preserve case sensitivity for database identifiers such as table or column names in parser",
26 | "Source": "engine-default",
27 | "DataType": "boolean",
28 | "AllowedValues": "true,false",
29 | "ApplyType": "static",
30 | "IsModifiable": true
31 | },
32 | {
33 | "ParameterName": "enable_user_activity_logging",
34 | "ParameterValue": "false",
35 | "Description": "parameter for audit logging purpose",
36 | "Source": "user",
37 | "DataType": "boolean",
38 | "AllowedValues": "true,false",
39 | "ApplyType": "static",
40 | "IsModifiable": true
41 | },
42 | {
43 | "ParameterName": "extra_float_digits",
44 | "ParameterValue": "0",
45 | "Description": "Sets the number of digits displayed for floating-point values",
46 | "Source": "engine-default",
47 | "DataType": "integer",
48 | "AllowedValues": "-15-2",
49 | "ApplyType": "static",
50 | "IsModifiable": true
51 | },
52 | {
53 | "ParameterName": "max_concurrency_scaling_clusters",
54 | "ParameterValue": "2",
55 | "Description": "The maximum concurrency scaling clusters can be used.",
56 | "Source": "user",
57 | "DataType": "integer",
58 | "AllowedValues": "0-10",
59 | "ApplyType": "static",
60 | "IsModifiable": true
61 | },
62 | {
63 | "ParameterName": "max_cursor_result_set_size",
64 | "ParameterValue": "default",
65 | "Description": "Sets the max cursor result set size",
66 | "Source": "engine-default",
67 | "DataType": "integer",
68 | "AllowedValues": "0-14400000",
69 | "ApplyType": "static",
70 | "IsModifiable": true
71 | },
72 | {
73 | "ParameterName": "query_group",
74 | "ParameterValue": "default",
75 | "Description": "This parameter applies a user-defined label to a group of queries that are run during the same session..",
76 | "Source": "engine-default",
77 | "DataType": "string",
78 | "ApplyType": "static",
79 | "IsModifiable": true
80 | },
81 | {
82 | "ParameterName": "require_ssl",
83 | "ParameterValue": "true",
84 | "Description": "require ssl for all databaseconnections",
85 | "Source": "user",
86 | "DataType": "boolean",
87 | "AllowedValues": "true,false",
88 | "ApplyType": "static",
89 | "IsModifiable": true
90 | },
91 | {
92 | "ParameterName": "search_path",
93 | "ParameterValue": "$user, public",
94 | "Description": "Sets the schema search order for names that are not schema-qualified.",
95 | "Source": "engine-default",
96 | "DataType": "string",
97 | "ApplyType": "static",
98 | "IsModifiable": true
99 | },
100 | {
101 | "ParameterName": "statement_timeout",
102 | "ParameterValue": "0",
103 | "Description": "Aborts any statement that takes over the specified number of milliseconds.",
104 | "Source": "engine-default",
105 | "DataType": "integer",
106 | "AllowedValues": "0,100-2147483647",
107 | "ApplyType": "static",
108 | "IsModifiable": true
109 | },
110 | {
111 | "ParameterName": "use_fips_ssl",
112 | "ParameterValue": "false",
113 | "Description": "Use fips ssl library",
114 | "Source": "engine-default",
115 | "DataType": "boolean",
116 | "AllowedValues": "true,false",
117 | "ApplyType": "static",
118 | "IsModifiable": true
119 | },
120 | {
121 | "ParameterName": "wlm_json_configuration",
122 | "ParameterValue": "[{\"auto_wlm\":true}]",
123 | "Description": "wlm json configuration",
124 | "Source": "engine-default",
125 | "DataType": "string",
126 | "ApplyType": "static",
127 | "IsModifiable": true
128 | }
129 | ]
130 | }
131 |
--------------------------------------------------------------------------------
/tools/NodeConfigCompare/configuration/source-wlm.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "query_group": [],
4 | "query_group_wild_card": 0,
5 | "user_group": [],
6 | "user_group_wild_card": 0,
7 | "concurrency_scaling": "off",
8 | "rules": [
9 | {
10 | "rule_name": "DiskSpilling",
11 | "predicate": [
12 | {
13 | "metric_name": "query_temp_blocks_to_disk",
14 | "operator": ">",
15 | "value": 100000
16 | }
17 | ],
18 | "action": "log"
19 | },
20 | {
21 | "rule_name": "QueryRunningMoreThan30min",
22 | "predicate": [
23 | {
24 | "metric_name": "query_execution_time",
25 | "operator": ">",
26 | "value": 1800
27 | }
28 | ],
29 | "action": "log"
30 | }
31 | ],
32 | "priority": "normal",
33 | "queue_type": "auto",
34 | "auto_wlm": true
35 | },
36 | {
37 | "short_query_queue": true
38 | }
39 | ]
40 |
--------------------------------------------------------------------------------
/tools/NodeConfigCompare/configuration/wlm-concurrency-scaling.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "query_group": [],
4 | "query_group_wild_card": 0,
5 | "user_group": [],
6 | "user_group_wild_card": 0,
7 | "concurrency_scaling": "auto",
8 | "rules": [
9 | {
10 | "rule_name": "DiskSpilling",
11 | "predicate": [
12 | {
13 | "metric_name": "query_temp_blocks_to_disk",
14 | "operator": ">",
15 | "value": 100000
16 | }
17 | ],
18 | "action": "log"
19 | },
20 | {
21 | "rule_name": "QueryRunningMoreThan30min",
22 | "predicate": [
23 | {
24 | "metric_name": "query_execution_time",
25 | "operator": ">",
26 | "value": 1800
27 | }
28 | ],
29 | "action": "log"
30 | }
31 | ],
32 | "priority": "normal",
33 | "queue_type": "auto",
34 | "auto_wlm": true
35 | },
36 | {
37 | "short_query_queue": true
38 | }
39 | ]
40 |
--------------------------------------------------------------------------------
/tools/NodeConfigCompare/images/architecure-serverless.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/redshift-test-drive/354b7ed75180a6b915d856175cffd6414cae998e/tools/NodeConfigCompare/images/architecure-serverless.png
--------------------------------------------------------------------------------
/tools/NodeConfigCompare/images/batch-cw-log-group.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/redshift-test-drive/354b7ed75180a6b915d856175cffd6414cae998e/tools/NodeConfigCompare/images/batch-cw-log-group.png
--------------------------------------------------------------------------------
/tools/NodeConfigCompare/images/redshift-clusters-provisioned.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/redshift-test-drive/354b7ed75180a6b915d856175cffd6414cae998e/tools/NodeConfigCompare/images/redshift-clusters-provisioned.png
--------------------------------------------------------------------------------
/tools/NodeConfigCompare/images/redshift-clusters-serverless.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/redshift-test-drive/354b7ed75180a6b915d856175cffd6414cae998e/tools/NodeConfigCompare/images/redshift-clusters-serverless.png
--------------------------------------------------------------------------------
/tools/NodeConfigCompare/images/redshift-clusters.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/redshift-test-drive/354b7ed75180a6b915d856175cffd6414cae998e/tools/NodeConfigCompare/images/redshift-clusters.png
--------------------------------------------------------------------------------
/tools/NodeConfigCompare/images/statemachine-log.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/redshift-test-drive/354b7ed75180a6b915d856175cffd6414cae998e/tools/NodeConfigCompare/images/statemachine-log.png
--------------------------------------------------------------------------------
/tools/NodeConfigCompare/images/statemachine.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/redshift-test-drive/354b7ed75180a6b915d856175cffd6414cae998e/tools/NodeConfigCompare/images/statemachine.png
--------------------------------------------------------------------------------
/tools/NodeConfigCompare/python_scripts/RedshiftConfigTestingLambda.py.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/redshift-test-drive/354b7ed75180a6b915d856175cffd6414cae998e/tools/NodeConfigCompare/python_scripts/RedshiftConfigTestingLambda.py.zip
--------------------------------------------------------------------------------
/tools/NodeConfigCompare/python_scripts/StartUpLambda.py.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/redshift-test-drive/354b7ed75180a6b915d856175cffd6414cae998e/tools/NodeConfigCompare/python_scripts/StartUpLambda.py.zip
--------------------------------------------------------------------------------
/tools/NodeConfigCompare/python_scripts/boto3-redshift-serverless.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/redshift-test-drive/354b7ed75180a6b915d856175cffd6414cae998e/tools/NodeConfigCompare/python_scripts/boto3-redshift-serverless.zip
--------------------------------------------------------------------------------
/tools/NodeConfigCompare/python_scripts/create_external_schema.py:
--------------------------------------------------------------------------------
1 | import redshift_connector
2 | import boto3
3 | import yaml
4 | import json
5 |
6 | rs_client = boto3.client("redshift")
7 | with open("config/replay.yaml", "r") as fr:
8 | config_read = yaml.safe_load(fr)
9 | target_cluster_endpoint = config_read["target_cluster_endpoint"]
10 | cluster_endpoint_split = target_cluster_endpoint.split(".")
11 | workgroup_id = cluster_endpoint_split[0]
12 | db_host = target_cluster_endpoint.split(":")[0]
13 | db_port = cluster_endpoint_split[5].split("/")[0][4:]
14 | db_name = cluster_endpoint_split[5].split("/")[1]
15 | db_username = config_read["master_username"]
16 | serverless_cluster_id = f"redshift-serverless-{workgroup_id}"
17 | with open("system_config.json", "r") as jr:
18 | json_data = json.load(jr)
19 | script = json_data["EXTERNAL_SCHEMA_SCRIPT"]
20 | try:
21 | response = rs_client.get_cluster_credentials(
22 | DbUser=db_username,
23 | ClusterIdentifier=serverless_cluster_id,
24 | AutoCreate=False,
25 | DurationSeconds=3600,
26 | )
27 | except rs_client.exceptions.ClientError as e:
28 | if e.response["Error"]["Code"] == "ExpiredToken":
29 | print(
30 | f"Error retrieving credentials for {serverless_cluster_id}: IAM credentials have expired."
31 | )
32 | exit(-1)
33 | elif e.response["Error"]["Code"] == "ResourceNotFoundException":
34 | print(
35 | f"Serverless endpoint could not be found "
36 | f"RedshiftServerless:GetCredentials. {e}"
37 | )
38 | exit(-1)
39 | else:
40 | print(f"Got exception retrieving credentials ({e.response['Error']['Code']})")
41 | raise e
42 | db_user = response["DbUser"]
43 | db_password = response["DbPassword"]
44 | try:
45 | conn = redshift_connector.connect(
46 | host=db_host, database=db_name, user=db_user, password=db_password
47 | )
48 | cursor = conn.cursor()
49 | conn.autocommit = True
50 | cursor.execute(script)
51 | print(f"Executed script.{script}")
52 | except Exception as err:
53 | if "already exists" not in str(err):
54 | print(f"Got exception while executing script {err}")
55 | raise
56 |
--------------------------------------------------------------------------------
/tools/NodeConfigCompare/python_scripts/python.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/redshift-test-drive/354b7ed75180a6b915d856175cffd6414cae998e/tools/NodeConfigCompare/python_scripts/python.zip
--------------------------------------------------------------------------------
/tools/NodeConfigCompare/python_scripts/redshift-performance-test.py:
--------------------------------------------------------------------------------
1 | import json
2 | import boto3
3 | import psycopg2
4 | import time
5 | import pandas
6 | from sqlalchemy import create_engine
7 | from sqlalchemy import text
8 | from concurrent.futures import ThreadPoolExecutor
9 | from concurrent.futures import as_completed
10 | from urllib.parse import quote_plus as urlquote
11 | import urllib
12 | import re
13 | import os
14 |
15 | SQL_SCRIPT_S3_PATH = os.environ["SQL_SCRIPT_S3_PATH"]
16 | REDSHIFT_CLUSTER_ENDPOINT = os.environ["REDSHIFT_CLUSTER_ENDPOINT"]
17 | REDSHIFT_IAM_ROLE = os.environ["REDSHIFT_IAM_ROLE"]
18 | BUCKET_NAME = os.environ["SQL_SCRIPT_S3_PATH"]
19 | REDSHIFT_USER_NAME = os.environ["REDSHIFT_USER_NAME"]
20 | NUMBER_OF_PARALLEL_SESSIONS_LIST = os.environ["NUMBER_OF_PARALLEL_SESSIONS_LIST"]
21 | DISABLE_RESULT_CACHE = os.environ["DISABLE_RESULT_CACHE"]
22 | DEFAULT_OUTPUT_LIMIT = os.environ["DEFAULT_OUTPUT_LIMIT"]
23 | MAX_NUMBER_OF_QUERIES = os.environ["MAX_NUMBER_OF_QUERIES"]
24 | MAX_PARALLEL_SESSIONS = os.environ["MAX_PARALLEL_SESSIONS"]
25 | QUERY_LABEL_PREFIX = os.environ["QUERY_LABEL_PREFIX"]
26 |
27 |
28 | def connect_to_redshift(host, username):
29 | client = boto3.client("redshift")
30 | cluster_creds = client.get_cluster_credentials(
31 | DbUser=username,
32 | DbName=REDSHIFT_CLUSTER_ENDPOINT.split("/")[1],
33 | ClusterIdentifier=REDSHIFT_CLUSTER_ENDPOINT.split(".")[0],
34 | )
35 |
36 | connection_string = (
37 | "postgresql://"
38 | + urlquote(cluster_creds["DbUser"])
39 | + ":"
40 | + urlquote(cluster_creds["DbPassword"])
41 | + "@"
42 | + REDSHIFT_CLUSTER_ENDPOINT
43 | )
44 | return create_engine(connection_string, pool_size=0, max_overflow=-1)
45 |
46 |
47 | def get_json_config_from_s3(script_s3_path):
48 | bucket, key = script_s3_path.replace("s3://", "").split("/", 1)
49 | obj = boto3.client("s3").get_object(Bucket=bucket, Key=key)
50 | return json.loads(obj["Body"].read().decode("utf-8"))
51 |
52 |
53 | def get_sql_scripts_from_s3():
54 |
55 | bucket, key = SQL_SCRIPT_S3_PATH.replace("s3://", "").split("/", 1)
56 | obj = boto3.client("s3").get_object(Bucket=bucket, Key=key)
57 | script = obj["Body"].read().decode("utf-8")
58 | script = script.format(redshift_iam_role=REDSHIFT_IAM_ROLE, bucket_name=BUCKET_NAME)
59 | split_scripts = script.split(";")[:-1]
60 | if len(split_scripts) > int(MAX_NUMBER_OF_QUERIES):
61 | split_scripts = split_scripts[0 : int(MAX_NUMBER_OF_QUERIES)]
62 | return split_scripts
63 |
64 |
65 | def get_sql(engine, number_of_parallel_sessions):
66 | sql_script = ""
67 |
68 | pattern = re.compile(r"limit[\s|\t|\n]+[\d]+[\s]*$", re.IGNORECASE)
69 | for query in get_sql_scripts_from_s3():
70 | if not re.search(pattern, query):
71 | query += " limit " + DEFAULT_OUTPUT_LIMIT
72 | sql_script += query + ";\n"
73 |
74 | if DISABLE_RESULT_CACHE == "true":
75 | sql_script = "set enable_result_cache_for_session to false;\n" + sql_script
76 |
77 | sql_script = (
78 | "set query_group to '"
79 | + QUERY_LABEL_PREFIX
80 | + str(number_of_parallel_sessions)
81 | + "';\n"
82 | + sql_script
83 | )
84 |
85 | df = pandas.read_sql(text(sql_script), engine)
86 | return df
87 |
88 |
89 | def run_concurrency_test(number_of_parallel_sessions):
90 | engine = connect_to_redshift(REDSHIFT_CLUSTER_ENDPOINT, REDSHIFT_USER_NAME)
91 | start_time = time.time()
92 | try:
93 | with ThreadPoolExecutor(max_workers=number_of_parallel_sessions) as executor:
94 | futures = []
95 | for _ in range(number_of_parallel_sessions):
96 | futures.append(executor.submit(get_sql, engine, number_of_parallel_sessions))
97 | for future in as_completed(futures):
98 | rs = future.result()
99 |
100 | except Exception as e:
101 | raise e
102 | elapsed_time_in_secs = time.time() - start_time
103 | print("--- %s seconds ---" % elapsed_time_in_secs)
104 | return elapsed_time_in_secs
105 |
106 |
107 | print(
108 | f"script:{SQL_SCRIPT_S3_PATH}, cluster:{REDSHIFT_CLUSTER_ENDPOINT},role:{REDSHIFT_IAM_ROLE},bucket:{BUCKET_NAME},user:{REDSHIFT_USER_NAME},sessions:{NUMBER_OF_PARALLEL_SESSIONS_LIST}"
109 | )
110 | for sessions in NUMBER_OF_PARALLEL_SESSIONS_LIST.split(","):
111 | number_of_parallel_sessions = int(sessions)
112 | if number_of_parallel_sessions <= int(MAX_PARALLEL_SESSIONS):
113 | print(f"running {number_of_parallel_sessions} parallel threads ..")
114 | run_concurrency_test(number_of_parallel_sessions)
115 | else:
116 | print(
117 | f"parallel sessions {number_of_parallel_sessions} exceeds maximum allowed {MAX_PARALLEL_SESSIONS} .."
118 | )
119 |
--------------------------------------------------------------------------------
/tools/NodeConfigCompare/sql/ddl.sql:
--------------------------------------------------------------------------------
1 | create table if not exists example_table
2 | (id INTEGER IDENTITY(1, 1) NOT NULL, column_value varchar(10), insert_timestamp timestamp default sysdate);
3 |
4 | insert into example_table (column_value) values('data');
5 |
--------------------------------------------------------------------------------
/tools/NodeConfigCompare/sql/populate_comparison_results.sql:
--------------------------------------------------------------------------------
1 | unload ($$
2 | select * from public.redshift_config_comparison_raw
3 | $$) to '{raw_comparison_results_s3_path}/{what_if_timestamp}/'
4 | FORMAT AS CSV HEADER ALLOWOVERWRITE iam_role '{redshift_iam_role}';
5 |
6 |
7 | unload ($$
8 | select * from public.redshift_config_comparison_results
9 | $$) to '{comparison_results_s3_path}/{what_if_timestamp}/'
10 | parallel off FORMAT AS CSV HEADER ALLOWOVERWRITE iam_role '{redshift_iam_role}';
11 |
--------------------------------------------------------------------------------
/tools/NodeConfigCompare/sql/test_queries.sql:
--------------------------------------------------------------------------------
1 | --first_query
2 |
3 | SELECT
4 | s_acctbal
5 | , s_name
6 | , n_name
7 | , p_partkey
8 | , p_mfgr
9 | , s_address
10 | , s_phone
11 | , s_comment
12 | FROM
13 | part,
14 | supplier,
15 | partsupp,
16 | nation,
17 | REGION
18 | WHERE p_partkey = ps_partkey
19 | AND s_suppkey = ps_suppkey
20 | AND p_size = 34
21 | AND p_type LIKE '%COPPER'
22 | AND s_nationkey = n_nationkey
23 | AND n_regionkey = r_regionkey
24 | AND r_name = 'MIDDLE EAST'
25 | AND ps_supplycost = (SELECT
26 | MIN(ps_supplycost)
27 | FROM
28 | partsupp,
29 | supplier,
30 | nation,
31 | REGION
32 | WHERE p_partkey = ps_partkey
33 | AND s_suppkey = ps_suppkey
34 | AND s_nationkey = n_nationkey
35 | AND n_regionkey = r_regionkey
36 | AND r_name = 'MIDDLE EAST')
37 | ORDER BY
38 | s_acctbal DESC
39 | , n_name
40 | , s_name
41 | , p_partkey ;
42 |
43 | --second_query
44 |
45 | SELECT
46 | ps_partkey
47 | , SUM(ps_supplycost * ps_availqty) AS value
48 | FROM
49 | partsupp,
50 | supplier,
51 | nation
52 | WHERE ps_suppkey = s_suppkey
53 | AND s_nationkey = n_nationkey
54 | AND n_name = 'SAUDI ARABIA'
55 | GROUP BY
56 | ps_partkey
57 | HAVING
58 | SUM(ps_supplycost * ps_availqty) > (SELECT
59 | SUM(ps_supplycost * ps_availqty) * 0.0000000333
60 | FROM
61 | partsupp,
62 | supplier,
63 | nation
64 | WHERE ps_suppkey = s_suppkey
65 | AND s_nationkey = n_nationkey
66 | AND n_name = 'SAUDI ARABIA')
67 | ORDER BY
68 | value DESC ;
69 |
70 | --third_query
71 |
72 | SELECT
73 | p_brand
74 | , p_type
75 | , p_size
76 | , COUNT(DISTINCT ps_suppkey) AS supplier_cnt
77 | FROM
78 | partsupp,
79 | part
80 | WHERE p_partkey = ps_partkey
81 | AND p_brand <> 'Brand#23'
82 | AND p_type NOT LIKE 'MEDIUM ANODIZED%'
83 | AND p_size IN (1, 32, 33, 46, 7, 42, 21, 40)
84 | AND ps_suppkey NOT IN (SELECT
85 | s_suppkey
86 | FROM
87 | supplier
88 | WHERE s_comment LIKE '%Customer%Complaints%')
89 | GROUP BY
90 | p_brand
91 | , p_type
92 | , p_size
93 | ORDER BY
94 | supplier_cnt DESC
95 | , p_brand
96 | , p_type
97 | , p_size ;
98 |
99 |
100 | --fourth_query
101 |
102 | SELECT r_name,count(1) number_of_supplies
103 | FROM
104 | part,
105 | partsupp,
106 | supplier,
107 | nation,
108 | REGION
109 | WHERE p_partkey = ps_partkey
110 | AND s_suppkey = ps_suppkey
111 | AND s_nationkey = n_nationkey
112 | AND n_regionkey = r_regionkey
113 | group by 1
114 | order by 1;
115 |
116 |
117 | --fifth_query
118 |
119 | SELECT
120 | n_name
121 | , COUNT(1) total_count
122 | FROM
123 | supplier,
124 | nation
125 | WHERE s_suppkey IN (SELECT
126 | ps_suppkey
127 | FROM
128 | partsupp
129 | WHERE ps_partkey IN (SELECT
130 | p_partkey
131 | FROM
132 | part
133 | WHERE p_name LIKE 'olive%')
134 | AND ps_availqty > 1)
135 | AND s_nationkey = n_nationkey
136 | GROUP BY
137 | 1
138 | ORDER BY
139 | 1;
140 |
--------------------------------------------------------------------------------
/tools/ReplayAnalysis/api/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/redshift-test-drive/354b7ed75180a6b915d856175cffd6414cae998e/tools/ReplayAnalysis/api/__init__.py
--------------------------------------------------------------------------------
/tools/ReplayAnalysis/gui/package.json:
--------------------------------------------------------------------------------
1 | {
2 | "name": "gui",
3 | "version": "0.1.0",
4 | "private": true,
5 | "proxy": "http://127.0.0.1:5000",
6 | "dependencies": {
7 | "@awsui/collection-hooks": "^1.0.49",
8 | "@awsui/components-react": "^3.0.724",
9 | "@awsui/design-tokens": "^3.0.34",
10 | "@awsui/global-styles": "^1.0.19",
11 | "@awsui/test-utils-core": "^1.0.33",
12 | "@emotion/react": "^11.10.5",
13 | "@emotion/styled": "^11.10.5",
14 | "@mui/material": "^5.11.6",
15 | "@testing-library/jest-dom": "^5.16.5",
16 | "@testing-library/react": "^13.4.0",
17 | "@testing-library/user-event": "^14.4.3",
18 | "react": "^18.2.0",
19 | "react-dom": "^18.2.0",
20 | "react-router-dom": "^6.8.0",
21 | "web-vitals": "^3.1.1"
22 | },
23 | "devDependencies": {
24 | "react-scripts": "^5.0.1",
25 | "@svgr/webpack": "^6.5.1"
26 | },
27 | "overrides": {
28 | "@svgr/webpack": "$@svgr/webpack"
29 | },
30 | "scripts": {
31 | "start": "react-scripts start",
32 | "start-backend": "cd ../api && flask run",
33 | "build": "react-scripts build",
34 | "test": "react-scripts test",
35 | "eject": "react-scripts eject"
36 | },
37 | "eslintConfig": {
38 | "extends": [
39 | "react-app",
40 | "react-app/jest"
41 | ]
42 | },
43 | "browserslist": {
44 | "production": [
45 | ">0.2%",
46 | "not dead",
47 | "not op_mini all"
48 | ],
49 | "development": [
50 | "last 1 chrome version",
51 | "last 1 firefox version",
52 | "last 1 safari version"
53 | ]
54 | }
55 | }
56 |
--------------------------------------------------------------------------------
/tools/ReplayAnalysis/gui/public/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
11 |
15 |
16 |
25 | Simple Replay Analysis
26 |
27 |
28 | You need to enable JavaScript to run this app.
29 |
30 |
40 |
41 |
42 |
--------------------------------------------------------------------------------
/tools/ReplayAnalysis/gui/public/manifest.json:
--------------------------------------------------------------------------------
1 | {
2 | "short_name": "SRA",
3 | "name": "Simple Replay Analysis",
4 | "start_url": ".",
5 | "display": "standalone",
6 | "theme_color": "#000000",
7 | "background_color": "#ffffff"
8 | }
9 |
--------------------------------------------------------------------------------
/tools/ReplayAnalysis/gui/public/robots.txt:
--------------------------------------------------------------------------------
1 | # https://www.robotstxt.org/robotstxt.html
2 | User-agent: *
3 | Disallow:
4 |
--------------------------------------------------------------------------------
/tools/ReplayAnalysis/gui/src/App.js:
--------------------------------------------------------------------------------
1 | import "@awsui/global-styles/index.css"
2 | import {HomePage} from "./pages/home";
3 | import {AnalysisPage} from "./pages/analysis";
4 | import {BrowserRouter, Routes, Route} from "react-router-dom";
5 |
6 | function App() {
7 | return (
8 |
9 |
10 |
11 | } />
12 | }/>
13 |
14 |
15 |
16 | );
17 | }
18 |
19 | export default App;
20 |
--------------------------------------------------------------------------------
/tools/ReplayAnalysis/gui/src/App.test.js:
--------------------------------------------------------------------------------
1 | import { render, screen } from '@testing-library/react';
2 | import App from './App';
3 |
4 | test('renders learn react link', () => {
5 | render( );
6 | const linkElement = screen.getByText(/learn react/i);
7 | expect(linkElement).toBeInTheDocument();
8 | });
9 |
--------------------------------------------------------------------------------
/tools/ReplayAnalysis/gui/src/components/AccessControl.js:
--------------------------------------------------------------------------------
1 | import React, {useEffect, useState} from 'react';
2 | import {Box, FormField, RadioGroup, Select, SpaceBetween, StatusIndicator} from "@awsui/components-react";
3 | import Input from "@awsui/components-react/input";
4 | import Button from "@awsui/components-react/button";
5 |
6 | export default function AccessControl({profiles}) {
7 | const [type, setType] = useState("profile");
8 | const [placeholder, setPlaceholder] = useState("");
9 | const [credentials, setCredentials] = useState("");
10 | const [disabled, setDisabled] = useState(true);
11 | const [saved, setSaved] = useState(false);
12 | const [selectedOption, setSelectedOption] = useState({label: 'default', value: 'default'});
13 | const options = profiles.map(item => ({label: item, value: item}))
14 | const [valid, setValid] = useState(true)
15 |
16 |
17 | useEffect(() => {
18 | function toggle() {
19 | if (type === "profile") {
20 | if (selectedOption.label !== "default") {
21 | setDisabled(false);
22 | }
23 | } else if (type === "role") {
24 | setPlaceholder("arn:aws:iam::123456789012:role/customrole");
25 | setDisabled(false);
26 | }
27 | }
28 |
29 | toggle()
30 | }, [type, selectedOption]);
31 |
32 | function save() {
33 | if (type === "profile") {
34 | fetch(`/profile?name=${selectedOption.label}`).then(response => response.json())
35 | .then(response => {
36 | if (response.success === false) {
37 | setValid(false)
38 | } else {
39 | setSaved(true)
40 | }
41 | })
42 |
43 | .catch((error) => {
44 | console.error('Error:', error);
45 | setValid(false)
46 |
47 | });
48 |
49 | } else if (type === "role") {
50 | fetch(`/role?arn=${credentials}`).then(response => response.json())
51 | .then(response => {
52 | if (response.success === false) {
53 | // TODO: Assume role Access denied
54 | setValid(false)
55 |
56 | } else {
57 | setSaved(true)
58 |
59 | }
60 | })
61 | .catch((error) => {
62 | console.error('Error:', error);
63 | });
64 | }
65 | }
66 |
67 |
68 | return (
69 |
70 |
71 |
73 | {
75 | setSaved(false);
76 | setType(detail.value);
77 | }}
78 | value={type}
79 | items={[
80 | {value: "profile", label: "Use a Profile"},
81 | {value: "role", label: "Use an IAM Role"}
82 | ]}/>
83 |
84 |
85 |
86 | save()}>
94 | Save
95 | }>
96 |
97 |
98 | {type === "profile" &&
99 |
100 | {
103 | setSaved(false);
104 | setValid(true)
105 | setSelectedOption(detail.selectedOption);
106 | }
107 | }
108 | options={options}
109 | selectedAriaLabel="Selected"
110 | empty="No options"
111 | />
112 |
113 | }
114 |
115 | {type === "role" &&
116 |
117 | {
123 | setSaved(false);
124 | setValid(true)
125 | setCredentials(event.detail.value)
126 | }}>
127 | }
128 |
129 | {saved &&
130 |
131 | Success
132 |
133 | }
134 |
135 |
136 |
137 |
138 | )
139 | }
--------------------------------------------------------------------------------
/tools/ReplayAnalysis/gui/src/components/ReplayAnalysis/AggregateMetrics.js:
--------------------------------------------------------------------------------
1 | import React, {useEffect, useState} from 'react';
2 | import {Header, Table} from "@awsui/components-react";
3 | import millisToMinutesAndSeconds from "../../helpers/msFormatter";
4 |
5 |
6 | /**
7 | * Aggregate Metrics Table
8 | * Displays p-values, averages, standard deviation values for each replay
9 | */
10 | const AggregateMetrics = ({selectedQueryTypes, selectedUser, selectedDuration}) => {
11 | /** @prop selectedQueryTypes, array of selected "query type" options */
12 | /** @prop selectedUser, array of selected "user" options */
13 | /** @prop selectedDuration, array of selected "duration" range */
14 |
15 | /** Table data */
16 | const [data, setData] = useState([]);
17 |
18 | /** Loading validator to render component given successful response */
19 | const [loading, setLoading] = useState(true);
20 |
21 |
22 | useEffect(() => {
23 | const fetchData = async () => {
24 | fetch(`/agg_metrics?qtype=${JSON.stringify(selectedQueryTypes)}&user=${JSON.stringify(selectedUser)}&start=${(selectedDuration[0])}&end=${(selectedDuration[1])}`).then(response => response.json())
25 | .then(response => {
26 | if (response.success === false) {
27 |
28 | } else {
29 | setData(response.data);
30 | setLoading(false);
31 | }
32 | })
33 | .catch((error) => {
34 | console.error('Error:', error);
35 | });
36 | };
37 | fetchData();
38 | }, [selectedQueryTypes, selectedUser, selectedDuration]);
39 |
40 | /** Render components */
41 | return !loading && (
42 |
49 | )
50 | };
51 |
52 | /** Array of column definitions for Aggregate Metrics table */
53 | const COL_DEF = [
54 | {
55 | id: 'sid',
56 | header: 'Replay',
57 | cell: item => item.sid,
58 | width: 50
59 | },
60 | {
61 | id: 'p25',
62 | header: 'P25 (s)',
63 | cell: item => millisToMinutesAndSeconds(item.p25, 3),
64 | width: 50
65 | },
66 | {
67 | id: 'p50',
68 | header: 'P50 (s)',
69 | cell: item => millisToMinutesAndSeconds(item.p50, 3),
70 | width: 50
71 | },
72 | {
73 | id: 'p75',
74 | header: 'P75 (s)',
75 | cell: item => millisToMinutesAndSeconds(item.p75, 3),
76 | width: 50
77 | },
78 | {
79 | id: 'p99',
80 | header: 'P99 (s)',
81 | cell: item => millisToMinutesAndSeconds(item.p99, 3),
82 | width: 50
83 | },
84 | {
85 | id: 'avg',
86 | header: 'Average (s)',
87 | cell: item => millisToMinutesAndSeconds(item.avg, 3),
88 | width: 50
89 | },
90 | {
91 | id: 'std',
92 | header: 'Standard Deviation (s)',
93 | cell: item => millisToMinutesAndSeconds(item.std, 3),
94 | width: 50
95 | }
96 | ]
97 |
98 | export default AggregateMetrics;
--------------------------------------------------------------------------------
/tools/ReplayAnalysis/gui/src/components/ReplayAnalysis/CompareThroughput.js:
--------------------------------------------------------------------------------
1 | import React, {useEffect, useState} from 'react';
2 | import {Box, Button, Header, LineChart} from "@awsui/components-react";
3 | import millisToMinutesAndSeconds from "../../helpers/msFormatter";
4 |
5 | /**
6 | * Compare Throughput Chart
7 | * Displays p-values, averages, standard deviation values for each replay
8 | */
9 | const CompareThroughput = ({selectedQueryTypes, selectedDuration, selectedUser}) => {
10 | /** @prop selectedQueryTypes, array of selected "query type" options */
11 | /** @prop selectedUser, array of selected "user" options */
12 | /** @prop selectedDuration, array of selected "duration" range */
13 |
14 | /** Series data */
15 | const [data, setData] = useState([]);
16 |
17 | /** Loading validator to render component given successful response */
18 | const [loading, setLoading] = useState(true);
19 |
20 | useEffect(() => {
21 | const fetchData = async () => {
22 | fetch(`/compare_throughput?qtype=${JSON.stringify(selectedQueryTypes)}&user=${JSON.stringify(selectedUser)}`).then(response => response.json())
23 | .then(response => {
24 | if (response.success === false) {
25 | console.log(response.message);
26 | } else {
27 | /** Maps response data to LineChart formatting */
28 | setData(response.data.map((entry) =>
29 | ({
30 | title: entry.replay,
31 | type: "line",
32 | data: entry.values.map((val) =>
33 | ({x: (val.rel_time), y: val.freq}))
34 | })));
35 |
36 | setLoading(false);
37 | }
38 |
39 | })
40 | .catch((error) => {
41 | console.error('Error:', error);
42 |
43 | });
44 | };
45 | fetchData();
46 | }, [selectedQueryTypes, selectedUser]);
47 |
48 | /**
49 | * Filters a series by given duration range
50 | * @param {Object} series Total data set of query frequency values.
51 | * @return {Object} filtered data set on duration
52 | */
53 | function filterRange(series) {
54 | return series.map(singleSerie => ({
55 | ...singleSerie,
56 | data: singleSerie.data.filter(value => value.x >= selectedDuration[0] && value.x <= selectedDuration[1])
57 | }));
58 | }
59 |
60 | return !loading && (
61 |
62 |
65 |
66 |
78 | millisToMinutesAndSeconds(e, 0)
79 | }}
80 | xScaleType={'linear'}
81 | xTitle={'Timestamp (relative to start time)'}
82 | yTitle={'Queries Executed'}
83 | empty={
84 |
85 | No data available
86 |
87 | There is no data available
88 |
89 |
90 | }
91 | noMatch={
92 |
93 | No matching data
94 |
95 | There is no matching data to display
96 |
97 | Clear filter
98 |
99 | }
100 | loadingText={"Loading"}
101 | >
102 |
103 |
104 | );
105 | };
106 |
107 | export default CompareThroughput
--------------------------------------------------------------------------------
/tools/ReplayAnalysis/gui/src/components/ReplayAnalysis/QueryLatency.js:
--------------------------------------------------------------------------------
1 | import React, {useEffect, useState} from 'react';
2 | import {BarChart, Box, Button, Header} from "@awsui/components-react";
3 | import millisToMinutesAndSeconds from "../../helpers/msFormatter";
4 |
5 |
6 | /**
7 | * Compare Throughput Chart
8 | * Displays p-values, averages, standard deviation values for each replay
9 | */
10 |
11 | const QueryLatency = ({selectedQueryTypes, selectedUser, selectedDuration}) => {
12 | /** @prop selectedQueryTypes, array of selected "query type" options */
13 | /** @prop selectedUser, array of selected "user" options */
14 | /** @prop selectedDuration, array of selected "duration" range */
15 |
16 | /** Series data */
17 | const [data, setData] = useState([]);
18 |
19 | /** Loading validator to render component given successful response */
20 | const [loading, setLoading] = useState(true);
21 |
22 | useEffect(() => {
23 | const fetchData = async () => {
24 | fetch(`/query_latency?qtype=${JSON.stringify(selectedQueryTypes)}&user=${JSON.stringify(selectedUser)}&start=${(selectedDuration[0])}&end=${(selectedDuration[1])}`).then(response => response.json())
25 | .then(response => {
26 | if (response.success === false) {
27 | } else {
28 | setData(response.data.map((entry) =>
29 | ({
30 | title: entry.replay,
31 | type: "bar",
32 | data: entry.values.map((val) => ({x: (val.bin), y: (val.count)}))
33 | })
34 | ))
35 | setLoading(false)
36 | }
37 |
38 | })
39 |
40 | .catch((error) => {
41 | console.error('Error:', error);
42 |
43 | })
44 | };
45 |
46 | fetchData();
47 | }, [selectedQueryTypes, selectedUser, selectedDuration]);
48 |
49 |
50 | return !loading && (
51 |
52 |
54 |
64 | millisToMinutesAndSeconds(e, 1)
65 | }}
66 | errorText="Error loading data."
67 | height={300}
68 | loadingText="Loading chart"
69 | recoveryText="Retry"
70 | xScaleType="categorical"
71 | xTitle="Elapsed Time"
72 | yTitle="# of Queries"
73 | empty={
74 |
75 | No data available
76 |
77 | There is no data available
78 |
79 |
80 | }
81 | noMatch={
82 |
83 | No matching data
84 |
85 | There is no matching data to display
86 |
87 | Clear filter
88 |
89 | }
90 | />
91 |
92 | );
93 | };
94 |
95 | export default QueryLatency;
--------------------------------------------------------------------------------
/tools/ReplayAnalysis/gui/src/components/ReplayAnalysis/ThroughputBreakdown.js:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import {AreaChart, Box} from "@awsui/components-react";
3 | import Button from "@awsui/components-react/button";
4 |
5 | /** COMPONENT NOT IN USE */
6 | /** TODO: Format request for breakdown chart */
7 |
8 | const ThroughputBreakdown = () => {
9 | return (
10 |
25 | e
26 | .toLocaleDateString("en-US", {
27 | month: "short",
28 | day: "numeric",
29 | hour: "numeric",
30 | minute: "numeric",
31 | hour12: !1
32 | })
33 | .split(",")
34 | .join("\n"),
35 | yTickFormatter: function o(e) {
36 | return (100 * e).toFixed(0) + "%";
37 | }
38 | }}
39 | ariaLabel="Stacked area chart, multiple metrics"
40 | errorText="Error loading data."
41 | height={200}
42 | loadingText="Loading chart"
43 | recoveryText="Retry"
44 | xScaleType="time"
45 | xTitle="Time (UTC)"
46 | yTitle="Total CPU load"
47 | empty={
48 |
49 | No data available
50 |
51 | There is no data available
52 |
53 |
54 | }
55 | noMatch={
56 |
57 | No matching data
58 |
59 | There is no matching data to display
60 |
61 | Clear filter
62 |
63 | }
64 | />
65 | );
66 | };
67 |
68 | export default ThroughputBreakdown;
--------------------------------------------------------------------------------
/tools/ReplayAnalysis/gui/src/components/ReplayOverview.js:
--------------------------------------------------------------------------------
1 | import React, {useEffect} from 'react';
2 | import {Table} from "@awsui/components-react";
3 |
4 |
5 | export default function ReplayOverview({replays, setReplays}) {
6 |
7 | useEffect(() => {
8 | const fetchData = async () => {
9 | const response = await fetch(`/submit_replays`);
10 | const newData = await response.json();
11 | setReplays(newData.replays);
12 | };
13 | fetchData();
14 | }, [setReplays]);
15 |
16 |
17 | return (
18 |
19 | )
20 |
21 |
22 | };
23 |
24 | const COL_DEF = [
25 | {
26 | id: 'sid',
27 | header: 'Replay',
28 | cell: item => item.sid,
29 | width: 50
30 | },
31 |
32 | {
33 | id: 'id',
34 | header: 'Cluster',
35 | cell: item => item.id,
36 | width: 50
37 | },
38 | {
39 | id: 'status',
40 | header: 'Status',
41 | cell: item => item.status,
42 | width: 50
43 | },
44 | {
45 | id: 'instance',
46 | header: 'Instance',
47 | cell: item => item.instance,
48 | width: 50
49 | },
50 | {
51 | id: 'num_nodes',
52 | header: 'Nodes',
53 | cell: item => item.num_nodes,
54 | width: 50
55 | },
56 | {
57 | id: 'database',
58 | header: 'Database',
59 | cell: item => item.database,
60 | width: 50
61 | },
62 | {
63 | id: 'start_time',
64 | header: 'Start Time (UTC)',
65 | cell: item => item.start_time.slice(0, -6),
66 | width: 50
67 | },
68 | {
69 | id: 'end_time',
70 | header: 'End Time (UTC)',
71 | cell: item => item.end_time.slice(0, -6),
72 | width: 50
73 | },
74 | {
75 | id: 'duration',
76 | header: 'Duration',
77 | cell: item => item.duration,
78 | width: 50
79 | },
80 | {
81 | id: 'query_success',
82 | header: 'Queries',
83 | cell: item => item.query_success,
84 | width: 50
85 | },
86 | {
87 | id: 'connection_success',
88 | header: 'Connections',
89 | cell: item => item.connection_success,
90 | width: 50
91 | },
92 |
93 |
94 | ]
--------------------------------------------------------------------------------
/tools/ReplayAnalysis/gui/src/components/ReplayValidation/CopyAgg.js:
--------------------------------------------------------------------------------
1 | import React, {useEffect, useState} from 'react';
2 | import {Header, Table} from "@awsui/components-react";
3 | import CopyDiff from "./CopyDiff";
4 |
5 |
6 | const CopyAgg = ({selectedUser, selectedDuration, replays}) => {
7 |
8 | const [data, setData] = useState([]);
9 | const [loading, setLoading] = useState(true);
10 |
11 | useEffect(() => {
12 | const fetchData = async () => {
13 |
14 | fetch(`/copy_agg?user=${JSON.stringify(selectedUser)}&start=${(selectedDuration[0])}&end=${(selectedDuration[1])}`).then(response => response.json())
15 | .then(response => {
16 | if (response.success === false) {
17 | console.log(response.message);
18 | } else {
19 | setData(response.data);
20 | setLoading(response.data.length === 0)
21 | }
22 | })
23 | .catch((error) => {
24 | console.error('Error:', error);
25 | });
26 | };
27 | fetchData();
28 | }, [selectedDuration, selectedUser]);
29 |
30 |
31 | return !loading && (
32 |
33 |
34 | COPY Ingestion Metrics
35 |
36 |
37 |
38 |
39 | )
40 | };
41 |
42 | const COL_DEF = [
43 | {
44 | id: 'replay',
45 | header: 'Replay',
46 | cell: item => item.sid,
47 | width: 50,
48 | },
49 | {
50 | id: 'loadedRows',
51 | header: 'Loaded Rows',
52 | cell: item => item.loaded_rows,
53 | width: 50,
54 | maxWidth: 300
55 | },
56 | {
57 | id: 'loadedBytes',
58 | header: 'Loaded Bytes',
59 | cell: item => item.loaded_bytes,
60 | width: 50
61 | },
62 | {
63 | id: 'sourceFileCount',
64 | header: 'Source File Count',
65 | cell: item => item.source_file_count,
66 | width: 50
67 | }
68 |
69 | ]
70 |
71 | export default CopyAgg;
--------------------------------------------------------------------------------
/tools/ReplayAnalysis/gui/src/components/ReplayValidation/ErrorDistribution.js:
--------------------------------------------------------------------------------
1 | import React, {useEffect, useState} from 'react';
2 | import {BarChart, Box, Header} from "@awsui/components-react";
3 | import Button from "@awsui/components-react/button";
4 |
5 | const ErrorDistribution = () => {
6 |
7 | const [data, setData] = useState([]);
8 | const [loading, setLoading] = useState(true);
9 |
10 | useEffect(() => {
11 | const fetchData = async () => {
12 | fetch(`/err_distribution`).then(response => response.json())
13 | .then(response => {
14 | if (response.success === false) {
15 | } else {
16 | setData(response.data.map((entry) =>
17 | ({
18 | title: entry.replay,
19 | type: "bar",
20 | data: entry.values.map((val) =>
21 | ({x: (val.category), y: val.freq}))
22 | })))
23 | setLoading(false)
24 | }
25 |
26 | })
27 |
28 | .catch((error) => {
29 | console.error('Error:', error);
30 |
31 | });
32 | };
33 | fetchData();
34 | }, []);
35 |
36 |
37 | return !loading && (
38 |
39 |
40 | Error Category Distribution
41 |
61 | No data available
62 |
63 | There is no data available
64 |
65 |
66 | }
67 | noMatch={
68 |
69 | No matching data
70 |
71 | There is no matching data to display
72 |
73 | Clear filter
74 |
75 | }
76 | />
77 |
78 | );
79 | };
80 |
81 | export default ErrorDistribution;
--------------------------------------------------------------------------------
/tools/ReplayAnalysis/gui/src/components/ReplayValidation/SpectrumDiff.js:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import {Table} from "@awsui/components-react";
3 |
4 |
5 | export default function SpectrumDiff() {
6 | const COL_DEF = [
7 | {
8 | id: 'id',
9 | header: 'Replay',
10 | cell: item => item.id,
11 | width: 50
12 | },
13 | {
14 | id: 'cluster',
15 | header: 'Cluster',
16 | cell: item => item.cluster,
17 | width: 50
18 | },
19 | {
20 | id: 'instance',
21 | header: 'Instance',
22 | cell: item => item.instance,
23 | width: 50
24 | },
25 | {
26 | id: 'nodes',
27 | header: 'Nodes',
28 | cell: item => item.nodes,
29 | width: 50
30 | },
31 | {
32 | id: 'db',
33 | header: 'Database',
34 | cell: item => item.db,
35 | width: 50
36 | },
37 | {
38 | id: 'start',
39 | header: 'Start Time',
40 | cell: item => item.start,
41 | width: 50
42 | },
43 | {
44 | id: 'end',
45 | header: 'End Time',
46 | cell: item => item.end,
47 | width: 50
48 | },
49 | {
50 | id: 'duration',
51 | header: 'Duration',
52 | cell: item => item.duration,
53 | width: 50
54 | },
55 | {
56 | id: 'executed',
57 | header: 'Queries Executed',
58 | cell: item => item.executed,
59 | width: 50
60 | },
61 | {
62 | id: 'aborted',
63 | header: 'Queries Aborted',
64 | cell: item => item.aborted,
65 | width: 50
66 | },
67 | {
68 | id: 'connections',
69 | header: 'Connections',
70 | cell: item => item.connections,
71 | width: 50
72 | },
73 |
74 |
75 | ]
76 |
77 | return (
78 |
79 | )
80 | };
--------------------------------------------------------------------------------
/tools/ReplayAnalysis/gui/src/components/navigation/GlobalFilters.js:
--------------------------------------------------------------------------------
1 | import React, {useState, useEffect} from 'react';
2 | import {Box, Button, Header, Multiselect, SpaceBetween,} from "@awsui/components-react";
3 | import Slider from '@mui/material/Slider';
4 | import * as awsui from '@awsui/design-tokens';
5 | import millisToMinutesAndSeconds from "../../helpers/msFormatter";
6 | import prepareSelectOptions from "../../helpers/PrepareOptions";
7 |
8 | /**
9 | * Global filters
10 | * Manipulates query type, user, and duration selection values
11 | * Updates global selectedQueryTypes, selectedUser, selectedDuration variables
12 | */
13 | const GlobalFilters = ({selectedQueryTypes, setSelectedQueryTypes,
14 | selectedUser, setSelectedUser,
15 | selectedDuration, setSelectedDuration}) => {
16 | /** @prop selectedQueryTypes, array of selected "query type" options */
17 | /** @prop setSelectedQueryTypes, useState setter for selectedQueryTypes */
18 | /** @prop selectedUser, array of selected "user" options */
19 | /** @prop setSelectedUser, useState setter for selectedUser */
20 | /** @prop selectedDuration, array of selected "duration" range in milliseconds. ex: [0,190290] */
21 | /** @prop setSelectedDuration, useState setter for selectedDuration */
22 |
23 | /** Longest relative duration in milliseconds */
24 | const [maxDuration, setMaxDuration ] = useState(0);
25 |
26 | /** Array of user options from response data */
27 | const [selectUserOptions, setSelectUserOptions] = useState();
28 |
29 | useEffect(() => {
30 | const fetchData = async () => {
31 | const response = await fetch(`/time_range`);
32 | const newData = await response.json();
33 |
34 | setMaxDuration(newData.time);
35 | setSelectedDuration([0, maxDuration])
36 | setSelectUserOptions(prepareSelectOptions(newData.users))
37 | };
38 | fetchData();
39 | },
40 | [maxDuration, setSelectedDuration]);
41 |
42 |
43 | function clearFilter() {
44 | setSelectedQueryTypes(queryTypes)
45 | setSelectedUser([])
46 | setSelectedDuration([0, maxDuration])
47 | }
48 |
49 | return (
50 |
51 |
52 |
53 |
54 | setSelectedQueryTypes(detail.selectedOptions)}
58 | />
59 | setSelectedUser(detail.selectedOptions)
63 | }/>
64 |
65 |
66 |
67 |
68 | Filter by time frame
69 | 'Range'}
70 | valueLabelFormat={(value)=> `${millisToMinutesAndSeconds(value)}`}
71 | value={selectedDuration}
72 | min={0}
73 | max={maxDuration}
74 | onChange={ (event, newValue) => setSelectedDuration(newValue)}
75 | size={'large'}
76 | valueLabelDisplay="auto"
77 | disableSwap
78 | marks={[{ value: 0, label: millisToMinutesAndSeconds(0,0)},
79 | {value: maxDuration, label: millisToMinutesAndSeconds(maxDuration,0)}]}
80 |
81 | />
82 |
83 |
84 | clearFilter()}>Clear filters
85 |
86 |
87 |
88 | )
89 | };
90 |
91 | /**
92 | * Custom styling for filters box, uses AWS-UI design tokens to mimic default styles
93 | * @const {object}
94 | */
95 | const boxStyle = {
96 | position: 'sticky',
97 | top: 0,
98 | display: 'block',
99 | backgroundColor: awsui.colorBackgroundControlDefault,
100 | borderColor: awsui.colorBorderControlDefault,
101 | borderWidth: 2,
102 | padding: 20,
103 | boxShadow: 20,
104 | boxShadowColor:awsui.colorBorderControlDefault
105 | };
106 |
107 | /**
108 | * Array of query type options
109 | * @const {object}
110 | */
111 | const queryTypes = [
112 | {
113 | label: "SELECT",
114 | value: "1",
115 | },
116 | {
117 | label: "INSERT",
118 | value: "2",
119 | },
120 | {
121 | label: "UPDATE",
122 | value: "3",
123 | },
124 | {
125 | label: "DELETE",
126 | value: "4",
127 | },
128 | {
129 | label: "COPY",
130 | value: "5",
131 | },
132 | {
133 | label: "UNLOAD",
134 | value: "6",
135 | },
136 | {
137 | label: "DDL",
138 | value: "7",
139 | },
140 | {
141 | label: "COMMAND",
142 | value: "8",
143 | },
144 | {
145 | label: "CTAS",
146 | value: "9",
147 | },
148 | {
149 | label: "UTILITY",
150 | value: "10",
151 | },
152 | {
153 | label: "OTHER",
154 | value: "11",
155 | }];
156 |
157 | export default GlobalFilters;
--------------------------------------------------------------------------------
/tools/ReplayAnalysis/gui/src/components/navigation/NavDrawer.js:
--------------------------------------------------------------------------------
1 | import {SideNavigation} from "@awsui/components-react";
2 | import * as React from 'react';
3 |
4 | /**
5 | * Navigation Sidebar
6 | * List of anchor tags
7 | */
8 | const Nav = () => {
9 | return (
10 |
19 | );
20 | }
21 | export default Nav;
--------------------------------------------------------------------------------
/tools/ReplayAnalysis/gui/src/components/navigation/ToolBar.js:
--------------------------------------------------------------------------------
1 | import {SideNavigation} from "@awsui/components-react";
2 | import * as React from 'react';
3 |
4 | /**
5 | * Help Sidebar
6 | * List of anchor tags
7 | */
8 | const ToolBar = () => {
9 | return (
10 |
48 | );
49 | }
50 | export default ToolBar;
--------------------------------------------------------------------------------
/tools/ReplayAnalysis/gui/src/helpers/PrepareOptions.js:
--------------------------------------------------------------------------------
1 | /**
2 | * Iterates through provided
3 | * @param {Object} field name of field
4 | * @param {Object} data Total data set of items
5 | * @return {Object} list of unique values formatted as options for selection component
6 | */
7 | export default function prepareSelectOptions(field, data) {
8 | const optionSet = [];
9 |
10 | /** If data exists, iterate through data set and collect unique values */
11 | if (data) {
12 | data.forEach(item => {
13 | if (optionSet.indexOf(item[field]) === -1) {
14 | optionSet.push(item[field]);
15 | }
16 | })
17 |
18 | /** else no data, iterate through field object to format values as options */
19 | } else {
20 | field.forEach(item => {
21 | if (optionSet.indexOf(item) === -1) {
22 | optionSet.push(item);
23 | }
24 | });
25 | }
26 |
27 | optionSet.sort();
28 | const options = [];
29 |
30 | /** format list as options Object */
31 | optionSet.forEach((item, index) => options.push({label: item, value: (index + 1).toString()}));
32 |
33 | return options;
34 | }
35 |
--------------------------------------------------------------------------------
/tools/ReplayAnalysis/gui/src/helpers/msFormatter.js:
--------------------------------------------------------------------------------
1 | /**
2 | * Converts ms value to min:sec.ms string for visual formatting
3 | * @param {*} milliseconds, value in milliseconds
4 | * @param {number} digits, number of digits to round to (default 2)
5 | * @return {string} Formatted string
6 | */
7 |
8 | export default function millisToMinutesAndSeconds(milliseconds, digits = 2) {
9 | const minutes = Math.floor(milliseconds / 60000);
10 | const seconds = ((milliseconds % 60000) / 1000).toFixed(digits);
11 |
12 | return minutes + ":" + (seconds < 10 ? '0' : '') + seconds;
13 | }
--------------------------------------------------------------------------------
/tools/ReplayAnalysis/gui/src/index.js:
--------------------------------------------------------------------------------
1 | import React from 'react';
2 | import ReactDOM from 'react-dom/client';
3 | import App from './App';
4 | import reportWebVitals from './reportWebVitals';
5 |
6 | const root = ReactDOM.createRoot(document.getElementById('root'));
7 | root.render(
8 |
9 |
10 |
11 | );
12 |
13 | // If you want to start measuring performance in your app, pass a function
14 | // to log results (for example: reportWebVitals(console.log))
15 | // or send to an analytics endpoint. Learn more: https://bit.ly/CRA-vitals
16 | reportWebVitals();
17 |
--------------------------------------------------------------------------------
/tools/ReplayAnalysis/gui/src/pages/home.js:
--------------------------------------------------------------------------------
1 | import Input from "@awsui/components-react/input";
2 | import Button from "@awsui/components-react/button";
3 | import AppLayout from "@awsui/components-react/app-layout";
4 | import React, {useEffect, useState} from "react";
5 | import {Container, FormField, Header, SpaceBetween, TokenGroup} from "@awsui/components-react";
6 | import ReplayList from "../components/ReplayList";
7 | import AccessControl from "../components/AccessControl";
8 |
9 | export const HomePage = () => {
10 |
11 | const [resource, setResource] = useState('');
12 | const [replays, setReplays] = useState([])
13 | const [buckets, setBuckets] = useState([])
14 | const [bucketLabels, setBucketLabels] = useState([])
15 | const [searching, setSearching] = useState(false)
16 | const [profiles, setProfiles] = useState([])
17 | const [valid, setValid] = useState(true)
18 |
19 | useEffect(() => {
20 | const fetchData = async () => {
21 | const response = await fetch(`/getprofile`);
22 | const newData = await response.json();
23 | setProfiles(newData.profiles);
24 | };
25 | fetchData();
26 | }, []);
27 |
28 | function search(uri) {
29 | // TODO: explicit s3 uri validation
30 |
31 | if (uri !== '' && uri.startsWith('s3://')) {
32 | setSearching(true);
33 |
34 | fetch(`/search?uri=${encodeURIComponent(uri)}`).then(response => response.json())
35 | .then(response => {
36 | if (!response.success) {
37 | setValid(false)
38 | } else {
39 | if (!buckets.includes(response.bucket)) {
40 | setReplays(replays => [...replays, ...response.replays]);
41 | setBuckets(buckets => [...buckets, response.bucket]);
42 | setBucketLabels(buckets => [...buckets, {label: response.bucket}]);
43 | }
44 | }
45 |
46 | setSearching(false);
47 |
48 |
49 | }).catch((error) => {
50 | console.error('Error:', error);
51 | setSearching(false);
52 |
53 | });
54 | setResource("");
55 | } else {
56 | setValid(false)
57 |
58 | }
59 | }
60 |
61 | /**
62 | * Removes entries from list of replays when bucket is removed
63 | * @param {number} itemIndex Total data set of query frequency values.
64 | */
65 | function removeBucket(itemIndex) {
66 | let bucket = bucketLabels[itemIndex].label
67 | setBucketLabels([...bucketLabels.slice(0, itemIndex),
68 | ...bucketLabels.slice(itemIndex + 1)]);
69 | setBuckets([...buckets.slice(0, itemIndex),
70 | ...buckets.slice(itemIndex + 1)]);
71 | let result = replays.filter((data) => {
72 | return data.bucket.search(bucket) === -1;
73 | });
74 | setReplays(result);
75 | }
76 |
77 | return (
78 |
84 | Test Drive Replay Analysis
85 |
86 | }>
87 |
88 |
89 |
90 |
91 | search(resource)}>
99 | Search
100 | }>
101 |
102 | {
108 | setResource(event.detail.value);
109 | setValid(true)
110 | }}/>
111 |
112 |
113 |
114 | {
116 | removeBucket(itemIndex)
117 | }}
118 | items={bucketLabels}>
119 |
120 |
121 |
122 |
123 |
124 |
125 |
126 |
127 |
128 | }
129 |
130 | />
131 | );
132 |
133 | }
134 |
135 |
--------------------------------------------------------------------------------
/tools/ReplayAnalysis/gui/src/reportWebVitals.js:
--------------------------------------------------------------------------------
1 | const reportWebVitals = onPerfEntry => {
2 | if (onPerfEntry && onPerfEntry instanceof Function) {
3 | import('web-vitals').then(({ getCLS, getFID, getFCP, getLCP, getTTFB }) => {
4 | getCLS(onPerfEntry);
5 | getFID(onPerfEntry);
6 | getFCP(onPerfEntry);
7 | getLCP(onPerfEntry);
8 | getTTFB(onPerfEntry);
9 | });
10 | }
11 | };
12 |
13 | export default reportWebVitals;
14 |
--------------------------------------------------------------------------------
/tools/ReplayAnalysis/gui/src/setupTests.js:
--------------------------------------------------------------------------------
1 | // jest-dom adds custom jest matchers for asserting on DOM nodes.
2 | // allows you to do things like:
3 | // expect(element).toHaveTextContent(/react/i)
4 | // learn more: https://github.com/testing-library/jest-dom
5 | import '@testing-library/jest-dom';
6 |
--------------------------------------------------------------------------------
/tools/ReplayAnalysis/replay_analysis.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | import common.log as log_helper
4 |
5 |
6 | def launch_analysis_v2():
7 | """Package install and server init"""
8 |
9 | # add explicit instructions for user
10 |
11 | os.system("pip install -r requirements.txt")
12 | os.chdir(f"{os.getcwd()}/tools/ReplayAnalysis/gui")
13 |
14 | # explicit version checking
15 | if os.system("node -v") != 0:
16 | print("Please install node before proceeding.")
17 | exit(-1)
18 |
19 | if os.system("npm install") != 0:
20 | print("Could not install npm packages. ")
21 |
22 | os.system("npm run start-backend &")
23 | os.system("npm start")
24 |
25 |
26 | def main():
27 | log_helper.init_logging("replay_analysis.log",dir='tools/ReplayAnalysis/logs',logger_name="ReplayAnalysisLogger")
28 | log_helper.log_version()
29 | launch_analysis_v2()
30 |
31 |
32 | if __name__ == "__main__":
33 | main()
34 |
--------------------------------------------------------------------------------
/tools/ReplayAnalysis/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/redshift-test-drive/354b7ed75180a6b915d856175cffd6414cae998e/tools/ReplayAnalysis/tests/__init__.py
--------------------------------------------------------------------------------
/tools/ReplayAnalysis/tests/test_replay_analysis.py:
--------------------------------------------------------------------------------
1 | from unittest.mock import patch, MagicMock, mock_open, Mock
2 | import unittest
3 | import botocore.session
4 | import tools.ReplayAnalysis.replay_analysis as replay_analysis
5 |
6 |
7 | class TestReplayAnalysis(unittest.TestCase):
8 | def setUp(self):
9 | self.severless_cluster = {
10 | "is_serverless": True,
11 | "secret_name": None,
12 | "host": "host",
13 | "region": "someregion",
14 | "port": 5439,
15 | "database": "somedb",
16 | "id": "someid",
17 | }
18 | self.bucket = {"url": "someurl", "bucket_name": "somebucket", "prefix": "someprefix"}
19 | self.provisioned_cluster = {
20 | "is_serverless": False,
21 | "secret_name": None,
22 | "host": "host",
23 | "region": "someregion",
24 | "port": 5439,
25 | "database": "somedb",
26 | "id": "someid",
27 | }
28 | self.report = MagicMock()
29 | self.replay = "someid"
30 | self.cluster_endpoint = "someid"
31 | self.start_time = "sometime"
32 | self.end_time = "sometime"
33 | self.bucket_url = "url"
34 | self.iam_role = "somerole"
35 | self.user = "someuser"
36 | self.rs_client_response = {"DbUser": self.user, "DbPassword": "password123"}
37 | model = botocore.session.get_session().get_service_model("redshift")
38 | factory = botocore.errorfactory.ClientExceptionsFactory()
39 | self.exceptions = factory.create_client_exceptions(model)
40 |
41 | @patch("os.system")
42 | @patch("os.chdir")
43 | @patch("builtins.print")
44 | def test_launch_analysis_v2_exit(self, mock_print, mock_chdir, mock_os):
45 | mock_os.side_effect = [5, 10]
46 | with self.assertRaises(SystemExit):
47 | replay_analysis.launch_analysis_v2()
48 | mock_print.assert_called_once_with("Please install node before proceeding.")
49 |
50 | @patch("os.system")
51 | @patch("os.chdir")
52 | @patch("builtins.print")
53 | def test_launch_analysis_v2_cannot_install(self, mock_print, mock_chdir, mock_os):
54 | mock_os.side_effect = [0, 0, 1, 1, 1]
55 | replay_analysis.launch_analysis_v2()
56 | mock_print.assert_called_once_with("Could not install npm packages. ")
57 |
58 | @patch("os.system", return_value=0)
59 | @patch("os.chdir")
60 | @patch("builtins.print")
61 | def test_launch_analysis_v2_success(self, mock_print, mock_chdir, mock_os):
62 | replay_analysis.launch_analysis_v2()
63 | mock_print.assert_not_called()
64 |
--------------------------------------------------------------------------------
/tools/ReplayAnalysis/util/report_gen.py:
--------------------------------------------------------------------------------
1 | import os
2 | import pandas as pd
3 | import yaml
4 |
5 | from functools import partial
6 | from reportlab.lib.pagesizes import letter
7 | from reportlab.lib.units import inch
8 | from reportlab.platypus import (
9 | PageBreak,
10 | TableStyle,
11 | Table,
12 | Spacer,
13 | Image,
14 | SimpleDocTemplate,
15 | Paragraph,
16 | ListFlowable,
17 | ListItem,
18 | )
19 | from report_util import (
20 | styles,
21 | build_pdf_tables,
22 | df_to_np,
23 | first_page,
24 | later_pages,
25 | hist_gen,
26 | sub_yaml_vars,
27 | )
28 |
29 | g_stylesheet = styles()
30 |
31 |
32 | def pdf_gen(report, summary=None):
33 | """This function formats the summary report using the content from report_content.yaml to populate the paragraphs,
34 | titles, and headers. The tables are populated via the Report param which has all the dataframes.
35 |
36 | @param report: Report object
37 | @param summary: list, replay summary
38 |
39 | """
40 | with open("report_content.yaml", "r") as stream:
41 | docs = yaml.safe_load(stream)
42 |
43 | style = g_stylesheet.get("styles")
44 | elems = [] # elements array used to build pdf structure
45 | pdf = SimpleDocTemplate(
46 | f"{report.replay_id}_report.pdf",
47 | pagesize=letter,
48 | leftMargin=0.75 * inch,
49 | rightMargin=0.75 * inch,
50 | topMargin=0.75 * inch,
51 | bottomMargin=0.75 * inch,
52 | )
53 |
54 | # title and subtitle and cluster info table
55 | elems.append(Paragraph(docs["title"], style["Title"]))
56 | elems.append(Paragraph(sub_yaml_vars(report, docs["subtitle"]), style["Heading4"]))
57 | cluster_info = pd.DataFrame.from_dict(report.cluster_details, orient="index")
58 | elems.append(
59 | Table(
60 | df_to_np(report.cluster_details.keys(), cluster_info.transpose()),
61 | hAlign="LEFT",
62 | style=g_stylesheet.get("table_style"),
63 | )
64 | )
65 | # replay summary
66 | if summary is not None:
67 | elems.append(Paragraph(f"Replay Summary", style["Heading4"]))
68 | elems.append(
69 | ListFlowable(
70 | [ListItem(Paragraph(x, style["Normal"])) for x in summary],
71 | bulletType="bullet",
72 | )
73 | )
74 | elems.append(Spacer(0, 5))
75 |
76 | elems.append(Paragraph(docs["report_paragraph"], style["Normal"]))
77 |
78 | # glossary section
79 | elems.append(Paragraph(docs["glossary_header"], style["Heading4"]))
80 | elems.append(Paragraph(docs["glossary_paragraph"], style["Normal"]))
81 | elems.append(
82 | ListFlowable(
83 | [ListItem(Paragraph(x, style["Normal"])) for x in docs["glossary"]],
84 | bulletType="bullet",
85 | )
86 | )
87 | elems.append(Spacer(0, 5))
88 |
89 | # access data section
90 | elems.append(Paragraph(docs["data_header"], style["Heading4"]))
91 | elems.append(Paragraph(sub_yaml_vars(report, docs["data_paragraph"]), style["Normal"]))
92 | elems.append(
93 | ListFlowable(
94 | [ListItem(Paragraph(x, style["Normal"])) for x in docs["raw_data"]],
95 | bulletType="bullet",
96 | )
97 | )
98 | elems.append(Spacer(0, 5))
99 | elems.append(Paragraph(sub_yaml_vars(report, docs["agg_data_paragraph"]), style["Normal"]))
100 |
101 | # notes section
102 | elems.append(Paragraph(docs["notes_header"], style["Heading4"]))
103 | elems.append(Paragraph(docs["notes_paragraph"], style["Normal"]))
104 | elems.append(
105 | ListFlowable(
106 | [ListItem(Paragraph(x, style["Normal"])) for x in docs["notes"]],
107 | bulletType="bullet",
108 | )
109 | )
110 |
111 | elems.append(PageBreak()) # page 2: cluster details
112 |
113 | # query breakdown
114 | build_pdf_tables(elems, docs["query_breakdown"], report)
115 | elems.append(Spacer(0, 5))
116 |
117 | # histogram and description
118 | image_path = hist_gen(
119 | x_data=report.feature_graph["sec_start"],
120 | y_data=report.feature_graph["count"],
121 | title=docs["graph"].get("title"),
122 | x_label="Average Elapsed Time (s)",
123 | )
124 |
125 | desc = Paragraph(docs["graph"].get("paragraph"), style["Normal"])
126 | data = [[Image(image_path, width=300, height=200, hAlign="LEFT"), desc]]
127 | elems.append(Table(data, style=TableStyle([("VALIGN", (0, 0), (-1, -1), "MIDDLE")])))
128 | elems.append(Spacer(0, 5))
129 |
130 | # cluster metrics table
131 | build_pdf_tables(elems, docs["cluster_metrics"], report)
132 |
133 | elems.append(PageBreak()) # page 3+ measure tables
134 |
135 | build_pdf_tables(
136 | elems, docs["measure_tables"], report
137 | ) # build 5 measure tables all at once
138 |
139 | # build pdf
140 | pdf.build(
141 | elems,
142 | onFirstPage=partial(first_page, report=report),
143 | onLaterPages=partial(later_pages, report=report),
144 | )
145 | os.remove(image_path)
146 |
147 | return pdf.filename
148 |
--------------------------------------------------------------------------------
/tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aws/redshift-test-drive/354b7ed75180a6b915d856175cffd6414cae998e/tools/__init__.py
--------------------------------------------------------------------------------