├── .gitignore ├── README.md ├── cluster_dir ├── ' ├── config.py └── start_monitoring.py └── redshift_monitoring ├── __init__.py ├── base_config.py ├── cloudwatch_metrics.py ├── diskspace_metrics.py ├── post_db.py ├── query_level_metrics.py ├── redshift_monitoring.py ├── running_queries.py ├── table_level_metrics.py ├── thread_utils.py └── wlm_metrics.py /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Motivation 2 | Redshift is the distributed data warehousing solution by AWS. The redshift monitoring console displays: 3 | 4 | 1. Hardware metrics like CPU, Disk Space, Read/Write IOPs for the clusters. 5 | 2. Query level information such as: 6 | a. Expected versus actual execution plan 7 | b. Username query mapping 8 | c. Time Taken for query 9 | 10 | ## Redeye Overview 11 | The tool gathers the following metrics on redshift performance: 12 | 13 | 1. Hardware Metrics: 14 | a. CPU Utilization 15 | b. Disk Space Utilization 16 | c. Read/Write IOPs 17 | d. Read Latency/Throughput 18 | e. Write Latency/Throughput 19 | f. Network Transmit/Throughput 20 | 21 | 2. Software Metrics: 22 | a. Aggregate Metrics: 23 | - Queries fired by user 24 | - Queries in Queue/Running/Returning State in each queue 25 | - Average time taken in Queue/Running/Returning State in each queue 26 | b. Query Level Metrics: Number of diskhits, Number of rows broadcast across nodes, queue used and user at a query_id granularity 27 | c. Table Level Metrics: Least and most used tables in warehouse 28 | 29 | There is a short [post](http://saharshoza.github.io/RedEye/) highlighting some of the insights that can be gained from this utility. 30 | 31 | # Requirements: 32 | For this tool to work, you will need: 33 | 34 | 1. Statsd endpoint 35 | 2. Opentsdb endpoint 36 | 3. MySQL endpoint 37 | 4. Redshift Credentials 38 | 39 | # Quick Setup 40 | 41 | `vim $REDYEYE/redshift_monitoring/base_config.py` 42 | 43 | 44 | This file contains configuration properties common to all the clusters. Edit the statsd, opentsdb and mysql endpoints here. 45 | 46 | 47 | `mkdir $REDEYE/cluster_new` 48 | `cp $REDEYE/cluster_dir/config.py $REDEYE/cluster_new/` 49 | `vim $REDEYE/cluster_new/config.py` 50 | 51 | 52 | This file will contain configuration specific to your redshift console. Change the credentials to point to your cluster. You should provide username and password that can access the redshift system tables. 53 | 54 | 55 | `cp $REDEYE/cluster_dir/start_monitoring.py $REDEYE/cluster_new/` 56 | 57 | 58 | `python $REDEYE/cluster_new/start_monitoring.py` 59 | 60 | 61 | The tool has been designed to run for an hour. It can be scheduled as part of a workflow scheduler like Azkaban to keep the monitoring persistent. 62 | -------------------------------------------------------------------------------- /cluster_dir/': -------------------------------------------------------------------------------- 1 | config_dict = { 2 | 'redshift_connection' : {'redshift_endpoint': , 3 | 'redshift_user': , 4 | 'redshift_pass': , 5 | 'port': , 6 | 'dbname': , 7 | 'cluster_name': , 8 | 'aws_access_key_id': , 9 | 'aws_secret_access_key': , 10 | 'region_name': , 11 | 'num_nodes_cluster': }, 12 | 'database' : {'mysql_table' : {'query_tab': 'rs_query_monitor_', 13 | 'table_tab': 'rs_table_monitor_', 14 | 'query_scan_tab': 'rs_scan_monitor_'}}, 15 | 'metrics' : {'general': 16 | {'cluster_name': }, 17 | 'diskspace_metrics': 18 | {'total_disk_space_mb': } 19 | }, 20 | 'utility' : {'LOG_FILE': 'commerce_monitoring.log'} 21 | } 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /cluster_dir/config.py: -------------------------------------------------------------------------------- 1 | config_dict = { 2 | 'redshift_connection' : {'redshift_endpoint': , 3 | 'redshift_user': , 4 | 'redshift_pass': , 5 | 'port': , 6 | 'dbname': , 7 | 'cluster_name': , 8 | 'aws_access_key_id': , 9 | 'aws_secret_access_key': , 10 | 'region_name': , 11 | 'num_nodes_cluster': }, 12 | 'database' : {'mysql_table' : {'query_tab': 'rs_query_monitor_', 13 | 'table_tab': 'rs_table_monitor_', 14 | 'query_scan_tab': 'rs_scan_monitor_'}}, 15 | 'metrics' : {'general': 16 | {'cluster_name': }, 17 | 'diskspace_metrics': 18 | {'total_disk_space_mb': } 19 | }, 20 | 'utility' : {'LOG_FILE': 'commerce_monitoring.log'} 21 | } 22 | 23 | 24 | 25 | -------------------------------------------------------------------------------- /cluster_dir/start_monitoring.py: -------------------------------------------------------------------------------- 1 | from config import * 2 | import os, sys, inspect 3 | 4 | if __name__ == "__main__": 5 | 6 | cmd_folder = os.path.realpath('../') 7 | 8 | ## Azkaban operates from different directory, but adds this path as first element in its system path 9 | if sys.path[0] == '': 10 | sys.path.insert(1,cmd_folder) 11 | else: 12 | sys.path.insert(1,sys.path[0][:sys.path[0].rfind("/")]) 13 | 14 | redshift_monitoring = map(__import__,['redshift_monitoring']) 15 | 16 | cmd_folder = os.path.realpath('../redshift_monitoring') 17 | if sys.path[0] == '': 18 | sys.path.insert(1,cmd_folder) 19 | else: 20 | sys.path.insert(1,sys.path[0][:sys.path[0].rfind("/")]+'/redshift_monitoring') 21 | 22 | redshift_monitoring[0].initiate_monitoring(config_dict) 23 | 24 | -------------------------------------------------------------------------------- /redshift_monitoring/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ['redshift_monitoring','cloudwatch_metrics','diskspace_metrics','query_level_metrics','running_queries','thread_utils','wlm_metrics','post_db'] 2 | 3 | from redshift_monitoring import * 4 | from cloudwatch_metrics import * 5 | from diskspace_metrics import * 6 | from query_level_metrics import * 7 | from running_queries import * 8 | from thread_utils import * 9 | from wlm_metrics import * 10 | from post_db import * -------------------------------------------------------------------------------- /redshift_monitoring/base_config.py: -------------------------------------------------------------------------------- 1 | base_config = { 2 | 'redshift_connection' : {}, 3 | 'database' : {'opentsdb_url': 'http://:4242/api/put/', 4 | 'statsd_ip': , 5 | 'statsd_port': , 6 | 'mysql_endpoint': 'mysql+mysqldb://', 7 | 'mysql_user': , 8 | 'mysql_port': , 9 | 'mysql_dbname': , 10 | 'mysql_pass' : }, 11 | 'query' : {'query_dictionary': { 12 | 'query1' : """SELECT * FROM stv_recents WHERE status = 'Running';""", 13 | 'query2' : """SELECT * FROM stv_wlm_query_state;""", 14 | 'query3' : """SELECT wlm.query AS query_id, 15 | wlm.state, 16 | wlm.service_class AS queue, 17 | CONVERT_TIMEZONE('Asia/Calcutta',wlm.wlm_start_time) AS starttime, 18 | wlm.slot_count, 19 | pg_user.usename AS username, 20 | ex.inner_bcast_count, 21 | bcast.bcast_rows, 22 | CAST((wlm.exec_time) AS float) / 1000000 AS exec_time, 23 | CAST((wlm.queue_time) AS float) / 1000000 AS queue_time, 24 | CAST(SUM(qs.workmem) AS float) / 1000000000 AS workmem, 25 | SUM(CASE WHEN qs.is_diskbased = 't' THEN 1 ELSE 0 END) AS num_diskhits 26 | FROM stv_wlm_query_state wlm 27 | LEFT JOIN svv_query_state qs ON qs.query = wlm.query 28 | LEFT JOIN pg_user ON qs.userid = pg_user.usesysid 29 | LEFT JOIN (SELECT DISTINCT query, 30 | SUM(ROWS) AS bcast_rows 31 | FROM stl_bcast 32 | GROUP BY 1) bcast ON bcast.query = wlm.query 33 | LEFT JOIN (SELECT DISTINCT ex.query, 34 | COUNT(*) inner_bcast_count 35 | FROM stl_explain ex, 36 | stv_wlm_query_state wlm 37 | WHERE wlm.query = ex.query 38 | AND wlm.state = 'Running' 39 | AND ex.plannode LIKE ('%%DS_BCAST_INNER%%') 40 | GROUP BY 1) ex ON ex.query = wlm.query 41 | GROUP BY 1, 42 | 2, 43 | 3, 44 | 4, 45 | 5, 46 | 6, 47 | 7, 48 | 8, 49 | 9, 50 | 10 51 | ORDER BY 1, 52 | 2, 53 | 3;""", 54 | 'query4' : """SELECT TRIM(pgn.nspname) AS SCHEMA, 55 | SUM(b.mbytes) 56 | FROM (SELECT db_id, 57 | id, 58 | name, 59 | SUM(ROWS) AS ROWS 60 | FROM stv_tbl_perm a 61 | GROUP BY db_id, 62 | id, 63 | name) AS a 64 | JOIN pg_class AS pgc ON pgc.oid = a.id 65 | JOIN pg_namespace AS pgn ON pgn.oid = pgc.relnamespace 66 | JOIN pg_database AS pgdb ON pgdb.oid = a.db_id 67 | JOIN (SELECT tbl, COUNT(*) AS mbytes FROM stv_blocklist GROUP BY tbl) b ON a.id = b.tbl 68 | GROUP BY 1 69 | ORDER BY 2;""", 70 | 'query5':"""SELECT tab_all.*, 71 | tab_m1.m1_num_scan, 72 | tab_m1.m1_row_scan, 73 | tab_m1.m1_avg_time, 74 | tab_w1.w1_num_scan, 75 | tab_w1.w1_row_scan, 76 | tab_w1.w1_avg_time, 77 | tab_d1.d1_num_scan, 78 | tab_d1.d1_row_scan, 79 | tab_d1.d1_avg_time, 80 | tab_h6.h6_num_scan, 81 | tab_h6.h6_row_scan, 82 | tab_h6.h6_avg_time, 83 | tab_h3.h3_num_scan, 84 | tab_h3.h3_row_scan, 85 | tab_h3.h3_avg_time 86 | FROM (SELECT "schema" AS schemaname, 87 | "table" AS tablename, 88 | pct_used AS pct_mem_used, 89 | unsorted AS unsorted_rows, 90 | stats_off AS statistics, 91 | encoded AS is_encoded, 92 | diststyle, 93 | (CASE WHEN sortkey1 LIKE ('%%INTERLEAVED%%') THEN 'INTERLEAVED' ELSE sortkey1 END) sortkey1, 94 | skew_sortkey1, 95 | skew_rows 96 | FROM svv_table_info 97 | ORDER BY 1, 98 | 2) tab_all 99 | LEFT JOIN (SELECT DISTINCT scan.perm_table_name AS m1_table_name, 100 | COUNT(*) AS m1_num_scan, 101 | AVG("rows") AS m1_row_scan, 102 | AVG(DATEDIFF (seconds,scan.starttime,scan.endtime))::FLOAT AS m1_avg_time 103 | FROM stl_scan scan 104 | WHERE starttime > GETDATE () -INTERVAL '1 month' 105 | AND starttime < GETDATE () 106 | AND endtime > '2001-01-01 00:00:00' 107 | GROUP BY 1) tab_m1 ON tab_all.tablename = tab_m1.m1_table_name 108 | LEFT JOIN (SELECT DISTINCT scan.perm_table_name AS w1_table_name, 109 | COUNT(*) AS w1_num_scan, 110 | AVG("rows") AS w1_row_scan, 111 | AVG(DATEDIFF (seconds,scan.starttime,scan.endtime))::FLOAT AS w1_avg_time 112 | FROM stl_scan scan 113 | WHERE starttime > GETDATE () -INTERVAL '1 week' 114 | AND starttime < GETDATE () 115 | AND endtime > '2001-01-01 00:00:00' 116 | GROUP BY 1) tab_w1 ON tab_all.tablename = tab_w1.w1_table_name 117 | LEFT JOIN (SELECT DISTINCT scan.perm_table_name AS d1_table_name, 118 | COUNT(*) AS d1_num_scan, 119 | AVG("rows") AS d1_row_scan, 120 | AVG(DATEDIFF (seconds,scan.starttime,scan.endtime))::FLOAT AS d1_avg_time 121 | FROM stl_scan scan 122 | WHERE starttime > GETDATE () -INTERVAL '1 day' 123 | AND starttime < GETDATE () 124 | AND endtime > '2001-01-01 00:00:00' 125 | GROUP BY 1) tab_d1 ON tab_all.tablename = tab_d1.d1_table_name 126 | LEFT JOIN (SELECT DISTINCT scan.perm_table_name AS h6_table_name, 127 | COUNT(*) AS h6_num_scan, 128 | AVG("rows") AS h6_row_scan, 129 | AVG(DATEDIFF (seconds,scan.starttime,scan.endtime))::FLOAT AS h6_avg_time 130 | FROM stl_scan scan 131 | WHERE starttime > GETDATE () -INTERVAL '6 hours' 132 | AND starttime < GETDATE () 133 | AND endtime > '2001-01-01 00:00:00' 134 | GROUP BY 1) tab_h6 ON tab_all.tablename = tab_h6.h6_table_name 135 | LEFT JOIN (SELECT DISTINCT scan.perm_table_name AS h3_table_name, 136 | COUNT(*) AS h3_num_scan, 137 | AVG("rows") AS h3_row_scan, 138 | AVG(DATEDIFF (seconds,scan.starttime,scan.endtime))::FLOAT AS h3_avg_time 139 | FROM stl_scan scan 140 | WHERE starttime > GETDATE () -INTERVAL '3 hours' 141 | AND starttime < GETDATE () 142 | AND endtime > '2001-01-01 00:00:00' 143 | GROUP BY 1) tab_h3 ON tab_all.tablename = tab_h3.h3_table_name;""", 144 | 'query6':"""SELECT DISTINCT scan.query AS query_id, 145 | wlm.service_class AS queue, 146 | scan.perm_table_name AS tablename, 147 | CONVERT_TIMEZONE('Asia/Calcutta',wlm.wlm_start_time) AS query_start_time, 148 | (COUNT(CASE WHEN scan.is_rrscan = 't' THEN scan.is_rrscan ELSE NULL END)::float/ COUNT(*)::float)*100 AS range_scan_pct, 149 | AVG("rows") AS rows_scan, 150 | AVG(DATEDIFF (seconds,scan.starttime,scan.endtime)) AS avg_time 151 | FROM stl_scan scan, 152 | stv_wlm_query_state wlm 153 | WHERE wlm.state = 'Running' 154 | AND wlm.query = scan.query 155 | GROUP BY 1, 156 | 2, 157 | 3, 158 | 4 159 | ORDER BY 1;"""}, 160 | 'metric_query_dictionary': { 161 | 'running_queries' : ['query1'], 162 | 'wlm_metrics' : ['query2'], 163 | 'query_level_metrics' : ['query3','query6'], 164 | 'diskspace_metrics' : ['query4'], 165 | 'table_level_metrics' : ['query5'] 166 | }, 167 | 'query_frequency_dictionary' : { 168 | 'query1' : 1, 169 | 'query2' : 1, 170 | 'query3' : 1, 171 | 'query4' : 100, 172 | 'query6' : 1, 173 | 'query5' : 1000 } 174 | }, 175 | 'metrics' : {}, 176 | 'utility' : {'sleep_config': 60, 177 | 'job_reset_time': 3600} 178 | } 179 | 180 | -------------------------------------------------------------------------------- /redshift_monitoring/cloudwatch_metrics.py: -------------------------------------------------------------------------------- 1 | # Libraries to get cloudwatch metrics 2 | from datetime import datetime 3 | from datetime import timedelta 4 | import boto3 5 | from boto3.session import Session 6 | 7 | import logging 8 | #from statsd import StatsClient 9 | #import json 10 | import time 11 | #import requests 12 | 13 | # Post to database 14 | from post_db import * 15 | 16 | 17 | class CloudWatchMetrics(): 18 | 19 | def __init__(self,config_dict): 20 | # Create boto3 session for getting cloudwatch metrics 21 | session = Session(aws_access_key_id=config_dict['redshift_connection']['aws_access_key_id'] 22 | ,aws_secret_access_key=config_dict['redshift_connection']['aws_secret_access_key'] 23 | ,region_name=config_dict['redshift_connection']['region_name']) 24 | self.cw = session.client('cloudwatch') 25 | self.name_space = 'AWS/Redshift' 26 | self.metric_name = ['CPUUtilization', 27 | 'NetworkReceiveThroughput', 28 | 'NetworkTransmitThroughput', 29 | 'PercentageDiskSpaceUsed', 30 | 'ReadIOPS', 31 | 'ReadLatency', 32 | 'ReadThroughput', 33 | 'WriteIOPS', 34 | 'WriteLatency', 35 | 'WriteThroughput'] 36 | self.period = 60 37 | self.statistics = ['Average'] 38 | self.unit = ['Percent', 39 | 'Bytes/Second', 40 | 'Bytes/Second', 41 | 'Percent', 42 | 'Count/Second', 43 | 'Seconds', 44 | 'Bytes/Second', 45 | 'Count/Second', 46 | 'Seconds', 47 | 'Bytes/Second'] 48 | self.log_identifier = 'cw_metrics' 49 | self.cluster_name = config_dict['redshift_connection']['cluster_name'] 50 | self.num_nodes = config_dict['redshift_connection']['num_nodes_cluster'] 51 | self.post_db = PostDB(db_queue=None,database_config=config_dict['database']) 52 | 53 | def get_metrics(self): 54 | response = [0] * len(self.metric_name) 55 | json_response_node = {} 56 | json_opentsdb = {} 57 | tags = {} 58 | tags['ClusterIdentifier'] = self.cluster_name 59 | 60 | for node_iter in range(0,self.num_nodes): 61 | 62 | start_time = datetime.datetime.utcnow() - timedelta(minutes=4) 63 | end_time = datetime.datetime.utcnow() - timedelta(minutes=3) 64 | 65 | if node_iter == self.num_nodes-1: 66 | node = 'Leader' 67 | dimensions = [{'Name': 'ClusterIdentifier', 'Value': self.cluster_name}, 68 | {'Name': 'NodeID', 'Value': 'Leader'}] 69 | else: 70 | node = 'Compute-'+str(node_iter) 71 | dimensions = [{'Name': 'ClusterIdentifier', 'Value': self.cluster_name}, 72 | {'Name': 'NodeID', 'Value': 'Compute-'+str(node_iter)}] 73 | 74 | for i in range(0,len(self.metric_name)): 75 | response[i] = self.cw.get_metric_statistics(Namespace=self.name_space, 76 | MetricName=self.metric_name[i], 77 | Dimensions=dimensions, 78 | StartTime=start_time, 79 | EndTime=end_time, 80 | Period=self.period, 81 | Statistics=self.statistics, 82 | Unit=self.unit[i]) 83 | 84 | if response[i]['Datapoints'] == []: 85 | logging.info('%s : %s metric has no Datapoints on node %s', self.log_identifier, self.metric_name[i], node) 86 | print '%s : %s metric has no Datapoints on node %s' %(self.log_identifier, self.metric_name[i], node) 87 | else: 88 | self.post_db.post_statsd([{'redshift.'+response[i]['Label']+str('.')+str(node)+'.'+self.cluster_name : response[i]['Datapoints'][0][self.statistics[0]]}]) 89 | #statsd.gauge('redshift.'+response[i]['Label']+str('.')+str(node), response[i]['Datapoints'][0][self.statistics[0]]) 90 | #json_opentsdb['metric'] = str('redshift.')+response[i]['Label'] 91 | #json_opentsdb['timestamp'] = int(time.time()) 92 | #json_opentsdb['value'] = response[i]['Datapoints'][0][self.statistics[0]] 93 | #tags['Node'] = node 94 | #json_opentsdb['tags'] = tags 95 | 96 | #logging.info('%s : JSON posted is %s', self.log_identifier, json_opentsdb) 97 | #print '%s : JSON posted is %s' %(self.log_identifier, json_opentsdb) 98 | #self.post_db.post_opentsdb(json_opentsdb) 99 | #r = requests.post(url,data=json.dumps(json_opentsdb)) 100 | #logging.info('%s : HTTP Response is %s', self.log_identifier, r) 101 | #print '%s : HTTP Response is %s' %(self.log_identifier, r) 102 | 103 | 104 | -------------------------------------------------------------------------------- /redshift_monitoring/diskspace_metrics.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import time 3 | import copy 4 | import numpy as np 5 | import logging 6 | from thread_utils import * 7 | 8 | def compute_metric(thread_num,df_response,total_disk_space_mb): 9 | 10 | logging.info('%s : %d thread : computing json response for diskspace_metrics', 'system_tables',thread_num) 11 | print '%s : %d thread : computing json response for diskspace_metrics' %('system_tables',thread_num) 12 | 13 | return {schema:(float(diskspace[0]*100)/float(total_disk_space_mb)) for (schema,diskspace) in df_response.set_index('schema').T.to_dict('list').items()} 14 | 15 | ## Thread starts here 16 | def run(thread_num,query_result_queue,db_queue,stop_thread,config_dict): 17 | 18 | metric_name = 'redshift.DiskSchemaUsage' 19 | db = 'opentsdb' 20 | thread_read = ThreadRead() 21 | thread_write = ThreadWrite(config_dict['general']['cluster_name']) 22 | 23 | while (not stop_thread.is_set()): 24 | payload = {} 25 | df_response, error = thread_read.read_query_result(thread_num=thread_num,query_result_queue=query_result_queue,metric_name=metric_name,query_name='query4') 26 | if error == -1: 27 | print 'Something broke. Skip this run of %s' %(metric_name) 28 | else: 29 | metric_dictionary = compute_metric(thread_num=thread_num,df_response=df_response,total_disk_space_mb=config_dict['diskspace_metrics']['total_disk_space_mb']) 30 | payload['opentsdb'] = thread_write.get_payload(db='opentsdb',metric_name=metric_name,metric_dictionary=metric_dictionary,tag_name='schema') 31 | payload['statsd'] = thread_write.get_payload(db='statsd',metric_name=metric_name,metric_dictionary=metric_dictionary,tag_name='schema') 32 | thread_write.write_payload(payload=payload,db_queue=db_queue) 33 | -------------------------------------------------------------------------------- /redshift_monitoring/post_db.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | import requests 4 | import sqlalchemy.orm 5 | from sqlalchemy import func 6 | from sqlalchemy import create_engine 7 | from sqlalchemy.orm import sessionmaker, mapper, clear_mappers 8 | from sqlalchemy.ext.declarative import declarative_base 9 | from sqlalchemy import Column, Integer, String, Float, DateTime, MetaData, Table, and_ 10 | import logging 11 | from statsd import StatsClient 12 | import os 13 | import datetime 14 | from datetime import timedelta 15 | 16 | Base = declarative_base() 17 | 18 | ist_delta = timedelta(hours=0,minutes=0) 19 | 20 | log_identifier = 'system_tables' 21 | 22 | class PostDB(): 23 | 24 | def __init__(self,db_queue,database_config): 25 | mysql_endpoint = database_config['mysql_endpoint'] 26 | mysql_user = database_config['mysql_user'] 27 | mysql_port = database_config['mysql_port'] 28 | mysql_dbname = database_config['mysql_dbname'] 29 | mysql_pass = database_config['mysql_pass'] 30 | statsd_ip = database_config['statsd_ip'] 31 | statsd_port = database_config['statsd_port'] 32 | self.opentsdb_url = database_config['opentsdb_url'] 33 | query_tabname = database_config['mysql_table']['query_tab'] 34 | table_tabname = database_config['mysql_table']['table_tab'] 35 | query_scan_tab = database_config['mysql_table']['query_scan_tab'] 36 | 37 | 38 | self.mysql_engine_string = mysql_endpoint+mysql_user+':'+mysql_pass+'@'+mysql_port+'/'+mysql_dbname 39 | self.db_queue = db_queue 40 | self.engine = create_engine(self.mysql_engine_string) 41 | self.Session = sessionmaker(bind=self.engine) 42 | self.session = self.Session() 43 | self.statsd = StatsClient(statsd_ip,statsd_port) 44 | self.RSQueryMonitor = self.get_query_object(query_tabname,self.engine) 45 | self.RSTableMonitor = self.get_table_object(table_tabname,self.engine) 46 | self.RSQueryScan = self.get_scan_object(query_scan_tab,self.engine) 47 | 48 | def post_db(self,metric_list): 49 | logging.info('%s : Waiting for json response on %d', log_identifier, os.getpid()) 50 | #print '%s : Waiting for json response on %d' %(log_identifier, os.getpid()) 51 | for i in range(0,len(metric_list)): 52 | payload = self.db_queue.get() 53 | logging.info('%s : JSON response received is %s', log_identifier, payload) 54 | #print '%s : JSON response received is %s' %(log_identifier, payload) 55 | for key in payload: 56 | if key == 'opentsdb': 57 | r = self.post_opentsdb(payload[key]) 58 | elif key == 'mysql': 59 | self.post_mysql(payload[key]) 60 | elif key == 'statsd': 61 | self.post_statsd(payload[key]) 62 | 63 | def post_statsd(self,payload): 64 | for metric in range(0,len(payload)): 65 | for key in payload[metric]: 66 | self.statsd.gauge(key, payload[metric][key]) 67 | 68 | def post_opentsdb(self,payload): 69 | r = requests.post(self.opentsdb_url,data=json.dumps(payload)) 70 | logging.info('%s : HTTP response received is %s', log_identifier, r) 71 | print '%s : HTTP response received is %s' %(log_identifier, r) 72 | return r 73 | 74 | def post_mysql(self,payload): 75 | for key in payload: 76 | if key == 'rs_query_monitor': 77 | table_rows_list = payload[key] 78 | for row in table_rows_list: 79 | table_rows_dict = table_rows_list[row] 80 | if self.session.query(self.RSQueryMonitor).filter(self.RSQueryMonitor.query_id == table_rows_dict['query_id']).count() == 0: 81 | self.session.add(self.RSQueryMonitor(query_id=table_rows_dict['query_id'],username=table_rows_dict['username'].strip(), 82 | workmem=table_rows_dict['workmem'], num_diskhits=table_rows_dict['num_diskhits'], exec_time=table_rows_dict['exec_time'], 83 | queue_time=table_rows_dict['queue_time'], slot_count=table_rows_dict['slot_count'], 84 | starttime=table_rows_dict['starttime'],state=table_rows_dict['state'].strip(),queue=table_rows_dict['queue'], 85 | inner_bcast_count=table_rows_dict['inner_bcast_count'],bcast_rows=table_rows_dict['bcast_rows'], 86 | last_modified_on=datetime.datetime.utcnow()+ist_delta)) 87 | else: 88 | row = self.session.query(self.RSQueryMonitor).filter(self.RSQueryMonitor.query_id == table_rows_dict['query_id']).first() 89 | row.queue = table_rows_dict['username'].strip() 90 | row.workmem = table_rows_dict['workmem'] 91 | row.num_diskhits = table_rows_dict['num_diskhits'] + row.num_diskhits 92 | row.exec_time = table_rows_dict['exec_time'] 93 | row.queue_time = table_rows_dict['queue_time'] 94 | row.slot_count = table_rows_dict['slot_count'] 95 | row.starttime = table_rows_dict['starttime'] 96 | row.state = table_rows_dict['state'].strip() 97 | row.queue = table_rows_dict['queue'] 98 | row.inner_bcast_count = table_rows_dict['inner_bcast_count'] 99 | row.bcast_rows = table_rows_dict['bcast_rows'] 100 | row.last_modified_on = datetime.datetime.utcnow()+ist_delta 101 | self.session.commit() 102 | max_lmd = self.session.query(func.max(self.RSQueryMonitor.last_modified_on)).all()[0][0] 103 | done_rows = self.session.query(self.RSQueryMonitor).filter(~self.RSQueryMonitor.state.in_('Done ')).filter(self.RSQueryMonitor.last_modified_on < max_lmd).all() 104 | for row in range(0,len(done_rows)): 105 | done_rows[row].state = 'Done' 106 | self.session.commit() 107 | if key == 'rs_table_monitor': 108 | table_rows_list = payload[key] 109 | for row in table_rows_list: 110 | table_rows_dict = table_rows_list[row] 111 | if self.session.query(self.RSTableMonitor).filter(and_(self.RSTableMonitor.schemaname == table_rows_dict['schemaname'].strip(), self.RSTableMonitor.tablename == table_rows_dict['tablename'].strip())).count() == 0: 112 | self.session.add(self.RSTableMonitor(schemaname=table_rows_dict['schemaname'].strip(),tablename=table_rows_dict['tablename'].strip(),pct_mem_used=table_rows_dict['pct_mem_used'], 113 | unsorted_rows=table_rows_dict['unsorted_rows'], statistics=table_rows_dict['statistics'], is_encoded=table_rows_dict['is_encoded'],diststyle=table_rows_dict['diststyle'], 114 | sortkey1=table_rows_dict['sortkey1'],skew_sortkey1=table_rows_dict['skew_sortkey1'],skew_rows=table_rows_dict['skew_rows'],m1_num_scan=table_rows_dict['m1_num_scan'], 115 | m1_row_scan=table_rows_dict['m1_row_scan'],m1_avg_time=table_rows_dict['m1_avg_time'],w1_num_scan=table_rows_dict['w1_num_scan'],w1_row_scan=table_rows_dict['w1_row_scan'], 116 | w1_avg_time=table_rows_dict['w1_avg_time'],d1_num_scan=table_rows_dict['d1_num_scan'],d1_row_scan=table_rows_dict['d1_row_scan'],d1_avg_time=table_rows_dict['d1_avg_time'], 117 | h6_num_scan=table_rows_dict['h6_num_scan'],h6_row_scan=table_rows_dict['h6_row_scan'], h6_avg_time=table_rows_dict['h6_avg_time'],h3_num_scan=table_rows_dict['h3_num_scan'], 118 | h3_row_scan=table_rows_dict['h3_row_scan'],h3_avg_time=table_rows_dict['h3_avg_time'],last_modified_on=datetime.datetime.utcnow()+ist_delta)) 119 | else: 120 | row = self.session.query(self.RSTableMonitor).filter(and_(self.RSTableMonitor.schemaname == table_rows_dict['schemaname'].strip(), self.RSTableMonitor.tablename == table_rows_dict['tablename'].strip())) 121 | row.pct_mem_used = table_rows_dict['pct_mem_used'] 122 | row.unsorted_rows = table_rows_dict['unsorted_rows'] 123 | row.statistics = table_rows_dict['statistics'] 124 | row.is_encoded = table_rows_dict['is_encoded'] 125 | row.diststyle = table_rows_dict['diststyle'] 126 | row.sortkey1 = table_rows_dict['sortkey1'] 127 | row.skew_sortkey1 = table_rows_dict['skew_sortkey1'] 128 | row.skew_rows = table_rows_dict['skew_rows'] 129 | row.m1_num_scan = table_rows_dict['m1_num_scan'] 130 | row.m1_avg_time = table_rows_dict['m1_avg_time'] 131 | row.m1_row_scan = table_rows_dict['m1_row_scan'] 132 | row.w1_num_scan = table_rows_dict['w1_num_scan'] 133 | row.w1_row_scan = table_rows_dict['w1_row_scan'] 134 | row.w1_avg_time = table_rows_dict['w1_avg_time'] 135 | row.d1_num_scan = table_rows_dict['d1_num_scan'] 136 | row.d1_row_scan = table_rows_dict['d1_row_scan'] 137 | row.d1_avg_time = table_rows_dict['d1_avg_time'] 138 | row.h6_num_scan = table_rows_dict['h6_num_scan'] 139 | row.h6_row_scan = table_rows_dict['h6_row_scan'] 140 | row.h6_avg_time = table_rows_dict['h6_avg_time'] 141 | row.h3_num_scan = table_rows_dict['h3_num_scan'] 142 | row.h3_row_scan = table_rows_dict['h3_row_scan'] 143 | row.h3_avg_time = table_rows_dict['h3_avg_time'] 144 | row.last_modified_on = datetime.datetime.utcnow()+ist_delta 145 | self.session.commit() 146 | if key == 'rs_scan_monitor': 147 | table_rows_list = payload[key] 148 | for row in table_rows_list: 149 | table_rows_dict = table_rows_list[row] 150 | if self.session.query(self.RSQueryScan).filter(and_(RSQueryScan.query_id == table_rows_dict['query_id'],self.RSQueryScan.tablename == table_rows_dict['tablename'].strip(), self.RSQueryScan.queue == table_rows_dict['queue'])): 151 | self.session.add(self.RSQueryScan(query_id=table_rows_dict['query_id'],queue=table_rows_dict['queue'],tablename=table_rows_dict['tablename'].strip(), 152 | query_start_time=table_rows_dict['query_start_time'], range_scan_pct=table_rows_dict['range_scan_pct'], rows_scan=table_rows_dict['rows_scan'], 153 | avg_time=table_rows_dict['avg_time'],last_modified_on=datetime.datetime.utcnow()+ist_delta)) 154 | else: 155 | row = self.session.query(self.RSQueryScan).filter(and_(RSQueryScan.query_id == table_rows_dict['query_id'],self.RSQueryScan.tablename == table_rows_dict['tablename'].strip(), self.RSQueryScan.queue == table_rows_dict['queue'])) 156 | row.range_scan_pct = table_rows_dict['range_scan_pct'] 157 | row.rows_scan = table_rows_dict['rows_scan'] 158 | row.avg_time = table_rows_dict['avg_time'] 159 | row.last_modified_on = datetime.datetime.utcnow()+ist_delta 160 | self.session.commit() 161 | 162 | def get_query_object(self,tablename,engine): 163 | metadata = MetaData(bind=engine) 164 | rs_query_monitor = Table(tablename, metadata, 165 | Column('query_id', Integer(), primary_key=True), 166 | Column('username', String(255)), 167 | Column('workmem', String(255)), 168 | Column('num_diskhits', Integer()), 169 | Column('inner_bcast_count', Integer()), 170 | Column('bcast_rows', Integer()), 171 | Column('exec_time', Integer()), 172 | Column('slot_count', Integer()), 173 | Column('queue_time', Integer()), 174 | Column('starttime', DateTime(), default=datetime.datetime.utcnow()+ist_delta), 175 | Column('state', String(255)), 176 | Column('queue', Integer()), 177 | Column('last_modified_on', DateTime(), default=datetime.datetime.utcnow()+ist_delta)) 178 | rs_query_monitor.create(checkfirst=True) 179 | clear_mappers() 180 | mapper(RSQueryMonitor, rs_query_monitor) 181 | return RSQueryMonitor 182 | 183 | def get_table_object(self,tablename,engine): 184 | metadata = MetaData(bind=engine) 185 | rs_tab_monitor = Table(tablename, metadata, 186 | Column('id', Integer(), primary_key=True), 187 | Column('schemaname', String(255), nullable=False), 188 | Column('tablename', String(255), nullable=False), 189 | Column('pct_mem_used', Float()), 190 | Column('unsorted_rows', Float()), 191 | Column('statistics', Float()), 192 | Column('is_encoded', String(255)), 193 | Column('diststyle', String(255)), 194 | Column('sortkey1', String(255)), 195 | Column('skew_sortkey1', String(255)), 196 | Column('skew_rows', Float()), 197 | Column('m1_num_scan', Float()), 198 | Column('m1_row_scan', Float()), 199 | Column('m1_avg_time', Float()), 200 | Column('w1_num_scan', Float()), 201 | Column('w1_row_scan', Float()), 202 | Column('w1_avg_time', Float()), 203 | Column('d1_num_scan', Float()), 204 | Column('d1_row_scan', Float()), 205 | Column('d1_avg_time', Float()), 206 | Column('h6_num_scan', Float()), 207 | Column('h6_row_scan', Float()), 208 | Column('h6_avg_time', Float()), 209 | Column('h3_num_scan', Float()), 210 | Column('h3_row_scan', Float()), 211 | Column('h3_avg_time', Float()), 212 | Column('last_modified_on', DateTime(), default=datetime.datetime.utcnow()+ist_delta)) 213 | rs_tab_monitor.create(checkfirst=True) 214 | #clear_mappers() 215 | mapper(RSTableMonitor, rs_tab_monitor) 216 | return RSTableMonitor 217 | 218 | def get_scan_object(self,tablename,engine): 219 | metadata = MetaData(bind=engine) 220 | rs_scan_monitor = Table(tablename,metadata, 221 | Column('id', Integer(), primary_key=True), 222 | Column('query_id', Integer(),nullable=False), 223 | Column('queue', Integer(),nullable=False), 224 | Column('tablename', String(255),nullable=False), 225 | Column('query_start_time',DateTime(),default=datetime.datetime.utcnow()+ist_delta), 226 | Column('range_scan_pct', Integer()), 227 | Column('rows_scan', Integer()), 228 | Column('avg_time', Integer()), 229 | Column('last_modified_on',DateTime(),default=datetime.datetime.utcnow()+ist_delta)) 230 | rs_scan_monitor.create(checkfirst=True) 231 | mapper(RSQueryScan,rs_scan_monitor) 232 | return RSQueryScan 233 | 234 | 235 | class RSQueryMonitor(object): 236 | def __init__(self,query_id,username,workmem,num_diskhits,exec_time,slot_count, 237 | inner_bcast_count,bcast_rows, 238 | queue_time,starttime,state,queue,last_modified_on): 239 | self.query_id = query_id 240 | self.username = username 241 | self.inner_bcast_count = inner_bcast_count 242 | self.bcast_rows = bcast_rows 243 | self.workmem = workmem 244 | self.num_diskhits = num_diskhits 245 | self.exec_time = exec_time 246 | self.slot_count = slot_count 247 | self.queue_time = queue_time 248 | self.starttime = starttime 249 | self.state = state 250 | self.queue = queue 251 | self.last_modified_on = last_modified_on 252 | def __repr__(self): 253 | return "" % (self.query_id, self.username, self.workmem, self.num_diskhits, self.exec_time, self.slot_count, self.queue_time, self.starttime, self.state, self.queue, self.last_modified_on) 254 | 255 | class RSTableMonitor(object): 256 | def __init__(self,schemaname,tablename,pct_mem_used,unsorted_rows,statistics,is_encoded,diststyle,sortkey1,skew_sortkey1,skew_rows, 257 | m1_num_scan,m1_row_scan,m1_avg_time,w1_num_scan,w1_row_scan,w1_avg_time,d1_num_scan,d1_row_scan,d1_avg_time,h6_num_scan, 258 | h6_row_scan,h6_avg_time,h3_num_scan,h3_row_scan,h3_avg_time,last_modified_on): 259 | self.schemaname = schemaname 260 | self.tablename = tablename 261 | self.pct_mem_used = pct_mem_used 262 | self.unsorted_rows = unsorted_rows 263 | self.statistics = statistics 264 | self.is_encoded = is_encoded 265 | self.diststyle = diststyle 266 | self.sortkey1 = sortkey1 267 | self.skew_sortkey1 = skew_sortkey1 268 | self.skew_rows = skew_rows 269 | self.m1_num_scan = m1_num_scan 270 | self.m1_row_scan = m1_row_scan 271 | self.m1_avg_time = m1_avg_time 272 | self.w1_num_scan = w1_num_scan 273 | self.w1_row_scan = w1_row_scan 274 | self.w1_avg_time = w1_avg_time 275 | self.d1_num_scan = d1_num_scan 276 | self.d1_row_scan = d1_row_scan 277 | self.d1_avg_time = d1_avg_time 278 | self.h6_num_scan = h6_num_scan 279 | self.h6_row_scan = h6_row_scan 280 | self.h6_avg_time = h6_avg_time 281 | self.h3_num_scan = h3_num_scan 282 | self.h3_row_scan = h3_row_scan 283 | self.h3_avg_time = h3_avg_time 284 | self.last_modified_on = last_modified_on 285 | def __repr__(self): 286 | return "" % (self.schemaname, self.tablename, self.pct_mem_used, self.unsorted_rows, self.statistics, self.is_encoded, self.diststyle, self.sortkey1, self.skew_sortkey1, self.skew_rows, self.last_modified_on) 287 | 288 | class RSQueryScan(object): 289 | def __init__(self,query_id,queue,tablename,query_start_time,range_scan_pct,rows_scan,avg_time,last_modified_on): 290 | self.query_id = query_id 291 | self.queue = queue 292 | self.tablename = tablename 293 | self.query_start_time = query_start_time 294 | self.range_scan_pct = range_scan_pct 295 | self.rows_scan = rows_scan 296 | self.avg_time = avg_time 297 | self.last_modified_on = last_modified_on 298 | def __repr__(self): 299 | return "" -------------------------------------------------------------------------------- /redshift_monitoring/query_level_metrics.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import time 3 | import copy 4 | import numpy as np 5 | import logging 6 | from thread_utils import * 7 | 8 | def compute_metric(thread_num,df_response): 9 | logging.info('%s : %d thread : computing json response for query_level_metrics', 'system_tables',thread_num) 10 | print '%s : %d thread : computing json response for query_level_metrics' %('system_tables',thread_num) 11 | 12 | return df_response.to_dict('index') 13 | 14 | class ComputeMetric(): 15 | 16 | def __init__(self,thread_num,df_response_query,df_response_scan): 17 | logging.info('%s : %d thread : computing json response for query_level_metrics', 'system_tables',thread_num) 18 | print '%s : %d thread : computing json response for query_level_metrics' %('system_tables',thread_num) 19 | self.df_response_query = df_response_query 20 | self.df_response_scan = df_response_scan 21 | self.thread_num = thread_num 22 | 23 | def aggregate_data_query(self,df_response_query): 24 | df_response_query = df_response_query.replace(np.nan,0,regex=True) 25 | return df_response_query.replace({'username':{0:'NULL'}},regex=True).to_dict('index') 26 | 27 | def aggregate_data_scan(self,df_response_scan): 28 | return df_response_scan.to_dict('index') 29 | 30 | def query_time_series(self): 31 | df_dict = self.df_response_query[['query_id','workmem','num_diskhits']].set_index('query_id').to_dict('index') 32 | metric_name_list = ['redshift.QueryLevelMetrics'+'.'+str(key) for (key,val) in df_dict.items()] 33 | metric_val_list = [df_val for (query_id,df_val) in df_dict.items()] 34 | return metric_name_list, metric_val_list 35 | 36 | ## Thread starts here 37 | def run(thread_num,query_result_queue,db_queue,stop_thread,config_dict): 38 | 39 | metric_name = 'redshift.QueryLevelMetrics' 40 | query_table_name = 'rs_query_monitor' 41 | scan_table_name = 'rs_scan_monitor' 42 | thread_read = ThreadRead() 43 | thread_write = ThreadWrite(config_dict['general']['cluster_name']) 44 | 45 | while (not stop_thread.is_set()): 46 | payload = {} 47 | df_response_query, error_query = thread_read.read_query_result(thread_num=thread_num,query_result_queue=query_result_queue,metric_name=metric_name,query_name='query3') 48 | df_response_scan, error_scan = thread_read.read_query_result(thread_num=thread_num,query_result_queue=query_result_queue,metric_name=metric_name,query_name='query6') 49 | 50 | if error_scan == -1 or error_query == -1: 51 | print 'Something broke. Skip this run of %s' %(metric_name) 52 | else: 53 | compute_metric = ComputeMetric(thread_num,df_response_query,df_response_scan) 54 | metric_dictionary = compute_metric.aggregate_data_query(df_response_query) 55 | payload['mysql'] = thread_write.get_payload(db='mysql',metric_name=query_table_name,metric_dictionary=metric_dictionary,tag_name='user_name') 56 | metric_dictionary = compute_metric.aggregate_data_scan(df_response_scan) 57 | payload['mysql'][scan_table_name] = thread_write.get_payload(db='mysql',metric_name=scan_table_name,metric_dictionary=metric_dictionary,tag_name='user_name')[scan_table_name] 58 | metric_name_list, metric_dictionary = compute_metric.query_time_series() 59 | payload['statsd'] = [val[0] for val in map(thread_write.get_payload_statsd,metric_name_list,metric_dictionary)] 60 | thread_write.write_payload(payload=payload,db_queue=db_queue) 61 | -------------------------------------------------------------------------------- /redshift_monitoring/redshift_monitoring.py: -------------------------------------------------------------------------------- 1 | # Query and storage modules 2 | import pandas as pd 3 | import psycopg2 4 | from sqlalchemy import create_engine 5 | 6 | # Utility modules 7 | import json 8 | import time 9 | import requests 10 | import os 11 | 12 | # Multithreading 13 | from Queue import Queue 14 | import threading 15 | from threading import Thread 16 | 17 | # Exception cathcing modules 18 | import traceback, os.path 19 | import logging 20 | 21 | # Post to database 22 | from post_db import * 23 | 24 | # Library get cloudwatch metrics 25 | from cloudwatch_metrics import * 26 | 27 | # Inherit base configuration 28 | from base_config import * 29 | 30 | db_queue = Queue() 31 | query_result_queue = Queue() 32 | 33 | query_result_df = {} 34 | 35 | log_identifier = 'system_tables' 36 | 37 | def create_rs_engine(log_identifier,redshift_connection): 38 | redshift_endpoint = redshift_connection['redshift_endpoint'] 39 | redshift_user = redshift_connection['redshift_user'] 40 | redshift_pass = redshift_connection['redshift_pass'] 41 | port = redshift_connection['port'] 42 | dbname = redshift_connection['dbname'] 43 | engine_string = 'postgresql+psycopg2://'+redshift_user+':'+redshift_pass+'@'+redshift_endpoint+':'+str(port)+'/'+dbname 44 | print engine_string 45 | engine = create_engine(engine_string) 46 | logging.info('%s : Created redshift engine', log_identifier) 47 | print '%s : Created redshift engine' %(log_identifier) 48 | return engine 49 | 50 | def sql_query(query_counter,engine,metric_list,query_frequency_dictionary,query_dictionary,sleep_config,log_identifier,redshift_connection,queue_push): 51 | # Get a list of only those queries that are divisible by the time period set by user 52 | query_list = [query for (query, period) in query_frequency_dictionary.items() if query_counter%query_frequency_dictionary[query] == 0] 53 | print query_list 54 | # Query redshift for each of the chosen queries 55 | for i in range(0,len(query_list)): 56 | try: 57 | print query_list[i] 58 | query_result_df[query_list[i]] = pd.read_sql_query(query_dictionary[query_list[i]],engine) 59 | except: 60 | print 'Something broke. connection failure' 61 | logging.exception('%s : Redshift connection failure', log_identifier) 62 | traceback.extract_stack() 63 | #print type(exception).__name__ 64 | print query_counter 65 | time.sleep(sleep_config) 66 | engine = create_rs_engine(log_identifier=log_identifier,redshift_connection=redshift_connection) 67 | continue 68 | # Increment the count by 1 69 | query_counter += 1 70 | # Put the dataframes on a queue consumed by all threads. 71 | for i in range(0,queue_push): 72 | query_result_queue.put(query_result_df) 73 | 74 | return query_counter 75 | 76 | def initiate_monitoring(config_dict): 77 | 78 | {base_config[key].update(val) for (key,val) in config_dict.items()} 79 | config_dict = base_config 80 | 81 | log_file = config_dict['utility']['LOG_FILE'] 82 | metric_query_dictionary = config_dict['query']['metric_query_dictionary'] 83 | redshift_connection = config_dict['redshift_connection'] 84 | job_reset_time = config_dict['utility']['job_reset_time'] 85 | sleep_config = config_dict['utility']['sleep_config'] 86 | query_dictionary = config_dict['query']['query_dictionary'] 87 | query_frequency_dictionary = config_dict['query']['query_frequency_dictionary'] 88 | database_config = config_dict['database'] 89 | 90 | # The metric collector would sleep every seconds. This variable will store the start time for it 91 | sys_rs_job_start_time = time.time() 92 | 93 | os.system('rm '+log_file) 94 | 95 | logging.basicConfig(filename=log_file, level=logging.INFO, format='%(asctime)s %(message)s') 96 | 97 | log_identifier = 'system_tables' 98 | 99 | logging.info('%s : Start logging system_tables_rs metrics', log_identifier) 100 | print '%s : Start logging system_tables_rs metrics' %(log_identifier) 101 | 102 | # Create an engine for connecting and querying to redshift 103 | engine = create_rs_engine(log_identifier,redshift_connection) 104 | 105 | # Get the name of all the metrics and import them dynamically 106 | metric_list = [metric for (metric,query) in metric_query_dictionary.items()] 107 | metrics = map(__import__,metric_list) 108 | 109 | # Get total number of dataframes to push to queue 110 | queue_push = 0 111 | for key in metric_query_dictionary: 112 | queue_push += len(metric_query_dictionary[key]) 113 | 114 | # Define an event to signal all threads to terminate 115 | stop_thread = threading.Event() 116 | 117 | # Spawn threads for every metric 118 | for i in range(0,len(metric_list)): 119 | worker = Thread(target=metrics[i].run,args=(i,query_result_queue,db_queue,stop_thread,config_dict['metrics'],)) 120 | worker.setDaemon(True) 121 | worker.start() 122 | 123 | # A counter of how many times the redshift query API aws called 124 | query_counter = 0 125 | 126 | # Create an instance of PostDB() 127 | post_db = PostDB(db_queue,database_config) 128 | 129 | # Create Cloudwatch metrics instance 130 | cloudwatch_metrics = CloudWatchMetrics(config_dict) 131 | 132 | while 1: 133 | 134 | # Query the redshift cluster 135 | query_counter = sql_query(query_counter=query_counter,engine=engine,metric_list=metric_list, 136 | query_frequency_dictionary=query_frequency_dictionary,query_dictionary=query_dictionary, 137 | sleep_config=sleep_config,log_identifier=log_identifier,redshift_connection=redshift_connection,queue_push=queue_push) 138 | query_result_queue.join() 139 | 140 | # Wait for each of the database queues to join 141 | post_db.post_db(metric_list=metric_list) 142 | db_queue.join() 143 | 144 | # Get hardware metrics from cloudwatch 145 | cloudwatch_metrics.get_metrics() 146 | 147 | print 'Sleep for a while' 148 | # Sleep for a configured time 149 | time.sleep(sleep_config) 150 | 151 | sys_rs_job_end_time = time.time() 152 | 153 | if (sys_rs_job_end_time - sys_rs_job_start_time) > job_reset_time: 154 | stop_thread.set() 155 | break; 156 | 157 | -------------------------------------------------------------------------------- /redshift_monitoring/running_queries.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import time 3 | import copy 4 | import numpy as np 5 | import logging 6 | from thread_utils import * 7 | 8 | def compute_metric(thread_num,df_response): 9 | 10 | logging.info('%s : %d thread : computing json response for running_queries', 'system_tables',thread_num) 11 | print '%s : %d thread : computing json response for running_queries' %('system_tables',thread_num) 12 | 13 | return {user_name.rstrip():count for (user_name,count) in dict(pd.Series(df_response['user_name']).value_counts()).items()} 14 | 15 | ## Thread starts here 16 | def run(thread_num,query_result_queue,db_queue,stop_thread,config_dict): 17 | 18 | metric_name = 'redshift.RunningQueries' 19 | #db = 'opentsdb' 20 | thread_read = ThreadRead() 21 | thread_write = ThreadWrite(config_dict['general']['cluster_name']) 22 | 23 | while (not stop_thread.is_set()): 24 | payload = {} 25 | df_response, error = thread_read.read_query_result(thread_num=thread_num,query_result_queue=query_result_queue,metric_name=metric_name,query_name='query1') 26 | if error == -1: 27 | print 'Something broke. Skip this run of %s' %(metric_name) 28 | else: 29 | metric_dictionary = compute_metric(thread_num=thread_num,df_response=df_response) 30 | payload['statsd'] = thread_write.get_payload(db='statsd',metric_name=metric_name,metric_dictionary=metric_dictionary,tag_name='null') 31 | thread_write.write_payload(payload=payload,db_queue=db_queue) -------------------------------------------------------------------------------- /redshift_monitoring/table_level_metrics.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import time 3 | import copy 4 | import numpy as np 5 | import logging 6 | from thread_utils import * 7 | 8 | class ComputeMetric(): 9 | 10 | def __init__(self,thread_num,df_response): 11 | logging.info('%s : %d thread : computing json response for table_metrics', 'system_tables',thread_num) 12 | print '%s : %d thread : computing json response for table_metrics' %('system_tables',thread_num) 13 | self.df_response = df_response 14 | self.thread_num = thread_num 15 | 16 | def dist_sort_list(self): 17 | return self.df_response.replace(np.nan,19700101,regex=True).to_dict('index') 18 | 19 | ## Thread starts here 20 | def run(thread_num,query_result_queue,db_queue,stop_thread,config_dict): 21 | 22 | metric_name = 'redshift.TableLevelMetrics' 23 | table_name = 'rs_table_monitor' 24 | thread_read = ThreadRead() 25 | thread_write = ThreadWrite(config_dict['general']['cluster_name']) 26 | 27 | while (not stop_thread.is_set()): 28 | payload = {} 29 | df_response, error = thread_read.read_query_result(thread_num=thread_num,query_result_queue=query_result_queue,metric_name=metric_name,query_name='query5') 30 | if error == -1: 31 | print 'Something broke. Skip this run of %s' %(metric_name) 32 | else: 33 | compute_metric = ComputeMetric(thread_num,df_response) 34 | metric_dictionary = compute_metric.dist_sort_list() 35 | payload['mysql'] = thread_write.get_payload(db='mysql',metric_name=table_name,metric_dictionary=metric_dictionary,tag_name='user_name') 36 | thread_write.write_payload(payload=payload,db_queue=db_queue) 37 | -------------------------------------------------------------------------------- /redshift_monitoring/thread_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import time 3 | import copy 4 | 5 | class ThreadRead(): 6 | 7 | ## Read redshift query in query queue 8 | def read_query_result(self,thread_num,query_result_queue,metric_name,query_name): 9 | logging.info('%s : %d thread : waiting for result from redshift in %s' , 'system_tables', thread_num, metric_name) 10 | 11 | try: 12 | df_packet = query_result_queue.get() 13 | #print '%d thread: Inside thread read' %(thread_num) 14 | key_list = [] 15 | for key in df_packet: 16 | key_list.append(key) 17 | print '%d thread: %s' %(thread_num,key_list) 18 | df_response = df_packet[query_name] 19 | error = 0 20 | logging.info('%s : %d thread : obtained df_response for %s and %s' , 'system_tables', thread_num, metric_name, query_name) 21 | print '%s : %d thread : obtained df_response for %s' %('system_tables',thread_num, metric_name) 22 | except: 23 | df_response = None 24 | error = -1 25 | print '%s : %d thread : The thread read failed. Most likely cause is %s timed out' %('system_tables',thread_num, query_name) 26 | pass 27 | query_result_queue.task_done() 28 | return df_response, error 29 | 30 | class ThreadWrite(): 31 | 32 | def __init__(self,cluster_name): 33 | self.cluster_name = cluster_name 34 | ## Create a payload for each of the databases 35 | def get_payload(self,db,metric_name, metric_dictionary,tag_name): 36 | if db == 'opentsdb': 37 | payload = self.get_payload_opentsdb(metric_name,metric_dictionary,tag_name) 38 | elif db == 'statsd': 39 | payload = self.get_payload_statsd(metric_name,metric_dictionary) 40 | elif db == 'mysql': 41 | payload = {metric_name : metric_dictionary} 42 | return payload 43 | 44 | def get_payload_statsd(self,metric_name,input_dict): 45 | if input_dict == {}: 46 | return [{metric_name+'.default'+'.'+self.cluster_name :0}] 47 | else: 48 | return [{metric_name+'.'+str(key)+'.'+self.cluster_name : value for (key,value) in input_dict.items()}] 49 | 50 | ## Create a payload for opentsdb database 51 | def get_payload_opentsdb(self,metric_name,input_dict,tag_name): 52 | 53 | json_opentsdb = {} 54 | tags = {} 55 | tags['ClusterIdentifier'] = self.cluster_name 56 | json_queue = [] 57 | 58 | if input_dict == {}: 59 | json_opentsdb['metric'] = str(metric_name) 60 | json_opentsdb['timestamp'] = int(time.time()) 61 | json_opentsdb['value'] = 0 62 | json_opentsdb['tags'] = tags 63 | json_queue.append(json_opentsdb) 64 | 65 | else: 66 | for key in input_dict: 67 | json_opentsdb['metric'] = str(metric_name) 68 | json_opentsdb['timestamp'] = int(time.time()) 69 | json_opentsdb['value'] = float(input_dict[key]) 70 | tags[tag_name] = key 71 | json_opentsdb['tags'] = tags 72 | json_queue.append(copy.deepcopy(json_opentsdb)) 73 | 74 | return json_queue 75 | 76 | def write_payload(self,payload,db_queue): 77 | db_queue.put(payload) 78 | db_queue.task_done() 79 | -------------------------------------------------------------------------------- /redshift_monitoring/wlm_metrics.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import time 3 | import copy 4 | import numpy as np 5 | import logging 6 | from thread_utils import * 7 | 8 | class ComputeMetric(): 9 | 10 | def __init__(self,thread_num,df_response): 11 | logging.info('%s : %d thread : computing json response for wlm_metrics', 'system_tables',thread_num) 12 | print '%s : %d thread : computing json response for wlm_metrics' %('system_tables',thread_num) 13 | self.df_response = df_response 14 | self.thread_num = thread_num 15 | 16 | def running_query_count(self): 17 | return dict(self.df_response[self.df_response['state'] == 'Running ']['service_class'].value_counts()), 'redshift.WlmRunningQueries' 18 | 19 | def queued_query_count(self): 20 | return dict(self.df_response[self.df_response['state'] == 'QueuedWaiting ']['service_class'].value_counts()), 'redshift.WlmQueuedQueries' 21 | 22 | def returning_query_count(self): 23 | return dict(self.df_response[self.df_response['state'] == 'Returning ']['service_class'].value_counts()), 'redshift.WlmReturningQueries' 24 | 25 | def running_query_avg_time(self): 26 | return dict(self.df_response[self.df_response['state'] == 'Running '].groupby('service_class').agg({'exec_time':np.mean})['exec_time']), 'redshift.WlmRunningQueriesAvgTime' 27 | 28 | def queued_query_avg_time(self): 29 | return dict(self.df_response[self.df_response['state'] == 'QueuedWaiting '].groupby('service_class').agg({'queue_time':np.mean})['queue_time']), 'redshift.WlmQueuedQueriesAvgTime' 30 | 31 | def returning_query_avg_time(self): 32 | return dict(self.df_response[self.df_response['state'] == 'Returning '].groupby('service_class').agg({'exec_time':np.mean})['exec_time']), 'redshift.WlmReturningQueriesAvgTime' 33 | 34 | ## Thread starts here 35 | def run(thread_num,query_result_queue,db_queue,stop_thread,config_dict): 36 | 37 | metric_name = 'redshift.WlmMetrics' 38 | db = 'opentsdb' 39 | thread_read = ThreadRead() 40 | thread_write = ThreadWrite(config_dict['general']['cluster_name']) 41 | payload = {} 42 | 43 | while (not stop_thread.is_set()): 44 | payload = {} 45 | payload_opentsdb = [] 46 | payload_statsd = [] 47 | 48 | df_response, error = thread_read.read_query_result(thread_num=thread_num,query_result_queue=query_result_queue,metric_name=metric_name,query_name='query2') 49 | if error == -1: 50 | print 'Something broke. Skip this run of %s' %(metric_name) 51 | else: 52 | compute_metric = ComputeMetric(thread_num=thread_num,df_response=df_response) 53 | running_query_count, metric_name = compute_metric.running_query_count() 54 | payload_statsd.extend(thread_write.get_payload(db='statsd',metric_name=metric_name,metric_dictionary=running_query_count,tag_name='service_class')) 55 | queued_query_count, metric_name = compute_metric.queued_query_count() 56 | payload_statsd.extend(thread_write.get_payload(db='statsd',metric_name=metric_name,metric_dictionary=queued_query_count,tag_name='service_class')) 57 | returning_query_count, metric_name = compute_metric.returning_query_count() 58 | payload_statsd.extend(thread_write.get_payload(db='statsd',metric_name=metric_name,metric_dictionary=returning_query_count,tag_name='service_class')) 59 | running_query_avg_time, metric_name = compute_metric.running_query_avg_time() 60 | payload_statsd.extend(thread_write.get_payload(db='statsd',metric_name=metric_name,metric_dictionary=running_query_avg_time,tag_name='service_class')) 61 | queued_query_avg_time, metric_name = compute_metric.queued_query_avg_time() 62 | payload_statsd.extend(thread_write.get_payload(db='statsd',metric_name=metric_name,metric_dictionary=queued_query_avg_time,tag_name='service_class')) 63 | returning_query_avg_time, metric_name = compute_metric.returning_query_avg_time() 64 | payload_opentsdb.extend(thread_write.get_payload(db='opentsdb',metric_name=metric_name,metric_dictionary=returning_query_avg_time,tag_name='service_class')) 65 | payload_statsd.extend(thread_write.get_payload(db='statsd',metric_name=metric_name,metric_dictionary=returning_query_avg_time,tag_name='service_class')) 66 | payload['statsd'] = payload_statsd 67 | thread_write.write_payload(payload=payload,db_queue=db_queue) --------------------------------------------------------------------------------