├── .flake8 ├── HDFS-home-1.png ├── HDFS-datanode-1.png ├── IsilonDataInsightsClusterDetail.JPG ├── IsilonDataInsightsMultiClusterSummary.JPG ├── IsilonDataInsightsClusterProtocolDetail.JPG ├── .whitesource ├── IsilonDataInsightsClusterCapacityUtilizationTable.JPG ├── requirements.txt ├── setup_venv3.sh ├── setup_venv.sh ├── isi_api_client.py ├── .gitignore ├── LICENSE ├── isi_data_insights_d.py ├── prometheus_plugin.py ├── isi_sdk_utils.py ├── grafana_cluster_capacity_utilization_dashboard.json ├── README.md ├── isi_stats_client.py ├── influxdb_plugin.py ├── example_isi_data_insights_d.cfg ├── README_KAPACITOR_INTEGRATION.md ├── isi_data_insights_config.py ├── isi_data_insights_daemon.py └── dashboards └── prometheus └── grafana_cluster_list_dashboard.json /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 100 3 | -------------------------------------------------------------------------------- /HDFS-home-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Isilon/isilon_data_insights_connector/HEAD/HDFS-home-1.png -------------------------------------------------------------------------------- /HDFS-datanode-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Isilon/isilon_data_insights_connector/HEAD/HDFS-datanode-1.png -------------------------------------------------------------------------------- /IsilonDataInsightsClusterDetail.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Isilon/isilon_data_insights_connector/HEAD/IsilonDataInsightsClusterDetail.JPG -------------------------------------------------------------------------------- /IsilonDataInsightsMultiClusterSummary.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Isilon/isilon_data_insights_connector/HEAD/IsilonDataInsightsMultiClusterSummary.JPG -------------------------------------------------------------------------------- /IsilonDataInsightsClusterProtocolDetail.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Isilon/isilon_data_insights_connector/HEAD/IsilonDataInsightsClusterProtocolDetail.JPG -------------------------------------------------------------------------------- /.whitesource: -------------------------------------------------------------------------------- 1 | { 2 | "generalSettings": { 3 | "shouldScanRepo": true 4 | }, 5 | "checkRunSettings": { 6 | "vulnerableCheckRunConclusionLevel": "failure" 7 | } 8 | } -------------------------------------------------------------------------------- /IsilonDataInsightsClusterCapacityUtilizationTable.JPG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Isilon/isilon_data_insights_connector/HEAD/IsilonDataInsightsClusterCapacityUtilizationTable.JPG -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | daemons >= 1.3.0 2 | influxdb >= 2.12.0 3 | pip >= 8.0.2 4 | urllib3 >= 1.13.1 5 | requests >= 2.22.0 6 | isi_sdk_8_0 >= 0.2.0, < 0.3.0 7 | isi_sdk_7_2 >= 0.2.0, < 0.3.0 8 | Equation >= 1.2.01 9 | gevent >= 1.2.1 10 | future >= 0.18.0 11 | configparser >= 0.4.0 12 | prometheus_client == 0.12.0 13 | -------------------------------------------------------------------------------- /setup_venv3.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | venv_path=".venv3" 6 | 7 | python3 -m venv $venv_path 8 | 9 | . $venv_path/bin/activate 10 | pip install -r requirements.txt 11 | 12 | echo 13 | echo "Isilon Data Insights Connector virtual environment setup at $venv_path." 14 | echo "To activate the virtual environment run: . $venv_path/bin/activate" 15 | -------------------------------------------------------------------------------- /setup_venv.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | 5 | venv_path=".venv" 6 | 7 | pip install --user virtualenv 8 | 9 | virtualenv $venv_path 10 | 11 | . $venv_path/bin/activate 12 | pip install -U pip setuptools 13 | pip install -r requirements.txt 14 | 15 | echo 16 | echo "Isilon Data Insights Connector virtual environment setup at $venv_path." 17 | echo "To activate the virtual environment run: . $venv_path/bin/activate" 18 | -------------------------------------------------------------------------------- /isi_api_client.py: -------------------------------------------------------------------------------- 1 | """ 2 | Currently Swagger Codegen uses a singleton to specify basic auth 3 | credentials, which doesn't work for multi-thread or multi-client 4 | scenarios where each thread or client needs to connect to a unique 5 | cluster. So this class is a custom implementation of the 6 | isi_sdk.ApiClient that is multi-thread/client safe. 7 | """ 8 | from builtins import object 9 | 10 | 11 | class IsiApiClient(object): 12 | _username = None 13 | _password = None 14 | 15 | def configure_basic_auth(self, username, password): 16 | self._username = username 17 | self._password = password 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | 5 | # C extensions 6 | *.so 7 | 8 | # Distribution / packaging 9 | .Python 10 | env/ 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # PyInstaller 27 | # Usually these files are written by a python script from a template 28 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 29 | *.manifest 30 | *.spec 31 | 32 | # Installer logs 33 | pip-log.txt 34 | pip-delete-this-directory.txt 35 | 36 | # Unit test / coverage reports 37 | htmlcov/ 38 | .tox/ 39 | .coverage 40 | .coverage.* 41 | .cache 42 | nosetests.xml 43 | coverage.xml 44 | *,cover 45 | 46 | # Translations 47 | *.mo 48 | *.pot 49 | 50 | # Django stuff: 51 | *.log 52 | 53 | # Sphinx documentation 54 | docs/_build/ 55 | 56 | # PyBuilder 57 | target/ 58 | 59 | # Our virtual environments 60 | .venv 61 | .venv3 62 | 63 | # Development environment stuff 64 | .vscode 65 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright (c) 2016 EMC Corporation 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in all 11 | copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 19 | SOFTWARE. 20 | -------------------------------------------------------------------------------- /isi_data_insights_d.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | # From __future_ imports have to be before everything. 4 | from __future__ import print_function 5 | from builtins import str 6 | 7 | # Have to do this before importing the other libs 8 | # The noqa comment prevents spurious E402 flake8 errors 9 | # The documentation for monkey explicitly requires patching to be 10 | # performed as early as possible BEFORE other imports but after the 11 | # from __future__ imports. 12 | from gevent import monkey 13 | 14 | monkey.patch_all() # noqa 15 | 16 | import sys 17 | 18 | from isi_data_insights_config import ( 19 | parse_cli, 20 | configure_args_via_file, 21 | process_pid_file_arg, 22 | configure_logging_via_cli, 23 | configure_via_cli, 24 | configure_via_file, 25 | ) 26 | from isi_data_insights_daemon import IsiDataInsightsDaemon 27 | 28 | 29 | def main(): 30 | args = parse_cli() 31 | 32 | # load the config file if one is provided, then set the "required" 33 | # parameters of the CLI args with config file parameters (if possible) 34 | config_file = configure_args_via_file(args) 35 | 36 | # validate the pid_file arg and get the full path to it. 37 | pid_file_path = process_pid_file_arg(args.pid_file, args.action) 38 | 39 | daemon = IsiDataInsightsDaemon(pidfile=pid_file_path) 40 | 41 | # before we do the long process of configuring, lets make sure we have 42 | # a valid pid to do a stop or restart with 43 | if (args.action == "restart" or args.action == "stop") and daemon.pid is None: 44 | print( 45 | "Cannot " + args.action + " daemon, " 46 | "invalid pid in file: " + str(pid_file_path), 47 | file=sys.stderr, 48 | ) 49 | sys.exit(1) 50 | 51 | if args.action == "start" or args.action == "debug" or args.action == "restart": 52 | configure_logging_via_cli(args) 53 | 54 | if config_file is not None: 55 | configure_via_file(daemon, args, config_file) 56 | else: 57 | configure_via_cli(daemon, args) 58 | 59 | if args.action == "start": 60 | daemon.start() 61 | elif args.action == "restart": 62 | print("Restarting daemon with pid " + str(daemon.pid)) 63 | daemon.restart() 64 | else: 65 | daemon.run(debug=True) 66 | elif args.action == "stop": 67 | print("Stopping daemon with pid " + str(daemon.pid)) 68 | daemon.stop() 69 | else: 70 | print( 71 | "Invalid action arg: '%s', must be one of " 72 | "'start', 'stop', or 'restart'." % args.action, 73 | file=sys.stderr, 74 | ) 75 | 76 | 77 | if __name__ == "__main__": 78 | main() 79 | -------------------------------------------------------------------------------- /prometheus_plugin.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from future.utils import string_types 3 | import logging 4 | import time 5 | import sys 6 | import prometheus_client as prom 7 | LOG = logging.getLogger(__name__) 8 | 9 | # module variables 10 | this = sys.modules[__name__] 11 | collection_duration = None 12 | gobaltags = {} 13 | tagnames = [] 14 | intervalstart = 0 15 | metriclist = {} 16 | 17 | def start(argv): 18 | ''' 19 | Setup Prometheus client interface. 20 | For prometheus all metrics are exposed via HTTP and the server will 21 | scrape (=collect) data from there 22 | 23 | Arguments: 24 | argv[0] = (String) 25 | Default is 8080. If running inside containers do not change this port 26 | but instead change the exposed port via the docker run command 27 | argv[1] = (String) 28 | Custom tags that are used to decorate metrics. The plugin needs to 29 | know them at startup time. 30 | Comma separated pairs like, group=Lab,datacenter=Berlin,.... 31 | ''' 32 | port = 8080 33 | this.globaltags = {} 34 | this.tagnames = [] 35 | if isinstance(argv, list) and len(argv) > 0: 36 | port = int(argv[0]) 37 | if len(argv) > 1: 38 | for item in argv[1].split(','): 39 | (key, val) = item.split('=') 40 | this.globaltags[key] = val 41 | 42 | this.tagnames = ['hostname', 'node'] + list(this.globaltags.keys()) 43 | this.collection_duration = prom.Gauge('isi_collector_duration_seconds', '', this.tagnames) 44 | prom.start_http_server(port) 45 | LOG.info('Exposing data for prometheus at port {}'.format(port)) 46 | 47 | def start_process(cluster): 48 | ''' 49 | Start of a new collection interval 50 | ''' 51 | LOG.info('Start processing prometheus metrics for {}'.format(cluster)) 52 | this.intervalstart = time.time() 53 | 54 | def end_process(cluster): 55 | ''' 56 | End of a collection interval 57 | ''' 58 | tags = this.globaltags.copy() 59 | tags['hostname'] = cluster 60 | tags['node'] = '' 61 | this.collection_duration.labels(**tags).set(time.time() - this.intervalstart) 62 | LOG.info('Done processing {} metrics for prometheus for {}'.format(len(this.metriclist), cluster)) 63 | 64 | def process_stat(cluster, stat): 65 | ''' Arguments: 66 | cluster(String) = isilon cluster hostname/ip 67 | stat(Object) 68 | ''' 69 | if stat.error != None: 70 | return 71 | tags = this.globaltags.copy() 72 | tags['hostname'] = cluster 73 | tags['node'] = str(stat.devid) 74 | 75 | if isinstance(stat.value, list): 76 | _process_list(tags, stat.key, stat.value) 77 | 78 | elif isinstance(stat.value, dict): 79 | _process_dict(tags, stat.key, stat.value) 80 | 81 | else: 82 | _process_one_stat(tags, stat.key, stat.value) 83 | 84 | def _process_list(tags, basekey, statlist): 85 | ''' list of stats (expected as list of dict) ''' 86 | for elem in statlist: 87 | if isinstance(elem, dict): 88 | _process_dict(tags, basekey, elem) 89 | else: 90 | LOG.error('Unexpected list of non-dict element: {}={}'.format(basekey, elem)) 91 | 92 | def _process_dict(tags, basekey, statdict): 93 | ''' dictionary stats 94 | all number values in the dict are metrics. But it contains text members 95 | and fields named with 'id': Those are filtered out as tags 96 | ''' 97 | for k in list(statdict.keys()): 98 | if isinstance(statdict[k], string_types) or (k[-2:] == 'id' and isinstance(statdict[k], int)): 99 | tags[k] = statdict[k] 100 | del statdict[k] 101 | 102 | for k in statdict.keys(): 103 | mname = basekey + '_' + k 104 | _process_one_stat(tags, mname, statdict[k]) 105 | 106 | def _process_one_stat(tags, metricname, value): 107 | ''' process one stat for prometheus. 108 | metrics are kept inside the process as list of gauges for prometheus to scrape 109 | ''' 110 | m = metricname.replace('.', '_') 111 | if m in this.metriclist: 112 | metric = this.metriclist[m] 113 | else: 114 | metric = prom.Gauge('isilon_' + m, '', tags.keys()) 115 | this.metriclist[m] = metric 116 | metric.labels(**tags).set(value) 117 | 118 | -------------------------------------------------------------------------------- /isi_sdk_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Handle the details of building a Swagger client with the correct version of the 3 | SDK to talk to a specific Isilon host. 4 | """ 5 | from __future__ import print_function 6 | from builtins import str 7 | 8 | try: 9 | import isi_sdk_8_0 10 | except ImportError: 11 | isi_sdk_8_0 = None 12 | 13 | try: 14 | import isi_sdk_7_2 15 | except ImportError: 16 | isi_sdk_7_2 = None 17 | 18 | import sys 19 | 20 | 21 | def configure(host, username, password, verify_ssl=False, use_version="detect"): 22 | """ 23 | Get a version specific instance of the isi_sdk and a multi-thread/client 24 | safe instance of IsiApiClient that can be used to interface with the 25 | specified host by possibly detecting the best version of the sdk to use. 26 | Returns a tuple consisting of the isi_sdk interface, an instance of 27 | IsiApiClient, and a float value set to either 8.0 or 7.2 depending on 28 | which version of the SDK was chosen. The IsiApiClient instance can be used 29 | in conjunction with the isi_sdk to interface with the specified cluster 30 | cluster (i.e. isi_sdk.ProtocolsApi(isi_api_cli_inst).list_nfs_exports()). 31 | :param string host: The name or ip-address of the host to configure the SDK 32 | interface to work with. 33 | :param string username: The username to use for authentication with the 34 | specified host. 35 | :param string password: The password to use for authentication with the 36 | specified host. 37 | :param bool verify_ssl: Specifies whether or not the Isilon cluster's SSL 38 | certificate should be verified. 39 | :param mixed use_version: Can be either "detect" in order to detect the 40 | correct version of the SDK to use with the specified host. Or a float value 41 | of 7.2 or 8.0 can be used in order to force use of that particular version 42 | of the SDK. 43 | :returns: tuple 44 | """ 45 | if isi_sdk_7_2 is None and isi_sdk_8_0 is None: 46 | raise RuntimeError("Isilon SDK is not installed.") 47 | 48 | host_url = "https://" + host + ":8080" 49 | 50 | if use_version is None or use_version == "detect": 51 | host_version = _detect_host_version(host, username, password, verify_ssl) 52 | else: 53 | host_version = use_version 54 | 55 | isi_sdk = None 56 | if host_version < 8.0 and isi_sdk_7_2 is not None: 57 | isi_sdk = isi_sdk_7_2 58 | elif host_version >= 8.0 and isi_sdk_8_0 is None: 59 | isi_sdk = isi_sdk_7_2 60 | # we detected a version 8.0 host, but have to treat it like a 7.2 host 61 | # because the 8.0 SDK is not installed 62 | host_version = 7.2 63 | else: 64 | isi_sdk = isi_sdk_8_0 65 | 66 | configuration = isi_sdk.Configuration() 67 | configuration.username = username 68 | configuration.password = password 69 | configuration.verify_ssl = verify_ssl 70 | configuration.host = host_url 71 | api_client = isi_sdk.ApiClient(configuration) 72 | 73 | return isi_sdk, api_client, host_version 74 | 75 | 76 | def _detect_host_version(host, username, password, verify_ssl): 77 | # if 7.2 is available then use it to check the version of the cluster 78 | # because it will work for 7.2 or newer clusters. 79 | isi_sdk = isi_sdk_7_2 if isi_sdk_7_2 else isi_sdk_8_0 80 | 81 | configuration = isi_sdk.Configuration() 82 | configuration.username = username 83 | configuration.password = password 84 | configuration.verify_ssl = verify_ssl 85 | configuration.host = "https://" + host + ":8080" 86 | api_client = isi_sdk.ApiClient(configuration) 87 | 88 | try: 89 | try: 90 | config = isi_sdk.ClusterApi(api_client).get_cluster_config() 91 | host_version = ( 92 | 7.2 if config.onefs_version.release.startswith("v7.") else 8.0 93 | ) 94 | except isi_sdk.rest.ApiException as api_exc: 95 | # if we are using isi_sdk_8_0 (because 7.2 is not installed) and the 96 | # cluster is a 7.2 cluster then it will return 404 for the 97 | # get_cluster_config call, but it should still work for stats queries, 98 | # so just set the version and continue on. 99 | if isi_sdk == isi_sdk_8_0 and api_exc.status == 404: 100 | host_version = 7.2 101 | else: 102 | raise api_exc 103 | except Exception as exc: 104 | raise RuntimeError( 105 | "Failed to get cluster config for cluster %s " 106 | "using SDK %s. Error: %s" % (host, isi_sdk.__name__, str(exc)) 107 | ) 108 | 109 | if host_version == 7.2 and isi_sdk_7_2 is None: 110 | print( 111 | "Detected version 7 host, but version 7.2 SDK " 112 | "is not installed, will use 8.0 SDK instead.", 113 | file=sys.stderr, 114 | ) 115 | 116 | if host_version == 8.0 and isi_sdk_8_0 is None: 117 | print( 118 | "Detected version 8 host, but version 8.0 SDK " 119 | "is not installed, will use 7.2 SDK instead.", 120 | file=sys.stderr, 121 | ) 122 | 123 | return host_version 124 | -------------------------------------------------------------------------------- /grafana_cluster_capacity_utilization_dashboard.json: -------------------------------------------------------------------------------- 1 | { 2 | "__inputs": [ 3 | { 4 | "name": "DS_LOCAL_INFLUXDB", 5 | "label": "Local influxdb", 6 | "description": "", 7 | "type": "datasource", 8 | "pluginId": "influxdb", 9 | "pluginName": "InfluxDB" 10 | } 11 | ], 12 | "__requires": [ 13 | { 14 | "type": "panel", 15 | "id": "table", 16 | "name": "Table", 17 | "version": "" 18 | }, 19 | { 20 | "type": "grafana", 21 | "id": "grafana", 22 | "name": "Grafana", 23 | "version": "3.1.1" 24 | }, 25 | { 26 | "type": "datasource", 27 | "id": "influxdb", 28 | "name": "InfluxDB", 29 | "version": "1.0.0" 30 | } 31 | ], 32 | "id": null, 33 | "title": "Isilon Data Insights Cluster Capacity Utilization Table", 34 | "description": "Color coded table showing cluster capacity utilization. Good to see the clusters with the highest capacity utilization.", 35 | "tags": [], 36 | "style": "dark", 37 | "timezone": "browser", 38 | "editable": true, 39 | "hideControls": false, 40 | "sharedCrosshair": false, 41 | "rows": [ 42 | { 43 | "collapse": false, 44 | "editable": true, 45 | "height": "250px", 46 | "panels": [ 47 | { 48 | "columns": [], 49 | "editable": true, 50 | "error": false, 51 | "fontSize": "100%", 52 | "height": "1000", 53 | "id": 1, 54 | "interval": ">200d", 55 | "isNew": true, 56 | "links": [], 57 | "pageSize": null, 58 | "scroll": true, 59 | "showHeader": true, 60 | "sort": { 61 | "col": 2, 62 | "desc": true 63 | }, 64 | "span": 12, 65 | "styles": [ 66 | { 67 | "dateFormat": "YYYY-MM-DD HH:mm:ss", 68 | "pattern": "Time", 69 | "type": "date" 70 | }, 71 | { 72 | "colorMode": "row", 73 | "colors": [ 74 | "rgba(50, 172, 45, 0.97)", 75 | "rgba(237, 129, 40, 0.89)", 76 | "rgba(245, 54, 54, 0.9)" 77 | ], 78 | "decimals": 2, 79 | "pattern": "utilization", 80 | "thresholds": [ 81 | "85", 82 | "90" 83 | ], 84 | "type": "number", 85 | "unit": "percent" 86 | } 87 | ], 88 | "targets": [ 89 | { 90 | "alias": "", 91 | "dsType": "influxdb", 92 | "groupBy": [ 93 | { 94 | "params": [ 95 | "$interval" 96 | ], 97 | "type": "time" 98 | }, 99 | { 100 | "params": [ 101 | "cluster" 102 | ], 103 | "type": "tag" 104 | }, 105 | { 106 | "params": [ 107 | "none" 108 | ], 109 | "type": "fill" 110 | } 111 | ], 112 | "measurement": "ifs.percent.avail", 113 | "policy": "default", 114 | "query": "SELECT 100.0 - last(\"value\") as utilization FROM \"ifs.percent.avail\" WHERE \"cluster\" =~ /^$cluster$/ AND $timeFilter GROUP BY time($interval), \"cluster\" fill(none)", 115 | "rawQuery": true, 116 | "refId": "A", 117 | "resultFormat": "table", 118 | "select": [ 119 | [ 120 | { 121 | "params": [ 122 | "value" 123 | ], 124 | "type": "field" 125 | }, 126 | { 127 | "params": [], 128 | "type": "last" 129 | }, 130 | { 131 | "params": [ 132 | "100 -" 133 | ], 134 | "type": "math" 135 | } 136 | ] 137 | ], 138 | "tags": [ 139 | { 140 | "key": "cluster", 141 | "operator": "=~", 142 | "value": "/^$cluster$/" 143 | } 144 | ] 145 | } 146 | ], 147 | "timeFrom": null, 148 | "title": "Cluster Capacity Utilization", 149 | "transform": "table", 150 | "type": "table" 151 | } 152 | ], 153 | "title": "Row" 154 | } 155 | ], 156 | "time": { 157 | "from": "now-7d", 158 | "to": "now" 159 | }, 160 | "timepicker": { 161 | "refresh_intervals": [ 162 | "5s", 163 | "10s", 164 | "30s", 165 | "1m", 166 | "5m", 167 | "15m", 168 | "30m", 169 | "1h", 170 | "2h", 171 | "1d" 172 | ], 173 | "time_options": [ 174 | "5m", 175 | "15m", 176 | "1h", 177 | "6h", 178 | "12h", 179 | "24h", 180 | "2d", 181 | "7d", 182 | "30d" 183 | ] 184 | }, 185 | "templating": { 186 | "list": [ 187 | { 188 | "current": {}, 189 | "datasource": "${DS_LOCAL_INFLUXDB}", 190 | "hide": 0, 191 | "includeAll": true, 192 | "label": "Cluster", 193 | "multi": true, 194 | "name": "cluster", 195 | "options": [], 196 | "query": "show tag values with key = \"cluster\"", 197 | "refresh": 1, 198 | "type": "query" 199 | } 200 | ] 201 | }, 202 | "annotations": { 203 | "list": [] 204 | }, 205 | "schemaVersion": 12, 206 | "version": 2, 207 | "links": [], 208 | "gnetId": null 209 | } -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Isilon Data Insights Connector 2 | 3 | The isi_data_insights_d.py script controls a daemon process that can be used to query multiple OneFS clusters for statistics data via the Isilon OneFS Platform API (PAPI). The collector uses a pluggable module for processing the results of those queries. The provided stats processor defined in influxdb_plugin.py sends query results to an InfluxDB backend. Additionally, several Grafana dashboards are provided to make it easy to monitor the health and status of your Isilon clusters. 4 | The Connector now supports running under either Python 2 or Python 3. 5 | 6 | ## Installation Instructions 7 | 8 | The collector was developed and tested on Linux. It is written in Python and believed to be portable, but no testing has been performed on other platforms. It is suggested that a Linux VM be provisioned to run the collector and the InfluxDB and Grafana components. 9 | 10 | Please note, it is dangerous and unnecessary to install Python packages as root (sudo pip ...). The data insights collector needs no special privileges and can be installed and run as an unprivileged user. Because of this, the recommended way to install the Connector is via a Python virtual environment. The virtual environment installation installs the required Python dependencies into a [Python Virtual Environment](http://docs.python-guide.org/en/latest/dev/virtualenvs/). The Connector is then run directly from the source directory. 11 | 12 | * To install the connector in a virtual environment using the default Python interpreter on the system, run: 13 | 14 | ```sh 15 | ./setup_venv.sh 16 | ``` 17 | 18 | * To explicitly install using "python3" as the interpreter, run 19 | 20 | ```sh 21 | ./setup_venv3.sh 22 | ``` 23 | 24 | The Grafana visualization component can be downloaded from [here](https://grafana.com/grafana/download?pg=get&plcmt=selfmanaged-box1-cta1) 25 | 26 | **Important note** InfluxDB 2.x is incompatible with version 1 and will not work. Please ensure you download and install an InfluxDB version 1.x package (the latest is currently 1.8.10), For installation instructions for the current 1.x (1.8.10) version of Influxdb, refer to [this link](https://portal.influxdata.com/downloads/), scroll down and expand the "Are you interested in InfluxDB 1.x Open Source?" section. 27 | 28 | ## Run Instructions 29 | 30 | * Rename or copy the example configuration file, example_isi_data_insights_d.cfg, to isi_data_insights_d.cfg. The path ./isi_data_insights_d.cfg is the default configuration file path for the Connector. If you use that name and run the Connector from the source directory then you don't have to use the --config parameter to specify a different configuration file. 31 | * Edit isi_data_insights_d.cfg to configure the collector to query the set of Isilon OneFS clusters that you want to monitor. Do this by modifying the config file's clusters parameter. 32 | * The example configuration file is configured to gather and send several sets of stats to InfluxDB via the influxdb_plugin.py. 33 | * If you installed InfluxDB to somewhere other than localhost and/or port 8086 then you'll also need to update the configuration file with the address and port of the InfluxDB instance. 34 | * Activate the virtualenv it before running the Connector by running: 35 | 36 | ```sh 37 | . .venv/bin/activate 38 | ``` 39 | 40 | or, if you installed the Python 3 version, by running: 41 | 42 | ```sh 43 | . .venv3/bin/activate 44 | ``` 45 | 46 | * To run the Connector: 47 | 48 | ```sh 49 | ./isi_data_insights_d.py start 50 | ``` 51 | 52 | ## Grafana Setup 53 | 54 | Included with the Connector source code are several Grafana dashboards that make it easy to monitor the health and status of your Isilon clusters. To view the dashboards with Grafana, follow these instructions: 55 | 56 | * [Install and configure Grafana](http://docs.grafana.org/installation/) to use the InfluxDB as a data source. Note that the provided Grafana dashboards have been tested to work with Grafana versions up to and including 8,2,2. Also, note that the influxdb_plugin.py creates and stores the statistics data in a database named isi_data_insights. You'll need that information when following the instructions for adding a data source to Grafana. Also, be sure to configure the isi_data_insights data source as the default Grafana data source using the Grafana Dashboard Admin web-interface. 57 | * Import the Grafana dashboards. 58 | * grafana_cluster_list_dashboard.json 59 | ![Multi-cluster Summary Dashboard Screen Shot](https://raw.githubusercontent.com/Isilon/isilon_data_insights_connector/master/IsilonDataInsightsMultiClusterSummary.JPG) 60 | * grafana_cluster_capacity_utilization_dashboard.json 61 | ![Cluster Capacity Utilization Dashboard Screen Shot](https://raw.githubusercontent.com/Isilon/isilon_data_insights_connector/master/IsilonDataInsightsClusterCapacityUtilizationTable.JPG) 62 | * grafana_cluster_detail_dashboard.json 63 | ![Cluster Detail Dashboard Screen Shot](https://raw.githubusercontent.com/Isilon/isilon_data_insights_connector/master/IsilonDataInsightsClusterDetail.JPG) 64 | * grafana_cluster_protocol_dashboard.json 65 | ![Cluster Protocol Detail Dashboard Screen Shot](https://raw.githubusercontent.com/Isilon/isilon_data_insights_connector/master/IsilonDataInsightsClusterProtocolDetail.JPG) 66 | 67 | Import (optional) HDFS specific dashboards: 68 | 69 | * grafana_hadoop_home.json 70 | ![Hadoop Home Dashboard Screeenshot](https://raw.githubusercontent.com/Isilon/isilon_data_insights_connector/master/HDFS-home-1.png) 71 | * grafana_hadoop_datanodes.json 72 | ![Hadoop Home Dashboard Screeenshot](https://raw.githubusercontent.com/Isilon/isilon_data_insights_connector/master/HDFS-datanode-1.png) 73 | 74 | * If you had previously started the Connector, there should already be data in your database displayed in the dashboards. One common issue that might prevent your dashboards from showing up correctly is if the date/time on your Isilon clusters is not closely enough in-synch with the date/time used by Grafana. Synchronizing the date/time of all the systems to within a few seconds of each other should be enough to fix the issue. 75 | 76 | ## Kapacitor Integration 77 | 78 | [Kapacitor](https://www.influxdata.com/time-series-platform/kapacitor/) is an add-on component that, when used in conjunction with the Connector enables flexible, configurable, real-time notifications of alert conditions based off the statistics data streaming into the InfluxDB. For more information on how to integrate the Connector and InfluxDB with Kapacitor refer to: 79 | 80 | [Kapacitor Integration Instructions](https://github.com/Isilon/isilon_data_insights_connector/blob/master/README_KAPACITOR_INTEGRATION.md) 81 | 82 | ## Customizing the Connector 83 | 84 | The Connector is designed to allow for customization via a plugin architecture. The default plugin, influxd_plugin.py, is configured via the provided example configuration file. If you would like to process the stats data differently or send them to a different backend than the influxdb_plugin.py you can implement a custom stats processor. Here are the instructions for doing so: 85 | 86 | * Create a file called my_plugin.py, or whatever you want to name it. 87 | * In the my_plugin.py file define a process(cluster, stats) function that takes as input the name/ip-address of a cluster and a list of stats. The list of stats will contain instances of the isi_sdk_8_0/models/CurrentStatisticsStat class or isi_sdk_7_2/models/CurrenStatisticsStat class, but it makes no difference because the two classes are the same regardless of the version. 88 | * Optionally define a start(argv) function that takes a list of input args as defined in the config file via the stats_processor_args parameter. 89 | * Optionally define a stop() function. 90 | * Put the my_plugin.py file somewhere in your PYTHONPATH (easiest is to put into the same directory as the other Python source code files). 91 | * Update the isi_data_insights_d.cfg file with the name of your plugin (i.e. 'my_plugin') 92 | * Restart the isi_data_insights_d.py daemon: 93 | 94 | ```sh 95 | ./isi_data_insights_d.py restart 96 | ``` 97 | 98 | ## Extending and/or Contributing to the Connector 99 | 100 | There are multiple ways for anyone using the Connector to interact with our dev team to request new features or discuss problems. 101 | 102 | * Create a new issue on the [Issues](https://github.com/Isilon/isilon_data_insights_connector/issues) tab. 103 | * Use the [discussion](https://community.emc.com/docs/DOC-48273) capability of the Isilon SDK Info Hub page. 104 | 105 | Also, just like an other project on github.com we are entirely open to external code contributions: 106 | 107 | * Fork the project, modify it, then initiate a pull request. 108 | -------------------------------------------------------------------------------- /isi_stats_client.py: -------------------------------------------------------------------------------- 1 | from builtins import range 2 | from builtins import object 3 | import logging 4 | 5 | 6 | LOG = logging.getLogger(__name__) 7 | # Apache/PAPI has a request URI limit of 8096, MAX_KEYS_LEN is the max 8 | # length of a set of keys that the client will attempt to send. 9 | MAX_KEYS_LEN = 7000 10 | # When getting metadata for multiple stats, if there are less than 11 | # MAX_DIRECT_METADATA_STATS then do the query as multiple direct key queries, 12 | # otherwise do it as a single batch query and filter the results on the client 13 | # side. Testing revealed that 200 is the optimal cutoff point for a virtual 14 | # cluster. 15 | MAX_DIRECT_METADATA_STATS = 200 16 | 17 | 18 | class IsiStatsClient(object): 19 | """ 20 | Handles the details of querying for Isilon cluster statistics values and 21 | metadata using the Isilon SDK. 22 | """ 23 | 24 | def __init__(self, stats_api): 25 | """ 26 | Setup the Isilon SDK to query the specified cluster's statistics. 27 | :param StatisticsApi stats_api: instance of StatisticsApi from the 28 | isi_sdk_8_0 or isi_sdk_7_2 package. 29 | """ 30 | # get the Statistics API 31 | self._stats_api = stats_api 32 | 33 | def query_stats( 34 | self, 35 | stats, 36 | devid="all", 37 | substr=False, 38 | timeout=60, 39 | degraded=True, 40 | expand_clientid=False, 41 | ): 42 | """ 43 | Queries the cluster for a list of stat values. Note: this function only 44 | works on OneFS 8.0 or newer. 45 | :param list stats: a list of stat names to query 46 | :param string devid: The node number or "all" to query all nodes. 47 | :param bool substr: If True, makes the 'keys' arg perform a partial 48 | match. 49 | :param int timeout: Time in seconds to wait for results from remote 50 | nodes. 51 | :param bool degraded: If true, try to continue even if some stats are 52 | unavailable. 53 | :param bool expand_clientid: If true, use name resolution to expand 54 | client addresses and other IDs. 55 | :returns: a list of isi_sdk.models.StatisticsCurrentStat 56 | instances corresponding to the list of stat names provided in the stats 57 | input list. 58 | """ 59 | # setup the stat keys for querying as set of comma delimitted values 60 | combined_query_results = None 61 | stat_keys = ",".join(stats) 62 | stat_index = 0 63 | stat_keys_len = len(stat_keys) 64 | while stat_index < stat_keys_len: 65 | if stat_keys_len - stat_index > MAX_KEYS_LEN: 66 | # find the last comma between stat_index and 67 | # stat_index + MAX_KEYS_LEN 68 | next_stat_index = stat_keys.rfind( 69 | ",", stat_index, stat_index + MAX_KEYS_LEN 70 | ) 71 | # unless there's a key that is longer than MAX_KEYS_LEN 72 | # then the rfind should never return -1 because there should 73 | # definitely be at least one comma. 74 | query_keys = stat_keys[stat_index:next_stat_index] 75 | stat_index = next_stat_index + 1 76 | else: 77 | query_keys = stat_keys[stat_index:] 78 | stat_index = stat_keys_len 79 | 80 | query_result = self._stats_api.get_statistics_current( 81 | keys=query_keys, 82 | devid=devid, 83 | substr=substr, 84 | degraded=degraded, 85 | expand_clientid=expand_clientid, 86 | timeout=timeout, 87 | ) 88 | 89 | if combined_query_results is None: 90 | combined_query_results = query_result 91 | else: 92 | combined_query_results.stats.extend(query_result.stats) 93 | 94 | # return the list of stats only (at this point there are no other 95 | # fields on the query_results data model). 96 | return combined_query_results.stats 97 | 98 | def query_stat( 99 | self, stat, devid="all", timeout=60, degraded=True, expand_clientid=False 100 | ): 101 | """ 102 | Queries the cluster for a single stat's value. Note: this function 103 | works on OneFS 7.2 or newer clusters. 104 | :param string stats: the name of the stat to query 105 | :param string devid: The node number or "all" to query all nodes. 106 | :param int timeout: Time in seconds to wait for results from remote 107 | nodes. 108 | :param bool degraded: If true, try to continue even if some stats are 109 | unavailable. 110 | :param bool expand_clientid: If true, use name resolution to expand 111 | client addresses and other IDs. 112 | :returns: an instance of isi_sdk.models.StatisticsCurrentStat 113 | """ 114 | query_result = self._stats_api.get_statistics_current( 115 | key=stat, 116 | devid=devid, 117 | degraded=degraded, 118 | expand_clientid=expand_clientid, 119 | timeout=timeout, 120 | ) 121 | 122 | return query_result.stats 123 | 124 | def get_stats_metadata(self, stats=None): 125 | """ 126 | Query the cluster for the metadata associated with each key specified 127 | in the stats list or all stats if stats is None. 128 | :param list stats: list of statistic keys to query. 129 | :returns: a list of isi_sdk.models.StatisticsKey instances (in 130 | the same order as the stats input param list). 131 | """ 132 | if stats is not None and len(stats) < MAX_DIRECT_METADATA_STATS: 133 | return self._get_metadata_direct(stats) 134 | return self._get_metadata_indirect(stats) 135 | 136 | def get_stat_metadata(self, stat): 137 | """ 138 | Query the cluster for the metadata of a specific stat. 139 | :param string stat: the name of the stat to query 140 | :returns: a single isi_sdk.models.StatisticsKey. 141 | """ 142 | result = self._stats_api.get_statistics_key(statistics_key_id=stat) 143 | return result.keys[0] 144 | 145 | def _get_metadata_indirect(self, stats): 146 | """ 147 | Get the metadata for every single stat and then filter it down to the 148 | list of stats specified in the stats param. 149 | :param list stats: the list of stats to return metadata for, or if it 150 | is None then return all metadata. 151 | :returns: a list of isi_sdk.models.StatisticsKey instances. 152 | """ 153 | stat_map = {} 154 | if stats is not None: 155 | num_stats = len(stats) 156 | for stat_index in range(0, num_stats): 157 | stat_map[stats[stat_index]] = stat_index 158 | result_list = [None] * num_stats 159 | else: 160 | result_list = [] 161 | query_args = dict() 162 | while True: 163 | results = self._stats_api.get_statistics_keys(**query_args) 164 | if stats is None: 165 | if result_list is None: 166 | result_list = results.keys 167 | else: 168 | result_list.extend(results.keys) 169 | else: 170 | for key in results.keys: 171 | try: 172 | stat_index = stat_map[key.key] 173 | result_list[stat_index] = key 174 | num_stats -= 1 175 | if num_stats == 0: 176 | break 177 | except KeyError: 178 | pass 179 | 180 | resume = results.resume 181 | if resume is None: 182 | break 183 | query_args["resume"] = resume 184 | 185 | return result_list 186 | 187 | def _get_metadata_direct(self, stats): 188 | """ 189 | Get the metadata for the list of stats provided in the stats list input 190 | parameter by sending an individual request for each stat. When the list 191 | of stats is small(er) then this method is faster than querying for all 192 | the stats metadata and filtering it (see _get_metadata_indirect). 193 | :param list stats: the list of stat names to query for metadata. 194 | :returns: a list of isi_sdk.models.StatisticsKey instances. 195 | """ 196 | metadata_list = [] 197 | for stat in stats: 198 | metadata = self.get_stat_metadata(stat) 199 | metadata_list.append(metadata) 200 | return metadata_list 201 | -------------------------------------------------------------------------------- /influxdb_plugin.py: -------------------------------------------------------------------------------- 1 | from __future__ import print_function 2 | from builtins import input 3 | from builtins import str 4 | from builtins import range 5 | from builtins import object 6 | from future.utils import string_types 7 | from influxdb import InfluxDBClient 8 | from influxdb.exceptions import InfluxDBServerError, InfluxDBClientError 9 | from ast import literal_eval 10 | 11 | import getpass 12 | import logging 13 | import requests.exceptions 14 | import sys 15 | 16 | 17 | class StatsProcessorState(object): 18 | def __init__(self): 19 | self.influxdb_points = None 20 | self.points_written = None 21 | self.reset() 22 | 23 | def reset(self): 24 | self.influxdb_points = [] 25 | self.points_written = 0 26 | 27 | 28 | # influxdb_plugin state 29 | g_state = StatsProcessorState() 30 | 31 | # InfluxDBClient interface 32 | g_client = None 33 | LOG = logging.getLogger(__name__) 34 | 35 | # Number of points to queue up before writing it to the database. 36 | MAX_POINTS_PER_WRITE = 100 37 | # separator used to concatenate stat keys with sub-keys derived from stats 38 | # whose value is a dict or list. 39 | SUB_KEY_SEPARATOR = "." 40 | 41 | 42 | def start(argv): 43 | """ 44 | Instantiate an InfluxDBClient. The expected inputs are the host/address and 45 | port of the InfluxDB and the name of the database to use. If the database 46 | does not exist then it will be created. If the fourth arg is "auth" then it 47 | will prompt the user for the InfluxDB's username and password. 48 | """ 49 | influxdb_host = argv[0] 50 | influxdb_port = int(argv[1]) 51 | influxdb_name = argv[2] 52 | influxdb_ssl = False 53 | influxdb_verifyssl = False 54 | influxdb_username = "root" 55 | influxdb_password = "root" 56 | 57 | if len(argv) > 3: 58 | if argv[3] == "auth": 59 | influxdb_username = input("InfluxDB username: ") 60 | influxdb_password = getpass.getpass("Password: ") 61 | else: 62 | influxdb_username = argv[3] 63 | influxdb_password = argv[4] 64 | influxdb_ssl = literal_eval(argv[5]) 65 | influxdb_verifyssl = literal_eval(argv[6]) 66 | 67 | LOG.info( 68 | "Connecting to: %s@%s:%d database:%s ssl=%s verify_ssl=%s.", 69 | influxdb_username, 70 | influxdb_host, 71 | influxdb_port, 72 | influxdb_name, 73 | influxdb_ssl, 74 | influxdb_verifyssl, 75 | ) 76 | 77 | global g_client 78 | g_client = InfluxDBClient( 79 | host=influxdb_host, 80 | port=influxdb_port, 81 | database=influxdb_name, 82 | username=influxdb_username, 83 | password=influxdb_password, 84 | ssl=influxdb_ssl, 85 | verify_ssl=influxdb_verifyssl 86 | ) 87 | 88 | create_database = True 89 | try: 90 | databases = g_client.get_list_database() 91 | except (requests.exceptions.ConnectionError, InfluxDBClientError) as exc: 92 | print( 93 | "Failed to connect to InfluxDB server at %s:%s " 94 | "database: %s.\nERROR: %s" 95 | % (influxdb_host, str(influxdb_port), influxdb_name, str(exc)), 96 | file=sys.stderr, 97 | ) 98 | sys.exit(1) 99 | 100 | for database in databases: 101 | if database["name"] == influxdb_name: 102 | create_database = False 103 | break 104 | 105 | if create_database is True: 106 | LOG.info("Creating database: %s.", influxdb_name) 107 | g_client.create_database(influxdb_name) 108 | 109 | 110 | def begin_process(cluster): 111 | LOG.debug("Begin processing %s stats.", cluster) 112 | 113 | 114 | def process_stat(cluster, stat): 115 | """ 116 | Convert Isilon stat query result to InfluxDB point and send to the 117 | InfluxDB service. Organize the measurements by cluster and node via tags. 118 | """ 119 | # Process stat(s) and then write points if list is large enough. 120 | tags = {"cluster": cluster} 121 | if stat.devid != 0: 122 | tags["node"] = stat.devid 123 | 124 | influxdb_points = _influxdb_points_from_stat(stat.time, tags, stat.key, stat.value) 125 | if influxdb_points == []: 126 | return 127 | for influxdb_point in influxdb_points: 128 | if len(influxdb_point["fields"]) > 0: 129 | g_state.influxdb_points.append(influxdb_point) 130 | num_points = len(g_state.influxdb_points) 131 | if num_points > MAX_POINTS_PER_WRITE: 132 | g_state.points_written += _write_points( 133 | g_state.influxdb_points, num_points 134 | ) 135 | g_state.influxdb_points = [] 136 | 137 | 138 | def end_process(cluster): 139 | # send left over points to influxdb 140 | num_points = len(g_state.influxdb_points) 141 | if num_points > 0: 142 | g_state.points_written += _write_points(g_state.influxdb_points, num_points) 143 | LOG.debug( 144 | "Done processing %s stats, wrote %d points.", cluster, g_state.points_written 145 | ) 146 | g_state.reset() 147 | 148 | 149 | def _add_field(fields, field_name, field_value, field_value_type): 150 | if field_value_type == int: 151 | # convert integers to float because InfluxDB only supports 64 bit 152 | # signed integers, so doing this prevents an "out of range" error when 153 | # inserting values that are unsigned 64 bit integers. 154 | # Note that it is not clear if the PAPI is smart enough to always 155 | # encode 64 bit unsigned integers as type 'long' even when the actual 156 | # value is fits into a 64 bit signed integer and because InfluxDB 157 | # wants a measurement to always be of the same type, the safest thing 158 | # to do is convert integers to float. 159 | field_value = float(field_value) 160 | fields.append((field_name, field_value)) 161 | 162 | 163 | def _process_stat_dict(stat_value, fields, tags, prefix=""): 164 | """ 165 | Add (field_name, field_value) tuples to the fields list for any 166 | non-string or non-"id" items in the stat_value dict so that they can be 167 | used for the "fields" parameter of the InfluxDB point. 168 | Any string or keys with "id" on the end of their name get turned into tags. 169 | """ 170 | for key, value in stat_value.items(): 171 | value_type = type(value) 172 | field_name = prefix + key 173 | if isinstance(value, string_types) or (key[-2:] == "id" and value_type == int): 174 | tags[field_name] = value 175 | elif value_type == list: 176 | list_prefix = field_name + SUB_KEY_SEPARATOR 177 | _process_stat_list(value, fields, tags, list_prefix) 178 | elif value_type == dict: 179 | dict_prefix = field_name + SUB_KEY_SEPARATOR 180 | _process_stat_dict(value, fields, tags, dict_prefix) 181 | else: 182 | _add_field(fields, field_name, value, value_type) 183 | 184 | 185 | def _process_stat_list(stat_value, fields, tags, prefix=""): 186 | """ 187 | Add (field_name, field_value) tuples to the fields list for any 188 | non-string or non-"id" items in the stat_value dict so that they can be 189 | used for the "fields" parameter of the InfluxDB point. 190 | """ 191 | field_name = prefix + "value" 192 | for index in range(0, len(stat_value)): 193 | list_value = stat_value[index] 194 | value_type = type(list_value) 195 | if value_type == dict: 196 | _process_stat_dict(list_value, fields, tags, prefix) 197 | else: 198 | item_name = field_name + SUB_KEY_SEPARATOR + str(index) 199 | if value_type == list: 200 | # AFAIK there are no instances of a list that contains a list 201 | # but just in case one is added in the future, deal with it. 202 | item_name += SUB_KEY_SEPARATOR 203 | _process_stat_list(list_value, fields, tags, item_name) 204 | else: 205 | _add_field(fields, item_name, list_value, value_type) 206 | 207 | 208 | def _influxdb_points_from_stat(stat_time, tags, stat_key, stat_value): 209 | """ 210 | Create InfluxDB points/measurements from the stat query result. 211 | """ 212 | points = [] 213 | fields = [] 214 | stat_value_type = type(stat_value) 215 | if stat_value_type == list: 216 | for stat in stat_value: 217 | (fields, point_tags) = _influxdb_point_from_stat( 218 | stat_time, tags, stat_key, stat 219 | ) 220 | points.append( 221 | _build_influxdb_point(stat_time, point_tags, stat_key, fields) 222 | ) 223 | elif stat_value_type == dict: 224 | point_tags = tags.copy() 225 | _process_stat_dict(stat_value, fields, point_tags) 226 | points.append(_build_influxdb_point(stat_time, point_tags, stat_key, fields)) 227 | else: 228 | if stat_value == "": 229 | return None # InfluxDB does not like empty string stats 230 | _add_field(fields, "value", stat_value, stat_value_type) 231 | points.append(_build_influxdb_point(stat_time, tags.copy(), stat_key, fields)) 232 | return points 233 | 234 | 235 | def _influxdb_point_from_stat(stat_time, tags, stat_key, stat_value): 236 | """ 237 | Create InfluxDB points/measurements from the stat query result. 238 | """ 239 | point_tags = tags.copy() 240 | fields = [] 241 | stat_value_type = type(stat_value) 242 | if stat_value_type == dict: 243 | _process_stat_dict(stat_value, fields, point_tags) 244 | elif stat_value_type == list: 245 | _process_stat_list(stat_value, fields, point_tags) 246 | else: 247 | if stat_value == "": 248 | return None # InfluxDB does not like empty string stats 249 | _add_field(fields, "value", stat_value, stat_value_type) 250 | return (fields, point_tags) 251 | 252 | 253 | def _build_influxdb_point(unix_ts_secs, tags, measurement, fields): 254 | """ 255 | Build the json for an InfluxDB data point. 256 | """ 257 | timestamp_ns = unix_ts_secs * 1000000000 # convert to nanoseconds 258 | point_json = { 259 | "measurement": measurement, 260 | "tags": tags, 261 | "time": timestamp_ns, 262 | "fields": {}, 263 | } 264 | 265 | for field_name, field_value in fields: 266 | point_json["fields"][field_name] = field_value 267 | 268 | return point_json 269 | 270 | 271 | def _get_point_names(points): 272 | names = "" 273 | for point in points: 274 | names += point["measurement"] 275 | names += " " 276 | return names 277 | 278 | 279 | def _write_points(points, num_points): 280 | """ 281 | Write the points to the InfluxDB in groups that are MAX_POINTS_PER_WRITE in 282 | size. 283 | """ 284 | LOG.debug("Writing points %d", num_points) 285 | write_index = 0 286 | points_written = 0 287 | while write_index < num_points: 288 | max_write_index = write_index + MAX_POINTS_PER_WRITE 289 | write_points = points[write_index:max_write_index] 290 | try: 291 | g_client.write_points(write_points) 292 | points_written += len(write_points) 293 | except InfluxDBServerError as svr_exc: 294 | LOG.error( 295 | "InfluxDBServerError: %s\nFailed to write points: %s", 296 | str(svr_exc), 297 | _get_point_names(write_points), 298 | ) 299 | except InfluxDBClientError as client_exc: 300 | LOG.error( 301 | "InfluxDBClientError writing points: %s\n" "Error: %s", 302 | _get_point_names(write_points), 303 | str(client_exc), 304 | ) 305 | except requests.exceptions.ConnectionError as req_exc: 306 | LOG.error( 307 | "ConnectionError exception caught writing points: %s\n" "Error: %s", 308 | _get_point_names(write_points), 309 | str(req_exc), 310 | ) 311 | write_index += MAX_POINTS_PER_WRITE 312 | 313 | return points_written 314 | -------------------------------------------------------------------------------- /example_isi_data_insights_d.cfg: -------------------------------------------------------------------------------- 1 | [isi_data_insights_d] 2 | # Parameters specified on the command line will supersede the parameters 3 | # in this section. 4 | # pid_file: /var/run/isi_data_insights_d.pid 5 | # log_file: /var/run/isi_data_insights_d.log 6 | # default log_level is INFO 7 | # log_level: DEBUG 8 | stats_processor: influxdb_plugin 9 | # Use "auth" as the 4th arg in order to be prompted for the 10 | # InfluxDB username and password stats_processor_args 11 | # or 12 | # define the credentials in this config 13 | # examples: 14 | # localhost 8086 isi_data_insights auth 15 | # or without prompting 16 | # localhost 8086 isi_data_insights username password ssl=True/False verify_ssl=True/False 17 | stats_processor_args: localhost 8086 isi_data_insights 18 | 19 | # clusters in this section are queried for all stat groups 20 | # clusters: [username1:password1@][:True|False] 21 | # [[username2:password2]@[:True|False]] 22 | # [[username3:password3]@[:True|False]] 23 | # ... 24 | # If you don't specify the username and password then you will be prompted 25 | # for them when the daemon starts up. 26 | # Use the optional True or False on the end to specify whether the cluster's 27 | # SSL certificate should be verified. If it is omitted then the default is 28 | # False (i.e. don't verify SSL cert). 29 | clusters: 30 | 31 | # Specifies the active list of stat groups to query, each stat group name 32 | # specified here should have a corresponding section in the config file. 33 | active_stat_groups: cluster_cpu_stats 34 | cluster_network_traffic_stats 35 | cluster_client_activity_stats 36 | cluster_health_stats 37 | ifs_space_stats 38 | ifs_rate_stats 39 | node_load_stats 40 | node_disk_stats 41 | node_net_stats 42 | cluster_disk_rate_stats 43 | cluster_proto_stats 44 | cache_stats 45 | heat_total_stats 46 | 47 | # The min_update_interval_override param provides ability to override the 48 | # minimum interval that the daemon will query for a set of stats. The purpose 49 | # of the minimum interval, which defaults to 30 seconds, is to prevent 50 | # the daemon's queries from putting too much stress on the cluster. 51 | # The default value is 30 seconds. 52 | # min_update_interval_override: 15 53 | 54 | [cluster_cpu_stats] 55 | # The clusters (optional) param defines a list of clusters specific to this 56 | # group. 57 | # clusters: 10.25.69.74 10.25.69.75 58 | # update interval is in seconds or use * to base the update interval 59 | # off each stat's collection interval (i.e. *2 == 2 times the collection 60 | # interval, *1 == * == 1 times the collection invterval of each stat) 61 | update_interval: * 62 | stats: cluster.cpu.sys.avg 63 | cluster.cpu.user.avg 64 | cluster.cpu.idle.avg 65 | cluster.cpu.intr.avg 66 | 67 | [cluster_network_traffic_stats] 68 | update_interval: * 69 | stats: cluster.net.ext.bytes.in.rate 70 | cluster.net.ext.bytes.out.rate 71 | cluster.net.ext.packets.in.rate 72 | cluster.net.ext.packets.out.rate 73 | cluster.net.ext.errors.in.rate 74 | cluster.net.ext.errors.out.rate 75 | 76 | [cluster_client_activity_stats] 77 | update_interval: * 78 | stats: node.clientstats.active.ftp 79 | node.clientstats.active.hdfs 80 | node.clientstats.active.http 81 | node.clientstats.active.lsass_out 82 | node.clientstats.active.jobd 83 | node.clientstats.active.nfs 84 | node.clientstats.active.nfs4 85 | node.clientstats.active.nlm 86 | node.clientstats.active.papi 87 | node.clientstats.active.siq 88 | node.clientstats.active.cifs 89 | node.clientstats.active.smb2 90 | node.clientstats.connected.ftp 91 | node.clientstats.connected.hdfs 92 | node.clientstats.connected.http 93 | node.clientstats.connected.nfs 94 | node.clientstats.connected.nlm 95 | node.clientstats.connected.papi 96 | node.clientstats.connected.siq 97 | node.clientstats.connected.cifs 98 | 99 | [cluster_health_stats] 100 | update_interval: * 101 | stats: cluster.health 102 | cluster.node.count.all 103 | cluster.node.count.down 104 | 105 | [ifs_space_stats] 106 | update_interval: * 107 | stats: ifs.bytes.avail 108 | ifs.bytes.free 109 | ifs.bytes.used 110 | ifs.bytes.total 111 | ifs.percent.free 112 | ifs.percent.avail 113 | ifs.percent.used 114 | 115 | [ifs_rate_stats] 116 | update_interval: * 117 | stats: ifs.bytes.in.rate 118 | ifs.bytes.out.rate 119 | ifs.ops.in.rate 120 | ifs.ops.out.rate 121 | 122 | [node_load_stats] 123 | update_interval: * 124 | stats: node.load.1min 125 | node.load.5min 126 | node.load.15min 127 | node.memory.used 128 | node.memory.free 129 | node.open.files 130 | 131 | [node_disk_stats] 132 | update_interval: * 133 | stats: node.disk.bytes.out.rate.avg 134 | node.disk.bytes.in.rate.avg 135 | node.disk.busy.avg 136 | node.disk.xfers.out.rate.avg 137 | node.disk.xfers.in.rate.avg 138 | node.disk.xfer.size.out.avg 139 | node.disk.xfer.size.in.avg 140 | node.disk.access.latency.avg 141 | node.disk.access.slow.avg 142 | node.disk.iosched.queue.avg 143 | node.disk.iosched.latency.avg 144 | 145 | [node_net_stats] 146 | update_interval: * 147 | stats: node.net.int.bytes.in.rate 148 | node.net.int.bytes.out.rate 149 | node.net.ext.bytes.in.rate 150 | node.net.ext.bytes.out.rate 151 | node.net.int.errors.in.rate 152 | node.net.int.errors.out.rate 153 | node.net.ext.errors.in.rate 154 | node.net.ext.errors.out.rate 155 | 156 | [cluster_disk_rate_stats] 157 | update_interval: * 158 | stats: cluster.disk.xfers.rate 159 | cluster.disk.xfers.in.rate 160 | cluster.disk.xfers.out.rate 161 | cluster.disk.bytes.in.rate 162 | cluster.disk.bytes.out.rate 163 | 164 | [cluster_proto_stats] 165 | update_interval: * 166 | stats: cluster.protostats.nfs 167 | cluster.protostats.nlm 168 | cluster.protostats.cifs 169 | cluster.protostats.ftp 170 | cluster.protostats.http 171 | cluster.protostats.siq 172 | cluster.protostats.jobd 173 | cluster.protostats.smb2 174 | cluster.protostats.nfs4 175 | cluster.protostats.irp 176 | cluster.protostats.lsass_in 177 | cluster.protostats.lsass_out 178 | cluster.protostats.papi 179 | cluster.protostats.hdfs 180 | cluster.protostats.nfs.total 181 | cluster.protostats.nlm.total 182 | cluster.protostats.cifs.total 183 | cluster.protostats.ftp.total 184 | cluster.protostats.http.total 185 | cluster.protostats.siq.total 186 | cluster.protostats.jobd.total 187 | cluster.protostats.smb2.total 188 | cluster.protostats.nfs4.total 189 | cluster.protostats.irp.total 190 | cluster.protostats.lsass_in.total 191 | cluster.protostats.lsass_out.total 192 | cluster.protostats.papi.total 193 | cluster.protostats.hdfs.total 194 | 195 | [cache_stats] 196 | update_interval: * 197 | stats: node.ifs.cache 198 | 199 | [heat_total_stats] 200 | update_interval: * 201 | stats: node.ifs.heat.lock.total 202 | node.ifs.heat.blocked.total 203 | node.ifs.heat.contended.total 204 | node.ifs.heat.deadlocked.total 205 | node.ifs.heat.write.total 206 | node.ifs.heat.read.total 207 | node.ifs.heat.lookup.total 208 | node.ifs.heat.rename.total 209 | node.ifs.heat.link.total 210 | node.ifs.heat.unlink.total 211 | node.ifs.heat.getattr.total 212 | node.ifs.heat.setattr.total 213 | 214 | # These stats are not currently active by default. They are here to serve as an example of how to use the 215 | # derived stats functionality. See the comments below for more details. 216 | [concurrency_stats] 217 | update_interval: * 218 | stats: node.ifs.ops.in node.ifs.ops.out node.disk.iosched.latency.avg 219 | cluster.protostats.nfs.total 220 | cluster.protostats.nfs.total 221 | cluster.protostats.smb2.total 222 | cluster.protostats.nlm.total 223 | cluster.protostats.cifs.total 224 | cluster.protostats.http.total 225 | cluster.protostats.siq.total 226 | cluster.protostats.nfs4.total 227 | cluster.protostats.hdfs.total 228 | cluster.protostats.ftp.total 229 | # The composite_stats, equation_stats, percent_change_stats, final_equation_stats sections allow you to 230 | # specify new stats that are derived from the values of other stats. You can derive stats from base stats 231 | # or even specific fields or indices within a base stat's value, which is actually required if the 232 | # base stat's value is not a float or integer (i.e. it is a dict or list). See below for more 233 | # info on each type of derived stat. 234 | 235 | #### Composite Stats Description ##### 236 | # The composite_stats parameter specifies a list of node specific stats (i.e. stats whose name 237 | # start with "node.") where each stat is composited across the entire cluster using the specified 238 | # operation. Supported operations at this time are avg, max, min, and sum. 239 | # The output name of a composite_stat is: cluster..[[...]]., 240 | # so for the three stats above it would be cluster.node.ifs.ops.in.sum, 241 | # cluster.node.ifs.ops.out.sum, and cluster.node.disk.iosched.latency.avg.avg. If the base stat 242 | # contains one of more fields then those are appended to the name with '.' as delimiter, e.g.: 243 | # sum(node.protostats.nfs.total:op_count) -> cluster.node.protostats.nfs.total.op_count.sum 244 | composite_stats: sum(node.ifs.ops.in) sum(node.ifs.ops.out) avg(node.disk.iosched.latency.avg) 245 | 246 | 247 | #### Equation Stats Description ##### 248 | # The equation_stats parameter specifies a list of output stat names for stats that will be 249 | # derived from an equation that takes as input either base stat values or composite_stats values. 250 | # The equation for each equation stat is specified in a parameter named the same as the equation 251 | # stat. 252 | equation_stats: cluster.ifs.concurrency cluster.protostats.all.total.op_count cluster.protostats.all.total.time_avg 253 | # This is the definition of the equation used to compute the the cluster.ifs.concurrency stat. 254 | # Any of the base stats or any composite stat can be used in the equation expression. Any 255 | # expression supported by the Equation package of Python can be used: 256 | # https://pypi.python.org/pypi/Equation 257 | cluster.ifs.concurrency: (cluster.node.ifs.ops.in.sum + cluster.node.ifs.ops.out.sum) * cluster.node.disk.iosched.latency.avg.avg 258 | # The cluster.protostats.all.total.op_count is a sum of all 9 of the different protocols' op_count. 259 | # This equation shows an example of how to select a specific field within a stat that returns a dict, in this case the op_count 260 | # field. Note that some stats are returned as list with always only a single dict item - in those cases the value is treated 261 | # as if it was just a dict. Otherwise, to index into a list you would use numeric field names after the colon. Multiple field 262 | # names or list indices are allowed (i.e. node.example.stat:field1:field2:field3...). 263 | cluster.protostats.all.total.op_count: cluster.protostats.nfs.total:op_count + cluster.protostats.nfs.total:op_count + cluster.protostats.smb2.total:op_count + cluster.protostats.nlm.total:op_count + cluster.protostats.cifs.total:op_count + cluster.protostats.http.total:op_count + cluster.protostats.siq.total:op_count + cluster.protostats.nfs4.total:op_count + cluster.protostats.hdfs.total:op_count + cluster.protostats.ftp.total:op_count 264 | # This stat computes the sum of the time_avg field and then takes an average. 265 | cluster.protostats.all.total.time_avg: (cluster.protostats.nfs.total:time_avg + cluster.protostats.nfs.total:time_avg + cluster.protostats.smb2.total:time_avg + cluster.protostats.nlm.total:time_avg + cluster.protostats.cifs.total:time_avg + cluster.protostats.http.total:time_avg + cluster.protostats.siq.total:time_avg + cluster.protostats.nfs4.total:time_avg + cluster.protostats.hdfs.total:time_avg + cluster.protostats.ftp.total:time_avg) / 10.0 266 | 267 | #### Percent Change Stats Description ##### 268 | # The percent_change_stats section specifies a list of base stats, composite stats, and/or equation 269 | # stats whose percent change from one measurement to the next will be stored in a new stat whose 270 | # name will be .percentchange 271 | percent_change_stats: cluster.node.disk.iosched.latency.avg.avg cluster.protostats.all.total.time_avg 272 | 273 | #### Final Equation Stats Description ##### 274 | # The final_equation_stats is the same as the equation_stats section except these equations have access to base stats and all of the previously 275 | # defined derived stats as input. Again list the names of the output stats and then list the equation for each output stat in section of that same 276 | # name. 277 | final_equation_stats: cluster.ifs.concurrency.importance 278 | # Definition of the cluster.ifs.concurrency.importance final equation stat 279 | cluster.ifs.concurrency.importance: (cluster.protostats.all.total.op_count * cluster.protostats.all.total.time_avg) * cluster.node.disk.iosched.latency.avg.avg.percentchange 280 | -------------------------------------------------------------------------------- /README_KAPACITOR_INTEGRATION.md: -------------------------------------------------------------------------------- 1 | # Kapacitor Integration 2 | Kapacitor (https://www.influxdata.com/time-series-platform/kapacitor/) is an add-on component that when used in conjunction with the Connector enables flexible, configurable, real-time notifications of alert conditions based off the statistics data streaming into the InfluxDB. Kapacitor leverages the ability to subscribe to updates to the InfluxDB database to provide this capability. 3 | 4 | # Initial setup 5 | First setup InfluxDB and the Data Insights Connector following the instructions outlined in the README.md file. Then follow these instructions to install and setup Kapacitor: 6 | 7 | Install Kapacitor from https://www.influxdata.com/downloads/#kapacitor 8 | 9 | The getting started page (https://docs.influxdata.com/kapacitor/v1.0/introduction/getting_started/) contains useful examples, but is not entirely pertinent to this use case since it is leveraging Telegraf to generate statistics. In this case, you already have sets of statistics (measurements) in InfluxDB being fed by the Connector. After you have installed Kapacitor then you will need to configure it. 10 | 11 | The Kapacitor installation package already includes the configuration file (/etc/kapacitor/kapacitor.conf) so there is no need to generate one. Edit /etc/kapacitor/kapacitor.conf to change the alert provider configurations as necessary. For instance, to enable email alerts, find the section beginning “[smtp]” and modify the configuration to utilize and available SMTP provider. 12 | 13 | # Kapacitor Scripting 14 | 15 | ## Introduction 16 | Kapacitor uses one or more tasks that are defined using “TICK” scripts to control what data should be filtered, how it should be filtered, and what criteria to use to alert based off the data. The TICK scripts are a domain-specific language (DSL) and are somewhat tersely documented on the Kapacitor documentation site (https://docs.influxdata.com/kapacitor/v1.0/). This document presents some example scripts, and presents some patterns to enable more sophisticated criteria for alerting (e.g. moving average). 17 | 18 | ## How to create and enable a TICK task 19 | Edit the script using your favorite text editor. It is suggested that the name of these scripts use that “.tick” extension e.g. “nfs_avg_lat_alert.tick” 20 | Next install the script into Kapacitor using the CLI. The generic form of the command is: 21 | 22 | ```sh 23 | kapacitor define -type stream -tick -dbrp isi_data_insights.autogen 24 | ``` 25 | 26 | The internal name should be something descriptive. These examples only show the use of stream scripts, but note that Kapacitor can also perform batch-processing. The path to the script is obvious. The “-dbrp” argument specifies the InfluxDB “database retention policy”. Since we are using the Isilon data insights connector database, the correct value for our examples is “isi_data_insights.autogen”; this value would differ if a different source database were in use. If we are using “nfs_avg_lat_alert.tick” as our example script, then the command to define the task would be: 27 | ```sh 28 | kapacitor define nfs_lat_alert -type stream -tick /root/nfs_avg_lat_alert.tick -dbrp isi_data_insights.autogen 29 | ``` 30 | 31 | Here is the “nfs_avg_lat_alert.tick” script: 32 | ``` 33 | stream 34 | // Select avg NFS3 proto response time 35 | |from() 36 | .database('isi_data_insights') 37 | .measurement('cluster.protostats.nfs.total') 38 | |eval(lambda: float("time_avg") / 1000.0) 39 | .as('time_ms') 40 | |groupBy('cluster') 41 | |alert() 42 | .id('{{ index .Tags "cluster" }}/{{ .Name }}') 43 | .message('Average value of {{ .ID }} is {{ .Level}} value: {{ index .Fields "time_ms" }}ms') 44 | .crit(lambda: "time_ms" > 50.0) 45 | .warn(lambda: "time_ms" > 20.0) 46 | // Only warn every 15 mins if we haven't changed state 47 | .stateChangesOnly(15m) 48 | // Whenever we get an alert write it to a file. 49 | .log('/tmp/alerts.log') 50 | .slack() 51 | ``` 52 | Breaking it down: 53 | * This is a stream filter so it starts with “stream”. 54 | * Next, the script specifies where to pulling its data from. In this case, the “isi_data_insights” database, which is the default database created and populated by the Connector. This script selects a single measurement: “cluster.protostats.nfs.total”, which are the totaled (clusterwide as opposed to node-specific) NFS3 protocol statistics. 55 | * Next, the script specifies an “eval” node which takes the “time_avg” measurement for the operations, and divides it by 1000. Note that the statistics values are in microseconds. Hence, this node is converting the values to milliseconds. 56 | * Next, the script uses a “groupby” node, that is using the measurement tag “cluster” because the statistics for each cluster are distinct (e.g. we don’t want a low value from one cluster resetting the alert threshold of another cluster). 57 | * Finally, the “alert” node. This is quite detailed (see next section for details). 58 | 59 | Alert node details: 60 | * First it defines the alert id that appears in the messages. In this case it will be /nfs_lat_alert 61 | * Next it defines the format of the message that appears in the alert. “.Level” is the alert level (crit, warn, info, ok). We index into the fields of the measurement to extract the “time_ms” field we generated to show the actual time value. 62 | * The “.crit” and “.warn” nodes define a Boolean lambda function that determines whether that alert level has been reached. In this case, we’re defining the critical level to be a latency of greater than 50ms, and the warning level to be a latency of greater than 20ms. 63 | * Lastly, the “squelch” node makes it so that it the alert is triggered repeatedly every 15 minutes if the alert level hasn’t changed, so we don’t get spammed with messages every 30 seconds. 64 | * The ”.log” node simply logs these alerts to a local file (useful for testing). 65 | * In this case, the alert is configured to use the Slack channel. This can be changed to use “.email” if that has been configured in the /etc/kapacitor/kapacitor.conf file, or “.post” to use the HTML POST method on a given URL. Numerous other alert channels are available. See the Kapacitor documentation for details. 66 | 67 | Provided the syntax is correct, and the correct command is used, the task should now be defined in Kapacitor. However, it won’t be enabled: 68 | ```sh 69 | kapacitor list tasks 70 | ID Type Status Executing Databases and Retention Policies 71 | nfs_lat_alert stream disabled false ["isi_data_insights"."autogen"] 72 | ``` 73 | To enable the task, simply type: 74 | ```sh 75 | kapacitor enable nfs_lat_alert 76 | ``` 77 | The task should now be enabled: 78 | ```sh 79 | kapacitor list tasks 80 | ID Type Status Executing Databases and Retention Policies 81 | nfs_lat_alert stream enabled true ["isi_data_insights"."autogen"] 82 | ``` 83 | It’s possible to check the status of the task and see the results at each node in the script: 84 | ```sh 85 | kapacitor show nfs_lat_alert 86 | ID: nfs_lat_alert 87 | Error: 88 | Template: 89 | Type: stream 90 | Status: enabled 91 | Executing: true 92 | Created: 10 Aug 16 12:10 PDT 93 | Modified: 16 Aug 16 06:40 PDT 94 | LastEnabled: 16 Aug 16 06:40 PDT 95 | Databases Retention Policies: ["isi_data_insights"."autogen"] 96 | TICKscript: 97 | stream 98 | // Select avg NFS3 proto response time 99 | |from() 100 | .database('isi_data_insights') 101 | .measurement('cluster.protostats.nfs.total') 102 | |eval(lambda: float("time_avg") / 1000.0) 103 | .as('time_ms') 104 | |groupBy('cluster') 105 | |alert() 106 | .id('{{ index .Tags "cluster" }}/{{ .Name }}') 107 | .message('Average value of {{ .ID }} is {{ .Level}} value: {{ index .Fields "time_ms" }}ms') 108 | .crit(lambda: "time_ms" > 50.0) 109 | .warn(lambda: "time_ms" > 20.0) 110 | // Only warn every 15 mins if we haven't changed state 111 | .stateChangesOnly(15m) 112 | // Whenever we get an alert write it to a file. 113 | .log('/tmp/alerts.log') 114 | .slack() 115 | 116 | DOT: 117 | digraph nfs_lat_alert { 118 | graph [throughput="0.00 points/s"]; 119 | 120 | stream0 [avg_exec_time_ns="0" ]; 121 | stream0 -> from1 [processed="58279"]; 122 | 123 | from1 [avg_exec_time_ns="1.215s" ]; 124 | from1 -> eval2 [processed="58279"]; 125 | 126 | eval2 [avg_exec_time_ns="208.86s" eval_errors="0" ]; 127 | eval2 -> groupby3 [processed="58279"]; 128 | 129 | groupby3 [avg_exec_time_ns="28.392s" ]; 130 | groupby3 -> alert4 [processed="58279"]; 131 | 132 | alert4 [alerts_triggered="2457" avg_exec_time_ns="87.22134ms" crits_triggered="836" infos_triggered="0" oks_triggered="1008" warns_triggered="613" ]; 133 | } 134 | ``` 135 | 136 | This output shows that the script is working and triggering on events. The “DOT:” section can be rendered as a graph using the “GraphViz” package. 137 | 138 | This initial script works well, but is rather simplistic and, in particular, will alert on momentary spikes in load which may not be desirable. 139 | 140 | # Example TICK script patterns 141 | This section describes some examples for different types of alerting scripts. 142 | 143 | # Moving average of measurement 144 | This is an example of a script that uses a moving window to average the statistic value over a recent window: 145 | ``` 146 | stream 147 | // Select avg NFS3 proto response time 148 | |from() 149 | .database('isi_data_insights') 150 | .measurement('cluster.protostats.nfs.total') 151 | |groupBy('cluster') 152 | |window() 153 | .period(10m) 154 | .every(1m) 155 | |mean('time_avg') 156 | .as('time_avg') 157 | |eval(lambda: float("time_avg") / 1000.0) 158 | .as('mean_ms') 159 | .keep('mean_ms', 'time_avg') 160 | |alert() 161 | .id('{{ index .Tags "cluster" }}/{{ .Name }}') 162 | .message('Windowed average of avg value of {{ .ID }} is {{ .Level}} value: {{ index .Fields "mean_ms" }}ms') 163 | .crit(lambda: "mean_ms" > 50.0) 164 | .warn(lambda: "mean_ms" > 25.0) 165 | // Only warn every 15 mins if we haven't changed state 166 | .stateChangesOnly(15m) 167 | // Whenever we get an alert write it to a file. 168 | .log('/tmp/alerts.log') 169 | .slack() 170 | ``` 171 | 172 | This script is similar to the previous script, but there are a few important differences: 173 | * The “window” node generates a window of data. With the values specified, we will keep and output the last 10 minutes of data every minute. 174 | * The window output is fed into a “mean” node that calculates the mean of the data fed (the last 10 minutes of data, in this case the “time_avg” field), and stores the result back as the “time_avg” field to be fed further down the pipeline. 175 | * The “eval” node converts the microsecond average field to a new “mean_ms” field. 176 | * The rest of the alert is similar to the previous example. 177 | 178 | # Joining/alerting based off two different measurements 179 | This script is an example. It alerts based off moving average, but only if the operation count is above a given threshold. It’s probably not safe to use this as the sole alerting mechanism because a deadlock (which will reduce the operation count to zero) won’t generate an alert. Additional scripts are provided below to look for deadlock events (“node.ifs.heat.deadlocked.total” measurement) and to alert if no data points have been collected in a configurable period. 180 | 181 | ``` 182 | // Alert based off mean NFS3 proto response time if work is actually happening 183 | 184 | var timestream = stream 185 | |from() 186 | .database('isi_data_insights') 187 | .measurement('cluster.protostats.nfs.total') 188 | |groupBy('cluster') 189 | |window() 190 | .period(10m) 191 | .every(1m) 192 | |mean('time_avg') 193 | .as('time_avg') 194 | |eval(lambda: float("time_avg") / 1000.0) 195 | .as('mean_ms') 196 | 197 | var opstream = stream 198 | |from() 199 | .database('isi_data_insights') 200 | .measurement('cluster.protostats.nfs.total') 201 | |groupBy('cluster') 202 | |window() 203 | .period(10m) 204 | .every(1m) 205 | |mean('op_rate') 206 | .as('op_rate') 207 | 208 | timestream 209 | |join(opstream) 210 | .as('times', 'ops') 211 | |alert() 212 | .id('{{ index .Tags "cluster" }}/{{ .Name }}') 213 | .message('Cluster {{ index .Tags "cluster" }} is executing {{ index .Fields "ops.op_rate" }} NFSv3 operations per second and windowed average of avg value of {{ .Name }} is {{ .Level }} value: {{ index .Fields "times.mean_ms" }}ms') 214 | .crit(lambda: "ops.op_rate" > 1000 AND "times.mean_ms" > 25.0) 215 | .warn(lambda: "ops.op_rate" > 1000 AND "times.mean_ms" > 10.0) 216 | // .info(lambda: TRUE) 217 | // Only warn every 15 mins if we haven't changed state 218 | .stateChangesOnly(15m) 219 | // Whenever we get an alert write it to a file. 220 | .log('/tmp/alerts.log') 221 | .slack() 222 | ``` 223 | 224 | This script is significantly different to the previous examples. It uses variables to store the results of the two different streams that we sample, and then uses a “join” operation to create a stream with both sets of data for us to alert from. 225 | 226 | # Deadman alert to warn if data collection fails 227 | This script uses the Kapacitor “Deadman” node to warn when the collected/emitted point count falls below a defined threshold in a given period. Many of the statistics collected by the Connector are updated as frequently as every 30 seconds, but the overall collection period can be longer if many clusters are being monitored, if they are large, and/or if they are under heavy load. The script arbitrarily uses 5 minutes as the interval for this example. 228 | ``` 229 | // Deadman alert for cluster data collection 230 | var data = stream 231 | |from() 232 | .database('isi_data_insights') 233 | .measurement('cluster.health') 234 | .groupBy('cluster') 235 | 236 | data 237 | |deadman(1.0, 5m) 238 | .id ('Statistics data collection for cluster {{ index .Tags "cluster" }}') 239 | .slack() 240 | ``` 241 | 242 | This script will output alerts of the form: 243 | Statistics collection for cluster logserver is dead: 0.0 244 | or 245 | Statistics collection for cluster logserver is alive: 1.0 246 | 247 | # Deadlock event count alert 248 | This script uses one of the OneFS filesystem “heat” statistics to look for high rates of deadlocks within the filesystem. 249 | ``` 250 | stream 251 | // Alert based off node heat stats 252 | |from() 253 | .database('isi_data_insights') 254 | .measurement('node.ifs.heat.deadlocked.total') 255 | |groupBy('cluster') 256 | |alert() 257 | .id('Deadlock event count') 258 | .message('Value of {{ .ID }} on cluster {{ index .Tags "cluster" }}, node {{ index .Tags "node" }} is {{ .Level }} value: {{ index .Fields "value" }}') 259 | .crit(lambda: "value" > 50.0) 260 | .warn(lambda: "value" > 10.0) 261 | // .info(lambda: TRUE) 262 | // Only warn every 15 mins if we haven't changed state 263 | .stateChangesOnly(15m) 264 | // Whenever we get an alert write it to a file. 265 | .log('/tmp/alerts.log') 266 | .slack() 267 | ``` 268 | 269 | # Other useful node types 270 | Kapacitor offers a number of useful processing nodes to filter the data. Examples that are of particular interest are: 271 | * Mean/median/mode – computes the various average types. 272 | * Max/min – selects the largest/smallest point. 273 | * MovingAverage – a relatively new function that would simplify our earlier example. 274 | * Stddev – computes the standard deviation of points. Useful to detect anomalies. 275 | * Sum – sums the points. 276 | * Deadman - useful to alert if the collector fails for some reason. It alerts if the points per interval drops below a given threshold. 277 | -------------------------------------------------------------------------------- /isi_data_insights_config.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains utility functions for configuring the IsiDataInsightsDaemon 3 | via command line args and config file. 4 | """ 5 | from __future__ import print_function 6 | from __future__ import division 7 | from future import standard_library 8 | 9 | standard_library.install_aliases() # noqa: E402 10 | from builtins import input 11 | from builtins import str 12 | from builtins import range 13 | from past.utils import old_div 14 | import argparse 15 | import configparser 16 | import getpass 17 | import logging 18 | import os 19 | import re 20 | import sys 21 | import urllib3 22 | 23 | from ast import literal_eval 24 | from Equation import Expression 25 | 26 | from isi_data_insights_daemon import ( 27 | StatsConfig, 28 | ClusterConfig, 29 | ClusterCompositeStatComputer, 30 | EquationStatComputer, 31 | PercentChangeStatComputer, 32 | DerivedStatInput, 33 | ) 34 | from isi_stats_client import IsiStatsClient 35 | import isi_sdk_utils 36 | 37 | 38 | LOG = logging.getLogger(__name__) 39 | 40 | DEFAULT_PID_FILE = "./isi_data_insights_d.pid" 41 | DEFAULT_LOG_FILE = "./isi_data_insights_d.log" 42 | DEFAULT_LOG_LEVEL = "INFO" 43 | # name of the section in the config file where the main/global settings for the 44 | # daemon are stored. 45 | MAIN_CFG_SEC = "isi_data_insights_d" 46 | # the number of seconds to wait between updates for stats that are 47 | # continually kept up-to-date. 48 | ONE_SEC = 1 # seconds 49 | # the default minimum update interval (even if a particular stat key is updated 50 | # at a higher rate than this we will still only query at this rate in order to 51 | # prevent the cluster from being overloaded with stat queries). 52 | MIN_UPDATE_INTERVAL = 30 # seconds 53 | # name of the config file param that can be used to specify a lower 54 | # MIN_UPDATE_INTERVAL. 55 | MIN_UPDATE_INTERVAL_OVERRIDE_PARAM = "min_update_interval_override" 56 | 57 | 58 | def avg(stat_values): 59 | # XXX investigate if plain '/' is OK here 60 | return old_div(sum(stat_values), len(stat_values)) 61 | 62 | 63 | # operations use by ClusterCompositeStatComputer 64 | COMPOSITE_OPERATIONS = {"avg": avg, "max": max, "min": min, "sum": sum} 65 | 66 | # keep track of auth data that we have username and passwords for so that we 67 | # don't prompt more than once. 68 | g_cluster_auth_data = {} 69 | # keep track of the name and version of each cluster 70 | g_cluster_configs = {} 71 | 72 | 73 | def _add_cluster_auth_data(cluster_address, username, password, verify_ssl): 74 | # update cluster auth data 75 | g_cluster_auth_data[cluster_address] = (username, password, verify_ssl) 76 | 77 | 78 | def _process_config_file_clusters(clusters): 79 | cluster_list = [] 80 | cluster_configs = clusters.split() 81 | for cluster_config in cluster_configs: 82 | # default to insecure https 83 | verify_ssl = False 84 | 85 | # expected [username:password@]address[:bool] 86 | # the password can potentially contain ":" and "@" characters, split is done 87 | # from right side first and then left side to isolate out the password. 88 | at_split = cluster_config.rsplit("@", 1) 89 | if len(at_split) == 2: 90 | user_pass_split = at_split[0].split(":", 1) 91 | if len(user_pass_split) != 2: 92 | print( 93 | "Config file contains invalid cluster " 94 | "config: %s in %s (expected : " 95 | "prefix)." % (cluster_config, clusters), 96 | file=sys.stderr, 97 | ) 98 | sys.exit(1) 99 | username = user_pass_split[0] 100 | password = user_pass_split[1] 101 | else: 102 | username = None 103 | password = None 104 | verify_ssl_split = at_split[-1].split(":", 1) 105 | cluster_address = verify_ssl_split[0] 106 | if len(verify_ssl_split) > 1: 107 | try: 108 | # try to convert to a bool 109 | verify_ssl = literal_eval(verify_ssl_split[-1]) 110 | if type(verify_ssl) != bool: 111 | raise Exception 112 | except Exception: 113 | print( 114 | "Config file contains invalid cluster " 115 | "config: %s (expected True or False on end)" % cluster_config, 116 | file=sys.stderr, 117 | ) 118 | sys.exit(1) 119 | # add to cache of known cluster auth usernames and passwords 120 | _add_cluster_auth_data(cluster_address, username, password, verify_ssl) 121 | cluster_list.append(cluster_address) 122 | 123 | return cluster_list 124 | 125 | 126 | def _get_cluster_auth_data(cluster): 127 | try: 128 | username = password = verify_ssl = None 129 | # check if we already know the username and password 130 | username, password, verify_ssl = g_cluster_auth_data[cluster] 131 | if username is None or password is None or verify_ssl is None: 132 | # this happens when some of the auth params were provided in the 133 | # config file or cli, but not all. 134 | raise KeyError 135 | except KeyError: 136 | # get username and password for input clusters 137 | if username is None: 138 | username = input( 139 | "Please provide the username used to access " + cluster + " via PAPI: " 140 | ) 141 | if password is None: 142 | password = getpass.getpass("Password: ") 143 | while verify_ssl is None: 144 | verify_ssl_resp = input("Verify SSL cert [y/n]: ") 145 | if verify_ssl_resp == "yes" or verify_ssl_resp == "y": 146 | verify_ssl = True 147 | elif verify_ssl_resp == "no" or verify_ssl_resp == "n": 148 | verify_ssl = False 149 | # add to cache of known cluster auth usernames and passwords 150 | _add_cluster_auth_data(cluster, username, password, verify_ssl) 151 | 152 | return username, password, verify_ssl 153 | 154 | 155 | def _query_cluster_name(cluster_address, isi_sdk, api_client): 156 | # get the Cluster API 157 | cluster_api = isi_sdk.ClusterApi(api_client) 158 | try: 159 | resp = cluster_api.get_cluster_identity() 160 | return resp.name 161 | except isi_sdk.rest.ApiException: 162 | # if get_cluster_identity() doesn't work just use the address 163 | return cluster_address 164 | 165 | 166 | def _build_cluster_configs(cluster_list): 167 | cluster_configs = [] 168 | for cluster in cluster_list: 169 | username, password, verify_ssl = _get_cluster_auth_data(cluster) 170 | 171 | if cluster in g_cluster_configs: 172 | cluster_name, isi_sdk, api_client, version = g_cluster_configs[cluster] 173 | else: 174 | if verify_ssl is False: 175 | urllib3.disable_warnings() 176 | try: 177 | isi_sdk, api_client, version = isi_sdk_utils.configure( 178 | cluster, username, password, verify_ssl 179 | ) 180 | except RuntimeError as exc: 181 | print( 182 | "Failed to configure SDK for " 183 | "cluster %s. Exception raised: %s" % (cluster, str(exc)), 184 | file=sys.stderr, 185 | ) 186 | sys.exit(1) 187 | print( 188 | "Configured %s as version %d cluster, using SDK %s." 189 | % (cluster, int(version), isi_sdk.__name__) 190 | ) 191 | cluster_name = _query_cluster_name(cluster, isi_sdk, api_client) 192 | g_cluster_configs[cluster] = cluster_name, isi_sdk, api_client, version 193 | 194 | cluster_config = ClusterConfig( 195 | cluster, cluster_name, version, isi_sdk, api_client 196 | ) 197 | cluster_configs.append(cluster_config) 198 | 199 | return cluster_configs 200 | 201 | 202 | def _configure_stat_group( 203 | daemon, 204 | update_interval, 205 | cluster_configs, 206 | stats_list, 207 | cluster_composite_stats=None, 208 | equation_stats=None, 209 | pct_change_stats=None, 210 | final_equation_stats=None, 211 | ): 212 | """ 213 | Configure the daemon with some StatsConfigs. 214 | """ 215 | # configure daemon with stats 216 | if update_interval < MIN_UPDATE_INTERVAL: 217 | LOG.warning( 218 | "The following stats are set to be queried at a faster " 219 | "rate, %d seconds, than the MIN_UPDATE_INTERVAL of %d " 220 | "seconds. To configure a shorter MIN_UPDATE_INTERVAL specify " 221 | "it with the %s param in the %s section of the config file. " 222 | "Stats:\n\t%s", 223 | update_interval, 224 | MIN_UPDATE_INTERVAL, 225 | MIN_UPDATE_INTERVAL_OVERRIDE_PARAM, 226 | MAIN_CFG_SEC, 227 | str(stats_list), 228 | ) 229 | update_interval = MIN_UPDATE_INTERVAL 230 | stats_config = StatsConfig(cluster_configs, stats_list, update_interval) 231 | if cluster_composite_stats is not None: 232 | stats_config.cluster_composite_stats.extend(cluster_composite_stats) 233 | if equation_stats is not None: 234 | stats_config.equation_stats.extend(equation_stats) 235 | if pct_change_stats is not None: 236 | stats_config.pct_change_stats.extend(pct_change_stats) 237 | if final_equation_stats is not None: 238 | stats_config.final_equation_stats.extend(final_equation_stats) 239 | daemon.add_stats(stats_config) 240 | 241 | 242 | def _query_stats_metadata(cluster, stat_names): 243 | """ 244 | Query the specified cluster for the metadata of the stats specified in 245 | stat_names list. 246 | """ 247 | stats_api = cluster.isi_sdk.StatisticsApi(cluster.api_client) 248 | isi_stats_client = IsiStatsClient(stats_api) 249 | return isi_stats_client.get_stats_metadata(stat_names) 250 | 251 | 252 | def _compute_stat_group_update_intervals( 253 | update_interval_multiplier, cluster_configs, stat_names, update_intervals 254 | ): 255 | # update interval is supposed to be set relative to the collection 256 | # interval, which might be different for each stat and each cluster. 257 | for cluster in cluster_configs: 258 | stats_metadata = _query_stats_metadata(cluster, stat_names) 259 | for stat_index in range(0, len(stats_metadata)): 260 | stat_metadata = stats_metadata[stat_index] 261 | stat_name = stat_names[stat_index] 262 | # cache time is the length of time the system will store the 263 | # value before it updates. 264 | cache_time = -1 265 | if stat_metadata.default_cache_time: 266 | cache_time = ( 267 | (stat_metadata.default_cache_time + 1) 268 | # add one to the default_cache_time because the new 269 | # value is not set until 1 second after the cache time. 270 | * update_interval_multiplier 271 | ) 272 | # the policy intervals seem to override the default cache time 273 | if stat_metadata.policies: 274 | smallest_interval = cache_time 275 | for policy in stat_metadata.policies: 276 | if smallest_interval == -1: 277 | smallest_interval = policy.interval 278 | else: 279 | smallest_interval = min(policy.interval, smallest_interval) 280 | cache_time = smallest_interval * update_interval_multiplier 281 | # if the cache_time is still -1 then it means that the statistic is 282 | # continually updated, so the fastest it can be queried is 283 | # once every second. 284 | if cache_time == -1: 285 | cache_time = ONE_SEC * update_interval_multiplier 286 | try: 287 | update_interval = update_intervals[cache_time] 288 | update_interval[0].add(cluster) 289 | update_interval[1].add(stat_name) 290 | except KeyError: 291 | # insert a new interval time 292 | update_intervals[cache_time] = (set([cluster]), set([stat_name])) 293 | 294 | 295 | def _configure_stat_groups_via_file( 296 | daemon, config_file, stat_group, global_cluster_list 297 | ): 298 | cluster_list = [] 299 | cluster_list.extend(global_cluster_list) 300 | try: 301 | # process clusters specific to this stat group (if any) 302 | clusters_param = config_file.get(stat_group, "clusters") 303 | stat_group_clusters = _process_config_file_clusters(clusters_param) 304 | cluster_list.extend(stat_group_clusters) 305 | # remove duplicates 306 | cluster_list = list(set(cluster_list)) 307 | except configparser.NoOptionError: 308 | pass 309 | 310 | if len(cluster_list) == 0: 311 | print( 312 | "The %s stat group has no clusters to query." % stat_group, file=sys.stderr 313 | ) 314 | print( 315 | "You must provide either a global list of " 316 | "clusters to query for all stat groups, or a per-stat-" 317 | "group list of clusters, or both.", 318 | file=sys.stderr, 319 | ) 320 | sys.exit(1) 321 | 322 | cluster_configs = _build_cluster_configs(cluster_list) 323 | 324 | update_interval_param = config_file.get(stat_group, "update_interval") 325 | stat_names = config_file.get(stat_group, "stats").split() 326 | # remove duplicates 327 | stat_names = list(set(stat_names)) 328 | # deal with derived stats (if any) 329 | composite_stats = [] 330 | if config_file.has_option(stat_group, "composite_stats") is True: 331 | composite_stats = _parse_derived_stats( 332 | config_file, stat_group, "composite_stats", _parse_composite_stats 333 | ) 334 | 335 | eq_stats = [] 336 | if config_file.has_option(stat_group, "equation_stats") is True: 337 | eq_stats = _build_equation_stats_list(config_file, stat_group, "equation_stats") 338 | 339 | pct_change_stats = [] 340 | if config_file.has_option(stat_group, "percent_change_stats") is True: 341 | pct_change_stats = _parse_derived_stats( 342 | config_file, stat_group, "percent_change_stats", _parse_pct_change_stats 343 | ) 344 | 345 | final_eq_stats = [] 346 | if config_file.has_option(stat_group, "final_equation_stats") is True: 347 | final_eq_stats = _build_equation_stats_list( 348 | config_file, stat_group, "final_equation_stats" 349 | ) 350 | 351 | update_intervals = {} 352 | if update_interval_param.startswith("*"): 353 | try: 354 | update_interval_multiplier = ( 355 | 1 if update_interval_param == "*" else int(update_interval_param[1:]) 356 | ) 357 | except ValueError as exc: 358 | print( 359 | "Failed to parse update interval multiplier " 360 | "from %s stat group.\nERROR: %s" % (stat_group, str(exc)), 361 | file=sys.stderr, 362 | ) 363 | sys.exit(1) 364 | print("Computing update intervals for stat group: %s." % stat_group) 365 | _compute_stat_group_update_intervals( 366 | update_interval_multiplier, cluster_configs, stat_names, update_intervals 367 | ) 368 | else: 369 | try: 370 | update_interval = int(update_interval_param) 371 | except ValueError as exc: 372 | print( 373 | "Failed to parse update interval from %s " 374 | "stat group.\nERROR: %s" % (stat_group, str(exc)), 375 | file=sys.stderr, 376 | ) 377 | sys.exit(1) 378 | update_intervals[update_interval] = (cluster_configs, stat_names) 379 | 380 | # TODO - fix this - for now if there are derived stats then we are going to 381 | # query all the stats in this section at once (i.e. using the the smallest 382 | # of the configured update intervals) in order to make sure that all of the 383 | # input parameters of the derived stats are available at once. 384 | if ( 385 | len(composite_stats) > 0 386 | or len(eq_stats) > 0 387 | or len(pct_change_stats) > 0 388 | or len(final_eq_stats) > 0 389 | ): 390 | update_interval_keys = list(update_intervals.keys()) 391 | update_interval_keys.sort() 392 | update_interval = update_interval_keys[0] 393 | _configure_stat_group( 394 | daemon, 395 | update_interval, 396 | cluster_configs, 397 | stat_names, 398 | composite_stats, 399 | eq_stats, 400 | pct_change_stats, 401 | final_eq_stats, 402 | ) 403 | else: 404 | for update_interval, clusters_stats_tuple in update_intervals.items(): 405 | # first item in clusters_stats_tuple is the unique list of clusters 406 | # associated with the current update_interval, the second item is the 407 | # unique list of stats to query on the set of clusters at the current 408 | # update_interval. 409 | _configure_stat_group( 410 | daemon, 411 | update_interval, 412 | clusters_stats_tuple[0], 413 | clusters_stats_tuple[1], 414 | ) 415 | 416 | 417 | def _parse_derived_stats(config_file, stat_group, derived_stats_name, parse_func): 418 | derived_stats_cfg = config_file.get(stat_group, derived_stats_name) 419 | try: 420 | derived_stats = parse_func(derived_stats_cfg) 421 | except RuntimeError as rterr: 422 | print( 423 | "Failed to parse %s from %s " 424 | "section. %s" % (derived_stats_name, stat_group, str(rterr)), 425 | file=sys.stderr, 426 | ) 427 | sys.exit(1) 428 | 429 | return derived_stats 430 | 431 | 432 | def _parse_fields(in_stat_name): 433 | split_name = in_stat_name.split(":") 434 | if len(split_name) == 1: 435 | return in_stat_name, None 436 | 437 | return split_name[0], tuple(split_name[1:]) 438 | 439 | 440 | def _parse_composite_stats(composite_stats_cfg): 441 | # Example of what is expected for each stat_cfg: 442 | # sum(node.ifs.ops.in[:field1:field2]) 443 | composite_stats = [] 444 | for stat_cfg in composite_stats_cfg.split(): 445 | bracket1 = stat_cfg.find("(") 446 | bracket2 = stat_cfg.find(")") 447 | if bracket1 <= 0 or bracket2 == -1 or bracket1 > bracket2: 448 | raise RuntimeError( 449 | "Failed to parse operation from %s." 450 | "Expected: op(stat) where op is avg, min, max, " 451 | " or sum and stat is the name of a base OneFS " 452 | ' statistic name that starts with "node.".' % stat_cfg 453 | ) 454 | op_name = stat_cfg[0:bracket1] 455 | if op_name not in COMPOSITE_OPERATIONS: 456 | raise RuntimeError( 457 | "Invalid operation %s specified for %s." % (op_name, stat_cfg) 458 | ) 459 | 460 | in_stat_name = stat_cfg[bracket1 + 1:bracket2] 461 | if in_stat_name.startswith("node.") is False: 462 | raise RuntimeError( 463 | "Invalid stat name %s specified for %s." 464 | ' Composite stats must start with "node.".' % (op_name, stat_cfg) 465 | ) 466 | out_stat_name = "cluster.%s.%s" % (in_stat_name.replace(":", "."), op_name) 467 | in_stat_name, fields = _parse_fields(in_stat_name) 468 | # TODO should validate that this is a valid stat name 469 | composite_stat = ClusterCompositeStatComputer( 470 | DerivedStatInput(in_stat_name, fields), 471 | out_stat_name, 472 | COMPOSITE_OPERATIONS[op_name], 473 | ) 474 | composite_stats.append(composite_stat) 475 | 476 | return composite_stats 477 | 478 | 479 | def _build_equation_stats_list(config_file, stat_group, equation_stats): 480 | eq_stats = [] 481 | eq_stats_list = config_file.get(stat_group, equation_stats).split() 482 | for eq_stat in eq_stats_list: 483 | eq_stat_names = _parse_derived_stats( 484 | config_file, stat_group, eq_stat, _parse_equation_stats 485 | ) 486 | cfg_expression = config_file.get(stat_group, eq_stat) 487 | # the Equation package doesn't like having '.' characters in the 488 | # input param names, so we have to replace them with placeholder 489 | # names. 490 | eq_func = _build_equation_expression(cfg_expression, eq_stat_names) 491 | eq_stat_inputs = _build_equation_stat_inputs(eq_stat_names) 492 | eq_stats.append(EquationStatComputer(eq_func, eq_stat_inputs, eq_stat)) 493 | 494 | return eq_stats 495 | 496 | 497 | def _build_equation_stat_inputs(eq_stat_names): 498 | input_stats = [] 499 | for stat_name in eq_stat_names: 500 | stat_name, fields = _parse_fields(stat_name) 501 | input_stats.append(DerivedStatInput(stat_name, fields)) 502 | 503 | return input_stats 504 | 505 | 506 | def _parse_equation_stats(equation_stat_expression): 507 | # Example of what is expected: 508 | # (cluster.node.ifs.ops.in.sum + cluster.node.ifs.ops.out.sum) 509 | # * cluster.node.disk.iosched.latency.avg.avg 510 | # Example of what is expected from stat with specific fields: 511 | # (cluster.protostats.nfs.total:op_count 512 | # + cluster.protostats.smb2.total:op_count) 513 | equation_stats = re.findall("[a-zA-Z.:_0-9]+", equation_stat_expression) 514 | 515 | # remove items that don't start with an alphabet character 516 | equation_stats = [eq_stat for eq_stat in equation_stats if eq_stat[0].isalpha()] 517 | return equation_stats 518 | 519 | 520 | def _build_equation_expression(cfg_expression, eq_stat_names): 521 | params_list = [] 522 | for eindex in range(0, len(eq_stat_names)): 523 | eq_stat_name = eq_stat_names[eindex] 524 | param_name = "param" + str(eindex) 525 | cfg_expression = cfg_expression.replace(eq_stat_name, param_name, 1) 526 | params_list.append(param_name) 527 | 528 | return Expression(cfg_expression, params_list) 529 | 530 | 531 | def _parse_pct_change_stats(pct_change_stats_cfg): 532 | # Expected is just a white-space delimitted list of stat names 533 | pct_change_stats = [] 534 | for stat_name in pct_change_stats_cfg.split(): 535 | out_stat_name = stat_name.replace(":", ".") + ".percentchange" 536 | stat_name, fields = _parse_fields(stat_name) 537 | pct_change_stats.append( 538 | PercentChangeStatComputer( 539 | DerivedStatInput(stat_name, fields), out_stat_name 540 | ) 541 | ) 542 | return pct_change_stats 543 | 544 | 545 | def _configure_stat_groups_via_cli(daemon, args): 546 | if len(args.stat_groups) == 0: 547 | print( 548 | "You must provide a set of stats to query via " 549 | "the --stats command line argument or a configuration file.", 550 | file=sys.stderr, 551 | ) 552 | sys.exit(1) 553 | 554 | if not args.update_intervals: 555 | # for some reason if i try to use default=[MIN_UPDATE_INTERVAL] in the 556 | # argparser for the update_intervals arg then my list always has a 557 | # MIN_UPDATE_INTERVAL in addition to any intervals actually provided by 558 | # the user on the command line, so i need to setup the default here 559 | args.update_intervals.append(MIN_UPDATE_INTERVAL) 560 | 561 | if len(args.stat_groups) != len(args.update_intervals): 562 | print( 563 | "The number of update intervals must be the " 564 | + "same as the number of stat groups.", 565 | file=sys.stderr, 566 | ) 567 | sys.exit(1) 568 | 569 | cluster_list = args.clusters.split(",") 570 | # if args.clusters is the empty string then 1st element will be empty 571 | if cluster_list[0] == "": 572 | print("Please provide at least one input cluster.", file=sys.stderr) 573 | sys.exit(1) 574 | 575 | # remove duplicates 576 | cluster_list = list(set(cluster_list)) 577 | cluster_configs = _build_cluster_configs(cluster_list) 578 | 579 | for index in range(0, len(args.stat_groups)): 580 | stats_list = args.stat_groups[index].split(",") 581 | # split always results in at least one item, so check if the first 582 | # item is empty to validate the stats input arg 583 | if stats_list[0] == "": 584 | print("Please provide at least one stat name.", file=sys.stderr) 585 | sys.exit(1) 586 | update_interval = args.update_intervals[index] 587 | _configure_stat_group(daemon, update_interval, cluster_configs, stats_list) 588 | 589 | 590 | def _configure_stats_processor(daemon, stats_processor, processor_args): 591 | try: 592 | processor = __import__(stats_processor, fromlist=[""]) 593 | except ImportError: 594 | print("Unable to load stats processor: %s." % stats_processor, file=sys.stderr) 595 | sys.exit(1) 596 | 597 | try: 598 | arg_list = processor_args.split(" ") if processor_args != "" else [] 599 | daemon.set_stats_processor(processor, arg_list) 600 | except AttributeError as exception: 601 | print( 602 | "Failed to configure %s as stats processor. %s" 603 | % (stats_processor, str(exception)), 604 | file=sys.stderr, 605 | ) 606 | sys.exit(1) 607 | 608 | 609 | def _log_level_str_to_enum(log_level): 610 | if log_level.upper() == "DEBUG": 611 | return logging.DEBUG 612 | elif log_level.upper() == "INFO": 613 | return logging.INFO 614 | elif log_level.upper() == "WARNING": 615 | return logging.WARNING 616 | elif log_level.upper() == "ERROR": 617 | return logging.ERROR 618 | elif log_level.upper() == "CRITICAL": 619 | return logging.CRITICAL 620 | else: 621 | print("Invalid logging level: " + log_level + ", setting to INFO.") 622 | return logging.INFO 623 | 624 | 625 | def _update_args_with_config_file(config_file, args): 626 | # command line args override config file params 627 | if args.pid_file is None and config_file.has_option(MAIN_CFG_SEC, "pid_file"): 628 | args.pid_file = config_file.get(MAIN_CFG_SEC, "pid_file") 629 | if args.log_file is None and config_file.has_option(MAIN_CFG_SEC, "log_file"): 630 | args.log_file = config_file.get(MAIN_CFG_SEC, "log_file") 631 | if args.log_level is None and config_file.has_option(MAIN_CFG_SEC, "log_level"): 632 | args.log_level = config_file.get(MAIN_CFG_SEC, "log_level") 633 | 634 | 635 | def _print_stat_groups(daemon): 636 | """ 637 | Print out the list of stat sets that were configured for the daemon prior 638 | to starting it so that user can verify that it was configured as expected. 639 | """ 640 | for update_interval, stat_set in daemon.get_next_stat_set(): 641 | msg = ( 642 | "Configured stat set:\n\tClusters: %s\n\t" 643 | "Update Interval: %d\n\tStat Keys: %s" 644 | % (str(stat_set.cluster_configs), update_interval, str(stat_set.stats)) 645 | ) 646 | # print it to stdout and the log file. 647 | print(msg) 648 | LOG.debug(msg) 649 | 650 | 651 | def configure_via_file(daemon, args, config_file): 652 | """ 653 | Configure the daemon's stat groups and the stats processor via command line 654 | arguments and configuration file. The command line args override settings 655 | provided in the config file. 656 | """ 657 | # Command line args override config file params 658 | if ( 659 | not args.stats_processor 660 | and config_file.has_option(MAIN_CFG_SEC, "stats_processor") is True 661 | ): 662 | args.stats_processor = config_file.get(MAIN_CFG_SEC, "stats_processor") 663 | if ( 664 | not args.processor_args 665 | and config_file.has_option(MAIN_CFG_SEC, "stats_processor_args") is True 666 | ): 667 | args.processor_args = config_file.get(MAIN_CFG_SEC, "stats_processor_args") 668 | _configure_stats_processor(daemon, args.stats_processor, args.processor_args) 669 | 670 | # check if the MAIN_CFG_SEC has the MIN_UPDATE_INTERVAL_OVERRIDE_PARAM 671 | if config_file.has_option(MAIN_CFG_SEC, MIN_UPDATE_INTERVAL_OVERRIDE_PARAM): 672 | global MIN_UPDATE_INTERVAL 673 | try: 674 | override_update_interval = int( 675 | config_file.get(MAIN_CFG_SEC, MIN_UPDATE_INTERVAL_OVERRIDE_PARAM) 676 | ) 677 | except ValueError as exc: 678 | print( 679 | "Failed to parse %s from %s " 680 | "section.\nERROR: %s" 681 | % (MIN_UPDATE_INTERVAL_OVERRIDE_PARAM, MAIN_CFG_SEC, str(exc)), 682 | file=sys.stderr, 683 | ) 684 | sys.exit(1) 685 | 686 | LOG.warning( 687 | "Overriding MIN_UPDATE_INTERVAL of %d seconds with " "%d seconds.", 688 | MIN_UPDATE_INTERVAL, 689 | override_update_interval, 690 | ) 691 | MIN_UPDATE_INTERVAL = override_update_interval 692 | 693 | # if there are any clusters, stats, or update_intervals specified via CLI 694 | # then try to configure the daemon using them first. 695 | if args.update_intervals or args.stat_groups or args.clusters: 696 | _configure_stat_groups_via_cli(daemon, args) 697 | global_cluster_list = [] 698 | if args.clusters: 699 | global_cluster_list = args.clusters.split(",") 700 | elif config_file.has_option(MAIN_CFG_SEC, "clusters"): 701 | global_cluster_list = _process_config_file_clusters( 702 | config_file.get(MAIN_CFG_SEC, "clusters") 703 | ) 704 | # remove duplicates 705 | global_cluster_list = list(set(global_cluster_list)) 706 | 707 | # now configure with config file params too 708 | if config_file.has_option(MAIN_CFG_SEC, "active_stat_groups"): 709 | active_stat_groups = config_file.get(MAIN_CFG_SEC, "active_stat_groups").split() 710 | for stat_group in active_stat_groups: 711 | _configure_stat_groups_via_file( 712 | daemon, config_file, stat_group, global_cluster_list 713 | ) 714 | 715 | # check that at least one stat group was added to the daemon. 716 | if daemon.get_stat_set_count() == 0: 717 | print( 718 | "Please provide stat groups to query via " 719 | "command line args or via config file parameters.", 720 | file=sys.stderr, 721 | ) 722 | sys.exit(1) 723 | 724 | _print_stat_groups(daemon) 725 | 726 | 727 | def configure_via_cli(daemon, args): 728 | """ 729 | Configure the daemon's stat groups and the stats processor via command line 730 | arguments. 731 | """ 732 | _configure_stat_groups_via_cli(daemon, args) 733 | _configure_stats_processor(daemon, args.stats_processor, args.processor_args) 734 | 735 | _print_stat_groups(daemon) 736 | 737 | 738 | def configure_logging_via_cli(args): 739 | """ 740 | Setup the logging from command line args. 741 | """ 742 | if args.action != "debug": 743 | if args.log_file is None: 744 | args.log_file = DEFAULT_LOG_FILE 745 | 746 | parent_dir = os.path.dirname(args.log_file) 747 | if parent_dir and os.path.exists(parent_dir) is False: 748 | print("Invalid log file path: %s." % (args.log_file), file=sys.stderr) 749 | sys.exit(1) 750 | 751 | if args.log_level is None: 752 | args.log_level = DEFAULT_LOG_LEVEL 753 | 754 | log_level = _log_level_str_to_enum(args.log_level) 755 | logging.basicConfig( 756 | filename=args.log_file, 757 | level=log_level, 758 | format="%(asctime)s:%(name)s:%(levelname)s: %(message)s", 759 | ) 760 | else: # configure logging to stdout for 'debug' action 761 | logging.basicConfig( 762 | stream=sys.stdout, 763 | level=logging.DEBUG, 764 | format="%(asctime)s:%(name)s:%(levelname)s: %(message)s", 765 | ) 766 | 767 | 768 | def configure_args_via_file(args): 769 | """ 770 | Load the config_file, if there is one, then check if the pid_file, 771 | log_file, and log_level parameters are provided in the config file. If they 772 | are and they are not set via CLI args then use the config file to set them. 773 | """ 774 | config_file = None 775 | if args.config_file is not None: 776 | try: 777 | config_file = configparser.RawConfigParser() 778 | with open(args.config_file, "r") as cfg_fp: 779 | config_file.readfp(cfg_fp) 780 | except Exception as exc: 781 | print( 782 | "Failed to parse config file: %s.\n" 783 | "ERROR:\n%s." % (args.config_file, str(exc)), 784 | file=sys.stderr, 785 | ) 786 | sys.exit(1) 787 | _update_args_with_config_file(config_file, args) 788 | return config_file 789 | 790 | 791 | def process_pid_file_arg(pid_file, action): 792 | """ 793 | Make sure the pid_file argument is a valid path. Set it to the default if 794 | it was not specified. 795 | """ 796 | if pid_file is None: 797 | pid_file = DEFAULT_PID_FILE 798 | 799 | parent_dir = os.path.dirname(pid_file) 800 | if parent_dir and os.path.exists(parent_dir) is False: 801 | print("Invalid pid file path: %s." % pid_file, file=sys.stderr) 802 | sys.exit(1) 803 | 804 | pid_file_path = os.path.abspath(pid_file) 805 | if (action == "stop" or action == "restart") and os.path.exists( 806 | pid_file_path 807 | ) is False: 808 | print("Invalid pid file path: %s." % pid_file, file=sys.stderr) 809 | sys.exit(1) 810 | 811 | return pid_file_path 812 | 813 | 814 | def parse_cli(): 815 | """ 816 | Setup the command line args and parse them. 817 | """ 818 | argparser = argparse.ArgumentParser( 819 | description="Starts, stops, or restarts the " "isi_data_insights_daemon." 820 | ) 821 | argparser.add_argument( 822 | "action", 823 | help="Specifies to 'start', 'stop', " "'restart', or 'debug' the daemon.", 824 | ) 825 | argparser.add_argument( 826 | "-c", 827 | "--config-file", 828 | dest="config_file", 829 | help="Set the path to the config file. The default value is " 830 | "'./isi_data_insights_d.cfg'.", 831 | action="store", 832 | default="./isi_data_insights_d.cfg", 833 | ) 834 | argparser.add_argument( 835 | "-a", 836 | "--processor-args", 837 | dest="processor_args", 838 | help="Specifies the args to pass to the start function of the " 839 | "results processor's start function.", 840 | action="store", 841 | default="", 842 | ) 843 | argparser.add_argument( 844 | "-l", 845 | "--log-file", 846 | dest="log_file", 847 | help="Set the path to the log file. The default value is " 848 | "'./isi_data_insights_d.log'.", 849 | action="store", 850 | default=None, 851 | ) 852 | argparser.add_argument( 853 | "-e", 854 | "--log-level", 855 | dest="log_level", 856 | help="Set the logging level (debug, info, warning, error, or " "critical).", 857 | action="store", 858 | default=None, 859 | ) 860 | argparser.add_argument( 861 | "-p", 862 | "--pid-file", 863 | dest="pid_file", 864 | help="Set the path to the daemon pid file. The default value is " 865 | "'./isi_data_insights_d.pid'.", 866 | action="store", 867 | default=None, 868 | ) 869 | argparser.add_argument( 870 | "-x", 871 | "--stats-processor", 872 | dest="stats_processor", 873 | help="Name of the Python module used to process stats query " 874 | "results. The specified Python module must define " 875 | "a function named process(results_list) where results_list is a" 876 | "list of isi_sdk.models.statistics_current_stat objects." 877 | "StatisticsCurrentStat objects. The module may also optionally " 878 | "define start(args) and stop() functions. Use the " 879 | "--processor-args to specify args to pass to the results " 880 | "processor's start function.", 881 | action="store", 882 | default=None, 883 | ) 884 | argparser.add_argument( 885 | "-i", 886 | "--input-clusters", 887 | dest="clusters", 888 | help="Comma delimitted list of clusters to monitor (either " 889 | "hostnames or ip-addresses)", 890 | action="store", 891 | default="", 892 | ) 893 | argparser.add_argument( 894 | "-s", 895 | "--stats", 896 | dest="stat_groups", 897 | help="Comma delimitted list of stat names to monitor. Accepts" "multiple.", 898 | default=[], 899 | action="append", 900 | ) 901 | argparser.add_argument( 902 | "-u", 903 | "--update-interval", 904 | dest="update_intervals", 905 | help="Specifies how often, in seconds, the input clusters should " 906 | "be polled for each stat group. Accepts multiple.", 907 | action="append", 908 | default=[], 909 | type=int, 910 | ) 911 | 912 | return argparser.parse_args() 913 | -------------------------------------------------------------------------------- /isi_data_insights_daemon.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | from builtins import str 3 | from builtins import range 4 | from past.utils import old_div 5 | from builtins import object 6 | import gevent 7 | import gevent.pool 8 | 9 | from daemons.prefab import run 10 | from ast import literal_eval 11 | import logging 12 | import sys 13 | import time 14 | import urllib3.exceptions 15 | 16 | from isi_stats_client import IsiStatsClient 17 | 18 | MAX_ASYNC_QUERIES = 20 19 | 20 | LOG = logging.getLogger(__name__) 21 | 22 | 23 | class ClusterConfig(object): 24 | def __init__(self, address, name, version, isi_sdk, api_client): 25 | self.address = address 26 | self.name = name 27 | self.version = version 28 | self.isi_sdk = isi_sdk 29 | self.api_client = api_client 30 | 31 | def __eq__(self, other): 32 | """ 33 | Override __eq__ so that we can store this in a list and check for its 34 | existence. 35 | """ 36 | return self.address == other.address 37 | 38 | def __hash__(self): 39 | """ 40 | Override __hash__ so that we can store this in a dict. 41 | """ 42 | return hash(str(self)) 43 | 44 | def __repr__(self): 45 | return self.name 46 | 47 | 48 | class DerivedStatsProcessor(object): 49 | def __init__(self, derived_stat_computers): 50 | self._derived_stat_computers = derived_stat_computers 51 | 52 | def begin_process(self, cluster_name): 53 | for derived_stat_computer in self._derived_stat_computers: 54 | derived_stat_computer.begin_process(cluster_name) 55 | 56 | def select_stat(self, stat): 57 | for derived_stat_computer in self._derived_stat_computers: 58 | derived_stat_computer.select_stat(stat) 59 | 60 | def end_process(self, cluster_name): 61 | for derived_stat_computer in self._derived_stat_computers: 62 | derived_stat_computer.end_process(cluster_name) 63 | 64 | def stats(self): 65 | for derived_stat_computer in self._derived_stat_computers: 66 | yield derived_stat_computer 67 | 68 | 69 | class DerivedStatComputer(object): 70 | def __init__(self, out_stat_name): 71 | self._initialize() 72 | self.out_stat_name = out_stat_name 73 | 74 | def _initialize(self): 75 | self._selected_stat_timestamps = {} 76 | self._selected_stat_errors = {} 77 | 78 | def begin_process(self, cluster_name): 79 | self._initialize() 80 | 81 | def end_process(self, cluster_name): 82 | pass 83 | 84 | def process(self, stat): 85 | pass 86 | 87 | def _choose_stat(self, stat): 88 | LOG.debug("Choose stat: %s", stat.key) 89 | try: 90 | self._selected_stat_timestamps[stat.devid].append(int(stat.time)) 91 | except KeyError: 92 | self._selected_stat_timestamps[stat.devid] = [int(stat.time)] 93 | 94 | def _create_derived_stat(self, value, devid=0, error=None): 95 | class DerivedStat(object): 96 | """ Pretend to be a Stat returned by PAPI """ 97 | 98 | def __init__(self, key, val, node, timestamp, err): 99 | self.key = key 100 | self.value = val 101 | self.devid = node 102 | self.time = timestamp 103 | self.error = err 104 | self.error_code = None if error is None else 1 105 | 106 | avg_timestamp = 0 107 | if error is not None: 108 | try: 109 | avg_timestamp = self._get_timestamp_avg(devid) 110 | except ZeroDivisionError: 111 | error = ( 112 | "Caught ZeroDivisionError from _get_timestamp_avg " 113 | "for stat %s on node %s." % (self.out_stat_name, str(devid)) 114 | ) 115 | 116 | return DerivedStat(self.out_stat_name, value, devid, avg_timestamp, error) 117 | 118 | def _get_timestamp_avg(self, devid): 119 | if devid not in self._selected_stat_timestamps and devid == 0: 120 | tot = 0 121 | tot_count = 0 122 | for node in self._selected_stat_timestamps: 123 | tot += sum(self._selected_stat_timestamps[node]) 124 | tot_count += len(self._selected_stat_timestamps[node]) 125 | return int(old_div(tot, tot_count)) 126 | return int( 127 | old_div( 128 | sum(self._selected_stat_timestamps[devid]), 129 | len(self._selected_stat_timestamps[devid]), 130 | ) 131 | ) 132 | 133 | 134 | class DerivedStatInput(object): 135 | def __init__(self, stat_name, stat_fields=()): 136 | self.name = stat_name 137 | if stat_fields and len(stat_fields) > 0: 138 | self._stat_fields = stat_fields 139 | else: 140 | self._stat_fields = None 141 | 142 | def _lookup(self, stat_value, field=None, *fields): 143 | if fields: 144 | # if stat_value is not a dict or list then this will raise 145 | # exception, which is what we want it to do. 146 | if type(stat_value) == dict: 147 | return self._lookup(stat_value.get(field, {}), *fields) 148 | else: 149 | return self._lookup(stat_value[field], *fields) 150 | return stat_value.get(field) 151 | 152 | def get_value(self, stat_value): 153 | if self._stat_fields is not None: 154 | # PAPI has a weird habit of putting stats that have only 1 value 155 | # into a list. When that happens we just ignore the list 156 | if type(stat_value) == list: 157 | num_items = len(stat_value) 158 | if num_items == 1: 159 | stat_value = stat_value[0] 160 | elif num_items == 0: 161 | return None 162 | return self._lookup(stat_value, *self._stat_fields) 163 | return stat_value 164 | 165 | @property 166 | def full_name(self): 167 | return self._get_full_name(self.name) 168 | 169 | def _get_full_name(self, stat_name): 170 | if self._stat_fields is not None: 171 | full_name = stat_name 172 | full_name += ":" 173 | full_name += ":".join(self._stat_fields) 174 | else: 175 | full_name = stat_name 176 | return full_name 177 | 178 | 179 | class ClusterCompositeStatComputer(DerivedStatComputer): 180 | def __init__(self, input_stat, out_stat_name, operation): 181 | super(ClusterCompositeStatComputer, self).__init__(out_stat_name) 182 | self._input_stat = input_stat 183 | self._operation = operation 184 | 185 | def _initialize(self): 186 | super(ClusterCompositeStatComputer, self)._initialize() 187 | self._selected_stat_values = [] 188 | 189 | def select_stat(self, stat): 190 | if stat.key == self._input_stat.name: 191 | self._selected_stat_values.append(self._input_stat.get_value(stat.value)) 192 | self._choose_stat(stat) 193 | 194 | def compute_derived_stat(self): 195 | LOG.debug( 196 | "CCSC %s(%s)", 197 | str(self._operation.__name__), 198 | str(self._selected_stat_values), 199 | ) 200 | return self._create_derived_stat(self._operation(self._selected_stat_values)) 201 | 202 | 203 | class EquationStatComputer(DerivedStatComputer): 204 | def __init__(self, eq_func, input_stats, out_stat_name): 205 | super(EquationStatComputer, self).__init__(out_stat_name) 206 | self._eq_func = eq_func 207 | self._num_func_args = len(input_stats) 208 | self._input_stats = input_stats 209 | self._input_stats_names = {} 210 | self._input_stat_locations = {} 211 | for index in range(0, self._num_func_args): 212 | input_stat = self._input_stats[index] 213 | # setup mapping from base stat name to input_stat 214 | try: 215 | # there might be multiple fields from a single stat with this 216 | # name so we need to keep a list of input_stats 217 | self._input_stats_names[input_stat.name].append(input_stat) 218 | except KeyError: 219 | self._input_stats_names[input_stat.name] = [input_stat] 220 | # setup mapping from name to location(s) in the equation 221 | try: 222 | self._input_stat_locations[input_stat.full_name].append(index) 223 | except KeyError: 224 | self._input_stat_locations[input_stat.full_name] = [index] 225 | 226 | def _initialize(self): 227 | super(EquationStatComputer, self)._initialize() 228 | self._selected_stat_values = {} 229 | self._nodes = set() 230 | 231 | def select_stat(self, stat): 232 | # check if this stat is included in this equation 233 | try: 234 | input_stats = self._input_stats_names[stat.key] 235 | # if there is an entry for this stat then it is part of my equation 236 | self._choose_stat(stat) 237 | self._nodes.add(stat.devid) 238 | except KeyError: 239 | return 240 | for input_stat in input_stats: 241 | try: 242 | selected_stats_by_node = self._selected_stat_values[ 243 | input_stat.full_name 244 | ] 245 | except KeyError: 246 | self._selected_stat_values[input_stat.full_name] = {} 247 | selected_stats_by_node = self._selected_stat_values[ 248 | input_stat.full_name 249 | ] 250 | 251 | try: 252 | selected_stats_by_node[stat.devid] = input_stat.get_value(stat.value) 253 | except KeyError: 254 | selected_stats_by_node = {} 255 | selected_stats_by_node[stat.devid] = input_stat.get_value(stat.value) 256 | 257 | def compute_derived_stats(self): 258 | # return one derived stat per node that the selected stats were 259 | # collected for. 260 | derived_stats = [] 261 | for node in self._nodes: 262 | # for each node build a tuple of the args to the equation 263 | # by iterating through the intput stat names 264 | func_args = [None] * self._num_func_args 265 | for in_stat_name in self._input_stat_locations.keys(): 266 | stat_node = node 267 | if in_stat_name.startswith("cluster.") is True: 268 | stat_node = 0 # this is a cluster stat 269 | stat_value = self._get_stat_value(in_stat_name, stat_node) 270 | in_arg_locations = self._input_stat_locations[in_stat_name] 271 | for in_arg_loc in in_arg_locations: 272 | func_args[in_arg_loc] = stat_value 273 | # if there is at least one non-None arg then convert the Nones to 274 | # zero and try to do the computation. If all are None then skip it. 275 | if self._null_to_zero(func_args) is False: 276 | # failed to get this stat, so return error for it 277 | derived_stat = self._create_derived_stat( 278 | None, 279 | node, 280 | "Failed to get equation input for %s, " 281 | "input params: %s." % (self.out_stat_name, tuple(func_args)), 282 | ) 283 | else: 284 | try: 285 | func_args_tuple = tuple(func_args) 286 | LOG.debug( 287 | "EQS [%s]=%s(%s)", 288 | str(node), 289 | str(self._eq_func), 290 | str(func_args_tuple), 291 | ) 292 | derived_stat_value = self._eq_func(*func_args_tuple) 293 | derived_stat = self._create_derived_stat(derived_stat_value, node) 294 | except Exception as exception: 295 | derived_stat = self._create_derived_stat( 296 | None, 297 | node, 298 | error="Exception caught evaluating " 299 | "expression for %s, input " 300 | "params: %s, exception: %s" 301 | % (self.out_stat_name, str(func_args_tuple), str(exception)), 302 | ) 303 | derived_stats.append(derived_stat) 304 | 305 | return derived_stats 306 | 307 | def _null_to_zero(self, func_args): 308 | null_args = [] 309 | # since we don't know the type do some math to get zero in the correct 310 | # data type from one of the non-zero values 311 | zero = None 312 | for aindex in range(0, self._num_func_args): 313 | farg = func_args[aindex] 314 | if farg is None: 315 | null_args.append(aindex) 316 | else: 317 | zero = farg - farg 318 | 319 | if len(null_args) == self._num_func_args: 320 | # all the args are null so return False - we can't compute this 321 | # equation 322 | return False 323 | # go back through and set null args to zero 324 | for aindex in null_args: 325 | func_args[aindex] = zero 326 | 327 | return True 328 | 329 | def _get_stat_value(self, stat_name, node): 330 | try: 331 | return self._selected_stat_values[stat_name][node] 332 | except KeyError: 333 | return None 334 | 335 | 336 | class PercentChangeStatComputer(DerivedStatComputer): 337 | def __init__(self, input_stat, out_stat_name): 338 | super(PercentChangeStatComputer, self).__init__(out_stat_name) 339 | self._input_stat = input_stat 340 | # per node/cluster value 341 | self._cur_values = {} 342 | self._prev_values = {} 343 | 344 | def begin_process(self, cluster_name): 345 | super(PercentChangeStatComputer, self).begin_process(cluster_name) 346 | self._cur_cluster_name = cluster_name 347 | self._cur_values = {} 348 | 349 | def end_process(self, cluster_name): 350 | super(PercentChangeStatComputer, self).end_process(cluster_name) 351 | self._prev_values[cluster_name] = self._cur_values 352 | 353 | def select_stat(self, stat): 354 | if stat.key == self._input_stat.name: 355 | self._cur_values[stat.devid] = self._input_stat.get_value(stat.value) 356 | self._choose_stat(stat) 357 | 358 | def compute_derived_stats(self): 359 | derived_stats = [] 360 | for node in self._cur_values: 361 | try: 362 | cur_value = self._cur_values[node] 363 | except KeyError: 364 | cur_value = None 365 | if cur_value is None: 366 | derived_stat = self._create_derived_stat( 367 | None, 368 | node, 369 | error="Unable to determine current value " 370 | "of input stat: %s" % self._input_stat.full_name, 371 | ) 372 | else: 373 | try: 374 | prev_values = self._prev_values[self._cur_cluster_name] 375 | # TREAT no previous value as zero? 376 | prev_value = prev_values[node] 377 | LOG.debug( 378 | "PCS [%s]=(%s / %s) - 1", 379 | str(node), 380 | str(cur_value), 381 | str(prev_value), 382 | ) 383 | try: 384 | derived_stat_value = ( 385 | old_div(float(cur_value), float(prev_value)) 386 | ) - 1 387 | except ZeroDivisionError: 388 | if cur_value == 0 or cur_value == 0.0: 389 | # prev_value and cur_value == 0 390 | derived_stat_value = 0.0 391 | else: 392 | derived_stat_value = ( 393 | old_div(float(prev_value), float(cur_value)) 394 | ) - 1 395 | derived_stat_value *= -1.0 396 | derived_stat_value *= 100.0 397 | except KeyError: 398 | # no previous value will cause a KeyError 399 | # so return 0% change 400 | derived_stat_value = 0.0 401 | derived_stat = self._create_derived_stat(derived_stat_value, node) 402 | derived_stats.append(derived_stat) 403 | 404 | return derived_stats 405 | 406 | 407 | class StatsConfig(object): 408 | def __init__(self, cluster_configs, stats, update_interval): 409 | self.cluster_configs = cluster_configs 410 | self.stats = stats 411 | self.update_interval = update_interval 412 | self.cluster_composite_stats = [] 413 | self.equation_stats = [] 414 | self.pct_change_stats = [] 415 | self.final_equation_stats = [] 416 | 417 | 418 | class StatSet(object): 419 | def __init__(self): 420 | self.cluster_configs = [] 421 | self.stats = set() 422 | self.cluster_composite_stats = [] 423 | self.equation_stats = [] 424 | self.pct_change_stats = [] 425 | self.final_equation_stats = [] 426 | 427 | 428 | class UpdateInterval(object): 429 | def __init__(self, interval): 430 | self.interval = interval 431 | self.last_update = 0.0 432 | 433 | 434 | class IsiDataInsightsDaemon(run.RunDaemon): 435 | """ 436 | Periodically query a list of OneFS clusters for statistics and 437 | process them via a configurable stats processor module. 438 | """ 439 | 440 | def __init__(self, pidfile): 441 | """ 442 | Initialize. 443 | :param: pidfile is the path to the daemon's pidfile (required). 444 | """ 445 | super(IsiDataInsightsDaemon, self).__init__(pidfile=pidfile) 446 | self._stat_sets = {} 447 | self._update_intervals = [] 448 | self._stats_processor = None 449 | self._stats_processor_args = None 450 | self._process_stats_func = None 451 | self.async_worker_pool = gevent.pool.Pool(MAX_ASYNC_QUERIES) 452 | 453 | def set_stats_processor(self, stats_processor, processor_args): 454 | self._stats_processor = stats_processor 455 | self._stats_processor_args = processor_args 456 | if hasattr(stats_processor, "process_stat") is True: 457 | self._process_stats_func = self._process_stats_with_derived_stats 458 | self._init_derived_stats_processor() 459 | elif hasattr(stats_processor, "process") is True: 460 | self._process_stats_func = self._process_all_stats 461 | else: 462 | raise AttributeError( 463 | "Results processor module has no process() or " 464 | "process_stat() function." 465 | ) 466 | # start the stats processor module 467 | if hasattr(self._stats_processor, "start") is True: 468 | # need to start the processor now before the process is daemonized 469 | # in case the plugin needs to prompt the user for input prior to 470 | # starting. 471 | LOG.info("Starting stats processor.") 472 | self._stats_processor.start(self._stats_processor_args) 473 | 474 | def _init_derived_stats_processor(self): 475 | # if the stats processor doesn't define begin_process or end_process, 476 | # then add a noop version so we don't have to check each time we 477 | # process stats 478 | def noop(cluster_name): 479 | pass 480 | 481 | if hasattr(self._stats_processor, "begin_process") is False: 482 | self._stats_processor.begin_process = noop 483 | if hasattr(self._stats_processor, "end_process") is False: 484 | self._stats_processor.end_process = noop 485 | 486 | def add_stats(self, stats_config): 487 | """ 488 | Add set of stats to be queried. 489 | :param: stats_config is an instance of StatsConfig, which defines the 490 | list of stats, an update interval, and the list of clusters to query. 491 | """ 492 | try: 493 | # organize the stat sets by update interval 494 | stat_set = self._stat_sets[stats_config.update_interval] 495 | except KeyError: 496 | self._stat_sets[stats_config.update_interval] = stat_set = StatSet() 497 | self._update_intervals.append(UpdateInterval(stats_config.update_interval)) 498 | 499 | # add the new clusters to the list of clusters associated with this 500 | # update interval's stat set. 501 | for cluster in stats_config.cluster_configs: 502 | if cluster not in stat_set.cluster_configs: 503 | # TODO this is a bug - this causes these stats to be queried on 504 | # all clusters in this update interval, not just the clusters 505 | # defined in this stats_config 506 | stat_set.cluster_configs.append(cluster) 507 | 508 | # add the new stats to the stat set 509 | for stat_name in stats_config.stats: 510 | stat_set.stats.add(stat_name) 511 | 512 | stat_set.cluster_composite_stats.extend(stats_config.cluster_composite_stats) 513 | 514 | stat_set.equation_stats.extend(stats_config.equation_stats) 515 | 516 | stat_set.pct_change_stats.extend(stats_config.pct_change_stats) 517 | 518 | stat_set.final_equation_stats.extend(stats_config.final_equation_stats) 519 | 520 | def get_stat_set_count(self): 521 | return len(self._stat_sets) 522 | 523 | def get_next_stat_set(self): 524 | for update_interval, stat_set in self._stat_sets.items(): 525 | yield update_interval, stat_set 526 | 527 | def run(self, debug=False): 528 | """ 529 | Loop through stat sets, query for their values, and process them with 530 | the stats processor. 531 | """ 532 | LOG.info("Starting.") 533 | 534 | sleep_secs = 0 535 | start_time = time.time() 536 | # setup the last update time of each update interval so that they all 537 | # get updated on the first pass. 538 | for update_interval in self._update_intervals: 539 | update_interval.last_update = start_time - update_interval.interval 540 | 541 | while True: 542 | LOG.debug("Sleeping for %f seconds.", sleep_secs) 543 | time.sleep(sleep_secs) 544 | 545 | # query and process the stat sets whose update interval has been 546 | # hit or surpassed. 547 | self._query_and_process_stats(time.time(), debug) 548 | 549 | cur_time = time.time() 550 | # figure out the shortest amount of time until the next update is 551 | # needed and sleep for that amount of time. 552 | min_next_update = sys.float_info.max 553 | for update_interval in self._update_intervals: 554 | next_update_time = ( 555 | update_interval.last_update + update_interval.interval 556 | ) 557 | 558 | time_to_next_update = next_update_time - cur_time 559 | min_next_update = min(time_to_next_update, min_next_update) 560 | sleep_secs = max(0.0, min_next_update) 561 | 562 | def shutdown(self, signum): 563 | """ 564 | Stops the stats processor prior to stopping the daemon. 565 | """ 566 | LOG.info("Stopping.") 567 | if ( 568 | self._stats_processor is not None 569 | and hasattr(self._stats_processor, "stop") is True 570 | ): 571 | LOG.info("Stopping stats processor.") 572 | self._stats_processor.stop() 573 | super(IsiDataInsightsDaemon, self).shutdown(signum) 574 | 575 | def _query_and_process_stats(self, cur_time, debug): 576 | """ 577 | Build a unique set of stats to update per cluster from each set of 578 | stats that are in need of updating based on the amount of time elapsed 579 | since their last update. 580 | """ 581 | # there might be more than one stat set that needs updating and thus 582 | # there might be common clusters between those stat sets, so this loop 583 | # makes sure that we only send one query to each unique cluster. 584 | cluster_stats = {} 585 | for update_interval in self._update_intervals: 586 | # if the update_interval is less than or equal to the elapsed_time 587 | # then we need to query the stats associated with this update 588 | # interval. 589 | time_since_last_update = cur_time - update_interval.last_update 590 | if time_since_last_update >= update_interval.interval: 591 | LOG.debug( 592 | "updating interval:%d time_since_last_update: %f", 593 | update_interval.interval, 594 | time_since_last_update, 595 | ) 596 | # update the last_update time 597 | update_interval.last_update = cur_time 598 | # add the stats from stat set to their respective cluster_stats 599 | cur_stat_set = self._stat_sets[update_interval.interval] 600 | for cluster in cur_stat_set.cluster_configs: 601 | try: 602 | ( 603 | cluster_stat_set, 604 | cluster_composite_stats, 605 | equation_stats, 606 | pct_change_stats, 607 | final_equation_stats, 608 | ) = cluster_stats[cluster] 609 | cluster_composite_stats.extend( 610 | cur_stat_set.cluster_composite_stats 611 | ) 612 | equation_stats.extend(cur_stat_set.equation_stats) 613 | pct_change_stats.extend(cur_stat_set.pct_change_stats) 614 | final_equation_stats.extend(cur_stat_set.final_equation_stats) 615 | except KeyError: 616 | cluster_stat_set = set() 617 | cluster_stats[cluster] = ( 618 | cluster_stat_set, 619 | cur_stat_set.cluster_composite_stats, 620 | cur_stat_set.equation_stats, 621 | cur_stat_set.pct_change_stats, 622 | cur_stat_set.final_equation_stats, 623 | ) 624 | 625 | for stat_name in cur_stat_set.stats: 626 | cluster_stat_set.add(stat_name) 627 | 628 | # now we have a unique list of clusters to query, so query them 629 | for ( 630 | cluster, 631 | (stats, composite_stats, eq_stats, pct_change_stats, final_eq_stats), 632 | ) in cluster_stats.items(): 633 | self.async_worker_pool.spawn( 634 | self._query_and_process_stats1, 635 | cluster, 636 | stats, 637 | composite_stats, 638 | eq_stats, 639 | pct_change_stats, 640 | final_eq_stats, 641 | debug, 642 | ) 643 | self.async_worker_pool.join() 644 | 645 | def _query_and_process_stats1( 646 | self, 647 | cluster, 648 | stats, 649 | composite_stats, 650 | eq_stats, 651 | pct_change_stats, 652 | final_eq_stats, 653 | debug, 654 | ): 655 | LOG.debug("Querying cluster %s %f", cluster.name, cluster.version) 656 | LOG.debug("Querying stats %d.", len(stats)) 657 | stats_client = IsiStatsClient(cluster.isi_sdk.StatisticsApi(cluster.api_client)) 658 | # query the current cluster with the current set of stats 659 | try: 660 | if cluster.version >= 8.0: 661 | results = stats_client.query_stats(stats) 662 | else: 663 | results = self._v7_2_multistat_query(stats, stats_client) 664 | except ( 665 | urllib3.exceptions.HTTPError, 666 | cluster.isi_sdk.rest.ApiException, 667 | ) as http_exc: 668 | LOG.error( 669 | "Failed to query stats from cluster %s, exception " "raised: %s", 670 | cluster.name, 671 | str(http_exc), 672 | ) 673 | return 674 | except Exception as gen_exc: 675 | # if in debug mode then re-raise general Exceptions because 676 | # they are most likely bugs in the code, but in non-debug mode 677 | # just continue 678 | if debug is False: 679 | LOG.error( 680 | "Failed to query stats from cluster %s, exception " "raised: %s", 681 | cluster.name, 682 | str(gen_exc), 683 | ) 684 | return 685 | else: 686 | raise gen_exc 687 | 688 | composite_stats_processor = DerivedStatsProcessor(composite_stats) 689 | equation_stats_processor = DerivedStatsProcessor(eq_stats) 690 | pct_change_stats_processor = DerivedStatsProcessor(pct_change_stats) 691 | final_equation_stats_processor = DerivedStatsProcessor(final_eq_stats) 692 | derived_stats_processors = ( 693 | composite_stats_processor, 694 | equation_stats_processor, 695 | pct_change_stats_processor, 696 | final_equation_stats_processor, 697 | ) 698 | # calls either _process_all_stats or 699 | # _process_stats_with_derived_stats depending on whether or not the 700 | # _stats_processor has a process_stat function or just a process 701 | # function. The latter requires the process_stat function. 702 | self._process_stats_func(cluster.name, results, derived_stats_processors) 703 | 704 | def _v7_2_multistat_query(self, stats, stats_client): 705 | result = [] 706 | for stat in stats: 707 | result.extend(stats_client.query_stat(stat)) 708 | return result 709 | 710 | def _process_all_stats(self, *args): 711 | cluster_name = args[0] 712 | results = args[1] 713 | # the initial version of the stats processor plugin processed all stats 714 | # at once, this function allows backwards compatibility, but derived 715 | # stats are not supported 716 | self._stats_processor.process(cluster_name, results) 717 | 718 | def _process_stats_with_derived_stats( 719 | self, cluster_name, stats_query_results, derived_stats 720 | ): 721 | LOG.debug("Processing stat results on %s", cluster_name) 722 | self._stats_processor.begin_process(cluster_name) 723 | ( 724 | cluster_composite_stats, 725 | equation_stats, 726 | pct_change_stats, 727 | final_equation_stats, 728 | ) = derived_stats 729 | cluster_composite_stats.begin_process(cluster_name) 730 | equation_stats.begin_process(cluster_name) 731 | pct_change_stats.begin_process(cluster_name) 732 | final_equation_stats.begin_process(cluster_name) 733 | # process the results 734 | for stat in stats_query_results: 735 | # check if the stat query returned an error 736 | if stat.error is not None: 737 | LOG.warning( 738 | "Query for stat: '%s' on '%s', returned error: '%s'.", 739 | str(stat.key), 740 | cluster_name, 741 | str(stat.error), 742 | ) 743 | continue 744 | self._prep_stat(stat) 745 | # let stats processor process it 746 | self._stats_processor.process_stat(cluster_name, stat) 747 | # allow derived stats to select/use this stat 748 | cluster_composite_stats.select_stat(stat) 749 | equation_stats.select_stat(stat) 750 | pct_change_stats.select_stat(stat) 751 | final_equation_stats.select_stat(stat) 752 | 753 | LOG.debug("Processing composite stats on %s", cluster_name) 754 | for composite_stat in cluster_composite_stats.stats(): 755 | # composite stats always return only one derived stat 756 | derived_stat = composite_stat.compute_derived_stat() 757 | if derived_stat.error is not None: 758 | LOG.warning( 759 | "Cluster node composite stat: " 760 | "'%s' on '%s', returned error: '%s'.", 761 | str(derived_stat.key), 762 | cluster_name, 763 | str(derived_stat.error), 764 | ) 765 | continue 766 | LOG.debug( 767 | "ClusterCompositeStat[%s]=%s", derived_stat.key, str(derived_stat.value) 768 | ) 769 | # let stats processor process it 770 | self._stats_processor.process_stat(cluster_name, derived_stat) 771 | # allow derived stats to select/use this stat 772 | equation_stats.select_stat(derived_stat) 773 | pct_change_stats.select_stat(derived_stat) 774 | final_equation_stats.select_stat(derived_stat) 775 | 776 | LOG.debug("Processing equation stats on %s", cluster_name) 777 | for eq_stat in equation_stats.stats(): 778 | # equation stats might produce more than one derived stat, 779 | # potentially one stat per node 780 | derived_stats = eq_stat.compute_derived_stats() 781 | for derived_stat in derived_stats: 782 | if derived_stat.error is not None: 783 | LOG.warning( 784 | "Equation computed stat: " 785 | "'%s' on '%s', returned error: '%s'.", 786 | str(derived_stat.key), 787 | cluster_name, 788 | str(derived_stat.error), 789 | ) 790 | continue 791 | LOG.debug( 792 | "EquationStat[%s]=%s", derived_stat.key, str(derived_stat.value) 793 | ) 794 | # let stats processor process them 795 | self._stats_processor.process_stat(cluster_name, derived_stat) 796 | # allow derived stats to select/use this stat 797 | pct_change_stats.select_stat(derived_stat) 798 | final_equation_stats.select_stat(derived_stat) 799 | 800 | LOG.debug("Processing percent change stats on %s", cluster_name) 801 | for pct_change_stat in pct_change_stats.stats(): 802 | # percent change stats might produce more than one derived stat, 803 | # potentially one stat per node 804 | derived_stats = pct_change_stat.compute_derived_stats() 805 | for derived_stat in derived_stats: 806 | if derived_stat.error is not None: 807 | LOG.warning( 808 | "Percent change stat: " "'%s' on '%s', returned error: '%s'.", 809 | str(derived_stat.key), 810 | cluster_name, 811 | str(derived_stat.error), 812 | ) 813 | continue 814 | LOG.debug( 815 | "PercentChangeStat[%s]=%s", 816 | derived_stat.key, 817 | str(derived_stat.value), 818 | ) 819 | # let stats processor process it 820 | self._stats_processor.process_stat(cluster_name, derived_stat) 821 | # allow derived stats to select/use this stat 822 | final_equation_stats.select_stat(derived_stat) 823 | 824 | LOG.debug("Processing final equation stats on %s", cluster_name) 825 | for eq_stat in final_equation_stats.stats(): 826 | # equation stats might produce more than one derived stat, 827 | # potentially one stat per node 828 | derived_stats = eq_stat.compute_derived_stats() 829 | for derived_stat in derived_stats: 830 | if derived_stat.error is not None: 831 | LOG.warning( 832 | "Final equation computed stat: " 833 | "'%s' on '%s', returned error: '%s'.", 834 | str(derived_stat.key), 835 | cluster_name, 836 | str(derived_stat.error), 837 | ) 838 | continue 839 | LOG.debug( 840 | "FinalEquationStat[%s]=%s", 841 | derived_stat.key, 842 | str(derived_stat.value), 843 | ) 844 | # let stats processor process them 845 | self._stats_processor.process_stat(cluster_name, derived_stat) 846 | 847 | self._stats_processor.end_process(cluster_name) 848 | cluster_composite_stats.end_process(cluster_name) 849 | equation_stats.end_process(cluster_name) 850 | pct_change_stats.end_process(cluster_name) 851 | final_equation_stats.end_process(cluster_name) 852 | 853 | def _prep_stat(self, stat): 854 | try: 855 | # the stat value's data type is variable depending on the key so 856 | # use literal_eval() to convert it to the correct type 857 | eval_value = literal_eval(stat.value) 858 | # convert tuples to a list for simplicity 859 | if type(eval_value) == tuple: 860 | stat.value = list(eval_value) 861 | else: 862 | stat.value = eval_value 863 | except Exception: # if literal_eval throws an exception 864 | # then just leave it as string value 865 | pass 866 | -------------------------------------------------------------------------------- /dashboards/prometheus/grafana_cluster_list_dashboard.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "builtIn": 1, 6 | "datasource": "-- Grafana --", 7 | "enable": true, 8 | "hide": true, 9 | "iconColor": "rgba(0, 211, 255, 1)", 10 | "name": "Annotations & Alerts", 11 | "type": "dashboard" 12 | } 13 | ] 14 | }, 15 | "editable": true, 16 | "gnetId": null, 17 | "graphTooltip": 0, 18 | "id": 11, 19 | "iteration": 1600859254949, 20 | "links": [], 21 | "panels": [ 22 | { 23 | "collapsed": true, 24 | "datasource": null, 25 | "gridPos": { 26 | "h": 1, 27 | "w": 24, 28 | "x": 0, 29 | "y": 0 30 | }, 31 | "id": 214, 32 | "panels": [ 33 | { 34 | "content": "* Use the pull down at the very top left of the page (next to the spiral icon) to select which dashboard you want to look at.\n* Use the cluster and other pull downs to select the cluster and protocol of interest.\n* Use the pull downs at the top right to select a specific time period of interest.\n* Note that by default the dates and time displayed are in your browser’s time zone, not the source cluster. You can get it to display in UTC via the settings under the little gear symbol at the top of the page.\n* You can hide rows using the green slide-out tab to the left of each chart.\n* If there is a legend displayed you can click on elements within it to hide or display items, etc.\n* Click on the title of the chart and then the horizontal bars icon at the left to show/hide the legend and get a CSV export of the data.\n* There is no significance in whether things are displayed as lines, bars or points - we have used whatever seems to be clearest for the data.\n", 35 | "datasource": null, 36 | "editable": true, 37 | "error": false, 38 | "fieldConfig": { 39 | "defaults": { 40 | "custom": {} 41 | }, 42 | "overrides": [] 43 | }, 44 | "gridPos": { 45 | "h": 7, 46 | "w": 24, 47 | "x": 0, 48 | "y": 1 49 | }, 50 | "id": 18, 51 | "isNew": true, 52 | "links": [], 53 | "mode": "markdown", 54 | "options": { 55 | "content": "* Use the pull down at the very top left of the page (next to the spiral icon) to select which dashboard you want to look at.\n* Use the cluster and other pull downs to select the cluster and protocol of interest.\n* Use the pull downs at the top right to select a specific time period of interest.\n* Note that by default the dates and time displayed are in your browser’s time zone, not the source cluster. You can get it to display in UTC via the settings under the little gear symbol at the top of the page.\n* You can hide rows using the green slide-out tab to the left of each chart.\n* If there is a legend displayed you can click on elements within it to hide or display items, etc.\n* Click on the title of the chart and then the horizontal bars icon at the left to show/hide the legend and get a CSV export of the data.\n* There is no significance in whether things are displayed as lines, bars or points - we have used whatever seems to be clearest for the data.\n", 56 | "mode": "markdown" 57 | }, 58 | "pluginVersion": "7.1.0", 59 | "title": "Welcome to the Isilon Cluster Summary Dashboard", 60 | "type": "text" 61 | } 62 | ], 63 | "title": "Welcome to the Isilon Cluster Summary Dashboard", 64 | "type": "row" 65 | }, 66 | { 67 | "collapsed": false, 68 | "datasource": null, 69 | "gridPos": { 70 | "h": 1, 71 | "w": 24, 72 | "x": 0, 73 | "y": 1 74 | }, 75 | "id": 215, 76 | "panels": [], 77 | "repeat": "cluster", 78 | "scopedVars": { 79 | "cluster": { 80 | "selected": true, 81 | "text": "All", 82 | "value": "$__all" 83 | } 84 | }, 85 | "title": "$cluster", 86 | "type": "row" 87 | }, 88 | { 89 | "content": "Detail dashboard
\nWebUI for $cluster", 90 | "datasource": null, 91 | "editable": true, 92 | "error": false, 93 | "fieldConfig": { 94 | "defaults": { 95 | "custom": {} 96 | }, 97 | "overrides": [] 98 | }, 99 | "gridPos": { 100 | "h": 4, 101 | "w": 2, 102 | "x": 0, 103 | "y": 2 104 | }, 105 | "id": 35, 106 | "isNew": true, 107 | "links": [ 108 | { 109 | "targetBlank": false, 110 | "title": "Detail dashboard for $cluster", 111 | "url": "dashboard/db/isilon-data-insights?$__url_time_range&$__all_variables" 112 | }, 113 | { 114 | "targetBlank": true, 115 | "title": "WebUI", 116 | "url": "https://$cluster:8080/" 117 | } 118 | ], 119 | "mode": "html", 120 | "options": { 121 | "content": "Detail dashboard
\nWebUI for $cluster", 122 | "mode": "html" 123 | }, 124 | "pluginVersion": "7.1.0", 125 | "repeatIteration": 1476718550844, 126 | "scopedVars": { 127 | "cluster": { 128 | "selected": true, 129 | "text": "All", 130 | "value": "$__all" 131 | } 132 | }, 133 | "title": "$cluster", 134 | "transparent": true, 135 | "type": "text" 136 | }, 137 | { 138 | "cacheTimeout": null, 139 | "colorBackground": false, 140 | "colorValue": false, 141 | "colors": [ 142 | "rgba(50, 172, 45, 0.97)", 143 | "rgba(237, 129, 40, 0.89)", 144 | "rgba(245, 54, 54, 0.9)" 145 | ], 146 | "datasource": "Prometheus", 147 | "editable": true, 148 | "error": false, 149 | "fieldConfig": { 150 | "defaults": { 151 | "custom": {} 152 | }, 153 | "overrides": [] 154 | }, 155 | "format": "none", 156 | "gauge": { 157 | "maxValue": 100, 158 | "minValue": 0, 159 | "show": false, 160 | "thresholdLabels": false, 161 | "thresholdMarkers": false 162 | }, 163 | "gridPos": { 164 | "h": 4, 165 | "w": 2, 166 | "x": 2, 167 | "y": 2 168 | }, 169 | "height": "", 170 | "id": 207, 171 | "interval": null, 172 | "isNew": true, 173 | "links": [ 174 | { 175 | "targetBlank": true, 176 | "title": "WebUI for $cluster", 177 | "url": "https://$cluster:8080/" 178 | } 179 | ], 180 | "mappingType": 1, 181 | "mappingTypes": [ 182 | { 183 | "name": "value to text", 184 | "value": 1 185 | }, 186 | { 187 | "name": "range to text", 188 | "value": 2 189 | } 190 | ], 191 | "maxDataPoints": 100, 192 | "nullPointMode": "connected", 193 | "nullText": null, 194 | "postfix": "", 195 | "postfixFontSize": "50%", 196 | "prefix": "", 197 | "prefixFontSize": "50%", 198 | "rangeMaps": [ 199 | { 200 | "from": "null", 201 | "text": "N/A", 202 | "to": "null" 203 | } 204 | ], 205 | "repeatIteration": 1476718550844, 206 | "scopedVars": { 207 | "cluster": { 208 | "selected": true, 209 | "text": "All", 210 | "value": "$__all" 211 | } 212 | }, 213 | "sparkline": { 214 | "fillColor": "rgba(31, 118, 189, 0.18)", 215 | "full": false, 216 | "lineColor": "rgb(31, 120, 193)", 217 | "show": false 218 | }, 219 | "tableColumn": "", 220 | "targets": [ 221 | { 222 | "expr": "max(isilon_cluster_node_count_all{hostname=~\"$cluster\"})", 223 | "interval": "", 224 | "legendFormat": "", 225 | "refId": "A" 226 | } 227 | ], 228 | "thresholds": "1,2", 229 | "title": "Total Nodes", 230 | "type": "singlestat", 231 | "valueFontSize": "80%", 232 | "valueMaps": [ 233 | { 234 | "op": "=", 235 | "text": "", 236 | "value": "" 237 | } 238 | ], 239 | "valueName": "current" 240 | }, 241 | { 242 | "cacheTimeout": null, 243 | "colorBackground": true, 244 | "colorValue": false, 245 | "colors": [ 246 | "rgba(50, 172, 45, 0.97)", 247 | "rgba(237, 129, 40, 0.89)", 248 | "rgba(245, 54, 54, 0.9)" 249 | ], 250 | "datasource": "Prometheus", 251 | "editable": true, 252 | "error": false, 253 | "fieldConfig": { 254 | "defaults": { 255 | "custom": {} 256 | }, 257 | "overrides": [] 258 | }, 259 | "format": "none", 260 | "gauge": { 261 | "maxValue": 100, 262 | "minValue": 0, 263 | "show": false, 264 | "thresholdLabels": false, 265 | "thresholdMarkers": true 266 | }, 267 | "gridPos": { 268 | "h": 4, 269 | "w": 2, 270 | "x": 4, 271 | "y": 2 272 | }, 273 | "id": 13, 274 | "interval": null, 275 | "isNew": true, 276 | "links": [ 277 | { 278 | "targetBlank": true, 279 | "title": "WebUI for $cluster", 280 | "url": "https://$cluster:8080/" 281 | } 282 | ], 283 | "mappingType": 1, 284 | "mappingTypes": [ 285 | { 286 | "name": "value to text", 287 | "value": 1 288 | }, 289 | { 290 | "name": "range to text", 291 | "value": 2 292 | } 293 | ], 294 | "maxDataPoints": 100, 295 | "nullPointMode": "connected", 296 | "nullText": null, 297 | "postfix": "", 298 | "postfixFontSize": "50%", 299 | "prefix": "", 300 | "prefixFontSize": "50%", 301 | "rangeMaps": [ 302 | { 303 | "from": "null", 304 | "text": "N/A", 305 | "to": "null" 306 | } 307 | ], 308 | "repeatIteration": 1476718550844, 309 | "scopedVars": { 310 | "cluster": { 311 | "selected": true, 312 | "text": "All", 313 | "value": "$__all" 314 | } 315 | }, 316 | "sparkline": { 317 | "fillColor": "rgba(31, 118, 189, 0.18)", 318 | "full": false, 319 | "lineColor": "rgb(31, 120, 193)", 320 | "show": false 321 | }, 322 | "tableColumn": "", 323 | "targets": [ 324 | { 325 | "expr": "max(isilon_cluster_node_count_down{hostname=~\"$cluster\"})", 326 | "interval": "", 327 | "legendFormat": "", 328 | "refId": "A" 329 | } 330 | ], 331 | "thresholds": "1,2", 332 | "title": "Nodes Down", 333 | "type": "singlestat", 334 | "valueFontSize": "80%", 335 | "valueMaps": [ 336 | { 337 | "op": "=", 338 | "text": "", 339 | "value": "" 340 | } 341 | ], 342 | "valueName": "current" 343 | }, 344 | { 345 | "cacheTimeout": null, 346 | "colorBackground": true, 347 | "colorValue": false, 348 | "colors": [ 349 | "rgba(50, 172, 45, 0.97)", 350 | "rgba(237, 129, 40, 0.89)", 351 | "rgba(245, 54, 54, 0.9)" 352 | ], 353 | "datasource": "Prometheus", 354 | "editable": true, 355 | "error": false, 356 | "fieldConfig": { 357 | "defaults": { 358 | "custom": {} 359 | }, 360 | "overrides": [] 361 | }, 362 | "format": "none", 363 | "gauge": { 364 | "maxValue": 100, 365 | "minValue": 0, 366 | "show": false, 367 | "thresholdLabels": false, 368 | "thresholdMarkers": true 369 | }, 370 | "gridPos": { 371 | "h": 4, 372 | "w": 2, 373 | "x": 6, 374 | "y": 2 375 | }, 376 | "id": 14, 377 | "interval": null, 378 | "isNew": true, 379 | "links": [ 380 | { 381 | "targetBlank": true, 382 | "title": "WebUI for $cluster", 383 | "url": "https://$cluster:8080/" 384 | } 385 | ], 386 | "mappingType": 2, 387 | "mappingTypes": [ 388 | { 389 | "name": "value to text", 390 | "value": 1 391 | }, 392 | { 393 | "name": "range to text", 394 | "value": 2 395 | } 396 | ], 397 | "maxDataPoints": 100, 398 | "nullPointMode": "connected", 399 | "nullText": null, 400 | "postfix": "", 401 | "postfixFontSize": "50%", 402 | "prefix": "", 403 | "prefixFontSize": "50%", 404 | "rangeMaps": [ 405 | { 406 | "from": "0", 407 | "text": "Healthy", 408 | "to": "0" 409 | }, 410 | { 411 | "from": ".0001", 412 | "text": "Attention", 413 | "to": "1.999" 414 | }, 415 | { 416 | "from": "2", 417 | "text": "Down", 418 | "to": "5" 419 | } 420 | ], 421 | "repeatIteration": 1476718550844, 422 | "scopedVars": { 423 | "cluster": { 424 | "selected": true, 425 | "text": "All", 426 | "value": "$__all" 427 | } 428 | }, 429 | "sparkline": { 430 | "fillColor": "rgba(31, 118, 189, 0.18)", 431 | "full": false, 432 | "lineColor": "rgb(31, 120, 193)", 433 | "show": false 434 | }, 435 | "tableColumn": "", 436 | "targets": [ 437 | { 438 | "expr": "max(isilon_cluster_health{hostname=~\"$cluster\"})", 439 | "interval": "", 440 | "legendFormat": "", 441 | "refId": "A" 442 | } 443 | ], 444 | "thresholds": "0.0001,2", 445 | "title": "Alert Status", 446 | "type": "singlestat", 447 | "valueFontSize": "50%", 448 | "valueMaps": [ 449 | { 450 | "op": "=", 451 | "text": "Healthy", 452 | "value": "0" 453 | }, 454 | { 455 | "op": "=", 456 | "text": "Attention", 457 | "value": "1" 458 | }, 459 | { 460 | "op": "=", 461 | "text": "Down", 462 | "value": "2" 463 | } 464 | ], 465 | "valueName": "avg" 466 | }, 467 | { 468 | "cacheTimeout": null, 469 | "colorBackground": false, 470 | "colorValue": false, 471 | "colors": [ 472 | "rgba(50, 172, 45, 0.97)", 473 | "rgba(237, 129, 40, 0.89)", 474 | "rgba(245, 54, 54, 0.9)" 475 | ], 476 | "datasource": "Prometheus", 477 | "editable": true, 478 | "error": false, 479 | "fieldConfig": { 480 | "defaults": { 481 | "custom": {} 482 | }, 483 | "overrides": [] 484 | }, 485 | "format": "percentunit", 486 | "gauge": { 487 | "maxValue": 1, 488 | "minValue": 0, 489 | "show": true, 490 | "thresholdLabels": false, 491 | "thresholdMarkers": true 492 | }, 493 | "gridPos": { 494 | "h": 4, 495 | "w": 2, 496 | "x": 8, 497 | "y": 2 498 | }, 499 | "id": 8, 500 | "interval": null, 501 | "isNew": true, 502 | "links": [ 503 | { 504 | "targetBlank": false, 505 | "title": "Detail dashboard for $cluster", 506 | "url": "dashboard/db/isilon-data-insights-cluster-detail?$__url_time_range&$__all_variables" 507 | } 508 | ], 509 | "mappingType": 2, 510 | "mappingTypes": [ 511 | { 512 | "name": "value to text", 513 | "value": 1 514 | }, 515 | { 516 | "name": "range to text", 517 | "value": 2 518 | } 519 | ], 520 | "maxDataPoints": 100, 521 | "nullPointMode": "connected", 522 | "nullText": null, 523 | "postfix": "", 524 | "postfixFontSize": "50%", 525 | "prefix": "", 526 | "prefixFontSize": "50%", 527 | "rangeMaps": [], 528 | "repeatIteration": 1476718550844, 529 | "scopedVars": { 530 | "cluster": { 531 | "selected": true, 532 | "text": "All", 533 | "value": "$__all" 534 | } 535 | }, 536 | "sparkline": { 537 | "fillColor": "rgba(31, 118, 189, 0.18)", 538 | "full": true, 539 | "lineColor": "rgb(31, 120, 193)", 540 | "show": true 541 | }, 542 | "tableColumn": "", 543 | "targets": [ 544 | { 545 | "expr": "1.0 - avg(isilon_cluster_cpu_idle_avg{hostname=~\"$cluster\"})/1000", 546 | "interval": "", 547 | "legendFormat": "", 548 | "refId": "A" 549 | } 550 | ], 551 | "thresholds": "0.80,0.95", 552 | "title": "Cluster CPU", 553 | "type": "singlestat", 554 | "valueFontSize": "80%", 555 | "valueMaps": [ 556 | { 557 | "op": "=", 558 | "text": "N/A", 559 | "value": "null" 560 | } 561 | ], 562 | "valueName": "current" 563 | }, 564 | { 565 | "cacheTimeout": null, 566 | "colorBackground": false, 567 | "colorValue": false, 568 | "colors": [ 569 | "rgba(50, 172, 45, 0.97)", 570 | "rgba(237, 129, 40, 0.89)", 571 | "rgba(245, 54, 54, 0.9)" 572 | ], 573 | "datasource": "Prometheus", 574 | "editable": true, 575 | "error": false, 576 | "fieldConfig": { 577 | "defaults": { 578 | "custom": {} 579 | }, 580 | "overrides": [] 581 | }, 582 | "format": "percent", 583 | "gauge": { 584 | "maxValue": 100, 585 | "minValue": 0, 586 | "show": true, 587 | "thresholdLabels": false, 588 | "thresholdMarkers": true 589 | }, 590 | "gridPos": { 591 | "h": 4, 592 | "w": 2, 593 | "x": 10, 594 | "y": 2 595 | }, 596 | "id": 9, 597 | "interval": null, 598 | "isNew": true, 599 | "links": [ 600 | { 601 | "targetBlank": false, 602 | "title": "Detail dashboard for $cluster", 603 | "url": "dashboard/db/isilon-data-insights-cluster-detail?$__url_time_range&$__all_variables" 604 | } 605 | ], 606 | "mappingType": 2, 607 | "mappingTypes": [ 608 | { 609 | "name": "value to text", 610 | "value": 1 611 | }, 612 | { 613 | "name": "range to text", 614 | "value": 2 615 | } 616 | ], 617 | "maxDataPoints": 100, 618 | "nullPointMode": "connected", 619 | "nullText": null, 620 | "postfix": "", 621 | "postfixFontSize": "50%", 622 | "prefix": "", 623 | "prefixFontSize": "50%", 624 | "rangeMaps": [], 625 | "repeatIteration": 1476718550844, 626 | "scopedVars": { 627 | "cluster": { 628 | "selected": true, 629 | "text": "All", 630 | "value": "$__all" 631 | } 632 | }, 633 | "sparkline": { 634 | "fillColor": "rgba(31, 118, 189, 0.18)", 635 | "full": true, 636 | "lineColor": "rgb(31, 120, 193)", 637 | "show": true 638 | }, 639 | "tableColumn": "", 640 | "targets": [ 641 | { 642 | "expr": "100 - avg(isilon_ifs_percent_avail{hostname=~\"$cluster\"})", 643 | "interval": "", 644 | "legendFormat": "", 645 | "refId": "A" 646 | } 647 | ], 648 | "thresholds": "80,90", 649 | "title": "Cluster Capacity", 650 | "type": "singlestat", 651 | "valueFontSize": "80%", 652 | "valueMaps": [ 653 | { 654 | "op": "=", 655 | "text": "N/A", 656 | "value": "null" 657 | } 658 | ], 659 | "valueName": "current" 660 | }, 661 | { 662 | "cacheTimeout": null, 663 | "colorBackground": false, 664 | "colorValue": false, 665 | "colors": [ 666 | "rgba(245, 54, 54, 0.9)", 667 | "rgba(237, 129, 40, 0.89)", 668 | "rgba(50, 172, 45, 0.97)" 669 | ], 670 | "datasource": "Prometheus", 671 | "editable": true, 672 | "error": false, 673 | "fieldConfig": { 674 | "defaults": { 675 | "custom": {} 676 | }, 677 | "overrides": [] 678 | }, 679 | "format": "Bps", 680 | "gauge": { 681 | "maxValue": 100, 682 | "minValue": 0, 683 | "show": false, 684 | "thresholdLabels": false, 685 | "thresholdMarkers": true 686 | }, 687 | "gridPos": { 688 | "h": 4, 689 | "w": 2, 690 | "x": 12, 691 | "y": 2 692 | }, 693 | "id": 208, 694 | "interval": null, 695 | "isNew": true, 696 | "links": [ 697 | { 698 | "title": "Isilon Data Insights Cluster Detail", 699 | "url": "dashboard/db/isilon-data-insights-cluster-detail?$__url_time_range&$__all_variables" 700 | } 701 | ], 702 | "mappingType": 1, 703 | "mappingTypes": [ 704 | { 705 | "name": "value to text", 706 | "value": 1 707 | }, 708 | { 709 | "name": "range to text", 710 | "value": 2 711 | } 712 | ], 713 | "maxDataPoints": 100, 714 | "nullPointMode": "connected", 715 | "nullText": null, 716 | "postfix": "", 717 | "postfixFontSize": "50%", 718 | "prefix": "", 719 | "prefixFontSize": "50%", 720 | "rangeMaps": [ 721 | { 722 | "from": "null", 723 | "text": "N/A", 724 | "to": "null" 725 | } 726 | ], 727 | "repeatIteration": 1476718550844, 728 | "scopedVars": { 729 | "cluster": { 730 | "selected": true, 731 | "text": "All", 732 | "value": "$__all" 733 | } 734 | }, 735 | "sparkline": { 736 | "fillColor": "rgba(31, 118, 189, 0.18)", 737 | "full": true, 738 | "lineColor": "rgb(31, 120, 193)", 739 | "show": true 740 | }, 741 | "tableColumn": "", 742 | "targets": [ 743 | { 744 | "expr": "isilon_cluster_protostats_nfs_in_rate{hostname=~\"$cluster\"}+isilon_cluster_protostats_nfs_out_rate{hostname=~\"$cluster\"}", 745 | "interval": "", 746 | "legendFormat": "", 747 | "refId": "A" 748 | } 749 | ], 750 | "thresholds": "", 751 | "title": "NFSv3 Throughput", 752 | "type": "singlestat", 753 | "valueFontSize": "80%", 754 | "valueMaps": [ 755 | { 756 | "op": "=", 757 | "text": "N/A", 758 | "value": "null" 759 | } 760 | ], 761 | "valueName": "current" 762 | }, 763 | { 764 | "cacheTimeout": null, 765 | "colorBackground": false, 766 | "colorValue": false, 767 | "colors": [ 768 | "rgba(245, 54, 54, 0.9)", 769 | "rgba(237, 129, 40, 0.89)", 770 | "rgba(50, 172, 45, 0.97)" 771 | ], 772 | "datasource": "Prometheus", 773 | "editable": true, 774 | "error": false, 775 | "fieldConfig": { 776 | "defaults": { 777 | "custom": {} 778 | }, 779 | "overrides": [] 780 | }, 781 | "format": "ops", 782 | "gauge": { 783 | "maxValue": 100, 784 | "minValue": 0, 785 | "show": false, 786 | "thresholdLabels": false, 787 | "thresholdMarkers": true 788 | }, 789 | "gridPos": { 790 | "h": 4, 791 | "w": 2, 792 | "x": 14, 793 | "y": 2 794 | }, 795 | "id": 209, 796 | "interval": null, 797 | "isNew": true, 798 | "links": [ 799 | { 800 | "title": "Isilon Data Insights Cluster Detail", 801 | "url": "dashboard/db/isilon-data-insights-cluster-detail?$__url_time_range&$__all_variables" 802 | } 803 | ], 804 | "mappingType": 1, 805 | "mappingTypes": [ 806 | { 807 | "name": "value to text", 808 | "value": 1 809 | }, 810 | { 811 | "name": "range to text", 812 | "value": 2 813 | } 814 | ], 815 | "maxDataPoints": 100, 816 | "nullPointMode": "connected", 817 | "nullText": null, 818 | "postfix": "", 819 | "postfixFontSize": "50%", 820 | "prefix": "", 821 | "prefixFontSize": "50%", 822 | "rangeMaps": [ 823 | { 824 | "from": "null", 825 | "text": "N/A", 826 | "to": "null" 827 | } 828 | ], 829 | "repeatIteration": 1476718550844, 830 | "scopedVars": { 831 | "cluster": { 832 | "selected": true, 833 | "text": "All", 834 | "value": "$__all" 835 | } 836 | }, 837 | "sparkline": { 838 | "fillColor": "rgba(31, 118, 189, 0.18)", 839 | "full": true, 840 | "lineColor": "rgb(31, 120, 193)", 841 | "show": true 842 | }, 843 | "tableColumn": "", 844 | "targets": [ 845 | { 846 | "expr": "isilon_cluster_protostats_nfs_total_op_rate{hostname=~\"$cluster\"}", 847 | "interval": "", 848 | "legendFormat": "", 849 | "refId": "A" 850 | } 851 | ], 852 | "thresholds": "", 853 | "title": "NFSv3 Op/s", 854 | "type": "singlestat", 855 | "valueFontSize": "80%", 856 | "valueMaps": [ 857 | { 858 | "op": "=", 859 | "text": "N/A", 860 | "value": "null" 861 | } 862 | ], 863 | "valueName": "current" 864 | }, 865 | { 866 | "cacheTimeout": null, 867 | "colorBackground": true, 868 | "colorValue": false, 869 | "colors": [ 870 | "rgba(50, 172, 45, 0.97)", 871 | "rgba(237, 129, 40, 0.89)", 872 | "rgba(245, 54, 54, 0.9)" 873 | ], 874 | "datasource": "Prometheus", 875 | "editable": true, 876 | "error": false, 877 | "fieldConfig": { 878 | "defaults": { 879 | "custom": {} 880 | }, 881 | "overrides": [] 882 | }, 883 | "format": "ms", 884 | "gauge": { 885 | "maxValue": 100, 886 | "minValue": 0, 887 | "show": false, 888 | "thresholdLabels": false, 889 | "thresholdMarkers": false 890 | }, 891 | "gridPos": { 892 | "h": 4, 893 | "w": 2, 894 | "x": 16, 895 | "y": 2 896 | }, 897 | "id": 210, 898 | "interval": null, 899 | "isNew": true, 900 | "links": [ 901 | { 902 | "title": "Isilon Data Insights Cluster Detail", 903 | "url": "dashboard/db/isilon-data-insights-cluster-detail?$__url_time_range&$__all_variables" 904 | } 905 | ], 906 | "mappingType": 1, 907 | "mappingTypes": [ 908 | { 909 | "name": "value to text", 910 | "value": 1 911 | }, 912 | { 913 | "name": "range to text", 914 | "value": 2 915 | } 916 | ], 917 | "maxDataPoints": 100, 918 | "nullPointMode": "connected", 919 | "nullText": null, 920 | "postfix": "", 921 | "postfixFontSize": "50%", 922 | "prefix": "", 923 | "prefixFontSize": "50%", 924 | "rangeMaps": [ 925 | { 926 | "from": "null", 927 | "text": "N/A", 928 | "to": "null" 929 | } 930 | ], 931 | "repeatIteration": 1476718550844, 932 | "scopedVars": { 933 | "cluster": { 934 | "selected": true, 935 | "text": "All", 936 | "value": "$__all" 937 | } 938 | }, 939 | "sparkline": { 940 | "fillColor": "rgba(31, 118, 189, 0.18)", 941 | "full": true, 942 | "lineColor": "rgb(31, 120, 193)", 943 | "show": true 944 | }, 945 | "tableColumn": "", 946 | "targets": [ 947 | { 948 | "expr": "isilon_cluster_protostats_nfs_total_time_avg{hostname=~\"$cluster\"}/1000", 949 | "interval": "", 950 | "legendFormat": "", 951 | "refId": "A" 952 | } 953 | ], 954 | "thresholds": "10,25", 955 | "title": "NFSv3 Latency", 956 | "type": "singlestat", 957 | "valueFontSize": "80%", 958 | "valueMaps": [ 959 | { 960 | "op": "=", 961 | "text": "N/A", 962 | "value": "null" 963 | } 964 | ], 965 | "valueName": "current" 966 | }, 967 | { 968 | "cacheTimeout": null, 969 | "colorBackground": false, 970 | "colorValue": false, 971 | "colors": [ 972 | "rgba(245, 54, 54, 0.9)", 973 | "rgba(237, 129, 40, 0.89)", 974 | "rgba(50, 172, 45, 0.97)" 975 | ], 976 | "datasource": "Prometheus", 977 | "editable": true, 978 | "error": false, 979 | "fieldConfig": { 980 | "defaults": { 981 | "custom": {} 982 | }, 983 | "overrides": [] 984 | }, 985 | "format": "Bps", 986 | "gauge": { 987 | "maxValue": 100, 988 | "minValue": 0, 989 | "show": false, 990 | "thresholdLabels": false, 991 | "thresholdMarkers": true 992 | }, 993 | "gridPos": { 994 | "h": 4, 995 | "w": 2, 996 | "x": 18, 997 | "y": 2 998 | }, 999 | "id": 211, 1000 | "interval": null, 1001 | "isNew": true, 1002 | "links": [ 1003 | { 1004 | "title": "Isilon Data Insights Cluster Detail", 1005 | "url": "dashboard/db/isilon-data-insights-cluster-detail?$__url_time_range&$__all_variables" 1006 | } 1007 | ], 1008 | "mappingType": 1, 1009 | "mappingTypes": [ 1010 | { 1011 | "name": "value to text", 1012 | "value": 1 1013 | }, 1014 | { 1015 | "name": "range to text", 1016 | "value": 2 1017 | } 1018 | ], 1019 | "maxDataPoints": 100, 1020 | "nullPointMode": "connected", 1021 | "nullText": null, 1022 | "postfix": "", 1023 | "postfixFontSize": "50%", 1024 | "prefix": "", 1025 | "prefixFontSize": "50%", 1026 | "rangeMaps": [ 1027 | { 1028 | "from": "null", 1029 | "text": "N/A", 1030 | "to": "null" 1031 | } 1032 | ], 1033 | "repeatIteration": 1476718550844, 1034 | "scopedVars": { 1035 | "cluster": { 1036 | "selected": true, 1037 | "text": "All", 1038 | "value": "$__all" 1039 | } 1040 | }, 1041 | "sparkline": { 1042 | "fillColor": "rgba(31, 118, 189, 0.18)", 1043 | "full": true, 1044 | "lineColor": "rgb(31, 120, 193)", 1045 | "show": true 1046 | }, 1047 | "tableColumn": "", 1048 | "targets": [ 1049 | { 1050 | "expr": "isilon_cluster_protostats_smb2_total_in_rate{hostname=~\"$cluster\"}+isilon_cluster_protostats_smb2_total_out_rate{hostname=~\"$cluster\"}", 1051 | "interval": "", 1052 | "legendFormat": "", 1053 | "refId": "A" 1054 | } 1055 | ], 1056 | "thresholds": "", 1057 | "title": "SMB2 Throughput", 1058 | "type": "singlestat", 1059 | "valueFontSize": "80%", 1060 | "valueMaps": [ 1061 | { 1062 | "op": "=", 1063 | "text": "N/A", 1064 | "value": "null" 1065 | } 1066 | ], 1067 | "valueName": "current" 1068 | }, 1069 | { 1070 | "cacheTimeout": null, 1071 | "colorBackground": false, 1072 | "colorValue": false, 1073 | "colors": [ 1074 | "rgba(245, 54, 54, 0.9)", 1075 | "rgba(237, 129, 40, 0.89)", 1076 | "rgba(50, 172, 45, 0.97)" 1077 | ], 1078 | "datasource": "Prometheus", 1079 | "editable": true, 1080 | "error": false, 1081 | "fieldConfig": { 1082 | "defaults": { 1083 | "custom": {} 1084 | }, 1085 | "overrides": [] 1086 | }, 1087 | "format": "ops", 1088 | "gauge": { 1089 | "maxValue": 100, 1090 | "minValue": 0, 1091 | "show": false, 1092 | "thresholdLabels": false, 1093 | "thresholdMarkers": true 1094 | }, 1095 | "gridPos": { 1096 | "h": 4, 1097 | "w": 2, 1098 | "x": 20, 1099 | "y": 2 1100 | }, 1101 | "id": 212, 1102 | "interval": null, 1103 | "isNew": true, 1104 | "links": [ 1105 | { 1106 | "title": "Isilon Data Insights Cluster Detail", 1107 | "url": "dashboard/db/isilon-data-insights-cluster-detail?$__url_time_range&$__all_variables" 1108 | } 1109 | ], 1110 | "mappingType": 1, 1111 | "mappingTypes": [ 1112 | { 1113 | "name": "value to text", 1114 | "value": 1 1115 | }, 1116 | { 1117 | "name": "range to text", 1118 | "value": 2 1119 | } 1120 | ], 1121 | "maxDataPoints": 100, 1122 | "nullPointMode": "connected", 1123 | "nullText": null, 1124 | "postfix": "", 1125 | "postfixFontSize": "50%", 1126 | "prefix": "", 1127 | "prefixFontSize": "50%", 1128 | "rangeMaps": [ 1129 | { 1130 | "from": "null", 1131 | "text": "N/A", 1132 | "to": "null" 1133 | } 1134 | ], 1135 | "repeatIteration": 1476718550844, 1136 | "scopedVars": { 1137 | "cluster": { 1138 | "selected": true, 1139 | "text": "All", 1140 | "value": "$__all" 1141 | } 1142 | }, 1143 | "sparkline": { 1144 | "fillColor": "rgba(31, 118, 189, 0.18)", 1145 | "full": true, 1146 | "lineColor": "rgb(31, 120, 193)", 1147 | "show": true 1148 | }, 1149 | "tableColumn": "", 1150 | "targets": [ 1151 | { 1152 | "expr": "isilon_cluster_protostats_smb2_total_op_rate{hostname=~\"$cluster\"}", 1153 | "interval": "", 1154 | "legendFormat": "", 1155 | "refId": "A" 1156 | } 1157 | ], 1158 | "thresholds": "", 1159 | "title": "SMB2 Op/s", 1160 | "type": "singlestat", 1161 | "valueFontSize": "80%", 1162 | "valueMaps": [ 1163 | { 1164 | "op": "=", 1165 | "text": "N/A", 1166 | "value": "null" 1167 | } 1168 | ], 1169 | "valueName": "current" 1170 | }, 1171 | { 1172 | "cacheTimeout": null, 1173 | "colorBackground": true, 1174 | "colorValue": false, 1175 | "colors": [ 1176 | "rgba(50, 172, 45, 0.97)", 1177 | "rgba(237, 129, 40, 0.89)", 1178 | "rgba(245, 54, 54, 0.9)" 1179 | ], 1180 | "datasource": "Prometheus", 1181 | "editable": true, 1182 | "error": false, 1183 | "fieldConfig": { 1184 | "defaults": { 1185 | "custom": {} 1186 | }, 1187 | "overrides": [] 1188 | }, 1189 | "format": "ms", 1190 | "gauge": { 1191 | "maxValue": 100, 1192 | "minValue": 0, 1193 | "show": false, 1194 | "thresholdLabels": false, 1195 | "thresholdMarkers": false 1196 | }, 1197 | "gridPos": { 1198 | "h": 4, 1199 | "w": 2, 1200 | "x": 22, 1201 | "y": 2 1202 | }, 1203 | "id": 213, 1204 | "interval": null, 1205 | "isNew": true, 1206 | "links": [ 1207 | { 1208 | "title": "Isilon Data Insights Cluster Detail", 1209 | "url": "dashboard/db/isilon-data-insights-cluster-detail?$__url_time_range&$__all_variables" 1210 | } 1211 | ], 1212 | "mappingType": 1, 1213 | "mappingTypes": [ 1214 | { 1215 | "name": "value to text", 1216 | "value": 1 1217 | }, 1218 | { 1219 | "name": "range to text", 1220 | "value": 2 1221 | } 1222 | ], 1223 | "maxDataPoints": 100, 1224 | "nullPointMode": "connected", 1225 | "nullText": null, 1226 | "postfix": "", 1227 | "postfixFontSize": "50%", 1228 | "prefix": "", 1229 | "prefixFontSize": "50%", 1230 | "rangeMaps": [ 1231 | { 1232 | "from": "null", 1233 | "text": "N/A", 1234 | "to": "null" 1235 | } 1236 | ], 1237 | "repeatIteration": 1476718550844, 1238 | "scopedVars": { 1239 | "cluster": { 1240 | "selected": true, 1241 | "text": "All", 1242 | "value": "$__all" 1243 | } 1244 | }, 1245 | "sparkline": { 1246 | "fillColor": "rgba(31, 118, 189, 0.18)", 1247 | "full": true, 1248 | "lineColor": "rgb(31, 120, 193)", 1249 | "show": true 1250 | }, 1251 | "tableColumn": "", 1252 | "targets": [ 1253 | { 1254 | "expr": "isilon_cluster_protostats_smb2_total_time_avg{hostname=~\"$cluster\"}/1000", 1255 | "interval": "", 1256 | "legendFormat": "", 1257 | "refId": "A" 1258 | } 1259 | ], 1260 | "thresholds": "10,25", 1261 | "title": "SMB2 Latency", 1262 | "type": "singlestat", 1263 | "valueFontSize": "80%", 1264 | "valueMaps": [ 1265 | { 1266 | "op": "=", 1267 | "text": "N/A", 1268 | "value": "null" 1269 | } 1270 | ], 1271 | "valueName": "current" 1272 | } 1273 | ], 1274 | "refresh": "5m", 1275 | "schemaVersion": 26, 1276 | "style": "dark", 1277 | "tags": [], 1278 | "templating": { 1279 | "list": [ 1280 | { 1281 | "allFormat": "regex values", 1282 | "allValue": null, 1283 | "current": { 1284 | "selected": false, 1285 | "text": [ 1286 | "All" 1287 | ], 1288 | "value": [ 1289 | "$__all" 1290 | ] 1291 | }, 1292 | "datasource": "Prometheus", 1293 | "definition": "label_values({job=\"isilon\"}, hostname)", 1294 | "hide": 0, 1295 | "includeAll": true, 1296 | "label": "Cluster", 1297 | "multi": true, 1298 | "multiFormat": "glob", 1299 | "name": "cluster", 1300 | "options": [], 1301 | "query": "label_values({job=\"isilon\"}, hostname)", 1302 | "refresh": 1, 1303 | "regex": "", 1304 | "skipUrlSync": false, 1305 | "sort": 0, 1306 | "tagValuesQuery": "", 1307 | "tags": [], 1308 | "tagsQuery": "", 1309 | "type": "query", 1310 | "useTags": false 1311 | } 1312 | ] 1313 | }, 1314 | "time": { 1315 | "from": "now-15m", 1316 | "to": "now" 1317 | }, 1318 | "timepicker": { 1319 | "now": true, 1320 | "refresh_intervals": [ 1321 | "5s", 1322 | "10s", 1323 | "30s", 1324 | "1m", 1325 | "5m", 1326 | "15m", 1327 | "30m", 1328 | "1h", 1329 | "2h", 1330 | "1d" 1331 | ], 1332 | "time_options": [ 1333 | "5m", 1334 | "15m", 1335 | "1h", 1336 | "6h", 1337 | "12h", 1338 | "24h", 1339 | "2d", 1340 | "7d", 1341 | "30d" 1342 | ] 1343 | }, 1344 | "timezone": "browser", 1345 | "title": "Isilon Data Insights Cluster Summary", 1346 | "uid": "Xef-mgFMk", 1347 | "version": 2 1348 | } --------------------------------------------------------------------------------