├── .flake8
├── HDFS-home-1.png
├── HDFS-datanode-1.png
├── IsilonDataInsightsClusterDetail.JPG
├── IsilonDataInsightsMultiClusterSummary.JPG
├── IsilonDataInsightsClusterProtocolDetail.JPG
├── .whitesource
├── IsilonDataInsightsClusterCapacityUtilizationTable.JPG
├── requirements.txt
├── setup_venv3.sh
├── setup_venv.sh
├── isi_api_client.py
├── .gitignore
├── LICENSE
├── isi_data_insights_d.py
├── prometheus_plugin.py
├── isi_sdk_utils.py
├── grafana_cluster_capacity_utilization_dashboard.json
├── README.md
├── isi_stats_client.py
├── influxdb_plugin.py
├── example_isi_data_insights_d.cfg
├── README_KAPACITOR_INTEGRATION.md
├── isi_data_insights_config.py
├── isi_data_insights_daemon.py
└── dashboards
    └── prometheus
        └── grafana_cluster_list_dashboard.json


/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 100
3 | 


--------------------------------------------------------------------------------
/HDFS-home-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Isilon/isilon_data_insights_connector/HEAD/HDFS-home-1.png


--------------------------------------------------------------------------------
/HDFS-datanode-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Isilon/isilon_data_insights_connector/HEAD/HDFS-datanode-1.png


--------------------------------------------------------------------------------
/IsilonDataInsightsClusterDetail.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Isilon/isilon_data_insights_connector/HEAD/IsilonDataInsightsClusterDetail.JPG


--------------------------------------------------------------------------------
/IsilonDataInsightsMultiClusterSummary.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Isilon/isilon_data_insights_connector/HEAD/IsilonDataInsightsMultiClusterSummary.JPG


--------------------------------------------------------------------------------
/IsilonDataInsightsClusterProtocolDetail.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Isilon/isilon_data_insights_connector/HEAD/IsilonDataInsightsClusterProtocolDetail.JPG


--------------------------------------------------------------------------------
/.whitesource:
--------------------------------------------------------------------------------
1 | {
2 |   "generalSettings": {
3 |     "shouldScanRepo": true
4 |   },
5 |   "checkRunSettings": {
6 |     "vulnerableCheckRunConclusionLevel": "failure"
7 |   }
8 | }


--------------------------------------------------------------------------------
/IsilonDataInsightsClusterCapacityUtilizationTable.JPG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Isilon/isilon_data_insights_connector/HEAD/IsilonDataInsightsClusterCapacityUtilizationTable.JPG


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | daemons >= 1.3.0
 2 | influxdb >= 2.12.0
 3 | pip >= 8.0.2
 4 | urllib3 >= 1.13.1
 5 | requests >= 2.22.0
 6 | isi_sdk_8_0 >= 0.2.0, < 0.3.0
 7 | isi_sdk_7_2 >= 0.2.0, < 0.3.0
 8 | Equation >= 1.2.01
 9 | gevent >= 1.2.1
10 | future >= 0.18.0
11 | configparser >= 0.4.0
12 | prometheus_client == 0.12.0
13 | 


--------------------------------------------------------------------------------
/setup_venv3.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | venv_path=".venv3"
 6 | 
 7 | python3 -m venv $venv_path
 8 | 
 9 | . $venv_path/bin/activate
10 | pip install -r requirements.txt
11 | 
12 | echo
13 | echo "Isilon Data Insights Connector virtual environment setup at $venv_path."
14 | echo "To activate the virtual environment run: . $venv_path/bin/activate"
15 | 


--------------------------------------------------------------------------------
/setup_venv.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -e
 4 | 
 5 | venv_path=".venv"
 6 | 
 7 | pip install --user virtualenv
 8 | 
 9 | virtualenv $venv_path
10 | 
11 | . $venv_path/bin/activate
12 | pip install -U pip setuptools
13 | pip install -r requirements.txt
14 | 
15 | echo
16 | echo "Isilon Data Insights Connector virtual environment setup at $venv_path."
17 | echo "To activate the virtual environment run: . $venv_path/bin/activate"
18 | 


--------------------------------------------------------------------------------
/isi_api_client.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Currently Swagger Codegen uses a singleton to specify basic auth
 3 | credentials, which doesn't work for multi-thread or multi-client
 4 | scenarios where each thread or client needs to connect to a unique
 5 | cluster. So this class is a custom implementation of the
 6 | isi_sdk.ApiClient that is multi-thread/client safe.
 7 | """
 8 | from builtins import object
 9 | 
10 | 
11 | class IsiApiClient(object):
12 |     _username = None
13 |     _password = None
14 | 
15 |     def configure_basic_auth(self, username, password):
16 |         self._username = username
17 |         self._password = password
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | 
 5 | # C extensions
 6 | *.so
 7 | 
 8 | # Distribution / packaging
 9 | .Python
10 | env/
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | *.egg-info/
23 | .installed.cfg
24 | *.egg
25 | 
26 | # PyInstaller
27 | #  Usually these files are written by a python script from a template
28 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
29 | *.manifest
30 | *.spec
31 | 
32 | # Installer logs
33 | pip-log.txt
34 | pip-delete-this-directory.txt
35 | 
36 | # Unit test / coverage reports
37 | htmlcov/
38 | .tox/
39 | .coverage
40 | .coverage.*
41 | .cache
42 | nosetests.xml
43 | coverage.xml
44 | *,cover
45 | 
46 | # Translations
47 | *.mo
48 | *.pot
49 | 
50 | # Django stuff:
51 | *.log
52 | 
53 | # Sphinx documentation
54 | docs/_build/
55 | 
56 | # PyBuilder
57 | target/
58 | 
59 | # Our virtual environments
60 | .venv
61 | .venv3
62 | 
63 | # Development environment stuff
64 | .vscode
65 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2016 EMC Corporation
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in all
11 | copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19 | SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/isi_data_insights_d.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | # From __future_ imports have to be before everything.
 4 | from __future__ import print_function
 5 | from builtins import str
 6 | 
 7 | # Have to do this before importing the other libs
 8 | # The noqa comment prevents spurious E402 flake8 errors
 9 | # The documentation for monkey explicitly requires patching to be
10 | # performed as early as possible BEFORE other imports but after the
11 | # from __future__ imports.
12 | from gevent import monkey
13 | 
14 | monkey.patch_all()  # noqa
15 | 
16 | import sys
17 | 
18 | from isi_data_insights_config import (
19 |     parse_cli,
20 |     configure_args_via_file,
21 |     process_pid_file_arg,
22 |     configure_logging_via_cli,
23 |     configure_via_cli,
24 |     configure_via_file,
25 | )
26 | from isi_data_insights_daemon import IsiDataInsightsDaemon
27 | 
28 | 
29 | def main():
30 |     args = parse_cli()
31 | 
32 |     # load the config file if one is provided, then set the "required"
33 |     # parameters of the CLI args with config file parameters (if possible)
34 |     config_file = configure_args_via_file(args)
35 | 
36 |     # validate the pid_file arg and get the full path to it.
37 |     pid_file_path = process_pid_file_arg(args.pid_file, args.action)
38 | 
39 |     daemon = IsiDataInsightsDaemon(pidfile=pid_file_path)
40 | 
41 |     # before we do the long process of configuring, lets make sure we have
42 |     # a valid pid to do a stop or restart with
43 |     if (args.action == "restart" or args.action == "stop") and daemon.pid is None:
44 |         print(
45 |             "Cannot " + args.action + " daemon, "
46 |             "invalid pid in file: " + str(pid_file_path),
47 |             file=sys.stderr,
48 |         )
49 |         sys.exit(1)
50 | 
51 |     if args.action == "start" or args.action == "debug" or args.action == "restart":
52 |         configure_logging_via_cli(args)
53 | 
54 |         if config_file is not None:
55 |             configure_via_file(daemon, args, config_file)
56 |         else:
57 |             configure_via_cli(daemon, args)
58 | 
59 |         if args.action == "start":
60 |             daemon.start()
61 |         elif args.action == "restart":
62 |             print("Restarting daemon with pid " + str(daemon.pid))
63 |             daemon.restart()
64 |         else:
65 |             daemon.run(debug=True)
66 |     elif args.action == "stop":
67 |         print("Stopping daemon with pid " + str(daemon.pid))
68 |         daemon.stop()
69 |     else:
70 |         print(
71 |             "Invalid action arg: '%s', must be one of "
72 |             "'start', 'stop', or 'restart'." % args.action,
73 |             file=sys.stderr,
74 |         )
75 | 
76 | 
77 | if __name__ == "__main__":
78 |     main()
79 | 


--------------------------------------------------------------------------------
/prometheus_plugin.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | from future.utils import string_types
  3 | import logging
  4 | import time
  5 | import sys
  6 | import prometheus_client as prom
  7 | LOG = logging.getLogger(__name__)
  8 | 
  9 | # module variables
 10 | this = sys.modules[__name__]
 11 | collection_duration = None
 12 | gobaltags = {}
 13 | tagnames = []
 14 | intervalstart = 0
 15 | metriclist = {}
 16 | 
 17 | def start(argv):
 18 |     '''
 19 |     Setup Prometheus client interface.
 20 |     For prometheus all metrics are exposed via HTTP and the server will
 21 |     scrape (=collect) data from there
 22 | 
 23 |     Arguments:
 24 |         argv[0] = <port> (String)
 25 |             Default is 8080. If running inside containers do not change this port
 26 |             but instead change the exposed port via the docker run command
 27 |         argv[1] = <customtags> (String)
 28 |             Custom tags that are used to decorate metrics. The plugin needs to
 29 |             know them at startup time.
 30 |             Comma separated pairs like, group=Lab,datacenter=Berlin,....
 31 |     '''
 32 |     port = 8080
 33 |     this.globaltags = {}
 34 |     this.tagnames = []
 35 |     if isinstance(argv, list) and len(argv) > 0:
 36 |         port = int(argv[0])
 37 |         if len(argv) > 1:
 38 |             for item in argv[1].split(','):
 39 |                 (key, val) = item.split('=')
 40 |                 this.globaltags[key] = val
 41 | 
 42 |     this.tagnames = ['hostname', 'node'] + list(this.globaltags.keys())
 43 |     this.collection_duration = prom.Gauge('isi_collector_duration_seconds', '', this.tagnames)
 44 |     prom.start_http_server(port)
 45 |     LOG.info('Exposing data for prometheus at port {}'.format(port))
 46 | 
 47 | def start_process(cluster):
 48 |     '''
 49 |     Start of a new collection interval
 50 |     '''
 51 |     LOG.info('Start processing prometheus metrics for {}'.format(cluster))
 52 |     this.intervalstart = time.time()
 53 | 
 54 | def end_process(cluster):
 55 |     '''
 56 |     End of a collection interval
 57 |     '''
 58 |     tags = this.globaltags.copy()
 59 |     tags['hostname'] = cluster
 60 |     tags['node'] = ''
 61 |     this.collection_duration.labels(**tags).set(time.time() - this.intervalstart)
 62 |     LOG.info('Done processing {} metrics for prometheus for {}'.format(len(this.metriclist), cluster))
 63 | 
 64 | def process_stat(cluster, stat):
 65 |     ''' Arguments:
 66 |         cluster(String) = isilon cluster hostname/ip
 67 |         stat(Object)
 68 |     '''
 69 |     if stat.error != None:
 70 |         return
 71 |     tags = this.globaltags.copy()
 72 |     tags['hostname'] = cluster
 73 |     tags['node'] = str(stat.devid)
 74 | 
 75 |     if isinstance(stat.value, list):
 76 |         _process_list(tags, stat.key, stat.value)
 77 | 
 78 |     elif isinstance(stat.value, dict):
 79 |         _process_dict(tags, stat.key, stat.value)
 80 | 
 81 |     else:
 82 |         _process_one_stat(tags, stat.key, stat.value)
 83 | 
 84 | def _process_list(tags, basekey, statlist):
 85 |     ''' list of stats (expected as list of dict) '''
 86 |     for elem in statlist:
 87 |         if isinstance(elem, dict):
 88 |             _process_dict(tags, basekey, elem)
 89 |         else:
 90 |             LOG.error('Unexpected list of non-dict element: {}={}'.format(basekey, elem))
 91 | 
 92 | def _process_dict(tags, basekey, statdict):
 93 |     ''' dictionary stats
 94 |     all number values in the dict are metrics. But it contains text members
 95 |     and fields named with 'id': Those are filtered out as tags
 96 |     '''
 97 |     for k in list(statdict.keys()):
 98 |         if isinstance(statdict[k], string_types) or (k[-2:] == 'id' and isinstance(statdict[k], int)):
 99 |             tags[k] = statdict[k]
100 |             del statdict[k]
101 | 
102 |     for k in statdict.keys():
103 |         mname = basekey + '_' + k
104 |         _process_one_stat(tags, mname, statdict[k])
105 | 
106 | def _process_one_stat(tags, metricname, value):
107 |     ''' process one stat for prometheus.
108 |     metrics are kept inside the process as list of gauges for prometheus to scrape
109 |     '''
110 |     m = metricname.replace('.', '_')
111 |     if m in this.metriclist:
112 |         metric = this.metriclist[m]
113 |     else:
114 |         metric = prom.Gauge('isilon_' + m, '', tags.keys())
115 |         this.metriclist[m] = metric
116 |     metric.labels(**tags).set(value)
117 | 
118 | 


--------------------------------------------------------------------------------
/isi_sdk_utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Handle the details of building a Swagger client with the correct version of the
  3 | SDK to talk to a specific Isilon host.
  4 | """
  5 | from __future__ import print_function
  6 | from builtins import str
  7 | 
  8 | try:
  9 |     import isi_sdk_8_0
 10 | except ImportError:
 11 |     isi_sdk_8_0 = None
 12 | 
 13 | try:
 14 |     import isi_sdk_7_2
 15 | except ImportError:
 16 |     isi_sdk_7_2 = None
 17 | 
 18 | import sys
 19 | 
 20 | 
 21 | def configure(host, username, password, verify_ssl=False, use_version="detect"):
 22 |     """
 23 |     Get a version specific instance of the isi_sdk and a multi-thread/client
 24 |     safe instance of IsiApiClient that can be used to interface with the
 25 |     specified host by possibly detecting the best version of the sdk to use.
 26 |     Returns a tuple consisting of the isi_sdk interface, an instance of
 27 |     IsiApiClient, and a float value set to either 8.0 or 7.2 depending on
 28 |     which version of the SDK was chosen. The IsiApiClient instance can be used
 29 |     in conjunction with the isi_sdk to interface with the specified cluster
 30 |     cluster (i.e. isi_sdk.ProtocolsApi(isi_api_cli_inst).list_nfs_exports()).
 31 |     :param string host: The name or ip-address of the host to configure the SDK
 32 |     interface to work with.
 33 |     :param string username: The username to use for authentication with the
 34 |     specified host.
 35 |     :param string password: The password to use for authentication with the
 36 |     specified host.
 37 |     :param bool verify_ssl: Specifies whether or not the Isilon cluster's SSL
 38 |     certificate should be verified.
 39 |     :param mixed use_version: Can be either "detect" in order to detect the
 40 |     correct version of the SDK to use with the specified host. Or a float value
 41 |     of 7.2 or 8.0 can be used in order to force use of that particular version
 42 |     of the SDK.
 43 |     :returns: tuple
 44 |     """
 45 |     if isi_sdk_7_2 is None and isi_sdk_8_0 is None:
 46 |         raise RuntimeError("Isilon SDK is not installed.")
 47 | 
 48 |     host_url = "https://" + host + ":8080"
 49 | 
 50 |     if use_version is None or use_version == "detect":
 51 |         host_version = _detect_host_version(host, username, password, verify_ssl)
 52 |     else:
 53 |         host_version = use_version
 54 | 
 55 |     isi_sdk = None
 56 |     if host_version < 8.0 and isi_sdk_7_2 is not None:
 57 |         isi_sdk = isi_sdk_7_2
 58 |     elif host_version >= 8.0 and isi_sdk_8_0 is None:
 59 |         isi_sdk = isi_sdk_7_2
 60 |         # we detected a version 8.0 host, but have to treat it like a 7.2 host
 61 |         # because the 8.0 SDK is not installed
 62 |         host_version = 7.2
 63 |     else:
 64 |         isi_sdk = isi_sdk_8_0
 65 | 
 66 |     configuration = isi_sdk.Configuration()
 67 |     configuration.username = username
 68 |     configuration.password = password
 69 |     configuration.verify_ssl = verify_ssl
 70 |     configuration.host = host_url
 71 |     api_client = isi_sdk.ApiClient(configuration)
 72 | 
 73 |     return isi_sdk, api_client, host_version
 74 | 
 75 | 
 76 | def _detect_host_version(host, username, password, verify_ssl):
 77 |     # if 7.2 is available then use it to check the version of the cluster
 78 |     # because it will work for 7.2 or newer clusters.
 79 |     isi_sdk = isi_sdk_7_2 if isi_sdk_7_2 else isi_sdk_8_0
 80 | 
 81 |     configuration = isi_sdk.Configuration()
 82 |     configuration.username = username
 83 |     configuration.password = password
 84 |     configuration.verify_ssl = verify_ssl
 85 |     configuration.host = "https://" + host + ":8080"
 86 |     api_client = isi_sdk.ApiClient(configuration)
 87 | 
 88 |     try:
 89 |         try:
 90 |             config = isi_sdk.ClusterApi(api_client).get_cluster_config()
 91 |             host_version = (
 92 |                 7.2 if config.onefs_version.release.startswith("v7.") else 8.0
 93 |             )
 94 |         except isi_sdk.rest.ApiException as api_exc:
 95 |             # if we are using isi_sdk_8_0 (because 7.2 is not installed) and the
 96 |             # cluster is a 7.2 cluster then it will return 404 for the
 97 |             # get_cluster_config call, but it should still work for stats queries,
 98 |             # so just set the version and continue on.
 99 |             if isi_sdk == isi_sdk_8_0 and api_exc.status == 404:
100 |                 host_version = 7.2
101 |             else:
102 |                 raise api_exc
103 |     except Exception as exc:
104 |         raise RuntimeError(
105 |             "Failed to get cluster config for cluster %s "
106 |             "using SDK %s. Error: %s" % (host, isi_sdk.__name__, str(exc))
107 |         )
108 | 
109 |     if host_version == 7.2 and isi_sdk_7_2 is None:
110 |         print(
111 |             "Detected version 7 host, but version 7.2 SDK "
112 |             "is not installed, will use 8.0 SDK instead.",
113 |             file=sys.stderr,
114 |         )
115 | 
116 |     if host_version == 8.0 and isi_sdk_8_0 is None:
117 |         print(
118 |             "Detected version 8 host, but version 8.0 SDK "
119 |             "is not installed, will use 7.2 SDK instead.",
120 |             file=sys.stderr,
121 |         )
122 | 
123 |     return host_version
124 | 


--------------------------------------------------------------------------------
/grafana_cluster_capacity_utilization_dashboard.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "__inputs": [
  3 |     {
  4 |       "name": "DS_LOCAL_INFLUXDB",
  5 |       "label": "Local influxdb",
  6 |       "description": "",
  7 |       "type": "datasource",
  8 |       "pluginId": "influxdb",
  9 |       "pluginName": "InfluxDB"
 10 |     }
 11 |   ],
 12 |   "__requires": [
 13 |     {
 14 |       "type": "panel",
 15 |       "id": "table",
 16 |       "name": "Table",
 17 |       "version": ""
 18 |     },
 19 |     {
 20 |       "type": "grafana",
 21 |       "id": "grafana",
 22 |       "name": "Grafana",
 23 |       "version": "3.1.1"
 24 |     },
 25 |     {
 26 |       "type": "datasource",
 27 |       "id": "influxdb",
 28 |       "name": "InfluxDB",
 29 |       "version": "1.0.0"
 30 |     }
 31 |   ],
 32 |   "id": null,
 33 |   "title": "Isilon Data Insights Cluster Capacity Utilization Table",
 34 |   "description": "Color coded table showing cluster capacity utilization.   Good to see the clusters with the highest capacity utilization.",
 35 |   "tags": [],
 36 |   "style": "dark",
 37 |   "timezone": "browser",
 38 |   "editable": true,
 39 |   "hideControls": false,
 40 |   "sharedCrosshair": false,
 41 |   "rows": [
 42 |     {
 43 |       "collapse": false,
 44 |       "editable": true,
 45 |       "height": "250px",
 46 |       "panels": [
 47 |         {
 48 |           "columns": [],
 49 |           "editable": true,
 50 |           "error": false,
 51 |           "fontSize": "100%",
 52 |           "height": "1000",
 53 |           "id": 1,
 54 |           "interval": ">200d",
 55 |           "isNew": true,
 56 |           "links": [],
 57 |           "pageSize": null,
 58 |           "scroll": true,
 59 |           "showHeader": true,
 60 |           "sort": {
 61 |             "col": 2,
 62 |             "desc": true
 63 |           },
 64 |           "span": 12,
 65 |           "styles": [
 66 |             {
 67 |               "dateFormat": "YYYY-MM-DD HH:mm:ss",
 68 |               "pattern": "Time",
 69 |               "type": "date"
 70 |             },
 71 |             {
 72 |               "colorMode": "row",
 73 |               "colors": [
 74 |                 "rgba(50, 172, 45, 0.97)",
 75 |                 "rgba(237, 129, 40, 0.89)",
 76 |                 "rgba(245, 54, 54, 0.9)"
 77 |               ],
 78 |               "decimals": 2,
 79 |               "pattern": "utilization",
 80 |               "thresholds": [
 81 |                 "85",
 82 |                 "90"
 83 |               ],
 84 |               "type": "number",
 85 |               "unit": "percent"
 86 |             }
 87 |           ],
 88 |           "targets": [
 89 |             {
 90 |               "alias": "",
 91 |               "dsType": "influxdb",
 92 |               "groupBy": [
 93 |                 {
 94 |                   "params": [
 95 |                     "$interval"
 96 |                   ],
 97 |                   "type": "time"
 98 |                 },
 99 |                 {
100 |                   "params": [
101 |                     "cluster"
102 |                   ],
103 |                   "type": "tag"
104 |                 },
105 |                 {
106 |                   "params": [
107 |                     "none"
108 |                   ],
109 |                   "type": "fill"
110 |                 }
111 |               ],
112 |               "measurement": "ifs.percent.avail",
113 |               "policy": "default",
114 |               "query": "SELECT 100.0 - last(\"value\") as utilization FROM \"ifs.percent.avail\" WHERE \"cluster\" =~ /^$cluster$/ AND $timeFilter GROUP BY time($interval), \"cluster\" fill(none)",
115 |               "rawQuery": true,
116 |               "refId": "A",
117 |               "resultFormat": "table",
118 |               "select": [
119 |                 [
120 |                   {
121 |                     "params": [
122 |                       "value"
123 |                     ],
124 |                     "type": "field"
125 |                   },
126 |                   {
127 |                     "params": [],
128 |                     "type": "last"
129 |                   },
130 |                   {
131 |                     "params": [
132 |                       "100 -"
133 |                     ],
134 |                     "type": "math"
135 |                   }
136 |                 ]
137 |               ],
138 |               "tags": [
139 |                 {
140 |                   "key": "cluster",
141 |                   "operator": "=~",
142 |                   "value": "/^$cluster$/"
143 |                 }
144 |               ]
145 |             }
146 |           ],
147 |           "timeFrom": null,
148 |           "title": "Cluster Capacity Utilization",
149 |           "transform": "table",
150 |           "type": "table"
151 |         }
152 |       ],
153 |       "title": "Row"
154 |     }
155 |   ],
156 |   "time": {
157 |     "from": "now-7d",
158 |     "to": "now"
159 |   },
160 |   "timepicker": {
161 |     "refresh_intervals": [
162 |       "5s",
163 |       "10s",
164 |       "30s",
165 |       "1m",
166 |       "5m",
167 |       "15m",
168 |       "30m",
169 |       "1h",
170 |       "2h",
171 |       "1d"
172 |     ],
173 |     "time_options": [
174 |       "5m",
175 |       "15m",
176 |       "1h",
177 |       "6h",
178 |       "12h",
179 |       "24h",
180 |       "2d",
181 |       "7d",
182 |       "30d"
183 |     ]
184 |   },
185 |   "templating": {
186 |     "list": [
187 |       {
188 |         "current": {},
189 |         "datasource": "${DS_LOCAL_INFLUXDB}",
190 |         "hide": 0,
191 |         "includeAll": true,
192 |         "label": "Cluster",
193 |         "multi": true,
194 |         "name": "cluster",
195 |         "options": [],
196 |         "query": "show tag values with key = \"cluster\"",
197 |         "refresh": 1,
198 |         "type": "query"
199 |       }
200 |     ]
201 |   },
202 |   "annotations": {
203 |     "list": []
204 |   },
205 |   "schemaVersion": 12,
206 |   "version": 2,
207 |   "links": [],
208 |   "gnetId": null
209 | }


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Isilon Data Insights Connector
  2 | 
  3 | The isi_data_insights_d.py script controls a daemon process that can be used to query multiple OneFS clusters for statistics data via the Isilon OneFS Platform API (PAPI). The collector uses a pluggable module for processing the results of those queries. The provided stats processor defined in influxdb_plugin.py sends query results to an InfluxDB backend. Additionally, several Grafana dashboards are provided to make it easy to monitor the health and status of your Isilon clusters.
  4 | The Connector now supports running under either Python 2 or Python 3.
  5 | 
  6 | ## Installation Instructions
  7 | 
  8 | The collector was developed and tested on Linux. It is written in Python and believed to be portable, but no testing has been performed on other platforms. It is suggested that a Linux VM be provisioned to run the collector and the InfluxDB and Grafana components.
  9 | 
 10 | Please note, it is dangerous and unnecessary to install Python packages as root (sudo pip ...). The data insights collector needs no special privileges and can be installed and run as an unprivileged user. Because of this, the recommended way to install the Connector is via a Python virtual environment. The virtual environment installation installs the required Python dependencies into a [Python Virtual Environment](http://docs.python-guide.org/en/latest/dev/virtualenvs/). The Connector is then run directly from the source directory.
 11 | 
 12 | * To install the connector in a virtual environment using the default Python interpreter on the system, run:
 13 | 
 14 | ```sh
 15 | ./setup_venv.sh
 16 | ```
 17 | 
 18 | * To explicitly install using "python3" as the interpreter, run
 19 | 
 20 | ```sh
 21 | ./setup_venv3.sh
 22 | ```
 23 | 
 24 | The Grafana visualization component can be downloaded from [here](https://grafana.com/grafana/download?pg=get&plcmt=selfmanaged-box1-cta1)
 25 | 
 26 | **Important note** InfluxDB 2.x is incompatible with version 1 and will not work. Please ensure you download and install an InfluxDB version 1.x package (the latest is currently 1.8.10), For installation instructions for the current 1.x (1.8.10) version of Influxdb, refer to [this link](https://portal.influxdata.com/downloads/), scroll down and expand the "Are you interested in InfluxDB 1.x Open Source?" section.
 27 | 
 28 | ## Run Instructions
 29 | 
 30 | * Rename or copy the example configuration file, example_isi_data_insights_d.cfg, to isi_data_insights_d.cfg. The path ./isi_data_insights_d.cfg is the default configuration file path for the Connector. If you use that name and run the Connector from the source directory then you don't have to use the --config parameter to specify a different configuration file.
 31 | * Edit isi_data_insights_d.cfg to configure the collector to query the set of Isilon OneFS clusters that you want to monitor. Do this by modifying the config file's clusters parameter.
 32 | * The example configuration file is configured to gather and send several sets of stats to InfluxDB via the influxdb_plugin.py.
 33 | * If you installed InfluxDB to somewhere other than localhost and/or port 8086 then you'll also need to update the configuration file with the address and port of the InfluxDB instance.
 34 | * Activate the virtualenv it before running the Connector by running:
 35 | 
 36 | ```sh
 37 | . .venv/bin/activate
 38 | ```
 39 | 
 40 | or, if you installed the Python 3 version, by running:
 41 | 
 42 | ```sh
 43 | . .venv3/bin/activate
 44 | ```
 45 | 
 46 | * To run the Connector:
 47 | 
 48 | ```sh
 49 | ./isi_data_insights_d.py start
 50 | ```
 51 | 
 52 | ## Grafana Setup
 53 | 
 54 | Included with the Connector source code are several Grafana dashboards that make it easy to monitor the health and status of your Isilon clusters. To view the dashboards with Grafana, follow these instructions:
 55 | 
 56 | * [Install and configure Grafana](http://docs.grafana.org/installation/) to use the InfluxDB as a data source. Note that the provided Grafana dashboards have been tested to work with Grafana versions up to and including 8,2,2. Also, note that the influxdb_plugin.py creates and stores the statistics data in a database named isi_data_insights. You'll need that information when following the instructions for adding a data source to Grafana. Also, be sure to configure the isi_data_insights data source as the default Grafana data source using the Grafana Dashboard Admin web-interface.
 57 | * Import the Grafana dashboards.
 58 |   * grafana_cluster_list_dashboard.json
 59 | ![Multi-cluster Summary Dashboard Screen Shot](https://raw.githubusercontent.com/Isilon/isilon_data_insights_connector/master/IsilonDataInsightsMultiClusterSummary.JPG)
 60 |   * grafana_cluster_capacity_utilization_dashboard.json
 61 |  ![Cluster Capacity Utilization Dashboard Screen Shot](https://raw.githubusercontent.com/Isilon/isilon_data_insights_connector/master/IsilonDataInsightsClusterCapacityUtilizationTable.JPG)
 62 |   * grafana_cluster_detail_dashboard.json
 63 |  ![Cluster Detail Dashboard Screen Shot](https://raw.githubusercontent.com/Isilon/isilon_data_insights_connector/master/IsilonDataInsightsClusterDetail.JPG)
 64 |   * grafana_cluster_protocol_dashboard.json
 65 | ![Cluster Protocol Detail Dashboard Screen Shot](https://raw.githubusercontent.com/Isilon/isilon_data_insights_connector/master/IsilonDataInsightsClusterProtocolDetail.JPG)
 66 | 
 67 | Import (optional) HDFS specific dashboards:
 68 | 
 69 | * grafana_hadoop_home.json
 70 | ![Hadoop Home Dashboard Screeenshot](https://raw.githubusercontent.com/Isilon/isilon_data_insights_connector/master/HDFS-home-1.png)
 71 | * grafana_hadoop_datanodes.json
 72 | ![Hadoop Home Dashboard Screeenshot](https://raw.githubusercontent.com/Isilon/isilon_data_insights_connector/master/HDFS-datanode-1.png)
 73 | 
 74 | * If you had previously started the Connector, there should already be data in your database displayed in the dashboards. One common issue that might prevent your dashboards from showing up correctly is if the date/time on your Isilon clusters is not closely enough in-synch with the date/time used by Grafana. Synchronizing the date/time of all the systems to within a few seconds of each other should be enough to fix the issue.
 75 | 
 76 | ## Kapacitor Integration
 77 | 
 78 | [Kapacitor](https://www.influxdata.com/time-series-platform/kapacitor/) is an add-on component that, when used in conjunction with the Connector enables flexible, configurable, real-time notifications of alert conditions based off the statistics data streaming into the InfluxDB. For more information on how to integrate the Connector and InfluxDB with Kapacitor refer to:
 79 | 
 80 | [Kapacitor Integration Instructions](https://github.com/Isilon/isilon_data_insights_connector/blob/master/README_KAPACITOR_INTEGRATION.md)
 81 | 
 82 | ## Customizing the Connector
 83 | 
 84 | The Connector is designed to allow for customization via a plugin architecture. The default plugin, influxd_plugin.py, is configured via the provided example configuration file. If you would like to process the stats data differently or send them to a different backend than the influxdb_plugin.py you can implement a custom stats processor. Here are the instructions for doing so:
 85 | 
 86 | * Create a file called my_plugin.py, or whatever you want to name it.
 87 | * In the my_plugin.py file define a process(cluster, stats) function that takes as input the name/ip-address of a cluster and a list of stats. The list of stats will contain instances of the isi_sdk_8_0/models/CurrentStatisticsStat class or isi_sdk_7_2/models/CurrenStatisticsStat class, but it makes no difference because the two classes are the same regardless of the version.
 88 | * Optionally define a start(argv) function that takes a list of input args as defined in the config file via the stats_processor_args parameter.
 89 | * Optionally define a stop() function.
 90 | * Put the my_plugin.py file somewhere in your PYTHONPATH (easiest is to put into the same directory as the other Python source code files).
 91 | * Update the isi_data_insights_d.cfg file with the name of your plugin (i.e. 'my_plugin')
 92 | * Restart the isi_data_insights_d.py daemon:
 93 | 
 94 | ```sh
 95 | ./isi_data_insights_d.py restart
 96 | ```
 97 | 
 98 | ## Extending and/or Contributing to the Connector
 99 | 
100 | There are multiple ways for anyone using the Connector to interact with our dev team to request new features or discuss problems.
101 | 
102 | * Create a new issue on the [Issues](https://github.com/Isilon/isilon_data_insights_connector/issues) tab.
103 | * Use the [discussion](https://community.emc.com/docs/DOC-48273) capability of the Isilon SDK Info Hub page.
104 | 
105 | Also, just like an other project on github.com we are entirely open to external code contributions:
106 | 
107 | * Fork the project, modify it, then initiate a pull request.
108 | 


--------------------------------------------------------------------------------
/isi_stats_client.py:
--------------------------------------------------------------------------------
  1 | from builtins import range
  2 | from builtins import object
  3 | import logging
  4 | 
  5 | 
  6 | LOG = logging.getLogger(__name__)
  7 | # Apache/PAPI has a request URI limit of 8096, MAX_KEYS_LEN is the max
  8 | # length of a set of keys that the client will attempt to send.
  9 | MAX_KEYS_LEN = 7000
 10 | # When getting metadata for multiple stats, if there are less than
 11 | # MAX_DIRECT_METADATA_STATS then do the query as multiple direct key queries,
 12 | # otherwise do it as a single batch query and filter the results on the client
 13 | # side. Testing revealed that 200 is the optimal cutoff point for a virtual
 14 | # cluster.
 15 | MAX_DIRECT_METADATA_STATS = 200
 16 | 
 17 | 
 18 | class IsiStatsClient(object):
 19 |     """
 20 |     Handles the details of querying for Isilon cluster statistics values and
 21 |     metadata using the Isilon SDK.
 22 |     """
 23 | 
 24 |     def __init__(self, stats_api):
 25 |         """
 26 |         Setup the Isilon SDK to query the specified cluster's statistics.
 27 |         :param StatisticsApi stats_api: instance of StatisticsApi from the
 28 |         isi_sdk_8_0 or isi_sdk_7_2 package.
 29 |         """
 30 |         # get the Statistics API
 31 |         self._stats_api = stats_api
 32 | 
 33 |     def query_stats(
 34 |         self,
 35 |         stats,
 36 |         devid="all",
 37 |         substr=False,
 38 |         timeout=60,
 39 |         degraded=True,
 40 |         expand_clientid=False,
 41 |     ):
 42 |         """
 43 |         Queries the cluster for a list of stat values. Note: this function only
 44 |         works on OneFS 8.0 or newer.
 45 |         :param list stats: a list of stat names to query
 46 |         :param string devid: The node number or "all" to query all nodes.
 47 |         :param bool substr: If True, makes the 'keys' arg perform a partial
 48 |         match.
 49 |         :param int timeout: Time in seconds to wait for results from remote
 50 |         nodes.
 51 |         :param bool degraded: If true, try to continue even if some stats are
 52 |         unavailable.
 53 |         :param bool expand_clientid: If true, use name resolution to expand
 54 |         client addresses and other IDs.
 55 |         :returns: a list of isi_sdk.models.StatisticsCurrentStat
 56 |         instances corresponding to the list of stat names provided in the stats
 57 |         input list.
 58 |         """
 59 |         # setup the stat keys for querying as set of comma delimitted values
 60 |         combined_query_results = None
 61 |         stat_keys = ",".join(stats)
 62 |         stat_index = 0
 63 |         stat_keys_len = len(stat_keys)
 64 |         while stat_index < stat_keys_len:
 65 |             if stat_keys_len - stat_index > MAX_KEYS_LEN:
 66 |                 # find the last comma between stat_index and
 67 |                 # stat_index + MAX_KEYS_LEN
 68 |                 next_stat_index = stat_keys.rfind(
 69 |                     ",", stat_index, stat_index + MAX_KEYS_LEN
 70 |                 )
 71 |                 # unless there's a key that is longer than MAX_KEYS_LEN
 72 |                 # then the rfind should never return -1 because there should
 73 |                 # definitely be at least one comma.
 74 |                 query_keys = stat_keys[stat_index:next_stat_index]
 75 |                 stat_index = next_stat_index + 1
 76 |             else:
 77 |                 query_keys = stat_keys[stat_index:]
 78 |                 stat_index = stat_keys_len
 79 | 
 80 |             query_result = self._stats_api.get_statistics_current(
 81 |                 keys=query_keys,
 82 |                 devid=devid,
 83 |                 substr=substr,
 84 |                 degraded=degraded,
 85 |                 expand_clientid=expand_clientid,
 86 |                 timeout=timeout,
 87 |             )
 88 | 
 89 |             if combined_query_results is None:
 90 |                 combined_query_results = query_result
 91 |             else:
 92 |                 combined_query_results.stats.extend(query_result.stats)
 93 | 
 94 |         # return the list of stats only (at this point there are no other
 95 |         # fields on the query_results data model).
 96 |         return combined_query_results.stats
 97 | 
 98 |     def query_stat(
 99 |         self, stat, devid="all", timeout=60, degraded=True, expand_clientid=False
100 |     ):
101 |         """
102 |         Queries the cluster for a single stat's value. Note: this function
103 |         works on OneFS 7.2 or newer clusters.
104 |         :param string stats: the name of the stat to query
105 |         :param string devid: The node number or "all" to query all nodes.
106 |         :param int timeout: Time in seconds to wait for results from remote
107 |         nodes.
108 |         :param bool degraded: If true, try to continue even if some stats are
109 |         unavailable.
110 |         :param bool expand_clientid: If true, use name resolution to expand
111 |         client addresses and other IDs.
112 |         :returns: an instance of isi_sdk.models.StatisticsCurrentStat
113 |         """
114 |         query_result = self._stats_api.get_statistics_current(
115 |             key=stat,
116 |             devid=devid,
117 |             degraded=degraded,
118 |             expand_clientid=expand_clientid,
119 |             timeout=timeout,
120 |         )
121 | 
122 |         return query_result.stats
123 | 
124 |     def get_stats_metadata(self, stats=None):
125 |         """
126 |         Query the cluster for the metadata associated with each key specified
127 |         in the stats list or all stats if stats is None.
128 |         :param list stats: list of statistic keys to query.
129 |         :returns: a list of isi_sdk.models.StatisticsKey instances (in
130 |         the same order as the stats input param list).
131 |         """
132 |         if stats is not None and len(stats) < MAX_DIRECT_METADATA_STATS:
133 |             return self._get_metadata_direct(stats)
134 |         return self._get_metadata_indirect(stats)
135 | 
136 |     def get_stat_metadata(self, stat):
137 |         """
138 |         Query the cluster for the metadata of a specific stat.
139 |         :param string stat: the name of the stat to query
140 |         :returns: a single isi_sdk.models.StatisticsKey.
141 |         """
142 |         result = self._stats_api.get_statistics_key(statistics_key_id=stat)
143 |         return result.keys[0]
144 | 
145 |     def _get_metadata_indirect(self, stats):
146 |         """
147 |         Get the metadata for every single stat and then filter it down to the
148 |         list of stats specified in the stats param.
149 |         :param list stats: the list of stats to return metadata for, or if it
150 |         is None then return all metadata.
151 |         :returns: a list of isi_sdk.models.StatisticsKey instances.
152 |         """
153 |         stat_map = {}
154 |         if stats is not None:
155 |             num_stats = len(stats)
156 |             for stat_index in range(0, num_stats):
157 |                 stat_map[stats[stat_index]] = stat_index
158 |             result_list = [None] * num_stats
159 |         else:
160 |             result_list = []
161 |         query_args = dict()
162 |         while True:
163 |             results = self._stats_api.get_statistics_keys(**query_args)
164 |             if stats is None:
165 |                 if result_list is None:
166 |                     result_list = results.keys
167 |                 else:
168 |                     result_list.extend(results.keys)
169 |             else:
170 |                 for key in results.keys:
171 |                     try:
172 |                         stat_index = stat_map[key.key]
173 |                         result_list[stat_index] = key
174 |                         num_stats -= 1
175 |                         if num_stats == 0:
176 |                             break
177 |                     except KeyError:
178 |                         pass
179 | 
180 |             resume = results.resume
181 |             if resume is None:
182 |                 break
183 |             query_args["resume"] = resume
184 | 
185 |         return result_list
186 | 
187 |     def _get_metadata_direct(self, stats):
188 |         """
189 |         Get the metadata for the list of stats provided in the stats list input
190 |         parameter by sending an individual request for each stat. When the list
191 |         of stats is small(er) then this method is faster than querying for all
192 |         the stats metadata and filtering it (see _get_metadata_indirect).
193 |         :param list stats: the list of stat names to query for metadata.
194 |         :returns: a list of isi_sdk.models.StatisticsKey instances.
195 |         """
196 |         metadata_list = []
197 |         for stat in stats:
198 |             metadata = self.get_stat_metadata(stat)
199 |             metadata_list.append(metadata)
200 |         return metadata_list
201 | 


--------------------------------------------------------------------------------
/influxdb_plugin.py:
--------------------------------------------------------------------------------
  1 | from __future__ import print_function
  2 | from builtins import input
  3 | from builtins import str
  4 | from builtins import range
  5 | from builtins import object
  6 | from future.utils import string_types
  7 | from influxdb import InfluxDBClient
  8 | from influxdb.exceptions import InfluxDBServerError, InfluxDBClientError
  9 | from ast import literal_eval
 10 | 
 11 | import getpass
 12 | import logging
 13 | import requests.exceptions
 14 | import sys
 15 | 
 16 | 
 17 | class StatsProcessorState(object):
 18 |     def __init__(self):
 19 |         self.influxdb_points = None
 20 |         self.points_written = None
 21 |         self.reset()
 22 | 
 23 |     def reset(self):
 24 |         self.influxdb_points = []
 25 |         self.points_written = 0
 26 | 
 27 | 
 28 | # influxdb_plugin state
 29 | g_state = StatsProcessorState()
 30 | 
 31 | # InfluxDBClient interface
 32 | g_client = None
 33 | LOG = logging.getLogger(__name__)
 34 | 
 35 | # Number of points to queue up before writing it to the database.
 36 | MAX_POINTS_PER_WRITE = 100
 37 | # separator used to concatenate stat keys with sub-keys derived from stats
 38 | # whose value is a dict or list.
 39 | SUB_KEY_SEPARATOR = "."
 40 | 
 41 | 
 42 | def start(argv):
 43 |     """
 44 |     Instantiate an InfluxDBClient. The expected inputs are the host/address and
 45 |     port of the InfluxDB and the name of the database to use. If the database
 46 |     does not exist then it will be created. If the fourth arg is "auth" then it
 47 |     will prompt the user for the InfluxDB's username and password.
 48 |     """
 49 |     influxdb_host = argv[0]
 50 |     influxdb_port = int(argv[1])
 51 |     influxdb_name = argv[2]
 52 |     influxdb_ssl = False
 53 |     influxdb_verifyssl = False
 54 |     influxdb_username = "root"
 55 |     influxdb_password = "root"
 56 | 
 57 |     if len(argv) > 3:
 58 |         if argv[3] == "auth":
 59 |             influxdb_username = input("InfluxDB username: ")
 60 |             influxdb_password = getpass.getpass("Password: ")
 61 |         else:
 62 |             influxdb_username = argv[3]
 63 |             influxdb_password = argv[4]
 64 |             influxdb_ssl = literal_eval(argv[5])
 65 |             influxdb_verifyssl = literal_eval(argv[6])
 66 | 
 67 |     LOG.info(
 68 |         "Connecting to: %s@%s:%d database:%s ssl=%s verify_ssl=%s.",
 69 |         influxdb_username,
 70 |         influxdb_host,
 71 |         influxdb_port,
 72 |         influxdb_name,
 73 |         influxdb_ssl,
 74 |         influxdb_verifyssl,
 75 |     )
 76 | 
 77 |     global g_client
 78 |     g_client = InfluxDBClient(
 79 |         host=influxdb_host,
 80 |         port=influxdb_port,
 81 |         database=influxdb_name,
 82 |         username=influxdb_username,
 83 |         password=influxdb_password,
 84 |         ssl=influxdb_ssl,
 85 |         verify_ssl=influxdb_verifyssl
 86 |     )
 87 | 
 88 |     create_database = True
 89 |     try:
 90 |         databases = g_client.get_list_database()
 91 |     except (requests.exceptions.ConnectionError, InfluxDBClientError) as exc:
 92 |         print(
 93 |             "Failed to connect to InfluxDB server at %s:%s "
 94 |             "database: %s.\nERROR: %s"
 95 |             % (influxdb_host, str(influxdb_port), influxdb_name, str(exc)),
 96 |             file=sys.stderr,
 97 |         )
 98 |         sys.exit(1)
 99 | 
100 |     for database in databases:
101 |         if database["name"] == influxdb_name:
102 |             create_database = False
103 |             break
104 | 
105 |     if create_database is True:
106 |         LOG.info("Creating database: %s.", influxdb_name)
107 |         g_client.create_database(influxdb_name)
108 | 
109 | 
110 | def begin_process(cluster):
111 |     LOG.debug("Begin processing %s stats.", cluster)
112 | 
113 | 
114 | def process_stat(cluster, stat):
115 |     """
116 |     Convert Isilon stat query result to InfluxDB point and send to the
117 |     InfluxDB service. Organize the measurements by cluster and node via tags.
118 |     """
119 |     # Process stat(s) and then write points if list is large enough.
120 |     tags = {"cluster": cluster}
121 |     if stat.devid != 0:
122 |         tags["node"] = stat.devid
123 | 
124 |     influxdb_points = _influxdb_points_from_stat(stat.time, tags, stat.key, stat.value)
125 |     if influxdb_points == []:
126 |         return
127 |     for influxdb_point in influxdb_points:
128 |         if len(influxdb_point["fields"]) > 0:
129 |             g_state.influxdb_points.append(influxdb_point)
130 |             num_points = len(g_state.influxdb_points)
131 |             if num_points > MAX_POINTS_PER_WRITE:
132 |                 g_state.points_written += _write_points(
133 |                     g_state.influxdb_points, num_points
134 |                 )
135 |                 g_state.influxdb_points = []
136 | 
137 | 
138 | def end_process(cluster):
139 |     # send left over points to influxdb
140 |     num_points = len(g_state.influxdb_points)
141 |     if num_points > 0:
142 |         g_state.points_written += _write_points(g_state.influxdb_points, num_points)
143 |     LOG.debug(
144 |         "Done processing %s stats, wrote %d points.", cluster, g_state.points_written
145 |     )
146 |     g_state.reset()
147 | 
148 | 
149 | def _add_field(fields, field_name, field_value, field_value_type):
150 |     if field_value_type == int:
151 |         # convert integers to float because InfluxDB only supports 64 bit
152 |         # signed integers, so doing this prevents an "out of range" error when
153 |         # inserting values that are unsigned 64 bit integers.
154 |         # Note that it is not clear if the PAPI is smart enough to always
155 |         # encode 64 bit unsigned integers as type 'long' even when the actual
156 |         # value is fits into a 64 bit signed integer and because InfluxDB
157 |         # wants a measurement to always be of the same type, the safest thing
158 |         # to do is convert integers to float.
159 |         field_value = float(field_value)
160 |     fields.append((field_name, field_value))
161 | 
162 | 
163 | def _process_stat_dict(stat_value, fields, tags, prefix=""):
164 |     """
165 |     Add (field_name, field_value) tuples to the fields list for any
166 |     non-string or non-"id" items in the stat_value dict so that they can be
167 |     used for the "fields" parameter of the InfluxDB point.
168 |     Any string or keys with "id" on the end of their name get turned into tags.
169 |     """
170 |     for key, value in stat_value.items():
171 |         value_type = type(value)
172 |         field_name = prefix + key
173 |         if isinstance(value, string_types) or (key[-2:] == "id" and value_type == int):
174 |             tags[field_name] = value
175 |         elif value_type == list:
176 |             list_prefix = field_name + SUB_KEY_SEPARATOR
177 |             _process_stat_list(value, fields, tags, list_prefix)
178 |         elif value_type == dict:
179 |             dict_prefix = field_name + SUB_KEY_SEPARATOR
180 |             _process_stat_dict(value, fields, tags, dict_prefix)
181 |         else:
182 |             _add_field(fields, field_name, value, value_type)
183 | 
184 | 
185 | def _process_stat_list(stat_value, fields, tags, prefix=""):
186 |     """
187 |     Add (field_name, field_value) tuples to the fields list for any
188 |     non-string or non-"id" items in the stat_value dict so that they can be
189 |     used for the "fields" parameter of the InfluxDB point.
190 |     """
191 |     field_name = prefix + "value"
192 |     for index in range(0, len(stat_value)):
193 |         list_value = stat_value[index]
194 |         value_type = type(list_value)
195 |         if value_type == dict:
196 |             _process_stat_dict(list_value, fields, tags, prefix)
197 |         else:
198 |             item_name = field_name + SUB_KEY_SEPARATOR + str(index)
199 |             if value_type == list:
200 |                 # AFAIK there are no instances of a list that contains a list
201 |                 # but just in case one is added in the future, deal with it.
202 |                 item_name += SUB_KEY_SEPARATOR
203 |                 _process_stat_list(list_value, fields, tags, item_name)
204 |             else:
205 |                 _add_field(fields, item_name, list_value, value_type)
206 | 
207 | 
208 | def _influxdb_points_from_stat(stat_time, tags, stat_key, stat_value):
209 |     """
210 |     Create InfluxDB points/measurements from the stat query result.
211 |     """
212 |     points = []
213 |     fields = []
214 |     stat_value_type = type(stat_value)
215 |     if stat_value_type == list:
216 |         for stat in stat_value:
217 |             (fields, point_tags) = _influxdb_point_from_stat(
218 |                 stat_time, tags, stat_key, stat
219 |             )
220 |             points.append(
221 |                 _build_influxdb_point(stat_time, point_tags, stat_key, fields)
222 |             )
223 |     elif stat_value_type == dict:
224 |         point_tags = tags.copy()
225 |         _process_stat_dict(stat_value, fields, point_tags)
226 |         points.append(_build_influxdb_point(stat_time, point_tags, stat_key, fields))
227 |     else:
228 |         if stat_value == "":
229 |             return None  # InfluxDB does not like empty string stats
230 |         _add_field(fields, "value", stat_value, stat_value_type)
231 |         points.append(_build_influxdb_point(stat_time, tags.copy(), stat_key, fields))
232 |     return points
233 | 
234 | 
235 | def _influxdb_point_from_stat(stat_time, tags, stat_key, stat_value):
236 |     """
237 |     Create InfluxDB points/measurements from the stat query result.
238 |     """
239 |     point_tags = tags.copy()
240 |     fields = []
241 |     stat_value_type = type(stat_value)
242 |     if stat_value_type == dict:
243 |         _process_stat_dict(stat_value, fields, point_tags)
244 |     elif stat_value_type == list:
245 |         _process_stat_list(stat_value, fields, point_tags)
246 |     else:
247 |         if stat_value == "":
248 |             return None  # InfluxDB does not like empty string stats
249 |         _add_field(fields, "value", stat_value, stat_value_type)
250 |     return (fields, point_tags)
251 | 
252 | 
253 | def _build_influxdb_point(unix_ts_secs, tags, measurement, fields):
254 |     """
255 |     Build the json for an InfluxDB data point.
256 |     """
257 |     timestamp_ns = unix_ts_secs * 1000000000  # convert to nanoseconds
258 |     point_json = {
259 |         "measurement": measurement,
260 |         "tags": tags,
261 |         "time": timestamp_ns,
262 |         "fields": {},
263 |     }
264 | 
265 |     for field_name, field_value in fields:
266 |         point_json["fields"][field_name] = field_value
267 | 
268 |     return point_json
269 | 
270 | 
271 | def _get_point_names(points):
272 |     names = ""
273 |     for point in points:
274 |         names += point["measurement"]
275 |         names += " "
276 |     return names
277 | 
278 | 
279 | def _write_points(points, num_points):
280 |     """
281 |     Write the points to the InfluxDB in groups that are MAX_POINTS_PER_WRITE in
282 |     size.
283 |     """
284 |     LOG.debug("Writing points %d", num_points)
285 |     write_index = 0
286 |     points_written = 0
287 |     while write_index < num_points:
288 |         max_write_index = write_index + MAX_POINTS_PER_WRITE
289 |         write_points = points[write_index:max_write_index]
290 |         try:
291 |             g_client.write_points(write_points)
292 |             points_written += len(write_points)
293 |         except InfluxDBServerError as svr_exc:
294 |             LOG.error(
295 |                 "InfluxDBServerError: %s\nFailed to write points: %s",
296 |                 str(svr_exc),
297 |                 _get_point_names(write_points),
298 |             )
299 |         except InfluxDBClientError as client_exc:
300 |             LOG.error(
301 |                 "InfluxDBClientError writing points: %s\n" "Error: %s",
302 |                 _get_point_names(write_points),
303 |                 str(client_exc),
304 |             )
305 |         except requests.exceptions.ConnectionError as req_exc:
306 |             LOG.error(
307 |                 "ConnectionError exception caught writing points: %s\n" "Error: %s",
308 |                 _get_point_names(write_points),
309 |                 str(req_exc),
310 |             )
311 |         write_index += MAX_POINTS_PER_WRITE
312 | 
313 |     return points_written
314 | 


--------------------------------------------------------------------------------
/example_isi_data_insights_d.cfg:
--------------------------------------------------------------------------------
  1 | [isi_data_insights_d]
  2 | # Parameters specified on the command line will supersede the parameters
  3 | # in this section.
  4 | # pid_file: /var/run/isi_data_insights_d.pid
  5 | # log_file: /var/run/isi_data_insights_d.log
  6 | # default log_level is INFO
  7 | # log_level: DEBUG
  8 | stats_processor: influxdb_plugin
  9 | # Use "auth" as the 4th arg in order to be prompted for the
 10 | # InfluxDB username and password stats_processor_args
 11 | # or
 12 | # define the credentials in this config
 13 | # examples:
 14 | # localhost 8086 isi_data_insights auth
 15 | # or without prompting
 16 | # localhost 8086 isi_data_insights username password ssl=True/False verify_ssl=True/False
 17 | stats_processor_args: localhost 8086 isi_data_insights
 18 | 
 19 | # clusters in this section are queried for all stat groups
 20 | # clusters: [username1:password1@]<ip-or-host-address1>[:True|False]
 21 | #	[[username2:password2]@<ip-or-host-address2>[:True|False]]
 22 | #	[[username3:password3]@<ip-or-host-address3>[:True|False]]
 23 | #	...
 24 | # If you don't specify the username and password then you will be prompted
 25 | # for them when the daemon starts up.
 26 | # Use the optional True or False on the end to specify whether the cluster's
 27 | # SSL certificate should be verified. If it is omitted then the default is
 28 | # False (i.e. don't verify SSL cert).
 29 | clusters:
 30 | 
 31 | # Specifies the active list of stat groups to query, each stat group name
 32 | # specified here should have a corresponding section in the config file.
 33 | active_stat_groups: cluster_cpu_stats
 34 |     cluster_network_traffic_stats
 35 |     cluster_client_activity_stats
 36 |     cluster_health_stats
 37 |     ifs_space_stats
 38 |     ifs_rate_stats
 39 |     node_load_stats
 40 |     node_disk_stats
 41 |     node_net_stats
 42 |     cluster_disk_rate_stats
 43 |     cluster_proto_stats
 44 |     cache_stats
 45 |     heat_total_stats
 46 | 
 47 | # The min_update_interval_override param provides ability to override the
 48 | # minimum interval that the daemon will query for a set of stats. The purpose
 49 | # of the minimum interval, which defaults to 30 seconds, is to prevent
 50 | # the daemon's queries from putting too much stress on the cluster.
 51 | # The default value is 30 seconds.
 52 | # min_update_interval_override: 15
 53 | 
 54 | [cluster_cpu_stats]
 55 | # The clusters (optional) param defines a list of clusters specific to this
 56 | # group.
 57 | # clusters: 10.25.69.74 10.25.69.75
 58 | # update interval is in seconds or use *<number> to base the update interval
 59 | # off each stat's collection interval (i.e. *2 == 2 times the collection
 60 | # interval, *1 == * == 1 times the collection invterval of each stat)
 61 | update_interval: *
 62 | stats: cluster.cpu.sys.avg
 63 |     cluster.cpu.user.avg
 64 |     cluster.cpu.idle.avg
 65 |     cluster.cpu.intr.avg
 66 | 
 67 | [cluster_network_traffic_stats]
 68 | update_interval: *
 69 | stats: cluster.net.ext.bytes.in.rate
 70 |     cluster.net.ext.bytes.out.rate
 71 |     cluster.net.ext.packets.in.rate
 72 |     cluster.net.ext.packets.out.rate
 73 |     cluster.net.ext.errors.in.rate
 74 |     cluster.net.ext.errors.out.rate
 75 | 
 76 | [cluster_client_activity_stats]
 77 | update_interval: *
 78 | stats: node.clientstats.active.ftp
 79 |     node.clientstats.active.hdfs
 80 |     node.clientstats.active.http
 81 |     node.clientstats.active.lsass_out
 82 |     node.clientstats.active.jobd
 83 |     node.clientstats.active.nfs
 84 |     node.clientstats.active.nfs4
 85 |     node.clientstats.active.nlm
 86 |     node.clientstats.active.papi
 87 |     node.clientstats.active.siq
 88 |     node.clientstats.active.cifs
 89 |     node.clientstats.active.smb2
 90 |     node.clientstats.connected.ftp
 91 |     node.clientstats.connected.hdfs
 92 |     node.clientstats.connected.http
 93 |     node.clientstats.connected.nfs
 94 |     node.clientstats.connected.nlm
 95 |     node.clientstats.connected.papi
 96 |     node.clientstats.connected.siq
 97 |     node.clientstats.connected.cifs
 98 | 
 99 | [cluster_health_stats]
100 | update_interval: *
101 | stats: cluster.health
102 |   cluster.node.count.all
103 |   cluster.node.count.down
104 |  
105 | [ifs_space_stats]
106 | update_interval: *
107 | stats: ifs.bytes.avail
108 |   ifs.bytes.free
109 |   ifs.bytes.used
110 |   ifs.bytes.total
111 |   ifs.percent.free
112 |   ifs.percent.avail
113 |   ifs.percent.used
114 | 
115 | [ifs_rate_stats]
116 | update_interval: *
117 | stats: ifs.bytes.in.rate
118 |   ifs.bytes.out.rate
119 |   ifs.ops.in.rate
120 |   ifs.ops.out.rate
121 |  
122 | [node_load_stats]
123 | update_interval: *
124 | stats: node.load.1min
125 |   node.load.5min
126 |   node.load.15min
127 |   node.memory.used
128 |   node.memory.free
129 |   node.open.files
130 |  
131 | [node_disk_stats]
132 | update_interval: *
133 | stats: node.disk.bytes.out.rate.avg
134 |   node.disk.bytes.in.rate.avg
135 |   node.disk.busy.avg
136 |   node.disk.xfers.out.rate.avg
137 |   node.disk.xfers.in.rate.avg
138 |   node.disk.xfer.size.out.avg
139 |   node.disk.xfer.size.in.avg
140 |   node.disk.access.latency.avg
141 |   node.disk.access.slow.avg
142 |   node.disk.iosched.queue.avg
143 |   node.disk.iosched.latency.avg
144 | 
145 | [node_net_stats]
146 | update_interval: *
147 | stats: node.net.int.bytes.in.rate
148 |   node.net.int.bytes.out.rate
149 |   node.net.ext.bytes.in.rate
150 |   node.net.ext.bytes.out.rate
151 |   node.net.int.errors.in.rate
152 |   node.net.int.errors.out.rate
153 |   node.net.ext.errors.in.rate
154 |   node.net.ext.errors.out.rate
155 | 
156 | [cluster_disk_rate_stats]
157 | update_interval: *
158 | stats: cluster.disk.xfers.rate
159 |   cluster.disk.xfers.in.rate
160 |   cluster.disk.xfers.out.rate
161 |   cluster.disk.bytes.in.rate
162 |   cluster.disk.bytes.out.rate
163 | 
164 | [cluster_proto_stats]
165 | update_interval: *
166 | stats: cluster.protostats.nfs
167 |   cluster.protostats.nlm
168 |   cluster.protostats.cifs
169 |   cluster.protostats.ftp
170 |   cluster.protostats.http
171 |   cluster.protostats.siq
172 |   cluster.protostats.jobd
173 |   cluster.protostats.smb2
174 |   cluster.protostats.nfs4
175 |   cluster.protostats.irp
176 |   cluster.protostats.lsass_in
177 |   cluster.protostats.lsass_out
178 |   cluster.protostats.papi
179 |   cluster.protostats.hdfs
180 |   cluster.protostats.nfs.total
181 |   cluster.protostats.nlm.total
182 |   cluster.protostats.cifs.total
183 |   cluster.protostats.ftp.total
184 |   cluster.protostats.http.total
185 |   cluster.protostats.siq.total
186 |   cluster.protostats.jobd.total
187 |   cluster.protostats.smb2.total
188 |   cluster.protostats.nfs4.total
189 |   cluster.protostats.irp.total
190 |   cluster.protostats.lsass_in.total
191 |   cluster.protostats.lsass_out.total
192 |   cluster.protostats.papi.total
193 |   cluster.protostats.hdfs.total
194 | 
195 | [cache_stats]
196 | update_interval: *
197 | stats: node.ifs.cache
198 | 
199 | [heat_total_stats]
200 | update_interval: *
201 | stats: node.ifs.heat.lock.total
202 |   node.ifs.heat.blocked.total
203 |   node.ifs.heat.contended.total
204 |   node.ifs.heat.deadlocked.total
205 |   node.ifs.heat.write.total
206 |   node.ifs.heat.read.total
207 |   node.ifs.heat.lookup.total
208 |   node.ifs.heat.rename.total
209 |   node.ifs.heat.link.total
210 |   node.ifs.heat.unlink.total
211 |   node.ifs.heat.getattr.total
212 |   node.ifs.heat.setattr.total
213 | 
214 | # These stats are not currently active by default. They are here to serve as an example of how to use the
215 | # derived stats functionality. See the comments below for more details.
216 | [concurrency_stats]
217 | update_interval: *
218 | stats: node.ifs.ops.in node.ifs.ops.out node.disk.iosched.latency.avg
219 |   cluster.protostats.nfs.total
220 |   cluster.protostats.nfs.total
221 |   cluster.protostats.smb2.total
222 |   cluster.protostats.nlm.total
223 |   cluster.protostats.cifs.total
224 |   cluster.protostats.http.total
225 |   cluster.protostats.siq.total
226 |   cluster.protostats.nfs4.total
227 |   cluster.protostats.hdfs.total
228 |   cluster.protostats.ftp.total
229 | # The composite_stats, equation_stats, percent_change_stats, final_equation_stats sections allow you to
230 | # specify new stats that are derived from the values of other stats. You can derive stats from base stats
231 | # or even specific fields or indices within a base stat's value, which is actually required if the
232 | # base stat's value is not a float or integer (i.e. it is a dict or list). See below for more
233 | # info on each type of derived stat.
234 | 
235 | #### Composite Stats Description #####
236 | # The composite_stats parameter specifies a list of node specific stats (i.e. stats whose name
237 | # start with "node.") where each stat is composited across the entire cluster using the specified
238 | # operation. Supported operations at this time are avg, max, min, and sum.
239 | # The output name of a composite_stat is: cluster.<name of original stat>.[<field1>[...<fieldN>]].<name of operation>,
240 | # so for the three stats above it would be cluster.node.ifs.ops.in.sum,
241 | # cluster.node.ifs.ops.out.sum, and cluster.node.disk.iosched.latency.avg.avg. If the base stat
242 | # contains one of more fields then those are appended to the name with '.' as delimiter, e.g.:
243 | # sum(node.protostats.nfs.total:op_count) -> cluster.node.protostats.nfs.total.op_count.sum
244 | composite_stats: sum(node.ifs.ops.in) sum(node.ifs.ops.out) avg(node.disk.iosched.latency.avg)
245 | 
246 | 
247 | #### Equation Stats Description #####
248 | # The equation_stats parameter specifies a list of output stat names for stats that will be
249 | # derived from an equation that takes as input either base stat values or composite_stats values.
250 | # The equation for each equation stat is specified in a parameter named the same as the equation
251 | # stat.
252 | equation_stats: cluster.ifs.concurrency cluster.protostats.all.total.op_count cluster.protostats.all.total.time_avg
253 | # This is the definition of the equation used to compute the the cluster.ifs.concurrency stat.
254 | # Any of the base stats or any composite stat can be used in the equation expression. Any
255 | # expression supported by the Equation package of Python can be used:
256 | # https://pypi.python.org/pypi/Equation
257 | cluster.ifs.concurrency: (cluster.node.ifs.ops.in.sum + cluster.node.ifs.ops.out.sum) * cluster.node.disk.iosched.latency.avg.avg
258 | # The cluster.protostats.all.total.op_count is a sum of all 9 of the different protocols' op_count.
259 | # This equation shows an example of how to select a specific field within a stat that returns a dict, in this case the op_count
260 | # field. Note that some stats are returned as list with always only a single dict item - in those cases the value is treated
261 | # as if it was just a dict. Otherwise, to index into a list you would use numeric field names after the colon. Multiple field
262 | # names or list indices are allowed (i.e. node.example.stat:field1:field2:field3...).
263 | cluster.protostats.all.total.op_count: cluster.protostats.nfs.total:op_count + cluster.protostats.nfs.total:op_count + cluster.protostats.smb2.total:op_count + cluster.protostats.nlm.total:op_count + cluster.protostats.cifs.total:op_count + cluster.protostats.http.total:op_count + cluster.protostats.siq.total:op_count + cluster.protostats.nfs4.total:op_count + cluster.protostats.hdfs.total:op_count + cluster.protostats.ftp.total:op_count
264 | # This stat computes the sum of the time_avg field and then takes an average.
265 | cluster.protostats.all.total.time_avg: (cluster.protostats.nfs.total:time_avg + cluster.protostats.nfs.total:time_avg + cluster.protostats.smb2.total:time_avg + cluster.protostats.nlm.total:time_avg + cluster.protostats.cifs.total:time_avg + cluster.protostats.http.total:time_avg + cluster.protostats.siq.total:time_avg + cluster.protostats.nfs4.total:time_avg + cluster.protostats.hdfs.total:time_avg + cluster.protostats.ftp.total:time_avg) / 10.0
266 | 
267 | #### Percent Change Stats Description #####
268 | # The percent_change_stats section specifies a list of base stats, composite stats, and/or equation
269 | # stats whose percent change from one measurement to the next will be stored in a new stat whose
270 | # name will be <name of original stat>.percentchange
271 | percent_change_stats: cluster.node.disk.iosched.latency.avg.avg cluster.protostats.all.total.time_avg
272 | 
273 | #### Final Equation Stats Description #####
274 | # The final_equation_stats is the same as the equation_stats section except these equations have access to base stats and all of the previously
275 | # defined derived stats as input. Again list the names of the output stats and then list the equation for each output stat in section of that same
276 | # name.
277 | final_equation_stats: cluster.ifs.concurrency.importance
278 | # Definition of the cluster.ifs.concurrency.importance final equation stat
279 | cluster.ifs.concurrency.importance: (cluster.protostats.all.total.op_count * cluster.protostats.all.total.time_avg) * cluster.node.disk.iosched.latency.avg.avg.percentchange
280 | 


--------------------------------------------------------------------------------
/README_KAPACITOR_INTEGRATION.md:
--------------------------------------------------------------------------------
  1 | # Kapacitor Integration
  2 | Kapacitor (https://www.influxdata.com/time-series-platform/kapacitor/) is an add-on component that when used in conjunction with the Connector enables flexible, configurable, real-time notifications of alert conditions based off the statistics data streaming into the InfluxDB. Kapacitor leverages the ability to subscribe to updates to the InfluxDB database to provide this capability.
  3 | 
  4 | # Initial setup
  5 | First setup InfluxDB and the Data Insights Connector following the instructions outlined in the README.md file. Then follow these instructions to install and setup Kapacitor:
  6 | 
  7 | Install Kapacitor from https://www.influxdata.com/downloads/#kapacitor
  8 | 
  9 | The getting started page (https://docs.influxdata.com/kapacitor/v1.0/introduction/getting_started/) contains useful examples, but is not entirely pertinent to this use case since it is leveraging Telegraf to generate statistics. In this case, you already have sets of statistics (measurements) in InfluxDB being fed by the Connector. After you have installed Kapacitor then you will need to configure it.
 10 | 
 11 | The Kapacitor installation package already includes the configuration file (/etc/kapacitor/kapacitor.conf) so there is no need to generate one. Edit /etc/kapacitor/kapacitor.conf to change the alert provider configurations as necessary. For instance, to enable email alerts, find the section beginning “[smtp]” and modify the configuration to utilize and available SMTP provider.
 12 | 
 13 | # Kapacitor Scripting
 14 | 
 15 | ## Introduction
 16 | Kapacitor uses one or more tasks that are defined using “TICK” scripts to control what data should be filtered, how it should be filtered, and what criteria to use to alert based off the data. The TICK scripts are a domain-specific language (DSL) and are somewhat tersely documented on the Kapacitor documentation site (https://docs.influxdata.com/kapacitor/v1.0/). This document presents some example scripts, and presents some patterns to enable more sophisticated criteria for alerting (e.g. moving average).
 17 | 
 18 | ## How to create and enable a TICK task
 19 | Edit the script using your favorite text editor. It is suggested that the name of these scripts use that “.tick” extension e.g. “nfs_avg_lat_alert.tick”
 20 | Next install the script into Kapacitor using the CLI. The generic form of the command is:
 21 | 
 22 | ```sh
 23 | kapacitor define <internal_kapacitor_name> -type stream -tick <path_to_tick script> -dbrp isi_data_insights.autogen
 24 | ```
 25 | 
 26 | The internal name should be something descriptive. These examples only show the use of stream scripts, but note that Kapacitor can also perform batch-processing. The path to the script is obvious. The “-dbrp” argument specifies the InfluxDB “database retention policy”. Since we are using the Isilon data insights connector database, the correct value for our examples is “isi_data_insights.autogen”; this value would differ if a different source database were in use. If we are using “nfs_avg_lat_alert.tick” as our example script, then the command to define the task would be:
 27 | ```sh
 28 | kapacitor define nfs_lat_alert -type stream -tick /root/nfs_avg_lat_alert.tick -dbrp isi_data_insights.autogen
 29 | ```
 30 | 
 31 | Here is the “nfs_avg_lat_alert.tick” script:
 32 | ```
 33 | stream
 34 |     // Select avg NFS3 proto response time
 35 |     |from()
 36 |         .database('isi_data_insights')
 37 |         .measurement('cluster.protostats.nfs.total')
 38 |     |eval(lambda: float("time_avg") / 1000.0)
 39 |      .as('time_ms')
 40 |     |groupBy('cluster')
 41 |     |alert()
 42 |         .id('{{ index .Tags "cluster" }}/{{ .Name }}')
 43 |         .message('Average value of {{ .ID }} is {{ .Level}} value: {{ index .Fields "time_ms" }}ms')
 44 |         .crit(lambda: "time_ms" > 50.0)
 45 |         .warn(lambda: "time_ms" > 20.0)
 46 |         // Only warn every 15 mins if we haven't changed state
 47 |         .stateChangesOnly(15m)
 48 |         // Whenever we get an alert write it to a file.
 49 |         .log('/tmp/alerts.log')
 50 |         .slack()
 51 | ```
 52 | Breaking it down:
 53 | * This is a stream filter so it starts with “stream”.
 54 | * Next, the script specifies where to pulling its data from. In this case, the “isi_data_insights” database, which is the default database created and populated by the Connector. This script selects a single measurement: “cluster.protostats.nfs.total”, which are the totaled (clusterwide as opposed to node-specific) NFS3 protocol statistics.
 55 | * Next, the script specifies an “eval” node which takes the “time_avg” measurement for the operations, and divides it by 1000. Note that the statistics values are in microseconds. Hence, this node is converting the values to milliseconds.
 56 | * Next, the script uses a “groupby” node, that is using the measurement tag “cluster” because the statistics for each cluster are distinct (e.g. we don’t want a low value from one cluster resetting the alert threshold of another cluster).
 57 | * Finally, the “alert” node. This is quite detailed (see next section for details).
 58 | 
 59 | Alert node details:
 60 | * First it defines the alert id that appears in the messages. In this case it will be <clustername>/nfs_lat_alert
 61 | * Next it defines the format of the message that appears in the alert. “.Level” is the alert level (crit, warn, info, ok). We index into the fields of the measurement to extract the “time_ms” field we generated to show the actual time value.
 62 | * The “.crit” and “.warn” nodes define a Boolean lambda function that determines whether that alert level has been reached. In this case, we’re defining the critical level to be a latency of greater than 50ms, and the warning level to be a latency of greater than 20ms.
 63 | * Lastly, the “squelch” node makes it so that it the alert is triggered repeatedly every 15 minutes if the alert level hasn’t changed, so we don’t get spammed with messages every 30 seconds.
 64 | * The ”.log” node simply logs these alerts to a local file (useful for testing).
 65 | * In this case, the alert is configured to use the Slack channel. This can be changed to use “.email” if that has been configured in the /etc/kapacitor/kapacitor.conf file, or “.post” to use the HTML POST method on a given URL. Numerous other alert channels are available. See the Kapacitor documentation for details.
 66 | 
 67 | Provided the syntax is correct, and the correct command is used, the task should now be defined in Kapacitor. However, it won’t be enabled:
 68 | ```sh
 69 | kapacitor list tasks
 70 | ID                Type      Status    Executing Databases and Retention Policies
 71 | nfs_lat_alert     stream    disabled  false     ["isi_data_insights"."autogen"]
 72 | ```
 73 | To enable the task, simply type:
 74 | ```sh
 75 | kapacitor enable nfs_lat_alert
 76 | ```
 77 | The task should now be enabled:
 78 | ```sh
 79 | kapacitor list tasks
 80 | ID                Type      Status    Executing Databases and Retention Policies
 81 | nfs_lat_alert     stream    enabled   true      ["isi_data_insights"."autogen"]
 82 | ```
 83 | It’s possible to check the status of the task and see the results at each node in the script:
 84 | ```sh
 85 | kapacitor show nfs_lat_alert
 86 | ID: nfs_lat_alert
 87 | Error:
 88 | Template:
 89 | Type: stream
 90 | Status: enabled
 91 | Executing: true
 92 | Created: 10 Aug 16 12:10 PDT
 93 | Modified: 16 Aug 16 06:40 PDT
 94 | LastEnabled: 16 Aug 16 06:40 PDT
 95 | Databases Retention Policies: ["isi_data_insights"."autogen"]
 96 | TICKscript:
 97 | stream
 98 |     // Select avg NFS3 proto response time
 99 |     |from()
100 |         .database('isi_data_insights')
101 |         .measurement('cluster.protostats.nfs.total')
102 |     |eval(lambda: float("time_avg") / 1000.0)
103 |         .as('time_ms')
104 |     |groupBy('cluster')
105 |     |alert()
106 |         .id('{{ index .Tags "cluster" }}/{{ .Name }}')
107 |         .message('Average value of {{ .ID }} is {{ .Level}} value: {{ index .Fields "time_ms" }}ms')
108 |         .crit(lambda: "time_ms" > 50.0)
109 |         .warn(lambda: "time_ms" > 20.0)
110 |         // Only warn every 15 mins if we haven't changed state
111 |         .stateChangesOnly(15m)
112 |         // Whenever we get an alert write it to a file.
113 |         .log('/tmp/alerts.log')
114 |         .slack()
115 | 
116 | DOT:
117 | digraph nfs_lat_alert {
118 | graph [throughput="0.00 points/s"];
119 | 
120 | stream0 [avg_exec_time_ns="0" ];
121 | stream0 -> from1 [processed="58279"];
122 | 
123 | from1 [avg_exec_time_ns="1.215s" ];
124 | from1 -> eval2 [processed="58279"];
125 | 
126 | eval2 [avg_exec_time_ns="208.86s" eval_errors="0" ];
127 | eval2 -> groupby3 [processed="58279"];
128 | 
129 | groupby3 [avg_exec_time_ns="28.392s" ];
130 | groupby3 -> alert4 [processed="58279"];
131 | 
132 | alert4 [alerts_triggered="2457" avg_exec_time_ns="87.22134ms" crits_triggered="836" infos_triggered="0" oks_triggered="1008" warns_triggered="613" ];
133 | }
134 | ```
135 | 
136 | This output shows that the script is working and triggering on events. The “DOT:” section can be rendered as a graph using the “GraphViz” package.
137 | 
138 | This initial script works well, but is rather simplistic and, in particular, will alert on momentary spikes in load which may not be desirable.
139 | 
140 | # Example TICK script patterns
141 | This section describes some examples for different types of alerting scripts.
142 | 
143 | # Moving average of measurement
144 | This is an example of a script that uses a moving window to average the statistic value over a recent window:
145 | ```
146 | stream
147 |     // Select avg NFS3 proto response time
148 |     |from()
149 |         .database('isi_data_insights')
150 |         .measurement('cluster.protostats.nfs.total')
151 |     |groupBy('cluster')
152 |     |window()
153 |         .period(10m)
154 |         .every(1m)
155 |     |mean('time_avg')
156 |         .as('time_avg')
157 |     |eval(lambda: float("time_avg") / 1000.0)
158 |          .as('mean_ms')
159 |         .keep('mean_ms', 'time_avg')
160 |     |alert()
161 |         .id('{{ index .Tags "cluster" }}/{{ .Name }}')
162 |         .message('Windowed average of avg value of {{ .ID }} is {{ .Level}} value: {{ index .Fields "mean_ms" }}ms')
163 |         .crit(lambda: "mean_ms" > 50.0)
164 |         .warn(lambda: "mean_ms" > 25.0)
165 |         // Only warn every 15 mins if we haven't changed state
166 |         .stateChangesOnly(15m)
167 |         // Whenever we get an alert write it to a file.
168 |         .log('/tmp/alerts.log')
169 |         .slack()
170 | ```
171 | 
172 | This script is similar to the previous script, but there are a few important differences:
173 | * The “window” node generates a window of data. With the values specified, we will keep and output the last 10 minutes of data every minute.
174 | * The window output is fed into a “mean” node that calculates the mean of the data fed (the last 10 minutes of data, in this case the “time_avg” field), and stores the result back as the “time_avg” field to be fed further down the pipeline.
175 | * The “eval” node converts the microsecond average field to a new “mean_ms” field.
176 | * The rest of the alert is similar to the previous example.
177 | 
178 | # Joining/alerting based off two different measurements
179 | This script is an example. It alerts based off moving average, but only if the operation count is above a given threshold. It’s probably not safe to use this as the sole alerting mechanism because a deadlock (which will reduce the operation count to zero) won’t generate an alert. Additional scripts are provided below to look for deadlock events (“node.ifs.heat.deadlocked.total” measurement) and to alert if no data points have been collected in a configurable period.
180 | 
181 | ```
182 | // Alert based off mean NFS3 proto response time if work is actually happening
183 | 
184 | var timestream = stream
185 |     |from()
186 |         .database('isi_data_insights')
187 |         .measurement('cluster.protostats.nfs.total')
188 |     |groupBy('cluster')
189 |     |window()
190 |         .period(10m)
191 |         .every(1m)
192 |     |mean('time_avg')
193 |         .as('time_avg')
194 |     |eval(lambda: float("time_avg") / 1000.0)
195 |          .as('mean_ms')
196 | 
197 | var opstream = stream
198 |     |from()
199 |         .database('isi_data_insights')
200 |         .measurement('cluster.protostats.nfs.total')
201 |     |groupBy('cluster')
202 |     |window()
203 |         .period(10m)
204 |         .every(1m)
205 |     |mean('op_rate')
206 |         .as('op_rate')
207 | 
208 | timestream
209 |     |join(opstream)
210 |         .as('times', 'ops')
211 |     |alert()
212 |         .id('{{ index .Tags "cluster" }}/{{ .Name }}')
213 |         .message('Cluster {{ index .Tags "cluster" }} is executing {{ index .Fields "ops.op_rate" }} NFSv3 operations per second and windowed average of avg value of {{ .Name }} is {{ .Level }} value: {{ index .Fields "times.mean_ms" }}ms')
214 |         .crit(lambda: "ops.op_rate" > 1000 AND "times.mean_ms" > 25.0)
215 |         .warn(lambda: "ops.op_rate" > 1000 AND "times.mean_ms" > 10.0)
216 |         // .info(lambda: TRUE)
217 |         // Only warn every 15 mins if we haven't changed state
218 |         .stateChangesOnly(15m)
219 |         // Whenever we get an alert write it to a file.
220 |         .log('/tmp/alerts.log')
221 |         .slack()
222 | ```
223 | 
224 | This script is significantly different to the previous examples. It uses variables to store the results of the two different streams that we sample, and then uses a “join” operation to create a stream with both sets of data for us to alert from.
225 | 
226 | # Deadman alert to warn if data collection fails
227 | This script uses the Kapacitor “Deadman” node to warn when the collected/emitted point count falls below a defined threshold in a given period. Many of the statistics collected by the Connector are updated as frequently as every 30 seconds, but the overall collection period can be longer if many clusters are being monitored, if they are large, and/or if they are under heavy load. The script arbitrarily uses 5 minutes as the interval for this example.
228 | ```
229 | // Deadman alert for cluster data collection
230 | var data = stream
231 |     |from()
232 |         .database('isi_data_insights')
233 |         .measurement('cluster.health')
234 |         .groupBy('cluster')
235 | 
236 | data
237 |     |deadman(1.0, 5m)
238 |         .id ('Statistics data collection for cluster {{ index .Tags "cluster" }}')
239 |         .slack()
240 | ```
241 | 
242 | This script will output alerts of the form:
243 | Statistics collection for cluster logserver is dead: 0.0
244 | or
245 | Statistics collection for cluster logserver is alive: 1.0
246 | 
247 | # Deadlock event count alert
248 | This script uses one of the OneFS filesystem “heat” statistics to look for high rates of deadlocks within the filesystem.
249 | ```
250 | stream
251 |     // Alert based off node heat stats
252 |     |from()
253 |         .database('isi_data_insights')
254 |         .measurement('node.ifs.heat.deadlocked.total')
255 |     |groupBy('cluster')
256 |     |alert()
257 |         .id('Deadlock event count')
258 |         .message('Value of {{ .ID }} on cluster {{ index .Tags "cluster" }}, node {{ index .Tags "node" }} is {{ .Level }} value: {{ index .Fields "value" }}')
259 |         .crit(lambda: "value" > 50.0)
260 |         .warn(lambda: "value" > 10.0)
261 |         // .info(lambda: TRUE)
262 |         // Only warn every 15 mins if we haven't changed state
263 |         .stateChangesOnly(15m)
264 |         // Whenever we get an alert write it to a file.
265 |         .log('/tmp/alerts.log')
266 |         .slack()
267 | ```
268 | 
269 | # Other useful node types
270 | Kapacitor offers a number of useful processing nodes to filter the data. Examples that are of particular interest are:
271 | * Mean/median/mode – computes the various average types.
272 | * Max/min – selects the largest/smallest point.
273 | * MovingAverage – a relatively new function that would simplify our earlier example.
274 | * Stddev – computes the standard deviation of points. Useful to detect anomalies.
275 | * Sum – sums the points.
276 | * Deadman - useful to alert if the collector fails for some reason. It alerts if the points per interval drops below a given threshold.
277 | 


--------------------------------------------------------------------------------
/isi_data_insights_config.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This file contains utility functions for configuring the IsiDataInsightsDaemon
  3 | via command line args and config file.
  4 | """
  5 | from __future__ import print_function
  6 | from __future__ import division
  7 | from future import standard_library
  8 | 
  9 | standard_library.install_aliases()  # noqa: E402
 10 | from builtins import input
 11 | from builtins import str
 12 | from builtins import range
 13 | from past.utils import old_div
 14 | import argparse
 15 | import configparser
 16 | import getpass
 17 | import logging
 18 | import os
 19 | import re
 20 | import sys
 21 | import urllib3
 22 | 
 23 | from ast import literal_eval
 24 | from Equation import Expression
 25 | 
 26 | from isi_data_insights_daemon import (
 27 |     StatsConfig,
 28 |     ClusterConfig,
 29 |     ClusterCompositeStatComputer,
 30 |     EquationStatComputer,
 31 |     PercentChangeStatComputer,
 32 |     DerivedStatInput,
 33 | )
 34 | from isi_stats_client import IsiStatsClient
 35 | import isi_sdk_utils
 36 | 
 37 | 
 38 | LOG = logging.getLogger(__name__)
 39 | 
 40 | DEFAULT_PID_FILE = "./isi_data_insights_d.pid"
 41 | DEFAULT_LOG_FILE = "./isi_data_insights_d.log"
 42 | DEFAULT_LOG_LEVEL = "INFO"
 43 | # name of the section in the config file where the main/global settings for the
 44 | # daemon are stored.
 45 | MAIN_CFG_SEC = "isi_data_insights_d"
 46 | # the number of seconds to wait between updates for stats that are
 47 | # continually kept up-to-date.
 48 | ONE_SEC = 1  # seconds
 49 | # the default minimum update interval (even if a particular stat key is updated
 50 | # at a higher rate than this we will still only query at this rate in order to
 51 | # prevent the cluster from being overloaded with stat queries).
 52 | MIN_UPDATE_INTERVAL = 30  # seconds
 53 | # name of the config file param that can be used to specify a lower
 54 | # MIN_UPDATE_INTERVAL.
 55 | MIN_UPDATE_INTERVAL_OVERRIDE_PARAM = "min_update_interval_override"
 56 | 
 57 | 
 58 | def avg(stat_values):
 59 |     # XXX investigate if plain '/' is OK here
 60 |     return old_div(sum(stat_values), len(stat_values))
 61 | 
 62 | 
 63 | # operations use by ClusterCompositeStatComputer
 64 | COMPOSITE_OPERATIONS = {"avg": avg, "max": max, "min": min, "sum": sum}
 65 | 
 66 | # keep track of auth data that we have username and passwords for so that we
 67 | # don't prompt more than once.
 68 | g_cluster_auth_data = {}
 69 | # keep track of the name and version of each cluster
 70 | g_cluster_configs = {}
 71 | 
 72 | 
 73 | def _add_cluster_auth_data(cluster_address, username, password, verify_ssl):
 74 |     # update cluster auth data
 75 |     g_cluster_auth_data[cluster_address] = (username, password, verify_ssl)
 76 | 
 77 | 
 78 | def _process_config_file_clusters(clusters):
 79 |     cluster_list = []
 80 |     cluster_configs = clusters.split()
 81 |     for cluster_config in cluster_configs:
 82 |         # default to insecure https
 83 |         verify_ssl = False
 84 | 
 85 |         # expected [username:password@]address[:bool]
 86 |         # the password can potentially contain ":" and "@" characters, split is done
 87 |         # from right side first and then left side to isolate out the password.
 88 |         at_split = cluster_config.rsplit("@", 1)
 89 |         if len(at_split) == 2:
 90 |             user_pass_split = at_split[0].split(":", 1)
 91 |             if len(user_pass_split) != 2:
 92 |                 print(
 93 |                     "Config file contains invalid cluster "
 94 |                     "config: %s in %s (expected <username>:<password> "
 95 |                     "prefix)." % (cluster_config, clusters),
 96 |                     file=sys.stderr,
 97 |                 )
 98 |                 sys.exit(1)
 99 |             username = user_pass_split[0]
100 |             password = user_pass_split[1]
101 |         else:
102 |             username = None
103 |             password = None
104 |         verify_ssl_split = at_split[-1].split(":", 1)
105 |         cluster_address = verify_ssl_split[0]
106 |         if len(verify_ssl_split) > 1:
107 |             try:
108 |                 # try to convert to a bool
109 |                 verify_ssl = literal_eval(verify_ssl_split[-1])
110 |                 if type(verify_ssl) != bool:
111 |                     raise Exception
112 |             except Exception:
113 |                 print(
114 |                     "Config file contains invalid cluster "
115 |                     "config: %s (expected True or False on end)" % cluster_config,
116 |                     file=sys.stderr,
117 |                 )
118 |                 sys.exit(1)
119 |         # add to cache of known cluster auth usernames and passwords
120 |         _add_cluster_auth_data(cluster_address, username, password, verify_ssl)
121 |         cluster_list.append(cluster_address)
122 | 
123 |     return cluster_list
124 | 
125 | 
126 | def _get_cluster_auth_data(cluster):
127 |     try:
128 |         username = password = verify_ssl = None
129 |         # check if we already know the username and password
130 |         username, password, verify_ssl = g_cluster_auth_data[cluster]
131 |         if username is None or password is None or verify_ssl is None:
132 |             # this happens when some of the auth params were provided in the
133 |             # config file or cli, but not all.
134 |             raise KeyError
135 |     except KeyError:
136 |         # get username and password for input clusters
137 |         if username is None:
138 |             username = input(
139 |                 "Please provide the username used to access " + cluster + " via PAPI: "
140 |             )
141 |         if password is None:
142 |             password = getpass.getpass("Password: ")
143 |         while verify_ssl is None:
144 |             verify_ssl_resp = input("Verify SSL cert [y/n]: ")
145 |             if verify_ssl_resp == "yes" or verify_ssl_resp == "y":
146 |                 verify_ssl = True
147 |             elif verify_ssl_resp == "no" or verify_ssl_resp == "n":
148 |                 verify_ssl = False
149 |         # add to cache of known cluster auth usernames and passwords
150 |         _add_cluster_auth_data(cluster, username, password, verify_ssl)
151 | 
152 |     return username, password, verify_ssl
153 | 
154 | 
155 | def _query_cluster_name(cluster_address, isi_sdk, api_client):
156 |     # get the Cluster API
157 |     cluster_api = isi_sdk.ClusterApi(api_client)
158 |     try:
159 |         resp = cluster_api.get_cluster_identity()
160 |         return resp.name
161 |     except isi_sdk.rest.ApiException:
162 |         # if get_cluster_identity() doesn't work just use the address
163 |         return cluster_address
164 | 
165 | 
166 | def _build_cluster_configs(cluster_list):
167 |     cluster_configs = []
168 |     for cluster in cluster_list:
169 |         username, password, verify_ssl = _get_cluster_auth_data(cluster)
170 | 
171 |         if cluster in g_cluster_configs:
172 |             cluster_name, isi_sdk, api_client, version = g_cluster_configs[cluster]
173 |         else:
174 |             if verify_ssl is False:
175 |                 urllib3.disable_warnings()
176 |             try:
177 |                 isi_sdk, api_client, version = isi_sdk_utils.configure(
178 |                     cluster, username, password, verify_ssl
179 |                 )
180 |             except RuntimeError as exc:
181 |                 print(
182 |                     "Failed to configure SDK for "
183 |                     "cluster %s. Exception raised: %s" % (cluster, str(exc)),
184 |                     file=sys.stderr,
185 |                 )
186 |                 sys.exit(1)
187 |             print(
188 |                 "Configured %s as version %d cluster, using SDK %s."
189 |                 % (cluster, int(version), isi_sdk.__name__)
190 |             )
191 |             cluster_name = _query_cluster_name(cluster, isi_sdk, api_client)
192 |             g_cluster_configs[cluster] = cluster_name, isi_sdk, api_client, version
193 | 
194 |         cluster_config = ClusterConfig(
195 |             cluster, cluster_name, version, isi_sdk, api_client
196 |         )
197 |         cluster_configs.append(cluster_config)
198 | 
199 |     return cluster_configs
200 | 
201 | 
202 | def _configure_stat_group(
203 |     daemon,
204 |     update_interval,
205 |     cluster_configs,
206 |     stats_list,
207 |     cluster_composite_stats=None,
208 |     equation_stats=None,
209 |     pct_change_stats=None,
210 |     final_equation_stats=None,
211 | ):
212 |     """
213 |     Configure the daemon with some StatsConfigs.
214 |     """
215 |     # configure daemon with stats
216 |     if update_interval < MIN_UPDATE_INTERVAL:
217 |         LOG.warning(
218 |             "The following stats are set to be queried at a faster "
219 |             "rate, %d seconds, than the MIN_UPDATE_INTERVAL of %d "
220 |             "seconds. To configure a shorter MIN_UPDATE_INTERVAL specify "
221 |             "it with the %s param in the %s section of the config file. "
222 |             "Stats:\n\t%s",
223 |             update_interval,
224 |             MIN_UPDATE_INTERVAL,
225 |             MIN_UPDATE_INTERVAL_OVERRIDE_PARAM,
226 |             MAIN_CFG_SEC,
227 |             str(stats_list),
228 |         )
229 |         update_interval = MIN_UPDATE_INTERVAL
230 |     stats_config = StatsConfig(cluster_configs, stats_list, update_interval)
231 |     if cluster_composite_stats is not None:
232 |         stats_config.cluster_composite_stats.extend(cluster_composite_stats)
233 |     if equation_stats is not None:
234 |         stats_config.equation_stats.extend(equation_stats)
235 |     if pct_change_stats is not None:
236 |         stats_config.pct_change_stats.extend(pct_change_stats)
237 |     if final_equation_stats is not None:
238 |         stats_config.final_equation_stats.extend(final_equation_stats)
239 |     daemon.add_stats(stats_config)
240 | 
241 | 
242 | def _query_stats_metadata(cluster, stat_names):
243 |     """
244 |     Query the specified cluster for the metadata of the stats specified in
245 |     stat_names list.
246 |     """
247 |     stats_api = cluster.isi_sdk.StatisticsApi(cluster.api_client)
248 |     isi_stats_client = IsiStatsClient(stats_api)
249 |     return isi_stats_client.get_stats_metadata(stat_names)
250 | 
251 | 
252 | def _compute_stat_group_update_intervals(
253 |     update_interval_multiplier, cluster_configs, stat_names, update_intervals
254 | ):
255 |     # update interval is supposed to be set relative to the collection
256 |     # interval, which might be different for each stat and each cluster.
257 |     for cluster in cluster_configs:
258 |         stats_metadata = _query_stats_metadata(cluster, stat_names)
259 |         for stat_index in range(0, len(stats_metadata)):
260 |             stat_metadata = stats_metadata[stat_index]
261 |             stat_name = stat_names[stat_index]
262 |             # cache time is the length of time the system will store the
263 |             # value before it updates.
264 |             cache_time = -1
265 |             if stat_metadata.default_cache_time:
266 |                 cache_time = (
267 |                     (stat_metadata.default_cache_time + 1)
268 |                     # add one to the default_cache_time because the new
269 |                     # value is not set until 1 second after the cache time.
270 |                     * update_interval_multiplier
271 |                 )
272 |             # the policy intervals seem to override the default cache time
273 |             if stat_metadata.policies:
274 |                 smallest_interval = cache_time
275 |                 for policy in stat_metadata.policies:
276 |                     if smallest_interval == -1:
277 |                         smallest_interval = policy.interval
278 |                     else:
279 |                         smallest_interval = min(policy.interval, smallest_interval)
280 |                 cache_time = smallest_interval * update_interval_multiplier
281 |             # if the cache_time is still -1 then it means that the statistic is
282 |             # continually updated, so the fastest it can be queried is
283 |             # once every second.
284 |             if cache_time == -1:
285 |                 cache_time = ONE_SEC * update_interval_multiplier
286 |             try:
287 |                 update_interval = update_intervals[cache_time]
288 |                 update_interval[0].add(cluster)
289 |                 update_interval[1].add(stat_name)
290 |             except KeyError:
291 |                 # insert a new interval time
292 |                 update_intervals[cache_time] = (set([cluster]), set([stat_name]))
293 | 
294 | 
295 | def _configure_stat_groups_via_file(
296 |     daemon, config_file, stat_group, global_cluster_list
297 | ):
298 |     cluster_list = []
299 |     cluster_list.extend(global_cluster_list)
300 |     try:
301 |         # process clusters specific to this stat group (if any)
302 |         clusters_param = config_file.get(stat_group, "clusters")
303 |         stat_group_clusters = _process_config_file_clusters(clusters_param)
304 |         cluster_list.extend(stat_group_clusters)
305 |         # remove duplicates
306 |         cluster_list = list(set(cluster_list))
307 |     except configparser.NoOptionError:
308 |         pass
309 | 
310 |     if len(cluster_list) == 0:
311 |         print(
312 |             "The %s stat group has no clusters to query." % stat_group, file=sys.stderr
313 |         )
314 |         print(
315 |             "You must provide either a global list of "
316 |             "clusters to query for all stat groups, or a per-stat-"
317 |             "group list of clusters, or both.",
318 |             file=sys.stderr,
319 |         )
320 |         sys.exit(1)
321 | 
322 |     cluster_configs = _build_cluster_configs(cluster_list)
323 | 
324 |     update_interval_param = config_file.get(stat_group, "update_interval")
325 |     stat_names = config_file.get(stat_group, "stats").split()
326 |     # remove duplicates
327 |     stat_names = list(set(stat_names))
328 |     # deal with derived stats (if any)
329 |     composite_stats = []
330 |     if config_file.has_option(stat_group, "composite_stats") is True:
331 |         composite_stats = _parse_derived_stats(
332 |             config_file, stat_group, "composite_stats", _parse_composite_stats
333 |         )
334 | 
335 |     eq_stats = []
336 |     if config_file.has_option(stat_group, "equation_stats") is True:
337 |         eq_stats = _build_equation_stats_list(config_file, stat_group, "equation_stats")
338 | 
339 |     pct_change_stats = []
340 |     if config_file.has_option(stat_group, "percent_change_stats") is True:
341 |         pct_change_stats = _parse_derived_stats(
342 |             config_file, stat_group, "percent_change_stats", _parse_pct_change_stats
343 |         )
344 | 
345 |     final_eq_stats = []
346 |     if config_file.has_option(stat_group, "final_equation_stats") is True:
347 |         final_eq_stats = _build_equation_stats_list(
348 |             config_file, stat_group, "final_equation_stats"
349 |         )
350 | 
351 |     update_intervals = {}
352 |     if update_interval_param.startswith("*"):
353 |         try:
354 |             update_interval_multiplier = (
355 |                 1 if update_interval_param == "*" else int(update_interval_param[1:])
356 |             )
357 |         except ValueError as exc:
358 |             print(
359 |                 "Failed to parse update interval multiplier "
360 |                 "from %s stat group.\nERROR: %s" % (stat_group, str(exc)),
361 |                 file=sys.stderr,
362 |             )
363 |             sys.exit(1)
364 |         print("Computing update intervals for stat group: %s." % stat_group)
365 |         _compute_stat_group_update_intervals(
366 |             update_interval_multiplier, cluster_configs, stat_names, update_intervals
367 |         )
368 |     else:
369 |         try:
370 |             update_interval = int(update_interval_param)
371 |         except ValueError as exc:
372 |             print(
373 |                 "Failed to parse update interval from %s "
374 |                 "stat group.\nERROR: %s" % (stat_group, str(exc)),
375 |                 file=sys.stderr,
376 |             )
377 |             sys.exit(1)
378 |         update_intervals[update_interval] = (cluster_configs, stat_names)
379 | 
380 |     # TODO - fix this - for now if there are derived stats then we are going to
381 |     # query all the stats in this section at once (i.e. using the the smallest
382 |     # of the configured update intervals) in order to make sure that all of the
383 |     # input parameters of the derived stats are available at once.
384 |     if (
385 |         len(composite_stats) > 0
386 |         or len(eq_stats) > 0
387 |         or len(pct_change_stats) > 0
388 |         or len(final_eq_stats) > 0
389 |     ):
390 |         update_interval_keys = list(update_intervals.keys())
391 |         update_interval_keys.sort()
392 |         update_interval = update_interval_keys[0]
393 |         _configure_stat_group(
394 |             daemon,
395 |             update_interval,
396 |             cluster_configs,
397 |             stat_names,
398 |             composite_stats,
399 |             eq_stats,
400 |             pct_change_stats,
401 |             final_eq_stats,
402 |         )
403 |     else:
404 |         for update_interval, clusters_stats_tuple in update_intervals.items():
405 |             # first item in clusters_stats_tuple is the unique list of clusters
406 |             # associated with the current update_interval, the second item is the
407 |             # unique list of stats to query on the set of clusters at the current
408 |             # update_interval.
409 |             _configure_stat_group(
410 |                 daemon,
411 |                 update_interval,
412 |                 clusters_stats_tuple[0],
413 |                 clusters_stats_tuple[1],
414 |             )
415 | 
416 | 
417 | def _parse_derived_stats(config_file, stat_group, derived_stats_name, parse_func):
418 |     derived_stats_cfg = config_file.get(stat_group, derived_stats_name)
419 |     try:
420 |         derived_stats = parse_func(derived_stats_cfg)
421 |     except RuntimeError as rterr:
422 |         print(
423 |             "Failed to parse %s from %s "
424 |             "section. %s" % (derived_stats_name, stat_group, str(rterr)),
425 |             file=sys.stderr,
426 |         )
427 |         sys.exit(1)
428 | 
429 |     return derived_stats
430 | 
431 | 
432 | def _parse_fields(in_stat_name):
433 |     split_name = in_stat_name.split(":")
434 |     if len(split_name) == 1:
435 |         return in_stat_name, None
436 | 
437 |     return split_name[0], tuple(split_name[1:])
438 | 
439 | 
440 | def _parse_composite_stats(composite_stats_cfg):
441 |     # Example of what is expected for each stat_cfg:
442 |     # sum(node.ifs.ops.in[:field1:field2])
443 |     composite_stats = []
444 |     for stat_cfg in composite_stats_cfg.split():
445 |         bracket1 = stat_cfg.find("(")
446 |         bracket2 = stat_cfg.find(")")
447 |         if bracket1 <= 0 or bracket2 == -1 or bracket1 > bracket2:
448 |             raise RuntimeError(
449 |                 "Failed to parse operation from %s."
450 |                 "Expected: op(stat) where op is avg, min, max, "
451 |                 " or sum and stat is the name of a base OneFS "
452 |                 ' statistic name that starts with "node.".' % stat_cfg
453 |             )
454 |         op_name = stat_cfg[0:bracket1]
455 |         if op_name not in COMPOSITE_OPERATIONS:
456 |             raise RuntimeError(
457 |                 "Invalid operation %s specified for %s." % (op_name, stat_cfg)
458 |             )
459 | 
460 |         in_stat_name = stat_cfg[bracket1 + 1:bracket2]
461 |         if in_stat_name.startswith("node.") is False:
462 |             raise RuntimeError(
463 |                 "Invalid stat name %s specified for %s."
464 |                 ' Composite stats must start with "node.".' % (op_name, stat_cfg)
465 |             )
466 |         out_stat_name = "cluster.%s.%s" % (in_stat_name.replace(":", "."), op_name)
467 |         in_stat_name, fields = _parse_fields(in_stat_name)
468 |         # TODO should validate that this is a valid stat name
469 |         composite_stat = ClusterCompositeStatComputer(
470 |             DerivedStatInput(in_stat_name, fields),
471 |             out_stat_name,
472 |             COMPOSITE_OPERATIONS[op_name],
473 |         )
474 |         composite_stats.append(composite_stat)
475 | 
476 |     return composite_stats
477 | 
478 | 
479 | def _build_equation_stats_list(config_file, stat_group, equation_stats):
480 |     eq_stats = []
481 |     eq_stats_list = config_file.get(stat_group, equation_stats).split()
482 |     for eq_stat in eq_stats_list:
483 |         eq_stat_names = _parse_derived_stats(
484 |             config_file, stat_group, eq_stat, _parse_equation_stats
485 |         )
486 |         cfg_expression = config_file.get(stat_group, eq_stat)
487 |         # the Equation package doesn't like having '.' characters in the
488 |         # input param names, so we have to replace them with placeholder
489 |         # names.
490 |         eq_func = _build_equation_expression(cfg_expression, eq_stat_names)
491 |         eq_stat_inputs = _build_equation_stat_inputs(eq_stat_names)
492 |         eq_stats.append(EquationStatComputer(eq_func, eq_stat_inputs, eq_stat))
493 | 
494 |     return eq_stats
495 | 
496 | 
497 | def _build_equation_stat_inputs(eq_stat_names):
498 |     input_stats = []
499 |     for stat_name in eq_stat_names:
500 |         stat_name, fields = _parse_fields(stat_name)
501 |         input_stats.append(DerivedStatInput(stat_name, fields))
502 | 
503 |     return input_stats
504 | 
505 | 
506 | def _parse_equation_stats(equation_stat_expression):
507 |     # Example of what is expected:
508 |     # (cluster.node.ifs.ops.in.sum + cluster.node.ifs.ops.out.sum)
509 |     # * cluster.node.disk.iosched.latency.avg.avg
510 |     # Example of what is expected from stat with specific fields:
511 |     # (cluster.protostats.nfs.total:op_count
512 |     #  + cluster.protostats.smb2.total:op_count)
513 |     equation_stats = re.findall("[a-zA-Z.:_0-9]+", equation_stat_expression)
514 | 
515 |     # remove items that don't start with an alphabet character
516 |     equation_stats = [eq_stat for eq_stat in equation_stats if eq_stat[0].isalpha()]
517 |     return equation_stats
518 | 
519 | 
520 | def _build_equation_expression(cfg_expression, eq_stat_names):
521 |     params_list = []
522 |     for eindex in range(0, len(eq_stat_names)):
523 |         eq_stat_name = eq_stat_names[eindex]
524 |         param_name = "param" + str(eindex)
525 |         cfg_expression = cfg_expression.replace(eq_stat_name, param_name, 1)
526 |         params_list.append(param_name)
527 | 
528 |     return Expression(cfg_expression, params_list)
529 | 
530 | 
531 | def _parse_pct_change_stats(pct_change_stats_cfg):
532 |     # Expected is just a white-space delimitted list of stat names
533 |     pct_change_stats = []
534 |     for stat_name in pct_change_stats_cfg.split():
535 |         out_stat_name = stat_name.replace(":", ".") + ".percentchange"
536 |         stat_name, fields = _parse_fields(stat_name)
537 |         pct_change_stats.append(
538 |             PercentChangeStatComputer(
539 |                 DerivedStatInput(stat_name, fields), out_stat_name
540 |             )
541 |         )
542 |     return pct_change_stats
543 | 
544 | 
545 | def _configure_stat_groups_via_cli(daemon, args):
546 |     if len(args.stat_groups) == 0:
547 |         print(
548 |             "You must provide a set of stats to query via "
549 |             "the --stats command line argument or a configuration file.",
550 |             file=sys.stderr,
551 |         )
552 |         sys.exit(1)
553 | 
554 |     if not args.update_intervals:
555 |         # for some reason if i try to use default=[MIN_UPDATE_INTERVAL] in the
556 |         # argparser for the update_intervals arg then my list always has a
557 |         # MIN_UPDATE_INTERVAL in addition to any intervals actually provided by
558 |         # the user on the command line, so i need to setup the default here
559 |         args.update_intervals.append(MIN_UPDATE_INTERVAL)
560 | 
561 |     if len(args.stat_groups) != len(args.update_intervals):
562 |         print(
563 |             "The number of update intervals must be the "
564 |             + "same as the number of stat groups.",
565 |             file=sys.stderr,
566 |         )
567 |         sys.exit(1)
568 | 
569 |     cluster_list = args.clusters.split(",")
570 |     # if args.clusters is the empty string then 1st element will be empty
571 |     if cluster_list[0] == "":
572 |         print("Please provide at least one input cluster.", file=sys.stderr)
573 |         sys.exit(1)
574 | 
575 |     # remove duplicates
576 |     cluster_list = list(set(cluster_list))
577 |     cluster_configs = _build_cluster_configs(cluster_list)
578 | 
579 |     for index in range(0, len(args.stat_groups)):
580 |         stats_list = args.stat_groups[index].split(",")
581 |         # split always results in at least one item, so check if the first
582 |         # item is empty to validate the stats input arg
583 |         if stats_list[0] == "":
584 |             print("Please provide at least one stat name.", file=sys.stderr)
585 |             sys.exit(1)
586 |         update_interval = args.update_intervals[index]
587 |         _configure_stat_group(daemon, update_interval, cluster_configs, stats_list)
588 | 
589 | 
590 | def _configure_stats_processor(daemon, stats_processor, processor_args):
591 |     try:
592 |         processor = __import__(stats_processor, fromlist=[""])
593 |     except ImportError:
594 |         print("Unable to load stats processor: %s." % stats_processor, file=sys.stderr)
595 |         sys.exit(1)
596 | 
597 |     try:
598 |         arg_list = processor_args.split(" ") if processor_args != "" else []
599 |         daemon.set_stats_processor(processor, arg_list)
600 |     except AttributeError as exception:
601 |         print(
602 |             "Failed to configure %s as stats processor. %s"
603 |             % (stats_processor, str(exception)),
604 |             file=sys.stderr,
605 |         )
606 |         sys.exit(1)
607 | 
608 | 
609 | def _log_level_str_to_enum(log_level):
610 |     if log_level.upper() == "DEBUG":
611 |         return logging.DEBUG
612 |     elif log_level.upper() == "INFO":
613 |         return logging.INFO
614 |     elif log_level.upper() == "WARNING":
615 |         return logging.WARNING
616 |     elif log_level.upper() == "ERROR":
617 |         return logging.ERROR
618 |     elif log_level.upper() == "CRITICAL":
619 |         return logging.CRITICAL
620 |     else:
621 |         print("Invalid logging level: " + log_level + ", setting to INFO.")
622 |         return logging.INFO
623 | 
624 | 
625 | def _update_args_with_config_file(config_file, args):
626 |     # command line args override config file params
627 |     if args.pid_file is None and config_file.has_option(MAIN_CFG_SEC, "pid_file"):
628 |         args.pid_file = config_file.get(MAIN_CFG_SEC, "pid_file")
629 |     if args.log_file is None and config_file.has_option(MAIN_CFG_SEC, "log_file"):
630 |         args.log_file = config_file.get(MAIN_CFG_SEC, "log_file")
631 |     if args.log_level is None and config_file.has_option(MAIN_CFG_SEC, "log_level"):
632 |         args.log_level = config_file.get(MAIN_CFG_SEC, "log_level")
633 | 
634 | 
635 | def _print_stat_groups(daemon):
636 |     """
637 |     Print out the list of stat sets that were configured for the daemon prior
638 |     to starting it so that user can verify that it was configured as expected.
639 |     """
640 |     for update_interval, stat_set in daemon.get_next_stat_set():
641 |         msg = (
642 |             "Configured stat set:\n\tClusters: %s\n\t"
643 |             "Update Interval: %d\n\tStat Keys: %s"
644 |             % (str(stat_set.cluster_configs), update_interval, str(stat_set.stats))
645 |         )
646 |         # print it to stdout and the log file.
647 |         print(msg)
648 |         LOG.debug(msg)
649 | 
650 | 
651 | def configure_via_file(daemon, args, config_file):
652 |     """
653 |     Configure the daemon's stat groups and the stats processor via command line
654 |     arguments and configuration file. The command line args override settings
655 |     provided in the config file.
656 |     """
657 |     # Command line args override config file params
658 |     if (
659 |         not args.stats_processor
660 |         and config_file.has_option(MAIN_CFG_SEC, "stats_processor") is True
661 |     ):
662 |         args.stats_processor = config_file.get(MAIN_CFG_SEC, "stats_processor")
663 |     if (
664 |         not args.processor_args
665 |         and config_file.has_option(MAIN_CFG_SEC, "stats_processor_args") is True
666 |     ):
667 |         args.processor_args = config_file.get(MAIN_CFG_SEC, "stats_processor_args")
668 |     _configure_stats_processor(daemon, args.stats_processor, args.processor_args)
669 | 
670 |     # check if the MAIN_CFG_SEC has the MIN_UPDATE_INTERVAL_OVERRIDE_PARAM
671 |     if config_file.has_option(MAIN_CFG_SEC, MIN_UPDATE_INTERVAL_OVERRIDE_PARAM):
672 |         global MIN_UPDATE_INTERVAL
673 |         try:
674 |             override_update_interval = int(
675 |                 config_file.get(MAIN_CFG_SEC, MIN_UPDATE_INTERVAL_OVERRIDE_PARAM)
676 |             )
677 |         except ValueError as exc:
678 |             print(
679 |                 "Failed to parse %s from %s "
680 |                 "section.\nERROR: %s"
681 |                 % (MIN_UPDATE_INTERVAL_OVERRIDE_PARAM, MAIN_CFG_SEC, str(exc)),
682 |                 file=sys.stderr,
683 |             )
684 |             sys.exit(1)
685 | 
686 |         LOG.warning(
687 |             "Overriding MIN_UPDATE_INTERVAL of %d seconds with " "%d seconds.",
688 |             MIN_UPDATE_INTERVAL,
689 |             override_update_interval,
690 |         )
691 |         MIN_UPDATE_INTERVAL = override_update_interval
692 | 
693 |     # if there are any clusters, stats, or update_intervals specified via CLI
694 |     # then try to configure the daemon using them first.
695 |     if args.update_intervals or args.stat_groups or args.clusters:
696 |         _configure_stat_groups_via_cli(daemon, args)
697 |     global_cluster_list = []
698 |     if args.clusters:
699 |         global_cluster_list = args.clusters.split(",")
700 |     elif config_file.has_option(MAIN_CFG_SEC, "clusters"):
701 |         global_cluster_list = _process_config_file_clusters(
702 |             config_file.get(MAIN_CFG_SEC, "clusters")
703 |         )
704 |     # remove duplicates
705 |     global_cluster_list = list(set(global_cluster_list))
706 | 
707 |     # now configure with config file params too
708 |     if config_file.has_option(MAIN_CFG_SEC, "active_stat_groups"):
709 |         active_stat_groups = config_file.get(MAIN_CFG_SEC, "active_stat_groups").split()
710 |         for stat_group in active_stat_groups:
711 |             _configure_stat_groups_via_file(
712 |                 daemon, config_file, stat_group, global_cluster_list
713 |             )
714 | 
715 |     # check that at least one stat group was added to the daemon.
716 |     if daemon.get_stat_set_count() == 0:
717 |         print(
718 |             "Please provide stat groups to query via "
719 |             "command line args or via config file parameters.",
720 |             file=sys.stderr,
721 |         )
722 |         sys.exit(1)
723 | 
724 |     _print_stat_groups(daemon)
725 | 
726 | 
727 | def configure_via_cli(daemon, args):
728 |     """
729 |     Configure the daemon's stat groups and the stats processor via command line
730 |     arguments.
731 |     """
732 |     _configure_stat_groups_via_cli(daemon, args)
733 |     _configure_stats_processor(daemon, args.stats_processor, args.processor_args)
734 | 
735 |     _print_stat_groups(daemon)
736 | 
737 | 
738 | def configure_logging_via_cli(args):
739 |     """
740 |     Setup the logging from command line args.
741 |     """
742 |     if args.action != "debug":
743 |         if args.log_file is None:
744 |             args.log_file = DEFAULT_LOG_FILE
745 | 
746 |         parent_dir = os.path.dirname(args.log_file)
747 |         if parent_dir and os.path.exists(parent_dir) is False:
748 |             print("Invalid log file path: %s." % (args.log_file), file=sys.stderr)
749 |             sys.exit(1)
750 | 
751 |         if args.log_level is None:
752 |             args.log_level = DEFAULT_LOG_LEVEL
753 | 
754 |         log_level = _log_level_str_to_enum(args.log_level)
755 |         logging.basicConfig(
756 |             filename=args.log_file,
757 |             level=log_level,
758 |             format="%(asctime)s:%(name)s:%(levelname)s: %(message)s",
759 |         )
760 |     else:  # configure logging to stdout for 'debug' action
761 |         logging.basicConfig(
762 |             stream=sys.stdout,
763 |             level=logging.DEBUG,
764 |             format="%(asctime)s:%(name)s:%(levelname)s: %(message)s",
765 |         )
766 | 
767 | 
768 | def configure_args_via_file(args):
769 |     """
770 |     Load the config_file, if there is one, then check if the pid_file,
771 |     log_file, and log_level parameters are provided in the config file. If they
772 |     are and they are not set via CLI args then use the config file to set them.
773 |     """
774 |     config_file = None
775 |     if args.config_file is not None:
776 |         try:
777 |             config_file = configparser.RawConfigParser()
778 |             with open(args.config_file, "r") as cfg_fp:
779 |                 config_file.readfp(cfg_fp)
780 |         except Exception as exc:
781 |             print(
782 |                 "Failed to parse config file: %s.\n"
783 |                 "ERROR:\n%s." % (args.config_file, str(exc)),
784 |                 file=sys.stderr,
785 |             )
786 |             sys.exit(1)
787 |         _update_args_with_config_file(config_file, args)
788 |     return config_file
789 | 
790 | 
791 | def process_pid_file_arg(pid_file, action):
792 |     """
793 |     Make sure the pid_file argument is a valid path. Set it to the default if
794 |     it was not specified.
795 |     """
796 |     if pid_file is None:
797 |         pid_file = DEFAULT_PID_FILE
798 | 
799 |     parent_dir = os.path.dirname(pid_file)
800 |     if parent_dir and os.path.exists(parent_dir) is False:
801 |         print("Invalid pid file path: %s." % pid_file, file=sys.stderr)
802 |         sys.exit(1)
803 | 
804 |     pid_file_path = os.path.abspath(pid_file)
805 |     if (action == "stop" or action == "restart") and os.path.exists(
806 |         pid_file_path
807 |     ) is False:
808 |         print("Invalid pid file path: %s." % pid_file, file=sys.stderr)
809 |         sys.exit(1)
810 | 
811 |     return pid_file_path
812 | 
813 | 
814 | def parse_cli():
815 |     """
816 |     Setup the command line args and parse them.
817 |     """
818 |     argparser = argparse.ArgumentParser(
819 |         description="Starts, stops, or restarts the " "isi_data_insights_daemon."
820 |     )
821 |     argparser.add_argument(
822 |         "action",
823 |         help="Specifies to 'start', 'stop', " "'restart', or 'debug' the daemon.",
824 |     )
825 |     argparser.add_argument(
826 |         "-c",
827 |         "--config-file",
828 |         dest="config_file",
829 |         help="Set the path to the config file. The default value is "
830 |         "'./isi_data_insights_d.cfg'.",
831 |         action="store",
832 |         default="./isi_data_insights_d.cfg",
833 |     )
834 |     argparser.add_argument(
835 |         "-a",
836 |         "--processor-args",
837 |         dest="processor_args",
838 |         help="Specifies the args to pass to the start function of the "
839 |         "results processor's start function.",
840 |         action="store",
841 |         default="",
842 |     )
843 |     argparser.add_argument(
844 |         "-l",
845 |         "--log-file",
846 |         dest="log_file",
847 |         help="Set the path to the log file. The default value is "
848 |         "'./isi_data_insights_d.log'.",
849 |         action="store",
850 |         default=None,
851 |     )
852 |     argparser.add_argument(
853 |         "-e",
854 |         "--log-level",
855 |         dest="log_level",
856 |         help="Set the logging level (debug, info, warning, error, or " "critical).",
857 |         action="store",
858 |         default=None,
859 |     )
860 |     argparser.add_argument(
861 |         "-p",
862 |         "--pid-file",
863 |         dest="pid_file",
864 |         help="Set the path to the daemon pid file. The default value is "
865 |         "'./isi_data_insights_d.pid'.",
866 |         action="store",
867 |         default=None,
868 |     )
869 |     argparser.add_argument(
870 |         "-x",
871 |         "--stats-processor",
872 |         dest="stats_processor",
873 |         help="Name of the Python module used to process stats query "
874 |         "results. The specified Python module must define "
875 |         "a function named process(results_list) where results_list is a"
876 |         "list of isi_sdk.models.statistics_current_stat objects."
877 |         "StatisticsCurrentStat objects.  The module may also optionally "
878 |         "define start(args) and stop() functions. Use the "
879 |         "--processor-args to specify args to pass to the results "
880 |         "processor's start function.",
881 |         action="store",
882 |         default=None,
883 |     )
884 |     argparser.add_argument(
885 |         "-i",
886 |         "--input-clusters",
887 |         dest="clusters",
888 |         help="Comma delimitted list of clusters to monitor (either "
889 |         "hostnames or ip-addresses)",
890 |         action="store",
891 |         default="",
892 |     )
893 |     argparser.add_argument(
894 |         "-s",
895 |         "--stats",
896 |         dest="stat_groups",
897 |         help="Comma delimitted list of stat names to monitor. Accepts" "multiple.",
898 |         default=[],
899 |         action="append",
900 |     )
901 |     argparser.add_argument(
902 |         "-u",
903 |         "--update-interval",
904 |         dest="update_intervals",
905 |         help="Specifies how often, in seconds, the input clusters should "
906 |         "be polled for each stat group. Accepts multiple.",
907 |         action="append",
908 |         default=[],
909 |         type=int,
910 |     )
911 | 
912 |     return argparser.parse_args()
913 | 


--------------------------------------------------------------------------------
/isi_data_insights_daemon.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | from builtins import str
  3 | from builtins import range
  4 | from past.utils import old_div
  5 | from builtins import object
  6 | import gevent
  7 | import gevent.pool
  8 | 
  9 | from daemons.prefab import run
 10 | from ast import literal_eval
 11 | import logging
 12 | import sys
 13 | import time
 14 | import urllib3.exceptions
 15 | 
 16 | from isi_stats_client import IsiStatsClient
 17 | 
 18 | MAX_ASYNC_QUERIES = 20
 19 | 
 20 | LOG = logging.getLogger(__name__)
 21 | 
 22 | 
 23 | class ClusterConfig(object):
 24 |     def __init__(self, address, name, version, isi_sdk, api_client):
 25 |         self.address = address
 26 |         self.name = name
 27 |         self.version = version
 28 |         self.isi_sdk = isi_sdk
 29 |         self.api_client = api_client
 30 | 
 31 |     def __eq__(self, other):
 32 |         """
 33 |         Override __eq__ so that we can store this in a list and check for its
 34 |         existence.
 35 |         """
 36 |         return self.address == other.address
 37 | 
 38 |     def __hash__(self):
 39 |         """
 40 |         Override __hash__ so that we can store this in a dict.
 41 |         """
 42 |         return hash(str(self))
 43 | 
 44 |     def __repr__(self):
 45 |         return self.name
 46 | 
 47 | 
 48 | class DerivedStatsProcessor(object):
 49 |     def __init__(self, derived_stat_computers):
 50 |         self._derived_stat_computers = derived_stat_computers
 51 | 
 52 |     def begin_process(self, cluster_name):
 53 |         for derived_stat_computer in self._derived_stat_computers:
 54 |             derived_stat_computer.begin_process(cluster_name)
 55 | 
 56 |     def select_stat(self, stat):
 57 |         for derived_stat_computer in self._derived_stat_computers:
 58 |             derived_stat_computer.select_stat(stat)
 59 | 
 60 |     def end_process(self, cluster_name):
 61 |         for derived_stat_computer in self._derived_stat_computers:
 62 |             derived_stat_computer.end_process(cluster_name)
 63 | 
 64 |     def stats(self):
 65 |         for derived_stat_computer in self._derived_stat_computers:
 66 |             yield derived_stat_computer
 67 | 
 68 | 
 69 | class DerivedStatComputer(object):
 70 |     def __init__(self, out_stat_name):
 71 |         self._initialize()
 72 |         self.out_stat_name = out_stat_name
 73 | 
 74 |     def _initialize(self):
 75 |         self._selected_stat_timestamps = {}
 76 |         self._selected_stat_errors = {}
 77 | 
 78 |     def begin_process(self, cluster_name):
 79 |         self._initialize()
 80 | 
 81 |     def end_process(self, cluster_name):
 82 |         pass
 83 | 
 84 |     def process(self, stat):
 85 |         pass
 86 | 
 87 |     def _choose_stat(self, stat):
 88 |         LOG.debug("Choose stat: %s", stat.key)
 89 |         try:
 90 |             self._selected_stat_timestamps[stat.devid].append(int(stat.time))
 91 |         except KeyError:
 92 |             self._selected_stat_timestamps[stat.devid] = [int(stat.time)]
 93 | 
 94 |     def _create_derived_stat(self, value, devid=0, error=None):
 95 |         class DerivedStat(object):
 96 |             """ Pretend to be a Stat returned by PAPI """
 97 | 
 98 |             def __init__(self, key, val, node, timestamp, err):
 99 |                 self.key = key
100 |                 self.value = val
101 |                 self.devid = node
102 |                 self.time = timestamp
103 |                 self.error = err
104 |                 self.error_code = None if error is None else 1
105 | 
106 |         avg_timestamp = 0
107 |         if error is not None:
108 |             try:
109 |                 avg_timestamp = self._get_timestamp_avg(devid)
110 |             except ZeroDivisionError:
111 |                 error = (
112 |                     "Caught ZeroDivisionError from _get_timestamp_avg "
113 |                     "for stat %s on node %s." % (self.out_stat_name, str(devid))
114 |                 )
115 | 
116 |         return DerivedStat(self.out_stat_name, value, devid, avg_timestamp, error)
117 | 
118 |     def _get_timestamp_avg(self, devid):
119 |         if devid not in self._selected_stat_timestamps and devid == 0:
120 |             tot = 0
121 |             tot_count = 0
122 |             for node in self._selected_stat_timestamps:
123 |                 tot += sum(self._selected_stat_timestamps[node])
124 |                 tot_count += len(self._selected_stat_timestamps[node])
125 |             return int(old_div(tot, tot_count))
126 |         return int(
127 |             old_div(
128 |                 sum(self._selected_stat_timestamps[devid]),
129 |                 len(self._selected_stat_timestamps[devid]),
130 |             )
131 |         )
132 | 
133 | 
134 | class DerivedStatInput(object):
135 |     def __init__(self, stat_name, stat_fields=()):
136 |         self.name = stat_name
137 |         if stat_fields and len(stat_fields) > 0:
138 |             self._stat_fields = stat_fields
139 |         else:
140 |             self._stat_fields = None
141 | 
142 |     def _lookup(self, stat_value, field=None, *fields):
143 |         if fields:
144 |             # if stat_value is not a dict or list then this will raise
145 |             # exception, which is what we want it to do.
146 |             if type(stat_value) == dict:
147 |                 return self._lookup(stat_value.get(field, {}), *fields)
148 |             else:
149 |                 return self._lookup(stat_value[field], *fields)
150 |         return stat_value.get(field)
151 | 
152 |     def get_value(self, stat_value):
153 |         if self._stat_fields is not None:
154 |             # PAPI has a weird habit of putting stats that have only 1 value
155 |             # into a list. When that happens we just ignore the list
156 |             if type(stat_value) == list:
157 |                 num_items = len(stat_value)
158 |                 if num_items == 1:
159 |                     stat_value = stat_value[0]
160 |                 elif num_items == 0:
161 |                     return None
162 |             return self._lookup(stat_value, *self._stat_fields)
163 |         return stat_value
164 | 
165 |     @property
166 |     def full_name(self):
167 |         return self._get_full_name(self.name)
168 | 
169 |     def _get_full_name(self, stat_name):
170 |         if self._stat_fields is not None:
171 |             full_name = stat_name
172 |             full_name += ":"
173 |             full_name += ":".join(self._stat_fields)
174 |         else:
175 |             full_name = stat_name
176 |         return full_name
177 | 
178 | 
179 | class ClusterCompositeStatComputer(DerivedStatComputer):
180 |     def __init__(self, input_stat, out_stat_name, operation):
181 |         super(ClusterCompositeStatComputer, self).__init__(out_stat_name)
182 |         self._input_stat = input_stat
183 |         self._operation = operation
184 | 
185 |     def _initialize(self):
186 |         super(ClusterCompositeStatComputer, self)._initialize()
187 |         self._selected_stat_values = []
188 | 
189 |     def select_stat(self, stat):
190 |         if stat.key == self._input_stat.name:
191 |             self._selected_stat_values.append(self._input_stat.get_value(stat.value))
192 |             self._choose_stat(stat)
193 | 
194 |     def compute_derived_stat(self):
195 |         LOG.debug(
196 |             "CCSC %s(%s)",
197 |             str(self._operation.__name__),
198 |             str(self._selected_stat_values),
199 |         )
200 |         return self._create_derived_stat(self._operation(self._selected_stat_values))
201 | 
202 | 
203 | class EquationStatComputer(DerivedStatComputer):
204 |     def __init__(self, eq_func, input_stats, out_stat_name):
205 |         super(EquationStatComputer, self).__init__(out_stat_name)
206 |         self._eq_func = eq_func
207 |         self._num_func_args = len(input_stats)
208 |         self._input_stats = input_stats
209 |         self._input_stats_names = {}
210 |         self._input_stat_locations = {}
211 |         for index in range(0, self._num_func_args):
212 |             input_stat = self._input_stats[index]
213 |             # setup mapping from base stat name to input_stat
214 |             try:
215 |                 # there might be multiple fields from a single stat with this
216 |                 # name so we need to keep a list of input_stats
217 |                 self._input_stats_names[input_stat.name].append(input_stat)
218 |             except KeyError:
219 |                 self._input_stats_names[input_stat.name] = [input_stat]
220 |             # setup mapping from name to location(s) in the equation
221 |             try:
222 |                 self._input_stat_locations[input_stat.full_name].append(index)
223 |             except KeyError:
224 |                 self._input_stat_locations[input_stat.full_name] = [index]
225 | 
226 |     def _initialize(self):
227 |         super(EquationStatComputer, self)._initialize()
228 |         self._selected_stat_values = {}
229 |         self._nodes = set()
230 | 
231 |     def select_stat(self, stat):
232 |         # check if this stat is included in this equation
233 |         try:
234 |             input_stats = self._input_stats_names[stat.key]
235 |             # if there is an entry for this stat then it is part of my equation
236 |             self._choose_stat(stat)
237 |             self._nodes.add(stat.devid)
238 |         except KeyError:
239 |             return
240 |         for input_stat in input_stats:
241 |             try:
242 |                 selected_stats_by_node = self._selected_stat_values[
243 |                     input_stat.full_name
244 |                 ]
245 |             except KeyError:
246 |                 self._selected_stat_values[input_stat.full_name] = {}
247 |                 selected_stats_by_node = self._selected_stat_values[
248 |                     input_stat.full_name
249 |                 ]
250 | 
251 |             try:
252 |                 selected_stats_by_node[stat.devid] = input_stat.get_value(stat.value)
253 |             except KeyError:
254 |                 selected_stats_by_node = {}
255 |                 selected_stats_by_node[stat.devid] = input_stat.get_value(stat.value)
256 | 
257 |     def compute_derived_stats(self):
258 |         # return one derived stat per node that the selected stats were
259 |         # collected for.
260 |         derived_stats = []
261 |         for node in self._nodes:
262 |             # for each node build a tuple of the args to the equation
263 |             # by iterating through the intput stat names
264 |             func_args = [None] * self._num_func_args
265 |             for in_stat_name in self._input_stat_locations.keys():
266 |                 stat_node = node
267 |                 if in_stat_name.startswith("cluster.") is True:
268 |                     stat_node = 0  # this is a cluster stat
269 |                 stat_value = self._get_stat_value(in_stat_name, stat_node)
270 |                 in_arg_locations = self._input_stat_locations[in_stat_name]
271 |                 for in_arg_loc in in_arg_locations:
272 |                     func_args[in_arg_loc] = stat_value
273 |             # if there is at least one non-None arg then convert the Nones to
274 |             # zero and try to do the computation. If all are None then skip it.
275 |             if self._null_to_zero(func_args) is False:
276 |                 # failed to get this stat, so return error for it
277 |                 derived_stat = self._create_derived_stat(
278 |                     None,
279 |                     node,
280 |                     "Failed to get equation input for %s, "
281 |                     "input params: %s." % (self.out_stat_name, tuple(func_args)),
282 |                 )
283 |             else:
284 |                 try:
285 |                     func_args_tuple = tuple(func_args)
286 |                     LOG.debug(
287 |                         "EQS [%s]=%s(%s)",
288 |                         str(node),
289 |                         str(self._eq_func),
290 |                         str(func_args_tuple),
291 |                     )
292 |                     derived_stat_value = self._eq_func(*func_args_tuple)
293 |                     derived_stat = self._create_derived_stat(derived_stat_value, node)
294 |                 except Exception as exception:
295 |                     derived_stat = self._create_derived_stat(
296 |                         None,
297 |                         node,
298 |                         error="Exception caught evaluating "
299 |                         "expression for %s, input "
300 |                         "params: %s, exception: %s"
301 |                         % (self.out_stat_name, str(func_args_tuple), str(exception)),
302 |                     )
303 |             derived_stats.append(derived_stat)
304 | 
305 |         return derived_stats
306 | 
307 |     def _null_to_zero(self, func_args):
308 |         null_args = []
309 |         # since we don't know the type do some math to get zero in the correct
310 |         # data type from one of the non-zero values
311 |         zero = None
312 |         for aindex in range(0, self._num_func_args):
313 |             farg = func_args[aindex]
314 |             if farg is None:
315 |                 null_args.append(aindex)
316 |             else:
317 |                 zero = farg - farg
318 | 
319 |         if len(null_args) == self._num_func_args:
320 |             # all the args are null so return False - we can't compute this
321 |             # equation
322 |             return False
323 |         # go back through and set null args to zero
324 |         for aindex in null_args:
325 |             func_args[aindex] = zero
326 | 
327 |         return True
328 | 
329 |     def _get_stat_value(self, stat_name, node):
330 |         try:
331 |             return self._selected_stat_values[stat_name][node]
332 |         except KeyError:
333 |             return None
334 | 
335 | 
336 | class PercentChangeStatComputer(DerivedStatComputer):
337 |     def __init__(self, input_stat, out_stat_name):
338 |         super(PercentChangeStatComputer, self).__init__(out_stat_name)
339 |         self._input_stat = input_stat
340 |         # per node/cluster value
341 |         self._cur_values = {}
342 |         self._prev_values = {}
343 | 
344 |     def begin_process(self, cluster_name):
345 |         super(PercentChangeStatComputer, self).begin_process(cluster_name)
346 |         self._cur_cluster_name = cluster_name
347 |         self._cur_values = {}
348 | 
349 |     def end_process(self, cluster_name):
350 |         super(PercentChangeStatComputer, self).end_process(cluster_name)
351 |         self._prev_values[cluster_name] = self._cur_values
352 | 
353 |     def select_stat(self, stat):
354 |         if stat.key == self._input_stat.name:
355 |             self._cur_values[stat.devid] = self._input_stat.get_value(stat.value)
356 |             self._choose_stat(stat)
357 | 
358 |     def compute_derived_stats(self):
359 |         derived_stats = []
360 |         for node in self._cur_values:
361 |             try:
362 |                 cur_value = self._cur_values[node]
363 |             except KeyError:
364 |                 cur_value = None
365 |             if cur_value is None:
366 |                 derived_stat = self._create_derived_stat(
367 |                     None,
368 |                     node,
369 |                     error="Unable to determine current value "
370 |                     "of input stat: %s" % self._input_stat.full_name,
371 |                 )
372 |             else:
373 |                 try:
374 |                     prev_values = self._prev_values[self._cur_cluster_name]
375 |                     # TREAT no previous value as zero?
376 |                     prev_value = prev_values[node]
377 |                     LOG.debug(
378 |                         "PCS [%s]=(%s /  %s) - 1",
379 |                         str(node),
380 |                         str(cur_value),
381 |                         str(prev_value),
382 |                     )
383 |                     try:
384 |                         derived_stat_value = (
385 |                             old_div(float(cur_value), float(prev_value))
386 |                         ) - 1
387 |                     except ZeroDivisionError:
388 |                         if cur_value == 0 or cur_value == 0.0:
389 |                             # prev_value and cur_value == 0
390 |                             derived_stat_value = 0.0
391 |                         else:
392 |                             derived_stat_value = (
393 |                                 old_div(float(prev_value), float(cur_value))
394 |                             ) - 1
395 |                             derived_stat_value *= -1.0
396 |                     derived_stat_value *= 100.0
397 |                 except KeyError:
398 |                     # no previous value will cause a KeyError
399 |                     # so return 0% change
400 |                     derived_stat_value = 0.0
401 |                 derived_stat = self._create_derived_stat(derived_stat_value, node)
402 |             derived_stats.append(derived_stat)
403 | 
404 |         return derived_stats
405 | 
406 | 
407 | class StatsConfig(object):
408 |     def __init__(self, cluster_configs, stats, update_interval):
409 |         self.cluster_configs = cluster_configs
410 |         self.stats = stats
411 |         self.update_interval = update_interval
412 |         self.cluster_composite_stats = []
413 |         self.equation_stats = []
414 |         self.pct_change_stats = []
415 |         self.final_equation_stats = []
416 | 
417 | 
418 | class StatSet(object):
419 |     def __init__(self):
420 |         self.cluster_configs = []
421 |         self.stats = set()
422 |         self.cluster_composite_stats = []
423 |         self.equation_stats = []
424 |         self.pct_change_stats = []
425 |         self.final_equation_stats = []
426 | 
427 | 
428 | class UpdateInterval(object):
429 |     def __init__(self, interval):
430 |         self.interval = interval
431 |         self.last_update = 0.0
432 | 
433 | 
434 | class IsiDataInsightsDaemon(run.RunDaemon):
435 |     """
436 |     Periodically query a list of OneFS clusters for statistics and
437 |     process them via a configurable stats processor module.
438 |     """
439 | 
440 |     def __init__(self, pidfile):
441 |         """
442 |         Initialize.
443 |         :param: pidfile is the path to the daemon's pidfile (required).
444 |         """
445 |         super(IsiDataInsightsDaemon, self).__init__(pidfile=pidfile)
446 |         self._stat_sets = {}
447 |         self._update_intervals = []
448 |         self._stats_processor = None
449 |         self._stats_processor_args = None
450 |         self._process_stats_func = None
451 |         self.async_worker_pool = gevent.pool.Pool(MAX_ASYNC_QUERIES)
452 | 
453 |     def set_stats_processor(self, stats_processor, processor_args):
454 |         self._stats_processor = stats_processor
455 |         self._stats_processor_args = processor_args
456 |         if hasattr(stats_processor, "process_stat") is True:
457 |             self._process_stats_func = self._process_stats_with_derived_stats
458 |             self._init_derived_stats_processor()
459 |         elif hasattr(stats_processor, "process") is True:
460 |             self._process_stats_func = self._process_all_stats
461 |         else:
462 |             raise AttributeError(
463 |                 "Results processor module has no process() or "
464 |                 "process_stat() function."
465 |             )
466 |         # start the stats processor module
467 |         if hasattr(self._stats_processor, "start") is True:
468 |             # need to start the processor now before the process is daemonized
469 |             # in case the plugin needs to prompt the user for input prior to
470 |             # starting.
471 |             LOG.info("Starting stats processor.")
472 |             self._stats_processor.start(self._stats_processor_args)
473 | 
474 |     def _init_derived_stats_processor(self):
475 |         # if the stats processor doesn't define begin_process or end_process,
476 |         # then add a noop version so we don't have to check each time we
477 |         # process stats
478 |         def noop(cluster_name):
479 |             pass
480 | 
481 |         if hasattr(self._stats_processor, "begin_process") is False:
482 |             self._stats_processor.begin_process = noop
483 |         if hasattr(self._stats_processor, "end_process") is False:
484 |             self._stats_processor.end_process = noop
485 | 
486 |     def add_stats(self, stats_config):
487 |         """
488 |         Add set of stats to be queried.
489 |         :param: stats_config is an instance of StatsConfig, which defines the
490 |         list of stats, an update interval, and the list of clusters to query.
491 |         """
492 |         try:
493 |             # organize the stat sets by update interval
494 |             stat_set = self._stat_sets[stats_config.update_interval]
495 |         except KeyError:
496 |             self._stat_sets[stats_config.update_interval] = stat_set = StatSet()
497 |             self._update_intervals.append(UpdateInterval(stats_config.update_interval))
498 | 
499 |         # add the new clusters to the list of clusters associated with this
500 |         # update interval's stat set.
501 |         for cluster in stats_config.cluster_configs:
502 |             if cluster not in stat_set.cluster_configs:
503 |                 # TODO this is a bug - this causes these stats to be queried on
504 |                 # all clusters in this update interval, not just the clusters
505 |                 # defined in this stats_config
506 |                 stat_set.cluster_configs.append(cluster)
507 | 
508 |         # add the new stats to the stat set
509 |         for stat_name in stats_config.stats:
510 |             stat_set.stats.add(stat_name)
511 | 
512 |         stat_set.cluster_composite_stats.extend(stats_config.cluster_composite_stats)
513 | 
514 |         stat_set.equation_stats.extend(stats_config.equation_stats)
515 | 
516 |         stat_set.pct_change_stats.extend(stats_config.pct_change_stats)
517 | 
518 |         stat_set.final_equation_stats.extend(stats_config.final_equation_stats)
519 | 
520 |     def get_stat_set_count(self):
521 |         return len(self._stat_sets)
522 | 
523 |     def get_next_stat_set(self):
524 |         for update_interval, stat_set in self._stat_sets.items():
525 |             yield update_interval, stat_set
526 | 
527 |     def run(self, debug=False):
528 |         """
529 |         Loop through stat sets, query for their values, and process them with
530 |         the stats processor.
531 |         """
532 |         LOG.info("Starting.")
533 | 
534 |         sleep_secs = 0
535 |         start_time = time.time()
536 |         # setup the last update time of each update interval so that they all
537 |         # get updated on the first pass.
538 |         for update_interval in self._update_intervals:
539 |             update_interval.last_update = start_time - update_interval.interval
540 | 
541 |         while True:
542 |             LOG.debug("Sleeping for %f seconds.", sleep_secs)
543 |             time.sleep(sleep_secs)
544 | 
545 |             # query and process the stat sets whose update interval has been
546 |             # hit or surpassed.
547 |             self._query_and_process_stats(time.time(), debug)
548 | 
549 |             cur_time = time.time()
550 |             # figure out the shortest amount of time until the next update is
551 |             # needed and sleep for that amount of time.
552 |             min_next_update = sys.float_info.max
553 |             for update_interval in self._update_intervals:
554 |                 next_update_time = (
555 |                     update_interval.last_update + update_interval.interval
556 |                 )
557 | 
558 |                 time_to_next_update = next_update_time - cur_time
559 |                 min_next_update = min(time_to_next_update, min_next_update)
560 |             sleep_secs = max(0.0, min_next_update)
561 | 
562 |     def shutdown(self, signum):
563 |         """
564 |         Stops the stats processor prior to stopping the daemon.
565 |         """
566 |         LOG.info("Stopping.")
567 |         if (
568 |             self._stats_processor is not None
569 |             and hasattr(self._stats_processor, "stop") is True
570 |         ):
571 |             LOG.info("Stopping stats processor.")
572 |             self._stats_processor.stop()
573 |         super(IsiDataInsightsDaemon, self).shutdown(signum)
574 | 
575 |     def _query_and_process_stats(self, cur_time, debug):
576 |         """
577 |         Build a unique set of stats to update per cluster from each set of
578 |         stats that are in need of updating based on the amount of time elapsed
579 |         since their last update.
580 |         """
581 |         # there might be more than one stat set that needs updating and thus
582 |         # there might be common clusters between those stat sets, so this loop
583 |         # makes sure that we only send one query to each unique cluster.
584 |         cluster_stats = {}
585 |         for update_interval in self._update_intervals:
586 |             # if the update_interval is less than or equal to the elapsed_time
587 |             # then we need to query the stats associated with this update
588 |             # interval.
589 |             time_since_last_update = cur_time - update_interval.last_update
590 |             if time_since_last_update >= update_interval.interval:
591 |                 LOG.debug(
592 |                     "updating interval:%d time_since_last_update: %f",
593 |                     update_interval.interval,
594 |                     time_since_last_update,
595 |                 )
596 |                 # update the last_update time
597 |                 update_interval.last_update = cur_time
598 |                 # add the stats from stat set to their respective cluster_stats
599 |                 cur_stat_set = self._stat_sets[update_interval.interval]
600 |                 for cluster in cur_stat_set.cluster_configs:
601 |                     try:
602 |                         (
603 |                             cluster_stat_set,
604 |                             cluster_composite_stats,
605 |                             equation_stats,
606 |                             pct_change_stats,
607 |                             final_equation_stats,
608 |                         ) = cluster_stats[cluster]
609 |                         cluster_composite_stats.extend(
610 |                             cur_stat_set.cluster_composite_stats
611 |                         )
612 |                         equation_stats.extend(cur_stat_set.equation_stats)
613 |                         pct_change_stats.extend(cur_stat_set.pct_change_stats)
614 |                         final_equation_stats.extend(cur_stat_set.final_equation_stats)
615 |                     except KeyError:
616 |                         cluster_stat_set = set()
617 |                         cluster_stats[cluster] = (
618 |                             cluster_stat_set,
619 |                             cur_stat_set.cluster_composite_stats,
620 |                             cur_stat_set.equation_stats,
621 |                             cur_stat_set.pct_change_stats,
622 |                             cur_stat_set.final_equation_stats,
623 |                         )
624 | 
625 |                     for stat_name in cur_stat_set.stats:
626 |                         cluster_stat_set.add(stat_name)
627 | 
628 |         # now we have a unique list of clusters to query, so query them
629 |         for (
630 |             cluster,
631 |             (stats, composite_stats, eq_stats, pct_change_stats, final_eq_stats),
632 |         ) in cluster_stats.items():
633 |             self.async_worker_pool.spawn(
634 |                 self._query_and_process_stats1,
635 |                 cluster,
636 |                 stats,
637 |                 composite_stats,
638 |                 eq_stats,
639 |                 pct_change_stats,
640 |                 final_eq_stats,
641 |                 debug,
642 |             )
643 |         self.async_worker_pool.join()
644 | 
645 |     def _query_and_process_stats1(
646 |         self,
647 |         cluster,
648 |         stats,
649 |         composite_stats,
650 |         eq_stats,
651 |         pct_change_stats,
652 |         final_eq_stats,
653 |         debug,
654 |     ):
655 |         LOG.debug("Querying cluster %s %f", cluster.name, cluster.version)
656 |         LOG.debug("Querying stats %d.", len(stats))
657 |         stats_client = IsiStatsClient(cluster.isi_sdk.StatisticsApi(cluster.api_client))
658 |         # query the current cluster with the current set of stats
659 |         try:
660 |             if cluster.version >= 8.0:
661 |                 results = stats_client.query_stats(stats)
662 |             else:
663 |                 results = self._v7_2_multistat_query(stats, stats_client)
664 |         except (
665 |             urllib3.exceptions.HTTPError,
666 |             cluster.isi_sdk.rest.ApiException,
667 |         ) as http_exc:
668 |             LOG.error(
669 |                 "Failed to query stats from cluster %s, exception " "raised: %s",
670 |                 cluster.name,
671 |                 str(http_exc),
672 |             )
673 |             return
674 |         except Exception as gen_exc:
675 |             # if in debug mode then re-raise general Exceptions because
676 |             # they are most likely bugs in the code, but in non-debug mode
677 |             # just continue
678 |             if debug is False:
679 |                 LOG.error(
680 |                     "Failed to query stats from cluster %s, exception " "raised: %s",
681 |                     cluster.name,
682 |                     str(gen_exc),
683 |                 )
684 |                 return
685 |             else:
686 |                 raise gen_exc
687 | 
688 |         composite_stats_processor = DerivedStatsProcessor(composite_stats)
689 |         equation_stats_processor = DerivedStatsProcessor(eq_stats)
690 |         pct_change_stats_processor = DerivedStatsProcessor(pct_change_stats)
691 |         final_equation_stats_processor = DerivedStatsProcessor(final_eq_stats)
692 |         derived_stats_processors = (
693 |             composite_stats_processor,
694 |             equation_stats_processor,
695 |             pct_change_stats_processor,
696 |             final_equation_stats_processor,
697 |         )
698 |         # calls either _process_all_stats or
699 |         # _process_stats_with_derived_stats depending on whether or not the
700 |         # _stats_processor has a process_stat function or just a process
701 |         # function. The latter requires the process_stat function.
702 |         self._process_stats_func(cluster.name, results, derived_stats_processors)
703 | 
704 |     def _v7_2_multistat_query(self, stats, stats_client):
705 |         result = []
706 |         for stat in stats:
707 |             result.extend(stats_client.query_stat(stat))
708 |         return result
709 | 
710 |     def _process_all_stats(self, *args):
711 |         cluster_name = args[0]
712 |         results = args[1]
713 |         # the initial version of the stats processor plugin processed all stats
714 |         # at once, this function allows backwards compatibility, but derived
715 |         # stats are not supported
716 |         self._stats_processor.process(cluster_name, results)
717 | 
718 |     def _process_stats_with_derived_stats(
719 |         self, cluster_name, stats_query_results, derived_stats
720 |     ):
721 |         LOG.debug("Processing stat results on %s", cluster_name)
722 |         self._stats_processor.begin_process(cluster_name)
723 |         (
724 |             cluster_composite_stats,
725 |             equation_stats,
726 |             pct_change_stats,
727 |             final_equation_stats,
728 |         ) = derived_stats
729 |         cluster_composite_stats.begin_process(cluster_name)
730 |         equation_stats.begin_process(cluster_name)
731 |         pct_change_stats.begin_process(cluster_name)
732 |         final_equation_stats.begin_process(cluster_name)
733 |         # process the results
734 |         for stat in stats_query_results:
735 |             # check if the stat query returned an error
736 |             if stat.error is not None:
737 |                 LOG.warning(
738 |                     "Query for stat: '%s' on '%s', returned error: '%s'.",
739 |                     str(stat.key),
740 |                     cluster_name,
741 |                     str(stat.error),
742 |                 )
743 |                 continue
744 |             self._prep_stat(stat)
745 |             # let stats processor process it
746 |             self._stats_processor.process_stat(cluster_name, stat)
747 |             # allow derived stats to select/use this stat
748 |             cluster_composite_stats.select_stat(stat)
749 |             equation_stats.select_stat(stat)
750 |             pct_change_stats.select_stat(stat)
751 |             final_equation_stats.select_stat(stat)
752 | 
753 |         LOG.debug("Processing composite stats on %s", cluster_name)
754 |         for composite_stat in cluster_composite_stats.stats():
755 |             # composite stats always return only one derived stat
756 |             derived_stat = composite_stat.compute_derived_stat()
757 |             if derived_stat.error is not None:
758 |                 LOG.warning(
759 |                     "Cluster node composite stat: "
760 |                     "'%s' on '%s', returned error: '%s'.",
761 |                     str(derived_stat.key),
762 |                     cluster_name,
763 |                     str(derived_stat.error),
764 |                 )
765 |                 continue
766 |             LOG.debug(
767 |                 "ClusterCompositeStat[%s]=%s", derived_stat.key, str(derived_stat.value)
768 |             )
769 |             # let stats processor process it
770 |             self._stats_processor.process_stat(cluster_name, derived_stat)
771 |             # allow derived stats to select/use this stat
772 |             equation_stats.select_stat(derived_stat)
773 |             pct_change_stats.select_stat(derived_stat)
774 |             final_equation_stats.select_stat(derived_stat)
775 | 
776 |         LOG.debug("Processing equation stats on %s", cluster_name)
777 |         for eq_stat in equation_stats.stats():
778 |             # equation stats might produce more than one derived stat,
779 |             # potentially one stat per node
780 |             derived_stats = eq_stat.compute_derived_stats()
781 |             for derived_stat in derived_stats:
782 |                 if derived_stat.error is not None:
783 |                     LOG.warning(
784 |                         "Equation computed stat: "
785 |                         "'%s' on '%s', returned error: '%s'.",
786 |                         str(derived_stat.key),
787 |                         cluster_name,
788 |                         str(derived_stat.error),
789 |                     )
790 |                     continue
791 |                 LOG.debug(
792 |                     "EquationStat[%s]=%s", derived_stat.key, str(derived_stat.value)
793 |                 )
794 |                 # let stats processor process them
795 |                 self._stats_processor.process_stat(cluster_name, derived_stat)
796 |                 # allow derived stats to select/use this stat
797 |                 pct_change_stats.select_stat(derived_stat)
798 |                 final_equation_stats.select_stat(derived_stat)
799 | 
800 |         LOG.debug("Processing percent change stats on %s", cluster_name)
801 |         for pct_change_stat in pct_change_stats.stats():
802 |             # percent change stats might produce more than one derived stat,
803 |             # potentially one stat per node
804 |             derived_stats = pct_change_stat.compute_derived_stats()
805 |             for derived_stat in derived_stats:
806 |                 if derived_stat.error is not None:
807 |                     LOG.warning(
808 |                         "Percent change stat: " "'%s' on '%s', returned error: '%s'.",
809 |                         str(derived_stat.key),
810 |                         cluster_name,
811 |                         str(derived_stat.error),
812 |                     )
813 |                     continue
814 |                 LOG.debug(
815 |                     "PercentChangeStat[%s]=%s",
816 |                     derived_stat.key,
817 |                     str(derived_stat.value),
818 |                 )
819 |                 # let stats processor process it
820 |                 self._stats_processor.process_stat(cluster_name, derived_stat)
821 |                 # allow derived stats to select/use this stat
822 |                 final_equation_stats.select_stat(derived_stat)
823 | 
824 |         LOG.debug("Processing final equation stats on %s", cluster_name)
825 |         for eq_stat in final_equation_stats.stats():
826 |             # equation stats might produce more than one derived stat,
827 |             # potentially one stat per node
828 |             derived_stats = eq_stat.compute_derived_stats()
829 |             for derived_stat in derived_stats:
830 |                 if derived_stat.error is not None:
831 |                     LOG.warning(
832 |                         "Final equation computed stat: "
833 |                         "'%s' on '%s', returned error: '%s'.",
834 |                         str(derived_stat.key),
835 |                         cluster_name,
836 |                         str(derived_stat.error),
837 |                     )
838 |                     continue
839 |                 LOG.debug(
840 |                     "FinalEquationStat[%s]=%s",
841 |                     derived_stat.key,
842 |                     str(derived_stat.value),
843 |                 )
844 |                 # let stats processor process them
845 |                 self._stats_processor.process_stat(cluster_name, derived_stat)
846 | 
847 |         self._stats_processor.end_process(cluster_name)
848 |         cluster_composite_stats.end_process(cluster_name)
849 |         equation_stats.end_process(cluster_name)
850 |         pct_change_stats.end_process(cluster_name)
851 |         final_equation_stats.end_process(cluster_name)
852 | 
853 |     def _prep_stat(self, stat):
854 |         try:
855 |             # the stat value's data type is variable depending on the key so
856 |             # use literal_eval() to convert it to the correct type
857 |             eval_value = literal_eval(stat.value)
858 |             # convert tuples to a list for simplicity
859 |             if type(eval_value) == tuple:
860 |                 stat.value = list(eval_value)
861 |             else:
862 |                 stat.value = eval_value
863 |         except Exception:  # if literal_eval throws an exception
864 |             # then just leave it as string value
865 |             pass
866 | 


--------------------------------------------------------------------------------
/dashboards/prometheus/grafana_cluster_list_dashboard.json:
--------------------------------------------------------------------------------
   1 | {
   2 |   "annotations": {
   3 |     "list": [
   4 |       {
   5 |         "builtIn": 1,
   6 |         "datasource": "-- Grafana --",
   7 |         "enable": true,
   8 |         "hide": true,
   9 |         "iconColor": "rgba(0, 211, 255, 1)",
  10 |         "name": "Annotations & Alerts",
  11 |         "type": "dashboard"
  12 |       }
  13 |     ]
  14 |   },
  15 |   "editable": true,
  16 |   "gnetId": null,
  17 |   "graphTooltip": 0,
  18 |   "id": 11,
  19 |   "iteration": 1600859254949,
  20 |   "links": [],
  21 |   "panels": [
  22 |     {
  23 |       "collapsed": true,
  24 |       "datasource": null,
  25 |       "gridPos": {
  26 |         "h": 1,
  27 |         "w": 24,
  28 |         "x": 0,
  29 |         "y": 0
  30 |       },
  31 |       "id": 214,
  32 |       "panels": [
  33 |         {
  34 |           "content": "* Use the pull down at the very top left of the page (next to the spiral icon) to select which dashboard you want to look at.\n* Use the cluster and other pull downs to select the cluster and protocol of interest.\n* Use the pull downs at the top right to select a specific time period of interest.\n* Note that by default the dates and time displayed are in your browser’s time zone, not the source cluster.   You can get it to display in UTC via the settings under the little gear symbol at the top of the page.\n* You can hide rows using the green slide-out tab to the left of each chart.\n* If there is a legend displayed you can click on elements within it to hide or display items, etc.\n* Click on the title of the chart and then the horizontal bars icon at the left to show/hide the legend and get a CSV export of the data.\n* There is no significance in whether things are displayed as lines, bars or points - we have used whatever seems to be clearest for the data.\n",
  35 |           "datasource": null,
  36 |           "editable": true,
  37 |           "error": false,
  38 |           "fieldConfig": {
  39 |             "defaults": {
  40 |               "custom": {}
  41 |             },
  42 |             "overrides": []
  43 |           },
  44 |           "gridPos": {
  45 |             "h": 7,
  46 |             "w": 24,
  47 |             "x": 0,
  48 |             "y": 1
  49 |           },
  50 |           "id": 18,
  51 |           "isNew": true,
  52 |           "links": [],
  53 |           "mode": "markdown",
  54 |           "options": {
  55 |             "content": "* Use the pull down at the very top left of the page (next to the spiral icon) to select which dashboard you want to look at.\n* Use the cluster and other pull downs to select the cluster and protocol of interest.\n* Use the pull downs at the top right to select a specific time period of interest.\n* Note that by default the dates and time displayed are in your browser’s time zone, not the source cluster.   You can get it to display in UTC via the settings under the little gear symbol at the top of the page.\n* You can hide rows using the green slide-out tab to the left of each chart.\n* If there is a legend displayed you can click on elements within it to hide or display items, etc.\n* Click on the title of the chart and then the horizontal bars icon at the left to show/hide the legend and get a CSV export of the data.\n* There is no significance in whether things are displayed as lines, bars or points - we have used whatever seems to be clearest for the data.\n",
  56 |             "mode": "markdown"
  57 |           },
  58 |           "pluginVersion": "7.1.0",
  59 |           "title": "Welcome to the Isilon Cluster Summary Dashboard",
  60 |           "type": "text"
  61 |         }
  62 |       ],
  63 |       "title": "Welcome to the Isilon Cluster Summary Dashboard",
  64 |       "type": "row"
  65 |     },
  66 |     {
  67 |       "collapsed": false,
  68 |       "datasource": null,
  69 |       "gridPos": {
  70 |         "h": 1,
  71 |         "w": 24,
  72 |         "x": 0,
  73 |         "y": 1
  74 |       },
  75 |       "id": 215,
  76 |       "panels": [],
  77 |       "repeat": "cluster",
  78 |       "scopedVars": {
  79 |         "cluster": {
  80 |           "selected": true,
  81 |           "text": "All",
  82 |           "value": "$__all"
  83 |         }
  84 |       },
  85 |       "title": "$cluster",
  86 |       "type": "row"
  87 |     },
  88 |     {
  89 |       "content": "<a href=\"/dashboard/db/isilon-data-insights-cluster-detail?var-cluster=$cluster\">Detail dashboard</a><br>\n<a target=\"_blank\" href=\"https://$cluster:8080/\">WebUI for $cluster</a>",
  90 |       "datasource": null,
  91 |       "editable": true,
  92 |       "error": false,
  93 |       "fieldConfig": {
  94 |         "defaults": {
  95 |           "custom": {}
  96 |         },
  97 |         "overrides": []
  98 |       },
  99 |       "gridPos": {
 100 |         "h": 4,
 101 |         "w": 2,
 102 |         "x": 0,
 103 |         "y": 2
 104 |       },
 105 |       "id": 35,
 106 |       "isNew": true,
 107 |       "links": [
 108 |         {
 109 |           "targetBlank": false,
 110 |           "title": "Detail dashboard for $cluster",
 111 |           "url": "dashboard/db/isilon-data-insights?$__url_time_range&$__all_variables"
 112 |         },
 113 |         {
 114 |           "targetBlank": true,
 115 |           "title": "WebUI",
 116 |           "url": "https://$cluster:8080/"
 117 |         }
 118 |       ],
 119 |       "mode": "html",
 120 |       "options": {
 121 |         "content": "<a href=\"/dashboard/db/isilon-data-insights-cluster-detail?var-cluster=$cluster\">Detail dashboard</a><br>\n<a target=\"_blank\" href=\"https://$cluster:8080/\">WebUI for $cluster</a>",
 122 |         "mode": "html"
 123 |       },
 124 |       "pluginVersion": "7.1.0",
 125 |       "repeatIteration": 1476718550844,
 126 |       "scopedVars": {
 127 |         "cluster": {
 128 |           "selected": true,
 129 |           "text": "All",
 130 |           "value": "$__all"
 131 |         }
 132 |       },
 133 |       "title": "$cluster",
 134 |       "transparent": true,
 135 |       "type": "text"
 136 |     },
 137 |     {
 138 |       "cacheTimeout": null,
 139 |       "colorBackground": false,
 140 |       "colorValue": false,
 141 |       "colors": [
 142 |         "rgba(50, 172, 45, 0.97)",
 143 |         "rgba(237, 129, 40, 0.89)",
 144 |         "rgba(245, 54, 54, 0.9)"
 145 |       ],
 146 |       "datasource": "Prometheus",
 147 |       "editable": true,
 148 |       "error": false,
 149 |       "fieldConfig": {
 150 |         "defaults": {
 151 |           "custom": {}
 152 |         },
 153 |         "overrides": []
 154 |       },
 155 |       "format": "none",
 156 |       "gauge": {
 157 |         "maxValue": 100,
 158 |         "minValue": 0,
 159 |         "show": false,
 160 |         "thresholdLabels": false,
 161 |         "thresholdMarkers": false
 162 |       },
 163 |       "gridPos": {
 164 |         "h": 4,
 165 |         "w": 2,
 166 |         "x": 2,
 167 |         "y": 2
 168 |       },
 169 |       "height": "",
 170 |       "id": 207,
 171 |       "interval": null,
 172 |       "isNew": true,
 173 |       "links": [
 174 |         {
 175 |           "targetBlank": true,
 176 |           "title": "WebUI for $cluster",
 177 |           "url": "https://$cluster:8080/"
 178 |         }
 179 |       ],
 180 |       "mappingType": 1,
 181 |       "mappingTypes": [
 182 |         {
 183 |           "name": "value to text",
 184 |           "value": 1
 185 |         },
 186 |         {
 187 |           "name": "range to text",
 188 |           "value": 2
 189 |         }
 190 |       ],
 191 |       "maxDataPoints": 100,
 192 |       "nullPointMode": "connected",
 193 |       "nullText": null,
 194 |       "postfix": "",
 195 |       "postfixFontSize": "50%",
 196 |       "prefix": "",
 197 |       "prefixFontSize": "50%",
 198 |       "rangeMaps": [
 199 |         {
 200 |           "from": "null",
 201 |           "text": "N/A",
 202 |           "to": "null"
 203 |         }
 204 |       ],
 205 |       "repeatIteration": 1476718550844,
 206 |       "scopedVars": {
 207 |         "cluster": {
 208 |           "selected": true,
 209 |           "text": "All",
 210 |           "value": "$__all"
 211 |         }
 212 |       },
 213 |       "sparkline": {
 214 |         "fillColor": "rgba(31, 118, 189, 0.18)",
 215 |         "full": false,
 216 |         "lineColor": "rgb(31, 120, 193)",
 217 |         "show": false
 218 |       },
 219 |       "tableColumn": "",
 220 |       "targets": [
 221 |         {
 222 |           "expr": "max(isilon_cluster_node_count_all{hostname=~\"$cluster\"})",
 223 |           "interval": "",
 224 |           "legendFormat": "",
 225 |           "refId": "A"
 226 |         }
 227 |       ],
 228 |       "thresholds": "1,2",
 229 |       "title": "Total Nodes",
 230 |       "type": "singlestat",
 231 |       "valueFontSize": "80%",
 232 |       "valueMaps": [
 233 |         {
 234 |           "op": "=",
 235 |           "text": "",
 236 |           "value": ""
 237 |         }
 238 |       ],
 239 |       "valueName": "current"
 240 |     },
 241 |     {
 242 |       "cacheTimeout": null,
 243 |       "colorBackground": true,
 244 |       "colorValue": false,
 245 |       "colors": [
 246 |         "rgba(50, 172, 45, 0.97)",
 247 |         "rgba(237, 129, 40, 0.89)",
 248 |         "rgba(245, 54, 54, 0.9)"
 249 |       ],
 250 |       "datasource": "Prometheus",
 251 |       "editable": true,
 252 |       "error": false,
 253 |       "fieldConfig": {
 254 |         "defaults": {
 255 |           "custom": {}
 256 |         },
 257 |         "overrides": []
 258 |       },
 259 |       "format": "none",
 260 |       "gauge": {
 261 |         "maxValue": 100,
 262 |         "minValue": 0,
 263 |         "show": false,
 264 |         "thresholdLabels": false,
 265 |         "thresholdMarkers": true
 266 |       },
 267 |       "gridPos": {
 268 |         "h": 4,
 269 |         "w": 2,
 270 |         "x": 4,
 271 |         "y": 2
 272 |       },
 273 |       "id": 13,
 274 |       "interval": null,
 275 |       "isNew": true,
 276 |       "links": [
 277 |         {
 278 |           "targetBlank": true,
 279 |           "title": "WebUI for $cluster",
 280 |           "url": "https://$cluster:8080/"
 281 |         }
 282 |       ],
 283 |       "mappingType": 1,
 284 |       "mappingTypes": [
 285 |         {
 286 |           "name": "value to text",
 287 |           "value": 1
 288 |         },
 289 |         {
 290 |           "name": "range to text",
 291 |           "value": 2
 292 |         }
 293 |       ],
 294 |       "maxDataPoints": 100,
 295 |       "nullPointMode": "connected",
 296 |       "nullText": null,
 297 |       "postfix": "",
 298 |       "postfixFontSize": "50%",
 299 |       "prefix": "",
 300 |       "prefixFontSize": "50%",
 301 |       "rangeMaps": [
 302 |         {
 303 |           "from": "null",
 304 |           "text": "N/A",
 305 |           "to": "null"
 306 |         }
 307 |       ],
 308 |       "repeatIteration": 1476718550844,
 309 |       "scopedVars": {
 310 |         "cluster": {
 311 |           "selected": true,
 312 |           "text": "All",
 313 |           "value": "$__all"
 314 |         }
 315 |       },
 316 |       "sparkline": {
 317 |         "fillColor": "rgba(31, 118, 189, 0.18)",
 318 |         "full": false,
 319 |         "lineColor": "rgb(31, 120, 193)",
 320 |         "show": false
 321 |       },
 322 |       "tableColumn": "",
 323 |       "targets": [
 324 |         {
 325 |           "expr": "max(isilon_cluster_node_count_down{hostname=~\"$cluster\"})",
 326 |           "interval": "",
 327 |           "legendFormat": "",
 328 |           "refId": "A"
 329 |         }
 330 |       ],
 331 |       "thresholds": "1,2",
 332 |       "title": "Nodes Down",
 333 |       "type": "singlestat",
 334 |       "valueFontSize": "80%",
 335 |       "valueMaps": [
 336 |         {
 337 |           "op": "=",
 338 |           "text": "",
 339 |           "value": ""
 340 |         }
 341 |       ],
 342 |       "valueName": "current"
 343 |     },
 344 |     {
 345 |       "cacheTimeout": null,
 346 |       "colorBackground": true,
 347 |       "colorValue": false,
 348 |       "colors": [
 349 |         "rgba(50, 172, 45, 0.97)",
 350 |         "rgba(237, 129, 40, 0.89)",
 351 |         "rgba(245, 54, 54, 0.9)"
 352 |       ],
 353 |       "datasource": "Prometheus",
 354 |       "editable": true,
 355 |       "error": false,
 356 |       "fieldConfig": {
 357 |         "defaults": {
 358 |           "custom": {}
 359 |         },
 360 |         "overrides": []
 361 |       },
 362 |       "format": "none",
 363 |       "gauge": {
 364 |         "maxValue": 100,
 365 |         "minValue": 0,
 366 |         "show": false,
 367 |         "thresholdLabels": false,
 368 |         "thresholdMarkers": true
 369 |       },
 370 |       "gridPos": {
 371 |         "h": 4,
 372 |         "w": 2,
 373 |         "x": 6,
 374 |         "y": 2
 375 |       },
 376 |       "id": 14,
 377 |       "interval": null,
 378 |       "isNew": true,
 379 |       "links": [
 380 |         {
 381 |           "targetBlank": true,
 382 |           "title": "WebUI for $cluster",
 383 |           "url": "https://$cluster:8080/"
 384 |         }
 385 |       ],
 386 |       "mappingType": 2,
 387 |       "mappingTypes": [
 388 |         {
 389 |           "name": "value to text",
 390 |           "value": 1
 391 |         },
 392 |         {
 393 |           "name": "range to text",
 394 |           "value": 2
 395 |         }
 396 |       ],
 397 |       "maxDataPoints": 100,
 398 |       "nullPointMode": "connected",
 399 |       "nullText": null,
 400 |       "postfix": "",
 401 |       "postfixFontSize": "50%",
 402 |       "prefix": "",
 403 |       "prefixFontSize": "50%",
 404 |       "rangeMaps": [
 405 |         {
 406 |           "from": "0",
 407 |           "text": "Healthy",
 408 |           "to": "0"
 409 |         },
 410 |         {
 411 |           "from": ".0001",
 412 |           "text": "Attention",
 413 |           "to": "1.999"
 414 |         },
 415 |         {
 416 |           "from": "2",
 417 |           "text": "Down",
 418 |           "to": "5"
 419 |         }
 420 |       ],
 421 |       "repeatIteration": 1476718550844,
 422 |       "scopedVars": {
 423 |         "cluster": {
 424 |           "selected": true,
 425 |           "text": "All",
 426 |           "value": "$__all"
 427 |         }
 428 |       },
 429 |       "sparkline": {
 430 |         "fillColor": "rgba(31, 118, 189, 0.18)",
 431 |         "full": false,
 432 |         "lineColor": "rgb(31, 120, 193)",
 433 |         "show": false
 434 |       },
 435 |       "tableColumn": "",
 436 |       "targets": [
 437 |         {
 438 |           "expr": "max(isilon_cluster_health{hostname=~\"$cluster\"})",
 439 |           "interval": "",
 440 |           "legendFormat": "",
 441 |           "refId": "A"
 442 |         }
 443 |       ],
 444 |       "thresholds": "0.0001,2",
 445 |       "title": "Alert Status",
 446 |       "type": "singlestat",
 447 |       "valueFontSize": "50%",
 448 |       "valueMaps": [
 449 |         {
 450 |           "op": "=",
 451 |           "text": "Healthy",
 452 |           "value": "0"
 453 |         },
 454 |         {
 455 |           "op": "=",
 456 |           "text": "Attention",
 457 |           "value": "1"
 458 |         },
 459 |         {
 460 |           "op": "=",
 461 |           "text": "Down",
 462 |           "value": "2"
 463 |         }
 464 |       ],
 465 |       "valueName": "avg"
 466 |     },
 467 |     {
 468 |       "cacheTimeout": null,
 469 |       "colorBackground": false,
 470 |       "colorValue": false,
 471 |       "colors": [
 472 |         "rgba(50, 172, 45, 0.97)",
 473 |         "rgba(237, 129, 40, 0.89)",
 474 |         "rgba(245, 54, 54, 0.9)"
 475 |       ],
 476 |       "datasource": "Prometheus",
 477 |       "editable": true,
 478 |       "error": false,
 479 |       "fieldConfig": {
 480 |         "defaults": {
 481 |           "custom": {}
 482 |         },
 483 |         "overrides": []
 484 |       },
 485 |       "format": "percentunit",
 486 |       "gauge": {
 487 |         "maxValue": 1,
 488 |         "minValue": 0,
 489 |         "show": true,
 490 |         "thresholdLabels": false,
 491 |         "thresholdMarkers": true
 492 |       },
 493 |       "gridPos": {
 494 |         "h": 4,
 495 |         "w": 2,
 496 |         "x": 8,
 497 |         "y": 2
 498 |       },
 499 |       "id": 8,
 500 |       "interval": null,
 501 |       "isNew": true,
 502 |       "links": [
 503 |         {
 504 |           "targetBlank": false,
 505 |           "title": "Detail dashboard for $cluster",
 506 |           "url": "dashboard/db/isilon-data-insights-cluster-detail?$__url_time_range&$__all_variables"
 507 |         }
 508 |       ],
 509 |       "mappingType": 2,
 510 |       "mappingTypes": [
 511 |         {
 512 |           "name": "value to text",
 513 |           "value": 1
 514 |         },
 515 |         {
 516 |           "name": "range to text",
 517 |           "value": 2
 518 |         }
 519 |       ],
 520 |       "maxDataPoints": 100,
 521 |       "nullPointMode": "connected",
 522 |       "nullText": null,
 523 |       "postfix": "",
 524 |       "postfixFontSize": "50%",
 525 |       "prefix": "",
 526 |       "prefixFontSize": "50%",
 527 |       "rangeMaps": [],
 528 |       "repeatIteration": 1476718550844,
 529 |       "scopedVars": {
 530 |         "cluster": {
 531 |           "selected": true,
 532 |           "text": "All",
 533 |           "value": "$__all"
 534 |         }
 535 |       },
 536 |       "sparkline": {
 537 |         "fillColor": "rgba(31, 118, 189, 0.18)",
 538 |         "full": true,
 539 |         "lineColor": "rgb(31, 120, 193)",
 540 |         "show": true
 541 |       },
 542 |       "tableColumn": "",
 543 |       "targets": [
 544 |         {
 545 |           "expr": "1.0 - avg(isilon_cluster_cpu_idle_avg{hostname=~\"$cluster\"})/1000",
 546 |           "interval": "",
 547 |           "legendFormat": "",
 548 |           "refId": "A"
 549 |         }
 550 |       ],
 551 |       "thresholds": "0.80,0.95",
 552 |       "title": "Cluster CPU",
 553 |       "type": "singlestat",
 554 |       "valueFontSize": "80%",
 555 |       "valueMaps": [
 556 |         {
 557 |           "op": "=",
 558 |           "text": "N/A",
 559 |           "value": "null"
 560 |         }
 561 |       ],
 562 |       "valueName": "current"
 563 |     },
 564 |     {
 565 |       "cacheTimeout": null,
 566 |       "colorBackground": false,
 567 |       "colorValue": false,
 568 |       "colors": [
 569 |         "rgba(50, 172, 45, 0.97)",
 570 |         "rgba(237, 129, 40, 0.89)",
 571 |         "rgba(245, 54, 54, 0.9)"
 572 |       ],
 573 |       "datasource": "Prometheus",
 574 |       "editable": true,
 575 |       "error": false,
 576 |       "fieldConfig": {
 577 |         "defaults": {
 578 |           "custom": {}
 579 |         },
 580 |         "overrides": []
 581 |       },
 582 |       "format": "percent",
 583 |       "gauge": {
 584 |         "maxValue": 100,
 585 |         "minValue": 0,
 586 |         "show": true,
 587 |         "thresholdLabels": false,
 588 |         "thresholdMarkers": true
 589 |       },
 590 |       "gridPos": {
 591 |         "h": 4,
 592 |         "w": 2,
 593 |         "x": 10,
 594 |         "y": 2
 595 |       },
 596 |       "id": 9,
 597 |       "interval": null,
 598 |       "isNew": true,
 599 |       "links": [
 600 |         {
 601 |           "targetBlank": false,
 602 |           "title": "Detail dashboard for $cluster",
 603 |           "url": "dashboard/db/isilon-data-insights-cluster-detail?$__url_time_range&$__all_variables"
 604 |         }
 605 |       ],
 606 |       "mappingType": 2,
 607 |       "mappingTypes": [
 608 |         {
 609 |           "name": "value to text",
 610 |           "value": 1
 611 |         },
 612 |         {
 613 |           "name": "range to text",
 614 |           "value": 2
 615 |         }
 616 |       ],
 617 |       "maxDataPoints": 100,
 618 |       "nullPointMode": "connected",
 619 |       "nullText": null,
 620 |       "postfix": "",
 621 |       "postfixFontSize": "50%",
 622 |       "prefix": "",
 623 |       "prefixFontSize": "50%",
 624 |       "rangeMaps": [],
 625 |       "repeatIteration": 1476718550844,
 626 |       "scopedVars": {
 627 |         "cluster": {
 628 |           "selected": true,
 629 |           "text": "All",
 630 |           "value": "$__all"
 631 |         }
 632 |       },
 633 |       "sparkline": {
 634 |         "fillColor": "rgba(31, 118, 189, 0.18)",
 635 |         "full": true,
 636 |         "lineColor": "rgb(31, 120, 193)",
 637 |         "show": true
 638 |       },
 639 |       "tableColumn": "",
 640 |       "targets": [
 641 |         {
 642 |           "expr": "100 - avg(isilon_ifs_percent_avail{hostname=~\"$cluster\"})",
 643 |           "interval": "",
 644 |           "legendFormat": "",
 645 |           "refId": "A"
 646 |         }
 647 |       ],
 648 |       "thresholds": "80,90",
 649 |       "title": "Cluster Capacity",
 650 |       "type": "singlestat",
 651 |       "valueFontSize": "80%",
 652 |       "valueMaps": [
 653 |         {
 654 |           "op": "=",
 655 |           "text": "N/A",
 656 |           "value": "null"
 657 |         }
 658 |       ],
 659 |       "valueName": "current"
 660 |     },
 661 |     {
 662 |       "cacheTimeout": null,
 663 |       "colorBackground": false,
 664 |       "colorValue": false,
 665 |       "colors": [
 666 |         "rgba(245, 54, 54, 0.9)",
 667 |         "rgba(237, 129, 40, 0.89)",
 668 |         "rgba(50, 172, 45, 0.97)"
 669 |       ],
 670 |       "datasource": "Prometheus",
 671 |       "editable": true,
 672 |       "error": false,
 673 |       "fieldConfig": {
 674 |         "defaults": {
 675 |           "custom": {}
 676 |         },
 677 |         "overrides": []
 678 |       },
 679 |       "format": "Bps",
 680 |       "gauge": {
 681 |         "maxValue": 100,
 682 |         "minValue": 0,
 683 |         "show": false,
 684 |         "thresholdLabels": false,
 685 |         "thresholdMarkers": true
 686 |       },
 687 |       "gridPos": {
 688 |         "h": 4,
 689 |         "w": 2,
 690 |         "x": 12,
 691 |         "y": 2
 692 |       },
 693 |       "id": 208,
 694 |       "interval": null,
 695 |       "isNew": true,
 696 |       "links": [
 697 |         {
 698 |           "title": "Isilon Data Insights Cluster Detail",
 699 |           "url": "dashboard/db/isilon-data-insights-cluster-detail?$__url_time_range&$__all_variables"
 700 |         }
 701 |       ],
 702 |       "mappingType": 1,
 703 |       "mappingTypes": [
 704 |         {
 705 |           "name": "value to text",
 706 |           "value": 1
 707 |         },
 708 |         {
 709 |           "name": "range to text",
 710 |           "value": 2
 711 |         }
 712 |       ],
 713 |       "maxDataPoints": 100,
 714 |       "nullPointMode": "connected",
 715 |       "nullText": null,
 716 |       "postfix": "",
 717 |       "postfixFontSize": "50%",
 718 |       "prefix": "",
 719 |       "prefixFontSize": "50%",
 720 |       "rangeMaps": [
 721 |         {
 722 |           "from": "null",
 723 |           "text": "N/A",
 724 |           "to": "null"
 725 |         }
 726 |       ],
 727 |       "repeatIteration": 1476718550844,
 728 |       "scopedVars": {
 729 |         "cluster": {
 730 |           "selected": true,
 731 |           "text": "All",
 732 |           "value": "$__all"
 733 |         }
 734 |       },
 735 |       "sparkline": {
 736 |         "fillColor": "rgba(31, 118, 189, 0.18)",
 737 |         "full": true,
 738 |         "lineColor": "rgb(31, 120, 193)",
 739 |         "show": true
 740 |       },
 741 |       "tableColumn": "",
 742 |       "targets": [
 743 |         {
 744 |           "expr": "isilon_cluster_protostats_nfs_in_rate{hostname=~\"$cluster\"}+isilon_cluster_protostats_nfs_out_rate{hostname=~\"$cluster\"}",
 745 |           "interval": "",
 746 |           "legendFormat": "",
 747 |           "refId": "A"
 748 |         }
 749 |       ],
 750 |       "thresholds": "",
 751 |       "title": "NFSv3 Throughput",
 752 |       "type": "singlestat",
 753 |       "valueFontSize": "80%",
 754 |       "valueMaps": [
 755 |         {
 756 |           "op": "=",
 757 |           "text": "N/A",
 758 |           "value": "null"
 759 |         }
 760 |       ],
 761 |       "valueName": "current"
 762 |     },
 763 |     {
 764 |       "cacheTimeout": null,
 765 |       "colorBackground": false,
 766 |       "colorValue": false,
 767 |       "colors": [
 768 |         "rgba(245, 54, 54, 0.9)",
 769 |         "rgba(237, 129, 40, 0.89)",
 770 |         "rgba(50, 172, 45, 0.97)"
 771 |       ],
 772 |       "datasource": "Prometheus",
 773 |       "editable": true,
 774 |       "error": false,
 775 |       "fieldConfig": {
 776 |         "defaults": {
 777 |           "custom": {}
 778 |         },
 779 |         "overrides": []
 780 |       },
 781 |       "format": "ops",
 782 |       "gauge": {
 783 |         "maxValue": 100,
 784 |         "minValue": 0,
 785 |         "show": false,
 786 |         "thresholdLabels": false,
 787 |         "thresholdMarkers": true
 788 |       },
 789 |       "gridPos": {
 790 |         "h": 4,
 791 |         "w": 2,
 792 |         "x": 14,
 793 |         "y": 2
 794 |       },
 795 |       "id": 209,
 796 |       "interval": null,
 797 |       "isNew": true,
 798 |       "links": [
 799 |         {
 800 |           "title": "Isilon Data Insights Cluster Detail",
 801 |           "url": "dashboard/db/isilon-data-insights-cluster-detail?$__url_time_range&$__all_variables"
 802 |         }
 803 |       ],
 804 |       "mappingType": 1,
 805 |       "mappingTypes": [
 806 |         {
 807 |           "name": "value to text",
 808 |           "value": 1
 809 |         },
 810 |         {
 811 |           "name": "range to text",
 812 |           "value": 2
 813 |         }
 814 |       ],
 815 |       "maxDataPoints": 100,
 816 |       "nullPointMode": "connected",
 817 |       "nullText": null,
 818 |       "postfix": "",
 819 |       "postfixFontSize": "50%",
 820 |       "prefix": "",
 821 |       "prefixFontSize": "50%",
 822 |       "rangeMaps": [
 823 |         {
 824 |           "from": "null",
 825 |           "text": "N/A",
 826 |           "to": "null"
 827 |         }
 828 |       ],
 829 |       "repeatIteration": 1476718550844,
 830 |       "scopedVars": {
 831 |         "cluster": {
 832 |           "selected": true,
 833 |           "text": "All",
 834 |           "value": "$__all"
 835 |         }
 836 |       },
 837 |       "sparkline": {
 838 |         "fillColor": "rgba(31, 118, 189, 0.18)",
 839 |         "full": true,
 840 |         "lineColor": "rgb(31, 120, 193)",
 841 |         "show": true
 842 |       },
 843 |       "tableColumn": "",
 844 |       "targets": [
 845 |         {
 846 |           "expr": "isilon_cluster_protostats_nfs_total_op_rate{hostname=~\"$cluster\"}",
 847 |           "interval": "",
 848 |           "legendFormat": "",
 849 |           "refId": "A"
 850 |         }
 851 |       ],
 852 |       "thresholds": "",
 853 |       "title": "NFSv3 Op/s",
 854 |       "type": "singlestat",
 855 |       "valueFontSize": "80%",
 856 |       "valueMaps": [
 857 |         {
 858 |           "op": "=",
 859 |           "text": "N/A",
 860 |           "value": "null"
 861 |         }
 862 |       ],
 863 |       "valueName": "current"
 864 |     },
 865 |     {
 866 |       "cacheTimeout": null,
 867 |       "colorBackground": true,
 868 |       "colorValue": false,
 869 |       "colors": [
 870 |         "rgba(50, 172, 45, 0.97)",
 871 |         "rgba(237, 129, 40, 0.89)",
 872 |         "rgba(245, 54, 54, 0.9)"
 873 |       ],
 874 |       "datasource": "Prometheus",
 875 |       "editable": true,
 876 |       "error": false,
 877 |       "fieldConfig": {
 878 |         "defaults": {
 879 |           "custom": {}
 880 |         },
 881 |         "overrides": []
 882 |       },
 883 |       "format": "ms",
 884 |       "gauge": {
 885 |         "maxValue": 100,
 886 |         "minValue": 0,
 887 |         "show": false,
 888 |         "thresholdLabels": false,
 889 |         "thresholdMarkers": false
 890 |       },
 891 |       "gridPos": {
 892 |         "h": 4,
 893 |         "w": 2,
 894 |         "x": 16,
 895 |         "y": 2
 896 |       },
 897 |       "id": 210,
 898 |       "interval": null,
 899 |       "isNew": true,
 900 |       "links": [
 901 |         {
 902 |           "title": "Isilon Data Insights Cluster Detail",
 903 |           "url": "dashboard/db/isilon-data-insights-cluster-detail?$__url_time_range&$__all_variables"
 904 |         }
 905 |       ],
 906 |       "mappingType": 1,
 907 |       "mappingTypes": [
 908 |         {
 909 |           "name": "value to text",
 910 |           "value": 1
 911 |         },
 912 |         {
 913 |           "name": "range to text",
 914 |           "value": 2
 915 |         }
 916 |       ],
 917 |       "maxDataPoints": 100,
 918 |       "nullPointMode": "connected",
 919 |       "nullText": null,
 920 |       "postfix": "",
 921 |       "postfixFontSize": "50%",
 922 |       "prefix": "",
 923 |       "prefixFontSize": "50%",
 924 |       "rangeMaps": [
 925 |         {
 926 |           "from": "null",
 927 |           "text": "N/A",
 928 |           "to": "null"
 929 |         }
 930 |       ],
 931 |       "repeatIteration": 1476718550844,
 932 |       "scopedVars": {
 933 |         "cluster": {
 934 |           "selected": true,
 935 |           "text": "All",
 936 |           "value": "$__all"
 937 |         }
 938 |       },
 939 |       "sparkline": {
 940 |         "fillColor": "rgba(31, 118, 189, 0.18)",
 941 |         "full": true,
 942 |         "lineColor": "rgb(31, 120, 193)",
 943 |         "show": true
 944 |       },
 945 |       "tableColumn": "",
 946 |       "targets": [
 947 |         {
 948 |           "expr": "isilon_cluster_protostats_nfs_total_time_avg{hostname=~\"$cluster\"}/1000",
 949 |           "interval": "",
 950 |           "legendFormat": "",
 951 |           "refId": "A"
 952 |         }
 953 |       ],
 954 |       "thresholds": "10,25",
 955 |       "title": "NFSv3 Latency",
 956 |       "type": "singlestat",
 957 |       "valueFontSize": "80%",
 958 |       "valueMaps": [
 959 |         {
 960 |           "op": "=",
 961 |           "text": "N/A",
 962 |           "value": "null"
 963 |         }
 964 |       ],
 965 |       "valueName": "current"
 966 |     },
 967 |     {
 968 |       "cacheTimeout": null,
 969 |       "colorBackground": false,
 970 |       "colorValue": false,
 971 |       "colors": [
 972 |         "rgba(245, 54, 54, 0.9)",
 973 |         "rgba(237, 129, 40, 0.89)",
 974 |         "rgba(50, 172, 45, 0.97)"
 975 |       ],
 976 |       "datasource": "Prometheus",
 977 |       "editable": true,
 978 |       "error": false,
 979 |       "fieldConfig": {
 980 |         "defaults": {
 981 |           "custom": {}
 982 |         },
 983 |         "overrides": []
 984 |       },
 985 |       "format": "Bps",
 986 |       "gauge": {
 987 |         "maxValue": 100,
 988 |         "minValue": 0,
 989 |         "show": false,
 990 |         "thresholdLabels": false,
 991 |         "thresholdMarkers": true
 992 |       },
 993 |       "gridPos": {
 994 |         "h": 4,
 995 |         "w": 2,
 996 |         "x": 18,
 997 |         "y": 2
 998 |       },
 999 |       "id": 211,
1000 |       "interval": null,
1001 |       "isNew": true,
1002 |       "links": [
1003 |         {
1004 |           "title": "Isilon Data Insights Cluster Detail",
1005 |           "url": "dashboard/db/isilon-data-insights-cluster-detail?$__url_time_range&$__all_variables"
1006 |         }
1007 |       ],
1008 |       "mappingType": 1,
1009 |       "mappingTypes": [
1010 |         {
1011 |           "name": "value to text",
1012 |           "value": 1
1013 |         },
1014 |         {
1015 |           "name": "range to text",
1016 |           "value": 2
1017 |         }
1018 |       ],
1019 |       "maxDataPoints": 100,
1020 |       "nullPointMode": "connected",
1021 |       "nullText": null,
1022 |       "postfix": "",
1023 |       "postfixFontSize": "50%",
1024 |       "prefix": "",
1025 |       "prefixFontSize": "50%",
1026 |       "rangeMaps": [
1027 |         {
1028 |           "from": "null",
1029 |           "text": "N/A",
1030 |           "to": "null"
1031 |         }
1032 |       ],
1033 |       "repeatIteration": 1476718550844,
1034 |       "scopedVars": {
1035 |         "cluster": {
1036 |           "selected": true,
1037 |           "text": "All",
1038 |           "value": "$__all"
1039 |         }
1040 |       },
1041 |       "sparkline": {
1042 |         "fillColor": "rgba(31, 118, 189, 0.18)",
1043 |         "full": true,
1044 |         "lineColor": "rgb(31, 120, 193)",
1045 |         "show": true
1046 |       },
1047 |       "tableColumn": "",
1048 |       "targets": [
1049 |         {
1050 |           "expr": "isilon_cluster_protostats_smb2_total_in_rate{hostname=~\"$cluster\"}+isilon_cluster_protostats_smb2_total_out_rate{hostname=~\"$cluster\"}",
1051 |           "interval": "",
1052 |           "legendFormat": "",
1053 |           "refId": "A"
1054 |         }
1055 |       ],
1056 |       "thresholds": "",
1057 |       "title": "SMB2 Throughput",
1058 |       "type": "singlestat",
1059 |       "valueFontSize": "80%",
1060 |       "valueMaps": [
1061 |         {
1062 |           "op": "=",
1063 |           "text": "N/A",
1064 |           "value": "null"
1065 |         }
1066 |       ],
1067 |       "valueName": "current"
1068 |     },
1069 |     {
1070 |       "cacheTimeout": null,
1071 |       "colorBackground": false,
1072 |       "colorValue": false,
1073 |       "colors": [
1074 |         "rgba(245, 54, 54, 0.9)",
1075 |         "rgba(237, 129, 40, 0.89)",
1076 |         "rgba(50, 172, 45, 0.97)"
1077 |       ],
1078 |       "datasource": "Prometheus",
1079 |       "editable": true,
1080 |       "error": false,
1081 |       "fieldConfig": {
1082 |         "defaults": {
1083 |           "custom": {}
1084 |         },
1085 |         "overrides": []
1086 |       },
1087 |       "format": "ops",
1088 |       "gauge": {
1089 |         "maxValue": 100,
1090 |         "minValue": 0,
1091 |         "show": false,
1092 |         "thresholdLabels": false,
1093 |         "thresholdMarkers": true
1094 |       },
1095 |       "gridPos": {
1096 |         "h": 4,
1097 |         "w": 2,
1098 |         "x": 20,
1099 |         "y": 2
1100 |       },
1101 |       "id": 212,
1102 |       "interval": null,
1103 |       "isNew": true,
1104 |       "links": [
1105 |         {
1106 |           "title": "Isilon Data Insights Cluster Detail",
1107 |           "url": "dashboard/db/isilon-data-insights-cluster-detail?$__url_time_range&$__all_variables"
1108 |         }
1109 |       ],
1110 |       "mappingType": 1,
1111 |       "mappingTypes": [
1112 |         {
1113 |           "name": "value to text",
1114 |           "value": 1
1115 |         },
1116 |         {
1117 |           "name": "range to text",
1118 |           "value": 2
1119 |         }
1120 |       ],
1121 |       "maxDataPoints": 100,
1122 |       "nullPointMode": "connected",
1123 |       "nullText": null,
1124 |       "postfix": "",
1125 |       "postfixFontSize": "50%",
1126 |       "prefix": "",
1127 |       "prefixFontSize": "50%",
1128 |       "rangeMaps": [
1129 |         {
1130 |           "from": "null",
1131 |           "text": "N/A",
1132 |           "to": "null"
1133 |         }
1134 |       ],
1135 |       "repeatIteration": 1476718550844,
1136 |       "scopedVars": {
1137 |         "cluster": {
1138 |           "selected": true,
1139 |           "text": "All",
1140 |           "value": "$__all"
1141 |         }
1142 |       },
1143 |       "sparkline": {
1144 |         "fillColor": "rgba(31, 118, 189, 0.18)",
1145 |         "full": true,
1146 |         "lineColor": "rgb(31, 120, 193)",
1147 |         "show": true
1148 |       },
1149 |       "tableColumn": "",
1150 |       "targets": [
1151 |         {
1152 |           "expr": "isilon_cluster_protostats_smb2_total_op_rate{hostname=~\"$cluster\"}",
1153 |           "interval": "",
1154 |           "legendFormat": "",
1155 |           "refId": "A"
1156 |         }
1157 |       ],
1158 |       "thresholds": "",
1159 |       "title": "SMB2 Op/s",
1160 |       "type": "singlestat",
1161 |       "valueFontSize": "80%",
1162 |       "valueMaps": [
1163 |         {
1164 |           "op": "=",
1165 |           "text": "N/A",
1166 |           "value": "null"
1167 |         }
1168 |       ],
1169 |       "valueName": "current"
1170 |     },
1171 |     {
1172 |       "cacheTimeout": null,
1173 |       "colorBackground": true,
1174 |       "colorValue": false,
1175 |       "colors": [
1176 |         "rgba(50, 172, 45, 0.97)",
1177 |         "rgba(237, 129, 40, 0.89)",
1178 |         "rgba(245, 54, 54, 0.9)"
1179 |       ],
1180 |       "datasource": "Prometheus",
1181 |       "editable": true,
1182 |       "error": false,
1183 |       "fieldConfig": {
1184 |         "defaults": {
1185 |           "custom": {}
1186 |         },
1187 |         "overrides": []
1188 |       },
1189 |       "format": "ms",
1190 |       "gauge": {
1191 |         "maxValue": 100,
1192 |         "minValue": 0,
1193 |         "show": false,
1194 |         "thresholdLabels": false,
1195 |         "thresholdMarkers": false
1196 |       },
1197 |       "gridPos": {
1198 |         "h": 4,
1199 |         "w": 2,
1200 |         "x": 22,
1201 |         "y": 2
1202 |       },
1203 |       "id": 213,
1204 |       "interval": null,
1205 |       "isNew": true,
1206 |       "links": [
1207 |         {
1208 |           "title": "Isilon Data Insights Cluster Detail",
1209 |           "url": "dashboard/db/isilon-data-insights-cluster-detail?$__url_time_range&$__all_variables"
1210 |         }
1211 |       ],
1212 |       "mappingType": 1,
1213 |       "mappingTypes": [
1214 |         {
1215 |           "name": "value to text",
1216 |           "value": 1
1217 |         },
1218 |         {
1219 |           "name": "range to text",
1220 |           "value": 2
1221 |         }
1222 |       ],
1223 |       "maxDataPoints": 100,
1224 |       "nullPointMode": "connected",
1225 |       "nullText": null,
1226 |       "postfix": "",
1227 |       "postfixFontSize": "50%",
1228 |       "prefix": "",
1229 |       "prefixFontSize": "50%",
1230 |       "rangeMaps": [
1231 |         {
1232 |           "from": "null",
1233 |           "text": "N/A",
1234 |           "to": "null"
1235 |         }
1236 |       ],
1237 |       "repeatIteration": 1476718550844,
1238 |       "scopedVars": {
1239 |         "cluster": {
1240 |           "selected": true,
1241 |           "text": "All",
1242 |           "value": "$__all"
1243 |         }
1244 |       },
1245 |       "sparkline": {
1246 |         "fillColor": "rgba(31, 118, 189, 0.18)",
1247 |         "full": true,
1248 |         "lineColor": "rgb(31, 120, 193)",
1249 |         "show": true
1250 |       },
1251 |       "tableColumn": "",
1252 |       "targets": [
1253 |         {
1254 |           "expr": "isilon_cluster_protostats_smb2_total_time_avg{hostname=~\"$cluster\"}/1000",
1255 |           "interval": "",
1256 |           "legendFormat": "",
1257 |           "refId": "A"
1258 |         }
1259 |       ],
1260 |       "thresholds": "10,25",
1261 |       "title": "SMB2 Latency",
1262 |       "type": "singlestat",
1263 |       "valueFontSize": "80%",
1264 |       "valueMaps": [
1265 |         {
1266 |           "op": "=",
1267 |           "text": "N/A",
1268 |           "value": "null"
1269 |         }
1270 |       ],
1271 |       "valueName": "current"
1272 |     }
1273 |   ],
1274 |   "refresh": "5m",
1275 |   "schemaVersion": 26,
1276 |   "style": "dark",
1277 |   "tags": [],
1278 |   "templating": {
1279 |     "list": [
1280 |       {
1281 |         "allFormat": "regex values",
1282 |         "allValue": null,
1283 |         "current": {
1284 |           "selected": false,
1285 |           "text": [
1286 |             "All"
1287 |           ],
1288 |           "value": [
1289 |             "$__all"
1290 |           ]
1291 |         },
1292 |         "datasource": "Prometheus",
1293 |         "definition": "label_values({job=\"isilon\"}, hostname)",
1294 |         "hide": 0,
1295 |         "includeAll": true,
1296 |         "label": "Cluster",
1297 |         "multi": true,
1298 |         "multiFormat": "glob",
1299 |         "name": "cluster",
1300 |         "options": [],
1301 |         "query": "label_values({job=\"isilon\"}, hostname)",
1302 |         "refresh": 1,
1303 |         "regex": "",
1304 |         "skipUrlSync": false,
1305 |         "sort": 0,
1306 |         "tagValuesQuery": "",
1307 |         "tags": [],
1308 |         "tagsQuery": "",
1309 |         "type": "query",
1310 |         "useTags": false
1311 |       }
1312 |     ]
1313 |   },
1314 |   "time": {
1315 |     "from": "now-15m",
1316 |     "to": "now"
1317 |   },
1318 |   "timepicker": {
1319 |     "now": true,
1320 |     "refresh_intervals": [
1321 |       "5s",
1322 |       "10s",
1323 |       "30s",
1324 |       "1m",
1325 |       "5m",
1326 |       "15m",
1327 |       "30m",
1328 |       "1h",
1329 |       "2h",
1330 |       "1d"
1331 |     ],
1332 |     "time_options": [
1333 |       "5m",
1334 |       "15m",
1335 |       "1h",
1336 |       "6h",
1337 |       "12h",
1338 |       "24h",
1339 |       "2d",
1340 |       "7d",
1341 |       "30d"
1342 |     ]
1343 |   },
1344 |   "timezone": "browser",
1345 |   "title": "Isilon Data Insights Cluster Summary",
1346 |   "uid": "Xef-mgFMk",
1347 |   "version": 2
1348 | }


--------------------------------------------------------------------------------