├── .gitignore
├── README.md
├── __init__.py
├── common.py
├── dashboards
    ├── HDFS-DataNode-1588759242237.json
    ├── HDFS-NameNode-1588759254552.json
    ├── HDFS-NameNode.png
    ├── YARN-NodeManager-1588759264873.json
    ├── YARN-ResourceManager-1588759276429.json
    └── hadoop_monitoring.mp4
├── examples
    ├── DataNode.json
    ├── JobHistoryServer.json
    ├── NameNode.json
    ├── NodeManager.json
    ├── ResouceManager.json
    └── all_metrics.txt
├── hadoop_jmx_exporter.py
├── hdfs_datanode.py
├── hdfs_journalnode.py
├── hdfs_namenode.py
├── metrics
    ├── common
    │   ├── JvmMetrics.json
    │   ├── MetricsSystem.json
    │   ├── OperatingSystem.json
    │   ├── RpcActivity.json
    │   ├── RpcDetailedActivity.json
    │   ├── Runtime.json
    │   └── UgiMetrics.json
    ├── datanode
    │   ├── DataNodeActivity.json
    │   ├── DataNodeInfo.json
    │   └── FSDatasetState.json
    ├── journalnode
    │   └── JournalNode.json
    ├── namenode
    │   ├── FSNamesystem.json
    │   ├── FSNamesystemState.json
    │   ├── NameNodeActivity.json
    │   ├── NameNodeInfo.json
    │   ├── RetryCache.json
    │   └── StartupProgress.json
    ├── nodemanager
    │   ├── NodeManagerMetrics.json
    │   └── ShuffleMetrics.json
    └── resourcemanager
    │   ├── ClusterMetrics.json
    │   ├── QueueMetrics.json
    │   └── RMNMInfo.json
├── requirements.txt
├── scraper.py
├── utils.py
├── yarn_nodemanager.py
└── yarn_resourcemanager.py


/.gitignore:
--------------------------------------------------------------------------------
1 | ^#
2 | *.pyc
3 | *.log
4 | .vscode


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # hadoop_jmx_exporter
 2 | 
 3 | Hadoop HDFS & YARN jmx metrics prometheus exporter.
 4 | 
 5 | All metrics can be found [here](./examples/all_metrics.txt).
 6 | 
 7 | Grafana dashboards json file and a short video demo in [dashboards](./dashboards) directory.
 8 | 
 9 | Tested on CDH 5.14.2.
10 | 
11 | ![HDFS-Namenode](./dashboards/HDFS-NameNode.png)
12 | 
13 | # Run
14 | 
15 | ``` bash
16 | ➜  hadoop_jmx_exporter git:(master) ✗ pip2 install -r requirements.txt
17 | 
18 | ➜  hadoop_exporter git:(master) ✗ python2 hadoop_jmx_exporter.py -h
19 | usage: hadoop_jmx_exporter.py [-h] -cluster cluster_name
20 |                               [-queue yarn_queue_regexp]
21 |                               [-nns [namenode_jmx_url [namenode_jmx_url ...]]]
22 |                               [-rms [resourcemanager_jmx_url [resourcemanager_jmx_url ...]]]
23 |                               [-jns [journalnode_jmx_url [journalnode_jmx_url ...]]]
24 |                               [-host host] [-port port]
25 | 
26 | hadoop jmx metric prometheus exporter
27 | 
28 | optional arguments:
29 |   -h, --help            show this help message and exit
30 |   -cluster cluster_name
31 |                         Hadoop cluster name (maybe HA name)
32 |   -queue yarn_queue_regexp
33 |                         Regular expression of queue name. default: root.*
34 |   -nns [namenode_jmx_url [namenode_jmx_url ...]]
35 |                         Hadoop hdfs namenode jmx metrics URL.
36 |   -rms [resourcemanager_jmx_url [resourcemanager_jmx_url ...]]
37 |                         Hadoop resourcemanager metrics jmx URL.
38 |   -jns [journalnode_jmx_url [journalnode_jmx_url ...]]
39 |                         Hadoop journalnode jmx metrics URL.
40 |   -host host            Listen on this address. default: 0.0.0.0
41 |   -port port            Listen to this port. default: 6688
42 | ➜  hadoop_exporter git:(master) ✗
43 | 
44 | ➜  hadoop_exporter git:(master) ✗ python2 hadoop_jmx_exporter.py -cluster yh-cdh -nns http://10.193.40.10:50070/jmx http://10.193.40.3:50070/jmx -rms http://yh-shhd-cdh04:8088/jmx http://yh-shhd-cdh01:8088/jmx
45 | Listen at 0.0.0.0:6688
46 | ```
47 | 
48 | Open your browser to view metrics: `http://127.0.0.1:6688/metrics`.
49 | 
50 | # Reference
51 | 
52 | 1. https://github.com/cauwulixuan/hadoop_exporter
53 | 2. http://hadoop.apache.org/docs/r2.7.3/hadoop-project-dist/hadoop-common/Metrics.html#namenode
54 | 3. https://docs.cloudera.com/HDPDocuments/Ambari-2.7.5.0/using-ambari-core-services/content/amb_hdfs_users.html
55 | 4. https://www.datadoghq.com/blog/collecting-hadoop-metrics/
56 | 


--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/opsnull/hadoop_jmx_exporter/939a27889134d4decef7cc7cb067cb1eba9e4d10/__init__.py


--------------------------------------------------------------------------------
/common.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import re
  5 | from prometheus_client.core import GaugeMetricFamily
  6 | 
  7 | import utils
  8 | 
  9 | 
 10 | logger = utils.get_module_logger(__name__)
 11 | 
 12 | 
 13 | class MetricCollector(object):
 14 |     def __init__(self, cluster, component, service):
 15 |         self.cluster = cluster
 16 |         self.component = component
 17 |         self.prefix = 'hadoop_{0}_{1}'.format(component, service)
 18 | 
 19 |         self.file_list = utils.get_file_list(service)
 20 |         self.metrics = {}
 21 |         for i in range(len(self.file_list)):
 22 |             self.metrics.setdefault(self.file_list[i], utils.read_json_file(service, self.file_list[i]))
 23 | 
 24 |         common_file = utils.get_file_list("common")
 25 |         self.merge_list = self.file_list + common_file
 26 | 
 27 |     def collect(self):
 28 |         pass
 29 | 
 30 |     def _setup_metrics_labels(self):
 31 |         pass
 32 | 
 33 |     def _get_metrics(self, metrics):
 34 |         pass
 35 | 
 36 | 
 37 | class CommonMetricCollector():
 38 |     def __init__(self, cluster, component, service):
 39 |         self.cluster = cluster
 40 |         self.componet = component
 41 |         self.service = service
 42 |         self.prefix = 'hadoop_{0}_{1}'.format(component, service)
 43 |         self.common_metrics = {}
 44 |         self.tmp_metrics = {}
 45 |         file_list = utils.get_file_list("common")
 46 |         for i in range(len(file_list)):
 47 |             self.common_metrics.setdefault(file_list[i], {})
 48 |             self.tmp_metrics.setdefault(file_list[i], utils.read_json_file("common", file_list[i]))
 49 | 
 50 |     def setup_labels(self, beans):
 51 |         for i in range(len(beans)):
 52 |             if 'name=JvmMetrics' in beans[i]['name']:
 53 |                 self.setup_jvm_labels()
 54 |             if 'OperatingSystem' in beans[i]['name']:
 55 |                 self.setup_os_labels()
 56 |             if 'RpcActivity' in beans[i]['name']:
 57 |                 self.setup_rpc_labels()
 58 |             if 'RpcDetailedActivity' in beans[i]['name']:
 59 |                 self.setup_rpc_detailed_labels()
 60 |             if 'UgiMetrics' in beans[i]['name']:
 61 |                 self.setup_ugi_labels()
 62 |             if 'MetricsSystem' in beans[i]['name'] and "sub=Stats" in beans[i]['name']:
 63 |                 self.setup_metric_system_labels()
 64 |             if 'Runtime' in beans[i]['name']:
 65 |                 self.setup_runtime_labels()
 66 | 
 67 |     def get_metrics(self, beans, target):
 68 |         self.target = target
 69 |         for i in range(len(beans)):
 70 |             if 'name=JvmMetrics' in beans[i]['name']:
 71 |                 self.get_jvm_metrics(beans[i])
 72 |             if 'OperatingSystem' in beans[i]['name']:
 73 |                 self.get_os_metrics(beans[i])
 74 |             if 'RpcActivity' in beans[i]['name']:
 75 |                 self.get_rpc_metrics(beans[i])
 76 |             if 'RpcDetailedActivity' in beans[i]['name']:
 77 |                 self.get_rpc_detailed_metrics(beans[i])
 78 |             if 'UgiMetrics' in beans[i]['name']:
 79 |                 self.get_ugi_metrics(beans[i])
 80 |             if 'MetricsSystem' in beans[i]['name'] and "sub=Stats" in beans[i]['name']:
 81 |                 self.get_metric_system_metrics(beans[i])
 82 |             if 'Runtime' in beans[i]['name']:
 83 |                 self.get_runtime_metrics(beans[i])
 84 |         return self.common_metrics
 85 | 
 86 |     def setup_jvm_labels(self):
 87 |         for metric in self.tmp_metrics["JvmMetrics"]:
 88 |             snake_case = "_".join(["jvm", re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower()])
 89 |             if 'Mem' in metric:
 90 |                 name = "".join([snake_case, "ebibytes"])
 91 |                 label = ["cluster", "mode"]
 92 |                 if "Used" in metric:
 93 |                     key = "jvm_mem_used_mebibytes"
 94 |                     descriptions = "Current memory used in mebibytes."
 95 |                 elif "Committed" in metric:
 96 |                     key = "jvm_mem_committed_mebibytes"
 97 |                     descriptions = "Current memory committed in mebibytes."
 98 |                 elif "Max" in metric:
 99 |                     key = "jvm_mem_max_mebibytes"
100 |                     descriptions = "Current max memory in mebibytes."
101 |                 else:
102 |                     key = name
103 |                     label = ["cluster"]
104 |                     descriptions = self.tmp_metrics['JvmMetrics'][metric]
105 |             elif 'Gc' in metric:
106 |                 label = ["cluster", "type"]
107 |                 if "GcCount" in metric:
108 |                     key = "jvm_gc_count"
109 |                     descriptions = "GC count of each type GC."
110 |                 elif "GcTimeMillis" in metric:
111 |                     key = "jvm_gc_time_milliseconds"
112 |                     descriptions = "Each type GC time in milliseconds."
113 |                 elif "ThresholdExceeded" in metric:
114 |                     key = "jvm_gc_exceeded_threshold_total"
115 |                     descriptions = "Number of times that the GC threshold is exceeded."
116 |                 else:
117 |                     key = snake_case
118 |                     label = ["cluster"]
119 |                     descriptions = self.tmp_metrics['JvmMetrics'][metric]
120 |             elif 'Threads' in metric:
121 |                 label = ["cluster", "state"]
122 |                 key = "jvm_threads_state_total"
123 |                 descriptions = "Current number of different threads."
124 |             elif 'Log' in metric:
125 |                 label = ["cluster", "level"]
126 |                 key = "jvm_log_level_total"
127 |                 descriptions = "Total number of each level logs."
128 |             else:
129 |                 label = ["cluster"]
130 |                 key = snake_case
131 |                 descriptions = self.tmp_metrics['JvmMetrics'][metric]
132 |             label.append("_target")
133 |             self.common_metrics['JvmMetrics'][key] = GaugeMetricFamily("_".join([self.prefix, key]), descriptions, labels=label)
134 | 
135 |     def setup_os_labels(self):
136 |         for metric in self.tmp_metrics['OperatingSystem']:
137 |             label = ["cluster", "_target"]
138 |             snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower()
139 |             name = "_".join([self.prefix, snake_case])
140 |             self.common_metrics['OperatingSystem'][metric] = GaugeMetricFamily(name, self.tmp_metrics['OperatingSystem'][metric], labels=label)
141 | 
142 |     def setup_rpc_labels(self):
143 |         num_rpc_flag, avg_rpc_flag = 1, 1
144 |         for metric in self.tmp_metrics["RpcActivity"]:
145 |             snake_case = "_".join(["rpc", re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower()])
146 |             if 'Rpc' in metric:
147 |                 snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower()
148 |             label = ["cluster", "tag"]
149 |             if "NumOps" in metric:
150 |                 if num_rpc_flag:
151 |                     key = "MethodNumOps"
152 |                     label.extend(["method", "_target"])
153 |                     name = "_".join([self.prefix, "rpc_method_called_total"])
154 |                     description = "Total number of the times the method is called."
155 |                     self.common_metrics['RpcActivity'][key] = GaugeMetricFamily(name, description, labels=label)
156 |                     num_rpc_flag = 0
157 |                 else:
158 |                     continue
159 |             elif "AvgTime" in metric:
160 |                 if avg_rpc_flag:
161 |                     key = "MethodAvgTime"
162 |                     label.extend(["method", "_target"])
163 |                     name = "_".join([self.prefix, "rpc_method_avg_time_milliseconds"])
164 |                     descrption = "Average turn around time of the method in milliseconds."
165 |                     self.common_metrics['RpcActivity'][key] = GaugeMetricFamily(name, descrption, labels=label)
166 |                     avg_rpc_flag = 0
167 |                 else:
168 |                     continue
169 |             else:
170 |                 key = metric
171 |                 label.append("_target")
172 |                 name = "_".join([self.prefix, snake_case])
173 |                 self.common_metrics['RpcActivity'][key] = GaugeMetricFamily(name, self.tmp_metrics['RpcActivity'][metric], labels=label)
174 | 
175 |     def setup_rpc_detailed_labels(self):
176 |         for metric in self.tmp_metrics['RpcDetailedActivity']:
177 |             label = ["cluster", "tag", "method", "_target"]
178 |             if "NumOps" in metric:
179 |                 key = "NumOps"
180 |                 name = "_".join([self.prefix, 'rpc_detailed_method_called_total'])
181 |             elif "AvgTime" in metric:
182 |                 key = "AvgTime"
183 |                 name = "_".join([self.prefix, 'rpc_detailed_method_avg_time_milliseconds'])
184 |             else:
185 |                 continue
186 |             self.common_metrics['RpcDetailedActivity'][key] = GaugeMetricFamily(name, self.tmp_metrics['RpcDetailedActivity'][metric], labels=label)
187 |         return self.common_metrics
188 | 
189 |     def setup_ugi_labels(self):
190 |         ugi_num_flag, ugi_avg_flag = 1, 1
191 |         for metric in self.tmp_metrics['UgiMetrics']:
192 |             label = ["cluster"]
193 |             if 'NumOps' in metric:
194 |                 if ugi_num_flag:
195 |                     key = 'NumOps'
196 |                     label.extend(["method", "state", "_target"])
197 |                     ugi_num_flag = 0
198 |                     name = "_".join([self.prefix, 'ugi_method_called_total'])
199 |                     description = "Total number of the times the method is called."
200 |                     self.common_metrics['UgiMetrics'][key] = GaugeMetricFamily(name, description, labels=label)
201 |                 else:
202 |                     continue
203 |             elif 'AvgTime' in metric:
204 |                 if ugi_avg_flag:
205 |                     key = 'AvgTime'
206 |                     label.extend(["method", "state", "_target"])
207 |                     ugi_avg_flag = 0
208 |                     name = "_".join([self.prefix, 'ugi_method_avg_time_milliseconds'])
209 |                     description = "Average turn around time of the method in milliseconds."
210 |                     self.common_metrics['UgiMetrics'][key] = GaugeMetricFamily(name, description, labels=label)
211 |                 else:
212 |                     continue
213 |             else:
214 |                 label.append("_target")
215 |                 snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower()
216 |                 name = "_".join([self.prefix, 'ugi', snake_case])
217 |                 self.common_metrics['UgiMetrics'][metric] = GaugeMetricFamily(name, self.tmp_metrics['UgiMetrics'][metric], labels=label)
218 | 
219 |     def setup_metric_system_labels(self):
220 |         metric_num_flag, metric_avg_flag = 1, 1
221 |         for metric in self.tmp_metrics['MetricsSystem']:
222 |             label = ["cluster"]
223 |             if 'NumOps' in metric:
224 |                 if metric_num_flag:
225 |                     key = 'NumOps'
226 |                     label.extend(["oper", "_target"])
227 |                     metric_num_flag = 0
228 |                     name = "_".join([self.prefix, 'metricssystem_operations_total'])
229 |                     self.common_metrics['MetricsSystem'][key] = GaugeMetricFamily(name, "Total number of operations", labels=label)
230 |                 else:
231 |                     continue
232 |             elif 'AvgTime' in metric:
233 |                 if metric_avg_flag:
234 |                     key = 'AvgTime'
235 |                     label.extend(["oper", "_target"])
236 |                     metric_avg_flag = 0
237 |                     name = "_".join([self.prefix, 'metricssystem_method_avg_time_milliseconds'])
238 |                     description = "Average turn around time of the operations in milliseconds."
239 |                     self.common_metrics['MetricsSystem'][key] = GaugeMetricFamily(name, description, labels=label)
240 |                 else:
241 |                     continue
242 |             else:
243 |                 label.append("_target")
244 |                 snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower()
245 |                 name = "_".join([self.prefix, 'metricssystem', snake_case])
246 |                 self.common_metrics['MetricsSystem'][metric] = GaugeMetricFamily(name, self.tmp_metrics['MetricsSystem'][metric], labels=label)
247 | 
248 |     def setup_runtime_labels(self):
249 |         for metric in self.tmp_metrics['Runtime']:
250 |             label = ["cluster", "host", "_target"]
251 |             snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower()
252 |             name = "_".join([self.prefix, snake_case, "milliseconds"])
253 |             self.common_metrics['Runtime'][metric] = GaugeMetricFamily(name, self.tmp_metrics['Runtime'][metric], labels=label)
254 | 
255 |     def get_jvm_metrics(self, bean):
256 |         for metric in self.tmp_metrics['JvmMetrics']:
257 |             name = "_".join(["jvm", re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower()])
258 |             if 'Mem' in metric:
259 |                 if "Used" in metric:
260 |                     key = "jvm_mem_used_mebibytes"
261 |                     mode = metric.split("Used")[0].split("Mem")[1]
262 |                     label = [self.cluster, mode]
263 |                 elif "Committed" in metric:
264 |                     key = "jvm_mem_committed_mebibytes"
265 |                     mode = metric.split("Committed")[0].split("Mem")[1]
266 |                     label = [self.cluster, mode]
267 |                 elif "Max" in metric:
268 |                     key = "jvm_mem_max_mebibytes"
269 |                     if "Heap" in metric:
270 |                         mode = metric.split("Max")[0].split("Mem")[1]
271 |                     else:
272 |                         mode = "max"
273 |                     label = [self.cluster, mode]
274 |                 else:
275 |                     key = "".join([name, 'ebibytes'])
276 |                     label = [self.cluster]
277 |             elif 'Gc' in metric:
278 |                 if "GcCount" in metric:
279 |                     key = "jvm_gc_count"
280 |                     if "GcCount" == metric:
281 |                         typo = "total"
282 |                     else:
283 |                         typo = metric.split("GcCount")[1]
284 |                     label = [self.cluster, typo]
285 |                 elif "GcTimeMillis" in metric:
286 |                     key = "jvm_gc_time_milliseconds"
287 |                     if "GcTimeMillis" == metric:
288 |                         typo = "total"
289 |                     else:
290 |                         typo = metric.split("GcTimeMillis")[1]
291 |                     label = [self.cluster, typo]
292 |                 elif "ThresholdExceeded" in metric:
293 |                     key = "jvm_gc_exceeded_threshold_total"
294 |                     typo = metric.split("ThresholdExceeded")[
295 |                         0].split("GcNum")[1]
296 |                     label = [self.cluster, typo]
297 |                 else:
298 |                     key = name
299 |                     label = [self.cluster]
300 |             elif 'Threads' in metric:
301 |                 key = "jvm_threads_state_total"
302 |                 state = metric.split("Threads")[1]
303 |                 label = [self.cluster, state]
304 |             elif 'Log' in metric:
305 |                 key = "jvm_log_level_total"
306 |                 level = metric.split("Log")[1]
307 |                 label = [self.cluster, level]
308 |             else:
309 |                 key = name
310 |                 label = [self.cluster]
311 |             label.append(self.target)
312 |             self.common_metrics['JvmMetrics'][key].add_metric(label, bean[metric] if metric in bean else 0)
313 | 
314 |     def get_os_metrics(self, bean):
315 |         for metric in self.tmp_metrics['OperatingSystem']:
316 |             label = [self.cluster]
317 |             label.append(self.target)
318 |             self.common_metrics['OperatingSystem'][metric].add_metric(label, bean[metric] if metric in bean else 0)
319 | 
320 |     def get_rpc_metrics(self, bean):
321 |         rpc_tag = bean['tag.port']
322 |         for metric in self.tmp_metrics['RpcActivity']:
323 |             if "NumOps" in metric:
324 |                 method = metric.split('NumOps')[0]
325 |                 label = [self.cluster, rpc_tag, method]
326 |                 key = "MethodNumOps"
327 |             elif "AvgTime" in metric:
328 |                 method = metric.split('AvgTime')[0]
329 |                 label = [self.cluster, rpc_tag, method]
330 |                 key = "MethodAvgTime"
331 |             else:
332 |                 label = [self.cluster, rpc_tag]
333 |                 key = metric
334 |             label.append(self.target)
335 |             self.common_metrics['RpcActivity'][key].add_metric(label, bean[metric] if metric in bean else 0)
336 | 
337 |     def get_rpc_detailed_metrics(self, bean):
338 |         detail_tag = bean['tag.port']
339 |         for metric in bean:
340 |             if metric[0].isupper():
341 |                 if "NumOps" in metric:
342 |                     key = "NumOps"
343 |                     method = metric.split('NumOps')[0]
344 |                 elif "AvgTime" in metric:
345 |                     key = "AvgTime"
346 |                     method = metric.split("AvgTime")[0]
347 |                 else:
348 |                     continue
349 |                 label = [self.cluster, detail_tag, method, self.target]
350 |                 self.common_metrics['RpcDetailedActivity'][key].add_metric(label, bean[metric])
351 | 
352 |     def get_ugi_metrics(self, bean):
353 |         for metric in self.tmp_metrics['UgiMetrics']:
354 |             if 'NumOps' in metric:
355 |                 key = 'NumOps'
356 |                 if 'Login' in metric:
357 |                     method = 'Login'
358 |                     state = metric.split('Login')[1].split('NumOps')[0]
359 |                     label = [self.cluster, method, state]
360 |                 else:
361 |                     method = metric.split('NumOps')[0]
362 |                     label = [self.cluster, method, "-"]
363 |             elif 'AvgTime' in metric:
364 |                 key = 'AvgTime'
365 |                 if 'Login' in metric:
366 |                     method = 'Login'
367 |                     state = metric.split('Login')[1].split('AvgTime')[0]
368 |                     label = [self.cluster, method, state]
369 |                 else:
370 |                     method = metric.split('AvgTime')[0]
371 |                     label = [self.cluster, method, "-"]
372 |             else:
373 |                 key = metric
374 |                 label = [self.cluster]
375 |             label.append(self.target)
376 |             self.common_metrics['UgiMetrics'][key].add_metric(label, bean[metric] if metric in bean and bean[metric] else 0)
377 | 
378 |     def get_metric_system_metrics(self, bean):
379 |         for metric in self.tmp_metrics['MetricsSystem']:
380 |             if 'NumOps' in metric:
381 |                 key = 'NumOps'
382 |                 oper = metric.split('NumOps')[0]
383 |                 label = [self.cluster, oper]
384 |             elif 'AvgTime' in metric:
385 |                 key = 'AvgTime'
386 |                 oper = metric.split('AvgTime')[0]
387 |                 label = [self.cluster, oper]
388 |             else:
389 |                 key = metric
390 |                 label = [self.cluster]
391 |             label.append(self.target)
392 |             self.common_metrics['MetricsSystem'][key].add_metric(label, bean[metric] if metric in bean and bean[metric] else 0)
393 | 
394 |     def get_runtime_metrics(self, bean):
395 |         for metric in self.tmp_metrics['Runtime']:
396 |             label = [self.cluster, bean['Name'].split("@")[1], self.target]
397 |             self.common_metrics['Runtime'][metric].add_metric(label, bean[metric] if metric in bean and bean[metric] else 0)
398 | 


--------------------------------------------------------------------------------
/dashboards/HDFS-NameNode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/opsnull/hadoop_jmx_exporter/939a27889134d4decef7cc7cb067cb1eba9e4d10/dashboards/HDFS-NameNode.png


--------------------------------------------------------------------------------
/dashboards/hadoop_monitoring.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/opsnull/hadoop_jmx_exporter/939a27889134d4decef7cc7cb067cb1eba9e4d10/dashboards/hadoop_monitoring.mp4


--------------------------------------------------------------------------------
/examples/DataNode.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "beans": [
  3 |         {
  4 |             "name": "Hadoop:service=DataNode,name=JvmMetrics",
  5 |             "modelerType": "JvmMetrics",
  6 |             "tag.Context": "jvm",
  7 |             "tag.ProcessName": "DataNode",
  8 |             "tag.SessionId": null,
  9 |             "tag.Hostname": "yh-shhd-cdh01",
 10 |             "MemNonHeapUsedM": 78.327324,
 11 |             "MemNonHeapCommittedM": 79.87109,
 12 |             "MemNonHeapMaxM": -1,
 13 |             "MemHeapUsedM": 1307.1587,
 14 |             "MemHeapCommittedM": 1979.75,
 15 |             "MemHeapMaxM": 1979.75,
 16 |             "MemMaxM": 1979.75,
 17 |             "GcCountParNew": 5222,
 18 |             "GcTimeMillisParNew": 532221,
 19 |             "GcCountConcurrentMarkSweep": 92,
 20 |             "GcTimeMillisConcurrentMarkSweep": 7191,
 21 |             "GcCount": 5314,
 22 |             "GcTimeMillis": 539412,
 23 |             "GcNumWarnThresholdExceeded": 0,
 24 |             "GcNumInfoThresholdExceeded": 0,
 25 |             "GcTotalExtraSleepTime": 79593,
 26 |             "ThreadsNew": 0,
 27 |             "ThreadsRunnable": 186,
 28 |             "ThreadsBlocked": 0,
 29 |             "ThreadsWaiting": 20,
 30 |             "ThreadsTimedWaiting": 31,
 31 |             "ThreadsTerminated": 0,
 32 |             "LogFatal": 0,
 33 |             "LogError": 4779,
 34 |             "LogWarn": 466,
 35 |             "LogInfo": 4585284
 36 |         },
 37 |         {
 38 |             "name": "Hadoop:service=DataNode,name=FSDatasetState-2d7d9029-dcdc-404a-9d98-cb72ad235493",
 39 |             "modelerType": "org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetImpl",
 40 |             "Remaining": 21524598985077,
 41 |             "DfsUsed": 8583846612190,
 42 |             "Capacity": 31710403805184,
 43 |             "LastVolumeFailureDate": 0,
 44 |             "EstimatedCapacityLostTotal": 0,
 45 |             "CacheUsed": 0,
 46 |             "CacheCapacity": 4294967296,
 47 |             "NumBlocksCached": 0,
 48 |             "NumBlocksFailedToCache": 0,
 49 |             "NumBlocksFailedToUncache": 509262,
 50 |             "NumFailedVolumes": 0,
 51 |             "FailedStorageLocations": [],
 52 |             "StorageInfo": "FSDataset{dirpath='[/mnt/disk0/dfs/dn/current, /mnt/disk1/dfs/dn/current, /mnt/disk2/dfs/dn/current, /mnt/disk3/dfs/dn/current]'}"
 53 |         },
 54 |         {
 55 |             "name": "Hadoop:service=DataNode,name=UgiMetrics",
 56 |             "modelerType": "UgiMetrics",
 57 |             "tag.Context": "ugi",
 58 |             "tag.Hostname": "yh-shhd-cdh01",
 59 |             "LoginSuccessNumOps": 0,
 60 |             "LoginSuccessAvgTime": 0,
 61 |             "LoginFailureNumOps": 0,
 62 |             "LoginFailureAvgTime": 0,
 63 |             "GetGroupsNumOps": 0,
 64 |             "GetGroupsAvgTime": 0,
 65 |             "RenewalFailuresTotal": 0,
 66 |             "RenewalFailures": 0
 67 |         },
 68 |         {
 69 |             "name": "Hadoop:service=DataNode,name=DataNodeInfo",
 70 |             "modelerType": "org.apache.hadoop.hdfs.server.datanode.DataNode",
 71 |             "Version": "2.6.0-cdh5.14.2",
 72 |             "XceiverCount": 26,
 73 |             "DatanodeNetworkCounts": [
 74 |                 {
 75 |                     "key": "/10.193.40.4",
 76 |                     "value": [
 77 |                         {
 78 |                             "key": "networkErrors",
 79 |                             "value": 27
 80 |                         }
 81 |                     ]
 82 |                 },
 83 |                 {
 84 |                     "key": "/10.193.40.9",
 85 |                     "value": [
 86 |                         {
 87 |                             "key": "networkErrors",
 88 |                             "value": 35305
 89 |                         }
 90 |                     ]
 91 |                 },
 92 |                 {
 93 |                     "key": "/10.193.48.9",
 94 |                     "value": [
 95 |                         {
 96 |                             "key": "networkErrors",
 97 |                             "value": 15
 98 |                         }
 99 |                     ]
100 |                 },
101 |                 {
102 |                     "key": "/10.193.40.1",
103 |                     "value": [
104 |                         {
105 |                             "key": "networkErrors",
106 |                             "value": 2500
107 |                         }
108 |                     ]
109 |                 },
110 |                 {
111 |                     "key": "/10.193.40.3",
112 |                     "value": [
113 |                         {
114 |                             "key": "networkErrors",
115 |                             "value": 2545
116 |                         }
117 |                     ]
118 |                 },
119 |                 {
120 |                     "key": "/10.193.33.1",
121 |                     "value": [
122 |                         {
123 |                             "key": "networkErrors",
124 |                             "value": 4
125 |                         }
126 |                     ]
127 |                 },
128 |                 {
129 |                     "key": "/10.193.40.10",
130 |                     "value": [
131 |                         {
132 |                             "key": "networkErrors",
133 |                             "value": 2705
134 |                         }
135 |                     ]
136 |                 },
137 |                 {
138 |                     "key": "/10.193.40.5",
139 |                     "value": [
140 |                         {
141 |                             "key": "networkErrors",
142 |                             "value": 33
143 |                         }
144 |                     ]
145 |                 },
146 |                 {
147 |                     "key": "/10.193.40.2",
148 |                     "value": [
149 |                         {
150 |                             "key": "networkErrors",
151 |                             "value": 3658
152 |                         }
153 |                     ]
154 |                 },
155 |                 {
156 |                     "key": "/10.193.48.7",
157 |                     "value": [
158 |                         {
159 |                             "key": "networkErrors",
160 |                             "value": 41
161 |                         }
162 |                     ]
163 |                 }
164 |             ],
165 |             "RpcPort": "50020",
166 |             "HttpPort": null,
167 |             "NamenodeAddresses": "{\"yh-shhd-cdh05\":\"BP-1654582017-10.193.40.10-1585051030504\",\"yh-shhd-cdh02\":\"BP-1654582017-10.193.40.10-1585051030504\"}",
168 |             "VolumeInfo": "{\"/mnt/disk3/dfs/dn/current\":{\"usedSpace\":2123832329532,\"freeSpace\":5403410074903,\"reservedSpace\":10737418240,\"reservedSpaceForRBW\":263679405},\"/mnt/disk2/dfs/dn/current\":{\"usedSpace\":2141850634176,\"freeSpace\":5385258770496,\"reservedSpace\":10737418240,\"reservedSpaceForRBW\":396679168},\"/mnt/disk0/dfs/dn/current\":{\"usedSpace\":2185856311266,\"freeSpace\":5341126906910,\"reservedSpace\":10737418240,\"reservedSpaceForRBW\":522865664},\"/mnt/disk1/dfs/dn/current\":{\"usedSpace\":2132307337216,\"freeSpace\":5394803232768,\"reservedSpace\":10737418240,\"reservedSpaceForRBW\":395513856}}",
169 |             "ClusterId": "cluster7",
170 |             "DiskBalancerStatus": ""
171 |         },
172 |         {
173 |             "name": "Hadoop:service=DataNode,name=DataNodeActivity-yh-shhd-cdh01-50010",
174 |             "modelerType": "DataNodeActivity-yh-shhd-cdh01-50010",
175 |             "tag.SessionId": null,
176 |             "tag.Context": "dfs",
177 |             "tag.Hostname": "yh-shhd-cdh01",
178 |             "BytesWritten": 3930869762859,
179 |             "TotalWriteTime": 5390318,
180 |             "BytesRead": 2592902229782,
181 |             "TotalReadTime": 12910687,
182 |             "BlocksWritten": 1339342,
183 |             "BlocksRead": 14341148,
184 |             "BlocksReplicated": 138907,
185 |             "BlocksRemoved": 315277,
186 |             "BlocksVerified": 1241033,
187 |             "BlockVerificationFailures": 0,
188 |             "BlocksCached": 0,
189 |             "BlocksUncached": 0,
190 |             "ReadsFromLocalClient": 5040940,
191 |             "ReadsFromRemoteClient": 9300208,
192 |             "WritesFromLocalClient": 390614,
193 |             "WritesFromRemoteClient": 814364,
194 |             "BlocksGetLocalPathInfo": 0,
195 |             "RemoteBytesRead": 1544540994665,
196 |             "RemoteBytesWritten": 2445837244412,
197 |             "RamDiskBlocksWrite": 0,
198 |             "RamDiskBlocksWriteFallback": 0,
199 |             "RamDiskBytesWrite": 0,
200 |             "RamDiskBlocksReadHits": 0,
201 |             "RamDiskBlocksEvicted": 0,
202 |             "RamDiskBlocksEvictedWithoutRead": 0,
203 |             "RamDiskBlocksEvictionWindowMsNumOps": 0,
204 |             "RamDiskBlocksEvictionWindowMsAvgTime": 0,
205 |             "RamDiskBlocksLazyPersisted": 0,
206 |             "RamDiskBlocksDeletedBeforeLazyPersisted": 0,
207 |             "RamDiskBytesLazyPersisted": 0,
208 |             "RamDiskBlocksLazyPersistWindowMsNumOps": 0,
209 |             "RamDiskBlocksLazyPersistWindowMsAvgTime": 0,
210 |             "FsyncCount": 0,
211 |             "VolumeFailures": 0,
212 |             "DatanodeNetworkErrors": 46833,
213 |             "ReadBlockOpNumOps": 14341148,
214 |             "ReadBlockOpAvgTime": 2.7777777777777777,
215 |             "WriteBlockOpNumOps": 1204978,
216 |             "WriteBlockOpAvgTime": 40067,
217 |             "BlockChecksumOpNumOps": 51251,
218 |             "BlockChecksumOpAvgTime": 0.3333333333333333,
219 |             "CopyBlockOpNumOps": 0,
220 |             "CopyBlockOpAvgTime": 0,
221 |             "ReplaceBlockOpNumOps": 0,
222 |             "ReplaceBlockOpAvgTime": 0,
223 |             "HeartbeatsNumOps": 1252832,
224 |             "HeartbeatsAvgTime": 1.2727272727272727,
225 |             "BlockReportsNumOps": 176,
226 |             "BlockReportsAvgTime": 985,
227 |             "IncrementalBlockReportsNumOps": 2551500,
228 |             "IncrementalBlockReportsAvgTime": 1,
229 |             "CacheReportsNumOps": 328655,
230 |             "CacheReportsAvgTime": 0.3333333333333333,
231 |             "PacketAckRoundTripTimeNanosNumOps": 32064492,
232 |             "PacketAckRoundTripTimeNanosAvgTime": 389616,
233 |             "FlushNanosNumOps": 70305363,
234 |             "FlushNanosAvgTime": 16205.285714285714,
235 |             "FsyncNanosNumOps": 0,
236 |             "FsyncNanosAvgTime": 0,
237 |             "SendDataPacketBlockedOnNetworkNanosNumOps": 95274803,
238 |             "SendDataPacketBlockedOnNetworkNanosAvgTime": 29429.13888888889,
239 |             "SendDataPacketTransferNanosNumOps": 95274803,
240 |             "SendDataPacketTransferNanosAvgTime": 353804.77777777775
241 |         },
242 |         {
243 |             "name": "Hadoop:service=DataNode,name=RpcDetailedActivityForPort50020",
244 |             "modelerType": "RpcDetailedActivityForPort50020",
245 |             "tag.port": "50020",
246 |             "tag.Context": "rpcdetailed",
247 |             "tag.Hostname": "yh-shhd-cdh01",
248 |             "InitReplicaRecoveryNumOps": 2,
249 |             "InitReplicaRecoveryAvgTime": 2,
250 |             "GetReplicaVisibleLengthNumOps": 62882,
251 |             "GetReplicaVisibleLengthAvgTime": 0,
252 |             "UpdateReplicaUnderRecoveryNumOps": 2,
253 |             "UpdateReplicaUnderRecoveryAvgTime": 3,
254 |             "ReplicaNotFoundExceptionNumOps": 19,
255 |             "ReplicaNotFoundExceptionAvgTime": 0
256 |         },
257 |         {
258 |             "name": "Hadoop:service=DataNode,name=RpcActivityForPort50020",
259 |             "modelerType": "RpcActivityForPort50020",
260 |             "tag.port": "50020",
261 |             "tag.Context": "rpc",
262 |             "tag.NumOpenConnectionsPerUser": "{}",
263 |             "tag.Hostname": "yh-shhd-cdh01",
264 |             "ReceivedBytes": 21810967,
265 |             "SentBytes": 13644420,
266 |             "RpcQueueTimeNumOps": 62905,
267 |             "RpcQueueTimeAvgTime": 0,
268 |             "RpcProcessingTimeNumOps": 62905,
269 |             "RpcProcessingTimeAvgTime": 0,
270 |             "RpcAuthenticationFailures": 0,
271 |             "RpcAuthenticationSuccesses": 0,
272 |             "RpcAuthorizationFailures": 0,
273 |             "RpcAuthorizationSuccesses": 62903,
274 |             "RpcSlowCalls": 0,
275 |             "RpcClientBackoff": 0,
276 |             "NumOpenConnections": 0,
277 |             "CallQueueLength": 0,
278 |             "NumDroppedConnections": 0
279 |         },
280 |         {
281 |             "name": "Hadoop:service=DataNode,name=MetricsSystem,sub=Stats",
282 |             "modelerType": "MetricsSystem,sub=Stats",
283 |             "tag.Context": "metricssystem",
284 |             "tag.Hostname": "yh-shhd-cdh01",
285 |             "NumActiveSources": 5,
286 |             "NumAllSources": 5,
287 |             "NumActiveSinks": 0,
288 |             "NumAllSinks": 0,
289 |             "SnapshotNumOps": 0,
290 |             "SnapshotAvgTime": 0,
291 |             "PublishNumOps": 0,
292 |             "PublishAvgTime": 0,
293 |             "DroppedPubAll": 0
294 |         },
295 |         {
296 |             "name": "Hadoop:service=DataNode,name=MetricsSystem,sub=Control",
297 |             "modelerType": "org.apache.hadoop.metrics2.impl.MetricsSystemImpl"
298 |         }
299 |     ]
300 | }
301 | 


--------------------------------------------------------------------------------
/examples/JobHistoryServer.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "beans": [
  3 |     {
  4 |       "name": "Hadoop:service=JobHistoryServer,name=UgiMetrics",
  5 |       "modelerType": "UgiMetrics",
  6 |       "tag.Context": "ugi",
  7 |       "tag.Hostname": "yh-shhd-cdh01",
  8 |       "LoginSuccessNumOps": 0,
  9 |       "LoginSuccessAvgTime": 0,
 10 |       "LoginFailureNumOps": 0,
 11 |       "LoginFailureAvgTime": 0,
 12 |       "GetGroupsNumOps": 0,
 13 |       "GetGroupsAvgTime": 0,
 14 |       "RenewalFailuresTotal": 0,
 15 |       "RenewalFailures": 0
 16 |     },
 17 |     {
 18 |       "name": "Hadoop:service=JobHistoryServer,name=RpcActivityForPort10033",
 19 |       "modelerType": "RpcActivityForPort10033",
 20 |       "tag.port": "10033",
 21 |       "tag.Context": "rpc",
 22 |       "tag.NumOpenConnectionsPerUser": "{}",
 23 |       "tag.Hostname": "yh-shhd-cdh01",
 24 |       "ReceivedBytes": 0,
 25 |       "SentBytes": 0,
 26 |       "RpcQueueTimeNumOps": 0,
 27 |       "RpcQueueTimeAvgTime": 0,
 28 |       "RpcProcessingTimeNumOps": 0,
 29 |       "RpcProcessingTimeAvgTime": 0,
 30 |       "RpcAuthenticationFailures": 0,
 31 |       "RpcAuthenticationSuccesses": 0,
 32 |       "RpcAuthorizationFailures": 0,
 33 |       "RpcAuthorizationSuccesses": 0,
 34 |       "RpcSlowCalls": 0,
 35 |       "RpcClientBackoff": 0,
 36 |       "NumOpenConnections": 0,
 37 |       "CallQueueLength": 0,
 38 |       "NumDroppedConnections": 0
 39 |     },
 40 |     {
 41 |       "name": "Hadoop:service=JobHistoryServer,name=RpcActivityForPort10020",
 42 |       "modelerType": "RpcActivityForPort10020",
 43 |       "tag.port": "10020",
 44 |       "tag.Context": "rpc",
 45 |       "tag.NumOpenConnectionsPerUser": "{}",
 46 |       "tag.Hostname": "yh-shhd-cdh01",
 47 |       "ReceivedBytes": 0,
 48 |       "SentBytes": 0,
 49 |       "RpcQueueTimeNumOps": 0,
 50 |       "RpcQueueTimeAvgTime": 0,
 51 |       "RpcProcessingTimeNumOps": 0,
 52 |       "RpcProcessingTimeAvgTime": 0,
 53 |       "RpcAuthenticationFailures": 0,
 54 |       "RpcAuthenticationSuccesses": 0,
 55 |       "RpcAuthorizationFailures": 0,
 56 |       "RpcAuthorizationSuccesses": 0,
 57 |       "RpcSlowCalls": 0,
 58 |       "RpcClientBackoff": 0,
 59 |       "NumOpenConnections": 0,
 60 |       "CallQueueLength": 0,
 61 |       "NumDroppedConnections": 0
 62 |     },
 63 |     {
 64 |       "name": "Hadoop:service=JobHistoryServer,name=RpcDetailedActivityForPort10033",
 65 |       "modelerType": "RpcDetailedActivityForPort10033",
 66 |       "tag.port": "10033",
 67 |       "tag.Context": "rpcdetailed",
 68 |       "tag.Hostname": "yh-shhd-cdh01"
 69 |     },
 70 |     {
 71 |       "name": "Hadoop:service=JobHistoryServer,name=MetricsSystem,sub=Control",
 72 |       "modelerType": "org.apache.hadoop.metrics2.impl.MetricsSystemImpl"
 73 |     },
 74 |     {
 75 |       "name": "Hadoop:service=JobHistoryServer,name=RpcDetailedActivityForPort10020",
 76 |       "modelerType": "RpcDetailedActivityForPort10020",
 77 |       "tag.port": "10020",
 78 |       "tag.Context": "rpcdetailed",
 79 |       "tag.Hostname": "yh-shhd-cdh01"
 80 |     },
 81 |     {
 82 |       "name": "Hadoop:service=JobHistoryServer,name=JvmMetrics",
 83 |       "modelerType": "JvmMetrics",
 84 |       "tag.Context": "jvm",
 85 |       "tag.ProcessName": "JobHistoryServer",
 86 |       "tag.SessionId": null,
 87 |       "tag.Hostname": "yh-shhd-cdh01",
 88 |       "MemNonHeapUsedM": 95.115036,
 89 |       "MemNonHeapCommittedM": 98.24609,
 90 |       "MemNonHeapMaxM": -1,
 91 |       "MemHeapUsedM": 321.61688,
 92 |       "MemHeapCommittedM": 989.875,
 93 |       "MemHeapMaxM": 989.875,
 94 |       "MemMaxM": 989.875,
 95 |       "GcCountParNew": 1447,
 96 |       "GcTimeMillisParNew": 25516,
 97 |       "GcCountConcurrentMarkSweep": 2,
 98 |       "GcTimeMillisConcurrentMarkSweep": 92,
 99 |       "GcCount": 1449,
100 |       "GcTimeMillis": 25608,
101 |       "GcNumWarnThresholdExceeded": 0,
102 |       "GcNumInfoThresholdExceeded": 0,
103 |       "GcTotalExtraSleepTime": 1005,
104 |       "ThreadsNew": 0,
105 |       "ThreadsRunnable": 12,
106 |       "ThreadsBlocked": 0,
107 |       "ThreadsWaiting": 10,
108 |       "ThreadsTimedWaiting": 25,
109 |       "ThreadsTerminated": 0,
110 |       "LogFatal": 0,
111 |       "LogError": 0,
112 |       "LogWarn": 0,
113 |       "LogInfo": 0
114 |     },
115 |     {
116 |       "name": "Hadoop:service=JobHistoryServer,name=MetricsSystem,sub=Stats",
117 |       "modelerType": "MetricsSystem,sub=Stats",
118 |       "tag.Context": "metricssystem",
119 |       "tag.Hostname": "yh-shhd-cdh01",
120 |       "NumActiveSources": 6,
121 |       "NumAllSources": 6,
122 |       "NumActiveSinks": 0,
123 |       "NumAllSinks": 0,
124 |       "SnapshotNumOps": 0,
125 |       "SnapshotAvgTime": 0,
126 |       "PublishNumOps": 0,
127 |       "PublishAvgTime": 0,
128 |       "DroppedPubAll": 0
129 |     }
130 |   ]
131 | }
132 | 


--------------------------------------------------------------------------------
/examples/NameNode.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "beans": [
  3 |     {
  4 |       "name": "Hadoop:service=NameNode,name=JvmMetrics",
  5 |       "modelerType": "JvmMetrics",
  6 |       "tag.Context": "jvm",
  7 |       "tag.ProcessName": "NameNode",
  8 |       "tag.SessionId": null,
  9 |       "tag.Hostname": "yh-shhd-cdh02",
 10 |       "MemNonHeapUsedM": 115.754105,
 11 |       "MemNonHeapCommittedM": 117.58984,
 12 |       "MemNonHeapMaxM": -1,
 13 |       "MemHeapUsedM": 3334.1143,
 14 |       "MemHeapCommittedM": 7918.9375,
 15 |       "MemHeapMaxM": 7918.9375,
 16 |       "MemMaxM": 7918.9375,
 17 |       "GcCountParNew": 1550,
 18 |       "GcTimeMillisParNew": 38883,
 19 |       "GcCountConcurrentMarkSweep": 2,
 20 |       "GcTimeMillisConcurrentMarkSweep": 485,
 21 |       "GcCount": 1552,
 22 |       "GcTimeMillis": 39368,
 23 |       "GcNumWarnThresholdExceeded": 0,
 24 |       "GcNumInfoThresholdExceeded": 0,
 25 |       "GcTotalExtraSleepTime": 2443,
 26 |       "ThreadsNew": 0,
 27 |       "ThreadsRunnable": 14,
 28 |       "ThreadsBlocked": 0,
 29 |       "ThreadsWaiting": 10,
 30 |       "ThreadsTimedWaiting": 88,
 31 |       "ThreadsTerminated": 0,
 32 |       "LogFatal": 0,
 33 |       "LogError": 2,
 34 |       "LogWarn": 18531,
 35 |       "LogInfo": 40434225
 36 |     },
 37 |     {
 38 |       "name": "Hadoop:service=NameNode,name=NameNodeActivity",
 39 |       "modelerType": "NameNodeActivity",
 40 |       "tag.ProcessName": "NameNode",
 41 |       "tag.SessionId": null,
 42 |       "tag.Context": "dfs",
 43 |       "tag.Hostname": "yh-shhd-cdh02",
 44 |       "CreateFileOps": 8918834,
 45 |       "FilesCreated": 12224584,
 46 |       "FilesAppended": 190847,
 47 |       "GetBlockLocations": 26548167,
 48 |       "FilesRenamed": 902599,
 49 |       "GetListingOps": 37263567,
 50 |       "DeleteFileOps": 11114944,
 51 |       "FilesDeleted": 11118384,
 52 |       "FileInfoOps": 16343664,
 53 |       "AddBlockOps": 8854681,
 54 |       "GetAdditionalDatanodeOps": 0,
 55 |       "CreateSymlinkOps": 0,
 56 |       "GetLinkTargetOps": 0,
 57 |       "FilesInGetListingOps": 50064685,
 58 |       "AllowSnapshotOps": 0,
 59 |       "DisallowSnapshotOps": 0,
 60 |       "CreateSnapshotOps": 0,
 61 |       "DeleteSnapshotOps": 0,
 62 |       "RenameSnapshotOps": 0,
 63 |       "ListSnapshottableDirOps": 0,
 64 |       "SnapshotDiffReportOps": 0,
 65 |       "BlockReceivedAndDeletedOps": 9711629,
 66 |       "StorageBlockReportOps": 432,
 67 |       "BlockOpsQueued": 1,
 68 |       "BlockOpsBatched": 2684190,
 69 |       "TransactionsNumOps": 65283526,
 70 |       "TransactionsAvgTime": 0.00466804979253112,
 71 |       "SyncsNumOps": 11319494,
 72 |       "SyncsAvgTime": 19.490243902439026,
 73 |       "TransactionsBatchedInSync": 311541393,
 74 |       "BlockReportNumOps": 432,
 75 |       "BlockReportAvgTime": 232,
 76 |       "CacheReportNumOps": 245481,
 77 |       "CacheReportAvgTime": 0,
 78 |       "SafeModeTime": 89778,
 79 |       "FsImageLoadTime": 27558,
 80 |       "GetEditNumOps": 0,
 81 |       "GetEditAvgTime": 0,
 82 |       "GetImageNumOps": 0,
 83 |       "GetImageAvgTime": 0,
 84 |       "PutImageNumOps": 145,
 85 |       "PutImageAvgTime": 10951,
 86 |       "TotalFileOps": 110137303
 87 |     },
 88 |     {
 89 |       "name": "Hadoop:service=NameNode,name=IPCLoggerChannel-10.193.40.2-8485",
 90 |       "modelerType": "IPCLoggerChannel-10.193.40.2-8485",
 91 |       "tag.Context": "dfs",
 92 |       "tag.IsOutOfSync": "false",
 93 |       "tag.Hostname": "yh-shhd-cdh02",
 94 |       "QueuedEditsSize": 225,
 95 |       "LagTimeMillis": 5,
 96 |       "CurrentLagTxns": 0
 97 |     },
 98 |     {
 99 |       "name": "Hadoop:service=NameNode,name=NNTopUserOpCounts",
100 |       "modelerType": "NNTopUserOpCounts",
101 |       "tag.Context": "dfs",
102 |       "tag.Hostname": "yh-shhd-cdh02",
103 |       "tag.Context.1": "dfs",
104 |       "tag.Hostname.1": "yh-shhd-cdh02",
105 |       "tag.Context.2": "dfs",
106 |       "tag.Hostname.2": "yh-shhd-cdh02"
107 |     },
108 |     {
109 |       "name": "Hadoop:service=NameNode,name=NameNodeStatus",
110 |       "modelerType": "org.apache.hadoop.hdfs.server.namenode.NameNode",
111 |       "SecurityEnabled": false,
112 |       "NNRole": "NameNode",
113 |       "HostAndPort": "yh-shhd-cdh02:8020",
114 |       "LastHATransitionTime": 1586867210547,
115 |       "State": "active"
116 |     },
117 |     {
118 |       "name": "Hadoop:service=NameNode,name=NameNodeInfo",
119 |       "modelerType": "org.apache.hadoop.hdfs.server.namenode.FSNamesystem",
120 |       "Total": 133166434799616,
121 |       "UpgradeFinalized": true,
122 |       "ClusterId": "cluster7",
123 |       "BlockPoolId": "BP-1654582017-10.193.40.10-1585051030504",
124 |       "Version": "2.6.0-cdh5.14.2, r5724a4ad7a27f7af31aa725694d3df09a68bb213",
125 |       "TotalBlocks": 5358625,
126 |       "Used": 25018182141559,
127 |       "Free": 64666311712118,
128 |       "Safemode": "",
129 |       "NonDfsUsedSpace": 36758750009889,
130 |       "PercentUsed": 18.787153,
131 |       "BlockPoolUsedSpace": 25018182141559,
132 |       "PercentBlockPoolUsed": 18.787153,
133 |       "PercentRemaining": 48.56052,
134 |       "CacheCapacity": 21474836480,
135 |       "CacheUsed": 0,
136 |       "TotalFiles": 6823097,
137 |       "NumberOfMissingBlocks": 0,
138 |       "NumberOfMissingBlocksWithReplicationFactorOne": 0,
139 |       "LiveNodes": "{\"yh-shhd-cdh04\":{\"infoAddr\":\"10.193.40.2:50075\",\"infoSecureAddr\":\"10.193.40.2:0\",\"xferaddr\":\"10.193.40.2:50010\",\"lastContact\":1,\"usedSpace\":3402098630360,\"adminState\":\"In Service\",\"nonDfsUsedSpace\":12260165435688,\"capacity\":23248542650368,\"numBlocks\":1542688,\"version\":\"2.6.0-cdh5.14.2\",\"used\":3402098630360,\"remaining\":6412527156224,\"blockScheduled\":16,\"blockPoolUsed\":3402098630360,\"blockPoolUsedPercent\":14.633599,\"volfails\":0},\"yh-shhd-cdh03\":{\"infoAddr\":\"10.193.40.1:50075\",\"infoSecureAddr\":\"10.193.40.1:0\",\"xferaddr\":\"10.193.40.1:50010\",\"lastContact\":2,\"usedSpace\":3360642361157,\"adminState\":\"In Service\",\"nonDfsUsedSpace\":12214790928571,\"capacity\":23248542269440,\"numBlocks\":1546037,\"version\":\"2.6.0-cdh5.14.2\",\"used\":3360642361157,\"remaining\":6499349846314,\"blockScheduled\":2,\"blockPoolUsed\":3360642361157,\"blockPoolUsedPercent\":14.455282,\"volfails\":0},\"yh-shhd-cdh05\":{\"infoAddr\":\"10.193.40.3:50075\",\"infoSecureAddr\":\"10.193.40.3:0\",\"xferaddr\":\"10.193.40.3:50010\",\"lastContact\":2,\"usedSpace\":3251740135362,\"adminState\":\"In Service\",\"nonDfsUsedSpace\":12283793645630,\"capacity\":23248542269440,\"numBlocks\":1091277,\"version\":\"2.6.0-cdh5.14.2\",\"used\":3251740135362,\"remaining\":6539139766237,\"blockScheduled\":0,\"blockPoolUsed\":3251740135362,\"blockPoolUsedPercent\":13.986856,\"volfails\":0},\"yh-shhd-cdh02\":{\"infoAddr\":\"10.193.40.10:50075\",\"infoSecureAddr\":\"10.193.40.10:0\",\"xferaddr\":\"10.193.40.10:50010\",\"lastContact\":1,\"usedSpace\":8922113438288,\"adminState\":\"In Service\",\"nonDfsUsedSpace\":0,\"capacity\":31710403805184,\"numBlocks\":575953,\"version\":\"2.6.0-cdh5.14.2\",\"used\":8922113438288,\"remaining\":21187383799102,\"blockScheduled\":1,\"blockPoolUsed\":8922113438288,\"blockPoolUsedPercent\":28.136232,\"volfails\":0},\"yh-shhd-cdh01\":{\"infoAddr\":\"10.193.40.9:50075\",\"infoSecureAddr\":\"10.193.40.9:0\",\"xferaddr\":\"10.193.40.9:50010\",\"lastContact\":0,\"usedSpace\":6081587576392,\"adminState\":\"In Service\",\"nonDfsUsedSpace\":0,\"capacity\":31710403805184,\"numBlocks\":1179703,\"version\":\"2.6.0-cdh5.14.2\",\"used\":6081587576392,\"remaining\":24027911144241,\"blockScheduled\":0,\"blockPoolUsed\":6081587576392,\"blockPoolUsedPercent\":19.178526,\"volfails\":0}}",
140 |       "SoftwareVersion": "2.6.0-cdh5.14.2",
141 |       "DeadNodes": "{}",
142 |       "DecomNodes": "{}",
143 |       "EnteringMaintenanceNodes": "{}",
144 |       "NameDirStatuses": "{\"active\":{\"/mnt/disk1/dfs/nn\":\"IMAGE_AND_EDITS\",\"/mnt/disk0/dfs/nn\":\"IMAGE_AND_EDITS\"},\"failed\":{}}",
145 |       "NodeUsage": "{\"nodeUsage\":{\"min\":\"13.99%\",\"median\":\"14.63%\",\"max\":\"28.14%\",\"stdDev\":\"5.37%\"}}",
146 |       "NameJournalStatus": "[{\"manager\":\"QJM to [10.193.40.1:8485, 10.193.40.2:8485, 10.193.40.3:8485]\",\"stream\":\"Writing segment beginning at txid 322850295. \\n10.193.40.1:8485 (Written txid 322860887), 10.193.40.2:8485 (Written txid 322860887), 10.193.40.3:8485 (Written txid 322860887)\",\"disabled\":\"false\",\"required\":\"true\"},{\"manager\":\"FileJournalManager(root=/mnt/disk0/dfs/nn)\",\"stream\":\"EditLogFileOutputStream(/mnt/disk0/dfs/nn/current/edits_inprogress_0000000000322850295)\",\"disabled\":\"false\",\"required\":\"false\"},{\"manager\":\"FileJournalManager(root=/mnt/disk1/dfs/nn)\",\"stream\":\"EditLogFileOutputStream(/mnt/disk1/dfs/nn/current/edits_inprogress_0000000000322850295)\",\"disabled\":\"false\",\"required\":\"false\"}]",
147 |       "JournalTransactionInfo": "{\"MostRecentCheckpointTxId\":\"322439274\",\"LastAppliedOrWrittenTxId\":\"322860888\"}",
148 |       "NNStarted": "Tue Apr 14 20:26:19 CST 2020",
149 |       "NNStartedTimeInMillis": 1586867179855,
150 |       "CompileInfo": "2018-03-27T20:40Z by jenkins from Unknown",
151 |       "CorruptFiles": "[]",
152 |       "NumberOfSnapshottableDirs": 0,
153 |       "DistinctVersionCount": 1,
154 |       "DistinctVersions": [
155 |         {
156 |           "key": "2.6.0-cdh5.14.2",
157 |           "value": 5
158 |         }
159 |       ],
160 |       "RollingUpgradeStatus": null,
161 |       "Threads": 112
162 |     },
163 |     {
164 |       "name": "Hadoop:service=NameNode,name=StartupProgress",
165 |       "modelerType": "StartupProgress",
166 |       "tag.Hostname": "yh-shhd-cdh02",
167 |       "ElapsedTime": 89320,
168 |       "PercentComplete": 1,
169 |       "LoadingFsImageCount": 6362453,
170 |       "LoadingFsImageElapsedTime": 25441,
171 |       "LoadingFsImageTotal": 6362453,
172 |       "LoadingFsImagePercentComplete": 1,
173 |       "LoadingEditsCount": 261261,
174 |       "LoadingEditsElapsedTime": 282,
175 |       "LoadingEditsTotal": 261261,
176 |       "LoadingEditsPercentComplete": 1,
177 |       "SavingCheckpointCount": 0,
178 |       "SavingCheckpointElapsedTime": 0,
179 |       "SavingCheckpointTotal": 0,
180 |       "SavingCheckpointPercentComplete": 1,
181 |       "SafeModeCount": 4966636,
182 |       "SafeModeElapsedTime": 61632,
183 |       "SafeModeTotal": 4966505,
184 |       "SafeModePercentComplete": 1
185 |     },
186 |     {
187 |       "name": "Hadoop:service=NameNode,name=FSNamesystem",
188 |       "modelerType": "FSNamesystem",
189 |       "tag.Context": "dfs",
190 |       "tag.HAState": "active",
191 |       "tag.Hostname": "yh-shhd-cdh02",
192 |       "BlocksTotal": 5358625,
193 |       "MissingBlocks": 0,
194 |       "MissingReplOneBlocks": 0,
195 |       "ExpiredHeartbeats": 0,
196 |       "TransactionsSinceLastCheckpoint": 421614,
197 |       "TransactionsSinceLastLogRoll": 10594,
198 |       "LastWrittenTransactionId": 322860888,
199 |       "LastCheckpointTime": 1587392977976,
200 |       "UnderReplicatedBlocks": 0,
201 |       "CorruptBlocks": 0,
202 |       "CapacityTotal": 133166434799616,
203 |       "CapacityTotalGB": 124021,
204 |       "CapacityUsed": 25018182141559,
205 |       "CapacityUsedGB": 23300,
206 |       "CapacityRemaining": 64666311712118,
207 |       "CapacityRemainingGB": 60225,
208 |       "CapacityUsedNonDFS": 36758750009889,
209 |       "TotalLoad": 50,
210 |       "SnapshottableDirectories": 0,
211 |       "Snapshots": 0,
212 |       "NumEncryptionZones": 0,
213 |       "LockQueueLength": 0,
214 |       "NumFilesUnderConstruction": 82,
215 |       "NumActiveClients": 52,
216 |       "FilesTotal": 6823097,
217 |       "PendingReplicationBlocks": 0,
218 |       "ScheduledReplicationBlocks": 0,
219 |       "PendingDeletionBlocks": 0,
220 |       "ExcessBlocks": 0,
221 |       "PostponedMisreplicatedBlocks": 0,
222 |       "PendingDataNodeMessageCount": 0,
223 |       "MillisSinceLastLoadedEdits": 0,
224 |       "BlockCapacity": 16777216,
225 |       "StaleDataNodes": 0,
226 |       "TotalFiles": 6823097
227 |     },
228 |     {
229 |       "name": "Hadoop:service=NameNode,name=SnapshotInfo",
230 |       "modelerType": "org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotManager",
231 |       "SnapshottableDirectories": [],
232 |       "Snapshots": []
233 |     },
234 |     {
235 |       "name": "Hadoop:service=NameNode,name=IPCLoggerChannel-10.193.40.1-8485",
236 |       "modelerType": "IPCLoggerChannel-10.193.40.1-8485",
237 |       "tag.Context": "dfs",
238 |       "tag.IsOutOfSync": "false",
239 |       "tag.Hostname": "yh-shhd-cdh02",
240 |       "QueuedEditsSize": 225,
241 |       "LagTimeMillis": 5,
242 |       "CurrentLagTxns": 0
243 |     },
244 |     {
245 |       "name": "Hadoop:service=NameNode,name=RpcDetailedActivityForPort8022",
246 |       "modelerType": "RpcDetailedActivityForPort8022",
247 |       "tag.port": "8022",
248 |       "tag.Context": "rpcdetailed",
249 |       "tag.Hostname": "yh-shhd-cdh02",
250 |       "GetServiceStatusNumOps": 528952,
251 |       "GetServiceStatusAvgTime": 0,
252 |       "RollEditLogNumOps": 4384,
253 |       "RollEditLogAvgTime": 281,
254 |       "RegisterDatanodeNumOps": 5,
255 |       "RegisterDatanodeAvgTime": 43.333333333333336,
256 |       "SendHeartbeatNumOps": 881809,
257 |       "SendHeartbeatAvgTime": 0,
258 |       "CacheReportNumOps": 245481,
259 |       "CacheReportAvgTime": 0,
260 |       "ReportBadBlocksNumOps": 2,
261 |       "ReportBadBlocksAvgTime": 0,
262 |       "VersionRequestNumOps": 5,
263 |       "VersionRequestAvgTime": 119.2,
264 |       "MonitorHealthNumOps": 528951,
265 |       "MonitorHealthAvgTime": 0.0625,
266 |       "TransitionToActiveNumOps": 1,
267 |       "TransitionToActiveAvgTime": 1995,
268 |       "CommitBlockSynchronizationNumOps": 18,
269 |       "CommitBlockSynchronizationAvgTime": 0.5,
270 |       "BlockReportNumOps": 329,
271 |       "BlockReportAvgTime": 232.33333333333334,
272 |       "BlockReceivedAndDeletedNumOps": 9711629,
273 |       "BlockReceivedAndDeletedAvgTime": 0.007246376811594203,
274 |       "ErrorReportNumOps": 7,
275 |       "ErrorReportAvgTime": 0
276 |     },
277 |     {
278 |       "name": "Hadoop:service=NameNode,name=BlockStats",
279 |       "modelerType": "org.apache.hadoop.hdfs.server.blockmanagement.BlockManager",
280 |       "StorageTypeStats": [
281 |         {
282 |           "key": "DISK",
283 |           "value": {
284 |             "blockPoolUsed": 25018182141559,
285 |             "capacityRemaining": 64666311712118,
286 |             "capacityTotal": 133166434799616,
287 |             "capacityUsed": 25018182141559,
288 |             "nodesInService": 5
289 |           }
290 |         }
291 |       ]
292 |     },
293 |     {
294 |       "name": "Hadoop:service=NameNode,name=RpcActivityForPort8022",
295 |       "modelerType": "RpcActivityForPort8022",
296 |       "tag.port": "8022",
297 |       "tag.Context": "rpc",
298 |       "tag.NumOpenConnectionsPerUser": "{\"hdfs\":6}",
299 |       "tag.Hostname": "yh-shhd-cdh02",
300 |       "ReceivedBytes": 6932960359,
301 |       "SentBytes": 656275389,
302 |       "RpcQueueTimeNumOps": 11901573,
303 |       "RpcQueueTimeAvgTime": 0.029411764705882353,
304 |       "RpcProcessingTimeNumOps": 11901573,
305 |       "RpcProcessingTimeAvgTime": 0.008823529411764706,
306 |       "RpcAuthenticationFailures": 0,
307 |       "RpcAuthenticationSuccesses": 0,
308 |       "RpcAuthorizationFailures": 0,
309 |       "RpcAuthorizationSuccesses": 4396,
310 |       "RpcSlowCalls": 0,
311 |       "RpcClientBackoff": 0,
312 |       "NumOpenConnections": 6,
313 |       "CallQueueLength": 0,
314 |       "NumDroppedConnections": 0
315 |     },
316 |     {
317 |       "name": "Hadoop:service=NameNode,name=FSNamesystemState",
318 |       "modelerType": "org.apache.hadoop.hdfs.server.namenode.FSNamesystem",
319 |       "BlocksTotal": 5358625,
320 |       "UnderReplicatedBlocks": 0,
321 |       "CapacityTotal": 133166434799616,
322 |       "CapacityUsed": 25018182141559,
323 |       "CapacityRemaining": 64666311712118,
324 |       "TotalLoad": 50,
325 |       "SnapshotStats": "{\"SnapshottableDirectories\":0,\"Snapshots\":0}",
326 |       "NumEncryptionZones": 0,
327 |       "FsLockQueueLength": 0,
328 |       "MaxObjects": 0,
329 |       "FilesTotal": 6823097,
330 |       "PendingReplicationBlocks": 0,
331 |       "ScheduledReplicationBlocks": 0,
332 |       "PendingDeletionBlocks": 0,
333 |       "BlockDeletionStartTime": 1586867179855,
334 |       "FSState": "Operational",
335 |       "NumLiveDataNodes": 5,
336 |       "NumDeadDataNodes": 0,
337 |       "NumDecomLiveDataNodes": 0,
338 |       "NumDecomDeadDataNodes": 0,
339 |       "VolumeFailuresTotal": 0,
340 |       "EstimatedCapacityLostTotal": 0,
341 |       "NumDecommissioningDataNodes": 0,
342 |       "NumStaleDataNodes": 0,
343 |       "NumStaleStorages": 0,
344 |       "TopUserOpCounts": "{\"timestamp\":\"2020-04-20T23:25:36+0800\",\"windows\":[{\"ops\":[{\"opType\":\"listCachePools\",\"topUsers\":[{\"user\":\"hdfs\",\"count\":1}],\"totalCount\":1},{\"opType\":\"listStatus\",\"topUsers\":[{\"user\":\"work\",\"count\":1349},{\"user\":\"mapred\",\"count\":4}],\"totalCount\":1353},{\"opType\":\"*\",\"topUsers\":[{\"user\":\"work\",\"count\":3727},{\"user\":\"hdfs\",\"count\":4},{\"user\":\"mapred\",\"count\":4},{\"user\":\"yarn\",\"count\":3}],\"totalCount\":3738},{\"opType\":\"delete\",\"topUsers\":[{\"user\":\"work\",\"count\":695},{\"user\":\"hdfs\",\"count\":1}],\"totalCount\":696},{\"opType\":\"setReplication\",\"topUsers\":[{\"user\":\"work\",\"count\":7}],\"totalCount\":7},{\"opType\":\"listCacheDirectives\",\"topUsers\":[{\"user\":\"hdfs\",\"count\":1}],\"totalCount\":1},{\"opType\":\"getfileinfo\",\"topUsers\":[{\"user\":\"work\",\"count\":1123},{\"user\":\"yarn\",\"count\":3}],\"totalCount\":1126},{\"opType\":\"rename\",\"topUsers\":[{\"user\":\"work\",\"count\":64}],\"totalCount\":64},{\"opType\":\"mkdirs\",\"topUsers\":[{\"user\":\"work\",\"count\":182}],\"totalCount\":182},{\"opType\":\"create\",\"topUsers\":[{\"user\":\"work\",\"count\":407}],\"totalCount\":407},{\"opType\":\"setPermission\",\"topUsers\":[{\"user\":\"work\",\"count\":11}],\"totalCount\":11},{\"opType\":\"filesInGetListing\",\"topUsers\":[{\"user\":\"work\",\"count\":1643},{\"user\":\"mapred\",\"count\":12}],\"totalCount\":1655},{\"opType\":\"open\",\"topUsers\":[{\"user\":\"work\",\"count\":303},{\"user\":\"hdfs\",\"count\":1}],\"totalCount\":304},{\"opType\":\"append\",\"topUsers\":[{\"user\":\"work\",\"count\":11}],\"totalCount\":11}],\"windowLenMs\":60000},{\"ops\":[{\"opType\":\"listCachePools\",\"topUsers\":[{\"user\":\"hdfs\",\"count\":4}],\"totalCount\":4},{\"opType\":\"listStatus\",\"topUsers\":[{\"user\":\"work\",\"count\":6659},{\"user\":\"mapred\",\"count\":12},{\"user\":\"yarn\",\"count\":1}],\"totalCount\":6672},{\"opType\":\"*\",\"topUsers\":[{\"user\":\"work\",\"count\":9789},{\"user\":\"hdfs\",\"count\":28},{\"user\":\"mapred\",\"count\":12},{\"user\":\"yarn\",\"count\":6}],\"totalCount\":9835},{\"opType\":\"delete\",\"topUsers\":[{\"user\":\"work\",\"count\":2738},{\"user\":\"hdfs\",\"count\":5}],\"totalCount\":2743},{\"opType\":\"setReplication\",\"topUsers\":[{\"user\":\"work\",\"count\":10}],\"totalCount\":10},{\"opType\":\"listCacheDirectives\",\"topUsers\":[{\"user\":\"hdfs\",\"count\":4}],\"totalCount\":4},{\"opType\":\"getfileinfo\",\"topUsers\":[{\"user\":\"work\",\"count\":3089},{\"user\":\"hdfs\",\"count\":5},{\"user\":\"yarn\",\"count\":3}],\"totalCount\":3097},{\"opType\":\"rename\",\"topUsers\":[{\"user\":\"work\",\"count\":123}],\"totalCount\":123},{\"opType\":\"mkdirs\",\"topUsers\":[{\"user\":\"work\",\"count\":832}],\"totalCount\":832},{\"opType\":\"create\",\"topUsers\":[{\"user\":\"work\",\"count\":2775},{\"user\":\"hdfs\",\"count\":5}],\"totalCount\":2780},{\"opType\":\"setPermission\",\"topUsers\":[{\"user\":\"work\",\"count\":15}],\"totalCount\":15},{\"opType\":\"filesInGetListing\",\"topUsers\":[{\"user\":\"work\",\"count\":7671},{\"user\":\"mapred\",\"count\":36},{\"user\":\"yarn\",\"count\":1}],\"totalCount\":7708},{\"opType\":\"open\",\"topUsers\":[{\"user\":\"work\",\"count\":3804},{\"user\":\"hdfs\",\"count\":5}],\"totalCount\":3809},{\"opType\":\"append\",\"topUsers\":[{\"user\":\"work\",\"count\":65}],\"totalCount\":65}],\"windowLenMs\":300000},{\"ops\":[{\"opType\":\"listCachePools\",\"topUsers\":[{\"user\":\"hdfs\",\"count\":16}],\"totalCount\":16},{\"opType\":\"listStatus\",\"topUsers\":[{\"user\":\"work\",\"count\":56123},{\"user\":\"mapred\",\"count\":63},{\"user\":\"yarn\",\"count\":18}],\"totalCount\":56204},{\"opType\":\"*\",\"topUsers\":[{\"user\":\"work\",\"count\":136057},{\"user\":\"mapred\",\"count\":101},{\"user\":\"hdfs\",\"count\":74},{\"user\":\"yarn\",\"count\":40}],\"totalCount\":136272},{\"opType\":\"delete\",\"topUsers\":[{\"user\":\"work\",\"count\":22669},{\"user\":\"hdfs\",\"count\":14},{\"user\":\"mapred\",\"count\":7}],\"totalCount\":22690},{\"opType\":\"setReplication\",\"topUsers\":[{\"user\":\"work\",\"count\":40}],\"totalCount\":40},{\"opType\":\"listCacheDirectives\",\"topUsers\":[{\"user\":\"hdfs\",\"count\":16}],\"totalCount\":16},{\"opType\":\"getfileinfo\",\"topUsers\":[{\"user\":\"work\",\"count\":8090},{\"user\":\"yarn\",\"count\":28},{\"user\":\"mapred\",\"count\":28},{\"user\":\"hdfs\",\"count\":13}],\"totalCount\":8159},{\"opType\":\"rename options=2\",\"topUsers\":[{\"user\":\"mapred\",\"count\":14},{\"user\":\"work\",\"count\":8}],\"totalCount\":22},{\"opType\":\"rename\",\"topUsers\":[{\"user\":\"work\",\"count\":1285}],\"totalCount\":1285},{\"opType\":\"mkdirs\",\"topUsers\":[{\"user\":\"work\",\"count\":1541}],\"totalCount\":1541},{\"opType\":\"create\",\"topUsers\":[{\"user\":\"work\",\"count\":5148},{\"user\":\"hdfs\",\"count\":13}],\"totalCount\":5161},{\"opType\":\"setPermission\",\"topUsers\":[{\"user\":\"work\",\"count\":117}],\"totalCount\":117},{\"opType\":\"filesInGetListing\",\"topUsers\":[{\"user\":\"work\",\"count\":61532},{\"user\":\"mapred\",\"count\":203},{\"user\":\"yarn\",\"count\":32}],\"totalCount\":61767},{\"opType\":\"open\",\"topUsers\":[{\"user\":\"work\",\"count\":5192},{\"user\":\"mapred\",\"count\":58},{\"user\":\"hdfs\",\"count\":14}],\"totalCount\":5264},{\"opType\":\"append\",\"topUsers\":[{\"user\":\"work\",\"count\":406}],\"totalCount\":406},{\"opType\":\"contentSummary\",\"topUsers\":[{\"user\":\"work\",\"count\":4}],\"totalCount\":4}],\"windowLenMs\":1500000}]}",
345 |       "NumInMaintenanceLiveDataNodes": 0,
346 |       "NumInMaintenanceDeadDataNodes": 0,
347 |       "NumEnteringMaintenanceDataNodes": 0
348 |     },
349 |     {
350 |       "name": "Hadoop:service=NameNode,name=IPCLoggerChannel-10.193.40.3-8485",
351 |       "modelerType": "IPCLoggerChannel-10.193.40.3-8485",
352 |       "tag.Context": "dfs",
353 |       "tag.IsOutOfSync": "false",
354 |       "tag.Hostname": "yh-shhd-cdh02",
355 |       "QueuedEditsSize": 0,
356 |       "LagTimeMillis": 0,
357 |       "CurrentLagTxns": 0
358 |     },
359 |     {
360 |       "name": "Hadoop:service=NameNode,name=RetryCache.NameNodeRetryCache",
361 |       "modelerType": "RetryCache.NameNodeRetryCache",
362 |       "tag.Context": "rpc",
363 |       "tag.Hostname": "yh-shhd-cdh02",
364 |       "CacheHit": 175,
365 |       "CacheCleared": 0,
366 |       "CacheUpdated": 21424040
367 |     },
368 |     {
369 |       "name": "Hadoop:service=NameNode,name=RpcActivityForPort8020",
370 |       "modelerType": "RpcActivityForPort8020",
371 |       "tag.port": "8020",
372 |       "tag.Context": "rpc",
373 |       "tag.NumOpenConnectionsPerUser": "{\"work\":56,\"mapred\":1}",
374 |       "tag.Hostname": "yh-shhd-cdh02",
375 |       "ReceivedBytes": 31160263601,
376 |       "SentBytes": 45729185342,
377 |       "RpcQueueTimeNumOps": 125221471,
378 |       "RpcQueueTimeAvgTime": 0.02129011757229107,
379 |       "RpcProcessingTimeNumOps": 125221471,
380 |       "RpcProcessingTimeAvgTime": 0.06990784874483635,
381 |       "RpcAuthenticationFailures": 0,
382 |       "RpcAuthenticationSuccesses": 0,
383 |       "RpcAuthorizationFailures": 0,
384 |       "RpcAuthorizationSuccesses": 495234,
385 |       "RpcSlowCalls": 0,
386 |       "RpcClientBackoff": 0,
387 |       "NumOpenConnections": 57,
388 |       "CallQueueLength": 0,
389 |       "NumDroppedConnections": 0
390 |     },
391 |     {
392 |       "name": "Hadoop:service=NameNode,name=UgiMetrics",
393 |       "modelerType": "UgiMetrics",
394 |       "tag.Context": "ugi",
395 |       "tag.Hostname": "yh-shhd-cdh02",
396 |       "LoginSuccessNumOps": 0,
397 |       "LoginSuccessAvgTime": 0,
398 |       "LoginFailureNumOps": 0,
399 |       "LoginFailureAvgTime": 0,
400 |       "GetGroupsNumOps": 6750,
401 |       "GetGroupsAvgTime": 9,
402 |       "RenewalFailuresTotal": 0,
403 |       "RenewalFailures": 0
404 |     },
405 |     {
406 |       "name": "Hadoop:service=NameNode,name=RpcDetailedActivityForPort8020",
407 |       "modelerType": "RpcDetailedActivityForPort8020",
408 |       "tag.port": "8020",
409 |       "tag.Context": "rpcdetailed",
410 |       "tag.Hostname": "yh-shhd-cdh02",
411 |       "GetBlockLocationsNumOps": 26548005,
412 |       "GetBlockLocationsAvgTime": 0.03896103896103896,
413 |       "ListCachePoolsNumOps": 8532,
414 |       "ListCachePoolsAvgTime": 0,
415 |       "FileNotFoundExceptionNumOps": 54,
416 |       "FileNotFoundExceptionAvgTime": 0,
417 |       "GetDatanodeReportNumOps": 3,
418 |       "GetDatanodeReportAvgTime": 0.6666666666666666,
419 |       "DeleteNumOps": 11132547,
420 |       "DeleteAvgTime": 0.10862619808306709,
421 |       "GetServerDefaultsNumOps": 394929,
422 |       "GetServerDefaultsAvgTime": 0,
423 |       "GetFsStatsNumOps": 5,
424 |       "GetFsStatsAvgTime": 0.8,
425 |       "FsyncNumOps": 2762323,
426 |       "FsyncAvgTime": 0.012658227848101266,
427 |       "LeaseExpiredExceptionNumOps": 261,
428 |       "LeaseExpiredExceptionAvgTime": 0,
429 |       "AddBlockNumOps": 8854681,
430 |       "AddBlockAvgTime": 0.1863799283154122,
431 |       "ListEncryptionZonesNumOps": 147,
432 |       "ListEncryptionZonesAvgTime": 0,
433 |       "CreateNumOps": 8918834,
434 |       "CreateAvgTime": 0.17204301075268819,
435 |       "SetPermissionNumOps": 97884,
436 |       "SetPermissionAvgTime": 0,
437 |       "UpdateBlockForPipelineNumOps": 183263,
438 |       "UpdateBlockForPipelineAvgTime": 0,
439 |       "AlreadyBeingCreatedExceptionNumOps": 8292,
440 |       "AlreadyBeingCreatedExceptionAvgTime": 0,
441 |       "GetContentSummaryNumOps": 5737,
442 |       "GetContentSummaryAvgTime": 0,
443 |       "SetSafeModeNumOps": 2,
444 |       "SetSafeModeAvgTime": 0,
445 |       "GetListingNumOps": 35775306,
446 |       "GetListingAvgTime": 0.028312570781426953,
447 |       "SetReplicationNumOps": 105295,
448 |       "SetReplicationAvgTime": 0.14285714285714285,
449 |       "CheckAccessNumOps": 17761,
450 |       "CheckAccessAvgTime": 0,
451 |       "Rename2NumOps": 24359,
452 |       "Rename2AvgTime": 0,
453 |       "ListCacheDirectivesNumOps": 8532,
454 |       "ListCacheDirectivesAvgTime": 1,
455 |       "StandbyExceptionNumOps": 1,
456 |       "StandbyExceptionAvgTime": 11,
457 |       "RenewLeaseNumOps": 544048,
458 |       "RenewLeaseAvgTime": 0,
459 |       "RenameNumOps": 878240,
460 |       "RenameAvgTime": 0.1875,
461 |       "MkdirsNumOps": 2802980,
462 |       "MkdirsAvgTime": 0.13725490196078433,
463 |       "RecoverLeaseNumOps": 2,
464 |       "RecoverLeaseAvgTime": 7,
465 |       "UpdatePipelineNumOps": 183258,
466 |       "UpdatePipelineAvgTime": 0,
467 |       "SetTimesNumOps": 12,
468 |       "SetTimesAvgTime": 0,
469 |       "CompleteNumOps": 9430907,
470 |       "CompleteAvgTime": 0.06666666666666667,
471 |       "GetFileInfoNumOps": 16343651,
472 |       "GetFileInfoAvgTime": 0.02544529262086514,
473 |       "AppendNumOps": 190847,
474 |       "AppendAvgTime": 0,
475 |       "RetriableExceptionNumOps": 773,
476 |       "RetriableExceptionAvgTime": 11.575704225352112
477 |     },
478 |     {
479 |       "name": "Hadoop:service=NameNode,name=MetricsSystem,sub=Stats",
480 |       "modelerType": "MetricsSystem,sub=Stats",
481 |       "tag.Context": "metricssystem",
482 |       "tag.Hostname": "yh-shhd-cdh02",
483 |       "NumActiveSources": 14,
484 |       "NumAllSources": 14,
485 |       "NumActiveSinks": 0,
486 |       "NumAllSinks": 0,
487 |       "SnapshotNumOps": 0,
488 |       "SnapshotAvgTime": 0,
489 |       "PublishNumOps": 0,
490 |       "PublishAvgTime": 0,
491 |       "DroppedPubAll": 0
492 |     },
493 |     {
494 |       "name": "Hadoop:service=NameNode,name=MetricsSystem,sub=Control",
495 |       "modelerType": "org.apache.hadoop.metrics2.impl.MetricsSystemImpl"
496 |     }
497 |   ]
498 | }
499 | 


--------------------------------------------------------------------------------
/examples/NodeManager.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "beans": [
  3 |     {
  4 |       "name": "Hadoop:service=NodeManager,name=ContainerResource_container_e12_1586466344459_0110_01_000029",
  5 |       "modelerType": "ContainerResource_container_e12_1586466344459_0110_01_000029"
  6 |     },
  7 |     {
  8 |       "name": "Hadoop:service=NodeManager,name=ContainerResource_container_e12_1586466344459_0110_01_000021",
  9 |       "modelerType": "ContainerResource_container_e12_1586466344459_0110_01_000021"
 10 |     },
 11 |     {
 12 |       "name": "Hadoop:service=NodeManager,name=ContainerResource_container_e12_1586466344459_0110_01_000003",
 13 |       "modelerType": "ContainerResource_container_e12_1586466344459_0110_01_000003"
 14 |     },
 15 |     {
 16 |       "name": "Hadoop:service=NodeManager,name=ContainerResource_container_e12_1586466344459_0110_01_000050",
 17 |       "modelerType": "ContainerResource_container_e12_1586466344459_0110_01_000050"
 18 |     },
 19 |     {
 20 |       "name": "Hadoop:service=NodeManager,name=ContainerResource_container_e14_1586867223681_12177_01_000001",
 21 |       "modelerType": "ContainerResource_container_e14_1586867223681_12177_01_000001"
 22 |     },
 23 |     {
 24 |       "name": "Hadoop:service=NodeManager,name=ContainerResource_container_e12_1586466344459_0110_01_000031",
 25 |       "modelerType": "ContainerResource_container_e12_1586466344459_0110_01_000031"
 26 |     },
 27 |     {
 28 |       "name": "Hadoop:service=NodeManager,name=NodeManagerMetrics",
 29 |       "modelerType": "NodeManagerMetrics",
 30 |       "tag.Context": "yarn",
 31 |       "tag.Hostname": "yh-shhd-cdh04",
 32 |       "ContainersLaunched": 19285,
 33 |       "ContainersCompleted": 16605,
 34 |       "ContainersFailed": 1032,
 35 |       "ContainersKilled": 1651,
 36 |       "ContainersIniting": 0,
 37 |       "ContainersRunning": 12,
 38 |       "AllocatedGB": -30,
 39 |       "AllocatedContainers": -3,
 40 |       "AvailableGB": 272,
 41 |       "AllocatedVCores": -3,
 42 |       "AvailableVCores": 83,
 43 |       "ContainerLaunchDurationNumOps": 19300,
 44 |       "ContainerLaunchDurationAvgTime": 11,
 45 |       "BadLocalDirs": 0,
 46 |       "BadLogDirs": 0,
 47 |       "GoodLocalDirsDiskUtilizationPerc": 72,
 48 |       "GoodLogDirsDiskUtilizationPerc": 71
 49 |     },
 50 |     {
 51 |       "name": "Hadoop:service=NodeManager,name=ContainerResource_container_e12_1586466344459_0110_01_000051",
 52 |       "modelerType": "ContainerResource_container_e12_1586466344459_0110_01_000051"
 53 |     },
 54 |     {
 55 |       "name": "Hadoop:service=NodeManager,name=RpcActivityForPort8040",
 56 |       "modelerType": "RpcActivityForPort8040",
 57 |       "tag.port": "8040",
 58 |       "tag.Context": "rpc",
 59 |       "tag.NumOpenConnectionsPerUser": "{\"work\":1}",
 60 |       "tag.Hostname": "yh-shhd-cdh04",
 61 |       "ReceivedBytes": 15997743,
 62 |       "SentBytes": 5431407,
 63 |       "RpcQueueTimeNumOps": 65367,
 64 |       "RpcQueueTimeAvgTime": 0,
 65 |       "RpcProcessingTimeNumOps": 65367,
 66 |       "RpcProcessingTimeAvgTime": 0.6,
 67 |       "RpcAuthenticationFailures": 0,
 68 |       "RpcAuthenticationSuccesses": 0,
 69 |       "RpcAuthorizationFailures": 0,
 70 |       "RpcAuthorizationSuccesses": 16608,
 71 |       "RpcSlowCalls": 0,
 72 |       "RpcClientBackoff": 0,
 73 |       "NumOpenConnections": 1,
 74 |       "CallQueueLength": 0,
 75 |       "NumDroppedConnections": 0
 76 |     },
 77 |     {
 78 |       "name": "Hadoop:service=NodeManager,name=RpcDetailedActivityForPort8040",
 79 |       "modelerType": "RpcDetailedActivityForPort8040",
 80 |       "tag.port": "8040",
 81 |       "tag.Context": "rpcdetailed",
 82 |       "tag.Hostname": "yh-shhd-cdh04",
 83 |       "HeartbeatNumOps": 65367,
 84 |       "HeartbeatAvgTime": 0.6
 85 |     },
 86 |     {
 87 |       "name": "Hadoop:service=NodeManager,name=ContainerResource_container_e12_1586466344459_0110_01_000030",
 88 |       "modelerType": "ContainerResource_container_e12_1586466344459_0110_01_000030"
 89 |     },
 90 |     {
 91 |       "name": "Hadoop:service=NodeManager,name=MetricsSystem,sub=Control",
 92 |       "modelerType": "org.apache.hadoop.metrics2.impl.MetricsSystemImpl"
 93 |     },
 94 |     {
 95 |       "name": "Hadoop:service=NodeManager,name=JvmMetrics",
 96 |       "modelerType": "JvmMetrics",
 97 |       "tag.Context": "jvm",
 98 |       "tag.ProcessName": "NodeManager",
 99 |       "tag.SessionId": null,
100 |       "tag.Hostname": "yh-shhd-cdh04",
101 |       "MemNonHeapUsedM": 114.80347,
102 |       "MemNonHeapCommittedM": 117.72266,
103 |       "MemNonHeapMaxM": -1,
104 |       "MemHeapUsedM": 83.01589,
105 |       "MemHeapCommittedM": 989.875,
106 |       "MemHeapMaxM": 989.875,
107 |       "MemMaxM": 989.875,
108 |       "GcCountParNew": 131940,
109 |       "GcTimeMillisParNew": 1707030,
110 |       "GcCountConcurrentMarkSweep": 8,
111 |       "GcTimeMillisConcurrentMarkSweep": 382,
112 |       "GcCount": 131948,
113 |       "GcTimeMillis": 1707412,
114 |       "GcNumWarnThresholdExceeded": 0,
115 |       "GcNumInfoThresholdExceeded": 0,
116 |       "GcTotalExtraSleepTime": 22756,
117 |       "ThreadsNew": 0,
118 |       "ThreadsRunnable": 180,
119 |       "ThreadsBlocked": 0,
120 |       "ThreadsWaiting": 116,
121 |       "ThreadsTimedWaiting": 62,
122 |       "ThreadsTerminated": 0,
123 |       "LogFatal": 0,
124 |       "LogError": 0,
125 |       "LogWarn": 0,
126 |       "LogInfo": 0
127 |     },
128 |     {
129 |       "name": "Hadoop:service=NodeManager,name=ContainerResource_container_e12_1586466344459_0110_01_000001",
130 |       "modelerType": "ContainerResource_container_e12_1586466344459_0110_01_000001"
131 |     },
132 |     {
133 |       "name": "Hadoop:service=NodeManager,name=ContainerResource_container_e14_1586867223681_12174_01_000012",
134 |       "modelerType": "ContainerResource_container_e14_1586867223681_12174_01_000012"
135 |     },
136 |     {
137 |       "name": "Hadoop:service=NodeManager,name=ContainerResource_container_e12_1586466344459_0110_01_000006",
138 |       "modelerType": "ContainerResource_container_e12_1586466344459_0110_01_000006"
139 |     },
140 |     {
141 |       "name": "Hadoop:service=NodeManager,name=RpcDetailedActivityForPort8041",
142 |       "modelerType": "RpcDetailedActivityForPort8041",
143 |       "tag.port": "8041",
144 |       "tag.Context": "rpcdetailed",
145 |       "tag.Hostname": "yh-shhd-cdh04",
146 |       "StartContainersNumOps": 19285,
147 |       "StartContainersAvgTime": 0,
148 |       "StopContainersNumOps": 6017,
149 |       "StopContainersAvgTime": 0.3333333333333333
150 |     },
151 |     {
152 |       "name": "Hadoop:service=NodeManager,name=UgiMetrics",
153 |       "modelerType": "UgiMetrics",
154 |       "tag.Context": "ugi",
155 |       "tag.Hostname": "yh-shhd-cdh04",
156 |       "LoginSuccessNumOps": 0,
157 |       "LoginSuccessAvgTime": 0,
158 |       "LoginFailureNumOps": 0,
159 |       "LoginFailureAvgTime": 0,
160 |       "GetGroupsNumOps": 0,
161 |       "GetGroupsAvgTime": 0,
162 |       "RenewalFailuresTotal": 0,
163 |       "RenewalFailures": 0
164 |     },
165 |     {
166 |       "name": "Hadoop:service=NodeManager,name=RpcActivityForPort8041",
167 |       "modelerType": "RpcActivityForPort8041",
168 |       "tag.port": "8041",
169 |       "tag.Context": "rpc",
170 |       "tag.NumOpenConnectionsPerUser": "{\"appattempt_1586867223681_12177_000001\":1}",
171 |       "tag.Hostname": "yh-shhd-cdh04",
172 |       "ReceivedBytes": 142931721,
173 |       "SentBytes": 7815857,
174 |       "RpcQueueTimeNumOps": 25302,
175 |       "RpcQueueTimeAvgTime": 0,
176 |       "RpcProcessingTimeNumOps": 25302,
177 |       "RpcProcessingTimeAvgTime": 0,
178 |       "RpcAuthenticationFailures": 0,
179 |       "RpcAuthenticationSuccesses": 25302,
180 |       "RpcAuthorizationFailures": 0,
181 |       "RpcAuthorizationSuccesses": 25302,
182 |       "RpcSlowCalls": 0,
183 |       "RpcClientBackoff": 0,
184 |       "NumOpenConnections": 1,
185 |       "CallQueueLength": 0,
186 |       "NumDroppedConnections": 0
187 |     },
188 |     {
189 |       "name": "Hadoop:service=NodeManager,name=ShuffleMetrics",
190 |       "modelerType": "ShuffleMetrics",
191 |       "tag.Context": "mapred",
192 |       "tag.Hostname": "yh-shhd-cdh04",
193 |       "ShuffleOutputBytes": 3738069132,
194 |       "ShuffleOutputsFailed": 0,
195 |       "ShuffleOutputsOK": 4272,
196 |       "ShuffleConnections": 9183
197 |     },
198 |     {
199 |       "name": "Hadoop:service=NodeManager,name=MetricsSystem,sub=Stats",
200 |       "modelerType": "MetricsSystem,sub=Stats",
201 |       "tag.Context": "metricssystem",
202 |       "tag.Hostname": "yh-shhd-cdh04",
203 |       "NumActiveSources": 20,
204 |       "NumAllSources": 20,
205 |       "NumActiveSinks": 0,
206 |       "NumAllSinks": 0,
207 |       "SnapshotNumOps": 0,
208 |       "SnapshotAvgTime": 0,
209 |       "PublishNumOps": 0,
210 |       "PublishAvgTime": 0,
211 |       "DroppedPubAll": 0
212 |     },
213 |     {
214 |       "name": "Hadoop:service=NodeManager,name=ContainerResource_container_e12_1586466344459_0110_01_000005",
215 |       "modelerType": "ContainerResource_container_e12_1586466344459_0110_01_000005"
216 |     }
217 |   ]
218 | }
219 | 


--------------------------------------------------------------------------------
/examples/ResouceManager.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "beans": [
  3 |     {
  4 |       "name": "Hadoop:service=ResourceManager,name=RMNMInfo",
  5 |       "modelerType": "org.apache.hadoop.yarn.server.resourcemanager.RMNMInfo",
  6 |       "LiveNodeManagers": "[{\"HostName\":\"yh-shhd-cdh05\",\"Rack\":\"/default\",\"State\":\"RUNNING\",\"NodeId\":\"yh-shhd-cdh05:8041\",\"NodeHTTPAddress\":\"yh-shhd-cdh05:8042\",\"LastHealthUpdate\":1587396542915,\"HealthReport\":\"\",\"NodeManagerVersion\":\"2.6.0-cdh5.14.2\",\"NumContainers\":5,\"UsedMemoryMB\":10240,\"AvailableMemoryMB\":237070},{\"HostName\":\"yh-shhd-cdh01\",\"Rack\":\"/default\",\"State\":\"RUNNING\",\"NodeId\":\"yh-shhd-cdh01:8041\",\"NodeHTTPAddress\":\"yh-shhd-cdh01:8042\",\"LastHealthUpdate\":1587396543467,\"HealthReport\":\"\",\"NodeManagerVersion\":\"2.6.0-cdh5.14.2\",\"NumContainers\":8,\"UsedMemoryMB\":46080,\"AvailableMemoryMB\":201230},{\"HostName\":\"yh-shhd-cdh03\",\"Rack\":\"/default\",\"State\":\"RUNNING\",\"NodeId\":\"yh-shhd-cdh03:8041\",\"NodeHTTPAddress\":\"yh-shhd-cdh03:8042\",\"LastHealthUpdate\":1587396542872,\"HealthReport\":\"\",\"NodeManagerVersion\":\"2.6.0-cdh5.14.2\",\"NumContainers\":8,\"UsedMemoryMB\":62976,\"AvailableMemoryMB\":184334},{\"HostName\":\"yh-shhd-cdh04\",\"Rack\":\"/default\",\"State\":\"RUNNING\",\"NodeId\":\"yh-shhd-cdh04:8041\",\"NodeHTTPAddress\":\"yh-shhd-cdh04:8042\",\"LastHealthUpdate\":1587396543105,\"HealthReport\":\"\",\"NodeManagerVersion\":\"2.6.0-cdh5.14.2\",\"NumContainers\":10,\"UsedMemoryMB\":20480,\"AvailableMemoryMB\":226830},{\"HostName\":\"yh-shhd-cdh02\",\"Rack\":\"/default\",\"State\":\"RUNNING\",\"NodeId\":\"yh-shhd-cdh02:8041\",\"NodeHTTPAddress\":\"yh-shhd-cdh02:8042\",\"LastHealthUpdate\":1587396543355,\"HealthReport\":\"\",\"NodeManagerVersion\":\"2.6.0-cdh5.14.2\",\"NumContainers\":0,\"UsedMemoryMB\":0,\"AvailableMemoryMB\":247310}]"
  7 |     },
  8 |     {
  9 |       "name": "Hadoop:service=ResourceManager,name=RpcActivityForPort8033",
 10 |       "modelerType": "RpcActivityForPort8033",
 11 |       "tag.port": "8033",
 12 |       "tag.Context": "rpc",
 13 |       "tag.NumOpenConnectionsPerUser": "{}",
 14 |       "tag.Hostname": "yh-shhd-cdh04",
 15 |       "ReceivedBytes": 2697,
 16 |       "SentBytes": 536,
 17 |       "RpcQueueTimeNumOps": 15,
 18 |       "RpcQueueTimeAvgTime": 0,
 19 |       "RpcProcessingTimeNumOps": 15,
 20 |       "RpcProcessingTimeAvgTime": 0,
 21 |       "RpcAuthenticationFailures": 0,
 22 |       "RpcAuthenticationSuccesses": 0,
 23 |       "RpcAuthorizationFailures": 0,
 24 |       "RpcAuthorizationSuccesses": 15,
 25 |       "RpcSlowCalls": 0,
 26 |       "RpcClientBackoff": 0,
 27 |       "NumOpenConnections": 0,
 28 |       "CallQueueLength": 0,
 29 |       "NumDroppedConnections": 0
 30 |     },
 31 |     {
 32 |       "name": "Hadoop:service=ResourceManager,name=RpcActivityForPort8031",
 33 |       "modelerType": "RpcActivityForPort8031",
 34 |       "tag.port": "8031",
 35 |       "tag.Context": "rpc",
 36 |       "tag.NumOpenConnectionsPerUser": "{\"yarn\":5}",
 37 |       "tag.Hostname": "yh-shhd-cdh04",
 38 |       "ReceivedBytes": 1198590757,
 39 |       "SentBytes": 118872873,
 40 |       "RpcQueueTimeNumOps": 2703983,
 41 |       "RpcQueueTimeAvgTime": 0.044444444444444446,
 42 |       "RpcProcessingTimeNumOps": 2703983,
 43 |       "RpcProcessingTimeAvgTime": 0.044444444444444446,
 44 |       "RpcAuthenticationFailures": 0,
 45 |       "RpcAuthenticationSuccesses": 0,
 46 |       "RpcAuthorizationFailures": 0,
 47 |       "RpcAuthorizationSuccesses": 5,
 48 |       "RpcSlowCalls": 0,
 49 |       "RpcClientBackoff": 0,
 50 |       "NumOpenConnections": 5,
 51 |       "CallQueueLength": 0,
 52 |       "NumDroppedConnections": 0
 53 |     },
 54 |     {
 55 |       "name": "Hadoop:service=ResourceManager,name=RpcDetailedActivityForPort8032",
 56 |       "modelerType": "RpcDetailedActivityForPort8032",
 57 |       "tag.port": "8032",
 58 |       "tag.Context": "rpcdetailed",
 59 |       "tag.Hostname": "yh-shhd-cdh04",
 60 |       "GetNewApplicationNumOps": 12184,
 61 |       "GetNewApplicationAvgTime": 0,
 62 |       "GetClusterMetricsNumOps": 2958,
 63 |       "GetClusterMetricsAvgTime": 0,
 64 |       "ForceKillApplicationNumOps": 1,
 65 |       "ForceKillApplicationAvgTime": 2,
 66 |       "GetQueueInfoNumOps": 1764,
 67 |       "GetQueueInfoAvgTime": 0,
 68 |       "ApplicationNotFoundExceptionNumOps": 10218,
 69 |       "ApplicationNotFoundExceptionAvgTime": 0.011627906976744186,
 70 |       "SubmitApplicationNumOps": 12183,
 71 |       "SubmitApplicationAvgTime": 0,
 72 |       "GetApplicationReportNumOps": 1359839,
 73 |       "GetApplicationReportAvgTime": 0
 74 |     },
 75 |     {
 76 |       "name": "Hadoop:service=ResourceManager,name=MetricsSystem,sub=Control",
 77 |       "modelerType": "org.apache.hadoop.metrics2.impl.MetricsSystemImpl"
 78 |     },
 79 |     {
 80 |       "name": "Hadoop:service=ResourceManager,name=UgiMetrics",
 81 |       "modelerType": "UgiMetrics",
 82 |       "tag.Context": "ugi",
 83 |       "tag.Hostname": "yh-shhd-cdh04",
 84 |       "LoginSuccessNumOps": 0,
 85 |       "LoginSuccessAvgTime": 0,
 86 |       "LoginFailureNumOps": 0,
 87 |       "LoginFailureAvgTime": 0,
 88 |       "GetGroupsNumOps": 0,
 89 |       "GetGroupsAvgTime": 0,
 90 |       "RenewalFailuresTotal": 0,
 91 |       "RenewalFailures": 0
 92 |     },
 93 |     {
 94 |       "name": "Hadoop:service=ResourceManager,name=MetricsSystem,sub=Stats",
 95 |       "modelerType": "MetricsSystem,sub=Stats",
 96 |       "tag.Context": "metricssystem",
 97 |       "tag.Hostname": "yh-shhd-cdh04",
 98 |       "NumActiveSources": 20,
 99 |       "NumAllSources": 20,
100 |       "NumActiveSinks": 0,
101 |       "NumAllSinks": 0,
102 |       "SnapshotNumOps": 0,
103 |       "SnapshotAvgTime": 0,
104 |       "PublishNumOps": 0,
105 |       "PublishAvgTime": 0,
106 |       "DroppedPubAll": 0
107 |     },
108 |     {
109 |       "name": "Hadoop:service=ResourceManager,name=QueueMetrics,q0=root,q1=default",
110 |       "modelerType": "QueueMetrics,q0=root,q1=default",
111 |       "tag.Queue": "root.default",
112 |       "tag.Context": "yarn",
113 |       "tag.Hostname": "yh-shhd-cdh04",
114 |       "running_0": 3,
115 |       "running_60": 0,
116 |       "running_300": 0,
117 |       "running_1440": 3,
118 |       "FairShareMB": 1236550,
119 |       "FairShareVCores": 400,
120 |       "SteadyFairShareMB": 618275,
121 |       "SteadyFairShareVCores": 200,
122 |       "MinShareMB": 0,
123 |       "MinShareVCores": 0,
124 |       "MaxShareMB": 2147483647,
125 |       "MaxShareVCores": 2147483647,
126 |       "MaxApps": 2147483647,
127 |       "MaxAMShareMB": 0,
128 |       "MaxAMShareVCores": 0,
129 |       "AmResourceUsageMB": 0,
130 |       "AmResourceUsageVCores": 0,
131 |       "AppsSubmitted": 12190,
132 |       "AppsRunning": 6,
133 |       "AppsPending": 0,
134 |       "AppsCompleted": 12184,
135 |       "AppsKilled": 0,
136 |       "AppsFailed": 0,
137 |       "AllocatedMB": 139776,
138 |       "AllocatedVCores": 68,
139 |       "AllocatedContainers": 31,
140 |       "AggregateContainersAllocated": 104769,
141 |       "AggregateContainersReleased": 104738,
142 |       "AvailableMB": 0,
143 |       "AggregateContainersPreempted": 0,
144 |       "AvailableVCores": 0,
145 |       "PendingMB": 0,
146 |       "PendingVCores": 0,
147 |       "PendingContainers": 0,
148 |       "ReservedMB": 0,
149 |       "ReservedVCores": 0,
150 |       "ReservedContainers": 0,
151 |       "ActiveUsers": 0,
152 |       "ActiveApplications": 0,
153 |       "AppAttemptFirstContainerAllocationDelayNumOps": 0,
154 |       "AppAttemptFirstContainerAllocationDelayAvgTime": 0
155 |     },
156 |     {
157 |       "name": "Hadoop:service=ResourceManager,name=QueueMetrics,q0=root,q1=default,user=work",
158 |       "modelerType": "QueueMetrics,q0=root,q1=default,user=work",
159 |       "tag.Queue": "root.default",
160 |       "tag.User": "work",
161 |       "tag.Context": "yarn",
162 |       "tag.Hostname": "yh-shhd-cdh04",
163 |       "running_0": 3,
164 |       "running_60": 0,
165 |       "running_300": 0,
166 |       "running_1440": 3,
167 |       "AppsSubmitted": 12190,
168 |       "AppsRunning": 6,
169 |       "AppsPending": 0,
170 |       "AppsCompleted": 12184,
171 |       "AppsKilled": 0,
172 |       "AppsFailed": 0,
173 |       "AllocatedMB": 139776,
174 |       "AllocatedVCores": 68,
175 |       "AllocatedContainers": 31,
176 |       "AggregateContainersAllocated": 104769,
177 |       "AggregateContainersReleased": 104738,
178 |       "AvailableMB": 0,
179 |       "AggregateContainersPreempted": 0,
180 |       "AvailableVCores": 0,
181 |       "PendingMB": 0,
182 |       "PendingVCores": 0,
183 |       "PendingContainers": 0,
184 |       "ReservedMB": 0,
185 |       "ReservedVCores": 0,
186 |       "ReservedContainers": 0,
187 |       "ActiveUsers": 0,
188 |       "ActiveApplications": 0,
189 |       "AppAttemptFirstContainerAllocationDelayNumOps": 0,
190 |       "AppAttemptFirstContainerAllocationDelayAvgTime": 0
191 |     },
192 |     {
193 |       "name": "Hadoop:service=ResourceManager,name=JvmMetrics",
194 |       "modelerType": "JvmMetrics",
195 |       "tag.Context": "jvm",
196 |       "tag.ProcessName": "ResourceManager",
197 |       "tag.SessionId": null,
198 |       "tag.Hostname": "yh-shhd-cdh04",
199 |       "MemNonHeapUsedM": 112.60955,
200 |       "MemNonHeapCommittedM": 115.55078,
201 |       "MemNonHeapMaxM": -1,
202 |       "MemHeapUsedM": 556.7262,
203 |       "MemHeapCommittedM": 989.875,
204 |       "MemHeapMaxM": 989.875,
205 |       "MemMaxM": 989.875,
206 |       "GcCountParNew": 793,
207 |       "GcTimeMillisParNew": 5257,
208 |       "GcCountConcurrentMarkSweep": 4,
209 |       "GcTimeMillisConcurrentMarkSweep": 167,
210 |       "GcCount": 797,
211 |       "GcTimeMillis": 5424,
212 |       "GcNumWarnThresholdExceeded": 0,
213 |       "GcNumInfoThresholdExceeded": 0,
214 |       "GcTotalExtraSleepTime": 1606,
215 |       "ThreadsNew": 0,
216 |       "ThreadsRunnable": 19,
217 |       "ThreadsBlocked": 0,
218 |       "ThreadsWaiting": 62,
219 |       "ThreadsTimedWaiting": 179,
220 |       "ThreadsTerminated": 0,
221 |       "LogFatal": 0,
222 |       "LogError": 0,
223 |       "LogWarn": 0,
224 |       "LogInfo": 0
225 |     },
226 |     {
227 |       "name": "Hadoop:service=ResourceManager,name=RpcDetailedActivityForPort8030",
228 |       "modelerType": "RpcDetailedActivityForPort8030",
229 |       "tag.port": "8030",
230 |       "tag.Context": "rpcdetailed",
231 |       "tag.Hostname": "yh-shhd-cdh04",
232 |       "ApplicationMasterNotRegisteredExceptionNumOps": 7,
233 |       "ApplicationMasterNotRegisteredExceptionAvgTime": 1.8571428571428572,
234 |       "RegisterApplicationMasterNumOps": 12210,
235 |       "RegisterApplicationMasterAvgTime": 0,
236 |       "AllocateNumOps": 850467,
237 |       "AllocateAvgTime": 0.125,
238 |       "FinishApplicationMasterNumOps": 24382,
239 |       "FinishApplicationMasterAvgTime": 0
240 |     },
241 |     {
242 |       "name": "Hadoop:service=ResourceManager,name=QueueMetrics,q0=root,q1=default,q2=work,user=work",
243 |       "modelerType": "QueueMetrics,q0=root,q1=default,q2=work,user=work",
244 |       "tag.Queue": "root.default.work",
245 |       "tag.User": "work",
246 |       "tag.Context": "yarn",
247 |       "tag.Hostname": "yh-shhd-cdh04",
248 |       "running_0": 3,
249 |       "running_60": 0,
250 |       "running_300": 0,
251 |       "running_1440": 3,
252 |       "AppsSubmitted": 12190,
253 |       "AppsRunning": 6,
254 |       "AppsPending": 0,
255 |       "AppsCompleted": 12184,
256 |       "AppsKilled": 0,
257 |       "AppsFailed": 0,
258 |       "AllocatedMB": 139776,
259 |       "AllocatedVCores": 68,
260 |       "AllocatedContainers": 31,
261 |       "AggregateContainersAllocated": 104769,
262 |       "AggregateContainersReleased": 104738,
263 |       "AvailableMB": 0,
264 |       "AggregateContainersPreempted": 0,
265 |       "AvailableVCores": 0,
266 |       "PendingMB": 0,
267 |       "PendingVCores": 0,
268 |       "PendingContainers": 0,
269 |       "ReservedMB": 0,
270 |       "ReservedVCores": 0,
271 |       "ReservedContainers": 0,
272 |       "ActiveUsers": 0,
273 |       "ActiveApplications": 0,
274 |       "AppAttemptFirstContainerAllocationDelayNumOps": 0,
275 |       "AppAttemptFirstContainerAllocationDelayAvgTime": 0
276 |     },
277 |     {
278 |       "name": "Hadoop:service=ResourceManager,name=QueueMetrics,q0=root,user=dr.who",
279 |       "modelerType": "QueueMetrics,q0=root,user=dr.who",
280 |       "tag.Queue": "root",
281 |       "tag.User": "dr.who",
282 |       "tag.Context": "yarn",
283 |       "tag.Hostname": "yh-shhd-cdh04",
284 |       "running_0": 0,
285 |       "running_60": 0,
286 |       "running_300": 0,
287 |       "running_1440": 0,
288 |       "AppsSubmitted": 0,
289 |       "AppsRunning": 0,
290 |       "AppsPending": 0,
291 |       "AppsCompleted": 0,
292 |       "AppsKilled": 0,
293 |       "AppsFailed": 0,
294 |       "AllocatedMB": 0,
295 |       "AllocatedVCores": 0,
296 |       "AllocatedContainers": 0,
297 |       "AggregateContainersAllocated": 0,
298 |       "AggregateContainersReleased": 0,
299 |       "AvailableMB": 0,
300 |       "AggregateContainersPreempted": 0,
301 |       "AvailableVCores": 0,
302 |       "PendingMB": 0,
303 |       "PendingVCores": 0,
304 |       "PendingContainers": 0,
305 |       "ReservedMB": 0,
306 |       "ReservedVCores": 0,
307 |       "ReservedContainers": 0,
308 |       "ActiveUsers": 0,
309 |       "ActiveApplications": 0,
310 |       "AppAttemptFirstContainerAllocationDelayNumOps": 0,
311 |       "AppAttemptFirstContainerAllocationDelayAvgTime": 0
312 |     },
313 |     {
314 |       "name": "Hadoop:service=ResourceManager,name=RpcActivityForPort8030",
315 |       "modelerType": "RpcActivityForPort8030",
316 |       "tag.port": "8030",
317 |       "tag.Context": "rpc",
318 |       "tag.NumOpenConnectionsPerUser": "{\"appattempt_1586867223681_12172_000001\":1,\"appattempt_1586867223681_12175_000001\":1,\"appattempt_1586867223681_12184_000001\":1,\"appattempt_1586867223681_0054_000001\":1,\"appattempt_1586466344459_0110_000001\":1,\"appattempt_1586867223681_0055_000001\":1}",
319 |       "tag.Hostname": "yh-shhd-cdh04",
320 |       "ReceivedBytes": 109416252,
321 |       "SentBytes": 82031221,
322 |       "RpcQueueTimeNumOps": 887066,
323 |       "RpcQueueTimeAvgTime": 0,
324 |       "RpcProcessingTimeNumOps": 887066,
325 |       "RpcProcessingTimeAvgTime": 0.1111111111111111,
326 |       "RpcAuthenticationFailures": 0,
327 |       "RpcAuthenticationSuccesses": 12210,
328 |       "RpcAuthorizationFailures": 0,
329 |       "RpcAuthorizationSuccesses": 12210,
330 |       "RpcSlowCalls": 0,
331 |       "RpcClientBackoff": 0,
332 |       "NumOpenConnections": 6,
333 |       "CallQueueLength": 0,
334 |       "NumDroppedConnections": 0
335 |     },
336 |     {
337 |       "name": "Hadoop:service=ResourceManager,name=FSOpDurations",
338 |       "modelerType": "FSOpDurations",
339 |       "tag.FSOpDurations": "FSOpDurations",
340 |       "tag.Context": "fairscheduler-op-durations",
341 |       "tag.Hostname": "yh-shhd-cdh04",
342 |       "ContinuousSchedulingRunNumOps": 0,
343 |       "ContinuousSchedulingRunAvgTime": 0,
344 |       "ContinuousSchedulingRunStdevTime": 0,
345 |       "ContinuousSchedulingRunIMinTime": 3.4028234663852886e38,
346 |       "ContinuousSchedulingRunIMaxTime": 1.401298464324817e-45,
347 |       "ContinuousSchedulingRunMinTime": 3.4028234663852886e38,
348 |       "ContinuousSchedulingRunMaxTime": 1.401298464324817e-45,
349 |       "ContinuousSchedulingRunINumOps": 0,
350 |       "NodeUpdateCallNumOps": 2703973,
351 |       "NodeUpdateCallAvgTime": 0.08888888888888889,
352 |       "NodeUpdateCallStdevTime": 0.35816592283860327,
353 |       "NodeUpdateCallIMinTime": 0,
354 |       "NodeUpdateCallIMaxTime": 2,
355 |       "NodeUpdateCallMinTime": 0,
356 |       "NodeUpdateCallMaxTime": 467,
357 |       "NodeUpdateCallINumOps": 45,
358 |       "UpdateThreadRunNumOps": 1058477,
359 |       "UpdateThreadRunAvgTime": 0,
360 |       "UpdateThreadRunStdevTime": 0,
361 |       "UpdateThreadRunIMinTime": 0,
362 |       "UpdateThreadRunIMaxTime": 1.401298464324817e-45,
363 |       "UpdateThreadRunMinTime": 0,
364 |       "UpdateThreadRunMaxTime": 489,
365 |       "UpdateThreadRunINumOps": 17,
366 |       "UpdateCallNumOps": 1058477,
367 |       "UpdateCallAvgTime": 0,
368 |       "UpdateCallStdevTime": 0,
369 |       "UpdateCallIMinTime": 0,
370 |       "UpdateCallIMaxTime": 1.401298464324817e-45,
371 |       "UpdateCallMinTime": 0,
372 |       "UpdateCallMaxTime": 489,
373 |       "UpdateCallINumOps": 17,
374 |       "PreemptCallNumOps": 0,
375 |       "PreemptCallAvgTime": 0,
376 |       "PreemptCallStdevTime": 0,
377 |       "PreemptCallIMinTime": 3.4028234663852886e38,
378 |       "PreemptCallIMaxTime": 1.401298464324817e-45,
379 |       "PreemptCallMinTime": 3.4028234663852886e38,
380 |       "PreemptCallMaxTime": 1.401298464324817e-45,
381 |       "PreemptCallINumOps": 0
382 |     },
383 |     {
384 |       "name": "Hadoop:service=ResourceManager,name=QueueMetrics,q0=root,q1=users",
385 |       "modelerType": "QueueMetrics,q0=root,q1=users",
386 |       "tag.Queue": "root.users",
387 |       "tag.Context": "yarn",
388 |       "tag.Hostname": "yh-shhd-cdh04",
389 |       "running_0": 0,
390 |       "running_60": 0,
391 |       "running_300": 0,
392 |       "running_1440": 0,
393 |       "FairShareMB": 0,
394 |       "FairShareVCores": 0,
395 |       "SteadyFairShareMB": 618275,
396 |       "SteadyFairShareVCores": 200,
397 |       "MinShareMB": 0,
398 |       "MinShareVCores": 0,
399 |       "MaxShareMB": 2147483647,
400 |       "MaxShareVCores": 2147483647,
401 |       "MaxApps": 2147483647,
402 |       "MaxAMShareMB": 0,
403 |       "MaxAMShareVCores": 0,
404 |       "AmResourceUsageMB": 0,
405 |       "AmResourceUsageVCores": 0,
406 |       "AppsSubmitted": 0,
407 |       "AppsRunning": 0,
408 |       "AppsPending": 0,
409 |       "AppsCompleted": 0,
410 |       "AppsKilled": 0,
411 |       "AppsFailed": 0,
412 |       "AllocatedMB": 0,
413 |       "AllocatedVCores": 0,
414 |       "AllocatedContainers": 0,
415 |       "AggregateContainersAllocated": 0,
416 |       "AggregateContainersReleased": 0,
417 |       "AvailableMB": 0,
418 |       "AggregateContainersPreempted": 0,
419 |       "AvailableVCores": 0,
420 |       "PendingMB": 0,
421 |       "PendingVCores": 0,
422 |       "PendingContainers": 0,
423 |       "ReservedMB": 0,
424 |       "ReservedVCores": 0,
425 |       "ReservedContainers": 0,
426 |       "ActiveUsers": 0,
427 |       "ActiveApplications": 0,
428 |       "AppAttemptFirstContainerAllocationDelayNumOps": 0,
429 |       "AppAttemptFirstContainerAllocationDelayAvgTime": 0
430 |     },
431 |     {
432 |       "name": "Hadoop:service=ResourceManager,name=QueueMetrics,q0=root,q1=default,q2=work",
433 |       "modelerType": "QueueMetrics,q0=root,q1=default,q2=work",
434 |       "tag.Queue": "root.default.work",
435 |       "tag.Context": "yarn",
436 |       "tag.Hostname": "yh-shhd-cdh04",
437 |       "running_0": 3,
438 |       "running_60": 0,
439 |       "running_300": 0,
440 |       "running_1440": 3,
441 |       "FairShareMB": 1236550,
442 |       "FairShareVCores": 400,
443 |       "SteadyFairShareMB": 618275,
444 |       "SteadyFairShareVCores": 200,
445 |       "MinShareMB": 0,
446 |       "MinShareVCores": 0,
447 |       "MaxShareMB": 2147483647,
448 |       "MaxShareVCores": 2147483647,
449 |       "MaxApps": 2147483647,
450 |       "MaxAMShareMB": 618275,
451 |       "MaxAMShareVCores": 200,
452 |       "AmResourceUsageMB": 20480,
453 |       "AmResourceUsageVCores": 7,
454 |       "AppsSubmitted": 12190,
455 |       "AppsRunning": 6,
456 |       "AppsPending": 0,
457 |       "AppsCompleted": 12184,
458 |       "AppsKilled": 0,
459 |       "AppsFailed": 0,
460 |       "AllocatedMB": 139776,
461 |       "AllocatedVCores": 68,
462 |       "AllocatedContainers": 31,
463 |       "AggregateContainersAllocated": 104769,
464 |       "AggregateContainersReleased": 104738,
465 |       "AvailableMB": 0,
466 |       "AggregateContainersPreempted": 0,
467 |       "AvailableVCores": 0,
468 |       "PendingMB": 0,
469 |       "PendingVCores": 0,
470 |       "PendingContainers": 0,
471 |       "ReservedMB": 0,
472 |       "ReservedVCores": 0,
473 |       "ReservedContainers": 0,
474 |       "ActiveUsers": 0,
475 |       "ActiveApplications": 0,
476 |       "AppAttemptFirstContainerAllocationDelayNumOps": 12205,
477 |       "AppAttemptFirstContainerAllocationDelayAvgTime": 834
478 |     },
479 |     {
480 |       "name": "Hadoop:service=ResourceManager,name=RpcActivityForPort8032",
481 |       "modelerType": "RpcActivityForPort8032",
482 |       "tag.port": "8032",
483 |       "tag.Context": "rpc",
484 |       "tag.NumOpenConnectionsPerUser": "{\"work\":5,\"yarn\":1}",
485 |       "tag.Hostname": "yh-shhd-cdh04",
486 |       "ReceivedBytes": 277244591,
487 |       "SentBytes": 434732996,
488 |       "RpcQueueTimeNumOps": 1399147,
489 |       "RpcQueueTimeAvgTime": 0.09523809523809523,
490 |       "RpcProcessingTimeNumOps": 1399147,
491 |       "RpcProcessingTimeAvgTime": 0,
492 |       "RpcAuthenticationFailures": 0,
493 |       "RpcAuthenticationSuccesses": 0,
494 |       "RpcAuthorizationFailures": 0,
495 |       "RpcAuthorizationSuccesses": 124523,
496 |       "RpcSlowCalls": 0,
497 |       "RpcClientBackoff": 0,
498 |       "NumOpenConnections": 6,
499 |       "CallQueueLength": 0,
500 |       "NumDroppedConnections": 0
501 |     },
502 |     {
503 |       "name": "Hadoop:service=ResourceManager,name=QueueMetrics,q0=root",
504 |       "modelerType": "QueueMetrics,q0=root",
505 |       "tag.Queue": "root",
506 |       "tag.Context": "yarn",
507 |       "tag.Hostname": "yh-shhd-cdh04",
508 |       "running_0": 3,
509 |       "running_60": 0,
510 |       "running_300": 0,
511 |       "running_1440": 3,
512 |       "FairShareMB": 1236550,
513 |       "FairShareVCores": 400,
514 |       "SteadyFairShareMB": 1236550,
515 |       "SteadyFairShareVCores": 400,
516 |       "MinShareMB": 0,
517 |       "MinShareVCores": 0,
518 |       "MaxShareMB": 2147483647,
519 |       "MaxShareVCores": 2147483647,
520 |       "MaxApps": 2147483647,
521 |       "MaxAMShareMB": 0,
522 |       "MaxAMShareVCores": 0,
523 |       "AmResourceUsageMB": 0,
524 |       "AmResourceUsageVCores": 0,
525 |       "AppsSubmitted": 12190,
526 |       "AppsRunning": 6,
527 |       "AppsPending": 0,
528 |       "AppsCompleted": 12184,
529 |       "AppsKilled": 0,
530 |       "AppsFailed": 0,
531 |       "AllocatedMB": 139776,
532 |       "AllocatedVCores": 68,
533 |       "AllocatedContainers": 31,
534 |       "AggregateContainersAllocated": 104769,
535 |       "AggregateContainersReleased": 104738,
536 |       "AvailableMB": 1096774,
537 |       "AggregateContainersPreempted": 0,
538 |       "AvailableVCores": 332,
539 |       "PendingMB": 0,
540 |       "PendingVCores": 0,
541 |       "PendingContainers": 0,
542 |       "ReservedMB": 0,
543 |       "ReservedVCores": 0,
544 |       "ReservedContainers": 0,
545 |       "ActiveUsers": 0,
546 |       "ActiveApplications": 0,
547 |       "AppAttemptFirstContainerAllocationDelayNumOps": 0,
548 |       "AppAttemptFirstContainerAllocationDelayAvgTime": 0
549 |     },
550 |     {
551 |       "name": "Hadoop:service=ResourceManager,name=RpcDetailedActivityForPort8031",
552 |       "modelerType": "RpcDetailedActivityForPort8031",
553 |       "tag.port": "8031",
554 |       "tag.Context": "rpcdetailed",
555 |       "tag.Hostname": "yh-shhd-cdh04",
556 |       "NodeHeartbeatNumOps": 2703978,
557 |       "NodeHeartbeatAvgTime": 0.044444444444444446,
558 |       "RegisterNodeManagerNumOps": 5,
559 |       "RegisterNodeManagerAvgTime": 29
560 |     },
561 |     {
562 |       "name": "Hadoop:service=ResourceManager,name=ClusterMetrics",
563 |       "modelerType": "ClusterMetrics",
564 |       "tag.ClusterMetrics": "ResourceManager",
565 |       "tag.Context": "yarn",
566 |       "tag.Hostname": "yh-shhd-cdh04",
567 |       "NumActiveNMs": 5,
568 |       "NumDecommissioningNMs": 0,
569 |       "NumDecommissionedNMs": 0,
570 |       "NumLostNMs": 0,
571 |       "NumUnhealthyNMs": 0,
572 |       "NumRebootedNMs": 0,
573 |       "AMLaunchDelayNumOps": 12203,
574 |       "AMLaunchDelayAvgTime": 5,
575 |       "AMRegisterDelayNumOps": 12210,
576 |       "AMRegisterDelayAvgTime": 3199
577 |     },
578 |     {
579 |       "name": "Hadoop:service=ResourceManager,name=RpcDetailedActivityForPort8033",
580 |       "modelerType": "RpcDetailedActivityForPort8033",
581 |       "tag.port": "8033",
582 |       "tag.Context": "rpcdetailed",
583 |       "tag.Hostname": "yh-shhd-cdh04",
584 |       "GetServiceStatusNumOps": 14,
585 |       "GetServiceStatusAvgTime": 0,
586 |       "TransitionToActiveNumOps": 0,
587 |       "TransitionToActiveAvgTime": 0,
588 |       "TransitionToStandbyNumOps": 0,
589 |       "TransitionToStandbyAvgTime": 0,
590 |       "MonitorHealthNumOps": 1,
591 |       "MonitorHealthAvgTime": 0
592 |     },
593 |     {
594 |       "name": "Hadoop:service=ResourceManager,name=QueueMetrics,q0=root,user=work",
595 |       "modelerType": "QueueMetrics,q0=root,user=work",
596 |       "tag.Queue": "root",
597 |       "tag.User": "work",
598 |       "tag.Context": "yarn",
599 |       "tag.Hostname": "yh-shhd-cdh04",
600 |       "running_0": 3,
601 |       "running_60": 0,
602 |       "running_300": 0,
603 |       "running_1440": 3,
604 |       "AppsSubmitted": 12190,
605 |       "AppsRunning": 6,
606 |       "AppsPending": 0,
607 |       "AppsCompleted": 12184,
608 |       "AppsKilled": 0,
609 |       "AppsFailed": 0,
610 |       "AllocatedMB": 139776,
611 |       "AllocatedVCores": 68,
612 |       "AllocatedContainers": 31,
613 |       "AggregateContainersAllocated": 104769,
614 |       "AggregateContainersReleased": 104738,
615 |       "AvailableMB": 0,
616 |       "AggregateContainersPreempted": 0,
617 |       "AvailableVCores": 0,
618 |       "PendingMB": 0,
619 |       "PendingVCores": 0,
620 |       "PendingContainers": 0,
621 |       "ReservedMB": 0,
622 |       "ReservedVCores": 0,
623 |       "ReservedContainers": 0,
624 |       "ActiveUsers": 0,
625 |       "ActiveApplications": 0,
626 |       "AppAttemptFirstContainerAllocationDelayNumOps": 0,
627 |       "AppAttemptFirstContainerAllocationDelayAvgTime": 0
628 |     }
629 |   ]
630 | }
631 | 


--------------------------------------------------------------------------------
/hadoop_jmx_exporter.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import time
 5 | from prometheus_client import start_http_server
 6 | from prometheus_client.core import REGISTRY
 7 | 
 8 | import utils
 9 | from utils import get_module_logger
10 | from hdfs_namenode import NameNodeMetricCollector
11 | from hdfs_datanode import DataNodeMetricCollector
12 | from hdfs_journalnode import JournalNodeMetricCollector
13 | from yarn_resourcemanager import ResourceManagerMetricCollector
14 | from yarn_nodemanager import NodeManagerMetricCollector
15 | 
16 | logger = get_module_logger(__name__)
17 | 
18 | 
19 | def register_prometheus(cluster, args):
20 |     if args.nns is not None and len(args.nns) > 0:
21 |         nnc = NameNodeMetricCollector(cluster, args.nns)
22 |         nnc.collect()
23 |         REGISTRY.register(nnc)
24 |         REGISTRY.register(DataNodeMetricCollector(cluster, nnc))
25 |     if args.rms is not None and len(args.rms) > 0:
26 |         rmc = ResourceManagerMetricCollector(cluster, args.rms, args.queue)
27 |         rmc.collect()
28 |         REGISTRY.register(rmc)
29 |         REGISTRY.register(NodeManagerMetricCollector(cluster, rmc))
30 |     if args.jns is not None and len(args.jns) > 0:
31 |         REGISTRY.register(JournalNodeMetricCollector(cluster, args.jns))
32 | def main():
33 |     args = utils.parse_args()
34 |     host = args.host
35 |     port = int(args.port)
36 |     start_http_server(port, host)
37 |     print "Listen at %s:%s" % (host, port)
38 |     register_prometheus(args.cluster, args)
39 |     while True:
40 |         time.sleep(300)
41 | 
42 | 
43 | if __name__ == "__main__":
44 |     main()
45 | 


--------------------------------------------------------------------------------
/hdfs_datanode.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import yaml
  5 | import re
  6 | from prometheus_client.core import GaugeMetricFamily
  7 | 
  8 | from utils import get_module_logger
  9 | from common import MetricCollector, CommonMetricCollector
 10 | from scraper import ScrapeMetrics
 11 | 
 12 | logger = get_module_logger(__name__)
 13 | 
 14 | 
 15 | class DataNodeMetricCollector(MetricCollector):
 16 |     def __init__(self, cluster, nnc):
 17 |         MetricCollector.__init__(self, cluster, "hdfs", "datanode")
 18 |         self.target = "-"
 19 |         self.nnc = nnc
 20 | 
 21 |         self.hadoop_datanode_metrics = {}
 22 |         for i in range(len(self.file_list)):
 23 |             self.hadoop_datanode_metrics.setdefault(self.file_list[i], {})
 24 | 
 25 |         self.common_metric_collector = CommonMetricCollector(cluster, "hdfs", "datanode")
 26 | 
 27 |     def collect(self):
 28 |         isSetup = False
 29 |         if self.nnc.dns == "":
 30 |             return
 31 |         beans_list = ScrapeMetrics(self.nnc.dns).scrape()
 32 |         for beans in beans_list:
 33 |             if not isSetup:
 34 |                 self.common_metric_collector.setup_labels(beans)
 35 |                 self.setup_metrics_labels(beans)
 36 |                 isSetup = True
 37 |             for i in range(len(beans)):
 38 |                 if 'tag.Hostname' in beans[i]:
 39 |                     self.target = beans[i]["tag.Hostname"]
 40 |                     break
 41 |             self.hadoop_datanode_metrics.update(self.common_metric_collector.get_metrics(beans, self.target))
 42 |             self.get_metrics(beans)
 43 | 
 44 |         for i in range(len(self.merge_list)):
 45 |             service = self.merge_list[i]
 46 |             if service in self.hadoop_datanode_metrics:
 47 |                 for metric in self.hadoop_datanode_metrics[service]:
 48 |                     yield self.hadoop_datanode_metrics[service][metric]
 49 | 
 50 |     def setup_dninfo_labels(self):
 51 |         for metric in self.metrics['DataNodeInfo']:
 52 |             if 'VolumeInfo' in metric:
 53 |                 label = ["cluster", "version", "path", "state"]
 54 |                 name = "_".join([self.prefix, 'volume_state'])
 55 |             else:
 56 |                 label = ["cluster", "version"]
 57 |                 snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower()
 58 |                 name = "_".join([self.prefix, snake_case])
 59 |             label.append("_target")
 60 |             self.hadoop_datanode_metrics['DataNodeInfo'][metric] = GaugeMetricFamily(name, self.metrics['DataNodeInfo'][metric], labels=label)
 61 | 
 62 |     def setup_dnactivity_labels(self):
 63 |         block_flag, client_flag = 1, 1
 64 |         for metric in self.metrics['DataNodeActivity']:
 65 |             if 'Blocks' in metric:
 66 |                 if block_flag:
 67 |                     label = ['cluster', 'host', 'oper']
 68 |                     key = "Blocks"
 69 |                     name = "block_operations_total"
 70 |                     descriptions = "Total number of blocks in different oprations"
 71 |                     block_flag = 0
 72 |                 else:
 73 |                     continue
 74 |             elif 'Client' in metric:
 75 |                 if client_flag:
 76 |                     label = ['cluster', 'host', 'oper', 'client']
 77 |                     key = "Client"
 78 |                     name = "from_client_total"
 79 |                     descriptions = "Total number of each operations from different client"
 80 |                     client_flag = 0
 81 |                 else:
 82 |                     continue
 83 |             else:
 84 |                 snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower()
 85 |                 label = ['cluster', 'host']
 86 |                 key = metric
 87 |                 name = snake_case
 88 |                 descriptions = self.metrics['DataNodeActivity'][metric]
 89 |             label.append("_target")
 90 |             self.hadoop_datanode_metrics['DataNodeActivity'][key] = GaugeMetricFamily("_".join([self.prefix, name]), descriptions, labels=label)
 91 | 
 92 |     def setup_fsdatasetstate_labels(self):
 93 |         for metric in self.metrics['FSDatasetState']:
 94 |             label = ['cluster', 'host', "_target"]
 95 |             if "Num" in metric:
 96 |                 snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric.split("Num")[1]).lower()
 97 |             else:
 98 |                 snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower()
 99 |             name = "_".join([self.prefix, snake_case])
100 |             self.hadoop_datanode_metrics['FSDatasetState'][metric] = GaugeMetricFamily(name, self.metrics['FSDatasetState'][metric], labels=label)
101 | 
102 |     def setup_metrics_labels(self, beans):
103 |         for i in range(len(beans)):
104 |             if 'DataNodeInfo' in beans[i]['name']:
105 |                 self.setup_dninfo_labels()
106 |             if 'DataNodeActivity' in beans[i]['name']:
107 |                 self.setup_dnactivity_labels()
108 |             if 'FSDatasetState' in beans[i]['name']:
109 |                 self.setup_fsdatasetstate_labels()
110 | 
111 |     def get_dninfo_metrics(self, bean):
112 |         for metric in self.metrics['DataNodeInfo']:
113 |             version = bean['Version']
114 |             if 'VolumeInfo' in metric:
115 |                 if 'VolumeInfo' in bean:
116 |                     volume_info_dict = yaml.safe_load(bean['VolumeInfo'])
117 |                     for k, v in volume_info_dict.items():
118 |                         path = k
119 |                         for key, val in v.items():
120 |                             if key != "storageType":
121 |                                 state = key
122 |                                 label = [self.cluster, version, path, state, self.target]
123 |                                 value = val
124 |                                 self.hadoop_datanode_metrics['DataNodeInfo'][metric].add_metric(label, value)
125 |                 else:
126 |                     continue
127 |             else:
128 |                 label = [self.cluster, version, self.target]
129 |                 value = bean[metric]
130 |                 self.hadoop_datanode_metrics['DataNodeInfo'][metric].add_metric(label, value)
131 | 
132 |     def get_dnactivity_metrics(self, bean):
133 |         for metric in self.metrics['DataNodeActivity']:
134 |             host = bean['tag.Hostname']
135 |             label = [self.cluster, host]
136 |             if 'Blocks' in metric:
137 |                 oper = metric.split("Blocks")[1]
138 |                 label.append(oper)
139 |                 key = "Blocks"
140 |             elif 'Client' in metric:
141 |                 oper = metric.split("Client")[0].split("From")[0]
142 |                 client = metric.split("Client")[0].split("From")[1]
143 |                 label.extend([oper, client])
144 |                 key = "Client"
145 |             else:
146 |                 key = metric
147 |             label.append(self.target)
148 |             self.hadoop_datanode_metrics['DataNodeActivity'][key].add_metric(label, bean[metric] if metric in bean else 0)
149 | 
150 |     def get_fsdatasetstate_metrics(self, bean):
151 |         for metric in self.metrics['FSDatasetState']:
152 |             label = [self.cluster, self.target, self.target]
153 |             self.hadoop_datanode_metrics['FSDatasetState'][metric].add_metric(
154 |                 label, bean[metric] if metric in bean else 0)
155 | 
156 |     def get_metrics(self, beans):
157 |         for i in range(len(beans)):
158 |             if 'DataNodeInfo' in beans[i]['name']:
159 |                 self.get_dninfo_metrics(beans[i])
160 |             if 'DataNodeActivity' in beans[i]['name']:
161 |                 self.get_dnactivity_metrics(beans[i])
162 |             if 'FSDatasetState' in beans[i]['name']:
163 |                 self.get_fsdatasetstate_metrics(beans[i])
164 | 


--------------------------------------------------------------------------------
/hdfs_journalnode.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import re
  5 | from prometheus_client.core import GaugeMetricFamily, HistogramMetricFamily
  6 | 
  7 | from utils import get_module_logger
  8 | from common import MetricCollector, CommonMetricCollector
  9 | from scraper import ScrapeMetrics
 10 | 
 11 | logger = get_module_logger(__name__)
 12 | 
 13 | 
 14 | class JournalNodeMetricCollector(MetricCollector):
 15 |     def __init__(self, cluster, urls):
 16 |         MetricCollector.__init__(self, cluster, "hdfs", "journalnode")
 17 |         self.target = "-"
 18 |         self.urls = urls
 19 | 
 20 |         self.hadoop_journalnode_metrics = {}
 21 |         for i in range(len(self.file_list)):
 22 |             self.hadoop_journalnode_metrics.setdefault(self.file_list[i], {})
 23 | 
 24 |         self.common_metric_collector = CommonMetricCollector(cluster, "hdfs", "journalnode")
 25 | 
 26 |         self.scrape_metrics = ScrapeMetrics(urls)
 27 | 
 28 |     def collect(self):
 29 |         isSetup = False
 30 |         beans_list = self.scrape_metrics.scrape()
 31 |         for beans in beans_list:
 32 |             if not isSetup:
 33 |                 self.common_metric_collector.setup_labels(beans)
 34 |                 self.setup_metrics_labels(beans)
 35 |                 isSetup = True
 36 |             for i in range(len(beans)):
 37 |                 if 'tag.Hostname' in beans[i]:
 38 |                     self.target = beans[i]["tag.Hostname"]
 39 |                     break
 40 |             self.hadoop_journalnode_metrics.update(self.common_metric_collector.get_metrics(beans, self.target))
 41 |             self.get_metrics(beans)
 42 | 
 43 |         for i in range(len(self.merge_list)):
 44 |             service = self.merge_list[i]
 45 |             if service in self.hadoop_journalnode_metrics:
 46 |                 for metric in self.hadoop_journalnode_metrics[service]:
 47 |                     yield self.hadoop_journalnode_metrics[service][metric]
 48 | 
 49 |     def setup_journalnode_labels(self):
 50 |         a_60_latency_flag, a_300_latency_flag, a_3600_latency_flag = 1, 1, 1
 51 |         for metric in self.metrics['JournalNode']:
 52 |             label = ["cluster", "host", "_target"]
 53 |             if 'Syncs60s' in metric:
 54 |                 if a_60_latency_flag:
 55 |                     a_60_latency_flag = 0
 56 |                     key = "Syncs60"
 57 |                     name = "_".join([self.prefix, 'sync60s_latency_microseconds'])
 58 |                     descriptions = "The percentile of sync latency in microseconds in 60s granularity"
 59 |                     self.hadoop_journalnode_metrics['JournalNode'][key] = HistogramMetricFamily(name, descriptions, labels=label)
 60 |                 else:
 61 |                     continue
 62 |             elif 'Syncs300s' in metric:
 63 |                 if a_300_latency_flag:
 64 |                     a_300_latency_flag = 0
 65 |                     key = "Syncs300"
 66 |                     name = "_".join([self.prefix, 'sync300s_latency_microseconds'])
 67 |                     descriptions = "The percentile of sync latency in microseconds in 300s granularity"
 68 |                     self.hadoop_journalnode_metrics['JournalNode'][key] = HistogramMetricFamily(name, descriptions, labels=label)
 69 |                 else:
 70 |                     continue
 71 |             elif 'Syncs3600s' in metric:
 72 |                 if a_3600_latency_flag:
 73 |                     a_3600_latency_flag = 0
 74 |                     key = "Syncs3600"
 75 |                     name = "_".join([self.prefix, 'sync3600s_latency_microseconds'])
 76 |                     descriptions = "The percentile of sync latency in microseconds in 3600s granularity"
 77 |                     self.hadoop_journalnode_metrics['JournalNode'][key] = HistogramMetricFamily(name, descriptions, labels=label)
 78 |                 else:
 79 |                     continue
 80 |             else:
 81 |                 snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower()
 82 |                 name = "_".join([self.prefix, snake_case])
 83 |                 self.hadoop_journalnode_metrics['JournalNode'][metric] = GaugeMetricFamily(name, self.metrics['JournalNode'][metric], labels=label)
 84 | 
 85 |     def setup_metrics_labels(self, beans):
 86 |         for i in range(len(beans)):
 87 |             if 'name=Journal-' in beans[i]['name']:
 88 |                 self.setup_journalnode_labels()
 89 | 
 90 |     def get_metrics(self, beans):
 91 |         for i in range(len(beans)):
 92 |             if 'name=Journal-' in beans[i]['name'] and 'JournalNode' in self.metrics:
 93 |                 host = beans[i]['tag.Hostname']
 94 |                 label = [self.cluster, host, self.target]
 95 | 
 96 |                 a_60_sum, a_300_sum, a_3600_sum = 0.0, 0.0, 0.0
 97 |                 a_60_value, a_300_value, a_3600_value = [], [], []
 98 |                 a_60_percentile, a_300_percentile, a_3600_percentile = [], [], []
 99 | 
100 |                 for metric in beans[i]:
101 |                     if not metric[0].isupper():
102 |                         continue
103 |                     if "Syncs60s" in metric:
104 |                         if 'NumOps' in metric:
105 |                             a_60_count = beans[i][metric]
106 |                         else:
107 |                             tmp = metric.split("thPercentileLatencyMicros")[0].split("Syncs")[1].split("s")
108 |                             a_60_percentile.append(str(float(tmp[1]) / 100.0))
109 |                             a_60_value.append(beans[i][metric])
110 |                             a_60_sum += beans[i][metric]
111 |                     elif 'Syncs300' in metric:
112 |                         if 'NumOps' in metric:
113 |                             a_300_count = beans[i][metric]
114 |                         else:
115 |                             tmp = metric.split("thPercentileLatencyMicros")[0].split("Syncs")[1].split("s")
116 |                             a_300_percentile.append(str(float(tmp[1]) / 100.0))
117 |                             a_300_value.append(beans[i][metric])
118 |                             a_300_sum += beans[i][metric]
119 |                     elif 'Syncs3600' in metric:
120 |                         if 'NumOps' in metric:
121 |                             a_3600_count = beans[i][metric]
122 |                         else:
123 |                             tmp = metric.split("thPercentileLatencyMicros")[0].split("Syncs")[1].split("s")
124 |                             a_3600_percentile.append(str(float(tmp[1]) / 100.0))
125 |                             a_3600_value.append(beans[i][metric])
126 |                             a_3600_sum += beans[i][metric]
127 |                     else:
128 |                         key = metric
129 |                         if key in self.hadoop_journalnode_metrics['JournalNode']:
130 |                             self.hadoop_journalnode_metrics['JournalNode'][key].add_metric(label, beans[i][metric])
131 |                 a_60_bucket = zip(a_60_percentile, a_60_value)
132 |                 a_300_bucket = zip(a_300_percentile, a_300_value)
133 |                 a_3600_bucket = zip(a_3600_percentile, a_3600_value)
134 |                 a_60_bucket.sort()
135 |                 a_300_bucket.sort()
136 |                 a_3600_bucket.sort()
137 |                 a_60_bucket.append(("+Inf", a_60_count))
138 |                 a_300_bucket.append(("+Inf", a_300_count))
139 |                 a_3600_bucket.append(("+Inf", a_3600_count))
140 |                 self.hadoop_journalnode_metrics['JournalNode']['Syncs60'].add_metric(label, buckets=a_60_bucket, sum_value=a_60_sum)
141 |                 self.hadoop_journalnode_metrics['JournalNode']['Syncs300'].add_metric(label, buckets=a_300_bucket, sum_value=a_300_sum)
142 |                 self.hadoop_journalnode_metrics['JournalNode']['Syncs3600'].add_metric(label, buckets=a_3600_bucket, sum_value=a_3600_sum)
143 | 


--------------------------------------------------------------------------------
/hdfs_namenode.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import yaml
  5 | import re
  6 | 
  7 | from prometheus_client.core import GaugeMetricFamily
  8 | 
  9 | from utils import get_module_logger
 10 | from common import MetricCollector, CommonMetricCollector
 11 | from scraper import ScrapeMetrics
 12 | 
 13 | logger = get_module_logger(__name__)
 14 | 
 15 | 
 16 | class NameNodeMetricCollector(MetricCollector):
 17 |     def __init__(self, cluster, urls):
 18 |         MetricCollector.__init__(self, cluster, "hdfs", "namenode")
 19 |         self.target = "-"
 20 |         self.urls = urls
 21 |         self.dns = set()
 22 | 
 23 |         self.hadoop_namenode_metrics = {}
 24 |         for i in range(len(self.file_list)):
 25 |             self.hadoop_namenode_metrics.setdefault(self.file_list[i], {})
 26 | 
 27 |         self.common_metric_collector = CommonMetricCollector(cluster, "hdfs", "namenode")
 28 | 
 29 |         self.scrape_metrics = ScrapeMetrics(urls)
 30 | 
 31 |     def collect(self):
 32 |         isSetup = False
 33 |         beans_list = self.scrape_metrics.scrape()
 34 |         for beans in beans_list:
 35 |             if not isSetup:
 36 |                 self.common_metric_collector.setup_labels(beans)
 37 |                 self.setup_metrics_labels(beans)
 38 |                 isSetup = True
 39 |             for i in range(len(beans)):
 40 |                 if 'tag.Hostname' in beans[i]:
 41 |                     self.target = beans[i]["tag.Hostname"]
 42 |                     break
 43 |             self.hadoop_namenode_metrics.update(self.common_metric_collector.get_metrics(beans, self.target))
 44 |             self.get_metrics(beans)
 45 | 
 46 |         for i in range(len(self.merge_list)):
 47 |             service = self.merge_list[i]
 48 |             if service in self.hadoop_namenode_metrics:
 49 |                 for metric in self.hadoop_namenode_metrics[service]:
 50 |                     yield self.hadoop_namenode_metrics[service][metric]
 51 | 
 52 |     def setup_nnactivity_labels(self):
 53 |         num_namenode_flag, avg_namenode_flag, ops_namenode_flag = 1, 1, 1
 54 |         for metric in self.metrics['NameNodeActivity']:
 55 |             label = ["cluster", "method", "_target"]
 56 |             if "NumOps" in metric:
 57 |                 if num_namenode_flag:
 58 |                     key = "MethodNumOps"
 59 |                     name = "_".join([self.prefix, "nnactivity_method_ops_total"])
 60 |                     description = "Total number of the times the method is called."
 61 |                     self.hadoop_namenode_metrics['NameNodeActivity'][key] = GaugeMetricFamily(name, description, labels=label)
 62 |                     num_namenode_flag = 0
 63 |                 else:
 64 |                     continue
 65 |             elif "AvgTime" in metric:
 66 |                 if avg_namenode_flag:
 67 |                     key = "MethodAvgTime"
 68 |                     name = "_".join([self.prefix, "nnactivity_method_avg_time_milliseconds"])
 69 |                     descripton = "Average turn around time of the method in milliseconds."
 70 |                     self.hadoop_namenode_metrics['NameNodeActivity'][key] = GaugeMetricFamily(name, descripton, labels=label)
 71 |                     avg_namenode_flag = 0
 72 |                 else:
 73 |                     continue
 74 |             elif ops_namenode_flag:
 75 |                 key = "Operations"
 76 |                 name = "_".join([self.prefix, "nnactivity_operations_total"])
 77 |                 description = "Total number of each operation."
 78 |                 self.hadoop_namenode_metrics['NameNodeActivity'][key] = GaugeMetricFamily(name, description, labels=label)
 79 |                 ops_namenode_flag = 0
 80 | 
 81 |     def setup_startupprogress_labels(self):
 82 |         sp_count_flag, sp_elapsed_flag, sp_total_flag, sp_complete_flag = 1, 1, 1, 1
 83 |         for metric in self.metrics['StartupProgress']:
 84 |             snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower()
 85 |             if "ElapsedTime" == metric:
 86 |                 key = "ElapsedTime"
 87 |                 name = "total_elapsed_time_milliseconds"
 88 |                 descriptions = "Total elapsed time in milliseconds."
 89 |             elif "PercentComplete" == metric:
 90 |                 key = "PercentComplete"
 91 |                 name = "complete_rate"
 92 |                 descriptions = "Current rate completed in NameNode startup progress  (The max value is not 100 but 1.0)."
 93 |             elif "Count" in metric:
 94 |                 if sp_count_flag:
 95 |                     sp_count_flag = 0
 96 |                     key = "PhaseCount"
 97 |                     name = "phase_count"
 98 |                     descriptions = "Total number of steps completed in the phase."
 99 |                 else:
100 |                     continue
101 |             elif "ElapsedTime" in metric:
102 |                 if sp_elapsed_flag:
103 |                     sp_elapsed_flag = 0
104 |                     key = "PhaseElapsedTime"
105 |                     name = "phase_elapsed_time_milliseconds"
106 |                     descriptions = "Total elapsed time in the phase in milliseconds."
107 |                 else:
108 |                     continue
109 |             elif "Total" in metric:
110 |                 if sp_total_flag:
111 |                     sp_total_flag = 0
112 |                     key = "PhaseTotal"
113 |                     name = "phase_total"
114 |                     descriptions = "Total number of steps in the phase."
115 |                 else:
116 |                     continue
117 |             elif "PercentComplete" in metric:
118 |                 if sp_complete_flag:
119 |                     sp_complete_flag = 0
120 |                     key = "PhasePercentComplete"
121 |                     name = "phase_complete_rate"
122 |                     descriptions = "Current rate completed in the phase  (The max value is not 100 but 1.0)."
123 |                 else:
124 |                     continue
125 |             else:
126 |                 key = metric
127 |                 name = snake_case
128 |                 descriptions = self.metrics['StartupProgress'][metric]
129 |             label = ["cluster", "phase", "_target"]
130 |             name = "_".join([self.prefix, "startup_process", name])
131 |             self.hadoop_namenode_metrics['StartupProgress'][key] = GaugeMetricFamily(name, descriptions, labels=label)
132 | 
133 |     def setup_fsnamesystem_labels(self):
134 |         cap_flag = 1
135 |         for metric in self.metrics['FSNamesystem']:
136 |             if metric.startswith('Capacity'):
137 |                 if cap_flag:
138 |                     cap_flag = 0
139 |                     key = "capacity"
140 |                     label = ["cluster", "mode"]
141 |                     name = "capacity_bytes"
142 |                     descriptions = "Current DataNodes capacity in each mode in bytes"
143 |                 else:
144 |                     continue
145 |             else:
146 |                 key = metric
147 |                 label = ["cluster"]
148 |                 name = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower()
149 |                 descriptions = self.metrics['FSNamesystem'][metric]
150 |             label.append("_target")
151 |             name = "_".join([self.prefix, "fsname_system", name])
152 |             self.hadoop_namenode_metrics['FSNamesystem'][key] = GaugeMetricFamily(name, descriptions, labels=label)
153 | 
154 |     def setup_fsnamesystem_state_labels(self):
155 |         num_flag = 1
156 |         for metric in self.metrics['FSNamesystemState']:
157 |             snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower()
158 |             if 'DataNodes' in metric:
159 |                 if num_flag:
160 |                     num_flag = 0
161 |                     key = "datanodes_num"
162 |                     label = ["cluster", "state"]
163 |                     descriptions = "Number of datanodes in each state"
164 |                 else:
165 |                     continue
166 |             else:
167 |                 key = metric
168 |                 label = ["cluster"]
169 |                 descriptions = self.metrics['FSNamesystemState'][metric]
170 |             label.append("_target")
171 |             name = "_".join([self.prefix, "fsname_system_state", snake_case])
172 |             self.hadoop_namenode_metrics['FSNamesystemState'][key] = GaugeMetricFamily(name, descriptions, labels=label)
173 | 
174 |     def setup_retrycache_labels(self):
175 |         cache_flag = 1
176 |         for metric in self.metrics['RetryCache']:
177 |             if cache_flag:
178 |                 cache_flag = 0
179 |                 key = "cache"
180 |                 label = ["cluster", "mode", "_target"]
181 |                 name = "_".join([self.prefix, "cache_total"])
182 |                 description = "Total number of RetryCache in each mode"
183 |                 self.hadoop_namenode_metrics['RetryCache'][key] = GaugeMetricFamily(name, description, labels=label)
184 | 
185 |     def setup_nninfo_labels(self):
186 |         for metric in self.metrics['NameNodeInfo']:
187 |             if "LiveNodes" in metric:
188 |                 name = "_".join([self.prefix, "nninfo_live_nodes_count"])
189 |                 description = "Count of live data node"
190 |                 self.hadoop_namenode_metrics['NameNodeInfo']["LiveNodeCount"] = GaugeMetricFamily(name, description, labels=["cluster", "_target"])
191 | 
192 |                 label = ["cluster", "datanode", "infoAddr", "infoSecureAddr", "xferaddr", "version", "_target"]
193 |                 items = ["lastContact", "usedSpace", "adminState", "nonDfsUsedSpace", "capacity", "numBlocks",
194 |                          "used", "remaining", "blockScheduled", "blockPoolUsed", "blockPoolUsedPercent", "volfails"]
195 |                 for item in items:
196 |                     item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower()
197 |                     name = "_".join([self.prefix, "nninfo_live_nodes", item])
198 |                     key = "LiveNodes-" + item
199 |                     description = "Live node " + item
200 |                     if item == "admin_state":
201 |                         description += " 0: In Service, 1: Decommission In Progress, 2: Decommissioned"
202 |                     self.hadoop_namenode_metrics['NameNodeInfo'][key] = GaugeMetricFamily(name, description, labels=label)
203 |                 continue
204 |             elif "DeadNodes" in metric:
205 |                 name = "_".join([self.prefix, "nninfo_dead_nodes_count"])
206 |                 description = "Count of dead data node"
207 |                 self.hadoop_namenode_metrics['NameNodeInfo']["DeadNodeCount"] = GaugeMetricFamily(name, description, labels=["cluster", "_target"])
208 | 
209 |                 label = ["cluster", "datanode", "decommissioned", "xferaddr"]
210 |                 name = "_".join([self.prefix, "nninfo_dead_nodes_last_contact"])
211 |                 key = "DeadNodes"
212 |                 description = "Dead node last contact in milions"
213 |                 self.hadoop_namenode_metrics['NameNodeInfo'][key] = GaugeMetricFamily(name, description, labels=label)
214 |                 continue
215 |             elif "DecomNodes" in metric:
216 |                 name = "_".join([self.prefix, "nninfo_decom_nodes_count"])
217 |                 description = "Count of decommissioned data node"
218 |                 self.hadoop_namenode_metrics['NameNodeInfo']["DecomNodeCount"] = GaugeMetricFamily(name, description, labels=["cluster", "_target"])
219 | 
220 |                 label = ["cluster", "datanode", "xferaddr", "_target"]
221 |                 items = ["underReplicatedBlocks", "decommissionOnlyReplicas", "underReplicateInOpenFiles"]
222 |                 for item in items:
223 |                     item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower()
224 |                     name = "_".join([self.prefix, "nninfo_decom_nodes", item])
225 |                     key = "DecomNodes-" + item
226 |                     description = "Decom Node " + item
227 |                     self.hadoop_namenode_metrics['NameNodeInfo'][key] = GaugeMetricFamily(name, description, labels=label)
228 |                 continue
229 |             elif "EnteringMaintenanceNodes" in metric:
230 |                 name = "_".join([self.prefix, "nninfo_maintenance_nodes_count"])
231 |                 description = "Count of maintenance data node"
232 |                 self.hadoop_namenode_metrics['NameNodeInfo']["MaintenanceNodeCount"] = GaugeMetricFamily(name, description, labels=["cluster", "_target"])
233 | 
234 |                 label = ["cluster", "datanode", "xferaddr", "_target"]
235 |                 items = ["underReplicatedBlocks", "maintenanceOnlyReplicas", "underReplicateInOpenFiles"]
236 |                 for item in items:
237 |                     item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower()
238 |                     name = "_".join([self.prefix, "nninfo_entering_maintenance_nodes", item])
239 |                     key = "EnteringMaintenanceNodes-" + item
240 |                     description = "Entering maintenance node " + item
241 |                     self.hadoop_namenode_metrics['NameNodeInfo'][key] = GaugeMetricFamily(name, description, labels=label)
242 |                 continue
243 |             elif "CorruptFiles" in metric:
244 |                 label = ["cluster", "_target"]
245 |                 name = "_".join([self.prefix, "nninfo_corrupt_file_count"])
246 |                 key = "CorruptFiles"
247 |                 description = "Corrupt file count"
248 |                 self.hadoop_namenode_metrics['NameNodeInfo'][key] = GaugeMetricFamily(name, description, labels=label)
249 |                 continue
250 |             elif "NodeUsage" in metric:
251 |                 label = ["cluster", "_target"]
252 |                 items = ["min", "median", "max", "stdDev"]
253 |                 for item in items:
254 |                     item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower()
255 |                     name = "_".join([self.prefix, "nninfo_node_usage", item])
256 |                     key = "NodeUsage-" + item
257 |                     description = "Node usage " + item
258 |                     self.hadoop_namenode_metrics['NameNodeInfo'][key] = GaugeMetricFamily(name, description, labels=label)
259 |                 continue
260 |             elif "SoftwareVersion" in metric:
261 |                 label = ["cluster", "software_version"]
262 |                 name = "_".join([self.prefix, "nninfo_software_version"])
263 |                 key = "SoftwareVersion"
264 |             elif "Safemode" in metric:
265 |                 label = ["cluster"]
266 |                 name = "_".join([self.prefix, "nninfo_safe_mode"])
267 |                 key = "Safemode"
268 |             else:
269 |                 label = ["cluster"]
270 |                 snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower()
271 |                 name = "_".join([self.prefix, "nninfo", snake_case])
272 |                 key = metric
273 |             label.append("_target")
274 |             self.hadoop_namenode_metrics['NameNodeInfo'][key] = GaugeMetricFamily(name, self.metrics["NameNodeInfo"][metric], labels=label)
275 | 
276 |     def setup_metrics_labels(self, beans):
277 |         for i in range(len(beans)):
278 |             if 'NameNodeActivity' in beans[i]['name']:
279 |                 self.setup_nnactivity_labels()
280 |             if 'StartupProgress' in beans[i]['name']:
281 |                 self.setup_startupprogress_labels()
282 |             if 'FSNamesystem' in beans[i]['name']:
283 |                 self.setup_fsnamesystem_labels()
284 |             if 'FSNamesystemState' in beans[i]['name']:
285 |                 self.setup_fsnamesystem_state_labels()
286 |             if 'RetryCache' in beans[i]['name']:
287 |                 self.setup_retrycache_labels()
288 |             if "NameNodeInfo" in beans[i]['name']:
289 |                 self.setup_nninfo_labels()
290 | 
291 |     def get_nnactivity_metrics(self, bean):
292 |         for metric in self.metrics['NameNodeActivity']:
293 |             if "NumOps" in metric:
294 |                 method = metric.split('NumOps')[0]
295 |                 key = "MethodNumOps"
296 |             elif "AvgTime" in metric:
297 |                 method = metric.split('AvgTime')[0]
298 |                 key = "MethodAvgTime"
299 |             else:
300 |                 if "Ops" in metric:
301 |                     method = metric.split('Ops')[0]
302 |                 else:
303 |                     method = metric
304 |                 key = "Operations"
305 |             label = [self.cluster, method, self.target]
306 |             self.hadoop_namenode_metrics['NameNodeActivity'][key].add_metric(label, bean[metric] if metric in bean else 0)
307 | 
308 |     def get_startupprogress_metrics(self, bean):
309 |         for metric in self.metrics['StartupProgress']:
310 |             if "Count" in metric:
311 |                 key = "PhaseCount"
312 |                 phase = metric.split("Count")[0]
313 |             elif "ElapsedTime" in metric and "ElapsedTime" != metric:
314 |                 key = "PhaseElapsedTime"
315 |                 phase = metric.split("ElapsedTime")[0]
316 |             elif "Total" in metric:
317 |                 key = "PhaseTotal"
318 |                 phase = metric.split("Total")[0]
319 |             elif "PercentComplete" in metric and "PercentComplete" != metric:
320 |                 key = "PhasePercentComplete"
321 |                 phase = metric.split("PercentComplete")[0]
322 |             else:
323 |                 key = metric
324 |                 phase = "-"
325 |             label = [self.cluster, phase, self.target]
326 |             self.hadoop_namenode_metrics['StartupProgress'][key].add_metric(label, bean[metric] if metric in bean else 0)
327 | 
328 |     def get_fsnamesystem_metrics(self, bean):
329 |         for metric in self.metrics['FSNamesystem']:
330 |             key = metric
331 |             if 'HAState' in metric:
332 |                 label = [self.cluster]
333 |                 if 'initializing' == bean['tag.HAState']:
334 |                     value = 0.0
335 |                 elif 'active' == bean['tag.HAState']:
336 |                     value = 1.0
337 |                 elif 'standby' == bean['tag.HAState']:
338 |                     value = 2.0
339 |                 elif 'stopping' == bean['tag.HAState']:
340 |                     value = 3.0
341 |                 else:
342 |                     value = 9999
343 |                 label.append(self.target)
344 |                 self.hadoop_namenode_metrics['FSNamesystem'][key].add_metric(label, value)
345 |             elif metric.startswith("Capacity"):
346 |                 key = 'capacity'
347 |                 mode = metric.split("Capacity")[1]
348 |                 label = [self.cluster, mode]
349 |                 label.append(self.target)
350 |                 self.hadoop_namenode_metrics['FSNamesystem'][key].add_metric(label, bean[metric] if metric in bean else 0)
351 |             else:
352 |                 label = [self.cluster]
353 |                 label.append(self.target)
354 |                 self.hadoop_namenode_metrics['FSNamesystem'][key].add_metric(label, bean[metric] if metric in bean else 0)
355 | 
356 |     def get_fsnamesystem_state_metrics(self, bean):
357 |         for metric in self.metrics['FSNamesystemState']:
358 |             label = [self.cluster]
359 |             key = metric
360 |             if 'FSState' in metric:
361 |                 if 'Safemode' == bean['FSState']:
362 |                     value = 0.0
363 |                 elif 'Operational' == bean['FSState']:
364 |                     value = 1.0
365 |                 else:
366 |                     value = 9999
367 |                 label.append(self.target)
368 |                 self.hadoop_namenode_metrics['FSNamesystemState'][key].add_metric(label, value)
369 |             elif "TotalSyncTimes" in metric:
370 |                 label.append(self.target)
371 |                 self.hadoop_namenode_metrics['FSNamesystemState'][key].add_metric(label, float(
372 |                     re.sub(r'\s', '', bean[metric])) if metric in bean and bean[metric] else 0)
373 |             elif "DataNodes" in metric:
374 |                 key = 'datanodes_num'
375 |                 state = metric.split("DataNodes")[0].split("Num")[1]
376 |                 label = [self.cluster, state, self.target]
377 |                 self.hadoop_namenode_metrics['FSNamesystemState'][key].add_metric(label, bean[metric] if metric in bean and bean[metric] else 0)
378 |             else:
379 |                 label.append(self.target)
380 |                 self.hadoop_namenode_metrics['FSNamesystemState'][key].add_metric(label, bean[metric] if metric in bean and bean[metric] else 0)
381 | 
382 |     def get_retrycache_metrics(self, bean):
383 |         for metric in self.metrics['RetryCache']:
384 |             key = "cache"
385 |             label = [self.cluster, metric.split('Cache')[1], self.target]
386 |             self.hadoop_namenode_metrics['RetryCache'][key].add_metric(label, bean[metric] if metric in bean and bean[metric] else 0)
387 | 
388 |     def get_nninfo_metrics(self, bean):
389 |         for metric in self.metrics["NameNodeInfo"]:
390 |             if "LiveNodes" in metric and "LiveNodes" in bean:
391 |                 live_node_dict = yaml.safe_load(bean["LiveNodes"])
392 |                 self.hadoop_namenode_metrics["NameNodeInfo"]["LiveNodeCount"].add_metric([self.cluster, self.target], len(live_node_dict))
393 |                 dns = set()
394 |                 for node, info in live_node_dict.items():
395 |                     label = [self.cluster, node, info["infoAddr"], info["infoSecureAddr"], info["xferaddr"], info["version"], self.target]
396 |                     items = ["lastContact", "usedSpace", "adminState", "nonDfsUsedSpace", "capacity", "numBlocks",
397 |                              "used", "remaining", "blockScheduled", "blockPoolUsed", "blockPoolUsedPercent", "volfails"]
398 |                     dns.add("http://"+info["infoAddr"]+"/jmx")
399 |                     for item in items:
400 |                         value = info[item] if item in info else 0
401 |                         if item == "adminState":
402 |                             if value == "In Service":
403 |                                 value = 0
404 |                             elif value == "Decommission In Progress":
405 |                                 value = 1
406 |                             else:  # Decommissioned
407 |                                 value = 2
408 |                         item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower()
409 |                         key = "LiveNodes-" + item
410 |                         self.hadoop_namenode_metrics["NameNodeInfo"][key].add_metric(label, value)
411 |                 self.dns = dns
412 |             elif "DeadNodes" in metric and "DeadNodes" in bean:
413 |                 dead_node_dict = yaml.safe_load(bean["DeadNodes"])
414 |                 self.hadoop_namenode_metrics["NameNodeInfo"]["DeadNodeCount"].add_metric([self.cluster, self.target], len(dead_node_dict))
415 |                 for node, info in dead_node_dict.items():
416 |                     label = [self.cluster, node, str(info["decommissioned"]), info["xferaddr"], self.target]
417 |                     value = info["lastContact"]
418 |                     self.hadoop_namenode_metrics["NameNodeInfo"]["DeadNodes"].add_metric(label, value)
419 |             elif "DecomNodes" in metric and "DecomNodes" in bean:
420 |                 decom_node_dict = yaml.safe_load(bean["DecomNodes"])
421 |                 self.hadoop_namenode_metrics["NameNodeInfo"]["DecomNodeCount"].add_metric([self.cluster, self.target], len(decom_node_dict))
422 |                 for node, info in decom_node_dict.items():
423 |                     label = [self.cluster, node, info["xferaddr"], self.target]
424 |                     items = ["underReplicatedBlocks", "decommissionOnlyReplicas", "underReplicateInOpenFiles"]
425 |                     for item in items:
426 |                         value = info[item] if item in info else 0
427 |                         item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower()
428 |                         key = "DecomNodes-" + item
429 |                         self.hadoop_namenode_metrics["NameNodeInfo"][key].add_metric(label, value)
430 |             elif "EnteringMaintenanceNodes" in metric and "EnteringMaintenanceNodes" in bean:
431 |                 node_dict = yaml.safe_load(bean["EnteringMaintenanceNodes"])
432 |                 self.hadoop_namenode_metrics["NameNodeInfo"]["MaintenanceNodeCount"].add_metric([self.cluster, self.target], len(node_dict))
433 |                 for node, info in node_dict.items():
434 |                     label = [self.cluster, node, info["xferaddr"], self.target]
435 |                     items = ["underReplicatedBlocks", "maintenanceOnlyReplicas", "underReplicateInOpenFiles"]
436 |                     for item in items:
437 |                         value = info[item] if item in info else 0
438 |                         item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower()
439 |                         key = "EnteringMaintenanceNodes-" + item
440 |                         self.hadoop_namenode_metrics["NameNodeInfo"][key].add_metric(label, value)
441 |             elif "CorruptFiles" in metric and "CorruptFiles" in bean:
442 |                 file_list = yaml.safe_load(bean["CorruptFiles"])
443 |                 label = [self.cluster, self.target]
444 |                 self.hadoop_namenode_metrics["NameNodeInfo"]["CorruptFiles"].add_metric(label, len(file_list))
445 |             elif "NodeUsage" in metric and "NodeUsage" in bean:
446 |                 node_usage_dict = yaml.safe_load(bean["NodeUsage"])["nodeUsage"]
447 |                 label = [self.cluster, self.target]
448 |                 items = ["min", "median", "max", "stdDev"]
449 |                 for item in items:
450 |                     value = node_usage_dict[item] if item in node_usage_dict else 0
451 |                     value = float(value.strip("%"))
452 |                     item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower()
453 |                     key = "NodeUsage-" + item
454 |                     self.hadoop_namenode_metrics["NameNodeInfo"][key].add_metric(label, value)
455 |             elif "SoftwareVersion" in metric and "SoftwareVersion" in bean:
456 |                 label = [self.cluster, bean["SoftwareVersion"], self.target]
457 |                 self.hadoop_namenode_metrics["NameNodeInfo"]["SoftwareVersion"].add_metric(label, 0)
458 |             elif "Safemode" in metric and "Safemode" in bean:
459 |                 label = [self.cluster, self.target]
460 |                 self.hadoop_namenode_metrics["NameNodeInfo"]["Safemode"].add_metric(label, 0 if metric in bean and bean[metric] == "" else 1)
461 |             else:
462 |                 label = [self.cluster, self.target]
463 |                 self.hadoop_namenode_metrics['NameNodeInfo'][metric].add_metric(label, bean[metric] if metric in bean and bean[metric] else 0)
464 | 
465 |     def get_metrics(self, beans):
466 |         for i in range(len(beans)):
467 |             if 'NameNodeActivity' in beans[i]['name']:
468 |                 self.get_nnactivity_metrics(beans[i])
469 |             if 'StartupProgress' in beans[i]['name']:
470 |                 self.get_startupprogress_metrics(beans[i])
471 |             if 'FSNamesystem' in beans[i]['name'] and 'FSNamesystemState' not in beans[i]['name']:
472 |                 self.get_fsnamesystem_metrics(beans[i])
473 |             if 'FSNamesystemState' in beans[i]['name']:
474 |                 self.get_fsnamesystem_state_metrics(beans[i])
475 |             if 'RetryCache' in beans[i]['name']:
476 |                 self.get_retrycache_metrics(beans[i])
477 |             if 'NameNodeInfo' in beans[i]['name']:
478 |                 self.get_nninfo_metrics(beans[i])
479 | 


--------------------------------------------------------------------------------
/metrics/common/JvmMetrics.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "MemNonHeapUsedM": "Current non-heap memory used in MB.",
 3 |     "MemNonHeapCommittedM": "Current non-heap memory committed in MB.",
 4 |     "MemNonHeapMaxM": "Max non-heap memory size in MB.",
 5 |     "MemHeapUsedM": "Current heap memory used in MB.",
 6 |     "MemHeapCommittedM": "Current heap memory committed in MB.",
 7 |     "MemHeapMaxM": "Max heap memory size in MB.",
 8 |     "MemMaxM": "Max memory size in MB.",
 9 |     "ThreadsNew": "Current number of NEW threads.",
10 |     "ThreadsRunnable": "Current number of RUNNABLE threads.",
11 |     "ThreadsBlocked": "Current number of BLOCKED threads.",
12 |     "ThreadsWaiting": "Current number of WAITING threads.",
13 |     "ThreadsTimedWaiting": "Current number of TIMED_WAITING threads.",
14 |     "ThreadsTerminated": "Current number of TERMINATED threads.",
15 |     "GcCount": "Total number of Gc count",    
16 |     "GcTimeMillis": "Total GC time in msec.",
17 |     "GcCountParNew": "ParNew GC count.",
18 |     "GcTimeMillisParNew": "ParNew GC time in msec.",
19 |     "GcCountConcurrentMarkSweep": "ConcurrentMarkSweep GC count.",
20 |     "GcTimeMillisConcurrentMarkSweep": "ConcurrentMarkSweep GC time in msec.",
21 |     "GcNumWarnThresholdExceeded": "Number of times that the GC warn threshold is exceeded.",
22 |     "GcNumInfoThresholdExceeded": "Number of times that the GC info threshold is exceeded.",
23 |     "GcTotalExtraSleepTime": "Total GC extra sleep time in msec.",
24 |     "LogFatal": "Total number of FATAL logs.",
25 |     "LogError": "Total number of ERROR logs.",
26 |     "LogWarn": "Total number of WARN logs.",
27 |     "LogInfo": "Total number of INFO logs."
28 | }
29 | 


--------------------------------------------------------------------------------
/metrics/common/MetricsSystem.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "NumActiveSources": "Current number of active metrics sources.",
 3 |     "NumAllSources": "Total number of metrics sources.",
 4 |     "NumActiveSinks": "Current number of active sinks.",
 5 |     "NumAllSinks": "Total number of sinks (BUT usually less than NumActiveSinks, see HADOOP-9946).",
 6 |     "SnapshotNumOps": "Total number of operations to snapshot statistics from a metrics source.",
 7 |     "SnapshotAvgTime": "Average time in milliseconds to snapshot statistics from a metrics source.",
 8 |     "PublishNumOps": "Total number of operations to publish statistics to a sink.",
 9 |     "PublishAvgTime": "Average time in milliseconds to publish statistics to a sink.",
10 |     "DroppedPubAll": "Total number of dropped publishes.",
11 |     "Sink_instanceNumOps": "Total number of sink operations for the instance.",
12 |     "Sink_instanceAvgTime": "Average time in milliseconds of sink operations for the instance.",
13 |     "Sink_instanceDropped": "Total number of dropped sink operations for the instance.",
14 |     "Sink_instanceQsize": "Current queue length of sink operations (BUT always set to 0 because nothing to increment this metrics, see HADOOP-9941)."
15 | }


--------------------------------------------------------------------------------
/metrics/common/OperatingSystem.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "OpenFileDescriptorCount": "Total number of open file descriptor",
 3 |     "MaxFileDescriptorCount": "Total number of max file descriptor",
 4 |     "CommittedVirtualMemorySize": "The size of committed virtual memory in bytes",
 5 |     "TotalSwapSpaceSize": "The size of total swap space in bytes",
 6 |     "FreeSwapSpaceSize": "The size of free swap space in bytes",
 7 |     "ProcessCpuTime": "Total process cpu time in microseconds",
 8 |     "FreePhysicalMemorySize": "The size of free physical memory in bytes",
 9 |     "TotalPhysicalMemorySize": "The size of total physical memory in bytes",
10 |     "SystemCpuLoad": "Average of system CPU load",
11 |     "ProcessCpuLoad": "Average of process CPU load",
12 |     "SystemLoadAverage": "Average of system load",
13 |     "AvailableProcessors": "Total number of available processors",
14 | }


--------------------------------------------------------------------------------
/metrics/common/RpcActivity.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "ReceivedBytes": "Total number of received bytes",
 3 |     "SentBytes": "Total number of sent bytes",
 4 |     "RpcQueueTimeNumOps": "Total number of RPC calls",
 5 |     "RpcQueueTimeAvgTime": "Average queue time in milliseconds",
 6 |     "RpcProcessingTimeNumOps": "Total number of RPC calls (same to RpcQueueTimeNumOps)",
 7 |     "RpcProcessingTimeAvgTime": "Average Processing time in milliseconds",
 8 |     "RpcAuthenticationFailures": "Total number of authentication failures",
 9 |     "RpcAuthenticationSuccesses": "Total number of authentication successes",
10 |     "RpcAuthorizationFailures": "Total number of authorization failures",
11 |     "RpcAuthorizationSuccesses": "Total number of authorization successes",
12 |     "RpcClientBackoff": "Total number of RPC client back off",
13 |     "RpcSlowCalls": "Total number of RPC slow calls",
14 |     "NumOpenConnections": "Current number of open connections",
15 |     "CallQueueLength": "Current length of the call queue"
16 | }
17 | 


--------------------------------------------------------------------------------
/metrics/common/RpcDetailedActivity.json:
--------------------------------------------------------------------------------
1 | {
2 |     "methodNumOps": "Total number of the times the method is called",
3 |     "methodAvgTime": "Average turn around time of the method in milliseconds"
4 | }


--------------------------------------------------------------------------------
/metrics/common/Runtime.json:
--------------------------------------------------------------------------------
1 | {
2 |     "Uptime": "components uptime in milliseconds",
3 |     "StartTime": "components start time in milliseconds"
4 | }


--------------------------------------------------------------------------------
/metrics/common/UgiMetrics.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "LoginSuccessNumOps": "Total number of successful kerberos logins.",
 3 |     "LoginSuccessAvgTime": "Average time for successful kerberos logins in milliseconds.",
 4 |     "LoginFailureNumOps": "Total number of failed kerberos logins.",
 5 |     "LoginFailureAvgTime": "Average time for failed kerberos logins in milliseconds.",
 6 |     "GetGroupsNumOps": "Total number of group resolutions.",
 7 |     "GetGroupsAvgTime": "Average time for group resolution in milliseconds.",
 8 |     "RenewalFailuresTotal": "Total number of renewal failures.",
 9 |     "RenewalFailures": "Current number of renewal failures."
10 | }


--------------------------------------------------------------------------------
/metrics/datanode/DataNodeActivity.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "BytesWritten": "Total number of bytes written to DataNode",
 3 |     "BytesRead": "Total number of bytes read from DataNode",
 4 |     "TotalWriteTime": "Total number of milliseconds spent on write operation",
 5 |     "TotalReadTime": "Total number of milliseconds spent on read operation",
 6 |     "BlocksWritten": "Total number of blocks written to DataNode",
 7 |     "BlocksRead": "Total number of blocks read from DataNode",
 8 |     "BlocksReplicated": "Total number of blocks replicated",
 9 |     "BlocksRemoved": "Total number of blocks removed",
10 |     "BlocksVerified": "Total number of blocks verified",
11 |     "BlockVerificationFailures": "Total number of verifications failures",
12 |     "BlocksCached": "Total number of blocks cached",
13 |     "BlocksUncached": "Total number of blocks uncached",
14 |     "ReadsFromLocalClient": "Total number of read operations from local client",
15 |     "ReadsFromRemoteClient": "Total number of read operations from remote client",
16 |     "WritesFromLocalClient": "Total number of write operations from local client",
17 |     "WritesFromRemoteClient": "Total number of write operations from remote client",
18 |     "BlocksGetLocalPathInfo": "Total number of operations to get local path names of blocks",    
19 |     "RemoteBytesRead": "Number of bytes read by remote clients",
20 |     "RemoteBytesWritten": "Number of bytes written by remote clients",
21 |     "RamDiskBlocksWrite": "Total number of blocks written to memory",
22 |     "RamDiskBlocksWriteFallback": "Total number of blocks written to memory but not satisfied (failed-over to disk)",
23 |     "RamDiskBytesWrite": "Total number of bytes written to memory",
24 |     "RamDiskBlocksReadHits": "Total number of times a block in memory was read",
25 |     "RamDiskBlocksEvicted": "Total number of blocks evicted in memory",
26 |     "RamDiskBlocksEvictedWithoutRead": "Total number of blocks evicted in memory without ever being read from memory",
27 |     "RamDiskBlocksEvictionWindowMsNumOps": "Number of blocks evicted in memory",
28 |     "RamDiskBlocksEvictionWindowMsAvgTime": "Average time of blocks in memory before being evicted in milliseconds",
29 |     "RamDiskBlocksLazyPersisted": "Total number of blocks written to disk by lazy writer",
30 |     "RamDiskBlocksDeletedBeforeLazyPersisted": "Total number of blocks deleted by application before being persisted to disk",
31 |     "RamDiskBytesLazyPersisted": "Total number of bytes written to disk by lazy writer",
32 |     "RamDiskBlocksLazyPersistWindowMsNumOps": "Number of blocks written to disk by lazy writer",
33 |     "RamDiskBlocksLazyPersistWindowMsAvgTime": "Average time of blocks written to disk by lazy writer in milliseconds",
34 |     "FsyncCount": "Total number of fsync",
35 |     "VolumeFailures": "Total number of volume failures occurred",
36 |     "DatanodeNetworkErrors" : "Total number of datanode network error",
37 |     "DataNodeActiveXceiversCount" : "Total number of datanode active Xceivers",
38 |     "ReadBlockOpNumOps": "Total number of read operations",
39 |     "ReadBlockOpAvgTime": "Average time of read operations in milliseconds",
40 |     "WriteBlockOpNumOps": "Total number of write operations",
41 |     "WriteBlockOpAvgTime": "Average time of write operations in milliseconds",
42 |     "BlockChecksumOpNumOps": "Total number of blockChecksum operations",
43 |     "BlockChecksumOpAvgTime": "Average time of blockChecksum operations in milliseconds",
44 |     "CopyBlockOpNumOps": "Total number of block copy operations",
45 |     "CopyBlockOpAvgTime": "Average time of block copy operations in milliseconds",
46 |     "ReplaceBlockOpNumOps": "Total number of block replace operations",
47 |     "ReplaceBlockOpAvgTime": "Average time of block replace operations in milliseconds",
48 |     "HeartbeatsNumOps": "Total number of heartbeats",
49 |     "HeartbeatsAvgTime": "Average heartbeat time in milliseconds",
50 |     "HeartbeatsTotalNumOps": "Total number of heartbeats which is a duplicate of HeartbeatsNumOps",
51 |     "HeartbeatsTotalAvgTime": "Average total heartbeat time in milliseconds",
52 |     "LifelinesNumOps": "Total number of lifeline messages",
53 |     "LifelinesAvgTime": "Average lifeline message processing time in milliseconds",
54 |     "BlockReportsNumOps": "Total number of block report operations",
55 |     "BlockReportsAvgTime": "Average time of block report operations in milliseconds",
56 |     "IncrementalBlockReportsNumOps": "Total number of incremental block report operations",
57 |     "IncrementalBlockReportsAvgTime": "Average time of incremental block report operations in milliseconds",
58 |     "CacheReportsNumOps": "Total number of cache report operations",
59 |     "CacheReportsAvgTime": "Average time of cache report operations in milliseconds",
60 |     "PacketAckRoundTripTimeNanosNumOps": "Total number of ack round trip",
61 |     "PacketAckRoundTripTimeNanosAvgTime": "Average time from ack send to receive minus the downstream ack time in nanoseconds",
62 |     "FlushNanosNumOps": "Total number of flushes",
63 |     "FlushNanosAvgTime": "Average flush time in nanoseconds",
64 |     "FsyncNanosNumOps": "Total number of fsync",
65 |     "FsyncNanosAvgTime": "Average fsync time in nanoseconds",
66 |     "SendDataPacketBlockedOnNetworkNanosNumOps": "Total number of sending packets",
67 |     "SendDataPacketBlockedOnNetworkNanosAvgTime": "Average waiting time of sending packets in nanoseconds",
68 |     "SendDataPacketTransferNanosNumOps": "Total number of sending packets",
69 |     "SendDataPacketTransferNanosAvgTime": "Average transfer time of sending packets in nanoseconds"
70 | }


--------------------------------------------------------------------------------
/metrics/datanode/DataNodeInfo.json:
--------------------------------------------------------------------------------
1 | {
2 |   "VolumeInfo": "Volume infomation in each path and in each mode",
3 |   "XceiverCount": "Total number of datanode Xceivers"
4 | }
5 | 


--------------------------------------------------------------------------------
/metrics/datanode/FSDatasetState.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Capacity" : "Current raw capacity of DataNode in bytes",
 3 |     "DfsUsed" : "Current space used by DataNodes for DFS purposes in bytes",
 4 |     "Remaining" : "Current remaining capacity in bytes",
 5 |     "NumFailedVolumes" : "Total number of failed volumes",
 6 |     "LastVolumeFailureDate" : "Last time of volume failures",
 7 |     "EstimatedCapacityLostTotal" : "An estimate of the total capacity lost due to volume failures",
 8 |     "CacheUsed" : "Total number of cache used",
 9 |     "CacheCapacity" : "Current raw capacity of cache in bytes",
10 |     "NumBlocksCached" : "Total number of blocks cached",
11 |     "NumBlocksFailedToCache" : "Total number of blocks failed to cache",
12 |     "NumBlocksFailedToUnCache" : "Total number of blocks failed to uncached"
13 | }


--------------------------------------------------------------------------------
/metrics/journalnode/JournalNode.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "Syncs60sNumOps": "Number of sync operations (1 minute granularity)",
 3 |     "Syncs60s50thPercentileLatencyMicros": "The 50th percentile of sync latency in microseconds (1 minute granularity)",
 4 |     "Syncs60s75thPercentileLatencyMicros": "The 75th percentile of sync latency in microseconds (1 minute granularity)",
 5 |     "Syncs60s90thPercentileLatencyMicros": "The 90th percentile of sync latency in microseconds (1 minute granularity)",
 6 |     "Syncs60s95thPercentileLatencyMicros": "The 95th percentile of sync latency in microseconds (1 minute granularity)",
 7 |     "Syncs60s99thPercentileLatencyMicros": "The 99th percentile of sync latency in microseconds (1 minute granularity)",
 8 |     "Syncs300sNumOps": "Number of sync operations (5 minutes granularity)",
 9 |     "Syncs300s50thPercentileLatencyMicros": "The 50th percentile of sync latency in microseconds (5 minutes granularity)",
10 |     "Syncs300s75thPercentileLatencyMicros": "The 75th percentile of sync latency in microseconds (5 minutes granularity)",
11 |     "Syncs300s90thPercentileLatencyMicros": "The 90th percentile of sync latency in microseconds (5 minutes granularity)",
12 |     "Syncs300s95thPercentileLatencyMicros": "The 95th percentile of sync latency in microseconds (5 minutes granularity)",
13 |     "Syncs300s99thPercentileLatencyMicros": "The 99th percentile of sync latency in microseconds (5 minutes granularity)",
14 |     "Syncs3600sNumOps": "Number of sync operations (1 hour granularity)",
15 |     "Syncs3600s50thPercentileLatencyMicros": "The 50th percentile of sync latency in microseconds (1 hour granularity)",
16 |     "Syncs3600s75thPercentileLatencyMicros": "The 75th percentile of sync latency in microseconds (1 hour granularity)",
17 |     "Syncs3600s90thPercentileLatencyMicros": "The 90th percentile of sync latency in microseconds (1 hour granularity)",
18 |     "Syncs3600s95thPercentileLatencyMicros": "The 95th percentile of sync latency in microseconds (1 hour granularity)",
19 |     "Syncs3600s99thPercentileLatencyMicros": "The 99th percentile of sync latency in microseconds (1 hour granularity)",
20 |     "BatchesWritten": "Total number of batches written since startup",
21 |     "TxnsWritten": "Total number of transactions written since startup",
22 |     "BytesWritten": "Total number of bytes written since startup",
23 |     "BatchesWrittenWhileLagging": "Total number of batches written where this node was lagging",
24 |     "LastWriterEpoch": "Current writer’s epoch number",
25 |     "CurrentLagTxns": "The number of transactions that this JournalNode is lagging",
26 |     "LastWrittenTxId": "The highest transaction id stored on this JournalNode",
27 |     "LastPromisedEpoch": "The last epoch number which this node has promised not to accept any lower epoch, or 0 if no promises have been made",
28 |     "LastJournalTimestamp": "The timestamp of last successfully written transaction"
29 | }


--------------------------------------------------------------------------------
/metrics/namenode/FSNamesystem.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "HAState": "(HA-only) Current state of the NameNode: 0.0 (for initializing) or 1.0 (for active) or 2.0 (for standby) or 3.0 (for stopping) state",
 3 |     "MissingBlocks": "Current number of missing blocks",
 4 |     "MissingReplOneBlocks": "Current number of missing blocks with replication factor 1",
 5 |     "ExpiredHeartbeats": "Total number of expired heartbeats",
 6 |     "TransactionsSinceLastCheckpoint": "Total number of transactions since last checkpoint",
 7 |     "TransactionsSinceLastLogRoll": "Total number of transactions since last edit log roll",
 8 |     "LastWrittenTransactionId": "Last transaction ID written to the edit log",
 9 |     "LastCheckpointTime": "Time in milliseconds since epoch of last checkpoint",
10 |     "CapacityTotal": "Current raw capacity of DataNodes in bytes",
11 |     "CapacityUsed": "Current used capacity across all DataNodes in bytes",
12 |     "CapacityRemaining": "Current remaining capacity in bytes",
13 |     "CapacityUsedNonDFS": "Current space used by DataNodes for non DFS purposes in bytes",
14 |     "TotalLoad": "Current number of connections",
15 |     "SnapshottableDirectories": "Current number of snapshottable directories",
16 |     "Snapshots": "Current number of snapshots",
17 |     "NumEncryptionZones": "Current number of encryption zones",    
18 |     "LockQueueLength": "Number of threads waiting to acquire FSNameSystem lock",
19 |     "BlocksTotal": "Current number of allocated blocks in the system",    
20 |     "NumFilesUnderConstruction": "Current number of files under construction",
21 |     "NumActiveClients": "Current number of active clients holding lease",
22 |     "FilesTotal": "Current number of files and directories",
23 |     "PendingReplicationBlocks": "Current number of blocks pending to be replicated",
24 |     "UnderReplicatedBlocks": "Current number of blocks under replicated",
25 |     "CorruptBlocks": "Current number of blocks with corrupt replicas.",
26 |     "ScheduledReplicationBlocks": "Current number of blocks scheduled for replications",
27 |     "PendingDeletionBlocks": "Current number of blocks pending deletion",
28 |     "ExcessBlocks": "Current number of excess blocks",
29 |     "NumTimedOutPendingReplications": "The number of timed out replications. Not the number of unique blocks that timed out. Note: The metric name will be changed to NumTimedOutPendingReconstructions in Hadoop 3 release.",
30 |     "PostponedMisreplicatedBlocks": "(HA-only) Current number of blocks postponed to replicate",
31 |     "PendingDataNodeMessageCount": "(HA-only) Current number of pending block-related messages for later processing in the standby NameNode",
32 |     "MillisSinceLastLoadedEdits": "(HA-only) Time in milliseconds since the last time standby NameNode load edit log. In active NameNode, set to 0",
33 |     "BlockCapacity": "Current number of block capacity",
34 |     "StaleDataNodes": "Current number of DataNodes marked stale due to delayed heartbeat",
35 |     "TotalSyncCount": "Total number of sync operations performed by edit log"
36 | }


--------------------------------------------------------------------------------
/metrics/namenode/FSNamesystemState.json:
--------------------------------------------------------------------------------
 1 | {    
 2 |     "FsLockQueueLength": "Filesystem lock queue length",
 3 |     "MaxObjects": "Max objects",
 4 |     "BlockDeletionStartTime": "Start time of block deletion",
 5 |     "NumLiveDataNodes": "Number of datanodes which are currently live",
 6 |     "NumDeadDataNodes": "Number of datanodes which are currently dead",
 7 |     "NumDecomLiveDataNodes": "Number of datanodes which have been decommissioned and are now live",
 8 |     "NumDecomDeadDataNodes": "Number of datanodes which have been decommissioned and are now dead",
 9 |     "NumDecommissioningDataNodes": "Number of datanodes in decommissioning state",
10 |     "NumStaleDataNodes": "Number of datanodes marked as content stale",
11 |     "VolumeFailuresTotal": "Total number of volume failures across all Datanodes",
12 |     "EstimatedCapacityLostTotal": "An estimate of the total capacity lost due to volume failures",
13 |     "NumStaleStorages": "Number of storages marked as content stale (after NameNode restart/failover before first block report is received)",
14 |     "FSState": "Current state of the file system: 0 (for Safemode) or 1(Operational)",
15 |     "TotalSyncTimes": "Total number of milliseconds spent by various edit logs in sync operation"
16 | }


--------------------------------------------------------------------------------
/metrics/namenode/NameNodeActivity.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "CreateFileOps": "Total number of files created.",
 3 |   "FilesCreated": "Total number of files and directories created by create or mkdir operations.",
 4 |   "FilesAppended": "Total number of files appended.",
 5 |   "GetBlockLocations": "Total number of getBlockLocations operations.",
 6 |   "FilesRenamed": "Total number of rename operations (NOT number of files/dirs renamed).",
 7 |   "GetListingOps": "Total number of directory listing operations.",
 8 |   "DeleteFileOps": "Total number of delete operations.",
 9 |   "FilesDeleted": "Total number of files and directories deleted by delete or rename operations.",
10 |   "FileInfoOps": "Total number of getFileInfo and getLinkFileInfo operations.",
11 |   "AddBlockOps": "Total number of addBlock operations succeeded.",
12 |   "GetAdditionalDatanodeOps": "Total number of getAdditionalDatanode operations.",
13 |   "CreateSymlinkOps": "Total number of createSymlink operations.",
14 |   "GetLinkTargetOps": "Total number of getLinkTarget operations.",
15 |   "FilesInGetListingOps": "Total number of files and directories listed by directory listing operations.",
16 |   "AllowSnapshotOps": "Total number of allowSnapshot operations.",
17 |   "DisallowSnapshotOps": "Total number of disallowSnapshot operations.",
18 |   "CreateSnapshotOps": "Total number of createSnapshot operations.",
19 |   "DeleteSnapshotOps": "Total number of deleteSnapshot operations.",
20 |   "RenameSnapshotOps": "Total number of renameSnapshot operations.",
21 |   "ListSnapshottableDirOps": "Total number of snapshottableDirectoryStatus operations.",
22 |   "SnapshotDiffReportOps": "Total number of getSnapshotDiffReport operations.",
23 |   "TransactionsNumOps": "Total number of Journal transactions.",
24 |   "TransactionsAvgTime": "Average time of Journal transactions in milliseconds.",
25 |   "SyncsNumOps": "Total number of Journal syncs.",
26 |   "SyncsAvgTime": "Average time of Journal syncs in milliseconds.",
27 |   "TransactionsBatchedInSync": "Total number of Journal transactions batched in sync.",
28 |   "BlockReportNumOps": "Total number of processing block reports from DataNode.",
29 |   "BlockReportAvgTime": "Average time of processing block reports in milliseconds.",
30 |   "CacheReportNumOps": "Total number of processing cache reports from DataNode.",
31 |   "CacheReportAvgTime": "Average time of processing cache reports in milliseconds.",
32 |   "SafeModeTime": "The interval between FSNameSystem starts and the last time safemode leaves in milliseconds. (sometimes not equal to the time in SafeMode, see HDFS-5156).",
33 |   "FsImageLoadTime": "Time loading FS Image at startup in milliseconds.",
34 |   "GetEditNumOps": "Total number of edits downloads from SecondaryNameNode.",
35 |   "GetEditAvgTime": "Average edits download time in milliseconds.",
36 |   "GetImageNumOps": "Total number of fsimage downloads from SecondaryNameNode.",
37 |   "GetImageAvgTime": "Average fsimage download time in milliseconds.",
38 |   "PutImageNumOps": "Total number of fsimage uploads to SecondaryNameNode.",
39 |   "PutImageAvgTime": "Average fsimage upload time in milliseconds.",
40 |   "TotalFileOps": "Total number of all file operations."
41 | }
42 | 


--------------------------------------------------------------------------------
/metrics/namenode/NameNodeInfo.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "Total": "Total",
 3 |   "TotalBlocks": "Total number of blocks",
 4 |   "Used": "Total used space by data nodes",
 5 |   "Free": "Total free space by data nodes",
 6 |   "Safemode": "Is in safe mode. 0: no, 1: yes",
 7 |   "NonDfsUsedSpace": "Total used space by data nodes for non DFS purposes such as storing temporary files on the local file system",
 8 |   "PercentUsed": "Total used space by data nodes as percentage of total capacity",
 9 |   "BlockPoolUsedSpace": "Block pool used space",
10 |   "PercentBlockPoolUsed": "Percent of block pool used",
11 |   "PercentRemaining": "Total remaining space by data nodes as percentage of total capacity",
12 |   "CacheCapacity": "Cache Capacity",
13 |   "CacheUsed": "Cache Used",
14 |   "TotalFiles": "Total Files",
15 |   "NumberOfMissingBlocks": "Number of missing blocks",
16 |   "NumberOfMissingBlocksWithReplicationFactorOne": "Number of missing blocks with replication factor one",
17 |   "LiveNodes": "Live nodes",
18 |   "SoftwareVersion": "Software version",
19 |   "DeadNodes": "Dead nodes",
20 |   "DecomNodes": "Decom nodes",
21 |   "EnteringMaintenanceNodes": "Entering maintenance nodes",
22 |   "NodeUsage": "Node Usage",
23 |   "NNStartedTimeInMillis": "NameNode started time in millis",
24 |   "CorruptFiles": "Corrupt file list"
25 | }
26 | 


--------------------------------------------------------------------------------
/metrics/namenode/RetryCache.json:
--------------------------------------------------------------------------------
1 | {
2 |     "CacheHit": "Total number of RetryCache hit.",
3 |     "CacheCleared": "Total number of RetryCache cleared.",
4 |     "CacheUpdated": "Total number of RetryCache updated."
5 | }
6 | 


--------------------------------------------------------------------------------
/metrics/namenode/StartupProgress.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "ElapsedTime": "Total elapsed time in milliseconds.",
 3 |     "PercentComplete": "Current rate completed in NameNode startup progress  (The max value is not 100 but 1.0).",
 4 |     "LoadingFsImageCount": "",
 5 |     "LoadingFsImageElapsedTime": "",
 6 |     "LoadingFsImageTotal": "",
 7 |     "LoadingFsImagePercentComplete": "",
 8 |     "LoadingEditsCount": "",
 9 |     "LoadingEditsElapsedTime": "",
10 |     "LoadingEditsTotal": "",
11 |     "LoadingEditsPercentComplete": "",
12 |     "SavingCheckpointCount": "",
13 |     "SavingCheckpointElapsedTime": "",
14 |     "SavingCheckpointTotal": "",
15 |     "SavingCheckpointPercentComplete": "",
16 |     "SafeModeCount": "",
17 |     "SafeModeElapsedTime": "",
18 |     "SafeModeTotal": "",
19 |     "SafeModePercentComplete": ""
20 | }


--------------------------------------------------------------------------------
/metrics/nodemanager/NodeManagerMetrics.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "ContainersLaunched": "Count of launched container",
 3 |     "ContainersCompleted": "Count of completed container",
 4 |     "ContainersFailed": "Count of failed container",
 5 |     "ContainersKilled": "Count of killed container",
 6 |     "ContainersIniting": "Count of initing container",
 7 |     "ContainersRunning": "Count of running container",
 8 |     "AllocatedGB": "Memory size of allocated (in GB)",
 9 |     "AllocatedContainers": "Count of allocated container",
10 |     "AvailableGB": "Memory size of available (in GB)",
11 |     "AllocatedVCores": "Count of allocated VCores",
12 |     "AvailableVCores": "Count of available VCores",
13 |     "ContainerLaunchDurationNumOps": "Count of launched container",
14 |     "ContainerLaunchDurationAvgTime": "Average time of launching container (in ms)",
15 |     "BadLocalDirs": "Count of bad local directory",
16 |     "BadLogDirs": "Count of bad log directory",
17 |     "GoodLocalDirsDiskUtilizationPerc": "Percent of good local directory disk utilization",
18 |     "GoodLogDirsDiskUtilizationPerc": "Percent of good local log directory disk utilization"
19 | }
20 | 


--------------------------------------------------------------------------------
/metrics/nodemanager/ShuffleMetrics.json:
--------------------------------------------------------------------------------
1 | {
2 |     "ShuffleOutputBytes": "Output byte of shuffle",
3 |     "ShuffleOutputsFailed": "Output failed of shuffle",
4 |     "ShuffleOutputsOK": "Output ok of shuffle",
5 |     "ShuffleConnections": "Connection count of shuffle"
6 | }
7 | 


--------------------------------------------------------------------------------
/metrics/resourcemanager/ClusterMetrics.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "NumActiveNMs": "Current number of active NodeManagers",
 3 |     "NumDecommissionedNMs": "Current number of decommissioned NodeManagers",
 4 |     "NumLostNMs": "Current number of lost NodeManagers for not sending heartbeats",
 5 |     "NumUnhealthyNMs": "Current number of unhealthy NodeManagers",
 6 |     "NumRebootedNMs": "Current number of rebooted NodeManagers",
 7 |     "AMLaunchDelayNumOps": "Total number of AMs launched",
 8 |     "AMLaunchDelayAvgTime": "Average time in milliseconds RM spends to launch AM containers after the AM container is allocated",
 9 |     "AMRegisterDelayNumOps": "Total number of AMs registered",
10 |     "AMRegisterDelayAvgTime": "Average time in milliseconds AM spends to register with RM after the AM container gets launched"
11 | }


--------------------------------------------------------------------------------
/metrics/resourcemanager/QueueMetrics.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "running_0": "Current number of running applications whose elapsed time are less than 60 minutes.",
 3 |     "running_60": "Current number of running applications whose elapsed time are between 60 and 300 minutes.",
 4 |     "running_300": "Current number of running applications whose elapsed time are between 300 and 1440 minutes.",
 5 |     "running_1440": "Current number of running applications elapsed time are more than 1440 minutes.",
 6 |     "AppsSubmitted": "Total number of submitted applications.",
 7 |     "AppsRunning": "Current number of running applications.",
 8 |     "AppsPending": "Current number of applications that have not yet been assigned by any containers.",
 9 |     "AppsCompleted": "Total number of completed applications.",
10 |     "AppsKilled": "Total number of killed applications.",
11 |     "AppsFailed": "Total number of failed applications.",
12 |     "AllocatedMB": "Current allocated memory in MB.",
13 |     "AllocatedVCores": "Current allocated CPU in virtual cores.",
14 |     "AllocatedContainers": "Current number of allocated containers.",
15 |     "AggregateContainersAllocated": "Total number of allocated containers.",
16 |     "AggregateContainersReleased": "Total number of released containers.",
17 |     "AvailableMB": "Current available memory in MB.",
18 |     "AvailableVCores": "Current available CPU in virtual cores.",
19 |     "PendingMB": "Current pending memory resource requests in MB that are not yet fulfilled by the scheduler.",
20 |     "PendingVCores": "Current pending CPU allocation requests in virtual cores that are not yet fulfilled by the scheduler.",
21 |     "PendingContainers": "Current pending resource requests that are not yet fulfilled by the scheduler.",
22 |     "ReservedMB": "Current reserved memory in MB.",
23 |     "ReservedVCores": "Current reserved CPU in virtual cores.",
24 |     "ReservedContainers": "Current number of reserved containers.",
25 |     "ActiveUsers": "Current number of active users.",
26 |     "ActiveApplications": "Current number of active applications.",
27 |     "FairShareMB": "(FairScheduler only) Current fair share of memory in MB.",
28 |     "FairShareVCores": "(FairScheduler only) Current fair share of CPU in virtual cores.",
29 |     "MinShareMB": "(FairScheduler only) Minimum share of memory in MB.",
30 |     "MinShareVCores": "(FairScheduler only) Minimum share of CPU in virtual cores.",
31 |     "MaxShareMB": "(FairScheduler only) Maximum share of memory in MB.",
32 |     "MaxShareVCores": "(FairScheduler only) Maximum share of CPU in virtual cores."
33 | }


--------------------------------------------------------------------------------
/metrics/resourcemanager/RMNMInfo.json:
--------------------------------------------------------------------------------
1 | {
2 |     "NumContainers": "Total number of containers currently running on the host",
3 |     "State": "State of the host - valid values are: NEW, RUNNING, UNHEALTHY, DECOMMISSIONED, LOST, REBOOTED",
4 |     "UsedMemoryMB": "The total amount of memory currently used on the host (in MB)",
5 |     "AvailableMemoryMB": "The total amount of memory currently available on the host (in MB)"
6 | }


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | requests
2 | prometheus_client
3 | pyyaml
4 | 


--------------------------------------------------------------------------------
/scraper.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import threading
 5 | import requests
 6 | 
 7 | from utils import get_module_logger
 8 | 
 9 | 
10 | logger = get_module_logger(__name__)
11 | 
12 | 
13 | class Scraper(threading.Thread):
14 |     def __init__(self, url, result):
15 |         super(Scraper, self).__init__()
16 |         self.name = "thread-%s" % url
17 |         self.url = url
18 |         self.result = result
19 | 
20 |     def run(self):
21 |         result = []
22 |         try:
23 |             s = requests.session()
24 |             response = s.get(self.url, timeout=5)
25 |         except Exception as e:
26 |             logger.warning("Get {0} failed, error: {1}.".format(self.url, str(e)))
27 |         else:
28 |             if response.status_code != requests.codes.ok:
29 |                 logger.warning("Get {0} failed, response code is: {1}.".format(self.url, response.status_code))
30 |             else:
31 |                 rlt = response.json()
32 |                 if rlt and "beans" in rlt:
33 |                     result = rlt['beans']
34 |                 else:
35 |                     logger.warning("No metrics get in the {0}.".format(self.url))
36 |             s.close()
37 |             if len(result) > 0:
38 |                 self.result.append(result)
39 | 
40 | 
41 | class ScrapeMetrics(object):
42 |     def __init__(self, urls):
43 |         self.urls = urls
44 | 
45 |     def scrape(self):
46 |         result = []
47 |         tasks = [Scraper(url, result) for url in self.urls]
48 |         for task in tasks:
49 |             task.start()
50 |         for task in tasks:
51 |             task.join()
52 |         return result
53 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import os
 5 | import argparse
 6 | import logging
 7 | import yaml
 8 | 
 9 | 
10 | def get_module_logger(mod_name):
11 |     logger = logging.getLogger(mod_name)
12 |     logger.setLevel(logging.DEBUG)
13 | 
14 |     path = os.path.dirname(os.path.abspath(__file__))
15 |     par_path = os.path.dirname(path)
16 |     fh = logging.FileHandler(os.path.join(par_path, "hadoop_jmx_exporter.log"))
17 |     fh.setLevel(logging.INFO)
18 | 
19 |     sh = logging.StreamHandler()
20 |     sh.setLevel(logging.INFO)
21 | 
22 |     fmt = logging.Formatter(fmt='%(asctime)s %(filename)s[line:%(lineno)d]-[%(levelname)s]: %(message)s')
23 |     fh.setFormatter(fmt)
24 |     sh.setFormatter(fmt)
25 | 
26 |     logger.addHandler(fh)
27 |     logger.addHandler(sh)
28 |     return logger
29 | 
30 | 
31 | logger = get_module_logger(__name__)
32 | 
33 | def read_json_file(path_name, file_name):
34 |     path = os.path.dirname(os.path.realpath(__file__))
35 |     metric_path = os.path.join(path, "metrics", path_name)
36 |     metric_name = "{0}.json".format(file_name)
37 |     try:
38 |         with open(os.path.join(metric_path, metric_name), 'r') as f:
39 |             metrics = yaml.safe_load(f)
40 |             return metrics
41 |     except Exception as e:
42 |         logger.info("read metrics json file failed, error msg is: %s" % e)
43 |         return {}
44 | 
45 | 
46 | def get_file_list(file_path_name):
47 |     path = os.path.dirname(os.path.abspath(__file__))
48 |     json_path = os.path.join(path, "metrics", file_path_name)
49 |     try:
50 |         files = os.listdir(json_path)
51 |     except OSError:
52 |         logger.info("No such file or directory: '%s'" % json_path)
53 |         return []
54 |     else:
55 |         rlt = []
56 |         for i in range(len(files)):
57 |             rlt.append(files[i].split(".json")[0])
58 |         return rlt
59 | 
60 | 
61 | def parse_args():
62 |     parser = argparse.ArgumentParser(description='hadoop jmx metric prometheus exporter')
63 |     parser.add_argument('-cluster', required=True, metavar='cluster_name', help='Hadoop cluster name (maybe HA name)')
64 |     parser.add_argument('-queue', required=False, metavar='yarn_queue_regexp', help='Regular expression of queue name. default: root.*', default='root.*')
65 |     parser.add_argument('-nns', required=False, metavar='namenode_jmx_url', help='Hadoop hdfs namenode jmx metrics URL.', nargs="*")
66 |     parser.add_argument('-rms', required=False, metavar='resourcemanager_jmx_url', help='Hadoop resourcemanager metrics jmx URL.', nargs="*")
67 |     parser.add_argument('-jns', required=False, metavar='journalnode_jmx_url', help='Hadoop journalnode jmx metrics URL.', nargs="*")
68 |     parser.add_argument('-host', required=False, metavar='host', help='Listen on this address. default: 0.0.0.0', default='0.0.0.0')
69 |     parser.add_argument('-port', required=False, metavar='port', type=int, help='Listen to this port. default: 6688', default=6688)
70 |     return parser.parse_args()
71 | 


--------------------------------------------------------------------------------
/yarn_nodemanager.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python2
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import re
 5 | from prometheus_client.core import GaugeMetricFamily
 6 | 
 7 | from utils import get_module_logger
 8 | from common import MetricCollector, CommonMetricCollector
 9 | from scraper import ScrapeMetrics
10 | 
11 | logger = get_module_logger(__name__)
12 | 
13 | 
14 | class NodeManagerMetricCollector(MetricCollector):
15 | 
16 |     def __init__(self, cluster, rmc):
17 |         MetricCollector.__init__(self, cluster, "yarn", "nodemanager")
18 |         self.target = "-"
19 |         self.rmc = rmc
20 | 
21 |         self.hadoop_nodemanager_metrics = {}
22 |         for i in range(len(self.file_list)):
23 |             self.hadoop_nodemanager_metrics.setdefault(self.file_list[i], {})
24 | 
25 |         self.common_metric_collector = CommonMetricCollector(cluster, "yarn", "nodemanager")
26 | 
27 |     def collect(self):
28 |         isSetup = False
29 |         beans_list = ScrapeMetrics(self.rmc.nms).scrape()
30 |         for beans in beans_list:
31 |             if not isSetup:
32 |                 self.common_metric_collector.setup_labels(beans)
33 |                 self.setup_metrics_labels(beans)
34 |                 isSetup = True
35 |             for i in range(len(beans)):
36 |                 if 'tag.Hostname' in beans[i]:
37 |                     self.target = beans[i]["tag.Hostname"]
38 |                     break
39 |             self.hadoop_nodemanager_metrics.update(self.common_metric_collector.get_metrics(beans, self.target))
40 |             self.get_metrics(beans)
41 | 
42 |         for i in range(len(self.merge_list)):
43 |             service = self.merge_list[i]
44 |             if service in self.hadoop_nodemanager_metrics:
45 |                 for metric in self.hadoop_nodemanager_metrics[service]:
46 |                     yield self.hadoop_nodemanager_metrics[service][metric]
47 | 
48 |     def setup_metrics_labels(self, beans):
49 |         for i in range(len(beans)):
50 |             for service in self.metrics:
51 |                 if service in beans[i]['name']:
52 |                     container_flag = 1
53 |                     for metric in self.metrics[service]:
54 |                         label = ["cluster", "host"]
55 |                         if metric.startswith("Containers"):
56 |                             if container_flag:
57 |                                 container_flag = 0
58 |                                 label.append("status")
59 |                                 key = "containers"
60 |                                 name = "_".join([self.prefix, "container_count"])
61 |                                 description = "Count of container"
62 |                             else:
63 |                                 continue
64 |                         else:
65 |                             snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower()
66 |                             name = "_".join([self.prefix, snake_case])
67 |                             key = metric
68 |                             description = self.metrics[service][metric]
69 |                         label.append("target")
70 |                         self.hadoop_nodemanager_metrics[service][key] = GaugeMetricFamily(name, description, labels=label)
71 | 
72 |     def get_metrics(self, beans):
73 |         for i in range(len(beans)):
74 |             for service in self.metrics:
75 |                 if service not in beans[i]['name']:
76 |                     continue
77 |                 for metric in beans[i]:
78 |                     if metric not in self.metrics[service]:
79 |                         continue
80 |                     label = [self.cluster, self.target]
81 |                     if metric.startswith("Containers"):
82 |                         key = "containers"
83 |                         label.append(metric.split("Containers")[1])
84 |                     else:
85 |                         key = metric
86 |                     label.append(self.target)
87 |                     value = beans[i][metric] if beans[i][metric] > 0 else 0  # incase vcore or memory < 0
88 |                     self.hadoop_nodemanager_metrics[service][key].add_metric(label, value)
89 | 


--------------------------------------------------------------------------------
/yarn_resourcemanager.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python2
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import yaml
  5 | import re
  6 | from prometheus_client.core import GaugeMetricFamily
  7 | 
  8 | from utils import get_module_logger
  9 | from common import MetricCollector, CommonMetricCollector
 10 | from scraper import ScrapeMetrics
 11 | 
 12 | logger = get_module_logger(__name__)
 13 | 
 14 | 
 15 | class ResourceManagerMetricCollector(MetricCollector):
 16 | 
 17 |     NODE_STATE = {
 18 |         'NEW': 1,
 19 |         'RUNNING': 2,
 20 |         'UNHEALTHY': 3,
 21 |         'DECOMMISSIONED': 4,
 22 |         'LOST': 5,
 23 |         'REBOOTED': 6,
 24 |     }
 25 | 
 26 |     def __init__(self, cluster, urls, queue_regexp):
 27 |         MetricCollector.__init__(self, cluster, "yarn", "resourcemanager")
 28 |         self.target = "-"
 29 |         self.queue_regexp = queue_regexp
 30 |         self.nms = set()
 31 | 
 32 |         self.hadoop_resourcemanager_metrics = {}
 33 |         for i in range(len(self.file_list)):
 34 |             self.hadoop_resourcemanager_metrics.setdefault(self.file_list[i], {})
 35 | 
 36 |         self.common_metric_collector = CommonMetricCollector(cluster, "yarn", "resourcemanager")
 37 | 
 38 |         self.scrape_metrics = ScrapeMetrics(urls)
 39 | 
 40 |     def collect(self):
 41 |         isSetup = False
 42 |         beans_list = self.scrape_metrics.scrape()
 43 |         for beans in beans_list:
 44 |             if not isSetup:
 45 |                 self.common_metric_collector.setup_labels(beans)
 46 |                 self.setup_metrics_labels(beans)
 47 |                 isSetup = True
 48 |             for i in range(len(beans)):
 49 |                 if 'tag.Hostname' in beans[i]:
 50 |                     self.target = beans[i]["tag.Hostname"]
 51 |                     break
 52 |             self.hadoop_resourcemanager_metrics.update(self.common_metric_collector.get_metrics(beans, self.target))
 53 |             self.get_metrics(beans)
 54 | 
 55 |         for i in range(len(self.merge_list)):
 56 |             service = self.merge_list[i]
 57 |             if service in self.hadoop_resourcemanager_metrics:
 58 |                 for metric in self.hadoop_resourcemanager_metrics[service]:
 59 |                     yield self.hadoop_resourcemanager_metrics[service][metric]
 60 | 
 61 |     def setup_rmnminfo_labels(self):
 62 |         for metric in self.metrics['RMNMInfo']:
 63 |             label = ["cluster", "host", "version", "rack", "_target"]
 64 |             if 'NumContainers' in metric:
 65 |                 name = "_".join([self.prefix, 'node_containers_total'])
 66 |             elif 'State' in metric:
 67 |                 name = "_".join([self.prefix, 'node_state'])
 68 |             elif 'UsedMemoryMB' in metric:
 69 |                 name = "_".join([self.prefix, 'node_memory_used_mb'])
 70 |             elif 'AvailableMemoryMB' in metric:
 71 |                 name = "_".join([self.prefix, 'node_memory_available_mb'])
 72 |             else:
 73 |                 continue
 74 |             self.hadoop_resourcemanager_metrics['RMNMInfo'][metric] = GaugeMetricFamily(name, self.metrics['RMNMInfo'][metric], labels=label)
 75 | 
 76 |     def setup_queue_labels(self):
 77 |         running_flag, mb_flag, vcore_flag, container_flag, apps_flag = 1, 1, 1, 1, 1
 78 |         for metric in self.metrics['QueueMetrics']:
 79 |             label = ["cluster", "modeler_type", "queue", "user"]
 80 |             if "running_" in metric:
 81 |                 if running_flag:
 82 |                     running_flag = 0
 83 |                     label.append("elapsed_time")
 84 |                     key = "running_app"
 85 |                     name = "_".join([self.prefix, "running_app_total"])
 86 |                     description = "Current number of running applications in each elapsed time ( < 60min, 60min < x < 300min, 300min < x < 1440min and x > 1440min )"
 87 |                 else:
 88 |                     continue
 89 |             elif metric.endswith("VCores"):
 90 |                 if vcore_flag:
 91 |                     vcore_flag = 0
 92 |                     label.append("status")
 93 |                     key = "vcore"
 94 |                     name = "_".join([self.prefix, "vcore_count"])
 95 |                     description = "Count of vcore"
 96 |                 else:
 97 |                     continue
 98 |             elif metric.endswith("Containers"):
 99 |                 if container_flag:
100 |                     container_flag = 0
101 |                     label.append("status")
102 |                     key = "containers"
103 |                     name = "_".join([self.prefix, "container_count"])
104 |                     description = "Count of container"
105 |                 else:
106 |                     continue
107 |             elif metric.endswith("MB"):
108 |                 if mb_flag:
109 |                     mb_flag = 0
110 |                     label.append("status")
111 |                     key = "memory"
112 |                     name = "_".join([self.prefix, "memory_in_mb"])
113 |                     description = "Memory in MB"
114 |                 else:
115 |                     continue
116 |             elif metric.startswith("Apps"):
117 |                 if apps_flag:
118 |                     apps_flag = 0
119 |                     label.append("status")
120 |                     key = "apps"
121 |                     name = "_".join([self.prefix, "application_count"])
122 |                     description = "Count of application"
123 |                 else:
124 |                     continue
125 |             else:
126 |                 key = metric
127 |                 snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower()
128 |                 name = "_".join([self.prefix, snake_case])
129 |                 description = self.metrics['QueueMetrics'][metric]
130 |             label.append("_target")
131 |             self.hadoop_resourcemanager_metrics['QueueMetrics'][key] = GaugeMetricFamily(name, description, labels=label)
132 | 
133 |     def setup_cluster_labels(self):
134 |         nm_flag, cm_num_flag, cm_avg_flag = 1, 1, 1
135 |         for metric in self.metrics['ClusterMetrics']:
136 |             if "NMs" in metric:
137 |                 if nm_flag:
138 |                     nm_flag = 0
139 |                     label = ["cluster", "status"]
140 |                     key = "NMs"
141 |                     name = "nodemanager_total"
142 |                     description = "Current number of NodeManagers in each status"
143 |                 else:
144 |                     continue
145 |             elif "NumOps" in metric:
146 |                 if cm_num_flag:
147 |                     cm_num_flag = 0
148 |                     label = ["cluster", "oper"]
149 |                     key = "NumOps"
150 |                     name = "ams_total"
151 |                     description = "Total number of Applications Masters in each operation"
152 |                 else:
153 |                     continue
154 |             elif "AvgTime" in metric:
155 |                 if cm_avg_flag:
156 |                     cm_avg_flag = 0
157 |                     label = ["cluster", "oper"]
158 |                     key = "AvgTime"
159 |                     name = "average_time_milliseconds"
160 |                     description = "Average time in milliseconds AM spends in each operation"
161 |                 else:
162 |                     continue
163 |             else:
164 |                 key = metric
165 |                 name = metric
166 |                 description = self.metrics['ClusterMetrics'][metric]
167 |                 label = ["cluster"]
168 |             label.append("_target")
169 |             self.hadoop_resourcemanager_metrics['ClusterMetrics'][key] = GaugeMetricFamily("_".join([self.prefix, name]), description, labels=label)
170 | 
171 |     def setup_metrics_labels(self, beans):
172 |         for i in range(len(beans)):
173 |             if 'RMNMInfo' in beans[i]['name']:
174 |                 self.setup_rmnminfo_labels()
175 |             if 'QueueMetrics' in self.metrics:
176 |                 self.setup_queue_labels()
177 |             if 'ClusterMetrics' in self.metrics:
178 |                 self.setup_cluster_labels()
179 | 
180 |     def get_rmnminfo_metrics(self, bean):
181 |         for metric in self.metrics['RMNMInfo']:
182 |             nms = set()
183 |             live_nm_list = yaml.safe_load(bean['LiveNodeManagers'])
184 |             for j in range(len(live_nm_list)):
185 |                 nms.add("http://"+live_nm_list[j]["NodeHTTPAddress"]+"/jmx")
186 |                 host = live_nm_list[j]['HostName']
187 |                 version = live_nm_list[j]['NodeManagerVersion']
188 |                 rack = live_nm_list[j]['Rack']
189 |                 label = [self.cluster, host, version, rack, self.target]
190 |                 if 'State' == metric:
191 |                     value = self.NODE_STATE[live_nm_list[j]['State']]
192 |                 else:
193 |                     value = live_nm_list[j][metric] if metric in live_nm_list[j] else 0.0
194 |                 self.hadoop_resourcemanager_metrics['RMNMInfo'][metric].add_metric(label, value)
195 |             self.nms = nms
196 | 
197 |     def get_queue_metrics(self, bean):
198 |         for metric in self.metrics['QueueMetrics']:
199 |             label = [self.cluster, bean.get("modelerType", "-"), bean.get("tag.Queue", "-"), bean.get("tag.User", "-")]
200 |             if "running_0" in metric:
201 |                 key = "running_app"
202 |                 label.append("0to60")
203 |             elif "running_60" in metric:
204 |                 key = "running_app"
205 |                 label.append("60to300")
206 |             elif "running_300" in metric:
207 |                 key = "running_app"
208 |                 label.append("300to1440")
209 |             elif "running_1440" in metric:
210 |                 key = "running_app"
211 |                 label.append("1440up")
212 |             elif metric.endswith("VCores"):
213 |                 label.append(metric.split("VCores")[0])
214 |                 key = "vcore"
215 |             elif metric.endswith("Containers"):
216 |                 label.append(metric.split("Containers")[0])
217 |                 key = "containers"
218 |             elif metric.endswith("MB"):
219 |                 label.append(metric.split("MB")[0])
220 |                 key = "memory"
221 |             elif metric.startswith("Apps"):
222 |                 label.append(metric.split("Apps")[1])
223 |                 key = "apps"
224 |             else:
225 |                 key = metric
226 |             label.append(self.target)
227 |             self.hadoop_resourcemanager_metrics['QueueMetrics'][key].add_metric(label, bean[metric] if metric in bean else 0)
228 | 
229 |     def get_cluster_metrics(self, bean):
230 |         for metric in self.metrics['ClusterMetrics']:
231 |             label = [self.cluster]
232 |             if "NMs" in metric:
233 |                 label.append(metric.split('NMs')[0].split('Num')[1])
234 |                 key = "NMs"
235 |             elif "NumOps" in metric:
236 |                 key = "NumOps"
237 |                 label.append(metric.split("DelayNumOps")[0].split('AM')[1])
238 |             elif "AvgTime" in metric:
239 |                 key = "AvgTime"
240 |                 label.append(metric.split("DelayAvgTime")[0].split('AM')[1])
241 |             else:
242 |                 continue
243 |             label.append(self.target)
244 |             self.hadoop_resourcemanager_metrics['ClusterMetrics'][key].add_metric(label, bean[metric] if metric in bean else 0)
245 | 
246 |     def get_metrics(self, beans):
247 |         for i in range(len(beans)):
248 |             if 'RMNMInfo' in beans[i]['name']:
249 |                 self.get_rmnminfo_metrics(beans[i])
250 |             if 'name=QueueMetrics' in beans[i]['name'] and re.match(self.queue_regexp, beans[i]['tag.Queue']):
251 |                 self.get_queue_metrics(beans[i])
252 |             if 'ClusterMetrics' in beans[i]['name']:
253 |                 self.get_cluster_metrics(beans[i])
254 | 


--------------------------------------------------------------------------------