├── .gitignore ├── README.md ├── __init__.py ├── common.py ├── dashboards ├── HDFS-DataNode-1588759242237.json ├── HDFS-NameNode-1588759254552.json ├── HDFS-NameNode.png ├── YARN-NodeManager-1588759264873.json ├── YARN-ResourceManager-1588759276429.json └── hadoop_monitoring.mp4 ├── examples ├── DataNode.json ├── JobHistoryServer.json ├── NameNode.json ├── NodeManager.json ├── ResouceManager.json └── all_metrics.txt ├── hadoop_jmx_exporter.py ├── hdfs_datanode.py ├── hdfs_journalnode.py ├── hdfs_namenode.py ├── metrics ├── common │ ├── JvmMetrics.json │ ├── MetricsSystem.json │ ├── OperatingSystem.json │ ├── RpcActivity.json │ ├── RpcDetailedActivity.json │ ├── Runtime.json │ └── UgiMetrics.json ├── datanode │ ├── DataNodeActivity.json │ ├── DataNodeInfo.json │ └── FSDatasetState.json ├── journalnode │ └── JournalNode.json ├── namenode │ ├── FSNamesystem.json │ ├── FSNamesystemState.json │ ├── NameNodeActivity.json │ ├── NameNodeInfo.json │ ├── RetryCache.json │ └── StartupProgress.json ├── nodemanager │ ├── NodeManagerMetrics.json │ └── ShuffleMetrics.json └── resourcemanager │ ├── ClusterMetrics.json │ ├── QueueMetrics.json │ └── RMNMInfo.json ├── requirements.txt ├── scraper.py ├── utils.py ├── yarn_nodemanager.py └── yarn_resourcemanager.py /.gitignore: -------------------------------------------------------------------------------- 1 | ^# 2 | *.pyc 3 | *.log 4 | .vscode -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # hadoop_jmx_exporter 2 | 3 | Hadoop HDFS & YARN jmx metrics prometheus exporter. 4 | 5 | All metrics can be found [here](./examples/all_metrics.txt). 6 | 7 | Grafana dashboards json file and a short video demo in [dashboards](./dashboards) directory. 8 | 9 | Tested on CDH 5.14.2. 10 | 11 | ![HDFS-Namenode](./dashboards/HDFS-NameNode.png) 12 | 13 | # Run 14 | 15 | ``` bash 16 | ➜ hadoop_jmx_exporter git:(master) ✗ pip2 install -r requirements.txt 17 | 18 | ➜ hadoop_exporter git:(master) ✗ python2 hadoop_jmx_exporter.py -h 19 | usage: hadoop_jmx_exporter.py [-h] -cluster cluster_name 20 | [-queue yarn_queue_regexp] 21 | [-nns [namenode_jmx_url [namenode_jmx_url ...]]] 22 | [-rms [resourcemanager_jmx_url [resourcemanager_jmx_url ...]]] 23 | [-jns [journalnode_jmx_url [journalnode_jmx_url ...]]] 24 | [-host host] [-port port] 25 | 26 | hadoop jmx metric prometheus exporter 27 | 28 | optional arguments: 29 | -h, --help show this help message and exit 30 | -cluster cluster_name 31 | Hadoop cluster name (maybe HA name) 32 | -queue yarn_queue_regexp 33 | Regular expression of queue name. default: root.* 34 | -nns [namenode_jmx_url [namenode_jmx_url ...]] 35 | Hadoop hdfs namenode jmx metrics URL. 36 | -rms [resourcemanager_jmx_url [resourcemanager_jmx_url ...]] 37 | Hadoop resourcemanager metrics jmx URL. 38 | -jns [journalnode_jmx_url [journalnode_jmx_url ...]] 39 | Hadoop journalnode jmx metrics URL. 40 | -host host Listen on this address. default: 0.0.0.0 41 | -port port Listen to this port. default: 6688 42 | ➜ hadoop_exporter git:(master) ✗ 43 | 44 | ➜ hadoop_exporter git:(master) ✗ python2 hadoop_jmx_exporter.py -cluster yh-cdh -nns http://10.193.40.10:50070/jmx http://10.193.40.3:50070/jmx -rms http://yh-shhd-cdh04:8088/jmx http://yh-shhd-cdh01:8088/jmx 45 | Listen at 0.0.0.0:6688 46 | ``` 47 | 48 | Open your browser to view metrics: `http://127.0.0.1:6688/metrics`. 49 | 50 | # Reference 51 | 52 | 1. https://github.com/cauwulixuan/hadoop_exporter 53 | 2. http://hadoop.apache.org/docs/r2.7.3/hadoop-project-dist/hadoop-common/Metrics.html#namenode 54 | 3. https://docs.cloudera.com/HDPDocuments/Ambari-2.7.5.0/using-ambari-core-services/content/amb_hdfs_users.html 55 | 4. https://www.datadoghq.com/blog/collecting-hadoop-metrics/ 56 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opsnull/hadoop_jmx_exporter/939a27889134d4decef7cc7cb067cb1eba9e4d10/__init__.py -------------------------------------------------------------------------------- /common.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | 4 | import re 5 | from prometheus_client.core import GaugeMetricFamily 6 | 7 | import utils 8 | 9 | 10 | logger = utils.get_module_logger(__name__) 11 | 12 | 13 | class MetricCollector(object): 14 | def __init__(self, cluster, component, service): 15 | self.cluster = cluster 16 | self.component = component 17 | self.prefix = 'hadoop_{0}_{1}'.format(component, service) 18 | 19 | self.file_list = utils.get_file_list(service) 20 | self.metrics = {} 21 | for i in range(len(self.file_list)): 22 | self.metrics.setdefault(self.file_list[i], utils.read_json_file(service, self.file_list[i])) 23 | 24 | common_file = utils.get_file_list("common") 25 | self.merge_list = self.file_list + common_file 26 | 27 | def collect(self): 28 | pass 29 | 30 | def _setup_metrics_labels(self): 31 | pass 32 | 33 | def _get_metrics(self, metrics): 34 | pass 35 | 36 | 37 | class CommonMetricCollector(): 38 | def __init__(self, cluster, component, service): 39 | self.cluster = cluster 40 | self.componet = component 41 | self.service = service 42 | self.prefix = 'hadoop_{0}_{1}'.format(component, service) 43 | self.common_metrics = {} 44 | self.tmp_metrics = {} 45 | file_list = utils.get_file_list("common") 46 | for i in range(len(file_list)): 47 | self.common_metrics.setdefault(file_list[i], {}) 48 | self.tmp_metrics.setdefault(file_list[i], utils.read_json_file("common", file_list[i])) 49 | 50 | def setup_labels(self, beans): 51 | for i in range(len(beans)): 52 | if 'name=JvmMetrics' in beans[i]['name']: 53 | self.setup_jvm_labels() 54 | if 'OperatingSystem' in beans[i]['name']: 55 | self.setup_os_labels() 56 | if 'RpcActivity' in beans[i]['name']: 57 | self.setup_rpc_labels() 58 | if 'RpcDetailedActivity' in beans[i]['name']: 59 | self.setup_rpc_detailed_labels() 60 | if 'UgiMetrics' in beans[i]['name']: 61 | self.setup_ugi_labels() 62 | if 'MetricsSystem' in beans[i]['name'] and "sub=Stats" in beans[i]['name']: 63 | self.setup_metric_system_labels() 64 | if 'Runtime' in beans[i]['name']: 65 | self.setup_runtime_labels() 66 | 67 | def get_metrics(self, beans, target): 68 | self.target = target 69 | for i in range(len(beans)): 70 | if 'name=JvmMetrics' in beans[i]['name']: 71 | self.get_jvm_metrics(beans[i]) 72 | if 'OperatingSystem' in beans[i]['name']: 73 | self.get_os_metrics(beans[i]) 74 | if 'RpcActivity' in beans[i]['name']: 75 | self.get_rpc_metrics(beans[i]) 76 | if 'RpcDetailedActivity' in beans[i]['name']: 77 | self.get_rpc_detailed_metrics(beans[i]) 78 | if 'UgiMetrics' in beans[i]['name']: 79 | self.get_ugi_metrics(beans[i]) 80 | if 'MetricsSystem' in beans[i]['name'] and "sub=Stats" in beans[i]['name']: 81 | self.get_metric_system_metrics(beans[i]) 82 | if 'Runtime' in beans[i]['name']: 83 | self.get_runtime_metrics(beans[i]) 84 | return self.common_metrics 85 | 86 | def setup_jvm_labels(self): 87 | for metric in self.tmp_metrics["JvmMetrics"]: 88 | snake_case = "_".join(["jvm", re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower()]) 89 | if 'Mem' in metric: 90 | name = "".join([snake_case, "ebibytes"]) 91 | label = ["cluster", "mode"] 92 | if "Used" in metric: 93 | key = "jvm_mem_used_mebibytes" 94 | descriptions = "Current memory used in mebibytes." 95 | elif "Committed" in metric: 96 | key = "jvm_mem_committed_mebibytes" 97 | descriptions = "Current memory committed in mebibytes." 98 | elif "Max" in metric: 99 | key = "jvm_mem_max_mebibytes" 100 | descriptions = "Current max memory in mebibytes." 101 | else: 102 | key = name 103 | label = ["cluster"] 104 | descriptions = self.tmp_metrics['JvmMetrics'][metric] 105 | elif 'Gc' in metric: 106 | label = ["cluster", "type"] 107 | if "GcCount" in metric: 108 | key = "jvm_gc_count" 109 | descriptions = "GC count of each type GC." 110 | elif "GcTimeMillis" in metric: 111 | key = "jvm_gc_time_milliseconds" 112 | descriptions = "Each type GC time in milliseconds." 113 | elif "ThresholdExceeded" in metric: 114 | key = "jvm_gc_exceeded_threshold_total" 115 | descriptions = "Number of times that the GC threshold is exceeded." 116 | else: 117 | key = snake_case 118 | label = ["cluster"] 119 | descriptions = self.tmp_metrics['JvmMetrics'][metric] 120 | elif 'Threads' in metric: 121 | label = ["cluster", "state"] 122 | key = "jvm_threads_state_total" 123 | descriptions = "Current number of different threads." 124 | elif 'Log' in metric: 125 | label = ["cluster", "level"] 126 | key = "jvm_log_level_total" 127 | descriptions = "Total number of each level logs." 128 | else: 129 | label = ["cluster"] 130 | key = snake_case 131 | descriptions = self.tmp_metrics['JvmMetrics'][metric] 132 | label.append("_target") 133 | self.common_metrics['JvmMetrics'][key] = GaugeMetricFamily("_".join([self.prefix, key]), descriptions, labels=label) 134 | 135 | def setup_os_labels(self): 136 | for metric in self.tmp_metrics['OperatingSystem']: 137 | label = ["cluster", "_target"] 138 | snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() 139 | name = "_".join([self.prefix, snake_case]) 140 | self.common_metrics['OperatingSystem'][metric] = GaugeMetricFamily(name, self.tmp_metrics['OperatingSystem'][metric], labels=label) 141 | 142 | def setup_rpc_labels(self): 143 | num_rpc_flag, avg_rpc_flag = 1, 1 144 | for metric in self.tmp_metrics["RpcActivity"]: 145 | snake_case = "_".join(["rpc", re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower()]) 146 | if 'Rpc' in metric: 147 | snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() 148 | label = ["cluster", "tag"] 149 | if "NumOps" in metric: 150 | if num_rpc_flag: 151 | key = "MethodNumOps" 152 | label.extend(["method", "_target"]) 153 | name = "_".join([self.prefix, "rpc_method_called_total"]) 154 | description = "Total number of the times the method is called." 155 | self.common_metrics['RpcActivity'][key] = GaugeMetricFamily(name, description, labels=label) 156 | num_rpc_flag = 0 157 | else: 158 | continue 159 | elif "AvgTime" in metric: 160 | if avg_rpc_flag: 161 | key = "MethodAvgTime" 162 | label.extend(["method", "_target"]) 163 | name = "_".join([self.prefix, "rpc_method_avg_time_milliseconds"]) 164 | descrption = "Average turn around time of the method in milliseconds." 165 | self.common_metrics['RpcActivity'][key] = GaugeMetricFamily(name, descrption, labels=label) 166 | avg_rpc_flag = 0 167 | else: 168 | continue 169 | else: 170 | key = metric 171 | label.append("_target") 172 | name = "_".join([self.prefix, snake_case]) 173 | self.common_metrics['RpcActivity'][key] = GaugeMetricFamily(name, self.tmp_metrics['RpcActivity'][metric], labels=label) 174 | 175 | def setup_rpc_detailed_labels(self): 176 | for metric in self.tmp_metrics['RpcDetailedActivity']: 177 | label = ["cluster", "tag", "method", "_target"] 178 | if "NumOps" in metric: 179 | key = "NumOps" 180 | name = "_".join([self.prefix, 'rpc_detailed_method_called_total']) 181 | elif "AvgTime" in metric: 182 | key = "AvgTime" 183 | name = "_".join([self.prefix, 'rpc_detailed_method_avg_time_milliseconds']) 184 | else: 185 | continue 186 | self.common_metrics['RpcDetailedActivity'][key] = GaugeMetricFamily(name, self.tmp_metrics['RpcDetailedActivity'][metric], labels=label) 187 | return self.common_metrics 188 | 189 | def setup_ugi_labels(self): 190 | ugi_num_flag, ugi_avg_flag = 1, 1 191 | for metric in self.tmp_metrics['UgiMetrics']: 192 | label = ["cluster"] 193 | if 'NumOps' in metric: 194 | if ugi_num_flag: 195 | key = 'NumOps' 196 | label.extend(["method", "state", "_target"]) 197 | ugi_num_flag = 0 198 | name = "_".join([self.prefix, 'ugi_method_called_total']) 199 | description = "Total number of the times the method is called." 200 | self.common_metrics['UgiMetrics'][key] = GaugeMetricFamily(name, description, labels=label) 201 | else: 202 | continue 203 | elif 'AvgTime' in metric: 204 | if ugi_avg_flag: 205 | key = 'AvgTime' 206 | label.extend(["method", "state", "_target"]) 207 | ugi_avg_flag = 0 208 | name = "_".join([self.prefix, 'ugi_method_avg_time_milliseconds']) 209 | description = "Average turn around time of the method in milliseconds." 210 | self.common_metrics['UgiMetrics'][key] = GaugeMetricFamily(name, description, labels=label) 211 | else: 212 | continue 213 | else: 214 | label.append("_target") 215 | snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() 216 | name = "_".join([self.prefix, 'ugi', snake_case]) 217 | self.common_metrics['UgiMetrics'][metric] = GaugeMetricFamily(name, self.tmp_metrics['UgiMetrics'][metric], labels=label) 218 | 219 | def setup_metric_system_labels(self): 220 | metric_num_flag, metric_avg_flag = 1, 1 221 | for metric in self.tmp_metrics['MetricsSystem']: 222 | label = ["cluster"] 223 | if 'NumOps' in metric: 224 | if metric_num_flag: 225 | key = 'NumOps' 226 | label.extend(["oper", "_target"]) 227 | metric_num_flag = 0 228 | name = "_".join([self.prefix, 'metricssystem_operations_total']) 229 | self.common_metrics['MetricsSystem'][key] = GaugeMetricFamily(name, "Total number of operations", labels=label) 230 | else: 231 | continue 232 | elif 'AvgTime' in metric: 233 | if metric_avg_flag: 234 | key = 'AvgTime' 235 | label.extend(["oper", "_target"]) 236 | metric_avg_flag = 0 237 | name = "_".join([self.prefix, 'metricssystem_method_avg_time_milliseconds']) 238 | description = "Average turn around time of the operations in milliseconds." 239 | self.common_metrics['MetricsSystem'][key] = GaugeMetricFamily(name, description, labels=label) 240 | else: 241 | continue 242 | else: 243 | label.append("_target") 244 | snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() 245 | name = "_".join([self.prefix, 'metricssystem', snake_case]) 246 | self.common_metrics['MetricsSystem'][metric] = GaugeMetricFamily(name, self.tmp_metrics['MetricsSystem'][metric], labels=label) 247 | 248 | def setup_runtime_labels(self): 249 | for metric in self.tmp_metrics['Runtime']: 250 | label = ["cluster", "host", "_target"] 251 | snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() 252 | name = "_".join([self.prefix, snake_case, "milliseconds"]) 253 | self.common_metrics['Runtime'][metric] = GaugeMetricFamily(name, self.tmp_metrics['Runtime'][metric], labels=label) 254 | 255 | def get_jvm_metrics(self, bean): 256 | for metric in self.tmp_metrics['JvmMetrics']: 257 | name = "_".join(["jvm", re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower()]) 258 | if 'Mem' in metric: 259 | if "Used" in metric: 260 | key = "jvm_mem_used_mebibytes" 261 | mode = metric.split("Used")[0].split("Mem")[1] 262 | label = [self.cluster, mode] 263 | elif "Committed" in metric: 264 | key = "jvm_mem_committed_mebibytes" 265 | mode = metric.split("Committed")[0].split("Mem")[1] 266 | label = [self.cluster, mode] 267 | elif "Max" in metric: 268 | key = "jvm_mem_max_mebibytes" 269 | if "Heap" in metric: 270 | mode = metric.split("Max")[0].split("Mem")[1] 271 | else: 272 | mode = "max" 273 | label = [self.cluster, mode] 274 | else: 275 | key = "".join([name, 'ebibytes']) 276 | label = [self.cluster] 277 | elif 'Gc' in metric: 278 | if "GcCount" in metric: 279 | key = "jvm_gc_count" 280 | if "GcCount" == metric: 281 | typo = "total" 282 | else: 283 | typo = metric.split("GcCount")[1] 284 | label = [self.cluster, typo] 285 | elif "GcTimeMillis" in metric: 286 | key = "jvm_gc_time_milliseconds" 287 | if "GcTimeMillis" == metric: 288 | typo = "total" 289 | else: 290 | typo = metric.split("GcTimeMillis")[1] 291 | label = [self.cluster, typo] 292 | elif "ThresholdExceeded" in metric: 293 | key = "jvm_gc_exceeded_threshold_total" 294 | typo = metric.split("ThresholdExceeded")[ 295 | 0].split("GcNum")[1] 296 | label = [self.cluster, typo] 297 | else: 298 | key = name 299 | label = [self.cluster] 300 | elif 'Threads' in metric: 301 | key = "jvm_threads_state_total" 302 | state = metric.split("Threads")[1] 303 | label = [self.cluster, state] 304 | elif 'Log' in metric: 305 | key = "jvm_log_level_total" 306 | level = metric.split("Log")[1] 307 | label = [self.cluster, level] 308 | else: 309 | key = name 310 | label = [self.cluster] 311 | label.append(self.target) 312 | self.common_metrics['JvmMetrics'][key].add_metric(label, bean[metric] if metric in bean else 0) 313 | 314 | def get_os_metrics(self, bean): 315 | for metric in self.tmp_metrics['OperatingSystem']: 316 | label = [self.cluster] 317 | label.append(self.target) 318 | self.common_metrics['OperatingSystem'][metric].add_metric(label, bean[metric] if metric in bean else 0) 319 | 320 | def get_rpc_metrics(self, bean): 321 | rpc_tag = bean['tag.port'] 322 | for metric in self.tmp_metrics['RpcActivity']: 323 | if "NumOps" in metric: 324 | method = metric.split('NumOps')[0] 325 | label = [self.cluster, rpc_tag, method] 326 | key = "MethodNumOps" 327 | elif "AvgTime" in metric: 328 | method = metric.split('AvgTime')[0] 329 | label = [self.cluster, rpc_tag, method] 330 | key = "MethodAvgTime" 331 | else: 332 | label = [self.cluster, rpc_tag] 333 | key = metric 334 | label.append(self.target) 335 | self.common_metrics['RpcActivity'][key].add_metric(label, bean[metric] if metric in bean else 0) 336 | 337 | def get_rpc_detailed_metrics(self, bean): 338 | detail_tag = bean['tag.port'] 339 | for metric in bean: 340 | if metric[0].isupper(): 341 | if "NumOps" in metric: 342 | key = "NumOps" 343 | method = metric.split('NumOps')[0] 344 | elif "AvgTime" in metric: 345 | key = "AvgTime" 346 | method = metric.split("AvgTime")[0] 347 | else: 348 | continue 349 | label = [self.cluster, detail_tag, method, self.target] 350 | self.common_metrics['RpcDetailedActivity'][key].add_metric(label, bean[metric]) 351 | 352 | def get_ugi_metrics(self, bean): 353 | for metric in self.tmp_metrics['UgiMetrics']: 354 | if 'NumOps' in metric: 355 | key = 'NumOps' 356 | if 'Login' in metric: 357 | method = 'Login' 358 | state = metric.split('Login')[1].split('NumOps')[0] 359 | label = [self.cluster, method, state] 360 | else: 361 | method = metric.split('NumOps')[0] 362 | label = [self.cluster, method, "-"] 363 | elif 'AvgTime' in metric: 364 | key = 'AvgTime' 365 | if 'Login' in metric: 366 | method = 'Login' 367 | state = metric.split('Login')[1].split('AvgTime')[0] 368 | label = [self.cluster, method, state] 369 | else: 370 | method = metric.split('AvgTime')[0] 371 | label = [self.cluster, method, "-"] 372 | else: 373 | key = metric 374 | label = [self.cluster] 375 | label.append(self.target) 376 | self.common_metrics['UgiMetrics'][key].add_metric(label, bean[metric] if metric in bean and bean[metric] else 0) 377 | 378 | def get_metric_system_metrics(self, bean): 379 | for metric in self.tmp_metrics['MetricsSystem']: 380 | if 'NumOps' in metric: 381 | key = 'NumOps' 382 | oper = metric.split('NumOps')[0] 383 | label = [self.cluster, oper] 384 | elif 'AvgTime' in metric: 385 | key = 'AvgTime' 386 | oper = metric.split('AvgTime')[0] 387 | label = [self.cluster, oper] 388 | else: 389 | key = metric 390 | label = [self.cluster] 391 | label.append(self.target) 392 | self.common_metrics['MetricsSystem'][key].add_metric(label, bean[metric] if metric in bean and bean[metric] else 0) 393 | 394 | def get_runtime_metrics(self, bean): 395 | for metric in self.tmp_metrics['Runtime']: 396 | label = [self.cluster, bean['Name'].split("@")[1], self.target] 397 | self.common_metrics['Runtime'][metric].add_metric(label, bean[metric] if metric in bean and bean[metric] else 0) 398 | -------------------------------------------------------------------------------- /dashboards/HDFS-NameNode.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opsnull/hadoop_jmx_exporter/939a27889134d4decef7cc7cb067cb1eba9e4d10/dashboards/HDFS-NameNode.png -------------------------------------------------------------------------------- /dashboards/hadoop_monitoring.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/opsnull/hadoop_jmx_exporter/939a27889134d4decef7cc7cb067cb1eba9e4d10/dashboards/hadoop_monitoring.mp4 -------------------------------------------------------------------------------- /examples/DataNode.json: -------------------------------------------------------------------------------- 1 | { 2 | "beans": [ 3 | { 4 | "name": "Hadoop:service=DataNode,name=JvmMetrics", 5 | "modelerType": "JvmMetrics", 6 | "tag.Context": "jvm", 7 | "tag.ProcessName": "DataNode", 8 | "tag.SessionId": null, 9 | "tag.Hostname": "yh-shhd-cdh01", 10 | "MemNonHeapUsedM": 78.327324, 11 | "MemNonHeapCommittedM": 79.87109, 12 | "MemNonHeapMaxM": -1, 13 | "MemHeapUsedM": 1307.1587, 14 | "MemHeapCommittedM": 1979.75, 15 | "MemHeapMaxM": 1979.75, 16 | "MemMaxM": 1979.75, 17 | "GcCountParNew": 5222, 18 | "GcTimeMillisParNew": 532221, 19 | "GcCountConcurrentMarkSweep": 92, 20 | "GcTimeMillisConcurrentMarkSweep": 7191, 21 | "GcCount": 5314, 22 | "GcTimeMillis": 539412, 23 | "GcNumWarnThresholdExceeded": 0, 24 | "GcNumInfoThresholdExceeded": 0, 25 | "GcTotalExtraSleepTime": 79593, 26 | "ThreadsNew": 0, 27 | "ThreadsRunnable": 186, 28 | "ThreadsBlocked": 0, 29 | "ThreadsWaiting": 20, 30 | "ThreadsTimedWaiting": 31, 31 | "ThreadsTerminated": 0, 32 | "LogFatal": 0, 33 | "LogError": 4779, 34 | "LogWarn": 466, 35 | "LogInfo": 4585284 36 | }, 37 | { 38 | "name": "Hadoop:service=DataNode,name=FSDatasetState-2d7d9029-dcdc-404a-9d98-cb72ad235493", 39 | "modelerType": "org.apache.hadoop.hdfs.server.datanode.fsdataset.impl.FsDatasetImpl", 40 | "Remaining": 21524598985077, 41 | "DfsUsed": 8583846612190, 42 | "Capacity": 31710403805184, 43 | "LastVolumeFailureDate": 0, 44 | "EstimatedCapacityLostTotal": 0, 45 | "CacheUsed": 0, 46 | "CacheCapacity": 4294967296, 47 | "NumBlocksCached": 0, 48 | "NumBlocksFailedToCache": 0, 49 | "NumBlocksFailedToUncache": 509262, 50 | "NumFailedVolumes": 0, 51 | "FailedStorageLocations": [], 52 | "StorageInfo": "FSDataset{dirpath='[/mnt/disk0/dfs/dn/current, /mnt/disk1/dfs/dn/current, /mnt/disk2/dfs/dn/current, /mnt/disk3/dfs/dn/current]'}" 53 | }, 54 | { 55 | "name": "Hadoop:service=DataNode,name=UgiMetrics", 56 | "modelerType": "UgiMetrics", 57 | "tag.Context": "ugi", 58 | "tag.Hostname": "yh-shhd-cdh01", 59 | "LoginSuccessNumOps": 0, 60 | "LoginSuccessAvgTime": 0, 61 | "LoginFailureNumOps": 0, 62 | "LoginFailureAvgTime": 0, 63 | "GetGroupsNumOps": 0, 64 | "GetGroupsAvgTime": 0, 65 | "RenewalFailuresTotal": 0, 66 | "RenewalFailures": 0 67 | }, 68 | { 69 | "name": "Hadoop:service=DataNode,name=DataNodeInfo", 70 | "modelerType": "org.apache.hadoop.hdfs.server.datanode.DataNode", 71 | "Version": "2.6.0-cdh5.14.2", 72 | "XceiverCount": 26, 73 | "DatanodeNetworkCounts": [ 74 | { 75 | "key": "/10.193.40.4", 76 | "value": [ 77 | { 78 | "key": "networkErrors", 79 | "value": 27 80 | } 81 | ] 82 | }, 83 | { 84 | "key": "/10.193.40.9", 85 | "value": [ 86 | { 87 | "key": "networkErrors", 88 | "value": 35305 89 | } 90 | ] 91 | }, 92 | { 93 | "key": "/10.193.48.9", 94 | "value": [ 95 | { 96 | "key": "networkErrors", 97 | "value": 15 98 | } 99 | ] 100 | }, 101 | { 102 | "key": "/10.193.40.1", 103 | "value": [ 104 | { 105 | "key": "networkErrors", 106 | "value": 2500 107 | } 108 | ] 109 | }, 110 | { 111 | "key": "/10.193.40.3", 112 | "value": [ 113 | { 114 | "key": "networkErrors", 115 | "value": 2545 116 | } 117 | ] 118 | }, 119 | { 120 | "key": "/10.193.33.1", 121 | "value": [ 122 | { 123 | "key": "networkErrors", 124 | "value": 4 125 | } 126 | ] 127 | }, 128 | { 129 | "key": "/10.193.40.10", 130 | "value": [ 131 | { 132 | "key": "networkErrors", 133 | "value": 2705 134 | } 135 | ] 136 | }, 137 | { 138 | "key": "/10.193.40.5", 139 | "value": [ 140 | { 141 | "key": "networkErrors", 142 | "value": 33 143 | } 144 | ] 145 | }, 146 | { 147 | "key": "/10.193.40.2", 148 | "value": [ 149 | { 150 | "key": "networkErrors", 151 | "value": 3658 152 | } 153 | ] 154 | }, 155 | { 156 | "key": "/10.193.48.7", 157 | "value": [ 158 | { 159 | "key": "networkErrors", 160 | "value": 41 161 | } 162 | ] 163 | } 164 | ], 165 | "RpcPort": "50020", 166 | "HttpPort": null, 167 | "NamenodeAddresses": "{\"yh-shhd-cdh05\":\"BP-1654582017-10.193.40.10-1585051030504\",\"yh-shhd-cdh02\":\"BP-1654582017-10.193.40.10-1585051030504\"}", 168 | "VolumeInfo": "{\"/mnt/disk3/dfs/dn/current\":{\"usedSpace\":2123832329532,\"freeSpace\":5403410074903,\"reservedSpace\":10737418240,\"reservedSpaceForRBW\":263679405},\"/mnt/disk2/dfs/dn/current\":{\"usedSpace\":2141850634176,\"freeSpace\":5385258770496,\"reservedSpace\":10737418240,\"reservedSpaceForRBW\":396679168},\"/mnt/disk0/dfs/dn/current\":{\"usedSpace\":2185856311266,\"freeSpace\":5341126906910,\"reservedSpace\":10737418240,\"reservedSpaceForRBW\":522865664},\"/mnt/disk1/dfs/dn/current\":{\"usedSpace\":2132307337216,\"freeSpace\":5394803232768,\"reservedSpace\":10737418240,\"reservedSpaceForRBW\":395513856}}", 169 | "ClusterId": "cluster7", 170 | "DiskBalancerStatus": "" 171 | }, 172 | { 173 | "name": "Hadoop:service=DataNode,name=DataNodeActivity-yh-shhd-cdh01-50010", 174 | "modelerType": "DataNodeActivity-yh-shhd-cdh01-50010", 175 | "tag.SessionId": null, 176 | "tag.Context": "dfs", 177 | "tag.Hostname": "yh-shhd-cdh01", 178 | "BytesWritten": 3930869762859, 179 | "TotalWriteTime": 5390318, 180 | "BytesRead": 2592902229782, 181 | "TotalReadTime": 12910687, 182 | "BlocksWritten": 1339342, 183 | "BlocksRead": 14341148, 184 | "BlocksReplicated": 138907, 185 | "BlocksRemoved": 315277, 186 | "BlocksVerified": 1241033, 187 | "BlockVerificationFailures": 0, 188 | "BlocksCached": 0, 189 | "BlocksUncached": 0, 190 | "ReadsFromLocalClient": 5040940, 191 | "ReadsFromRemoteClient": 9300208, 192 | "WritesFromLocalClient": 390614, 193 | "WritesFromRemoteClient": 814364, 194 | "BlocksGetLocalPathInfo": 0, 195 | "RemoteBytesRead": 1544540994665, 196 | "RemoteBytesWritten": 2445837244412, 197 | "RamDiskBlocksWrite": 0, 198 | "RamDiskBlocksWriteFallback": 0, 199 | "RamDiskBytesWrite": 0, 200 | "RamDiskBlocksReadHits": 0, 201 | "RamDiskBlocksEvicted": 0, 202 | "RamDiskBlocksEvictedWithoutRead": 0, 203 | "RamDiskBlocksEvictionWindowMsNumOps": 0, 204 | "RamDiskBlocksEvictionWindowMsAvgTime": 0, 205 | "RamDiskBlocksLazyPersisted": 0, 206 | "RamDiskBlocksDeletedBeforeLazyPersisted": 0, 207 | "RamDiskBytesLazyPersisted": 0, 208 | "RamDiskBlocksLazyPersistWindowMsNumOps": 0, 209 | "RamDiskBlocksLazyPersistWindowMsAvgTime": 0, 210 | "FsyncCount": 0, 211 | "VolumeFailures": 0, 212 | "DatanodeNetworkErrors": 46833, 213 | "ReadBlockOpNumOps": 14341148, 214 | "ReadBlockOpAvgTime": 2.7777777777777777, 215 | "WriteBlockOpNumOps": 1204978, 216 | "WriteBlockOpAvgTime": 40067, 217 | "BlockChecksumOpNumOps": 51251, 218 | "BlockChecksumOpAvgTime": 0.3333333333333333, 219 | "CopyBlockOpNumOps": 0, 220 | "CopyBlockOpAvgTime": 0, 221 | "ReplaceBlockOpNumOps": 0, 222 | "ReplaceBlockOpAvgTime": 0, 223 | "HeartbeatsNumOps": 1252832, 224 | "HeartbeatsAvgTime": 1.2727272727272727, 225 | "BlockReportsNumOps": 176, 226 | "BlockReportsAvgTime": 985, 227 | "IncrementalBlockReportsNumOps": 2551500, 228 | "IncrementalBlockReportsAvgTime": 1, 229 | "CacheReportsNumOps": 328655, 230 | "CacheReportsAvgTime": 0.3333333333333333, 231 | "PacketAckRoundTripTimeNanosNumOps": 32064492, 232 | "PacketAckRoundTripTimeNanosAvgTime": 389616, 233 | "FlushNanosNumOps": 70305363, 234 | "FlushNanosAvgTime": 16205.285714285714, 235 | "FsyncNanosNumOps": 0, 236 | "FsyncNanosAvgTime": 0, 237 | "SendDataPacketBlockedOnNetworkNanosNumOps": 95274803, 238 | "SendDataPacketBlockedOnNetworkNanosAvgTime": 29429.13888888889, 239 | "SendDataPacketTransferNanosNumOps": 95274803, 240 | "SendDataPacketTransferNanosAvgTime": 353804.77777777775 241 | }, 242 | { 243 | "name": "Hadoop:service=DataNode,name=RpcDetailedActivityForPort50020", 244 | "modelerType": "RpcDetailedActivityForPort50020", 245 | "tag.port": "50020", 246 | "tag.Context": "rpcdetailed", 247 | "tag.Hostname": "yh-shhd-cdh01", 248 | "InitReplicaRecoveryNumOps": 2, 249 | "InitReplicaRecoveryAvgTime": 2, 250 | "GetReplicaVisibleLengthNumOps": 62882, 251 | "GetReplicaVisibleLengthAvgTime": 0, 252 | "UpdateReplicaUnderRecoveryNumOps": 2, 253 | "UpdateReplicaUnderRecoveryAvgTime": 3, 254 | "ReplicaNotFoundExceptionNumOps": 19, 255 | "ReplicaNotFoundExceptionAvgTime": 0 256 | }, 257 | { 258 | "name": "Hadoop:service=DataNode,name=RpcActivityForPort50020", 259 | "modelerType": "RpcActivityForPort50020", 260 | "tag.port": "50020", 261 | "tag.Context": "rpc", 262 | "tag.NumOpenConnectionsPerUser": "{}", 263 | "tag.Hostname": "yh-shhd-cdh01", 264 | "ReceivedBytes": 21810967, 265 | "SentBytes": 13644420, 266 | "RpcQueueTimeNumOps": 62905, 267 | "RpcQueueTimeAvgTime": 0, 268 | "RpcProcessingTimeNumOps": 62905, 269 | "RpcProcessingTimeAvgTime": 0, 270 | "RpcAuthenticationFailures": 0, 271 | "RpcAuthenticationSuccesses": 0, 272 | "RpcAuthorizationFailures": 0, 273 | "RpcAuthorizationSuccesses": 62903, 274 | "RpcSlowCalls": 0, 275 | "RpcClientBackoff": 0, 276 | "NumOpenConnections": 0, 277 | "CallQueueLength": 0, 278 | "NumDroppedConnections": 0 279 | }, 280 | { 281 | "name": "Hadoop:service=DataNode,name=MetricsSystem,sub=Stats", 282 | "modelerType": "MetricsSystem,sub=Stats", 283 | "tag.Context": "metricssystem", 284 | "tag.Hostname": "yh-shhd-cdh01", 285 | "NumActiveSources": 5, 286 | "NumAllSources": 5, 287 | "NumActiveSinks": 0, 288 | "NumAllSinks": 0, 289 | "SnapshotNumOps": 0, 290 | "SnapshotAvgTime": 0, 291 | "PublishNumOps": 0, 292 | "PublishAvgTime": 0, 293 | "DroppedPubAll": 0 294 | }, 295 | { 296 | "name": "Hadoop:service=DataNode,name=MetricsSystem,sub=Control", 297 | "modelerType": "org.apache.hadoop.metrics2.impl.MetricsSystemImpl" 298 | } 299 | ] 300 | } 301 | -------------------------------------------------------------------------------- /examples/JobHistoryServer.json: -------------------------------------------------------------------------------- 1 | { 2 | "beans": [ 3 | { 4 | "name": "Hadoop:service=JobHistoryServer,name=UgiMetrics", 5 | "modelerType": "UgiMetrics", 6 | "tag.Context": "ugi", 7 | "tag.Hostname": "yh-shhd-cdh01", 8 | "LoginSuccessNumOps": 0, 9 | "LoginSuccessAvgTime": 0, 10 | "LoginFailureNumOps": 0, 11 | "LoginFailureAvgTime": 0, 12 | "GetGroupsNumOps": 0, 13 | "GetGroupsAvgTime": 0, 14 | "RenewalFailuresTotal": 0, 15 | "RenewalFailures": 0 16 | }, 17 | { 18 | "name": "Hadoop:service=JobHistoryServer,name=RpcActivityForPort10033", 19 | "modelerType": "RpcActivityForPort10033", 20 | "tag.port": "10033", 21 | "tag.Context": "rpc", 22 | "tag.NumOpenConnectionsPerUser": "{}", 23 | "tag.Hostname": "yh-shhd-cdh01", 24 | "ReceivedBytes": 0, 25 | "SentBytes": 0, 26 | "RpcQueueTimeNumOps": 0, 27 | "RpcQueueTimeAvgTime": 0, 28 | "RpcProcessingTimeNumOps": 0, 29 | "RpcProcessingTimeAvgTime": 0, 30 | "RpcAuthenticationFailures": 0, 31 | "RpcAuthenticationSuccesses": 0, 32 | "RpcAuthorizationFailures": 0, 33 | "RpcAuthorizationSuccesses": 0, 34 | "RpcSlowCalls": 0, 35 | "RpcClientBackoff": 0, 36 | "NumOpenConnections": 0, 37 | "CallQueueLength": 0, 38 | "NumDroppedConnections": 0 39 | }, 40 | { 41 | "name": "Hadoop:service=JobHistoryServer,name=RpcActivityForPort10020", 42 | "modelerType": "RpcActivityForPort10020", 43 | "tag.port": "10020", 44 | "tag.Context": "rpc", 45 | "tag.NumOpenConnectionsPerUser": "{}", 46 | "tag.Hostname": "yh-shhd-cdh01", 47 | "ReceivedBytes": 0, 48 | "SentBytes": 0, 49 | "RpcQueueTimeNumOps": 0, 50 | "RpcQueueTimeAvgTime": 0, 51 | "RpcProcessingTimeNumOps": 0, 52 | "RpcProcessingTimeAvgTime": 0, 53 | "RpcAuthenticationFailures": 0, 54 | "RpcAuthenticationSuccesses": 0, 55 | "RpcAuthorizationFailures": 0, 56 | "RpcAuthorizationSuccesses": 0, 57 | "RpcSlowCalls": 0, 58 | "RpcClientBackoff": 0, 59 | "NumOpenConnections": 0, 60 | "CallQueueLength": 0, 61 | "NumDroppedConnections": 0 62 | }, 63 | { 64 | "name": "Hadoop:service=JobHistoryServer,name=RpcDetailedActivityForPort10033", 65 | "modelerType": "RpcDetailedActivityForPort10033", 66 | "tag.port": "10033", 67 | "tag.Context": "rpcdetailed", 68 | "tag.Hostname": "yh-shhd-cdh01" 69 | }, 70 | { 71 | "name": "Hadoop:service=JobHistoryServer,name=MetricsSystem,sub=Control", 72 | "modelerType": "org.apache.hadoop.metrics2.impl.MetricsSystemImpl" 73 | }, 74 | { 75 | "name": "Hadoop:service=JobHistoryServer,name=RpcDetailedActivityForPort10020", 76 | "modelerType": "RpcDetailedActivityForPort10020", 77 | "tag.port": "10020", 78 | "tag.Context": "rpcdetailed", 79 | "tag.Hostname": "yh-shhd-cdh01" 80 | }, 81 | { 82 | "name": "Hadoop:service=JobHistoryServer,name=JvmMetrics", 83 | "modelerType": "JvmMetrics", 84 | "tag.Context": "jvm", 85 | "tag.ProcessName": "JobHistoryServer", 86 | "tag.SessionId": null, 87 | "tag.Hostname": "yh-shhd-cdh01", 88 | "MemNonHeapUsedM": 95.115036, 89 | "MemNonHeapCommittedM": 98.24609, 90 | "MemNonHeapMaxM": -1, 91 | "MemHeapUsedM": 321.61688, 92 | "MemHeapCommittedM": 989.875, 93 | "MemHeapMaxM": 989.875, 94 | "MemMaxM": 989.875, 95 | "GcCountParNew": 1447, 96 | "GcTimeMillisParNew": 25516, 97 | "GcCountConcurrentMarkSweep": 2, 98 | "GcTimeMillisConcurrentMarkSweep": 92, 99 | "GcCount": 1449, 100 | "GcTimeMillis": 25608, 101 | "GcNumWarnThresholdExceeded": 0, 102 | "GcNumInfoThresholdExceeded": 0, 103 | "GcTotalExtraSleepTime": 1005, 104 | "ThreadsNew": 0, 105 | "ThreadsRunnable": 12, 106 | "ThreadsBlocked": 0, 107 | "ThreadsWaiting": 10, 108 | "ThreadsTimedWaiting": 25, 109 | "ThreadsTerminated": 0, 110 | "LogFatal": 0, 111 | "LogError": 0, 112 | "LogWarn": 0, 113 | "LogInfo": 0 114 | }, 115 | { 116 | "name": "Hadoop:service=JobHistoryServer,name=MetricsSystem,sub=Stats", 117 | "modelerType": "MetricsSystem,sub=Stats", 118 | "tag.Context": "metricssystem", 119 | "tag.Hostname": "yh-shhd-cdh01", 120 | "NumActiveSources": 6, 121 | "NumAllSources": 6, 122 | "NumActiveSinks": 0, 123 | "NumAllSinks": 0, 124 | "SnapshotNumOps": 0, 125 | "SnapshotAvgTime": 0, 126 | "PublishNumOps": 0, 127 | "PublishAvgTime": 0, 128 | "DroppedPubAll": 0 129 | } 130 | ] 131 | } 132 | -------------------------------------------------------------------------------- /examples/NameNode.json: -------------------------------------------------------------------------------- 1 | { 2 | "beans": [ 3 | { 4 | "name": "Hadoop:service=NameNode,name=JvmMetrics", 5 | "modelerType": "JvmMetrics", 6 | "tag.Context": "jvm", 7 | "tag.ProcessName": "NameNode", 8 | "tag.SessionId": null, 9 | "tag.Hostname": "yh-shhd-cdh02", 10 | "MemNonHeapUsedM": 115.754105, 11 | "MemNonHeapCommittedM": 117.58984, 12 | "MemNonHeapMaxM": -1, 13 | "MemHeapUsedM": 3334.1143, 14 | "MemHeapCommittedM": 7918.9375, 15 | "MemHeapMaxM": 7918.9375, 16 | "MemMaxM": 7918.9375, 17 | "GcCountParNew": 1550, 18 | "GcTimeMillisParNew": 38883, 19 | "GcCountConcurrentMarkSweep": 2, 20 | "GcTimeMillisConcurrentMarkSweep": 485, 21 | "GcCount": 1552, 22 | "GcTimeMillis": 39368, 23 | "GcNumWarnThresholdExceeded": 0, 24 | "GcNumInfoThresholdExceeded": 0, 25 | "GcTotalExtraSleepTime": 2443, 26 | "ThreadsNew": 0, 27 | "ThreadsRunnable": 14, 28 | "ThreadsBlocked": 0, 29 | "ThreadsWaiting": 10, 30 | "ThreadsTimedWaiting": 88, 31 | "ThreadsTerminated": 0, 32 | "LogFatal": 0, 33 | "LogError": 2, 34 | "LogWarn": 18531, 35 | "LogInfo": 40434225 36 | }, 37 | { 38 | "name": "Hadoop:service=NameNode,name=NameNodeActivity", 39 | "modelerType": "NameNodeActivity", 40 | "tag.ProcessName": "NameNode", 41 | "tag.SessionId": null, 42 | "tag.Context": "dfs", 43 | "tag.Hostname": "yh-shhd-cdh02", 44 | "CreateFileOps": 8918834, 45 | "FilesCreated": 12224584, 46 | "FilesAppended": 190847, 47 | "GetBlockLocations": 26548167, 48 | "FilesRenamed": 902599, 49 | "GetListingOps": 37263567, 50 | "DeleteFileOps": 11114944, 51 | "FilesDeleted": 11118384, 52 | "FileInfoOps": 16343664, 53 | "AddBlockOps": 8854681, 54 | "GetAdditionalDatanodeOps": 0, 55 | "CreateSymlinkOps": 0, 56 | "GetLinkTargetOps": 0, 57 | "FilesInGetListingOps": 50064685, 58 | "AllowSnapshotOps": 0, 59 | "DisallowSnapshotOps": 0, 60 | "CreateSnapshotOps": 0, 61 | "DeleteSnapshotOps": 0, 62 | "RenameSnapshotOps": 0, 63 | "ListSnapshottableDirOps": 0, 64 | "SnapshotDiffReportOps": 0, 65 | "BlockReceivedAndDeletedOps": 9711629, 66 | "StorageBlockReportOps": 432, 67 | "BlockOpsQueued": 1, 68 | "BlockOpsBatched": 2684190, 69 | "TransactionsNumOps": 65283526, 70 | "TransactionsAvgTime": 0.00466804979253112, 71 | "SyncsNumOps": 11319494, 72 | "SyncsAvgTime": 19.490243902439026, 73 | "TransactionsBatchedInSync": 311541393, 74 | "BlockReportNumOps": 432, 75 | "BlockReportAvgTime": 232, 76 | "CacheReportNumOps": 245481, 77 | "CacheReportAvgTime": 0, 78 | "SafeModeTime": 89778, 79 | "FsImageLoadTime": 27558, 80 | "GetEditNumOps": 0, 81 | "GetEditAvgTime": 0, 82 | "GetImageNumOps": 0, 83 | "GetImageAvgTime": 0, 84 | "PutImageNumOps": 145, 85 | "PutImageAvgTime": 10951, 86 | "TotalFileOps": 110137303 87 | }, 88 | { 89 | "name": "Hadoop:service=NameNode,name=IPCLoggerChannel-10.193.40.2-8485", 90 | "modelerType": "IPCLoggerChannel-10.193.40.2-8485", 91 | "tag.Context": "dfs", 92 | "tag.IsOutOfSync": "false", 93 | "tag.Hostname": "yh-shhd-cdh02", 94 | "QueuedEditsSize": 225, 95 | "LagTimeMillis": 5, 96 | "CurrentLagTxns": 0 97 | }, 98 | { 99 | "name": "Hadoop:service=NameNode,name=NNTopUserOpCounts", 100 | "modelerType": "NNTopUserOpCounts", 101 | "tag.Context": "dfs", 102 | "tag.Hostname": "yh-shhd-cdh02", 103 | "tag.Context.1": "dfs", 104 | "tag.Hostname.1": "yh-shhd-cdh02", 105 | "tag.Context.2": "dfs", 106 | "tag.Hostname.2": "yh-shhd-cdh02" 107 | }, 108 | { 109 | "name": "Hadoop:service=NameNode,name=NameNodeStatus", 110 | "modelerType": "org.apache.hadoop.hdfs.server.namenode.NameNode", 111 | "SecurityEnabled": false, 112 | "NNRole": "NameNode", 113 | "HostAndPort": "yh-shhd-cdh02:8020", 114 | "LastHATransitionTime": 1586867210547, 115 | "State": "active" 116 | }, 117 | { 118 | "name": "Hadoop:service=NameNode,name=NameNodeInfo", 119 | "modelerType": "org.apache.hadoop.hdfs.server.namenode.FSNamesystem", 120 | "Total": 133166434799616, 121 | "UpgradeFinalized": true, 122 | "ClusterId": "cluster7", 123 | "BlockPoolId": "BP-1654582017-10.193.40.10-1585051030504", 124 | "Version": "2.6.0-cdh5.14.2, r5724a4ad7a27f7af31aa725694d3df09a68bb213", 125 | "TotalBlocks": 5358625, 126 | "Used": 25018182141559, 127 | "Free": 64666311712118, 128 | "Safemode": "", 129 | "NonDfsUsedSpace": 36758750009889, 130 | "PercentUsed": 18.787153, 131 | "BlockPoolUsedSpace": 25018182141559, 132 | "PercentBlockPoolUsed": 18.787153, 133 | "PercentRemaining": 48.56052, 134 | "CacheCapacity": 21474836480, 135 | "CacheUsed": 0, 136 | "TotalFiles": 6823097, 137 | "NumberOfMissingBlocks": 0, 138 | "NumberOfMissingBlocksWithReplicationFactorOne": 0, 139 | "LiveNodes": "{\"yh-shhd-cdh04\":{\"infoAddr\":\"10.193.40.2:50075\",\"infoSecureAddr\":\"10.193.40.2:0\",\"xferaddr\":\"10.193.40.2:50010\",\"lastContact\":1,\"usedSpace\":3402098630360,\"adminState\":\"In Service\",\"nonDfsUsedSpace\":12260165435688,\"capacity\":23248542650368,\"numBlocks\":1542688,\"version\":\"2.6.0-cdh5.14.2\",\"used\":3402098630360,\"remaining\":6412527156224,\"blockScheduled\":16,\"blockPoolUsed\":3402098630360,\"blockPoolUsedPercent\":14.633599,\"volfails\":0},\"yh-shhd-cdh03\":{\"infoAddr\":\"10.193.40.1:50075\",\"infoSecureAddr\":\"10.193.40.1:0\",\"xferaddr\":\"10.193.40.1:50010\",\"lastContact\":2,\"usedSpace\":3360642361157,\"adminState\":\"In Service\",\"nonDfsUsedSpace\":12214790928571,\"capacity\":23248542269440,\"numBlocks\":1546037,\"version\":\"2.6.0-cdh5.14.2\",\"used\":3360642361157,\"remaining\":6499349846314,\"blockScheduled\":2,\"blockPoolUsed\":3360642361157,\"blockPoolUsedPercent\":14.455282,\"volfails\":0},\"yh-shhd-cdh05\":{\"infoAddr\":\"10.193.40.3:50075\",\"infoSecureAddr\":\"10.193.40.3:0\",\"xferaddr\":\"10.193.40.3:50010\",\"lastContact\":2,\"usedSpace\":3251740135362,\"adminState\":\"In Service\",\"nonDfsUsedSpace\":12283793645630,\"capacity\":23248542269440,\"numBlocks\":1091277,\"version\":\"2.6.0-cdh5.14.2\",\"used\":3251740135362,\"remaining\":6539139766237,\"blockScheduled\":0,\"blockPoolUsed\":3251740135362,\"blockPoolUsedPercent\":13.986856,\"volfails\":0},\"yh-shhd-cdh02\":{\"infoAddr\":\"10.193.40.10:50075\",\"infoSecureAddr\":\"10.193.40.10:0\",\"xferaddr\":\"10.193.40.10:50010\",\"lastContact\":1,\"usedSpace\":8922113438288,\"adminState\":\"In Service\",\"nonDfsUsedSpace\":0,\"capacity\":31710403805184,\"numBlocks\":575953,\"version\":\"2.6.0-cdh5.14.2\",\"used\":8922113438288,\"remaining\":21187383799102,\"blockScheduled\":1,\"blockPoolUsed\":8922113438288,\"blockPoolUsedPercent\":28.136232,\"volfails\":0},\"yh-shhd-cdh01\":{\"infoAddr\":\"10.193.40.9:50075\",\"infoSecureAddr\":\"10.193.40.9:0\",\"xferaddr\":\"10.193.40.9:50010\",\"lastContact\":0,\"usedSpace\":6081587576392,\"adminState\":\"In Service\",\"nonDfsUsedSpace\":0,\"capacity\":31710403805184,\"numBlocks\":1179703,\"version\":\"2.6.0-cdh5.14.2\",\"used\":6081587576392,\"remaining\":24027911144241,\"blockScheduled\":0,\"blockPoolUsed\":6081587576392,\"blockPoolUsedPercent\":19.178526,\"volfails\":0}}", 140 | "SoftwareVersion": "2.6.0-cdh5.14.2", 141 | "DeadNodes": "{}", 142 | "DecomNodes": "{}", 143 | "EnteringMaintenanceNodes": "{}", 144 | "NameDirStatuses": "{\"active\":{\"/mnt/disk1/dfs/nn\":\"IMAGE_AND_EDITS\",\"/mnt/disk0/dfs/nn\":\"IMAGE_AND_EDITS\"},\"failed\":{}}", 145 | "NodeUsage": "{\"nodeUsage\":{\"min\":\"13.99%\",\"median\":\"14.63%\",\"max\":\"28.14%\",\"stdDev\":\"5.37%\"}}", 146 | "NameJournalStatus": "[{\"manager\":\"QJM to [10.193.40.1:8485, 10.193.40.2:8485, 10.193.40.3:8485]\",\"stream\":\"Writing segment beginning at txid 322850295. \\n10.193.40.1:8485 (Written txid 322860887), 10.193.40.2:8485 (Written txid 322860887), 10.193.40.3:8485 (Written txid 322860887)\",\"disabled\":\"false\",\"required\":\"true\"},{\"manager\":\"FileJournalManager(root=/mnt/disk0/dfs/nn)\",\"stream\":\"EditLogFileOutputStream(/mnt/disk0/dfs/nn/current/edits_inprogress_0000000000322850295)\",\"disabled\":\"false\",\"required\":\"false\"},{\"manager\":\"FileJournalManager(root=/mnt/disk1/dfs/nn)\",\"stream\":\"EditLogFileOutputStream(/mnt/disk1/dfs/nn/current/edits_inprogress_0000000000322850295)\",\"disabled\":\"false\",\"required\":\"false\"}]", 147 | "JournalTransactionInfo": "{\"MostRecentCheckpointTxId\":\"322439274\",\"LastAppliedOrWrittenTxId\":\"322860888\"}", 148 | "NNStarted": "Tue Apr 14 20:26:19 CST 2020", 149 | "NNStartedTimeInMillis": 1586867179855, 150 | "CompileInfo": "2018-03-27T20:40Z by jenkins from Unknown", 151 | "CorruptFiles": "[]", 152 | "NumberOfSnapshottableDirs": 0, 153 | "DistinctVersionCount": 1, 154 | "DistinctVersions": [ 155 | { 156 | "key": "2.6.0-cdh5.14.2", 157 | "value": 5 158 | } 159 | ], 160 | "RollingUpgradeStatus": null, 161 | "Threads": 112 162 | }, 163 | { 164 | "name": "Hadoop:service=NameNode,name=StartupProgress", 165 | "modelerType": "StartupProgress", 166 | "tag.Hostname": "yh-shhd-cdh02", 167 | "ElapsedTime": 89320, 168 | "PercentComplete": 1, 169 | "LoadingFsImageCount": 6362453, 170 | "LoadingFsImageElapsedTime": 25441, 171 | "LoadingFsImageTotal": 6362453, 172 | "LoadingFsImagePercentComplete": 1, 173 | "LoadingEditsCount": 261261, 174 | "LoadingEditsElapsedTime": 282, 175 | "LoadingEditsTotal": 261261, 176 | "LoadingEditsPercentComplete": 1, 177 | "SavingCheckpointCount": 0, 178 | "SavingCheckpointElapsedTime": 0, 179 | "SavingCheckpointTotal": 0, 180 | "SavingCheckpointPercentComplete": 1, 181 | "SafeModeCount": 4966636, 182 | "SafeModeElapsedTime": 61632, 183 | "SafeModeTotal": 4966505, 184 | "SafeModePercentComplete": 1 185 | }, 186 | { 187 | "name": "Hadoop:service=NameNode,name=FSNamesystem", 188 | "modelerType": "FSNamesystem", 189 | "tag.Context": "dfs", 190 | "tag.HAState": "active", 191 | "tag.Hostname": "yh-shhd-cdh02", 192 | "BlocksTotal": 5358625, 193 | "MissingBlocks": 0, 194 | "MissingReplOneBlocks": 0, 195 | "ExpiredHeartbeats": 0, 196 | "TransactionsSinceLastCheckpoint": 421614, 197 | "TransactionsSinceLastLogRoll": 10594, 198 | "LastWrittenTransactionId": 322860888, 199 | "LastCheckpointTime": 1587392977976, 200 | "UnderReplicatedBlocks": 0, 201 | "CorruptBlocks": 0, 202 | "CapacityTotal": 133166434799616, 203 | "CapacityTotalGB": 124021, 204 | "CapacityUsed": 25018182141559, 205 | "CapacityUsedGB": 23300, 206 | "CapacityRemaining": 64666311712118, 207 | "CapacityRemainingGB": 60225, 208 | "CapacityUsedNonDFS": 36758750009889, 209 | "TotalLoad": 50, 210 | "SnapshottableDirectories": 0, 211 | "Snapshots": 0, 212 | "NumEncryptionZones": 0, 213 | "LockQueueLength": 0, 214 | "NumFilesUnderConstruction": 82, 215 | "NumActiveClients": 52, 216 | "FilesTotal": 6823097, 217 | "PendingReplicationBlocks": 0, 218 | "ScheduledReplicationBlocks": 0, 219 | "PendingDeletionBlocks": 0, 220 | "ExcessBlocks": 0, 221 | "PostponedMisreplicatedBlocks": 0, 222 | "PendingDataNodeMessageCount": 0, 223 | "MillisSinceLastLoadedEdits": 0, 224 | "BlockCapacity": 16777216, 225 | "StaleDataNodes": 0, 226 | "TotalFiles": 6823097 227 | }, 228 | { 229 | "name": "Hadoop:service=NameNode,name=SnapshotInfo", 230 | "modelerType": "org.apache.hadoop.hdfs.server.namenode.snapshot.SnapshotManager", 231 | "SnapshottableDirectories": [], 232 | "Snapshots": [] 233 | }, 234 | { 235 | "name": "Hadoop:service=NameNode,name=IPCLoggerChannel-10.193.40.1-8485", 236 | "modelerType": "IPCLoggerChannel-10.193.40.1-8485", 237 | "tag.Context": "dfs", 238 | "tag.IsOutOfSync": "false", 239 | "tag.Hostname": "yh-shhd-cdh02", 240 | "QueuedEditsSize": 225, 241 | "LagTimeMillis": 5, 242 | "CurrentLagTxns": 0 243 | }, 244 | { 245 | "name": "Hadoop:service=NameNode,name=RpcDetailedActivityForPort8022", 246 | "modelerType": "RpcDetailedActivityForPort8022", 247 | "tag.port": "8022", 248 | "tag.Context": "rpcdetailed", 249 | "tag.Hostname": "yh-shhd-cdh02", 250 | "GetServiceStatusNumOps": 528952, 251 | "GetServiceStatusAvgTime": 0, 252 | "RollEditLogNumOps": 4384, 253 | "RollEditLogAvgTime": 281, 254 | "RegisterDatanodeNumOps": 5, 255 | "RegisterDatanodeAvgTime": 43.333333333333336, 256 | "SendHeartbeatNumOps": 881809, 257 | "SendHeartbeatAvgTime": 0, 258 | "CacheReportNumOps": 245481, 259 | "CacheReportAvgTime": 0, 260 | "ReportBadBlocksNumOps": 2, 261 | "ReportBadBlocksAvgTime": 0, 262 | "VersionRequestNumOps": 5, 263 | "VersionRequestAvgTime": 119.2, 264 | "MonitorHealthNumOps": 528951, 265 | "MonitorHealthAvgTime": 0.0625, 266 | "TransitionToActiveNumOps": 1, 267 | "TransitionToActiveAvgTime": 1995, 268 | "CommitBlockSynchronizationNumOps": 18, 269 | "CommitBlockSynchronizationAvgTime": 0.5, 270 | "BlockReportNumOps": 329, 271 | "BlockReportAvgTime": 232.33333333333334, 272 | "BlockReceivedAndDeletedNumOps": 9711629, 273 | "BlockReceivedAndDeletedAvgTime": 0.007246376811594203, 274 | "ErrorReportNumOps": 7, 275 | "ErrorReportAvgTime": 0 276 | }, 277 | { 278 | "name": "Hadoop:service=NameNode,name=BlockStats", 279 | "modelerType": "org.apache.hadoop.hdfs.server.blockmanagement.BlockManager", 280 | "StorageTypeStats": [ 281 | { 282 | "key": "DISK", 283 | "value": { 284 | "blockPoolUsed": 25018182141559, 285 | "capacityRemaining": 64666311712118, 286 | "capacityTotal": 133166434799616, 287 | "capacityUsed": 25018182141559, 288 | "nodesInService": 5 289 | } 290 | } 291 | ] 292 | }, 293 | { 294 | "name": "Hadoop:service=NameNode,name=RpcActivityForPort8022", 295 | "modelerType": "RpcActivityForPort8022", 296 | "tag.port": "8022", 297 | "tag.Context": "rpc", 298 | "tag.NumOpenConnectionsPerUser": "{\"hdfs\":6}", 299 | "tag.Hostname": "yh-shhd-cdh02", 300 | "ReceivedBytes": 6932960359, 301 | "SentBytes": 656275389, 302 | "RpcQueueTimeNumOps": 11901573, 303 | "RpcQueueTimeAvgTime": 0.029411764705882353, 304 | "RpcProcessingTimeNumOps": 11901573, 305 | "RpcProcessingTimeAvgTime": 0.008823529411764706, 306 | "RpcAuthenticationFailures": 0, 307 | "RpcAuthenticationSuccesses": 0, 308 | "RpcAuthorizationFailures": 0, 309 | "RpcAuthorizationSuccesses": 4396, 310 | "RpcSlowCalls": 0, 311 | "RpcClientBackoff": 0, 312 | "NumOpenConnections": 6, 313 | "CallQueueLength": 0, 314 | "NumDroppedConnections": 0 315 | }, 316 | { 317 | "name": "Hadoop:service=NameNode,name=FSNamesystemState", 318 | "modelerType": "org.apache.hadoop.hdfs.server.namenode.FSNamesystem", 319 | "BlocksTotal": 5358625, 320 | "UnderReplicatedBlocks": 0, 321 | "CapacityTotal": 133166434799616, 322 | "CapacityUsed": 25018182141559, 323 | "CapacityRemaining": 64666311712118, 324 | "TotalLoad": 50, 325 | "SnapshotStats": "{\"SnapshottableDirectories\":0,\"Snapshots\":0}", 326 | "NumEncryptionZones": 0, 327 | "FsLockQueueLength": 0, 328 | "MaxObjects": 0, 329 | "FilesTotal": 6823097, 330 | "PendingReplicationBlocks": 0, 331 | "ScheduledReplicationBlocks": 0, 332 | "PendingDeletionBlocks": 0, 333 | "BlockDeletionStartTime": 1586867179855, 334 | "FSState": "Operational", 335 | "NumLiveDataNodes": 5, 336 | "NumDeadDataNodes": 0, 337 | "NumDecomLiveDataNodes": 0, 338 | "NumDecomDeadDataNodes": 0, 339 | "VolumeFailuresTotal": 0, 340 | "EstimatedCapacityLostTotal": 0, 341 | "NumDecommissioningDataNodes": 0, 342 | "NumStaleDataNodes": 0, 343 | "NumStaleStorages": 0, 344 | "TopUserOpCounts": "{\"timestamp\":\"2020-04-20T23:25:36+0800\",\"windows\":[{\"ops\":[{\"opType\":\"listCachePools\",\"topUsers\":[{\"user\":\"hdfs\",\"count\":1}],\"totalCount\":1},{\"opType\":\"listStatus\",\"topUsers\":[{\"user\":\"work\",\"count\":1349},{\"user\":\"mapred\",\"count\":4}],\"totalCount\":1353},{\"opType\":\"*\",\"topUsers\":[{\"user\":\"work\",\"count\":3727},{\"user\":\"hdfs\",\"count\":4},{\"user\":\"mapred\",\"count\":4},{\"user\":\"yarn\",\"count\":3}],\"totalCount\":3738},{\"opType\":\"delete\",\"topUsers\":[{\"user\":\"work\",\"count\":695},{\"user\":\"hdfs\",\"count\":1}],\"totalCount\":696},{\"opType\":\"setReplication\",\"topUsers\":[{\"user\":\"work\",\"count\":7}],\"totalCount\":7},{\"opType\":\"listCacheDirectives\",\"topUsers\":[{\"user\":\"hdfs\",\"count\":1}],\"totalCount\":1},{\"opType\":\"getfileinfo\",\"topUsers\":[{\"user\":\"work\",\"count\":1123},{\"user\":\"yarn\",\"count\":3}],\"totalCount\":1126},{\"opType\":\"rename\",\"topUsers\":[{\"user\":\"work\",\"count\":64}],\"totalCount\":64},{\"opType\":\"mkdirs\",\"topUsers\":[{\"user\":\"work\",\"count\":182}],\"totalCount\":182},{\"opType\":\"create\",\"topUsers\":[{\"user\":\"work\",\"count\":407}],\"totalCount\":407},{\"opType\":\"setPermission\",\"topUsers\":[{\"user\":\"work\",\"count\":11}],\"totalCount\":11},{\"opType\":\"filesInGetListing\",\"topUsers\":[{\"user\":\"work\",\"count\":1643},{\"user\":\"mapred\",\"count\":12}],\"totalCount\":1655},{\"opType\":\"open\",\"topUsers\":[{\"user\":\"work\",\"count\":303},{\"user\":\"hdfs\",\"count\":1}],\"totalCount\":304},{\"opType\":\"append\",\"topUsers\":[{\"user\":\"work\",\"count\":11}],\"totalCount\":11}],\"windowLenMs\":60000},{\"ops\":[{\"opType\":\"listCachePools\",\"topUsers\":[{\"user\":\"hdfs\",\"count\":4}],\"totalCount\":4},{\"opType\":\"listStatus\",\"topUsers\":[{\"user\":\"work\",\"count\":6659},{\"user\":\"mapred\",\"count\":12},{\"user\":\"yarn\",\"count\":1}],\"totalCount\":6672},{\"opType\":\"*\",\"topUsers\":[{\"user\":\"work\",\"count\":9789},{\"user\":\"hdfs\",\"count\":28},{\"user\":\"mapred\",\"count\":12},{\"user\":\"yarn\",\"count\":6}],\"totalCount\":9835},{\"opType\":\"delete\",\"topUsers\":[{\"user\":\"work\",\"count\":2738},{\"user\":\"hdfs\",\"count\":5}],\"totalCount\":2743},{\"opType\":\"setReplication\",\"topUsers\":[{\"user\":\"work\",\"count\":10}],\"totalCount\":10},{\"opType\":\"listCacheDirectives\",\"topUsers\":[{\"user\":\"hdfs\",\"count\":4}],\"totalCount\":4},{\"opType\":\"getfileinfo\",\"topUsers\":[{\"user\":\"work\",\"count\":3089},{\"user\":\"hdfs\",\"count\":5},{\"user\":\"yarn\",\"count\":3}],\"totalCount\":3097},{\"opType\":\"rename\",\"topUsers\":[{\"user\":\"work\",\"count\":123}],\"totalCount\":123},{\"opType\":\"mkdirs\",\"topUsers\":[{\"user\":\"work\",\"count\":832}],\"totalCount\":832},{\"opType\":\"create\",\"topUsers\":[{\"user\":\"work\",\"count\":2775},{\"user\":\"hdfs\",\"count\":5}],\"totalCount\":2780},{\"opType\":\"setPermission\",\"topUsers\":[{\"user\":\"work\",\"count\":15}],\"totalCount\":15},{\"opType\":\"filesInGetListing\",\"topUsers\":[{\"user\":\"work\",\"count\":7671},{\"user\":\"mapred\",\"count\":36},{\"user\":\"yarn\",\"count\":1}],\"totalCount\":7708},{\"opType\":\"open\",\"topUsers\":[{\"user\":\"work\",\"count\":3804},{\"user\":\"hdfs\",\"count\":5}],\"totalCount\":3809},{\"opType\":\"append\",\"topUsers\":[{\"user\":\"work\",\"count\":65}],\"totalCount\":65}],\"windowLenMs\":300000},{\"ops\":[{\"opType\":\"listCachePools\",\"topUsers\":[{\"user\":\"hdfs\",\"count\":16}],\"totalCount\":16},{\"opType\":\"listStatus\",\"topUsers\":[{\"user\":\"work\",\"count\":56123},{\"user\":\"mapred\",\"count\":63},{\"user\":\"yarn\",\"count\":18}],\"totalCount\":56204},{\"opType\":\"*\",\"topUsers\":[{\"user\":\"work\",\"count\":136057},{\"user\":\"mapred\",\"count\":101},{\"user\":\"hdfs\",\"count\":74},{\"user\":\"yarn\",\"count\":40}],\"totalCount\":136272},{\"opType\":\"delete\",\"topUsers\":[{\"user\":\"work\",\"count\":22669},{\"user\":\"hdfs\",\"count\":14},{\"user\":\"mapred\",\"count\":7}],\"totalCount\":22690},{\"opType\":\"setReplication\",\"topUsers\":[{\"user\":\"work\",\"count\":40}],\"totalCount\":40},{\"opType\":\"listCacheDirectives\",\"topUsers\":[{\"user\":\"hdfs\",\"count\":16}],\"totalCount\":16},{\"opType\":\"getfileinfo\",\"topUsers\":[{\"user\":\"work\",\"count\":8090},{\"user\":\"yarn\",\"count\":28},{\"user\":\"mapred\",\"count\":28},{\"user\":\"hdfs\",\"count\":13}],\"totalCount\":8159},{\"opType\":\"rename options=2\",\"topUsers\":[{\"user\":\"mapred\",\"count\":14},{\"user\":\"work\",\"count\":8}],\"totalCount\":22},{\"opType\":\"rename\",\"topUsers\":[{\"user\":\"work\",\"count\":1285}],\"totalCount\":1285},{\"opType\":\"mkdirs\",\"topUsers\":[{\"user\":\"work\",\"count\":1541}],\"totalCount\":1541},{\"opType\":\"create\",\"topUsers\":[{\"user\":\"work\",\"count\":5148},{\"user\":\"hdfs\",\"count\":13}],\"totalCount\":5161},{\"opType\":\"setPermission\",\"topUsers\":[{\"user\":\"work\",\"count\":117}],\"totalCount\":117},{\"opType\":\"filesInGetListing\",\"topUsers\":[{\"user\":\"work\",\"count\":61532},{\"user\":\"mapred\",\"count\":203},{\"user\":\"yarn\",\"count\":32}],\"totalCount\":61767},{\"opType\":\"open\",\"topUsers\":[{\"user\":\"work\",\"count\":5192},{\"user\":\"mapred\",\"count\":58},{\"user\":\"hdfs\",\"count\":14}],\"totalCount\":5264},{\"opType\":\"append\",\"topUsers\":[{\"user\":\"work\",\"count\":406}],\"totalCount\":406},{\"opType\":\"contentSummary\",\"topUsers\":[{\"user\":\"work\",\"count\":4}],\"totalCount\":4}],\"windowLenMs\":1500000}]}", 345 | "NumInMaintenanceLiveDataNodes": 0, 346 | "NumInMaintenanceDeadDataNodes": 0, 347 | "NumEnteringMaintenanceDataNodes": 0 348 | }, 349 | { 350 | "name": "Hadoop:service=NameNode,name=IPCLoggerChannel-10.193.40.3-8485", 351 | "modelerType": "IPCLoggerChannel-10.193.40.3-8485", 352 | "tag.Context": "dfs", 353 | "tag.IsOutOfSync": "false", 354 | "tag.Hostname": "yh-shhd-cdh02", 355 | "QueuedEditsSize": 0, 356 | "LagTimeMillis": 0, 357 | "CurrentLagTxns": 0 358 | }, 359 | { 360 | "name": "Hadoop:service=NameNode,name=RetryCache.NameNodeRetryCache", 361 | "modelerType": "RetryCache.NameNodeRetryCache", 362 | "tag.Context": "rpc", 363 | "tag.Hostname": "yh-shhd-cdh02", 364 | "CacheHit": 175, 365 | "CacheCleared": 0, 366 | "CacheUpdated": 21424040 367 | }, 368 | { 369 | "name": "Hadoop:service=NameNode,name=RpcActivityForPort8020", 370 | "modelerType": "RpcActivityForPort8020", 371 | "tag.port": "8020", 372 | "tag.Context": "rpc", 373 | "tag.NumOpenConnectionsPerUser": "{\"work\":56,\"mapred\":1}", 374 | "tag.Hostname": "yh-shhd-cdh02", 375 | "ReceivedBytes": 31160263601, 376 | "SentBytes": 45729185342, 377 | "RpcQueueTimeNumOps": 125221471, 378 | "RpcQueueTimeAvgTime": 0.02129011757229107, 379 | "RpcProcessingTimeNumOps": 125221471, 380 | "RpcProcessingTimeAvgTime": 0.06990784874483635, 381 | "RpcAuthenticationFailures": 0, 382 | "RpcAuthenticationSuccesses": 0, 383 | "RpcAuthorizationFailures": 0, 384 | "RpcAuthorizationSuccesses": 495234, 385 | "RpcSlowCalls": 0, 386 | "RpcClientBackoff": 0, 387 | "NumOpenConnections": 57, 388 | "CallQueueLength": 0, 389 | "NumDroppedConnections": 0 390 | }, 391 | { 392 | "name": "Hadoop:service=NameNode,name=UgiMetrics", 393 | "modelerType": "UgiMetrics", 394 | "tag.Context": "ugi", 395 | "tag.Hostname": "yh-shhd-cdh02", 396 | "LoginSuccessNumOps": 0, 397 | "LoginSuccessAvgTime": 0, 398 | "LoginFailureNumOps": 0, 399 | "LoginFailureAvgTime": 0, 400 | "GetGroupsNumOps": 6750, 401 | "GetGroupsAvgTime": 9, 402 | "RenewalFailuresTotal": 0, 403 | "RenewalFailures": 0 404 | }, 405 | { 406 | "name": "Hadoop:service=NameNode,name=RpcDetailedActivityForPort8020", 407 | "modelerType": "RpcDetailedActivityForPort8020", 408 | "tag.port": "8020", 409 | "tag.Context": "rpcdetailed", 410 | "tag.Hostname": "yh-shhd-cdh02", 411 | "GetBlockLocationsNumOps": 26548005, 412 | "GetBlockLocationsAvgTime": 0.03896103896103896, 413 | "ListCachePoolsNumOps": 8532, 414 | "ListCachePoolsAvgTime": 0, 415 | "FileNotFoundExceptionNumOps": 54, 416 | "FileNotFoundExceptionAvgTime": 0, 417 | "GetDatanodeReportNumOps": 3, 418 | "GetDatanodeReportAvgTime": 0.6666666666666666, 419 | "DeleteNumOps": 11132547, 420 | "DeleteAvgTime": 0.10862619808306709, 421 | "GetServerDefaultsNumOps": 394929, 422 | "GetServerDefaultsAvgTime": 0, 423 | "GetFsStatsNumOps": 5, 424 | "GetFsStatsAvgTime": 0.8, 425 | "FsyncNumOps": 2762323, 426 | "FsyncAvgTime": 0.012658227848101266, 427 | "LeaseExpiredExceptionNumOps": 261, 428 | "LeaseExpiredExceptionAvgTime": 0, 429 | "AddBlockNumOps": 8854681, 430 | "AddBlockAvgTime": 0.1863799283154122, 431 | "ListEncryptionZonesNumOps": 147, 432 | "ListEncryptionZonesAvgTime": 0, 433 | "CreateNumOps": 8918834, 434 | "CreateAvgTime": 0.17204301075268819, 435 | "SetPermissionNumOps": 97884, 436 | "SetPermissionAvgTime": 0, 437 | "UpdateBlockForPipelineNumOps": 183263, 438 | "UpdateBlockForPipelineAvgTime": 0, 439 | "AlreadyBeingCreatedExceptionNumOps": 8292, 440 | "AlreadyBeingCreatedExceptionAvgTime": 0, 441 | "GetContentSummaryNumOps": 5737, 442 | "GetContentSummaryAvgTime": 0, 443 | "SetSafeModeNumOps": 2, 444 | "SetSafeModeAvgTime": 0, 445 | "GetListingNumOps": 35775306, 446 | "GetListingAvgTime": 0.028312570781426953, 447 | "SetReplicationNumOps": 105295, 448 | "SetReplicationAvgTime": 0.14285714285714285, 449 | "CheckAccessNumOps": 17761, 450 | "CheckAccessAvgTime": 0, 451 | "Rename2NumOps": 24359, 452 | "Rename2AvgTime": 0, 453 | "ListCacheDirectivesNumOps": 8532, 454 | "ListCacheDirectivesAvgTime": 1, 455 | "StandbyExceptionNumOps": 1, 456 | "StandbyExceptionAvgTime": 11, 457 | "RenewLeaseNumOps": 544048, 458 | "RenewLeaseAvgTime": 0, 459 | "RenameNumOps": 878240, 460 | "RenameAvgTime": 0.1875, 461 | "MkdirsNumOps": 2802980, 462 | "MkdirsAvgTime": 0.13725490196078433, 463 | "RecoverLeaseNumOps": 2, 464 | "RecoverLeaseAvgTime": 7, 465 | "UpdatePipelineNumOps": 183258, 466 | "UpdatePipelineAvgTime": 0, 467 | "SetTimesNumOps": 12, 468 | "SetTimesAvgTime": 0, 469 | "CompleteNumOps": 9430907, 470 | "CompleteAvgTime": 0.06666666666666667, 471 | "GetFileInfoNumOps": 16343651, 472 | "GetFileInfoAvgTime": 0.02544529262086514, 473 | "AppendNumOps": 190847, 474 | "AppendAvgTime": 0, 475 | "RetriableExceptionNumOps": 773, 476 | "RetriableExceptionAvgTime": 11.575704225352112 477 | }, 478 | { 479 | "name": "Hadoop:service=NameNode,name=MetricsSystem,sub=Stats", 480 | "modelerType": "MetricsSystem,sub=Stats", 481 | "tag.Context": "metricssystem", 482 | "tag.Hostname": "yh-shhd-cdh02", 483 | "NumActiveSources": 14, 484 | "NumAllSources": 14, 485 | "NumActiveSinks": 0, 486 | "NumAllSinks": 0, 487 | "SnapshotNumOps": 0, 488 | "SnapshotAvgTime": 0, 489 | "PublishNumOps": 0, 490 | "PublishAvgTime": 0, 491 | "DroppedPubAll": 0 492 | }, 493 | { 494 | "name": "Hadoop:service=NameNode,name=MetricsSystem,sub=Control", 495 | "modelerType": "org.apache.hadoop.metrics2.impl.MetricsSystemImpl" 496 | } 497 | ] 498 | } 499 | -------------------------------------------------------------------------------- /examples/NodeManager.json: -------------------------------------------------------------------------------- 1 | { 2 | "beans": [ 3 | { 4 | "name": "Hadoop:service=NodeManager,name=ContainerResource_container_e12_1586466344459_0110_01_000029", 5 | "modelerType": "ContainerResource_container_e12_1586466344459_0110_01_000029" 6 | }, 7 | { 8 | "name": "Hadoop:service=NodeManager,name=ContainerResource_container_e12_1586466344459_0110_01_000021", 9 | "modelerType": "ContainerResource_container_e12_1586466344459_0110_01_000021" 10 | }, 11 | { 12 | "name": "Hadoop:service=NodeManager,name=ContainerResource_container_e12_1586466344459_0110_01_000003", 13 | "modelerType": "ContainerResource_container_e12_1586466344459_0110_01_000003" 14 | }, 15 | { 16 | "name": "Hadoop:service=NodeManager,name=ContainerResource_container_e12_1586466344459_0110_01_000050", 17 | "modelerType": "ContainerResource_container_e12_1586466344459_0110_01_000050" 18 | }, 19 | { 20 | "name": "Hadoop:service=NodeManager,name=ContainerResource_container_e14_1586867223681_12177_01_000001", 21 | "modelerType": "ContainerResource_container_e14_1586867223681_12177_01_000001" 22 | }, 23 | { 24 | "name": "Hadoop:service=NodeManager,name=ContainerResource_container_e12_1586466344459_0110_01_000031", 25 | "modelerType": "ContainerResource_container_e12_1586466344459_0110_01_000031" 26 | }, 27 | { 28 | "name": "Hadoop:service=NodeManager,name=NodeManagerMetrics", 29 | "modelerType": "NodeManagerMetrics", 30 | "tag.Context": "yarn", 31 | "tag.Hostname": "yh-shhd-cdh04", 32 | "ContainersLaunched": 19285, 33 | "ContainersCompleted": 16605, 34 | "ContainersFailed": 1032, 35 | "ContainersKilled": 1651, 36 | "ContainersIniting": 0, 37 | "ContainersRunning": 12, 38 | "AllocatedGB": -30, 39 | "AllocatedContainers": -3, 40 | "AvailableGB": 272, 41 | "AllocatedVCores": -3, 42 | "AvailableVCores": 83, 43 | "ContainerLaunchDurationNumOps": 19300, 44 | "ContainerLaunchDurationAvgTime": 11, 45 | "BadLocalDirs": 0, 46 | "BadLogDirs": 0, 47 | "GoodLocalDirsDiskUtilizationPerc": 72, 48 | "GoodLogDirsDiskUtilizationPerc": 71 49 | }, 50 | { 51 | "name": "Hadoop:service=NodeManager,name=ContainerResource_container_e12_1586466344459_0110_01_000051", 52 | "modelerType": "ContainerResource_container_e12_1586466344459_0110_01_000051" 53 | }, 54 | { 55 | "name": "Hadoop:service=NodeManager,name=RpcActivityForPort8040", 56 | "modelerType": "RpcActivityForPort8040", 57 | "tag.port": "8040", 58 | "tag.Context": "rpc", 59 | "tag.NumOpenConnectionsPerUser": "{\"work\":1}", 60 | "tag.Hostname": "yh-shhd-cdh04", 61 | "ReceivedBytes": 15997743, 62 | "SentBytes": 5431407, 63 | "RpcQueueTimeNumOps": 65367, 64 | "RpcQueueTimeAvgTime": 0, 65 | "RpcProcessingTimeNumOps": 65367, 66 | "RpcProcessingTimeAvgTime": 0.6, 67 | "RpcAuthenticationFailures": 0, 68 | "RpcAuthenticationSuccesses": 0, 69 | "RpcAuthorizationFailures": 0, 70 | "RpcAuthorizationSuccesses": 16608, 71 | "RpcSlowCalls": 0, 72 | "RpcClientBackoff": 0, 73 | "NumOpenConnections": 1, 74 | "CallQueueLength": 0, 75 | "NumDroppedConnections": 0 76 | }, 77 | { 78 | "name": "Hadoop:service=NodeManager,name=RpcDetailedActivityForPort8040", 79 | "modelerType": "RpcDetailedActivityForPort8040", 80 | "tag.port": "8040", 81 | "tag.Context": "rpcdetailed", 82 | "tag.Hostname": "yh-shhd-cdh04", 83 | "HeartbeatNumOps": 65367, 84 | "HeartbeatAvgTime": 0.6 85 | }, 86 | { 87 | "name": "Hadoop:service=NodeManager,name=ContainerResource_container_e12_1586466344459_0110_01_000030", 88 | "modelerType": "ContainerResource_container_e12_1586466344459_0110_01_000030" 89 | }, 90 | { 91 | "name": "Hadoop:service=NodeManager,name=MetricsSystem,sub=Control", 92 | "modelerType": "org.apache.hadoop.metrics2.impl.MetricsSystemImpl" 93 | }, 94 | { 95 | "name": "Hadoop:service=NodeManager,name=JvmMetrics", 96 | "modelerType": "JvmMetrics", 97 | "tag.Context": "jvm", 98 | "tag.ProcessName": "NodeManager", 99 | "tag.SessionId": null, 100 | "tag.Hostname": "yh-shhd-cdh04", 101 | "MemNonHeapUsedM": 114.80347, 102 | "MemNonHeapCommittedM": 117.72266, 103 | "MemNonHeapMaxM": -1, 104 | "MemHeapUsedM": 83.01589, 105 | "MemHeapCommittedM": 989.875, 106 | "MemHeapMaxM": 989.875, 107 | "MemMaxM": 989.875, 108 | "GcCountParNew": 131940, 109 | "GcTimeMillisParNew": 1707030, 110 | "GcCountConcurrentMarkSweep": 8, 111 | "GcTimeMillisConcurrentMarkSweep": 382, 112 | "GcCount": 131948, 113 | "GcTimeMillis": 1707412, 114 | "GcNumWarnThresholdExceeded": 0, 115 | "GcNumInfoThresholdExceeded": 0, 116 | "GcTotalExtraSleepTime": 22756, 117 | "ThreadsNew": 0, 118 | "ThreadsRunnable": 180, 119 | "ThreadsBlocked": 0, 120 | "ThreadsWaiting": 116, 121 | "ThreadsTimedWaiting": 62, 122 | "ThreadsTerminated": 0, 123 | "LogFatal": 0, 124 | "LogError": 0, 125 | "LogWarn": 0, 126 | "LogInfo": 0 127 | }, 128 | { 129 | "name": "Hadoop:service=NodeManager,name=ContainerResource_container_e12_1586466344459_0110_01_000001", 130 | "modelerType": "ContainerResource_container_e12_1586466344459_0110_01_000001" 131 | }, 132 | { 133 | "name": "Hadoop:service=NodeManager,name=ContainerResource_container_e14_1586867223681_12174_01_000012", 134 | "modelerType": "ContainerResource_container_e14_1586867223681_12174_01_000012" 135 | }, 136 | { 137 | "name": "Hadoop:service=NodeManager,name=ContainerResource_container_e12_1586466344459_0110_01_000006", 138 | "modelerType": "ContainerResource_container_e12_1586466344459_0110_01_000006" 139 | }, 140 | { 141 | "name": "Hadoop:service=NodeManager,name=RpcDetailedActivityForPort8041", 142 | "modelerType": "RpcDetailedActivityForPort8041", 143 | "tag.port": "8041", 144 | "tag.Context": "rpcdetailed", 145 | "tag.Hostname": "yh-shhd-cdh04", 146 | "StartContainersNumOps": 19285, 147 | "StartContainersAvgTime": 0, 148 | "StopContainersNumOps": 6017, 149 | "StopContainersAvgTime": 0.3333333333333333 150 | }, 151 | { 152 | "name": "Hadoop:service=NodeManager,name=UgiMetrics", 153 | "modelerType": "UgiMetrics", 154 | "tag.Context": "ugi", 155 | "tag.Hostname": "yh-shhd-cdh04", 156 | "LoginSuccessNumOps": 0, 157 | "LoginSuccessAvgTime": 0, 158 | "LoginFailureNumOps": 0, 159 | "LoginFailureAvgTime": 0, 160 | "GetGroupsNumOps": 0, 161 | "GetGroupsAvgTime": 0, 162 | "RenewalFailuresTotal": 0, 163 | "RenewalFailures": 0 164 | }, 165 | { 166 | "name": "Hadoop:service=NodeManager,name=RpcActivityForPort8041", 167 | "modelerType": "RpcActivityForPort8041", 168 | "tag.port": "8041", 169 | "tag.Context": "rpc", 170 | "tag.NumOpenConnectionsPerUser": "{\"appattempt_1586867223681_12177_000001\":1}", 171 | "tag.Hostname": "yh-shhd-cdh04", 172 | "ReceivedBytes": 142931721, 173 | "SentBytes": 7815857, 174 | "RpcQueueTimeNumOps": 25302, 175 | "RpcQueueTimeAvgTime": 0, 176 | "RpcProcessingTimeNumOps": 25302, 177 | "RpcProcessingTimeAvgTime": 0, 178 | "RpcAuthenticationFailures": 0, 179 | "RpcAuthenticationSuccesses": 25302, 180 | "RpcAuthorizationFailures": 0, 181 | "RpcAuthorizationSuccesses": 25302, 182 | "RpcSlowCalls": 0, 183 | "RpcClientBackoff": 0, 184 | "NumOpenConnections": 1, 185 | "CallQueueLength": 0, 186 | "NumDroppedConnections": 0 187 | }, 188 | { 189 | "name": "Hadoop:service=NodeManager,name=ShuffleMetrics", 190 | "modelerType": "ShuffleMetrics", 191 | "tag.Context": "mapred", 192 | "tag.Hostname": "yh-shhd-cdh04", 193 | "ShuffleOutputBytes": 3738069132, 194 | "ShuffleOutputsFailed": 0, 195 | "ShuffleOutputsOK": 4272, 196 | "ShuffleConnections": 9183 197 | }, 198 | { 199 | "name": "Hadoop:service=NodeManager,name=MetricsSystem,sub=Stats", 200 | "modelerType": "MetricsSystem,sub=Stats", 201 | "tag.Context": "metricssystem", 202 | "tag.Hostname": "yh-shhd-cdh04", 203 | "NumActiveSources": 20, 204 | "NumAllSources": 20, 205 | "NumActiveSinks": 0, 206 | "NumAllSinks": 0, 207 | "SnapshotNumOps": 0, 208 | "SnapshotAvgTime": 0, 209 | "PublishNumOps": 0, 210 | "PublishAvgTime": 0, 211 | "DroppedPubAll": 0 212 | }, 213 | { 214 | "name": "Hadoop:service=NodeManager,name=ContainerResource_container_e12_1586466344459_0110_01_000005", 215 | "modelerType": "ContainerResource_container_e12_1586466344459_0110_01_000005" 216 | } 217 | ] 218 | } 219 | -------------------------------------------------------------------------------- /examples/ResouceManager.json: -------------------------------------------------------------------------------- 1 | { 2 | "beans": [ 3 | { 4 | "name": "Hadoop:service=ResourceManager,name=RMNMInfo", 5 | "modelerType": "org.apache.hadoop.yarn.server.resourcemanager.RMNMInfo", 6 | "LiveNodeManagers": "[{\"HostName\":\"yh-shhd-cdh05\",\"Rack\":\"/default\",\"State\":\"RUNNING\",\"NodeId\":\"yh-shhd-cdh05:8041\",\"NodeHTTPAddress\":\"yh-shhd-cdh05:8042\",\"LastHealthUpdate\":1587396542915,\"HealthReport\":\"\",\"NodeManagerVersion\":\"2.6.0-cdh5.14.2\",\"NumContainers\":5,\"UsedMemoryMB\":10240,\"AvailableMemoryMB\":237070},{\"HostName\":\"yh-shhd-cdh01\",\"Rack\":\"/default\",\"State\":\"RUNNING\",\"NodeId\":\"yh-shhd-cdh01:8041\",\"NodeHTTPAddress\":\"yh-shhd-cdh01:8042\",\"LastHealthUpdate\":1587396543467,\"HealthReport\":\"\",\"NodeManagerVersion\":\"2.6.0-cdh5.14.2\",\"NumContainers\":8,\"UsedMemoryMB\":46080,\"AvailableMemoryMB\":201230},{\"HostName\":\"yh-shhd-cdh03\",\"Rack\":\"/default\",\"State\":\"RUNNING\",\"NodeId\":\"yh-shhd-cdh03:8041\",\"NodeHTTPAddress\":\"yh-shhd-cdh03:8042\",\"LastHealthUpdate\":1587396542872,\"HealthReport\":\"\",\"NodeManagerVersion\":\"2.6.0-cdh5.14.2\",\"NumContainers\":8,\"UsedMemoryMB\":62976,\"AvailableMemoryMB\":184334},{\"HostName\":\"yh-shhd-cdh04\",\"Rack\":\"/default\",\"State\":\"RUNNING\",\"NodeId\":\"yh-shhd-cdh04:8041\",\"NodeHTTPAddress\":\"yh-shhd-cdh04:8042\",\"LastHealthUpdate\":1587396543105,\"HealthReport\":\"\",\"NodeManagerVersion\":\"2.6.0-cdh5.14.2\",\"NumContainers\":10,\"UsedMemoryMB\":20480,\"AvailableMemoryMB\":226830},{\"HostName\":\"yh-shhd-cdh02\",\"Rack\":\"/default\",\"State\":\"RUNNING\",\"NodeId\":\"yh-shhd-cdh02:8041\",\"NodeHTTPAddress\":\"yh-shhd-cdh02:8042\",\"LastHealthUpdate\":1587396543355,\"HealthReport\":\"\",\"NodeManagerVersion\":\"2.6.0-cdh5.14.2\",\"NumContainers\":0,\"UsedMemoryMB\":0,\"AvailableMemoryMB\":247310}]" 7 | }, 8 | { 9 | "name": "Hadoop:service=ResourceManager,name=RpcActivityForPort8033", 10 | "modelerType": "RpcActivityForPort8033", 11 | "tag.port": "8033", 12 | "tag.Context": "rpc", 13 | "tag.NumOpenConnectionsPerUser": "{}", 14 | "tag.Hostname": "yh-shhd-cdh04", 15 | "ReceivedBytes": 2697, 16 | "SentBytes": 536, 17 | "RpcQueueTimeNumOps": 15, 18 | "RpcQueueTimeAvgTime": 0, 19 | "RpcProcessingTimeNumOps": 15, 20 | "RpcProcessingTimeAvgTime": 0, 21 | "RpcAuthenticationFailures": 0, 22 | "RpcAuthenticationSuccesses": 0, 23 | "RpcAuthorizationFailures": 0, 24 | "RpcAuthorizationSuccesses": 15, 25 | "RpcSlowCalls": 0, 26 | "RpcClientBackoff": 0, 27 | "NumOpenConnections": 0, 28 | "CallQueueLength": 0, 29 | "NumDroppedConnections": 0 30 | }, 31 | { 32 | "name": "Hadoop:service=ResourceManager,name=RpcActivityForPort8031", 33 | "modelerType": "RpcActivityForPort8031", 34 | "tag.port": "8031", 35 | "tag.Context": "rpc", 36 | "tag.NumOpenConnectionsPerUser": "{\"yarn\":5}", 37 | "tag.Hostname": "yh-shhd-cdh04", 38 | "ReceivedBytes": 1198590757, 39 | "SentBytes": 118872873, 40 | "RpcQueueTimeNumOps": 2703983, 41 | "RpcQueueTimeAvgTime": 0.044444444444444446, 42 | "RpcProcessingTimeNumOps": 2703983, 43 | "RpcProcessingTimeAvgTime": 0.044444444444444446, 44 | "RpcAuthenticationFailures": 0, 45 | "RpcAuthenticationSuccesses": 0, 46 | "RpcAuthorizationFailures": 0, 47 | "RpcAuthorizationSuccesses": 5, 48 | "RpcSlowCalls": 0, 49 | "RpcClientBackoff": 0, 50 | "NumOpenConnections": 5, 51 | "CallQueueLength": 0, 52 | "NumDroppedConnections": 0 53 | }, 54 | { 55 | "name": "Hadoop:service=ResourceManager,name=RpcDetailedActivityForPort8032", 56 | "modelerType": "RpcDetailedActivityForPort8032", 57 | "tag.port": "8032", 58 | "tag.Context": "rpcdetailed", 59 | "tag.Hostname": "yh-shhd-cdh04", 60 | "GetNewApplicationNumOps": 12184, 61 | "GetNewApplicationAvgTime": 0, 62 | "GetClusterMetricsNumOps": 2958, 63 | "GetClusterMetricsAvgTime": 0, 64 | "ForceKillApplicationNumOps": 1, 65 | "ForceKillApplicationAvgTime": 2, 66 | "GetQueueInfoNumOps": 1764, 67 | "GetQueueInfoAvgTime": 0, 68 | "ApplicationNotFoundExceptionNumOps": 10218, 69 | "ApplicationNotFoundExceptionAvgTime": 0.011627906976744186, 70 | "SubmitApplicationNumOps": 12183, 71 | "SubmitApplicationAvgTime": 0, 72 | "GetApplicationReportNumOps": 1359839, 73 | "GetApplicationReportAvgTime": 0 74 | }, 75 | { 76 | "name": "Hadoop:service=ResourceManager,name=MetricsSystem,sub=Control", 77 | "modelerType": "org.apache.hadoop.metrics2.impl.MetricsSystemImpl" 78 | }, 79 | { 80 | "name": "Hadoop:service=ResourceManager,name=UgiMetrics", 81 | "modelerType": "UgiMetrics", 82 | "tag.Context": "ugi", 83 | "tag.Hostname": "yh-shhd-cdh04", 84 | "LoginSuccessNumOps": 0, 85 | "LoginSuccessAvgTime": 0, 86 | "LoginFailureNumOps": 0, 87 | "LoginFailureAvgTime": 0, 88 | "GetGroupsNumOps": 0, 89 | "GetGroupsAvgTime": 0, 90 | "RenewalFailuresTotal": 0, 91 | "RenewalFailures": 0 92 | }, 93 | { 94 | "name": "Hadoop:service=ResourceManager,name=MetricsSystem,sub=Stats", 95 | "modelerType": "MetricsSystem,sub=Stats", 96 | "tag.Context": "metricssystem", 97 | "tag.Hostname": "yh-shhd-cdh04", 98 | "NumActiveSources": 20, 99 | "NumAllSources": 20, 100 | "NumActiveSinks": 0, 101 | "NumAllSinks": 0, 102 | "SnapshotNumOps": 0, 103 | "SnapshotAvgTime": 0, 104 | "PublishNumOps": 0, 105 | "PublishAvgTime": 0, 106 | "DroppedPubAll": 0 107 | }, 108 | { 109 | "name": "Hadoop:service=ResourceManager,name=QueueMetrics,q0=root,q1=default", 110 | "modelerType": "QueueMetrics,q0=root,q1=default", 111 | "tag.Queue": "root.default", 112 | "tag.Context": "yarn", 113 | "tag.Hostname": "yh-shhd-cdh04", 114 | "running_0": 3, 115 | "running_60": 0, 116 | "running_300": 0, 117 | "running_1440": 3, 118 | "FairShareMB": 1236550, 119 | "FairShareVCores": 400, 120 | "SteadyFairShareMB": 618275, 121 | "SteadyFairShareVCores": 200, 122 | "MinShareMB": 0, 123 | "MinShareVCores": 0, 124 | "MaxShareMB": 2147483647, 125 | "MaxShareVCores": 2147483647, 126 | "MaxApps": 2147483647, 127 | "MaxAMShareMB": 0, 128 | "MaxAMShareVCores": 0, 129 | "AmResourceUsageMB": 0, 130 | "AmResourceUsageVCores": 0, 131 | "AppsSubmitted": 12190, 132 | "AppsRunning": 6, 133 | "AppsPending": 0, 134 | "AppsCompleted": 12184, 135 | "AppsKilled": 0, 136 | "AppsFailed": 0, 137 | "AllocatedMB": 139776, 138 | "AllocatedVCores": 68, 139 | "AllocatedContainers": 31, 140 | "AggregateContainersAllocated": 104769, 141 | "AggregateContainersReleased": 104738, 142 | "AvailableMB": 0, 143 | "AggregateContainersPreempted": 0, 144 | "AvailableVCores": 0, 145 | "PendingMB": 0, 146 | "PendingVCores": 0, 147 | "PendingContainers": 0, 148 | "ReservedMB": 0, 149 | "ReservedVCores": 0, 150 | "ReservedContainers": 0, 151 | "ActiveUsers": 0, 152 | "ActiveApplications": 0, 153 | "AppAttemptFirstContainerAllocationDelayNumOps": 0, 154 | "AppAttemptFirstContainerAllocationDelayAvgTime": 0 155 | }, 156 | { 157 | "name": "Hadoop:service=ResourceManager,name=QueueMetrics,q0=root,q1=default,user=work", 158 | "modelerType": "QueueMetrics,q0=root,q1=default,user=work", 159 | "tag.Queue": "root.default", 160 | "tag.User": "work", 161 | "tag.Context": "yarn", 162 | "tag.Hostname": "yh-shhd-cdh04", 163 | "running_0": 3, 164 | "running_60": 0, 165 | "running_300": 0, 166 | "running_1440": 3, 167 | "AppsSubmitted": 12190, 168 | "AppsRunning": 6, 169 | "AppsPending": 0, 170 | "AppsCompleted": 12184, 171 | "AppsKilled": 0, 172 | "AppsFailed": 0, 173 | "AllocatedMB": 139776, 174 | "AllocatedVCores": 68, 175 | "AllocatedContainers": 31, 176 | "AggregateContainersAllocated": 104769, 177 | "AggregateContainersReleased": 104738, 178 | "AvailableMB": 0, 179 | "AggregateContainersPreempted": 0, 180 | "AvailableVCores": 0, 181 | "PendingMB": 0, 182 | "PendingVCores": 0, 183 | "PendingContainers": 0, 184 | "ReservedMB": 0, 185 | "ReservedVCores": 0, 186 | "ReservedContainers": 0, 187 | "ActiveUsers": 0, 188 | "ActiveApplications": 0, 189 | "AppAttemptFirstContainerAllocationDelayNumOps": 0, 190 | "AppAttemptFirstContainerAllocationDelayAvgTime": 0 191 | }, 192 | { 193 | "name": "Hadoop:service=ResourceManager,name=JvmMetrics", 194 | "modelerType": "JvmMetrics", 195 | "tag.Context": "jvm", 196 | "tag.ProcessName": "ResourceManager", 197 | "tag.SessionId": null, 198 | "tag.Hostname": "yh-shhd-cdh04", 199 | "MemNonHeapUsedM": 112.60955, 200 | "MemNonHeapCommittedM": 115.55078, 201 | "MemNonHeapMaxM": -1, 202 | "MemHeapUsedM": 556.7262, 203 | "MemHeapCommittedM": 989.875, 204 | "MemHeapMaxM": 989.875, 205 | "MemMaxM": 989.875, 206 | "GcCountParNew": 793, 207 | "GcTimeMillisParNew": 5257, 208 | "GcCountConcurrentMarkSweep": 4, 209 | "GcTimeMillisConcurrentMarkSweep": 167, 210 | "GcCount": 797, 211 | "GcTimeMillis": 5424, 212 | "GcNumWarnThresholdExceeded": 0, 213 | "GcNumInfoThresholdExceeded": 0, 214 | "GcTotalExtraSleepTime": 1606, 215 | "ThreadsNew": 0, 216 | "ThreadsRunnable": 19, 217 | "ThreadsBlocked": 0, 218 | "ThreadsWaiting": 62, 219 | "ThreadsTimedWaiting": 179, 220 | "ThreadsTerminated": 0, 221 | "LogFatal": 0, 222 | "LogError": 0, 223 | "LogWarn": 0, 224 | "LogInfo": 0 225 | }, 226 | { 227 | "name": "Hadoop:service=ResourceManager,name=RpcDetailedActivityForPort8030", 228 | "modelerType": "RpcDetailedActivityForPort8030", 229 | "tag.port": "8030", 230 | "tag.Context": "rpcdetailed", 231 | "tag.Hostname": "yh-shhd-cdh04", 232 | "ApplicationMasterNotRegisteredExceptionNumOps": 7, 233 | "ApplicationMasterNotRegisteredExceptionAvgTime": 1.8571428571428572, 234 | "RegisterApplicationMasterNumOps": 12210, 235 | "RegisterApplicationMasterAvgTime": 0, 236 | "AllocateNumOps": 850467, 237 | "AllocateAvgTime": 0.125, 238 | "FinishApplicationMasterNumOps": 24382, 239 | "FinishApplicationMasterAvgTime": 0 240 | }, 241 | { 242 | "name": "Hadoop:service=ResourceManager,name=QueueMetrics,q0=root,q1=default,q2=work,user=work", 243 | "modelerType": "QueueMetrics,q0=root,q1=default,q2=work,user=work", 244 | "tag.Queue": "root.default.work", 245 | "tag.User": "work", 246 | "tag.Context": "yarn", 247 | "tag.Hostname": "yh-shhd-cdh04", 248 | "running_0": 3, 249 | "running_60": 0, 250 | "running_300": 0, 251 | "running_1440": 3, 252 | "AppsSubmitted": 12190, 253 | "AppsRunning": 6, 254 | "AppsPending": 0, 255 | "AppsCompleted": 12184, 256 | "AppsKilled": 0, 257 | "AppsFailed": 0, 258 | "AllocatedMB": 139776, 259 | "AllocatedVCores": 68, 260 | "AllocatedContainers": 31, 261 | "AggregateContainersAllocated": 104769, 262 | "AggregateContainersReleased": 104738, 263 | "AvailableMB": 0, 264 | "AggregateContainersPreempted": 0, 265 | "AvailableVCores": 0, 266 | "PendingMB": 0, 267 | "PendingVCores": 0, 268 | "PendingContainers": 0, 269 | "ReservedMB": 0, 270 | "ReservedVCores": 0, 271 | "ReservedContainers": 0, 272 | "ActiveUsers": 0, 273 | "ActiveApplications": 0, 274 | "AppAttemptFirstContainerAllocationDelayNumOps": 0, 275 | "AppAttemptFirstContainerAllocationDelayAvgTime": 0 276 | }, 277 | { 278 | "name": "Hadoop:service=ResourceManager,name=QueueMetrics,q0=root,user=dr.who", 279 | "modelerType": "QueueMetrics,q0=root,user=dr.who", 280 | "tag.Queue": "root", 281 | "tag.User": "dr.who", 282 | "tag.Context": "yarn", 283 | "tag.Hostname": "yh-shhd-cdh04", 284 | "running_0": 0, 285 | "running_60": 0, 286 | "running_300": 0, 287 | "running_1440": 0, 288 | "AppsSubmitted": 0, 289 | "AppsRunning": 0, 290 | "AppsPending": 0, 291 | "AppsCompleted": 0, 292 | "AppsKilled": 0, 293 | "AppsFailed": 0, 294 | "AllocatedMB": 0, 295 | "AllocatedVCores": 0, 296 | "AllocatedContainers": 0, 297 | "AggregateContainersAllocated": 0, 298 | "AggregateContainersReleased": 0, 299 | "AvailableMB": 0, 300 | "AggregateContainersPreempted": 0, 301 | "AvailableVCores": 0, 302 | "PendingMB": 0, 303 | "PendingVCores": 0, 304 | "PendingContainers": 0, 305 | "ReservedMB": 0, 306 | "ReservedVCores": 0, 307 | "ReservedContainers": 0, 308 | "ActiveUsers": 0, 309 | "ActiveApplications": 0, 310 | "AppAttemptFirstContainerAllocationDelayNumOps": 0, 311 | "AppAttemptFirstContainerAllocationDelayAvgTime": 0 312 | }, 313 | { 314 | "name": "Hadoop:service=ResourceManager,name=RpcActivityForPort8030", 315 | "modelerType": "RpcActivityForPort8030", 316 | "tag.port": "8030", 317 | "tag.Context": "rpc", 318 | "tag.NumOpenConnectionsPerUser": "{\"appattempt_1586867223681_12172_000001\":1,\"appattempt_1586867223681_12175_000001\":1,\"appattempt_1586867223681_12184_000001\":1,\"appattempt_1586867223681_0054_000001\":1,\"appattempt_1586466344459_0110_000001\":1,\"appattempt_1586867223681_0055_000001\":1}", 319 | "tag.Hostname": "yh-shhd-cdh04", 320 | "ReceivedBytes": 109416252, 321 | "SentBytes": 82031221, 322 | "RpcQueueTimeNumOps": 887066, 323 | "RpcQueueTimeAvgTime": 0, 324 | "RpcProcessingTimeNumOps": 887066, 325 | "RpcProcessingTimeAvgTime": 0.1111111111111111, 326 | "RpcAuthenticationFailures": 0, 327 | "RpcAuthenticationSuccesses": 12210, 328 | "RpcAuthorizationFailures": 0, 329 | "RpcAuthorizationSuccesses": 12210, 330 | "RpcSlowCalls": 0, 331 | "RpcClientBackoff": 0, 332 | "NumOpenConnections": 6, 333 | "CallQueueLength": 0, 334 | "NumDroppedConnections": 0 335 | }, 336 | { 337 | "name": "Hadoop:service=ResourceManager,name=FSOpDurations", 338 | "modelerType": "FSOpDurations", 339 | "tag.FSOpDurations": "FSOpDurations", 340 | "tag.Context": "fairscheduler-op-durations", 341 | "tag.Hostname": "yh-shhd-cdh04", 342 | "ContinuousSchedulingRunNumOps": 0, 343 | "ContinuousSchedulingRunAvgTime": 0, 344 | "ContinuousSchedulingRunStdevTime": 0, 345 | "ContinuousSchedulingRunIMinTime": 3.4028234663852886e38, 346 | "ContinuousSchedulingRunIMaxTime": 1.401298464324817e-45, 347 | "ContinuousSchedulingRunMinTime": 3.4028234663852886e38, 348 | "ContinuousSchedulingRunMaxTime": 1.401298464324817e-45, 349 | "ContinuousSchedulingRunINumOps": 0, 350 | "NodeUpdateCallNumOps": 2703973, 351 | "NodeUpdateCallAvgTime": 0.08888888888888889, 352 | "NodeUpdateCallStdevTime": 0.35816592283860327, 353 | "NodeUpdateCallIMinTime": 0, 354 | "NodeUpdateCallIMaxTime": 2, 355 | "NodeUpdateCallMinTime": 0, 356 | "NodeUpdateCallMaxTime": 467, 357 | "NodeUpdateCallINumOps": 45, 358 | "UpdateThreadRunNumOps": 1058477, 359 | "UpdateThreadRunAvgTime": 0, 360 | "UpdateThreadRunStdevTime": 0, 361 | "UpdateThreadRunIMinTime": 0, 362 | "UpdateThreadRunIMaxTime": 1.401298464324817e-45, 363 | "UpdateThreadRunMinTime": 0, 364 | "UpdateThreadRunMaxTime": 489, 365 | "UpdateThreadRunINumOps": 17, 366 | "UpdateCallNumOps": 1058477, 367 | "UpdateCallAvgTime": 0, 368 | "UpdateCallStdevTime": 0, 369 | "UpdateCallIMinTime": 0, 370 | "UpdateCallIMaxTime": 1.401298464324817e-45, 371 | "UpdateCallMinTime": 0, 372 | "UpdateCallMaxTime": 489, 373 | "UpdateCallINumOps": 17, 374 | "PreemptCallNumOps": 0, 375 | "PreemptCallAvgTime": 0, 376 | "PreemptCallStdevTime": 0, 377 | "PreemptCallIMinTime": 3.4028234663852886e38, 378 | "PreemptCallIMaxTime": 1.401298464324817e-45, 379 | "PreemptCallMinTime": 3.4028234663852886e38, 380 | "PreemptCallMaxTime": 1.401298464324817e-45, 381 | "PreemptCallINumOps": 0 382 | }, 383 | { 384 | "name": "Hadoop:service=ResourceManager,name=QueueMetrics,q0=root,q1=users", 385 | "modelerType": "QueueMetrics,q0=root,q1=users", 386 | "tag.Queue": "root.users", 387 | "tag.Context": "yarn", 388 | "tag.Hostname": "yh-shhd-cdh04", 389 | "running_0": 0, 390 | "running_60": 0, 391 | "running_300": 0, 392 | "running_1440": 0, 393 | "FairShareMB": 0, 394 | "FairShareVCores": 0, 395 | "SteadyFairShareMB": 618275, 396 | "SteadyFairShareVCores": 200, 397 | "MinShareMB": 0, 398 | "MinShareVCores": 0, 399 | "MaxShareMB": 2147483647, 400 | "MaxShareVCores": 2147483647, 401 | "MaxApps": 2147483647, 402 | "MaxAMShareMB": 0, 403 | "MaxAMShareVCores": 0, 404 | "AmResourceUsageMB": 0, 405 | "AmResourceUsageVCores": 0, 406 | "AppsSubmitted": 0, 407 | "AppsRunning": 0, 408 | "AppsPending": 0, 409 | "AppsCompleted": 0, 410 | "AppsKilled": 0, 411 | "AppsFailed": 0, 412 | "AllocatedMB": 0, 413 | "AllocatedVCores": 0, 414 | "AllocatedContainers": 0, 415 | "AggregateContainersAllocated": 0, 416 | "AggregateContainersReleased": 0, 417 | "AvailableMB": 0, 418 | "AggregateContainersPreempted": 0, 419 | "AvailableVCores": 0, 420 | "PendingMB": 0, 421 | "PendingVCores": 0, 422 | "PendingContainers": 0, 423 | "ReservedMB": 0, 424 | "ReservedVCores": 0, 425 | "ReservedContainers": 0, 426 | "ActiveUsers": 0, 427 | "ActiveApplications": 0, 428 | "AppAttemptFirstContainerAllocationDelayNumOps": 0, 429 | "AppAttemptFirstContainerAllocationDelayAvgTime": 0 430 | }, 431 | { 432 | "name": "Hadoop:service=ResourceManager,name=QueueMetrics,q0=root,q1=default,q2=work", 433 | "modelerType": "QueueMetrics,q0=root,q1=default,q2=work", 434 | "tag.Queue": "root.default.work", 435 | "tag.Context": "yarn", 436 | "tag.Hostname": "yh-shhd-cdh04", 437 | "running_0": 3, 438 | "running_60": 0, 439 | "running_300": 0, 440 | "running_1440": 3, 441 | "FairShareMB": 1236550, 442 | "FairShareVCores": 400, 443 | "SteadyFairShareMB": 618275, 444 | "SteadyFairShareVCores": 200, 445 | "MinShareMB": 0, 446 | "MinShareVCores": 0, 447 | "MaxShareMB": 2147483647, 448 | "MaxShareVCores": 2147483647, 449 | "MaxApps": 2147483647, 450 | "MaxAMShareMB": 618275, 451 | "MaxAMShareVCores": 200, 452 | "AmResourceUsageMB": 20480, 453 | "AmResourceUsageVCores": 7, 454 | "AppsSubmitted": 12190, 455 | "AppsRunning": 6, 456 | "AppsPending": 0, 457 | "AppsCompleted": 12184, 458 | "AppsKilled": 0, 459 | "AppsFailed": 0, 460 | "AllocatedMB": 139776, 461 | "AllocatedVCores": 68, 462 | "AllocatedContainers": 31, 463 | "AggregateContainersAllocated": 104769, 464 | "AggregateContainersReleased": 104738, 465 | "AvailableMB": 0, 466 | "AggregateContainersPreempted": 0, 467 | "AvailableVCores": 0, 468 | "PendingMB": 0, 469 | "PendingVCores": 0, 470 | "PendingContainers": 0, 471 | "ReservedMB": 0, 472 | "ReservedVCores": 0, 473 | "ReservedContainers": 0, 474 | "ActiveUsers": 0, 475 | "ActiveApplications": 0, 476 | "AppAttemptFirstContainerAllocationDelayNumOps": 12205, 477 | "AppAttemptFirstContainerAllocationDelayAvgTime": 834 478 | }, 479 | { 480 | "name": "Hadoop:service=ResourceManager,name=RpcActivityForPort8032", 481 | "modelerType": "RpcActivityForPort8032", 482 | "tag.port": "8032", 483 | "tag.Context": "rpc", 484 | "tag.NumOpenConnectionsPerUser": "{\"work\":5,\"yarn\":1}", 485 | "tag.Hostname": "yh-shhd-cdh04", 486 | "ReceivedBytes": 277244591, 487 | "SentBytes": 434732996, 488 | "RpcQueueTimeNumOps": 1399147, 489 | "RpcQueueTimeAvgTime": 0.09523809523809523, 490 | "RpcProcessingTimeNumOps": 1399147, 491 | "RpcProcessingTimeAvgTime": 0, 492 | "RpcAuthenticationFailures": 0, 493 | "RpcAuthenticationSuccesses": 0, 494 | "RpcAuthorizationFailures": 0, 495 | "RpcAuthorizationSuccesses": 124523, 496 | "RpcSlowCalls": 0, 497 | "RpcClientBackoff": 0, 498 | "NumOpenConnections": 6, 499 | "CallQueueLength": 0, 500 | "NumDroppedConnections": 0 501 | }, 502 | { 503 | "name": "Hadoop:service=ResourceManager,name=QueueMetrics,q0=root", 504 | "modelerType": "QueueMetrics,q0=root", 505 | "tag.Queue": "root", 506 | "tag.Context": "yarn", 507 | "tag.Hostname": "yh-shhd-cdh04", 508 | "running_0": 3, 509 | "running_60": 0, 510 | "running_300": 0, 511 | "running_1440": 3, 512 | "FairShareMB": 1236550, 513 | "FairShareVCores": 400, 514 | "SteadyFairShareMB": 1236550, 515 | "SteadyFairShareVCores": 400, 516 | "MinShareMB": 0, 517 | "MinShareVCores": 0, 518 | "MaxShareMB": 2147483647, 519 | "MaxShareVCores": 2147483647, 520 | "MaxApps": 2147483647, 521 | "MaxAMShareMB": 0, 522 | "MaxAMShareVCores": 0, 523 | "AmResourceUsageMB": 0, 524 | "AmResourceUsageVCores": 0, 525 | "AppsSubmitted": 12190, 526 | "AppsRunning": 6, 527 | "AppsPending": 0, 528 | "AppsCompleted": 12184, 529 | "AppsKilled": 0, 530 | "AppsFailed": 0, 531 | "AllocatedMB": 139776, 532 | "AllocatedVCores": 68, 533 | "AllocatedContainers": 31, 534 | "AggregateContainersAllocated": 104769, 535 | "AggregateContainersReleased": 104738, 536 | "AvailableMB": 1096774, 537 | "AggregateContainersPreempted": 0, 538 | "AvailableVCores": 332, 539 | "PendingMB": 0, 540 | "PendingVCores": 0, 541 | "PendingContainers": 0, 542 | "ReservedMB": 0, 543 | "ReservedVCores": 0, 544 | "ReservedContainers": 0, 545 | "ActiveUsers": 0, 546 | "ActiveApplications": 0, 547 | "AppAttemptFirstContainerAllocationDelayNumOps": 0, 548 | "AppAttemptFirstContainerAllocationDelayAvgTime": 0 549 | }, 550 | { 551 | "name": "Hadoop:service=ResourceManager,name=RpcDetailedActivityForPort8031", 552 | "modelerType": "RpcDetailedActivityForPort8031", 553 | "tag.port": "8031", 554 | "tag.Context": "rpcdetailed", 555 | "tag.Hostname": "yh-shhd-cdh04", 556 | "NodeHeartbeatNumOps": 2703978, 557 | "NodeHeartbeatAvgTime": 0.044444444444444446, 558 | "RegisterNodeManagerNumOps": 5, 559 | "RegisterNodeManagerAvgTime": 29 560 | }, 561 | { 562 | "name": "Hadoop:service=ResourceManager,name=ClusterMetrics", 563 | "modelerType": "ClusterMetrics", 564 | "tag.ClusterMetrics": "ResourceManager", 565 | "tag.Context": "yarn", 566 | "tag.Hostname": "yh-shhd-cdh04", 567 | "NumActiveNMs": 5, 568 | "NumDecommissioningNMs": 0, 569 | "NumDecommissionedNMs": 0, 570 | "NumLostNMs": 0, 571 | "NumUnhealthyNMs": 0, 572 | "NumRebootedNMs": 0, 573 | "AMLaunchDelayNumOps": 12203, 574 | "AMLaunchDelayAvgTime": 5, 575 | "AMRegisterDelayNumOps": 12210, 576 | "AMRegisterDelayAvgTime": 3199 577 | }, 578 | { 579 | "name": "Hadoop:service=ResourceManager,name=RpcDetailedActivityForPort8033", 580 | "modelerType": "RpcDetailedActivityForPort8033", 581 | "tag.port": "8033", 582 | "tag.Context": "rpcdetailed", 583 | "tag.Hostname": "yh-shhd-cdh04", 584 | "GetServiceStatusNumOps": 14, 585 | "GetServiceStatusAvgTime": 0, 586 | "TransitionToActiveNumOps": 0, 587 | "TransitionToActiveAvgTime": 0, 588 | "TransitionToStandbyNumOps": 0, 589 | "TransitionToStandbyAvgTime": 0, 590 | "MonitorHealthNumOps": 1, 591 | "MonitorHealthAvgTime": 0 592 | }, 593 | { 594 | "name": "Hadoop:service=ResourceManager,name=QueueMetrics,q0=root,user=work", 595 | "modelerType": "QueueMetrics,q0=root,user=work", 596 | "tag.Queue": "root", 597 | "tag.User": "work", 598 | "tag.Context": "yarn", 599 | "tag.Hostname": "yh-shhd-cdh04", 600 | "running_0": 3, 601 | "running_60": 0, 602 | "running_300": 0, 603 | "running_1440": 3, 604 | "AppsSubmitted": 12190, 605 | "AppsRunning": 6, 606 | "AppsPending": 0, 607 | "AppsCompleted": 12184, 608 | "AppsKilled": 0, 609 | "AppsFailed": 0, 610 | "AllocatedMB": 139776, 611 | "AllocatedVCores": 68, 612 | "AllocatedContainers": 31, 613 | "AggregateContainersAllocated": 104769, 614 | "AggregateContainersReleased": 104738, 615 | "AvailableMB": 0, 616 | "AggregateContainersPreempted": 0, 617 | "AvailableVCores": 0, 618 | "PendingMB": 0, 619 | "PendingVCores": 0, 620 | "PendingContainers": 0, 621 | "ReservedMB": 0, 622 | "ReservedVCores": 0, 623 | "ReservedContainers": 0, 624 | "ActiveUsers": 0, 625 | "ActiveApplications": 0, 626 | "AppAttemptFirstContainerAllocationDelayNumOps": 0, 627 | "AppAttemptFirstContainerAllocationDelayAvgTime": 0 628 | } 629 | ] 630 | } 631 | -------------------------------------------------------------------------------- /hadoop_jmx_exporter.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | 4 | import time 5 | from prometheus_client import start_http_server 6 | from prometheus_client.core import REGISTRY 7 | 8 | import utils 9 | from utils import get_module_logger 10 | from hdfs_namenode import NameNodeMetricCollector 11 | from hdfs_datanode import DataNodeMetricCollector 12 | from hdfs_journalnode import JournalNodeMetricCollector 13 | from yarn_resourcemanager import ResourceManagerMetricCollector 14 | from yarn_nodemanager import NodeManagerMetricCollector 15 | 16 | logger = get_module_logger(__name__) 17 | 18 | 19 | def register_prometheus(cluster, args): 20 | if args.nns is not None and len(args.nns) > 0: 21 | nnc = NameNodeMetricCollector(cluster, args.nns) 22 | nnc.collect() 23 | REGISTRY.register(nnc) 24 | REGISTRY.register(DataNodeMetricCollector(cluster, nnc)) 25 | if args.rms is not None and len(args.rms) > 0: 26 | rmc = ResourceManagerMetricCollector(cluster, args.rms, args.queue) 27 | rmc.collect() 28 | REGISTRY.register(rmc) 29 | REGISTRY.register(NodeManagerMetricCollector(cluster, rmc)) 30 | if args.jns is not None and len(args.jns) > 0: 31 | REGISTRY.register(JournalNodeMetricCollector(cluster, args.jns)) 32 | def main(): 33 | args = utils.parse_args() 34 | host = args.host 35 | port = int(args.port) 36 | start_http_server(port, host) 37 | print "Listen at %s:%s" % (host, port) 38 | register_prometheus(args.cluster, args) 39 | while True: 40 | time.sleep(300) 41 | 42 | 43 | if __name__ == "__main__": 44 | main() 45 | -------------------------------------------------------------------------------- /hdfs_datanode.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | 4 | import yaml 5 | import re 6 | from prometheus_client.core import GaugeMetricFamily 7 | 8 | from utils import get_module_logger 9 | from common import MetricCollector, CommonMetricCollector 10 | from scraper import ScrapeMetrics 11 | 12 | logger = get_module_logger(__name__) 13 | 14 | 15 | class DataNodeMetricCollector(MetricCollector): 16 | def __init__(self, cluster, nnc): 17 | MetricCollector.__init__(self, cluster, "hdfs", "datanode") 18 | self.target = "-" 19 | self.nnc = nnc 20 | 21 | self.hadoop_datanode_metrics = {} 22 | for i in range(len(self.file_list)): 23 | self.hadoop_datanode_metrics.setdefault(self.file_list[i], {}) 24 | 25 | self.common_metric_collector = CommonMetricCollector(cluster, "hdfs", "datanode") 26 | 27 | def collect(self): 28 | isSetup = False 29 | if self.nnc.dns == "": 30 | return 31 | beans_list = ScrapeMetrics(self.nnc.dns).scrape() 32 | for beans in beans_list: 33 | if not isSetup: 34 | self.common_metric_collector.setup_labels(beans) 35 | self.setup_metrics_labels(beans) 36 | isSetup = True 37 | for i in range(len(beans)): 38 | if 'tag.Hostname' in beans[i]: 39 | self.target = beans[i]["tag.Hostname"] 40 | break 41 | self.hadoop_datanode_metrics.update(self.common_metric_collector.get_metrics(beans, self.target)) 42 | self.get_metrics(beans) 43 | 44 | for i in range(len(self.merge_list)): 45 | service = self.merge_list[i] 46 | if service in self.hadoop_datanode_metrics: 47 | for metric in self.hadoop_datanode_metrics[service]: 48 | yield self.hadoop_datanode_metrics[service][metric] 49 | 50 | def setup_dninfo_labels(self): 51 | for metric in self.metrics['DataNodeInfo']: 52 | if 'VolumeInfo' in metric: 53 | label = ["cluster", "version", "path", "state"] 54 | name = "_".join([self.prefix, 'volume_state']) 55 | else: 56 | label = ["cluster", "version"] 57 | snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() 58 | name = "_".join([self.prefix, snake_case]) 59 | label.append("_target") 60 | self.hadoop_datanode_metrics['DataNodeInfo'][metric] = GaugeMetricFamily(name, self.metrics['DataNodeInfo'][metric], labels=label) 61 | 62 | def setup_dnactivity_labels(self): 63 | block_flag, client_flag = 1, 1 64 | for metric in self.metrics['DataNodeActivity']: 65 | if 'Blocks' in metric: 66 | if block_flag: 67 | label = ['cluster', 'host', 'oper'] 68 | key = "Blocks" 69 | name = "block_operations_total" 70 | descriptions = "Total number of blocks in different oprations" 71 | block_flag = 0 72 | else: 73 | continue 74 | elif 'Client' in metric: 75 | if client_flag: 76 | label = ['cluster', 'host', 'oper', 'client'] 77 | key = "Client" 78 | name = "from_client_total" 79 | descriptions = "Total number of each operations from different client" 80 | client_flag = 0 81 | else: 82 | continue 83 | else: 84 | snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() 85 | label = ['cluster', 'host'] 86 | key = metric 87 | name = snake_case 88 | descriptions = self.metrics['DataNodeActivity'][metric] 89 | label.append("_target") 90 | self.hadoop_datanode_metrics['DataNodeActivity'][key] = GaugeMetricFamily("_".join([self.prefix, name]), descriptions, labels=label) 91 | 92 | def setup_fsdatasetstate_labels(self): 93 | for metric in self.metrics['FSDatasetState']: 94 | label = ['cluster', 'host', "_target"] 95 | if "Num" in metric: 96 | snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric.split("Num")[1]).lower() 97 | else: 98 | snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() 99 | name = "_".join([self.prefix, snake_case]) 100 | self.hadoop_datanode_metrics['FSDatasetState'][metric] = GaugeMetricFamily(name, self.metrics['FSDatasetState'][metric], labels=label) 101 | 102 | def setup_metrics_labels(self, beans): 103 | for i in range(len(beans)): 104 | if 'DataNodeInfo' in beans[i]['name']: 105 | self.setup_dninfo_labels() 106 | if 'DataNodeActivity' in beans[i]['name']: 107 | self.setup_dnactivity_labels() 108 | if 'FSDatasetState' in beans[i]['name']: 109 | self.setup_fsdatasetstate_labels() 110 | 111 | def get_dninfo_metrics(self, bean): 112 | for metric in self.metrics['DataNodeInfo']: 113 | version = bean['Version'] 114 | if 'VolumeInfo' in metric: 115 | if 'VolumeInfo' in bean: 116 | volume_info_dict = yaml.safe_load(bean['VolumeInfo']) 117 | for k, v in volume_info_dict.items(): 118 | path = k 119 | for key, val in v.items(): 120 | if key != "storageType": 121 | state = key 122 | label = [self.cluster, version, path, state, self.target] 123 | value = val 124 | self.hadoop_datanode_metrics['DataNodeInfo'][metric].add_metric(label, value) 125 | else: 126 | continue 127 | else: 128 | label = [self.cluster, version, self.target] 129 | value = bean[metric] 130 | self.hadoop_datanode_metrics['DataNodeInfo'][metric].add_metric(label, value) 131 | 132 | def get_dnactivity_metrics(self, bean): 133 | for metric in self.metrics['DataNodeActivity']: 134 | host = bean['tag.Hostname'] 135 | label = [self.cluster, host] 136 | if 'Blocks' in metric: 137 | oper = metric.split("Blocks")[1] 138 | label.append(oper) 139 | key = "Blocks" 140 | elif 'Client' in metric: 141 | oper = metric.split("Client")[0].split("From")[0] 142 | client = metric.split("Client")[0].split("From")[1] 143 | label.extend([oper, client]) 144 | key = "Client" 145 | else: 146 | key = metric 147 | label.append(self.target) 148 | self.hadoop_datanode_metrics['DataNodeActivity'][key].add_metric(label, bean[metric] if metric in bean else 0) 149 | 150 | def get_fsdatasetstate_metrics(self, bean): 151 | for metric in self.metrics['FSDatasetState']: 152 | label = [self.cluster, self.target, self.target] 153 | self.hadoop_datanode_metrics['FSDatasetState'][metric].add_metric( 154 | label, bean[metric] if metric in bean else 0) 155 | 156 | def get_metrics(self, beans): 157 | for i in range(len(beans)): 158 | if 'DataNodeInfo' in beans[i]['name']: 159 | self.get_dninfo_metrics(beans[i]) 160 | if 'DataNodeActivity' in beans[i]['name']: 161 | self.get_dnactivity_metrics(beans[i]) 162 | if 'FSDatasetState' in beans[i]['name']: 163 | self.get_fsdatasetstate_metrics(beans[i]) 164 | -------------------------------------------------------------------------------- /hdfs_journalnode.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | 4 | import re 5 | from prometheus_client.core import GaugeMetricFamily, HistogramMetricFamily 6 | 7 | from utils import get_module_logger 8 | from common import MetricCollector, CommonMetricCollector 9 | from scraper import ScrapeMetrics 10 | 11 | logger = get_module_logger(__name__) 12 | 13 | 14 | class JournalNodeMetricCollector(MetricCollector): 15 | def __init__(self, cluster, urls): 16 | MetricCollector.__init__(self, cluster, "hdfs", "journalnode") 17 | self.target = "-" 18 | self.urls = urls 19 | 20 | self.hadoop_journalnode_metrics = {} 21 | for i in range(len(self.file_list)): 22 | self.hadoop_journalnode_metrics.setdefault(self.file_list[i], {}) 23 | 24 | self.common_metric_collector = CommonMetricCollector(cluster, "hdfs", "journalnode") 25 | 26 | self.scrape_metrics = ScrapeMetrics(urls) 27 | 28 | def collect(self): 29 | isSetup = False 30 | beans_list = self.scrape_metrics.scrape() 31 | for beans in beans_list: 32 | if not isSetup: 33 | self.common_metric_collector.setup_labels(beans) 34 | self.setup_metrics_labels(beans) 35 | isSetup = True 36 | for i in range(len(beans)): 37 | if 'tag.Hostname' in beans[i]: 38 | self.target = beans[i]["tag.Hostname"] 39 | break 40 | self.hadoop_journalnode_metrics.update(self.common_metric_collector.get_metrics(beans, self.target)) 41 | self.get_metrics(beans) 42 | 43 | for i in range(len(self.merge_list)): 44 | service = self.merge_list[i] 45 | if service in self.hadoop_journalnode_metrics: 46 | for metric in self.hadoop_journalnode_metrics[service]: 47 | yield self.hadoop_journalnode_metrics[service][metric] 48 | 49 | def setup_journalnode_labels(self): 50 | a_60_latency_flag, a_300_latency_flag, a_3600_latency_flag = 1, 1, 1 51 | for metric in self.metrics['JournalNode']: 52 | label = ["cluster", "host", "_target"] 53 | if 'Syncs60s' in metric: 54 | if a_60_latency_flag: 55 | a_60_latency_flag = 0 56 | key = "Syncs60" 57 | name = "_".join([self.prefix, 'sync60s_latency_microseconds']) 58 | descriptions = "The percentile of sync latency in microseconds in 60s granularity" 59 | self.hadoop_journalnode_metrics['JournalNode'][key] = HistogramMetricFamily(name, descriptions, labels=label) 60 | else: 61 | continue 62 | elif 'Syncs300s' in metric: 63 | if a_300_latency_flag: 64 | a_300_latency_flag = 0 65 | key = "Syncs300" 66 | name = "_".join([self.prefix, 'sync300s_latency_microseconds']) 67 | descriptions = "The percentile of sync latency in microseconds in 300s granularity" 68 | self.hadoop_journalnode_metrics['JournalNode'][key] = HistogramMetricFamily(name, descriptions, labels=label) 69 | else: 70 | continue 71 | elif 'Syncs3600s' in metric: 72 | if a_3600_latency_flag: 73 | a_3600_latency_flag = 0 74 | key = "Syncs3600" 75 | name = "_".join([self.prefix, 'sync3600s_latency_microseconds']) 76 | descriptions = "The percentile of sync latency in microseconds in 3600s granularity" 77 | self.hadoop_journalnode_metrics['JournalNode'][key] = HistogramMetricFamily(name, descriptions, labels=label) 78 | else: 79 | continue 80 | else: 81 | snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() 82 | name = "_".join([self.prefix, snake_case]) 83 | self.hadoop_journalnode_metrics['JournalNode'][metric] = GaugeMetricFamily(name, self.metrics['JournalNode'][metric], labels=label) 84 | 85 | def setup_metrics_labels(self, beans): 86 | for i in range(len(beans)): 87 | if 'name=Journal-' in beans[i]['name']: 88 | self.setup_journalnode_labels() 89 | 90 | def get_metrics(self, beans): 91 | for i in range(len(beans)): 92 | if 'name=Journal-' in beans[i]['name'] and 'JournalNode' in self.metrics: 93 | host = beans[i]['tag.Hostname'] 94 | label = [self.cluster, host, self.target] 95 | 96 | a_60_sum, a_300_sum, a_3600_sum = 0.0, 0.0, 0.0 97 | a_60_value, a_300_value, a_3600_value = [], [], [] 98 | a_60_percentile, a_300_percentile, a_3600_percentile = [], [], [] 99 | 100 | for metric in beans[i]: 101 | if not metric[0].isupper(): 102 | continue 103 | if "Syncs60s" in metric: 104 | if 'NumOps' in metric: 105 | a_60_count = beans[i][metric] 106 | else: 107 | tmp = metric.split("thPercentileLatencyMicros")[0].split("Syncs")[1].split("s") 108 | a_60_percentile.append(str(float(tmp[1]) / 100.0)) 109 | a_60_value.append(beans[i][metric]) 110 | a_60_sum += beans[i][metric] 111 | elif 'Syncs300' in metric: 112 | if 'NumOps' in metric: 113 | a_300_count = beans[i][metric] 114 | else: 115 | tmp = metric.split("thPercentileLatencyMicros")[0].split("Syncs")[1].split("s") 116 | a_300_percentile.append(str(float(tmp[1]) / 100.0)) 117 | a_300_value.append(beans[i][metric]) 118 | a_300_sum += beans[i][metric] 119 | elif 'Syncs3600' in metric: 120 | if 'NumOps' in metric: 121 | a_3600_count = beans[i][metric] 122 | else: 123 | tmp = metric.split("thPercentileLatencyMicros")[0].split("Syncs")[1].split("s") 124 | a_3600_percentile.append(str(float(tmp[1]) / 100.0)) 125 | a_3600_value.append(beans[i][metric]) 126 | a_3600_sum += beans[i][metric] 127 | else: 128 | key = metric 129 | if key in self.hadoop_journalnode_metrics['JournalNode']: 130 | self.hadoop_journalnode_metrics['JournalNode'][key].add_metric(label, beans[i][metric]) 131 | a_60_bucket = zip(a_60_percentile, a_60_value) 132 | a_300_bucket = zip(a_300_percentile, a_300_value) 133 | a_3600_bucket = zip(a_3600_percentile, a_3600_value) 134 | a_60_bucket.sort() 135 | a_300_bucket.sort() 136 | a_3600_bucket.sort() 137 | a_60_bucket.append(("+Inf", a_60_count)) 138 | a_300_bucket.append(("+Inf", a_300_count)) 139 | a_3600_bucket.append(("+Inf", a_3600_count)) 140 | self.hadoop_journalnode_metrics['JournalNode']['Syncs60'].add_metric(label, buckets=a_60_bucket, sum_value=a_60_sum) 141 | self.hadoop_journalnode_metrics['JournalNode']['Syncs300'].add_metric(label, buckets=a_300_bucket, sum_value=a_300_sum) 142 | self.hadoop_journalnode_metrics['JournalNode']['Syncs3600'].add_metric(label, buckets=a_3600_bucket, sum_value=a_3600_sum) 143 | -------------------------------------------------------------------------------- /hdfs_namenode.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | 4 | import yaml 5 | import re 6 | 7 | from prometheus_client.core import GaugeMetricFamily 8 | 9 | from utils import get_module_logger 10 | from common import MetricCollector, CommonMetricCollector 11 | from scraper import ScrapeMetrics 12 | 13 | logger = get_module_logger(__name__) 14 | 15 | 16 | class NameNodeMetricCollector(MetricCollector): 17 | def __init__(self, cluster, urls): 18 | MetricCollector.__init__(self, cluster, "hdfs", "namenode") 19 | self.target = "-" 20 | self.urls = urls 21 | self.dns = set() 22 | 23 | self.hadoop_namenode_metrics = {} 24 | for i in range(len(self.file_list)): 25 | self.hadoop_namenode_metrics.setdefault(self.file_list[i], {}) 26 | 27 | self.common_metric_collector = CommonMetricCollector(cluster, "hdfs", "namenode") 28 | 29 | self.scrape_metrics = ScrapeMetrics(urls) 30 | 31 | def collect(self): 32 | isSetup = False 33 | beans_list = self.scrape_metrics.scrape() 34 | for beans in beans_list: 35 | if not isSetup: 36 | self.common_metric_collector.setup_labels(beans) 37 | self.setup_metrics_labels(beans) 38 | isSetup = True 39 | for i in range(len(beans)): 40 | if 'tag.Hostname' in beans[i]: 41 | self.target = beans[i]["tag.Hostname"] 42 | break 43 | self.hadoop_namenode_metrics.update(self.common_metric_collector.get_metrics(beans, self.target)) 44 | self.get_metrics(beans) 45 | 46 | for i in range(len(self.merge_list)): 47 | service = self.merge_list[i] 48 | if service in self.hadoop_namenode_metrics: 49 | for metric in self.hadoop_namenode_metrics[service]: 50 | yield self.hadoop_namenode_metrics[service][metric] 51 | 52 | def setup_nnactivity_labels(self): 53 | num_namenode_flag, avg_namenode_flag, ops_namenode_flag = 1, 1, 1 54 | for metric in self.metrics['NameNodeActivity']: 55 | label = ["cluster", "method", "_target"] 56 | if "NumOps" in metric: 57 | if num_namenode_flag: 58 | key = "MethodNumOps" 59 | name = "_".join([self.prefix, "nnactivity_method_ops_total"]) 60 | description = "Total number of the times the method is called." 61 | self.hadoop_namenode_metrics['NameNodeActivity'][key] = GaugeMetricFamily(name, description, labels=label) 62 | num_namenode_flag = 0 63 | else: 64 | continue 65 | elif "AvgTime" in metric: 66 | if avg_namenode_flag: 67 | key = "MethodAvgTime" 68 | name = "_".join([self.prefix, "nnactivity_method_avg_time_milliseconds"]) 69 | descripton = "Average turn around time of the method in milliseconds." 70 | self.hadoop_namenode_metrics['NameNodeActivity'][key] = GaugeMetricFamily(name, descripton, labels=label) 71 | avg_namenode_flag = 0 72 | else: 73 | continue 74 | elif ops_namenode_flag: 75 | key = "Operations" 76 | name = "_".join([self.prefix, "nnactivity_operations_total"]) 77 | description = "Total number of each operation." 78 | self.hadoop_namenode_metrics['NameNodeActivity'][key] = GaugeMetricFamily(name, description, labels=label) 79 | ops_namenode_flag = 0 80 | 81 | def setup_startupprogress_labels(self): 82 | sp_count_flag, sp_elapsed_flag, sp_total_flag, sp_complete_flag = 1, 1, 1, 1 83 | for metric in self.metrics['StartupProgress']: 84 | snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() 85 | if "ElapsedTime" == metric: 86 | key = "ElapsedTime" 87 | name = "total_elapsed_time_milliseconds" 88 | descriptions = "Total elapsed time in milliseconds." 89 | elif "PercentComplete" == metric: 90 | key = "PercentComplete" 91 | name = "complete_rate" 92 | descriptions = "Current rate completed in NameNode startup progress (The max value is not 100 but 1.0)." 93 | elif "Count" in metric: 94 | if sp_count_flag: 95 | sp_count_flag = 0 96 | key = "PhaseCount" 97 | name = "phase_count" 98 | descriptions = "Total number of steps completed in the phase." 99 | else: 100 | continue 101 | elif "ElapsedTime" in metric: 102 | if sp_elapsed_flag: 103 | sp_elapsed_flag = 0 104 | key = "PhaseElapsedTime" 105 | name = "phase_elapsed_time_milliseconds" 106 | descriptions = "Total elapsed time in the phase in milliseconds." 107 | else: 108 | continue 109 | elif "Total" in metric: 110 | if sp_total_flag: 111 | sp_total_flag = 0 112 | key = "PhaseTotal" 113 | name = "phase_total" 114 | descriptions = "Total number of steps in the phase." 115 | else: 116 | continue 117 | elif "PercentComplete" in metric: 118 | if sp_complete_flag: 119 | sp_complete_flag = 0 120 | key = "PhasePercentComplete" 121 | name = "phase_complete_rate" 122 | descriptions = "Current rate completed in the phase (The max value is not 100 but 1.0)." 123 | else: 124 | continue 125 | else: 126 | key = metric 127 | name = snake_case 128 | descriptions = self.metrics['StartupProgress'][metric] 129 | label = ["cluster", "phase", "_target"] 130 | name = "_".join([self.prefix, "startup_process", name]) 131 | self.hadoop_namenode_metrics['StartupProgress'][key] = GaugeMetricFamily(name, descriptions, labels=label) 132 | 133 | def setup_fsnamesystem_labels(self): 134 | cap_flag = 1 135 | for metric in self.metrics['FSNamesystem']: 136 | if metric.startswith('Capacity'): 137 | if cap_flag: 138 | cap_flag = 0 139 | key = "capacity" 140 | label = ["cluster", "mode"] 141 | name = "capacity_bytes" 142 | descriptions = "Current DataNodes capacity in each mode in bytes" 143 | else: 144 | continue 145 | else: 146 | key = metric 147 | label = ["cluster"] 148 | name = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() 149 | descriptions = self.metrics['FSNamesystem'][metric] 150 | label.append("_target") 151 | name = "_".join([self.prefix, "fsname_system", name]) 152 | self.hadoop_namenode_metrics['FSNamesystem'][key] = GaugeMetricFamily(name, descriptions, labels=label) 153 | 154 | def setup_fsnamesystem_state_labels(self): 155 | num_flag = 1 156 | for metric in self.metrics['FSNamesystemState']: 157 | snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() 158 | if 'DataNodes' in metric: 159 | if num_flag: 160 | num_flag = 0 161 | key = "datanodes_num" 162 | label = ["cluster", "state"] 163 | descriptions = "Number of datanodes in each state" 164 | else: 165 | continue 166 | else: 167 | key = metric 168 | label = ["cluster"] 169 | descriptions = self.metrics['FSNamesystemState'][metric] 170 | label.append("_target") 171 | name = "_".join([self.prefix, "fsname_system_state", snake_case]) 172 | self.hadoop_namenode_metrics['FSNamesystemState'][key] = GaugeMetricFamily(name, descriptions, labels=label) 173 | 174 | def setup_retrycache_labels(self): 175 | cache_flag = 1 176 | for metric in self.metrics['RetryCache']: 177 | if cache_flag: 178 | cache_flag = 0 179 | key = "cache" 180 | label = ["cluster", "mode", "_target"] 181 | name = "_".join([self.prefix, "cache_total"]) 182 | description = "Total number of RetryCache in each mode" 183 | self.hadoop_namenode_metrics['RetryCache'][key] = GaugeMetricFamily(name, description, labels=label) 184 | 185 | def setup_nninfo_labels(self): 186 | for metric in self.metrics['NameNodeInfo']: 187 | if "LiveNodes" in metric: 188 | name = "_".join([self.prefix, "nninfo_live_nodes_count"]) 189 | description = "Count of live data node" 190 | self.hadoop_namenode_metrics['NameNodeInfo']["LiveNodeCount"] = GaugeMetricFamily(name, description, labels=["cluster", "_target"]) 191 | 192 | label = ["cluster", "datanode", "infoAddr", "infoSecureAddr", "xferaddr", "version", "_target"] 193 | items = ["lastContact", "usedSpace", "adminState", "nonDfsUsedSpace", "capacity", "numBlocks", 194 | "used", "remaining", "blockScheduled", "blockPoolUsed", "blockPoolUsedPercent", "volfails"] 195 | for item in items: 196 | item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower() 197 | name = "_".join([self.prefix, "nninfo_live_nodes", item]) 198 | key = "LiveNodes-" + item 199 | description = "Live node " + item 200 | if item == "admin_state": 201 | description += " 0: In Service, 1: Decommission In Progress, 2: Decommissioned" 202 | self.hadoop_namenode_metrics['NameNodeInfo'][key] = GaugeMetricFamily(name, description, labels=label) 203 | continue 204 | elif "DeadNodes" in metric: 205 | name = "_".join([self.prefix, "nninfo_dead_nodes_count"]) 206 | description = "Count of dead data node" 207 | self.hadoop_namenode_metrics['NameNodeInfo']["DeadNodeCount"] = GaugeMetricFamily(name, description, labels=["cluster", "_target"]) 208 | 209 | label = ["cluster", "datanode", "decommissioned", "xferaddr"] 210 | name = "_".join([self.prefix, "nninfo_dead_nodes_last_contact"]) 211 | key = "DeadNodes" 212 | description = "Dead node last contact in milions" 213 | self.hadoop_namenode_metrics['NameNodeInfo'][key] = GaugeMetricFamily(name, description, labels=label) 214 | continue 215 | elif "DecomNodes" in metric: 216 | name = "_".join([self.prefix, "nninfo_decom_nodes_count"]) 217 | description = "Count of decommissioned data node" 218 | self.hadoop_namenode_metrics['NameNodeInfo']["DecomNodeCount"] = GaugeMetricFamily(name, description, labels=["cluster", "_target"]) 219 | 220 | label = ["cluster", "datanode", "xferaddr", "_target"] 221 | items = ["underReplicatedBlocks", "decommissionOnlyReplicas", "underReplicateInOpenFiles"] 222 | for item in items: 223 | item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower() 224 | name = "_".join([self.prefix, "nninfo_decom_nodes", item]) 225 | key = "DecomNodes-" + item 226 | description = "Decom Node " + item 227 | self.hadoop_namenode_metrics['NameNodeInfo'][key] = GaugeMetricFamily(name, description, labels=label) 228 | continue 229 | elif "EnteringMaintenanceNodes" in metric: 230 | name = "_".join([self.prefix, "nninfo_maintenance_nodes_count"]) 231 | description = "Count of maintenance data node" 232 | self.hadoop_namenode_metrics['NameNodeInfo']["MaintenanceNodeCount"] = GaugeMetricFamily(name, description, labels=["cluster", "_target"]) 233 | 234 | label = ["cluster", "datanode", "xferaddr", "_target"] 235 | items = ["underReplicatedBlocks", "maintenanceOnlyReplicas", "underReplicateInOpenFiles"] 236 | for item in items: 237 | item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower() 238 | name = "_".join([self.prefix, "nninfo_entering_maintenance_nodes", item]) 239 | key = "EnteringMaintenanceNodes-" + item 240 | description = "Entering maintenance node " + item 241 | self.hadoop_namenode_metrics['NameNodeInfo'][key] = GaugeMetricFamily(name, description, labels=label) 242 | continue 243 | elif "CorruptFiles" in metric: 244 | label = ["cluster", "_target"] 245 | name = "_".join([self.prefix, "nninfo_corrupt_file_count"]) 246 | key = "CorruptFiles" 247 | description = "Corrupt file count" 248 | self.hadoop_namenode_metrics['NameNodeInfo'][key] = GaugeMetricFamily(name, description, labels=label) 249 | continue 250 | elif "NodeUsage" in metric: 251 | label = ["cluster", "_target"] 252 | items = ["min", "median", "max", "stdDev"] 253 | for item in items: 254 | item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower() 255 | name = "_".join([self.prefix, "nninfo_node_usage", item]) 256 | key = "NodeUsage-" + item 257 | description = "Node usage " + item 258 | self.hadoop_namenode_metrics['NameNodeInfo'][key] = GaugeMetricFamily(name, description, labels=label) 259 | continue 260 | elif "SoftwareVersion" in metric: 261 | label = ["cluster", "software_version"] 262 | name = "_".join([self.prefix, "nninfo_software_version"]) 263 | key = "SoftwareVersion" 264 | elif "Safemode" in metric: 265 | label = ["cluster"] 266 | name = "_".join([self.prefix, "nninfo_safe_mode"]) 267 | key = "Safemode" 268 | else: 269 | label = ["cluster"] 270 | snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() 271 | name = "_".join([self.prefix, "nninfo", snake_case]) 272 | key = metric 273 | label.append("_target") 274 | self.hadoop_namenode_metrics['NameNodeInfo'][key] = GaugeMetricFamily(name, self.metrics["NameNodeInfo"][metric], labels=label) 275 | 276 | def setup_metrics_labels(self, beans): 277 | for i in range(len(beans)): 278 | if 'NameNodeActivity' in beans[i]['name']: 279 | self.setup_nnactivity_labels() 280 | if 'StartupProgress' in beans[i]['name']: 281 | self.setup_startupprogress_labels() 282 | if 'FSNamesystem' in beans[i]['name']: 283 | self.setup_fsnamesystem_labels() 284 | if 'FSNamesystemState' in beans[i]['name']: 285 | self.setup_fsnamesystem_state_labels() 286 | if 'RetryCache' in beans[i]['name']: 287 | self.setup_retrycache_labels() 288 | if "NameNodeInfo" in beans[i]['name']: 289 | self.setup_nninfo_labels() 290 | 291 | def get_nnactivity_metrics(self, bean): 292 | for metric in self.metrics['NameNodeActivity']: 293 | if "NumOps" in metric: 294 | method = metric.split('NumOps')[0] 295 | key = "MethodNumOps" 296 | elif "AvgTime" in metric: 297 | method = metric.split('AvgTime')[0] 298 | key = "MethodAvgTime" 299 | else: 300 | if "Ops" in metric: 301 | method = metric.split('Ops')[0] 302 | else: 303 | method = metric 304 | key = "Operations" 305 | label = [self.cluster, method, self.target] 306 | self.hadoop_namenode_metrics['NameNodeActivity'][key].add_metric(label, bean[metric] if metric in bean else 0) 307 | 308 | def get_startupprogress_metrics(self, bean): 309 | for metric in self.metrics['StartupProgress']: 310 | if "Count" in metric: 311 | key = "PhaseCount" 312 | phase = metric.split("Count")[0] 313 | elif "ElapsedTime" in metric and "ElapsedTime" != metric: 314 | key = "PhaseElapsedTime" 315 | phase = metric.split("ElapsedTime")[0] 316 | elif "Total" in metric: 317 | key = "PhaseTotal" 318 | phase = metric.split("Total")[0] 319 | elif "PercentComplete" in metric and "PercentComplete" != metric: 320 | key = "PhasePercentComplete" 321 | phase = metric.split("PercentComplete")[0] 322 | else: 323 | key = metric 324 | phase = "-" 325 | label = [self.cluster, phase, self.target] 326 | self.hadoop_namenode_metrics['StartupProgress'][key].add_metric(label, bean[metric] if metric in bean else 0) 327 | 328 | def get_fsnamesystem_metrics(self, bean): 329 | for metric in self.metrics['FSNamesystem']: 330 | key = metric 331 | if 'HAState' in metric: 332 | label = [self.cluster] 333 | if 'initializing' == bean['tag.HAState']: 334 | value = 0.0 335 | elif 'active' == bean['tag.HAState']: 336 | value = 1.0 337 | elif 'standby' == bean['tag.HAState']: 338 | value = 2.0 339 | elif 'stopping' == bean['tag.HAState']: 340 | value = 3.0 341 | else: 342 | value = 9999 343 | label.append(self.target) 344 | self.hadoop_namenode_metrics['FSNamesystem'][key].add_metric(label, value) 345 | elif metric.startswith("Capacity"): 346 | key = 'capacity' 347 | mode = metric.split("Capacity")[1] 348 | label = [self.cluster, mode] 349 | label.append(self.target) 350 | self.hadoop_namenode_metrics['FSNamesystem'][key].add_metric(label, bean[metric] if metric in bean else 0) 351 | else: 352 | label = [self.cluster] 353 | label.append(self.target) 354 | self.hadoop_namenode_metrics['FSNamesystem'][key].add_metric(label, bean[metric] if metric in bean else 0) 355 | 356 | def get_fsnamesystem_state_metrics(self, bean): 357 | for metric in self.metrics['FSNamesystemState']: 358 | label = [self.cluster] 359 | key = metric 360 | if 'FSState' in metric: 361 | if 'Safemode' == bean['FSState']: 362 | value = 0.0 363 | elif 'Operational' == bean['FSState']: 364 | value = 1.0 365 | else: 366 | value = 9999 367 | label.append(self.target) 368 | self.hadoop_namenode_metrics['FSNamesystemState'][key].add_metric(label, value) 369 | elif "TotalSyncTimes" in metric: 370 | label.append(self.target) 371 | self.hadoop_namenode_metrics['FSNamesystemState'][key].add_metric(label, float( 372 | re.sub(r'\s', '', bean[metric])) if metric in bean and bean[metric] else 0) 373 | elif "DataNodes" in metric: 374 | key = 'datanodes_num' 375 | state = metric.split("DataNodes")[0].split("Num")[1] 376 | label = [self.cluster, state, self.target] 377 | self.hadoop_namenode_metrics['FSNamesystemState'][key].add_metric(label, bean[metric] if metric in bean and bean[metric] else 0) 378 | else: 379 | label.append(self.target) 380 | self.hadoop_namenode_metrics['FSNamesystemState'][key].add_metric(label, bean[metric] if metric in bean and bean[metric] else 0) 381 | 382 | def get_retrycache_metrics(self, bean): 383 | for metric in self.metrics['RetryCache']: 384 | key = "cache" 385 | label = [self.cluster, metric.split('Cache')[1], self.target] 386 | self.hadoop_namenode_metrics['RetryCache'][key].add_metric(label, bean[metric] if metric in bean and bean[metric] else 0) 387 | 388 | def get_nninfo_metrics(self, bean): 389 | for metric in self.metrics["NameNodeInfo"]: 390 | if "LiveNodes" in metric and "LiveNodes" in bean: 391 | live_node_dict = yaml.safe_load(bean["LiveNodes"]) 392 | self.hadoop_namenode_metrics["NameNodeInfo"]["LiveNodeCount"].add_metric([self.cluster, self.target], len(live_node_dict)) 393 | dns = set() 394 | for node, info in live_node_dict.items(): 395 | label = [self.cluster, node, info["infoAddr"], info["infoSecureAddr"], info["xferaddr"], info["version"], self.target] 396 | items = ["lastContact", "usedSpace", "adminState", "nonDfsUsedSpace", "capacity", "numBlocks", 397 | "used", "remaining", "blockScheduled", "blockPoolUsed", "blockPoolUsedPercent", "volfails"] 398 | dns.add("http://"+info["infoAddr"]+"/jmx") 399 | for item in items: 400 | value = info[item] if item in info else 0 401 | if item == "adminState": 402 | if value == "In Service": 403 | value = 0 404 | elif value == "Decommission In Progress": 405 | value = 1 406 | else: # Decommissioned 407 | value = 2 408 | item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower() 409 | key = "LiveNodes-" + item 410 | self.hadoop_namenode_metrics["NameNodeInfo"][key].add_metric(label, value) 411 | self.dns = dns 412 | elif "DeadNodes" in metric and "DeadNodes" in bean: 413 | dead_node_dict = yaml.safe_load(bean["DeadNodes"]) 414 | self.hadoop_namenode_metrics["NameNodeInfo"]["DeadNodeCount"].add_metric([self.cluster, self.target], len(dead_node_dict)) 415 | for node, info in dead_node_dict.items(): 416 | label = [self.cluster, node, str(info["decommissioned"]), info["xferaddr"], self.target] 417 | value = info["lastContact"] 418 | self.hadoop_namenode_metrics["NameNodeInfo"]["DeadNodes"].add_metric(label, value) 419 | elif "DecomNodes" in metric and "DecomNodes" in bean: 420 | decom_node_dict = yaml.safe_load(bean["DecomNodes"]) 421 | self.hadoop_namenode_metrics["NameNodeInfo"]["DecomNodeCount"].add_metric([self.cluster, self.target], len(decom_node_dict)) 422 | for node, info in decom_node_dict.items(): 423 | label = [self.cluster, node, info["xferaddr"], self.target] 424 | items = ["underReplicatedBlocks", "decommissionOnlyReplicas", "underReplicateInOpenFiles"] 425 | for item in items: 426 | value = info[item] if item in info else 0 427 | item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower() 428 | key = "DecomNodes-" + item 429 | self.hadoop_namenode_metrics["NameNodeInfo"][key].add_metric(label, value) 430 | elif "EnteringMaintenanceNodes" in metric and "EnteringMaintenanceNodes" in bean: 431 | node_dict = yaml.safe_load(bean["EnteringMaintenanceNodes"]) 432 | self.hadoop_namenode_metrics["NameNodeInfo"]["MaintenanceNodeCount"].add_metric([self.cluster, self.target], len(node_dict)) 433 | for node, info in node_dict.items(): 434 | label = [self.cluster, node, info["xferaddr"], self.target] 435 | items = ["underReplicatedBlocks", "maintenanceOnlyReplicas", "underReplicateInOpenFiles"] 436 | for item in items: 437 | value = info[item] if item in info else 0 438 | item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower() 439 | key = "EnteringMaintenanceNodes-" + item 440 | self.hadoop_namenode_metrics["NameNodeInfo"][key].add_metric(label, value) 441 | elif "CorruptFiles" in metric and "CorruptFiles" in bean: 442 | file_list = yaml.safe_load(bean["CorruptFiles"]) 443 | label = [self.cluster, self.target] 444 | self.hadoop_namenode_metrics["NameNodeInfo"]["CorruptFiles"].add_metric(label, len(file_list)) 445 | elif "NodeUsage" in metric and "NodeUsage" in bean: 446 | node_usage_dict = yaml.safe_load(bean["NodeUsage"])["nodeUsage"] 447 | label = [self.cluster, self.target] 448 | items = ["min", "median", "max", "stdDev"] 449 | for item in items: 450 | value = node_usage_dict[item] if item in node_usage_dict else 0 451 | value = float(value.strip("%")) 452 | item = re.sub('([a-z0-9])([A-Z])', r'\1_\2', item).lower() 453 | key = "NodeUsage-" + item 454 | self.hadoop_namenode_metrics["NameNodeInfo"][key].add_metric(label, value) 455 | elif "SoftwareVersion" in metric and "SoftwareVersion" in bean: 456 | label = [self.cluster, bean["SoftwareVersion"], self.target] 457 | self.hadoop_namenode_metrics["NameNodeInfo"]["SoftwareVersion"].add_metric(label, 0) 458 | elif "Safemode" in metric and "Safemode" in bean: 459 | label = [self.cluster, self.target] 460 | self.hadoop_namenode_metrics["NameNodeInfo"]["Safemode"].add_metric(label, 0 if metric in bean and bean[metric] == "" else 1) 461 | else: 462 | label = [self.cluster, self.target] 463 | self.hadoop_namenode_metrics['NameNodeInfo'][metric].add_metric(label, bean[metric] if metric in bean and bean[metric] else 0) 464 | 465 | def get_metrics(self, beans): 466 | for i in range(len(beans)): 467 | if 'NameNodeActivity' in beans[i]['name']: 468 | self.get_nnactivity_metrics(beans[i]) 469 | if 'StartupProgress' in beans[i]['name']: 470 | self.get_startupprogress_metrics(beans[i]) 471 | if 'FSNamesystem' in beans[i]['name'] and 'FSNamesystemState' not in beans[i]['name']: 472 | self.get_fsnamesystem_metrics(beans[i]) 473 | if 'FSNamesystemState' in beans[i]['name']: 474 | self.get_fsnamesystem_state_metrics(beans[i]) 475 | if 'RetryCache' in beans[i]['name']: 476 | self.get_retrycache_metrics(beans[i]) 477 | if 'NameNodeInfo' in beans[i]['name']: 478 | self.get_nninfo_metrics(beans[i]) 479 | -------------------------------------------------------------------------------- /metrics/common/JvmMetrics.json: -------------------------------------------------------------------------------- 1 | { 2 | "MemNonHeapUsedM": "Current non-heap memory used in MB.", 3 | "MemNonHeapCommittedM": "Current non-heap memory committed in MB.", 4 | "MemNonHeapMaxM": "Max non-heap memory size in MB.", 5 | "MemHeapUsedM": "Current heap memory used in MB.", 6 | "MemHeapCommittedM": "Current heap memory committed in MB.", 7 | "MemHeapMaxM": "Max heap memory size in MB.", 8 | "MemMaxM": "Max memory size in MB.", 9 | "ThreadsNew": "Current number of NEW threads.", 10 | "ThreadsRunnable": "Current number of RUNNABLE threads.", 11 | "ThreadsBlocked": "Current number of BLOCKED threads.", 12 | "ThreadsWaiting": "Current number of WAITING threads.", 13 | "ThreadsTimedWaiting": "Current number of TIMED_WAITING threads.", 14 | "ThreadsTerminated": "Current number of TERMINATED threads.", 15 | "GcCount": "Total number of Gc count", 16 | "GcTimeMillis": "Total GC time in msec.", 17 | "GcCountParNew": "ParNew GC count.", 18 | "GcTimeMillisParNew": "ParNew GC time in msec.", 19 | "GcCountConcurrentMarkSweep": "ConcurrentMarkSweep GC count.", 20 | "GcTimeMillisConcurrentMarkSweep": "ConcurrentMarkSweep GC time in msec.", 21 | "GcNumWarnThresholdExceeded": "Number of times that the GC warn threshold is exceeded.", 22 | "GcNumInfoThresholdExceeded": "Number of times that the GC info threshold is exceeded.", 23 | "GcTotalExtraSleepTime": "Total GC extra sleep time in msec.", 24 | "LogFatal": "Total number of FATAL logs.", 25 | "LogError": "Total number of ERROR logs.", 26 | "LogWarn": "Total number of WARN logs.", 27 | "LogInfo": "Total number of INFO logs." 28 | } 29 | -------------------------------------------------------------------------------- /metrics/common/MetricsSystem.json: -------------------------------------------------------------------------------- 1 | { 2 | "NumActiveSources": "Current number of active metrics sources.", 3 | "NumAllSources": "Total number of metrics sources.", 4 | "NumActiveSinks": "Current number of active sinks.", 5 | "NumAllSinks": "Total number of sinks (BUT usually less than NumActiveSinks, see HADOOP-9946).", 6 | "SnapshotNumOps": "Total number of operations to snapshot statistics from a metrics source.", 7 | "SnapshotAvgTime": "Average time in milliseconds to snapshot statistics from a metrics source.", 8 | "PublishNumOps": "Total number of operations to publish statistics to a sink.", 9 | "PublishAvgTime": "Average time in milliseconds to publish statistics to a sink.", 10 | "DroppedPubAll": "Total number of dropped publishes.", 11 | "Sink_instanceNumOps": "Total number of sink operations for the instance.", 12 | "Sink_instanceAvgTime": "Average time in milliseconds of sink operations for the instance.", 13 | "Sink_instanceDropped": "Total number of dropped sink operations for the instance.", 14 | "Sink_instanceQsize": "Current queue length of sink operations (BUT always set to 0 because nothing to increment this metrics, see HADOOP-9941)." 15 | } -------------------------------------------------------------------------------- /metrics/common/OperatingSystem.json: -------------------------------------------------------------------------------- 1 | { 2 | "OpenFileDescriptorCount": "Total number of open file descriptor", 3 | "MaxFileDescriptorCount": "Total number of max file descriptor", 4 | "CommittedVirtualMemorySize": "The size of committed virtual memory in bytes", 5 | "TotalSwapSpaceSize": "The size of total swap space in bytes", 6 | "FreeSwapSpaceSize": "The size of free swap space in bytes", 7 | "ProcessCpuTime": "Total process cpu time in microseconds", 8 | "FreePhysicalMemorySize": "The size of free physical memory in bytes", 9 | "TotalPhysicalMemorySize": "The size of total physical memory in bytes", 10 | "SystemCpuLoad": "Average of system CPU load", 11 | "ProcessCpuLoad": "Average of process CPU load", 12 | "SystemLoadAverage": "Average of system load", 13 | "AvailableProcessors": "Total number of available processors", 14 | } -------------------------------------------------------------------------------- /metrics/common/RpcActivity.json: -------------------------------------------------------------------------------- 1 | { 2 | "ReceivedBytes": "Total number of received bytes", 3 | "SentBytes": "Total number of sent bytes", 4 | "RpcQueueTimeNumOps": "Total number of RPC calls", 5 | "RpcQueueTimeAvgTime": "Average queue time in milliseconds", 6 | "RpcProcessingTimeNumOps": "Total number of RPC calls (same to RpcQueueTimeNumOps)", 7 | "RpcProcessingTimeAvgTime": "Average Processing time in milliseconds", 8 | "RpcAuthenticationFailures": "Total number of authentication failures", 9 | "RpcAuthenticationSuccesses": "Total number of authentication successes", 10 | "RpcAuthorizationFailures": "Total number of authorization failures", 11 | "RpcAuthorizationSuccesses": "Total number of authorization successes", 12 | "RpcClientBackoff": "Total number of RPC client back off", 13 | "RpcSlowCalls": "Total number of RPC slow calls", 14 | "NumOpenConnections": "Current number of open connections", 15 | "CallQueueLength": "Current length of the call queue" 16 | } 17 | -------------------------------------------------------------------------------- /metrics/common/RpcDetailedActivity.json: -------------------------------------------------------------------------------- 1 | { 2 | "methodNumOps": "Total number of the times the method is called", 3 | "methodAvgTime": "Average turn around time of the method in milliseconds" 4 | } -------------------------------------------------------------------------------- /metrics/common/Runtime.json: -------------------------------------------------------------------------------- 1 | { 2 | "Uptime": "components uptime in milliseconds", 3 | "StartTime": "components start time in milliseconds" 4 | } -------------------------------------------------------------------------------- /metrics/common/UgiMetrics.json: -------------------------------------------------------------------------------- 1 | { 2 | "LoginSuccessNumOps": "Total number of successful kerberos logins.", 3 | "LoginSuccessAvgTime": "Average time for successful kerberos logins in milliseconds.", 4 | "LoginFailureNumOps": "Total number of failed kerberos logins.", 5 | "LoginFailureAvgTime": "Average time for failed kerberos logins in milliseconds.", 6 | "GetGroupsNumOps": "Total number of group resolutions.", 7 | "GetGroupsAvgTime": "Average time for group resolution in milliseconds.", 8 | "RenewalFailuresTotal": "Total number of renewal failures.", 9 | "RenewalFailures": "Current number of renewal failures." 10 | } -------------------------------------------------------------------------------- /metrics/datanode/DataNodeActivity.json: -------------------------------------------------------------------------------- 1 | { 2 | "BytesWritten": "Total number of bytes written to DataNode", 3 | "BytesRead": "Total number of bytes read from DataNode", 4 | "TotalWriteTime": "Total number of milliseconds spent on write operation", 5 | "TotalReadTime": "Total number of milliseconds spent on read operation", 6 | "BlocksWritten": "Total number of blocks written to DataNode", 7 | "BlocksRead": "Total number of blocks read from DataNode", 8 | "BlocksReplicated": "Total number of blocks replicated", 9 | "BlocksRemoved": "Total number of blocks removed", 10 | "BlocksVerified": "Total number of blocks verified", 11 | "BlockVerificationFailures": "Total number of verifications failures", 12 | "BlocksCached": "Total number of blocks cached", 13 | "BlocksUncached": "Total number of blocks uncached", 14 | "ReadsFromLocalClient": "Total number of read operations from local client", 15 | "ReadsFromRemoteClient": "Total number of read operations from remote client", 16 | "WritesFromLocalClient": "Total number of write operations from local client", 17 | "WritesFromRemoteClient": "Total number of write operations from remote client", 18 | "BlocksGetLocalPathInfo": "Total number of operations to get local path names of blocks", 19 | "RemoteBytesRead": "Number of bytes read by remote clients", 20 | "RemoteBytesWritten": "Number of bytes written by remote clients", 21 | "RamDiskBlocksWrite": "Total number of blocks written to memory", 22 | "RamDiskBlocksWriteFallback": "Total number of blocks written to memory but not satisfied (failed-over to disk)", 23 | "RamDiskBytesWrite": "Total number of bytes written to memory", 24 | "RamDiskBlocksReadHits": "Total number of times a block in memory was read", 25 | "RamDiskBlocksEvicted": "Total number of blocks evicted in memory", 26 | "RamDiskBlocksEvictedWithoutRead": "Total number of blocks evicted in memory without ever being read from memory", 27 | "RamDiskBlocksEvictionWindowMsNumOps": "Number of blocks evicted in memory", 28 | "RamDiskBlocksEvictionWindowMsAvgTime": "Average time of blocks in memory before being evicted in milliseconds", 29 | "RamDiskBlocksLazyPersisted": "Total number of blocks written to disk by lazy writer", 30 | "RamDiskBlocksDeletedBeforeLazyPersisted": "Total number of blocks deleted by application before being persisted to disk", 31 | "RamDiskBytesLazyPersisted": "Total number of bytes written to disk by lazy writer", 32 | "RamDiskBlocksLazyPersistWindowMsNumOps": "Number of blocks written to disk by lazy writer", 33 | "RamDiskBlocksLazyPersistWindowMsAvgTime": "Average time of blocks written to disk by lazy writer in milliseconds", 34 | "FsyncCount": "Total number of fsync", 35 | "VolumeFailures": "Total number of volume failures occurred", 36 | "DatanodeNetworkErrors" : "Total number of datanode network error", 37 | "DataNodeActiveXceiversCount" : "Total number of datanode active Xceivers", 38 | "ReadBlockOpNumOps": "Total number of read operations", 39 | "ReadBlockOpAvgTime": "Average time of read operations in milliseconds", 40 | "WriteBlockOpNumOps": "Total number of write operations", 41 | "WriteBlockOpAvgTime": "Average time of write operations in milliseconds", 42 | "BlockChecksumOpNumOps": "Total number of blockChecksum operations", 43 | "BlockChecksumOpAvgTime": "Average time of blockChecksum operations in milliseconds", 44 | "CopyBlockOpNumOps": "Total number of block copy operations", 45 | "CopyBlockOpAvgTime": "Average time of block copy operations in milliseconds", 46 | "ReplaceBlockOpNumOps": "Total number of block replace operations", 47 | "ReplaceBlockOpAvgTime": "Average time of block replace operations in milliseconds", 48 | "HeartbeatsNumOps": "Total number of heartbeats", 49 | "HeartbeatsAvgTime": "Average heartbeat time in milliseconds", 50 | "HeartbeatsTotalNumOps": "Total number of heartbeats which is a duplicate of HeartbeatsNumOps", 51 | "HeartbeatsTotalAvgTime": "Average total heartbeat time in milliseconds", 52 | "LifelinesNumOps": "Total number of lifeline messages", 53 | "LifelinesAvgTime": "Average lifeline message processing time in milliseconds", 54 | "BlockReportsNumOps": "Total number of block report operations", 55 | "BlockReportsAvgTime": "Average time of block report operations in milliseconds", 56 | "IncrementalBlockReportsNumOps": "Total number of incremental block report operations", 57 | "IncrementalBlockReportsAvgTime": "Average time of incremental block report operations in milliseconds", 58 | "CacheReportsNumOps": "Total number of cache report operations", 59 | "CacheReportsAvgTime": "Average time of cache report operations in milliseconds", 60 | "PacketAckRoundTripTimeNanosNumOps": "Total number of ack round trip", 61 | "PacketAckRoundTripTimeNanosAvgTime": "Average time from ack send to receive minus the downstream ack time in nanoseconds", 62 | "FlushNanosNumOps": "Total number of flushes", 63 | "FlushNanosAvgTime": "Average flush time in nanoseconds", 64 | "FsyncNanosNumOps": "Total number of fsync", 65 | "FsyncNanosAvgTime": "Average fsync time in nanoseconds", 66 | "SendDataPacketBlockedOnNetworkNanosNumOps": "Total number of sending packets", 67 | "SendDataPacketBlockedOnNetworkNanosAvgTime": "Average waiting time of sending packets in nanoseconds", 68 | "SendDataPacketTransferNanosNumOps": "Total number of sending packets", 69 | "SendDataPacketTransferNanosAvgTime": "Average transfer time of sending packets in nanoseconds" 70 | } -------------------------------------------------------------------------------- /metrics/datanode/DataNodeInfo.json: -------------------------------------------------------------------------------- 1 | { 2 | "VolumeInfo": "Volume infomation in each path and in each mode", 3 | "XceiverCount": "Total number of datanode Xceivers" 4 | } 5 | -------------------------------------------------------------------------------- /metrics/datanode/FSDatasetState.json: -------------------------------------------------------------------------------- 1 | { 2 | "Capacity" : "Current raw capacity of DataNode in bytes", 3 | "DfsUsed" : "Current space used by DataNodes for DFS purposes in bytes", 4 | "Remaining" : "Current remaining capacity in bytes", 5 | "NumFailedVolumes" : "Total number of failed volumes", 6 | "LastVolumeFailureDate" : "Last time of volume failures", 7 | "EstimatedCapacityLostTotal" : "An estimate of the total capacity lost due to volume failures", 8 | "CacheUsed" : "Total number of cache used", 9 | "CacheCapacity" : "Current raw capacity of cache in bytes", 10 | "NumBlocksCached" : "Total number of blocks cached", 11 | "NumBlocksFailedToCache" : "Total number of blocks failed to cache", 12 | "NumBlocksFailedToUnCache" : "Total number of blocks failed to uncached" 13 | } -------------------------------------------------------------------------------- /metrics/journalnode/JournalNode.json: -------------------------------------------------------------------------------- 1 | { 2 | "Syncs60sNumOps": "Number of sync operations (1 minute granularity)", 3 | "Syncs60s50thPercentileLatencyMicros": "The 50th percentile of sync latency in microseconds (1 minute granularity)", 4 | "Syncs60s75thPercentileLatencyMicros": "The 75th percentile of sync latency in microseconds (1 minute granularity)", 5 | "Syncs60s90thPercentileLatencyMicros": "The 90th percentile of sync latency in microseconds (1 minute granularity)", 6 | "Syncs60s95thPercentileLatencyMicros": "The 95th percentile of sync latency in microseconds (1 minute granularity)", 7 | "Syncs60s99thPercentileLatencyMicros": "The 99th percentile of sync latency in microseconds (1 minute granularity)", 8 | "Syncs300sNumOps": "Number of sync operations (5 minutes granularity)", 9 | "Syncs300s50thPercentileLatencyMicros": "The 50th percentile of sync latency in microseconds (5 minutes granularity)", 10 | "Syncs300s75thPercentileLatencyMicros": "The 75th percentile of sync latency in microseconds (5 minutes granularity)", 11 | "Syncs300s90thPercentileLatencyMicros": "The 90th percentile of sync latency in microseconds (5 minutes granularity)", 12 | "Syncs300s95thPercentileLatencyMicros": "The 95th percentile of sync latency in microseconds (5 minutes granularity)", 13 | "Syncs300s99thPercentileLatencyMicros": "The 99th percentile of sync latency in microseconds (5 minutes granularity)", 14 | "Syncs3600sNumOps": "Number of sync operations (1 hour granularity)", 15 | "Syncs3600s50thPercentileLatencyMicros": "The 50th percentile of sync latency in microseconds (1 hour granularity)", 16 | "Syncs3600s75thPercentileLatencyMicros": "The 75th percentile of sync latency in microseconds (1 hour granularity)", 17 | "Syncs3600s90thPercentileLatencyMicros": "The 90th percentile of sync latency in microseconds (1 hour granularity)", 18 | "Syncs3600s95thPercentileLatencyMicros": "The 95th percentile of sync latency in microseconds (1 hour granularity)", 19 | "Syncs3600s99thPercentileLatencyMicros": "The 99th percentile of sync latency in microseconds (1 hour granularity)", 20 | "BatchesWritten": "Total number of batches written since startup", 21 | "TxnsWritten": "Total number of transactions written since startup", 22 | "BytesWritten": "Total number of bytes written since startup", 23 | "BatchesWrittenWhileLagging": "Total number of batches written where this node was lagging", 24 | "LastWriterEpoch": "Current writer’s epoch number", 25 | "CurrentLagTxns": "The number of transactions that this JournalNode is lagging", 26 | "LastWrittenTxId": "The highest transaction id stored on this JournalNode", 27 | "LastPromisedEpoch": "The last epoch number which this node has promised not to accept any lower epoch, or 0 if no promises have been made", 28 | "LastJournalTimestamp": "The timestamp of last successfully written transaction" 29 | } -------------------------------------------------------------------------------- /metrics/namenode/FSNamesystem.json: -------------------------------------------------------------------------------- 1 | { 2 | "HAState": "(HA-only) Current state of the NameNode: 0.0 (for initializing) or 1.0 (for active) or 2.0 (for standby) or 3.0 (for stopping) state", 3 | "MissingBlocks": "Current number of missing blocks", 4 | "MissingReplOneBlocks": "Current number of missing blocks with replication factor 1", 5 | "ExpiredHeartbeats": "Total number of expired heartbeats", 6 | "TransactionsSinceLastCheckpoint": "Total number of transactions since last checkpoint", 7 | "TransactionsSinceLastLogRoll": "Total number of transactions since last edit log roll", 8 | "LastWrittenTransactionId": "Last transaction ID written to the edit log", 9 | "LastCheckpointTime": "Time in milliseconds since epoch of last checkpoint", 10 | "CapacityTotal": "Current raw capacity of DataNodes in bytes", 11 | "CapacityUsed": "Current used capacity across all DataNodes in bytes", 12 | "CapacityRemaining": "Current remaining capacity in bytes", 13 | "CapacityUsedNonDFS": "Current space used by DataNodes for non DFS purposes in bytes", 14 | "TotalLoad": "Current number of connections", 15 | "SnapshottableDirectories": "Current number of snapshottable directories", 16 | "Snapshots": "Current number of snapshots", 17 | "NumEncryptionZones": "Current number of encryption zones", 18 | "LockQueueLength": "Number of threads waiting to acquire FSNameSystem lock", 19 | "BlocksTotal": "Current number of allocated blocks in the system", 20 | "NumFilesUnderConstruction": "Current number of files under construction", 21 | "NumActiveClients": "Current number of active clients holding lease", 22 | "FilesTotal": "Current number of files and directories", 23 | "PendingReplicationBlocks": "Current number of blocks pending to be replicated", 24 | "UnderReplicatedBlocks": "Current number of blocks under replicated", 25 | "CorruptBlocks": "Current number of blocks with corrupt replicas.", 26 | "ScheduledReplicationBlocks": "Current number of blocks scheduled for replications", 27 | "PendingDeletionBlocks": "Current number of blocks pending deletion", 28 | "ExcessBlocks": "Current number of excess blocks", 29 | "NumTimedOutPendingReplications": "The number of timed out replications. Not the number of unique blocks that timed out. Note: The metric name will be changed to NumTimedOutPendingReconstructions in Hadoop 3 release.", 30 | "PostponedMisreplicatedBlocks": "(HA-only) Current number of blocks postponed to replicate", 31 | "PendingDataNodeMessageCount": "(HA-only) Current number of pending block-related messages for later processing in the standby NameNode", 32 | "MillisSinceLastLoadedEdits": "(HA-only) Time in milliseconds since the last time standby NameNode load edit log. In active NameNode, set to 0", 33 | "BlockCapacity": "Current number of block capacity", 34 | "StaleDataNodes": "Current number of DataNodes marked stale due to delayed heartbeat", 35 | "TotalSyncCount": "Total number of sync operations performed by edit log" 36 | } -------------------------------------------------------------------------------- /metrics/namenode/FSNamesystemState.json: -------------------------------------------------------------------------------- 1 | { 2 | "FsLockQueueLength": "Filesystem lock queue length", 3 | "MaxObjects": "Max objects", 4 | "BlockDeletionStartTime": "Start time of block deletion", 5 | "NumLiveDataNodes": "Number of datanodes which are currently live", 6 | "NumDeadDataNodes": "Number of datanodes which are currently dead", 7 | "NumDecomLiveDataNodes": "Number of datanodes which have been decommissioned and are now live", 8 | "NumDecomDeadDataNodes": "Number of datanodes which have been decommissioned and are now dead", 9 | "NumDecommissioningDataNodes": "Number of datanodes in decommissioning state", 10 | "NumStaleDataNodes": "Number of datanodes marked as content stale", 11 | "VolumeFailuresTotal": "Total number of volume failures across all Datanodes", 12 | "EstimatedCapacityLostTotal": "An estimate of the total capacity lost due to volume failures", 13 | "NumStaleStorages": "Number of storages marked as content stale (after NameNode restart/failover before first block report is received)", 14 | "FSState": "Current state of the file system: 0 (for Safemode) or 1(Operational)", 15 | "TotalSyncTimes": "Total number of milliseconds spent by various edit logs in sync operation" 16 | } -------------------------------------------------------------------------------- /metrics/namenode/NameNodeActivity.json: -------------------------------------------------------------------------------- 1 | { 2 | "CreateFileOps": "Total number of files created.", 3 | "FilesCreated": "Total number of files and directories created by create or mkdir operations.", 4 | "FilesAppended": "Total number of files appended.", 5 | "GetBlockLocations": "Total number of getBlockLocations operations.", 6 | "FilesRenamed": "Total number of rename operations (NOT number of files/dirs renamed).", 7 | "GetListingOps": "Total number of directory listing operations.", 8 | "DeleteFileOps": "Total number of delete operations.", 9 | "FilesDeleted": "Total number of files and directories deleted by delete or rename operations.", 10 | "FileInfoOps": "Total number of getFileInfo and getLinkFileInfo operations.", 11 | "AddBlockOps": "Total number of addBlock operations succeeded.", 12 | "GetAdditionalDatanodeOps": "Total number of getAdditionalDatanode operations.", 13 | "CreateSymlinkOps": "Total number of createSymlink operations.", 14 | "GetLinkTargetOps": "Total number of getLinkTarget operations.", 15 | "FilesInGetListingOps": "Total number of files and directories listed by directory listing operations.", 16 | "AllowSnapshotOps": "Total number of allowSnapshot operations.", 17 | "DisallowSnapshotOps": "Total number of disallowSnapshot operations.", 18 | "CreateSnapshotOps": "Total number of createSnapshot operations.", 19 | "DeleteSnapshotOps": "Total number of deleteSnapshot operations.", 20 | "RenameSnapshotOps": "Total number of renameSnapshot operations.", 21 | "ListSnapshottableDirOps": "Total number of snapshottableDirectoryStatus operations.", 22 | "SnapshotDiffReportOps": "Total number of getSnapshotDiffReport operations.", 23 | "TransactionsNumOps": "Total number of Journal transactions.", 24 | "TransactionsAvgTime": "Average time of Journal transactions in milliseconds.", 25 | "SyncsNumOps": "Total number of Journal syncs.", 26 | "SyncsAvgTime": "Average time of Journal syncs in milliseconds.", 27 | "TransactionsBatchedInSync": "Total number of Journal transactions batched in sync.", 28 | "BlockReportNumOps": "Total number of processing block reports from DataNode.", 29 | "BlockReportAvgTime": "Average time of processing block reports in milliseconds.", 30 | "CacheReportNumOps": "Total number of processing cache reports from DataNode.", 31 | "CacheReportAvgTime": "Average time of processing cache reports in milliseconds.", 32 | "SafeModeTime": "The interval between FSNameSystem starts and the last time safemode leaves in milliseconds. (sometimes not equal to the time in SafeMode, see HDFS-5156).", 33 | "FsImageLoadTime": "Time loading FS Image at startup in milliseconds.", 34 | "GetEditNumOps": "Total number of edits downloads from SecondaryNameNode.", 35 | "GetEditAvgTime": "Average edits download time in milliseconds.", 36 | "GetImageNumOps": "Total number of fsimage downloads from SecondaryNameNode.", 37 | "GetImageAvgTime": "Average fsimage download time in milliseconds.", 38 | "PutImageNumOps": "Total number of fsimage uploads to SecondaryNameNode.", 39 | "PutImageAvgTime": "Average fsimage upload time in milliseconds.", 40 | "TotalFileOps": "Total number of all file operations." 41 | } 42 | -------------------------------------------------------------------------------- /metrics/namenode/NameNodeInfo.json: -------------------------------------------------------------------------------- 1 | { 2 | "Total": "Total", 3 | "TotalBlocks": "Total number of blocks", 4 | "Used": "Total used space by data nodes", 5 | "Free": "Total free space by data nodes", 6 | "Safemode": "Is in safe mode. 0: no, 1: yes", 7 | "NonDfsUsedSpace": "Total used space by data nodes for non DFS purposes such as storing temporary files on the local file system", 8 | "PercentUsed": "Total used space by data nodes as percentage of total capacity", 9 | "BlockPoolUsedSpace": "Block pool used space", 10 | "PercentBlockPoolUsed": "Percent of block pool used", 11 | "PercentRemaining": "Total remaining space by data nodes as percentage of total capacity", 12 | "CacheCapacity": "Cache Capacity", 13 | "CacheUsed": "Cache Used", 14 | "TotalFiles": "Total Files", 15 | "NumberOfMissingBlocks": "Number of missing blocks", 16 | "NumberOfMissingBlocksWithReplicationFactorOne": "Number of missing blocks with replication factor one", 17 | "LiveNodes": "Live nodes", 18 | "SoftwareVersion": "Software version", 19 | "DeadNodes": "Dead nodes", 20 | "DecomNodes": "Decom nodes", 21 | "EnteringMaintenanceNodes": "Entering maintenance nodes", 22 | "NodeUsage": "Node Usage", 23 | "NNStartedTimeInMillis": "NameNode started time in millis", 24 | "CorruptFiles": "Corrupt file list" 25 | } 26 | -------------------------------------------------------------------------------- /metrics/namenode/RetryCache.json: -------------------------------------------------------------------------------- 1 | { 2 | "CacheHit": "Total number of RetryCache hit.", 3 | "CacheCleared": "Total number of RetryCache cleared.", 4 | "CacheUpdated": "Total number of RetryCache updated." 5 | } 6 | -------------------------------------------------------------------------------- /metrics/namenode/StartupProgress.json: -------------------------------------------------------------------------------- 1 | { 2 | "ElapsedTime": "Total elapsed time in milliseconds.", 3 | "PercentComplete": "Current rate completed in NameNode startup progress (The max value is not 100 but 1.0).", 4 | "LoadingFsImageCount": "", 5 | "LoadingFsImageElapsedTime": "", 6 | "LoadingFsImageTotal": "", 7 | "LoadingFsImagePercentComplete": "", 8 | "LoadingEditsCount": "", 9 | "LoadingEditsElapsedTime": "", 10 | "LoadingEditsTotal": "", 11 | "LoadingEditsPercentComplete": "", 12 | "SavingCheckpointCount": "", 13 | "SavingCheckpointElapsedTime": "", 14 | "SavingCheckpointTotal": "", 15 | "SavingCheckpointPercentComplete": "", 16 | "SafeModeCount": "", 17 | "SafeModeElapsedTime": "", 18 | "SafeModeTotal": "", 19 | "SafeModePercentComplete": "" 20 | } -------------------------------------------------------------------------------- /metrics/nodemanager/NodeManagerMetrics.json: -------------------------------------------------------------------------------- 1 | { 2 | "ContainersLaunched": "Count of launched container", 3 | "ContainersCompleted": "Count of completed container", 4 | "ContainersFailed": "Count of failed container", 5 | "ContainersKilled": "Count of killed container", 6 | "ContainersIniting": "Count of initing container", 7 | "ContainersRunning": "Count of running container", 8 | "AllocatedGB": "Memory size of allocated (in GB)", 9 | "AllocatedContainers": "Count of allocated container", 10 | "AvailableGB": "Memory size of available (in GB)", 11 | "AllocatedVCores": "Count of allocated VCores", 12 | "AvailableVCores": "Count of available VCores", 13 | "ContainerLaunchDurationNumOps": "Count of launched container", 14 | "ContainerLaunchDurationAvgTime": "Average time of launching container (in ms)", 15 | "BadLocalDirs": "Count of bad local directory", 16 | "BadLogDirs": "Count of bad log directory", 17 | "GoodLocalDirsDiskUtilizationPerc": "Percent of good local directory disk utilization", 18 | "GoodLogDirsDiskUtilizationPerc": "Percent of good local log directory disk utilization" 19 | } 20 | -------------------------------------------------------------------------------- /metrics/nodemanager/ShuffleMetrics.json: -------------------------------------------------------------------------------- 1 | { 2 | "ShuffleOutputBytes": "Output byte of shuffle", 3 | "ShuffleOutputsFailed": "Output failed of shuffle", 4 | "ShuffleOutputsOK": "Output ok of shuffle", 5 | "ShuffleConnections": "Connection count of shuffle" 6 | } 7 | -------------------------------------------------------------------------------- /metrics/resourcemanager/ClusterMetrics.json: -------------------------------------------------------------------------------- 1 | { 2 | "NumActiveNMs": "Current number of active NodeManagers", 3 | "NumDecommissionedNMs": "Current number of decommissioned NodeManagers", 4 | "NumLostNMs": "Current number of lost NodeManagers for not sending heartbeats", 5 | "NumUnhealthyNMs": "Current number of unhealthy NodeManagers", 6 | "NumRebootedNMs": "Current number of rebooted NodeManagers", 7 | "AMLaunchDelayNumOps": "Total number of AMs launched", 8 | "AMLaunchDelayAvgTime": "Average time in milliseconds RM spends to launch AM containers after the AM container is allocated", 9 | "AMRegisterDelayNumOps": "Total number of AMs registered", 10 | "AMRegisterDelayAvgTime": "Average time in milliseconds AM spends to register with RM after the AM container gets launched" 11 | } -------------------------------------------------------------------------------- /metrics/resourcemanager/QueueMetrics.json: -------------------------------------------------------------------------------- 1 | { 2 | "running_0": "Current number of running applications whose elapsed time are less than 60 minutes.", 3 | "running_60": "Current number of running applications whose elapsed time are between 60 and 300 minutes.", 4 | "running_300": "Current number of running applications whose elapsed time are between 300 and 1440 minutes.", 5 | "running_1440": "Current number of running applications elapsed time are more than 1440 minutes.", 6 | "AppsSubmitted": "Total number of submitted applications.", 7 | "AppsRunning": "Current number of running applications.", 8 | "AppsPending": "Current number of applications that have not yet been assigned by any containers.", 9 | "AppsCompleted": "Total number of completed applications.", 10 | "AppsKilled": "Total number of killed applications.", 11 | "AppsFailed": "Total number of failed applications.", 12 | "AllocatedMB": "Current allocated memory in MB.", 13 | "AllocatedVCores": "Current allocated CPU in virtual cores.", 14 | "AllocatedContainers": "Current number of allocated containers.", 15 | "AggregateContainersAllocated": "Total number of allocated containers.", 16 | "AggregateContainersReleased": "Total number of released containers.", 17 | "AvailableMB": "Current available memory in MB.", 18 | "AvailableVCores": "Current available CPU in virtual cores.", 19 | "PendingMB": "Current pending memory resource requests in MB that are not yet fulfilled by the scheduler.", 20 | "PendingVCores": "Current pending CPU allocation requests in virtual cores that are not yet fulfilled by the scheduler.", 21 | "PendingContainers": "Current pending resource requests that are not yet fulfilled by the scheduler.", 22 | "ReservedMB": "Current reserved memory in MB.", 23 | "ReservedVCores": "Current reserved CPU in virtual cores.", 24 | "ReservedContainers": "Current number of reserved containers.", 25 | "ActiveUsers": "Current number of active users.", 26 | "ActiveApplications": "Current number of active applications.", 27 | "FairShareMB": "(FairScheduler only) Current fair share of memory in MB.", 28 | "FairShareVCores": "(FairScheduler only) Current fair share of CPU in virtual cores.", 29 | "MinShareMB": "(FairScheduler only) Minimum share of memory in MB.", 30 | "MinShareVCores": "(FairScheduler only) Minimum share of CPU in virtual cores.", 31 | "MaxShareMB": "(FairScheduler only) Maximum share of memory in MB.", 32 | "MaxShareVCores": "(FairScheduler only) Maximum share of CPU in virtual cores." 33 | } -------------------------------------------------------------------------------- /metrics/resourcemanager/RMNMInfo.json: -------------------------------------------------------------------------------- 1 | { 2 | "NumContainers": "Total number of containers currently running on the host", 3 | "State": "State of the host - valid values are: NEW, RUNNING, UNHEALTHY, DECOMMISSIONED, LOST, REBOOTED", 4 | "UsedMemoryMB": "The total amount of memory currently used on the host (in MB)", 5 | "AvailableMemoryMB": "The total amount of memory currently available on the host (in MB)" 6 | } -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests 2 | prometheus_client 3 | pyyaml 4 | -------------------------------------------------------------------------------- /scraper.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | 4 | import threading 5 | import requests 6 | 7 | from utils import get_module_logger 8 | 9 | 10 | logger = get_module_logger(__name__) 11 | 12 | 13 | class Scraper(threading.Thread): 14 | def __init__(self, url, result): 15 | super(Scraper, self).__init__() 16 | self.name = "thread-%s" % url 17 | self.url = url 18 | self.result = result 19 | 20 | def run(self): 21 | result = [] 22 | try: 23 | s = requests.session() 24 | response = s.get(self.url, timeout=5) 25 | except Exception as e: 26 | logger.warning("Get {0} failed, error: {1}.".format(self.url, str(e))) 27 | else: 28 | if response.status_code != requests.codes.ok: 29 | logger.warning("Get {0} failed, response code is: {1}.".format(self.url, response.status_code)) 30 | else: 31 | rlt = response.json() 32 | if rlt and "beans" in rlt: 33 | result = rlt['beans'] 34 | else: 35 | logger.warning("No metrics get in the {0}.".format(self.url)) 36 | s.close() 37 | if len(result) > 0: 38 | self.result.append(result) 39 | 40 | 41 | class ScrapeMetrics(object): 42 | def __init__(self, urls): 43 | self.urls = urls 44 | 45 | def scrape(self): 46 | result = [] 47 | tasks = [Scraper(url, result) for url in self.urls] 48 | for task in tasks: 49 | task.start() 50 | for task in tasks: 51 | task.join() 52 | return result 53 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import argparse 6 | import logging 7 | import yaml 8 | 9 | 10 | def get_module_logger(mod_name): 11 | logger = logging.getLogger(mod_name) 12 | logger.setLevel(logging.DEBUG) 13 | 14 | path = os.path.dirname(os.path.abspath(__file__)) 15 | par_path = os.path.dirname(path) 16 | fh = logging.FileHandler(os.path.join(par_path, "hadoop_jmx_exporter.log")) 17 | fh.setLevel(logging.INFO) 18 | 19 | sh = logging.StreamHandler() 20 | sh.setLevel(logging.INFO) 21 | 22 | fmt = logging.Formatter(fmt='%(asctime)s %(filename)s[line:%(lineno)d]-[%(levelname)s]: %(message)s') 23 | fh.setFormatter(fmt) 24 | sh.setFormatter(fmt) 25 | 26 | logger.addHandler(fh) 27 | logger.addHandler(sh) 28 | return logger 29 | 30 | 31 | logger = get_module_logger(__name__) 32 | 33 | def read_json_file(path_name, file_name): 34 | path = os.path.dirname(os.path.realpath(__file__)) 35 | metric_path = os.path.join(path, "metrics", path_name) 36 | metric_name = "{0}.json".format(file_name) 37 | try: 38 | with open(os.path.join(metric_path, metric_name), 'r') as f: 39 | metrics = yaml.safe_load(f) 40 | return metrics 41 | except Exception as e: 42 | logger.info("read metrics json file failed, error msg is: %s" % e) 43 | return {} 44 | 45 | 46 | def get_file_list(file_path_name): 47 | path = os.path.dirname(os.path.abspath(__file__)) 48 | json_path = os.path.join(path, "metrics", file_path_name) 49 | try: 50 | files = os.listdir(json_path) 51 | except OSError: 52 | logger.info("No such file or directory: '%s'" % json_path) 53 | return [] 54 | else: 55 | rlt = [] 56 | for i in range(len(files)): 57 | rlt.append(files[i].split(".json")[0]) 58 | return rlt 59 | 60 | 61 | def parse_args(): 62 | parser = argparse.ArgumentParser(description='hadoop jmx metric prometheus exporter') 63 | parser.add_argument('-cluster', required=True, metavar='cluster_name', help='Hadoop cluster name (maybe HA name)') 64 | parser.add_argument('-queue', required=False, metavar='yarn_queue_regexp', help='Regular expression of queue name. default: root.*', default='root.*') 65 | parser.add_argument('-nns', required=False, metavar='namenode_jmx_url', help='Hadoop hdfs namenode jmx metrics URL.', nargs="*") 66 | parser.add_argument('-rms', required=False, metavar='resourcemanager_jmx_url', help='Hadoop resourcemanager metrics jmx URL.', nargs="*") 67 | parser.add_argument('-jns', required=False, metavar='journalnode_jmx_url', help='Hadoop journalnode jmx metrics URL.', nargs="*") 68 | parser.add_argument('-host', required=False, metavar='host', help='Listen on this address. default: 0.0.0.0', default='0.0.0.0') 69 | parser.add_argument('-port', required=False, metavar='port', type=int, help='Listen to this port. default: 6688', default=6688) 70 | return parser.parse_args() 71 | -------------------------------------------------------------------------------- /yarn_nodemanager.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | 4 | import re 5 | from prometheus_client.core import GaugeMetricFamily 6 | 7 | from utils import get_module_logger 8 | from common import MetricCollector, CommonMetricCollector 9 | from scraper import ScrapeMetrics 10 | 11 | logger = get_module_logger(__name__) 12 | 13 | 14 | class NodeManagerMetricCollector(MetricCollector): 15 | 16 | def __init__(self, cluster, rmc): 17 | MetricCollector.__init__(self, cluster, "yarn", "nodemanager") 18 | self.target = "-" 19 | self.rmc = rmc 20 | 21 | self.hadoop_nodemanager_metrics = {} 22 | for i in range(len(self.file_list)): 23 | self.hadoop_nodemanager_metrics.setdefault(self.file_list[i], {}) 24 | 25 | self.common_metric_collector = CommonMetricCollector(cluster, "yarn", "nodemanager") 26 | 27 | def collect(self): 28 | isSetup = False 29 | beans_list = ScrapeMetrics(self.rmc.nms).scrape() 30 | for beans in beans_list: 31 | if not isSetup: 32 | self.common_metric_collector.setup_labels(beans) 33 | self.setup_metrics_labels(beans) 34 | isSetup = True 35 | for i in range(len(beans)): 36 | if 'tag.Hostname' in beans[i]: 37 | self.target = beans[i]["tag.Hostname"] 38 | break 39 | self.hadoop_nodemanager_metrics.update(self.common_metric_collector.get_metrics(beans, self.target)) 40 | self.get_metrics(beans) 41 | 42 | for i in range(len(self.merge_list)): 43 | service = self.merge_list[i] 44 | if service in self.hadoop_nodemanager_metrics: 45 | for metric in self.hadoop_nodemanager_metrics[service]: 46 | yield self.hadoop_nodemanager_metrics[service][metric] 47 | 48 | def setup_metrics_labels(self, beans): 49 | for i in range(len(beans)): 50 | for service in self.metrics: 51 | if service in beans[i]['name']: 52 | container_flag = 1 53 | for metric in self.metrics[service]: 54 | label = ["cluster", "host"] 55 | if metric.startswith("Containers"): 56 | if container_flag: 57 | container_flag = 0 58 | label.append("status") 59 | key = "containers" 60 | name = "_".join([self.prefix, "container_count"]) 61 | description = "Count of container" 62 | else: 63 | continue 64 | else: 65 | snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() 66 | name = "_".join([self.prefix, snake_case]) 67 | key = metric 68 | description = self.metrics[service][metric] 69 | label.append("target") 70 | self.hadoop_nodemanager_metrics[service][key] = GaugeMetricFamily(name, description, labels=label) 71 | 72 | def get_metrics(self, beans): 73 | for i in range(len(beans)): 74 | for service in self.metrics: 75 | if service not in beans[i]['name']: 76 | continue 77 | for metric in beans[i]: 78 | if metric not in self.metrics[service]: 79 | continue 80 | label = [self.cluster, self.target] 81 | if metric.startswith("Containers"): 82 | key = "containers" 83 | label.append(metric.split("Containers")[1]) 84 | else: 85 | key = metric 86 | label.append(self.target) 87 | value = beans[i][metric] if beans[i][metric] > 0 else 0 # incase vcore or memory < 0 88 | self.hadoop_nodemanager_metrics[service][key].add_metric(label, value) 89 | -------------------------------------------------------------------------------- /yarn_resourcemanager.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python2 2 | # -*- coding: utf-8 -*- 3 | 4 | import yaml 5 | import re 6 | from prometheus_client.core import GaugeMetricFamily 7 | 8 | from utils import get_module_logger 9 | from common import MetricCollector, CommonMetricCollector 10 | from scraper import ScrapeMetrics 11 | 12 | logger = get_module_logger(__name__) 13 | 14 | 15 | class ResourceManagerMetricCollector(MetricCollector): 16 | 17 | NODE_STATE = { 18 | 'NEW': 1, 19 | 'RUNNING': 2, 20 | 'UNHEALTHY': 3, 21 | 'DECOMMISSIONED': 4, 22 | 'LOST': 5, 23 | 'REBOOTED': 6, 24 | } 25 | 26 | def __init__(self, cluster, urls, queue_regexp): 27 | MetricCollector.__init__(self, cluster, "yarn", "resourcemanager") 28 | self.target = "-" 29 | self.queue_regexp = queue_regexp 30 | self.nms = set() 31 | 32 | self.hadoop_resourcemanager_metrics = {} 33 | for i in range(len(self.file_list)): 34 | self.hadoop_resourcemanager_metrics.setdefault(self.file_list[i], {}) 35 | 36 | self.common_metric_collector = CommonMetricCollector(cluster, "yarn", "resourcemanager") 37 | 38 | self.scrape_metrics = ScrapeMetrics(urls) 39 | 40 | def collect(self): 41 | isSetup = False 42 | beans_list = self.scrape_metrics.scrape() 43 | for beans in beans_list: 44 | if not isSetup: 45 | self.common_metric_collector.setup_labels(beans) 46 | self.setup_metrics_labels(beans) 47 | isSetup = True 48 | for i in range(len(beans)): 49 | if 'tag.Hostname' in beans[i]: 50 | self.target = beans[i]["tag.Hostname"] 51 | break 52 | self.hadoop_resourcemanager_metrics.update(self.common_metric_collector.get_metrics(beans, self.target)) 53 | self.get_metrics(beans) 54 | 55 | for i in range(len(self.merge_list)): 56 | service = self.merge_list[i] 57 | if service in self.hadoop_resourcemanager_metrics: 58 | for metric in self.hadoop_resourcemanager_metrics[service]: 59 | yield self.hadoop_resourcemanager_metrics[service][metric] 60 | 61 | def setup_rmnminfo_labels(self): 62 | for metric in self.metrics['RMNMInfo']: 63 | label = ["cluster", "host", "version", "rack", "_target"] 64 | if 'NumContainers' in metric: 65 | name = "_".join([self.prefix, 'node_containers_total']) 66 | elif 'State' in metric: 67 | name = "_".join([self.prefix, 'node_state']) 68 | elif 'UsedMemoryMB' in metric: 69 | name = "_".join([self.prefix, 'node_memory_used_mb']) 70 | elif 'AvailableMemoryMB' in metric: 71 | name = "_".join([self.prefix, 'node_memory_available_mb']) 72 | else: 73 | continue 74 | self.hadoop_resourcemanager_metrics['RMNMInfo'][metric] = GaugeMetricFamily(name, self.metrics['RMNMInfo'][metric], labels=label) 75 | 76 | def setup_queue_labels(self): 77 | running_flag, mb_flag, vcore_flag, container_flag, apps_flag = 1, 1, 1, 1, 1 78 | for metric in self.metrics['QueueMetrics']: 79 | label = ["cluster", "modeler_type", "queue", "user"] 80 | if "running_" in metric: 81 | if running_flag: 82 | running_flag = 0 83 | label.append("elapsed_time") 84 | key = "running_app" 85 | name = "_".join([self.prefix, "running_app_total"]) 86 | description = "Current number of running applications in each elapsed time ( < 60min, 60min < x < 300min, 300min < x < 1440min and x > 1440min )" 87 | else: 88 | continue 89 | elif metric.endswith("VCores"): 90 | if vcore_flag: 91 | vcore_flag = 0 92 | label.append("status") 93 | key = "vcore" 94 | name = "_".join([self.prefix, "vcore_count"]) 95 | description = "Count of vcore" 96 | else: 97 | continue 98 | elif metric.endswith("Containers"): 99 | if container_flag: 100 | container_flag = 0 101 | label.append("status") 102 | key = "containers" 103 | name = "_".join([self.prefix, "container_count"]) 104 | description = "Count of container" 105 | else: 106 | continue 107 | elif metric.endswith("MB"): 108 | if mb_flag: 109 | mb_flag = 0 110 | label.append("status") 111 | key = "memory" 112 | name = "_".join([self.prefix, "memory_in_mb"]) 113 | description = "Memory in MB" 114 | else: 115 | continue 116 | elif metric.startswith("Apps"): 117 | if apps_flag: 118 | apps_flag = 0 119 | label.append("status") 120 | key = "apps" 121 | name = "_".join([self.prefix, "application_count"]) 122 | description = "Count of application" 123 | else: 124 | continue 125 | else: 126 | key = metric 127 | snake_case = re.sub('([a-z0-9])([A-Z])', r'\1_\2', metric).lower() 128 | name = "_".join([self.prefix, snake_case]) 129 | description = self.metrics['QueueMetrics'][metric] 130 | label.append("_target") 131 | self.hadoop_resourcemanager_metrics['QueueMetrics'][key] = GaugeMetricFamily(name, description, labels=label) 132 | 133 | def setup_cluster_labels(self): 134 | nm_flag, cm_num_flag, cm_avg_flag = 1, 1, 1 135 | for metric in self.metrics['ClusterMetrics']: 136 | if "NMs" in metric: 137 | if nm_flag: 138 | nm_flag = 0 139 | label = ["cluster", "status"] 140 | key = "NMs" 141 | name = "nodemanager_total" 142 | description = "Current number of NodeManagers in each status" 143 | else: 144 | continue 145 | elif "NumOps" in metric: 146 | if cm_num_flag: 147 | cm_num_flag = 0 148 | label = ["cluster", "oper"] 149 | key = "NumOps" 150 | name = "ams_total" 151 | description = "Total number of Applications Masters in each operation" 152 | else: 153 | continue 154 | elif "AvgTime" in metric: 155 | if cm_avg_flag: 156 | cm_avg_flag = 0 157 | label = ["cluster", "oper"] 158 | key = "AvgTime" 159 | name = "average_time_milliseconds" 160 | description = "Average time in milliseconds AM spends in each operation" 161 | else: 162 | continue 163 | else: 164 | key = metric 165 | name = metric 166 | description = self.metrics['ClusterMetrics'][metric] 167 | label = ["cluster"] 168 | label.append("_target") 169 | self.hadoop_resourcemanager_metrics['ClusterMetrics'][key] = GaugeMetricFamily("_".join([self.prefix, name]), description, labels=label) 170 | 171 | def setup_metrics_labels(self, beans): 172 | for i in range(len(beans)): 173 | if 'RMNMInfo' in beans[i]['name']: 174 | self.setup_rmnminfo_labels() 175 | if 'QueueMetrics' in self.metrics: 176 | self.setup_queue_labels() 177 | if 'ClusterMetrics' in self.metrics: 178 | self.setup_cluster_labels() 179 | 180 | def get_rmnminfo_metrics(self, bean): 181 | for metric in self.metrics['RMNMInfo']: 182 | nms = set() 183 | live_nm_list = yaml.safe_load(bean['LiveNodeManagers']) 184 | for j in range(len(live_nm_list)): 185 | nms.add("http://"+live_nm_list[j]["NodeHTTPAddress"]+"/jmx") 186 | host = live_nm_list[j]['HostName'] 187 | version = live_nm_list[j]['NodeManagerVersion'] 188 | rack = live_nm_list[j]['Rack'] 189 | label = [self.cluster, host, version, rack, self.target] 190 | if 'State' == metric: 191 | value = self.NODE_STATE[live_nm_list[j]['State']] 192 | else: 193 | value = live_nm_list[j][metric] if metric in live_nm_list[j] else 0.0 194 | self.hadoop_resourcemanager_metrics['RMNMInfo'][metric].add_metric(label, value) 195 | self.nms = nms 196 | 197 | def get_queue_metrics(self, bean): 198 | for metric in self.metrics['QueueMetrics']: 199 | label = [self.cluster, bean.get("modelerType", "-"), bean.get("tag.Queue", "-"), bean.get("tag.User", "-")] 200 | if "running_0" in metric: 201 | key = "running_app" 202 | label.append("0to60") 203 | elif "running_60" in metric: 204 | key = "running_app" 205 | label.append("60to300") 206 | elif "running_300" in metric: 207 | key = "running_app" 208 | label.append("300to1440") 209 | elif "running_1440" in metric: 210 | key = "running_app" 211 | label.append("1440up") 212 | elif metric.endswith("VCores"): 213 | label.append(metric.split("VCores")[0]) 214 | key = "vcore" 215 | elif metric.endswith("Containers"): 216 | label.append(metric.split("Containers")[0]) 217 | key = "containers" 218 | elif metric.endswith("MB"): 219 | label.append(metric.split("MB")[0]) 220 | key = "memory" 221 | elif metric.startswith("Apps"): 222 | label.append(metric.split("Apps")[1]) 223 | key = "apps" 224 | else: 225 | key = metric 226 | label.append(self.target) 227 | self.hadoop_resourcemanager_metrics['QueueMetrics'][key].add_metric(label, bean[metric] if metric in bean else 0) 228 | 229 | def get_cluster_metrics(self, bean): 230 | for metric in self.metrics['ClusterMetrics']: 231 | label = [self.cluster] 232 | if "NMs" in metric: 233 | label.append(metric.split('NMs')[0].split('Num')[1]) 234 | key = "NMs" 235 | elif "NumOps" in metric: 236 | key = "NumOps" 237 | label.append(metric.split("DelayNumOps")[0].split('AM')[1]) 238 | elif "AvgTime" in metric: 239 | key = "AvgTime" 240 | label.append(metric.split("DelayAvgTime")[0].split('AM')[1]) 241 | else: 242 | continue 243 | label.append(self.target) 244 | self.hadoop_resourcemanager_metrics['ClusterMetrics'][key].add_metric(label, bean[metric] if metric in bean else 0) 245 | 246 | def get_metrics(self, beans): 247 | for i in range(len(beans)): 248 | if 'RMNMInfo' in beans[i]['name']: 249 | self.get_rmnminfo_metrics(beans[i]) 250 | if 'name=QueueMetrics' in beans[i]['name'] and re.match(self.queue_regexp, beans[i]['tag.Queue']): 251 | self.get_queue_metrics(beans[i]) 252 | if 'ClusterMetrics' in beans[i]['name']: 253 | self.get_cluster_metrics(beans[i]) 254 | --------------------------------------------------------------------------------