├── .dockerignore ├── .gitignore ├── Dockerfile ├── README.md ├── build.sh ├── dashboards ├── hdfs.json └── yarn.json ├── entrypoint.sh ├── examples ├── config-example.yaml └── docker-compose.yaml ├── hadoop_exporter ├── __init__.py ├── common.py ├── datanode.py ├── exporter.py ├── hiveserver2.py ├── journalnode.py ├── mapping.py ├── namenode.py ├── nodemanager.py ├── resourcemanager.py └── utils.py ├── metrics ├── common.yaml ├── datanode.yaml ├── hiveserver2.yaml ├── journalnode.yaml ├── namenode.yaml ├── nodemanager.yaml └── resourcemanager.yaml ├── requirements.txt ├── service.py └── test ├── datanode.json ├── namenode.json ├── nodemanager.json └── resourcemanager.json /.dockerignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | .idea 3 | 4 | __pycache__ 5 | *.egg-info 6 | *.egg/ 7 | *.pyc 8 | *.swp 9 | 10 | test 11 | build.sh 12 | 13 | .git 14 | .gitignore 15 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .vscode 2 | .idea 3 | local 4 | build 5 | 6 | __pycache__ 7 | *.pyc 8 | 9 | .mypy_cache 10 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.7-slim-buster 2 | 3 | LABEL maintainer="vqcuong96@gmail.com" 4 | 5 | ENV container=docker 6 | 7 | ADD requirements.txt /tmp 8 | 9 | ENV EXPORTER_PORT=9123 \ 10 | EXPORTER_HOME=/exporter \ 11 | EXPORTER_METRICS_DIR=/exporter/metrics \ 12 | EXPORTER_LOGS_DIR=/exporter/logs 13 | 14 | RUN set -ex \ 15 | && apt-get update \ 16 | && apt-get install --no-install-recommends net-tools dos2unix dumb-init -y \ 17 | && pip install -r /tmp/requirements.txt \ 18 | && mkdir -p ${EXPORTER_HOME} ${EXPORTER_LOGS_DIR} \ 19 | && rm -rf /tmp/* \ 20 | && rm -rf /var/lib/apt/lists/* /var/log/dpkg.log \ 21 | && apt-get autoremove -yqq --purge \ 22 | && apt-get clean 23 | 24 | ADD hadoop_exporter /exporter/hadoop_exporter 25 | ADD metrics /exporter/metrics 26 | ADD service.py /service.py 27 | ADD entrypoint.sh /entrypoint.sh 28 | 29 | RUN set -ex \ 30 | && chmod +x /entrypoint.sh /service.py 31 | 32 | ENV PYTHONPATH=${PYTHONPATH}:${EXPORTER_HOME} 33 | EXPOSE ${EXPORTER_PORT} 34 | ENTRYPOINT ["/usr/bin/dumb-init", "--", "/entrypoint.sh"] 35 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Hadoop Prometheus Exporter 2 | A hadoop metrics exporter for common hadoop components. Currently, I've just implemented for HDFS NameNode, HDFS DataNode, HDFS JournalNode, YARN ResourceManager, YARN NodeManager. This is a python version, you may take another version using golang [here](https://github.com/vqcuong/hadoop_metric_exporter). 3 | 4 | ## How it works 5 | - Consume metrics from JMX http, convert and export hadoop metrics via HTTP for Prometheus consumption. 6 | - Underlyring, I used regex template to parse and map config name as well as label before exporting it via promethues http server. You can see my templates in folder [metrics](./metrics) 7 | 8 | ## How to run 9 | ``` 10 | python service.py 11 | ``` 12 | 13 | Help on flags of hadoop_exporter: 14 | ``` 15 | $ python service.py -h 16 | usage: service.py [-h] [-cfg CONFIG] [-c CLUSTER_NAME] [-nn NAMENODE_JMX] 17 | [-dn DATANODE_JMX] [-jn JOURNALNODE_JMX] 18 | [-rm RESOURCEMANAGER_JMX] [-nm NODEMANAGER_JMX] 19 | [-mrjh MAPRED_JOBHISTORY_JMX] [-hm HMASTER_JMX] 20 | [-hr HREGION_JMX] [-hs2 HIVESERVER2_JMX] 21 | [-hllap HIVELLAP_JMX] [-ad AUTO_DISCOVERY] 22 | [-adw DISCOVERY_WHITELIST] [-addr ADDRESS] [-p PORT] 23 | [--path PATH] [--period PERIOD] [--log-level LOG_LEVEL] 24 | 25 | optional arguments: 26 | -h, --help show this help message and exit 27 | -cfg CONFIG Exporter config file (defautl: /exporter/config.yaml) 28 | -c CLUSTER_NAME Hadoop cluster labels. (default "hadoop_cluster") 29 | -nn NAMENODE_JMX List of HDFS namenode JMX url. (example 30 | "http://localhost:9870/jmx") 31 | -dn DATANODE_JMX List of HDFS datanode JMX url. (example 32 | "http://localhost:9864/jmx") 33 | -jn JOURNALNODE_JMX List of HDFS journalnode JMX url. (example 34 | "http://localhost:8480/jmx") 35 | -rm RESOURCEMANAGER_JMX 36 | List of YARN resourcemanager JMX url. (example 37 | "http://localhost:8088/jmx") 38 | -nm NODEMANAGER_JMX List of YARN nodemanager JMX url. (example 39 | "http://localhost:8042/jmx") 40 | -mrjh MAPRED_JOBHISTORY_JMX 41 | List of Mapreduce jobhistory JMX url. (example 42 | "http://localhost:19888/jmx") 43 | -hm HMASTER_JMX List of HBase master JMX url. (example 44 | "http://localhost:16010/jmx") 45 | -hr HREGION_JMX List of HBase regionserver JMX url. (example 46 | "http://localhost:16030/jmx") 47 | -hs2 HIVESERVER2_JMX List of HiveServer2 JMX url. (example 48 | "http://localhost:10002/jmx") 49 | -hllap HIVELLAP_JMX List of Hive LLAP JMX url. (example 50 | "http://localhost:15002/jmx") 51 | -ad AUTO_DISCOVERY Enable auto discovery if set true else false. (example 52 | "--auto true") (default: false) 53 | -adw DISCOVERY_WHITELIST 54 | Enable auto discovery if set true else false. (example 55 | "--auto true") (default: false) 56 | -addr ADDRESS Polling server on this address. (default "127.0.0.1") 57 | -p PORT Listen to this port. (default "9123") 58 | --path PATH Path under which to expose metrics. (default 59 | "/metrics") 60 | --period PERIOD Period (seconds) to consume jmx service. (default: 10) 61 | --log-level LOG_LEVEL Log level, include: all, debug, info, warn, error (default: info) 62 | ``` 63 | 64 | You can use config file (yaml format) to replace commandline args. Example of config.yaml: 65 | ``` 66 | # exporter server config 67 | server: 68 | address: 127.0.0.1 # address to run exporter 69 | port: 9123 # port to listen 70 | 71 | # list of jmx service to scape metrics 72 | jmx: 73 | - cluster: hadoop_prod 74 | services: 75 | namenode: 76 | - http://nn1:9870/jmx 77 | datanode: 78 | - http://dn1:9864/jmx 79 | - http://dn2:9864/jmx 80 | - http://dn3:9864/jmx 81 | resourcemanager: 82 | - http://rm1:8088/jmx 83 | nodemanager: 84 | - http://nm1:8042/jmx 85 | - http://nm2:8042/jmx 86 | - http://nm3:8042/jmx 87 | hiveserver2: 88 | - http://hs2:10002/jmx 89 | hmaster: 90 | - http://hmaster1:16010/jmx 91 | - http://hmaster2:16010/jmx 92 | - http://hmaster3:16010/jmx 93 | hregionserver: 94 | - http://hregion1:16030/jmx 95 | - http://hregion2:16030/jmx 96 | - http://hregion3:16030/jmx 97 | - cluster: hadoop_dev 98 | services: 99 | namenode: 100 | - http://dev:9870/jmx 101 | datanode: 102 | - http://dev:9864/jmx 103 | resourcemanager: 104 | - http://dev:8088/jmx 105 | nodemanager: 106 | - http://dev:8042/jmx 107 | ``` 108 | 109 | Tested on Apache Hadoop 2.7.3, 3.3.0, 3.3.1, 3.3.2 110 | 111 | ## Grafana Monitoring 112 | There are [HDFS](./dashboards/hdfs.json) and [YARN](./dashboards/yarn.json) dashboard definition prepared by me. You can import it directly on grafana. 113 | 114 | ## Docker deployment 115 | 116 | Run container: 117 | ``` 118 | docker run -d \ 119 | --name hadoop-exporter \ 120 | vqcuong96/hadoop_exporter \ 121 | -nn http://localhost:9870/jmx \ 122 | -rm http://localhost:8088/jmx 123 | ``` 124 | 125 | You can also mount config to docker container: 126 | ``` 127 | docker run -d \ 128 | --name hadoop_exporter \ 129 | --mount type=bind,source=/path/to/config.yaml,target=/tmp/config.yaml \ 130 | vqcuong96/hadoop_exporter \ 131 | -cfg /tmp/config.yaml 132 | ``` 133 | 134 | To build your own images, run: 135 | ``` 136 | ./build.sh [your_repo] [your_version_tag] 137 | ``` 138 | 139 | Example: 140 | ``` 141 | ./build.sh mydockerhub/ latest 142 | #your image will look like: mydockerhub/hadoop_exporter:latest 143 | ``` 144 | -------------------------------------------------------------------------------- /build.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | base_repo=${1} 4 | version=${2:-"latest"} 5 | docker build -t ${base_repo}hadoop_exporter:${version} . 6 | -------------------------------------------------------------------------------- /entrypoint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [[ "x$1" != "x" && "$1" != -* ]]; then 4 | exec $@ 5 | else 6 | /service.py $@ 7 | fi 8 | -------------------------------------------------------------------------------- /examples/config-example.yaml: -------------------------------------------------------------------------------- 1 | # exporter server config 2 | server: 3 | address: 127.0.0.1 # address to run exporter 4 | port: 9123 # port to listen 5 | 6 | # list of jmx service to scape metrics 7 | jmx: 8 | - cluster: hadoop_prod 9 | services: 10 | namenode: 11 | - http://nn1:9870/jmx 12 | datanode: 13 | - http://dn1:9864/jmx 14 | - http://dn2:9864/jmx 15 | - http://dn3:9864/jmx 16 | resourcemanager: 17 | - http://rm1:8088/jmx 18 | nodemanager: 19 | - http://nm1:8042/jmx 20 | - http://nm2:8042/jmx 21 | - http://nm3:8042/jmx 22 | hiveserver2: 23 | - http://hs2:10002/jmx 24 | hmaster: 25 | - http://hmaster1:16010/jmx 26 | - http://hmaster2:16010/jmx 27 | - http://hmaster3:16010/jmx 28 | hregionserver: 29 | - http://hregion1:16030/jmx 30 | - http://hregion2:16030/jmx 31 | - http://hregion3:16030/jmx 32 | 33 | - cluster: hadoop_dev 34 | services: 35 | namenode: 36 | - http://dev:9870/jmx 37 | datanode: 38 | - http://dev:9864/jmx 39 | resourcemanager: 40 | - http://dev:8088/jmx 41 | nodemanager: 42 | - http://dev:8042/jmx 43 | -------------------------------------------------------------------------------- /examples/docker-compose.yaml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | 3 | services: 4 | hadoop_exporter: 5 | image: vqcuong96/hadoop_exporter 6 | container_name: hadoop_exporter 7 | ports: 8 | - 9123:9123 9 | volumes: 10 | - ./config-example.yaml:/exporter/config.yaml 11 | - exporter_logs:/exporter/logs 12 | volumes: 13 | exporter_logs: 14 | external: false 15 | -------------------------------------------------------------------------------- /hadoop_exporter/__init__.py: -------------------------------------------------------------------------------- 1 | from hadoop_exporter.namenode import HDFSNameNodeMetricCollector 2 | from hadoop_exporter.datanode import HDFSDataNodeMetricCollector 3 | from hadoop_exporter.journalnode import HDFSJournalNodeMetricCollector 4 | from hadoop_exporter.resourcemanager import YARNResourceManagerMetricCollector 5 | from hadoop_exporter.nodemanager import YARNNodeManagerMetricCollector 6 | from hadoop_exporter.hiveserver2 import HiveServer2MetricCollector 7 | -------------------------------------------------------------------------------- /hadoop_exporter/common.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import re 6 | import traceback 7 | from logging import Logger 8 | from typing import Any, List, Dict, Optional, Union 9 | from prometheus_client.core import GaugeMetricFamily 10 | from hadoop_exporter import utils 11 | 12 | EXPORTER_METRICS_DIR = os.environ.get('EXPORTER_METRICS_DIR', 'metrics') 13 | 14 | 15 | class MetricCollector(object): 16 | ''' 17 | MetricCollector is a super class of all kinds of MetricsColleter classes. It setup common params like cluster, url, component and service. 18 | ''' 19 | NON_METRIC_NAMES = ["name", "modelerType", "Name", "ObjectName"] 20 | 21 | def __init__(self, cluster: str, urls: Union[str, List[str]], component: str, service: str, logger: Logger = None): 22 | ''' 23 | @param cluster: Cluster name, registered in the config file or ran in the command-line. 24 | @param urls: List of JMX url of each unique serivce corresponding to each component 25 | e.g. hdfs namenode metrics can be scraped in list: [http://namenode1:9870/jmx. http://namenode2:9870/jmx] 26 | @param component: Component name. e.g. "hdfs", "yarn" 27 | @param service: Service name. e.g. "namenode", "datanode", "resourcemanager", "nodemanager" 28 | ''' 29 | 30 | self._logger = logger or utils.get_logger() 31 | self._cluster = cluster 32 | self._component = component 33 | self._service = service 34 | self._urls = list(map(lambda url: url.rstrip('/'), urls.split(",") if isinstance(urls, str) else urls)) 35 | self._prefix = f"hadoop_{component}_{service}" 36 | 37 | cfg = utils.read_yaml_file(os.path.join(EXPORTER_METRICS_DIR, f"{service}.yaml")) 38 | common_cfg = utils.read_yaml_file(os.path.join(EXPORTER_METRICS_DIR, 'common.yaml')) 39 | 40 | self._rules = cfg.get("rules", {}) if cfg is not None else {} 41 | if common_cfg is not None: 42 | self._rules.update(common_cfg.get("rules", {})) 43 | self._lower_name = cfg.get("lowercaseOutputName", True) 44 | self._lower_label = cfg.get("lowercaseOutputLabel", True) 45 | self._common_labels = {} 46 | self._first_get_common_labels = {} 47 | for url in self._urls: 48 | self._first_get_common_labels[url] = True 49 | self._metrics = {} 50 | 51 | 52 | def collect(self): 53 | for group_pattern in self._rules: 54 | self._metrics[group_pattern] = {} 55 | for url in self._urls: 56 | try: 57 | beans = utils.get_metrics(url) 58 | except: 59 | self._logger.info( 60 | "Can't scrape metrics from url: {0}".format(url)) 61 | pass 62 | else: 63 | if self._first_get_common_labels[url]: 64 | self._common_labels[url] = {"names": [], "values": []} 65 | self._get_common_labels(beans, url) 66 | self._convert_metrics(beans, url) 67 | 68 | for group_metrics in self._metrics.values(): 69 | for metric in group_metrics.values(): 70 | yield metric 71 | 72 | 73 | def _get_common_labels(self, beans: List[Dict], url: str): 74 | self._first_get_common_labels[url] = False 75 | self._common_labels[url]["names"].append("cluster") 76 | self._common_labels[url]["values"].append(self._cluster) 77 | 78 | 79 | def _convert_metrics(self, beans: List[Dict], url: str): 80 | # loop for each group metric 81 | for bean in beans: 82 | for group_pattern in self._rules: 83 | if not re.compile(group_pattern).match(bean["name"]): 84 | continue 85 | for metric_name, metric_value in bean.items(): 86 | if metric_name in self.NON_METRIC_NAMES: 87 | continue 88 | # loop for each metric defined in each group 89 | for metric_def in self._rules[group_pattern]: 90 | if metric_def["type"] != "GAUSE": 91 | self._logger.warning( 92 | "Metric type {} not supported currently".format(metric_def["type"])) 93 | continue 94 | if re.compile(metric_def["pattern"]).match(metric_name): 95 | pattern = re.compile("{}<>{}".format( 96 | group_pattern.rstrip("$"), metric_def["pattern"].lstrip("^"))) 97 | concat_str = "{}<>{}".format(bean["name"], metric_name) 98 | sub_name = pattern.sub(metric_def["name"].replace("$", "\\"), concat_str) 99 | sub_label_names = [label for label in metric_def["labels"].keys()] \ 100 | if "labels" in metric_def else [] 101 | metric_identifier = '_'.join([sub_name] + sorted(sub_label_names)).lower() 102 | if metric_identifier not in self._metrics[group_pattern]: 103 | name = "_".join([self._prefix, sub_name]) 104 | if self._lower_name: name = name.lower() 105 | label_names = self._common_labels[url]["names"] + sub_label_names 106 | if self._lower_label: label_names = [l.lower() for l in label_names] 107 | docs = name if "help" not in metric_def \ 108 | else pattern.sub(metric_def["help"].replace("$", "\\"), concat_str) 109 | try: 110 | metric = GaugeMetricFamily(name, docs, labels=label_names) 111 | except: 112 | self._logger.warning("Error while create new metric") 113 | traceback.print_exc() 114 | else: 115 | self._metrics[group_pattern][metric_identifier] = metric 116 | 117 | if metric_identifier in self._metrics[group_pattern]: 118 | sub_label_values = [pattern.sub(label.replace("$", "\\"), concat_str) 119 | for label in metric_def["labels"].values()] if "labels" in metric_def else [] 120 | label_values = self._common_labels[url]["values"] + sub_label_values 121 | if self._lower_label: label_values = [l.lower() for l in label_values] 122 | try: 123 | resolved_value = self._resolve_value(metric_value, metric_def.get("mapping", None)) 124 | except: 125 | self._logger.warn("Unparseble metric: {} - {} = {}".format(bean["name"], metric_name, metric_value)) 126 | else: 127 | self._metrics[group_pattern][metric_identifier].add_metric(label_values, resolved_value) 128 | break 129 | 130 | 131 | def _resolve_value(self, value: Any, mapping: Optional[str]) -> Any: 132 | if mapping: 133 | import importlib 134 | mod_name, func_name = mapping.rsplit('.',1) 135 | mod = importlib.import_module(mod_name) 136 | func = getattr(mod, func_name) 137 | return func(value) 138 | return value 139 | 140 | 141 | def _find_bean(self, beans: List[Dict], group_pattern: str) -> Optional[Dict]: 142 | regex = re.compile(group_pattern) 143 | for bean in beans: 144 | if regex.match(bean["name"]): 145 | return bean 146 | return None 147 | -------------------------------------------------------------------------------- /hadoop_exporter/datanode.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from typing import List, Union, Dict 5 | from hadoop_exporter import utils 6 | from hadoop_exporter.common import MetricCollector 7 | 8 | 9 | class HDFSDataNodeMetricCollector(MetricCollector): 10 | COMPONENT = "hdfs" 11 | SERVICE = "datanode" 12 | 13 | def __init__(self, cluster, urls: Union[str, List[str]]): 14 | logger = utils.get_logger(__name__, log_file=f"{self.COMPONENT}_{self.SERVICE}.log") 15 | MetricCollector.__init__( 16 | self, cluster, urls, self.COMPONENT, self.SERVICE, logger) 17 | 18 | def _get_common_labels(self, beans: List[Dict], url: str): 19 | super()._get_common_labels(beans, url) 20 | 21 | bean = self._find_bean(beans, "Hadoop:service=DataNode,name=JvmMetrics") 22 | if bean: 23 | self._common_labels[url]["names"].append("host") 24 | self._common_labels[url]["values"].append(bean["tag.Hostname"]) 25 | -------------------------------------------------------------------------------- /hadoop_exporter/exporter.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import traceback 4 | from typing import Callable, Dict, List, Optional, Union 5 | from prometheus_client.core import REGISTRY 6 | from prometheus_client import start_http_server 7 | import yaml 8 | from hadoop_exporter import utils 9 | from hadoop_exporter.common import MetricCollector 10 | from hadoop_exporter import ( 11 | HDFSNameNodeMetricCollector, 12 | HDFSDataNodeMetricCollector, 13 | HDFSJournalNodeMetricCollector, 14 | YARNResourceManagerMetricCollector, 15 | YARNNodeManagerMetricCollector, 16 | HiveServer2MetricCollector 17 | # MapredJobHistoryMetricCollector, \ 18 | # HBaseMasterMetricCollector, \ 19 | # HBaseRegionServerMetricCollector, \ 20 | # HiveServer2MetricCollector, \ 21 | # HiveLlapDaemonMetricCollector 22 | ) 23 | 24 | logger = utils.get_logger(__name__) 25 | 26 | EXPORTER_CLUSTER_NAME_DEFAULT = 'hadoop_cluster' 27 | EXPORTER_ADDRESS_DEFAULT = '0.0.0.0' 28 | EXPORTER_PORT_DEFAULT = 9123 29 | EXPORTER_PATH_DEFAULT = '/metrics' 30 | EXPORTER_PERIOD_DEFAULT=30 31 | EXPORTER_CONFIG_DEFAULT='/exporter/config.yaml' 32 | EXPORTER_LOG_LEVEL_DEFAULT='info' 33 | 34 | 35 | class ExporterEnv: 36 | EXPORTER_CONFIG = os.environ.get('EXPORTER_CONFIG', EXPORTER_CONFIG_DEFAULT) 37 | EXPORTER_CLUSTER_NAME = os.environ.get( 38 | 'EXPORTER_CLUSTER_NAME', EXPORTER_CLUSTER_NAME_DEFAULT) 39 | EXPORTER_NAMENODE_JMX = os.environ.get('EXPORTER_NAMENODE_JMX', None) 40 | EXPORTER_DATANODE_JMX = os.environ.get('EXPORTER_DATANODE_JMX', None) 41 | EXPORTER_JOURNALNODE_JMX = os.environ.get('EXPORTER_JOURNALNODE_JMX', None) 42 | EXPORTER_RESOURCEMANAGER_JMX = os.environ.get( 43 | 'EXPORTER_RESOURCEMANAGER_JMX', None) 44 | EXPORTER_NODEMANAGER_JMX = os.environ.get('EXPORTER_NODEMANAGER_JMX', None) 45 | EXPORTER_MAPRED_JOBHISTORY_JMX = os.environ.get( 46 | 'EXPORTER_MAPRED_JOBHISTORY_JMX', None) 47 | EXPORTER_HMASTER_JMX = os.environ.get('EXPORTER_HMASTER_JMX', None) 48 | EXPORTER_HREGION_JMX = os.environ.get('EXPORTER_HREGION_JMX', None) 49 | EXPORTER_HIVESERVER2_JMX = os.environ.get('EXPORTER_HIVESERVER2_JMX', None) 50 | EXPORTER_HIVELLAP_JMX = os.environ.get('EXPORTER_HIVELLAP_JMX', None) 51 | EXPORTER_AUTO_DISCOVERY = os.environ.get( 52 | 'EXPORTER_AUTO_DISCOVERY', 'false') 53 | EXPORTER_DISCOVERY_WHITELIST = os.environ.get( 54 | 'EXPORTER_DISCOVERY_WHITELIST', None) 55 | EXPORTER_ADDRESS = os.environ.get( 56 | 'EXPORTER_ADDRESS', EXPORTER_ADDRESS_DEFAULT) 57 | EXPORTER_PORT = os.environ.get('EXPORTER_PORT', EXPORTER_PORT_DEFAULT) 58 | EXPORTER_PATH = os.environ.get('EXPORTER_PATH', EXPORTER_PATH_DEFAULT) 59 | EXPORTER_PERIOD = os.environ.get('EXPORTER_PERIOD', EXPORTER_PERIOD_DEFAULT) 60 | EXPORTER_LOG_LEVEL = os.environ.get('EXPORTER_LOG_LEVEL', EXPORTER_LOG_LEVEL_DEFAULT) 61 | 62 | 63 | class Service: 64 | def __init__(self, cluster: str, urls: List[str], collector: Callable = MetricCollector, name: Optional[str] = None) -> None: 65 | self.collector = collector 66 | self.urls = urls 67 | self.cluster = cluster 68 | self.flag = True 69 | self.name = name 70 | 71 | def register(self): 72 | if self.flag: 73 | logger.info("register new {} listen from {}".format( 74 | self.collector.__name__, self.urls)) 75 | REGISTRY.register(self.collector( 76 | cluster=self.cluster, urls=self.urls)) 77 | self.flag = not self.flag 78 | 79 | def __str__(self) -> str: 80 | return "(cluster: {}, url: {}, collector: {}{})".format( 81 | self.cluster, self.urls, self.collector.__name__, f', name: {self.name}' if self.name else '') 82 | 83 | 84 | class Exporter: 85 | COLLECTOR_MAPPING = { 86 | 'namenode': HDFSNameNodeMetricCollector, 87 | 'datanode': HDFSDataNodeMetricCollector, 88 | 'journalnode': HDFSJournalNodeMetricCollector, 89 | 'resourcemanager': YARNResourceManagerMetricCollector, 90 | 'nodemanager': YARNNodeManagerMetricCollector, 91 | 'hiveserver2': HiveServer2MetricCollector, 92 | # 'llapdaemon': HiveLlapDaemonMetricCollector, 93 | # 'master': HBaseMasterMetricCollector, 94 | # 'regionserver': HBaseRegionServerMetricCollector 95 | } 96 | 97 | def __init__(self) -> None: 98 | args = utils.parse_args() 99 | self.log_level = (args.log_level or ExporterEnv.EXPORTER_LOG_LEVEL).upper() 100 | self.config = args.config or ExporterEnv.EXPORTER_CONFIG 101 | self.auto_discovery = False 102 | self.discovery_whitelist = [] 103 | if os.path.exists(self.config): 104 | logger.info("Use provided config: {}".format(self.config)) 105 | try: 106 | with open(self.config, 'r') as f: 107 | cfg = yaml.safe_load(f) 108 | except: 109 | logger.error("Something wrong when load config file") 110 | traceback.print_exc() 111 | else: 112 | server = cfg.get('server', {}) 113 | self.address = server.get('address', EXPORTER_ADDRESS_DEFAULT) 114 | self.port = int(server.get('port', EXPORTER_PORT_DEFAULT)) 115 | self.path = server.get('path', ExporterEnv.EXPORTER_PATH) 116 | self.period = int(server.get('period', ExporterEnv.EXPORTER_PERIOD)) 117 | self.sevices: List[Service] = [] 118 | 119 | jmx = cfg.get('jmx', []) 120 | for js in jmx: 121 | try: 122 | services = self._build_service_from_config(js) 123 | if services: 124 | self.sevices.extend(services) 125 | except: 126 | logger.warning(f'Error when parse jmx_service: {js}') 127 | traceback.print_exc() 128 | else: 129 | logger.info("Config file: {} doesn't existed. Ignore".format(self.config)) 130 | self.address = args.address or ExporterEnv.EXPORTER_ADDRESS 131 | self.port = int(args.port or ExporterEnv.EXPORTER_PORT) 132 | self.path = args.path or ExporterEnv.EXPORTER_PATH 133 | self.period = int(args.period or ExporterEnv.EXPORTER_PERIOD) 134 | self.sevices: List[Service] = [] 135 | 136 | if (args.auto_discovery or ExporterEnv.EXPORTER_AUTO_DISCOVERY).lower() == 'true': 137 | self.auto_discovery = True 138 | 139 | self.discovery_whitelist = args.discovery_whitelist or ExporterEnv.EXPORTER_DISCOVERY_WHITELIST 140 | 141 | cluster_name = args.cluster_name or ExporterEnv.EXPORTER_CLUSTER_NAME 142 | namenode_jmx = args.namenode_jmx or ExporterEnv.EXPORTER_NAMENODE_JMX 143 | datanode_jmx = args.datanode_jmx or ExporterEnv.EXPORTER_DATANODE_JMX 144 | journalnode_jmx = args.journalnode_jmx or ExporterEnv.EXPORTER_JOURNALNODE_JMX 145 | resourcemanager_jmx = args.resourcemanager_jmx or ExporterEnv.EXPORTER_RESOURCEMANAGER_JMX 146 | nodemanager_jmx = args.nodemanager_jmx or ExporterEnv.EXPORTER_NODEMANAGER_JMX 147 | mapred_jobhistory_jmx = args.mapred_jobhistory_jmx or ExporterEnv.EXPORTER_MAPRED_JOBHISTORY_JMX 148 | hmaster_jmx = args.hmaster_jmx or ExporterEnv.EXPORTER_HMASTER_JMX 149 | hregion_jmx = args.hregion_jmx or ExporterEnv.EXPORTER_HREGION_JMX 150 | hiveserver2_jmx = args.hiveserver2_jmx or ExporterEnv.EXPORTER_HIVESERVER2_JMX 151 | hivellap_jmx = args.hivellap_jmx or ExporterEnv.EXPORTER_HIVELLAP_JMX 152 | 153 | if self.auto_discovery: 154 | namenode_jmx = namenode_jmx or 'http://localhost:9870/jmx' 155 | datanode_jmx = datanode_jmx or 'http://localhost:9864/jmx' 156 | journalnode_jmx = journalnode_jmx or 'http://localhost:8480/jmx' 157 | resourcemanager_jmx = resourcemanager_jmx or 'http://localhost:8088/jmx' 158 | nodemanager_jmx = nodemanager_jmx or 'http://localhost:8042/jmx' 159 | mapred_jobhistory_jmx = mapred_jobhistory_jmx or 'http://localhost:19888/jmx' 160 | hmaster_jmx = hmaster_jmx or 'http://localhost:16010/jmx' 161 | hregion_jmx = hregion_jmx or 'http://localhost:16030/jmx' 162 | hiveserver2_jmx = hiveserver2_jmx or 'http://localhost:10002/jmx' 163 | hivellap_jmx = hivellap_jmx or 'http://localhost:15002/jmx' 164 | 165 | if self.auto_discovery: 166 | logger.info("Enable service auto discovery mode") 167 | 168 | if namenode_jmx and self._check_whitelist('nn'): 169 | self.sevices.append(self._build_service( 170 | cluster_name, namenode_jmx, HDFSNameNodeMetricCollector)) 171 | if datanode_jmx and self._check_whitelist('dn'): 172 | self.sevices.append(self._build_service( 173 | cluster_name, datanode_jmx, HDFSDataNodeMetricCollector)) 174 | if journalnode_jmx and self._check_whitelist('jn'): 175 | self.sevices.append(self._build_service( 176 | cluster_name, journalnode_jmx, HDFSJournalNodeMetricCollector)) 177 | if resourcemanager_jmx and self._check_whitelist('rm'): 178 | self.sevices.append(self._build_service( 179 | cluster_name, resourcemanager_jmx, YARNResourceManagerMetricCollector)) 180 | if nodemanager_jmx and self._check_whitelist('nm'): 181 | self.sevices.append(self._build_service( 182 | cluster_name, nodemanager_jmx, YARNNodeManagerMetricCollector)) 183 | # if mapred_jobhistory_jmx and self._check_whitelist('mrjh'): 184 | # self.sevices.append(self._build_service( 185 | # cluster_name, mapred_jobhistory_jmx, MapredJobHistoryMetricCollector)) 186 | if hiveserver2_jmx and self._check_whitelist('hs2'): 187 | self.sevices.append(self._build_service( 188 | cluster_name, hiveserver2_jmx, HiveServer2MetricCollector)) 189 | # if hivellap_jmx and self._check_whitelist('hllap'): 190 | # self.sevices.append(self._build_service( 191 | # cluster_name, hivellap_jmx, HiveLlapDaemonMetricCollector)) 192 | # if hmaster_jmx and self._check_whitelist('hm'): 193 | # self.sevices.append(self._build_service( 194 | # cluster_name, hmaster_jmx, HBaseMasterMetricCollector)) 195 | # if hregion_jmx and self._check_whitelist('hr'): 196 | # self.sevices.append(self._build_service( 197 | # cluster_name, hregion_jmx, HBaseRegionServerMetricCollector)) 198 | 199 | def _build_service_from_config(self, js: Dict) -> List[Service]: 200 | if "services" not in js: 201 | logger.error("services field must provided") 202 | return None 203 | 204 | cluster=js.get("cluster", EXPORTER_CLUSTER_NAME_DEFAULT) 205 | services= [] 206 | for service_name, urls in js["services"].items(): 207 | collector = self.COLLECTOR_MAPPING.get(service_name.lower(), None) 208 | if collector: 209 | service = Service( 210 | cluster=cluster, 211 | urls=urls, 212 | collector=collector 213 | ) 214 | services.append(service) 215 | logger.info("Added service: {}".format(service)) 216 | else: 217 | logger.warning("Unknown service name: {}. Ignored".format(service_name)) 218 | return services 219 | 220 | def _build_service(self, cluster_name: str, urls: Union[str, List[str]], collector: Callable) -> Service: 221 | service = Service( 222 | cluster=cluster_name, 223 | urls=urls, 224 | collector=collector, 225 | ) 226 | logger.info("Added service: {}".format(service)) 227 | return service 228 | 229 | def _check_whitelist(self, service) -> bool: 230 | if self.discovery_whitelist is None: 231 | return True 232 | else: 233 | whilelist = self.discovery_whitelist.split(',') 234 | if service in whilelist: 235 | return True 236 | else: 237 | return False 238 | 239 | def register_consul(self): 240 | start_http_server(self.port, addr=self.address) 241 | logger.info( 242 | f"Exporter start listening on http://{self.address}:{self.port}") 243 | logger.info(f"Scraping metrics every {self.period}s ...") 244 | logger.info(f"Set log level = {self.log_level}") 245 | logger.setLevel(self.log_level) 246 | 247 | def register_prometheus(self): 248 | self.logging_threshold = 60 #seconds 249 | counter = self.logging_threshold 250 | try: 251 | while True: 252 | for service in self.sevices: 253 | service.register() 254 | if counter >= self.logging_threshold: 255 | logger.info(f"Continue scraping metrics every {self.period}s ...") 256 | counter = 0 257 | counter += 1 258 | time.sleep(self.period) 259 | 260 | except KeyboardInterrupt: 261 | logger.info("Interrupted") 262 | exit(0) 263 | except: 264 | traceback.print_exc() 265 | -------------------------------------------------------------------------------- /hadoop_exporter/hiveserver2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from typing import List, Union, Dict 5 | from hadoop_exporter import utils 6 | from hadoop_exporter.common import MetricCollector 7 | 8 | 9 | class HiveServer2MetricCollector(MetricCollector): 10 | COMPONENT = "hive" 11 | SERVICE = "hiveserver2" 12 | 13 | def __init__(self, cluster, urls: Union[str, List[str]]): 14 | logger = utils.get_logger(__name__, log_file=f"{self.COMPONENT}_{self.SERVICE}.log") 15 | MetricCollector.__init__( 16 | self, cluster, urls, self.COMPONENT, self.SERVICE, logger) 17 | 18 | def _get_common_labels(self, beans: List[Dict], url: str): 19 | super()._get_common_labels(beans, url) 20 | 21 | bean = self._find_bean(beans, "org.apache.logging.log4j2:type=AsyncContext@(\w{8})$") 22 | if bean: 23 | import re 24 | matched = re.compile(".*hostName=(.+),.*").match(bean["ConfigProperties"]) 25 | print(matched) 26 | self._common_labels[url]["names"].append("host") 27 | self._common_labels[url]["values"].append(matched.groups()[0]) 28 | -------------------------------------------------------------------------------- /hadoop_exporter/journalnode.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from typing import List, Union, Dict 5 | from hadoop_exporter import utils 6 | from hadoop_exporter.common import MetricCollector 7 | 8 | 9 | class HDFSJournalNodeMetricCollector(MetricCollector): 10 | COMPONENT = "hdfs" 11 | SERVICE = "journalnode" 12 | 13 | def __init__(self, cluster, urls: Union[str, List[str]]): 14 | logger = utils.get_logger(__name__, log_file=f"{self.COMPONENT}_{self.SERVICE}.log") 15 | MetricCollector.__init__( 16 | self, cluster, urls, self.COMPONENT, self.SERVICE, logger) 17 | 18 | def _get_common_labels(self, beans: List[Dict], url: str): 19 | super()._get_common_labels(beans, url) 20 | 21 | bean = self._find_bean(beans, "Hadoop:service=JournalNode,name=JvmMetrics") 22 | if bean: 23 | self._common_labels[url]["names"].append("host") 24 | self._common_labels[url]["values"].append(bean["tag.Hostname"]) 25 | -------------------------------------------------------------------------------- /hadoop_exporter/mapping.py: -------------------------------------------------------------------------------- 1 | 2 | def fsstate(value): 3 | if value == "Operational": 4 | return 0.0 5 | elif value == "Safemode": 6 | return 1.0 7 | else: 8 | return 9999.0 9 | 10 | 11 | def hastate(value): 12 | if value == "initializing": 13 | return 0.0 14 | elif value == "active": 15 | return 1.0 16 | elif value == "standby": 17 | return 2.0 18 | elif value == "stopping": 19 | return 3.0 20 | else: 21 | return 9999 22 | 23 | def rmstate(value): 24 | return hastate(value) 25 | -------------------------------------------------------------------------------- /hadoop_exporter/namenode.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from typing import Dict, List, Union 5 | from hadoop_exporter import utils 6 | from hadoop_exporter.common import MetricCollector 7 | 8 | 9 | class HDFSNameNodeMetricCollector(MetricCollector): 10 | COMPONENT = "hdfs" 11 | SERVICE = "namenode" 12 | 13 | def __init__(self, cluster, urls: Union[str, List[str]]): 14 | logger = utils.get_logger( 15 | __name__, log_file=f"{self.COMPONENT}_{self.SERVICE}.log") 16 | MetricCollector.__init__( 17 | self, cluster, urls, self.COMPONENT, self.SERVICE, logger) 18 | 19 | def _get_common_labels(self, beans: List[Dict], url: str): 20 | super()._get_common_labels(beans, url) 21 | 22 | bean = self._find_bean(beans, "Hadoop:service=NameNode,name=JvmMetrics") 23 | if bean: 24 | self._common_labels[url]["names"].append("host") 25 | self._common_labels[url]["values"].append(bean["tag.Hostname"]) 26 | -------------------------------------------------------------------------------- /hadoop_exporter/nodemanager.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from typing import Dict, List, Union 5 | from hadoop_exporter import utils 6 | from hadoop_exporter.common import MetricCollector 7 | 8 | 9 | class YARNNodeManagerMetricCollector(MetricCollector): 10 | COMPONENT = "yarn" 11 | SERVICE = "nodemanager" 12 | 13 | def __init__(self, cluster, urls: Union[str, List[str]]): 14 | logger = utils.get_logger( 15 | 16 | __name__, log_file=f"{self.COMPONENT}_{self.SERVICE}.log") 17 | MetricCollector.__init__( 18 | self, cluster, urls, self.COMPONENT, self.SERVICE, logger) 19 | 20 | def _get_common_labels(self, beans: List[Dict], url: str): 21 | super()._get_common_labels(beans, url) 22 | 23 | bean = self._find_bean(beans, "Hadoop:service=NodeManager,name=JvmMetrics") 24 | if bean: 25 | self._common_labels[url]["names"].append("host") 26 | self._common_labels[url]["values"].append(bean["tag.Hostname"]) 27 | -------------------------------------------------------------------------------- /hadoop_exporter/resourcemanager.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | from typing import Dict, List, Union 5 | from hadoop_exporter import utils 6 | from hadoop_exporter.common import MetricCollector 7 | 8 | 9 | class YARNResourceManagerMetricCollector(MetricCollector): 10 | COMPONENT = "yarn" 11 | SERVICE = "resourcemanager" 12 | 13 | def __init__(self, cluster, urls: Union[str, List[str]]): 14 | logger = utils.get_logger( 15 | __name__, log_file=f"{self.COMPONENT}_{self.SERVICE}.log") 16 | MetricCollector.__init__( 17 | self, cluster, urls, self.COMPONENT, self.SERVICE, logger) 18 | 19 | def _get_common_labels(self, beans: List[Dict], url: str): 20 | super()._get_common_labels(beans, url) 21 | 22 | bean = self._find_bean(beans, "Hadoop:service=ResourceManager,name=JvmMetrics") 23 | if bean: 24 | self._common_labels[url]["names"].append("host") 25 | self._common_labels[url]["values"].append(bean["tag.Hostname"]) 26 | -------------------------------------------------------------------------------- /hadoop_exporter/utils.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import os 5 | import socket 6 | import re 7 | from typing import Dict, List, Optional 8 | import requests 9 | import logging 10 | import yaml 11 | import argparse 12 | 13 | EXPORTER_LOGS_DIR = os.environ.get('EXPORTER_LOGS_DIR', '/tmp/exporter') 14 | 15 | 16 | def get_logger(name, log_file="hadoop_exporter.log", level: str = "INFO") -> logging.Logger: 17 | ''' 18 | define a common logger template to record log. 19 | @param name log module or object name. 20 | @return logger. 21 | ''' 22 | 23 | logger = logging.getLogger(name) 24 | logger.setLevel(level.upper()) 25 | 26 | if not os.path.exists(EXPORTER_LOGS_DIR): 27 | os.makedirs(EXPORTER_LOGS_DIR) 28 | 29 | fh = logging.FileHandler(os.path.join(EXPORTER_LOGS_DIR, log_file)) 30 | fh.setLevel(logging.INFO) 31 | 32 | sh = logging.StreamHandler() 33 | sh.setLevel(logging.INFO) 34 | 35 | fmt = logging.Formatter( 36 | fmt='%(asctime)s %(filename)s[line:%(lineno)d]-[%(levelname)s]: %(message)s') 37 | fh.setFormatter(fmt) 38 | sh.setFormatter(fmt) 39 | 40 | logger.addHandler(fh) 41 | logger.addHandler(sh) 42 | return logger 43 | 44 | 45 | logger = get_logger(__name__) 46 | 47 | 48 | def get_metrics(url) -> List[Dict]: 49 | ''' 50 | :param url: The jmx url, e.g. http://host1:9870/jmx, http://host1:8088/jmx, http://host2:19888/jmx... 51 | :return a dict of all metrics scraped in the jmx url. 52 | ''' 53 | result = [] 54 | try: 55 | s = requests.session() 56 | response = s.get(url, timeout=5) 57 | except Exception as e: 58 | logger.warning("error in func: get_metrics, error msg: %s" % e) 59 | result = [] 60 | else: 61 | if response.status_code != requests.codes.ok: 62 | logger.warning("get {0} failed, response code is: {1}.".format( 63 | url, response.status_code)) 64 | result = [] 65 | rlt = response.json() 66 | logger.debug(rlt) 67 | if rlt and "beans" in rlt: 68 | result = rlt['beans'] 69 | else: 70 | logger.warning("no metrics get in the {0}.".format(url)) 71 | result = [] 72 | finally: 73 | s.close() 74 | return result 75 | 76 | 77 | def get_host_ip(): 78 | try: 79 | s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) 80 | s.connect(('8.8.8.8', 80)) 81 | ip = s.getsockname()[0] 82 | finally: 83 | s.close() 84 | return ip 85 | 86 | 87 | def get_hostname(): 88 | ''' 89 | get hostname via socket. 90 | @return a string of hostname 91 | ''' 92 | try: 93 | host = socket.getfqdn() 94 | except Exception as e: 95 | logger.info("get hostname failed, error msg: {0}".format(e)) 96 | return None 97 | else: 98 | return host 99 | 100 | 101 | def read_yaml_file(file) -> Optional[Dict]: 102 | if os.path.exists(file): 103 | with open(file, 'r') as f: 104 | cfg = yaml.safe_load(f) 105 | return cfg 106 | return None 107 | 108 | def parse_args(): 109 | parser = argparse.ArgumentParser( 110 | description='hadoop node exporter args, including url, metrics_path, address, port and cluster.' 111 | ) 112 | parser.add_argument( 113 | '-cfg', 114 | required=False, 115 | dest='config', 116 | help='Exporter config file (default: /exporter/config.yaml)', 117 | default=None 118 | ) 119 | parser.add_argument( 120 | '-c', 121 | required=False, 122 | dest='cluster_name', 123 | help='Hadoop cluster labels. (default "hadoop_cluster")', 124 | default=None 125 | ) 126 | parser.add_argument( 127 | '-nn', 128 | required=False, 129 | dest='namenode_jmx', 130 | help='List of HDFS namenode JMX url. (example "http://localhost:9870/jmx")', 131 | default=None 132 | ) 133 | parser.add_argument( 134 | '-dn', 135 | required=False, 136 | dest='datanode_jmx', 137 | help='List of HDFS datanode JMX url. (example "http://localhost:9864/jmx")', 138 | default=None 139 | ) 140 | parser.add_argument( 141 | '-jn', 142 | required=False, 143 | dest='journalnode_jmx', 144 | help='List of HDFS journalnode JMX url. (example "http://localhost:8480/jmx")', 145 | default=None 146 | ) 147 | parser.add_argument( 148 | '-rm', 149 | required=False, 150 | dest='resourcemanager_jmx', 151 | help='List of YARN resourcemanager JMX url. (example "http://localhost:8088/jmx")', 152 | default=None 153 | ) 154 | parser.add_argument( 155 | '-nm', 156 | required=False, 157 | dest='nodemanager_jmx', 158 | help='List of YARN nodemanager JMX url. (example "http://localhost:8042/jmx")', 159 | default=None 160 | ) 161 | parser.add_argument( 162 | '-mrjh', 163 | required=False, 164 | dest='mapred_jobhistory_jmx', 165 | help='List of Mapreduce jobhistory JMX url. (example "http://localhost:19888/jmx")', 166 | default=None 167 | ) 168 | parser.add_argument( 169 | '-hm', 170 | required=False, 171 | dest='hmaster_jmx', 172 | help='List of HBase master JMX url. (example "http://localhost:16010/jmx")', 173 | default=None 174 | ) 175 | parser.add_argument( 176 | '-hr', 177 | required=False, 178 | dest='hregion_jmx', 179 | help='List of HBase regionserver JMX url. (example "http://localhost:16030/jmx")', 180 | default=None 181 | ) 182 | parser.add_argument( 183 | '-hs2', 184 | required=False, 185 | dest='hiveserver2_jmx', 186 | help='List of HiveServer2 JMX url. (example "http://localhost:10002/jmx")', 187 | default=None 188 | ) 189 | parser.add_argument( 190 | '-hllap', 191 | required=False, 192 | dest='hivellap_jmx', 193 | help='List of Hive LLAP JMX url. (example "http://localhost:15002/jmx")', 194 | default=None 195 | ) 196 | parser.add_argument( 197 | '-ad', 198 | required=False, 199 | dest='auto_discovery', 200 | help='Enable auto discovery if set true else false. (example "--auto_discovery true") (default: false)', 201 | default=None 202 | ) 203 | parser.add_argument( 204 | '-adw', 205 | required=False, 206 | dest='discovery_whitelist', 207 | help='List of shortnames of services (namenode: nn, datanode: dn, ...) that should be enable to auto discovery', 208 | default=None 209 | ) 210 | parser.add_argument( 211 | '-addr', 212 | dest='address', 213 | required=False, 214 | help='Polling server on this address (hostname or ip). (default "0.0.0.0")', 215 | default=None 216 | ) 217 | parser.add_argument( 218 | '-p', 219 | dest='port', 220 | required=False, 221 | type=int, 222 | help='Port to listen on. (default "9123")', 223 | default=None 224 | ) 225 | parser.add_argument( 226 | '--path', 227 | dest='path', 228 | required=False, 229 | help='Path under which to expose metrics. (default "/metrics")', 230 | default=None 231 | ) 232 | parser.add_argument( 233 | '--period', 234 | dest='period', 235 | required=False, 236 | type=int, 237 | help='Period (seconds) to consume jmx service. (default: 30)', 238 | default=None 239 | ) 240 | parser.add_argument( 241 | '--log-level', 242 | required=False, 243 | dest='log_level', 244 | help='Log level, include: all, debug, info, warn, error (default: info)', 245 | default=None 246 | ) 247 | return parser.parse_args() 248 | -------------------------------------------------------------------------------- /metrics/common.yaml: -------------------------------------------------------------------------------- 1 | rules: 2 | java.lang:type=(Threading): 3 | - pattern: ^(CurrentThreadAllocatedBytes|.+Count|.+Time)$ 4 | type: GAUSE 5 | name: $1 6 | labels: 7 | type: $2 8 | help: "$1 metrics" 9 | 10 | java.lang:type=(OperatingSystem): 11 | - pattern: (.+(Count|Size|Load|Time|Average|Processors)$) 12 | type: GAUSE 13 | name: $1 14 | labels: 15 | type: $2 16 | help: "$1 metrics" 17 | 18 | Hadoop:service=.+,name=(JvmMetrics): 19 | - pattern: ^(Mem|Gc|Threads)(.+) 20 | type: GAUSE 21 | name: $1_$2 22 | labels: 23 | type: $3 24 | help: "$1 metrics" 25 | 26 | Hadoop:service=.+,name=(RpcActivity)ForPort(\d+): 27 | - pattern: (.+)(NumOps|AvgTime)$ 28 | type: GAUSE 29 | name: $1_$4 30 | labels: 31 | port: $2 32 | type: $3 33 | 34 | - pattern: ^(RpcClientBackoff|RpcSlowCalls|CallQueueLength|.+Bytes|.+Connections|.+Failures|.+Successes)$ 35 | type: GAUSE 36 | name: $1 37 | labels: 38 | port: $2 39 | type: $3 40 | 41 | Hadoop:service=.+,name=(RpcDetailedActivity)ForPort(\d+): 42 | - pattern: (.+)(NumOps|AvgTime)$ 43 | type: GAUSE 44 | name: $1_$4 45 | labels: 46 | port: $2 47 | type: $3 48 | -------------------------------------------------------------------------------- /metrics/datanode.yaml: -------------------------------------------------------------------------------- 1 | # Reference: https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/Metrics.html 2 | lowercaseOutputName: true 3 | lowercaseOutputLabel: false 4 | rules: 5 | Hadoop:service=DataNode,name=(DataNodeVolume)-(.+): 6 | - pattern: (.+)(NumOps|AvgTime) 7 | type: GAUSE 8 | name: $1_$4 9 | labels: 10 | datadir: $2 11 | type: $3 12 | - pattern: ^(Total)(.+) 13 | type: GAUSE 14 | name: $1_$3 15 | labels: 16 | datadir: $2 17 | type: $4 18 | 19 | Hadoop:service=DataNode,name=(FSDatasetState)$: 20 | - pattern: ^((?!tag|modelerType|name).*) 21 | type: GAUSE 22 | name: $1 23 | labels: 24 | type: $2 25 | 26 | Hadoop:service=DataNode,name=(DataNodeActivity).*: 27 | - pattern: ^((?!tag|modelerType|name).*) 28 | type: GAUSE 29 | name: $1 30 | labels: 31 | type: $2 32 | -------------------------------------------------------------------------------- /metrics/hiveserver2.yaml: -------------------------------------------------------------------------------- 1 | # Reference: https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/Metrics.html 2 | lowercaseOutputName: true 3 | lowercaseOutputLabel: false 4 | rules: 5 | metrics:name=(active_calls.*)_hs2_(.+): 6 | - pattern: Count 7 | type: GAUSE 8 | name: $1_$2_count 9 | metrics:name=hs2_(completed_sql_operation_.+): 10 | - pattern: Count 11 | type: GAUSE 12 | name: $1_count 13 | metrics:name=hs2_(.+_sessions): 14 | - pattern: Value 15 | type: GAUSE 16 | name: $1_count 17 | metrics:name=hs2_(.+_queries): 18 | - pattern: Count 19 | type: GAUSE 20 | name: $1_count 21 | - pattern: ^(FifteenMinuteRate|OneMinuteRate|FiveMinuteRate|MeanRate|Min|Max|Mean)$ 22 | type: GAUSE 23 | name: $1 24 | labels: 25 | type: $2 26 | metrics:name=hs2_(completed_.*operation_.+): 27 | - pattern: Count 28 | type: GAUSE 29 | name: $1_count 30 | metrics:name=(api)_hs2_(.+): 31 | - pattern: Count 32 | type: GAUSE 33 | name: $1_$2_count 34 | - pattern: ^(FifteenMinuteRate|OneMinuteRate|FiveMinuteRate|MeanRate|Min|Max|Mean)$ 35 | type: GAUSE 36 | name: $1_$2 37 | labels: 38 | type: $3 39 | metrics:name=memory.heap.(.+): 40 | - pattern: Value 41 | type: GAUSE 42 | name: memory_heap_$1 43 | labels: 44 | type: $1 45 | metrics:name=memory.total.(.+): 46 | - pattern: Value 47 | type: GAUSE 48 | name: memory_total_$1 49 | metrics:name=(exec_async_.+_size): 50 | - pattern: Value 51 | type: GAUSE 52 | name: $1 53 | metrics:name=hs2_(sql_operation_active_user): 54 | - pattern: Count 55 | type: GAUSE 56 | name: $1 57 | -------------------------------------------------------------------------------- /metrics/journalnode.yaml: -------------------------------------------------------------------------------- 1 | # Reference: https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/Metrics.html 2 | lowercaseOutputName: true 3 | lowercaseOutputLabel: false 4 | rules: 5 | Hadoop:service=JournalNode,name=Journal-(.+): 6 | - pattern: ^(Syncs\d+s)(NumOps)$ 7 | type: GAUSE 8 | name: $2_$3 9 | labels: 10 | nameservice: $1 11 | - pattern: ^(Syncs\d+s)(\d+th)Percentile(Latency)Micros$ 12 | type: GAUSE 13 | name: $2_$4_micros 14 | labels: 15 | nameservice: $1 16 | percentile: $3 17 | - pattern: ^((?!tag|modelerType|name|Syncs).*) 18 | type: GAUSE 19 | name: metrics 20 | labels: 21 | nameservice: $1 22 | type: $2 23 | -------------------------------------------------------------------------------- /metrics/namenode.yaml: -------------------------------------------------------------------------------- 1 | # Reference: https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/Metrics.html 2 | lowercaseOutputName: true 3 | lowercaseOutputLabel: false 4 | rules: 5 | Hadoop:service=NameNode,name=(NameNodeInfo)$: 6 | - pattern: ^(Total|Used|Free|NonDfsUsedSpace|Percent.+|Cache.+|CorruptFilesCount|Threads)$ 7 | type: GAUSE 8 | name: $1 9 | labels: 10 | type: $2 11 | 12 | Hadoop:service=NameNode,name=(FSNamesystem)$: 13 | - pattern: ^(Capacity)(.+) 14 | type: GAUSE 15 | name: $1_$2 16 | labels: 17 | type: $3 18 | 19 | - pattern: ^(\w+)(Blocks)$ 20 | type: GAUSE 21 | name: $1_$3 22 | labels: 23 | type: $2 24 | 25 | - pattern: ^(Num)(.+)(DataNodes)$ 26 | type: GAUSE 27 | name: $1_$2_$4 28 | labels: 29 | type: $3 30 | 31 | - pattern: ^(TotalLoad|BlocksTotal|FilesTotal|TotalSyncCount|NumActiveClients|NumFilesUnderConstruction|NumStaleStorages)$ 32 | type: GAUSE 33 | name: $1 34 | labels: 35 | type: $2 36 | 37 | - pattern: ^(Transactions\w+|Last\w+)$ 38 | type: GAUSE 39 | name: $1 40 | labels: 41 | type: $2 42 | 43 | - pattern: .*(HAState)$ 44 | type: GAUSE 45 | name: $2 46 | mapping: hadoop_exporter.mapping.hastate 47 | help: 'the high-available state of namenodes: 0.0 => initializing, 1.0 => active, 2.0 => standby, 3.0 => stopping, 9999 => others' 48 | 49 | Hadoop:service=NameNode,name=(FSNamesystemState)$: 50 | - pattern: ^(Capacity)(.+) 51 | type: GAUSE 52 | name: $1_$2 53 | labels: 54 | type: $3 55 | 56 | - pattern: ^(Num)(.+)(DataNodes)$ 57 | type: GAUSE 58 | name: $1_$2_$4 59 | labels: 60 | type: $3 61 | 62 | - pattern: ^(\w+)(Blocks)$ 63 | type: GAUSE 64 | name: $1_$3 65 | labels: 66 | type: $2 67 | 68 | - pattern: ^(TotalLoad|BlocksTotal|FilesTotal|TotalSyncCount|VolumeFailuresTotal|EstimatedCapacityLostTotal)$ 69 | type: GAUSE 70 | name: $1 71 | labels: 72 | type: $2 73 | 74 | - pattern: (FSState) 75 | type: GAUSE 76 | name: $2 77 | mapping: hadoop_exporter.mapping.fsstate 78 | help: 'the fs state of namenode: 0.0 => Operational, 1.0 => Safemode, 9999 => others' 79 | 80 | Hadoop:service=NameNode,name=(NameNodeActivity)$: 81 | - pattern: ^((?!tag|modelerType|name).*) 82 | type: GAUSE 83 | name: $1 84 | labels: 85 | type: $2 86 | -------------------------------------------------------------------------------- /metrics/nodemanager.yaml: -------------------------------------------------------------------------------- 1 | lowercaseOutputName: true 2 | lowercaseOutputLabel: false 3 | rules: 4 | Hadoop:service=NodeManager,name=(NodeManagerMetrics)$: 5 | - pattern: ^(Containers)(\w+)$ 6 | type: GAUSE 7 | name: $1_$2 8 | labels: 9 | type: $3 10 | help: "Total number of containers on the type" 11 | - pattern: ^((?!tag|modelerType|name|Containers).*) 12 | type: GAUSE 13 | name: $1 14 | labels: 15 | type: $2 16 | 17 | Hadoop:service=NodeManager,name=(sparkShuffleService)$: 18 | - pattern: ^(openBlockRequestLatencyMillis|blockTransferRateBytes|registerExecutorRequestLatencyMillis)_(\w+)$ 19 | type: GAUSE 20 | name: $1_$2 21 | labels: 22 | type: $3 23 | - pattern: ^(numRegisteredConnections|numCaughtExceptions|registeredExecutorsSize|numActiveConnections)$ 24 | type: GAUSE 25 | name: $1 26 | labels: 27 | type: $2 28 | 29 | Hadoop:service=NodeManager,name=(ShuffleMetrics)$: 30 | - pattern: ^Shuffle(\w+)$ 31 | type: GAUSE 32 | name: $1 33 | labels: 34 | type: $2 35 | -------------------------------------------------------------------------------- /metrics/resourcemanager.yaml: -------------------------------------------------------------------------------- 1 | lowercaseOutputName: true 2 | lowercaseOutputLabel: false 3 | rules: 4 | Hadoop:service=ResourceManager,name=(QueueMetrics),q0=(\w+)$: 5 | - pattern: ^(running)_(\d+)$ 6 | type: GAUSE 7 | name: $1_$3 8 | labels: 9 | queue0: $2 10 | type: $4 11 | help: 'Current number of running applications based on type. See https://hadoop.apache.org/docs/r2.7.2/hadoop-project-dist/hadoop-common/Metrics.html#QueueMetrics to more details' 12 | 13 | - pattern: (.+)(Containers)$ 14 | type: GAUSE 15 | name: $1_$4 16 | labels: 17 | queue0: $2 18 | type: $3 19 | help: 'Current number of containers are on the type' 20 | 21 | - pattern: (.+)(VCores)$ 22 | type: GAUSE 23 | name: $1_$4 24 | labels: 25 | queue0: $2 26 | type: $3 27 | help: 'Current cpu resource in virtual cores of type' 28 | 29 | - pattern: (.+)(MB)$ 30 | type: GAUSE 31 | name: $1_$4 32 | labels: 33 | queue0: $2 34 | type: $3 35 | help: 'Current memory resource in MB of type' 36 | 37 | - pattern: ^(Apps)(.+) 38 | type: GAUSE 39 | name: $1_$3 40 | labels: 41 | queue0: $2 42 | type: $4 43 | help: 'Current number of applications are on the type' 44 | 45 | - pattern: ^((?!tag|modelerType|name).*) 46 | type: GAUSE 47 | name: $1 48 | labels: 49 | queue0: $2 50 | type: $3 51 | 52 | Hadoop:service=ResourceManager,name=(QueueMetrics),q0=(\w+),q1=(\w+)$: 53 | - pattern: ^(running)_(\d+)$ 54 | type: GAUSE 55 | name: $1_$4 56 | labels: 57 | queue0: $2 58 | queue1: $3 59 | type: $5 60 | help: 'Current number of running applications based on type. See https://hadoop.apache.org/docs/r3.3.0/hadoop-project-dist/hadoop-common/Metrics.html#QueueMetrics to more details' 61 | 62 | - pattern: (.+)(Containers)$ 63 | type: GAUSE 64 | name: $1_$5 65 | labels: 66 | queue0: $2 67 | queue1: $3 68 | type: $4 69 | help: 'Current number of containers are on the type' 70 | 71 | - pattern: (.+)(VCores)$ 72 | type: GAUSE 73 | name: $1_$5 74 | labels: 75 | queue0: $2 76 | queue1: $3 77 | type: $4 78 | help: 'Current cpu resource in virtual cores of type' 79 | 80 | - pattern: (.+)(MB)$ 81 | type: GAUSE 82 | name: $1_$5 83 | labels: 84 | queue0: $2 85 | queue1: $3 86 | type: $4 87 | help: 'Current memory resource in MB of type' 88 | 89 | - pattern: ^(Apps)(.+) 90 | type: GAUSE 91 | name: $1_$4 92 | labels: 93 | queue0: $2 94 | queue1: $3 95 | type: $5 96 | help: 'Current number of applications are on the type' 97 | 98 | - pattern: ^((?!tag|modelerType|name).*) 99 | type: GAUSE 100 | name: $1 101 | labels: 102 | queue0: $2 103 | queue1: $3 104 | type: $4 105 | 106 | Hadoop:service=ResourceManager,name=(CapacitySchedulerMetrics)$: 107 | - pattern: (.+)(NumOps|AvgTime)$ 108 | type: GAUSE 109 | name: $1_$3 110 | labels: 111 | type: $2 112 | 113 | Hadoop:service=ResourceManager,name=(ClusterMetrics)$: 114 | - pattern: ^(Num)(.+)(NMs)$ 115 | type: GAUSE 116 | name: $1_$2_$4 117 | labels: 118 | type: $3 119 | help: 'number of the node managers is in the particular states' 120 | 121 | - pattern: (.+)(NumOps|AvgTime)$ 122 | type: GAUSE 123 | name: $1_$3 124 | labels: 125 | type: $2 126 | 127 | Hadoop:service=ResourceManager,name=RMInfo: 128 | - pattern: (State) 129 | type: GAUSE 130 | name: $1 131 | mapping: hadoop_exporter.mapping.rmstate 132 | help: 'the high-available state of resourcemanager: 0.0 => initializing, 1.0 => active, 2.0 => standby, 3.0 => stopping, 9999 => others' 133 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | requests==2.23.0 2 | prometheus-client==0.9.0 3 | python-consul==1.1.0 4 | pyyaml==5.3.1 5 | -------------------------------------------------------------------------------- /service.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | 4 | from hadoop_exporter.exporter import Exporter 5 | 6 | def main(): 7 | exporter = Exporter() 8 | exporter.register_consul() 9 | exporter.register_prometheus() 10 | 11 | if __name__ == '__main__': 12 | main() 13 | --------------------------------------------------------------------------------