├── .gitignore ├── .travis.yml ├── Dockerfile ├── LICENSE ├── MANIFEST.in ├── README.md ├── prometheus_kafka_consumer_group_exporter ├── __init__.py ├── __main__.py ├── collectors.py ├── fetch_jobs.py ├── parsing.py └── scheduler.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit test / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | 55 | # Sphinx documentation 56 | docs/_build/ 57 | 58 | # PyBuilder 59 | target/ 60 | 61 | # Gedit 62 | *~ 63 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | sudo: required 3 | dist: xenial 4 | python: 5 | - "3.4" 6 | - "3.5" 7 | - "3.6" 8 | - "3.7" 9 | script: 10 | - python setup.py test 11 | after_success: 12 | - if [[ $TRAVIS_PYTHON_VERSION == 3.7* ]]; then export RELEASE_SENTINEL=1; fi 13 | deploy: 14 | provider: pypi 15 | user: $PYPI_USERNAME 16 | password: $PYPI_PASSWORD 17 | distributions: "sdist bdist_wheel" 18 | on: 19 | tags: true 20 | branch: master 21 | condition: $RELEASE_SENTINEL = 1 22 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3-slim 2 | 3 | WORKDIR /usr/src/app 4 | 5 | COPY setup.py /usr/src/app/ 6 | RUN pip install . 7 | 8 | COPY prometheus_kafka_consumer_group_exporter/*.py /usr/src/app/prometheus_kafka_consumer_group_exporter/ 9 | RUN pip install -e . 10 | 11 | COPY LICENSE /usr/src/app/ 12 | COPY README.md /usr/src/app/ 13 | 14 | EXPOSE 9208 15 | 16 | ENTRYPOINT ["python", "-u", "/usr/local/bin/prometheus-kafka-consumer-group-exporter"] 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) 2016 Braedon Vickers 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE 2 | include README.md 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Prometheus Kafka Consumer Group Exporter 2 | ==== 3 | This Prometheus exporter consumes the `__consumer_offsets` topic of a Kafka cluster and exports the results as Prometheus gauge metrics. i.e. it shows the position of Kafka consumer groups, including their lag. 4 | 5 | The high-water and low-water marks of the partitions of each topic are also exported. 6 | 7 | # Installation 8 | The exporter requires Python 3 and Pip 3 to be installed. 9 | 10 | To install the latest published version via Pip, run: 11 | ``` 12 | > pip3 install prometheus-kafka-consumer-group-exporter 13 | ``` 14 | Note that you may need to add the start script location (see pip output) to your `PATH`. 15 | 16 | # Usage 17 | Once installed, you can run the exporter with the `prometheus-kafka-consumer-group-exporter` command. 18 | 19 | By default, it will bind to port 9208 and connect to Kafka on `localhost:9092`. You can change these defaults as required by passing in arguments: 20 | ``` 21 | > prometheus-kafka-consumer-group-exporter -p -b 22 | ``` 23 | Run with the `-h` flag to see details on all the available arguments. 24 | 25 | Prometheus metrics can then be scraped from the `/metrics` path, e.g. http://localhost:9208/metrics. Metrics are currently actually exposed on all paths, but this may change in the future and `/metrics` is the standard path for Prometheus metric endpoints. 26 | 27 | # Metrics 28 | Ten main metrics are exported: 29 | 30 | ### `kafka_consumer_group_offset{group, topic, partition}` (gauge) 31 | The latest committed offset of a consumer group in a given partition of a topic, as read from `__consumer_offsets`. Useful for calculating the consumption rate and lag of a consumer group. 32 | 33 | ### `kafka_consumer_group_lag{group, topic, partition}` (gauge) 34 | The lag of a consumer group behind the head of a given partition of a topic - the difference between `kafka_topic_highwater` and `kafka_consumer_group_offset`. Useful for checking if a consumer group is keeping up with a topic. 35 | 36 | ### `kafka_consumer_group_lead{group, topic, partition}` (gauge) 37 | The lead of a consumer group ahead of the tail of a given partition of a topic - the difference between `kafka_consumer_group_offset` and `kafka_topic_lowwater`. Useful for checking if a consumer group is at risk of missing messages due to the cleaner. 38 | 39 | ### `kafka_consumer_group_commits_total{group, topic, partition}` (counter) 40 | The number of commit messages read from `__consumer_offsets` by the exporter from a consumer group for a given partition of a topic. Useful for calculating the commit rate of a consumer group (i.e. are the consumers working). 41 | 42 | ### `kafka_consumer_group_commit_timestamp{group, topic, partition}` (gauge) 43 | The timestamp (in seconds since January 1, 1970 UTC) of the latest commit from a consumer group for a given partition of a topic. Useful to determine how long a consumer has been inactive. 44 | 45 | ### `kafka_consumer_group_exporter_offset{partition}` (gauge) 46 | The offset of the exporter's consumer in each partition of the `__consumer_offset` topic. Useful for calculating the lag of the exporter. 47 | 48 | ### `kafka_consumer_group_exporter_lag{partition}` (gauge) 49 | The lag of the exporter's consumer behind the head of each partition of the `__consumer_offset` topic. Useful for checking if the exporter is keeping up with `__consumer_offset`. 50 | 51 | ### `kafka_consumer_group_exporter_lead{partition}` (gauge) 52 | The lead of the exporter's consumer ahead of the tail of each partition of the `__consumer_offset` topic. Useful for checking if the exporter is at risk of missing messages due to the cleaner. 53 | 54 | ### `kafka_topic_highwater{topic, partition}` (gauge) 55 | The offset of the head of a given partition of a topic, as reported by the lead broker for the partition. Useful for calculating the production rate of the producers for a topic, and the lag of a consumer group (or the exporter itself). 56 | 57 | ### `kafka_topic_lowwater{topic, partition}` (gauge) 58 | The offset of the tail of a given partition of a topic, as reported by the lead broker for the partition. Useful for calculating the lead of a consumer group (or the exporter itself) - i.e. how far ahead of the cleaner the consumer group is. 59 | 60 | ## Lag 61 | Lag metrics are exported for convenience, but they can also be calculated using other metrics if desired: 62 | ``` 63 | # Lag for a consumer group: 64 | kafka_topic_highwater - on (topic, partition) kafka_consumer_group_offset{group="some-consumer-group"} 65 | 66 | # Lag for the exporter: 67 | kafka_topic_highwater{topic='__consumer_offsets'} - on (partition) kafka_consumer_group_exporter_offset 68 | ``` 69 | Note that as the offset and high-water metrics are updated separately the offset value can be more up-to-date than the high-water, resulting in a negative lag. This is often the case with the exporter lag, as the exporter offset is tracked internally rather than read from `__consumer_offsets`. 70 | 71 | # Kafka Config 72 | If you need to set Kafka consumer configuration that isn't supported by command line arguments, you can provided a standard Kafka consumer properties file: 73 | ``` 74 | > prometheus-kafka-consumer-group-exporter --consumer-config consumer.properties 75 | ``` 76 | See the [Kafka docs](https://kafka.apache.org/documentation/#newconsumerconfigs) for details on consumer properties. However, as the exporter doesn't use the official consumer implementation, all properties may not be supported. Check the [kafka-python docs](https://kafka-python.readthedocs.io/en/master/apidoc/KafkaConsumer.html#kafkaconsumer) if you run into problems. 77 | 78 | You can provide multiple files if that's helpful - they will be merged together, with later files taking precedence: 79 | ``` 80 | > prometheus-kafka-consumer-group-exporter --consumer-config consumer.properties --consumer-config another-consumer.properties 81 | ``` 82 | Note that where a command line argument relates to a consumer property (e.g. `--bootstrap-brokers` sets `bootstrap.servers`) a value provided via that argument will override any value for that property in a properties file. The argument default will only be used if the property isn't provided in either a file or an argument. 83 | 84 | # Docker 85 | Docker images for released versions can be found on Docker Hub (note that no `latest` version is provided): 86 | ``` 87 | > sudo docker pull braedon/prometheus-kafka-consumer-group-exporter: 88 | ``` 89 | To run a container successfully, you will need map container port 9208 to a port on the host. Any options placed after the image name (`prometheus-kafka-consumer-group-exporter`) will be passed to the process inside the container. For example, you will need to use this to configure the kafka node(s) using `-b`. 90 | ``` 91 | > sudo docker run --rm --name exporter \ 92 | -p :9208 \ 93 | braedon/prometheus-kafka-consumer-group-exporter: -b 94 | ``` 95 | 96 | # Development 97 | To install directly from the git repo, run the following in the root project directory: 98 | ``` 99 | > pip3 install . 100 | ``` 101 | The exporter can be installed in "editable" mode, using pip's `-e` flag. This allows you to test out changes without having to re-install. 102 | ``` 103 | > pip3 install -e . 104 | ``` 105 | 106 | To build a docker image directly from the git repo, run the following in the root project directory: 107 | ``` 108 | > sudo docker build -t . 109 | ``` 110 | Send me a PR if you have a change you want to contribute! 111 | -------------------------------------------------------------------------------- /prometheus_kafka_consumer_group_exporter/__init__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import javaproperties 3 | import logging 4 | import signal 5 | import sys 6 | 7 | from jog import JogFormatter 8 | from kafka import KafkaConsumer 9 | from prometheus_client import start_http_server 10 | from prometheus_client.core import REGISTRY 11 | 12 | from . import scheduler, collectors 13 | from .fetch_jobs import setup_fetch_jobs 14 | from .parsing import parse_key, parse_value 15 | 16 | 17 | # Check if a dict contains a key, returning 18 | # a copy with the key if not. 19 | # Effectively a way to immutably add a key 20 | # to a dictionary, allowing other threads 21 | # to safely iterate over it. 22 | def ensure_dict_key(curr_dict, key, new_value): 23 | if key in curr_dict: 24 | return curr_dict 25 | 26 | new_dict = curr_dict.copy() 27 | new_dict[key] = new_value 28 | return new_dict 29 | 30 | 31 | def shutdown(): 32 | logging.info('Shutting down') 33 | sys.exit(1) 34 | 35 | 36 | def signal_handler(signum, frame): 37 | shutdown() 38 | 39 | 40 | def main(): 41 | signal.signal(signal.SIGTERM, signal_handler) 42 | 43 | parser = argparse.ArgumentParser( 44 | description='Export Kafka consumer offsets to Prometheus.') 45 | parser.add_argument( 46 | '-b', '--bootstrap-brokers', 47 | help='Addresses of brokers in a Kafka cluster to talk to.' + 48 | ' Brokers should be separated by commas e.g. broker1,broker2.' + 49 | ' Ports can be provided if non-standard (9092) e.g. brokers1:9999.' + 50 | ' (default: localhost)') 51 | parser.add_argument( 52 | '-p', '--port', type=int, default=9208, 53 | help='Port to serve the metrics endpoint on. (default: 9208)') 54 | parser.add_argument( 55 | '-s', '--from-start', action='store_true', 56 | help='Start from the beginning of the `__consumer_offsets` topic.') 57 | parser.add_argument( 58 | '--topic-interval', type=float, default=30.0, 59 | help='How often to refresh topic information, in seconds. (default: 30)') 60 | parser.add_argument( 61 | '--high-water-interval', type=float, default=10.0, 62 | help='How often to refresh high-water information, in seconds. (default: 10)') 63 | parser.add_argument( 64 | '--low-water-interval', type=float, default=10.0, 65 | help='How often to refresh low-water information, in seconds. (default: 10)') 66 | parser.add_argument( 67 | '--consumer-config', action='append', default=[], 68 | help='Provide additional Kafka consumer config as a consumer.properties file. Multiple files will be merged, later files having precedence.') 69 | parser.add_argument( 70 | '-j', '--json-logging', action='store_true', 71 | help='Turn on json logging.') 72 | parser.add_argument( 73 | '--log-level', default='INFO', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 74 | help='Detail level to log. (default: INFO)') 75 | parser.add_argument( 76 | '-v', '--verbose', action='store_true', 77 | help='Turn on verbose (DEBUG) logging. Overrides --log-level.') 78 | args = parser.parse_args() 79 | 80 | log_handler = logging.StreamHandler() 81 | log_format = '[%(asctime)s] %(name)s.%(levelname)s %(threadName)s %(message)s' 82 | formatter = JogFormatter(log_format) \ 83 | if args.json_logging \ 84 | else logging.Formatter(log_format) 85 | log_handler.setFormatter(formatter) 86 | 87 | log_level = getattr(logging, args.log_level) 88 | logging.basicConfig( 89 | handlers=[log_handler], 90 | level=logging.DEBUG if args.verbose else log_level 91 | ) 92 | logging.captureWarnings(True) 93 | 94 | port = args.port 95 | 96 | consumer_config = { 97 | 'bootstrap_servers': 'localhost', 98 | 'auto_offset_reset': 'latest', 99 | 'group_id': None, 100 | 'consumer_timeout_ms': 500 101 | } 102 | 103 | for filename in args.consumer_config: 104 | with open(filename) as f: 105 | raw_config = javaproperties.load(f) 106 | for k, v in raw_config.items(): 107 | if v == '': 108 | # Treat empty values as if they weren't set 109 | continue 110 | 111 | if v.lower() in ['true', 'false']: 112 | # Convert boolean values 113 | v = True if v.lower() == 'true' else False 114 | 115 | else: 116 | # Try and convert numeric values 117 | try: 118 | v = int(v) 119 | except ValueError: 120 | try: 121 | v = float(v) 122 | except ValueError: 123 | pass 124 | 125 | consumer_config[k.replace('.', '_')] = v 126 | 127 | if args.bootstrap_brokers: 128 | consumer_config['bootstrap_servers'] = args.bootstrap_brokers 129 | 130 | consumer_config['bootstrap_servers'] = consumer_config['bootstrap_servers'].split(',') 131 | 132 | if args.from_start: 133 | consumer_config['auto_offset_reset'] = 'earliest' 134 | 135 | consumer = KafkaConsumer( 136 | '__consumer_offsets', 137 | **consumer_config 138 | ) 139 | client = consumer._client 140 | 141 | topic_interval = args.topic_interval 142 | high_water_interval = args.high_water_interval 143 | low_water_interval = args.low_water_interval 144 | 145 | logging.info('Starting server...') 146 | start_http_server(port) 147 | logging.info('Server started on port %s', port) 148 | 149 | REGISTRY.register(collectors.HighwaterCollector()) 150 | REGISTRY.register(collectors.LowwaterCollector()) 151 | REGISTRY.register(collectors.ConsumerOffsetCollector()) 152 | REGISTRY.register(collectors.ConsumerLagCollector()) 153 | REGISTRY.register(collectors.ConsumerLeadCollector()) 154 | REGISTRY.register(collectors.ConsumerCommitsCollector()) 155 | REGISTRY.register(collectors.ConsumerCommitTimestampCollector()) 156 | REGISTRY.register(collectors.ExporterOffsetCollector()) 157 | REGISTRY.register(collectors.ExporterLagCollector()) 158 | REGISTRY.register(collectors.ExporterLeadCollector()) 159 | 160 | scheduled_jobs = setup_fetch_jobs(topic_interval, high_water_interval, low_water_interval, client) 161 | scheduler.run_scheduled_jobs(scheduled_jobs) 162 | 163 | try: 164 | while True: 165 | for message in consumer: 166 | offsets = collectors.get_offsets() 167 | commits = collectors.get_commits() 168 | commit_timestamps = collectors.get_commit_timestamps() 169 | exporter_offsets = collectors.get_exporter_offsets() 170 | 171 | # Commits store the offset a consumer should read from next, 172 | # so we need to add one to the current offset for semantic parity 173 | exporter_partition = message.partition 174 | exporter_offset = message.offset + 1 175 | exporter_offsets = ensure_dict_key(exporter_offsets, exporter_partition, exporter_offset) 176 | exporter_offsets[exporter_partition] = exporter_offset 177 | collectors.set_exporter_offsets(exporter_offsets) 178 | 179 | if message.key: 180 | key_dict = parse_key(message.key) 181 | # Only key versions 0 and 1 are offset commit messages. 182 | # Ignore other versions. 183 | if key_dict is not None and key_dict['version'] in (0, 1): 184 | 185 | if message.value: 186 | value_dict = parse_value(message.value) 187 | if value_dict is not None: 188 | group = key_dict['group'] 189 | topic = key_dict['topic'] 190 | partition = key_dict['partition'] 191 | offset = value_dict['offset'] 192 | commit_timestamp = value_dict['commit_timestamp'] / 1000 193 | 194 | offsets = ensure_dict_key(offsets, group, {}) 195 | offsets[group] = ensure_dict_key(offsets[group], topic, {}) 196 | offsets[group][topic] = ensure_dict_key(offsets[group][topic], partition, offset) 197 | offsets[group][topic][partition] = offset 198 | collectors.set_offsets(offsets) 199 | 200 | commits = ensure_dict_key(commits, group, {}) 201 | commits[group] = ensure_dict_key(commits[group], topic, {}) 202 | commits[group][topic] = ensure_dict_key(commits[group][topic], partition, 0) 203 | commits[group][topic][partition] += 1 204 | collectors.set_commits(commits) 205 | 206 | commit_timestamps = ensure_dict_key(commit_timestamps, group, {}) 207 | commit_timestamps[group] = ensure_dict_key(commit_timestamps[group], topic, {}) 208 | commit_timestamps[group][topic] = ensure_dict_key(commit_timestamps[group][topic], partition, 0) 209 | commit_timestamps[group][topic][partition] = commit_timestamp 210 | collectors.set_commit_timestamps(commit_timestamps) 211 | 212 | else: 213 | # The group has been removed, so we should not report metrics 214 | group = key_dict['group'] 215 | topic = key_dict['topic'] 216 | partition = key_dict['partition'] 217 | 218 | if group in offsets: 219 | if topic in offsets[group]: 220 | if partition in offsets[group][topic]: 221 | del offsets[group][topic][partition] 222 | 223 | if group in commits: 224 | if topic in commits[group]: 225 | if partition in commits[group][topic]: 226 | del commits[group][topic][partition] 227 | 228 | if group in commit_timestamps: 229 | if topic in commit_timestamps[group]: 230 | if partition in commit_timestamps[group][topic]: 231 | del commit_timestamps[group][topic][partition] 232 | 233 | # Check if we need to run any scheduled jobs 234 | # each message. 235 | scheduled_jobs = scheduler.run_scheduled_jobs(scheduled_jobs) 236 | 237 | # Also check if we need to run any scheduled jobs 238 | # each time the consumer times out, in case there 239 | # aren't any messages to consume. 240 | scheduled_jobs = scheduler.run_scheduled_jobs(scheduled_jobs) 241 | 242 | except KeyboardInterrupt: 243 | pass 244 | 245 | shutdown() 246 | -------------------------------------------------------------------------------- /prometheus_kafka_consumer_group_exporter/__main__.py: -------------------------------------------------------------------------------- 1 | from prometheus_kafka_consumer_group_exporter import main 2 | 3 | if __name__ == '__main__': 4 | main() 5 | -------------------------------------------------------------------------------- /prometheus_kafka_consumer_group_exporter/collectors.py: -------------------------------------------------------------------------------- 1 | from prometheus_client.core import GaugeMetricFamily, CounterMetricFamily 2 | 3 | from .fetch_jobs import build_highwaters, build_lowwaters 4 | 5 | METRIC_PREFIX = 'kafka_consumer_group_' 6 | 7 | # Globals 8 | offsets = {} # group->topic->partition->offset 9 | commits = {} # group->topic->partition->commits 10 | commit_timestamps = {} # group->topic->partition->commit_timestamp 11 | exporter_offsets = {} # partition->offset 12 | 13 | 14 | def get_offsets(): 15 | return offsets 16 | 17 | 18 | def set_offsets(new_offsets): 19 | global offsets 20 | offsets = new_offsets 21 | 22 | 23 | def get_commits(): 24 | return commits 25 | 26 | 27 | def set_commits(new_commits): 28 | global commits 29 | commits = new_commits 30 | 31 | 32 | def get_commit_timestamps(): 33 | return commit_timestamps 34 | 35 | 36 | def set_commit_timestamps(new_commit_timestamps): 37 | global commit_timestamps 38 | commit_timestamps = new_commit_timestamps 39 | 40 | 41 | def get_exporter_offsets(): 42 | return exporter_offsets 43 | 44 | 45 | def set_exporter_offsets(new_exporter_offsets): 46 | global exporter_offsets 47 | exporter_offsets = new_exporter_offsets 48 | 49 | 50 | def group_metrics(metrics): 51 | metric_dict = {} 52 | for (metric_name, metric_doc, label_keys, label_values, value) in metrics: 53 | if metric_name not in metric_dict: 54 | metric_dict[metric_name] = (metric_doc, label_keys, {}) 55 | 56 | metric_dict[metric_name][2][label_values] = value 57 | 58 | return metric_dict 59 | 60 | 61 | def gauge_generator(metrics): 62 | metric_dict = group_metrics(metrics) 63 | 64 | for metric_name, (metric_doc, label_keys, value_dict) in metric_dict.items(): 65 | # If we have label keys we may have multiple different values, 66 | # each with their own label values. 67 | if label_keys: 68 | gauge = GaugeMetricFamily(metric_name, metric_doc, labels=label_keys) 69 | 70 | for label_values in sorted(value_dict.keys()): 71 | value = value_dict[label_values] 72 | gauge.add_metric(tuple(str(v) for v in label_values), value) 73 | 74 | # No label keys, so we must have only a single value. 75 | else: 76 | gauge = GaugeMetricFamily(metric_name, metric_doc, value=list(value_dict.values())[0]) 77 | 78 | yield gauge 79 | 80 | 81 | def counter_generator(metrics): 82 | metric_dict = group_metrics(metrics) 83 | 84 | for metric_name, (metric_doc, label_keys, value_dict) in metric_dict.items(): 85 | # If we have label keys we may have multiple different values, 86 | # each with their own label values. 87 | if label_keys: 88 | counter = CounterMetricFamily(metric_name, metric_doc, labels=label_keys) 89 | 90 | for label_values in sorted(value_dict.keys()): 91 | value = value_dict[label_values] 92 | counter.add_metric(tuple(str(v) for v in label_values), value) 93 | 94 | # No label keys, so we must have only a single value. 95 | else: 96 | counter = CounterMetricFamily(metric_name, metric_doc, value=list(value_dict.values())[0]) 97 | 98 | yield counter 99 | 100 | 101 | class HighwaterCollector(object): 102 | 103 | def collect(self): 104 | highwaters = build_highwaters() 105 | metrics = [ 106 | ('kafka_topic_highwater', 'The offset of the head of a partition in a topic.', 107 | ('topic', 'partition'), (topic, partition), 108 | highwater) 109 | for topic, partitions in highwaters.items() 110 | for partition, highwater in partitions.items() 111 | ] 112 | yield from gauge_generator(metrics) 113 | 114 | 115 | class LowwaterCollector(object): 116 | 117 | def collect(self): 118 | lowwaters = build_lowwaters() 119 | metrics = [ 120 | ('kafka_topic_lowwater', 'The offset of the tail of a partition in a topic.', 121 | ('topic', 'partition'), (topic, partition), 122 | lowwater) 123 | for topic, partitions in lowwaters.items() 124 | for partition, lowwater in partitions.items() 125 | ] 126 | yield from gauge_generator(metrics) 127 | 128 | 129 | class ConsumerOffsetCollector(object): 130 | 131 | def collect(self): 132 | metrics = [ 133 | (METRIC_PREFIX + 'offset', 'The current offset of a consumer group in a partition of a topic.', 134 | ('group', 'topic', 'partition'), (group, topic, partition), 135 | offset) 136 | for group, topics in offsets.items() 137 | for topic, partitions in topics.items() 138 | for partition, offset in partitions.items() 139 | ] 140 | yield from gauge_generator(metrics) 141 | 142 | 143 | class ConsumerLagCollector(object): 144 | 145 | def collect(self): 146 | highwaters = build_highwaters() 147 | metrics = [ 148 | (METRIC_PREFIX + 'lag', 'How far a consumer group\'s current offset is behind the head of a partition of a topic.', 149 | ('group', 'topic', 'partition'), (group, topic, partition), 150 | max(highwaters[topic][partition] - offset, 0)) 151 | for group, topics in offsets.items() 152 | for topic, partitions in topics.items() 153 | for partition, offset in partitions.items() 154 | if topic in highwaters and partition in highwaters[topic] 155 | ] 156 | yield from gauge_generator(metrics) 157 | 158 | 159 | class ConsumerLeadCollector(object): 160 | 161 | def collect(self): 162 | lowwaters = build_lowwaters() 163 | metrics = [ 164 | (METRIC_PREFIX + 'lead', 'How far a consumer group\'s current offset is ahead of the tail of a partition of a topic.', 165 | ('group', 'topic', 'partition'), (group, topic, partition), 166 | offset - lowwaters[topic][partition]) 167 | for group, topics in offsets.items() 168 | for topic, partitions in topics.items() 169 | for partition, offset in partitions.items() 170 | if topic in lowwaters and partition in lowwaters[topic] 171 | ] 172 | yield from gauge_generator(metrics) 173 | 174 | 175 | class ConsumerCommitsCollector(object): 176 | 177 | def collect(self): 178 | metrics = [ 179 | (METRIC_PREFIX + 'commits', 'The number of commit messages read by the exporter consumer from a consumer group for a partition of a topic.', 180 | ('group', 'topic', 'partition'), (group, topic, partition), 181 | commit_count) 182 | for group, topics in commits.items() 183 | for topic, partitions in topics.items() 184 | for partition, commit_count in partitions.items() 185 | ] 186 | yield from counter_generator(metrics) 187 | 188 | 189 | class ConsumerCommitTimestampCollector(object): 190 | 191 | def collect(self): 192 | metrics = [ 193 | (METRIC_PREFIX + 'commit_timestamp', 'The timestamp of the latest commit from a consumer group for a partition of a topic.', 194 | ('group', 'topic', 'partition'), (group, topic, partition), 195 | commit_timestamp) 196 | for group, topics in commit_timestamps.items() 197 | for topic, partitions in topics.items() 198 | for partition, commit_timestamp in partitions.items() 199 | ] 200 | yield from gauge_generator(metrics) 201 | 202 | 203 | class ExporterOffsetCollector(object): 204 | 205 | def collect(self): 206 | metrics = [ 207 | (METRIC_PREFIX + 'exporter_offset', 'The current offset of the exporter consumer in a partition of the __consumer_offsets topic.', 208 | ('partition',), (partition,), 209 | offset) 210 | for partition, offset in exporter_offsets.items() 211 | ] 212 | yield from gauge_generator(metrics) 213 | 214 | 215 | class ExporterLagCollector(object): 216 | 217 | def collect(self): 218 | topic = '__consumer_offsets' 219 | highwaters = build_highwaters() 220 | metrics = [ 221 | (METRIC_PREFIX + 'exporter_lag', 'How far the exporter consumer is behind the head of a partition of the __consumer_offsets topic.', 222 | ('partition',), (partition,), 223 | max(highwaters[topic][partition] - offset, 0)) 224 | for partition, offset in exporter_offsets.items() 225 | if topic in highwaters and partition in highwaters[topic] 226 | ] 227 | yield from gauge_generator(metrics) 228 | 229 | 230 | class ExporterLeadCollector(object): 231 | 232 | def collect(self): 233 | topic = '__consumer_offsets' 234 | lowwaters = build_lowwaters() 235 | metrics = [ 236 | (METRIC_PREFIX + 'exporter_lead', 'How far the exporter consumer is ahead of the tail of a partition of the __consumer_offsets topic.', 237 | ('partition',), (partition,), 238 | offset - lowwaters[topic][partition]) 239 | for partition, offset in exporter_offsets.items() 240 | if topic in lowwaters and partition in lowwaters[topic] 241 | ] 242 | yield from gauge_generator(metrics) 243 | -------------------------------------------------------------------------------- /prometheus_kafka_consumer_group_exporter/fetch_jobs.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | import kafka.errors as Errors 4 | 5 | from kafka.protocol.metadata import MetadataRequest 6 | from kafka.protocol.offset import OffsetRequest, OffsetResetStrategy 7 | 8 | from . import scheduler 9 | 10 | # Globals 11 | topics = {} # topic->partition->leader 12 | node_highwaters = {} # node->topic->partition->highwater 13 | node_lowwaters = {} # node->topic->partition->lowwater 14 | 15 | 16 | def build_highwaters(): 17 | # Copy node_highwaters before iterating over it 18 | # as it may be updated by other threads. 19 | # (only first level - lower levels are replaced 20 | # wholesale, so don't worry about them) 21 | local_node_highwaters = node_highwaters.copy() 22 | 23 | highwaters = {} 24 | for node, topics in local_node_highwaters.items(): 25 | for topic, partitions in topics.items(): 26 | highwaters[topic] = {**highwaters.get(topic, {}), **partitions} 27 | 28 | return highwaters 29 | 30 | 31 | def build_lowwaters(): 32 | # Copy node_lowwaters before iterating over it 33 | # as it may be updated by other threads. 34 | # (only first level - lower levels are replaced 35 | # wholesale, so don't worry about them) 36 | local_node_lowwaters = node_lowwaters.copy() 37 | 38 | lowwaters = {} 39 | for node, topics in local_node_lowwaters.items(): 40 | for topic, partitions in topics.items(): 41 | lowwaters[topic] = {**lowwaters.get(topic, {}), **partitions} 42 | 43 | return lowwaters 44 | 45 | 46 | def fetch_topics(client, callback): 47 | logging.info('Requesting topics and partition assignments') 48 | 49 | try: 50 | node = client.least_loaded_node() 51 | 52 | logging.debug('Requesting topics and partition assignments from %(node)s', 53 | {'node': node}) 54 | 55 | api_version = 0 if client.config['api_version'] < (0, 10) else 1 56 | request = MetadataRequest[api_version](None) 57 | f = client.send(node, request) 58 | f.add_callback(callback, api_version) 59 | 60 | except Exception: 61 | logging.exception('Error requesting topics and partition assignments') 62 | 63 | 64 | def fetch_highwater(client, callback): 65 | try: 66 | if topics: 67 | logging.info('Requesting high-water marks') 68 | 69 | nodes = {} 70 | for topic, partition_map in topics.items(): 71 | for partition, leader in partition_map.items(): 72 | if leader not in nodes: 73 | nodes[leader] = {} 74 | if topic not in nodes[leader]: 75 | nodes[leader][topic] = [] 76 | nodes[leader][topic].append(partition) 77 | 78 | global node_highwaters 79 | # Build a new highwaters dict with only the nodes that 80 | # are leaders of at least one topic - i.e. the ones 81 | # we will be sending requests to. 82 | # Removes old nodes, and adds empty dicts for new nodes. 83 | # Values will be populated/updated with values we get 84 | # in the response from each node. 85 | # Topics/Partitions on old nodes may disappear briefly 86 | # before they reappear on their new nodes. 87 | new_node_highwaters = {} 88 | for node in nodes.keys(): 89 | new_node_highwaters[node] = node_highwaters.get(node, {}) 90 | 91 | node_highwaters = new_node_highwaters 92 | 93 | for node, topic_map in nodes.items(): 94 | logging.debug('Requesting high-water marks from %(node)s', 95 | {'topic': topic, 'node': node}) 96 | 97 | request = OffsetRequest[0]( 98 | -1, 99 | [(topic, 100 | [(partition, OffsetResetStrategy.LATEST, 1) 101 | for partition in partitions]) 102 | for topic, partitions in topic_map.items()] 103 | ) 104 | f = client.send(node, request) 105 | f.add_callback(callback, node) 106 | 107 | except Exception: 108 | logging.exception('Error requesting high-water marks') 109 | 110 | 111 | def fetch_lowwater(client, callback): 112 | try: 113 | if topics: 114 | logging.info('Requesting low-water marks') 115 | 116 | nodes = {} 117 | for topic, partition_map in topics.items(): 118 | for partition, leader in partition_map.items(): 119 | if leader not in nodes: 120 | nodes[leader] = {} 121 | if topic not in nodes[leader]: 122 | nodes[leader][topic] = [] 123 | nodes[leader][topic].append(partition) 124 | 125 | global node_lowwaters 126 | # Build a new node_lowwaters dict with only the nodes that 127 | # are leaders of at least one topic - i.e. the ones 128 | # we will be sending requests to. 129 | # Removes old nodes, and adds empty dicts for new nodes. 130 | # Values will be populated/updated with values we get 131 | # in the response from each node. 132 | # Topics/Partitions on old nodes may disappear briefly 133 | # before they reappear on their new nodes. 134 | new_node_lowwaters = {} 135 | for node in nodes.keys(): 136 | new_node_lowwaters[node] = node_lowwaters.get(node, {}) 137 | 138 | node_lowwaters = new_node_lowwaters 139 | 140 | for node, topic_map in nodes.items(): 141 | logging.debug('Requesting low-water marks from %(node)s', 142 | {'topic': topic, 'node': node}) 143 | 144 | request = OffsetRequest[0]( 145 | -1, 146 | [(topic, 147 | [(partition, OffsetResetStrategy.EARLIEST, 1) 148 | for partition in partitions]) 149 | for topic, partitions in topic_map.items()] 150 | ) 151 | f = client.send(node, request) 152 | f.add_callback(callback, node) 153 | 154 | except Exception: 155 | logging.exception('Error requesting low-water marks') 156 | 157 | 158 | def update_topics(api_version, metadata): 159 | logging.info('Received topics and partition assignments') 160 | 161 | if api_version == 0: 162 | TOPIC_ERROR = 0 163 | TOPIC_NAME = 1 164 | TOPIC_PARTITIONS = 2 165 | PARTITION_ERROR = 0 166 | PARTITION_NUMBER = 1 167 | PARTITION_LEADER = 2 168 | else: 169 | TOPIC_ERROR = 0 170 | TOPIC_NAME = 1 171 | TOPIC_PARTITIONS = 3 172 | PARTITION_ERROR = 0 173 | PARTITION_NUMBER = 1 174 | PARTITION_LEADER = 2 175 | 176 | new_topics = {} 177 | for t in metadata.topics: 178 | error_code = t[TOPIC_ERROR] 179 | if error_code: 180 | error = Errors.for_code(error_code)(t) 181 | logging.warning('Received error in metadata response at topic level: %s', error) 182 | else: 183 | topic = t[TOPIC_NAME] 184 | partitions = t[TOPIC_PARTITIONS] 185 | 186 | new_partitions = {} 187 | for p in partitions: 188 | error_code = p[PARTITION_ERROR] 189 | if error_code: 190 | error = Errors.for_code(error_code)(p) 191 | logging.warning('Received error in metadata response at partition level for topic %(topic)s: %(error)s', 192 | {'topic': topic, 'error': error}) 193 | else: 194 | partition = p[PARTITION_NUMBER] 195 | leader = p[PARTITION_LEADER] 196 | logging.debug('Received partition assignment for partition %(partition)s of topic %(topic)s', 197 | {'partition': partition, 'topic': topic}) 198 | 199 | new_partitions[partition] = leader 200 | 201 | new_topics[topic] = new_partitions 202 | 203 | global topics 204 | topics = new_topics 205 | 206 | 207 | def update_highwater(node, offsets): 208 | logging.info('Received high-water marks from node {}'.format(node)) 209 | 210 | highwaters = {} 211 | for topic, partitions in offsets.topics: 212 | for partition, error_code, offsets in partitions: 213 | if error_code: 214 | error = Errors.for_code(error_code)((partition, error_code, offsets)) 215 | logging.warning('Received error in offset response for topic %(topic)s: %(error)s', 216 | {'topic': topic, 'error': error}) 217 | else: 218 | logging.debug('Received high-water marks for partition %(partition)s of topic %(topic)s', 219 | {'partition': partition, 'topic': topic}) 220 | 221 | highwater = offsets[0] 222 | 223 | if topic not in highwaters: 224 | highwaters[topic] = {} 225 | highwaters[topic][partition] = highwater 226 | 227 | global node_highwaters 228 | node_highwaters[node] = highwaters 229 | 230 | 231 | def update_lowwater(node, offsets): 232 | logging.info('Received low-water marks from node {}'.format(node)) 233 | 234 | lowwaters = {} 235 | for topic, partitions in offsets.topics: 236 | for partition, error_code, offsets in partitions: 237 | if error_code: 238 | error = Errors.for_code(error_code)((partition, error_code, offsets)) 239 | logging.warning('Received error in offset response for topic %(topic)s: %(error)s', 240 | {'topic': topic, 'error': error}) 241 | else: 242 | logging.debug('Received low-water marks for partition %(partition)s of topic %(topic)s', 243 | {'partition': partition, 'topic': topic}) 244 | 245 | lowwater = offsets[0] 246 | 247 | if topic not in lowwaters: 248 | lowwaters[topic] = {} 249 | lowwaters[topic][partition] = lowwater 250 | 251 | global node_lowwaters 252 | node_lowwaters[node] = lowwaters 253 | 254 | 255 | def setup_fetch_jobs(topic_interval, high_water_interval, low_water_interval, 256 | client, jobs=None): 257 | 258 | if jobs is None: 259 | jobs = [] 260 | 261 | jobs = scheduler.add_scheduled_job(jobs, topic_interval, 262 | fetch_topics, client, update_topics) 263 | jobs = scheduler.add_scheduled_job(jobs, high_water_interval, 264 | fetch_highwater, client, update_highwater) 265 | jobs = scheduler.add_scheduled_job(jobs, low_water_interval, 266 | fetch_lowwater, client, update_lowwater) 267 | return jobs 268 | -------------------------------------------------------------------------------- /prometheus_kafka_consumer_group_exporter/parsing.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from struct import unpack_from, error as struct_error 3 | 4 | 5 | def read_short(bytes): 6 | num = unpack_from('>h', bytes)[0] 7 | remaining = bytes[2:] 8 | return (num, remaining) 9 | 10 | 11 | def read_int(bytes): 12 | num = unpack_from('>i', bytes)[0] 13 | remaining = bytes[4:] 14 | return (num, remaining) 15 | 16 | 17 | def read_long_long(bytes): 18 | num = unpack_from('>q', bytes)[0] 19 | remaining = bytes[8:] 20 | return (num, remaining) 21 | 22 | 23 | def read_string(bytes): 24 | length, remaining = read_short(bytes) 25 | string = remaining[:length].decode('utf-8') 26 | remaining = remaining[length:] 27 | return (string, remaining) 28 | 29 | 30 | def parse_key(bytes): 31 | try: 32 | (version, remaining_key) = read_short(bytes) 33 | key_dict = {'version': version} 34 | 35 | # These two versions are for offset commit messages. 36 | if version == 1 or version == 0: 37 | key_dict['group'], remaining_key = read_string(remaining_key) 38 | key_dict['topic'], remaining_key = read_string(remaining_key) 39 | key_dict['partition'], remaining_key = read_int(remaining_key) 40 | 41 | # This version is for group metadata messages. 42 | # (we don't support parsing their values currently) 43 | elif version == 2: 44 | key_dict['group'], remaining_key = read_string(remaining_key) 45 | 46 | else: 47 | logging.error('Can\'t parse __consumer_offsets topic message key with' 48 | ' unsupported version %(version)s.', 49 | {'version': version}) 50 | return None 51 | 52 | return key_dict 53 | 54 | except struct_error: 55 | logging.warning('Failed to parse key from __consumer_offsets topic message.' 56 | ' Key: %(key_bytes)s', 57 | {'key_bytes': bytes}, 58 | exc_info=True) 59 | 60 | 61 | def parse_value(bytes): 62 | try: 63 | (version, remaining_key) = read_short(bytes) 64 | value_dict = {'version': version} 65 | 66 | if version == 0: 67 | value_dict['offset'], remaining_key = read_long_long(remaining_key) 68 | value_dict['metadata'], remaining_key = read_string(remaining_key) 69 | value_dict['timestamp'], remaining_key = read_long_long(remaining_key) 70 | 71 | elif version == 1: 72 | value_dict['offset'], remaining_key = read_long_long(remaining_key) 73 | value_dict['metadata'], remaining_key = read_string(remaining_key) 74 | value_dict['commit_timestamp'], remaining_key = read_long_long(remaining_key) 75 | value_dict['expire_timestamp'], remaining_key = read_long_long(remaining_key) 76 | 77 | elif version == 2: 78 | value_dict['offset'], remaining_key = read_long_long(remaining_key) 79 | value_dict['metadata'], remaining_key = read_string(remaining_key) 80 | value_dict['commit_timestamp'], remaining_key = read_long_long(remaining_key) 81 | 82 | elif version == 3: 83 | value_dict['offset'], remaining_key = read_long_long(remaining_key) 84 | value_dict['leader_epoch'], remaining_key = read_int(remaining_key) 85 | value_dict['metadata'], remaining_key = read_string(remaining_key) 86 | value_dict['commit_timestamp'], remaining_key = read_long_long(remaining_key) 87 | 88 | else: 89 | logging.error('Can\'t parse __consumer_offsets topic message value with' 90 | ' unsupported version %(version)s.', 91 | {'version': version}) 92 | return None 93 | 94 | return value_dict 95 | 96 | except struct_error as e: 97 | logging.warning('Failed to parse value from __consumer_offsets topic message.' 98 | ' Value: %(value_bytes)s', 99 | {'value_bytes': bytes}, 100 | exc_info=True) 101 | -------------------------------------------------------------------------------- /prometheus_kafka_consumer_group_exporter/scheduler.py: -------------------------------------------------------------------------------- 1 | import time 2 | import logging 3 | 4 | 5 | def add_scheduled_job(jobs, interval, func, *args, **kwargs): 6 | jobs = jobs.copy() 7 | 8 | # Schedule the new job to run right away 9 | next_scheduled_time = time.monotonic() 10 | jobs.append((next_scheduled_time, interval, func, args, kwargs)) 11 | 12 | # No real need to sort the jobs here, but nice to 13 | # if we're going to inspect the jobs list later. 14 | return sorted(jobs) 15 | 16 | 17 | def run_scheduled_jobs(jobs): 18 | if not jobs: 19 | return jobs 20 | 21 | current_time = time.monotonic() 22 | 23 | # Sort jobs to run in scheduled time order to run them as 24 | # close to their scheduled time as possible. 25 | to_run = sorted(job for job in jobs if job[0] <= current_time) 26 | 27 | if not to_run: 28 | return jobs 29 | 30 | # We'll sort the remaining jobs later, so don't bother now. 31 | jobs = [job for job in jobs if job[0] > current_time] 32 | 33 | for scheduled_time, interval, func, args, kwargs in to_run: 34 | try: 35 | func(*args, **kwargs) 36 | except Exception: 37 | logging.exception('Error while running scheduled job; function: %(func_name)s, args: %(args)r, kwargs: %(kwargs)r', 38 | {'func_name': func.__name__, 'args': args, 'kwargs': kwargs}) 39 | 40 | # Make sure next scheduled time is past the current time, 41 | # in case we skipped some runs for some reason. 42 | next_scheduled_time = scheduled_time + interval 43 | while next_scheduled_time < current_time: 44 | next_scheduled_time += interval 45 | 46 | jobs.append((next_scheduled_time, interval, func, args, kwargs)) 47 | 48 | # No real need to sort the jobs here, but nice to 49 | # if we're going to inspect the jobs list later. 50 | return sorted(jobs) 51 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup( 4 | name='prometheus-kafka-consumer-group-exporter', 5 | version='0.5.5', 6 | description='Kafka consumer group Prometheus exporter', 7 | url='https://github.com/Braedon/prometheus-kafka-consumer-group-exporter', 8 | author='Braedon Vickers', 9 | author_email='braedon.vickers@gmail.com', 10 | license='MIT', 11 | classifiers=[ 12 | 'Development Status :: 4 - Beta', 13 | 'Intended Audience :: Developers', 14 | 'Intended Audience :: System Administrators', 15 | 'Topic :: System :: Monitoring', 16 | 'License :: OSI Approved :: MIT License', 17 | 'Programming Language :: Python :: 3', 18 | 'Programming Language :: Python :: 3.4', 19 | 'Programming Language :: Python :: 3.5', 20 | 'Programming Language :: Python :: 3.6', 21 | 'Programming Language :: Python :: 3.7', 22 | ], 23 | keywords='monitoring prometheus exporter kafka consumer group', 24 | packages=find_packages(), 25 | install_requires=[ 26 | # kafka-python 1.4.5 included a number of bugs and a severe drop 27 | # in consumer performance. 1.4.6 fixed the bugs, but the performance 28 | # issues remained. 1.4.7 fixed the performance issues. 29 | 'kafka-python >= 1.3, != 1.4.5, != 1.4.6', 30 | 'jog', 31 | 'prometheus-client >= 0.6.0', 32 | 'javaproperties' 33 | ], 34 | entry_points={ 35 | 'console_scripts': [ 36 | 'prometheus-kafka-consumer-group-exporter=prometheus_kafka_consumer_group_exporter:main', 37 | ], 38 | }, 39 | ) 40 | --------------------------------------------------------------------------------