├── .babelrc.js ├── .gitignore ├── LICENSE ├── README.md ├── app ├── LaunchWindow.jsx ├── MainWindow.jsx └── MetricTray.jsx ├── build ├── 2e0f6edb704eeb6183ad.jpg ├── bundle.js └── bundle.js.LICENSE.txt ├── configs ├── alertmanager │ └── alertmanager_template.yml ├── configGenerator.js ├── docker │ ├── docker_multiple_nodes_template.yml │ └── docker_single_node_template.yml ├── grafana │ ├── provisioning │ │ ├── dashboards │ │ │ └── kafka.yaml │ │ └── datasources │ │ │ └── datasource.yaml │ └── templates │ │ ├── broker_hard_disk_usage.json │ │ ├── broker_jvm_os.json │ │ ├── broker_performance.json │ │ ├── broker_zookeeper.json │ │ ├── client_consumers_fetch_lag.json │ │ ├── cluster_healthcheck.json │ │ ├── cluster_replication.json │ │ ├── panels_template.js │ │ └── topics_logs.json ├── jmx_exporter │ ├── config_kafka_template.yml │ └── metric_list.js └── prometheus │ ├── alert_rules.yml │ └── prometheus_template.yml ├── electron.js ├── package-lock.json ├── package.json ├── src ├── assets │ ├── app-logo.png │ ├── github-logo.jpg │ ├── icon-mac.png │ ├── icon-windows.png │ ├── launch-demo.gif │ ├── main-demo.gif │ └── white-icon.png ├── components │ ├── App.jsx │ ├── BrokerHardDiskUsage.jsx │ ├── BrokerJVMAndOS.jsx │ ├── BrokerPerformance.jsx │ ├── BrokerZookeeper.jsx │ ├── ClusterHealthCheck.jsx │ ├── ClusterReplication.jsx │ ├── GrafanaDash.jsx │ ├── HelpTab.jsx │ ├── Home.jsx │ ├── Launch.jsx │ ├── MetricCard.jsx │ ├── Sidebar.jsx │ ├── TopicsLogs.jsx │ ├── _utils │ │ ├── displayMetrics.js │ │ └── renderMetricPanels.js │ └── alerts.jsx ├── index.html ├── index.js ├── launch.html ├── models │ └── metricURLs.js └── styles │ ├── styles.css │ ├── styles.css.map │ └── styles.scss └── webpack.config.js /.babelrc.js: -------------------------------------------------------------------------------- 1 | // only bundles the necessary material UI elements to reduce bundle size / compilation time 2 | 3 | const plugins = [ 4 | [ 5 | 'babel-plugin-import', 6 | { 7 | libraryName: '@mui/material', 8 | libraryDirectory: '', 9 | camel2DashComponentName: false, 10 | }, 11 | 'core', 12 | ], 13 | [ 14 | 'babel-plugin-import', 15 | { 16 | libraryName: '@mui/icons-material', 17 | libraryDirectory: '', 18 | camel2DashComponentName: false, 19 | }, 20 | 'icons', 21 | ], 22 | ]; 23 | 24 | module.exports = { plugins }; 25 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | node_modules 2 | 3 | # auto-generated config files 4 | configs/grafana/dashboards/* 5 | configs/docker/docker_multiple_nodes.yml 6 | configs/prometheus/prometheus.yml 7 | configs/jmx_exporter/config_kafka1* 8 | configs/alertmanager/alertmanager.yml 9 | configs/docker/docker_single_node.yml -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 OSLabs Beta 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | Logo 4 | 5 |

Intuitive, customizable Kafka cluster metrics

6 |

7 | Product Website » 8 |

9 | 10 | GitHub issues 11 | GitHub last commit 12 |
13 | 14 |

15 |

16 | 17 | --- 18 | 19 | ## Table of Contents 🗺️ 20 | 21 | - [About Kaffia](#about) 22 | - [Features](#features) 23 | - [Getting Started](#getting-started) 24 | - [Contributors](#contributors) 25 | - [Become a contributor](#contribute) 26 | - [License](#license) 27 | 28 | --- 29 | 30 | ## About Kaffia 📙 31 | 32 | Apache Kafka is one of the most widely used stream-processing platforms, yet it lacks a free, easy-to-use GUI to monitor key Kafka cluster metrics. Without a comprehensive monitoring tool, the barrier to entry of Apache Kafka remains high, and engineers working with Kafka clusters may miss key issues as they come up. 33 | 34 | That's why we created Kaffia. Kaffia is an open-source, intuitive GUI for Kafka clusters that allows you to tailor Kafka cluster monitoring to your needs and experience level. All you have to do is finish some quick setup, launch the app, input your monitoring and Kafka broker preferences, and let Kaffia handle the rest! 35 | 36 | Read on to see some of our core features and learn how you can get Kaffia up and running on your machine. 37 | 38 | --- 39 | 40 | ## Core Features 😊 41 | 42 | ### Easy setup ✅ 43 | 44 | - Easily configure your Kafka cluster broker count, monitor certain metrics, and sign up for email alerts 45 | - One button launches your cluster, metrics scrapers, visualization service, and alert manager—automatically! 46 | 47 | ### Comprehensive visualization 🔎 48 | 49 | - View key Kafka metrics (broker count, throughput, topic size, etc.) 50 | - Live updates as you produce to and consume from your cluster 51 | - Intuitive GUI to make parsing complex data less cumbersome 52 | 53 | ### Quickly launch and stop your cluster ⚡️ 54 | 55 | - Containerize your cluster in Docker Desktop with the click of a button 56 | - Easily shut down your cluster from the Kaffia dashboard 57 | 58 | --- 59 | 60 | ## Getting Started with Kaffia ⬆️ 61 | 62 | Kaffia automates cluster configuration and launching by creating a Docker application that containerizes everything from Zookeeper to the metrics scraper. Make sure you have [Docker Desktop](https://www.docker.com/products/docker-desktop/) up and running before you launch Kaffia. 63 | 64 | To get started, fork our repository and clone it to your local machine. To install all dependencies, run the following: 65 | 66 | ```sh 67 | npm install 68 | ``` 69 | 70 | After the dependencies install, you're good to go! Just spin up the app by running this command inside of the Kaffia directory: 71 | 72 | ```sh 73 | npm start 74 | ``` 75 | 76 | Once Kaffia is up and running, configuring and monitoring your cluster is simple! Just choose your broker count and metrics from the launch screen, hit submit, and watch your cluster launch automatically in Docker Desktop! 77 | 78 | 79 | 80 | After the cluster launches, you'll be able to navigate throughout the app and view different key metrics that will help you monitor your cluster's health without having to do any setup on your end. Enjoy! 81 | 82 | 83 | 84 | --- 85 | 86 | ## Contributors 👋 87 | 88 | - Liz Blackledge 89 | - Aiden Blinn 90 | - Ritchie Cervantes 91 | - Jonathan Oh 92 | 93 | ### Contribute to Kaffia 💪 94 | 95 | We welcome any and all contributions to Kaffia! You can reach out to one of us on LinkedIn if you have any ideas, or you can fork the repository, make some changes, and submit a pull request. 96 | 97 | --- 98 | 99 | ## License 🧐 100 | 101 | MIT License 102 | -------------------------------------------------------------------------------- /app/LaunchWindow.jsx: -------------------------------------------------------------------------------- 1 | /** 2 | * LaunchWindow takes advantage of Electron's built-in 3 | * BrowserWindow to create a window with consistent sizing 4 | * and settings tailored to its intended use. 5 | */ 6 | 7 | const electron = require('electron'); 8 | 9 | const { BrowserWindow, app } = electron; 10 | 11 | class LaunchWindow extends BrowserWindow { 12 | constructor(url) { 13 | super({ 14 | webPreferences: { 15 | nodeIntegration: true, 16 | contextIsolation: false, 17 | backgroundThrottling: false, 18 | }, 19 | height: 700, 20 | width: 500, 21 | frame: true, 22 | autoHideMenuBar: false, 23 | resizable: process.env.NODE_ENV === 'development', 24 | show: true, 25 | }); 26 | this.on('closed', () => app.quit()); 27 | this.loadURL(url); 28 | } 29 | } 30 | 31 | module.exports = LaunchWindow; 32 | -------------------------------------------------------------------------------- /app/MainWindow.jsx: -------------------------------------------------------------------------------- 1 | /** 2 | * MainWindow takes advantage of Electron's built-in 3 | * BrowserWindow to create a window with consistent sizing 4 | * and settings tailored to its intended use. 5 | */ 6 | 7 | const electron = require('electron'); 8 | 9 | const { BrowserWindow, app } = electron; 10 | 11 | class MainWindow extends BrowserWindow { 12 | constructor(url) { 13 | super({ 14 | webPreferences: { 15 | nodeIntegration: true, 16 | contextIsolation: false, 17 | backgroundThrottling: false, 18 | }, 19 | height: 800, 20 | minHeight: 500, 21 | width: 1100, 22 | minWidth: 800, 23 | frame: true, 24 | autoHideMenuBar: false, 25 | resizable: true, 26 | show: true, 27 | }); 28 | this.on('closed', () => app.quit()); 29 | this.loadURL(url); 30 | } 31 | } 32 | 33 | module.exports = MainWindow; 34 | -------------------------------------------------------------------------------- /app/MetricTray.jsx: -------------------------------------------------------------------------------- 1 | /** 2 | * The Metric Tray is a feature for a future release. This tray 3 | * is designed to provide the user a quick glance at core metrics 4 | * by just clicking on an icon in the user's menu without having 5 | * to open the whole app and look at the entire dashboard. 6 | */ 7 | 8 | const electron = require('electron'); 9 | const { Tray, Menu } = electron; 10 | 11 | class MetricTray extends Tray { 12 | constructor(iconPath, popupWindow) { 13 | super(iconPath); 14 | this.popupWindow = popupWindow; 15 | this.on('click', this.onClick); 16 | this.on('right-click', this.onRightClick); 17 | this.setToolTip('Kaffia'); 18 | } 19 | 20 | onClick = (event, bounds) => { 21 | const { x, y } = bounds; 22 | const { height, width } = this.popupWindow.getBounds(); 23 | 24 | if (this.popupWindow.isVisible()) { 25 | this.popupWindow.hide(); 26 | } else { 27 | this.popupWindow.setBounds({ 28 | x: x - width / 2, 29 | y: process.platform === 'darwin' ? y : y - height, 30 | height, 31 | width, 32 | }); 33 | this.popupWindow.show(); 34 | } 35 | }; 36 | 37 | onRightClick = (event) => { 38 | const menuConfig = Menu.buildFromTemplate([{ role: 'quit' }]); 39 | this.popUpContextMenu(menuConfig); 40 | }; 41 | } 42 | 43 | module.exports = MetricTray; 44 | -------------------------------------------------------------------------------- /build/2e0f6edb704eeb6183ad.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oslabs-beta/Kaffia/4b585a0b94846f460c865e2f08103d25d42d0020/build/2e0f6edb704eeb6183ad.jpg -------------------------------------------------------------------------------- /build/bundle.js.LICENSE.txt: -------------------------------------------------------------------------------- 1 | /** @license MUI v5.5.2 2 | * 3 | * This source code is licensed under the MIT license found in the 4 | * LICENSE file in the root directory of this source tree. 5 | */ 6 | 7 | /** @license React v17.0.2 8 | * react-jsx-runtime.production.min.js 9 | * 10 | * Copyright (c) Facebook, Inc. and its affiliates. 11 | * 12 | * This source code is licensed under the MIT license found in the 13 | * LICENSE file in the root directory of this source tree. 14 | */ 15 | -------------------------------------------------------------------------------- /configs/alertmanager/alertmanager_template.yml: -------------------------------------------------------------------------------- 1 | global: 2 | resolve_timeout: 1m 3 | route: 4 | receiver: 'gmail-notifications' 5 | receivers: 6 | - name: 'gmail-notifications' 7 | email_configs: 8 | - to: 9 | from: kaffiamonitor@gmail.com 10 | smarthost: smtp.gmail.com:587 11 | auth_username: kaffiamonitor@gmail.com 12 | auth_identity: kaffiamonitor@gmail.com 13 | auth_password: dhfyxdzixrvlhotl 14 | send_resolved: true 15 | -------------------------------------------------------------------------------- /configs/configGenerator.js: -------------------------------------------------------------------------------- 1 | const yaml = require('js-yaml'); 2 | const fs = require('fs'); 3 | const path = require('path'); 4 | 5 | const grafanaPanels = require('./grafana/templates/panels_template.js'); 6 | const jmxMetrics = require('./jmx_exporter/metric_list.js'); 7 | 8 | /** 9 | * dockerConfigGenerator creates a yaml file for a multi-container Docker application 10 | * that uses Grafana, Prometheus, jmx-exporter, and Kafka to create a self-monitoring 11 | * cluster. 12 | * @param brokerCount: user-specified number of Kafka brokers 13 | * @returns void 14 | * 15 | */ 16 | 17 | const dockerConfigGenerator = (brokerCount, email) => { 18 | if (brokerCount === 1) { 19 | try { 20 | const dockerConfig = yaml.load( 21 | fs.readFileSync( 22 | path.join(__dirname, './docker/docker_single_node_template.yml'), 23 | 'utf8' 24 | ) 25 | ); 26 | if (email) { 27 | dockerConfig.services.alertManager = { 28 | image: 'prom/alertmanager:v0.23.0', 29 | restart: 'unless-stopped', 30 | ports: ['9096:9096'], 31 | volumes: ['../alertmanager:/config', 'alertmanager-data:/data'], 32 | command: '--config.file=/config/alertmanager.yml --log.level=debug', 33 | depends_on: ['prometheus'], 34 | }; 35 | } 36 | return fs.writeFileSync( 37 | path.join(__dirname, 'docker/docker_single_node.yml'), 38 | yaml.dump(dockerConfig, { noRefs: true }) 39 | ); 40 | } catch (e) { 41 | return console.log(e); 42 | } 43 | } 44 | try { 45 | // load in the multi-container Docker yaml template 46 | const dockerConfig = yaml.load( 47 | fs.readFileSync( 48 | path.join(__dirname, './docker/docker_multiple_nodes_template.yml'), 49 | 'utf8' 50 | ) 51 | ); 52 | 53 | // define the properties for the jmx-exporter and Kafka cluster services 54 | // that are the same in each cluster 55 | let jmxConfig = { 56 | image: 'sscaling/jmx-prometheus-exporter', 57 | environment: { 58 | CONFIG_YML: '/../jmx_exporter/config.yml', 59 | JVM_OPTS: '-Xmx512M', 60 | }, 61 | }; 62 | let kafkaConfig = { 63 | image: 'confluentinc/cp-kafka:latest', 64 | depends_on: ['zk1'], 65 | environment: { 66 | KAFKA_ZOOKEEPER_CONNECT: 'zookeeper1:2181', 67 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: 68 | 'PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT', 69 | KAFKA_INTER_BROKER_LISTENER_NAME: 'PLAINTEXT', 70 | CONFLUENT_METRICS_REPORTER_ZOOKEEPER_CONNECT: 'zookeeper1:2181', 71 | CONFLUENT_METRICS_REPORTER_TOPIC_REPLICAS: 1, 72 | CONFLUENT_METRICS_ENABLE: 'false', 73 | KAFKA_HEAP_OPTS: '-Xmx512M -Xms512M', 74 | }, 75 | }; 76 | 77 | // generate the unique, cluster-specific properties for the jmx-exporter 78 | // and Kafka services according to the user's preferred broker count 79 | for (let i = 0; i < brokerCount; i++) { 80 | jmxConfig = { 81 | ...jmxConfig, 82 | ports: [`${5556 + i}:5556`], 83 | volumes: [ 84 | `../jmx_exporter/config_kafka10${ 85 | i + 1 86 | }.yml:/../jmx_exporter/config.yml`, 87 | ], 88 | container_name: `jmx-kafka${101 + i}`, 89 | depends_on: [`kafka${101 + i}`], 90 | }; 91 | dockerConfig.services[`jmx-kafka${101 + i}`] = jmxConfig; 92 | 93 | kafkaConfig = { 94 | ...kafkaConfig, 95 | ports: [`909${i + 1}:909${i + 1}`, `999${i + 1}:999${i + 1}`], 96 | container_name: `kafka${101 + i}`, 97 | environment: { 98 | ...kafkaConfig.environment, 99 | KAFKA_BROKER_ID: 101 + i, 100 | KAFKA_JMX_PORT: 9991 + i, 101 | KAFKA_ADVERTISED_LISTENERS: `PLAINTEXT://kafka10${ 102 | i + 1 103 | }:29092,PLAINTEXT_HOST://localhost:909${i + 1}`, 104 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 105 | brokerCount < 3 ? brokerCount : 3, 106 | KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 107 | brokerCount < 3 ? brokerCount : 3, 108 | CONFLUENT_METRICS_REPORTER_BOOTSTRAP_SERVERS: `kafka10${i + 1}:29092`, 109 | }, 110 | }; 111 | dockerConfig.services[`kafka${101 + i}`] = kafkaConfig; 112 | } 113 | 114 | if (email) { 115 | dockerConfig.services.alertManager = { 116 | image: 'prom/alertmanager:v0.23.0', 117 | restart: 'unless-stopped', 118 | ports: ['9096:9096'], 119 | volumes: ['../alertmanager:/config', 'alertmanager-data:/data'], 120 | command: '--config.file=/config/alertmanager.yml --log.level=debug', 121 | depends_on: ['prometheus'], 122 | }; 123 | } 124 | // after adding the required services for each broker, save the completed template 125 | // to a filepath that will then be used to launch the Docker app 126 | fs.writeFileSync( 127 | path.join(__dirname, './docker/docker_multiple_nodes.yml'), 128 | yaml.dump(dockerConfig, { noRefs: true }) 129 | ); 130 | } catch (e) { 131 | return console.log(e); 132 | } 133 | }; 134 | 135 | /** 136 | * promConfigGenerator creates a yaml file for a multi-container Docker application 137 | * to enable Prometheus monitoring of the jmx-exporter exposing Kafka metrics 138 | * @param brokerCount: user-specified number of Kafka brokers 139 | * @returns void 140 | * 141 | */ 142 | 143 | const promConfigGenerator = (brokerCount, email) => { 144 | try { 145 | // read in prometheus.yml template file and add jmx-exporter ports depending on 146 | // the user's preferred number of Kafka brokers 147 | const promConfig = yaml.load( 148 | fs.readFileSync( 149 | path.join(__dirname, 'prometheus/prometheus_template.yml'), 150 | 'utf8' 151 | ) 152 | ); 153 | const promTargets = []; 154 | for (let i = 0; i < brokerCount; i++) { 155 | promTargets.push(`jmx-kafka10${i + 1}:5556`); 156 | } 157 | 158 | // add ports to scrape to the yml file and save changes into completed yml file 159 | promConfig.scrape_configs[0].static_configs[0].targets = promTargets; 160 | 161 | if (email) { 162 | promConfig.rule_files[0] = ['alert_rules.yml']; 163 | promConfig.alerting.alertmanagers[0].static_configs[0].targets[0] = [ 164 | 'alertmanager:9096', 165 | ]; 166 | } 167 | 168 | fs.writeFileSync( 169 | path.join(__dirname, 'prometheus/prometheus.yml'), 170 | yaml.dump(promConfig, { noRefs: true }) 171 | ); 172 | } catch (e) { 173 | console.log(e); 174 | } 175 | }; 176 | 177 | /** 178 | * promConfigGenerator creates multiple yaml files for the jmx-exporters to only 179 | * expose the metrics that the user would like to view. Additionally, the method 180 | * constructs Grafana dashboards using those user-specified metrics. 181 | * @param brokerCount: user-specified number of Kafka brokers 182 | * @returns void 183 | * 184 | */ 185 | 186 | const jvmGrafanaConfigGenerator = (brokerCount, userMetrics) => { 187 | // create Grafana dashboard folder if it does not already exist 188 | if (!fs.existsSync(path.join(__dirname, 'grafana/dashboards'))) { 189 | fs.mkdirSync(path.join(__dirname, 'grafana/dashboards')); 190 | } 191 | // delete existing dashboards from previous cluster creation, which may have different settings 192 | const existingDashboards = fs.readdirSync( 193 | path.join(__dirname, 'grafana/dashboards') 194 | ); 195 | existingDashboards.forEach((dashboard) => { 196 | fs.unlinkSync(path.join(__dirname, 'grafana/dashboards', dashboard)); 197 | }); 198 | 199 | // load template for jmx-exporter to add specific metrics to whtielist 200 | const config_kafka_template = yaml.load( 201 | fs.readFileSync( 202 | path.join(__dirname, 'jmx_exporter/config_kafka_template.yml'), 203 | 'utf8' 204 | ) 205 | ); 206 | 207 | // loop through each user-selected dashboard to add panels to Prometheus file 208 | for (const dashboard in userMetrics) { 209 | // grab Grafana file corresponding to selected dashboard 210 | const grafanaFile = JSON.parse( 211 | fs.readFileSync( 212 | path.join(__dirname, `grafana/templates/${dashboard}.json`), 213 | 'utf-8' 214 | ) 215 | ); 216 | // add jmx-exporter metric to whitelist to scrape that item 217 | // add panel to Grafana dashboard 218 | for (const panel of userMetrics[dashboard]) { 219 | // jmxMetrics[dashboard][panel].forEach(whitelist.add, whitelist); 220 | grafanaFile.panels.push(...grafanaPanels[dashboard][panel]); 221 | } 222 | fs.writeFileSync( 223 | path.join(__dirname, `./grafana/dashboards/${dashboard}.json`), 224 | JSON.stringify(grafanaFile), 225 | { noRefs: true } 226 | ); 227 | } 228 | // config_kafka_template.whitelistObjectNames = [...whitelist]; 229 | 230 | // save jmx-exporter config files with correct ports 231 | for (let i = 0; i < brokerCount; i++) { 232 | config_kafka_template.hostPort = `kafka10${i + 1}:999${i + 1}`; 233 | fs.writeFileSync( 234 | path.join(__dirname, `./jmx_exporter/config_kafka10${i + 1}.yml`), 235 | yaml.dump(config_kafka_template, { noRefs: true }) 236 | ); 237 | } 238 | }; 239 | 240 | /** 241 | * alertConfigGenerator creates a yaml file to alert the user 242 | * about critical cluster issues using the email provided on the 243 | * launch page form. 244 | * @param email: user-specified email address 245 | * @returns void 246 | * 247 | */ 248 | 249 | const alertConfigGenerator = (email) => { 250 | try { 251 | // read in alert manager template file and update with email 252 | // provided by user 253 | const alertManager = yaml.load( 254 | fs.readFileSync( 255 | path.join(__dirname, 'alertmanager/alertmanager_template.yml'), 256 | 'utf8' 257 | ) 258 | ); 259 | alertManager.receivers[0].email_configs[0].to = email; 260 | // save updated file to be used when Docker launches 261 | // the necessary containerized services 262 | fs.writeFileSync( 263 | path.join(__dirname, 'alertmanager/alertmanager.yml'), 264 | yaml.dump(alertManager, { noRefs: true }) 265 | ); 266 | } catch (e) { 267 | console.log(e); 268 | } 269 | }; 270 | 271 | module.exports = (brokerCount, metrics, email) => { 272 | // run all three (or four) config methods each time user submits form with preferences 273 | promConfigGenerator(brokerCount, email); 274 | jvmGrafanaConfigGenerator(brokerCount, metrics); 275 | if (email) alertConfigGenerator(email); 276 | dockerConfigGenerator(brokerCount, email); 277 | }; 278 | -------------------------------------------------------------------------------- /configs/docker/docker_multiple_nodes_template.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | grafana: 4 | image: 'grafana/grafana' 5 | ports: 6 | - '3000:3000' 7 | environment: 8 | GF_PATHS_DATA: /var/lib/grafana 9 | GF_SECURITY_ALLOW_EMBEDDING: 'true' 10 | GF_AUTH_ANONYMOUS_ENABLED: 'true' 11 | GF_SMTP_ENABLED: 'true' 12 | GF_SECURITY_ADMIN_PASSWORD: kaffia 13 | volumes: 14 | - ../grafana/provisioning:/etc/grafana/provisioning 15 | - ../grafana/dashboards:/var/lib/grafana/dashboards 16 | container_name: grafana 17 | depends_on: 18 | - prometheus 19 | prometheus: 20 | image: 'prom/prometheus' 21 | ports: 22 | - '9090:9090' 23 | volumes: 24 | - ../prometheus/prometheus.yml:/../prometheus/prometheus.yml 25 | - ../prometheus/alert_rules.yml:/../prometheus/alert_rules.yml 26 | command: '--config.file=/../prometheus/prometheus.yml' 27 | container_name: prometheus 28 | zk1: 29 | image: confluentinc/cp-zookeeper:latest 30 | environment: 31 | ZOOKEEPER_CLIENT_PORT: 2181 32 | ZOOKEEPER_TICK_TIME: 2000 33 | ZOOKEEPER_INIT_LIMIT: 5 34 | ZOOKEEPER_SYNC_LIMIT: 2 35 | ports: 36 | - 2181:2181 37 | container_name: zookeeper1 38 | volumes: 39 | alertmanager-data: 40 | -------------------------------------------------------------------------------- /configs/docker/docker_single_node_template.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | services: 3 | grafana: 4 | image: 'grafana/grafana' 5 | ports: 6 | - '3000:3000' 7 | environment: 8 | GF_PATHS_DATA: /var/lib/grafana 9 | GF_SECURITY_ALLOW_EMBEDDING: 'true' 10 | GF_AUTH_ANONYMOUS_ENABLED: 'true' 11 | GF_ALERTING_ENABLED: 'false' 12 | GF_SMTP_ENABLED: 'true' 13 | GF_SECURITY_ADMIN_PASSWORD: kaffia 14 | volumes: 15 | - ../grafana/provisioning:/etc/grafana/provisioning 16 | - ../grafana/dashboards:/var/lib/grafana/dashboards 17 | container_name: grafana 18 | depends_on: 19 | - prometheus 20 | prometheus: 21 | image: 'prom/prometheus' 22 | ports: 23 | - '9090:9090' 24 | volumes: 25 | - ../prometheus/prometheus.yml:/../prometheus/prometheus.yml 26 | - ../prometheus/alert_rules.yml:/../prometheus/alert_rules.yml 27 | command: '--config.file=/../prometheus/prometheus.yml' 28 | container_name: prometheus 29 | zk1: 30 | image: confluentinc/cp-zookeeper:latest 31 | environment: 32 | ZOOKEEPER_CLIENT_PORT: 2181 33 | ZOOKEEPER_TICK_TIME: 2000 34 | ZOOKEEPER_INIT_LIMIT: 5 35 | ZOOKEEPER_SYNC_LIMIT: 2 36 | ports: 37 | - 2181:2181 38 | container_name: zookeeper1 39 | jmx-kafka101: 40 | image: 'sscaling/jmx-prometheus-exporter' 41 | ports: 42 | - '5556:5556' 43 | environment: 44 | CONFIG_YML: '/../jmx_exporter/config.yml' 45 | volumes: 46 | - ./../jmx_exporter/config_kafka101.yml:/../jmx_exporter/config.yml 47 | container_name: jmx-kafka101 48 | depends_on: 49 | - kafka101 50 | kafka101: 51 | image: confluentinc/cp-kafka:latest 52 | depends_on: 53 | - zk1 54 | ports: 55 | - 9092:9092 56 | - 9991:9991 57 | container_name: kafka101 58 | environment: 59 | KAFKA_BROKER_ID: 101 60 | KAFKA_JMX_PORT: 9991 61 | KAFKA_ZOOKEEPER_CONNECT: zookeeper1:2181 62 | KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://kafka101:29092,PLAINTEXT_HOST://localhost:9092 63 | KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT 64 | KAFKA_INTER_BROKER_LISTENER_NAME: PLAINTEXT 65 | KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1 66 | KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1 67 | KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 1 68 | CONFLUENT_METRICS_REPORTER_BOOTSTRAP_SERVERS: kafka101:29092 69 | CONFLUENT_METRICS_REPORTER_ZOOKEEPER_CONNECT: zookeeper1:2181 70 | CONFLUENT_METRICS_REPORTER_TOPIC_REPLICAS: 1 71 | CONFLUENT_METRICS_ENABLE: 'false' 72 | KAFKA_HEAP_OPTS: '-Xmx512M -Xms512M' 73 | volumes: 74 | alertmanager-data: 75 | -------------------------------------------------------------------------------- /configs/grafana/provisioning/dashboards/kafka.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | providers: 4 | # an unique provider name. Required 5 | - name: 'default' 6 | # Org id. Default to 1 7 | orgId: 1 8 | # name of the dashboard folder. 9 | folder: '' 10 | # folder UID. will be automatically generated if not specified 11 | folderUid: '' 12 | # provider type. Default to 'file' 13 | type: file 14 | # disable dashboard deletion 15 | disableDeletion: false 16 | # how often Grafana will scan for changed dashboards 17 | updateIntervalSeconds: 5 18 | # allow updating provisioned dashboards from the UI 19 | allowUiUpdates: false 20 | options: 21 | # path to dashboard files on disk. Required when using the 'file' type 22 | path: /var/lib/grafana/dashboards 23 | -------------------------------------------------------------------------------- /configs/grafana/provisioning/datasources/datasource.yaml: -------------------------------------------------------------------------------- 1 | # config file version 2 | apiVersion: 1 3 | 4 | # list of datasources that should be deleted from the database 5 | deleteDatasources: 6 | - name: Prometheus 7 | orgId: 1 8 | 9 | # list of datasources to insert/update depending 10 | # what's available in the database 11 | datasources: 12 | # name of the datasource. Required 13 | - name: Prometheus 14 | # datasource type. Required 15 | type: prometheus 16 | # access mode. proxy or direct (Server or Browser in the UI). Required 17 | access: direct 18 | # org id. will default to orgId 1 if not specified 19 | orgId: 1 20 | # url 21 | url: http://localhost:9090 22 | # database password, if used 23 | password: 24 | # database user, if used 25 | user: 26 | # database name, if used 27 | database: 28 | # enable/disable basic auth 29 | basicAuth: 30 | # basic auth username 31 | basicAuthUser: 32 | # basic auth password 33 | basicAuthPassword: 34 | # enable/disable with credentials headers 35 | withCredentials: 36 | # mark as default datasource. Max one per org 37 | isDefault: true 38 | # fields that will be converted to json and stored in json_data 39 | jsonData: 40 | graphiteVersion: '1.1' 41 | tlsAuth: true 42 | tlsAuthWithCACert: true 43 | # json object of data that will be encrypted. 44 | secureJsonData: 45 | tlsCACert: '...' 46 | tlsClientCert: '...' 47 | tlsClientKey: '...' 48 | version: 1 49 | # allow users to edit datasources from the UI. 50 | editable: true 51 | -------------------------------------------------------------------------------- /configs/grafana/templates/broker_hard_disk_usage.json: -------------------------------------------------------------------------------- 1 | { 2 | "__inputs": [ 3 | { 4 | "name": "Prometheus", 5 | "label": "Prometheus", 6 | "description": "", 7 | "type": "datasource", 8 | "pluginId": "prometheus", 9 | "pluginName": "Prometheus" 10 | } 11 | ], 12 | "__requires": [ 13 | { 14 | "type": "grafana", 15 | "id": "grafana", 16 | "name": "Grafana", 17 | "version": "5.2.1" 18 | }, 19 | { 20 | "type": "panel", 21 | "id": "graph", 22 | "name": "Graph", 23 | "version": "5.0.0" 24 | }, 25 | { 26 | "type": "datasource", 27 | "id": "prometheus", 28 | "name": "Prometheus", 29 | "version": "5.0.0" 30 | } 31 | ], 32 | "annotations": { 33 | "list": [ 34 | { 35 | "builtIn": 1, 36 | "datasource": "-- Grafana --", 37 | "enable": true, 38 | "hide": true, 39 | "iconColor": "rgba(0, 211, 255, 1)", 40 | "name": "Annotations & Alerts", 41 | "type": "dashboard" 42 | } 43 | ] 44 | }, 45 | "editable": true, 46 | "gnetId": null, 47 | "graphTooltip": 0, 48 | "id": null, 49 | "iteration": 1540328526402, 50 | "links": [], 51 | "panels": [], 52 | "refresh": "5s", 53 | "schemaVersion": 16, 54 | "style": "dark", 55 | "tags": ["kafka", "topics", "disk"], 56 | "templating": { 57 | "list": [ 58 | { 59 | "allValue": null, 60 | "current": {}, 61 | "datasource": "Prometheus", 62 | "hide": 0, 63 | "includeAll": false, 64 | "label": "Env", 65 | "multi": false, 66 | "name": "env", 67 | "options": [], 68 | "query": "label_values(kafka_log_size, env)", 69 | "refresh": 1, 70 | "regex": "", 71 | "sort": 0, 72 | "tagValuesQuery": "", 73 | "tags": [], 74 | "tagsQuery": "", 75 | "type": "query", 76 | "useTags": false 77 | } 78 | ] 79 | }, 80 | "time": { 81 | "from": "now-1h", 82 | "to": "now" 83 | }, 84 | "timepicker": { 85 | "refresh_intervals": [ 86 | "5s", 87 | "10s", 88 | "30s", 89 | "1m", 90 | "5m", 91 | "15m", 92 | "30m", 93 | "1h", 94 | "2h", 95 | "1d" 96 | ], 97 | "time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"] 98 | }, 99 | "timezone": "", 100 | "title": "Kafka / Hard Disk Usage", 101 | "uid": "zApgMBbik", 102 | "version": 1 103 | } 104 | -------------------------------------------------------------------------------- /configs/grafana/templates/broker_jvm_os.json: -------------------------------------------------------------------------------- 1 | { 2 | "__inputs": [ 3 | { 4 | "name": "prometheus", 5 | "label": "Prometheus", 6 | "description": "", 7 | "type": "datasource", 8 | "pluginId": "prometheus", 9 | "pluginName": "Prometheus" 10 | } 11 | ], 12 | "__requires": [ 13 | { 14 | "type": "grafana", 15 | "id": "grafana", 16 | "name": "Grafana", 17 | "version": "5.2.1" 18 | }, 19 | { 20 | "type": "panel", 21 | "id": "graph", 22 | "name": "Graph", 23 | "version": "5.0.0" 24 | }, 25 | { 26 | "type": "datasource", 27 | "id": "prometheus", 28 | "name": "Prometheus", 29 | "version": "5.0.0" 30 | } 31 | ], 32 | "annotations": { 33 | "list": [ 34 | { 35 | "builtIn": 1, 36 | "datasource": "-- Grafana --", 37 | "enable": true, 38 | "hide": true, 39 | "iconColor": "rgba(0, 211, 255, 1)", 40 | "name": "Annotations & Alerts", 41 | "type": "dashboard" 42 | } 43 | ] 44 | }, 45 | "description": "Java Virtual Machine Memory and OS metrics", 46 | "editable": true, 47 | "gnetId": null, 48 | "graphTooltip": 0, 49 | "id": null, 50 | "iteration": 1540307470986, 51 | "links": [], 52 | "panels": [], 53 | "refresh": "5s", 54 | "schemaVersion": 16, 55 | "style": "dark", 56 | "tags": [], 57 | "templating": { 58 | "list": [ 59 | { 60 | "allValue": null, 61 | "current": {}, 62 | "datasource": "Prometheus", 63 | "hide": 0, 64 | "includeAll": false, 65 | "label": "Env", 66 | "multi": false, 67 | "name": "env", 68 | "options": [], 69 | "query": "label_values(kafka_jvm_os_openfiledescriptorcount, env)", 70 | "refresh": 1, 71 | "regex": "", 72 | "sort": 0, 73 | "tagValuesQuery": "", 74 | "tags": [], 75 | "tagsQuery": "", 76 | "type": "query", 77 | "useTags": false 78 | } 79 | ] 80 | }, 81 | "time": { 82 | "from": "now-1h", 83 | "to": "now" 84 | }, 85 | "timepicker": { 86 | "refresh_intervals": [ 87 | "5s", 88 | "10s", 89 | "30s", 90 | "1m", 91 | "5m", 92 | "15m", 93 | "30m", 94 | "1h", 95 | "2h", 96 | "1d" 97 | ], 98 | "time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"] 99 | }, 100 | "timezone": "", 101 | "title": "Kafka / Brokers JVM & OS", 102 | "uid": "AdG9A1xmk", 103 | "version": 3 104 | } 105 | -------------------------------------------------------------------------------- /configs/grafana/templates/broker_performance.json: -------------------------------------------------------------------------------- 1 | { 2 | "__inputs": [ 3 | { 4 | "name": "Prometheus", 5 | "label": "prometheus", 6 | "description": "", 7 | "type": "datasource", 8 | "pluginId": "prometheus", 9 | "pluginName": "Prometheus" 10 | } 11 | ], 12 | "__requires": [ 13 | { 14 | "type": "grafana", 15 | "id": "grafana", 16 | "name": "Grafana", 17 | "version": "5.2.1" 18 | }, 19 | { 20 | "type": "panel", 21 | "id": "graph", 22 | "name": "Graph", 23 | "version": "5.0.0" 24 | }, 25 | { 26 | "type": "datasource", 27 | "id": "prometheus", 28 | "name": "Prometheus", 29 | "version": "5.0.0" 30 | }, 31 | { 32 | "type": "panel", 33 | "id": "singlestat", 34 | "name": "Singlestat", 35 | "version": "5.0.0" 36 | } 37 | ], 38 | "annotations": { 39 | "list": [ 40 | { 41 | "builtIn": 1, 42 | "datasource": "-- Grafana --", 43 | "enable": true, 44 | "hide": true, 45 | "iconColor": "rgba(0, 211, 255, 1)", 46 | "name": "Annotations & Alerts", 47 | "type": "dashboard" 48 | } 49 | ] 50 | }, 51 | "editable": true, 52 | "gnetId": null, 53 | "graphTooltip": 0, 54 | "id": null, 55 | "iteration": 1532113327583, 56 | "links": [], 57 | "panels": [ 58 | { 59 | "collapsed": false, 60 | "gridPos": { 61 | "h": 1, 62 | "w": 24, 63 | "x": 0, 64 | "y": 0 65 | }, 66 | "id": 17, 67 | "panels": [], 68 | "repeat": "instance", 69 | "title": "Request Latency : $request ($instance)", 70 | "type": "row" 71 | } 72 | ], 73 | "refresh": "5s", 74 | "schemaVersion": 16, 75 | "style": "dark", 76 | "tags": ["kafka", "broker", "troubleshoot"], 77 | "templating": { 78 | "list": [ 79 | { 80 | "allValue": null, 81 | "current": {}, 82 | "datasource": "Prometheus", 83 | "hide": 0, 84 | "includeAll": false, 85 | "label": "Envrionment", 86 | "multi": false, 87 | "name": "env", 88 | "options": [], 89 | "query": "label_values(kafka_server_brokerstate, env)", 90 | "refresh": 1, 91 | "regex": "", 92 | "sort": 0, 93 | "tagValuesQuery": "", 94 | "tags": [], 95 | "tagsQuery": "", 96 | "type": "query", 97 | "useTags": false 98 | }, 99 | { 100 | "allValue": null, 101 | "current": {}, 102 | "datasource": "Prometheus", 103 | "hide": 0, 104 | "includeAll": true, 105 | "label": "Broker Host", 106 | "multi": true, 107 | "name": "instance", 108 | "options": [], 109 | "query": "label_values(kafka_network_request_metrics_time_ms{env=\"$env\"}, instance)", 110 | "refresh": 1, 111 | "regex": "", 112 | "sort": 0, 113 | "tagValuesQuery": "", 114 | "tags": [], 115 | "tagsQuery": "", 116 | "type": "query", 117 | "useTags": false 118 | }, 119 | { 120 | "allValue": null, 121 | "current": {}, 122 | "datasource": "Prometheus", 123 | "hide": 0, 124 | "includeAll": false, 125 | "label": "Request Type", 126 | "multi": false, 127 | "name": "request", 128 | "options": [], 129 | "query": "label_values(kafka_network_request_metrics_time_ms, request)", 130 | "refresh": 1, 131 | "regex": "", 132 | "sort": 0, 133 | "tagValuesQuery": "", 134 | "tags": [], 135 | "tagsQuery": "", 136 | "type": "query", 137 | "useTags": false 138 | } 139 | ] 140 | }, 141 | "time": { 142 | "from": "now-1h", 143 | "to": "now" 144 | }, 145 | "timepicker": { 146 | "refresh_intervals": [ 147 | "5s", 148 | "10s", 149 | "30s", 150 | "1m", 151 | "5m", 152 | "15m", 153 | "30m", 154 | "1h", 155 | "2h", 156 | "1d" 157 | ], 158 | "time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"] 159 | }, 160 | "timezone": "browser", 161 | "title": "Kafka Broker / Performance & Latency", 162 | "uid": "aRNaJwOmk", 163 | "version": 8 164 | } 165 | -------------------------------------------------------------------------------- /configs/grafana/templates/broker_zookeeper.json: -------------------------------------------------------------------------------- 1 | { 2 | "__inputs": [ 3 | { 4 | "name": "Prometheus", 5 | "label": "Prometheus", 6 | "description": "", 7 | "type": "datasource", 8 | "pluginId": "prometheus", 9 | "pluginName": "Prometheus" 10 | } 11 | ], 12 | "__requires": [ 13 | { 14 | "type": "grafana", 15 | "id": "grafana", 16 | "name": "Grafana", 17 | "version": "5.2.1" 18 | }, 19 | { 20 | "type": "panel", 21 | "id": "graph", 22 | "name": "Graph", 23 | "version": "5.0.0" 24 | }, 25 | { 26 | "type": "datasource", 27 | "id": "prometheus", 28 | "name": "Prometheus", 29 | "version": "5.0.0" 30 | }, 31 | { 32 | "type": "panel", 33 | "id": "text", 34 | "name": "Text", 35 | "version": "5.0.0" 36 | } 37 | ], 38 | "annotations": { 39 | "list": [ 40 | { 41 | "builtIn": 1, 42 | "datasource": "-- Grafana --", 43 | "enable": true, 44 | "hide": true, 45 | "iconColor": "rgba(0, 211, 255, 1)", 46 | "name": "Annotations & Alerts", 47 | "type": "dashboard" 48 | } 49 | ] 50 | }, 51 | "editable": true, 52 | "gnetId": null, 53 | "graphTooltip": 0, 54 | "id": null, 55 | "iteration": 1539721769054, 56 | "links": [], 57 | "panels": [], 58 | "refresh": "5s", 59 | "schemaVersion": 16, 60 | "style": "dark", 61 | "tags": ["kafka", "zookeeper"], 62 | "templating": { 63 | "list": [ 64 | { 65 | "allValue": null, 66 | "current": {}, 67 | "datasource": "Prometheus", 68 | "hide": 0, 69 | "includeAll": false, 70 | "label": "Env", 71 | "multi": false, 72 | "name": "env", 73 | "options": [], 74 | "query": "label_values(kafka_server_brokerstate, env)", 75 | "refresh": 1, 76 | "regex": "", 77 | "sort": 0, 78 | "tagValuesQuery": "", 79 | "tags": [], 80 | "tagsQuery": "", 81 | "type": "query", 82 | "useTags": false 83 | } 84 | ] 85 | }, 86 | "time": { 87 | "from": "now-30m", 88 | "to": "now" 89 | }, 90 | "timepicker": { 91 | "refresh_intervals": [ 92 | "5s", 93 | "10s", 94 | "30s", 95 | "1m", 96 | "5m", 97 | "15m", 98 | "30m", 99 | "1h", 100 | "2h", 101 | "1d" 102 | ], 103 | "time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"] 104 | }, 105 | "timezone": "browser", 106 | "title": "Kafka Broker / Zookeeper Connection", 107 | "uid": "142Xi34mk", 108 | "version": 1 109 | } 110 | -------------------------------------------------------------------------------- /configs/grafana/templates/client_consumers_fetch_lag.json: -------------------------------------------------------------------------------- 1 | { 2 | "__inputs": [ 3 | { 4 | "name": "Prometheus", 5 | "label": "prometheus", 6 | "description": "", 7 | "type": "datasource", 8 | "pluginId": "prometheus", 9 | "pluginName": "Prometheus" 10 | } 11 | ], 12 | "__requires": [ 13 | { 14 | "type": "grafana", 15 | "id": "grafana", 16 | "name": "Grafana", 17 | "version": "5.2.1" 18 | }, 19 | { 20 | "type": "panel", 21 | "id": "graph", 22 | "name": "Graph", 23 | "version": "5.0.0" 24 | }, 25 | { 26 | "type": "datasource", 27 | "id": "prometheus", 28 | "name": "Prometheus", 29 | "version": "5.0.0" 30 | } 31 | ], 32 | "annotations": { 33 | "list": [ 34 | { 35 | "builtIn": 1, 36 | "datasource": "-- Grafana --", 37 | "enable": true, 38 | "hide": true, 39 | "iconColor": "rgba(0, 211, 255, 1)", 40 | "name": "Annotations & Alerts", 41 | "type": "dashboard" 42 | } 43 | ] 44 | }, 45 | "description": "", 46 | "editable": true, 47 | "gnetId": null, 48 | "graphTooltip": 0, 49 | "id": null, 50 | "iteration": 1532113236152, 51 | "links": [], 52 | "panels": [], 53 | "refresh": "5s", 54 | "schemaVersion": 16, 55 | "style": "dark", 56 | "tags": ["kafka", "clients"], 57 | "templating": { 58 | "list": [ 59 | { 60 | "allValue": null, 61 | "current": {}, 62 | "datasource": "Prometheus", 63 | "hide": 0, 64 | "includeAll": false, 65 | "label": "env", 66 | "multi": false, 67 | "name": "env", 68 | "options": [], 69 | "query": "label_values(kafka_consumer_fetch_manager_metrics_records_consumed_rate, env)", 70 | "refresh": 1, 71 | "regex": "", 72 | "sort": 0, 73 | "tagValuesQuery": "", 74 | "tags": [], 75 | "tagsQuery": "", 76 | "type": "query", 77 | "useTags": false 78 | }, 79 | { 80 | "allValue": null, 81 | "current": {}, 82 | "datasource": "Prometheus", 83 | "hide": 0, 84 | "includeAll": false, 85 | "label": "topic", 86 | "multi": false, 87 | "name": "topic", 88 | "options": [], 89 | "query": "label_values(kafka_consumer_fetch_manager_metrics_records_consumed_rate, topic)", 90 | "refresh": 1, 91 | "regex": "", 92 | "sort": 0, 93 | "tagValuesQuery": "", 94 | "tags": [], 95 | "tagsQuery": "", 96 | "type": "query", 97 | "useTags": false 98 | } 99 | ] 100 | }, 101 | "time": { 102 | "from": "now-1h", 103 | "to": "now" 104 | }, 105 | "timepicker": { 106 | "refresh_intervals": [ 107 | "5s", 108 | "10s", 109 | "30s", 110 | "1m", 111 | "5m", 112 | "15m", 113 | "30m", 114 | "1h", 115 | "2h", 116 | "1d" 117 | ], 118 | "time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"] 119 | }, 120 | "timezone": "browser", 121 | "title": "Kafka Consumers / Fetch Rate & Records Lag", 122 | "uid": "Ni95Y3dmz", 123 | "version": 20 124 | } 125 | -------------------------------------------------------------------------------- /configs/grafana/templates/cluster_healthcheck.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "builtIn": 1, 6 | "datasource": "-- Grafana --", 7 | "enable": true, 8 | "hide": true, 9 | "iconColor": "rgba(0, 211, 255, 1)", 10 | "name": "Annotations & Alerts", 11 | "type": "dashboard" 12 | } 13 | ] 14 | }, 15 | "editable": true, 16 | "gnetId": null, 17 | "graphTooltip": 0, 18 | "iteration": 1540310392981, 19 | "links": [], 20 | "panels": [], 21 | "refresh": "5s", 22 | "schemaVersion": 16, 23 | "style": "dark", 24 | "tags": ["kafka", "broker", "healthcheck"], 25 | "templating": { 26 | "list": [ 27 | { 28 | "allValue": null, 29 | "current": { 30 | "text": "docker", 31 | "value": "docker" 32 | }, 33 | "datasource": "Prometheus", 34 | "hide": 0, 35 | "includeAll": false, 36 | "label": "Environment", 37 | "multi": false, 38 | "name": "env", 39 | "options": [], 40 | "query": "label_values(kafka_server_brokerstate, env)", 41 | "refresh": 1, 42 | "regex": "", 43 | "sort": 0, 44 | "tagValuesQuery": "", 45 | "tags": [], 46 | "tagsQuery": "", 47 | "type": "query", 48 | "useTags": false 49 | } 50 | ] 51 | }, 52 | "time": { 53 | "from": "now-1h", 54 | "to": "now" 55 | }, 56 | "timepicker": { 57 | "refresh_intervals": [ 58 | "5s", 59 | "10s", 60 | "30s", 61 | "1m", 62 | "5m", 63 | "15m", 64 | "30m", 65 | "1h", 66 | "2h", 67 | "1d" 68 | ], 69 | "time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"] 70 | }, 71 | "timezone": "browser", 72 | "title": "Kafka Cluster / Global HealthCheck", 73 | "uid": "e-6AJQOik", 74 | "version": 1 75 | } 76 | -------------------------------------------------------------------------------- /configs/grafana/templates/cluster_replication.json: -------------------------------------------------------------------------------- 1 | { 2 | "__inputs": [ 3 | { 4 | "name": "Prometheus", 5 | "label": "Prometheus", 6 | "description": "", 7 | "type": "datasource", 8 | "pluginId": "prometheus", 9 | "pluginName": "Prometheus" 10 | } 11 | ], 12 | "__requires": [ 13 | { 14 | "type": "grafana", 15 | "id": "grafana", 16 | "name": "Grafana", 17 | "version": "5.2.1" 18 | }, 19 | { 20 | "type": "panel", 21 | "id": "graph", 22 | "name": "Graph", 23 | "version": "5.0.0" 24 | }, 25 | { 26 | "type": "datasource", 27 | "id": "prometheus", 28 | "name": "Prometheus", 29 | "version": "5.0.0" 30 | } 31 | ], 32 | "annotations": { 33 | "list": [ 34 | { 35 | "builtIn": 1, 36 | "datasource": "-- Grafana --", 37 | "enable": true, 38 | "hide": true, 39 | "iconColor": "rgba(0, 211, 255, 1)", 40 | "name": "Annotations & Alerts", 41 | "type": "dashboard" 42 | } 43 | ] 44 | }, 45 | "editable": true, 46 | "gnetId": null, 47 | "graphTooltip": 0, 48 | "id": null, 49 | "iteration": 1540330422283, 50 | "links": [], 51 | "panels": [], 52 | "refresh": "5s", 53 | "schemaVersion": 16, 54 | "style": "dark", 55 | "tags": ["kafka", "broker", "replication"], 56 | "templating": { 57 | "list": [ 58 | { 59 | "allValue": null, 60 | "current": {}, 61 | "datasource": "Prometheus", 62 | "hide": 0, 63 | "includeAll": false, 64 | "label": "Env", 65 | "multi": false, 66 | "name": "env", 67 | "options": [], 68 | "query": "label_values(kafka_server_brokerstate, env)", 69 | "refresh": 1, 70 | "regex": "", 71 | "sort": 0, 72 | "tagValuesQuery": "", 73 | "tags": [], 74 | "tagsQuery": "", 75 | "type": "query", 76 | "useTags": false 77 | } 78 | ] 79 | }, 80 | "time": { 81 | "from": "now-1h", 82 | "to": "now" 83 | }, 84 | "timepicker": { 85 | "refresh_intervals": [ 86 | "5s", 87 | "10s", 88 | "30s", 89 | "1m", 90 | "5m", 91 | "15m", 92 | "30m", 93 | "1h", 94 | "2h", 95 | "1d" 96 | ], 97 | "time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"] 98 | }, 99 | "timezone": "browser", 100 | "title": "Kafka Cluster / Replication", 101 | "uid": "E9bXmq4ik", 102 | "version": 2 103 | } 104 | -------------------------------------------------------------------------------- /configs/grafana/templates/topics_logs.json: -------------------------------------------------------------------------------- 1 | { 2 | "__inputs": [ 3 | { 4 | "name": "Prometheus", 5 | "label": "Prometheus", 6 | "description": "", 7 | "type": "datasource", 8 | "pluginId": "prometheus", 9 | "pluginName": "Prometheus" 10 | } 11 | ], 12 | "__requires": [ 13 | { 14 | "type": "grafana", 15 | "id": "grafana", 16 | "name": "Grafana", 17 | "version": "5.2.1" 18 | }, 19 | { 20 | "type": "panel", 21 | "id": "graph", 22 | "name": "Graph", 23 | "version": "5.0.0" 24 | }, 25 | { 26 | "type": "datasource", 27 | "id": "prometheus", 28 | "name": "Prometheus", 29 | "version": "5.0.0" 30 | }, 31 | { 32 | "type": "panel", 33 | "id": "singlestat", 34 | "name": "Singlestat", 35 | "version": "5.0.0" 36 | }, 37 | { 38 | "type": "panel", 39 | "id": "table", 40 | "name": "Table", 41 | "version": "5.0.0" 42 | } 43 | ], 44 | "annotations": { 45 | "list": [ 46 | { 47 | "builtIn": 1, 48 | "datasource": "-- Grafana --", 49 | "enable": true, 50 | "hide": true, 51 | "iconColor": "rgba(0, 211, 255, 1)", 52 | "name": "Annotations & Alerts", 53 | "type": "dashboard" 54 | } 55 | ] 56 | }, 57 | "editable": true, 58 | "gnetId": null, 59 | "graphTooltip": 0, 60 | "id": null, 61 | "iteration": 1540331963544, 62 | "links": [], 63 | "panels": [ 64 | { 65 | "collapsed": false, 66 | "gridPos": { 67 | "h": 1, 68 | "w": 24, 69 | "x": 0, 70 | "y": 0 71 | }, 72 | "id": 4, 73 | "panels": [], 74 | "repeat": "topic", 75 | "title": "$topic", 76 | "type": "row" 77 | } 78 | ], 79 | "refresh": "5s", 80 | "schemaVersion": 16, 81 | "style": "dark", 82 | "tags": ["kafka", "topics"], 83 | "templating": { 84 | "list": [ 85 | { 86 | "allValue": null, 87 | "current": {}, 88 | "datasource": "Prometheus", 89 | "hide": 0, 90 | "includeAll": false, 91 | "label": "Cluster", 92 | "multi": false, 93 | "name": "env", 94 | "options": [], 95 | "query": "label_values(kafka_log_size, env)", 96 | "refresh": 2, 97 | "regex": "", 98 | "sort": 0, 99 | "tagValuesQuery": "", 100 | "tags": [], 101 | "tagsQuery": "", 102 | "type": "query", 103 | "useTags": false 104 | }, 105 | { 106 | "allValue": null, 107 | "current": {}, 108 | "datasource": "Prometheus", 109 | "hide": 0, 110 | "includeAll": false, 111 | "label": "Topics", 112 | "multi": true, 113 | "name": "topic", 114 | "options": [], 115 | "query": "label_values(kafka_log_size, topic)", 116 | "refresh": 2, 117 | "regex": "", 118 | "sort": 5, 119 | "tagValuesQuery": "", 120 | "tags": [], 121 | "tagsQuery": "", 122 | "type": "query", 123 | "useTags": false 124 | } 125 | ] 126 | }, 127 | "time": { 128 | "from": "now-1h", 129 | "to": "now" 130 | }, 131 | "timepicker": { 132 | "refresh_intervals": [ 133 | "5s", 134 | "10s", 135 | "30s", 136 | "1m", 137 | "5m", 138 | "15m", 139 | "30m", 140 | "1h", 141 | "2h", 142 | "1d" 143 | ], 144 | "time_options": ["5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d"] 145 | }, 146 | "timezone": "", 147 | "title": "Kafka Topics / Logs", 148 | "uid": "1vR8sjAmk", 149 | "version": 1 150 | } 151 | -------------------------------------------------------------------------------- /configs/jmx_exporter/config_kafka_template.yml: -------------------------------------------------------------------------------- 1 | hostPort: kafka101:9991 2 | lowercaseOutputName: true 3 | lowercaseOutputLabelNames: true 4 | whitelistObjectNames: 5 | - java.lang:* 6 | - kafka.cluster:* 7 | - kafka.controller:* 8 | - kafka.log:* 9 | - kafka.server:type=KafkaServer,name=BrokerState 10 | - kafka.server:type=KafkaRequestHandlerPool,* 11 | - kafka.server:type=BrokerTopicMetrics,* 12 | - kafka.server:type=FetcherLagMetrics,* 13 | - kafka.server:type=FetcherStats,* 14 | - kafka.server:type=Request,* 15 | - kafka.server:type=Fetch,* 16 | - kafka.server:type=Produce,* 17 | - kafka.server:type=ReplicaManager,* 18 | - kafka.server:type=ReplicaFetcherManager,* 19 | - kafka.server:type=SessionExpireListener,* 20 | - kafka.server:type=controller-channel-metrics,* 21 | - kafka.server:type=socket-server-metrics,* 22 | - kafka.network:type=RequestChannel,* 23 | - kafka.network:type=Processor,* 24 | - kafka.network:type=SocketServer,* 25 | - kafka.network:type=RequestMetrics,* 26 | - kafka.coordinator.group:* 27 | blacklistObjectNames: 28 | - java.lang:type=ClassLoading,* 29 | - java.lang:type=Compilation,* 30 | - java.lang:type=MemoryManager,* 31 | - kafka.utils:* 32 | - kafka.controller:type=ControllerChannelManager,name=QueueSize,* 33 | - kafka.log:type=Log,name=LogEndOffset,* 34 | - kafka.log:type=Log,name=LogStartOffset,* 35 | - kafka.cluster:type=Partition,name=InSyncReplicasCount,* 36 | - kafka.cluster:type=Partition,name=LastStableOffsetLag,* 37 | - kafka.cluster:type=Partition,name=ReplicasCounts,* 38 | - kafka.cluster:type=Partition,name=UnderReplicated,* 39 | - kafka.server:type=BrokerTopicMetrics,name=TotalFetchRequestsPerSec,* 40 | - kafka.server:type=BrokerTopicMetrics,name=TotalProduceRequestsPerSec,* 41 | - kafka.server:type=BrokerTopicMetrics,name=FailedProduceRequestsPerSec,* 42 | - kafka.server:type=BrokerTopicMetrics,name=FailedFetchRequestsPerSec,* 43 | - kafka.server:type=BrokerTopicMetrics,name=BytesRejectedPerSec,* 44 | rules: 45 | #------------------------------------------------------------------------------------------------------- 46 | # KafkaServers : State of broker server 47 | # 48 | # - BrokerState 49 | #------------------------------------------------------------------------------------------------------- 50 | - pattern: kafka.server<>Value 51 | name: kafka_server_brokerstate 52 | labels: 53 | service: kafka-broker 54 | env: cluster-demo 55 | 56 | #------------------------------------------------------------------------------------------------------- 57 | # Partition : Number of partitions for each broker 58 | # - InSyncReplicasCount 59 | # - LastStableOffsetLag 60 | # - ReplicasCount 61 | # - UnderReplicated 62 | #------------------------------------------------------------------------------------------------------- 63 | - pattern: kafka.cluster<>Value 64 | name: kafka_cluster_partition_$1 65 | labels: 66 | topic: $2 67 | partition: $3 68 | service: kafka-broker 69 | env: cluster-demo 70 | #------------------------------------------------------------------------------------------------------- 71 | # KafkaController : 72 | # 73 | # - ActiveControllerCount, OfflinePartitionsCount, PreferredReplicaImbalanceCount 74 | #------------------------------------------------------------------------------------------------------- 75 | - pattern: kafka.controller<>Value 76 | name: kafka_controller_$1 77 | labels: 78 | service: kafka-broker 79 | env: cluster-demo 80 | 81 | #------------------------------------------------------------------------------------------------------- 82 | # ControllerStats : The event that is currently being processed by the elected broker controller. 83 | # 84 | # - LeaderElectionRateAndtimeMs, UncleanLeaderElectionsPerSec, AutoLeaderBalanceRateAndTimeMs, ManualLeaderBalanceRateAndTimeMs 85 | # - ControllerChangeRateAndTimeMs, 86 | # - TopicChangeRateAndTimeMs, TopicDeletionRateAndTimeMs, PartitionReassignmentRateAndTimeMs 87 | # - IsrChangeRateAndTimeMs 88 | #------------------------------------------------------------------------------------------------------- 89 | - pattern: kafka.controller<>(OneMinuteRate|Mean|75thPercentile|99thPercentile) 90 | name: kafka_controller_stats_$1 91 | labels: 92 | aggregate: $2 93 | service: kafka-broker 94 | env: cluster-demo 95 | #------------------------------------------------------------------------------------------------------- 96 | # Coordinator : GroupMetadataManager 97 | # 98 | # - NumGroups, NumOffsets 99 | #------------------------------------------------------------------------------------------------------- 100 | - pattern: kafka.coordinator.group<>(Value) 101 | name: kafka_coordinator_group_metadata_manager_$1 102 | labels: 103 | service: kafka-broker 104 | env: cluster-demo 105 | #------------------------------------------------------------------------------------------------------- 106 | # Logs : 107 | # 108 | # - LogEndOffset, LogStartOffset, NumLogSegments, Size 109 | #------------------------------------------------------------------------------------------------------- 110 | - pattern: kafka.log<>Value 111 | name: kafka_log_$1 112 | labels: 113 | topic: $2 114 | partition: $3 115 | service: kafka-broker 116 | env: cluster-demo 117 | 118 | #------------------------------------------------------------------------------------------------------- 119 | # LogCleaner : 120 | #------------------------------------------------------------------------------------------------------- 121 | - pattern: kafka.log<>(Value) 122 | name: kafka_log_cleaner_recopy_percent 123 | labels: 124 | service: kafka-broker 125 | env: cluster-demo 126 | 127 | - pattern: kafka.log<>(Value) 128 | name: kafka_log_cleaner_max_clean_time_secs 129 | labels: 130 | service: kafka-broker 131 | env: cluster-demo 132 | 133 | - pattern: kafka.log<>(Value) 134 | name: kafka_log_cleaner_max_buffer_utilization_percent 135 | labels: 136 | service: kafka-broker 137 | env: cluster-demo 138 | 139 | #------------------------------------------------------------------------------------------------------- 140 | # LogCleanerManager : 141 | #------------------------------------------------------------------------------------------------------- 142 | - pattern: kafka.log<>(Value) 143 | name: kafka_log_cleaner_manager_max_dirty_percent 144 | labels: 145 | service: kafka-broker 146 | env: cluster-demo 147 | 148 | #------------------------------------------------------------------------------------------------------- 149 | # LogFlushStats : 150 | #------------------------------------------------------------------------------------------------------- 151 | - pattern: kafka.log<>(\w+) 152 | name: kafka_log_flush_stats_rate_and_time_ms 153 | labels: 154 | aggregate: $1 155 | service: kafka-broker 156 | env: cluster-demo 157 | #------------------------------------------------------------------------------------------------------- 158 | # KafkaRequestHandlerPool : Latency 159 | # 160 | # - KafkaRequestHandlerPool 161 | #------------------------------------------------------------------------------------------------------- 162 | - pattern: kafka.server<>(\w+) 163 | name: kafka_server_request_handler_avg_idle_percent 164 | labels: 165 | aggregate: $1 166 | service: kafka-broker 167 | env: cluster-demo 168 | 169 | #------------------------------------------------------------------------------------------------------- 170 | # Network Socket Server : Latency 171 | # 172 | # - NetworkProcessorAvgIdlePercent 173 | #------------------------------------------------------------------------------------------------------- 174 | - pattern: kafka.network<>(Value) 175 | name: kafka_network_socket_server_processor_avg_idle_percent 176 | labels: 177 | service: kafka-broker 178 | env: cluster-demo 179 | 180 | #------------------------------------------------------------------------------------------------------- 181 | # Network Processor : Latency 182 | # 183 | # - IdlePercent 184 | #------------------------------------------------------------------------------------------------------- 185 | - pattern: kafka.network<>(Value) 186 | name: kafka_network_processor_idle_percent 187 | labels: 188 | processor: $1 189 | service: kafka-broker 190 | env: cluster-demo 191 | 192 | #------------------------------------------------------------------------------------------------------- 193 | # Network KafkaRequestChannel : 194 | # 195 | # - RequestQueueSize, ResponseQueueSize 196 | #------------------------------------------------------------------------------------------------------- 197 | - pattern: kafka.network<>Value 198 | name: kafka_network_request_channel_queue_size 199 | labels: 200 | queue: $1 201 | service: kafka-broker 202 | env: cluster-demo 203 | 204 | #------------------------------------------------------------------------------------------------------- 205 | # Network KafkaRequest : 206 | # 207 | # - RequestPerSec, 208 | #------------------------------------------------------------------------------------------------------- 209 | - pattern: kafka.network<>(OneMinuteRate|Mean) 210 | name: kafka_network_request_per_sec 211 | labels: 212 | request: $1 213 | version: $2 214 | aggregate: $3 215 | service: kafka-broker 216 | env: cluster-demo 217 | 218 | #------------------------------------------------------------------------------------------------------- 219 | # Network KafkaRequestMetrics : 220 | # 221 | # - LocalTimeMs, RemoteTimeMs, 222 | # - RequestQueueTimeMs, 223 | # - ResponseQueueTimeMs, ResponseSendTimeMs 224 | # - ThrottleTimeMs 225 | # - TotalTimeMs 226 | #------------------------------------------------------------------------------------------------------- 227 | - pattern: kafka.network<>(OneMinuteRate|Mean|75thPercentile|99thPercentile) 228 | name: kafka_network_request_metrics_time_ms 229 | labels: 230 | scope: $1 231 | request: $2 232 | aggregate: $3 233 | service: kafka-broker 234 | env: cluster-demo 235 | 236 | #------------------------------------------------------------------------------------------------------- 237 | # KafkaServer / BrokerTopicMetrics - I/O metrics : 238 | # 239 | # - BytesInPerSec, BytesOutPerSec, BytesRejectedPerSec, 240 | # - FailedFetchRequestsPerSec, FailedProduceRequestsPerSec,MessagesInPerSec, 241 | # - TotalFetchRequestPerSec, TotalProduceRequestPerSec, ReplicationBytesInPerSec, ReplicationBytesOutPerSec 242 | #------------------------------------------------------------------------------------------------------- 243 | - pattern: kafka.server<>(OneMinute)Rate 244 | name: kafka_server_broker_topic_metrics_$1_rate 245 | labels: 246 | aggregate: $2 247 | service: kafka-broker 248 | env: cluster-demo 249 | 250 | - pattern: kafka.server<>(OneMinute)Rate 251 | name: kafka_server_broker_topic_metrics_$1_rate 252 | labels: 253 | topic: $2 254 | aggregate: $3 255 | service: kafka-broker 256 | env: cluster-demo 257 | #------------------------------------------------------------------------------------------------------- 258 | # KafkaServer / DelayedFetchMetrics : 259 | #------------------------------------------------------------------------------------------------------- 260 | - pattern: kafka.server<>([^,]+)Rate 261 | name: kafka_server_delayed_fetch_expires_per_sec 262 | labels: 263 | fetcher_type: $1 264 | aggregate: $2 265 | service: kafka-broker 266 | env: cluster-demo 267 | #------------------------------------------------------------------------------------------------------- 268 | # KafkaServer / DelayedOperationPurgatory : 269 | #------------------------------------------------------------------------------------------------------- 270 | - pattern: kafka.server<>Value 271 | name: kafka_server_delayed_operation_purgatory_$1 272 | labels: 273 | operation: $2 274 | service: kafka-broker 275 | env: cluster-demo 276 | #------------------------------------------------------------------------------------------------------- 277 | # FetcherLagMetrics : Lag in number of messages per follower replica 278 | #------------------------------------------------------------------------------------------------------- 279 | - pattern: kafka.server<>Value 280 | name: kafka_server_fetcher_lag_$1 281 | labels: 282 | client_id: $2 283 | topic: $3 284 | partition: $4 285 | service: kafka-broker 286 | env: cluster-demo 287 | #------------------------------------------------------------------------------------------------------- 288 | # FetcherStats : Replica Fetcher Thread stats 289 | # - BytesPerSec / RequestsPerSec 290 | #------------------------------------------------------------------------------------------------------- 291 | - pattern: kafka.server<>([^,]+)Rate 292 | name: kafka_server_fetcher_stats_$1 293 | labels: 294 | client_id: $2 295 | broker_host: $3 296 | broker_port: $4 297 | aggregate: $5 298 | service: kafka-broker 299 | env: cluster-demo 300 | #------------------------------------------------------------------------------------------------------- 301 | # KafkaServer Request : 302 | # - request-time - Tracking request-time per user/client-id 303 | # - throttle-time - Tracking average throttle-time per user/client-id 304 | #------------------------------------------------------------------------------------------------------- 305 | - pattern: kafka.server<>(request-time|throttle-time) 306 | name: kafka_server_request_$2 307 | labels: 308 | client_id: $1 309 | service: kafka-broker 310 | env: cluster-demo 311 | #------------------------------------------------------------------------------------------------------- 312 | # KafkaServer Fetcher/Producer : 313 | #------------------------------------------------------------------------------------------------------- 314 | - pattern: kafka.server<>(byte-rate|throttle-time) 315 | name: kafka_server_fetch_client_$2 316 | labels: 317 | client_id: $1 318 | service: kafka-broker 319 | env: cluster-demo 320 | 321 | - pattern: kafka.server<>(byte-rate|throttle-time) 322 | name: kafka_server_produce_client_$2 323 | labels: 324 | client_id: $1 325 | service: kafka-broker 326 | env: cluster-demo 327 | #------------------------------------------------------------------------------------------------------- 328 | # ReplicaManager : 329 | # - IsrExpandsPerSec, IsrShrinksPerSec, FailedIsrUpdatesPerSec 330 | # - LeaderCount, PartitionCount, UnderReplicatedPartitions) 331 | #------------------------------------------------------------------------------------------------------- 332 | - pattern: kafka.server<>([^,]+)Rate 333 | name: kafka_server_replica_manager_$1 334 | labels: 335 | aggregate: $2 336 | service: kafka-broker 337 | env: cluster-demo 338 | 339 | - pattern: kafka.server<>(Value) 340 | name: kafka_server_replica_manager_$1 341 | labels: 342 | service: kafka-broker 343 | env: cluster-demo 344 | 345 | #------------------------------------------------------------------------------------------------------- 346 | # ReplicaFetcherManager : 347 | # - MaxLag, MinFetchRate 348 | #------------------------------------------------------------------------------------------------------- 349 | - pattern: kafka.server<>(Value) 350 | name: kafka_server_replica_fetcher_manager_$1_value 351 | labels: 352 | client_id: $2 353 | service: kafka-broker 354 | env: cluster-demo 355 | 356 | #------------------------------------------------------------------------------------------------------- 357 | # Zookeeper / SessionExpireListener : 358 | #------------------------------------------------------------------------------------------------------- 359 | - pattern: kafka.server<>([^,]+)Rate 360 | name: kafka_zookeeper_session_expire_listener_$1 361 | labels: 362 | aggregate: $2 363 | service: kafka-broker 364 | env: cluster-demo 365 | 366 | #------------------------------------------------------------------------------------------------------- 367 | # ControllerChannelMetrics: 368 | #------------------------------------------------------------------------------------------------------- 369 | - pattern: kafka.server<>(\w*) 370 | name: kafka_server_controller_channel_metrics_$2 371 | labels: 372 | broker_id: $1 373 | service: kafka-broker 374 | env: cluster-demo 375 | 376 | #------------------------------------------------------------------------------------------------------- 377 | # KafkaServer / Socket Server Metrics 378 | #------------------------------------------------------------------------------------------------------- 379 | - pattern: kafka.server<>(\w*) 380 | name: kafka_server_socket_server_metrics_$2 381 | labels: 382 | network_processor: $1 383 | service: kafka-broker 384 | env: cluster-demo 385 | #------------------------------------------------------------------------------------------------------- 386 | # 387 | # Broker / JVM 388 | #------------------------------------------------------------------------------------------------------- 389 | # JVM GarbageCollector 390 | # 391 | - pattern: 'java.lang<>CollectionCount' 392 | name: kafka_jvm_gc_collection_count 393 | labels: 394 | name: $1 395 | service: kafka-broker 396 | env: cluster-demo 397 | 398 | - pattern: 'java.lang<>CollectionTime' 399 | name: kafka_jvm_gc_collection_time 400 | labels: 401 | name: $1 402 | service: kafka-broker 403 | env: cluster-demo 404 | 405 | - pattern: java.lang 406 | name: kafka_jvm_last_gc_duration 407 | labels: 408 | name: $1 409 | service: kafka-broker 410 | env: cluster-demo 411 | attrNameSnakeCase: true 412 | 413 | - pattern: 'java.lang(\w+)' 414 | name: kafka_jvm_last_gc_memory_usage_$4 415 | labels: 416 | name: $1 417 | space: $2 418 | type: $4 419 | service: kafka-broker 420 | env: cluster-demo 421 | attrNameSnakeCase: true 422 | 423 | # JVM Memory 424 | - pattern: java.lang(\w*) 425 | name: kafka_jvm_heap_usage 426 | labels: 427 | type: $1 428 | service: kafka-broker 429 | env: cluster-demo 430 | attrNameSnakeCase: true 431 | 432 | - pattern: java.lang(\w*) 433 | name: kafka_jvm_non_heap_usage 434 | labels: 435 | type: $1 436 | service: kafka-broker 437 | env: cluster-demo 438 | attrNameSnakeCase: true 439 | 440 | - pattern: 'java.lang(\w*)' 441 | name: kafka_jvm_memory_pool_collection_usage 442 | labels: 443 | name: $1 444 | type: $2 445 | service: kafka-broker 446 | env: cluster-demo 447 | - pattern: 'java.lang(\w*)' 448 | name: kafka_jvm_memory_pool_usage 449 | labels: 450 | name: $1 451 | type: $2 452 | service: kafka-broker 453 | env: cluster-demo 454 | - pattern: 'java.lang(\w*)' 455 | name: kafka_jvm_memory_pool_peak_usage 456 | labels: 457 | name: $1 458 | type: $2 459 | service: kafka-broker 460 | env: cluster-demo 461 | 462 | # JVM Thread 463 | - pattern: java.lang<>(\w*thread_count) 464 | name: kafka_jvm_$1 465 | labels: 466 | service: kafka-broker 467 | env: cluster-demo 468 | attrNameSnakeCase: true 469 | 470 | # Operating System 471 | - pattern: java.lang<>(\w*) 472 | name: kafka_jvm_os_$1 473 | labels: 474 | service: kafka-broker 475 | env: cluster-demo 476 | -------------------------------------------------------------------------------- /configs/jmx_exporter/metric_list.js: -------------------------------------------------------------------------------- 1 | // list of metrics user can choose from with their corresponding 2 | // jmx-exporter metrics to be exposed to prometheus 3 | module.exports = { 4 | broker_hard_disk_usage: { 5 | global_topics_size: ['kafka.log:*', 'kafka.cluster:*'], 6 | log_size_per_broker: ['kafka.log:*'], 7 | }, 8 | broker_jvm_os: { 9 | memory_usage: ['java.lang:*'], 10 | garbage_collection: ['java.lang:*'], 11 | cpu_usage: ['java.lang:*'], 12 | open_file_descriptors: ['java.lang:*'], 13 | available_memory: ['java.lang:*'], 14 | }, 15 | broker_performance: { 16 | request_total_time: ['kafka.network:type=RequestMetrics,*'], 17 | idle_percent: [ 18 | 'kafka.network:type=SocketServer,*', 19 | 'kafka.server:type=KafkaRequestHandlerPool,*', 20 | ], 21 | request_rate: ['kafka.network:type=RequestMetrics,*'], 22 | queue_size: ['kafka.network:type=RequestChannel,*'], 23 | queue_time: ['kafka.network:type=RequestMetrics,*'], 24 | throttling: ['kafka.network:type=RequestMetrics,*'], 25 | }, 26 | broker_zookeeper: { 27 | zookeeper_metrics: [ 28 | 'kafka.server:type=SessionExpireListener,*', 29 | 'kafka.server:type=KafkaServer,name=BrokerState', 30 | ], 31 | }, 32 | cluster_healthcheck: { 33 | core_healthcheck: [ 34 | 'kafka.controller:*', 35 | 'kafka.server:type=ReplicaManager,*', 36 | 'kafka.cluster:*', 37 | 'kafka.server:type=KafkaServer,name=BrokerState', 38 | 'kafka.network:type=RequestMetrics,*', 39 | ], 40 | throughput_io: ['kafka.server:type=BrokerTopicMetrics,*'], 41 | isr_count_change: ['kafka.server:type=ReplicaManager,*'], 42 | leaders_partitions: [ 43 | 'kafka.controller:*', 44 | 'kafka.server:type=ReplicaManager,*', 45 | ], 46 | }, 47 | cluster_replication: { 48 | replication_io: ['kafka.server:type=BrokerTopicMetrics,*'], 49 | replication_lag: ['kafka.server:type=ReplicaFetcherManager,*'], 50 | replica_fetcher: ['kafka.server:type=FetcherStats,*'], 51 | }, 52 | topics_logs: { 53 | log_info: [ 54 | 'kafka.log:*', 55 | 'kafka.cluster:*', 56 | 'kafka.server:type=BrokerTopicMetrics,*', 57 | ], 58 | }, 59 | }; 60 | -------------------------------------------------------------------------------- /configs/prometheus/alert_rules.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | # Broker_Hard_Disk_Usage 3 | - name: Broker_Hard_Disk_Usage 4 | rules: 5 | - alert: Global_Topics_Size_With_Replication 6 | expr: sum(kafka_log_size{env="$env"}) by (topic) < 1 # need to change the alert condition 7 | for: 1m # telling Prometheus to send alert only when expression holds true for 1 minute 8 | labels: 9 | severity: high 10 | annotations: 11 | summary: 'Alert with high severity.' 12 | 13 | - alert: Global_Topics_Size_Without_Replication 14 | expr: sum(sum(kafka_log_size{env="$env"}) by (topic, partition) / sum(kafka_cluster_partition_replicascount{env="$env"}) by(partition, topic)) by (topic) < 1 15 | for: 1m 16 | labels: 17 | severity: high 18 | annotations: 19 | summary: 'Alert with high severity.' 20 | 21 | - alert: Log_Size_Per_Broker_With_Replicas 22 | expr: sum(kafka_log_size{env="$env"}) by (instance) < 1 23 | for: 1m 24 | labels: 25 | severity: high 26 | annotations: 27 | summary: 'Alert with high severity.' 28 | 29 | - alert: Log_Size_Per_Broker_Without_Replicas 30 | expr: sum(kafka_log_size{env="$env"}) by (instance) < 1 31 | for: 1m 32 | labels: 33 | severity: high 34 | annotations: 35 | summary: 'Alert with high severity.' 36 | 37 | # Broker_Jvm_Os 38 | - name: Broker_Jvm_Os 39 | rules: 40 | - alert: JVM_Memory_Used_Non-Heap 41 | expr: kafka_jvm_non_heap_usage{env="$env", type="used"} < 1000 42 | for: 1m 43 | labels: 44 | severity: high 45 | annotations: 46 | summary: 'Alert with high severity.' 47 | 48 | - alert: JVM_Memory_Used_Heap 49 | expr: kafka_jvm_heap_usage{env="$env", type="used"} < 1000 50 | for: 1m 51 | labels: 52 | severity: high 53 | annotations: 54 | summary: 'Alert with high severity.' 55 | 56 | - alert: GC_Process_Time 57 | expr: kafka_jvm_gc_collection_time{env="$env"} < 1 58 | for: 1m 59 | labels: 60 | severity: high 61 | annotations: 62 | summary: 'Alert with high severity.' 63 | 64 | - alert: CPU 65 | expr: kafka_jvm_os_systemcpuload{env="$env"} > 1 #it stays under 1 66 | for: 1m 67 | labels: 68 | severity: high 69 | annotations: 70 | summary: 'Alert with high severity.' 71 | 72 | - alert: Open_File_Descriptors 73 | expr: kafka_jvm_os_openfiledescriptorcount{env="$env"} < 1 74 | for: 1m 75 | labels: 76 | severity: high 77 | annotations: 78 | summary: 'Alert with high severity.' 79 | 80 | - alert: Free_Physical_Memory 81 | expr: kafka_jvm_os_freephysicalmemorysize{env="$env"} < 10000 82 | for: 1m 83 | labels: 84 | severity: high 85 | annotations: 86 | summary: 'Alert with high severity.' 87 | 88 | - alert: Available_Virtual_Memory 89 | expr: kafka_jvm_os_committedvirtualmemorysize{env="$env"} < 10000 90 | for: 1m 91 | labels: 92 | severity: high 93 | annotations: 94 | summary: 'Alert with high severity.' 95 | 96 | # Broker_Performance 97 | - name: Broker_Performance 98 | rules: 99 | - alert: Request_Total_Average_Time 100 | expr: kafka_network_request_metrics_time_ms{instance=~"$instance", request=~"$request", aggregate=~"Mean", scope=~"Total",env="$env"} < 1 101 | for: 1m 102 | labels: 103 | severity: high 104 | annotations: 105 | summary: 'Alert with high severity.' 106 | 107 | - alert: Request_Total_99th_Percentile_Time 108 | expr: kafka_network_request_metrics_time_ms{instance=~"$instance", request=~"$request", aggregate=~"99thPercentile", scope=~"Total",env="$env"} < 1 109 | for: 1m 110 | labels: 111 | severity: high 112 | annotations: 113 | summary: 'Alert with high severity.' 114 | 115 | - alert: Request_Total_75th_Percentile_Time 116 | expr: kafka_network_request_metrics_time_ms{instance=~"$instance", request=~"$request", aggregate=~"75thPercentile", scope=~"Total",env="$env"} < 1 117 | for: 1m 118 | labels: 119 | severity: high 120 | annotations: 121 | summary: 'Alert with high severity.' 122 | 123 | - alert: Network_Socket_Avg_Idle_Percent 124 | expr: kafka_network_socket_server_processor_avg_idle_percent{instance=~"$instance",env="$env"} > 1 #should always be under 1 125 | for: 1m 126 | labels: 127 | severity: high 128 | annotations: 129 | summary: 'Alert with high severity.' 130 | 131 | - alert: Request_Handler_Avg_Idle_Percent 132 | expr: kafka_server_request_handler_avg_idle_percent{instance=~"$instance", aggregate=~"OneMinuteRate",env="$env"} < 1 133 | for: 1m 134 | labels: 135 | severity: high 136 | annotations: 137 | summary: 'Alert with high severity.' 138 | 139 | - alert: Requests_Per_Sec 140 | expr: kafka_network_request_per_sec{aggregate=~"OneMinuteRate",instance=~"$instance",request=~"$request",env="$env"} < 1 141 | for: 1m 142 | labels: 143 | severity: high 144 | annotations: 145 | summary: 'Alert with high severity.' 146 | 147 | - alert: Response_Queue_Size 148 | expr: kafka_network_request_channel_queue_size{instance=~"$instance", queue=~"Response",env="$env"} < 1 149 | for: 1m 150 | labels: 151 | severity: high 152 | annotations: 153 | summary: 'Alert with high severity.' 154 | 155 | - alert: Request_Queue_Size 156 | expr: kafka_network_request_channel_queue_size{instance=~"$instance", queue=~"Request", env="$env"} < 1 157 | for: 1m 158 | labels: 159 | severity: high 160 | annotations: 161 | summary: 'Alert with high severity.' 162 | 163 | # Broker_Zooeeper 164 | - name: broker_zookeeper 165 | rules: 166 | - alert: Zookeeper_Sync_Connects_Per_Sec 167 | expr: kafka_zookeeper_session_expire_listener_zookeepersyncconnectspersec{aggregate="OneMinute", env="$env"} <= 0 168 | for: 1m 169 | labels: 170 | severity: high 171 | annotations: 172 | summary: 'Alert with high severity.' 173 | 174 | - alert: Zookeeper_Disconnects_Per_Sec 175 | expr: kafka_zookeeper_session_expire_listener_zookeeperdisconnectspersec{env="$env", aggregate="OneMinute"} >= 1 #must be 0 all the time 176 | for: 1m 177 | labels: 178 | severity: high 179 | annotations: 180 | summary: 'Alert with high severity.' 181 | 182 | - alert: Zookeeper_Read_Only_Connects_Per_Sec 183 | expr: kafka_zookeeper_session_expire_listener_zookeeperreadonlyconnectspersec{env="$env", aggregate="OneMinute"} >= 1 #must be 0 all the time 184 | for: 1m 185 | labels: 186 | severity: high 187 | annotations: 188 | summary: 'Alert with high severity.' 189 | 190 | - alert: Zookeeper_Expires_Per_Sec 191 | expr: kafka_zookeeper_session_expire_listener_zookeeperexpirespersec{env="$env", aggregate="OneMinute"} >= 1 #must be 0 all the time 192 | for: 1m 193 | labels: 194 | severity: high 195 | annotations: 196 | summary: 'Alert with high severity.' 197 | 198 | - alert: Zookeeper_SASL_Authentications_Per_Sec 199 | expr: kafka_zookeeper_session_expire_listener_zookeeperauthfailurespersec{env="$env", aggregate="OneMinute"} >= 1 #must be 0 all the time 200 | for: 1m 201 | labels: 202 | severity: high 203 | annotations: 204 | summary: 'Alert with high severity.' 205 | 206 | - alert: Zookeeper_Auth_Failures_Per_Sec 207 | expr: kafka_zookeeper_session_expire_listener_zookeeperauthfailurespersec{env="$env", aggregate="OneMinute"} >= 1 #must be 0 all the time 208 | for: 1m 209 | labels: 210 | severity: high 211 | annotations: 212 | summary: 'Alert with high severity.' 213 | 214 | #cluster_healthcheck 215 | - name: cluster_healthcheck 216 | rules: 217 | - alert: Active_Controller 218 | expr: sum(kafka_controller_activecontrollercount{env="$env"}) < 1 #alert if there's any inactive controller counts. 219 | for: 1m 220 | labels: 221 | severity: high 222 | annotations: 223 | summary: 'Alert with high severity.' 224 | 225 | - alert: Offline_Partitions_Count 226 | expr: sum(kafka_controller_offlinepartitionscount{env="$env"}) > 0 #alert if there's any offline partitions 227 | for: 1m 228 | labels: 229 | severity: high 230 | annotations: 231 | summary: 'Alert with high severity.' 232 | 233 | - alert: Under_Replicated_Partitions 234 | expr: sum(kafka_server_replica_manager_underreplicatedpartitions{env="$env"}) > 0 #alert if there's any offline partitions 235 | for: 1m 236 | labels: 237 | severity: high 238 | annotations: 239 | summary: 'Alert with high severity.' 240 | 241 | - alert: Partitions_Under_Min_ISR 242 | expr: sum(kafka_cluster_partition_underminisr{env="$env"}) < 1 # the minimum number of in-sync replicas that must be available for the producer to successfully send records to a partition must be at least 1 243 | for: 1m 244 | labels: 245 | severity: high 246 | annotations: 247 | summary: 'Alert with high severity.' 248 | 249 | - alert: Preferred_Replica_Imbalance 250 | expr: sum(kafka_controller_preferredreplicaimbalancecount{env="$env"}) > 1 #imbalance count should be lower 251 | for: 1m 252 | labels: 253 | severity: high 254 | annotations: 255 | summary: 'Alert with high severity.' 256 | 257 | - alert: Brokers_Running_Greater_than_and_equal_to_three # Brokers_Running(>=3) 258 | expr: count((kafka_server_brokerstate{env="$env"}) == 3 or (kafka_server_brokerstate{env="$env"}) == 4) < 3 259 | for: 1m 260 | labels: 261 | severity: high 262 | annotations: 263 | summary: 'Alert with high severity.' 264 | 265 | - alert: Produce_Req_per_Sec 266 | expr: sum(kafka_network_request_per_sec{aggregate=~"OneMinuteRate",request=~"Produce",env="$env"}) > 1 267 | for: 1m 268 | labels: 269 | severity: high 270 | annotations: 271 | summary: 'Alert with high severity.' 272 | 273 | - alert: Fetch_Consumer_Req_per_Sec 274 | expr: sum(kafka_network_request_per_sec{aggregate=~"OneMinuteRate",request=~"FetchConsumer",env="$env"}) > 1 275 | for: 1m 276 | labels: 277 | severity: high 278 | annotations: 279 | summary: 'Alert with high severity.' 280 | 281 | - alert: Metadata_Req_per_Sec 282 | expr: sum(kafka_network_request_per_sec{aggregate=~"OneMinuteRate",request=~"Metadata",env="$env"}) > 1 283 | for: 1m 284 | labels: 285 | severity: high 286 | annotations: 287 | summary: 'Alert with high severity.' 288 | 289 | - alert: Offset_Commit_Req_per_Sec 290 | expr: sum(kafka_network_request_per_sec{aggregate=~"OneMinuteRate",request=~"OffsetCommit",env="$env"}) > 1 291 | for: 1m 292 | labels: 293 | severity: high 294 | annotations: 295 | summary: 'Alert with high severity.' 296 | 297 | - alert: Offset_Commit_Req_per_Sec 298 | expr: sum(kafka_network_request_per_sec{aggregate=~"OneMinuteRate",request=~"OffsetCommit",env="$env"}) > 1 299 | for: 1m 300 | labels: 301 | severity: high 302 | annotations: 303 | summary: 'Alert with high severity.' 304 | 305 | - alert: Messages_In_per_Topics 306 | expr: sum(kafka_server_broker_topic_metrics_messagesinpersec_rate{topic!=""}) by(topic) > 1 307 | for: 1m 308 | labels: 309 | severity: high 310 | annotations: 311 | summary: 'Alert with high severity.' 312 | 313 | - alert: Messages_In_per_sec_less_than_one 314 | expr: sum(kafka_server_broker_topic_metrics_messagesinpersec_rate{topic=""}) by (broker) > 1 315 | for: 1m 316 | labels: 317 | severity: high 318 | annotations: 319 | summary: 'Alert with high severity.' 320 | 321 | - alert: Bytes_In_per_Sec 322 | expr: sum(kafka_server_broker_topic_metrics_bytesinpersec_rate{topic=""}) by (broker) > 1 323 | for: 1m 324 | labels: 325 | severity: high 326 | annotations: 327 | summary: 'Alert with high severity.' 328 | 329 | - alert: Bytes_Out_per_Sec 330 | expr: sum(kafka_server_broker_topic_metrics_bytesoutpersec_rate{topic=""}) by (broker) > 1 331 | for: 1m 332 | labels: 333 | severity: high 334 | annotations: 335 | summary: 'Alert with high severity.' 336 | 337 | - alert: In-Sync_Replica_Expands_Rate 338 | expr: kafka_server_replica_manager_isrexpandspersec{aggregate="OneMinute",env="$env"} > 1 339 | for: 1m 340 | labels: 341 | severity: high 342 | annotations: 343 | summary: 'Alert with high severity.' 344 | 345 | - alert: In-Sync_Replica_Shrinks_Rate 346 | expr: kafka_server_replica_manager_isrshrinkspersec{aggregate="OneMinute",env="$env"} > 1 347 | for: 1m 348 | labels: 349 | severity: high 350 | annotations: 351 | summary: 'Alert with high severity.' 352 | 353 | - alert: Leader_Election_Rate_ms_1MinuteRate_Mean 354 | expr: kafka_controller_stats_leaderelectionrateandtimems{aggregate=~"Mean",env="$env"} > 1 355 | for: 1m 356 | labels: 357 | severity: high 358 | annotations: 359 | summary: 'Alert with high severity.' 360 | 361 | - alert: Leader_Election_Rate_ms_1MinuteRate_75th_Percentile 362 | expr: kafka_controller_stats_leaderelectionrateandtimems{aggregate=~"75thPercentile",env="$env"} > 1 363 | for: 1m 364 | labels: 365 | severity: high 366 | annotations: 367 | summary: 'Alert with high severity.' 368 | 369 | - alert: Leader_Election_Rate_ms_1MinuteRate_99th_Percentile 370 | expr: kafka_controller_stats_leaderelectionrateandtimems{aggregate=~"99thPercentile",env="$env"} > 1 371 | for: 1m 372 | labels: 373 | severity: high 374 | annotations: 375 | summary: 'Alert with high severity.' 376 | 377 | - alert: Leader_Election_Rate_ms_1MinuteRate_OneMinuteRate 378 | expr: kafka_controller_stats_leaderelectionrateandtimems{aggregate="OneMinuteRate", env="$env"} > 1 379 | for: 1m 380 | labels: 381 | severity: high 382 | annotations: 383 | summary: 'Alert with high severity.' 384 | 385 | - alert: Unclean_Leader_Election_Rate_less_than_0 386 | expr: kafka_controller_stats_uncleanleaderelectionspersec{aggregate="OneMinuteRate",env="$env"} >= 0 387 | for: 1m 388 | labels: 389 | severity: high 390 | annotations: 391 | summary: 'Alert with high severity.' 392 | 393 | - alert: Leader_Count_per_Broker 394 | expr: kafka_server_replica_manager_leadercount{env="$env"} < 1 #there must be at least one leader per broker. 395 | for: 1m 396 | labels: 397 | severity: high 398 | annotations: 399 | summary: 'Alert with high severity.' 400 | 401 | - alert: Number_of_Partitions_per_Broker 402 | expr: kafka_server_replica_manager_partitioncount{env="$env"} < 1 #there must be at least one partition per broker. 403 | for: 1m 404 | labels: 405 | severity: high 406 | annotations: 407 | summary: 'Alert with high severity.' 408 | 409 | # Cluster_Replication 410 | - name: Cluster_Replication 411 | rules: 412 | - alert: Replication_BytesIn_Sec 413 | expr: kafka_server_broker_topic_metrics_replicationbytesinpersec_rate{aggregate="OneMinute",env="$env"} > 1 414 | for: 1m 415 | labels: 416 | severity: high 417 | annotations: 418 | summary: 'Alert with high severity.' 419 | 420 | - alert: Replication_BytesOut_per_Sec 421 | expr: kafka_server_broker_topic_metrics_replicationbytesoutpersec_rate{aggregate="OneMinute",env="$env"} > 1 422 | for: 1m 423 | labels: 424 | severity: high 425 | annotations: 426 | summary: 'Alert with high severity.' 427 | 428 | - alert: Replication_Max_Lag_per_Broker 429 | expr: kafka_server_replica_fetcher_manager_maxlag_value{env="$env"} > 1 430 | for: 1m 431 | labels: 432 | severity: high 433 | annotations: 434 | summary: 'Alert with high severity.' 435 | 436 | - alert: Fetch_Bytes_per_sec_per_Replication_thread 437 | expr: kafka_server_fetcher_stats_bytespersec{env="$env", aggregate="OneMinute"} > 1 438 | for: 1m 439 | labels: 440 | severity: high 441 | annotations: 442 | summary: 'Alert with high severity.' 443 | 444 | - alert: Fetch_Requests_per_sec_Per_Replication_thread 445 | expr: kafka_server_fetcher_stats_requestspersec{env="$env", aggregate="OneMinute"} > 1 446 | for: 1m 447 | labels: 448 | severity: high 449 | annotations: 450 | summary: 'Alert with high severity.' 451 | #Total Log Size 452 | - name: Total_Log_Size 453 | rules: 454 | - alert: Replication_BytesIn_per_Sec 455 | expr: sum(kafka_log_size{env="$env", topic="$topic"}) < 1000 456 | for: 1m 457 | labels: 458 | severity: high 459 | annotations: 460 | summary: 'Alert with high severity.' 461 | 462 | - alert: Total_Number_of_Replicas 463 | expr: count_values("partition", kafka_log_numlogsegments{topic="$topic"}) < 1 464 | for: 1m 465 | labels: 466 | severity: high 467 | annotations: 468 | summary: 'Alert with high severity.' 469 | 470 | - alert: Number_of_Partitions_Under_Min_ISR 471 | expr: sum(kafka_cluster_partition_underminisr{env="$env", topic="$topic"}) > 1 472 | for: 1m 473 | labels: 474 | severity: high 475 | annotations: 476 | summary: 'Alert with high severity.' 477 | 478 | - alert: Under_Replicated_Partitions 479 | expr: sum(kafka_cluster_partition_underreplicated{env="$env", topic="$topic"}) > 1 480 | for: 1> 481 | labels: 482 | severity: high 483 | annotations: 484 | summary: 'Alert with high severity.' 485 | 486 | - alert: Number_of_Segment_Files_Per_Broker 487 | expr: kafka_log_numlogsegments{topic="$topic", env="$env"} > 1 488 | for: 1m 489 | labels: 490 | severity: high 491 | annotations: 492 | summary: 'Alert with high severity.' 493 | 494 | - alert: Messages_In_Per_Sec_(OneMinuteRate) 495 | expr: kafka_server_broker_topic_metrics_messagesinpersec_rate{topic="$topic", env="$env", aggregate="OneMinute"} > 0.00000000001 496 | for: 1m 497 | labels: 498 | severity: high 499 | annotations: 500 | summary: 'Alert with high severity.' 501 | 502 | - alert: BytesInPerSec_OneMinuteRate 503 | expr: kafka_server_broker_topic_metrics_bytesinpersec_rate{topic="$topic", env="$env", aggregate="OneMinute"} > 0.000000001 504 | for: 1m 505 | labels: 506 | severity: high 507 | annotations: 508 | summary: 'Alert with high severity.' 509 | 510 | - alert: BytesOutPerSec_OneMinuteRate 511 | expr: kafka_server_broker_topic_metrics_bytesoutpersec_rate{topic="$topic", env="$env", aggregate="OneMinute"} > 1 512 | for: 1m 513 | labels: 514 | severity: high 515 | annotations: 516 | summary: 'Alert with high severity.' 517 | -------------------------------------------------------------------------------- /configs/prometheus/prometheus_template.yml: -------------------------------------------------------------------------------- 1 | global: 2 | scrape_interval: 15s 3 | evaluation_interval: 15s 4 | alerting: 5 | alertmanagers: 6 | - static_configs: 7 | - targets: 8 | - null 9 | rule_files: 10 | - null 11 | scrape_configs: 12 | - job_name: kafka 13 | static_configs: 14 | - targets: 15 | -------------------------------------------------------------------------------- /electron.js: -------------------------------------------------------------------------------- 1 | const { app, ipcMain, Menu, shell } = require('electron'); 2 | const { exec } = require('child_process'); 3 | const configGenerator = require('./configs/configGenerator.js'); 4 | 5 | const LaunchWindow = require('./app/LaunchWindow.jsx'); 6 | const MainWindow = require('./app/MainWindow.jsx'); 7 | 8 | let mainWindow; 9 | let launchWindow; 10 | let preferences; 11 | 12 | app.on('ready', () => { 13 | const mainMenu = Menu.buildFromTemplate(menuTemplate); 14 | Menu.setApplicationMenu(mainMenu); 15 | 16 | launchWindow = new LaunchWindow(`file://${__dirname}/src/launch.html`); 17 | launchWindow.on('show', () => { 18 | setTimeout(() => { 19 | launchWindow.focus(); 20 | }, 500); 21 | }); 22 | launchWindow.show(); 23 | }); 24 | 25 | // build app menu 26 | const menuTemplate = [ 27 | { 28 | label: 'File', 29 | submenu: [ 30 | { 31 | role: 'quit', 32 | }, 33 | ], 34 | }, 35 | ]; 36 | 37 | // handle specific menu issue for Macs 38 | if (process.platform === 'darwin') { 39 | menuTemplate.unshift({ 40 | label: '', 41 | }); 42 | } 43 | 44 | // add dev tools to menu if in development mode 45 | if (process.env.NODE_ENV === 'development') { 46 | menuTemplate.push({ 47 | label: 'Developer', 48 | submenu: [{ role: 'reload' }, { role: 'toggleDevTools' }], 49 | }); 50 | } 51 | 52 | // enter the main app once the Docker container has launched 53 | function enterApp() { 54 | // close launch window and show main window 55 | launchWindow.hide(); 56 | mainWindow = new MainWindow(`file://${__dirname}/src/index.html`); 57 | mainWindow.show(); 58 | } 59 | 60 | ipcMain.on('preferences:submit', (_, userPreferences) => { 61 | // generate Prometheus, Docker, Grafana, etc. config files based on user input 62 | preferences = userPreferences; 63 | const { brokers, metrics, email } = userPreferences; 64 | configGenerator(brokers, metrics, email); 65 | if (brokers === 1) { 66 | dockerExec( 67 | './configs/docker/docker_single_node.yml up -d --remove-orphans' 68 | ); 69 | } else 70 | dockerExec( 71 | './configs/docker/docker_multiple_nodes.yml up -d --remove-orphans' 72 | ); 73 | 74 | // continue to check for a running cluster every second before launching the main app 75 | function checkForCluster() { 76 | exec('docker logs grafana', (err, stdout, stderr) => { 77 | if (stderr.includes('Cannot connect to the Docker daemon')) { 78 | return launchWindow.webContents.send('docker:closed'); 79 | } 80 | if (!stderr.includes('Error: No such container: grafana')) { 81 | return enterApp(); 82 | } 83 | setTimeout(checkForCluster, 1000); 84 | }); 85 | } 86 | checkForCluster(); 87 | }); 88 | 89 | // send preferences to React side when page is rendered 90 | ipcMain.on('app:rendered', () => { 91 | mainWindow.webContents.send('preferences:send', preferences); 92 | }); 93 | 94 | ipcMain.on('app:quit', () => { 95 | app.quit(); 96 | }); 97 | 98 | // run command to shut down cluster when user presses shutdown button 99 | ipcMain.on('cluster:shutdown', () => { 100 | if (preferences.brokers === 1) { 101 | dockerExec('./configs/docker/docker_single_node.yml down'); 102 | } else dockerExec('./configs/docker/docker_multiple_nodes.yml down'); 103 | mainWindow.hide(); 104 | launchWindow = new LaunchWindow(`file://${__dirname}/src/launch.html`); 105 | }); 106 | 107 | // execute command to start up or shut down Docker app 108 | function dockerExec(path) { 109 | const dockerCommand = 'docker-compose -p kaffia-cluster -f ' + path; 110 | exec(dockerCommand, (err, stdout, stderr) => { 111 | if (err) { 112 | console.log(err); 113 | } 114 | if (stderr) { 115 | console.log(stderr); 116 | } 117 | console.log(stdout); 118 | }); 119 | } 120 | 121 | // open GitHub in external browser when user clicks link on Home page 122 | ipcMain.on('github:launch', () => { 123 | shell.openExternal('https://github.com/oslabs-beta/Kaffia'); 124 | }); 125 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "kaffia", 3 | "version": "1.0.0", 4 | "description": "Apache Kafka visualization and deployment app", 5 | "main": "electron.js", 6 | "scripts": { 7 | "start": "cross-env NODE_ENV=production electron .", 8 | "build": "webpack", 9 | "dev": "concurrently \"cross-env NODE_ENV=development electron .\" \"cross-env NODE_ENV=development webpack-dev-server\"", 10 | "dev-server": "cross-env NODE_ENV=development webpack-dev-server", 11 | "dev-app": "cross-env NODE_ENV=development electron ." 12 | }, 13 | "repository": { 14 | "type": "git", 15 | "url": "git+https://github.com/aidenblinn/kaffia.git" 16 | }, 17 | "author": "", 18 | "license": "ISC", 19 | "bugs": { 20 | "url": "https://github.com/aidenblinn/kaffia/issues" 21 | }, 22 | "homepage": "https://github.com/aidenblinn/kaffia#readme", 23 | "dependencies": { 24 | "@babel/core": "^7.17.8", 25 | "@babel/preset-env": "^7.16.11", 26 | "@babel/preset-react": "^7.16.7", 27 | "@emotion/react": "^11.8.2", 28 | "@emotion/styled": "^11.8.1", 29 | "@fontsource/roboto": "^4.5.5", 30 | "@mui/icons-material": "^5.5.1", 31 | "@mui/material": "^5.5.2", 32 | "@mui/styles": "^5.6.0", 33 | "babel-loader": "^8.2.3", 34 | "babel-plugin-import": "^1.13.3", 35 | "concurrently": "^7.0.0", 36 | "cross-env": "^7.0.3", 37 | "css-loader": "^6.7.1", 38 | "electron": "^17.4.0", 39 | "js-yaml": "^4.1.0", 40 | "react": "^17.0.2", 41 | "react-copy-to-clipboard": "^5.0.4", 42 | "react-dom": "^17.0.2", 43 | "react-router-dom": "^6.2.2", 44 | "sass": "^1.49.9", 45 | "sass-loader": "^12.6.0", 46 | "style-loader": "^3.3.1", 47 | "webpack": "^5.70.0", 48 | "webpack-cli": "^4.9.2", 49 | "webpack-dev-server": "^4.7.4" 50 | }, 51 | "devDependencies": { 52 | "url-loader": "^4.1.1" 53 | } 54 | } 55 | -------------------------------------------------------------------------------- /src/assets/app-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oslabs-beta/Kaffia/4b585a0b94846f460c865e2f08103d25d42d0020/src/assets/app-logo.png -------------------------------------------------------------------------------- /src/assets/github-logo.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oslabs-beta/Kaffia/4b585a0b94846f460c865e2f08103d25d42d0020/src/assets/github-logo.jpg -------------------------------------------------------------------------------- /src/assets/icon-mac.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oslabs-beta/Kaffia/4b585a0b94846f460c865e2f08103d25d42d0020/src/assets/icon-mac.png -------------------------------------------------------------------------------- /src/assets/icon-windows.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oslabs-beta/Kaffia/4b585a0b94846f460c865e2f08103d25d42d0020/src/assets/icon-windows.png -------------------------------------------------------------------------------- /src/assets/launch-demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oslabs-beta/Kaffia/4b585a0b94846f460c865e2f08103d25d42d0020/src/assets/launch-demo.gif -------------------------------------------------------------------------------- /src/assets/main-demo.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oslabs-beta/Kaffia/4b585a0b94846f460c865e2f08103d25d42d0020/src/assets/main-demo.gif -------------------------------------------------------------------------------- /src/assets/white-icon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/oslabs-beta/Kaffia/4b585a0b94846f460c865e2f08103d25d42d0020/src/assets/white-icon.png -------------------------------------------------------------------------------- /src/components/App.jsx: -------------------------------------------------------------------------------- 1 | import React, { Component } from 'react'; 2 | import { HashRouter, Route, Routes, useLocation } from 'react-router-dom'; 3 | import { ipcRenderer } from 'electron'; 4 | 5 | import BrokerHardDiskUsage from './BrokerHardDiskUsage'; 6 | import BrokerJvmAndOs from './BrokerJvmAndOs'; 7 | import BrokerPerformance from './BrokerPerformance'; 8 | import BrokerZookeeper from './BrokerZookeeper'; 9 | import ClusterHealthCheck from './ClusterHealthCheck'; 10 | import ClusterReplication from './ClusterReplication'; 11 | import TopicLogs from './TopicsLogs'; 12 | import HelpTab from './HelpTab'; 13 | import Home from './Home'; 14 | import Sidebar from './Sidebar'; 15 | 16 | import links from '../models/metricUrls'; 17 | 18 | class App extends Component { 19 | constructor() { 20 | super(); 21 | this.state = { 22 | links, 23 | }; 24 | } 25 | 26 | componentDidMount() { 27 | ipcRenderer.send('app:rendered'); 28 | ipcRenderer.on('preferences:send', (event, preferences) => { 29 | this.setState((prevState) => { 30 | return { 31 | ...prevState, 32 | preferences, 33 | }; 34 | }); 35 | }); 36 | } 37 | 38 | render() { 39 | let broker_hard_disk_usage, 40 | broker_jvm_os, 41 | broker_performance, 42 | broker_zookeeper, 43 | cluster_healthcheck, 44 | cluster_replication, 45 | topics_logs; 46 | if (this.state.preferences) { 47 | ({ 48 | broker_hard_disk_usage, 49 | broker_jvm_os, 50 | broker_performance, 51 | broker_zookeeper, 52 | cluster_healthcheck, 53 | cluster_replication, 54 | topics_logs, 55 | } = this.state.preferences.metrics); 56 | } 57 | 58 | return ( 59 | 60 | {this.state.preferences && ( 61 | <> 62 | 63 |
70 | 71 | } /> 72 | 80 | } 81 | /> 82 | 90 | } 91 | /> 92 | 100 | } 101 | /> 102 | 110 | } 111 | /> 112 | 120 | } 121 | /> 122 | 130 | } 131 | /> 132 | } 136 | /> 137 | } /> 138 | 139 |
140 | 141 | )} 142 |
143 | ); 144 | } 145 | } 146 | 147 | export default App; 148 | -------------------------------------------------------------------------------- /src/components/BrokerHardDiskUsage.jsx: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import Typography from '@mui/material/Typography'; 3 | import renderMetricPanels from './_utils/renderMetricPanels'; 4 | 5 | function BrokerHardDiskUsage(props) { 6 | const { metrics, metricURLs } = props; 7 | 8 | return ( 9 | <> 10 | Broker Hard Disk Usage 11 | {metrics && renderMetricPanels(metrics, metricURLs)} 12 | 13 | ); 14 | } 15 | 16 | export default BrokerHardDiskUsage; 17 | -------------------------------------------------------------------------------- /src/components/BrokerJVMAndOS.jsx: -------------------------------------------------------------------------------- 1 | import Typography from '@mui/material/Typography'; 2 | import React from 'react'; 3 | import renderMetricPanels from './_utils/renderMetricPanels'; 4 | 5 | function BrokerJVMAndOS(props) { 6 | const {metrics, metricURLs} = props; 7 | 8 | return ( 9 | <> 10 | Broker JVM and OS 11 | {metrics && renderMetricPanels(metrics, metricURLs)} 12 | 13 | ); 14 | } 15 | 16 | export default BrokerJVMAndOS; 17 | -------------------------------------------------------------------------------- /src/components/BrokerPerformance.jsx: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import Typography from '@mui/material/Typography'; 3 | import renderMetricPanels from './_utils/renderMetricPanels'; 4 | 5 | function BrokerPerformance(props) { 6 | const {metrics, metricURLs} = props; 7 | 8 | return ( 9 | <> 10 | Broker Performance 11 | {metrics && renderMetricPanels(metrics, metricURLs)} 12 | 13 | ); 14 | } 15 | 16 | export default BrokerPerformance; 17 | -------------------------------------------------------------------------------- /src/components/BrokerZookeeper.jsx: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import Typography from '@mui/material/Typography'; 3 | import renderMetricPanels from './_utils/renderMetricPanels'; 4 | 5 | function BrokerZookeeper(props) { 6 | const {metrics, metricURLs} = props; 7 | 8 | return ( 9 | <> 10 | Broker Zookeeper 11 | {metrics && renderMetricPanels(metrics, metricURLs)} 12 | 13 | ); 14 | } 15 | 16 | export default BrokerZookeeper; 17 | -------------------------------------------------------------------------------- /src/components/ClusterHealthCheck.jsx: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import Typography from '@mui/material/Typography'; 3 | import renderMetricPanels from './_utils/renderMetricPanels'; 4 | 5 | function ClusterHealthCheck(props) { 6 | const {metrics, metricURLs} = props; 7 | 8 | return ( 9 | <> 10 | Cluster Health Check 11 | {metrics && renderMetricPanels(metrics, metricURLs)} 12 | 13 | ); 14 | } 15 | 16 | export default ClusterHealthCheck; 17 | -------------------------------------------------------------------------------- /src/components/ClusterReplication.jsx: -------------------------------------------------------------------------------- 1 | import React from 'react'; 2 | import Typography from '@mui/material/Typography'; 3 | import renderMetricPanels from './_utils/renderMetricPanels'; 4 | 5 | function ClusterReplication(props) { 6 | const {metrics, metricURLs} = props; 7 | 8 | return ( 9 | <> 10 | Cluster Replication 11 | {metrics && renderMetricPanels(metrics, metricURLs)} 12 | 13 | ); 14 | } 15 | 16 | export default ClusterReplication; 17 | -------------------------------------------------------------------------------- /src/components/GrafanaDash.jsx: -------------------------------------------------------------------------------- 1 | import { Card, Container } from '@mui/material'; 2 | 3 | import React from 'react'; 4 | 5 | function GrafChart() { 6 | return ( 7 | 12 | 13 |