├── alert-generation ├── templates │ ├── grafana-export-header-template.yml │ ├── prometheus-export-template.yml │ ├── grafana-export-group-template.yml │ └── grafana-export-rule-template.yml ├── Dockerfile ├── generate.py └── README.md ├── .gitignore ├── docs └── images │ ├── Kafka Consume1.png │ ├── Kafka Consume2.png │ ├── Kafka Produce1.png │ ├── Kafka Produce2.png │ ├── Kafka Produce3.png │ ├── Ops Dashboard.png │ ├── Topic Metrics.png │ ├── Consumer Offsets.png │ └── Grafana Alerting.png ├── grafana-dashboards ├── src │ ├── .gitignore │ ├── jsonnetfile.json │ ├── Makefile │ ├── README.md │ ├── dashboards │ │ ├── serverless.jsonnet │ │ ├── serverless-schema-registry.jsonnet │ │ ├── serverless-quota.jsonnet │ │ ├── serverless-rpcn.jsonnet │ │ └── serverless-overview.jsonnet │ └── lib │ │ └── common.libsonnet ├── Kafka-Topic-Metrics.json ├── redpanda-data-transforms.json └── Kafka-Consumer-Offsets.json ├── cloud ├── images │ └── prometheus-info-screen.png ├── config │ ├── grafana │ │ └── provisioning │ │ │ ├── datasources │ │ │ └── prometheus.yaml │ │ │ └── dashboards │ │ │ └── redpanda.yaml │ └── prometheus.yml ├── docker-compose.yml └── README.md ├── demo ├── config │ ├── grafana │ │ └── provisioning │ │ │ ├── datasources │ │ │ └── prometheus.yaml │ │ │ └── dashboards │ │ │ └── redpanda.yaml │ ├── alertmanager │ │ └── alertmanager.yml │ ├── prometheus │ │ ├── prometheus.yml │ │ └── alert-rules.yml │ ├── jmx_exporter │ │ └── kafka.yml │ └── alert-definitions.yml ├── README.md └── docker-compose.yml ├── README.md └── LICENSE /alert-generation/templates/grafana-export-header-template.yml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | groups: [] -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | shelf/ 3 | workspace.xml 4 | .idea/ 5 | venv/ 6 | .DS_Store 7 | 8 | -------------------------------------------------------------------------------- /alert-generation/templates/prometheus-export-template.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: alert.rules 3 | rules: [] -------------------------------------------------------------------------------- /docs/images/Kafka Consume1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redpanda-data/observability/HEAD/docs/images/Kafka Consume1.png -------------------------------------------------------------------------------- /docs/images/Kafka Consume2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redpanda-data/observability/HEAD/docs/images/Kafka Consume2.png -------------------------------------------------------------------------------- /docs/images/Kafka Produce1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redpanda-data/observability/HEAD/docs/images/Kafka Produce1.png -------------------------------------------------------------------------------- /docs/images/Kafka Produce2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redpanda-data/observability/HEAD/docs/images/Kafka Produce2.png -------------------------------------------------------------------------------- /docs/images/Kafka Produce3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redpanda-data/observability/HEAD/docs/images/Kafka Produce3.png -------------------------------------------------------------------------------- /docs/images/Ops Dashboard.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redpanda-data/observability/HEAD/docs/images/Ops Dashboard.png -------------------------------------------------------------------------------- /docs/images/Topic Metrics.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redpanda-data/observability/HEAD/docs/images/Topic Metrics.png -------------------------------------------------------------------------------- /grafana-dashboards/src/.gitignore: -------------------------------------------------------------------------------- 1 | # Vendored dependencies 2 | vendor/ 3 | 4 | # Lock file 5 | jsonnetfile.lock.json 6 | -------------------------------------------------------------------------------- /alert-generation/templates/grafana-export-group-template.yml: -------------------------------------------------------------------------------- 1 | orgId: 1 2 | name: "" 3 | folder: "" 4 | interval: 1m 5 | rules: [] -------------------------------------------------------------------------------- /docs/images/Consumer Offsets.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redpanda-data/observability/HEAD/docs/images/Consumer Offsets.png -------------------------------------------------------------------------------- /docs/images/Grafana Alerting.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redpanda-data/observability/HEAD/docs/images/Grafana Alerting.png -------------------------------------------------------------------------------- /cloud/images/prometheus-info-screen.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/redpanda-data/observability/HEAD/cloud/images/prometheus-info-screen.png -------------------------------------------------------------------------------- /alert-generation/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.9 2 | 3 | ADD generate.py . 4 | ADD templates /templates 5 | 6 | RUN pip install PyYAML 7 | 8 | CMD ["python", "./generate.py"] -------------------------------------------------------------------------------- /cloud/config/grafana/provisioning/datasources/prometheus.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | datasources: 4 | - name: prometheus 5 | type: prometheus 6 | access: proxy 7 | httpMethod: POST 8 | url: http://prometheus:9090 9 | isDefault: true -------------------------------------------------------------------------------- /demo/config/grafana/provisioning/datasources/prometheus.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | datasources: 4 | - name: prometheus 5 | uid: P1809F7CD0C75ACF3 6 | type: prometheus 7 | access: proxy 8 | httpMethod: POST 9 | url: http://prometheus:9090 10 | isDefault: true -------------------------------------------------------------------------------- /cloud/config/grafana/provisioning/dashboards/redpanda.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | providers: 4 | - name: 'redpanda' 5 | orgId: 1 6 | folder: '' 7 | folderUid: '' 8 | type: file 9 | disableDeletion: false 10 | updateIntervalSeconds: 10 11 | allowUiUpdates: false 12 | options: 13 | path: /var/lib/grafana/dashboards -------------------------------------------------------------------------------- /demo/config/grafana/provisioning/dashboards/redpanda.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: 1 2 | 3 | providers: 4 | - name: 'redpanda' 5 | orgId: 1 6 | folder: '' 7 | folderUid: '' 8 | type: file 9 | disableDeletion: false 10 | updateIntervalSeconds: 10 11 | allowUiUpdates: false 12 | options: 13 | path: /var/lib/grafana/dashboards -------------------------------------------------------------------------------- /grafana-dashboards/src/jsonnetfile.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "dependencies": [ 4 | { 5 | "source": { 6 | "git": { 7 | "remote": "https://github.com/grafana/grafonnet", 8 | "subdir": "gen/grafonnet-latest" 9 | } 10 | }, 11 | "version": "main" 12 | } 13 | ], 14 | "legacyImports": false 15 | } 16 | -------------------------------------------------------------------------------- /demo/config/alertmanager/alertmanager.yml: -------------------------------------------------------------------------------- 1 | 2 | route: 3 | receiver: 'mailhog' 4 | repeat_interval: 4h 5 | group_by: [ alertname ] 6 | 7 | 8 | receivers: 9 | - name: 'mailhog' 10 | email_configs: 11 | - smarthost: 'mailhog:1025' 12 | from: 'alertmanager@observability-demo.redpanda.com' 13 | to: 'recipient@observability-demo.redpanda.com' 14 | require_tls: false -------------------------------------------------------------------------------- /grafana-dashboards/src/Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: all install clean build 2 | 3 | JSONNET_BIN := jsonnet 4 | JB_BIN := jb 5 | OUTPUT_DIR := .. 6 | VENDOR_DIR := vendor 7 | 8 | all: install build 9 | 10 | install: 11 | @echo "Installing grafonnet dependencies..." 12 | @$(JB_BIN) install 13 | 14 | build: 15 | @echo "Building dashboards..." 16 | @$(JSONNET_BIN) -J $(VENDOR_DIR) -o $(OUTPUT_DIR)/Redpanda-Serverless-Dashboard.json dashboards/serverless.jsonnet 17 | @echo "Dashboard generated successfully!" 18 | -------------------------------------------------------------------------------- /demo/config/prometheus/prometheus.yml: -------------------------------------------------------------------------------- 1 | global: 2 | scrape_interval: 10s 3 | evaluation_interval: 10s 4 | 5 | scrape_configs: 6 | - job_name: redpanda 7 | static_configs: 8 | - targets: 9 | - redpanda-0:9644 10 | - redpanda-1:9644 11 | - redpanda-2:9644 12 | metrics_path: /public_metrics 13 | - job_name: connect 14 | static_configs: 15 | - targets: 16 | - connect:9010 17 | metrics_path: / 18 | 19 | alerting: 20 | alertmanagers: 21 | - scheme: http 22 | static_configs: 23 | - targets: 24 | - alertmanager:9093 25 | 26 | rule_files: 27 | - /etc/prometheus/alert-rules.yml -------------------------------------------------------------------------------- /cloud/config/prometheus.yml: -------------------------------------------------------------------------------- 1 | global: 2 | scrape_interval: 10s 3 | evaluation_interval: 10s 4 | 5 | scrape_configs: 6 | # The job name is added as a label `job=` to any timeseries scraped from this config. 7 | # Replace the section below with what you've copied from `Prometheus YAML` 8 | # in the "Redpanda Cloud > Overview > How to connect" Screen 9 | - job_name: <> 10 | static_configs: 11 | - targets: 12 | - <> 13 | metrics_path: /api/cloud/prometheus/public_metrics 14 | basic_auth: 15 | username: prometheus 16 | password: <> 17 | scheme: https 18 | -------------------------------------------------------------------------------- /cloud/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.7' 2 | services: 3 | grafana: 4 | image: grafana/grafana 5 | container_name: grafana 6 | environment: 7 | - "GF_AUTH_ANONYMOUS_ENABLED=true" 8 | - "GF_AUTH_ANONYMOUS_ORG_ROLE=Admin" 9 | - "GF_AUTH_ANONYMOUS_HIDE_VERSION=true" 10 | volumes: 11 | # Mount provisioning configuration 12 | - "./config/grafana/provisioning:/etc/grafana/provisioning" 13 | # Mount dashboards 14 | - "../grafana-dashboards:/var/lib/grafana/dashboards" 15 | ports: 16 | - 3000:3000 17 | 18 | prometheus: 19 | image: prom/prometheus 20 | container_name: prometheus 21 | # Mount prometheus configuration 22 | volumes: 23 | - "./config/prometheus.yml:/etc/prometheus/prometheus.yml" 24 | ports: 25 | - 9090:9090 26 | -------------------------------------------------------------------------------- /grafana-dashboards/src/README.md: -------------------------------------------------------------------------------- 1 | # Grafana Dashboards Generator 2 | 3 | This directory contains Jsonnet source code for generating Grafana dashboards using [Grafonnet](https://grafana.github.io/grafonnet/). 4 | 5 | ## Prerequisites 6 | 7 | Install the following tools: 8 | 9 | - **jsonnet**: `brew install jsonnet` (macOS) or see [jsonnet.org](https://jsonnet.org/) 10 | - **jsonnet-bundler (jb)**: `go install github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb@latest` 11 | 12 | ## Build Dashboards 13 | 14 | ```bash 15 | # Install dependencies 16 | make install 17 | 18 | # Generate JSON dashboards 19 | make build 20 | ``` 21 | 22 | Generated dashboards will be created in the parent directory (`grafana-dashboards/`). 23 | 24 | ## Adding New Dashboards 25 | 26 | Create a new `.jsonnet` file in `dashboards/` and add a build target in the `Makefile`: 27 | 28 | See the `serverless.jsonnet` example for code organization. 29 | 30 | ## Importing Dashboards 31 | 32 | The generated dashboards use datasource variables instead of hardcoded UIDs, making them portable across Grafana instances: 33 | 34 | 1. In Grafana, go to **Dashboards → Import** 35 | 2. Upload the JSON file 36 | 3. Select your Prometheus datasource when prompted 37 | 38 | ## Resources 39 | 40 | - [Grafonnet Documentation](https://grafana.github.io/grafonnet/) 41 | - [Jsonnet Tutorial](https://jsonnet.org/learning/tutorial.html) 42 | -------------------------------------------------------------------------------- /grafana-dashboards/src/dashboards/serverless.jsonnet: -------------------------------------------------------------------------------- 1 | local grafonnet = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | local common = import '../lib/common.libsonnet'; 3 | 4 | // Import dashboard sections 5 | local overview = import './serverless-overview.jsonnet'; 6 | local quota = import './serverless-quota.jsonnet'; 7 | local schemaRegistry = import './serverless-schema-registry.jsonnet'; 8 | local rpcn = import './serverless-rpcn.jsonnet'; 9 | 10 | local datasource = { type: 'prometheus', uid: '${DS_PROMETHEUS}' }; 11 | 12 | // Dashboard variables 13 | local variables = [ 14 | common.datasourceVariable(), 15 | common.dataClusterVariable(datasource), 16 | ]; 17 | 18 | // Build the complete panel list (flat structure with rows as separators) 19 | // Concatenate all sections and apply auto-layout in one pass 20 | local allPanels = common.layoutPanels( 21 | // Overview section 22 | [overview.row] + overview.panels(datasource) + 23 | // Quota section 24 | [quota.row] + quota.panels(datasource) + 25 | // Schema Registry section 26 | [schemaRegistry.row] + schemaRegistry.panels(datasource) + 27 | // Redpanda Connect section 28 | [rpcn.row] + rpcn.panels(datasource) 29 | ); 30 | 31 | // Build the dashboard 32 | grafonnet.dashboard.new('Redpanda Serverless Dashboard') 33 | + grafonnet.dashboard.withUid('redpanda-serverless') 34 | + grafonnet.dashboard.withTags([]) 35 | + grafonnet.dashboard.withTimezone('') 36 | + grafonnet.dashboard.withEditable(true) 37 | + grafonnet.dashboard.withRefresh('5s') 38 | + grafonnet.dashboard.time.withFrom('now-15m') 39 | + grafonnet.dashboard.time.withTo('now') 40 | + grafonnet.dashboard.graphTooltip.withSharedCrosshair() 41 | + grafonnet.dashboard.withVariables(variables) 42 | + grafonnet.dashboard.withPanels(allPanels) 43 | + { __requires: common.dashboardRequirements() } 44 | -------------------------------------------------------------------------------- /alert-generation/templates/grafana-export-rule-template.yml: -------------------------------------------------------------------------------- 1 | uid: 2 | title: 3 | condition: C 4 | data: 5 | - refId: A 6 | relativeTimeRange: 7 | from: 900 8 | to: 0 9 | datasourceUid: 10 | model: 11 | datasource: 12 | type: prometheus 13 | uid: 14 | exemplar: true 15 | expr: 16 | hide: false 17 | interval: "" 18 | intervalFactor: 1 19 | intervalMs: 15000 20 | legendFormat: Nodes Up 21 | maxDataPoints: 100 22 | refId: A 23 | step: 40 24 | - refId: B 25 | relativeTimeRange: 26 | from: 900 27 | to: 0 28 | datasourceUid: __expr__ 29 | model: 30 | conditions: 31 | - evaluator: 32 | params: [] 33 | type: gt 34 | operator: 35 | type: and 36 | query: 37 | params: 38 | - B 39 | reducer: 40 | params: [] 41 | type: last 42 | type: query 43 | datasource: 44 | type: __expr__ 45 | uid: __expr__ 46 | expression: A 47 | hide: false 48 | intervalMs: 1000 49 | maxDataPoints: 43200 50 | reducer: last 51 | refId: B 52 | settings: 53 | mode: replaceNN 54 | replaceWithValue: 0 55 | type: reduce 56 | - refId: C 57 | relativeTimeRange: 58 | from: 900 59 | to: 0 60 | datasourceUid: __expr__ 61 | model: 62 | conditions: 63 | - evaluator: 64 | params: 65 | - 5 66 | type: lt 67 | operator: 68 | type: and 69 | query: 70 | params: 71 | - C 72 | reducer: 73 | params: [] 74 | type: last 75 | type: query 76 | datasource: 77 | type: __expr__ 78 | uid: __expr__ 79 | expression: B 80 | hide: false 81 | intervalMs: 1000 82 | maxDataPoints: 43200 83 | refId: C 84 | type: threshold 85 | noDataState: NoData 86 | execErrState: Error 87 | for: 88 | annotations: 89 | labels: 90 | isPaused: false -------------------------------------------------------------------------------- /demo/README.md: -------------------------------------------------------------------------------- 1 | # Sandbox 2 | This example contains a fully dockerized sandbox with: 3 | * A 3-node Redpanda cluster 4 | * Redpanda Console 5 | * Prometheus 6 | * Alert Manager (from Prometheus) 7 | * [Alert Generation](../alert-generation) 8 | * Grafana 9 | * Mailhog 10 | * Kafka Connect. Used here to showcase the Kafka producer/consumer dashboards 11 | * Owl Shop. A mock e-commerce application that generates data. 12 | 13 | ## Setup 14 | To get started it is as simple as running the docker compose file: 15 | 16 | ```commandline 17 | $ cd demo 18 | $ docker-compose up -d 19 | [+] Running 9/9 20 | ⠿ Network demo_default Created 21 | ⠿ Container grafana Started 22 | ⠿ Container prometheus Started 23 | ⠿ Container redpanda-1 Started 24 | ⠿ Container redpanda-0 Started 25 | ⠿ Container redpanda-2 Started 26 | ⠿ Container demo-owl-shop-1 Started 27 | ⠿ Container demo-console-1 Started 28 | ⠿ Container connect Started 29 | ``` 30 | 31 | You can check the status of the Redpanda cluster using the following command: 32 | ```commandline 33 | $ docker exec -it redpanda-0 rpk cluster status 34 | CLUSTER 35 | ======= 36 | redpanda.initializing 37 | 38 | BROKERS 39 | ======= 40 | ID HOST PORT 41 | 0* redpanda-0 29092 42 | 1 redpanda-1 29093 43 | 2 redpanda-2 29094 44 | 45 | ``` 46 | Once the bootstrap is complete, you should see all three nodes running and the cluster's UUID displayed. 47 | ```commandline 48 | $docker exec -it redpanda-0 rpk cluster status 49 | CLUSTER 50 | ======= 51 | redpanda.initializing 52 | 53 | BROKERS 54 | ======= 55 | ID HOST PORT 56 | 0* redpanda-0 29092 57 | 1 redpanda-1 29093 58 | 2 redpanda-2 29094 59 | 60 | ``` 61 | You should now be able to open the following URIs in your browser for each service: 62 | - Redpanda Console: [http://localhost:8080/](http://localhost:8080/]) 63 | - Prometheus: [http://localhost:9090](http://localhost:9090]) 64 | - Grafana: [http://localhost:3000](http://localhost:3000]) 65 | - Mailhog: [http://localhost:8025](http://localhost:8025) 66 | 67 | Once you log into Grafana, click on the Dashboards icon on the left and select Browse. From there, you should be able to 68 | see the imported dashboards described above. 69 | -------------------------------------------------------------------------------- /grafana-dashboards/src/dashboards/serverless-schema-registry.jsonnet: -------------------------------------------------------------------------------- 1 | local grafonnet = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | local common = import '../lib/common.libsonnet'; 3 | local sectionName = 'Schema Registry'; 4 | 5 | { 6 | row:: 7 | grafonnet.panel.row.new(sectionName) 8 | + grafonnet.panel.row.gridPos.withW(24) 9 | + grafonnet.panel.row.gridPos.withH(1), 10 | 11 | panels(datasource):: [ 12 | // Section header 13 | common.sectionHeaderPanel(sectionName), 14 | 15 | // Schemas 16 | common.statPanel( 17 | 'Schemas', 18 | [ 19 | common.prometheusQuery( 20 | 'sum(redpanda_schema_registry_cache_schema_count{redpanda_cloud_data_cluster_name=~"${data_cluster}"})', 21 | 'Total Schemas' 22 | ), 23 | ] 24 | ) 25 | + grafonnet.panel.stat.gridPos.withW(3) 26 | + grafonnet.panel.stat.gridPos.withH(6) 27 | + grafonnet.panel.stat.standardOptions.withUnit('none'), 28 | 29 | // Schema Registry Request Rate 30 | common.timeseriesPanel( 31 | 'Schema Registry Request Rate', 32 | [ 33 | common.prometheusQuery( 34 | 'sum by (listener) (rate(redpanda_schema_registry_request_latency_seconds_count{redpanda_cloud_data_cluster_name=~"${data_cluster}"}[$__rate_interval]))', 35 | 'Request Rate ({{listener}})' 36 | ), 37 | ] 38 | ) 39 | + grafonnet.panel.timeSeries.gridPos.withW(7) 40 | + grafonnet.panel.timeSeries.gridPos.withH(6) 41 | + grafonnet.panel.timeSeries.standardOptions.withUnit('reqps') 42 | + grafonnet.panel.timeSeries.standardOptions.withDecimals(2), 43 | 44 | // Schema Registry P95 Latency 45 | common.timeseriesPanel( 46 | 'Schema Registry P95 Latency', 47 | [ 48 | common.prometheusQuery( 49 | 'histogram_quantile(0.95, sum by (le, listener) (rate(redpanda_schema_registry_request_latency_seconds_bucket{redpanda_cloud_data_cluster_name=~"${data_cluster}"}[$__rate_interval]))) * 1000', 50 | 'P95 Latency ({{listener}})' 51 | ), 52 | ] 53 | ) 54 | + grafonnet.panel.timeSeries.gridPos.withW(7) 55 | + grafonnet.panel.timeSeries.gridPos.withH(6) 56 | + grafonnet.panel.timeSeries.standardOptions.withUnit('ms') 57 | + grafonnet.panel.timeSeries.standardOptions.withDecimals(2), 58 | 59 | // Schema Registry Error Rate 60 | common.timeseriesPanel( 61 | 'Schema Registry Error Rate', 62 | [ 63 | common.prometheusQuery( 64 | 'sum(rate(redpanda_schema_registry_request_errors{redpanda_cloud_data_cluster_name=~"${data_cluster}"}[$__rate_interval]))', 65 | 'Error Rate' 66 | ), 67 | ], 68 | { fillOpacity: 0 } 69 | ) 70 | + grafonnet.panel.timeSeries.gridPos.withW(7) 71 | + grafonnet.panel.timeSeries.gridPos.withH(6) 72 | + grafonnet.panel.timeSeries.standardOptions.withUnit('errps'), 73 | ], 74 | } 75 | -------------------------------------------------------------------------------- /alert-generation/generate.py: -------------------------------------------------------------------------------- 1 | import os 2 | import yaml 3 | from pathlib import Path 4 | 5 | 6 | def generate_grafana_alerts(): 7 | alert_definitions = os.environ.get('ALERT_DEFINITIONS_YAML_FILE_LOCATION') 8 | alerts = yaml.safe_load(Path(alert_definitions).read_text())["rules"] 9 | 10 | rules = {} 11 | 12 | for alert in alerts: 13 | rule = yaml.safe_load(Path("templates/grafana-export-rule-template.yml").read_text()) 14 | rule["uid"] = alert["uid"] 15 | rule["title"] = alert["alert"] 16 | rule["data"][0]["model"]["expr"] = alert["expr"] 17 | rule["data"][2]["model"]["conditions"][0]["evaluator"]["type"] = alert["comparison"] 18 | rule["data"][2]["model"]["conditions"][0]["evaluator"]["params"][0] = alert["threshold"] 19 | rule["for"] = alert["for"] 20 | rule["labels"] = alert["labels"] 21 | rule["annotations"] = alert["annotations"] 22 | rule["data"][0]["datasourceUid"] = "P1809F7CD0C75ACF3" 23 | rule["data"][0]["model"]["datasource"]["uid"] = "P1809F7CD0C75ACF3" 24 | if alert["folder"] not in rules: 25 | rules[alert["folder"]] = [] 26 | rules[alert["folder"]].append(rule) 27 | 28 | groups = [] 29 | 30 | for folder in rules.keys(): 31 | group = yaml.safe_load(Path("templates/grafana-export-group-template.yml").read_text()) 32 | group["folder"] = folder 33 | group["name"] = folder 34 | for rule in rules[folder]: 35 | group["rules"].append(rule) 36 | groups.append(group) 37 | 38 | export = yaml.safe_load(Path("templates/grafana-export-header-template.yml").read_text()) 39 | export["groups"] = groups 40 | 41 | with open(os.environ.get('GRAFANA_ALERTS_YAML_FILE_LOCATION'), 'w') as outfile: 42 | yaml.dump(export, outfile, width=500) 43 | 44 | 45 | def convert_comparison(comparison): 46 | if comparison == "lt": 47 | return "<" 48 | elif comparison == "gt": 49 | return ">" 50 | else: 51 | raise ValueError("Don't know how to convert unknown comparison function: {}".format(comparison)) 52 | 53 | 54 | def generate_prometheus_alerts(): 55 | alert_definitions = os.environ.get('ALERT_DEFINITIONS_YAML_FILE_LOCATION') 56 | alerts = yaml.safe_load(Path(alert_definitions).read_text())["rules"] 57 | 58 | for alert in alerts: 59 | alert.pop("uid") 60 | alert.pop("folder") 61 | alert.pop("evaluation_group") 62 | alert["expr"] = "({}) {} {}".format(alert["expr"], convert_comparison(alert["comparison"]), alert["threshold"]) 63 | alert.pop("comparison") 64 | alert.pop("threshold") 65 | 66 | export = yaml.safe_load(Path("templates/prometheus-export-template.yml").read_text()) 67 | export["groups"][0]["rules"] = alerts 68 | 69 | prometheus_alerts = os.environ.get('PROMETHEUS_ALERTS_YAML_FILE_LOCATION') 70 | with open(prometheus_alerts, 'w') as outfile: 71 | yaml.dump(export, outfile, width=500) 72 | 73 | 74 | generate_grafana_alerts() 75 | generate_prometheus_alerts() 76 | 77 | 78 | -------------------------------------------------------------------------------- /cloud/README.md: -------------------------------------------------------------------------------- 1 | # Redpanda Cloud 2 | 3 | Redpanda Cloud exposes an OpenMetrics endpoint that can be scraped using Prometheus or other 4 | compatible tools. This example shows how to set up Prometheus and Grafana to work with 5 | clusters in Redpanda Cloud. 6 | 7 | --- 8 | 9 | **NOTE**: This is for instructional purposes only and not meant for production use. For production, 10 | we recommend deploying Prometheus and Grafana standalone or as a managed service. You can then 11 | directly import the dashboards provided in the [grafana-dashboards](../grafana-dashboards) 12 | directory into Grafana. 13 | 14 | --- 15 | 16 | ## Setup 17 | 18 | To get started, clone this repository into a local directory named `observability`. 19 | 20 | ## Browse the project 21 | 22 | Navigate to the cloud example: 23 | 24 | ``` 25 | cd observability/cloud 26 | ``` 27 | 28 | In there you will see a config directory with the following structure: 29 | 30 | ``` 31 | config 32 | ├── grafana 33 | │   └── provisioning 34 | │   ├── dashboards 35 | │   │   └── redpanda.yaml 36 | │   └── datasources 37 | │   └── prometheus.yaml 38 | └── prometheus.yml 39 | ``` 40 | 41 | This directory is mounted by Docker to provide configuration and other needed files by the 42 | Prometheus and Grafana containers. You will need to modify `config/prometheus.yml` such that 43 | it points to the information provided by Redpanda Cloud. 44 | 45 | ## Go to Redpanda Cloud 46 | 47 | The Redpanda Cloud Overview page contains details on how to configure Prometheus to scrape 48 | metrics from the cluster. To view this information, go to ** Overview > How to connect > Prometheus **. 49 | You should see something like the following: 50 | 51 | ![Prometheus Endpoint Info](images/prometheus-info-screen.png) 52 | 53 | Click on the icon to the right to copy the contents of **Prometheus YAML** into your clipboard. 54 | 55 | ## Edit `config/prometheus.yml` 56 | 57 | Open up `config/prometheus.yml` and replace everything under the `-- REPLACE BELOW --` line with 58 | the clipboard contents that you just copied. Alternatively, you can simply replace the target 59 | hostname and password -- both pieces of information are also provided in the Redpanda Cloud UI. 60 | 61 | You should end up with a configuration that looks something like this: 62 | 63 | ``` 64 | global: 65 | scrape_interval: 10s 66 | evaluation_interval: 10s 67 | 68 | scrape_configs: 69 | # The job name is added as a label `job=` to any timeseries scraped from this config. 70 | # Replace the section below with what you've copied from `Prometheus YAML` 71 | # in the "Redpanda Cloud > Overview > How to connect" Screen 72 | # ---------------- REPLACE BELOW ---------------------- 73 | - job_name: redpandaCloud-clustername 74 | static_configs: 75 | - targets: 76 | - somehostname.cloud.redpanda.com 77 | metrics_path: /api/cloud/prometheus/public_metrics 78 | basic_auth: 79 | username: prometheus 80 | password: REDACTED 81 | scheme: https 82 | ``` 83 | 84 | ## Run Docker Compose 85 | 86 | Bring up Prometheus and Grafana using Docker Compose: 87 | 88 | ``` 89 | docker compose up -d 90 | ``` 91 | 92 | Point your browser to [localhost:3000](http://localhost:3000) to view the dashboards in Grafana. 93 | 94 | 95 | -------------------------------------------------------------------------------- /alert-generation/README.md: -------------------------------------------------------------------------------- 1 | # Alert Generation 2 | 3 | This is a sub-project of [Observability](https://github.com/redpanda-data/observability), which exists to help 4 | create alert definition files for Prometheus and Grafana. 5 | 6 | ### I just want to add my own alerts - do I need this? 7 | 8 | Probably not. All you need to do is modify the [alert definition file](../demo/config/alert-definitions.yml) with your 9 | alerts and run `docker-compose up` - everything should work from there. If it doesn't, please open an issue! 10 | 11 | ### What is the format of the alert definition file? 12 | 13 | It's YAML. It's a custom structure, but it's very close to the format used by 14 | [Prometheus](../demo/config/prometheus/alert-rules.yml). The similarity is intentional, but it differs so that we can 15 | also generate Grafana alerts relatively easily as well. 16 | 17 | ### What does the project do? 18 | 19 | It reads a YAML [alert definition file](../demo/config/alert-definitions.yml) and writes 20 | out [alerts.yml](../demo/config/grafana/provisioning/alerting/alerts.yml) for Grafana and [alert-rules.yml](../demo/config/prometheus/alert-rules.yml) It does that using a number of templates found in `alert-generation/templates` 21 | 22 | ### How is it configured? 23 | 24 | Environment variables - see the command line example below. This helps when running the command in Docker / Kubernetes 25 | environments. 26 | 27 | # Usage 28 | 29 | ## Command line 30 | 31 | ```shell 32 | # This is the alert definitions input file for processing 33 | ALERT_DEFINITIONS_YAML_FILE_LOCATION=path/to/alert-definitions.yml 34 | 35 | # This is where the Grafana alerts file should be written 36 | GRAFANA_ALERTS_YAML_FILE_LOCATION=path/to/grafana/alerts.yml 37 | 38 | # This is where the Prometheus alerts file should be written 39 | PROMETHEUS_ALERTS_YAML_FILE_LOCATION=path/to/prometheus/alert-rules.yml 40 | 41 | # This runs the alert generation process 42 | python3 generate.py 43 | ``` 44 | 45 | ## Docker Compose 46 | 47 | In order that the demo work with a simple `docker-compose up`, the alert generation is already integrated into the 48 | [docker-compose.yml](../demo/docker-compose.yml). When docker compose is started, a pre-built image 49 | [pmwrp/alert-generation:0.1](https://hub.docker.com/r/pmwrp/alert-generation) is run, which performs the alert 50 | generation process. This writes out new alert definitions for both Prometheus and Grafana into the 51 | `../config` folder, which are used by their respective containers. 52 | 53 | ```yaml 54 | version: '3.7' 55 | services: 56 | # ... other services here 57 | generate-alert-configs: 58 | image: pmwrp/alert-generation:0.1 59 | environment: 60 | - "ALERT_DEFINITIONS_YAML_FILE_LOCATION=/config/alert-definitions.yml" 61 | - "GRAFANA_ALERTS_YAML_FILE_LOCATION=/config/grafana/provisioning/alerting/alerts.yml" 62 | - "PROMETHEUS_ALERTS_YAML_FILE_LOCATION=/config/prometheus/alert-rules.yml" 63 | volumes: 64 | - "./config:/config" 65 | # ... other services here 66 | ``` 67 | 68 | (This docker-compose.yaml snippet runs the alert generation process over the current [alert definition file](../demo/config/alert-definitions.yml) and writes out 69 | [alerts.yml](../demo/config/grafana/provisioning/alerting/alerts.yml) for Grafana and [alert-rules.yml](../demo/config/prometheus/alert-rules.yml) for Prometheus.) -------------------------------------------------------------------------------- /grafana-dashboards/src/dashboards/serverless-quota.jsonnet: -------------------------------------------------------------------------------- 1 | local grafonnet = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | local common = import '../lib/common.libsonnet'; 3 | local sectionName = 'Resource Quota (Soft Limits)'; 4 | 5 | { 6 | row:: 7 | grafonnet.panel.row.new(sectionName) 8 | + grafonnet.panel.row.gridPos.withW(24) 9 | + grafonnet.panel.row.gridPos.withH(1), 10 | 11 | panels(datasource):: [ 12 | // Section header 13 | common.sectionHeaderPanel(sectionName), 14 | 15 | // Partition Quota 16 | common.gaugePanel( 17 | 'Partition Quota', 18 | [ 19 | common.prometheusQuery( 20 | '(sum(redpanda_cluster_partitions{redpanda_cloud_data_cluster_name=~"${data_cluster}"}) / sum(redpanda_serverless_resource_limit{resource="partitions", redpanda_cloud_data_cluster_name=~"${data_cluster}"})) * 100', 21 | 'Partition Usage' 22 | ), 23 | ] 24 | ) 25 | + grafonnet.panel.gauge.gridPos.withW(5) 26 | + grafonnet.panel.gauge.gridPos.withH(5), 27 | 28 | // Topic Quota 29 | common.gaugePanel( 30 | 'Topic Quota', 31 | [ 32 | common.prometheusQuery( 33 | '(sum (redpanda_cluster_topics{redpanda_cloud_data_cluster_name=~"${data_cluster}"}) / sum(redpanda_serverless_resource_limit{resource="topics", redpanda_cloud_data_cluster_name=~"${data_cluster}"})) * 100', 34 | 'Partition Usage' 35 | ), 36 | ] 37 | ) 38 | + grafonnet.panel.gauge.gridPos.withW(4) 39 | + grafonnet.panel.gauge.gridPos.withH(5), 40 | 41 | // Ingress Quota 42 | common.gaugePanel( 43 | 'Ingress Quota', 44 | [ 45 | common.prometheusQuery( 46 | '(sum (rate(redpanda_serverless_ingress_bytes_total{redpanda_cloud_data_cluster_name=~"${data_cluster}"}[5m])) / sum (redpanda_serverless_resource_limit{resource="ingress", redpanda_cloud_data_cluster_name=~"${data_cluster}"})) * 100', 47 | 'Ingress Rate Usage' 48 | ), 49 | ] 50 | ) 51 | + grafonnet.panel.gauge.gridPos.withW(4) 52 | + grafonnet.panel.gauge.gridPos.withH(5), 53 | 54 | // Egress Quota 55 | common.gaugePanel( 56 | 'Egress Quota', 57 | [ 58 | common.prometheusQuery( 59 | '(sum (rate(redpanda_serverless_egress_bytes_total{redpanda_cloud_data_cluster_name=~"${data_cluster}"}[5m])) / sum (redpanda_serverless_resource_limit{resource="egress", redpanda_cloud_data_cluster_name=~"${data_cluster}"})) * 100', 60 | 'Egress Rate Usage' 61 | ), 62 | ] 63 | ) 64 | + grafonnet.panel.gauge.gridPos.withW(4) 65 | + grafonnet.panel.gauge.gridPos.withH(5), 66 | 67 | // Connection Quota 68 | common.gaugePanel( 69 | 'Connection Quota', 70 | [ 71 | common.prometheusQuery( 72 | '(sum (redpanda_serverless_connections_active{redpanda_cloud_data_cluster_name=~"${data_cluster}"}) / sum(redpanda_serverless_resource_limit{resource="connections", redpanda_cloud_data_cluster_name=~"${data_cluster}"})) * 100', 73 | 'Connection Limit' 74 | ), 75 | ] 76 | ) 77 | + grafonnet.panel.gauge.gridPos.withW(4) 78 | + grafonnet.panel.gauge.gridPos.withH(5), 79 | ], 80 | } 81 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Monitoring 2 | This repository contains examples for configuring monitoring of Redpanda using Prometheus and Grafana. 3 | 4 | > [!NOTE] 5 | > This version is designed to work with the `public_metrics` endpoint that was introduced in Redpanda 22.2 and is provided by Redpanda Cloud. If you are running an older version of Redpanda please use the [legacy version](../../tree/legacy_metrics) of these dashboards. 6 | 7 | > [!IMPORTANT] 8 | > The latest version of the [Redpanda Ops Dashboard](grafana-dashboards/Redpanda-Ops-Dashboard.json) includes the newer Kafka handler latency metric, specifically `redpanda_kafka_handler_latency_seconds_bucket{handler="produce"}` and `redpanda_kafka_handler_latency_seconds_bucket{handler="fetch"}`. This metric is only available from Redpanda versions `23.1.19+` and `23.2.12+`. 9 | 10 | ## Grafana Dashboards 11 | The [grafana-dashboards](grafana-dashboards) folder contains sample dashboards that can be imported directly into a 12 | Grafana or Grafana Cloud instance. 13 | 14 | The following dashboards are provided as examples: 15 | 16 | - [Redpanda Ops Dashboard](grafana-dashboards/Redpanda-Ops-Dashboard.json) - Provides an overview of KPIs for a Redpanda 17 | cluster with health indicators. This is suitable for ops or SRE to monitor on a daily or continuous basis. 18 | - [Kafka Topic Metrics](grafana-dashboards/Kafka-Topic-Metrics.json) - Provides throughput, read/write rates, and 19 | on-disk sizes of each/all topics. 20 | - [Kafka Consumer Offsets](grafana-dashboards/Kafka%20Consumer%20Offsets.json) - Metrics and KPIs that provide details 21 | of topic consumers and how far they are lagging behind the end of the log. 22 | - [Redpanda Dashboard](grafana-dashboards/Redpanda-Default-Dashboard.json) - The default dashboard that would be generated by 23 | the [`rpk generate grafana-dashboard` command]( 24 | https://docs.redpanda.com/docs/platform/reference/rpk/rpk-generate/rpk-generate-grafana-dashboard/). 25 | _Note: This is considered deprecated in favour of the above dashboards._ 26 | - [Kafka Consumer Metrics](grafana-dashboards/Kafka-Consumer-Metrics.json) - Allows for monitoring of Java Kafka 27 | consumers, using the [Prometheus JMX Exporter](https://github.com/prometheus/jmx_exporter) and the 28 | [Kafka Sample Configuration](https://github.com/prometheus/jmx_exporter/blob/master/example_configs/kafka-2_0_0.yml). 29 | 30 | ## Screenshots 31 | ![](docs/images/Ops%20Dashboard.png) 32 | ![](docs/images/Consumer%20Offsets.png) 33 | ![](docs/images/Topic%20Metrics.png) 34 | ![](docs/images/Kafka%20Consume1.png) 35 | ![](docs/images/Kafka%20Consume2.png) 36 | ![](docs/images/Kafka%20Produce1.png) 37 | ![](docs/images/Kafka%20Produce2.png) 38 | ![](docs/images/Kafka%20Produce3.png) 39 | 40 | # Alerting 41 | This repository contains examples for configuring alerting using [Prometheus](https://prometheus.io/) (via [Alert Manager](https://prometheus.io/docs/alerting/latest/alertmanager/)) or [Grafana](https://grafana.com/). 42 | 43 | Grafana can use Alert Manager as a source of alerts, or it can manage alerts independently of Prometheus. 44 | 45 | 46 | ## Screenshots 47 | ![](docs/images/Grafana%20Alerting.png) 48 | 49 | # Examples 50 | 51 | ### Sandbox Environment 52 | The [demo](demo) folder includes a full dockerized sandbox, that will spin up a three-node Redpanda cluster, an instance 53 | of Redpanda Console, a Prometheus instance and a Grafana instance with Prometheus and Grafana configured with the 54 | dashboards in the [grafana-dashboards](grafana-dashboards) folder. 55 | 56 | ### Redpanda Cloud 57 | The [cloud](cloud) folder has a Docker Compose file that will bring up Prometheus and Grafana, with instructions on 58 | how to scrape the Prometheus endpoint exposed by your Redpanda Cloud cluster 59 | 60 | -------------------------------------------------------------------------------- /demo/config/jmx_exporter/kafka.yml: -------------------------------------------------------------------------------- 1 | lowercaseOutputName: true 2 | 3 | rules: 4 | # Special cases and very specific rules 5 | - pattern : kafka.server<>Value 6 | name: kafka_server_$1_$2 7 | type: GAUGE 8 | labels: 9 | clientId: "$3" 10 | topic: "$4" 11 | partition: "$5" 12 | - pattern : kafka.server<>Value 13 | name: kafka_server_$1_$2 14 | type: GAUGE 15 | labels: 16 | clientId: "$3" 17 | broker: "$4:$5" 18 | - pattern : kafka.coordinator.(\w+)<>Value 19 | name: kafka_coordinator_$1_$2_$3 20 | type: GAUGE 21 | 22 | # Generic per-second counters with 0-2 key/value pairs 23 | - pattern: kafka.(\w+)<>Count 24 | name: kafka_$1_$2_$3_total 25 | type: COUNTER 26 | labels: 27 | "$4": "$5" 28 | "$6": "$7" 29 | - pattern: kafka.(\w+)<>Count 30 | name: kafka_$1_$2_$3_total 31 | type: COUNTER 32 | labels: 33 | "$4": "$5" 34 | - pattern: kafka.(\w+)<>Count 35 | name: kafka_$1_$2_$3_total 36 | type: COUNTER 37 | 38 | # Quota specific rules 39 | - pattern: kafka.server<>([a-z-]+) 40 | name: kafka_server_quota_$4 41 | type: GAUGE 42 | labels: 43 | resource: "$1" 44 | user: "$2" 45 | clientId: "$3" 46 | - pattern: kafka.server<>([a-z-]+) 47 | name: kafka_server_quota_$3 48 | type: GAUGE 49 | labels: 50 | resource: "$1" 51 | clientId: "$2" 52 | - pattern: kafka.server<>([a-z-]+) 53 | name: kafka_server_quota_$3 54 | type: GAUGE 55 | labels: 56 | resource: "$1" 57 | user: "$2" 58 | 59 | # Generic gauges with 0-2 key/value pairs 60 | - pattern: kafka.(\w+)<>Value 61 | name: kafka_$1_$2_$3 62 | type: GAUGE 63 | labels: 64 | "$4": "$5" 65 | "$6": "$7" 66 | - pattern: kafka.(\w+)<>Value 67 | name: kafka_$1_$2_$3 68 | type: GAUGE 69 | labels: 70 | "$4": "$5" 71 | - pattern: kafka.(\w+)<>Value 72 | name: kafka_$1_$2_$3 73 | type: GAUGE 74 | 75 | # Emulate Prometheus 'Summary' metrics for the exported 'Histogram's. 76 | # 77 | # Note that these are missing the '_sum' metric! 78 | - pattern: kafka.(\w+)<>Count 79 | name: kafka_$1_$2_$3_count 80 | type: COUNTER 81 | labels: 82 | "$4": "$5" 83 | "$6": "$7" 84 | - pattern: kafka.(\w+)<>(\d+)thPercentile 85 | name: kafka_$1_$2_$3 86 | type: GAUGE 87 | labels: 88 | "$4": "$5" 89 | "$6": "$7" 90 | quantile: "0.$8" 91 | - pattern: kafka.(\w+)<>Count 92 | name: kafka_$1_$2_$3_count 93 | type: COUNTER 94 | labels: 95 | "$4": "$5" 96 | - pattern: kafka.(\w+)<>(\d+)thPercentile 97 | name: kafka_$1_$2_$3 98 | type: GAUGE 99 | labels: 100 | "$4": "$5" 101 | quantile: "0.$6" 102 | - pattern: kafka.(\w+)<>Count 103 | name: kafka_$1_$2_$3_count 104 | type: COUNTER 105 | - pattern: kafka.(\w+)<>(\d+)thPercentile 106 | name: kafka_$1_$2_$3 107 | type: GAUGE 108 | labels: 109 | quantile: "0.$4" 110 | 111 | # Generic gauges for MeanRate Percent 112 | # Ex) kafka.server<>MeanRate 113 | - pattern: kafka.(\w+)<>MeanRate 114 | name: kafka_$1_$2_$3_percent 115 | type: GAUGE 116 | - pattern: kafka.(\w+)<>Value 117 | name: kafka_$1_$2_$3_percent 118 | type: GAUGE 119 | - pattern: kafka.(\w+)<>Value 120 | name: kafka_$1_$2_$3_percent 121 | type: GAUGE 122 | labels: 123 | "$4": "$5" 124 | 125 | -------------------------------------------------------------------------------- /grafana-dashboards/src/dashboards/serverless-rpcn.jsonnet: -------------------------------------------------------------------------------- 1 | local grafonnet = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | local common = import '../lib/common.libsonnet'; 3 | local sectionName = 'Redpanda Connect'; 4 | 5 | { 6 | row:: 7 | grafonnet.panel.row.new(sectionName) 8 | + grafonnet.panel.row.gridPos.withW(24) 9 | + grafonnet.panel.row.gridPos.withH(1), 10 | 11 | panels(datasource):: [ 12 | // Section header 13 | common.sectionHeaderPanel(sectionName), 14 | 15 | // Running Pipelines 16 | common.statPanel( 17 | 'Running Pipelines', 18 | [ 19 | common.prometheusQuery( 20 | 'count(sum by (pipeline_id) (input_connection_up{redpanda_cloud_data_cluster_name=~"${data_cluster}"}))', 21 | 'Running Pipelines' 22 | ), 23 | ] 24 | ) 25 | + grafonnet.panel.stat.gridPos.withW(3) 26 | + grafonnet.panel.stat.gridPos.withH(7) 27 | + grafonnet.panel.stat.standardOptions.withUnit('none'), 28 | 29 | // Input Throughput 30 | common.timeseriesPanel( 31 | 'Input Throughput', 32 | [ 33 | common.prometheusQuery( 34 | 'sum by (pipeline_id) (rate(input_received{redpanda_cloud_data_cluster_name=~"${data_cluster}"}[$__rate_interval]))', 35 | '{{pipeline_id}}' 36 | ), 37 | ] 38 | ) 39 | + grafonnet.panel.timeSeries.gridPos.withW(10) 40 | + grafonnet.panel.timeSeries.gridPos.withH(7) 41 | + grafonnet.panel.timeSeries.standardOptions.withUnit('recps') 42 | + grafonnet.panel.timeSeries.panelOptions.withDescription('Message rate for data flowing into Redpanda Connect pipelines'), 43 | 44 | // Output Throughput 45 | common.timeseriesPanel( 46 | 'Output Throughput', 47 | [ 48 | common.prometheusQuery( 49 | 'sum by (pipeline_id) (rate(output_sent{redpanda_cloud_data_cluster_name=~"${data_cluster}"}[$__rate_interval]))', 50 | '{{pipeline_id}}' 51 | ), 52 | ] 53 | ) 54 | + grafonnet.panel.timeSeries.gridPos.withW(11) 55 | + grafonnet.panel.timeSeries.gridPos.withH(7) 56 | + grafonnet.panel.timeSeries.standardOptions.withUnit('recps') 57 | + grafonnet.panel.timeSeries.standardOptions.color.withMode('continuous-BlPu') 58 | + grafonnet.panel.timeSeries.panelOptions.withDescription('Message rate for data flowing out of Redpanda Connect pipelines'), 59 | 60 | {}, // Line break 61 | 62 | // Input Connections Down 63 | common.timeseriesPanel( 64 | 'Input Connections Down', 65 | [ 66 | common.prometheusQuery( 67 | 'sum(input_connection_failed{redpanda_cloud_data_cluster_name=~"${data_cluster}"}) OR sum(input_connection_lost{redpanda_cloud_data_cluster_name=~"${data_cluster}"})', 68 | 'Total Input Down' 69 | ), 70 | ], 71 | { fillOpacity: 0 } 72 | ) 73 | + grafonnet.panel.timeSeries.gridPos.withW(6) 74 | + grafonnet.panel.timeSeries.gridPos.withH(5) 75 | + grafonnet.panel.timeSeries.standardOptions.withUnit('none'), 76 | 77 | // Output Connections Down 78 | common.timeseriesPanel( 79 | 'Output Connections Down', 80 | [ 81 | common.prometheusQuery( 82 | 'sum(output_connection_failed{redpanda_cloud_data_cluster_name=~"${data_cluster}"}) OR sum(output_connection_lost{redpanda_cloud_data_cluster_name=~"${data_cluster}"})', 83 | 'Total Output Down' 84 | ), 85 | ], 86 | { fillOpacity: 0 } 87 | ) 88 | + grafonnet.panel.timeSeries.gridPos.withW(6) 89 | + grafonnet.panel.timeSeries.gridPos.withH(5) 90 | + grafonnet.panel.timeSeries.standardOptions.withUnit('none'), 91 | 92 | // Processor Errors 93 | common.timeseriesPanel( 94 | 'Processor Errors', 95 | [ 96 | common.prometheusQuery( 97 | 'sum(processor_error{redpanda_cloud_data_cluster_name=~"${data_cluster}"})', 98 | 'Total Processor Error' 99 | ), 100 | ], 101 | { fillOpacity: 0 } 102 | ) 103 | + grafonnet.panel.timeSeries.gridPos.withW(6) 104 | + grafonnet.panel.timeSeries.gridPos.withH(5) 105 | + grafonnet.panel.timeSeries.standardOptions.withUnit('none'), 106 | 107 | // Output Errors 108 | common.timeseriesPanel( 109 | 'Output Errors', 110 | [ 111 | common.prometheusQuery( 112 | 'sum(output_error{redpanda_cloud_data_cluster_name=~"${data_cluster}"})', 113 | 'Total Output Errors' 114 | ), 115 | ], 116 | { fillOpacity: 0 } 117 | ) 118 | + grafonnet.panel.timeSeries.gridPos.withW(6) 119 | + grafonnet.panel.timeSeries.gridPos.withH(5) 120 | + grafonnet.panel.timeSeries.standardOptions.withUnit('none'), 121 | 122 | {}, // Line break 123 | 124 | // Input P90 Latency 125 | common.timeseriesPanel( 126 | 'Input P90 Latency (ms)', 127 | [ 128 | common.prometheusQuery( 129 | 'avg by (pipeline_id) (input_latency_ns{redpanda_cloud_data_cluster_name=~"${data_cluster}", quantile="0.9"}) / 1000000', 130 | '{{pipeline_id}}' 131 | ), 132 | common.prometheusQuery( 133 | 'histogram_quantile(0.95, sum by (le) (rate(output_latency_ns_bucket{redpanda_cloud_data_cluster_name=~"${data_cluster}"}[$__rate_interval]))) / 1000000', 134 | 'Output P95 Latency' 135 | ), 136 | ] 137 | ) 138 | + grafonnet.panel.timeSeries.gridPos.withW(12) 139 | + grafonnet.panel.timeSeries.gridPos.withH(8) 140 | + grafonnet.panel.timeSeries.standardOptions.withUnit('ms') 141 | + grafonnet.panel.timeSeries.panelOptions.withDescription('P95 latency for messages traveling through the connector input stages'), 142 | 143 | // Output P90 Latency 144 | common.timeseriesPanel( 145 | 'Output P90 Latency (ms)', 146 | [ 147 | common.prometheusQuery( 148 | 'avg by (pipeline_id) (output_latency_ns{redpanda_cloud_data_cluster_name=~"${data_cluster}", quantile="0.9"}) / 1000000', 149 | '{{pipeline_id}}' 150 | ), 151 | common.prometheusQuery( 152 | 'histogram_quantile(0.95, sum by (le) (rate(output_latency_ns_bucket{redpanda_cloud_data_cluster_name=~"${data_cluster}"}[$__rate_interval]))) / 1000000', 153 | 'Output P95 Latency' 154 | ), 155 | ] 156 | ) 157 | + grafonnet.panel.timeSeries.gridPos.withW(12) 158 | + grafonnet.panel.timeSeries.gridPos.withH(8) 159 | + grafonnet.panel.timeSeries.standardOptions.withUnit('ms') 160 | + grafonnet.panel.timeSeries.standardOptions.color.withMode('continuous-BlPu') 161 | + grafonnet.panel.timeSeries.panelOptions.withDescription('P95 latency for messages traveling through the connector output stages'), 162 | ], 163 | } 164 | -------------------------------------------------------------------------------- /demo/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3.7' 2 | services: 3 | mailhog: 4 | image: mailhog/mailhog 5 | container_name: mailhog 6 | hostname: mailhog 7 | ports: 8 | - 8025:8025 9 | generate-alert-configs: 10 | image: pmwrp/alert-generation:0.1 11 | environment: 12 | - "ALERT_DEFINITIONS_YAML_FILE_LOCATION=/config/alert-definitions.yml" 13 | - "GRAFANA_ALERTS_YAML_FILE_LOCATION=/config/grafana/provisioning/alerting/alerts.yml" 14 | - "PROMETHEUS_ALERTS_YAML_FILE_LOCATION=/config/prometheus/alert-rules.yml" 15 | volumes: 16 | - "./config:/config" 17 | redpanda0: 18 | image: docker.redpanda.com/redpandadata/redpanda:latest 19 | container_name: redpanda-0 20 | command: 21 | - redpanda 22 | - start 23 | - --smp 24 | - '1' 25 | - --reserve-memory 26 | - 0M 27 | - --overprovisioned 28 | - --node-id 29 | - '0' 30 | - --kafka-addr 31 | - PLAINTEXT://0.0.0.0:29092,OUTSIDE://0.0.0.0:9092 32 | - --advertise-kafka-addr 33 | - PLAINTEXT://redpanda-0:29092,OUTSIDE://localhost:9092 34 | - --pandaproxy-addr 35 | - PLAINTEXT://0.0.0.0:28082,OUTSIDE://0.0.0.0:8082 36 | - --advertise-pandaproxy-addr 37 | - PLAINTEXT://redpanda-0:28082,OUTSIDE://localhost:8082 38 | - --rpc-addr 0.0.0.0:33145 39 | - --advertise-rpc-addr redpanda-0:33145 40 | ports: 41 | - 8081:8081 42 | - 8082:8082 43 | - 9092:9092 44 | - 9642:9644 45 | - 28082:28082 46 | redpanda1: 47 | image: docker.redpanda.com/redpandadata/redpanda:latest 48 | container_name: redpanda-1 49 | command: 50 | - redpanda 51 | - start 52 | - --smp 53 | - '1' 54 | - --reserve-memory 55 | - 0M 56 | - --overprovisioned 57 | - --node-id 58 | - '1' 59 | - --seeds 60 | - redpanda-0:33145 61 | - --kafka-addr 62 | - PLAINTEXT://0.0.0.0:29093,OUTSIDE://0.0.0.0:9093 63 | - --advertise-kafka-addr 64 | - PLAINTEXT://redpanda-1:29093,OUTSIDE://localhost:9093 65 | - --pandaproxy-addr 66 | - PLAINTEXT://0.0.0.0:28083,OUTSIDE://0.0.0.0:8083 67 | - --advertise-pandaproxy-addr 68 | - PLAINTEXT://redpanda-1:28083,OUTSIDE://localhost:8083 69 | - --rpc-addr 0.0.0.0:33146 70 | - --advertise-rpc-addr redpanda-1:33146 71 | ports: 72 | - 8083:8083 73 | - 9093:9093 74 | - 9643:9644 75 | redpanda2: 76 | image: docker.redpanda.com/redpandadata/redpanda:latest 77 | container_name: redpanda-2 78 | command: 79 | - redpanda 80 | - start 81 | - --smp 82 | - '1' 83 | - --reserve-memory 84 | - 0M 85 | - --overprovisioned 86 | - --node-id 87 | - '2' 88 | - --seeds 89 | - redpanda-0:33145 90 | - --kafka-addr 91 | - PLAINTEXT://0.0.0.0:29094,OUTSIDE://0.0.0.0:9094 92 | - --advertise-kafka-addr 93 | - PLAINTEXT://redpanda-2:29094,OUTSIDE://localhost:9094 94 | - --pandaproxy-addr 95 | - PLAINTEXT://0.0.0.0:28084,OUTSIDE://0.0.0.0:8084 96 | - --advertise-pandaproxy-addr 97 | - PLAINTEXT://redpanda-2:28084,OUTSIDE://localhost:8084 98 | - --rpc-addr 0.0.0.0:33147 99 | - --advertise-rpc-addr redpanda-2:33147 100 | ports: 101 | - 8084:8084 102 | - 9094:9094 103 | - 9644:9644 104 | console: 105 | image: docker.redpanda.com/redpandadata/console:latest 106 | restart: on-failure 107 | entrypoint: /bin/sh 108 | command: -c "echo \"$$CONSOLE_CONFIG_FILE\" > /tmp/config.yml; /app/console" 109 | environment: 110 | CONFIG_FILEPATH: /tmp/config.yml 111 | CONSOLE_CONFIG_FILE: | 112 | kafka: 113 | brokers: ["redpanda-0:29092","redpanda-1:29093","redpanda-2:29094"] 114 | schemaRegistry: 115 | enabled: true 116 | urls: ["http://redpanda-0:8081"] 117 | redpanda: 118 | adminApi: 119 | enabled: true 120 | urls: ["http://redpanda-0:9644"] 121 | connect: 122 | enabled: true 123 | clusters: 124 | - name: datagen 125 | url: http://connect:8085 126 | ports: 127 | - "8080:8080" 128 | depends_on: 129 | - redpanda0 130 | - redpanda1 131 | - redpanda2 132 | grafana: 133 | image: grafana/grafana 134 | container_name: grafana 135 | environment: 136 | - "GF_AUTH_ANONYMOUS_ENABLED=true" 137 | - "GF_AUTH_ANONYMOUS_ORG_ROLE=Admin" 138 | - "GF_AUTH_ANONYMOUS_HIDE_VERSION=true" 139 | volumes: 140 | # Mount provisioning configuration 141 | - "./config/grafana/provisioning:/etc/grafana/provisioning" 142 | # Mount dashboards 143 | - "../grafana-dashboards:/var/lib/grafana/dashboards" 144 | ports: [ "3000:3000" ] 145 | prometheus: 146 | image: prom/prometheus 147 | container_name: prometheus 148 | # Mount prometheus configuration 149 | volumes: [ "./config/prometheus:/etc/prometheus" ] 150 | ports: 151 | - "9090:9090" 152 | alertmanager: 153 | image: prom/alertmanager 154 | container_name: alertmanager 155 | hostname: alertmanager 156 | ports: 157 | - "9099:9093" 158 | volumes: 159 | - "./config/alertmanager:/config" 160 | command: --config.file=/config/alertmanager.yml --log.level=info 161 | connect: 162 | image: cnfldemos/cp-server-connect-datagen:0.5.0-6.2.0 163 | hostname: connect 164 | container_name: connect 165 | depends_on: 166 | - redpanda0 167 | - redpanda1 168 | - redpanda2 169 | ports: 170 | - "8085:8085" 171 | - "9010:9010" 172 | environment: 173 | CONNECT_BOOTSTRAP_SERVERS: 'redpanda-0:29092,redpanda-1:29093,redpanda-2:29094' 174 | CONNECT_REST_ADVERTISED_HOST_NAME: connect 175 | CONNECT_REST_PORT: 8085 176 | CONNECT_GROUP_ID: compose-connect-group 177 | CONNECT_CONFIG_STORAGE_TOPIC: docker-connect-configs 178 | CONNECT_CONFIG_STORAGE_REPLICATION_FACTOR: 1 179 | CONNECT_OFFSET_FLUSH_INTERVAL_MS: 10000 180 | CONNECT_OFFSET_STORAGE_TOPIC: docker-connect-offsets 181 | CONNECT_OFFSET_STORAGE_REPLICATION_FACTOR: 1 182 | CONNECT_STATUS_STORAGE_TOPIC: docker-connect-status 183 | CONNECT_STATUS_STORAGE_REPLICATION_FACTOR: 1 184 | CONNECT_KEY_CONVERTER: org.apache.kafka.connect.storage.StringConverter 185 | CONNECT_VALUE_CONVERTER: io.confluent.connect.avro.AvroConverter 186 | CONNECT_VALUE_CONVERTER_SCHEMA_REGISTRY_URL: http://redpanda-0:8081 187 | CONNECT_PLUGIN_PATH: "/usr/share/java,/usr/share/confluent-hub-components" 188 | CONNECT_LOG4J_LOGGERS: org.apache.zookeeper=ERROR,org.I0Itec.zkclient=ERROR,org.reflections=ERROR 189 | CLASSPATH: "/usr/share/java/cp-base-new/jmx_prometheus_javaagent-0.14.0.jar" 190 | KAFKA_OPTS: "-javaagent:/usr/share/java/cp-base-new/jmx_prometheus_javaagent-0.14.0.jar=9010:/etc/kafka-connect/kafka.yml" 191 | volumes: [ "./config/jmx_exporter/kafka.yml:/etc/kafka-connect/kafka.yml" ] 192 | 193 | # OwlShop is a service that simulates an ecommerce shop that has producers and consumers. It will setup 194 | # a few Kafka topics, produces to them and also consumes the data again via consumer groups. 195 | owl-shop: 196 | #image: quay.io/cloudhut/owl-shop:v1.2.0 197 | image: quay.io/cloudhut/owl-shop:latest 198 | environment: 199 | - SHOP_KAFKA_BROKERS=redpanda-0:29092,redpanda-1:29093,redpanda-2:29094 200 | - SHOP_KAFKA_TOPICREPLICATIONFACTOR=1 201 | - SHOP_TRAFFIC_INTERVAL_RATE=1 202 | - SHOP_TRAFFIC_INTERVAL_DURATION=0.1s 203 | depends_on: 204 | - redpanda0 205 | - redpanda1 206 | - redpanda2 207 | 208 | -------------------------------------------------------------------------------- /grafana-dashboards/src/dashboards/serverless-overview.jsonnet: -------------------------------------------------------------------------------- 1 | local grafonnet = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | local common = import '../lib/common.libsonnet'; 3 | local sectionName = 'Redpanda Serverless Overview'; 4 | 5 | { 6 | row:: 7 | grafonnet.panel.row.new(sectionName) 8 | + grafonnet.panel.row.gridPos.withW(24) 9 | + grafonnet.panel.row.gridPos.withH(1), 10 | 11 | panels(datasource):: [ 12 | // Section header 13 | common.sectionHeaderPanel(sectionName), 14 | 15 | // Redpanda Version 16 | common.statPanel( 17 | 'Redpanda Version', 18 | [ 19 | common.prometheusQuery( 20 | 'topk(1, last_over_time(redpanda_application_build{redpanda_cloud_data_cluster_name=~"${data_cluster}"}[5m]))', 21 | '__auto' 22 | ), 23 | ], 24 | { textMode: 'name', graphMode: 'none' } 25 | ) 26 | + grafonnet.panel.stat.gridPos.withW(6) 27 | + grafonnet.panel.stat.gridPos.withH(3) 28 | + grafonnet.panel.stat.options.withWideLayout(true) 29 | + grafonnet.panel.stat.standardOptions.withDisplayName('${__field.labels.redpanda_version}'), 30 | 31 | // Throughput 32 | common.timeseriesPanel( 33 | 'Throughput', 34 | [ 35 | common.prometheusQuery( 36 | 'sum by (listener) (rate(redpanda_serverless_ingress_bytes_total{redpanda_cloud_data_cluster_name=~"${data_cluster}"}[$__rate_interval]))', 37 | 'Ingress ({{listener}})' 38 | ), 39 | common.prometheusQuery( 40 | 'sum by (listener) (rate(redpanda_serverless_egress_bytes_total{redpanda_cloud_data_cluster_name=~"${data_cluster}"}[$__rate_interval]))', 41 | 'Egress ({{listener}})' 42 | ), 43 | ] 44 | ) 45 | + grafonnet.panel.timeSeries.gridPos.withW(9) 46 | + grafonnet.panel.timeSeries.gridPos.withH(8) 47 | + grafonnet.panel.timeSeries.standardOptions.withUnit('Bps') 48 | + grafonnet.panel.timeSeries.panelOptions.withDescription('Data transferred from the clients to the Serverless Cluster (Ingress) and from the Serverless Cluster to the clients (Egress)'), 49 | 50 | // Records 51 | common.timeseriesPanel( 52 | 'Records', 53 | [ 54 | common.prometheusQuery( 55 | 'sum(rate(redpanda_kafka_records_produced_total{redpanda_cloud_data_cluster_name=~"${data_cluster}"}[$__rate_interval]))', 56 | 'Produced' 57 | ), 58 | common.prometheusQuery( 59 | 'sum(rate(redpanda_kafka_records_fetched_total{redpanda_cloud_data_cluster_name=~"${data_cluster}"}[$__rate_interval]))', 60 | 'Fetched' 61 | ), 62 | ] 63 | ) 64 | + grafonnet.panel.timeSeries.gridPos.withW(9) 65 | + grafonnet.panel.timeSeries.gridPos.withH(8) 66 | + grafonnet.panel.timeSeries.standardOptions.withUnit('recps') 67 | + grafonnet.panel.timeSeries.panelOptions.withDescription('Number of records produced or fetched from the Serverless Cluster'), 68 | 69 | {}, // Line break 70 | 71 | // Topics 72 | common.statPanel( 73 | 'Topics', 74 | [ 75 | common.prometheusQuery( 76 | 'sum(redpanda_cluster_topics{redpanda_cloud_data_cluster_name=~"${data_cluster}"})', 77 | 'Topics count' 78 | ), 79 | ] 80 | ) 81 | + grafonnet.panel.stat.gridPos.withW(3) 82 | + grafonnet.panel.stat.gridPos.withH(5) 83 | + grafonnet.panel.stat.standardOptions.withUnit('none'), 84 | 85 | // Partitions 86 | common.statPanel( 87 | 'Partitions', 88 | [ 89 | common.prometheusQuery( 90 | 'sum(redpanda_cluster_partitions{redpanda_cloud_data_cluster_name=~"${data_cluster}"})', 91 | 'Partition count' 92 | ), 93 | ], 94 | { colorMode: 'value', graphMode: 'area' } 95 | ) 96 | + grafonnet.panel.stat.gridPos.withW(3) 97 | + grafonnet.panel.stat.gridPos.withH(5) 98 | + grafonnet.panel.stat.standardOptions.withUnit('none') 99 | + grafonnet.panel.stat.standardOptions.thresholds.withSteps([ 100 | { color: 'green', value: 0 }, 101 | { color: 'yellow', value: 4500 }, 102 | { color: 'red', value: 5000 }, 103 | ]), 104 | 105 | {}, // Line break 106 | 107 | // Consumers per Group 108 | common.timeseriesPanel( 109 | 'Consumers per Group', 110 | [ 111 | common.prometheusQuery( 112 | 'sum(redpanda_kafka_consumer_group_consumers{redpanda_cloud_data_cluster_name=~"${data_cluster}"}) by (redpanda_group)', 113 | '{{redpanda_consumer_group}}' 114 | ), 115 | ] 116 | ) 117 | + grafonnet.panel.timeSeries.gridPos.withW(7) 118 | + grafonnet.panel.timeSeries.gridPos.withH(7) 119 | + grafonnet.panel.timeSeries.standardOptions.withUnit('none'), 120 | 121 | // Lag Sum per Consumer Group 122 | common.timeseriesPanel( 123 | 'Lag Sum per Consumer Group', 124 | [ 125 | common.prometheusQuery( 126 | 'sum(redpanda_kafka_consumer_group_lag_sum{redpanda_cloud_data_cluster_name=~"${data_cluster}"}) by (redpanda_group)', 127 | '{{redpanda_group}}' 128 | ), 129 | ], 130 | { fillOpacity: 27 } 131 | ) 132 | + grafonnet.panel.timeSeries.gridPos.withW(8) 133 | + grafonnet.panel.timeSeries.gridPos.withH(7) 134 | + grafonnet.panel.timeSeries.standardOptions.withUnit('short') 135 | + grafonnet.panel.timeSeries.standardOptions.color.withMode('continuous-reds'), 136 | 137 | // Most Active Topics 138 | common.barChartPanel( 139 | 'Most active topics', 140 | [ 141 | common.prometheusQuery( 142 | 'topk(6, sum by (redpanda_topic) (\n rate(redpanda_kafka_records_produced_total{redpanda_cloud_data_cluster_name=~"${data_cluster}"}[$__rate_interval]) + \n rate(redpanda_kafka_records_fetched_total{redpanda_cloud_data_cluster_name=~"${data_cluster}"}[$__rate_interval])\n))', 143 | '__auto', 144 | { instant: true, range: false, exemplar: false, format: 'table' } 145 | ), 146 | ] 147 | ) 148 | + grafonnet.panel.barChart.gridPos.withW(9) 149 | + grafonnet.panel.barChart.gridPos.withH(7) 150 | + grafonnet.panel.barChart.standardOptions.withUnit('ops') 151 | + grafonnet.panel.barChart.standardOptions.color.withMode('continuous-BlPu') 152 | + grafonnet.panel.barChart.options.withShowValue('never') 153 | + grafonnet.panel.barChart.panelOptions.withDescription('Total produce and fetch activities') 154 | + grafonnet.panel.barChart.queryOptions.withTransformations([ 155 | grafonnet.panel.barChart.transformation.withId('filterFieldsByName') 156 | + grafonnet.panel.barChart.transformation.withOptions({ 157 | include: { 158 | names: ['redpanda_topic', 'Value'], 159 | }, 160 | }), 161 | ]), 162 | 163 | {}, // Line break 164 | 165 | // Active Connections 166 | common.timeseriesPanel( 167 | 'Active Connections', 168 | [ 169 | common.prometheusQuery( 170 | 'sum by (listener) (redpanda_serverless_connections_active{redpanda_cloud_data_cluster_name=~"${data_cluster}"})', 171 | 'Active Connections ({{listener}})' 172 | ), 173 | ], 174 | { fillOpacity: 0 } 175 | ) 176 | + grafonnet.panel.timeSeries.gridPos.withW(7) 177 | + grafonnet.panel.timeSeries.gridPos.withH(6) 178 | + grafonnet.panel.timeSeries.standardOptions.withUnit('none'), 179 | 180 | // New Connections Rate 181 | common.timeseriesPanel( 182 | 'New Connections Rate', 183 | [ 184 | common.prometheusQuery( 185 | 'sum by (listener) (rate(redpanda_serverless_connections_created_total{redpanda_cloud_data_cluster_name=~"${data_cluster}"}[$__rate_interval]))', 186 | 'New Connections Rate ({{listener}})' 187 | ), 188 | ], 189 | { fillOpacity: 0 } 190 | ) 191 | + grafonnet.panel.timeSeries.gridPos.withW(8) 192 | + grafonnet.panel.timeSeries.gridPos.withH(6) 193 | + grafonnet.panel.timeSeries.standardOptions.withUnit('reqps'), 194 | 195 | // Avg Connection Duration 196 | common.timeseriesPanel( 197 | 'Avg Connection Duration', 198 | [ 199 | common.prometheusQuery( 200 | 'sum by (listener)(rate(redpanda_serverless_connections_duration_seconds_sum{redpanda_cloud_data_cluster_name=~"${data_cluster}"}[$__rate_interval])) / sum by (listener)(rate(redpanda_serverless_connections_duration_seconds_count{redpanda_cloud_data_cluster_name=~"${data_cluster}"}[$__rate_interval]))', 201 | 'Avg Duration ({{listener}})' 202 | ), 203 | ], 204 | { fillOpacity: 20 } 205 | ) 206 | + grafonnet.panel.timeSeries.gridPos.withW(9) 207 | + grafonnet.panel.timeSeries.gridPos.withH(6) 208 | + grafonnet.panel.timeSeries.standardOptions.withUnit('s') 209 | + grafonnet.panel.timeSeries.standardOptions.withDecimals(2), 210 | ], 211 | } 212 | -------------------------------------------------------------------------------- /grafana-dashboards/src/lib/common.libsonnet: -------------------------------------------------------------------------------- 1 | // Common utilities and helpers for dashboards 2 | local grafonnet = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 3 | 4 | { 5 | // Dashboard requirements (Grafana version and panel types) 6 | dashboardRequirements():: [ 7 | { 8 | type: 'panel', 9 | id: 'barchart', 10 | name: 'Bar chart', 11 | version: '', 12 | }, 13 | { 14 | type: 'panel', 15 | id: 'gauge', 16 | name: 'Gauge', 17 | version: '', 18 | }, 19 | { 20 | type: 'grafana', 21 | id: 'grafana', 22 | name: 'Grafana', 23 | version: '9.3.6', 24 | }, 25 | { 26 | type: 'datasource', 27 | id: 'prometheus', 28 | name: 'Prometheus', 29 | version: '1.0.0', 30 | }, 31 | { 32 | type: 'panel', 33 | id: 'stat', 34 | name: 'Stat', 35 | version: '', 36 | }, 37 | { 38 | type: 'panel', 39 | id: 'timeseries', 40 | name: 'Time series', 41 | version: '', 42 | }, 43 | ], 44 | 45 | // Common datasource variable for Prometheus 46 | datasourceVariable():: 47 | grafonnet.dashboard.variable.datasource.new('DS_PROMETHEUS', 'prometheus') 48 | + grafonnet.dashboard.variable.datasource.generalOptions.withLabel('Data Source'), 49 | 50 | // Common data cluster variable 51 | dataClusterVariable(datasource):: 52 | grafonnet.dashboard.variable.query.new('data_cluster') 53 | + grafonnet.dashboard.variable.query.withDatasource(datasource.type, datasource.uid) 54 | + grafonnet.dashboard.variable.query.queryTypes.withLabelValues('redpanda_cloud_data_cluster_name') 55 | + grafonnet.dashboard.variable.query.generalOptions.withLabel('Data cluster') 56 | + grafonnet.dashboard.variable.query.selectionOptions.withMulti(true) 57 | + grafonnet.dashboard.variable.query.refresh.onLoad(), 58 | 59 | // Create a timeseries panel with common settings 60 | timeseriesPanel(title, targets, options={}):: 61 | local defaults = { 62 | transparent: true, 63 | fillOpacity: 27, 64 | lineWidth: 1, 65 | showPoints: 'auto', 66 | legendDisplayMode: 'list', 67 | legendPlacement: 'bottom', 68 | tooltipMode: 'single', 69 | }; 70 | local settings = defaults + options; 71 | 72 | grafonnet.panel.timeSeries.new(title) 73 | + grafonnet.panel.timeSeries.queryOptions.withTargets(targets) 74 | + grafonnet.panel.timeSeries.options.legend.withDisplayMode(settings.legendDisplayMode) 75 | + grafonnet.panel.timeSeries.options.legend.withPlacement(settings.legendPlacement) 76 | + grafonnet.panel.timeSeries.options.legend.withShowLegend(true) 77 | + grafonnet.panel.timeSeries.options.tooltip.withMode(settings.tooltipMode) 78 | + grafonnet.panel.timeSeries.standardOptions.withMin(0) 79 | + grafonnet.panel.timeSeries.fieldConfig.defaults.custom.withFillOpacity(settings.fillOpacity) 80 | + grafonnet.panel.timeSeries.fieldConfig.defaults.custom.withLineWidth(settings.lineWidth) 81 | + grafonnet.panel.timeSeries.fieldConfig.defaults.custom.withShowPoints(settings.showPoints) 82 | + grafonnet.panel.timeSeries.panelOptions.withTransparent(settings.transparent), 83 | 84 | // Create a stat panel with common settings 85 | statPanel(title, targets, options={}):: 86 | local defaults = { 87 | transparent: true, 88 | colorMode: 'value', 89 | graphMode: 'area', 90 | textMode: 'auto', 91 | }; 92 | local settings = defaults + options; 93 | 94 | grafonnet.panel.stat.new(title) 95 | + grafonnet.panel.stat.queryOptions.withTargets(targets) 96 | + grafonnet.panel.stat.options.withColorMode(settings.colorMode) 97 | + grafonnet.panel.stat.options.withGraphMode(settings.graphMode) 98 | + grafonnet.panel.stat.options.withTextMode(settings.textMode) 99 | + grafonnet.panel.stat.options.reduceOptions.withCalcs(['lastNotNull']) 100 | + grafonnet.panel.stat.standardOptions.withMin(0) 101 | + grafonnet.panel.stat.panelOptions.withTransparent(settings.transparent), 102 | 103 | // Create a gauge panel with common settings 104 | gaugePanel(title, targets, options={}):: 105 | local defaults = { 106 | transparent: true, 107 | min: 0, 108 | thresholds: [ 109 | { color: 'green', value: 0 }, 110 | { color: 'yellow', value: 75 }, 111 | { color: 'red', value: 90 }, 112 | ], 113 | unit: 'percent', 114 | decimals: 2, 115 | }; 116 | local settings = defaults + options; 117 | 118 | grafonnet.panel.gauge.new(title) 119 | + grafonnet.panel.gauge.queryOptions.withTargets(targets) 120 | + grafonnet.panel.gauge.options.reduceOptions.withCalcs(['lastNotNull']) 121 | + grafonnet.panel.gauge.options.withShowThresholdLabels(false) 122 | + grafonnet.panel.gauge.options.withShowThresholdMarkers(true) 123 | + grafonnet.panel.gauge.standardOptions.withMin(settings.min) 124 | + grafonnet.panel.gauge.standardOptions.withUnit(settings.unit) 125 | + grafonnet.panel.gauge.standardOptions.withDecimals(settings.decimals) 126 | + grafonnet.panel.gauge.standardOptions.thresholds.withMode('absolute') 127 | + grafonnet.panel.gauge.standardOptions.thresholds.withSteps(settings.thresholds) 128 | + grafonnet.panel.gauge.panelOptions.withTransparent(settings.transparent), 129 | 130 | // Create a bar chart panel 131 | barChartPanel(title, targets, options={}):: 132 | local defaults = { 133 | transparent: true, 134 | orientation: 'horizontal', 135 | showLegend: false, 136 | }; 137 | local settings = defaults + options; 138 | 139 | grafonnet.panel.barChart.new(title) 140 | + grafonnet.panel.barChart.queryOptions.withTargets(targets) 141 | + grafonnet.panel.barChart.options.withOrientation(settings.orientation) 142 | + grafonnet.panel.barChart.options.legend.withShowLegend(settings.showLegend) 143 | + grafonnet.panel.barChart.options.legend.withPlacement('bottom') 144 | + grafonnet.panel.barChart.standardOptions.withMin(0) 145 | + grafonnet.panel.barChart.panelOptions.withTransparent(settings.transparent), 146 | 147 | // Helper to create Prometheus query target 148 | prometheusQuery(expr, legendFormat='__auto', options={}):: 149 | local defaults = { 150 | datasource: { type: 'prometheus', uid: '${DS_PROMETHEUS}' }, 151 | editorMode: 'code', 152 | range: true, 153 | instant: false, 154 | exemplar: true, 155 | format: null, 156 | }; 157 | local settings = defaults + options; 158 | 159 | grafonnet.query.prometheus.new(settings.datasource.uid, expr) 160 | + grafonnet.query.prometheus.withLegendFormat(legendFormat) 161 | + grafonnet.query.prometheus.withEditorMode(settings.editorMode) 162 | + grafonnet.query.prometheus.withRange(settings.range) 163 | + grafonnet.query.prometheus.withInstant(settings.instant) 164 | + grafonnet.query.prometheus.withExemplar(settings.exemplar) 165 | + (if settings.format != null then { format: settings.format } else {}), 166 | 167 | // Create an HTML section header panel 168 | sectionHeaderPanel(title, options={}):: 169 | local defaults = { 170 | color: '#87CEEB', 171 | x: 0, 172 | y: 0, 173 | w: 24, 174 | h: 2, 175 | }; 176 | local settings = defaults + options; 177 | local htmlContent = '

%(title)s

' % { color: settings.color, title: title }; 178 | 179 | grafonnet.panel.text.new('') 180 | + grafonnet.panel.text.options.withContent(htmlContent) 181 | + grafonnet.panel.text.options.withMode('html') 182 | + grafonnet.panel.text.panelOptions.withTransparent(true) 183 | + grafonnet.panel.text.gridPos.withX(settings.x) 184 | + grafonnet.panel.text.gridPos.withY(settings.y) 185 | + grafonnet.panel.text.gridPos.withW(settings.w) 186 | + grafonnet.panel.text.gridPos.withH(settings.h) 187 | + { datasource: { type: 'datasource', uid: 'grafana' } }, 188 | 189 | // Auto-layout utilities 190 | local allMax = function(arr, def) std.foldl(std.max, arr, def), 191 | 192 | local getPos = function(parent, child) 193 | local 194 | cX0 = child.gridPos.w + parent.__nextX, 195 | mustWrap = cX0 > 24, 196 | combinedHeight = allMax(std.map(function(p) p.gridPos.y + p.gridPos.h, parent.panels), 0), 197 | cX = if mustWrap then 0 else parent.__nextX, 198 | cY = if mustWrap then combinedHeight else parent.__nextY, 199 | nX = cX + child.gridPos.w, 200 | nY = cY; 201 | { cX: cX, cY: cY, nX: nX, nY: nY }, 202 | 203 | local addPanel = function(parent, child) 204 | if child == {} 205 | then 206 | local 207 | nextY = getPos(parent, { gridPos: { w: 24 } }).nY; 208 | // Wrap to next line without creating spacer panels 209 | parent { 210 | __nextX: 0, 211 | __nextY: nextY, 212 | } 213 | else 214 | local 215 | coords = getPos(parent, child), 216 | child1 = child { gridPos+: { x: coords.cX, y: coords.cY } }; 217 | parent { 218 | __nextX: coords.nX, 219 | __nextY: coords.nY, 220 | panels+: [child1], 221 | }, 222 | 223 | // Layout panels automatically starting from yOffset 224 | layoutPanels(panels, yOffset=0):: 225 | local 226 | initialState = { 227 | __nextX: 0, 228 | __nextY: yOffset, 229 | panels: [], 230 | }, 231 | finalState = std.foldl(addPanel, panels, initialState); 232 | finalState.panels, 233 | } 234 | -------------------------------------------------------------------------------- /demo/config/prometheus/alert-rules.yml: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: alert.rules 3 | rules: 4 | - alert: Brokers are down 5 | annotations: 6 | summary: The number of active brokers has been too low for more than 1 minute. 7 | expr: (max(max_over_time(redpanda_cluster_brokers{job='redpanda'}[30d])) - sum(up{job='redpanda'})) > 0 8 | for: 1m 9 | labels: 10 | severity: critical 11 | - alert: Brokers are down (alternative) 12 | annotations: 13 | summary: The number of active brokers has been too low for more than 1 minute. 14 | expr: (max_over_time(count(redpanda_application_uptime_seconds_total)[30d:]) - (count(redpanda_application_uptime_seconds_total) or on () vector(0))) > 0 15 | for: 1m 16 | labels: 17 | severity: critical 18 | - alert: Storage is degraded 19 | annotations: 20 | summary: Redpanda is alerting that storage is degraded for more than 1 minute, resulting in writes being rejected. 21 | expr: (redpanda_storage_disk_free_space_alert) > 1 22 | for: 1m 23 | labels: 24 | severity: critical 25 | - alert: Storage - there is less than 1 GiB of free space 26 | annotations: 27 | summary: There is less than 1 GiB free space available for more than 1 minute. 28 | expr: (redpanda_storage_disk_free_bytes) < 1073741824 29 | for: 1m 30 | labels: 31 | severity: critical 32 | - alert: Leaderless partitions 33 | annotations: 34 | summary: There are leaderless partitions for more than 1 minute, so some data may be unavailable. 35 | expr: (redpanda_cluster_unavailable_partitions) > 0 36 | for: 1m 37 | labels: 38 | severity: critical 39 | - alert: Low memory - there is less than 1 GiB of memory 40 | annotations: 41 | summary: There is less than 1 GiB memory available for more than 1 minute. 42 | expr: (redpanda_memory_available_memory) < 1073741824 43 | for: 1m 44 | labels: 45 | severity: critical 46 | - alert: Storage - low space 47 | annotations: 48 | summary: Redpanda is alerting that space is too low for over 5 minutes. 49 | expr: (redpanda_storage_disk_free_space_alert) > 0 50 | for: 5m 51 | labels: 52 | severity: high 53 | - alert: Under-replicated partitions 54 | annotations: 55 | summary: There have been under-replicated partitions for over 5 minutes. 56 | expr: (redpanda_kafka_under_replicated_replicas) > 0 57 | for: 5m 58 | labels: 59 | severity: high 60 | - alert: Storage space is predicted to be less than 1 GiB in 30 minutes 61 | annotations: 62 | summary: Storage space has been consistently predicted to be less than 1 GiB (in one hour), for over 5 minutes. 63 | expr: (predict_linear(redpanda_storage_disk_free_bytes[1h], 1800)) < 1073741824 64 | for: 5m 65 | labels: 66 | severity: high 67 | - alert: Memory is predicted to be less than 1 GiB in one hour 68 | annotations: 69 | summary: Memory has been consistently predicted to be less than 1 GiB (in one hour), for over 5 minutes. 70 | expr: (predict_linear(redpanda_memory_available_memory[30m], 1800)) < 1073741824 71 | for: 5m 72 | labels: 73 | severity: high 74 | - alert: More than 1% of Schema Registry requests results in an error 75 | annotations: 76 | summary: More than 1% of Schema Registry requests results in an error, for over 5 minutes. 77 | expr: (100 * (sum by (instance) (rate(redpanda_schema_registry_request_errors_total[5m])) / sum by (instance) (rate(redpanda_schema_registry_request_latency_seconds_count[5m])))) > 1 78 | for: 5m 79 | labels: 80 | severity: high 81 | - alert: More than 1% of Kafka RPC requests results in an error 82 | annotations: 83 | summary: More than 1% of Kafka RPC requests results in an error, for over 5 minutes. 84 | expr: (100 * (sum by (instance) (rate(redpanda_rpc_request_errors_total{redpanda_server="kafka"}[5m])) / sum by (instance) (rate(redpanda_rpc_request_latency_seconds_count{redpanda_server="kafka"}[5m])))) > 1 85 | for: 5m 86 | labels: 87 | severity: high 88 | - alert: More than 1% of internal RPC requests results in an error 89 | annotations: 90 | summary: More than 1% of internal RPC requests results in an error, for over 5 minutes. 91 | expr: (100 * (sum by (instance) (rate(redpanda_rpc_request_errors_total{redpanda_server="internal"}[5m])) / sum by (instance) (rate(redpanda_rpc_request_latency_seconds_count{redpanda_server="internal"}[5m])))) > 1 92 | for: 5m 93 | labels: 94 | severity: high 95 | - alert: More than 1% of REST requests results in an error 96 | annotations: 97 | summary: More than 1% of REST requests results in an error, for over 5 minutes. 98 | expr: (100 * (sum by (instance) (rate(redpanda_rest_proxy_request_errors_total[5m])) / sum by (instance) (rate(redpanda_rest_proxy_request_latency_seconds_count[5m])))) > 1 99 | for: 5m 100 | labels: 101 | severity: high 102 | - alert: Raft leadership is continually changing 103 | annotations: 104 | summary: Raft leadership is continually changing, rather than settling into a stable distribution, for over 5 minutes. 105 | expr: (rate(redpanda_raft_leadership_changes[1m])) > 0 106 | for: 5m 107 | labels: 108 | severity: high 109 | - alert: Kafka produce latency (p95) is too high (new handler metric) 110 | annotations: 111 | summary: Kafka produce latency (95th percentile) is more than 100 milliseconds per request, for over 5 minutes. 112 | expr: (histogram_quantile(0.95, sum by(le) (rate(redpanda_kafka_handler_latency_seconds_bucket{handler="produce"}[5m])))) > 0.1 113 | for: 5m 114 | labels: 115 | severity: high 116 | - alert: Kafka consume latency (p95) is too high (new handler metric) 117 | annotations: 118 | summary: Kafka consume latency (95th percentile) is more than 100 milliseconds per request, for over 5 minutes. 119 | expr: (histogram_quantile(0.95, sum by(le) (rate(redpanda_kafka_handler_latency_seconds_bucket{handler="fetch"}[5m])))) > 0.1 120 | for: 5m 121 | labels: 122 | severity: high 123 | - alert: Kafka produce latency (p95) is too high 124 | annotations: 125 | summary: Kafka produce latency (95th percentile) is more than 100 milliseconds per request, for over 5 minutes. 126 | expr: (histogram_quantile(0.95, sum by(le) (rate(redpanda_kafka_request_latency_seconds_bucket{redpanda_request="produce"}[5m])))) > 0.1 127 | for: 5m 128 | labels: 129 | severity: high 130 | - alert: Kafka consume latency (p95) is too high 131 | annotations: 132 | summary: Kafka consume latency (95th percentile) is more than 100 milliseconds per request, for over 5 minutes. 133 | expr: (histogram_quantile(0.95, sum by(le) (rate(redpanda_kafka_request_latency_seconds_bucket{redpanda_request="consume"}[5m])))) > 0.1 134 | for: 5m 135 | labels: 136 | severity: high 137 | - alert: Internal RPC request latency (p95) is too high 138 | annotations: 139 | summary: Internal RPC request latency (95th percentile) is more than 100 milliseconds per request, for over 5 minutes. 140 | expr: (histogram_quantile(0.95, sum by(le) (rate(redpanda_rpc_request_latency_seconds_bucket{redpanda_server="internal"}[5m])))) > 0.1 141 | for: 5m 142 | labels: 143 | severity: high 144 | - alert: REST request latency (p95) is too high 145 | annotations: 146 | summary: REST request latency (95th percentile) is more than 100 milliseconds per request, for over 5 minutes. 147 | expr: (histogram_quantile(0.95, sum by(le) (rate(redpanda_rest_proxy_request_latency_seconds_bucket[5m])))) > 0.1 148 | for: 5m 149 | labels: 150 | severity: high 151 | - alert: Schema Registry request latency (p95) is too high 152 | annotations: 153 | summary: Schema Registry request latency (95th percentile) is more than 100 milliseconds per request, for over 5 minutes. 154 | expr: (histogram_quantile(0.95, sum by(le) (rate(redpanda_schema_registry_request_latency_seconds_bucket[5m])))) > 0.1 155 | for: 5m 156 | labels: 157 | severity: high 158 | - alert: Storage - there is less than 10 GiB of free space 159 | annotations: 160 | summary: There is less than 10 GiB free space available for more than 5 minutes. 161 | expr: (redpanda_storage_disk_free_bytes) < 1073741824 162 | for: 5m 163 | labels: 164 | severity: medium 165 | - alert: Schema Registry errors are increasing 166 | annotations: 167 | summary: Schema Registry errors are increasing for more than 5 minutes. 168 | expr: (increase(redpanda_schema_registry_request_errors_total[1m])) > 0 169 | for: 5m 170 | labels: 171 | severity: medium 172 | - alert: Kafka RPC errors are increasing 173 | annotations: 174 | summary: Kafka RPC errors are increasing for more than 5 minutes. 175 | expr: (increase(redpanda_rpc_request_errors_total{redpanda_server="kafka"}[1m])) > 0 176 | for: 5m 177 | labels: 178 | severity: medium 179 | - alert: Internal RPC errors are increasing 180 | annotations: 181 | summary: Internal RPC errors are increasing for more than 5 minutes. 182 | expr: (increase(redpanda_rpc_request_errors_total{redpanda_server="internal"}[1m])) > 0 183 | for: 5m 184 | labels: 185 | severity: medium 186 | - alert: REST Proxy 3xx errors are increasing 187 | annotations: 188 | summary: REST Proxy 3xx errors are increasing for more than 5 minutes. 189 | expr: (increase(redpanda_rest_proxy_request_errors_total{redpanda_status="3xx"}[1m])) > 0 190 | for: 5m 191 | labels: 192 | severity: medium 193 | - alert: REST Proxy 4xx errors are increasing 194 | annotations: 195 | summary: REST Proxy 4xx errors are increasing for more than 5 minutes. 196 | expr: (increase(redpanda_rest_proxy_request_errors_total{redpanda_status="4xx"}[1m])) > 0 197 | for: 5m 198 | labels: 199 | severity: medium 200 | - alert: REST Proxy 5xx errors are increasing 201 | annotations: 202 | summary: REST Proxy 5xx errors are increasing for more than 5 minutes. 203 | expr: (increase(redpanda_rest_proxy_request_errors_total{redpanda_status="5xx"}[1m])) > 0 204 | for: 5m 205 | labels: 206 | severity: medium 207 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /demo/config/alert-definitions.yml: -------------------------------------------------------------------------------- 1 | rules: 2 | - alert: Brokers are down 3 | uid: c6f9ef08-5b6e-49b6-8b10-d55f40fbf719 4 | folder: Redpanda Critical 5 | evaluation_group: critical 6 | expr: max(max_over_time(redpanda_cluster_brokers{job='redpanda'}[30d])) - sum(up{job='redpanda'}) 7 | comparison: gt 8 | threshold: 0 9 | for: 1m 10 | labels: 11 | severity: critical 12 | annotations: 13 | summary: "The number of active brokers has been too low for more than 1 minute." 14 | - alert: Brokers are down (alternative) 15 | uid: a32ac38d-8553-4708-a227-956720856af0 16 | folder: Redpanda Critical 17 | evaluation_group: critical 18 | expr: max_over_time(count(redpanda_application_uptime_seconds_total)[30d:]) - (count(redpanda_application_uptime_seconds_total) or on () vector(0)) 19 | comparison: gt 20 | threshold: 0 21 | for: 1m 22 | labels: 23 | severity: critical 24 | annotations: 25 | summary: "The number of active brokers has been too low for more than 1 minute." 26 | - alert: Storage is degraded 27 | uid: 6a936f22-882d-421a-8e3d-4039387e4670 28 | folder: Redpanda Critical 29 | evaluation_group: critical 30 | expr: redpanda_storage_disk_free_space_alert 31 | comparison: gt 32 | threshold: 1 33 | for: 1m 34 | labels: 35 | severity: critical 36 | annotations: 37 | summary: "Redpanda is alerting that storage is degraded for more than 1 minute, resulting in writes being rejected." 38 | - alert: Storage - there is less than 1 GiB of free space 39 | uid: 6c741284-d04f-4dd4-80b9-cdba8916e936 40 | folder: Redpanda Critical 41 | evaluation_group: critical 42 | expr: redpanda_storage_disk_free_bytes 43 | comparison: lt 44 | threshold: 1073741824 45 | for: 1m 46 | labels: 47 | severity: critical 48 | annotations: 49 | summary: "There is less than 1 GiB free space available for more than 1 minute." 50 | - alert: Leaderless partitions 51 | uid: 64622530-7b51-4ba3-9463-d6ea9b50903a 52 | folder: Redpanda Critical 53 | evaluation_group: critical 54 | expr: redpanda_cluster_unavailable_partitions 55 | comparison: gt 56 | threshold: 0 57 | for: 1m 58 | labels: 59 | severity: critical 60 | annotations: 61 | summary: "There are leaderless partitions for more than 1 minute, so some data may be unavailable." 62 | - alert: Low memory - there is less than 1 GiB of memory 63 | uid: a9530700-4b13-4264-9fd3-6fa042f74c6d 64 | folder: Redpanda Critical 65 | evaluation_group: critical 66 | expr: redpanda_memory_available_memory 67 | comparison: lt 68 | threshold: 1073741824 69 | for: 1m 70 | labels: 71 | severity: critical 72 | annotations: 73 | summary: "There is less than 1 GiB memory available for more than 1 minute." 74 | - alert: Storage - low space 75 | uid: 38e89a48-249a-4709-bbcc-37698e474979 76 | folder: Redpanda Severe 77 | evaluation_group: severe 78 | expr: redpanda_storage_disk_free_space_alert 79 | comparison: gt 80 | threshold: 0 81 | for: 5m 82 | labels: 83 | severity: high 84 | annotations: 85 | summary: "Redpanda is alerting that space is too low for over 5 minutes." 86 | - alert: Under-replicated partitions 87 | uid: 0887a359-e688-424d-8da0-de97d15dad63 88 | folder: Redpanda Severe 89 | evaluation_group: severe 90 | expr: redpanda_kafka_under_replicated_replicas 91 | comparison: gt 92 | threshold: 0 93 | for: 5m 94 | labels: 95 | severity: high 96 | annotations: 97 | summary: "There have been under-replicated partitions for over 5 minutes." 98 | - alert: Storage space is predicted to be less than 1 GiB in 30 minutes 99 | uid: d6e142d2-cbfa-47ba-97f2-a025c1e859b4 100 | folder: Redpanda Severe 101 | evaluation_group: severe 102 | expr: predict_linear(redpanda_storage_disk_free_bytes[1h], 1800) 103 | comparison: lt 104 | threshold: 1073741824 105 | for: 5m 106 | labels: 107 | severity: high 108 | annotations: 109 | summary: "Storage space has been consistently predicted to be less than 1 GiB (in one hour), for over 5 minutes." 110 | - alert: Memory is predicted to be less than 1 GiB in one hour 111 | uid: f7ae9b13-8019-45b2-a3fd-395f1619a1ce 112 | folder: Redpanda Severe 113 | evaluation_group: severe 114 | expr: predict_linear(redpanda_memory_available_memory[30m], 1800) 115 | comparison: lt 116 | threshold: 1073741824 117 | for: 5m 118 | labels: 119 | severity: high 120 | annotations: 121 | summary: "Memory has been consistently predicted to be less than 1 GiB (in one hour), for over 5 minutes." 122 | - alert: More than 1% of Schema Registry requests results in an error 123 | uid: 38720ef1-db87-470b-aa40-ba1c01c03ec1 124 | folder: Redpanda Severe 125 | evaluation_group: severe 126 | expr: 100 * (sum by (instance) (rate(redpanda_schema_registry_request_errors_total[5m])) / sum by (instance) (rate(redpanda_schema_registry_request_latency_seconds_count[5m]))) 127 | comparison: gt 128 | threshold: 1 129 | for: 5m 130 | labels: 131 | severity: high 132 | annotations: 133 | summary: "More than 1% of Schema Registry requests results in an error, for over 5 minutes." 134 | - alert: More than 1% of Kafka RPC requests results in an error 135 | uid: 1fa48d99-c597-4f9f-85b5-82f7bca1bd18 136 | folder: Redpanda Severe 137 | evaluation_group: severe 138 | expr: 100 * (sum by (instance) (rate(redpanda_rpc_request_errors_total{redpanda_server="kafka"}[5m])) / sum by (instance) (rate(redpanda_rpc_request_latency_seconds_count{redpanda_server="kafka"}[5m]))) 139 | comparison: gt 140 | threshold: 1 141 | for: 5m 142 | labels: 143 | severity: high 144 | annotations: 145 | summary: "More than 1% of Kafka RPC requests results in an error, for over 5 minutes." 146 | - alert: More than 1% of internal RPC requests results in an error 147 | uid: 1fa48d99-c597-4f9f-85b5-82f7bca1bd18 148 | folder: Redpanda Severe 149 | evaluation_group: severe 150 | expr: 100 * (sum by (instance) (rate(redpanda_rpc_request_errors_total{redpanda_server="internal"}[5m])) / sum by (instance) (rate(redpanda_rpc_request_latency_seconds_count{redpanda_server="internal"}[5m]))) 151 | comparison: gt 152 | threshold: 1 153 | for: 5m 154 | labels: 155 | severity: high 156 | annotations: 157 | summary: "More than 1% of internal RPC requests results in an error, for over 5 minutes." 158 | - alert: More than 1% of REST requests results in an error 159 | uid: 15337901-3519-4cd8-a9a8-da2f3b05cf3f 160 | folder: Redpanda Severe 161 | evaluation_group: severe 162 | expr: 100 * (sum by (instance) (rate(redpanda_rest_proxy_request_errors_total[5m])) / sum by (instance) (rate(redpanda_rest_proxy_request_latency_seconds_count[5m]))) 163 | comparison: gt 164 | threshold: 1 165 | for: 5m 166 | labels: 167 | severity: high 168 | annotations: 169 | summary: "More than 1% of REST requests results in an error, for over 5 minutes." 170 | - alert: Raft leadership is continually changing 171 | uid: 68230e62-122f-43ad-96bc-8fd8cabf9b75 172 | folder: Redpanda Severe 173 | evaluation_group: severe 174 | expr: rate(redpanda_raft_leadership_changes[1m]) 175 | comparison: gt 176 | threshold: 0 177 | for: 5m 178 | labels: 179 | severity: high 180 | annotations: 181 | summary: "Raft leadership is continually changing, rather than settling into a stable distribution, for over 5 minutes." 182 | - alert: Kafka produce latency (p95) is too high (new handler metric) 183 | uid: 9569032d-93c1-4724-bf99-8d5a707823ad 184 | folder: Redpanda Severe 185 | evaluation_group: severe 186 | expr: histogram_quantile(0.95, sum by(le) (rate(redpanda_kafka_handler_latency_seconds_bucket{handler="produce"}[5m]))) 187 | comparison: gt 188 | threshold: 0.1 189 | for: 5m 190 | labels: 191 | severity: high 192 | annotations: 193 | summary: "Kafka produce latency (95th percentile) is more than 100 milliseconds per request, for over 5 minutes." 194 | - alert: Kafka consume latency (p95) is too high (new handler metric) 195 | uid: bcc70d16-a944-4347-86ca-1e5dfe33ad4d 196 | folder: Redpanda Severe 197 | evaluation_group: severe 198 | expr: histogram_quantile(0.95, sum by(le) (rate(redpanda_kafka_handler_latency_seconds_bucket{handler="fetch"}[5m]))) 199 | comparison: gt 200 | threshold: 0.1 201 | for: 5m 202 | labels: 203 | severity: high 204 | annotations: 205 | summary: "Kafka consume latency (95th percentile) is more than 100 milliseconds per request, for over 5 minutes." 206 | - alert: Kafka produce latency (p95) is too high 207 | uid: 5e98342f-cd16-4dc6-96b5-dcc4c1831c21 208 | folder: Redpanda Severe 209 | evaluation_group: severe 210 | expr: histogram_quantile(0.95, sum by(le) (rate(redpanda_kafka_request_latency_seconds_bucket{redpanda_request="produce"}[5m]))) 211 | comparison: gt 212 | threshold: 0.1 213 | for: 5m 214 | labels: 215 | severity: high 216 | annotations: 217 | summary: "Kafka produce latency (95th percentile) is more than 100 milliseconds per request, for over 5 minutes." 218 | - alert: Kafka consume latency (p95) is too high 219 | uid: 0ffa0b3a-5b6d-4247-aa0a-e3d012dbea9c 220 | folder: Redpanda Severe 221 | evaluation_group: severe 222 | expr: histogram_quantile(0.95, sum by(le) (rate(redpanda_kafka_request_latency_seconds_bucket{redpanda_request="consume"}[5m]))) 223 | comparison: gt 224 | threshold: 0.1 225 | for: 5m 226 | labels: 227 | severity: high 228 | annotations: 229 | summary: "Kafka consume latency (95th percentile) is more than 100 milliseconds per request, for over 5 minutes." 230 | - alert: Internal RPC request latency (p95) is too high 231 | uid: 66d78acb-1a09-49e2-8fcb-080f6180af28 232 | folder: Redpanda Severe 233 | evaluation_group: severe 234 | expr: histogram_quantile(0.95, sum by(le) (rate(redpanda_rpc_request_latency_seconds_bucket{redpanda_server="internal"}[5m]))) 235 | comparison: gt 236 | threshold: 0.1 237 | for: 5m 238 | labels: 239 | severity: high 240 | annotations: 241 | summary: "Internal RPC request latency (95th percentile) is more than 100 milliseconds per request, for over 5 minutes." 242 | - alert: REST request latency (p95) is too high 243 | uid: 4eb69c6f-15c7-47b2-a15b-caf12b43dd24 244 | folder: Redpanda Severe 245 | evaluation_group: severe 246 | expr: histogram_quantile(0.95, sum by(le) (rate(redpanda_rest_proxy_request_latency_seconds_bucket[5m]))) 247 | comparison: gt 248 | threshold: 0.1 249 | for: 5m 250 | labels: 251 | severity: high 252 | annotations: 253 | summary: "REST request latency (95th percentile) is more than 100 milliseconds per request, for over 5 minutes." 254 | - alert: Schema Registry request latency (p95) is too high 255 | uid: 26861082-1112-4a0f-9929-a0cac66e568b 256 | folder: Redpanda Severe 257 | evaluation_group: severe 258 | expr: histogram_quantile(0.95, sum by(le) (rate(redpanda_schema_registry_request_latency_seconds_bucket[5m]))) 259 | comparison: gt 260 | threshold: 0.1 261 | for: 5m 262 | labels: 263 | severity: high 264 | annotations: 265 | summary: "Schema Registry request latency (95th percentile) is more than 100 milliseconds per request, for over 5 minutes." 266 | - alert: Storage - there is less than 10 GiB of free space 267 | uid: 9f83f219-2d53-4c15-80c4-2ee4637eadbc 268 | folder: Redpanda Moderate 269 | evaluation_group: moderate 270 | expr: redpanda_storage_disk_free_bytes 271 | comparison: lt 272 | threshold: 1073741824 273 | for: 5m 274 | labels: 275 | severity: medium 276 | annotations: 277 | summary: "There is less than 10 GiB free space available for more than 5 minutes." 278 | - alert: Schema Registry errors are increasing 279 | uid: dd76c0ad-72d1-4a4e-811d-0bca0a71cbcf 280 | folder: Redpanda Moderate 281 | evaluation_group: moderate 282 | expr: increase(redpanda_schema_registry_request_errors_total[1m]) 283 | comparison: gt 284 | threshold: 0 285 | for: 5m 286 | labels: 287 | severity: medium 288 | annotations: 289 | summary: "Schema Registry errors are increasing for more than 5 minutes." 290 | - alert: Kafka RPC errors are increasing 291 | uid: 8b788eff-bef7-42e1-8474-1f4704a06bb7 292 | folder: Redpanda Moderate 293 | evaluation_group: moderate 294 | expr: increase(redpanda_rpc_request_errors_total{redpanda_server="kafka"}[1m]) 295 | comparison: gt 296 | threshold: 0 297 | for: 5m 298 | labels: 299 | severity: medium 300 | annotations: 301 | summary: "Kafka RPC errors are increasing for more than 5 minutes." 302 | - alert: Internal RPC errors are increasing 303 | uid: c6fea529-b424-474b-b370-9b44fb67e5e2 304 | folder: Redpanda Moderate 305 | evaluation_group: moderate 306 | expr: increase(redpanda_rpc_request_errors_total{redpanda_server="internal"}[1m]) 307 | comparison: gt 308 | threshold: 0 309 | for: 5m 310 | labels: 311 | severity: medium 312 | annotations: 313 | summary: "Internal RPC errors are increasing for more than 5 minutes." 314 | - alert: REST Proxy 3xx errors are increasing 315 | uid: 067b4155-d360-4333-9557-c2c0b5fbed9a 316 | folder: Redpanda Moderate 317 | evaluation_group: moderate 318 | expr: increase(redpanda_rest_proxy_request_errors_total{redpanda_status="3xx"}[1m]) 319 | comparison: gt 320 | threshold: 0 321 | for: 5m 322 | labels: 323 | severity: medium 324 | annotations: 325 | summary: "REST Proxy 3xx errors are increasing for more than 5 minutes." 326 | - alert: REST Proxy 4xx errors are increasing 327 | uid: 30277d3f-f223-4220-a95b-847fe2378e49 328 | folder: Redpanda Moderate 329 | evaluation_group: moderate 330 | expr: increase(redpanda_rest_proxy_request_errors_total{redpanda_status="4xx"}[1m]) 331 | comparison: gt 332 | threshold: 0 333 | for: 5m 334 | labels: 335 | severity: medium 336 | annotations: 337 | summary: "REST Proxy 4xx errors are increasing for more than 5 minutes." 338 | - alert: REST Proxy 5xx errors are increasing 339 | uid: b7cfb1a4-ee6c-4562-ba44-074c535f5ef6 340 | folder: Redpanda Moderate 341 | evaluation_group: moderate 342 | expr: increase(redpanda_rest_proxy_request_errors_total{redpanda_status="5xx"}[1m]) 343 | comparison: gt 344 | threshold: 0 345 | for: 5m 346 | labels: 347 | severity: medium 348 | annotations: 349 | summary: "REST Proxy 5xx errors are increasing for more than 5 minutes." -------------------------------------------------------------------------------- /grafana-dashboards/Kafka-Topic-Metrics.json: -------------------------------------------------------------------------------- 1 | { 2 | "__inputs": [ 3 | { 4 | "name": "DS_PROMETHEUS", 5 | "label": "Prometheus", 6 | "description": "", 7 | "type": "datasource", 8 | "pluginId": "prometheus", 9 | "pluginName": "Prometheus" 10 | } 11 | ], 12 | "__elements": {}, 13 | "__requires": [ 14 | { 15 | "type": "grafana", 16 | "id": "grafana", 17 | "name": "Grafana", 18 | "version": "9.3.6" 19 | }, 20 | { 21 | "type": "panel", 22 | "id": "graph", 23 | "name": "Graph (old)", 24 | "version": "" 25 | }, 26 | { 27 | "type": "datasource", 28 | "id": "prometheus", 29 | "name": "Prometheus", 30 | "version": "1.0.0" 31 | }, 32 | { 33 | "type": "panel", 34 | "id": "table", 35 | "name": "Table", 36 | "version": "" 37 | } 38 | ], 39 | "annotations": { 40 | "list": [ 41 | { 42 | "builtIn": 1, 43 | "datasource": { 44 | "type": "datasource", 45 | "uid": "grafana" 46 | }, 47 | "enable": true, 48 | "hide": true, 49 | "iconColor": "rgba(0, 211, 255, 1)", 50 | "name": "Annotations & Alerts", 51 | "target": { 52 | "limit": 100, 53 | "matchAny": false, 54 | "tags": [], 55 | "type": "dashboard" 56 | }, 57 | "type": "dashboard" 58 | } 59 | ] 60 | }, 61 | "editable": true, 62 | "fiscalYearStartMonth": 0, 63 | "graphTooltip": 0, 64 | "id": null, 65 | "links": [], 66 | "liveNow": false, 67 | "panels": [ 68 | { 69 | "datasource": { 70 | "type": "prometheus", 71 | "uid": "${DS_PROMETHEUS}" 72 | }, 73 | "fieldConfig": { 74 | "defaults": { 75 | "custom": { 76 | "align": "auto", 77 | "displayMode": "auto", 78 | "inspect": false 79 | }, 80 | "mappings": [], 81 | "thresholds": { 82 | "mode": "absolute", 83 | "steps": [ 84 | { 85 | "color": "green", 86 | "value": null 87 | }, 88 | { 89 | "color": "red", 90 | "value": 80 91 | } 92 | ] 93 | }, 94 | "unit": "none" 95 | }, 96 | "overrides": [ 97 | { 98 | "matcher": { 99 | "id": "byName", 100 | "options": "Skew (%)" 101 | }, 102 | "properties": [ 103 | { 104 | "id": "unit", 105 | "value": "percent" 106 | }, 107 | { 108 | "id": "mappings", 109 | "value": [ 110 | { 111 | "options": { 112 | "NaN": { 113 | "index": 0, 114 | "text": "-" 115 | } 116 | }, 117 | "type": "value" 118 | } 119 | ] 120 | }, 121 | { 122 | "id": "custom.width", 123 | "value": 185 124 | }, 125 | { 126 | "id": "color", 127 | "value": { 128 | "mode": "thresholds" 129 | } 130 | }, 131 | { 132 | "id": "thresholds", 133 | "value": { 134 | "mode": "absolute", 135 | "steps": [ 136 | { 137 | "color": "green", 138 | "value": null 139 | }, 140 | { 141 | "color": "red", 142 | "value": 0.1 143 | } 144 | ] 145 | } 146 | } 147 | ] 148 | }, 149 | { 150 | "matcher": { 151 | "id": "byName", 152 | "options": "Partitions" 153 | }, 154 | "properties": [ 155 | { 156 | "id": "custom.width", 157 | "value": 139 158 | } 159 | ] 160 | }, 161 | { 162 | "matcher": { 163 | "id": "byName", 164 | "options": "Max Offset" 165 | }, 166 | "properties": [ 167 | { 168 | "id": "custom.width", 169 | "value": 149 170 | } 171 | ] 172 | }, 173 | { 174 | "matcher": { 175 | "id": "byRegexp", 176 | "options": ".*Throughput" 177 | }, 178 | "properties": [ 179 | { 180 | "id": "unit", 181 | "value": "Bps" 182 | } 183 | ] 184 | } 185 | ] 186 | }, 187 | "gridPos": { 188 | "h": 7, 189 | "w": 24, 190 | "x": 0, 191 | "y": 0 192 | }, 193 | "id": 8, 194 | "options": { 195 | "footer": { 196 | "fields": "", 197 | "reducer": [ 198 | "sum" 199 | ], 200 | "show": false 201 | }, 202 | "frameIndex": 0, 203 | "showHeader": true, 204 | "sortBy": [] 205 | }, 206 | "pluginVersion": "9.3.6", 207 | "targets": [ 208 | { 209 | "datasource": { 210 | "type": "prometheus", 211 | "uid": "${DS_PROMETHEUS}" 212 | }, 213 | "editorMode": "code", 214 | "exemplar": false, 215 | "expr": "(sum(redpanda_kafka_max_offset{topic!=\"__consumer_offsets\", redpanda_namespace=\"kafka\",redpanda_cloud_data_cluster_name=~\"[[data_cluster]]\", redpanda_topic=~\"[[redpanda_topic]]\"}) by ([[aggr_criteria]])) ", 216 | "format": "table", 217 | "hide": false, 218 | "instant": true, 219 | "interval": "", 220 | "key": "Q-2f2af91b-9266-4c03-8a75-09b37685e65d-0", 221 | "range": false, 222 | "refId": "MaxOffset" 223 | }, 224 | { 225 | "datasource": { 226 | "type": "prometheus", 227 | "uid": "${DS_PROMETHEUS}" 228 | }, 229 | "editorMode": "code", 230 | "exemplar": false, 231 | "expr": "100 * abs((stddev by (redpanda_topic) (sum(redpanda_kafka_max_offset{redpanda_namespace=\"kafka\",redpanda_cloud_data_cluster_name=~\"\", redpanda_topic=~\"[[redpanda_topic]]\"}) by (redpanda_topic,redpanda_partition))) / (avg by (redpanda_topic) ((sum(redpanda_kafka_max_offset{redpanda_namespace=\"kafka\",redpanda_cloud_data_cluster_name=~\"\", redpanda_topic=~\"[[redpanda_topic]]\"}) by (redpanda_topic,redpanda_partition)))))", 232 | "format": "table", 233 | "hide": false, 234 | "instant": true, 235 | "range": false, 236 | "refId": "Skew" 237 | }, 238 | { 239 | "datasource": { 240 | "type": "prometheus", 241 | "uid": "${DS_PROMETHEUS}" 242 | }, 243 | "editorMode": "code", 244 | "exemplar": false, 245 | "expr": "max by(redpanda_topic)(redpanda_kafka_partitions{redpanda_namespace=\"kafka\",redpanda_cloud_data_cluster_name=~\"\", redpanda_topic=~\"[[redpanda_topic]]\"})", 246 | "format": "table", 247 | "hide": false, 248 | "instant": true, 249 | "range": false, 250 | "refId": "Partitions" 251 | }, 252 | { 253 | "datasource": { 254 | "type": "prometheus", 255 | "uid": "${DS_PROMETHEUS}" 256 | }, 257 | "editorMode": "code", 258 | "exemplar": false, 259 | "expr": "max by(redpanda_topic)(redpanda_kafka_replicas{redpanda_namespace=\"kafka\",redpanda_cloud_data_cluster_name=~\"\", redpanda_topic=~\"[[redpanda_topic]]\"})", 260 | "format": "table", 261 | "hide": false, 262 | "instant": true, 263 | "range": false, 264 | "refId": "Replicas" 265 | }, 266 | { 267 | "datasource": { 268 | "type": "prometheus", 269 | "uid": "${DS_PROMETHEUS}" 270 | }, 271 | "editorMode": "code", 272 | "exemplar": false, 273 | "expr": "sum(rate(redpanda_kafka_request_bytes_total{redpanda_topic=~\"[[redpanda_topic]]\",redpanda_cloud_data_cluster_name=~\"[[data_cluster]]\",redpanda_request=\"produce\"}[1m])) by (redpanda_topic)", 274 | "format": "table", 275 | "hide": false, 276 | "instant": true, 277 | "range": false, 278 | "refId": "ProduceThroughput" 279 | }, 280 | { 281 | "datasource": { 282 | "type": "prometheus", 283 | "uid": "${DS_PROMETHEUS}" 284 | }, 285 | "editorMode": "code", 286 | "exemplar": false, 287 | "expr": "sum(rate(redpanda_kafka_request_bytes_total{redpanda_topic=~\"[[redpanda_topic]]\",redpanda_cloud_data_cluster_name=~\"[[data_cluster]]\",redpanda_request=\"consume\"}[1m])) by (redpanda_topic)", 288 | "format": "table", 289 | "hide": false, 290 | "instant": true, 291 | "range": false, 292 | "refId": "ConsumeThroughput" 293 | } 294 | ], 295 | "title": "Topic Summary", 296 | "transformations": [ 297 | { 298 | "id": "seriesToColumns", 299 | "options": { 300 | "byField": "redpanda_topic" 301 | } 302 | }, 303 | { 304 | "id": "organize", 305 | "options": { 306 | "excludeByName": { 307 | "Time": true, 308 | "Time 1": true, 309 | "Time 2": true, 310 | "Time 3": false, 311 | "Time 4": false, 312 | "redpanda_request": false 313 | }, 314 | "indexByName": { 315 | "Time 1": 1, 316 | "Time 2": 2, 317 | "Time 3": 7, 318 | "Time 4": 8, 319 | "Value #MaxOffset": 5, 320 | "Value #Partitions": 4, 321 | "Value #Replicas": 3, 322 | "Value #Skew": 6, 323 | "redpanda_topic": 0 324 | }, 325 | "renameByName": { 326 | "Time 1": "", 327 | "Value #A": "Topic Size", 328 | "Value #B": "Topic Skew (%age)", 329 | "Value #ConsumeThroughput": "Consume Throughput", 330 | "Value #MaxOffset": "Max Offset", 331 | "Value #Partitions": "Partitions", 332 | "Value #ProduceThroughput": "Produce Throughput", 333 | "Value #Replicas": "Replicas", 334 | "Value #Skew": "Skew (%)", 335 | "Value #Throughput": "Thoughput", 336 | "redpanda_partition": "Partition", 337 | "redpanda_topic": "Topic" 338 | } 339 | } 340 | } 341 | ], 342 | "type": "table" 343 | }, 344 | { 345 | "aliasColors": {}, 346 | "bars": false, 347 | "dashLength": 10, 348 | "dashes": false, 349 | "datasource": { 350 | "type": "prometheus", 351 | "uid": "${DS_PROMETHEUS}" 352 | }, 353 | "fill": 1, 354 | "fillGradient": 0, 355 | "gridPos": { 356 | "h": 7, 357 | "w": 24, 358 | "x": 0, 359 | "y": 7 360 | }, 361 | "hiddenSeries": false, 362 | "id": 2, 363 | "legend": { 364 | "avg": false, 365 | "current": false, 366 | "max": false, 367 | "min": false, 368 | "show": true, 369 | "total": false, 370 | "values": false 371 | }, 372 | "lines": true, 373 | "linewidth": 1, 374 | "nullPointMode": "null", 375 | "options": { 376 | "alertThreshold": true 377 | }, 378 | "percentage": false, 379 | "pluginVersion": "9.3.6", 380 | "pointradius": 2, 381 | "points": false, 382 | "renderer": "flot", 383 | "seriesOverrides": [], 384 | "spaceLength": 10, 385 | "stack": false, 386 | "steppedLine": false, 387 | "targets": [ 388 | { 389 | "datasource": { 390 | "type": "prometheus", 391 | "uid": "${DS_PROMETHEUS}" 392 | }, 393 | "editorMode": "code", 394 | "exemplar": true, 395 | "expr": "sum by([[aggr_criteria]]) (rate(redpanda_kafka_max_offset{redpanda_topic=~\"[[redpanda_topic]]\", redpanda_cloud_data_cluster_name=~\"[[data_cluster]]\"}[1m]))", 396 | "interval": "", 397 | "legendFormat": "{{redpanda_topic}}:{{redpanda_partition}}", 398 | "range": true, 399 | "refId": "A" 400 | } 401 | ], 402 | "thresholds": [], 403 | "timeRegions": [], 404 | "title": "Records Received per Topic", 405 | "tooltip": { 406 | "shared": true, 407 | "sort": 0, 408 | "value_type": "individual" 409 | }, 410 | "type": "graph", 411 | "xaxis": { 412 | "mode": "time", 413 | "show": true, 414 | "values": [] 415 | }, 416 | "yaxes": [ 417 | { 418 | "$$hashKey": "object:24", 419 | "format": "short", 420 | "logBase": 1, 421 | "min": "0", 422 | "show": true 423 | }, 424 | { 425 | "$$hashKey": "object:25", 426 | "format": "short", 427 | "logBase": 1, 428 | "show": true 429 | } 430 | ], 431 | "yaxis": { 432 | "align": false 433 | } 434 | }, 435 | { 436 | "aliasColors": {}, 437 | "bars": false, 438 | "dashLength": 10, 439 | "dashes": false, 440 | "datasource": { 441 | "type": "prometheus", 442 | "uid": "${DS_PROMETHEUS}" 443 | }, 444 | "fill": 1, 445 | "fillGradient": 0, 446 | "gridPos": { 447 | "h": 8, 448 | "w": 12, 449 | "x": 0, 450 | "y": 14 451 | }, 452 | "hiddenSeries": false, 453 | "id": 7, 454 | "legend": { 455 | "avg": false, 456 | "current": false, 457 | "max": false, 458 | "min": false, 459 | "show": true, 460 | "total": false, 461 | "values": false 462 | }, 463 | "lines": true, 464 | "linewidth": 1, 465 | "nullPointMode": "null", 466 | "options": { 467 | "alertThreshold": true 468 | }, 469 | "percentage": false, 470 | "pluginVersion": "9.3.6", 471 | "pointradius": 2, 472 | "points": false, 473 | "renderer": "flot", 474 | "seriesOverrides": [], 475 | "spaceLength": 10, 476 | "stack": false, 477 | "steppedLine": false, 478 | "targets": [ 479 | { 480 | "datasource": { 481 | "type": "prometheus", 482 | "uid": "${DS_PROMETHEUS}" 483 | }, 484 | "editorMode": "code", 485 | "exemplar": true, 486 | "expr": "sum(irate(redpanda_kafka_request_bytes_total{redpanda_topic=~\"[[redpanda_topic]]\", redpanda_request=\"produce\",redpanda_cloud_data_cluster_name=~\"[[data_cluster]]\"}[1m])) by ([[aggr_criteria]])", 487 | "interval": "", 488 | "legendFormat": "{{redpanda_topic}}:{{redpanda_partition}}", 489 | "range": true, 490 | "refId": "A" 491 | } 492 | ], 493 | "thresholds": [], 494 | "timeRegions": [], 495 | "title": "Write Throughput per Topic", 496 | "tooltip": { 497 | "shared": true, 498 | "sort": 0, 499 | "value_type": "individual" 500 | }, 501 | "type": "graph", 502 | "xaxis": { 503 | "mode": "time", 504 | "show": true, 505 | "values": [] 506 | }, 507 | "yaxes": [ 508 | { 509 | "$$hashKey": "object:225", 510 | "format": "Bps", 511 | "logBase": 1, 512 | "min": "0", 513 | "show": true 514 | }, 515 | { 516 | "$$hashKey": "object:226", 517 | "format": "short", 518 | "logBase": 1, 519 | "show": true 520 | } 521 | ], 522 | "yaxis": { 523 | "align": false 524 | } 525 | }, 526 | { 527 | "aliasColors": {}, 528 | "bars": false, 529 | "dashLength": 10, 530 | "dashes": false, 531 | "datasource": { 532 | "type": "prometheus", 533 | "uid": "${DS_PROMETHEUS}" 534 | }, 535 | "fill": 1, 536 | "fillGradient": 0, 537 | "gridPos": { 538 | "h": 8, 539 | "w": 12, 540 | "x": 12, 541 | "y": 14 542 | }, 543 | "hiddenSeries": false, 544 | "id": 6, 545 | "legend": { 546 | "avg": false, 547 | "current": false, 548 | "max": false, 549 | "min": false, 550 | "show": true, 551 | "total": false, 552 | "values": false 553 | }, 554 | "lines": true, 555 | "linewidth": 1, 556 | "nullPointMode": "null", 557 | "options": { 558 | "alertThreshold": true 559 | }, 560 | "percentage": false, 561 | "pluginVersion": "9.3.6", 562 | "pointradius": 2, 563 | "points": false, 564 | "renderer": "flot", 565 | "seriesOverrides": [], 566 | "spaceLength": 10, 567 | "stack": false, 568 | "steppedLine": false, 569 | "targets": [ 570 | { 571 | "datasource": { 572 | "type": "prometheus", 573 | "uid": "${DS_PROMETHEUS}" 574 | }, 575 | "editorMode": "code", 576 | "exemplar": true, 577 | "expr": "sum(irate(redpanda_kafka_request_bytes_total{redpanda_topic=~\"[[redpanda_topic]]\",redpanda_request=\"consume\",vectorized_cloud_data_cluster_name=~\"[[data_cluster]]\"}[1m])) by ([[aggr_criteria]])", 578 | "interval": "", 579 | "legendFormat": "{{redpanda_topic}}:{{redpanda_partition}}", 580 | "range": true, 581 | "refId": "A" 582 | } 583 | ], 584 | "thresholds": [], 585 | "timeRegions": [], 586 | "title": "Read Throughput per Topic", 587 | "tooltip": { 588 | "shared": true, 589 | "sort": 0, 590 | "value_type": "individual" 591 | }, 592 | "type": "graph", 593 | "xaxis": { 594 | "mode": "time", 595 | "show": true, 596 | "values": [] 597 | }, 598 | "yaxes": [ 599 | { 600 | "$$hashKey": "object:225", 601 | "format": "Bps", 602 | "logBase": 1, 603 | "min": "0", 604 | "show": true 605 | }, 606 | { 607 | "$$hashKey": "object:226", 608 | "format": "short", 609 | "logBase": 1, 610 | "show": true 611 | } 612 | ], 613 | "yaxis": { 614 | "align": false 615 | } 616 | } 617 | ], 618 | "refresh": "5s", 619 | "schemaVersion": 37, 620 | "style": "dark", 621 | "tags": [], 622 | "templating": { 623 | "list": [ 624 | { 625 | "current": { 626 | "selected": false, 627 | "text": "Prometheus", 628 | "value": "Prometheus" 629 | }, 630 | "hide": 0, 631 | "includeAll": false, 632 | "label": "Data Source", 633 | "multi": false, 634 | "name": "DS_PROMETHEUS", 635 | "options": [], 636 | "query": "prometheus", 637 | "refresh": 1, 638 | "regex": "", 639 | "skipUrlSync": false, 640 | "type": "datasource" 641 | }, 642 | { 643 | "current": {}, 644 | "datasource": { 645 | "type": "prometheus", 646 | "uid": "${DS_PROMETHEUS}" 647 | }, 648 | "definition": "label_values(vectorized_cloud_data_cluster_name)", 649 | "hide": 0, 650 | "includeAll": false, 651 | "label": "Data cluster", 652 | "multi": false, 653 | "name": "data_cluster", 654 | "options": [], 655 | "query": { 656 | "query": "label_values(vectorized_cloud_data_cluster_name)", 657 | "refId": "StandardVariableQuery" 658 | }, 659 | "refresh": 1, 660 | "regex": "", 661 | "skipUrlSync": false, 662 | "sort": 0, 663 | "type": "query" 664 | }, 665 | { 666 | "allValue": "", 667 | "current": {}, 668 | "datasource": { 669 | "type": "prometheus", 670 | "uid": "${DS_PROMETHEUS}" 671 | }, 672 | "definition": "label_values(redpanda_kafka_request_bytes_total{redpanda_namespace='kafka', redpanda_topic !~ 'controller|group'}, redpanda_topic)", 673 | "hide": 0, 674 | "includeAll": true, 675 | "label": "Topic", 676 | "multi": true, 677 | "name": "redpanda_topic", 678 | "options": [], 679 | "query": { 680 | "query": "label_values(redpanda_kafka_request_bytes_total{redpanda_namespace='kafka', redpanda_topic !~ 'controller|group'}, redpanda_topic)", 681 | "refId": "StandardVariableQuery" 682 | }, 683 | "refresh": 2, 684 | "regex": "", 685 | "skipUrlSync": false, 686 | "sort": 0, 687 | "tagValuesQuery": "", 688 | "tagsQuery": "", 689 | "type": "query", 690 | "useTags": false 691 | }, 692 | { 693 | "current": { 694 | "selected": true, 695 | "text": "Topic", 696 | "value": "redpanda_topic" 697 | }, 698 | "hide": 0, 699 | "includeAll": false, 700 | "label": "Aggregate by", 701 | "multi": false, 702 | "name": "aggr_criteria", 703 | "options": [ 704 | { 705 | "selected": true, 706 | "text": "Topic", 707 | "value": "redpanda_topic" 708 | }, 709 | { 710 | "selected": false, 711 | "text": "Topic,Partition", 712 | "value": "redpanda_topic,redpanda_partition" 713 | } 714 | ], 715 | "query": "Topic : redpanda_topic,Topic\\,Partition : redpanda_topic\\,redpanda_partition", 716 | "queryValue": "", 717 | "skipUrlSync": false, 718 | "type": "custom" 719 | }, 720 | { 721 | "datasource": { 722 | "type": "prometheus", 723 | "uid": "${DS_PROMETHEUS}" 724 | }, 725 | "filters": [], 726 | "hide": 0, 727 | "name": "Filters", 728 | "skipUrlSync": false, 729 | "type": "adhoc" 730 | } 731 | ] 732 | }, 733 | "time": { 734 | "from": "now-5m", 735 | "to": "now" 736 | }, 737 | "timepicker": {}, 738 | "timezone": "", 739 | "title": "Kafka Topic Metrics", 740 | "uid": "Nxwln29Mz", 741 | "version": 1, 742 | "weekStart": "" 743 | } 744 | -------------------------------------------------------------------------------- /grafana-dashboards/redpanda-data-transforms.json: -------------------------------------------------------------------------------- 1 | { 2 | "annotations": { 3 | "list": [ 4 | { 5 | "builtIn": 1, 6 | "datasource": { 7 | "type": "grafana", 8 | "uid": "-- Grafana --" 9 | }, 10 | "enable": true, 11 | "hide": true, 12 | "iconColor": "rgba(0, 211, 255, 1)", 13 | "name": "Annotations & Alerts", 14 | "target": { 15 | "limit": 100, 16 | "matchAny": false, 17 | "tags": [], 18 | "type": "dashboard" 19 | }, 20 | "type": "dashboard" 21 | } 22 | ] 23 | }, 24 | "editable": true, 25 | "fiscalYearStartMonth": 0, 26 | "graphTooltip": 0, 27 | "id": 2, 28 | "links": [], 29 | "liveNow": false, 30 | "panels": [ 31 | { 32 | "datasource": {}, 33 | "fieldConfig": { 34 | "defaults": { 35 | "color": { 36 | "mode": "palette-classic" 37 | }, 38 | "custom": { 39 | "axisBorderShow": false, 40 | "axisCenteredZero": false, 41 | "axisColorMode": "text", 42 | "axisLabel": "", 43 | "axisPlacement": "auto", 44 | "barAlignment": 0, 45 | "drawStyle": "line", 46 | "fillOpacity": 0, 47 | "gradientMode": "none", 48 | "hideFrom": { 49 | "legend": false, 50 | "tooltip": false, 51 | "viz": false 52 | }, 53 | "insertNulls": false, 54 | "lineInterpolation": "linear", 55 | "lineWidth": 1, 56 | "pointSize": 5, 57 | "scaleDistribution": { 58 | "type": "linear" 59 | }, 60 | "showPoints": "auto", 61 | "spanNulls": false, 62 | "stacking": { 63 | "group": "A", 64 | "mode": "none" 65 | }, 66 | "thresholdsStyle": { 67 | "mode": "off" 68 | } 69 | }, 70 | "mappings": [], 71 | "thresholds": { 72 | "mode": "absolute", 73 | "steps": [ 74 | { 75 | "color": "green", 76 | "value": null 77 | }, 78 | { 79 | "color": "red", 80 | "value": 80 81 | } 82 | ] 83 | }, 84 | "unit": "short" 85 | }, 86 | "overrides": [] 87 | }, 88 | "gridPos": { 89 | "h": 8, 90 | "w": 12, 91 | "x": 0, 92 | "y": 0 93 | }, 94 | "id": 16, 95 | "options": { 96 | "legend": { 97 | "calcs": [], 98 | "displayMode": "list", 99 | "placement": "bottom", 100 | "showLegend": true 101 | }, 102 | "tooltip": { 103 | "mode": "single", 104 | "sort": "none" 105 | } 106 | }, 107 | "targets": [ 108 | { 109 | "datasource": { 110 | "type": "prometheus", 111 | "uid": "grafanacloud-prom" 112 | }, 113 | "disableTextWrap": false, 114 | "editorMode": "builder", 115 | "expr": "sum by(function_name) (redpanda_transform_lag)", 116 | "fullMetaSearch": false, 117 | "includeNullMetadata": true, 118 | "instant": false, 119 | "legendFormat": "{{function_name}}", 120 | "range": true, 121 | "refId": "A", 122 | "useBackend": false 123 | } 124 | ], 125 | "title": "Lag", 126 | "type": "timeseries" 127 | }, 128 | { 129 | "datasource": {}, 130 | "fieldConfig": { 131 | "defaults": { 132 | "color": { 133 | "mode": "palette-classic" 134 | }, 135 | "custom": { 136 | "axisBorderShow": false, 137 | "axisCenteredZero": false, 138 | "axisColorMode": "text", 139 | "axisLabel": "", 140 | "axisPlacement": "auto", 141 | "barAlignment": 0, 142 | "drawStyle": "line", 143 | "fillOpacity": 0, 144 | "gradientMode": "none", 145 | "hideFrom": { 146 | "legend": false, 147 | "tooltip": false, 148 | "viz": false 149 | }, 150 | "insertNulls": false, 151 | "lineInterpolation": "linear", 152 | "lineWidth": 1, 153 | "pointSize": 5, 154 | "scaleDistribution": { 155 | "type": "linear" 156 | }, 157 | "showPoints": "auto", 158 | "spanNulls": false, 159 | "stacking": { 160 | "group": "A", 161 | "mode": "none" 162 | }, 163 | "thresholdsStyle": { 164 | "mode": "off" 165 | } 166 | }, 167 | "mappings": [], 168 | "thresholds": { 169 | "mode": "absolute", 170 | "steps": [ 171 | { 172 | "color": "green", 173 | "value": null 174 | }, 175 | { 176 | "color": "red", 177 | "value": 80 178 | } 179 | ] 180 | }, 181 | "unit": "decbytes" 182 | }, 183 | "overrides": [] 184 | }, 185 | "gridPos": { 186 | "h": 8, 187 | "w": 12, 188 | "x": 12, 189 | "y": 0 190 | }, 191 | "id": 1, 192 | "options": { 193 | "legend": { 194 | "calcs": [], 195 | "displayMode": "list", 196 | "placement": "bottom", 197 | "showLegend": true 198 | }, 199 | "tooltip": { 200 | "mode": "single", 201 | "sort": "none" 202 | } 203 | }, 204 | "targets": [ 205 | { 206 | "datasource": { 207 | "type": "prometheus", 208 | "uid": "P1809F7CD0C75ACF3" 209 | }, 210 | "disableTextWrap": false, 211 | "editorMode": "builder", 212 | "exemplar": false, 213 | "expr": "sum by(function_name) (rate(redpanda_transform_write_bytes[$__rate_interval]))", 214 | "fullMetaSearch": false, 215 | "includeNullMetadata": true, 216 | "instant": false, 217 | "key": "Q-eff1bc59-6f94-4e27-891a-803c7ec7f8f5-0", 218 | "legendFormat": "write-{{function_name}}", 219 | "range": true, 220 | "refId": "B", 221 | "useBackend": false 222 | } 223 | ], 224 | "title": "Egress Thoughput", 225 | "type": "timeseries" 226 | }, 227 | { 228 | "datasource": {}, 229 | "fieldConfig": { 230 | "defaults": { 231 | "color": { 232 | "mode": "palette-classic" 233 | }, 234 | "custom": { 235 | "axisBorderShow": false, 236 | "axisCenteredZero": false, 237 | "axisColorMode": "text", 238 | "axisLabel": "", 239 | "axisPlacement": "auto", 240 | "barAlignment": 0, 241 | "drawStyle": "line", 242 | "fillOpacity": 0, 243 | "gradientMode": "none", 244 | "hideFrom": { 245 | "legend": false, 246 | "tooltip": false, 247 | "viz": false 248 | }, 249 | "insertNulls": false, 250 | "lineInterpolation": "linear", 251 | "lineWidth": 1, 252 | "pointSize": 5, 253 | "scaleDistribution": { 254 | "type": "linear" 255 | }, 256 | "showPoints": "auto", 257 | "spanNulls": false, 258 | "stacking": { 259 | "group": "A", 260 | "mode": "none" 261 | }, 262 | "thresholdsStyle": { 263 | "mode": "off" 264 | } 265 | }, 266 | "mappings": [], 267 | "thresholds": { 268 | "mode": "absolute", 269 | "steps": [ 270 | { 271 | "color": "green", 272 | "value": null 273 | }, 274 | { 275 | "color": "red", 276 | "value": 80 277 | } 278 | ] 279 | }, 280 | "unit": "s" 281 | }, 282 | "overrides": [] 283 | }, 284 | "gridPos": { 285 | "h": 8, 286 | "w": 12, 287 | "x": 0, 288 | "y": 8 289 | }, 290 | "id": 5, 291 | "options": { 292 | "legend": { 293 | "calcs": [], 294 | "displayMode": "list", 295 | "placement": "bottom", 296 | "showLegend": true 297 | }, 298 | "tooltip": { 299 | "mode": "single", 300 | "sort": "none" 301 | } 302 | }, 303 | "targets": [ 304 | { 305 | "datasource": { 306 | "type": "prometheus", 307 | "uid": "P1809F7CD0C75ACF3" 308 | }, 309 | "disableTextWrap": false, 310 | "editorMode": "builder", 311 | "expr": "histogram_quantile(0.99, sum by(le, function_name) (rate(redpanda_transform_execution_latency_sec_bucket[$__rate_interval])))", 312 | "fullMetaSearch": false, 313 | "hide": false, 314 | "includeNullMetadata": true, 315 | "legendFormat": "P99 {{function_name}}", 316 | "range": true, 317 | "refId": "B", 318 | "useBackend": false 319 | }, 320 | { 321 | "datasource": { 322 | "type": "prometheus", 323 | "uid": "P1809F7CD0C75ACF3" 324 | }, 325 | "disableTextWrap": false, 326 | "editorMode": "builder", 327 | "expr": "histogram_quantile(0.95, sum by(le, function_name) (rate(redpanda_transform_execution_latency_sec_bucket[$__rate_interval])))", 328 | "fullMetaSearch": false, 329 | "includeNullMetadata": true, 330 | "legendFormat": "P95 {{function_name}}", 331 | "range": true, 332 | "refId": "A", 333 | "useBackend": false 334 | }, 335 | { 336 | "datasource": { 337 | "type": "prometheus", 338 | "uid": "P1809F7CD0C75ACF3" 339 | }, 340 | "disableTextWrap": false, 341 | "editorMode": "builder", 342 | "expr": "histogram_quantile(0.5, sum by(le, function_name) (rate(redpanda_transform_execution_latency_sec_bucket[$__rate_interval])))", 343 | "fullMetaSearch": false, 344 | "hide": false, 345 | "includeNullMetadata": true, 346 | "legendFormat": "P50 {{function_name}}", 347 | "range": true, 348 | "refId": "C", 349 | "useBackend": false 350 | } 351 | ], 352 | "title": "Execution Latency", 353 | "type": "timeseries" 354 | }, 355 | { 356 | "datasource": {}, 357 | "fieldConfig": { 358 | "defaults": { 359 | "color": { 360 | "mode": "palette-classic" 361 | }, 362 | "custom": { 363 | "axisBorderShow": false, 364 | "axisCenteredZero": false, 365 | "axisColorMode": "text", 366 | "axisLabel": "", 367 | "axisPlacement": "auto", 368 | "barAlignment": 0, 369 | "drawStyle": "line", 370 | "fillOpacity": 0, 371 | "gradientMode": "none", 372 | "hideFrom": { 373 | "legend": false, 374 | "tooltip": false, 375 | "viz": false 376 | }, 377 | "insertNulls": false, 378 | "lineInterpolation": "linear", 379 | "lineWidth": 1, 380 | "pointSize": 5, 381 | "scaleDistribution": { 382 | "type": "linear" 383 | }, 384 | "showPoints": "auto", 385 | "spanNulls": false, 386 | "stacking": { 387 | "group": "A", 388 | "mode": "none" 389 | }, 390 | "thresholdsStyle": { 391 | "mode": "off" 392 | } 393 | }, 394 | "mappings": [], 395 | "thresholds": { 396 | "mode": "absolute", 397 | "steps": [ 398 | { 399 | "color": "green", 400 | "value": null 401 | }, 402 | { 403 | "color": "red", 404 | "value": 80 405 | } 406 | ] 407 | }, 408 | "unit": "decbytes" 409 | }, 410 | "overrides": [] 411 | }, 412 | "gridPos": { 413 | "h": 8, 414 | "w": 12, 415 | "x": 12, 416 | "y": 8 417 | }, 418 | "id": 17, 419 | "options": { 420 | "legend": { 421 | "calcs": [], 422 | "displayMode": "list", 423 | "placement": "bottom", 424 | "showLegend": true 425 | }, 426 | "tooltip": { 427 | "mode": "single", 428 | "sort": "none" 429 | } 430 | }, 431 | "targets": [ 432 | { 433 | "datasource": { 434 | "type": "prometheus", 435 | "uid": "P1809F7CD0C75ACF3" 436 | }, 437 | "disableTextWrap": false, 438 | "editorMode": "builder", 439 | "exemplar": false, 440 | "expr": "sum by(function_name) (rate(redpanda_transform_read_bytes[$__rate_interval]))", 441 | "fullMetaSearch": false, 442 | "includeNullMetadata": true, 443 | "instant": false, 444 | "key": "Q-eff1bc59-6f94-4e27-891a-803c7ec7f8f5-0", 445 | "legendFormat": "read-{{function_name}}", 446 | "range": true, 447 | "refId": "A", 448 | "useBackend": false 449 | } 450 | ], 451 | "title": "Ingress Thoughput", 452 | "type": "timeseries" 453 | }, 454 | { 455 | "datasource": { 456 | "type": "prometheus", 457 | "uid": "PBFA97CFB590B2093" 458 | }, 459 | "fieldConfig": { 460 | "defaults": { 461 | "color": { 462 | "mode": "palette-classic" 463 | }, 464 | "custom": { 465 | "axisBorderShow": false, 466 | "axisCenteredZero": false, 467 | "axisColorMode": "text", 468 | "axisLabel": "", 469 | "axisPlacement": "auto", 470 | "barAlignment": 0, 471 | "drawStyle": "line", 472 | "fillOpacity": 0, 473 | "gradientMode": "none", 474 | "hideFrom": { 475 | "legend": false, 476 | "tooltip": false, 477 | "viz": false 478 | }, 479 | "insertNulls": false, 480 | "lineInterpolation": "linear", 481 | "lineWidth": 1, 482 | "pointSize": 5, 483 | "scaleDistribution": { 484 | "type": "linear" 485 | }, 486 | "showPoints": "auto", 487 | "spanNulls": false, 488 | "stacking": { 489 | "group": "A", 490 | "mode": "none" 491 | }, 492 | "thresholdsStyle": { 493 | "mode": "off" 494 | } 495 | }, 496 | "fieldMinMax": false, 497 | "mappings": [], 498 | "thresholds": { 499 | "mode": "absolute", 500 | "steps": [ 501 | { 502 | "color": "green", 503 | "value": null 504 | }, 505 | { 506 | "color": "red", 507 | "value": 80 508 | } 509 | ] 510 | }, 511 | "unit": "none" 512 | }, 513 | "overrides": [ 514 | { 515 | "matcher": { 516 | "id": "byRegexp", 517 | "options": "errored-/.*/" 518 | }, 519 | "properties": [ 520 | { 521 | "id": "color", 522 | "value": { 523 | "fixedColor": "red", 524 | "mode": "fixed" 525 | } 526 | } 527 | ] 528 | }, 529 | { 530 | "matcher": { 531 | "id": "byRegexp", 532 | "options": "active-/.*/" 533 | }, 534 | "properties": [ 535 | { 536 | "id": "color", 537 | "value": { 538 | "fixedColor": "green", 539 | "mode": "fixed" 540 | } 541 | } 542 | ] 543 | }, 544 | { 545 | "matcher": { 546 | "id": "byRegexp", 547 | "options": "inactive-/.*/" 548 | }, 549 | "properties": [ 550 | { 551 | "id": "color", 552 | "value": { 553 | "fixedColor": "blue", 554 | "mode": "fixed" 555 | } 556 | } 557 | ] 558 | } 559 | ] 560 | }, 561 | "gridPos": { 562 | "h": 8, 563 | "w": 12, 564 | "x": 0, 565 | "y": 16 566 | }, 567 | "id": 3, 568 | "options": { 569 | "legend": { 570 | "calcs": [], 571 | "displayMode": "list", 572 | "placement": "bottom", 573 | "showLegend": true 574 | }, 575 | "tooltip": { 576 | "mode": "single", 577 | "sort": "none" 578 | } 579 | }, 580 | "targets": [ 581 | { 582 | "datasource": { 583 | "type": "prometheus", 584 | "uid": "P1809F7CD0C75ACF3" 585 | }, 586 | "disableTextWrap": false, 587 | "editorMode": "code", 588 | "expr": "sum by(function_name, state) (redpanda_transform_state) > 0 ", 589 | "fullMetaSearch": false, 590 | "includeNullMetadata": true, 591 | "legendFormat": "{{state}}-{{function_name}}", 592 | "range": true, 593 | "refId": "A", 594 | "useBackend": false 595 | } 596 | ], 597 | "title": "Transforms Running", 598 | "type": "timeseries" 599 | }, 600 | { 601 | "datasource": { 602 | "type": "prometheus", 603 | "uid": "PBFA97CFB590B2093" 604 | }, 605 | "fieldConfig": { 606 | "defaults": { 607 | "color": { 608 | "mode": "palette-classic" 609 | }, 610 | "custom": { 611 | "axisBorderShow": false, 612 | "axisCenteredZero": false, 613 | "axisColorMode": "text", 614 | "axisLabel": "", 615 | "axisPlacement": "auto", 616 | "barAlignment": 0, 617 | "drawStyle": "line", 618 | "fillOpacity": 0, 619 | "gradientMode": "none", 620 | "hideFrom": { 621 | "legend": false, 622 | "tooltip": false, 623 | "viz": false 624 | }, 625 | "insertNulls": false, 626 | "lineInterpolation": "linear", 627 | "lineWidth": 1, 628 | "pointSize": 5, 629 | "scaleDistribution": { 630 | "type": "linear" 631 | }, 632 | "showPoints": "auto", 633 | "spanNulls": false, 634 | "stacking": { 635 | "group": "A", 636 | "mode": "none" 637 | }, 638 | "thresholdsStyle": { 639 | "mode": "off" 640 | } 641 | }, 642 | "mappings": [], 643 | "thresholds": { 644 | "mode": "absolute", 645 | "steps": [ 646 | { 647 | "color": "green", 648 | "value": null 649 | }, 650 | { 651 | "color": "red", 652 | "value": 80 653 | } 654 | ] 655 | }, 656 | "unit": "bytes" 657 | }, 658 | "overrides": [] 659 | }, 660 | "gridPos": { 661 | "h": 8, 662 | "w": 12, 663 | "x": 12, 664 | "y": 16 665 | }, 666 | "id": 7, 667 | "options": { 668 | "legend": { 669 | "calcs": [ 670 | "last" 671 | ], 672 | "displayMode": "list", 673 | "placement": "bottom", 674 | "showLegend": true 675 | }, 676 | "tooltip": { 677 | "mode": "single", 678 | "sort": "none" 679 | } 680 | }, 681 | "targets": [ 682 | { 683 | "datasource": { 684 | "type": "prometheus", 685 | "uid": "P1809F7CD0C75ACF3" 686 | }, 687 | "disableTextWrap": false, 688 | "editorMode": "code", 689 | "expr": "sum by(function_name) (redpanda_wasm_engine_memory_usage) ", 690 | "fullMetaSearch": false, 691 | "includeNullMetadata": true, 692 | "legendFormat": "Usage-{{function_name}}", 693 | "range": true, 694 | "refId": "A", 695 | "useBackend": false 696 | }, 697 | { 698 | "datasource": { 699 | "type": "prometheus", 700 | "uid": "P1809F7CD0C75ACF3" 701 | }, 702 | "disableTextWrap": false, 703 | "editorMode": "builder", 704 | "expr": "sum by(function_name) (redpanda_wasm_engine_max_memory)", 705 | "fullMetaSearch": false, 706 | "hide": false, 707 | "includeNullMetadata": true, 708 | "legendFormat": "Max-{{function_name}}", 709 | "range": true, 710 | "refId": "B", 711 | "useBackend": false 712 | } 713 | ], 714 | "title": "Memory Usage", 715 | "type": "timeseries" 716 | }, 717 | { 718 | "datasource": { 719 | "type": "prometheus", 720 | "uid": "PBFA97CFB590B2093" 721 | }, 722 | "fieldConfig": { 723 | "defaults": { 724 | "color": { 725 | "mode": "palette-classic" 726 | }, 727 | "custom": { 728 | "axisBorderShow": false, 729 | "axisCenteredZero": false, 730 | "axisColorMode": "text", 731 | "axisLabel": "", 732 | "axisPlacement": "auto", 733 | "barAlignment": 0, 734 | "drawStyle": "line", 735 | "fillOpacity": 0, 736 | "gradientMode": "none", 737 | "hideFrom": { 738 | "legend": false, 739 | "tooltip": false, 740 | "viz": false 741 | }, 742 | "insertNulls": false, 743 | "lineInterpolation": "linear", 744 | "lineWidth": 1, 745 | "pointSize": 5, 746 | "scaleDistribution": { 747 | "type": "linear" 748 | }, 749 | "showPoints": "auto", 750 | "spanNulls": false, 751 | "stacking": { 752 | "group": "A", 753 | "mode": "none" 754 | }, 755 | "thresholdsStyle": { 756 | "mode": "off" 757 | } 758 | }, 759 | "mappings": [], 760 | "thresholds": { 761 | "mode": "absolute", 762 | "steps": [ 763 | { 764 | "color": "green", 765 | "value": null 766 | }, 767 | { 768 | "color": "red", 769 | "value": 80 770 | } 771 | ] 772 | }, 773 | "unit": "s" 774 | }, 775 | "overrides": [] 776 | }, 777 | "gridPos": { 778 | "h": 8, 779 | "w": 12, 780 | "x": 0, 781 | "y": 24 782 | }, 783 | "id": 15, 784 | "options": { 785 | "legend": { 786 | "calcs": [], 787 | "displayMode": "list", 788 | "placement": "bottom", 789 | "showLegend": true 790 | }, 791 | "tooltip": { 792 | "mode": "single", 793 | "sort": "none" 794 | } 795 | }, 796 | "targets": [ 797 | { 798 | "datasource": { 799 | "type": "prometheus", 800 | "uid": "P1809F7CD0C75ACF3" 801 | }, 802 | "disableTextWrap": false, 803 | "editorMode": "code", 804 | "expr": "sum by(function_name) (rate(redpanda_wasm_engine_cpu_seconds_total[$__rate_interval])) > 0 ", 805 | "fullMetaSearch": false, 806 | "includeNullMetadata": true, 807 | "legendFormat": "{{function_name}}", 808 | "range": true, 809 | "refId": "A", 810 | "useBackend": false 811 | } 812 | ], 813 | "title": "CPU Usage", 814 | "type": "timeseries" 815 | }, 816 | { 817 | "datasource": { 818 | "type": "prometheus", 819 | "uid": "PBFA97CFB590B2093" 820 | }, 821 | "fieldConfig": { 822 | "defaults": { 823 | "color": { 824 | "mode": "palette-classic" 825 | }, 826 | "custom": { 827 | "axisBorderShow": false, 828 | "axisCenteredZero": false, 829 | "axisColorMode": "text", 830 | "axisLabel": "", 831 | "axisPlacement": "auto", 832 | "barAlignment": 0, 833 | "drawStyle": "line", 834 | "fillOpacity": 0, 835 | "gradientMode": "none", 836 | "hideFrom": { 837 | "legend": false, 838 | "tooltip": false, 839 | "viz": false 840 | }, 841 | "insertNulls": false, 842 | "lineInterpolation": "linear", 843 | "lineWidth": 1, 844 | "pointSize": 5, 845 | "scaleDistribution": { 846 | "type": "linear" 847 | }, 848 | "showPoints": "auto", 849 | "spanNulls": false, 850 | "stacking": { 851 | "group": "A", 852 | "mode": "none" 853 | }, 854 | "thresholdsStyle": { 855 | "mode": "off" 856 | } 857 | }, 858 | "mappings": [], 859 | "thresholds": { 860 | "mode": "absolute", 861 | "steps": [ 862 | { 863 | "color": "green", 864 | "value": null 865 | }, 866 | { 867 | "color": "red", 868 | "value": 80 869 | } 870 | ] 871 | }, 872 | "unit": "ops" 873 | }, 874 | "overrides": [] 875 | }, 876 | "gridPos": { 877 | "h": 8, 878 | "w": 12, 879 | "x": 12, 880 | "y": 24 881 | }, 882 | "id": 18, 883 | "options": { 884 | "legend": { 885 | "calcs": [], 886 | "displayMode": "list", 887 | "placement": "bottom", 888 | "showLegend": true 889 | }, 890 | "tooltip": { 891 | "mode": "single", 892 | "sort": "none" 893 | } 894 | }, 895 | "targets": [ 896 | { 897 | "datasource": { 898 | "type": "prometheus", 899 | "uid": "grafanacloud-prom" 900 | }, 901 | "disableTextWrap": false, 902 | "editorMode": "code", 903 | "expr": "sum by(function_name) (rate(redpanda_transform_execution_latency_sec_count[$__rate_interval])) ", 904 | "fullMetaSearch": false, 905 | "includeNullMetadata": false, 906 | "instant": false, 907 | "legendFormat": "{{function_name}}", 908 | "range": true, 909 | "refId": "A", 910 | "useBackend": false 911 | } 912 | ], 913 | "title": "Records Transformed", 914 | "type": "timeseries" 915 | }, 916 | { 917 | "datasource": { 918 | "type": "prometheus", 919 | "uid": "PBFA97CFB590B2093" 920 | }, 921 | "fieldConfig": { 922 | "defaults": { 923 | "color": { 924 | "mode": "palette-classic" 925 | }, 926 | "custom": { 927 | "axisBorderShow": false, 928 | "axisCenteredZero": false, 929 | "axisColorMode": "text", 930 | "axisLabel": "", 931 | "axisPlacement": "auto", 932 | "barAlignment": 0, 933 | "drawStyle": "line", 934 | "fillOpacity": 0, 935 | "gradientMode": "none", 936 | "hideFrom": { 937 | "legend": false, 938 | "tooltip": false, 939 | "viz": false 940 | }, 941 | "insertNulls": false, 942 | "lineInterpolation": "linear", 943 | "lineWidth": 1, 944 | "pointSize": 5, 945 | "scaleDistribution": { 946 | "type": "linear" 947 | }, 948 | "showPoints": "auto", 949 | "spanNulls": false, 950 | "stacking": { 951 | "group": "A", 952 | "mode": "none" 953 | }, 954 | "thresholdsStyle": { 955 | "mode": "off" 956 | } 957 | }, 958 | "mappings": [], 959 | "thresholds": { 960 | "mode": "absolute", 961 | "steps": [ 962 | { 963 | "color": "green", 964 | "value": null 965 | }, 966 | { 967 | "color": "red", 968 | "value": 80 969 | } 970 | ] 971 | } 972 | }, 973 | "overrides": [] 974 | }, 975 | "gridPos": { 976 | "h": 8, 977 | "w": 12, 978 | "x": 0, 979 | "y": 32 980 | }, 981 | "id": 13, 982 | "options": { 983 | "legend": { 984 | "calcs": [], 985 | "displayMode": "list", 986 | "placement": "bottom", 987 | "showLegend": true 988 | }, 989 | "tooltip": { 990 | "mode": "single", 991 | "sort": "none" 992 | } 993 | }, 994 | "targets": [ 995 | { 996 | "datasource": { 997 | "type": "prometheus", 998 | "uid": "P1809F7CD0C75ACF3" 999 | }, 1000 | "disableTextWrap": false, 1001 | "editorMode": "code", 1002 | "expr": "sum by(function_name) (rate(redpanda_transform_failures[$__rate_interval])) > 0 ", 1003 | "fullMetaSearch": false, 1004 | "includeNullMetadata": true, 1005 | "legendFormat": "Failures-{{function_name}}", 1006 | "range": true, 1007 | "refId": "A", 1008 | "useBackend": false 1009 | } 1010 | ], 1011 | "title": "Transform Failures", 1012 | "type": "timeseries" 1013 | }, 1014 | { 1015 | "datasource": { 1016 | "type": "prometheus", 1017 | "uid": "PBFA97CFB590B2093" 1018 | }, 1019 | "fieldConfig": { 1020 | "defaults": { 1021 | "color": { 1022 | "mode": "thresholds" 1023 | }, 1024 | "mappings": [], 1025 | "thresholds": { 1026 | "mode": "absolute", 1027 | "steps": [ 1028 | { 1029 | "color": "green", 1030 | "value": null 1031 | } 1032 | ] 1033 | }, 1034 | "unit": "decbytes" 1035 | }, 1036 | "overrides": [] 1037 | }, 1038 | "gridPos": { 1039 | "h": 8, 1040 | "w": 12, 1041 | "x": 12, 1042 | "y": 32 1043 | }, 1044 | "id": 11, 1045 | "options": { 1046 | "colorMode": "value", 1047 | "graphMode": "area", 1048 | "justifyMode": "auto", 1049 | "orientation": "auto", 1050 | "reduceOptions": { 1051 | "calcs": [ 1052 | "lastNotNull" 1053 | ], 1054 | "fields": "", 1055 | "values": false 1056 | }, 1057 | "showPercentChange": false, 1058 | "textMode": "auto", 1059 | "wideLayout": true 1060 | }, 1061 | "pluginVersion": "10.2.2", 1062 | "targets": [ 1063 | { 1064 | "datasource": { 1065 | "type": "prometheus", 1066 | "uid": "P1809F7CD0C75ACF3" 1067 | }, 1068 | "disableTextWrap": false, 1069 | "editorMode": "code", 1070 | "expr": "max(redpanda_wasm_binary_executable_memory_usage) ", 1071 | "fullMetaSearch": false, 1072 | "includeNullMetadata": true, 1073 | "legendFormat": "{{instance}}", 1074 | "range": true, 1075 | "refId": "A", 1076 | "useBackend": false 1077 | } 1078 | ], 1079 | "title": "Executable Memory", 1080 | "type": "stat" 1081 | } 1082 | ], 1083 | "refresh": "", 1084 | "schemaVersion": 38, 1085 | "tags": [], 1086 | "templating": { 1087 | "list": [] 1088 | }, 1089 | "time": { 1090 | "from": "now-15m", 1091 | "to": "now" 1092 | }, 1093 | "timepicker": {}, 1094 | "timezone": "", 1095 | "title": "Wasm", 1096 | "uid": "gDIn6ERIk", 1097 | "version": 2, 1098 | "weekStart": "" 1099 | } -------------------------------------------------------------------------------- /grafana-dashboards/Kafka-Consumer-Offsets.json: -------------------------------------------------------------------------------- 1 | { 2 | "__inputs": [ 3 | { 4 | "name": "DS_PROMETHEUS", 5 | "label": "Prometheus", 6 | "description": "", 7 | "type": "datasource", 8 | "pluginId": "prometheus", 9 | "pluginName": "Prometheus" 10 | } 11 | ], 12 | "__elements": {}, 13 | "__requires": [ 14 | { 15 | "type": "grafana", 16 | "id": "grafana", 17 | "name": "Grafana", 18 | "version": "9.3.6" 19 | }, 20 | { 21 | "type": "panel", 22 | "id": "graph", 23 | "name": "Graph (old)", 24 | "version": "" 25 | }, 26 | { 27 | "type": "datasource", 28 | "id": "prometheus", 29 | "name": "Prometheus", 30 | "version": "1.0.0" 31 | }, 32 | { 33 | "type": "panel", 34 | "id": "stat", 35 | "name": "Stat", 36 | "version": "" 37 | }, 38 | { 39 | "type": "panel", 40 | "id": "table", 41 | "name": "Table", 42 | "version": "" 43 | }, 44 | { 45 | "type": "panel", 46 | "id": "timeseries", 47 | "name": "Time series", 48 | "version": "" 49 | } 50 | ], 51 | "annotations": { 52 | "list": [ 53 | { 54 | "builtIn": 1, 55 | "datasource": { 56 | "type": "datasource", 57 | "uid": "grafana" 58 | }, 59 | "enable": true, 60 | "hide": true, 61 | "iconColor": "rgba(0, 211, 255, 1)", 62 | "name": "Annotations & Alerts", 63 | "target": { 64 | "limit": 100, 65 | "matchAny": false, 66 | "tags": [], 67 | "type": "dashboard" 68 | }, 69 | "type": "dashboard" 70 | } 71 | ] 72 | }, 73 | "editable": true, 74 | "fiscalYearStartMonth": 0, 75 | "graphTooltip": 1, 76 | "id": null, 77 | "links": [], 78 | "liveNow": false, 79 | "panels": [ 80 | { 81 | "datasource": { 82 | "type": "prometheus", 83 | "uid": "${DS_PROMETHEUS}" 84 | }, 85 | "fieldConfig": { 86 | "defaults": { 87 | "color": { 88 | "mode": "palette-classic" 89 | }, 90 | "custom": { 91 | "axisCenteredZero": false, 92 | "axisColorMode": "text", 93 | "axisLabel": "", 94 | "axisPlacement": "auto", 95 | "barAlignment": 0, 96 | "drawStyle": "line", 97 | "fillOpacity": 10, 98 | "gradientMode": "none", 99 | "hideFrom": { 100 | "legend": false, 101 | "tooltip": false, 102 | "viz": false 103 | }, 104 | "lineInterpolation": "linear", 105 | "lineWidth": 1, 106 | "pointSize": 5, 107 | "scaleDistribution": { 108 | "type": "linear" 109 | }, 110 | "showPoints": "never", 111 | "spanNulls": false, 112 | "stacking": { 113 | "group": "A", 114 | "mode": "none" 115 | }, 116 | "thresholdsStyle": { 117 | "mode": "off" 118 | } 119 | }, 120 | "mappings": [], 121 | "min": 0, 122 | "thresholds": { 123 | "mode": "absolute", 124 | "steps": [ 125 | { 126 | "color": "green", 127 | "value": null 128 | }, 129 | { 130 | "color": "red", 131 | "value": 80 132 | } 133 | ] 134 | }, 135 | "unit": "short" 136 | }, 137 | "overrides": [] 138 | }, 139 | "gridPos": { 140 | "h": 7, 141 | "w": 12, 142 | "x": 0, 143 | "y": 0 144 | }, 145 | "id": 11, 146 | "options": { 147 | "legend": { 148 | "calcs": [], 149 | "displayMode": "list", 150 | "placement": "bottom", 151 | "showLegend": true 152 | }, 153 | "tooltip": { 154 | "mode": "multi", 155 | "sort": "none" 156 | } 157 | }, 158 | "pluginVersion": "", 159 | "targets": [ 160 | { 161 | "datasource": { 162 | "type": "prometheus", 163 | "uid": "${DS_PROMETHEUS}" 164 | }, 165 | "editorMode": "code", 166 | "exemplar": true, 167 | "expr": "sum by([[aggr_criteria]] , redpanda_group) (max by(redpanda_namespace, redpanda_topic, redpanda_partition)(redpanda_kafka_max_offset{redpanda_namespace=\"kafka\", redpanda_topic=~\"[[redpanda_topic]]\"}) - on(redpanda_topic, redpanda_partition) group_right max by(redpanda_group, redpanda_topic, redpanda_partition)(redpanda_kafka_consumer_group_committed_offset{redpanda_group=~\"[[redpanda_group]]\"}) +1)", 168 | "interval": "", 169 | "legendFormat": "Group: {{redpanda_group}}, Topic: {{redpanda_topic}}, Partition: {{redpanda_partition}}", 170 | "range": true, 171 | "refId": "A" 172 | } 173 | ], 174 | "title": "Consumer Group Offset Lag", 175 | "type": "timeseries" 176 | }, 177 | { 178 | "datasource": { 179 | "type": "prometheus", 180 | "uid": "${DS_PROMETHEUS}" 181 | }, 182 | "fieldConfig": { 183 | "defaults": { 184 | "color": { 185 | "mode": "thresholds" 186 | }, 187 | "mappings": [], 188 | "thresholds": { 189 | "mode": "absolute", 190 | "steps": [ 191 | { 192 | "color": "green", 193 | "value": null 194 | }, 195 | { 196 | "color": "red", 197 | "value": 80 198 | } 199 | ] 200 | } 201 | }, 202 | "overrides": [] 203 | }, 204 | "gridPos": { 205 | "h": 7, 206 | "w": 3, 207 | "x": 12, 208 | "y": 0 209 | }, 210 | "id": 19, 211 | "options": { 212 | "colorMode": "value", 213 | "graphMode": "none", 214 | "justifyMode": "auto", 215 | "orientation": "auto", 216 | "reduceOptions": { 217 | "calcs": [ 218 | "lastNotNull" 219 | ], 220 | "fields": "", 221 | "values": false 222 | }, 223 | "textMode": "auto" 224 | }, 225 | "pluginVersion": "9.3.6", 226 | "targets": [ 227 | { 228 | "datasource": { 229 | "type": "prometheus", 230 | "uid": "${DS_PROMETHEUS}" 231 | }, 232 | "expr": "count(count by(redpanda_group)(redpanda_kafka_consumer_group_committed_offset{redpanda_topic=~\"[[redpanda_topic]]\", redpanda_group=~\"[[redpanda_group]]\", redpanda_cloud_data_cluster_name=~\"[[data_cluster]]\"}))", 233 | "refId": "A" 234 | } 235 | ], 236 | "title": "Consumer Groups", 237 | "type": "stat" 238 | }, 239 | { 240 | "datasource": { 241 | "type": "prometheus", 242 | "uid": "${DS_PROMETHEUS}" 243 | }, 244 | "fieldConfig": { 245 | "defaults": { 246 | "color": { 247 | "mode": "thresholds" 248 | }, 249 | "mappings": [], 250 | "min": 0, 251 | "thresholds": { 252 | "mode": "percentage", 253 | "steps": [ 254 | { 255 | "color": "green", 256 | "value": null 257 | } 258 | ] 259 | }, 260 | "unit": "short" 261 | }, 262 | "overrides": [] 263 | }, 264 | "gridPos": { 265 | "h": 7, 266 | "w": 3, 267 | "x": 15, 268 | "y": 0 269 | }, 270 | "id": 23, 271 | "options": { 272 | "colorMode": "value", 273 | "graphMode": "area", 274 | "justifyMode": "auto", 275 | "orientation": "auto", 276 | "reduceOptions": { 277 | "calcs": [ 278 | "lastNotNull" 279 | ], 280 | "fields": "", 281 | "values": false 282 | }, 283 | "textMode": "auto" 284 | }, 285 | "pluginVersion": "9.3.6", 286 | "targets": [ 287 | { 288 | "datasource": { 289 | "type": "prometheus", 290 | "uid": "${DS_PROMETHEUS}" 291 | }, 292 | "editorMode": "code", 293 | "exemplar": true, 294 | "expr": "count(count(redpanda_kafka_max_offset{redpanda_topic=~\"[[redpanda_topic]]\",redpanda_cloud_data_cluster_name=~\"[[data_cluster]]\"}) by (redpanda_topic, redpanda_partition))", 295 | "interval": "", 296 | "legendFormat": "group: {{group}}, redpanda_topic: {{redpanda_topic}}, partition: {{partition}}", 297 | "range": true, 298 | "refId": "A" 299 | } 300 | ], 301 | "title": "Partitions", 302 | "type": "stat" 303 | }, 304 | { 305 | "datasource": { 306 | "type": "prometheus", 307 | "uid": "${DS_PROMETHEUS}" 308 | }, 309 | "fieldConfig": { 310 | "defaults": { 311 | "color": { 312 | "mode": "thresholds" 313 | }, 314 | "mappings": [], 315 | "min": 0, 316 | "thresholds": { 317 | "mode": "percentage", 318 | "steps": [ 319 | { 320 | "color": "green", 321 | "value": null 322 | }, 323 | { 324 | "color": "red", 325 | "value": 80 326 | } 327 | ] 328 | }, 329 | "unit": "short" 330 | }, 331 | "overrides": [] 332 | }, 333 | "gridPos": { 334 | "h": 7, 335 | "w": 3, 336 | "x": 18, 337 | "y": 0 338 | }, 339 | "id": 25, 340 | "options": { 341 | "colorMode": "value", 342 | "graphMode": "area", 343 | "justifyMode": "auto", 344 | "orientation": "auto", 345 | "reduceOptions": { 346 | "calcs": [ 347 | "lastNotNull" 348 | ], 349 | "fields": "", 350 | "values": false 351 | }, 352 | "textMode": "auto" 353 | }, 354 | "pluginVersion": "9.3.6", 355 | "targets": [ 356 | { 357 | "datasource": { 358 | "type": "prometheus", 359 | "uid": "${DS_PROMETHEUS}" 360 | }, 361 | "editorMode": "code", 362 | "exemplar": true, 363 | "expr": "sum(max by(redpanda_namespace, redpanda_topic, redpanda_partition)(redpanda_kafka_max_offset{redpanda_namespace=\"kafka\", redpanda_topic=~\"[[redpanda_topic]]\",redpanda_cloud_data_cluster_name=~\"[[data_cluster]]\"}) - on(redpanda_topic, redpanda_partition) group_right max by(redpanda_group, redpanda_topic, redpanda_partition)(redpanda_kafka_consumer_group_committed_offset{redpanda_group=~\"[[redpanda_group]]\", redpanda_topic=~\"[[redpanda_topic]]\",redpanda_cloud_data_cluster_name=~\"[[data_cluster]]\"}) + 1)", 364 | "interval": "", 365 | "legendFormat": "group: {{group}}, redpanda_topic: {{redpanda_topic}}, partition: {{partition}}", 366 | "range": true, 367 | "refId": "A" 368 | } 369 | ], 370 | "title": "Consumer Group Offset Lag", 371 | "type": "stat" 372 | }, 373 | { 374 | "datasource": { 375 | "type": "prometheus", 376 | "uid": "${DS_PROMETHEUS}" 377 | }, 378 | "description": "Only applies when topics are being written to (to understand the write-rate and compute the offset against that)", 379 | "fieldConfig": { 380 | "defaults": { 381 | "color": { 382 | "mode": "thresholds" 383 | }, 384 | "mappings": [], 385 | "min": 0, 386 | "thresholds": { 387 | "mode": "percentage", 388 | "steps": [ 389 | { 390 | "color": "green", 391 | "value": null 392 | }, 393 | { 394 | "color": "red", 395 | "value": 80 396 | } 397 | ] 398 | }, 399 | "unit": "s" 400 | }, 401 | "overrides": [] 402 | }, 403 | "gridPos": { 404 | "h": 7, 405 | "w": 3, 406 | "x": 21, 407 | "y": 0 408 | }, 409 | "id": 24, 410 | "options": { 411 | "colorMode": "value", 412 | "graphMode": "area", 413 | "justifyMode": "auto", 414 | "orientation": "auto", 415 | "reduceOptions": { 416 | "calcs": [ 417 | "lastNotNull" 418 | ], 419 | "fields": "", 420 | "values": false 421 | }, 422 | "textMode": "auto" 423 | }, 424 | "pluginVersion": "9.3.6", 425 | "targets": [ 426 | { 427 | "datasource": { 428 | "type": "prometheus", 429 | "uid": "${DS_PROMETHEUS}" 430 | }, 431 | "editorMode": "code", 432 | "exemplar": true, 433 | "expr": "avg\n(\n (max by(redpanda_namespace, redpanda_topic, redpanda_partition)(redpanda_kafka_max_offset{redpanda_namespace=\"kafka\", redpanda_topic!=\"__consumer_offsets\"}) - on (redpanda_topic, redpanda_partition) group_right max by(redpanda_group,redpanda_topic, redpanda_partition) (redpanda_kafka_consumer_group_committed_offset{redpanda_group=~\"[[redpanda_group]]\",redpanda_topic=~\"[[redpanda_topic]]\",redpanda_cloud_data_cluster_name=~\"[[data_cluster]]\"})\n )\n / on (redpanda_topic, redpanda_partition) group_right\n max by(redpanda_namespace, redpanda_topic, redpanda_partition) (rate(redpanda_kafka_max_offset{redpanda_topic=~\"[[redpanda_topic]]\",redpanda_cloud_data_cluster_name=~\"[[data_cluster]]\"}[$__rate_interval])!=0) \n)", 434 | "interval": "", 435 | "legendFormat": "group: {{group}}, redpanda_topic: {{redpanda_topic}}, partition: {{partition}}", 436 | "range": true, 437 | "refId": "A" 438 | } 439 | ], 440 | "title": "Offset Lag (time)", 441 | "type": "stat" 442 | }, 443 | { 444 | "datasource": { 445 | "type": "prometheus", 446 | "uid": "${DS_PROMETHEUS}" 447 | }, 448 | "fieldConfig": { 449 | "defaults": { 450 | "color": { 451 | "mode": "thresholds" 452 | }, 453 | "custom": { 454 | "align": "auto", 455 | "displayMode": "auto", 456 | "filterable": false, 457 | "inspect": false 458 | }, 459 | "mappings": [], 460 | "min": 0, 461 | "thresholds": { 462 | "mode": "absolute", 463 | "steps": [ 464 | { 465 | "color": "green", 466 | "value": null 467 | }, 468 | { 469 | "color": "red", 470 | "value": 80 471 | } 472 | ] 473 | }, 474 | "unit": "short" 475 | }, 476 | "overrides": [ 477 | { 478 | "matcher": { 479 | "id": "byName", 480 | "options": "partition" 481 | }, 482 | "properties": [ 483 | { 484 | "id": "custom.width", 485 | "value": 79 486 | } 487 | ] 488 | }, 489 | { 490 | "matcher": { 491 | "id": "byName", 492 | "options": "group" 493 | }, 494 | "properties": [ 495 | { 496 | "id": "custom.width", 497 | "value": 166 498 | } 499 | ] 500 | }, 501 | { 502 | "matcher": { 503 | "id": "byName", 504 | "options": "redpanda_topic" 505 | }, 506 | "properties": [ 507 | { 508 | "id": "custom.width", 509 | "value": 128 510 | } 511 | ] 512 | }, 513 | { 514 | "matcher": { 515 | "id": "byName", 516 | "options": "Value" 517 | }, 518 | "properties": [ 519 | { 520 | "id": "custom.width", 521 | "value": 52 522 | } 523 | ] 524 | }, 525 | { 526 | "matcher": { 527 | "id": "byName", 528 | "options": "redpanda_partition" 529 | }, 530 | "properties": [ 531 | { 532 | "id": "custom.width", 533 | "value": 110 534 | } 535 | ] 536 | }, 537 | { 538 | "matcher": { 539 | "id": "byName", 540 | "options": "redpanda_group" 541 | }, 542 | "properties": [ 543 | { 544 | "id": "custom.width", 545 | "value": 117 546 | } 547 | ] 548 | }, 549 | { 550 | "matcher": { 551 | "id": "byName", 552 | "options": "Lag" 553 | }, 554 | "properties": [ 555 | { 556 | "id": "custom.width", 557 | "value": 140 558 | } 559 | ] 560 | } 561 | ] 562 | }, 563 | "gridPos": { 564 | "h": 8, 565 | "w": 7, 566 | "x": 0, 567 | "y": 7 568 | }, 569 | "id": 15, 570 | "options": { 571 | "footer": { 572 | "enablePagination": false, 573 | "fields": "", 574 | "reducer": [ 575 | "sum" 576 | ], 577 | "show": false 578 | }, 579 | "showHeader": true, 580 | "sortBy": [] 581 | }, 582 | "pluginVersion": "9.3.6", 583 | "targets": [ 584 | { 585 | "datasource": { 586 | "type": "prometheus", 587 | "uid": "${DS_PROMETHEUS}" 588 | }, 589 | "editorMode": "code", 590 | "exemplar": false, 591 | "expr": "sum by([[aggr_criteria]] , redpanda_group) (max by(redpanda_namespace, redpanda_topic, redpanda_partition)(redpanda_kafka_max_offset{redpanda_namespace=\"kafka\", redpanda_topic=~\"[[redpanda_topic]]\"}) - on(redpanda_topic, redpanda_partition) group_right max by(redpanda_group, redpanda_topic, redpanda_partition)(redpanda_kafka_consumer_group_committed_offset{redpanda_group=~\"[[redpanda_group]]\"}) +1)", 592 | "format": "table", 593 | "instant": true, 594 | "interval": "", 595 | "legendFormat": "__auto", 596 | "range": false, 597 | "refId": "A" 598 | } 599 | ], 600 | "title": "Consumer Group Offset Lag", 601 | "transformations": [ 602 | { 603 | "id": "organize", 604 | "options": { 605 | "excludeByName": { 606 | "Time": true, 607 | "instance": true, 608 | "job": true, 609 | "shard": true 610 | }, 611 | "indexByName": { 612 | "Time": 1, 613 | "Value": 4, 614 | "redpanda_group": 2, 615 | "redpanda_partition": 3, 616 | "redpanda_topic": 0 617 | }, 618 | "renameByName": { 619 | "Value": "Lag", 620 | "redpanda_group": "Group", 621 | "redpanda_partition": "Partition", 622 | "redpanda_topic": "Topic" 623 | } 624 | } 625 | } 626 | ], 627 | "type": "table" 628 | }, 629 | { 630 | "datasource": { 631 | "type": "prometheus", 632 | "uid": "${DS_PROMETHEUS}" 633 | }, 634 | "fieldConfig": { 635 | "defaults": { 636 | "color": { 637 | "mode": "thresholds" 638 | }, 639 | "custom": { 640 | "align": "auto", 641 | "displayMode": "auto", 642 | "inspect": false 643 | }, 644 | "mappings": [], 645 | "thresholds": { 646 | "mode": "absolute", 647 | "steps": [ 648 | { 649 | "color": "green", 650 | "value": null 651 | }, 652 | { 653 | "color": "red", 654 | "value": 80 655 | } 656 | ] 657 | } 658 | }, 659 | "overrides": [ 660 | { 661 | "matcher": { 662 | "id": "byName", 663 | "options": "Value" 664 | }, 665 | "properties": [ 666 | { 667 | "id": "custom.width", 668 | "value": 73 669 | } 670 | ] 671 | }, 672 | { 673 | "matcher": { 674 | "id": "byName", 675 | "options": "Group" 676 | }, 677 | "properties": [ 678 | { 679 | "id": "custom.width", 680 | "value": 146 681 | } 682 | ] 683 | }, 684 | { 685 | "matcher": { 686 | "id": "byName", 687 | "options": "Topic" 688 | }, 689 | "properties": [ 690 | { 691 | "id": "custom.width", 692 | "value": 135 693 | } 694 | ] 695 | }, 696 | { 697 | "matcher": { 698 | "id": "byName", 699 | "options": "Consumption Rate" 700 | }, 701 | "properties": [ 702 | { 703 | "id": "custom.width", 704 | "value": 104 705 | } 706 | ] 707 | } 708 | ] 709 | }, 710 | "gridPos": { 711 | "h": 8, 712 | "w": 7, 713 | "x": 7, 714 | "y": 7 715 | }, 716 | "id": 22, 717 | "options": { 718 | "footer": { 719 | "fields": "", 720 | "reducer": [ 721 | "sum" 722 | ], 723 | "show": false 724 | }, 725 | "showHeader": true, 726 | "sortBy": [] 727 | }, 728 | "pluginVersion": "9.3.6", 729 | "targets": [ 730 | { 731 | "datasource": { 732 | "type": "prometheus", 733 | "uid": "${DS_PROMETHEUS}" 734 | }, 735 | "editorMode": "code", 736 | "exemplar": false, 737 | "expr": "sum by(redpanda_group, redpanda_topic) (rate(redpanda_kafka_consumer_group_committed_offset{redpanda_topic=~\"[[redpanda_topic]]\", redpanda_group=~\"[[redpanda_group]]\", redpanda_cloud_data_cluster_name=~\"[[data_cluster]]\"}[1m]))", 738 | "format": "table", 739 | "instant": true, 740 | "legendFormat": "Group: {{redpanda_group}}", 741 | "range": false, 742 | "refId": "A" 743 | } 744 | ], 745 | "title": "Topic Consumption Rate by Group", 746 | "transformations": [ 747 | { 748 | "id": "organize", 749 | "options": { 750 | "excludeByName": { 751 | "Time": true 752 | }, 753 | "indexByName": {}, 754 | "renameByName": { 755 | "Value": "Consumption Rate", 756 | "redpanda_group": "Group", 757 | "redpanda_topic": "Topic" 758 | } 759 | } 760 | } 761 | ], 762 | "type": "table" 763 | }, 764 | { 765 | "datasource": { 766 | "type": "prometheus", 767 | "uid": "${DS_PROMETHEUS}" 768 | }, 769 | "fieldConfig": { 770 | "defaults": { 771 | "color": { 772 | "mode": "palette-classic" 773 | }, 774 | "custom": { 775 | "axisCenteredZero": false, 776 | "axisColorMode": "text", 777 | "axisLabel": "", 778 | "axisPlacement": "auto", 779 | "barAlignment": 0, 780 | "drawStyle": "line", 781 | "fillOpacity": 0, 782 | "gradientMode": "none", 783 | "hideFrom": { 784 | "legend": false, 785 | "tooltip": false, 786 | "viz": false 787 | }, 788 | "lineInterpolation": "linear", 789 | "lineStyle": { 790 | "fill": "solid" 791 | }, 792 | "lineWidth": 1, 793 | "pointSize": 5, 794 | "scaleDistribution": { 795 | "type": "linear" 796 | }, 797 | "showPoints": "never", 798 | "spanNulls": false, 799 | "stacking": { 800 | "group": "A", 801 | "mode": "normal" 802 | }, 803 | "thresholdsStyle": { 804 | "mode": "off" 805 | } 806 | }, 807 | "mappings": [], 808 | "thresholds": { 809 | "mode": "absolute", 810 | "steps": [ 811 | { 812 | "color": "green", 813 | "value": null 814 | }, 815 | { 816 | "color": "red", 817 | "value": 80 818 | } 819 | ] 820 | } 821 | }, 822 | "overrides": [] 823 | }, 824 | "gridPos": { 825 | "h": 8, 826 | "w": 5, 827 | "x": 14, 828 | "y": 7 829 | }, 830 | "id": 20, 831 | "options": { 832 | "legend": { 833 | "calcs": [], 834 | "displayMode": "list", 835 | "placement": "bottom", 836 | "showLegend": true 837 | }, 838 | "tooltip": { 839 | "mode": "single", 840 | "sort": "none" 841 | } 842 | }, 843 | "targets": [ 844 | { 845 | "datasource": { 846 | "type": "prometheus", 847 | "uid": "${DS_PROMETHEUS}" 848 | }, 849 | "editorMode": "code", 850 | "expr": "sum by([[aggr_criteria]] ) (rate(redpanda_kafka_max_offset{redpanda_topic=~\"[[redpanda_topic]]\",redpanda_cloud_data_cluster_name=~\"[[data_cluster]]\"}[$__rate_interval]))", 851 | "legendFormat": "Topic: {{redpanda_topic}}, Partition: {{redpanda_partition}}", 852 | "range": true, 853 | "refId": "A" 854 | } 855 | ], 856 | "title": "Topic Production Rate", 857 | "type": "timeseries" 858 | }, 859 | { 860 | "datasource": { 861 | "type": "prometheus", 862 | "uid": "${DS_PROMETHEUS}" 863 | }, 864 | "fieldConfig": { 865 | "defaults": { 866 | "color": { 867 | "mode": "palette-classic" 868 | }, 869 | "custom": { 870 | "axisCenteredZero": false, 871 | "axisColorMode": "text", 872 | "axisLabel": "", 873 | "axisPlacement": "auto", 874 | "barAlignment": 0, 875 | "drawStyle": "line", 876 | "fillOpacity": 0, 877 | "gradientMode": "none", 878 | "hideFrom": { 879 | "legend": false, 880 | "tooltip": false, 881 | "viz": false 882 | }, 883 | "lineInterpolation": "linear", 884 | "lineWidth": 1, 885 | "pointSize": 5, 886 | "scaleDistribution": { 887 | "type": "linear" 888 | }, 889 | "showPoints": "never", 890 | "spanNulls": false, 891 | "stacking": { 892 | "group": "A", 893 | "mode": "none" 894 | }, 895 | "thresholdsStyle": { 896 | "mode": "off" 897 | } 898 | }, 899 | "mappings": [], 900 | "thresholds": { 901 | "mode": "absolute", 902 | "steps": [ 903 | { 904 | "color": "green", 905 | "value": null 906 | }, 907 | { 908 | "color": "red", 909 | "value": 80 910 | } 911 | ] 912 | } 913 | }, 914 | "overrides": [] 915 | }, 916 | "gridPos": { 917 | "h": 8, 918 | "w": 5, 919 | "x": 19, 920 | "y": 7 921 | }, 922 | "id": 17, 923 | "options": { 924 | "legend": { 925 | "calcs": [], 926 | "displayMode": "list", 927 | "placement": "bottom", 928 | "showLegend": true 929 | }, 930 | "tooltip": { 931 | "mode": "single", 932 | "sort": "none" 933 | } 934 | }, 935 | "targets": [ 936 | { 937 | "datasource": { 938 | "type": "prometheus", 939 | "uid": "${DS_PROMETHEUS}" 940 | }, 941 | "editorMode": "code", 942 | "expr": "sum by([[aggr_criteria]], redpanda_group ) (rate(redpanda_kafka_consumer_group_committed_offset{redpanda_cloud_data_cluster_name=~\"[[data_cluster]]\"}[$__rate_interval]))", 943 | "legendFormat": "group: {{redpanda_group}}, topic: {{redpanda_topic}}, partition: {{redpanda_partition}}", 944 | "range": true, 945 | "refId": "A" 946 | } 947 | ], 948 | "title": "Topic Consumption Rate by Group", 949 | "type": "timeseries" 950 | }, 951 | { 952 | "aliasColors": {}, 953 | "bars": false, 954 | "dashLength": 10, 955 | "dashes": false, 956 | "datasource": { 957 | "type": "prometheus", 958 | "uid": "${DS_PROMETHEUS}" 959 | }, 960 | "fill": 1, 961 | "fillGradient": 0, 962 | "gridPos": { 963 | "h": 8, 964 | "w": 12, 965 | "x": 0, 966 | "y": 15 967 | }, 968 | "hiddenSeries": false, 969 | "id": 2, 970 | "legend": { 971 | "avg": false, 972 | "current": false, 973 | "max": false, 974 | "min": false, 975 | "show": true, 976 | "total": false, 977 | "values": false 978 | }, 979 | "lines": true, 980 | "linewidth": 1, 981 | "nullPointMode": "null", 982 | "options": { 983 | "alertThreshold": true 984 | }, 985 | "percentage": false, 986 | "pluginVersion": "9.3.6", 987 | "pointradius": 2, 988 | "points": false, 989 | "renderer": "flot", 990 | "seriesOverrides": [], 991 | "spaceLength": 10, 992 | "stack": false, 993 | "steppedLine": false, 994 | "targets": [ 995 | { 996 | "datasource": { 997 | "type": "prometheus", 998 | "uid": "${DS_PROMETHEUS}" 999 | }, 1000 | "editorMode": "code", 1001 | "exemplar": true, 1002 | "expr": "max by([[aggr_criteria]]) (rate(redpanda_kafka_max_offset{redpanda_topic=~\"[[redpanda_topic]]\", redpanda_cloud_data_cluster_name=~\"[[data_cluster]]\"}[$__rate_interval]))", 1003 | "interval": "", 1004 | "legendFormat": "Topic: {{redpanda_topic}}, Partition: {{redpanda_partition}}", 1005 | "range": true, 1006 | "refId": "A" 1007 | } 1008 | ], 1009 | "thresholds": [], 1010 | "timeRegions": [], 1011 | "title": "Log End Offset Rate", 1012 | "tooltip": { 1013 | "shared": true, 1014 | "sort": 0, 1015 | "value_type": "individual" 1016 | }, 1017 | "type": "graph", 1018 | "xaxis": { 1019 | "mode": "time", 1020 | "show": true, 1021 | "values": [] 1022 | }, 1023 | "yaxes": [ 1024 | { 1025 | "$$hashKey": "object:24", 1026 | "format": "short", 1027 | "logBase": 1, 1028 | "show": true 1029 | }, 1030 | { 1031 | "$$hashKey": "object:25", 1032 | "format": "short", 1033 | "logBase": 1, 1034 | "show": true 1035 | } 1036 | ], 1037 | "yaxis": { 1038 | "align": false 1039 | } 1040 | }, 1041 | { 1042 | "datasource": { 1043 | "type": "prometheus", 1044 | "uid": "${DS_PROMETHEUS}" 1045 | }, 1046 | "fieldConfig": { 1047 | "defaults": { 1048 | "color": { 1049 | "mode": "palette-classic" 1050 | }, 1051 | "custom": { 1052 | "axisCenteredZero": false, 1053 | "axisColorMode": "text", 1054 | "axisLabel": "", 1055 | "axisPlacement": "auto", 1056 | "barAlignment": 0, 1057 | "drawStyle": "line", 1058 | "fillOpacity": 10, 1059 | "gradientMode": "none", 1060 | "hideFrom": { 1061 | "legend": false, 1062 | "tooltip": false, 1063 | "viz": false 1064 | }, 1065 | "lineInterpolation": "linear", 1066 | "lineWidth": 1, 1067 | "pointSize": 5, 1068 | "scaleDistribution": { 1069 | "type": "linear" 1070 | }, 1071 | "showPoints": "never", 1072 | "spanNulls": false, 1073 | "stacking": { 1074 | "group": "A", 1075 | "mode": "normal" 1076 | }, 1077 | "thresholdsStyle": { 1078 | "mode": "off" 1079 | } 1080 | }, 1081 | "mappings": [], 1082 | "thresholds": { 1083 | "mode": "absolute", 1084 | "steps": [ 1085 | { 1086 | "color": "green", 1087 | "value": null 1088 | }, 1089 | { 1090 | "color": "red", 1091 | "value": 80 1092 | } 1093 | ] 1094 | }, 1095 | "unit": "short" 1096 | }, 1097 | "overrides": [] 1098 | }, 1099 | "gridPos": { 1100 | "h": 8, 1101 | "w": 12, 1102 | "x": 12, 1103 | "y": 15 1104 | }, 1105 | "id": 3, 1106 | "options": { 1107 | "legend": { 1108 | "calcs": [], 1109 | "displayMode": "list", 1110 | "placement": "bottom", 1111 | "showLegend": true 1112 | }, 1113 | "tooltip": { 1114 | "mode": "multi", 1115 | "sort": "none" 1116 | } 1117 | }, 1118 | "pluginVersion": "", 1119 | "targets": [ 1120 | { 1121 | "datasource": { 1122 | "type": "prometheus", 1123 | "uid": "${DS_PROMETHEUS}" 1124 | }, 1125 | "editorMode": "builder", 1126 | "exemplar": true, 1127 | "expr": "max by([[aggr_criteria]], redpanda_group) (rate(redpanda_kafka_consumer_group_committed_offset{redpanda_topic=~\"[[redpanda_topic]]\", redpanda_group=~\"[[redpanda_group]]\", redpanda_cloud_data_cluster_name=~\"[[data_cluster]]\"}[$__rate_interval]))", 1128 | "interval": "", 1129 | "legendFormat": "Group: {{redpanda_group}}, Topic: {{redpanda_topic}}, Partition: {{redpanda_partition}}", 1130 | "range": true, 1131 | "refId": "A" 1132 | } 1133 | ], 1134 | "title": "Consumer Group Offsets Rate", 1135 | "type": "timeseries" 1136 | } 1137 | ], 1138 | "refresh": "30s", 1139 | "schemaVersion": 37, 1140 | "style": "dark", 1141 | "tags": [], 1142 | "templating": { 1143 | "list": [ 1144 | { 1145 | "current": { 1146 | "selected": false, 1147 | "text": "Prometheus", 1148 | "value": "Prometheus" 1149 | }, 1150 | "hide": 0, 1151 | "includeAll": false, 1152 | "label": "Data Source", 1153 | "multi": false, 1154 | "name": "DS_PROMETHEUS", 1155 | "options": [], 1156 | "query": "prometheus", 1157 | "refresh": 1, 1158 | "regex": "", 1159 | "skipUrlSync": false, 1160 | "type": "datasource" 1161 | }, 1162 | { 1163 | "allValue": "", 1164 | "current": {}, 1165 | "datasource": { 1166 | "type": "prometheus", 1167 | "uid": "${DS_PROMETHEUS}" 1168 | }, 1169 | "definition": "label_values(redpanda_kafka_consumer_group_topics{redpanda_topic !~ 'controller|group|__consumer_offsets'}, redpanda_group)", 1170 | "hide": 0, 1171 | "includeAll": true, 1172 | "label": "Group", 1173 | "multi": true, 1174 | "name": "redpanda_group", 1175 | "options": [], 1176 | "query": { 1177 | "query": "label_values(redpanda_kafka_consumer_group_topics{redpanda_topic !~ 'controller|group|__consumer_offsets'}, redpanda_group)", 1178 | "refId": "StandardVariableQuery" 1179 | }, 1180 | "refresh": 2, 1181 | "regex": "", 1182 | "skipUrlSync": false, 1183 | "sort": 0, 1184 | "tagValuesQuery": "", 1185 | "tagsQuery": "", 1186 | "type": "query", 1187 | "useTags": false 1188 | }, 1189 | { 1190 | "allValue": "", 1191 | "current": {}, 1192 | "datasource": { 1193 | "type": "prometheus", 1194 | "uid": "${DS_PROMETHEUS}" 1195 | }, 1196 | "definition": "label_values(redpanda_kafka_replicas{redpanda_topic !~ 'controller|group|__consumer_offsets', redpanda_namespace=\"kafka\"}, redpanda_topic)", 1197 | "hide": 0, 1198 | "includeAll": true, 1199 | "label": "Topic", 1200 | "multi": true, 1201 | "name": "redpanda_topic", 1202 | "options": [], 1203 | "query": { 1204 | "query": "label_values(redpanda_kafka_replicas{redpanda_topic !~ 'controller|group|__consumer_offsets', redpanda_namespace=\"kafka\"}, redpanda_topic)", 1205 | "refId": "StandardVariableQuery" 1206 | }, 1207 | "refresh": 2, 1208 | "regex": "", 1209 | "skipUrlSync": false, 1210 | "sort": 0, 1211 | "tagValuesQuery": "", 1212 | "tagsQuery": "", 1213 | "type": "query", 1214 | "useTags": false 1215 | }, 1216 | { 1217 | "current": { 1218 | "selected": true, 1219 | "text": "Topic", 1220 | "value": "redpanda_topic" 1221 | }, 1222 | "hide": 0, 1223 | "includeAll": false, 1224 | "label": "Aggregate by", 1225 | "multi": false, 1226 | "name": "aggr_criteria", 1227 | "options": [ 1228 | { 1229 | "selected": true, 1230 | "text": "Topic", 1231 | "value": "redpanda_topic" 1232 | }, 1233 | { 1234 | "selected": false, 1235 | "text": "Topic,Partition", 1236 | "value": "redpanda_topic,redpanda_partition" 1237 | } 1238 | ], 1239 | "query": "Topic : redpanda_topic,Topic\\,Partition : redpanda_topic\\,redpanda_partition", 1240 | "queryValue": "", 1241 | "skipUrlSync": false, 1242 | "type": "custom" 1243 | }, 1244 | { 1245 | "current": {}, 1246 | "datasource": { 1247 | "type": "prometheus", 1248 | "uid": "${DS_PROMETHEUS}" 1249 | }, 1250 | "definition": "label_values(redpanda_cloud_data_cluster_name)", 1251 | "hide": 0, 1252 | "includeAll": false, 1253 | "label": "Data cluster", 1254 | "multi": false, 1255 | "name": "data_cluster", 1256 | "options": [], 1257 | "query": { 1258 | "query": "label_values(redpanda_cloud_data_cluster_name)", 1259 | "refId": "StandardVariableQuery" 1260 | }, 1261 | "refresh": 1, 1262 | "regex": "", 1263 | "skipUrlSync": false, 1264 | "sort": 0, 1265 | "type": "query" 1266 | }, 1267 | { 1268 | "datasource": { 1269 | "type": "prometheus", 1270 | "uid": "${DS_PROMETHEUS}" 1271 | }, 1272 | "filters": [], 1273 | "hide": 0, 1274 | "name": "Filters", 1275 | "skipUrlSync": false, 1276 | "type": "adhoc" 1277 | } 1278 | ] 1279 | }, 1280 | "time": { 1281 | "from": "now-30m", 1282 | "to": "now" 1283 | }, 1284 | "timepicker": {}, 1285 | "timezone": "", 1286 | "title": "Kafka Consumer Offsets", 1287 | "uid": "rtW0EDenk", 1288 | "version": 1, 1289 | "weekStart": "" 1290 | } 1291 | --------------------------------------------------------------------------------