├── prometheus_rules.yaml ├── .gitignore ├── lib ├── rules.jsonnet ├── alerts.jsonnet └── dashboards.jsonnet ├── rules └── rules.libsonnet ├── mixin.libsonnet ├── .vale.ini ├── .lint ├── dashboards_out ├── .lint └── kubernetes-autoscaling-mixin-karpenter-act.json ├── dashboards ├── dashboards.libsonnet ├── cluster-autoscaler │ ├── util.libsonnet │ └── kubernetes-autoscaling-cluster-autoscaler.libsonnet ├── karpenter │ ├── util.libsonnet │ ├── karpenter-activity.libsonnet │ └── karpenter-performance.libsonnet ├── kubernetes │ ├── util.libsonnet │ ├── kubernetes-autoscaling-hpa.libsonnet │ └── kubernetes-autoscaling-pdb.libsonnet └── keda │ ├── util.libsonnet │ ├── keda-scaled-job.libsonnet │ └── keda-scaled-object.libsonnet ├── scripts ├── tools.go └── go.mod ├── jsonnetfile.json ├── .github └── workflows │ └── ci.yml ├── jsonnetfile.lock.json ├── Makefile ├── config.libsonnet ├── prometheus_alerts.yaml ├── tests └── tests.yaml ├── README.md ├── alerts └── alerts.libsonnet └── LICENSE /prometheus_rules.yaml: -------------------------------------------------------------------------------- 1 | "groups": [] 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | vendor 2 | tmp 3 | ./dashboards_out/lint 4 | .vale 5 | -------------------------------------------------------------------------------- /lib/rules.jsonnet: -------------------------------------------------------------------------------- 1 | std.manifestYamlDoc((import '../mixin.libsonnet').prometheusRules) 2 | -------------------------------------------------------------------------------- /lib/alerts.jsonnet: -------------------------------------------------------------------------------- 1 | std.manifestYamlDoc((import '../mixin.libsonnet').prometheusAlerts) 2 | -------------------------------------------------------------------------------- /rules/rules.libsonnet: -------------------------------------------------------------------------------- 1 | { 2 | prometheusRules+:: { 3 | groups+: [], 4 | }, 5 | } 6 | -------------------------------------------------------------------------------- /mixin.libsonnet: -------------------------------------------------------------------------------- 1 | (import 'alerts/alerts.libsonnet') + 2 | (import 'rules/rules.libsonnet') + 3 | (import 'dashboards/dashboards.libsonnet') + 4 | (import 'config.libsonnet') 5 | -------------------------------------------------------------------------------- /.vale.ini: -------------------------------------------------------------------------------- 1 | StylesPath = .vale/styles 2 | 3 | MinAlertLevel = error 4 | 5 | Packages = Readability, write-good, alex 6 | 7 | [*] 8 | BasedOnStyles = Readability, write-good, alex 9 | -------------------------------------------------------------------------------- /lib/dashboards.jsonnet: -------------------------------------------------------------------------------- 1 | local dashboards = (import '../mixin.libsonnet').grafanaDashboards; 2 | 3 | { 4 | [name]: dashboards[name] 5 | for name in std.objectFields(dashboards) 6 | } 7 | -------------------------------------------------------------------------------- /.lint: -------------------------------------------------------------------------------- 1 | --- 2 | exclusions: 3 | template-job-rule: 4 | reason: Jobs are not set to multi in our case. 5 | target-job-rule: 6 | reason: Some dashboard use recording rules 7 | template-instance-rule: 8 | reason: We don't use instances. 9 | panel-datasource-rule: 10 | reason: Using a datasource for each panel. 11 | panel-title-description-rule: 12 | reason: TODO(adinhodovic) 13 | target-instance-rule: 14 | -------------------------------------------------------------------------------- /dashboards_out/.lint: -------------------------------------------------------------------------------- 1 | --- 2 | exclusions: 3 | template-job-rule: 4 | reason: Jobs are not set to multi in our case. 5 | target-job-rule: 6 | reason: Some dashboard use recording rules 7 | template-instance-rule: 8 | reason: We don't use instances. 9 | panel-datasource-rule: 10 | reason: Using a datasource for each panel. 11 | panel-title-description-rule: 12 | reason: TODO(adinhodovic) 13 | target-instance-rule: 14 | -------------------------------------------------------------------------------- /dashboards/dashboards.libsonnet: -------------------------------------------------------------------------------- 1 | (import 'kubernetes/kubernetes-autoscaling-pdb.libsonnet') + 2 | (import 'kubernetes/kubernetes-autoscaling-hpa.libsonnet') + 3 | (import 'kubernetes/kubernetes-autoscaling-vpa.libsonnet') + 4 | (import 'cluster-autoscaler/kubernetes-autoscaling-cluster-autoscaler.libsonnet') + 5 | (import 'karpenter/karpenter-overview.libsonnet') + 6 | (import 'karpenter/karpenter-activity.libsonnet') + 7 | (import 'karpenter/karpenter-performance.libsonnet') + 8 | (import 'keda/keda-scaled-object.libsonnet') + 9 | (import 'keda/keda-scaled-job.libsonnet') 10 | -------------------------------------------------------------------------------- /scripts/tools.go: -------------------------------------------------------------------------------- 1 | //go:build tools 2 | // +build tools 3 | 4 | // Packae tols tracks dependencies for tools that used in the build process. 5 | // See https://github.com/golang/go/issues/25922 6 | package tools 7 | 8 | import ( 9 | _ "github.com/Kunde21/markdownfmt/v3/cmd/markdownfmt" 10 | _ "github.com/cloudflare/pint/cmd/pint" 11 | _ "github.com/errata-ai/vale/v3/cmd/vale" 12 | _ "github.com/google/go-jsonnet/cmd/jsonnet" 13 | _ "github.com/google/go-jsonnet/cmd/jsonnet-lint" 14 | _ "github.com/google/go-jsonnet/cmd/jsonnetfmt" 15 | _ "github.com/grafana/dashboard-linter" 16 | _ "github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb" 17 | _ "github.com/prometheus/prometheus/cmd/promtool" 18 | ) 19 | -------------------------------------------------------------------------------- /jsonnetfile.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "dependencies": [ 4 | { 5 | "source": { 6 | "git": { 7 | "remote": "https://github.com/adinhodovic/mixin-utils.git", 8 | "subdir": "" 9 | } 10 | }, 11 | "version": "main", 12 | "name": "mm-utils" 13 | }, 14 | { 15 | "source": { 16 | "git": { 17 | "remote": "https://github.com/grafana/grafonnet.git", 18 | "subdir": "gen/grafonnet-latest" 19 | } 20 | }, 21 | "version": "main" 22 | }, 23 | { 24 | "source": { 25 | "git": { 26 | "remote": "https://github.com/jsonnet-libs/docsonnet.git", 27 | "subdir": "doc-util" 28 | } 29 | }, 30 | "version": "master" 31 | }, 32 | { 33 | "source": { 34 | "git": { 35 | "remote": "https://github.com/jsonnet-libs/xtd.git", 36 | "subdir": "" 37 | } 38 | }, 39 | "version": "master" 40 | } 41 | ], 42 | "legacyImports": true 43 | } 44 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: ci 2 | permissions: {} 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | jobs: 9 | matrix: 10 | runs-on: ubuntu-latest 11 | name: ${{ matrix.name }} 12 | strategy: 13 | fail-fast: false 14 | matrix: 15 | include: 16 | - name: Lint Alerts 17 | run: make --always-make alerts-lint 18 | - name: Generate yaml 19 | run: make --always-make generate && git diff --exit-code 20 | - name: Lint Grafana Dashboards 21 | run: make --always-make dashboards-lint 22 | - name: Format Jsonnet 23 | run: make --always-make jsonnet-fmt && git diff --exit-code 24 | - name: Lint Jsonnet 25 | run: make --always-make jsonnet-lint 26 | - name: Format Markdown 27 | run: make --always-make markdownfmt && git diff --exit-code 28 | - name: Lint Markdown 29 | run: make --always-make vale && git diff --exit-code 30 | - name: Lint YAML 31 | run: make --always-make pint-lint 32 | - name: Run unit tests 33 | run: make --always-make test 34 | 35 | steps: 36 | - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 37 | with: 38 | persist-credentials: false 39 | - uses: actions/setup-go@d35c59abb061a4a6fb18e82ac0862c26744d6ab5 # v5.5.0 40 | with: 41 | go-version-file: scripts/go.mod 42 | cache-dependency-path: scripts/go.sum 43 | - run: ${{ matrix.run }} 44 | -------------------------------------------------------------------------------- /jsonnetfile.lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "dependencies": [ 4 | { 5 | "source": { 6 | "git": { 7 | "remote": "https://github.com/adinhodovic/mixin-utils.git", 8 | "subdir": "" 9 | } 10 | }, 11 | "version": "2d7880588e2b94f547c20ba270594dd7ecdc2ed9", 12 | "sum": "/n6K29u+5LfCLOiSOD8uehfMrd4AQoZCaqYYg3oV6xU=", 13 | "name": "mm-utils" 14 | }, 15 | { 16 | "source": { 17 | "git": { 18 | "remote": "https://github.com/grafana/grafonnet.git", 19 | "subdir": "gen/grafonnet-latest" 20 | } 21 | }, 22 | "version": "7380c9c64fb973f34c3ec46265621a2b0dee0058", 23 | "sum": "V9vAj21qJOc2DlMPDgB1eEjSQU4A+sAA4AXuJ6bd4xc=" 24 | }, 25 | { 26 | "source": { 27 | "git": { 28 | "remote": "https://github.com/grafana/grafonnet.git", 29 | "subdir": "gen/grafonnet-v11.4.0" 30 | } 31 | }, 32 | "version": "7380c9c64fb973f34c3ec46265621a2b0dee0058", 33 | "sum": "aVAX09paQYNOoCSKVpuk1exVIyBoMt/C50QJI+Q/3nA=" 34 | }, 35 | { 36 | "source": { 37 | "git": { 38 | "remote": "https://github.com/jsonnet-libs/docsonnet.git", 39 | "subdir": "doc-util" 40 | } 41 | }, 42 | "version": "6ac6c69685b8c29c54515448eaca583da2d88150", 43 | "sum": "BrAL/k23jq+xy9oA7TWIhUx07dsA/QLm3g7ktCwe//U=" 44 | }, 45 | { 46 | "source": { 47 | "git": { 48 | "remote": "https://github.com/jsonnet-libs/xtd.git", 49 | "subdir": "" 50 | } 51 | }, 52 | "version": "4d7f8cb24d613430799f9d56809cc6964f35cea9", 53 | "sum": "hOrwkOx34tOXqoDVnwuI/Uf/dr9HFFSPWpDPOvnEGrk=" 54 | } 55 | ], 56 | "legacyImports": false 57 | } 58 | -------------------------------------------------------------------------------- /dashboards/cluster-autoscaler/util.libsonnet: -------------------------------------------------------------------------------- 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | 3 | local dashboard = g.dashboard; 4 | 5 | local variable = dashboard.variable; 6 | local datasource = variable.datasource; 7 | local query = variable.query; 8 | 9 | { 10 | filters(config):: { 11 | local this = self, 12 | cluster: '%(clusterLabel)s="$cluster"' % config, 13 | job: 'job=~"$job"', 14 | 15 | base: ||| 16 | %(cluster)s, 17 | %(job)s 18 | ||| % this, 19 | }, 20 | 21 | variables(config):: { 22 | local this = self, 23 | 24 | local defaultFilters = $.filters(config), 25 | 26 | datasource: 27 | datasource.new( 28 | 'datasource', 29 | 'prometheus', 30 | ) + 31 | datasource.generalOptions.withLabel('Data source') + 32 | { 33 | current: { 34 | selected: true, 35 | text: config.datasourceName, 36 | value: config.datasourceName, 37 | }, 38 | }, 39 | 40 | cluster: 41 | query.new( 42 | config.clusterLabel, 43 | 'label_values(kube_pod_info{%(kubeStateMetricsSelector)s}, cluster)' % config, 44 | ) + 45 | query.withDatasourceFromVariable(this.datasource) + 46 | query.withSort() + 47 | query.generalOptions.withLabel('Cluster') + 48 | query.refresh.onLoad() + 49 | query.refresh.onTime() + 50 | ( 51 | if config.showMultiCluster 52 | then query.generalOptions.showOnDashboard.withLabelAndValue() 53 | else query.generalOptions.showOnDashboard.withNothing() 54 | ), 55 | 56 | job: 57 | query.new( 58 | 'job', 59 | 'label_values(cluster_autoscaler_last_activity{%(cluster)s}, job)' % defaultFilters, 60 | ) + 61 | query.withDatasourceFromVariable(this.datasource) + 62 | query.withSort(1) + 63 | query.generalOptions.withLabel('Job') + 64 | query.refresh.onLoad() + 65 | query.refresh.onTime(), 66 | }, 67 | } 68 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | BIN_DIR ?= $(shell pwd)/tmp/bin 2 | 3 | JSONNET_VENDOR=vendor 4 | GRAFANA_DASHBOARD_LINTER_BIN=$(BIN_DIR)/dashboard-linter 5 | JB_BIN=$(BIN_DIR)/jb 6 | JSONNET_BIN=$(BIN_DIR)/jsonnet 7 | JSONNETLINT_BIN=$(BIN_DIR)/jsonnet-lint 8 | JSONNETFMT_BIN=$(BIN_DIR)/jsonnetfmt 9 | MD_FILES = $(shell find . \( -type d -name '.vale' -o -type d -name 'vendor' \) -prune -o -type f -name "*.md" -print) 10 | MARKDOWNFMT_BIN=$(BIN_DIR)/markdownfmt 11 | VALE_BIN=$(BIN_DIR)/vale 12 | PROMTOOL_BIN=$(BIN_DIR)/promtool 13 | PINT_BIN=$(BIN_DIR)/pint 14 | TOOLING=$(JB_BIN) $(JSONNETLINT_BIN) $(JSONNET_BIN) $(JSONNETFMT_BIN) $(PROMTOOL_BIN) $(GRAFANA_DASHBOARD_LINTER_BIN) $(MARKDOWNFMT_BIN) $(VALE_BIN) $(PINT_BIN) 15 | JSONNETFMT_ARGS=-n 2 --max-blank-lines 2 --string-style s --comment-style s 16 | SRC_DIR ?=dashboards 17 | OUT_DIR ?=dashboards_out 18 | 19 | .PHONY: all 20 | all: fmt generate lint test 21 | 22 | .PHONY: generate 23 | generate: prometheus_alerts.yaml prometheus_rules.yaml $(OUT_DIR) 24 | 25 | $(JSONNET_VENDOR): $(JB_BIN) jsonnetfile.json 26 | $(JB_BIN) install 27 | 28 | .PHONY: fmt 29 | fmt: jsonnet-fmt markdownfmt 30 | 31 | .PHONY: jsonnet-fmt 32 | jsonnet-fmt: $(JSONNETFMT_BIN) 33 | @find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \ 34 | xargs -n 1 -- $(JSONNETFMT_BIN) $(JSONNETFMT_ARGS) -i 35 | 36 | .PHONY: markdownfmt 37 | markdownfmt: $(MARKDOWNFMT_BIN) 38 | @for file in $(MD_FILES); do $(MARKDOWNFMT_BIN) -w -gofmt $$file; done 39 | 40 | prometheus_alerts.yaml: $(JSONNET_BIN) mixin.libsonnet lib/alerts.jsonnet alerts/*.libsonnet 41 | @$(JSONNET_BIN) -J vendor -S lib/alerts.jsonnet > $@ 42 | 43 | prometheus_rules.yaml: $(JSONNET_BIN) mixin.libsonnet lib/rules.jsonnet rules/*.libsonnet 44 | @$(JSONNET_BIN) -J vendor -S lib/rules.jsonnet > $@ 45 | 46 | $(OUT_DIR): $(JSONNET_BIN) $(JSONNET_VENDOR) mixin.libsonnet lib/dashboards.jsonnet $(SRC_DIR)/*.libsonnet 47 | @mkdir -p $(OUT_DIR) 48 | @$(JSONNET_BIN) -J vendor -m $(OUT_DIR) lib/dashboards.jsonnet 49 | 50 | .PHONY: lint 51 | lint: jsonnet-lint alerts-lint dashboards-lint vale pint-lint 52 | 53 | .PHONY: jsonnet-lint 54 | jsonnet-lint: $(JSONNETLINT_BIN) $(JSONNET_VENDOR) 55 | @find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \ 56 | xargs -n 1 -- $(JSONNETLINT_BIN) -J vendor 57 | 58 | .PHONY: alerts-lint 59 | alerts-lint: $(PROMTOOL_BIN) prometheus_alerts.yaml prometheus_rules.yaml 60 | @$(PROMTOOL_BIN) check rules prometheus_rules.yaml 61 | @$(PROMTOOL_BIN) check rules prometheus_alerts.yaml 62 | 63 | $(OUT_DIR)/.lint: $(OUT_DIR) 64 | @cp .lint $@ 65 | 66 | .PHONY: dashboards-lint 67 | dashboards-lint: $(GRAFANA_DASHBOARD_LINTER_BIN) $(OUT_DIR)/.lint 68 | # Replace $$interval:$$resolution var with $$__rate_interval to make dashboard-linter happy. 69 | @sed -i -e 's/$$interval:$$resolution/$$__rate_interval/g' $(OUT_DIR)/*.json 70 | @find $(OUT_DIR) -name '*.json' -print0 | xargs -n 1 -0 $(GRAFANA_DASHBOARD_LINTER_BIN) lint --strict 71 | 72 | .PHONY: vale 73 | vale: $(VALE_BIN) 74 | @$(VALE_BIN) sync && \ 75 | $(VALE_BIN) $(MD_FILES) 76 | 77 | .PHONY: pint-lint 78 | pint-lint: generate $(PINT_BIN) 79 | @# Pint will not exit with a non-zero status code if there are linting issues. 80 | @output=$$($(PINT_BIN) -n -o -l WARN lint prometheus_alerts.yaml prometheus_rules.yaml 2>&1); \ 81 | if [ -n "$$output" ]; then \ 82 | echo "\n$$output"; \ 83 | exit 1; \ 84 | fi 85 | 86 | .PHONY: clean 87 | clean: 88 | # Remove all files and directories ignored by git. 89 | git clean -Xfd . 90 | 91 | .PHONY: test 92 | test: $(PROMTOOL_BIN) prometheus_alerts.yaml prometheus_rules.yaml 93 | @$(PROMTOOL_BIN) test rules tests/*.yaml 94 | 95 | $(BIN_DIR): 96 | mkdir -p $(BIN_DIR) 97 | 98 | $(TOOLING): $(BIN_DIR) 99 | @echo Installing tools from hack/tools.go 100 | @cd scripts && go list -e -mod=mod -tags tools -f '{{ range .Imports }}{{ printf "%s\n" .}}{{end}}' ./ | xargs -tI % go build -mod=mod -o $(BIN_DIR) % 101 | -------------------------------------------------------------------------------- /config.libsonnet: -------------------------------------------------------------------------------- 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | local annotation = g.dashboard.annotation; 3 | 4 | { 5 | _config+:: { 6 | local this = self, 7 | // Bypasses grafana.com/dashboards validator 8 | bypassDashboardValidation: { 9 | __inputs: [], 10 | __requires: [], 11 | }, 12 | 13 | // Default datasource name 14 | datasourceName: 'default', 15 | 16 | // Opt-in to multiCluster dashboards by overriding this and the clusterLabel. 17 | showMultiCluster: false, 18 | clusterLabel: 'cluster', 19 | 20 | kubeStateMetricsSelector: 'job=~"kube-state-metrics"', 21 | 22 | grafanaUrl: 'https://grafana.com', 23 | 24 | pdbDashboardUid: 'kubernetes-autoscaling-mixin-pdb-jkwq', 25 | hpaDashboardUid: 'kubernetes-autoscaling-mixin-hpa-jkwq', 26 | vpaDashboardUid: 'kubernetes-autoscaling-mixin-vpa-jkwq', 27 | clusterAutoscalerDashboardUid: 'kubernetes-autoscaling-mixin-ca-jkwq', 28 | karpenterOverviewDashboardUid: 'kubernetes-autoscaling-mixin-kover-jkwq', 29 | karpenterActivityDashboardUid: 'kubernetes-autoscaling-mixin-kact-jkwq', 30 | karpenterPerformanceDashboardUid: 'kubernetes-autoscaling-mixin-kperf-jkwq', 31 | kedaScaledObjectDashboardUid: 'kubernetes-autoscaling-mixin-kedaso-jkwq', 32 | kedaScaledJobDashboardUid: 'kubernetes-autoscaling-mixin-kedasj-jkwq', 33 | 34 | vpa: { 35 | enabled: true, 36 | // Optional: If you want to aggregate the VPA by cluster, set it to true requires showMultiCluster to be true. 37 | clusterAggregation: false, 38 | // Optional: If your VPA names are not based only from the pod name and include a prefix, set it here. 39 | vpaPrefix: '', 40 | }, 41 | 42 | clusterAutoscaler: { 43 | enabled: true, 44 | clusterAutoscalerSelector: 'job="cluster-autoscaler"', 45 | 46 | nodeCountCapacityThreshold: 75, 47 | 48 | clusterAutoscalerDashboardUrl: '%s/d/%s/kubernetes-autoscaling-cluster-autoscaler' % [this.grafanaUrl, this.clusterAutoscalerDashboardUid], 49 | }, 50 | 51 | karpenter: { 52 | enabled: true, 53 | karpenterSelector: 'job="karpenter"', 54 | 55 | nodepoolCapacityThreshold: 75, 56 | nodeclaimTerminationThreshold: 60 * 20, 57 | 58 | karpenterOverviewDashboardUrl: '%s/d/%s/kubernetes-autoscaling-karpenter-overview' % [this.grafanaUrl, this.karpenterOverviewDashboardUid], 59 | karpenterActivityDashboardUrl: '%s/d/%s/kubernetes-autoscaling-karpenter-activity' % [this.grafanaUrl, this.karpenterActivityDashboardUid], 60 | karpenterPerformanceDashboardUrl: '%s/d/%s/kubernetes-autoscaling-karpenter-performance' % [this.grafanaUrl, this.karpenterPerformanceDashboardUid], 61 | }, 62 | 63 | keda: { 64 | enabled: true, 65 | 66 | kedaScaledObjectDashboardUrl: '%s/d/%s/kubernetes-autoscaling-keda-scaled-object' % [this.grafanaUrl, this.kedaScaledObjectDashboardUid], 67 | kedaScaledJobDashboardUrl: '%s/d/%s/kubernetes-autoscaling-keda-scaled-job' % [this.grafanaUrl, this.kedaScaledJobDashboardUid], 68 | 69 | kedaSelector: 'job="keda-operator"', 70 | 71 | // Default thresholds for KEDA the scaler metrics latency threshold in seconds. 72 | scalerMetricsLatencyThreshold: '5', 73 | // The default threshold for scaled objects to be considered paused for too long. 74 | scaledObjectPausedThreshold: '25h', 75 | 76 | // Used to link to the workload dashboard from the scaled job dashboards. Allows viewing resource usage. 77 | k8sResourcesWorkloadDashboardUid: 'this-needs-to-be-customized', 78 | }, 79 | 80 | tags: ['kubernetes', 'autoscaling', 'kubernetes-autoscaling-mixin'], 81 | 82 | // Custom annotations to display in graphs 83 | annotation: { 84 | enabled: false, 85 | name: 'Custom Annotation', 86 | datasource: '-- Grafana --', 87 | iconColor: 'green', 88 | tags: [], 89 | }, 90 | 91 | customAnnotation:: if $._config.annotation.enabled then 92 | annotation.withName($._config.annotation.name) + 93 | annotation.withIconColor($._config.annotation.iconColor) + 94 | annotation.withHide(false) + 95 | annotation.datasource.withUid($._config.annotation.datasource) + 96 | annotation.target.withMatchAny(true) + 97 | annotation.target.withTags($._config.annotation.tags) + 98 | annotation.target.withType('tags') 99 | else {}, 100 | }, 101 | } 102 | -------------------------------------------------------------------------------- /dashboards/karpenter/util.libsonnet: -------------------------------------------------------------------------------- 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | 3 | local dashboard = g.dashboard; 4 | 5 | local variable = dashboard.variable; 6 | local datasource = variable.datasource; 7 | local query = variable.query; 8 | 9 | { 10 | filters(config):: { 11 | local this = self, 12 | cluster: '%(clusterLabel)s="$cluster"' % config, 13 | job: 'job=~"$job"', 14 | region: 'region=~"$region"', 15 | zone: 'zone=~"$zone"', 16 | arch: 'arch=~"$arch"', 17 | os: 'os=~"$os"', 18 | instanceType: 'instance_type=~"$instance_type"', 19 | capacityType: 'capacity_type=~"$capacity_type"', 20 | nodepool: 'nodepool=~"$nodepool"', 21 | 22 | base: ||| 23 | %(cluster)s, 24 | %(job)s 25 | ||| % this, 26 | 27 | default: ||| 28 | %(base)s, 29 | %(nodepool)s 30 | ||| % this, 31 | 32 | withLocation: ||| 33 | %(default)s, 34 | %(region)s, 35 | %(zone)s 36 | ||| % this, 37 | 38 | full: ||| 39 | %(withLocation)s, 40 | %(arch)s, 41 | %(os)s, 42 | %(instanceType)s, 43 | %(capacityType)s 44 | ||| % this, 45 | }, 46 | 47 | variables(config):: { 48 | local this = self, 49 | 50 | local defaultFilters = $.filters(config), 51 | 52 | datasource: 53 | datasource.new( 54 | 'datasource', 55 | 'prometheus', 56 | ) + 57 | datasource.generalOptions.withLabel('Data source') + 58 | { 59 | current: { 60 | selected: true, 61 | text: config.datasourceName, 62 | value: config.datasourceName, 63 | }, 64 | }, 65 | 66 | cluster: 67 | query.new( 68 | config.clusterLabel, 69 | 'label_values(kube_pod_info{%(kubeStateMetricsSelector)s}, cluster)' % config, 70 | ) + 71 | query.withDatasourceFromVariable(this.datasource) + 72 | query.withSort() + 73 | query.generalOptions.withLabel('Cluster') + 74 | query.refresh.onLoad() + 75 | query.refresh.onTime() + 76 | ( 77 | if config.showMultiCluster 78 | then query.generalOptions.showOnDashboard.withLabelAndValue() 79 | else query.generalOptions.showOnDashboard.withNothing() 80 | ), 81 | 82 | job: 83 | query.new( 84 | 'job', 85 | 'label_values(karpenter_nodes_allocatable{%(cluster)s}, job)' % defaultFilters, 86 | ) + 87 | query.withDatasourceFromVariable(this.datasource) + 88 | query.withSort(1) + 89 | query.generalOptions.withLabel('Job') + 90 | query.refresh.onLoad() + 91 | query.refresh.onTime(), 92 | 93 | region: 94 | query.new( 95 | 'region', 96 | 'label_values(karpenter_nodes_allocatable{%(cluster)s, %(job)s}, region)' % defaultFilters, 97 | ) + 98 | query.withDatasourceFromVariable(this.datasource) + 99 | query.withSort() + 100 | query.generalOptions.withLabel('Region') + 101 | query.selectionOptions.withMulti(true) + 102 | query.selectionOptions.withIncludeAll(true) + 103 | query.refresh.onLoad() + 104 | query.refresh.onTime(), 105 | 106 | zone: 107 | query.new( 108 | 'zone', 109 | 'label_values(karpenter_nodes_allocatable{%(cluster)s, %(job)s, %(region)s}, zone)' % defaultFilters, 110 | ) + 111 | query.withDatasourceFromVariable(this.datasource) + 112 | query.withSort() + 113 | query.generalOptions.withLabel('Zone') + 114 | query.selectionOptions.withMulti(true) + 115 | query.selectionOptions.withIncludeAll(true) + 116 | query.refresh.onLoad() + 117 | query.refresh.onTime(), 118 | 119 | arch: 120 | query.new( 121 | 'arch', 122 | 'label_values(karpenter_nodes_allocatable{%(cluster)s, %(job)s, %(region)s, %(zone)s}, arch)' % defaultFilters, 123 | ) + 124 | query.withDatasourceFromVariable(this.datasource) + 125 | query.withSort() + 126 | query.generalOptions.withLabel('Architecture') + 127 | query.selectionOptions.withMulti(true) + 128 | query.selectionOptions.withIncludeAll(true) + 129 | query.refresh.onLoad() + 130 | query.refresh.onTime(), 131 | 132 | os: 133 | query.new( 134 | 'os', 135 | 'label_values(karpenter_nodes_allocatable{%(cluster)s, %(job)s, %(region)s, %(zone)s, %(arch)s}, os)' % defaultFilters, 136 | ) + 137 | query.withDatasourceFromVariable(this.datasource) + 138 | query.withSort(1) + 139 | query.generalOptions.withLabel('Operating System') + 140 | query.selectionOptions.withMulti(true) + 141 | query.selectionOptions.withIncludeAll(true) + 142 | query.refresh.onLoad() + 143 | query.refresh.onTime(), 144 | 145 | instanceType: 146 | query.new( 147 | 'instance_type', 148 | 'label_values(karpenter_nodes_allocatable{%(cluster)s, %(job)s, %(region)s, %(zone)s, %(arch)s, %(os)s}, instance_type)' % defaultFilters, 149 | ) + 150 | query.withDatasourceFromVariable(this.datasource) + 151 | query.withSort(1) + 152 | query.generalOptions.withLabel('Instance Type') + 153 | query.selectionOptions.withMulti(true) + 154 | query.selectionOptions.withIncludeAll(true) + 155 | query.refresh.onLoad() + 156 | query.refresh.onTime(), 157 | 158 | capacityType: 159 | query.new( 160 | 'capacity_type', 161 | 'label_values(karpenter_nodes_allocatable{%(cluster)s, %(job)s, %(region)s, %(zone)s, %(arch)s, %(os)s, %(instanceType)s}, capacity_type)' % defaultFilters, 162 | ) + 163 | query.withDatasourceFromVariable(this.datasource) + 164 | query.withSort(1) + 165 | query.generalOptions.withLabel('Capacity Type') + 166 | query.selectionOptions.withMulti(true) + 167 | query.selectionOptions.withIncludeAll(true) + 168 | query.refresh.onLoad() + 169 | query.refresh.onTime(), 170 | 171 | nodepool: 172 | query.new( 173 | 'nodepool', 174 | 'label_values(karpenter_nodes_allocatable{%(cluster)s, %(job)s, %(region)s, %(zone)s, %(arch)s, %(os)s, %(instanceType)s, %(capacityType)s}, nodepool)' % defaultFilters, 175 | ) + 176 | query.withDatasourceFromVariable(this.datasource) + 177 | query.withSort(1) + 178 | query.generalOptions.withLabel('Node Pool') + 179 | query.selectionOptions.withMulti(true) + 180 | query.selectionOptions.withIncludeAll(true) + 181 | query.refresh.onLoad() + 182 | query.refresh.onTime(), 183 | 184 | nodepoolSimple: 185 | query.new( 186 | 'nodepool', 187 | 'label_values(karpenter_nodes_allocatable{%(cluster)s, %(job)s}, nodepool)' % defaultFilters, 188 | ) + 189 | query.withDatasourceFromVariable(this.datasource) + 190 | query.withSort(1) + 191 | query.generalOptions.withLabel('Node Pool') + 192 | query.selectionOptions.withMulti(true) + 193 | query.selectionOptions.withIncludeAll(true) + 194 | query.refresh.onLoad() + 195 | query.refresh.onTime(), 196 | }, 197 | } 198 | -------------------------------------------------------------------------------- /prometheus_alerts.yaml: -------------------------------------------------------------------------------- 1 | "groups": 2 | - "name": "karpenter" 3 | "rules": 4 | - "alert": "KarpenterCloudProviderErrors" 5 | "annotations": 6 | "dashboard_url": "https://grafana.com/d/kubernetes-autoscaling-mixin-kperf-jkwq/kubernetes-autoscaling-karpenter-performance" 7 | "description": "The Karpenter provider {{ $labels.provider }} with the controller {{ $labels.controller }} has errors with the method {{ $labels.method }}." 8 | "summary": "Karpenter has Cloud Provider Errors." 9 | "expr": | 10 | sum( 11 | increase( 12 | karpenter_cloudprovider_errors_total{ 13 | job="karpenter", 14 | controller!~"nodeclaim.termination|node.termination", 15 | error!="NodeClaimNotFoundError" 16 | }[5m] 17 | ) 18 | ) by (cluster, namespace, job, provider, controller, method) > 0 19 | "for": "5m" 20 | "labels": 21 | "severity": "warning" 22 | - "alert": "KarpenterNodeClaimsTerminationDurationHigh" 23 | "annotations": 24 | "dashboard_url": "https://grafana.com/d/kubernetes-autoscaling-mixin-kact-jkwq/kubernetes-autoscaling-karpenter-activity" 25 | "description": "The average node claim termination duration in Karpenter has exceeded 20 minutes for more than 15 minutes in nodepool {{ $labels.nodepool }}. This may indicate cloud provider issues or improper instance termination handling." 26 | "summary": "Karpenter Node Claims Termination Duration is High." 27 | "expr": | 28 | sum( 29 | rate( 30 | karpenter_nodeclaims_termination_duration_seconds_sum{ 31 | job="karpenter" 32 | }[5m] 33 | ) 34 | ) by (cluster, namespace, job, nodepool) 35 | / 36 | sum( 37 | rate( 38 | karpenter_nodeclaims_termination_duration_seconds_count{ 39 | job="karpenter" 40 | }[5m] 41 | ) 42 | ) by (cluster, namespace, job, nodepool) > 1200 43 | "for": "15m" 44 | "labels": 45 | "severity": "warning" 46 | - "alert": "KarpenterNodepoolNearCapacity" 47 | "annotations": 48 | "dashboard_url": "https://grafana.com/d/kubernetes-autoscaling-mixin-kover-jkwq/kubernetes-autoscaling-karpenter-overview" 49 | "description": "The resource {{ $labels.resource_type }} in the Karpenter node pool {{ $labels.nodepool }} is nearing its limit. Consider scaling or adding resources." 50 | "summary": "Karpenter Nodepool near capacity." 51 | "expr": | 52 | sum ( 53 | karpenter_nodepools_usage{job="karpenter"} 54 | ) by (cluster, namespace, job, nodepool, resource_type) 55 | / 56 | sum ( 57 | karpenter_nodepools_limit{job="karpenter"} 58 | ) by (cluster, namespace, job, nodepool, resource_type) 59 | * 100 > 75 60 | "for": "15m" 61 | "labels": 62 | "severity": "warning" 63 | - "name": "cluster-autoscaler" 64 | "rules": 65 | - "alert": "ClusterAutoscalerNodeCountNearCapacity" 66 | "annotations": 67 | "dashboard_url": "https://grafana.com/d/kubernetes-autoscaling-mixin-ca-jkwq/kubernetes-autoscaling-cluster-autoscaler" 68 | "description": "The node count for the cluster autoscaler job {{ $labels.job }} is reaching max limit. Consider scaling node groups." 69 | "summary": "Cluster Autoscaler Node Count near Capacity." 70 | "expr": | 71 | sum ( 72 | cluster_autoscaler_nodes_count{ 73 | job="cluster-autoscaler" 74 | } 75 | ) by (cluster, namespace, job) 76 | / 77 | sum ( 78 | cluster_autoscaler_max_nodes_count{ 79 | job="cluster-autoscaler" 80 | } 81 | ) by (cluster, namespace, job) 82 | * 100 > 75 83 | "for": "15m" 84 | "labels": 85 | "severity": "warning" 86 | - "alert": "ClusterAutoscalerUnschedulablePods" 87 | "annotations": 88 | "dashboard_url": "https://grafana.com/d/kubernetes-autoscaling-mixin-ca-jkwq/kubernetes-autoscaling-cluster-autoscaler" 89 | "description": "The cluster currently has unschedulable pods, indicating resource shortages. Consider adding more nodes or increasing node group capacity." 90 | "summary": "Pods Pending Scheduling - Cluster Node Group Scaling Required" 91 | "expr": | 92 | sum ( 93 | cluster_autoscaler_unschedulable_pods_count{ 94 | job="cluster-autoscaler" 95 | } 96 | ) by (cluster, namespace, job) 97 | > 0 98 | "for": "15m" 99 | "labels": 100 | "severity": "warning" 101 | - "name": "keda" 102 | "rules": 103 | - "alert": "KedaScaledJobErrors" 104 | "annotations": 105 | "dashboard_url": "https://grafana.com/d/kubernetes-autoscaling-mixin-kedasj-jkwq/kubernetes-autoscaling-keda-scaled-job?var-scaled_job={{ $labels.scaledObject }}&var-resource_namespace={{ $labels.exported_namespace }}" 106 | "description": "KEDA scaled jobs are experiencing errors. Check the scaled job {{ $labels.scaledObject }} in the namespace {{ $labels.exported_namespace }}." 107 | "summary": "Errors detected for KEDA scaled jobs." 108 | "expr": | 109 | sum( 110 | increase( 111 | keda_scaled_job_errors_total{ 112 | job="keda-operator" 113 | }[10m] 114 | ) 115 | ) by (cluster, job, exported_namespace, scaledObject) > 0 116 | "for": "1m" 117 | "labels": 118 | "severity": "warning" 119 | - "alert": "KedaScaledObjectErrors" 120 | "annotations": 121 | "dashboard_url": "https://grafana.com/d/kubernetes-autoscaling-mixin-kedaso-jkwq/kubernetes-autoscaling-keda-scaled-object?var-scaled_object={{ $labels.scaledObject }}&var-resource_namespace={{ $labels.exported_namespace }}" 122 | "description": "KEDA scaled objects are experiencing errors. Check the scaled object {{ $labels.scaledObject }} in the namespace {{ $labels.exported_namespace }}." 123 | "summary": "Errors detected for KEDA scaled objects." 124 | "expr": | 125 | sum( 126 | increase( 127 | keda_scaled_object_errors_total{ 128 | job="keda-operator" 129 | }[10m] 130 | ) 131 | ) by (cluster, job, exported_namespace, scaledObject) > 0 132 | "for": "1m" 133 | "labels": 134 | "severity": "warning" 135 | - "alert": "KedaScalerLatencyHigh" 136 | "annotations": 137 | "dashboard_url": "https://grafana.com/d/kubernetes-autoscaling-mixin-kedaso-jkwq/kubernetes-autoscaling-keda-scaled-object?var-scaled_object={{ $labels.scaledObject }}&var-scaler={{ $labels.scaler }}" 138 | "description": "Metric latency for scaler {{ $labels.scaler }} for the object {{ $labels.scaledObject }} has exceeded acceptable limits." 139 | "summary": "High latency for KEDA scaler metrics." 140 | "expr": | 141 | avg( 142 | keda_scaler_metrics_latency_seconds{ 143 | job="keda-operator" 144 | } 145 | ) by (cluster, job, exported_namespace, scaledObject, scaler) > 5 146 | "for": "10m" 147 | "labels": 148 | "severity": "warning" 149 | - "alert": "KedaScaledObjectPaused" 150 | "annotations": 151 | "dashboard_url": "https://grafana.com/d/kubernetes-autoscaling-mixin-kedaso-jkwq/kubernetes-autoscaling-keda-scaled-object?var-scaled_object={{ $labels.scaledObject }}&var-resource_namespace={{ $labels.exported_namespace }}" 152 | "description": "The scaled object {{ $labels.scaledObject }} in namespace {{ $labels.exported_namespace }} is paused for longer than 25h. This may indicate a configuration issue or manual intervention." 153 | "summary": "KEDA scaled object is paused." 154 | "expr": | 155 | max( 156 | keda_scaled_object_paused{ 157 | job="keda-operator" 158 | } 159 | ) by (cluster, job, exported_namespace, scaledObject) > 0 160 | "for": "25h" 161 | "labels": 162 | "severity": "warning" 163 | - "alert": "KedaScalerDetailErrors" 164 | "annotations": 165 | "dashboard_url": "https://grafana.com/d/kubernetes-autoscaling-mixin-kedaso-jkwq/kubernetes-autoscaling-keda-scaled-object?var-scaler={{ $labels.scaler }}&var-scaled_object={{ $labels.scaledObject }}" 166 | "description": "Errors have occurred in the KEDA scaler {{ $labels.scaler }}. Investigate the scaler for the {{ $labels.type }} {{ $labels.scaledObject }} in namespace {{ $labels.exported_namespace }}." 167 | "summary": "Errors detected in KEDA scaler." 168 | "expr": | 169 | sum( 170 | increase( 171 | keda_scaler_detail_errors_total{ 172 | job="keda-operator" 173 | }[10m] 174 | ) 175 | ) by (cluster, job, exported_namespace, scaledObject, type, scaler) > 0 176 | "for": "1m" 177 | "labels": 178 | "severity": "warning" 179 | -------------------------------------------------------------------------------- /dashboards/kubernetes/util.libsonnet: -------------------------------------------------------------------------------- 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | 3 | local dashboard = g.dashboard; 4 | 5 | local variable = dashboard.variable; 6 | local datasource = variable.datasource; 7 | local query = variable.query; 8 | 9 | { 10 | filters(config):: { 11 | local this = self, 12 | cluster: '%(clusterLabel)s="$cluster"' % config, 13 | clusterMulti: '%(clusterLabel)s=~"$cluster"' % config, 14 | clusterLabel: config.clusterLabel, 15 | job: 'job=~"$job"', 16 | namespace: 'namespace=~"$namespace"', 17 | container: 'container=~"$container"', 18 | 19 | // PDB 20 | pdb: 'poddisruptionbudget=~"$poddisruptionbudget"', 21 | 22 | // HPA 23 | hpa: 'horizontalpodautoscaler=~"$horizontalpodautoscaler"', 24 | hpaMetricName: 'metric_name=~"$metric_name"', 25 | hpaMetricTargetType: 'metric_target_type=~"$metric_target_type"', 26 | 27 | // VPA 28 | vpa: 'verticalpodautoscaler=~"$verticalpodautoscaler"', 29 | vpaPrefix: config.vpa.vpaPrefix, 30 | 31 | base: ||| 32 | %(cluster)s, 33 | %(job)s, 34 | %(namespace)s 35 | ||| % this, 36 | 37 | baseMulti: ||| 38 | %(clusterMulti)s, 39 | %(job)s, 40 | %(namespace)s 41 | ||| % this, 42 | 43 | // PDB 44 | withPdb: ||| 45 | %(base)s, 46 | %(pdb)s 47 | ||| % this, 48 | 49 | // HPA 50 | withHpa: ||| 51 | %(base)s, 52 | %(hpa)s 53 | ||| % this, 54 | 55 | withHpaMetricName: ||| 56 | %(withHpa)s, 57 | %(hpaMetricName)s 58 | ||| % this, 59 | 60 | withHpaMetricTargetType: ||| 61 | %(withHpaMetricName)s, 62 | %(hpaMetricTargetType)s 63 | ||| % this, 64 | 65 | // VPA 66 | withVpa: ||| 67 | %(base)s, 68 | %(vpa)s, 69 | %(container)s 70 | ||| % this, 71 | }, 72 | 73 | variables(config):: { 74 | local this = self, 75 | 76 | local defaultFilters = $.filters(config), 77 | 78 | datasource: 79 | datasource.new( 80 | 'datasource', 81 | 'prometheus', 82 | ) + 83 | datasource.generalOptions.withLabel('Data source') + 84 | { 85 | current: { 86 | selected: true, 87 | text: config.datasourceName, 88 | value: config.datasourceName, 89 | }, 90 | }, 91 | 92 | cluster: 93 | query.new( 94 | config.clusterLabel, 95 | 'label_values(kube_pod_info{%(kubeStateMetricsSelector)s}, cluster)' % config, 96 | ) + 97 | query.withDatasourceFromVariable(this.datasource) + 98 | query.withSort() + 99 | query.generalOptions.withLabel('Cluster') + 100 | query.refresh.onLoad() + 101 | query.refresh.onTime() + 102 | query.selectionOptions.withMulti(config.vpa.clusterAggregation) + 103 | ( 104 | if config.showMultiCluster 105 | then query.generalOptions.showOnDashboard.withLabelAndValue() 106 | else query.generalOptions.showOnDashboard.withNothing() 107 | ), 108 | 109 | // PDB 110 | pdbJob: 111 | query.new( 112 | 'job', 113 | 'label_values(kube_poddisruptionbudget_status_current_healthy{%(cluster)s}, job)' % defaultFilters, 114 | ) + 115 | query.withDatasourceFromVariable(this.datasource) + 116 | query.withSort() + 117 | query.generalOptions.withLabel('Job') + 118 | query.refresh.onLoad() + 119 | query.refresh.onTime(), 120 | 121 | pdbNamespace: 122 | query.new( 123 | 'namespace', 124 | 'label_values(kube_poddisruptionbudget_status_current_healthy{%(cluster)s, %(job)s}, namespace)' % defaultFilters, 125 | ) + 126 | query.withDatasourceFromVariable(this.datasource) + 127 | query.withSort() + 128 | query.generalOptions.withLabel('Namespace') + 129 | query.selectionOptions.withMulti(true) + 130 | query.refresh.onLoad() + 131 | query.refresh.onTime(), 132 | 133 | pdb: 134 | query.new( 135 | 'poddisruptionbudget', 136 | 'label_values(kube_poddisruptionbudget_status_current_healthy{%(cluster)s, %(job)s, %(namespace)s}, poddisruptionbudget)' % defaultFilters, 137 | ) + 138 | query.withDatasourceFromVariable(this.datasource) + 139 | query.withSort() + 140 | query.generalOptions.withLabel('Pod Disruption Budget') + 141 | query.selectionOptions.withMulti(true) + 142 | query.selectionOptions.withIncludeAll(false) + 143 | query.refresh.onLoad() + 144 | query.refresh.onTime(), 145 | 146 | // HPA 147 | hpaJob: 148 | query.new( 149 | 'job', 150 | 'label_values(kube_horizontalpodautoscaler_spec_target_metric{%(cluster)s}, job)' % defaultFilters, 151 | ) + 152 | query.withDatasourceFromVariable(this.datasource) + 153 | query.withSort() + 154 | query.generalOptions.withLabel('Job') + 155 | query.refresh.onLoad() + 156 | query.refresh.onTime(), 157 | 158 | hpaNamespace: 159 | query.new( 160 | 'namespace', 161 | 'label_values(kube_horizontalpodautoscaler_spec_target_metric{%(cluster)s, %(job)s}, namespace)' % defaultFilters, 162 | ) + 163 | query.withDatasourceFromVariable(this.datasource) + 164 | query.withSort() + 165 | query.generalOptions.withLabel('Namespace') + 166 | query.selectionOptions.withMulti(true) + 167 | query.refresh.onLoad() + 168 | query.refresh.onTime(), 169 | 170 | hpa: 171 | query.new( 172 | 'horizontalpodautoscaler', 173 | 'label_values(kube_horizontalpodautoscaler_spec_target_metric{%(cluster)s, %(job)s, %(namespace)s}, horizontalpodautoscaler)' % defaultFilters, 174 | ) + 175 | query.withDatasourceFromVariable(this.datasource) + 176 | query.withSort() + 177 | query.generalOptions.withLabel('HPA') + 178 | query.refresh.onLoad() + 179 | query.refresh.onTime(), 180 | 181 | hpaMetricName: 182 | query.new( 183 | 'metric_name', 184 | 'label_values(kube_horizontalpodautoscaler_spec_target_metric{%(cluster)s, %(job)s, %(namespace)s, %(hpa)s}, metric_name)' % defaultFilters, 185 | ) + 186 | query.withDatasourceFromVariable(this.datasource) + 187 | query.withSort() + 188 | query.generalOptions.withLabel('Metric Name') + 189 | query.refresh.onLoad() + 190 | query.refresh.onTime(), 191 | 192 | hpaMetricTargetType: 193 | query.new( 194 | 'metric_target_type', 195 | 'label_values(kube_horizontalpodautoscaler_spec_target_metric{%(cluster)s, %(job)s, %(namespace)s, %(hpa)s, %(hpaMetricName)s}, metric_target_type)' % defaultFilters, 196 | ) + 197 | query.withDatasourceFromVariable(this.datasource) + 198 | query.withSort() + 199 | query.generalOptions.withLabel('Metric Target Type') + 200 | query.selectionOptions.withMulti(true) + 201 | query.selectionOptions.withIncludeAll(true) + 202 | query.refresh.onLoad() + 203 | query.refresh.onTime(), 204 | 205 | // VPA 206 | vpaJob: 207 | query.new( 208 | 'job', 209 | 'label_values(kube_customresource_verticalpodautoscaler_labels{%(cluster)s}, job)' % defaultFilters, 210 | ) + 211 | query.withDatasourceFromVariable(this.datasource) + 212 | query.withSort() + 213 | query.generalOptions.withLabel('Job') + 214 | query.refresh.onLoad() + 215 | query.refresh.onTime(), 216 | 217 | vpaNamespace: 218 | query.new( 219 | 'namespace', 220 | 'label_values(kube_customresource_verticalpodautoscaler_labels{%(cluster)s, %(job)s}, namespace)' % defaultFilters, 221 | ) + 222 | query.withDatasourceFromVariable(this.datasource) + 223 | query.withSort() + 224 | query.generalOptions.withLabel('Namespace') + 225 | query.selectionOptions.withMulti(true) + 226 | query.refresh.onLoad() + 227 | query.refresh.onTime(), 228 | 229 | vpa: 230 | query.new( 231 | 'verticalpodautoscaler', 232 | 'label_values(kube_customresource_verticalpodautoscaler_labels{%(cluster)s, %(job)s, %(namespace)s}, verticalpodautoscaler)' % defaultFilters, 233 | ) + 234 | query.withDatasourceFromVariable(this.datasource) + 235 | query.withSort() + 236 | query.generalOptions.withLabel('Vertical Pod Autoscaler') + 237 | query.refresh.onLoad() + 238 | query.refresh.onTime(), 239 | 240 | vpaContainer: 241 | query.new( 242 | 'container', 243 | 'label_values(kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_target{%(cluster)s, %(job)s, %(namespace)s, %(vpa)s}, container)' % defaultFilters, 244 | ) + 245 | query.withDatasourceFromVariable(this.datasource) + 246 | query.withSort() + 247 | query.generalOptions.withLabel('Container') + 248 | query.selectionOptions.withMulti(true) + 249 | query.selectionOptions.withIncludeAll(true) + 250 | query.refresh.onLoad() + 251 | query.refresh.onTime(), 252 | }, 253 | } 254 | -------------------------------------------------------------------------------- /dashboards/keda/util.libsonnet: -------------------------------------------------------------------------------- 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 2 | 3 | local dashboard = g.dashboard; 4 | 5 | local variable = dashboard.variable; 6 | local datasource = variable.datasource; 7 | local query = variable.query; 8 | 9 | { 10 | filters(config):: { 11 | local this = self, 12 | cluster: '%(clusterLabel)s="$cluster"' % config, 13 | job: 'job=~"$job"', 14 | operatorNamespace: 'namespace=~"$operator_namespace"', 15 | resourceNamespace: 'exported_namespace=~"$resource_namespace"', 16 | scaledObject: 'scaledObject="$scaled_object"', 17 | scaledJob: 'scaledObject="$scaled_job"', 18 | scaler: 'scaler="$scaler"', 19 | metric: 'metric="$metric"', 20 | 21 | base: ||| 22 | %(cluster)s, 23 | %(job)s, 24 | %(operatorNamespace)s 25 | ||| % this, 26 | 27 | withResourceNamespace: ||| 28 | %(base)s, 29 | %(resourceNamespace)s 30 | ||| % this, 31 | 32 | withScaledObject: ||| 33 | %(withResourceNamespace)s, 34 | type="scaledobject", 35 | %(scaledObject)s 36 | ||| % this, 37 | 38 | withScaledJob: ||| 39 | %(withResourceNamespace)s, 40 | type="scaledjob", 41 | %(scaledJob)s 42 | ||| % this, 43 | 44 | withScaledObjectScaler: ||| 45 | %(withScaledObject)s, 46 | %(scaler)s 47 | ||| % this, 48 | 49 | withScaledJobScaler: ||| 50 | %(withScaledJob)s, 51 | %(scaler)s 52 | ||| % this, 53 | 54 | withScaledObjectMetric: ||| 55 | %(withScaledObjectScaler)s, 56 | %(metric)s 57 | ||| % this, 58 | 59 | withScaledJobMetric: ||| 60 | %(withScaledJobScaler)s, 61 | %(metric)s 62 | ||| % this, 63 | }, 64 | 65 | variables(config):: { 66 | local this = self, 67 | 68 | local defaultFilters = $.filters(config), 69 | 70 | datasource: 71 | datasource.new( 72 | 'datasource', 73 | 'prometheus', 74 | ) + 75 | datasource.generalOptions.withLabel('Data source') + 76 | { 77 | current: { 78 | selected: true, 79 | text: config.datasourceName, 80 | value: config.datasourceName, 81 | }, 82 | }, 83 | 84 | cluster: 85 | query.new( 86 | config.clusterLabel, 87 | 'label_values(keda_build_info{}, cluster)' % config, 88 | ) + 89 | query.withDatasourceFromVariable(this.datasource) + 90 | query.withSort() + 91 | query.generalOptions.withLabel('Cluster') + 92 | query.refresh.onLoad() + 93 | query.refresh.onTime() + 94 | ( 95 | if config.showMultiCluster 96 | then query.generalOptions.showOnDashboard.withLabelAndValue() 97 | else query.generalOptions.showOnDashboard.withNothing() 98 | ), 99 | 100 | scaledObjectJob: 101 | query.new( 102 | 'job', 103 | 'label_values(keda_scaled_object_paused{%(cluster)s}, job)' % defaultFilters, 104 | ) + 105 | query.withDatasourceFromVariable(this.datasource) + 106 | query.withSort() + 107 | query.generalOptions.withLabel('Job') + 108 | query.selectionOptions.withMulti(true) + 109 | query.selectionOptions.withIncludeAll(true) + 110 | query.refresh.onLoad() + 111 | query.refresh.onTime(), 112 | 113 | scaledJobJob: 114 | query.new( 115 | 'job', 116 | 'label_values(keda_scaled_job_errors_total{%(cluster)s}, job)' % defaultFilters, 117 | ) + 118 | query.withDatasourceFromVariable(this.datasource) + 119 | query.withSort() + 120 | query.generalOptions.withLabel('Job') + 121 | query.selectionOptions.withMulti(true) + 122 | query.selectionOptions.withIncludeAll(true) + 123 | query.refresh.onLoad() + 124 | query.refresh.onTime(), 125 | 126 | scaledObjectOperatorNamespace: 127 | query.new( 128 | 'operator_namespace', 129 | 'label_values(keda_scaled_object_paused{%(cluster)s, %(job)s}, namespace)' % defaultFilters, 130 | ) + 131 | query.withDatasourceFromVariable(this.datasource) + 132 | query.withSort() + 133 | query.generalOptions.withLabel('Operator Namespace') + 134 | query.selectionOptions.withMulti(true) + 135 | query.selectionOptions.withIncludeAll(true) + 136 | query.refresh.onLoad() + 137 | query.refresh.onTime(), 138 | 139 | scaledJobOperatorNamespace: 140 | query.new( 141 | 'operator_namespace', 142 | 'label_values(keda_scaled_job_errors_total{%(cluster)s, %(job)s}, namespace)' % defaultFilters, 143 | ) + 144 | query.withDatasourceFromVariable(this.datasource) + 145 | query.withSort() + 146 | query.generalOptions.withLabel('Operator Namespace') + 147 | query.selectionOptions.withMulti(true) + 148 | query.selectionOptions.withIncludeAll(true) + 149 | query.refresh.onLoad() + 150 | query.refresh.onTime(), 151 | 152 | scaledObjectResourceNamespace: 153 | query.new( 154 | 'resource_namespace', 155 | 'label_values(keda_scaled_object_paused{%(cluster)s, %(job)s, %(operatorNamespace)s}, exported_namespace)' % defaultFilters, 156 | ) + 157 | query.withDatasourceFromVariable(this.datasource) + 158 | query.withSort() + 159 | query.generalOptions.withLabel('Resource Namespace') + 160 | query.refresh.onLoad() + 161 | query.refresh.onTime(), 162 | 163 | scaledJobResourceNamespace: 164 | query.new( 165 | 'resource_namespace', 166 | 'label_values(keda_scaled_job_errors_total{%(cluster)s, %(job)s, %(operatorNamespace)s}, exported_namespace)' % defaultFilters, 167 | ) + 168 | query.withDatasourceFromVariable(this.datasource) + 169 | query.withSort() + 170 | query.generalOptions.withLabel('Resource Namespace') + 171 | query.refresh.onLoad() + 172 | query.refresh.onTime(), 173 | 174 | scaledObject: 175 | query.new( 176 | 'scaled_object', 177 | 'label_values(keda_scaled_object_paused{%(cluster)s, %(job)s, %(operatorNamespace)s, %(resourceNamespace)s}, scaledObject)' % defaultFilters, 178 | ) + 179 | query.withDatasourceFromVariable(this.datasource) + 180 | query.withSort() + 181 | query.generalOptions.withLabel('Scaled Object') + 182 | query.selectionOptions.withMulti(false) + 183 | query.selectionOptions.withIncludeAll(false) + 184 | query.refresh.onLoad() + 185 | query.refresh.onTime(), 186 | 187 | scaledJob: 188 | query.new( 189 | 'scaled_job', 190 | 'label_values(keda_scaled_job_errors_total{%(cluster)s, %(job)s, %(operatorNamespace)s, %(resourceNamespace)s}, scaledJob)' % defaultFilters, 191 | ) + 192 | query.withDatasourceFromVariable(this.datasource) + 193 | query.withSort() + 194 | query.generalOptions.withLabel('Scaled Job') + 195 | query.selectionOptions.withMulti(false) + 196 | query.selectionOptions.withIncludeAll(false) + 197 | query.refresh.onLoad() + 198 | query.refresh.onTime(), 199 | 200 | scalerForScaledObject: 201 | query.new( 202 | 'scaler', 203 | 'label_values(keda_scaler_active{%(cluster)s, %(job)s, %(operatorNamespace)s, exported_namespace="$resource_namespace", type="scaledobject", scaledObject="$scaled_object"}, scaler)' % defaultFilters, 204 | ) + 205 | query.withDatasourceFromVariable(this.datasource) + 206 | query.withSort() + 207 | query.generalOptions.withLabel('Scaler') + 208 | query.selectionOptions.withMulti(false) + 209 | query.selectionOptions.withIncludeAll(false) + 210 | query.refresh.onLoad() + 211 | query.refresh.onTime(), 212 | 213 | scalerForScaledJob: 214 | query.new( 215 | 'scaler', 216 | 'label_values(keda_scaler_active{%(cluster)s, %(job)s, %(operatorNamespace)s, exported_namespace="$resource_namespace", type="scaledjob", scaledObject="$scaled_job"}, scaler)' % defaultFilters, 217 | ) + 218 | query.withDatasourceFromVariable(this.datasource) + 219 | query.withSort() + 220 | query.generalOptions.withLabel('Scaler') + 221 | query.selectionOptions.withMulti(false) + 222 | query.selectionOptions.withIncludeAll(false) + 223 | query.refresh.onLoad() + 224 | query.refresh.onTime(), 225 | 226 | metricForScaledObject: 227 | query.new( 228 | 'metric', 229 | 'label_values(keda_scaler_active{%(cluster)s, %(job)s, %(operatorNamespace)s, exported_namespace="$resource_namespace", type="scaledobject", scaledObject="$scaled_object", scaler="$scaler"}, metric)' % defaultFilters, 230 | ) + 231 | query.withDatasourceFromVariable(this.datasource) + 232 | query.withSort() + 233 | query.generalOptions.withLabel('Metric') + 234 | query.selectionOptions.withMulti(false) + 235 | query.selectionOptions.withIncludeAll(false) + 236 | query.refresh.onLoad() + 237 | query.refresh.onTime(), 238 | 239 | metricForScaledJob: 240 | query.new( 241 | 'metric', 242 | 'label_values(keda_scaler_active{%(cluster)s, %(job)s, %(operatorNamespace)s, exported_namespace="$resource_namespace", type="scaledjob", scaledObject="$scaled_job", scaler="$scaler"}, metric)' % defaultFilters, 243 | ) + 244 | query.withDatasourceFromVariable(this.datasource) + 245 | query.withSort() + 246 | query.generalOptions.withLabel('Metric') + 247 | query.selectionOptions.withMulti(false) + 248 | query.selectionOptions.withIncludeAll(false) + 249 | query.refresh.onLoad() + 250 | query.refresh.onTime(), 251 | }, 252 | } 253 | -------------------------------------------------------------------------------- /dashboards/kubernetes/kubernetes-autoscaling-hpa.libsonnet: -------------------------------------------------------------------------------- 1 | local mixinUtils = import 'github.com/adinhodovic/mixin-utils/utils.libsonnet'; 2 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 3 | local util = import 'util.libsonnet'; 4 | 5 | local dashboard = g.dashboard; 6 | local row = g.panel.row; 7 | local grid = g.util.grid; 8 | 9 | local tablePanel = g.panel.table; 10 | 11 | // Table 12 | local tbQueryOptions = tablePanel.queryOptions; 13 | 14 | { 15 | grafanaDashboards+:: { 16 | 'kubernetes-autoscaling-mixin-hpa.json': 17 | 18 | local defaultVariables = util.variables($._config); 19 | 20 | local variables = [ 21 | defaultVariables.datasource, 22 | defaultVariables.cluster, 23 | defaultVariables.hpaJob, 24 | defaultVariables.hpaNamespace, 25 | defaultVariables.hpa, 26 | defaultVariables.hpaMetricName, 27 | defaultVariables.hpaMetricTargetType, 28 | ]; 29 | 30 | local defaultFilters = util.filters($._config); 31 | local queries = { 32 | desiredReplicas: ||| 33 | round( 34 | sum( 35 | kube_horizontalpodautoscaler_status_desired_replicas{ 36 | %(withHpa)s 37 | } 38 | ) 39 | ) 40 | ||| % defaultFilters, 41 | 42 | currentReplicas: ||| 43 | round( 44 | sum( 45 | kube_horizontalpodautoscaler_status_current_replicas{ 46 | %(withHpa)s 47 | } 48 | ) 49 | ) 50 | ||| % defaultFilters, 51 | 52 | minReplicas: ||| 53 | round( 54 | sum( 55 | kube_horizontalpodautoscaler_spec_min_replicas{ 56 | %(withHpa)s 57 | } 58 | ) 59 | ) 60 | ||| % defaultFilters, 61 | 62 | maxReplicas: ||| 63 | round( 64 | sum( 65 | kube_horizontalpodautoscaler_spec_max_replicas{ 66 | %(withHpa)s 67 | } 68 | ) 69 | ) 70 | ||| % defaultFilters, 71 | 72 | metricTargets: ||| 73 | sum( 74 | kube_horizontalpodautoscaler_spec_target_metric{ 75 | %(withHpaMetricName)s 76 | } 77 | ) by (job, namespace, horizontalpodautoscaler, metric_name, metric_target_type) 78 | ||| % defaultFilters, 79 | 80 | usageThreshold: ||| 81 | sum( 82 | kube_horizontalpodautoscaler_spec_target_metric{ 83 | %(withHpaMetricTargetType)s 84 | } 85 | ) by (job, namespace, horizontalpodautoscaler, metric_name, metric_target_type) 86 | ||| % defaultFilters, 87 | 88 | utilization: ||| 89 | sum( 90 | kube_horizontalpodautoscaler_status_target_metric{ 91 | %(withHpaMetricTargetType)s 92 | } 93 | ) by (job, namespace, horizontalpodautoscaler, metric_name, metric_target_type) 94 | ||| % defaultFilters, 95 | }; 96 | 97 | local panels = { 98 | desiredReplicasStat: 99 | mixinUtils.dashboards.statPanel( 100 | 'Desired Replicas', 101 | 'short', 102 | queries.desiredReplicas, 103 | description='The desired number of replicas for the HPA.', 104 | ), 105 | 106 | currentReplicasStat: 107 | mixinUtils.dashboards.statPanel( 108 | 'Current Replicas', 109 | 'short', 110 | queries.currentReplicas, 111 | description='The current number of replicas for the HPA.', 112 | ), 113 | 114 | minReplicasStat: 115 | mixinUtils.dashboards.statPanel( 116 | 'Min Replicas', 117 | 'short', 118 | queries.minReplicas, 119 | description='The minimum number of replicas configured for the HPA.', 120 | ), 121 | 122 | maxReplicasStat: 123 | mixinUtils.dashboards.statPanel( 124 | 'Max Replicas', 125 | 'short', 126 | queries.maxReplicas, 127 | description='The maximum number of replicas configured for the HPA.', 128 | ), 129 | 130 | usageAndThresholdTimeSeries: 131 | mixinUtils.dashboards.timeSeriesPanel( 132 | 'Usage & Threshold', 133 | 'short', 134 | [ 135 | { 136 | expr: queries.utilization, 137 | legend: '{{ metric_target_type }} / {{ metric_name }}', 138 | }, 139 | { 140 | expr: queries.usageThreshold, 141 | legend: 'Threshold / {{ metric_name }}', 142 | }, 143 | ], 144 | fillOpacity=0, 145 | description='The current utilization and configured threshold for the HPA metric.', 146 | ), 147 | 148 | replicasTimeSeries: 149 | mixinUtils.dashboards.timeSeriesPanel( 150 | 'Replicas', 151 | 'short', 152 | [ 153 | { 154 | expr: queries.desiredReplicas, 155 | legend: 'Desired Replicas', 156 | }, 157 | { 158 | expr: queries.currentReplicas, 159 | legend: 'Current Replicas', 160 | }, 161 | { 162 | expr: queries.minReplicas, 163 | legend: 'Min Replicas', 164 | }, 165 | { 166 | expr: queries.maxReplicas, 167 | legend: 'Max Replicas', 168 | }, 169 | ], 170 | fillOpacity=0, 171 | description='The desired, current, minimum, and maximum replicas for the HPA over time.', 172 | ), 173 | 174 | metricTargetsTable: 175 | mixinUtils.dashboards.tablePanel( 176 | 'Metric Targets', 177 | 'short', 178 | queries.metricTargets, 179 | description='Configured metric targets for the HPA.', 180 | sortBy={ name: 'Horizontal Pod Autoscaler', desc: false }, 181 | transformations=[ 182 | tbQueryOptions.transformation.withId( 183 | 'organize' 184 | ) + 185 | tbQueryOptions.transformation.withOptions( 186 | { 187 | renameByName: { 188 | namespace: 'Namespace', 189 | horizontalpodautoscaler: 'Horizontal Pod Autoscaler', 190 | metric_name: 'Metric Name', 191 | metric_target_type: 'Metric Target Type', 192 | 'Value #A': 'Threshold', 193 | }, 194 | indexByName: { 195 | horizontalpodautoscaler: 0, 196 | namespace: 1, 197 | metric_name: 2, 198 | metric_target_type: 3, 199 | 'Value #A': 4, 200 | }, 201 | excludeByName: { 202 | Time: true, 203 | job: true, 204 | }, 205 | } 206 | ), 207 | ] 208 | ), 209 | }; 210 | 211 | local rows = 212 | [ 213 | row.new('Summary') + 214 | row.gridPos.withX(0) + 215 | row.gridPos.withY(0) + 216 | row.gridPos.withW(24) + 217 | row.gridPos.withH(1), 218 | ] + 219 | grid.makeGrid( 220 | [ 221 | panels.desiredReplicasStat, 222 | panels.currentReplicasStat, 223 | panels.minReplicasStat, 224 | panels.maxReplicasStat, 225 | ], 226 | panelWidth=6, 227 | panelHeight=3, 228 | startY=1 229 | ) + 230 | [ 231 | panels.metricTargetsTable + 232 | row.gridPos.withX(0) + 233 | row.gridPos.withY(4) + 234 | row.gridPos.withW(24) + 235 | row.gridPos.withH(8), 236 | row.new('$horizontalpodautoscaler / $metric_name / $metric_target_type') + 237 | row.gridPos.withX(0) + 238 | row.gridPos.withY(12) + 239 | row.gridPos.withW(24) + 240 | row.gridPos.withH(1) + 241 | row.withRepeat('metric_target_type'), 242 | ] + 243 | grid.makeGrid( 244 | [ 245 | panels.usageAndThresholdTimeSeries, 246 | panels.replicasTimeSeries, 247 | ], 248 | panelWidth=24, 249 | panelHeight=6, 250 | startY=13 251 | ); 252 | 253 | mixinUtils.dashboards.bypassDashboardValidation + 254 | dashboard.new( 255 | 'Kubernetes / Autoscaling / Horizontal Pod Autoscaler', 256 | ) + 257 | dashboard.withDescription('A dashboard that monitors Horizontal Pod Autoscalers. %s' % mixinUtils.dashboards.dashboardDescriptionLink('kubernetes-autoscaling-mixin', 'https://github.com/adinhodovic/kubernetes-autoscaling-mixin')) + 258 | dashboard.withUid($._config.hpaDashboardUid) + 259 | dashboard.withTags($._config.tags + ['kubernetes-core']) + 260 | dashboard.withTimezone('utc') + 261 | dashboard.withEditable(true) + 262 | dashboard.time.withFrom('now-6h') + 263 | dashboard.time.withTo('now') + 264 | dashboard.withVariables(variables) + 265 | dashboard.withLinks( 266 | mixinUtils.dashboards.dashboardLinks('Kubernetes / Autoscaling', $._config, dropdown=true) 267 | ) + 268 | dashboard.withPanels(rows), 269 | }, 270 | } 271 | -------------------------------------------------------------------------------- /dashboards/karpenter/karpenter-activity.libsonnet: -------------------------------------------------------------------------------- 1 | local mixinUtils = import 'github.com/adinhodovic/mixin-utils/utils.libsonnet'; 2 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 3 | local util = import 'util.libsonnet'; 4 | 5 | local dashboard = g.dashboard; 6 | local row = g.panel.row; 7 | local grid = g.util.grid; 8 | 9 | { 10 | grafanaDashboards+:: { 11 | 'kubernetes-autoscaling-mixin-karpenter-act.json': 12 | if !$._config.karpenter.enabled then {} else 13 | 14 | local defaultVariables = util.variables($._config); 15 | 16 | local variables = [ 17 | defaultVariables.datasource, 18 | defaultVariables.cluster, 19 | defaultVariables.job, 20 | defaultVariables.nodepoolSimple, 21 | ]; 22 | 23 | local defaultFilters = util.filters($._config); 24 | local queries = { 25 | // Node Activity 26 | nodesCreatedByNodePool: ||| 27 | round( 28 | sum( 29 | increase( 30 | karpenter_nodes_created_total{ 31 | %(base)s, 32 | %(nodepool)s 33 | }[$__rate_interval] 34 | ) 35 | ) by (nodepool) 36 | ) 37 | ||| % defaultFilters, 38 | 39 | nodesTerminatedByNodePool: ||| 40 | round( 41 | sum( 42 | increase( 43 | karpenter_nodes_terminated_total{ 44 | %(base)s, 45 | %(nodepool)s 46 | }[$__rate_interval] 47 | ) 48 | ) by (nodepool) 49 | ) 50 | ||| % defaultFilters, 51 | 52 | nodesVoluntaryDisruptionDecisions: ||| 53 | round( 54 | sum( 55 | increase( 56 | karpenter_voluntary_disruption_decisions_total{ 57 | %(base)s 58 | }[$__rate_interval] 59 | ) 60 | ) by (decision, reason) 61 | ) 62 | ||| % defaultFilters, 63 | 64 | nodesVoluntaryDisruptionEligible: ||| 65 | round( 66 | sum( 67 | karpenter_voluntary_disruption_eligible_nodes{ 68 | %(base)s 69 | } 70 | ) by (reason) 71 | ) 72 | ||| % defaultFilters, 73 | 74 | nodesDisrupted: ||| 75 | round( 76 | sum( 77 | increase( 78 | karpenter_nodeclaims_disrupted_total{ 79 | %(base)s, 80 | %(nodepool)s 81 | }[$__rate_interval] 82 | ) 83 | ) by (nodepool, capacity_type, reason) 84 | ) 85 | ||| % defaultFilters, 86 | 87 | // Pod Activity 88 | podStateByPhase: ||| 89 | round( 90 | sum( 91 | karpenter_pods_state{ 92 | %(base)s 93 | } 94 | ) by (phase) 95 | ) 96 | ||| % defaultFilters, 97 | 98 | podsStartupP50Duration: ||| 99 | max( 100 | karpenter_pods_startup_duration_seconds{ 101 | %(base)s, 102 | quantile="0.5" 103 | } 104 | ) 105 | ||| % defaultFilters, 106 | 107 | podsStartupP95Duration: ||| 108 | max( 109 | karpenter_pods_startup_duration_seconds{ 110 | %(base)s, 111 | quantile="0.95" 112 | } 113 | ) 114 | ||| % defaultFilters, 115 | 116 | podsStartupP99Duration: ||| 117 | max( 118 | karpenter_pods_startup_duration_seconds{ 119 | %(base)s, 120 | quantile="0.99" 121 | } 122 | ) 123 | ||| % defaultFilters, 124 | }; 125 | 126 | local panels = { 127 | // Node Activity 128 | nodesCreatedByNodePoolTimeSeries: 129 | mixinUtils.dashboards.timeSeriesPanel( 130 | 'Nodes Created by Node Pool', 131 | 'short', 132 | queries.nodesCreatedByNodePool, 133 | '{{ nodepool }}', 134 | description='The number of nodes created by node pool.', 135 | stack='normal' 136 | ), 137 | 138 | nodesTerminatedByNodePoolTimeSeries: 139 | mixinUtils.dashboards.timeSeriesPanel( 140 | 'Nodes Terminated by Node Pool', 141 | 'short', 142 | queries.nodesTerminatedByNodePool, 143 | '{{ nodepool }}', 144 | description='The number of nodes terminated by node pool.', 145 | stack='normal' 146 | ), 147 | 148 | nodesVoluntaryDisruptionDecisionsTimeSeries: 149 | mixinUtils.dashboards.timeSeriesPanel( 150 | 'Node Disruption Decisions by Reason and Decision', 151 | 'short', 152 | queries.nodesVoluntaryDisruptionDecisions, 153 | '{{ decision }} - {{ reason }}', 154 | description='The number of voluntary disruption decisions by reason and decision.', 155 | stack='normal' 156 | ), 157 | 158 | nodesVoluntaryDisruptionEligibleTimeSeries: 159 | mixinUtils.dashboards.timeSeriesPanel( 160 | 'Nodes Eligible for Disruption by Reason', 161 | 'short', 162 | queries.nodesVoluntaryDisruptionEligible, 163 | '{{ reason }}', 164 | description='The number of nodes eligible for voluntary disruption by reason.', 165 | stack='normal' 166 | ), 167 | 168 | nodesDisruptedTimeSeries: 169 | mixinUtils.dashboards.timeSeriesPanel( 170 | 'Nodes Disrupted by Node Pool', 171 | 'short', 172 | queries.nodesDisrupted, 173 | '{{ nodepool }} - {{ capacity_type }} - {{ reason }}', 174 | description='The number of nodes disrupted by node pool, capacity type, and reason.', 175 | stack='normal' 176 | ), 177 | 178 | // Pod Activity 179 | podStateByPhaseTimeSeries: 180 | mixinUtils.dashboards.timeSeriesPanel( 181 | 'Pods by Phase', 182 | 'short', 183 | queries.podStateByPhase, 184 | '{{ phase }}', 185 | description='The number of pods by phase.', 186 | stack='normal' 187 | ), 188 | 189 | podStartupDurationTimeSeries: 190 | mixinUtils.dashboards.timeSeriesPanel( 191 | 'Pods Startup Duration', 192 | 's', 193 | [ 194 | { 195 | expr: queries.podsStartupP50Duration, 196 | legend: 'P50', 197 | }, 198 | { 199 | expr: queries.podsStartupP95Duration, 200 | legend: 'P95', 201 | }, 202 | { 203 | expr: queries.podsStartupP99Duration, 204 | legend: 'P99', 205 | }, 206 | ], 207 | description='The duration for pods to start up.', 208 | fillOpacity=0 209 | ), 210 | }; 211 | 212 | local rows = 213 | [ 214 | row.new('Node Pool Activity') + 215 | row.gridPos.withX(0) + 216 | row.gridPos.withY(0) + 217 | row.gridPos.withW(24) + 218 | row.gridPos.withH(1), 219 | ] + 220 | grid.makeGrid( 221 | [ 222 | panels.nodesCreatedByNodePoolTimeSeries, 223 | panels.nodesTerminatedByNodePoolTimeSeries, 224 | ], 225 | panelWidth=12, 226 | panelHeight=6, 227 | startY=1 228 | ) + 229 | grid.makeGrid( 230 | [ 231 | panels.nodesVoluntaryDisruptionDecisionsTimeSeries, 232 | panels.nodesVoluntaryDisruptionEligibleTimeSeries, 233 | ], 234 | panelWidth=12, 235 | panelHeight=6, 236 | startY=7 237 | ) + 238 | grid.makeGrid( 239 | [ 240 | panels.nodesDisruptedTimeSeries, 241 | ], 242 | panelWidth=24, 243 | panelHeight=6, 244 | startY=13 245 | ) + 246 | [ 247 | row.new('Pod Activity') + 248 | row.gridPos.withX(0) + 249 | row.gridPos.withY(19) + 250 | row.gridPos.withW(24) + 251 | row.gridPos.withH(1), 252 | ] + 253 | grid.makeGrid( 254 | [ 255 | panels.podStateByPhaseTimeSeries, 256 | panels.podStartupDurationTimeSeries, 257 | ], 258 | panelWidth=12, 259 | panelHeight=6, 260 | startY=20 261 | ); 262 | 263 | mixinUtils.dashboards.bypassDashboardValidation + 264 | dashboard.new( 265 | 'Kubernetes / Autoscaling / Karpenter / Activity', 266 | ) + 267 | dashboard.withDescription('A dashboard that monitors Karpenter and focuses on Karpenter deletion/creation activity. %s' % mixinUtils.dashboards.dashboardDescriptionLink('kubernetes-autoscaling-mixin', 'https://github.com/adinhodovic/kubernetes-autoscaling-mixin')) + 268 | dashboard.withUid($._config.karpenterActivityDashboardUid) + 269 | dashboard.withTags($._config.tags + ['karpenter']) + 270 | dashboard.withTimezone('utc') + 271 | dashboard.withEditable(true) + 272 | dashboard.time.withFrom('now-24h') + 273 | dashboard.time.withTo('now') + 274 | dashboard.withVariables(variables) + 275 | dashboard.withLinks( 276 | mixinUtils.dashboards.dashboardLinks('Kubernetes / Autoscaling', $._config, dropdown=true) 277 | ) + 278 | dashboard.withPanels( 279 | rows 280 | ) + 281 | dashboard.withAnnotations( 282 | mixinUtils.dashboards.annotations($._config, defaultFilters) 283 | ), 284 | }, 285 | } 286 | -------------------------------------------------------------------------------- /tests/tests.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | rule_files: 3 | - ../prometheus_alerts.yaml 4 | 5 | tests: 6 | # Karpenter 7 | - interval: 1m 8 | input_series: 9 | - series: 'karpenter_cloudprovider_errors_total{namespace="karpenter", job="karpenter", provider="aws", controller="nodeclaim.disruption", method="Get"}' 10 | values: "1+1x20" 11 | alert_rule_test: 12 | - eval_time: 20m 13 | alertname: KarpenterCloudProviderErrors 14 | exp_alerts: 15 | - exp_labels: 16 | namespace: karpenter 17 | job: karpenter 18 | provider: aws 19 | controller: nodeclaim.disruption 20 | method: Get 21 | severity: warning 22 | exp_annotations: 23 | summary: "Karpenter has Cloud Provider Errors." 24 | description: "The Karpenter provider aws with the controller nodeclaim.disruption has errors with the method Get." 25 | dashboard_url: "https://grafana.com/d/kubernetes-autoscaling-mixin-kperf-jkwq/kubernetes-autoscaling-karpenter-performance" 26 | - interval: 1m 27 | input_series: 28 | - series: 'karpenter_nodepools_usage{namespace="karpenter", job="karpenter", nodepool="nodepool-a", resource_type="cpu"}' 29 | values: "80x15" 30 | - series: 'karpenter_nodepools_limit{namespace="karpenter", job="karpenter", nodepool="nodepool-a", resource_type="cpu"}' 31 | values: "100x15" 32 | alert_rule_test: 33 | - eval_time: 15m 34 | alertname: KarpenterNodepoolNearCapacity 35 | exp_alerts: 36 | - exp_labels: 37 | namespace: karpenter 38 | job: karpenter 39 | nodepool: nodepool-a 40 | resource_type: cpu 41 | severity: warning 42 | exp_annotations: 43 | summary: "Karpenter Nodepool near capacity." 44 | description: "The resource cpu in the Karpenter node pool nodepool-a is nearing its limit. Consider scaling or adding resources." 45 | dashboard_url: "https://grafana.com/d/kubernetes-autoscaling-mixin-kover-jkwq/kubernetes-autoscaling-karpenter-overview" 46 | - interval: 1m 47 | input_series: 48 | - series: karpenter_nodeclaims_termination_duration_seconds_sum{namespace="karpenter", job="karpenter", nodepool="nodepool-a"} 49 | values: "0+2400x20" 50 | - series: karpenter_nodeclaims_termination_duration_seconds_count{namespace="karpenter", job="karpenter", nodepool="nodepool-a"} 51 | values: "0+1x20" 52 | - series: karpenter_nodeclaims_termination_duration_seconds_sum{namespace="karpenter", job="karpenter", nodepool="nodepool-b"} 53 | values: "0+60x20" 54 | - series: karpenter_nodeclaims_termination_duration_seconds_count{namespace="karpenter", job="karpenter", nodepool="nodepool-b"} 55 | values: "0+1x20" 56 | alert_rule_test: 57 | - eval_time: 20m 58 | alertname: KarpenterNodeClaimsTerminationDurationHigh 59 | exp_alerts: 60 | - exp_labels: 61 | namespace: karpenter 62 | job: karpenter 63 | nodepool: nodepool-a 64 | severity: warning 65 | exp_annotations: 66 | summary: "Karpenter Node Claims Termination Duration is High." 67 | description: "The average node claim termination duration in Karpenter has exceeded 20 minutes for more than 15 minutes in nodepool nodepool-a. This may indicate cloud provider issues or improper instance termination handling." 68 | dashboard_url: "https://grafana.com/d/kubernetes-autoscaling-mixin-kact-jkwq/kubernetes-autoscaling-karpenter-activity" 69 | # Cluster Autoscaler 70 | - interval: 1m 71 | input_series: 72 | - series: 'cluster_autoscaler_nodes_count{namespace="autoscaler", job="cluster-autoscaler"}' 73 | values: "95x15" 74 | - series: 'cluster_autoscaler_max_nodes_count{namespace="autoscaler", job="cluster-autoscaler"}' 75 | values: "100x15" 76 | alert_rule_test: 77 | - eval_time: 15m 78 | alertname: ClusterAutoscalerNodeCountNearCapacity 79 | exp_alerts: 80 | - exp_labels: 81 | namespace: autoscaler 82 | job: cluster-autoscaler 83 | severity: warning 84 | exp_annotations: 85 | summary: "Cluster Autoscaler Node Count near Capacity." 86 | description: "The node count for the cluster autoscaler job cluster-autoscaler is reaching max limit. Consider scaling node groups." 87 | dashboard_url: "https://grafana.com/d/kubernetes-autoscaling-mixin-ca-jkwq/kubernetes-autoscaling-cluster-autoscaler" 88 | - interval: 1m 89 | input_series: 90 | - series: 'cluster_autoscaler_unschedulable_pods_count{namespace="autoscaler", job="cluster-autoscaler"}' 91 | values: "1x15" 92 | alert_rule_test: 93 | - eval_time: 15m 94 | alertname: ClusterAutoscalerUnschedulablePods 95 | exp_alerts: 96 | - exp_labels: 97 | namespace: autoscaler 98 | job: cluster-autoscaler 99 | severity: warning 100 | exp_annotations: 101 | summary: "Pods Pending Scheduling - Cluster Node Group Scaling Required" 102 | description: "The cluster currently has unschedulable pods, indicating resource shortages. Consider adding more nodes or increasing node group capacity." 103 | dashboard_url: "https://grafana.com/d/kubernetes-autoscaling-mixin-ca-jkwq/kubernetes-autoscaling-cluster-autoscaler" 104 | # KEDA 105 | - interval: 1m 106 | input_series: 107 | - series: 'keda_scaled_job_errors_total{job="keda-operator", exported_namespace="test", scaledObject="test"}' 108 | values: "0+10x15" 109 | alert_rule_test: 110 | - eval_time: 15m 111 | alertname: KedaScaledJobErrors 112 | exp_alerts: 113 | - exp_labels: 114 | job: keda-operator 115 | exported_namespace: test 116 | scaledObject: test 117 | severity: warning 118 | exp_annotations: 119 | summary: "Errors detected for KEDA scaled jobs." 120 | description: "KEDA scaled jobs are experiencing errors. Check the scaled job test in the namespace test." 121 | dashboard_url: "https://grafana.com/d/kubernetes-autoscaling-mixin-kedasj-jkwq/kubernetes-autoscaling-keda-scaled-job?var-scaled_job=test&var-resource_namespace=test" 122 | 123 | - interval: 1m 124 | input_series: 125 | - series: 'keda_scaled_object_errors_total{job="keda-operator", exported_namespace="test", scaledObject="test"}' 126 | values: "0+10x15" 127 | alert_rule_test: 128 | - eval_time: 15m 129 | alertname: KedaScaledObjectErrors 130 | exp_alerts: 131 | - exp_labels: 132 | job: keda-operator 133 | exported_namespace: test 134 | scaledObject: test 135 | severity: warning 136 | exp_annotations: 137 | summary: "Errors detected for KEDA scaled objects." 138 | description: "KEDA scaled objects are experiencing errors. Check the scaled object test in the namespace test." 139 | dashboard_url: "https://grafana.com/d/kubernetes-autoscaling-mixin-kedaso-jkwq/kubernetes-autoscaling-keda-scaled-object?var-scaled_object=test&var-resource_namespace=test" 140 | 141 | - interval: 1m 142 | input_series: 143 | - series: 'keda_scaler_metrics_latency_seconds{namespace="keda", job="keda-operator", exported_namespace="test", scaler="prometheus", scaledObject="test"}' 144 | values: "10x10" 145 | alert_rule_test: 146 | - eval_time: 10m 147 | alertname: KedaScalerLatencyHigh 148 | exp_alerts: 149 | - exp_labels: 150 | job: keda-operator 151 | exported_namespace: test 152 | scaler: prometheus 153 | scaledObject: test 154 | severity: warning 155 | exp_annotations: 156 | summary: "High latency for KEDA scaler metrics." 157 | description: "Metric latency for scaler prometheus for the object test has exceeded acceptable limits." 158 | dashboard_url: "https://grafana.com/d/kubernetes-autoscaling-mixin-kedaso-jkwq/kubernetes-autoscaling-keda-scaled-object?var-scaled_object=test&var-scaler=prometheus" 159 | 160 | - interval: 1m 161 | input_series: 162 | - series: 'keda_scaled_object_paused{namespace="keda", job="keda-operator", exported_namespace="test", scaledObject="test"}' 163 | values: "1x1500" 164 | alert_rule_test: 165 | - eval_time: 25h 166 | alertname: KedaScaledObjectPaused 167 | exp_alerts: 168 | - exp_labels: 169 | job: keda-operator 170 | exported_namespace: test 171 | scaledObject: test 172 | severity: warning 173 | exp_annotations: 174 | summary: "KEDA scaled object is paused." 175 | description: "The scaled object test in namespace test is paused for longer than 25h. This may indicate a configuration issue or manual intervention." 176 | dashboard_url: "https://grafana.com/d/kubernetes-autoscaling-mixin-kedaso-jkwq/kubernetes-autoscaling-keda-scaled-object?var-scaled_object=test&var-resource_namespace=test" 177 | 178 | - interval: 1m 179 | input_series: 180 | - series: 'keda_scaler_detail_errors_total{metric="s0-prometheus", namespace="keda", exported_namespace="test", job="keda-operator", scaledObject="test", scaler="prometheusScaler",triggerIndex="0",type="scaledjob"}' 181 | values: "0+10x15" 182 | alert_rule_test: 183 | - eval_time: 15m 184 | alertname: KedaScalerDetailErrors 185 | exp_alerts: 186 | - exp_labels: 187 | exported_namespace: test 188 | job: keda-operator 189 | scaledObject: test 190 | scaler: prometheusScaler 191 | type: scaledjob 192 | severity: warning 193 | exp_annotations: 194 | summary: "Errors detected in KEDA scaler." 195 | description: "Errors have occurred in the KEDA scaler prometheusScaler. Investigate the scaler for the scaledjob test in namespace test." 196 | dashboard_url: "https://grafana.com/d/kubernetes-autoscaling-mixin-kedaso-jkwq/kubernetes-autoscaling-keda-scaled-object?var-scaler=prometheusScaler&var-scaled_object=test" 197 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Prometheus Monitoring Mixin for Kubernetes Autoscaling 2 | 3 | A set of Grafana dashboards and Prometheus alerts for Kubernetes Autoscaling using the metrics from Kube-state-metrics, Karpenter, and Cluster-autoscaler. 4 | 5 | This serves as a extension for the [Kubernetes-mixin](https://github.com/kubernetes-monitoring/kubernetes-mixin) and adds monitoring for components that aren't deployed by default in a Kubernetes cluster (VPA, Karpenter, Cluster-Autoscaler). 6 | 7 | ## Dashboards 8 | 9 | The mixin provides the following dashboards: 10 | 11 | - Kubernetes Autoscaling 12 | - Pod Disruption Budgets 13 | - Horizontal Pod Autoscalers 14 | - Vertical Pod Autoscalers 15 | - Cluster Autoscaler 16 | - Karpenter 17 | - Overview 18 | - Activity 19 | - Performance 20 | - KEDA 21 | - Scaled Objects 22 | - Scaled Jobs 23 | 24 | Generated dashboards also exist in the `./dashboards_out` directory. 25 | 26 | Alerts are created for the following components currently: 27 | 28 | - Karpenter 29 | - Keda 30 | - Cluster Autoscaler 31 | 32 | VPA, Karpenter, Keda, and Cluster Autoscaler are configurable in the `config.libsonnet` file. They can be turned off by setting the `enabled` field to `false`. 33 | 34 | ## How to use 35 | 36 | This mixin is designed to be vendored into the repo with your infrastructure config. To do this, use [jsonnet-bundler](https://github.com/jsonnet-bundler/jsonnet-bundler): 37 | 38 | You then have three options for deploying your dashboards 39 | 40 | 1. Generate the config files and deploy them yourself 41 | 2. Use jsonnet to deploy this mixin along with Prometheus and Grafana 42 | 3. Use prometheus-operator to deploy this mixin 43 | 44 | Or import the dashboard using json in `./dashboards_out`, alternatively import them from the `Grafana.com` dashboard page. 45 | 46 | ## Generate config files 47 | 48 | You can manually generate the alerts, dashboards, and rules files, but first you must install some tools: 49 | 50 | ```sh 51 | go get github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb 52 | brew install jsonnet 53 | ``` 54 | 55 | Then, grab the mixin and its dependencies: 56 | 57 | ```sh 58 | git clone https://github.com/adinhodovic/kubernetes-autoscaling-mixin 59 | cd kubernetes-autoscaling-mixin 60 | jb install 61 | ``` 62 | 63 | Finally, build the mixin: 64 | 65 | ```sh 66 | make prometheus_alerts.yaml 67 | make dashboards_out 68 | ``` 69 | 70 | The `prometheus_alerts.yaml` file then need to passed to your Prometheus server, and the files in `dashboards_out` need to be imported into you Grafana server. The exact details depend on how you deploy your monitoring stack. 71 | 72 | ### Configuration 73 | 74 | This mixin has its configuration in the `config.libsonnet` file. You can turn off the alerts for VPA, Karpenter, KEDA, and Cluster Autoscaler by setting the `enabled` field to `false`. 75 | 76 | ```jsonnet 77 | { 78 | _config+:: { 79 | vpa+: { 80 | enabled: false, 81 | }, 82 | keda+: { 83 | enabled: false, 84 | }, 85 | karpenter+: { 86 | enabled: false, 87 | }, 88 | clusterAutoscaler+: { 89 | enabled: false, 90 | }, 91 | }, 92 | } 93 | ``` 94 | 95 | The mixin has all components enabled by default and all the dashboards are generated in the `dashboards_out` directory. You can import them into Grafana. 96 | 97 | ### VPA Requirements 98 | 99 | Kube-state-metrics doesn't ship with VPA metrics by default. You need to deploy a custom kube-state-metrics with the following configuration: 100 | 101 | Adjust the `ClusterRole` `kube-state-metrics` to include the following rules: 102 | 103 | ```yaml 104 | apiVersion: rbac.authorization.k8s.io/v1 105 | kind: ClusterRole 106 | metadata: 107 | labels: 108 | app.kubernetes.io/component: exporter 109 | app.kubernetes.io/name: kube-state-metrics 110 | app.kubernetes.io/part-of: kube-prometheus 111 | name: kube-state-metrics 112 | rules: 113 | # ... other rules 114 | - apiGroups: 115 | - autoscaling.k8s.io 116 | resources: 117 | - verticalpodautoscalers 118 | verbs: 119 | - list 120 | - watch 121 | - apiGroups: 122 | - apiextensions.k8s.io 123 | resources: 124 | - customresourcedefinitions 125 | verbs: 126 | - list 127 | - watch 128 | ``` 129 | 130 | Adjust the `Deployment` `kube-state-metrics` to include the following extra arguments: 131 | 132 | ```yaml 133 | kind: Deployment 134 | metadata: 135 | labels: 136 | app.kubernetes.io/name: kube-state-metrics 137 | app.kubernetes.io/part-of: kube-prometheus 138 | app.kubernetes.io/version: 2.13.0 139 | name: kube-state-metrics 140 | namespace: monitoring 141 | spec: 142 | ... 143 | containers: 144 | - args: 145 | ... 146 | - --custom-resource-state-config 147 | - | 148 | kind: CustomResourceStateMetrics 149 | spec: 150 | resources: 151 | - groupVersionKind: 152 | group: autoscaling.k8s.io 153 | kind: "VerticalPodAutoscaler" 154 | version: "v1" 155 | labelsFromPath: 156 | verticalpodautoscaler: [metadata, name] 157 | namespace: [metadata, namespace] 158 | target_api_version: [spec, targetRef, apiVersion] 159 | target_kind: [spec, targetRef, kind] 160 | target_name: [spec, targetRef, name] 161 | metrics: 162 | # Labels 163 | - name: "verticalpodautoscaler_labels" 164 | help: "VPA container recommendations. Kubernetes labels converted to Prometheus labels" 165 | each: 166 | type: Info 167 | info: 168 | labelsFromPath: 169 | name: [metadata, name] 170 | # Memory Information 171 | - name: "verticalpodautoscaler_status_recommendation_containerrecommendations_target" 172 | help: "VPA container recommendations for memory. Target resources the VerticalPodAutoscaler recommends for the container." 173 | each: 174 | type: Gauge 175 | gauge: 176 | path: [status, recommendation, containerRecommendations] 177 | valueFrom: [target, memory] 178 | labelsFromPath: 179 | container: [containerName] 180 | commonLabels: 181 | resource: "memory" 182 | unit: "byte" 183 | - name: "verticalpodautoscaler_status_recommendation_containerrecommendations_lowerbound" 184 | help: "VPA container recommendations for memory. Minimum resources the container can use before the VerticalPodAutoscaler updater evicts it" 185 | each: 186 | type: Gauge 187 | gauge: 188 | path: [status, recommendation, containerRecommendations] 189 | valueFrom: [lowerBound, memory] 190 | labelsFromPath: 191 | container: [containerName] 192 | commonLabels: 193 | resource: "memory" 194 | unit: "byte" 195 | - name: "verticalpodautoscaler_status_recommendation_containerrecommendations_upperbound" 196 | help: "VPA container recommendations for memory. Maximum resources the container can use before the VerticalPodAutoscaler updater evicts it" 197 | each: 198 | type: Gauge 199 | gauge: 200 | path: [status, recommendation, containerRecommendations] 201 | valueFrom: [upperBound, memory] 202 | labelsFromPath: 203 | container: [containerName] 204 | commonLabels: 205 | resource: "memory" 206 | unit: "byte" 207 | - name: "verticalpodautoscaler_status_recommendation_containerrecommendations_uncappedtarget" 208 | help: "VPA container recommendations for memory. Target resources the VerticalPodAutoscaler recommends for the container ignoring bounds" 209 | each: 210 | type: Gauge 211 | gauge: 212 | path: [status, recommendation, containerRecommendations] 213 | valueFrom: [uncappedTarget, memory] 214 | labelsFromPath: 215 | container: [containerName] 216 | commonLabels: 217 | resource: "memory" 218 | unit: "byte" 219 | # CPU Information 220 | - name: "verticalpodautoscaler_status_recommendation_containerrecommendations_target" 221 | help: "VPA container recommendations for cpu. Target resources the VerticalPodAutoscaler recommends for the container." 222 | each: 223 | type: Gauge 224 | gauge: 225 | path: [status, recommendation, containerRecommendations] 226 | valueFrom: [target, cpu] 227 | labelsFromPath: 228 | container: [containerName] 229 | commonLabels: 230 | resource: "cpu" 231 | unit: "core" 232 | - name: "verticalpodautoscaler_status_recommendation_containerrecommendations_lowerbound" 233 | help: "VPA container recommendations for cpu. Minimum resources the container can use before the VerticalPodAutoscaler updater evicts it" 234 | each: 235 | type: Gauge 236 | gauge: 237 | path: [status, recommendation, containerRecommendations] 238 | valueFrom: [lowerBound, cpu] 239 | labelsFromPath: 240 | container: [containerName] 241 | commonLabels: 242 | resource: "cpu" 243 | unit: "core" 244 | - name: "verticalpodautoscaler_status_recommendation_containerrecommendations_upperbound" 245 | help: "VPA container recommendations for cpu. Maximum resources the container can use before the VerticalPodAutoscaler updater evicts it" 246 | each: 247 | type: Gauge 248 | gauge: 249 | path: [status, recommendation, containerRecommendations] 250 | valueFrom: [upperBound, cpu] 251 | labelsFromPath: 252 | container: [containerName] 253 | commonLabels: 254 | resource: "cpu" 255 | unit: "core" 256 | - name: "verticalpodautoscaler_status_recommendation_containerrecommendations_uncappedtarget" 257 | help: "VPA container recommendations for cpu. Target resources the VerticalPodAutoscaler recommends for the container ignoring bounds" 258 | each: 259 | type: Gauge 260 | gauge: 261 | path: [status, recommendation, containerRecommendations] 262 | valueFrom: [uncappedTarget, cpu] 263 | labelsFromPath: 264 | container: [containerName] 265 | commonLabels: 266 | resource: "cpu" 267 | unit: "core" 268 | ``` 269 | 270 | ## Alerts 271 | 272 | The mixin follows the [monitoring-mixins guidelines](https://github.com/monitoring-mixins/docs#guidelines-for-alert-names-labels-and-annotations) for alerts. 273 | -------------------------------------------------------------------------------- /alerts/alerts.libsonnet: -------------------------------------------------------------------------------- 1 | { 2 | local clusterVariableQueryString = if $._config.showMultiCluster then '&var-%(clusterLabel)s={{ $labels.%(clusterLabel)s }}' % $._config else '', 3 | local clusterLabel = { clusterLabel: $._config.clusterLabel }, 4 | prometheusAlerts+:: { 5 | groups+: std.prune([ 6 | if $._config.karpenter.enabled then { 7 | local karpenterConfig = $._config.karpenter + clusterLabel, 8 | name: 'karpenter', 9 | rules: [ 10 | { 11 | alert: 'KarpenterCloudProviderErrors', 12 | expr: ||| 13 | sum( 14 | increase( 15 | karpenter_cloudprovider_errors_total{ 16 | %(karpenterSelector)s, 17 | controller!~"nodeclaim.termination|node.termination", 18 | error!="NodeClaimNotFoundError" 19 | }[5m] 20 | ) 21 | ) by (%(clusterLabel)s, namespace, job, provider, controller, method) > 0 22 | ||| % karpenterConfig, 23 | labels: { 24 | severity: 'warning', 25 | }, 26 | 'for': '5m', 27 | annotations: { 28 | summary: 'Karpenter has Cloud Provider Errors.', 29 | description: 'The Karpenter provider {{ $labels.provider }} with the controller {{ $labels.controller }} has errors with the method {{ $labels.method }}.', 30 | dashboard_url: $._config.karpenter.karpenterPerformanceDashboardUrl + clusterVariableQueryString, 31 | }, 32 | }, 33 | { 34 | alert: 'KarpenterNodeClaimsTerminationDurationHigh', 35 | expr: ||| 36 | sum( 37 | rate( 38 | karpenter_nodeclaims_termination_duration_seconds_sum{ 39 | %(karpenterSelector)s 40 | }[5m] 41 | ) 42 | ) by (%(clusterLabel)s, namespace, job, nodepool) 43 | / 44 | sum( 45 | rate( 46 | karpenter_nodeclaims_termination_duration_seconds_count{ 47 | %(karpenterSelector)s 48 | }[5m] 49 | ) 50 | ) by (%(clusterLabel)s, namespace, job, nodepool) > %(nodeclaimTerminationThreshold)s 51 | ||| % karpenterConfig, 52 | labels: { 53 | severity: 'warning', 54 | }, 55 | 'for': '15m', 56 | annotations: { 57 | summary: 'Karpenter Node Claims Termination Duration is High.', 58 | description: 'The average node claim termination duration in Karpenter has exceeded %s minutes for more than 15 minutes in nodepool {{ $labels.nodepool }}. This may indicate cloud provider issues or improper instance termination handling.' % std.toString($._config.karpenter.nodeclaimTerminationThreshold / 60), 59 | dashboard_url: $._config.karpenter.karpenterActivityDashboardUrl + clusterVariableQueryString, 60 | }, 61 | }, 62 | { 63 | alert: 'KarpenterNodepoolNearCapacity', 64 | annotations: { 65 | summary: 'Karpenter Nodepool near capacity.', 66 | description: 'The resource {{ $labels.resource_type }} in the Karpenter node pool {{ $labels.nodepool }} is nearing its limit. Consider scaling or adding resources.', 67 | dashboard_url: $._config.karpenter.karpenterOverviewDashboardUrl + clusterVariableQueryString, 68 | }, 69 | expr: ||| 70 | sum ( 71 | karpenter_nodepools_usage{%(karpenterSelector)s} 72 | ) by (%(clusterLabel)s, namespace, job, nodepool, resource_type) 73 | / 74 | sum ( 75 | karpenter_nodepools_limit{%(karpenterSelector)s} 76 | ) by (%(clusterLabel)s, namespace, job, nodepool, resource_type) 77 | * 100 > %(nodepoolCapacityThreshold)s 78 | ||| % karpenterConfig, 79 | 'for': '15m', 80 | labels: { 81 | severity: 'warning', 82 | }, 83 | }, 84 | ], 85 | }, 86 | if $._config.clusterAutoscaler.enabled then { 87 | local clusterAutoscalerConfig = $._config.clusterAutoscaler + clusterLabel, 88 | name: 'cluster-autoscaler', 89 | rules: [ 90 | { 91 | alert: 'ClusterAutoscalerNodeCountNearCapacity', 92 | annotations: { 93 | summary: 'Cluster Autoscaler Node Count near Capacity.', 94 | description: 'The node count for the cluster autoscaler job {{ $labels.job }} is reaching max limit. Consider scaling node groups.', 95 | dashboard_url: $._config.clusterAutoscaler.clusterAutoscalerDashboardUrl + clusterVariableQueryString, 96 | }, 97 | expr: ||| 98 | sum ( 99 | cluster_autoscaler_nodes_count{ 100 | %(clusterAutoscalerSelector)s 101 | } 102 | ) by (%(clusterLabel)s, namespace, job) 103 | / 104 | sum ( 105 | cluster_autoscaler_max_nodes_count{ 106 | %(clusterAutoscalerSelector)s 107 | } 108 | ) by (%(clusterLabel)s, namespace, job) 109 | * 100 > %(nodeCountCapacityThreshold)s 110 | ||| % clusterAutoscalerConfig, 111 | 'for': '15m', 112 | labels: { 113 | severity: 'warning', 114 | }, 115 | }, 116 | { 117 | alert: 'ClusterAutoscalerUnschedulablePods', 118 | annotations: { 119 | summary: 'Pods Pending Scheduling - Cluster Node Group Scaling Required', 120 | description: 'The cluster currently has unschedulable pods, indicating resource shortages. Consider adding more nodes or increasing node group capacity.', 121 | dashboard_url: $._config.clusterAutoscaler.clusterAutoscalerDashboardUrl + clusterVariableQueryString, 122 | }, 123 | expr: ||| 124 | sum ( 125 | cluster_autoscaler_unschedulable_pods_count{ 126 | %(clusterAutoscalerSelector)s 127 | } 128 | ) by (%(clusterLabel)s, namespace, job) 129 | > 0 130 | ||| % clusterAutoscalerConfig, 131 | 'for': '15m', 132 | labels: { 133 | severity: 'warning', 134 | }, 135 | }, 136 | ], 137 | }, 138 | if $._config.keda.enabled then { 139 | local kedaConfig = $._config.keda + clusterLabel, 140 | name: 'keda', 141 | rules: [ 142 | { 143 | alert: 'KedaScaledJobErrors', 144 | annotations: { 145 | summary: 'Errors detected for KEDA scaled jobs.', 146 | description: 'KEDA scaled jobs are experiencing errors. Check the scaled job {{ $labels.scaledObject }} in the namespace {{ $labels.exported_namespace }}.', 147 | dashboard_url: $._config.keda.kedaScaledJobDashboardUrl + '?var-scaled_job={{ $labels.scaledObject }}&var-resource_namespace={{ $labels.exported_namespace }}' + clusterVariableQueryString, 148 | }, 149 | expr: ||| 150 | sum( 151 | increase( 152 | keda_scaled_job_errors_total{ 153 | %(kedaSelector)s 154 | }[10m] 155 | ) 156 | ) by (%(clusterLabel)s, job, exported_namespace, scaledObject) > 0 157 | ||| % kedaConfig, 158 | 'for': '1m', 159 | labels: { 160 | severity: 'warning', 161 | }, 162 | }, 163 | { 164 | alert: 'KedaScaledObjectErrors', 165 | annotations: { 166 | summary: 'Errors detected for KEDA scaled objects.', 167 | description: 'KEDA scaled objects are experiencing errors. Check the scaled object {{ $labels.scaledObject }} in the namespace {{ $labels.exported_namespace }}.', 168 | dashboard_url: $._config.keda.kedaScaledObjectDashboardUrl + '?var-scaled_object={{ $labels.scaledObject }}&var-resource_namespace={{ $labels.exported_namespace }}' + clusterVariableQueryString, 169 | }, 170 | expr: ||| 171 | sum( 172 | increase( 173 | keda_scaled_object_errors_total{ 174 | %(kedaSelector)s 175 | }[10m] 176 | ) 177 | ) by (%(clusterLabel)s, job, exported_namespace, scaledObject) > 0 178 | ||| % kedaConfig, 179 | 'for': '1m', 180 | labels: { 181 | severity: 'warning', 182 | }, 183 | }, 184 | { 185 | alert: 'KedaScalerLatencyHigh', 186 | annotations: { 187 | summary: 'High latency for KEDA scaler metrics.', 188 | description: 'Metric latency for scaler {{ $labels.scaler }} for the object {{ $labels.scaledObject }} has exceeded acceptable limits.', 189 | dashboard_url: $._config.keda.kedaScaledObjectDashboardUrl + '?var-scaled_object={{ $labels.scaledObject }}&var-scaler={{ $labels.scaler }}' + clusterVariableQueryString, 190 | }, 191 | expr: ||| 192 | avg( 193 | keda_scaler_metrics_latency_seconds{ 194 | %(kedaSelector)s 195 | } 196 | ) by (%(clusterLabel)s, job, exported_namespace, scaledObject, scaler) > %(scalerMetricsLatencyThreshold)s 197 | ||| % kedaConfig, 198 | 'for': '10m', 199 | labels: { 200 | severity: 'warning', 201 | }, 202 | }, 203 | { 204 | alert: 'KedaScaledObjectPaused', 205 | annotations: { 206 | summary: 'KEDA scaled object is paused.', 207 | description: 'The scaled object {{ $labels.scaledObject }} in namespace {{ $labels.exported_namespace }} is paused for longer than %(scaledObjectPausedThreshold)s. This may indicate a configuration issue or manual intervention.' % kedaConfig, 208 | dashboard_url: $._config.keda.kedaScaledObjectDashboardUrl + '?var-scaled_object={{ $labels.scaledObject }}&var-resource_namespace={{ $labels.exported_namespace }}' + clusterVariableQueryString, 209 | }, 210 | expr: ||| 211 | max( 212 | keda_scaled_object_paused{ 213 | %(kedaSelector)s 214 | } 215 | ) by (%(clusterLabel)s, job, exported_namespace, scaledObject) > 0 216 | ||| % kedaConfig, 217 | 'for': kedaConfig.scaledObjectPausedThreshold, 218 | labels: { 219 | severity: 'warning', 220 | }, 221 | }, 222 | { 223 | alert: 'KedaScalerDetailErrors', 224 | annotations: { 225 | summary: 'Errors detected in KEDA scaler.', 226 | description: 'Errors have occurred in the KEDA scaler {{ $labels.scaler }}. Investigate the scaler for the {{ $labels.type }} {{ $labels.scaledObject }} in namespace {{ $labels.exported_namespace }}.', 227 | dashboard_url: $._config.keda.kedaScaledObjectDashboardUrl + '?var-scaler={{ $labels.scaler }}&var-scaled_object={{ $labels.scaledObject }}' + clusterVariableQueryString, 228 | }, 229 | expr: ||| 230 | sum( 231 | increase( 232 | keda_scaler_detail_errors_total{ 233 | %(kedaSelector)s 234 | }[10m] 235 | ) 236 | ) by (%(clusterLabel)s, job, exported_namespace, scaledObject, type, scaler) > 0 237 | ||| % kedaConfig, 238 | 'for': '1m', 239 | labels: { 240 | severity: 'warning', 241 | }, 242 | }, 243 | ], 244 | }, 245 | ]), 246 | }, 247 | } 248 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /dashboards/keda/keda-scaled-job.libsonnet: -------------------------------------------------------------------------------- 1 | local mixinUtils = import 'github.com/adinhodovic/mixin-utils/utils.libsonnet'; 2 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 3 | local util = import 'util.libsonnet'; 4 | 5 | local dashboard = g.dashboard; 6 | local row = g.panel.row; 7 | local grid = g.util.grid; 8 | 9 | local tablePanel = g.panel.table; 10 | 11 | // Table 12 | local tbQueryOptions = tablePanel.queryOptions; 13 | local tbPanelOptions = tablePanel.panelOptions; 14 | 15 | { 16 | grafanaDashboards+:: { 17 | 'kubernetes-autoscaling-mixin-keda-sj.json': 18 | if !$._config.keda.enabled then {} else 19 | 20 | local defaultVariables = util.variables($._config); 21 | 22 | local variables = [ 23 | defaultVariables.datasource, 24 | defaultVariables.cluster, 25 | defaultVariables.scaledJobJob, 26 | defaultVariables.scaledJobOperatorNamespace, 27 | defaultVariables.scaledJobResourceNamespace, 28 | defaultVariables.scaledJob, 29 | defaultVariables.scalerForScaledJob, 30 | defaultVariables.metricForScaledJob, 31 | ]; 32 | 33 | local defaultFilters = util.filters($._config); 34 | 35 | local queries = { 36 | resourcesRegisteredByNamespace: ||| 37 | sum( 38 | keda_resource_registered_total{ 39 | %(base)s, 40 | type="scaled_job" 41 | } 42 | ) by (exported_namespace, type) 43 | ||| % defaultFilters, 44 | 45 | triggersByType: ||| 46 | sum( 47 | keda_trigger_registered_total{ 48 | %(base)s 49 | } 50 | ) by (type) 51 | ||| % defaultFilters, 52 | 53 | scaledJobsErrors: ||| 54 | sum( 55 | increase( 56 | keda_scaled_job_errors_total{ 57 | %(withResourceNamespace)s 58 | }[$__rate_interval] 59 | ) 60 | ) by (exported_namespace, scaledJob) 61 | ||| % defaultFilters, 62 | 63 | scalerDetailErrors: ||| 64 | sum( 65 | increase( 66 | keda_scaler_detail_errors_total{ 67 | %(withResourceNamespace)s, 68 | type="scaledjob" 69 | }[$__rate_interval] 70 | ) 71 | ) by (exported_namespace, scaledObject, scaler) 72 | ||| % defaultFilters, 73 | 74 | scaleTargetValues: ||| 75 | sum( 76 | keda_scaler_metrics_value{ 77 | %(withResourceNamespace)s, 78 | type="scaledjob" 79 | } 80 | ) by (job, exported_namespace, scaledObject, scaler, metric) 81 | ||| % defaultFilters, 82 | 83 | scaledJobActive: ||| 84 | sum( 85 | keda_scaler_active{ 86 | %(withScaledJob)s 87 | } 88 | ) by (exported_namespace, scaledObject) 89 | ||| % defaultFilters, 90 | 91 | scaledJobDetailError: ||| 92 | sum( 93 | increase( 94 | keda_scaler_detail_errors_total{ 95 | %(withScaledJob)s 96 | }[$__rate_interval] 97 | ) 98 | ) by (exported_namespace, scaledObject) 99 | ||| % defaultFilters, 100 | 101 | scaledJobMetricValue: ||| 102 | avg( 103 | keda_scaler_metrics_value{ 104 | %(withScaledJobMetric)s 105 | } 106 | ) by (exported_namespace, scaledObject, scaler, metric) 107 | ||| % defaultFilters, 108 | 109 | scaledJobMetricLatency: ||| 110 | avg( 111 | keda_scaler_metrics_latency_seconds{ 112 | %(withScaledJobMetric)s 113 | } 114 | ) by (exported_namespace, scaledObject, scaler, metric) 115 | ||| % defaultFilters, 116 | }; 117 | 118 | local panels = { 119 | resourcesRegisteredTimeSeries: 120 | mixinUtils.dashboards.timeSeriesPanel( 121 | 'Resources Registered by Namespace', 122 | 'short', 123 | queries.resourcesRegisteredByNamespace, 124 | '{{ exported_namespace}} / {{ type }}', 125 | description='The number of scaled job resources registered by namespace.', 126 | stack='normal', 127 | ), 128 | 129 | triggersByTypeTimeSeries: 130 | mixinUtils.dashboards.timeSeriesPanel( 131 | 'Triggers by Type', 132 | 'short', 133 | queries.triggersByType, 134 | '{{ type }}', 135 | description='The number of triggers registered by type.', 136 | stack='normal', 137 | ), 138 | 139 | scaledJobsErrorsTimeSeries: 140 | mixinUtils.dashboards.timeSeriesPanel( 141 | 'Scaled Jobs Errors', 142 | 'short', 143 | queries.scaledJobsErrors, 144 | '{{ scaledJob }}', 145 | description='The rate of errors for scaled jobs.', 146 | stack='normal', 147 | ), 148 | 149 | scalerDetailErrorsTimeSeries: 150 | mixinUtils.dashboards.timeSeriesPanel( 151 | 'Scaler Detail Errors', 152 | 'short', 153 | queries.scalerDetailErrors, 154 | '{{ scaledObject }} / {{ scaler }}', 155 | description='The rate of scaler detail errors.', 156 | stack='normal', 157 | ), 158 | 159 | scaleTargetValuesTable: 160 | mixinUtils.dashboards.tablePanel( 161 | 'Scale Target Values', 162 | 'short', 163 | queries.scaleTargetValues, 164 | description='This table has links to the Workload dashboard for the scaled Job, which can be used to see the current resource usage. The Workload dashboard can be found at [kubernetes-mixin](https://github.com/kubernetes-monitoring/kubernetes-mixin) and requires ID customization.', 165 | sortBy={ name: 'Scaled Job', desc: false }, 166 | transformations=[ 167 | tbQueryOptions.transformation.withId( 168 | 'organize' 169 | ) + 170 | tbQueryOptions.transformation.withOptions( 171 | { 172 | renameByName: { 173 | scaledObject: 'Scaled Job', 174 | exported_namespace: 'Resource Namespace', 175 | scaler: 'Scaler', 176 | metric: 'Metric', 177 | value: 'Value', 178 | }, 179 | indexByName: { 180 | scaledObject: 0, 181 | exported_namespace: 1, 182 | scaler: 2, 183 | metric: 3, 184 | value: 4, 185 | }, 186 | excludeByName: { 187 | Time: true, 188 | job: true, 189 | }, 190 | }, 191 | ), 192 | ], 193 | links=[ 194 | tbPanelOptions.link.withTitle('Go to Scaled Job') + 195 | tbPanelOptions.link.withUrl( 196 | '/d/%s/kubernetes-compute-resources-workload?var-namespace=${__data.fields.exported_namespace}&var-type=ScaledJob&var-workload=${__data.fields.scaledObject}' % $._config.keda.k8sResourcesWorkloadDashboardUid 197 | ) + 198 | tbPanelOptions.link.withTargetBlank(true), 199 | ] 200 | ), 201 | 202 | scaledJobActiveTimeSeries: 203 | mixinUtils.dashboards.timeSeriesPanel( 204 | 'Scaled Job Active', 205 | 'short', 206 | queries.scaledJobActive, 207 | '{{ scaledObject }}', 208 | description='Whether the scaled job is active.', 209 | ), 210 | 211 | scaledJobDetailErrorTimeSeries: 212 | mixinUtils.dashboards.timeSeriesPanel( 213 | 'Scaled Job Detail Errors', 214 | 'short', 215 | queries.scaledJobDetailError, 216 | '{{ scaledObject }}', 217 | description='The rate of errors for the selected scaled job.', 218 | ), 219 | 220 | scaledJobMetricValueTimeSeries: 221 | mixinUtils.dashboards.timeSeriesPanel( 222 | 'Scaled Job Metric Value', 223 | 'short', 224 | queries.scaledJobMetricValue, 225 | '{{ scaledObject }} / {{ scaler }} / {{ metric }}', 226 | description='The metric value for the selected scaled job.', 227 | stack='normal', 228 | ), 229 | 230 | scaledJobMetricLatencyTimeSeries: 231 | mixinUtils.dashboards.timeSeriesPanel( 232 | 'Scaled Job Metric Latency', 233 | 's', 234 | queries.scaledJobMetricLatency, 235 | '{{ scaledObject }} / {{ scaler }} / {{ metric }}', 236 | description='The metric collection latency for the selected scaled job.', 237 | ), 238 | }; 239 | 240 | local rows = 241 | [ 242 | row.new('Overview') + 243 | row.gridPos.withX(0) + 244 | row.gridPos.withY(0) + 245 | row.gridPos.withW(24) + 246 | row.gridPos.withH(1), 247 | ] + 248 | grid.makeGrid( 249 | [ 250 | panels.resourcesRegisteredTimeSeries, 251 | panels.triggersByTypeTimeSeries, 252 | ], 253 | panelWidth=12, 254 | panelHeight=6, 255 | startY=1 256 | ) + 257 | grid.makeGrid( 258 | [ 259 | panels.scaledJobsErrorsTimeSeries, 260 | panels.scalerDetailErrorsTimeSeries, 261 | ], 262 | panelWidth=12, 263 | panelHeight=6, 264 | startY=7 265 | ) + 266 | grid.makeGrid( 267 | [ 268 | panels.scaleTargetValuesTable, 269 | ], 270 | panelWidth=24, 271 | panelHeight=8, 272 | startY=13 273 | ) + 274 | [ 275 | row.new('Scaled Job $scaled_job / $scaler / $metric') + 276 | row.gridPos.withX(0) + 277 | row.gridPos.withY(21) + 278 | row.gridPos.withW(24) + 279 | row.gridPos.withH(1), 280 | ] + 281 | grid.makeGrid( 282 | [ 283 | panels.scaledJobActiveTimeSeries, 284 | panels.scaledJobDetailErrorTimeSeries, 285 | ], 286 | panelWidth=12, 287 | panelHeight=5, 288 | startY=22 289 | ) + 290 | grid.makeGrid( 291 | [ 292 | panels.scaledJobMetricValueTimeSeries, 293 | panels.scaledJobMetricLatencyTimeSeries, 294 | ], 295 | panelWidth=24, 296 | panelHeight=8, 297 | startY=27 298 | ); 299 | 300 | mixinUtils.dashboards.bypassDashboardValidation + 301 | dashboard.new( 302 | 'Kubernetes / Autoscaling / KEDA / Scaled Job', 303 | ) + 304 | dashboard.withDescription('A dashboard that monitors KEDA Scaled Jobs. %s' % mixinUtils.dashboards.dashboardDescriptionLink('kubernetes-autoscaling-mixin', 'https://github.com/adinhodovic/kubernetes-autoscaling-mixin')) + 305 | dashboard.withUid($._config.kedaScaledJobDashboardUid) + 306 | dashboard.withTags($._config.tags + ['keda']) + 307 | dashboard.withTimezone('utc') + 308 | dashboard.withEditable(true) + 309 | dashboard.time.withFrom('now-6h') + 310 | dashboard.time.withTo('now') + 311 | dashboard.withVariables(variables) + 312 | dashboard.withLinks( 313 | mixinUtils.dashboards.dashboardLinks('Kubernetes / Autoscaling', $._config, dropdown=true) 314 | ) + 315 | dashboard.withPanels(rows), 316 | }, 317 | } 318 | -------------------------------------------------------------------------------- /dashboards/kubernetes/kubernetes-autoscaling-pdb.libsonnet: -------------------------------------------------------------------------------- 1 | local mixinUtils = import 'github.com/adinhodovic/mixin-utils/utils.libsonnet'; 2 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 3 | local util = import 'util.libsonnet'; 4 | 5 | local dashboard = g.dashboard; 6 | local row = g.panel.row; 7 | local grid = g.util.grid; 8 | 9 | local tablePanel = g.panel.table; 10 | local timeSeriesPanel = g.panel.timeSeries; 11 | 12 | // Table 13 | local tbStandardOptions = tablePanel.standardOptions; 14 | local tbQueryOptions = tablePanel.queryOptions; 15 | local tbOverride = tbStandardOptions.override; 16 | local tbFieldConfig = tablePanel.fieldConfig; 17 | 18 | // Timeseries 19 | local tsStandardOptions = timeSeriesPanel.standardOptions; 20 | local tsOverride = tsStandardOptions.override; 21 | 22 | { 23 | grafanaDashboards+:: { 24 | 'kubernetes-autoscaling-mixin-pdb.json': 25 | 26 | local defaultVariables = util.variables($._config); 27 | 28 | local variables = [ 29 | defaultVariables.datasource, 30 | defaultVariables.cluster, 31 | defaultVariables.pdbJob, 32 | defaultVariables.pdbNamespace, 33 | defaultVariables.pdb, 34 | ]; 35 | 36 | local defaultFilters = util.filters($._config); 37 | local queries = { 38 | disruptionsAllowed: ||| 39 | round( 40 | sum( 41 | kube_poddisruptionbudget_status_pod_disruptions_allowed{ 42 | %(withPdb)s 43 | } 44 | ) 45 | ) 46 | ||| % defaultFilters, 47 | 48 | desiredHealthy: ||| 49 | round( 50 | sum( 51 | kube_poddisruptionbudget_status_desired_healthy{ 52 | %(withPdb)s 53 | } 54 | ) 55 | ) 56 | ||| % defaultFilters, 57 | 58 | currentlyHealthy: ||| 59 | round( 60 | sum( 61 | kube_poddisruptionbudget_status_current_healthy{ 62 | %(withPdb)s 63 | } 64 | ) 65 | ) 66 | ||| % defaultFilters, 67 | 68 | expectedPods: ||| 69 | round( 70 | sum( 71 | kube_poddisruptionbudget_status_expected_pods{ 72 | %(withPdb)s 73 | } 74 | ) 75 | ) 76 | ||| % defaultFilters, 77 | 78 | disruptionsAllowedNamespace: ||| 79 | round( 80 | sum( 81 | kube_poddisruptionbudget_status_pod_disruptions_allowed{ 82 | %(base)s 83 | } 84 | ) by (job, namespace, poddisruptionbudget) 85 | ) 86 | ||| % defaultFilters, 87 | 88 | desiredHealthyNamespace: ||| 89 | round( 90 | sum( 91 | kube_poddisruptionbudget_status_desired_healthy{ 92 | %(base)s 93 | } 94 | ) by (job, namespace, poddisruptionbudget) 95 | ) 96 | ||| % defaultFilters, 97 | 98 | currentlyHealthyNamespace: ||| 99 | round( 100 | sum( 101 | kube_poddisruptionbudget_status_current_healthy{ 102 | %(base)s 103 | } 104 | ) by (job, namespace, poddisruptionbudget) 105 | ) 106 | ||| % defaultFilters, 107 | 108 | expectedPodsNamespace: ||| 109 | round( 110 | sum( 111 | kube_poddisruptionbudget_status_expected_pods{ 112 | %(base)s 113 | } 114 | ) by (job, namespace, poddisruptionbudget) 115 | ) 116 | ||| % defaultFilters, 117 | }; 118 | 119 | local panels = { 120 | disruptionsAllowedStat: 121 | mixinUtils.dashboards.statPanel( 122 | 'Disruptions Allowed', 123 | 'short', 124 | queries.disruptionsAllowed, 125 | description='The number of pod disruptions allowed for the selected PDB.', 126 | ), 127 | 128 | desiredHealthyStat: 129 | mixinUtils.dashboards.statPanel( 130 | 'Desired Healthy', 131 | 'short', 132 | queries.desiredHealthy, 133 | description='The desired number of healthy pods for the selected PDB.', 134 | ), 135 | 136 | currentlyHealthyStat: 137 | mixinUtils.dashboards.statPanel( 138 | 'Currently Healthy', 139 | 'short', 140 | queries.currentlyHealthy, 141 | description='The current number of healthy pods for the selected PDB.', 142 | ), 143 | 144 | expectedPodsStat: 145 | mixinUtils.dashboards.statPanel( 146 | 'Expected Pods', 147 | 'short', 148 | queries.expectedPods, 149 | description='The expected number of pods for the selected PDB.', 150 | ), 151 | 152 | namespaceSummaryTable: 153 | mixinUtils.dashboards.tablePanel( 154 | 'Summary', 155 | 'short', 156 | [ 157 | { 158 | expr: queries.disruptionsAllowedNamespace, 159 | legend: 'Disruptions Allowed', 160 | }, 161 | { 162 | expr: queries.desiredHealthyNamespace, 163 | legend: 'Desired Healthy', 164 | }, 165 | { 166 | expr: queries.currentlyHealthyNamespace, 167 | legend: 'Currently Healthy', 168 | }, 169 | { 170 | expr: queries.expectedPodsNamespace, 171 | legend: 'Expected Pods', 172 | }, 173 | ], 174 | description='Summary of all PDBs in the selected namespace.', 175 | sortBy={ name: 'Pod Disruption Budget', desc: false }, 176 | transformations=[ 177 | tbQueryOptions.transformation.withId('merge'), 178 | tbQueryOptions.transformation.withId('organize') + 179 | tbQueryOptions.transformation.withOptions( 180 | { 181 | renameByName: { 182 | poddisruptionbudget: 'Pod Disruption Budget', 183 | namespace: 'Namespace', 184 | 'Value #A': 'Disruptions Allowed', 185 | 'Value #B': 'Desired Healthy', 186 | 'Value #C': 'Currently Healthy', 187 | 'Value #D': 'Expected Pods', 188 | }, 189 | indexByName: { 190 | namespace: 0, 191 | poddisruptionbudget: 1, 192 | 'Value #A': 2, 193 | 'Value #B': 3, 194 | 'Value #C': 4, 195 | 'Value #D': 5, 196 | }, 197 | excludeByName: { 198 | Time: true, 199 | job: true, 200 | }, 201 | } 202 | ), 203 | ], 204 | overrides=[ 205 | tbOverride.byName.new('Disruptions Allowed') + 206 | tbOverride.byName.withPropertiesFromOptions( 207 | tbFieldConfig.defaults.custom.withCellOptions( 208 | { type: 'color-text' } 209 | ) + 210 | tbStandardOptions.thresholds.withMode('absolute') + 211 | tbStandardOptions.thresholds.withSteps([ 212 | tbStandardOptions.threshold.step.withValue(0) + 213 | tbStandardOptions.threshold.step.withColor('red'), 214 | tbStandardOptions.threshold.step.withValue(0.1) + 215 | tbStandardOptions.threshold.step.withColor('green'), 216 | ]) 217 | ), 218 | ], 219 | ), 220 | 221 | statusTimeSeries: 222 | mixinUtils.dashboards.timeSeriesPanel( 223 | 'Status', 224 | 'short', 225 | [ 226 | { 227 | expr: queries.disruptionsAllowed, 228 | legend: 'Disruptions Allowed', 229 | }, 230 | { 231 | expr: queries.desiredHealthy, 232 | legend: 'Desired Healthy', 233 | }, 234 | { 235 | expr: queries.currentlyHealthy, 236 | legend: 'Currently Healthy', 237 | }, 238 | { 239 | expr: queries.expectedPods, 240 | legend: 'Expected Pods', 241 | }, 242 | ], 243 | description='Status metrics for the selected PDB over time.', 244 | fillOpacity=0, 245 | overrides=[ 246 | tsOverride.byName.new('Currently Healthy') + 247 | tsOverride.byName.withPropertiesFromOptions( 248 | tsStandardOptions.color.withMode('fixed') + 249 | tsStandardOptions.color.withFixedColor('yellow') 250 | ), 251 | tsOverride.byName.new('Disruptions Allowed') + 252 | tsOverride.byName.withPropertiesFromOptions( 253 | tsStandardOptions.color.withMode('fixed') + 254 | tsStandardOptions.color.withFixedColor('red') 255 | ), 256 | tsOverride.byName.new('Desired Healthy') + 257 | tsOverride.byName.withPropertiesFromOptions( 258 | tsStandardOptions.color.withMode('fixed') + 259 | tsStandardOptions.color.withFixedColor('green') 260 | ), 261 | tsOverride.byName.new('Expected Pods') + 262 | tsOverride.byName.withPropertiesFromOptions( 263 | tsStandardOptions.color.withMode('fixed') + 264 | tsStandardOptions.color.withFixedColor('blue') 265 | ), 266 | ], 267 | ), 268 | }; 269 | 270 | local rows = 271 | [ 272 | row.new('$namespace Namespace Summary') + 273 | row.gridPos.withX(0) + 274 | row.gridPos.withY(0) + 275 | row.gridPos.withW(24) + 276 | row.gridPos.withH(1), 277 | panels.namespaceSummaryTable + 278 | row.gridPos.withX(0) + 279 | row.gridPos.withY(1) + 280 | row.gridPos.withW(24) + 281 | row.gridPos.withH(10), 282 | row.new('$poddisruptionbudget') + 283 | row.gridPos.withX(0) + 284 | row.gridPos.withY(11) + 285 | row.gridPos.withW(24) + 286 | row.gridPos.withH(1) + 287 | row.withRepeat('poddisruptionbudget'), 288 | ] + 289 | grid.makeGrid( 290 | [ 291 | panels.disruptionsAllowedStat, 292 | panels.desiredHealthyStat, 293 | panels.currentlyHealthyStat, 294 | panels.expectedPodsStat, 295 | ], 296 | panelWidth=6, 297 | panelHeight=4, 298 | startY=12 299 | ) + 300 | [ 301 | panels.statusTimeSeries + 302 | row.gridPos.withX(0) + 303 | row.gridPos.withY(16) + 304 | row.gridPos.withW(24) + 305 | row.gridPos.withH(10), 306 | ]; 307 | 308 | mixinUtils.dashboards.bypassDashboardValidation + 309 | dashboard.new( 310 | 'Kubernetes / Autoscaling / Pod Disruption Budget', 311 | ) + 312 | dashboard.withDescription('A dashboard that monitors Kubernetes and focuses on giving a overview for pod disruption budgets. %s' % mixinUtils.dashboards.dashboardDescriptionLink('kubernetes-autoscaling-mixin', 'https://github.com/adinhodovic/kubernetes-autoscaling-mixin')) + 313 | dashboard.withUid($._config.pdbDashboardUid) + 314 | dashboard.withTags($._config.tags + ['kubernetes-core']) + 315 | dashboard.withTimezone('utc') + 316 | dashboard.withEditable(true) + 317 | dashboard.time.withFrom('now-6h') + 318 | dashboard.time.withTo('now') + 319 | dashboard.withVariables(variables) + 320 | dashboard.withLinks( 321 | mixinUtils.dashboards.dashboardLinks('Kubernetes / Autoscaling', $._config, dropdown=true) 322 | ) + 323 | dashboard.withPanels(rows) + 324 | dashboard.withAnnotations( 325 | mixinUtils.dashboards.annotations($._config, defaultFilters) 326 | ), 327 | }, 328 | } 329 | -------------------------------------------------------------------------------- /dashboards/cluster-autoscaler/kubernetes-autoscaling-cluster-autoscaler.libsonnet: -------------------------------------------------------------------------------- 1 | local mixinUtils = import 'github.com/adinhodovic/mixin-utils/utils.libsonnet'; 2 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 3 | local util = import 'util.libsonnet'; 4 | 5 | local dashboard = g.dashboard; 6 | local row = g.panel.row; 7 | local grid = g.util.grid; 8 | 9 | // Gauge panel helpers 10 | local gauge = g.panel.gauge; 11 | local gaStandardOptions = gauge.standardOptions; 12 | 13 | { 14 | grafanaDashboards+:: { 15 | 'kubernetes-autoscaling-mixin-ca.json': 16 | if !$._config.clusterAutoscaler.enabled then {} else 17 | 18 | local defaultVariables = util.variables($._config); 19 | 20 | local variables = [ 21 | defaultVariables.datasource, 22 | defaultVariables.cluster, 23 | defaultVariables.job, 24 | ]; 25 | 26 | local defaultFilters = util.filters($._config); 27 | local queries = { 28 | totalNodes: ||| 29 | round( 30 | sum( 31 | cluster_autoscaler_nodes_count{ 32 | %(base)s 33 | } 34 | ) 35 | ) 36 | ||| % defaultFilters, 37 | 38 | maxNodes: ||| 39 | round( 40 | sum( 41 | cluster_autoscaler_max_nodes_count{ 42 | %(base)s 43 | } 44 | ) 45 | ) 46 | ||| % defaultFilters, 47 | 48 | nodeGroups: ||| 49 | round( 50 | sum( 51 | cluster_autoscaler_node_groups_count{ 52 | %(base)s 53 | } 54 | ) 55 | ) 56 | ||| % defaultFilters, 57 | 58 | healthyNodes: ||| 59 | round( 60 | sum( 61 | cluster_autoscaler_nodes_count{ 62 | %(base)s, 63 | state="ready" 64 | } 65 | ) / 66 | sum( 67 | cluster_autoscaler_nodes_count{ 68 | %(base)s 69 | } 70 | ) * 100 71 | ) 72 | ||| % defaultFilters, 73 | 74 | safeToScale: ||| 75 | sum( 76 | cluster_autoscaler_cluster_safe_to_autoscale{ 77 | %(base)s 78 | } 79 | ) 80 | ||| % defaultFilters, 81 | 82 | numberUnscheduledPods: ||| 83 | round( 84 | sum( 85 | cluster_autoscaler_unschedulable_pods_count{ 86 | %(base)s 87 | } 88 | ) 89 | ) 90 | ||| % defaultFilters, 91 | 92 | lastScaleDown: ||| 93 | time() - max( 94 | cluster_autoscaler_last_activity{ 95 | %(base)s, 96 | activity="scaleDown" 97 | } 98 | ) 99 | ||| % defaultFilters, 100 | 101 | lastScaleUp: ||| 102 | time() - max( 103 | cluster_autoscaler_last_activity{ 104 | %(base)s, 105 | activity="scaleUp" 106 | } 107 | ) 108 | ||| % defaultFilters, 109 | 110 | unschedulablePods: ||| 111 | round( 112 | sum( 113 | increase( 114 | cluster_autoscaler_unschedulable_pods_count{ 115 | %(base)s 116 | }[$__rate_interval] 117 | ) 118 | ) by (type) 119 | ) 120 | ||| % defaultFilters, 121 | 122 | evictedPods: ||| 123 | round( 124 | sum( 125 | increase( 126 | cluster_autoscaler_evicted_pods_total{ 127 | %(base)s 128 | }[$__rate_interval] 129 | ) 130 | ) by (eviction_result) 131 | ) 132 | ||| % defaultFilters, 133 | 134 | nodeActivity: ||| 135 | round( 136 | sum( 137 | cluster_autoscaler_nodes_count{ 138 | %(base)s 139 | } 140 | ) by (state) 141 | ) 142 | ||| % defaultFilters, 143 | 144 | unneededNodes: ||| 145 | round( 146 | sum( 147 | cluster_autoscaler_unneeded_nodes_count{ 148 | %(base)s 149 | } 150 | ) 151 | ) 152 | ||| % defaultFilters, 153 | 154 | scaledUpNodes: ||| 155 | round( 156 | sum( 157 | increase( 158 | cluster_autoscaler_scaled_up_nodes_total{ 159 | %(base)s 160 | }[$__rate_interval] 161 | ) 162 | ) 163 | ) 164 | ||| % defaultFilters, 165 | 166 | scaledDownNodes: ||| 167 | round( 168 | sum( 169 | increase( 170 | cluster_autoscaler_scaled_down_nodes_total{ 171 | %(base)s 172 | }[$__rate_interval] 173 | ) 174 | ) 175 | ) 176 | ||| % defaultFilters, 177 | }; 178 | 179 | local panels = { 180 | totalNodesStat: 181 | mixinUtils.dashboards.statPanel( 182 | 'Total Nodes', 183 | 'short', 184 | queries.totalNodes, 185 | description='The total number of nodes in the cluster.', 186 | ), 187 | 188 | maxNodesStat: 189 | mixinUtils.dashboards.statPanel( 190 | 'Max Nodes', 191 | 'short', 192 | queries.maxNodes, 193 | description='The maximum number of nodes allowed in the cluster.', 194 | ), 195 | 196 | nodeGroupsStat: 197 | mixinUtils.dashboards.statPanel( 198 | 'Node Groups', 199 | 'short', 200 | queries.nodeGroups, 201 | description='The number of node groups in the cluster.', 202 | ), 203 | 204 | healthyNodesGauge: 205 | mixinUtils.dashboards.gaugePanel( 206 | 'Healthy Nodes', 207 | 'percent', 208 | queries.healthyNodes, 209 | description='The percentage of healthy nodes in the cluster.', 210 | min=0, 211 | max=100, 212 | steps=[ 213 | gaStandardOptions.threshold.step.withValue(0) + 214 | gaStandardOptions.threshold.step.withColor('red'), 215 | gaStandardOptions.threshold.step.withValue(50) + 216 | gaStandardOptions.threshold.step.withColor('yellow'), 217 | gaStandardOptions.threshold.step.withValue(80) + 218 | gaStandardOptions.threshold.step.withColor('green'), 219 | ], 220 | ), 221 | 222 | safeToScaleStat: 223 | mixinUtils.dashboards.statPanel( 224 | 'Safe to Scale', 225 | 'short', 226 | queries.safeToScale, 227 | description='Indicates whether it is safe to scale the cluster.', 228 | steps=[ 229 | gaStandardOptions.threshold.step.withValue(0) + 230 | gaStandardOptions.threshold.step.withColor('red'), 231 | gaStandardOptions.threshold.step.withValue(0.1) + 232 | gaStandardOptions.threshold.step.withColor('green'), 233 | ], 234 | mappings=[ 235 | gaStandardOptions.mapping.ValueMap.withType() + 236 | gaStandardOptions.mapping.ValueMap.withOptions( 237 | { 238 | '0': { text: 'No', color: 'red' }, 239 | '1': { text: 'Yes', color: 'green' }, 240 | } 241 | ), 242 | ], 243 | ), 244 | 245 | numberUnscheduledPodsStat: 246 | mixinUtils.dashboards.statPanel( 247 | 'Unscheduled Pods', 248 | 'short', 249 | queries.numberUnscheduledPods, 250 | description='The number of unscheduled pods in the cluster.', 251 | ), 252 | 253 | lastScaleDownStat: 254 | mixinUtils.dashboards.statPanel( 255 | 'Last Scale Down', 256 | 's', 257 | queries.lastScaleDown, 258 | description='The timestamp of the last scale down activity.', 259 | ), 260 | 261 | lastScaleUpStat: 262 | mixinUtils.dashboards.statPanel( 263 | 'Last Scale Up', 264 | 's', 265 | queries.lastScaleUp, 266 | description='The timestamp of the last scale up activity.', 267 | ), 268 | 269 | podActivityTimeSeries: 270 | mixinUtils.dashboards.timeSeriesPanel( 271 | 'Pod Activity', 272 | 'short', 273 | [ 274 | { 275 | expr: queries.unschedulablePods, 276 | legend: '{{ type }}', 277 | }, 278 | { 279 | expr: queries.evictedPods, 280 | legend: 'Evicted / {{ eviction_result }}', 281 | }, 282 | ], 283 | description='The activity of pods in the cluster.', 284 | stack='normal' 285 | ), 286 | 287 | nodeActivityTimeSeries: 288 | mixinUtils.dashboards.timeSeriesPanel( 289 | 'Node Activity', 290 | 'short', 291 | queries.nodeActivity, 292 | '{{ state }}', 293 | description='The activity of nodes in the cluster.', 294 | stack='normal' 295 | ), 296 | 297 | autoscalingActivityTimeSeries: 298 | mixinUtils.dashboards.timeSeriesPanel( 299 | 'Autoscaling Activity', 300 | 'short', 301 | [ 302 | { 303 | expr: queries.totalNodes, 304 | legend: 'Total Nodes', 305 | }, 306 | { 307 | expr: queries.unneededNodes, 308 | legend: 'Unneeded', 309 | }, 310 | { 311 | expr: queries.scaledUpNodes, 312 | legend: 'Scaled Up', 313 | }, 314 | { 315 | expr: queries.scaledDownNodes, 316 | legend: 'Scaled Down', 317 | }, 318 | ], 319 | description='The autoscaling activity in the cluster.', 320 | fillOpacity=0, 321 | ), 322 | }; 323 | 324 | local rows = 325 | [ 326 | row.new('Summary') + 327 | row.gridPos.withX(0) + 328 | row.gridPos.withY(0) + 329 | row.gridPos.withW(24) + 330 | row.gridPos.withH(1), 331 | ] + 332 | grid.makeGrid( 333 | [ 334 | panels.totalNodesStat, 335 | panels.maxNodesStat, 336 | panels.nodeGroupsStat, 337 | panels.healthyNodesGauge, 338 | panels.safeToScaleStat, 339 | panels.numberUnscheduledPodsStat, 340 | panels.lastScaleDownStat, 341 | panels.lastScaleUpStat, 342 | ], 343 | panelWidth=3, 344 | panelHeight=4, 345 | startY=1 346 | ) + 347 | [ 348 | row.new('Activity') + 349 | row.gridPos.withX(0) + 350 | row.gridPos.withY(5) + 351 | row.gridPos.withW(24) + 352 | row.gridPos.withH(1), 353 | ] + 354 | grid.makeGrid( 355 | [ 356 | panels.podActivityTimeSeries, 357 | panels.nodeActivityTimeSeries, 358 | ], 359 | panelWidth=12, 360 | panelHeight=8, 361 | startY=6 362 | ) + 363 | grid.makeGrid( 364 | [ 365 | panels.autoscalingActivityTimeSeries, 366 | ], 367 | panelWidth=24, 368 | panelHeight=8, 369 | startY=14 370 | ); 371 | 372 | mixinUtils.dashboards.bypassDashboardValidation + 373 | dashboard.new( 374 | 'Kubernetes / Autoscaling / Cluster Autoscaler', 375 | ) + 376 | dashboard.withDescription('A dashboard that monitors the Cluster Autoscaler. %s' % mixinUtils.dashboards.dashboardDescriptionLink('kubernetes-autoscaling-mixin', 'https://github.com/adinhodovic/kubernetes-autoscaling-mixin')) + 377 | dashboard.withUid($._config.clusterAutoscalerDashboardUid) + 378 | dashboard.withTags($._config.tags + ['cluster-autoscaler']) + 379 | dashboard.withTimezone('utc') + 380 | dashboard.withEditable(true) + 381 | dashboard.time.withFrom('now-6h') + 382 | dashboard.time.withTo('now') + 383 | dashboard.withVariables(variables) + 384 | dashboard.withLinks( 385 | mixinUtils.dashboards.dashboardLinks('Kubernetes / Autoscaling', $._config, dropdown=true) 386 | ) + 387 | dashboard.withPanels( 388 | rows 389 | ) + 390 | dashboard.withAnnotations( 391 | mixinUtils.dashboards.annotations($._config, defaultFilters) 392 | ), 393 | }, 394 | } 395 | -------------------------------------------------------------------------------- /dashboards/keda/keda-scaled-object.libsonnet: -------------------------------------------------------------------------------- 1 | local mixinUtils = import 'github.com/adinhodovic/mixin-utils/utils.libsonnet'; 2 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 3 | local util = import 'util.libsonnet'; 4 | 5 | local dashboard = g.dashboard; 6 | local row = g.panel.row; 7 | local grid = g.util.grid; 8 | 9 | local tablePanel = g.panel.table; 10 | 11 | // Table 12 | local tbQueryOptions = tablePanel.queryOptions; 13 | local tbPanelOptions = tablePanel.panelOptions; 14 | 15 | { 16 | grafanaDashboards+:: { 17 | 'kubernetes-autoscaling-mixin-keda-so.json': 18 | if !$._config.keda.enabled then {} else 19 | 20 | local defaultVariables = util.variables($._config); 21 | 22 | local variables = [ 23 | defaultVariables.datasource, 24 | defaultVariables.cluster, 25 | defaultVariables.scaledObjectJob, 26 | defaultVariables.scaledObjectOperatorNamespace, 27 | defaultVariables.scaledObjectResourceNamespace, 28 | defaultVariables.scaledObject, 29 | defaultVariables.scalerForScaledObject, 30 | defaultVariables.metricForScaledObject, 31 | ]; 32 | 33 | local defaultFilters = util.filters($._config); 34 | 35 | local queries = { 36 | resourcesRegisteredByNamespace: ||| 37 | sum( 38 | keda_resource_registered_total{ 39 | %(base)s, 40 | type="scaled_object" 41 | } 42 | ) by (exported_namespace, type) 43 | ||| % defaultFilters, 44 | 45 | triggersByType: ||| 46 | sum( 47 | keda_trigger_registered_total{ 48 | %(base)s 49 | } 50 | ) by (type) 51 | ||| % defaultFilters, 52 | 53 | scaledObjectsErrors: ||| 54 | sum( 55 | increase( 56 | keda_scaled_object_errors_total{ 57 | %(withResourceNamespace)s 58 | }[$__rate_interval] 59 | ) 60 | ) by (exported_namespace, scaledObject) 61 | ||| % defaultFilters, 62 | 63 | scalerDetailErrors: ||| 64 | sum( 65 | increase( 66 | keda_scaler_detail_errors_total{ 67 | %(withResourceNamespace)s, 68 | type="scaledobject" 69 | }[$__rate_interval] 70 | ) 71 | ) by (exported_namespace, scaledObject, scaler) 72 | ||| % defaultFilters, 73 | 74 | scaledObjectsPaused: ||| 75 | sum( 76 | keda_scaled_object_paused{ 77 | %(withResourceNamespace)s 78 | } 79 | ) by (exported_namespace, scaledObject) 80 | > 0 81 | ||| % defaultFilters, 82 | 83 | scaleTargetValues: ||| 84 | sum( 85 | keda_scaler_metrics_value{ 86 | %(withResourceNamespace)s, 87 | type="scaledobject" 88 | } 89 | ) by (job, exported_namespace, scaledObject, scaler, metric) 90 | ||| % defaultFilters, 91 | 92 | scaledObjectPaused: ||| 93 | sum( 94 | keda_scaled_object_paused{ 95 | %(withScaledObject)s 96 | } 97 | ) by (exported_namespace, scaledObject) 98 | ||| % defaultFilters, 99 | 100 | scaledObjectActive: ||| 101 | sum( 102 | keda_scaler_active{ 103 | %(withScaledObject)s 104 | } 105 | ) by (exported_namespace, scaledObject) 106 | ||| % defaultFilters, 107 | 108 | scaledObjectDetailError: ||| 109 | sum( 110 | increase( 111 | keda_scaler_detail_errors_total{ 112 | %(withScaledObject)s 113 | }[$__rate_interval] 114 | ) 115 | ) by (exported_namespace, scaledObject) 116 | ||| % defaultFilters, 117 | 118 | scaledObjectMetricValue: ||| 119 | avg( 120 | keda_scaler_metrics_value{ 121 | %(withScaledObjectMetric)s 122 | } 123 | ) by (exported_namespace, scaledObject, scaler, metric) 124 | ||| % defaultFilters, 125 | 126 | scaledObjectMetricLatency: ||| 127 | avg( 128 | keda_scaler_metrics_latency_seconds{ 129 | %(withScaledObjectMetric)s 130 | } 131 | ) by (exported_namespace, scaledObject, scaler, metric) 132 | ||| % defaultFilters, 133 | }; 134 | 135 | local panels = { 136 | resourcesRegisteredTimeSeries: 137 | mixinUtils.dashboards.timeSeriesPanel( 138 | 'Resources Registered by Namespace', 139 | 'short', 140 | queries.resourcesRegisteredByNamespace, 141 | '{{ exported_namespace}} / {{ type }}', 142 | description='The number of scaled object resources registered by namespace.', 143 | stack='normal', 144 | ), 145 | 146 | triggersByTypeTimeSeries: 147 | mixinUtils.dashboards.timeSeriesPanel( 148 | 'Triggers by Type', 149 | 'short', 150 | queries.triggersByType, 151 | '{{ type }}', 152 | description='The number of triggers registered by type.', 153 | stack='normal', 154 | ), 155 | 156 | scaledObjectsErrorsTimeSeries: 157 | mixinUtils.dashboards.timeSeriesPanel( 158 | 'Scaled Objects Errors', 159 | 'short', 160 | queries.scaledObjectsErrors, 161 | '{{ scaledObject }}', 162 | description='The rate of errors for scaled objects.', 163 | stack='normal', 164 | ), 165 | 166 | scalerDetailErrorsTimeSeries: 167 | mixinUtils.dashboards.timeSeriesPanel( 168 | 'Scaler Detail Errors', 169 | 'short', 170 | queries.scalerDetailErrors, 171 | '{{ scaledObject }} / {{ scaler }}', 172 | description='The rate of scaler detail errors.', 173 | stack='normal', 174 | ), 175 | 176 | scaledObjectsPausedTimeSeries: 177 | mixinUtils.dashboards.timeSeriesPanel( 178 | 'Scaled Objects Paused', 179 | 'short', 180 | queries.scaledObjectsPaused, 181 | '{{ scaledObject }}', 182 | description='Scaled objects that are currently paused.', 183 | stack='normal', 184 | ), 185 | 186 | scaleTargetValuesTable: 187 | mixinUtils.dashboards.tablePanel( 188 | 'Scale Target Values', 189 | 'short', 190 | queries.scaleTargetValues, 191 | description='This table has links to the HPA for the scaled object, which can be used to see the current scaling status and history. The HPA dashboard can be found at [kubernetes-autoscaling-mixin](https://github.com/adinhodovic/kubernetes-autoscaling-mixin).', 192 | sortBy={ name: 'Scaled Object', desc: false }, 193 | transformations=[ 194 | tbQueryOptions.transformation.withId( 195 | 'organize' 196 | ) + 197 | tbQueryOptions.transformation.withOptions( 198 | { 199 | renameByName: { 200 | scaledObject: 'Scaled Object', 201 | exported_namespace: 'Resource Namespace', 202 | scaler: 'Scaler', 203 | metric: 'Metric', 204 | value: 'Value', 205 | }, 206 | indexByName: { 207 | scaledObject: 0, 208 | exported_namespace: 1, 209 | scaler: 2, 210 | metric: 3, 211 | value: 4, 212 | }, 213 | excludeByName: { 214 | Time: true, 215 | job: true, 216 | }, 217 | } 218 | ), 219 | ], 220 | links=[ 221 | tbPanelOptions.link.withTitle('Go to HPA') + 222 | tbPanelOptions.link.withUrl( 223 | '/d/%s/kubernetes-autoscaling-horizontal-pod-autoscaler?var-namespace=${__data.fields.namespace}&var-hpa=keda-hpa-${__data.fields.scaledObject}&var-metric_name=${__data.fields.metric}' % $._config.hpaDashboardUid 224 | ) + 225 | tbPanelOptions.link.withTargetBlank(true), 226 | ] 227 | ), 228 | 229 | scaledObjectPausedTimeSeries: 230 | mixinUtils.dashboards.timeSeriesPanel( 231 | 'Scaled Object Paused', 232 | 'short', 233 | queries.scaledObjectPaused, 234 | '{{ scaledObject }}', 235 | description='Whether the selected scaled object is paused.', 236 | ), 237 | 238 | scaledObjectActiveTimeSeries: 239 | mixinUtils.dashboards.timeSeriesPanel( 240 | 'Scaled Object Active', 241 | 'short', 242 | queries.scaledObjectActive, 243 | '{{ scaledObject }}', 244 | description='Whether the selected scaled object is active.', 245 | ), 246 | 247 | scaledObjectDetailErrorTimeSeries: 248 | mixinUtils.dashboards.timeSeriesPanel( 249 | 'Scaled Object Detail Errors', 250 | 'short', 251 | queries.scaledObjectDetailError, 252 | '{{ scaledObject }}', 253 | description='The rate of errors for the selected scaled object.', 254 | ), 255 | 256 | scaledObjectMetricValueTimeSeries: 257 | mixinUtils.dashboards.timeSeriesPanel( 258 | 'Scaled Object Metric Value', 259 | 'short', 260 | queries.scaledObjectMetricValue, 261 | '{{ scaledObject }} / {{ scaler }} / {{ metric }}', 262 | description='The metric value for the selected scaled object.', 263 | stack='normal', 264 | ), 265 | 266 | scaledObjectMetricLatencyTimeSeries: 267 | mixinUtils.dashboards.timeSeriesPanel( 268 | 'Scaled Object Metric Latency', 269 | 's', 270 | queries.scaledObjectMetricLatency, 271 | '{{ scaledObject }} / {{ scaler }} / {{ metric }}', 272 | description='The metric collection latency for the selected scaled object.', 273 | ), 274 | }; 275 | 276 | local rows = 277 | [ 278 | row.new('Overview') + 279 | row.gridPos.withX(0) + 280 | row.gridPos.withY(0) + 281 | row.gridPos.withW(24) + 282 | row.gridPos.withH(1), 283 | ] + 284 | grid.makeGrid( 285 | [ 286 | panels.resourcesRegisteredTimeSeries, 287 | panels.triggersByTypeTimeSeries, 288 | ], 289 | panelWidth=12, 290 | panelHeight=6, 291 | startY=1 292 | ) + 293 | grid.makeGrid( 294 | [ 295 | panels.scaledObjectsErrorsTimeSeries, 296 | panels.scalerDetailErrorsTimeSeries, 297 | panels.scaledObjectsPausedTimeSeries, 298 | ], 299 | panelWidth=8, 300 | panelHeight=6, 301 | startY=7 302 | ) + 303 | grid.makeGrid( 304 | [ 305 | panels.scaleTargetValuesTable, 306 | ], 307 | panelWidth=24, 308 | panelHeight=8, 309 | startY=13 310 | ) + 311 | [ 312 | row.new('Scaled Object $scaled_object / $scaler / $metric') + 313 | row.gridPos.withX(0) + 314 | row.gridPos.withY(21) + 315 | row.gridPos.withW(24) + 316 | row.gridPos.withH(1), 317 | ] + 318 | grid.makeGrid( 319 | [ 320 | panels.scaledObjectPausedTimeSeries, 321 | panels.scaledObjectActiveTimeSeries, 322 | panels.scaledObjectDetailErrorTimeSeries, 323 | ], 324 | panelWidth=8, 325 | panelHeight=5, 326 | startY=22 327 | ) + 328 | grid.makeGrid( 329 | [ 330 | panels.scaledObjectMetricValueTimeSeries, 331 | panels.scaledObjectMetricLatencyTimeSeries, 332 | ], 333 | panelWidth=24, 334 | panelHeight=8, 335 | startY=27 336 | ); 337 | 338 | mixinUtils.dashboards.bypassDashboardValidation + 339 | dashboard.new( 340 | 'Kubernetes / Autoscaling / KEDA / Scaled Object', 341 | ) + 342 | dashboard.withDescription('A dashboard that monitors KEDA Scaled Objects. %s' % mixinUtils.dashboards.dashboardDescriptionLink('kubernetes-autoscaling-mixin', 'https://github.com/adinhodovic/kubernetes-autoscaling-mixin')) + 343 | dashboard.withUid($._config.kedaScaledObjectDashboardUid) + 344 | dashboard.withTags($._config.tags + ['keda']) + 345 | dashboard.withTimezone('utc') + 346 | dashboard.withEditable(true) + 347 | dashboard.time.withFrom('now-6h') + 348 | dashboard.time.withTo('now') + 349 | dashboard.withVariables(variables) + 350 | dashboard.withLinks( 351 | mixinUtils.dashboards.dashboardLinks('Kubernetes / Autoscaling', $._config, dropdown=true) 352 | ) + 353 | dashboard.withPanels(rows), 354 | }, 355 | } 356 | -------------------------------------------------------------------------------- /scripts/go.mod: -------------------------------------------------------------------------------- 1 | module _ 2 | 3 | go 1.24.0 4 | 5 | toolchain go1.24.1 6 | 7 | require ( 8 | github.com/Kunde21/markdownfmt/v3 v3.1.0 9 | github.com/cloudflare/pint v0.74.6 10 | github.com/errata-ai/vale/v3 v3.12.0 11 | github.com/google/go-jsonnet v0.21.0 12 | github.com/grafana/dashboard-linter v0.0.0-20231114210226-c458893a5731 13 | github.com/jsonnet-bundler/jsonnet-bundler v0.6.0 14 | github.com/prometheus/prometheus v0.304.2 15 | ) 16 | 17 | require ( 18 | atomicgo.dev/cursor v0.2.0 // indirect 19 | atomicgo.dev/keyboard v0.2.9 // indirect 20 | atomicgo.dev/schedule v0.1.0 // indirect 21 | cloud.google.com/go v0.115.1 // indirect 22 | cloud.google.com/go/auth v0.16.0 // indirect 23 | cloud.google.com/go/auth/oauth2adapt v0.2.8 // indirect 24 | cloud.google.com/go/compute/metadata v0.6.0 // indirect 25 | cloud.google.com/go/iam v1.2.0 // indirect 26 | cloud.google.com/go/storage v1.43.0 // indirect 27 | dario.cat/mergo v1.0.1 // indirect 28 | github.com/Azure/azure-sdk-for-go/sdk/azcore v1.18.0 // indirect 29 | github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.9.0 // indirect 30 | github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.1 // indirect 31 | github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v5 v5.7.0 // indirect 32 | github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v4 v4.3.0 // indirect 33 | github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 // indirect 34 | github.com/AzureAD/microsoft-authentication-library-for-go v1.4.2 // indirect 35 | github.com/BurntSushi/toml v1.4.0 // indirect 36 | github.com/Code-Hex/go-generics-cache v1.5.1 // indirect 37 | github.com/Masterminds/goutils v1.1.1 // indirect 38 | github.com/Masterminds/semver/v3 v3.3.1 // indirect 39 | github.com/Masterminds/sprig/v3 v3.3.0 // indirect 40 | github.com/Microsoft/go-winio v0.6.1 // indirect 41 | github.com/adrg/frontmatter v0.2.0 // indirect 42 | github.com/adrg/strutil v0.3.1 // indirect 43 | github.com/adrg/xdg v0.5.3 // indirect 44 | github.com/agext/levenshtein v1.2.1 // indirect 45 | github.com/alecthomas/kingpin/v2 v2.4.0 // indirect 46 | github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751 // indirect 47 | github.com/alecthomas/units v0.0.0-20240927000941-0f3dac36c52b // indirect 48 | github.com/apparentlymart/go-textseg/v15 v15.0.0 // indirect 49 | github.com/armon/go-metrics v0.4.1 // indirect 50 | github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect 51 | github.com/aws/aws-sdk-go v1.55.7 // indirect 52 | github.com/bboreham/go-loser v0.0.0-20230920113527-fcc2c21820a3 // indirect 53 | github.com/benbjohnson/clock v1.3.5 // indirect 54 | github.com/beorn7/perks v1.0.1 // indirect 55 | github.com/bmatcuk/doublestar/v4 v4.7.1 // indirect 56 | github.com/cespare/xxhash/v2 v2.3.0 // indirect 57 | github.com/cncf/xds/go v0.0.0-20250121191232-2f005788dc42 // indirect 58 | github.com/containerd/console v1.0.3 // indirect 59 | github.com/coreos/go-systemd/v22 v22.5.0 // indirect 60 | github.com/d5/tengo/v2 v2.17.0 // indirect 61 | github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect 62 | github.com/dennwc/varint v1.0.0 // indirect 63 | github.com/digitalocean/godo v1.144.0 // indirect 64 | github.com/distribution/reference v0.5.0 // indirect 65 | github.com/docker/docker v28.1.1+incompatible // indirect 66 | github.com/docker/go-connections v0.4.0 // indirect 67 | github.com/docker/go-units v0.5.0 // indirect 68 | github.com/edsrzf/mmap-go v1.2.0 // indirect 69 | github.com/elliotchance/orderedmap/v2 v2.2.0 // indirect 70 | github.com/emicklei/go-restful/v3 v3.11.0 // indirect 71 | github.com/envoyproxy/go-control-plane/envoy v1.32.4 // indirect 72 | github.com/envoyproxy/protoc-gen-validate v1.2.1 // indirect 73 | github.com/errata-ai/ini v1.63.0 // indirect 74 | github.com/errata-ai/regexp2 v1.7.0 // indirect 75 | github.com/expr-lang/expr v1.17.0 // indirect 76 | github.com/facette/natsort v0.0.0-20181210072756-2cd4dd1e2dcb // indirect 77 | github.com/fatih/color v1.18.0 // indirect 78 | github.com/felixge/httpsnoop v1.0.4 // indirect 79 | github.com/fsnotify/fsnotify v1.8.0 // indirect 80 | github.com/fxamacker/cbor/v2 v2.7.0 // indirect 81 | github.com/ghodss/yaml v1.0.0 // indirect 82 | github.com/go-logr/logr v1.4.2 // indirect 83 | github.com/go-logr/stdr v1.2.2 // indirect 84 | github.com/go-openapi/analysis v0.23.0 // indirect 85 | github.com/go-openapi/errors v0.22.0 // indirect 86 | github.com/go-openapi/jsonpointer v0.21.0 // indirect 87 | github.com/go-openapi/jsonreference v0.21.0 // indirect 88 | github.com/go-openapi/loads v0.22.0 // indirect 89 | github.com/go-openapi/spec v0.21.0 // indirect 90 | github.com/go-openapi/strfmt v0.23.0 // indirect 91 | github.com/go-openapi/swag v0.23.0 // indirect 92 | github.com/go-openapi/validate v0.24.0 // indirect 93 | github.com/go-resty/resty/v2 v2.16.5 // indirect 94 | github.com/go-viper/mapstructure/v2 v2.3.0 // indirect 95 | github.com/go-zookeeper/zk v1.0.4 // indirect 96 | github.com/gobwas/glob v0.2.3 // indirect 97 | github.com/gogo/protobuf v1.3.2 // indirect 98 | github.com/golang-jwt/jwt/v5 v5.2.2 // indirect 99 | github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect 100 | github.com/golang/protobuf v1.5.4 // indirect 101 | github.com/golang/snappy v1.0.0 // indirect 102 | github.com/google/gnostic-models v0.6.8 // indirect 103 | github.com/google/go-cmp v0.7.0 // indirect 104 | github.com/google/go-github/v73 v73.0.0 // indirect 105 | github.com/google/go-querystring v1.1.0 // indirect 106 | github.com/google/gofuzz v1.2.0 // indirect 107 | github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad // indirect 108 | github.com/google/s2a-go v0.1.9 // indirect 109 | github.com/google/uuid v1.6.0 // indirect 110 | github.com/googleapis/enterprise-certificate-proxy v0.3.6 // indirect 111 | github.com/googleapis/gax-go/v2 v2.14.1 // indirect 112 | github.com/gookit/color v1.5.4 // indirect 113 | github.com/gophercloud/gophercloud/v2 v2.7.0 // indirect 114 | github.com/gorilla/websocket v1.5.0 // indirect 115 | github.com/grafana/regexp v0.0.0-20240518133315-a468a5bfb3bc // indirect 116 | github.com/hashicorp/consul/api v1.32.0 // indirect 117 | github.com/hashicorp/cronexpr v1.1.2 // indirect 118 | github.com/hashicorp/errwrap v1.1.0 // indirect 119 | github.com/hashicorp/go-cleanhttp v0.5.2 // indirect 120 | github.com/hashicorp/go-hclog v1.6.3 // indirect 121 | github.com/hashicorp/go-immutable-radix v1.3.1 // indirect 122 | github.com/hashicorp/go-multierror v1.1.1 // indirect 123 | github.com/hashicorp/go-retryablehttp v0.7.8 // indirect 124 | github.com/hashicorp/go-rootcerts v1.0.2 // indirect 125 | github.com/hashicorp/go-version v1.7.0 // indirect 126 | github.com/hashicorp/golang-lru v0.6.0 // indirect 127 | github.com/hashicorp/hcl v1.0.0 // indirect 128 | github.com/hashicorp/hcl/v2 v2.24.0 // indirect 129 | github.com/hashicorp/nomad/api v0.0.0-20241218080744-e3ac00f30eec // indirect 130 | github.com/hashicorp/serf v0.10.1 // indirect 131 | github.com/hetznercloud/hcloud-go/v2 v2.21.0 // indirect 132 | github.com/huandu/xstrings v1.5.0 // indirect 133 | github.com/inconshreveable/mousetrap v1.1.0 // indirect 134 | github.com/ionos-cloud/sdk-go/v6 v6.3.3 // indirect 135 | github.com/jdkato/go-tree-sitter-julia v0.1.0 // indirect 136 | github.com/jdkato/twine v0.10.2 // indirect 137 | github.com/jmespath/go-jmespath v0.4.0 // indirect 138 | github.com/josharian/intern v1.0.0 // indirect 139 | github.com/jpillora/backoff v1.0.0 // indirect 140 | github.com/json-iterator/go v1.1.12 // indirect 141 | github.com/klauspost/compress v1.18.0 // indirect 142 | github.com/knadh/koanf/maps v0.1.2 // indirect 143 | github.com/knadh/koanf/providers/confmap v0.1.0 // indirect 144 | github.com/knadh/koanf/v2 v2.1.2 // indirect 145 | github.com/kolo/xmlrpc v0.0.0-20220921171641-a4b6fa1dd06b // indirect 146 | github.com/kylelemons/godebug v1.1.0 // indirect 147 | github.com/linode/linodego v1.49.0 // indirect 148 | github.com/lithammer/fuzzysearch v1.1.8 // indirect 149 | github.com/magiconair/properties v1.8.7 // indirect 150 | github.com/mailru/easyjson v0.7.7 // indirect 151 | github.com/mattn/go-colorable v0.1.13 // indirect 152 | github.com/mattn/go-isatty v0.0.20 // indirect 153 | github.com/mattn/go-runewidth v0.0.16 // indirect 154 | github.com/mdlayher/socket v0.4.1 // indirect 155 | github.com/mdlayher/vsock v1.2.1 // indirect 156 | github.com/miekg/dns v1.1.65 // indirect 157 | github.com/mitchellh/copystructure v1.2.0 // indirect 158 | github.com/mitchellh/go-homedir v1.1.0 // indirect 159 | github.com/mitchellh/go-wordwrap v1.0.1 // indirect 160 | github.com/mitchellh/mapstructure v1.5.0 // indirect 161 | github.com/mitchellh/reflectwalk v1.0.2 // indirect 162 | github.com/moby/docker-image-spec v1.3.1 // indirect 163 | github.com/moby/sys/sequential v0.6.0 // indirect 164 | github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect 165 | github.com/modern-go/reflect2 v1.0.2 // indirect 166 | github.com/montanaflynn/stats v0.7.1 // indirect 167 | github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect 168 | github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect 169 | github.com/niklasfasching/go-org v1.7.0 // indirect 170 | github.com/nsf/jsondiff v0.0.0-20230430225905-43f6cf3098c1 // indirect 171 | github.com/oklog/ulid v1.3.1 // indirect 172 | github.com/oklog/ulid/v2 v2.1.0 // indirect 173 | github.com/olekukonko/tablewriter v0.0.5 // indirect 174 | github.com/open-telemetry/opentelemetry-collector-contrib/internal/exp/metrics v0.124.1 // indirect 175 | github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatautil v0.124.1 // indirect 176 | github.com/open-telemetry/opentelemetry-collector-contrib/processor/deltatocumulativeprocessor v0.124.1 // indirect 177 | github.com/opencontainers/go-digest v1.0.0 // indirect 178 | github.com/opencontainers/image-spec v1.0.2 // indirect 179 | github.com/otiai10/copy v1.14.0 // indirect 180 | github.com/ovh/go-ovh v1.7.0 // indirect 181 | github.com/pelletier/go-toml/v2 v2.2.3 // indirect 182 | github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect 183 | github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e // indirect 184 | github.com/pkg/errors v0.9.1 // indirect 185 | github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect 186 | github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect 187 | github.com/prometheus/alertmanager v0.28.1 // indirect 188 | github.com/prometheus/client_golang v1.22.0 // indirect 189 | github.com/prometheus/client_model v0.6.2 // indirect 190 | github.com/prometheus/common v0.65.0 // indirect 191 | github.com/prometheus/exporter-toolkit v0.14.0 // indirect 192 | github.com/prometheus/otlptranslator v0.0.0-20250320144820-d800c8b0eb07 // indirect 193 | github.com/prometheus/procfs v0.15.1 // indirect 194 | github.com/prometheus/sigv4 v0.1.2 // indirect 195 | github.com/prymitive/current v0.1.1 // indirect 196 | github.com/pterm/pterm v0.12.76 // indirect 197 | github.com/puzpuzpuz/xsync/v3 v3.5.1 // indirect 198 | github.com/remeh/sizedwaitgroup v1.0.0 // indirect 199 | github.com/rivo/uniseg v0.4.7 // indirect 200 | github.com/scaleway/scaleway-sdk-go v1.0.0-beta.33 // indirect 201 | github.com/shopspring/decimal v1.4.0 // indirect 202 | github.com/smacker/go-tree-sitter v0.0.0-20240827094217-dd81d9e9be82 // indirect 203 | github.com/spf13/afero v1.10.0 // indirect 204 | github.com/spf13/cast v1.7.1 // indirect 205 | github.com/spf13/cobra v1.8.0 // indirect 206 | github.com/spf13/jwalterweatherman v1.1.0 // indirect 207 | github.com/spf13/pflag v1.0.5 // indirect 208 | github.com/spf13/viper v1.16.0 // indirect 209 | github.com/stretchr/testify v1.10.0 // indirect 210 | github.com/subosito/gotenv v1.4.2 // indirect 211 | github.com/tomwright/dasel/v2 v2.8.1 // indirect 212 | github.com/urfave/cli/v3 v3.3.8 // indirect 213 | github.com/vultr/govultr/v2 v2.17.2 // indirect 214 | github.com/x448/float16 v0.8.4 // indirect 215 | github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb // indirect 216 | github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415 // indirect 217 | github.com/xeipuuv/gojsonschema v1.2.0 // indirect 218 | github.com/xhit/go-str2duration/v2 v2.1.0 // indirect 219 | github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect 220 | github.com/yuin/goldmark v1.7.8 // indirect 221 | github.com/zclconf/go-cty v1.16.3 // indirect 222 | github.com/zeitlinger/conflate v0.0.0-20230622100834-279724abda8c // indirect 223 | gitlab.com/gitlab-org/api/client-go v0.137.0 // indirect 224 | go.mongodb.org/mongo-driver v1.14.0 // indirect 225 | go.opencensus.io v0.24.0 // indirect 226 | go.opentelemetry.io/auto/sdk v1.1.0 // indirect 227 | go.opentelemetry.io/collector/component v1.30.0 // indirect 228 | go.opentelemetry.io/collector/confmap v1.30.0 // indirect 229 | go.opentelemetry.io/collector/confmap/xconfmap v0.124.0 // indirect 230 | go.opentelemetry.io/collector/consumer v1.30.0 // indirect 231 | go.opentelemetry.io/collector/featuregate v1.30.0 // indirect 232 | go.opentelemetry.io/collector/internal/telemetry v0.124.0 // indirect 233 | go.opentelemetry.io/collector/pdata v1.30.0 // indirect 234 | go.opentelemetry.io/collector/pipeline v0.124.0 // indirect 235 | go.opentelemetry.io/collector/processor v1.30.0 // indirect 236 | go.opentelemetry.io/collector/semconv v0.124.0 // indirect 237 | go.opentelemetry.io/contrib/bridges/otelzap v0.10.0 // indirect 238 | go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.60.0 // indirect 239 | go.opentelemetry.io/contrib/instrumentation/net/http/httptrace/otelhttptrace v0.60.0 // indirect 240 | go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 // indirect 241 | go.opentelemetry.io/otel v1.35.0 // indirect 242 | go.opentelemetry.io/otel/log v0.11.0 // indirect 243 | go.opentelemetry.io/otel/metric v1.35.0 // indirect 244 | go.opentelemetry.io/otel/sdk v1.35.0 // indirect 245 | go.opentelemetry.io/otel/trace v1.35.0 // indirect 246 | go.uber.org/atomic v1.11.0 // indirect 247 | go.uber.org/automaxprocs v1.6.0 // indirect 248 | go.uber.org/goleak v1.3.0 // indirect 249 | go.uber.org/multierr v1.11.0 // indirect 250 | go.uber.org/ratelimit v0.3.1 // indirect 251 | go.uber.org/zap v1.27.0 // indirect 252 | golang.org/x/crypto v0.38.0 // indirect 253 | golang.org/x/exp v0.0.0-20250106191152-7588d65b2ba8 // indirect 254 | golang.org/x/mod v0.24.0 // indirect 255 | golang.org/x/net v0.40.0 // indirect 256 | golang.org/x/oauth2 v0.30.0 // indirect 257 | golang.org/x/sync v0.14.0 // indirect 258 | golang.org/x/sys v0.33.0 // indirect 259 | golang.org/x/term v0.32.0 // indirect 260 | golang.org/x/text v0.25.0 // indirect 261 | golang.org/x/time v0.12.0 // indirect 262 | golang.org/x/tools v0.32.0 // indirect 263 | google.golang.org/api v0.230.0 // indirect 264 | google.golang.org/genproto v0.0.0-20240903143218-8af14fe29dc1 // indirect 265 | google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb // indirect 266 | google.golang.org/genproto/googleapis/rpc v0.0.0-20250414145226-207652e42e2e // indirect 267 | google.golang.org/grpc v1.72.0 // indirect 268 | google.golang.org/protobuf v1.36.6 // indirect 269 | gopkg.in/alecthomas/kingpin.v2 v2.2.6 // indirect 270 | gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect 271 | gopkg.in/inf.v0 v0.9.1 // indirect 272 | gopkg.in/ini.v1 v1.67.0 // indirect 273 | gopkg.in/neurosnap/sentences.v1 v1.0.7 // indirect 274 | gopkg.in/yaml.v2 v2.4.0 // indirect 275 | gopkg.in/yaml.v3 v3.0.1 // indirect 276 | k8s.io/api v0.32.3 // indirect 277 | k8s.io/apimachinery v0.32.3 // indirect 278 | k8s.io/client-go v0.32.3 // indirect 279 | k8s.io/klog/v2 v2.130.1 // indirect 280 | k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f // indirect 281 | k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 // indirect 282 | sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect 283 | sigs.k8s.io/structured-merge-diff/v4 v4.4.2 // indirect 284 | sigs.k8s.io/yaml v1.4.0 // indirect 285 | ) 286 | 287 | // TODO: This could be removed after https://github.com/mholt/archiver/pull/396 merged 288 | replace github.com/mholt/archiver/v3 => github.com/anchore/archiver/v3 v3.5.2 289 | -------------------------------------------------------------------------------- /dashboards_out/kubernetes-autoscaling-mixin-karpenter-act.json: -------------------------------------------------------------------------------- 1 | { 2 | "__inputs": [ ], 3 | "__requires": [ ], 4 | "annotations": { 5 | "list": [ ] 6 | }, 7 | "description": "A dashboard that monitors Karpenter and focuses on Karpenter deletion/creation activity. The dashboards were generated using [kubernetes-autoscaling-mixin](https://github.com/adinhodovic/kubernetes-autoscaling-mixin). Open issues and create feature requests in the repository.", 8 | "editable": true, 9 | "links": [ 10 | { 11 | "asDropdown": true, 12 | "includeVars": false, 13 | "keepTime": true, 14 | "tags": [ 15 | "kubernetes", 16 | "autoscaling", 17 | "kubernetes-autoscaling-mixin" 18 | ], 19 | "targetBlank": true, 20 | "title": "Kubernetes / Autoscaling", 21 | "type": "dashboards" 22 | } 23 | ], 24 | "panels": [ 25 | { 26 | "collapsed": false, 27 | "gridPos": { 28 | "h": 1, 29 | "w": 24, 30 | "x": 0, 31 | "y": 0 32 | }, 33 | "id": 1, 34 | "title": "Node Pool Activity", 35 | "type": "row" 36 | }, 37 | { 38 | "datasource": { 39 | "type": "prometheus", 40 | "uid": "$datasource" 41 | }, 42 | "description": "The number of nodes created by node pool.", 43 | "fieldConfig": { 44 | "defaults": { 45 | "custom": { 46 | "axisSoftMin": 0, 47 | "fillOpacity": 100, 48 | "lineWidth": 1, 49 | "stacking": { 50 | "mode": "normal" 51 | } 52 | }, 53 | "unit": "short" 54 | }, 55 | "overrides": [ ] 56 | }, 57 | "gridPos": { 58 | "h": 6, 59 | "w": 12, 60 | "x": 0, 61 | "y": 1 62 | }, 63 | "id": 2, 64 | "options": { 65 | "legend": { 66 | "calcs": [ 67 | "mean", 68 | "max" 69 | ], 70 | "displayMode": "table", 71 | "placement": "right", 72 | "showLegend": true, 73 | "sortBy": "Mean", 74 | "sortDesc": true 75 | }, 76 | "tooltip": { 77 | "mode": "multi", 78 | "sort": "desc" 79 | } 80 | }, 81 | "pluginVersion": "v11.4.0", 82 | "targets": [ 83 | { 84 | "datasource": { 85 | "type": "prometheus", 86 | "uid": "$datasource" 87 | }, 88 | "exemplar": false, 89 | "expr": "round(\n sum(\n increase(\n karpenter_nodes_created_total{\n cluster=\"$cluster\",\njob=~\"$job\"\n,\n nodepool=~\"$nodepool\"\n }[$__rate_interval]\n )\n ) by (nodepool)\n)\n", 90 | "legendFormat": "{{ nodepool }}" 91 | } 92 | ], 93 | "title": "Nodes Created by Node Pool", 94 | "type": "timeseries" 95 | }, 96 | { 97 | "datasource": { 98 | "type": "prometheus", 99 | "uid": "$datasource" 100 | }, 101 | "description": "The number of nodes terminated by node pool.", 102 | "fieldConfig": { 103 | "defaults": { 104 | "custom": { 105 | "axisSoftMin": 0, 106 | "fillOpacity": 100, 107 | "lineWidth": 1, 108 | "stacking": { 109 | "mode": "normal" 110 | } 111 | }, 112 | "unit": "short" 113 | }, 114 | "overrides": [ ] 115 | }, 116 | "gridPos": { 117 | "h": 6, 118 | "w": 12, 119 | "x": 12, 120 | "y": 1 121 | }, 122 | "id": 3, 123 | "options": { 124 | "legend": { 125 | "calcs": [ 126 | "mean", 127 | "max" 128 | ], 129 | "displayMode": "table", 130 | "placement": "right", 131 | "showLegend": true, 132 | "sortBy": "Mean", 133 | "sortDesc": true 134 | }, 135 | "tooltip": { 136 | "mode": "multi", 137 | "sort": "desc" 138 | } 139 | }, 140 | "pluginVersion": "v11.4.0", 141 | "targets": [ 142 | { 143 | "datasource": { 144 | "type": "prometheus", 145 | "uid": "$datasource" 146 | }, 147 | "exemplar": false, 148 | "expr": "round(\n sum(\n increase(\n karpenter_nodes_terminated_total{\n cluster=\"$cluster\",\njob=~\"$job\"\n,\n nodepool=~\"$nodepool\"\n }[$__rate_interval]\n )\n ) by (nodepool)\n)\n", 149 | "legendFormat": "{{ nodepool }}" 150 | } 151 | ], 152 | "title": "Nodes Terminated by Node Pool", 153 | "type": "timeseries" 154 | }, 155 | { 156 | "datasource": { 157 | "type": "prometheus", 158 | "uid": "$datasource" 159 | }, 160 | "description": "The number of voluntary disruption decisions by reason and decision.", 161 | "fieldConfig": { 162 | "defaults": { 163 | "custom": { 164 | "axisSoftMin": 0, 165 | "fillOpacity": 100, 166 | "lineWidth": 1, 167 | "stacking": { 168 | "mode": "normal" 169 | } 170 | }, 171 | "unit": "short" 172 | }, 173 | "overrides": [ ] 174 | }, 175 | "gridPos": { 176 | "h": 6, 177 | "w": 12, 178 | "x": 0, 179 | "y": 7 180 | }, 181 | "id": 4, 182 | "options": { 183 | "legend": { 184 | "calcs": [ 185 | "mean", 186 | "max" 187 | ], 188 | "displayMode": "table", 189 | "placement": "right", 190 | "showLegend": true, 191 | "sortBy": "Mean", 192 | "sortDesc": true 193 | }, 194 | "tooltip": { 195 | "mode": "multi", 196 | "sort": "desc" 197 | } 198 | }, 199 | "pluginVersion": "v11.4.0", 200 | "targets": [ 201 | { 202 | "datasource": { 203 | "type": "prometheus", 204 | "uid": "$datasource" 205 | }, 206 | "exemplar": false, 207 | "expr": "round(\n sum(\n increase(\n karpenter_voluntary_disruption_decisions_total{\n cluster=\"$cluster\",\njob=~\"$job\"\n\n }[$__rate_interval]\n )\n ) by (decision, reason)\n)\n", 208 | "legendFormat": "{{ decision }} - {{ reason }}" 209 | } 210 | ], 211 | "title": "Node Disruption Decisions by Reason and Decision", 212 | "type": "timeseries" 213 | }, 214 | { 215 | "datasource": { 216 | "type": "prometheus", 217 | "uid": "$datasource" 218 | }, 219 | "description": "The number of nodes eligible for voluntary disruption by reason.", 220 | "fieldConfig": { 221 | "defaults": { 222 | "custom": { 223 | "axisSoftMin": 0, 224 | "fillOpacity": 100, 225 | "lineWidth": 1, 226 | "stacking": { 227 | "mode": "normal" 228 | } 229 | }, 230 | "unit": "short" 231 | }, 232 | "overrides": [ ] 233 | }, 234 | "gridPos": { 235 | "h": 6, 236 | "w": 12, 237 | "x": 12, 238 | "y": 7 239 | }, 240 | "id": 5, 241 | "options": { 242 | "legend": { 243 | "calcs": [ 244 | "mean", 245 | "max" 246 | ], 247 | "displayMode": "table", 248 | "placement": "right", 249 | "showLegend": true, 250 | "sortBy": "Mean", 251 | "sortDesc": true 252 | }, 253 | "tooltip": { 254 | "mode": "multi", 255 | "sort": "desc" 256 | } 257 | }, 258 | "pluginVersion": "v11.4.0", 259 | "targets": [ 260 | { 261 | "datasource": { 262 | "type": "prometheus", 263 | "uid": "$datasource" 264 | }, 265 | "exemplar": false, 266 | "expr": "round(\n sum(\n karpenter_voluntary_disruption_eligible_nodes{\n cluster=\"$cluster\",\njob=~\"$job\"\n\n }\n ) by (reason)\n)\n", 267 | "legendFormat": "{{ reason }}" 268 | } 269 | ], 270 | "title": "Nodes Eligible for Disruption by Reason", 271 | "type": "timeseries" 272 | }, 273 | { 274 | "datasource": { 275 | "type": "prometheus", 276 | "uid": "$datasource" 277 | }, 278 | "description": "The number of nodes disrupted by node pool, capacity type, and reason.", 279 | "fieldConfig": { 280 | "defaults": { 281 | "custom": { 282 | "axisSoftMin": 0, 283 | "fillOpacity": 100, 284 | "lineWidth": 1, 285 | "stacking": { 286 | "mode": "normal" 287 | } 288 | }, 289 | "unit": "short" 290 | }, 291 | "overrides": [ ] 292 | }, 293 | "gridPos": { 294 | "h": 6, 295 | "w": 24, 296 | "x": 0, 297 | "y": 13 298 | }, 299 | "id": 6, 300 | "options": { 301 | "legend": { 302 | "calcs": [ 303 | "mean", 304 | "max" 305 | ], 306 | "displayMode": "table", 307 | "placement": "right", 308 | "showLegend": true, 309 | "sortBy": "Mean", 310 | "sortDesc": true 311 | }, 312 | "tooltip": { 313 | "mode": "multi", 314 | "sort": "desc" 315 | } 316 | }, 317 | "pluginVersion": "v11.4.0", 318 | "targets": [ 319 | { 320 | "datasource": { 321 | "type": "prometheus", 322 | "uid": "$datasource" 323 | }, 324 | "exemplar": false, 325 | "expr": "round(\n sum(\n increase(\n karpenter_nodeclaims_disrupted_total{\n cluster=\"$cluster\",\njob=~\"$job\"\n,\n nodepool=~\"$nodepool\"\n }[$__rate_interval]\n )\n ) by (nodepool, capacity_type, reason)\n)\n", 326 | "legendFormat": "{{ nodepool }} - {{ capacity_type }} - {{ reason }}" 327 | } 328 | ], 329 | "title": "Nodes Disrupted by Node Pool", 330 | "type": "timeseries" 331 | }, 332 | { 333 | "collapsed": false, 334 | "gridPos": { 335 | "h": 1, 336 | "w": 24, 337 | "x": 0, 338 | "y": 19 339 | }, 340 | "id": 7, 341 | "title": "Pod Activity", 342 | "type": "row" 343 | }, 344 | { 345 | "datasource": { 346 | "type": "prometheus", 347 | "uid": "$datasource" 348 | }, 349 | "description": "The number of pods by phase.", 350 | "fieldConfig": { 351 | "defaults": { 352 | "custom": { 353 | "axisSoftMin": 0, 354 | "fillOpacity": 100, 355 | "lineWidth": 1, 356 | "stacking": { 357 | "mode": "normal" 358 | } 359 | }, 360 | "unit": "short" 361 | }, 362 | "overrides": [ ] 363 | }, 364 | "gridPos": { 365 | "h": 6, 366 | "w": 12, 367 | "x": 0, 368 | "y": 20 369 | }, 370 | "id": 8, 371 | "options": { 372 | "legend": { 373 | "calcs": [ 374 | "mean", 375 | "max" 376 | ], 377 | "displayMode": "table", 378 | "placement": "right", 379 | "showLegend": true, 380 | "sortBy": "Mean", 381 | "sortDesc": true 382 | }, 383 | "tooltip": { 384 | "mode": "multi", 385 | "sort": "desc" 386 | } 387 | }, 388 | "pluginVersion": "v11.4.0", 389 | "targets": [ 390 | { 391 | "datasource": { 392 | "type": "prometheus", 393 | "uid": "$datasource" 394 | }, 395 | "exemplar": false, 396 | "expr": "round(\n sum(\n karpenter_pods_state{\n cluster=\"$cluster\",\njob=~\"$job\"\n\n }\n ) by (phase)\n)\n", 397 | "legendFormat": "{{ phase }}" 398 | } 399 | ], 400 | "title": "Pods by Phase", 401 | "type": "timeseries" 402 | }, 403 | { 404 | "datasource": { 405 | "type": "prometheus", 406 | "uid": "$datasource" 407 | }, 408 | "description": "The duration for pods to start up.", 409 | "fieldConfig": { 410 | "defaults": { 411 | "custom": { 412 | "fillOpacity": 0 413 | }, 414 | "unit": "s" 415 | }, 416 | "overrides": [ ] 417 | }, 418 | "gridPos": { 419 | "h": 6, 420 | "w": 12, 421 | "x": 12, 422 | "y": 20 423 | }, 424 | "id": 9, 425 | "options": { 426 | "legend": { 427 | "calcs": [ 428 | "mean", 429 | "max" 430 | ], 431 | "displayMode": "table", 432 | "placement": "right", 433 | "showLegend": true, 434 | "sortBy": "Mean", 435 | "sortDesc": true 436 | }, 437 | "tooltip": { 438 | "mode": "multi", 439 | "sort": "desc" 440 | } 441 | }, 442 | "pluginVersion": "v11.4.0", 443 | "targets": [ 444 | { 445 | "datasource": { 446 | "type": "prometheus", 447 | "uid": "$datasource" 448 | }, 449 | "exemplar": false, 450 | "expr": "max(\n karpenter_pods_startup_duration_seconds{\n cluster=\"$cluster\",\njob=~\"$job\"\n,\n quantile=\"0.5\"\n }\n)\n", 451 | "legendFormat": "P50" 452 | }, 453 | { 454 | "datasource": { 455 | "type": "prometheus", 456 | "uid": "$datasource" 457 | }, 458 | "exemplar": false, 459 | "expr": "max(\n karpenter_pods_startup_duration_seconds{\n cluster=\"$cluster\",\njob=~\"$job\"\n,\n quantile=\"0.95\"\n }\n)\n", 460 | "legendFormat": "P95" 461 | }, 462 | { 463 | "datasource": { 464 | "type": "prometheus", 465 | "uid": "$datasource" 466 | }, 467 | "exemplar": false, 468 | "expr": "max(\n karpenter_pods_startup_duration_seconds{\n cluster=\"$cluster\",\njob=~\"$job\"\n,\n quantile=\"0.99\"\n }\n)\n", 469 | "legendFormat": "P99" 470 | } 471 | ], 472 | "title": "Pods Startup Duration", 473 | "type": "timeseries" 474 | } 475 | ], 476 | "schemaVersion": 39, 477 | "tags": [ 478 | "kubernetes", 479 | "autoscaling", 480 | "kubernetes-autoscaling-mixin", 481 | "karpenter" 482 | ], 483 | "templating": { 484 | "list": [ 485 | { 486 | "current": { 487 | "selected": true, 488 | "text": "default", 489 | "value": "default" 490 | }, 491 | "label": "Data source", 492 | "name": "datasource", 493 | "query": "prometheus", 494 | "type": "datasource" 495 | }, 496 | { 497 | "datasource": { 498 | "type": "prometheus", 499 | "uid": "${datasource}" 500 | }, 501 | "hide": 2, 502 | "label": "Cluster", 503 | "name": "cluster", 504 | "query": "label_values(kube_pod_info{job=~\"kube-state-metrics\"}, cluster)", 505 | "refresh": 2, 506 | "sort": 1, 507 | "type": "query" 508 | }, 509 | { 510 | "datasource": { 511 | "type": "prometheus", 512 | "uid": "${datasource}" 513 | }, 514 | "label": "Job", 515 | "name": "job", 516 | "query": "label_values(karpenter_nodes_allocatable{cluster=\"$cluster\"}, job)", 517 | "refresh": 2, 518 | "sort": 1, 519 | "type": "query" 520 | }, 521 | { 522 | "datasource": { 523 | "type": "prometheus", 524 | "uid": "${datasource}" 525 | }, 526 | "includeAll": true, 527 | "label": "Node Pool", 528 | "multi": true, 529 | "name": "nodepool", 530 | "query": "label_values(karpenter_nodes_allocatable{cluster=\"$cluster\", job=~\"$job\"}, nodepool)", 531 | "refresh": 2, 532 | "sort": 1, 533 | "type": "query" 534 | } 535 | ] 536 | }, 537 | "time": { 538 | "from": "now-24h", 539 | "to": "now" 540 | }, 541 | "timezone": "utc", 542 | "title": "Kubernetes / Autoscaling / Karpenter / Activity", 543 | "uid": "kubernetes-autoscaling-mixin-kact-jkwq" 544 | } 545 | -------------------------------------------------------------------------------- /dashboards/karpenter/karpenter-performance.libsonnet: -------------------------------------------------------------------------------- 1 | local mixinUtils = import 'github.com/adinhodovic/mixin-utils/utils.libsonnet'; 2 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; 3 | local util = import 'util.libsonnet'; 4 | 5 | local dashboard = g.dashboard; 6 | local row = g.panel.row; 7 | local grid = g.util.grid; 8 | 9 | // Stat panel helpers 10 | local stat = g.panel.stat; 11 | local stStandardOptions = stat.standardOptions; 12 | 13 | { 14 | grafanaDashboards+:: { 15 | 'kubernetes-autoscaling-mixin-karpenter-perf.json': 16 | if !$._config.karpenter.enabled then {} else 17 | 18 | local defaultVariables = util.variables($._config); 19 | 20 | local variables = [ 21 | defaultVariables.datasource, 22 | defaultVariables.cluster, 23 | defaultVariables.job, 24 | ]; 25 | 26 | local defaultFilters = util.filters($._config); 27 | local queries = { 28 | // Summary 29 | clusterStateSynced: ||| 30 | sum( 31 | karpenter_cluster_state_synced{ 32 | %(base)s 33 | } 34 | ) by (job) 35 | ||| % defaultFilters, 36 | 37 | clusterStateNodeCount: ||| 38 | sum( 39 | karpenter_cluster_state_node_count{ 40 | %(base)s 41 | } 42 | ) by (job) 43 | ||| % defaultFilters, 44 | 45 | cloudProviderErrors: ||| 46 | round( 47 | sum( 48 | increase( 49 | karpenter_cloudprovider_errors_total{ 50 | %(base)s 51 | }[$__rate_interval] 52 | ) 53 | ) by (job, provider, controller, method, error) 54 | ) 55 | ||| % defaultFilters, 56 | 57 | // Node Termination 58 | nodeTerminationP50Duration: ||| 59 | max( 60 | karpenter_nodes_termination_duration_seconds{ 61 | %(base)s, 62 | quantile="0.5" 63 | } 64 | ) 65 | ||| % defaultFilters, 66 | 67 | nodeTerminationP95Duration: ||| 68 | max( 69 | karpenter_nodes_termination_duration_seconds{ 70 | %(base)s, 71 | quantile="0.95" 72 | } 73 | ) 74 | ||| % defaultFilters, 75 | 76 | nodeTerminationP99Duration: ||| 77 | max( 78 | karpenter_nodes_termination_duration_seconds{ 79 | %(base)s, 80 | quantile="0.99" 81 | } 82 | ) 83 | ||| % defaultFilters, 84 | 85 | // Pod Startup 86 | podsStartupP50Duration: ||| 87 | max( 88 | karpenter_pods_startup_duration_seconds{ 89 | %(base)s, 90 | quantile="0.5" 91 | } 92 | ) 93 | ||| % defaultFilters, 94 | 95 | podsStartupP95Duration: ||| 96 | max( 97 | karpenter_pods_startup_duration_seconds{ 98 | %(base)s, 99 | quantile="0.95" 100 | } 101 | ) 102 | ||| % defaultFilters, 103 | 104 | podsStartupP99Duration: ||| 105 | max( 106 | karpenter_pods_startup_duration_seconds{ 107 | %(base)s, 108 | quantile="0.99" 109 | } 110 | ) 111 | ||| % defaultFilters, 112 | 113 | // Interruption Queue 114 | interruptionReceivedMessages: ||| 115 | sum( 116 | increase( 117 | karpenter_interruption_received_messages_total{ 118 | %(base)s 119 | }[$__rate_interval] 120 | ) 121 | ) by (job, message_type) 122 | ||| % defaultFilters, 123 | 124 | interruptionDeletedMessages: ||| 125 | sum( 126 | increase( 127 | karpenter_interruption_deleted_messages_total{ 128 | %(base)s 129 | }[$__rate_interval] 130 | ) 131 | ) by (job) 132 | ||| % defaultFilters, 133 | 134 | interuptionDurationP50: ||| 135 | histogram_quantile(0.50, 136 | sum( 137 | irate( 138 | karpenter_interruption_message_queue_duration_seconds_bucket{ 139 | %(base)s 140 | }[$__rate_interval] 141 | ) > 0 142 | ) by (job, le) 143 | ) 144 | ||| % defaultFilters, 145 | 146 | interuptionDurationP95: ||| 147 | histogram_quantile(0.95, 148 | sum( 149 | irate( 150 | karpenter_interruption_message_queue_duration_seconds_bucket{ 151 | %(base)s 152 | }[$__rate_interval] 153 | ) > 0 154 | ) by (job, le) 155 | ) 156 | ||| % defaultFilters, 157 | 158 | interuptionDurationP99: ||| 159 | histogram_quantile(0.99, 160 | sum( 161 | irate( 162 | karpenter_interruption_message_queue_duration_seconds_bucket{ 163 | %(base)s 164 | }[$__rate_interval] 165 | ) > 0 166 | ) by (job, le) 167 | ) 168 | ||| % defaultFilters, 169 | 170 | // Work Queue 171 | workQueueDepth: ||| 172 | sum( 173 | karpenter_workqueue_depth{ 174 | %(base)s 175 | } 176 | ) by (job) 177 | ||| % defaultFilters, 178 | 179 | workQueueInQueueDurationP50: ||| 180 | histogram_quantile(0.50, 181 | sum( 182 | irate( 183 | karpenter_workqueue_queue_duration_seconds_bucket{ 184 | %(base)s 185 | }[$__rate_interval] 186 | ) > 0 187 | ) by (job, le) 188 | ) 189 | ||| % defaultFilters, 190 | 191 | workQueueInQueueDurationP95: ||| 192 | histogram_quantile(0.95, 193 | sum( 194 | irate( 195 | karpenter_workqueue_queue_duration_seconds_bucket{ 196 | %(base)s 197 | }[$__rate_interval] 198 | ) > 0 199 | ) by (job, le) 200 | ) 201 | ||| % defaultFilters, 202 | 203 | workQueueInQueueDurationP99: ||| 204 | histogram_quantile(0.99, 205 | sum( 206 | irate( 207 | karpenter_workqueue_queue_duration_seconds_bucket{ 208 | %(base)s 209 | }[$__rate_interval] 210 | ) > 0 211 | ) by (job, le) 212 | ) 213 | ||| % defaultFilters, 214 | 215 | workQueueWorkDurationP50: ||| 216 | histogram_quantile(0.50, 217 | sum( 218 | irate( 219 | karpenter_workqueue_work_duration_seconds_bucket{ 220 | %(base)s 221 | }[$__rate_interval] 222 | ) > 0 223 | ) by (job, le) 224 | ) 225 | ||| % defaultFilters, 226 | 227 | workQueueWorkDurationP95: ||| 228 | histogram_quantile(0.95, 229 | sum( 230 | irate( 231 | karpenter_workqueue_work_duration_seconds_bucket{ 232 | %(base)s 233 | }[$__rate_interval] 234 | ) > 0 235 | ) by (job, le) 236 | ) 237 | ||| % defaultFilters, 238 | 239 | workQueueWorkDurationP99: ||| 240 | histogram_quantile(0.99, 241 | sum( 242 | irate( 243 | karpenter_workqueue_work_duration_seconds_bucket{ 244 | %(base)s 245 | }[$__rate_interval] 246 | ) > 0 247 | ) by (job, le) 248 | ) 249 | ||| % defaultFilters, 250 | 251 | // Controller 252 | controllerReconcile: ||| 253 | sum( 254 | irate( 255 | controller_runtime_reconcile_total{ 256 | %(base)s 257 | }[$__rate_interval] 258 | ) 259 | ) by (job, controller) 260 | ||| % defaultFilters, 261 | 262 | controllerResult: ||| 263 | sum( 264 | irate( 265 | controller_runtime_reconcile_total{ 266 | %(base)s 267 | }[$__rate_interval] 268 | ) 269 | ) by (job, result) 270 | ||| % defaultFilters, 271 | }; 272 | 273 | local panels = { 274 | // Summary 275 | clusterStateSyncedStat: 276 | mixinUtils.dashboards.statPanel( 277 | 'Cluster State Synced', 278 | 'short', 279 | queries.clusterStateSynced, 280 | description='Indicates whether the cluster state is synced.', 281 | steps=[ 282 | stStandardOptions.threshold.step.withValue(0) + 283 | stStandardOptions.threshold.step.withColor('red'), 284 | stStandardOptions.threshold.step.withValue(0.1) + 285 | stStandardOptions.threshold.step.withColor('green'), 286 | ], 287 | mappings=[ 288 | stStandardOptions.mapping.ValueMap.withType() + 289 | stStandardOptions.mapping.ValueMap.withOptions( 290 | { 291 | '0': { text: 'No', color: 'red' }, 292 | '1': { text: 'Yes', color: 'green' }, 293 | } 294 | ), 295 | ], 296 | ), 297 | 298 | clusterStateNodeCountStat: 299 | mixinUtils.dashboards.statPanel( 300 | 'Cluster State Node Count', 301 | 'short', 302 | queries.clusterStateNodeCount, 303 | description='The number of nodes in the cluster state.', 304 | steps=[ 305 | stStandardOptions.threshold.step.withValue(0) + 306 | stStandardOptions.threshold.step.withColor('red'), 307 | stStandardOptions.threshold.step.withValue(0.1) + 308 | stStandardOptions.threshold.step.withColor('green'), 309 | ], 310 | ), 311 | 312 | cloudProviderErrorsTimeSeries: 313 | mixinUtils.dashboards.timeSeriesPanel( 314 | 'Cloud Provider Errors', 315 | 'short', 316 | queries.cloudProviderErrors, 317 | '{{ provider }} - {{ controller }} - {{ method }} - {{ error }}', 318 | description='The number of cloud provider errors over time.', 319 | ), 320 | 321 | // Node Termination & Pod Startup 322 | nodeTerminationDurationTimeSeries: 323 | mixinUtils.dashboards.timeSeriesPanel( 324 | 'Node Termination Duration', 325 | 's', 326 | [ 327 | { 328 | expr: queries.nodeTerminationP50Duration, 329 | legend: 'P50', 330 | }, 331 | { 332 | expr: queries.nodeTerminationP95Duration, 333 | legend: 'P95', 334 | }, 335 | { 336 | expr: queries.nodeTerminationP99Duration, 337 | legend: 'P99', 338 | }, 339 | ], 340 | description='The duration to terminate nodes.', 341 | ), 342 | 343 | podStartupDurationTimeSeries: 344 | mixinUtils.dashboards.timeSeriesPanel( 345 | 'Pods Startup Duration', 346 | 's', 347 | [ 348 | { 349 | expr: queries.podsStartupP50Duration, 350 | legend: 'P50', 351 | }, 352 | { 353 | expr: queries.podsStartupP95Duration, 354 | legend: 'P95', 355 | }, 356 | { 357 | expr: queries.podsStartupP99Duration, 358 | legend: 'P99', 359 | }, 360 | ], 361 | description='The duration for pods to start up.', 362 | ), 363 | 364 | // Interruption Queue 365 | interruptionReceivedMessagesTimeSeries: 366 | mixinUtils.dashboards.timeSeriesPanel( 367 | 'Interruption Received Messages', 368 | 'short', 369 | queries.interruptionReceivedMessages, 370 | '{{ message_type }}', 371 | description='The number of interruption messages received.', 372 | ), 373 | 374 | interruptionDeletedMessagesTimeSeries: 375 | mixinUtils.dashboards.timeSeriesPanel( 376 | 'Interruption Deleted Messages', 377 | 'short', 378 | queries.interruptionDeletedMessages, 379 | 'Deleted Messages', 380 | description='The number of interruption messages deleted.', 381 | ), 382 | 383 | interuptionDurationTimeSeries: 384 | mixinUtils.dashboards.timeSeriesPanel( 385 | 'Interruption Duration', 386 | 's', 387 | [ 388 | { 389 | expr: queries.interuptionDurationP50, 390 | legend: 'P50', 391 | }, 392 | { 393 | expr: queries.interuptionDurationP95, 394 | legend: 'P95', 395 | }, 396 | { 397 | expr: queries.interuptionDurationP99, 398 | legend: 'P99', 399 | }, 400 | ], 401 | description='The duration for interruption message processing.', 402 | ), 403 | 404 | // Work Queue 405 | workQueueDepthTimeSeries: 406 | mixinUtils.dashboards.timeSeriesPanel( 407 | 'Work Queue Depth', 408 | 'short', 409 | queries.workQueueDepth, 410 | 'Queue Depth', 411 | description='The depth of the work queue.', 412 | ), 413 | 414 | workQueueInQueueDurationTimeSeries: 415 | mixinUtils.dashboards.timeSeriesPanel( 416 | 'Work Queue In Queue Duration', 417 | 's', 418 | [ 419 | { 420 | expr: queries.workQueueInQueueDurationP50, 421 | legend: 'P50', 422 | }, 423 | { 424 | expr: queries.workQueueInQueueDurationP95, 425 | legend: 'P95', 426 | }, 427 | { 428 | expr: queries.workQueueInQueueDurationP99, 429 | legend: 'P99', 430 | }, 431 | ], 432 | description='The duration items spend in the work queue.', 433 | ), 434 | 435 | workQueueWorkDurationTimeSeries: 436 | mixinUtils.dashboards.timeSeriesPanel( 437 | 'Work Queue Work Duration', 438 | 's', 439 | [ 440 | { 441 | expr: queries.workQueueWorkDurationP50, 442 | legend: 'P50', 443 | }, 444 | { 445 | expr: queries.workQueueWorkDurationP95, 446 | legend: 'P95', 447 | }, 448 | { 449 | expr: queries.workQueueWorkDurationP99, 450 | legend: 'P99', 451 | }, 452 | ], 453 | description='The duration to process work queue items.', 454 | ), 455 | 456 | // Controller 457 | controllerReconcileTimeSeries: 458 | mixinUtils.dashboards.timeSeriesPanel( 459 | 'Controller Reconcile', 460 | 'ops', 461 | queries.controllerReconcile, 462 | '{{ controller }}', 463 | description='The ops of controller reconciliation.', 464 | stack='normal' 465 | ), 466 | 467 | controllerResultTimeSeries: 468 | mixinUtils.dashboards.timeSeriesPanel( 469 | 'Controller Result', 470 | 'ops', 471 | queries.controllerResult, 472 | '{{ result }}', 473 | description='The result of controller reconciliations.', 474 | stack='normal' 475 | ), 476 | }; 477 | 478 | local rows = 479 | [ 480 | row.new('Summary') + 481 | row.gridPos.withX(0) + 482 | row.gridPos.withY(0) + 483 | row.gridPos.withW(24) + 484 | row.gridPos.withH(1), 485 | ] + 486 | grid.makeGrid( 487 | [ 488 | panels.clusterStateSyncedStat, 489 | panels.clusterStateNodeCountStat, 490 | ], 491 | panelWidth=3, 492 | panelHeight=6, 493 | startY=1 494 | ) + 495 | [ 496 | panels.cloudProviderErrorsTimeSeries + 497 | row.gridPos.withX(12) + 498 | row.gridPos.withY(1) + 499 | row.gridPos.withW(18) + 500 | row.gridPos.withH(6), 501 | ] + 502 | grid.makeGrid( 503 | [ 504 | panels.nodeTerminationDurationTimeSeries, 505 | panels.podStartupDurationTimeSeries, 506 | ], 507 | panelWidth=12, 508 | panelHeight=6, 509 | startY=7 510 | ) + 511 | [ 512 | row.new('Interruption Queue') + 513 | row.gridPos.withX(0) + 514 | row.gridPos.withY(13) + 515 | row.gridPos.withW(24) + 516 | row.gridPos.withH(1), 517 | ] + 518 | grid.makeGrid( 519 | [ 520 | panels.interruptionReceivedMessagesTimeSeries, 521 | panels.interruptionDeletedMessagesTimeSeries, 522 | panels.interuptionDurationTimeSeries, 523 | ], 524 | panelWidth=8, 525 | panelHeight=6, 526 | startY=14 527 | ) + 528 | [ 529 | row.new('Work Queue') + 530 | row.gridPos.withX(0) + 531 | row.gridPos.withY(20) + 532 | row.gridPos.withW(24) + 533 | row.gridPos.withH(1), 534 | ] + 535 | grid.makeGrid( 536 | [ 537 | panels.workQueueDepthTimeSeries, 538 | panels.workQueueInQueueDurationTimeSeries, 539 | panels.workQueueWorkDurationTimeSeries, 540 | ], 541 | panelWidth=8, 542 | panelHeight=6, 543 | startY=21 544 | ) + 545 | [ 546 | row.new('Controller') + 547 | row.gridPos.withX(0) + 548 | row.gridPos.withY(27) + 549 | row.gridPos.withW(24) + 550 | row.gridPos.withH(1), 551 | ] + 552 | grid.makeGrid( 553 | [ 554 | panels.controllerReconcileTimeSeries, 555 | panels.controllerResultTimeSeries, 556 | ], 557 | panelWidth=24, 558 | panelHeight=8, 559 | startY=28 560 | ); 561 | 562 | mixinUtils.dashboards.bypassDashboardValidation + 563 | dashboard.new( 564 | 'Kubernetes / Autoscaling / Karpenter / Performance', 565 | ) + 566 | dashboard.withDescription('A dashboard that monitors Karpenter performance metrics. %s' % mixinUtils.dashboards.dashboardDescriptionLink('kubernetes-autoscaling-mixin', 'https://github.com/adinhodovic/kubernetes-autoscaling-mixin')) + 567 | dashboard.withUid($._config.karpenterPerformanceDashboardUid) + 568 | dashboard.withTags($._config.tags + ['karpenter']) + 569 | dashboard.withTimezone('utc') + 570 | dashboard.withEditable(true) + 571 | dashboard.time.withFrom('now-6h') + 572 | dashboard.time.withTo('now') + 573 | dashboard.withVariables(variables) + 574 | dashboard.withLinks( 575 | mixinUtils.dashboards.dashboardLinks('Kubernetes / Autoscaling', $._config, dropdown=true) 576 | ) + 577 | dashboard.withPanels( 578 | rows 579 | ) + 580 | dashboard.withAnnotations( 581 | mixinUtils.dashboards.annotations($._config, defaultFilters) 582 | ), 583 | }, 584 | } 585 | --------------------------------------------------------------------------------