├── prometheus_rules.yaml
├── .gitignore
├── lib
    ├── rules.jsonnet
    ├── alerts.jsonnet
    └── dashboards.jsonnet
├── rules
    └── rules.libsonnet
├── mixin.libsonnet
├── .vale.ini
├── .lint
├── dashboards_out
    ├── .lint
    └── kubernetes-autoscaling-mixin-karpenter-act.json
├── dashboards
    ├── dashboards.libsonnet
    ├── cluster-autoscaler
    │   ├── util.libsonnet
    │   └── kubernetes-autoscaling-cluster-autoscaler.libsonnet
    ├── karpenter
    │   ├── util.libsonnet
    │   ├── karpenter-activity.libsonnet
    │   └── karpenter-performance.libsonnet
    ├── kubernetes
    │   ├── util.libsonnet
    │   ├── kubernetes-autoscaling-hpa.libsonnet
    │   └── kubernetes-autoscaling-pdb.libsonnet
    └── keda
    │   ├── util.libsonnet
    │   ├── keda-scaled-job.libsonnet
    │   └── keda-scaled-object.libsonnet
├── scripts
    ├── tools.go
    └── go.mod
├── jsonnetfile.json
├── .github
    └── workflows
    │   └── ci.yml
├── jsonnetfile.lock.json
├── Makefile
├── config.libsonnet
├── prometheus_alerts.yaml
├── tests
    └── tests.yaml
├── README.md
├── alerts
    └── alerts.libsonnet
└── LICENSE


/prometheus_rules.yaml:
--------------------------------------------------------------------------------
1 | "groups": []
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | vendor
2 | tmp
3 | ./dashboards_out/lint
4 | .vale
5 | 


--------------------------------------------------------------------------------
/lib/rules.jsonnet:
--------------------------------------------------------------------------------
1 | std.manifestYamlDoc((import '../mixin.libsonnet').prometheusRules)
2 | 


--------------------------------------------------------------------------------
/lib/alerts.jsonnet:
--------------------------------------------------------------------------------
1 | std.manifestYamlDoc((import '../mixin.libsonnet').prometheusAlerts)
2 | 


--------------------------------------------------------------------------------
/rules/rules.libsonnet:
--------------------------------------------------------------------------------
1 | {
2 |   prometheusRules+:: {
3 |     groups+: [],
4 |   },
5 | }
6 | 


--------------------------------------------------------------------------------
/mixin.libsonnet:
--------------------------------------------------------------------------------
1 | (import 'alerts/alerts.libsonnet') +
2 | (import 'rules/rules.libsonnet') +
3 | (import 'dashboards/dashboards.libsonnet') +
4 | (import 'config.libsonnet')
5 | 


--------------------------------------------------------------------------------
/.vale.ini:
--------------------------------------------------------------------------------
1 | StylesPath = .vale/styles
2 | 
3 | MinAlertLevel = error
4 | 
5 | Packages = Readability, write-good, alex
6 | 
7 | [*]
8 | BasedOnStyles = Readability, write-good, alex
9 | 


--------------------------------------------------------------------------------
/lib/dashboards.jsonnet:
--------------------------------------------------------------------------------
1 | local dashboards = (import '../mixin.libsonnet').grafanaDashboards;
2 | 
3 | {
4 |   [name]: dashboards[name]
5 |   for name in std.objectFields(dashboards)
6 | }
7 | 


--------------------------------------------------------------------------------
/.lint:
--------------------------------------------------------------------------------
 1 | ---
 2 | exclusions:
 3 |   template-job-rule:
 4 |     reason: Jobs are not set to multi in our case.
 5 |   target-job-rule:
 6 |     reason: Some dashboard use recording rules
 7 |   template-instance-rule:
 8 |     reason: We don't use instances.
 9 |   panel-datasource-rule:
10 |     reason: Using a datasource for each panel.
11 |   panel-title-description-rule:
12 |     reason: TODO(adinhodovic)
13 |   target-instance-rule:
14 | 


--------------------------------------------------------------------------------
/dashboards_out/.lint:
--------------------------------------------------------------------------------
 1 | ---
 2 | exclusions:
 3 |   template-job-rule:
 4 |     reason: Jobs are not set to multi in our case.
 5 |   target-job-rule:
 6 |     reason: Some dashboard use recording rules
 7 |   template-instance-rule:
 8 |     reason: We don't use instances.
 9 |   panel-datasource-rule:
10 |     reason: Using a datasource for each panel.
11 |   panel-title-description-rule:
12 |     reason: TODO(adinhodovic)
13 |   target-instance-rule:
14 | 


--------------------------------------------------------------------------------
/dashboards/dashboards.libsonnet:
--------------------------------------------------------------------------------
 1 | (import 'kubernetes/kubernetes-autoscaling-pdb.libsonnet') +
 2 | (import 'kubernetes/kubernetes-autoscaling-hpa.libsonnet') +
 3 | (import 'kubernetes/kubernetes-autoscaling-vpa.libsonnet') +
 4 | (import 'cluster-autoscaler/kubernetes-autoscaling-cluster-autoscaler.libsonnet') +
 5 | (import 'karpenter/karpenter-overview.libsonnet') +
 6 | (import 'karpenter/karpenter-activity.libsonnet') +
 7 | (import 'karpenter/karpenter-performance.libsonnet') +
 8 | (import 'keda/keda-scaled-object.libsonnet') +
 9 | (import 'keda/keda-scaled-job.libsonnet')
10 | 


--------------------------------------------------------------------------------
/scripts/tools.go:
--------------------------------------------------------------------------------
 1 | //go:build tools
 2 | // +build tools
 3 | 
 4 | // Packae tols tracks dependencies for tools that used in the build process.
 5 | // See https://github.com/golang/go/issues/25922
 6 | package tools
 7 | 
 8 | import (
 9 | 	_ "github.com/Kunde21/markdownfmt/v3/cmd/markdownfmt"
10 | 	_ "github.com/cloudflare/pint/cmd/pint"
11 | 	_ "github.com/errata-ai/vale/v3/cmd/vale"
12 | 	_ "github.com/google/go-jsonnet/cmd/jsonnet"
13 | 	_ "github.com/google/go-jsonnet/cmd/jsonnet-lint"
14 | 	_ "github.com/google/go-jsonnet/cmd/jsonnetfmt"
15 | 	_ "github.com/grafana/dashboard-linter"
16 | 	_ "github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb"
17 | 	_ "github.com/prometheus/prometheus/cmd/promtool"
18 | )
19 | 


--------------------------------------------------------------------------------
/jsonnetfile.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 1,
 3 |   "dependencies": [
 4 |     {
 5 |       "source": {
 6 |         "git": {
 7 |           "remote": "https://github.com/adinhodovic/mixin-utils.git",
 8 |           "subdir": ""
 9 |         }
10 |       },
11 |       "version": "main",
12 |       "name": "mm-utils"
13 |     },
14 |     {
15 |       "source": {
16 |         "git": {
17 |           "remote": "https://github.com/grafana/grafonnet.git",
18 |           "subdir": "gen/grafonnet-latest"
19 |         }
20 |       },
21 |       "version": "main"
22 |     },
23 |     {
24 |       "source": {
25 |         "git": {
26 |           "remote": "https://github.com/jsonnet-libs/docsonnet.git",
27 |           "subdir": "doc-util"
28 |         }
29 |       },
30 |       "version": "master"
31 |     },
32 |     {
33 |       "source": {
34 |         "git": {
35 |           "remote": "https://github.com/jsonnet-libs/xtd.git",
36 |           "subdir": ""
37 |         }
38 |       },
39 |       "version": "master"
40 |     }
41 |   ],
42 |   "legacyImports": true
43 | }
44 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: ci
 2 | permissions: {}
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 |   pull_request:
 8 | jobs:
 9 |   matrix:
10 |     runs-on: ubuntu-latest
11 |     name: ${{ matrix.name }}
12 |     strategy:
13 |       fail-fast: false
14 |       matrix:
15 |         include:
16 |           - name: Lint Alerts
17 |             run: make --always-make alerts-lint
18 |           - name: Generate yaml
19 |             run: make --always-make generate && git diff --exit-code
20 |           - name: Lint Grafana Dashboards
21 |             run: make --always-make dashboards-lint
22 |           - name: Format Jsonnet
23 |             run: make --always-make jsonnet-fmt && git diff --exit-code
24 |           - name: Lint Jsonnet
25 |             run: make --always-make jsonnet-lint
26 |           - name: Format Markdown
27 |             run: make --always-make markdownfmt && git diff --exit-code
28 |           - name: Lint Markdown
29 |             run: make --always-make vale && git diff --exit-code
30 |           - name: Lint YAML
31 |             run: make --always-make pint-lint
32 |           - name: Run unit tests
33 |             run: make --always-make test
34 | 
35 |     steps:
36 |       - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0
37 |         with:
38 |           persist-credentials: false
39 |       - uses: actions/setup-go@d35c59abb061a4a6fb18e82ac0862c26744d6ab5 # v5.5.0
40 |         with:
41 |           go-version-file: scripts/go.mod
42 |           cache-dependency-path: scripts/go.sum
43 |       - run: ${{ matrix.run }}
44 | 


--------------------------------------------------------------------------------
/jsonnetfile.lock.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 1,
 3 |   "dependencies": [
 4 |     {
 5 |       "source": {
 6 |         "git": {
 7 |           "remote": "https://github.com/adinhodovic/mixin-utils.git",
 8 |           "subdir": ""
 9 |         }
10 |       },
11 |       "version": "2d7880588e2b94f547c20ba270594dd7ecdc2ed9",
12 |       "sum": "/n6K29u+5LfCLOiSOD8uehfMrd4AQoZCaqYYg3oV6xU=",
13 |       "name": "mm-utils"
14 |     },
15 |     {
16 |       "source": {
17 |         "git": {
18 |           "remote": "https://github.com/grafana/grafonnet.git",
19 |           "subdir": "gen/grafonnet-latest"
20 |         }
21 |       },
22 |       "version": "7380c9c64fb973f34c3ec46265621a2b0dee0058",
23 |       "sum": "V9vAj21qJOc2DlMPDgB1eEjSQU4A+sAA4AXuJ6bd4xc="
24 |     },
25 |     {
26 |       "source": {
27 |         "git": {
28 |           "remote": "https://github.com/grafana/grafonnet.git",
29 |           "subdir": "gen/grafonnet-v11.4.0"
30 |         }
31 |       },
32 |       "version": "7380c9c64fb973f34c3ec46265621a2b0dee0058",
33 |       "sum": "aVAX09paQYNOoCSKVpuk1exVIyBoMt/C50QJI+Q/3nA="
34 |     },
35 |     {
36 |       "source": {
37 |         "git": {
38 |           "remote": "https://github.com/jsonnet-libs/docsonnet.git",
39 |           "subdir": "doc-util"
40 |         }
41 |       },
42 |       "version": "6ac6c69685b8c29c54515448eaca583da2d88150",
43 |       "sum": "BrAL/k23jq+xy9oA7TWIhUx07dsA/QLm3g7ktCwe//U="
44 |     },
45 |     {
46 |       "source": {
47 |         "git": {
48 |           "remote": "https://github.com/jsonnet-libs/xtd.git",
49 |           "subdir": ""
50 |         }
51 |       },
52 |       "version": "4d7f8cb24d613430799f9d56809cc6964f35cea9",
53 |       "sum": "hOrwkOx34tOXqoDVnwuI/Uf/dr9HFFSPWpDPOvnEGrk="
54 |     }
55 |   ],
56 |   "legacyImports": false
57 | }
58 | 


--------------------------------------------------------------------------------
/dashboards/cluster-autoscaler/util.libsonnet:
--------------------------------------------------------------------------------
 1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet';
 2 | 
 3 | local dashboard = g.dashboard;
 4 | 
 5 | local variable = dashboard.variable;
 6 | local datasource = variable.datasource;
 7 | local query = variable.query;
 8 | 
 9 | {
10 |   filters(config):: {
11 |     local this = self,
12 |     cluster: '%(clusterLabel)s="$cluster"' % config,
13 |     job: 'job=~"$job"',
14 | 
15 |     base: |||
16 |       %(cluster)s,
17 |       %(job)s
18 |     ||| % this,
19 |   },
20 | 
21 |   variables(config):: {
22 |     local this = self,
23 | 
24 |     local defaultFilters = $.filters(config),
25 | 
26 |     datasource:
27 |       datasource.new(
28 |         'datasource',
29 |         'prometheus',
30 |       ) +
31 |       datasource.generalOptions.withLabel('Data source') +
32 |       {
33 |         current: {
34 |           selected: true,
35 |           text: config.datasourceName,
36 |           value: config.datasourceName,
37 |         },
38 |       },
39 | 
40 |     cluster:
41 |       query.new(
42 |         config.clusterLabel,
43 |         'label_values(kube_pod_info{%(kubeStateMetricsSelector)s}, cluster)' % config,
44 |       ) +
45 |       query.withDatasourceFromVariable(this.datasource) +
46 |       query.withSort() +
47 |       query.generalOptions.withLabel('Cluster') +
48 |       query.refresh.onLoad() +
49 |       query.refresh.onTime() +
50 |       (
51 |         if config.showMultiCluster
52 |         then query.generalOptions.showOnDashboard.withLabelAndValue()
53 |         else query.generalOptions.showOnDashboard.withNothing()
54 |       ),
55 | 
56 |     job:
57 |       query.new(
58 |         'job',
59 |         'label_values(cluster_autoscaler_last_activity{%(cluster)s}, job)' % defaultFilters,
60 |       ) +
61 |       query.withDatasourceFromVariable(this.datasource) +
62 |       query.withSort(1) +
63 |       query.generalOptions.withLabel('Job') +
64 |       query.refresh.onLoad() +
65 |       query.refresh.onTime(),
66 |   },
67 | }
68 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
  1 | BIN_DIR ?= $(shell pwd)/tmp/bin
  2 | 
  3 | JSONNET_VENDOR=vendor
  4 | GRAFANA_DASHBOARD_LINTER_BIN=$(BIN_DIR)/dashboard-linter
  5 | JB_BIN=$(BIN_DIR)/jb
  6 | JSONNET_BIN=$(BIN_DIR)/jsonnet
  7 | JSONNETLINT_BIN=$(BIN_DIR)/jsonnet-lint
  8 | JSONNETFMT_BIN=$(BIN_DIR)/jsonnetfmt
  9 | MD_FILES = $(shell find . \( -type d -name '.vale' -o -type d -name 'vendor' \) -prune -o -type f -name "*.md" -print)
 10 | MARKDOWNFMT_BIN=$(BIN_DIR)/markdownfmt
 11 | VALE_BIN=$(BIN_DIR)/vale
 12 | PROMTOOL_BIN=$(BIN_DIR)/promtool
 13 | PINT_BIN=$(BIN_DIR)/pint
 14 | TOOLING=$(JB_BIN) $(JSONNETLINT_BIN) $(JSONNET_BIN) $(JSONNETFMT_BIN) $(PROMTOOL_BIN) $(GRAFANA_DASHBOARD_LINTER_BIN) $(MARKDOWNFMT_BIN) $(VALE_BIN) $(PINT_BIN)
 15 | JSONNETFMT_ARGS=-n 2 --max-blank-lines 2 --string-style s --comment-style s
 16 | SRC_DIR ?=dashboards
 17 | OUT_DIR ?=dashboards_out
 18 | 
 19 | .PHONY: all
 20 | all: fmt generate lint test
 21 | 
 22 | .PHONY: generate
 23 | generate: prometheus_alerts.yaml prometheus_rules.yaml $(OUT_DIR)
 24 | 
 25 | $(JSONNET_VENDOR): $(JB_BIN) jsonnetfile.json
 26 | 	$(JB_BIN) install
 27 | 
 28 | .PHONY: fmt
 29 | fmt: jsonnet-fmt markdownfmt
 30 | 
 31 | .PHONY: jsonnet-fmt
 32 | jsonnet-fmt: $(JSONNETFMT_BIN)
 33 | 	@find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
 34 | 		xargs -n 1 -- $(JSONNETFMT_BIN) $(JSONNETFMT_ARGS) -i
 35 | 
 36 | .PHONY: markdownfmt
 37 | markdownfmt: $(MARKDOWNFMT_BIN)
 38 | 	@for file in $(MD_FILES); do $(MARKDOWNFMT_BIN) -w -gofmt $$file; done
 39 | 
 40 | prometheus_alerts.yaml: $(JSONNET_BIN) mixin.libsonnet lib/alerts.jsonnet alerts/*.libsonnet
 41 | 	@$(JSONNET_BIN) -J vendor -S lib/alerts.jsonnet > $@
 42 | 
 43 | prometheus_rules.yaml: $(JSONNET_BIN) mixin.libsonnet lib/rules.jsonnet rules/*.libsonnet
 44 | 	@$(JSONNET_BIN) -J vendor -S lib/rules.jsonnet > $@
 45 | 
 46 | $(OUT_DIR): $(JSONNET_BIN) $(JSONNET_VENDOR) mixin.libsonnet lib/dashboards.jsonnet $(SRC_DIR)/*.libsonnet
 47 | 	@mkdir -p $(OUT_DIR)
 48 | 	@$(JSONNET_BIN) -J vendor -m $(OUT_DIR) lib/dashboards.jsonnet
 49 | 
 50 | .PHONY: lint
 51 | lint: jsonnet-lint alerts-lint dashboards-lint vale pint-lint
 52 | 
 53 | .PHONY: jsonnet-lint
 54 | jsonnet-lint: $(JSONNETLINT_BIN) $(JSONNET_VENDOR)
 55 | 	@find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
 56 | 		xargs -n 1 -- $(JSONNETLINT_BIN) -J vendor
 57 | 
 58 | .PHONY: alerts-lint
 59 | alerts-lint: $(PROMTOOL_BIN) prometheus_alerts.yaml prometheus_rules.yaml
 60 | 	@$(PROMTOOL_BIN) check rules prometheus_rules.yaml
 61 | 	@$(PROMTOOL_BIN) check rules prometheus_alerts.yaml
 62 | 
 63 | $(OUT_DIR)/.lint: $(OUT_DIR)
 64 | 	@cp .lint $@
 65 | 
 66 | .PHONY: dashboards-lint
 67 | dashboards-lint: $(GRAFANA_DASHBOARD_LINTER_BIN) $(OUT_DIR)/.lint
 68 | 	# Replace $$interval:$$resolution var with $$__rate_interval to make dashboard-linter happy.
 69 | 	@sed -i -e 's/$$interval:$$resolution/$$__rate_interval/g' $(OUT_DIR)/*.json
 70 | 	@find $(OUT_DIR) -name '*.json' -print0 | xargs -n 1 -0 $(GRAFANA_DASHBOARD_LINTER_BIN) lint --strict
 71 | 
 72 | .PHONY: vale
 73 | vale: $(VALE_BIN)
 74 | 	@$(VALE_BIN) sync && \
 75 | 		$(VALE_BIN) $(MD_FILES)
 76 | 
 77 | .PHONY: pint-lint
 78 | pint-lint: generate $(PINT_BIN)
 79 | 	@# Pint will not exit with a non-zero status code if there are linting issues.
 80 | 	@output=$$($(PINT_BIN) -n -o -l WARN lint prometheus_alerts.yaml prometheus_rules.yaml 2>&1); \
 81 | 	if [ -n "$$output" ]; then \
 82 | 		echo "\n$$output"; \
 83 | 		exit 1; \
 84 | 	fi
 85 | 
 86 | .PHONY: clean
 87 | clean:
 88 | 	# Remove all files and directories ignored by git.
 89 | 	git clean -Xfd .
 90 | 
 91 | .PHONY: test
 92 | test: $(PROMTOOL_BIN) prometheus_alerts.yaml prometheus_rules.yaml
 93 | 	@$(PROMTOOL_BIN) test rules tests/*.yaml
 94 | 
 95 | $(BIN_DIR):
 96 | 	mkdir -p $(BIN_DIR)
 97 | 
 98 | $(TOOLING): $(BIN_DIR)
 99 | 	@echo Installing tools from hack/tools.go
100 | 	@cd scripts && go list -e -mod=mod -tags tools -f '{{ range .Imports }}{{ printf "%s\n" .}}{{end}}' ./ | xargs -tI % go build -mod=mod -o $(BIN_DIR) %
101 | 


--------------------------------------------------------------------------------
/config.libsonnet:
--------------------------------------------------------------------------------
  1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet';
  2 | local annotation = g.dashboard.annotation;
  3 | 
  4 | {
  5 |   _config+:: {
  6 |     local this = self,
  7 |     // Bypasses grafana.com/dashboards validator
  8 |     bypassDashboardValidation: {
  9 |       __inputs: [],
 10 |       __requires: [],
 11 |     },
 12 | 
 13 |     // Default datasource name
 14 |     datasourceName: 'default',
 15 | 
 16 |     // Opt-in to multiCluster dashboards by overriding this and the clusterLabel.
 17 |     showMultiCluster: false,
 18 |     clusterLabel: 'cluster',
 19 | 
 20 |     kubeStateMetricsSelector: 'job=~"kube-state-metrics"',
 21 | 
 22 |     grafanaUrl: 'https://grafana.com',
 23 | 
 24 |     pdbDashboardUid: 'kubernetes-autoscaling-mixin-pdb-jkwq',
 25 |     hpaDashboardUid: 'kubernetes-autoscaling-mixin-hpa-jkwq',
 26 |     vpaDashboardUid: 'kubernetes-autoscaling-mixin-vpa-jkwq',
 27 |     clusterAutoscalerDashboardUid: 'kubernetes-autoscaling-mixin-ca-jkwq',
 28 |     karpenterOverviewDashboardUid: 'kubernetes-autoscaling-mixin-kover-jkwq',
 29 |     karpenterActivityDashboardUid: 'kubernetes-autoscaling-mixin-kact-jkwq',
 30 |     karpenterPerformanceDashboardUid: 'kubernetes-autoscaling-mixin-kperf-jkwq',
 31 |     kedaScaledObjectDashboardUid: 'kubernetes-autoscaling-mixin-kedaso-jkwq',
 32 |     kedaScaledJobDashboardUid: 'kubernetes-autoscaling-mixin-kedasj-jkwq',
 33 | 
 34 |     vpa: {
 35 |       enabled: true,
 36 |       // Optional: If you want to aggregate the VPA by cluster, set it to true requires showMultiCluster to be true.
 37 |       clusterAggregation: false,
 38 |       // Optional: If your VPA names are not based only from the pod name and include a prefix, set it here.
 39 |       vpaPrefix: '',
 40 |     },
 41 | 
 42 |     clusterAutoscaler: {
 43 |       enabled: true,
 44 |       clusterAutoscalerSelector: 'job="cluster-autoscaler"',
 45 | 
 46 |       nodeCountCapacityThreshold: 75,
 47 | 
 48 |       clusterAutoscalerDashboardUrl: '%s/d/%s/kubernetes-autoscaling-cluster-autoscaler' % [this.grafanaUrl, this.clusterAutoscalerDashboardUid],
 49 |     },
 50 | 
 51 |     karpenter: {
 52 |       enabled: true,
 53 |       karpenterSelector: 'job="karpenter"',
 54 | 
 55 |       nodepoolCapacityThreshold: 75,
 56 |       nodeclaimTerminationThreshold: 60 * 20,
 57 | 
 58 |       karpenterOverviewDashboardUrl: '%s/d/%s/kubernetes-autoscaling-karpenter-overview' % [this.grafanaUrl, this.karpenterOverviewDashboardUid],
 59 |       karpenterActivityDashboardUrl: '%s/d/%s/kubernetes-autoscaling-karpenter-activity' % [this.grafanaUrl, this.karpenterActivityDashboardUid],
 60 |       karpenterPerformanceDashboardUrl: '%s/d/%s/kubernetes-autoscaling-karpenter-performance' % [this.grafanaUrl, this.karpenterPerformanceDashboardUid],
 61 |     },
 62 | 
 63 |     keda: {
 64 |       enabled: true,
 65 | 
 66 |       kedaScaledObjectDashboardUrl: '%s/d/%s/kubernetes-autoscaling-keda-scaled-object' % [this.grafanaUrl, this.kedaScaledObjectDashboardUid],
 67 |       kedaScaledJobDashboardUrl: '%s/d/%s/kubernetes-autoscaling-keda-scaled-job' % [this.grafanaUrl, this.kedaScaledJobDashboardUid],
 68 | 
 69 |       kedaSelector: 'job="keda-operator"',
 70 | 
 71 |       // Default thresholds for KEDA the scaler metrics latency threshold in seconds.
 72 |       scalerMetricsLatencyThreshold: '5',
 73 |       // The default threshold for scaled objects to be considered paused for too long.
 74 |       scaledObjectPausedThreshold: '25h',
 75 | 
 76 |       // Used to link to the workload dashboard from the scaled job dashboards. Allows viewing resource usage.
 77 |       k8sResourcesWorkloadDashboardUid: 'this-needs-to-be-customized',
 78 |     },
 79 | 
 80 |     tags: ['kubernetes', 'autoscaling', 'kubernetes-autoscaling-mixin'],
 81 | 
 82 |     // Custom annotations to display in graphs
 83 |     annotation: {
 84 |       enabled: false,
 85 |       name: 'Custom Annotation',
 86 |       datasource: '-- Grafana --',
 87 |       iconColor: 'green',
 88 |       tags: [],
 89 |     },
 90 | 
 91 |     customAnnotation:: if $._config.annotation.enabled then
 92 |       annotation.withName($._config.annotation.name) +
 93 |       annotation.withIconColor($._config.annotation.iconColor) +
 94 |       annotation.withHide(false) +
 95 |       annotation.datasource.withUid($._config.annotation.datasource) +
 96 |       annotation.target.withMatchAny(true) +
 97 |       annotation.target.withTags($._config.annotation.tags) +
 98 |       annotation.target.withType('tags')
 99 |     else {},
100 |   },
101 | }
102 | 


--------------------------------------------------------------------------------
/dashboards/karpenter/util.libsonnet:
--------------------------------------------------------------------------------
  1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet';
  2 | 
  3 | local dashboard = g.dashboard;
  4 | 
  5 | local variable = dashboard.variable;
  6 | local datasource = variable.datasource;
  7 | local query = variable.query;
  8 | 
  9 | {
 10 |   filters(config):: {
 11 |     local this = self,
 12 |     cluster: '%(clusterLabel)s="$cluster"' % config,
 13 |     job: 'job=~"$job"',
 14 |     region: 'region=~"$region"',
 15 |     zone: 'zone=~"$zone"',
 16 |     arch: 'arch=~"$arch"',
 17 |     os: 'os=~"$os"',
 18 |     instanceType: 'instance_type=~"$instance_type"',
 19 |     capacityType: 'capacity_type=~"$capacity_type"',
 20 |     nodepool: 'nodepool=~"$nodepool"',
 21 | 
 22 |     base: |||
 23 |       %(cluster)s,
 24 |       %(job)s
 25 |     ||| % this,
 26 | 
 27 |     default: |||
 28 |       %(base)s,
 29 |       %(nodepool)s
 30 |     ||| % this,
 31 | 
 32 |     withLocation: |||
 33 |       %(default)s,
 34 |       %(region)s,
 35 |       %(zone)s
 36 |     ||| % this,
 37 | 
 38 |     full: |||
 39 |       %(withLocation)s,
 40 |       %(arch)s,
 41 |       %(os)s,
 42 |       %(instanceType)s,
 43 |       %(capacityType)s
 44 |     ||| % this,
 45 |   },
 46 | 
 47 |   variables(config):: {
 48 |     local this = self,
 49 | 
 50 |     local defaultFilters = $.filters(config),
 51 | 
 52 |     datasource:
 53 |       datasource.new(
 54 |         'datasource',
 55 |         'prometheus',
 56 |       ) +
 57 |       datasource.generalOptions.withLabel('Data source') +
 58 |       {
 59 |         current: {
 60 |           selected: true,
 61 |           text: config.datasourceName,
 62 |           value: config.datasourceName,
 63 |         },
 64 |       },
 65 | 
 66 |     cluster:
 67 |       query.new(
 68 |         config.clusterLabel,
 69 |         'label_values(kube_pod_info{%(kubeStateMetricsSelector)s}, cluster)' % config,
 70 |       ) +
 71 |       query.withDatasourceFromVariable(this.datasource) +
 72 |       query.withSort() +
 73 |       query.generalOptions.withLabel('Cluster') +
 74 |       query.refresh.onLoad() +
 75 |       query.refresh.onTime() +
 76 |       (
 77 |         if config.showMultiCluster
 78 |         then query.generalOptions.showOnDashboard.withLabelAndValue()
 79 |         else query.generalOptions.showOnDashboard.withNothing()
 80 |       ),
 81 | 
 82 |     job:
 83 |       query.new(
 84 |         'job',
 85 |         'label_values(karpenter_nodes_allocatable{%(cluster)s}, job)' % defaultFilters,
 86 |       ) +
 87 |       query.withDatasourceFromVariable(this.datasource) +
 88 |       query.withSort(1) +
 89 |       query.generalOptions.withLabel('Job') +
 90 |       query.refresh.onLoad() +
 91 |       query.refresh.onTime(),
 92 | 
 93 |     region:
 94 |       query.new(
 95 |         'region',
 96 |         'label_values(karpenter_nodes_allocatable{%(cluster)s, %(job)s}, region)' % defaultFilters,
 97 |       ) +
 98 |       query.withDatasourceFromVariable(this.datasource) +
 99 |       query.withSort() +
100 |       query.generalOptions.withLabel('Region') +
101 |       query.selectionOptions.withMulti(true) +
102 |       query.selectionOptions.withIncludeAll(true) +
103 |       query.refresh.onLoad() +
104 |       query.refresh.onTime(),
105 | 
106 |     zone:
107 |       query.new(
108 |         'zone',
109 |         'label_values(karpenter_nodes_allocatable{%(cluster)s, %(job)s, %(region)s}, zone)' % defaultFilters,
110 |       ) +
111 |       query.withDatasourceFromVariable(this.datasource) +
112 |       query.withSort() +
113 |       query.generalOptions.withLabel('Zone') +
114 |       query.selectionOptions.withMulti(true) +
115 |       query.selectionOptions.withIncludeAll(true) +
116 |       query.refresh.onLoad() +
117 |       query.refresh.onTime(),
118 | 
119 |     arch:
120 |       query.new(
121 |         'arch',
122 |         'label_values(karpenter_nodes_allocatable{%(cluster)s, %(job)s, %(region)s, %(zone)s}, arch)' % defaultFilters,
123 |       ) +
124 |       query.withDatasourceFromVariable(this.datasource) +
125 |       query.withSort() +
126 |       query.generalOptions.withLabel('Architecture') +
127 |       query.selectionOptions.withMulti(true) +
128 |       query.selectionOptions.withIncludeAll(true) +
129 |       query.refresh.onLoad() +
130 |       query.refresh.onTime(),
131 | 
132 |     os:
133 |       query.new(
134 |         'os',
135 |         'label_values(karpenter_nodes_allocatable{%(cluster)s, %(job)s, %(region)s, %(zone)s, %(arch)s}, os)' % defaultFilters,
136 |       ) +
137 |       query.withDatasourceFromVariable(this.datasource) +
138 |       query.withSort(1) +
139 |       query.generalOptions.withLabel('Operating System') +
140 |       query.selectionOptions.withMulti(true) +
141 |       query.selectionOptions.withIncludeAll(true) +
142 |       query.refresh.onLoad() +
143 |       query.refresh.onTime(),
144 | 
145 |     instanceType:
146 |       query.new(
147 |         'instance_type',
148 |         'label_values(karpenter_nodes_allocatable{%(cluster)s, %(job)s, %(region)s, %(zone)s, %(arch)s, %(os)s}, instance_type)' % defaultFilters,
149 |       ) +
150 |       query.withDatasourceFromVariable(this.datasource) +
151 |       query.withSort(1) +
152 |       query.generalOptions.withLabel('Instance Type') +
153 |       query.selectionOptions.withMulti(true) +
154 |       query.selectionOptions.withIncludeAll(true) +
155 |       query.refresh.onLoad() +
156 |       query.refresh.onTime(),
157 | 
158 |     capacityType:
159 |       query.new(
160 |         'capacity_type',
161 |         'label_values(karpenter_nodes_allocatable{%(cluster)s, %(job)s, %(region)s, %(zone)s, %(arch)s, %(os)s, %(instanceType)s}, capacity_type)' % defaultFilters,
162 |       ) +
163 |       query.withDatasourceFromVariable(this.datasource) +
164 |       query.withSort(1) +
165 |       query.generalOptions.withLabel('Capacity Type') +
166 |       query.selectionOptions.withMulti(true) +
167 |       query.selectionOptions.withIncludeAll(true) +
168 |       query.refresh.onLoad() +
169 |       query.refresh.onTime(),
170 | 
171 |     nodepool:
172 |       query.new(
173 |         'nodepool',
174 |         'label_values(karpenter_nodes_allocatable{%(cluster)s, %(job)s, %(region)s, %(zone)s, %(arch)s, %(os)s, %(instanceType)s, %(capacityType)s}, nodepool)' % defaultFilters,
175 |       ) +
176 |       query.withDatasourceFromVariable(this.datasource) +
177 |       query.withSort(1) +
178 |       query.generalOptions.withLabel('Node Pool') +
179 |       query.selectionOptions.withMulti(true) +
180 |       query.selectionOptions.withIncludeAll(true) +
181 |       query.refresh.onLoad() +
182 |       query.refresh.onTime(),
183 | 
184 |     nodepoolSimple:
185 |       query.new(
186 |         'nodepool',
187 |         'label_values(karpenter_nodes_allocatable{%(cluster)s, %(job)s}, nodepool)' % defaultFilters,
188 |       ) +
189 |       query.withDatasourceFromVariable(this.datasource) +
190 |       query.withSort(1) +
191 |       query.generalOptions.withLabel('Node Pool') +
192 |       query.selectionOptions.withMulti(true) +
193 |       query.selectionOptions.withIncludeAll(true) +
194 |       query.refresh.onLoad() +
195 |       query.refresh.onTime(),
196 |   },
197 | }
198 | 


--------------------------------------------------------------------------------
/prometheus_alerts.yaml:
--------------------------------------------------------------------------------
  1 | "groups":
  2 | - "name": "karpenter"
  3 |   "rules":
  4 |   - "alert": "KarpenterCloudProviderErrors"
  5 |     "annotations":
  6 |       "dashboard_url": "https://grafana.com/d/kubernetes-autoscaling-mixin-kperf-jkwq/kubernetes-autoscaling-karpenter-performance"
  7 |       "description": "The Karpenter provider {{ $labels.provider }} with the controller {{ $labels.controller }} has errors with the method {{ $labels.method }}."
  8 |       "summary": "Karpenter has Cloud Provider Errors."
  9 |     "expr": |
 10 |       sum(
 11 |         increase(
 12 |           karpenter_cloudprovider_errors_total{
 13 |             job="karpenter",
 14 |             controller!~"nodeclaim.termination|node.termination",
 15 |             error!="NodeClaimNotFoundError"
 16 |           }[5m]
 17 |         )
 18 |       ) by (cluster, namespace, job, provider, controller, method) > 0
 19 |     "for": "5m"
 20 |     "labels":
 21 |       "severity": "warning"
 22 |   - "alert": "KarpenterNodeClaimsTerminationDurationHigh"
 23 |     "annotations":
 24 |       "dashboard_url": "https://grafana.com/d/kubernetes-autoscaling-mixin-kact-jkwq/kubernetes-autoscaling-karpenter-activity"
 25 |       "description": "The average node claim termination duration in Karpenter has exceeded 20 minutes for more than 15 minutes in nodepool {{ $labels.nodepool }}. This may indicate cloud provider issues or improper instance termination handling."
 26 |       "summary": "Karpenter Node Claims Termination Duration is High."
 27 |     "expr": |
 28 |       sum(
 29 |         rate(
 30 |           karpenter_nodeclaims_termination_duration_seconds_sum{
 31 |             job="karpenter"
 32 |           }[5m]
 33 |         )
 34 |       ) by (cluster, namespace, job, nodepool)
 35 |       /
 36 |       sum(
 37 |         rate(
 38 |           karpenter_nodeclaims_termination_duration_seconds_count{
 39 |             job="karpenter"
 40 |           }[5m]
 41 |         )
 42 |       ) by (cluster, namespace, job, nodepool) > 1200
 43 |     "for": "15m"
 44 |     "labels":
 45 |       "severity": "warning"
 46 |   - "alert": "KarpenterNodepoolNearCapacity"
 47 |     "annotations":
 48 |       "dashboard_url": "https://grafana.com/d/kubernetes-autoscaling-mixin-kover-jkwq/kubernetes-autoscaling-karpenter-overview"
 49 |       "description": "The resource {{ $labels.resource_type }} in the Karpenter node pool {{ $labels.nodepool }} is nearing its limit. Consider scaling or adding resources."
 50 |       "summary": "Karpenter Nodepool near capacity."
 51 |     "expr": |
 52 |       sum (
 53 |         karpenter_nodepools_usage{job="karpenter"}
 54 |       ) by (cluster, namespace, job, nodepool, resource_type)
 55 |       /
 56 |       sum (
 57 |         karpenter_nodepools_limit{job="karpenter"}
 58 |       ) by (cluster, namespace, job, nodepool, resource_type)
 59 |       * 100 > 75
 60 |     "for": "15m"
 61 |     "labels":
 62 |       "severity": "warning"
 63 | - "name": "cluster-autoscaler"
 64 |   "rules":
 65 |   - "alert": "ClusterAutoscalerNodeCountNearCapacity"
 66 |     "annotations":
 67 |       "dashboard_url": "https://grafana.com/d/kubernetes-autoscaling-mixin-ca-jkwq/kubernetes-autoscaling-cluster-autoscaler"
 68 |       "description": "The node count for the cluster autoscaler job {{ $labels.job }} is reaching max limit. Consider scaling node groups."
 69 |       "summary": "Cluster Autoscaler Node Count near Capacity."
 70 |     "expr": |
 71 |       sum (
 72 |         cluster_autoscaler_nodes_count{
 73 |           job="cluster-autoscaler"
 74 |         }
 75 |       ) by (cluster, namespace, job)
 76 |       /
 77 |       sum (
 78 |         cluster_autoscaler_max_nodes_count{
 79 |           job="cluster-autoscaler"
 80 |         }
 81 |       ) by (cluster, namespace, job)
 82 |       * 100 > 75
 83 |     "for": "15m"
 84 |     "labels":
 85 |       "severity": "warning"
 86 |   - "alert": "ClusterAutoscalerUnschedulablePods"
 87 |     "annotations":
 88 |       "dashboard_url": "https://grafana.com/d/kubernetes-autoscaling-mixin-ca-jkwq/kubernetes-autoscaling-cluster-autoscaler"
 89 |       "description": "The cluster currently has unschedulable pods, indicating resource shortages. Consider adding more nodes or increasing node group capacity."
 90 |       "summary": "Pods Pending Scheduling - Cluster Node Group Scaling Required"
 91 |     "expr": |
 92 |       sum (
 93 |         cluster_autoscaler_unschedulable_pods_count{
 94 |           job="cluster-autoscaler"
 95 |         }
 96 |       ) by (cluster, namespace, job)
 97 |       > 0
 98 |     "for": "15m"
 99 |     "labels":
100 |       "severity": "warning"
101 | - "name": "keda"
102 |   "rules":
103 |   - "alert": "KedaScaledJobErrors"
104 |     "annotations":
105 |       "dashboard_url": "https://grafana.com/d/kubernetes-autoscaling-mixin-kedasj-jkwq/kubernetes-autoscaling-keda-scaled-job?var-scaled_job={{ $labels.scaledObject }}&var-resource_namespace={{ $labels.exported_namespace }}"
106 |       "description": "KEDA scaled jobs are experiencing errors. Check the scaled job {{ $labels.scaledObject }} in the namespace {{ $labels.exported_namespace }}."
107 |       "summary": "Errors detected for KEDA scaled jobs."
108 |     "expr": |
109 |       sum(
110 |         increase(
111 |           keda_scaled_job_errors_total{
112 |             job="keda-operator"
113 |           }[10m]
114 |         )
115 |       ) by (cluster, job, exported_namespace, scaledObject) > 0
116 |     "for": "1m"
117 |     "labels":
118 |       "severity": "warning"
119 |   - "alert": "KedaScaledObjectErrors"
120 |     "annotations":
121 |       "dashboard_url": "https://grafana.com/d/kubernetes-autoscaling-mixin-kedaso-jkwq/kubernetes-autoscaling-keda-scaled-object?var-scaled_object={{ $labels.scaledObject }}&var-resource_namespace={{ $labels.exported_namespace }}"
122 |       "description": "KEDA scaled objects are experiencing errors. Check the scaled object {{ $labels.scaledObject }} in the namespace {{ $labels.exported_namespace }}."
123 |       "summary": "Errors detected for KEDA scaled objects."
124 |     "expr": |
125 |       sum(
126 |         increase(
127 |           keda_scaled_object_errors_total{
128 |             job="keda-operator"
129 |           }[10m]
130 |         )
131 |       ) by (cluster, job, exported_namespace, scaledObject) > 0
132 |     "for": "1m"
133 |     "labels":
134 |       "severity": "warning"
135 |   - "alert": "KedaScalerLatencyHigh"
136 |     "annotations":
137 |       "dashboard_url": "https://grafana.com/d/kubernetes-autoscaling-mixin-kedaso-jkwq/kubernetes-autoscaling-keda-scaled-object?var-scaled_object={{ $labels.scaledObject }}&var-scaler={{ $labels.scaler }}"
138 |       "description": "Metric latency for scaler {{ $labels.scaler }} for the object {{ $labels.scaledObject }} has exceeded acceptable limits."
139 |       "summary": "High latency for KEDA scaler metrics."
140 |     "expr": |
141 |       avg(
142 |         keda_scaler_metrics_latency_seconds{
143 |           job="keda-operator"
144 |         }
145 |       ) by (cluster, job, exported_namespace, scaledObject, scaler) > 5
146 |     "for": "10m"
147 |     "labels":
148 |       "severity": "warning"
149 |   - "alert": "KedaScaledObjectPaused"
150 |     "annotations":
151 |       "dashboard_url": "https://grafana.com/d/kubernetes-autoscaling-mixin-kedaso-jkwq/kubernetes-autoscaling-keda-scaled-object?var-scaled_object={{ $labels.scaledObject }}&var-resource_namespace={{ $labels.exported_namespace }}"
152 |       "description": "The scaled object {{ $labels.scaledObject }} in namespace {{ $labels.exported_namespace }} is paused for longer than 25h. This may indicate a configuration issue or manual intervention."
153 |       "summary": "KEDA scaled object is paused."
154 |     "expr": |
155 |       max(
156 |         keda_scaled_object_paused{
157 |           job="keda-operator"
158 |         }
159 |       ) by (cluster, job, exported_namespace, scaledObject) > 0
160 |     "for": "25h"
161 |     "labels":
162 |       "severity": "warning"
163 |   - "alert": "KedaScalerDetailErrors"
164 |     "annotations":
165 |       "dashboard_url": "https://grafana.com/d/kubernetes-autoscaling-mixin-kedaso-jkwq/kubernetes-autoscaling-keda-scaled-object?var-scaler={{ $labels.scaler }}&var-scaled_object={{ $labels.scaledObject }}"
166 |       "description": "Errors have occurred in the KEDA scaler {{ $labels.scaler }}. Investigate the scaler for the {{ $labels.type }} {{ $labels.scaledObject }} in namespace {{ $labels.exported_namespace }}."
167 |       "summary": "Errors detected in KEDA scaler."
168 |     "expr": |
169 |       sum(
170 |         increase(
171 |           keda_scaler_detail_errors_total{
172 |             job="keda-operator"
173 |           }[10m]
174 |         )
175 |       ) by (cluster, job, exported_namespace, scaledObject, type, scaler) > 0
176 |     "for": "1m"
177 |     "labels":
178 |       "severity": "warning"
179 | 


--------------------------------------------------------------------------------
/dashboards/kubernetes/util.libsonnet:
--------------------------------------------------------------------------------
  1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet';
  2 | 
  3 | local dashboard = g.dashboard;
  4 | 
  5 | local variable = dashboard.variable;
  6 | local datasource = variable.datasource;
  7 | local query = variable.query;
  8 | 
  9 | {
 10 |   filters(config):: {
 11 |     local this = self,
 12 |     cluster: '%(clusterLabel)s="$cluster"' % config,
 13 |     clusterMulti: '%(clusterLabel)s=~"$cluster"' % config,
 14 |     clusterLabel: config.clusterLabel,
 15 |     job: 'job=~"$job"',
 16 |     namespace: 'namespace=~"$namespace"',
 17 |     container: 'container=~"$container"',
 18 | 
 19 |     // PDB
 20 |     pdb: 'poddisruptionbudget=~"$poddisruptionbudget"',
 21 | 
 22 |     // HPA
 23 |     hpa: 'horizontalpodautoscaler=~"$horizontalpodautoscaler"',
 24 |     hpaMetricName: 'metric_name=~"$metric_name"',
 25 |     hpaMetricTargetType: 'metric_target_type=~"$metric_target_type"',
 26 | 
 27 |     // VPA
 28 |     vpa: 'verticalpodautoscaler=~"$verticalpodautoscaler"',
 29 |     vpaPrefix: config.vpa.vpaPrefix,
 30 | 
 31 |     base: |||
 32 |       %(cluster)s,
 33 |       %(job)s,
 34 |       %(namespace)s
 35 |     ||| % this,
 36 | 
 37 |     baseMulti: |||
 38 |       %(clusterMulti)s,
 39 |       %(job)s,
 40 |       %(namespace)s
 41 |     ||| % this,
 42 | 
 43 |     // PDB
 44 |     withPdb: |||
 45 |       %(base)s,
 46 |       %(pdb)s
 47 |     ||| % this,
 48 | 
 49 |     // HPA
 50 |     withHpa: |||
 51 |       %(base)s,
 52 |       %(hpa)s
 53 |     ||| % this,
 54 | 
 55 |     withHpaMetricName: |||
 56 |       %(withHpa)s,
 57 |       %(hpaMetricName)s
 58 |     ||| % this,
 59 | 
 60 |     withHpaMetricTargetType: |||
 61 |       %(withHpaMetricName)s,
 62 |       %(hpaMetricTargetType)s
 63 |     ||| % this,
 64 | 
 65 |     // VPA
 66 |     withVpa: |||
 67 |       %(base)s,
 68 |       %(vpa)s,
 69 |       %(container)s
 70 |     ||| % this,
 71 |   },
 72 | 
 73 |   variables(config):: {
 74 |     local this = self,
 75 | 
 76 |     local defaultFilters = $.filters(config),
 77 | 
 78 |     datasource:
 79 |       datasource.new(
 80 |         'datasource',
 81 |         'prometheus',
 82 |       ) +
 83 |       datasource.generalOptions.withLabel('Data source') +
 84 |       {
 85 |         current: {
 86 |           selected: true,
 87 |           text: config.datasourceName,
 88 |           value: config.datasourceName,
 89 |         },
 90 |       },
 91 | 
 92 |     cluster:
 93 |       query.new(
 94 |         config.clusterLabel,
 95 |         'label_values(kube_pod_info{%(kubeStateMetricsSelector)s}, cluster)' % config,
 96 |       ) +
 97 |       query.withDatasourceFromVariable(this.datasource) +
 98 |       query.withSort() +
 99 |       query.generalOptions.withLabel('Cluster') +
100 |       query.refresh.onLoad() +
101 |       query.refresh.onTime() +
102 |       query.selectionOptions.withMulti(config.vpa.clusterAggregation) +
103 |       (
104 |         if config.showMultiCluster
105 |         then query.generalOptions.showOnDashboard.withLabelAndValue()
106 |         else query.generalOptions.showOnDashboard.withNothing()
107 |       ),
108 | 
109 |     // PDB
110 |     pdbJob:
111 |       query.new(
112 |         'job',
113 |         'label_values(kube_poddisruptionbudget_status_current_healthy{%(cluster)s}, job)' % defaultFilters,
114 |       ) +
115 |       query.withDatasourceFromVariable(this.datasource) +
116 |       query.withSort() +
117 |       query.generalOptions.withLabel('Job') +
118 |       query.refresh.onLoad() +
119 |       query.refresh.onTime(),
120 | 
121 |     pdbNamespace:
122 |       query.new(
123 |         'namespace',
124 |         'label_values(kube_poddisruptionbudget_status_current_healthy{%(cluster)s, %(job)s}, namespace)' % defaultFilters,
125 |       ) +
126 |       query.withDatasourceFromVariable(this.datasource) +
127 |       query.withSort() +
128 |       query.generalOptions.withLabel('Namespace') +
129 |       query.selectionOptions.withMulti(true) +
130 |       query.refresh.onLoad() +
131 |       query.refresh.onTime(),
132 | 
133 |     pdb:
134 |       query.new(
135 |         'poddisruptionbudget',
136 |         'label_values(kube_poddisruptionbudget_status_current_healthy{%(cluster)s, %(job)s, %(namespace)s}, poddisruptionbudget)' % defaultFilters,
137 |       ) +
138 |       query.withDatasourceFromVariable(this.datasource) +
139 |       query.withSort() +
140 |       query.generalOptions.withLabel('Pod Disruption Budget') +
141 |       query.selectionOptions.withMulti(true) +
142 |       query.selectionOptions.withIncludeAll(false) +
143 |       query.refresh.onLoad() +
144 |       query.refresh.onTime(),
145 | 
146 |     // HPA
147 |     hpaJob:
148 |       query.new(
149 |         'job',
150 |         'label_values(kube_horizontalpodautoscaler_spec_target_metric{%(cluster)s}, job)' % defaultFilters,
151 |       ) +
152 |       query.withDatasourceFromVariable(this.datasource) +
153 |       query.withSort() +
154 |       query.generalOptions.withLabel('Job') +
155 |       query.refresh.onLoad() +
156 |       query.refresh.onTime(),
157 | 
158 |     hpaNamespace:
159 |       query.new(
160 |         'namespace',
161 |         'label_values(kube_horizontalpodautoscaler_spec_target_metric{%(cluster)s, %(job)s}, namespace)' % defaultFilters,
162 |       ) +
163 |       query.withDatasourceFromVariable(this.datasource) +
164 |       query.withSort() +
165 |       query.generalOptions.withLabel('Namespace') +
166 |       query.selectionOptions.withMulti(true) +
167 |       query.refresh.onLoad() +
168 |       query.refresh.onTime(),
169 | 
170 |     hpa:
171 |       query.new(
172 |         'horizontalpodautoscaler',
173 |         'label_values(kube_horizontalpodautoscaler_spec_target_metric{%(cluster)s, %(job)s, %(namespace)s}, horizontalpodautoscaler)' % defaultFilters,
174 |       ) +
175 |       query.withDatasourceFromVariable(this.datasource) +
176 |       query.withSort() +
177 |       query.generalOptions.withLabel('HPA') +
178 |       query.refresh.onLoad() +
179 |       query.refresh.onTime(),
180 | 
181 |     hpaMetricName:
182 |       query.new(
183 |         'metric_name',
184 |         'label_values(kube_horizontalpodautoscaler_spec_target_metric{%(cluster)s, %(job)s, %(namespace)s, %(hpa)s}, metric_name)' % defaultFilters,
185 |       ) +
186 |       query.withDatasourceFromVariable(this.datasource) +
187 |       query.withSort() +
188 |       query.generalOptions.withLabel('Metric Name') +
189 |       query.refresh.onLoad() +
190 |       query.refresh.onTime(),
191 | 
192 |     hpaMetricTargetType:
193 |       query.new(
194 |         'metric_target_type',
195 |         'label_values(kube_horizontalpodautoscaler_spec_target_metric{%(cluster)s, %(job)s, %(namespace)s, %(hpa)s, %(hpaMetricName)s}, metric_target_type)' % defaultFilters,
196 |       ) +
197 |       query.withDatasourceFromVariable(this.datasource) +
198 |       query.withSort() +
199 |       query.generalOptions.withLabel('Metric Target Type') +
200 |       query.selectionOptions.withMulti(true) +
201 |       query.selectionOptions.withIncludeAll(true) +
202 |       query.refresh.onLoad() +
203 |       query.refresh.onTime(),
204 | 
205 |     // VPA
206 |     vpaJob:
207 |       query.new(
208 |         'job',
209 |         'label_values(kube_customresource_verticalpodautoscaler_labels{%(cluster)s}, job)' % defaultFilters,
210 |       ) +
211 |       query.withDatasourceFromVariable(this.datasource) +
212 |       query.withSort() +
213 |       query.generalOptions.withLabel('Job') +
214 |       query.refresh.onLoad() +
215 |       query.refresh.onTime(),
216 | 
217 |     vpaNamespace:
218 |       query.new(
219 |         'namespace',
220 |         'label_values(kube_customresource_verticalpodautoscaler_labels{%(cluster)s, %(job)s}, namespace)' % defaultFilters,
221 |       ) +
222 |       query.withDatasourceFromVariable(this.datasource) +
223 |       query.withSort() +
224 |       query.generalOptions.withLabel('Namespace') +
225 |       query.selectionOptions.withMulti(true) +
226 |       query.refresh.onLoad() +
227 |       query.refresh.onTime(),
228 | 
229 |     vpa:
230 |       query.new(
231 |         'verticalpodautoscaler',
232 |         'label_values(kube_customresource_verticalpodautoscaler_labels{%(cluster)s, %(job)s, %(namespace)s}, verticalpodautoscaler)' % defaultFilters,
233 |       ) +
234 |       query.withDatasourceFromVariable(this.datasource) +
235 |       query.withSort() +
236 |       query.generalOptions.withLabel('Vertical Pod Autoscaler') +
237 |       query.refresh.onLoad() +
238 |       query.refresh.onTime(),
239 | 
240 |     vpaContainer:
241 |       query.new(
242 |         'container',
243 |         'label_values(kube_customresource_verticalpodautoscaler_status_recommendation_containerrecommendations_target{%(cluster)s, %(job)s, %(namespace)s, %(vpa)s}, container)' % defaultFilters,
244 |       ) +
245 |       query.withDatasourceFromVariable(this.datasource) +
246 |       query.withSort() +
247 |       query.generalOptions.withLabel('Container') +
248 |       query.selectionOptions.withMulti(true) +
249 |       query.selectionOptions.withIncludeAll(true) +
250 |       query.refresh.onLoad() +
251 |       query.refresh.onTime(),
252 |   },
253 | }
254 | 


--------------------------------------------------------------------------------
/dashboards/keda/util.libsonnet:
--------------------------------------------------------------------------------
  1 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet';
  2 | 
  3 | local dashboard = g.dashboard;
  4 | 
  5 | local variable = dashboard.variable;
  6 | local datasource = variable.datasource;
  7 | local query = variable.query;
  8 | 
  9 | {
 10 |   filters(config):: {
 11 |     local this = self,
 12 |     cluster: '%(clusterLabel)s="$cluster"' % config,
 13 |     job: 'job=~"$job"',
 14 |     operatorNamespace: 'namespace=~"$operator_namespace"',
 15 |     resourceNamespace: 'exported_namespace=~"$resource_namespace"',
 16 |     scaledObject: 'scaledObject="$scaled_object"',
 17 |     scaledJob: 'scaledObject="$scaled_job"',
 18 |     scaler: 'scaler="$scaler"',
 19 |     metric: 'metric="$metric"',
 20 | 
 21 |     base: |||
 22 |       %(cluster)s,
 23 |       %(job)s,
 24 |       %(operatorNamespace)s
 25 |     ||| % this,
 26 | 
 27 |     withResourceNamespace: |||
 28 |       %(base)s,
 29 |       %(resourceNamespace)s
 30 |     ||| % this,
 31 | 
 32 |     withScaledObject: |||
 33 |       %(withResourceNamespace)s,
 34 |       type="scaledobject",
 35 |       %(scaledObject)s
 36 |     ||| % this,
 37 | 
 38 |     withScaledJob: |||
 39 |       %(withResourceNamespace)s,
 40 |       type="scaledjob",
 41 |       %(scaledJob)s
 42 |     ||| % this,
 43 | 
 44 |     withScaledObjectScaler: |||
 45 |       %(withScaledObject)s,
 46 |       %(scaler)s
 47 |     ||| % this,
 48 | 
 49 |     withScaledJobScaler: |||
 50 |       %(withScaledJob)s,
 51 |       %(scaler)s
 52 |     ||| % this,
 53 | 
 54 |     withScaledObjectMetric: |||
 55 |       %(withScaledObjectScaler)s,
 56 |       %(metric)s
 57 |     ||| % this,
 58 | 
 59 |     withScaledJobMetric: |||
 60 |       %(withScaledJobScaler)s,
 61 |       %(metric)s
 62 |     ||| % this,
 63 |   },
 64 | 
 65 |   variables(config):: {
 66 |     local this = self,
 67 | 
 68 |     local defaultFilters = $.filters(config),
 69 | 
 70 |     datasource:
 71 |       datasource.new(
 72 |         'datasource',
 73 |         'prometheus',
 74 |       ) +
 75 |       datasource.generalOptions.withLabel('Data source') +
 76 |       {
 77 |         current: {
 78 |           selected: true,
 79 |           text: config.datasourceName,
 80 |           value: config.datasourceName,
 81 |         },
 82 |       },
 83 | 
 84 |     cluster:
 85 |       query.new(
 86 |         config.clusterLabel,
 87 |         'label_values(keda_build_info{}, cluster)' % config,
 88 |       ) +
 89 |       query.withDatasourceFromVariable(this.datasource) +
 90 |       query.withSort() +
 91 |       query.generalOptions.withLabel('Cluster') +
 92 |       query.refresh.onLoad() +
 93 |       query.refresh.onTime() +
 94 |       (
 95 |         if config.showMultiCluster
 96 |         then query.generalOptions.showOnDashboard.withLabelAndValue()
 97 |         else query.generalOptions.showOnDashboard.withNothing()
 98 |       ),
 99 | 
100 |     scaledObjectJob:
101 |       query.new(
102 |         'job',
103 |         'label_values(keda_scaled_object_paused{%(cluster)s}, job)' % defaultFilters,
104 |       ) +
105 |       query.withDatasourceFromVariable(this.datasource) +
106 |       query.withSort() +
107 |       query.generalOptions.withLabel('Job') +
108 |       query.selectionOptions.withMulti(true) +
109 |       query.selectionOptions.withIncludeAll(true) +
110 |       query.refresh.onLoad() +
111 |       query.refresh.onTime(),
112 | 
113 |     scaledJobJob:
114 |       query.new(
115 |         'job',
116 |         'label_values(keda_scaled_job_errors_total{%(cluster)s}, job)' % defaultFilters,
117 |       ) +
118 |       query.withDatasourceFromVariable(this.datasource) +
119 |       query.withSort() +
120 |       query.generalOptions.withLabel('Job') +
121 |       query.selectionOptions.withMulti(true) +
122 |       query.selectionOptions.withIncludeAll(true) +
123 |       query.refresh.onLoad() +
124 |       query.refresh.onTime(),
125 | 
126 |     scaledObjectOperatorNamespace:
127 |       query.new(
128 |         'operator_namespace',
129 |         'label_values(keda_scaled_object_paused{%(cluster)s, %(job)s}, namespace)' % defaultFilters,
130 |       ) +
131 |       query.withDatasourceFromVariable(this.datasource) +
132 |       query.withSort() +
133 |       query.generalOptions.withLabel('Operator Namespace') +
134 |       query.selectionOptions.withMulti(true) +
135 |       query.selectionOptions.withIncludeAll(true) +
136 |       query.refresh.onLoad() +
137 |       query.refresh.onTime(),
138 | 
139 |     scaledJobOperatorNamespace:
140 |       query.new(
141 |         'operator_namespace',
142 |         'label_values(keda_scaled_job_errors_total{%(cluster)s, %(job)s}, namespace)' % defaultFilters,
143 |       ) +
144 |       query.withDatasourceFromVariable(this.datasource) +
145 |       query.withSort() +
146 |       query.generalOptions.withLabel('Operator Namespace') +
147 |       query.selectionOptions.withMulti(true) +
148 |       query.selectionOptions.withIncludeAll(true) +
149 |       query.refresh.onLoad() +
150 |       query.refresh.onTime(),
151 | 
152 |     scaledObjectResourceNamespace:
153 |       query.new(
154 |         'resource_namespace',
155 |         'label_values(keda_scaled_object_paused{%(cluster)s, %(job)s, %(operatorNamespace)s}, exported_namespace)' % defaultFilters,
156 |       ) +
157 |       query.withDatasourceFromVariable(this.datasource) +
158 |       query.withSort() +
159 |       query.generalOptions.withLabel('Resource Namespace') +
160 |       query.refresh.onLoad() +
161 |       query.refresh.onTime(),
162 | 
163 |     scaledJobResourceNamespace:
164 |       query.new(
165 |         'resource_namespace',
166 |         'label_values(keda_scaled_job_errors_total{%(cluster)s, %(job)s, %(operatorNamespace)s}, exported_namespace)' % defaultFilters,
167 |       ) +
168 |       query.withDatasourceFromVariable(this.datasource) +
169 |       query.withSort() +
170 |       query.generalOptions.withLabel('Resource Namespace') +
171 |       query.refresh.onLoad() +
172 |       query.refresh.onTime(),
173 | 
174 |     scaledObject:
175 |       query.new(
176 |         'scaled_object',
177 |         'label_values(keda_scaled_object_paused{%(cluster)s, %(job)s, %(operatorNamespace)s, %(resourceNamespace)s}, scaledObject)' % defaultFilters,
178 |       ) +
179 |       query.withDatasourceFromVariable(this.datasource) +
180 |       query.withSort() +
181 |       query.generalOptions.withLabel('Scaled Object') +
182 |       query.selectionOptions.withMulti(false) +
183 |       query.selectionOptions.withIncludeAll(false) +
184 |       query.refresh.onLoad() +
185 |       query.refresh.onTime(),
186 | 
187 |     scaledJob:
188 |       query.new(
189 |         'scaled_job',
190 |         'label_values(keda_scaled_job_errors_total{%(cluster)s, %(job)s, %(operatorNamespace)s, %(resourceNamespace)s}, scaledJob)' % defaultFilters,
191 |       ) +
192 |       query.withDatasourceFromVariable(this.datasource) +
193 |       query.withSort() +
194 |       query.generalOptions.withLabel('Scaled Job') +
195 |       query.selectionOptions.withMulti(false) +
196 |       query.selectionOptions.withIncludeAll(false) +
197 |       query.refresh.onLoad() +
198 |       query.refresh.onTime(),
199 | 
200 |     scalerForScaledObject:
201 |       query.new(
202 |         'scaler',
203 |         'label_values(keda_scaler_active{%(cluster)s, %(job)s, %(operatorNamespace)s, exported_namespace="$resource_namespace", type="scaledobject", scaledObject="$scaled_object"}, scaler)' % defaultFilters,
204 |       ) +
205 |       query.withDatasourceFromVariable(this.datasource) +
206 |       query.withSort() +
207 |       query.generalOptions.withLabel('Scaler') +
208 |       query.selectionOptions.withMulti(false) +
209 |       query.selectionOptions.withIncludeAll(false) +
210 |       query.refresh.onLoad() +
211 |       query.refresh.onTime(),
212 | 
213 |     scalerForScaledJob:
214 |       query.new(
215 |         'scaler',
216 |         'label_values(keda_scaler_active{%(cluster)s, %(job)s, %(operatorNamespace)s, exported_namespace="$resource_namespace", type="scaledjob", scaledObject="$scaled_job"}, scaler)' % defaultFilters,
217 |       ) +
218 |       query.withDatasourceFromVariable(this.datasource) +
219 |       query.withSort() +
220 |       query.generalOptions.withLabel('Scaler') +
221 |       query.selectionOptions.withMulti(false) +
222 |       query.selectionOptions.withIncludeAll(false) +
223 |       query.refresh.onLoad() +
224 |       query.refresh.onTime(),
225 | 
226 |     metricForScaledObject:
227 |       query.new(
228 |         'metric',
229 |         'label_values(keda_scaler_active{%(cluster)s, %(job)s, %(operatorNamespace)s, exported_namespace="$resource_namespace", type="scaledobject", scaledObject="$scaled_object", scaler="$scaler"}, metric)' % defaultFilters,
230 |       ) +
231 |       query.withDatasourceFromVariable(this.datasource) +
232 |       query.withSort() +
233 |       query.generalOptions.withLabel('Metric') +
234 |       query.selectionOptions.withMulti(false) +
235 |       query.selectionOptions.withIncludeAll(false) +
236 |       query.refresh.onLoad() +
237 |       query.refresh.onTime(),
238 | 
239 |     metricForScaledJob:
240 |       query.new(
241 |         'metric',
242 |         'label_values(keda_scaler_active{%(cluster)s, %(job)s, %(operatorNamespace)s, exported_namespace="$resource_namespace", type="scaledjob", scaledObject="$scaled_job", scaler="$scaler"}, metric)' % defaultFilters,
243 |       ) +
244 |       query.withDatasourceFromVariable(this.datasource) +
245 |       query.withSort() +
246 |       query.generalOptions.withLabel('Metric') +
247 |       query.selectionOptions.withMulti(false) +
248 |       query.selectionOptions.withIncludeAll(false) +
249 |       query.refresh.onLoad() +
250 |       query.refresh.onTime(),
251 |   },
252 | }
253 | 


--------------------------------------------------------------------------------
/dashboards/kubernetes/kubernetes-autoscaling-hpa.libsonnet:
--------------------------------------------------------------------------------
  1 | local mixinUtils = import 'github.com/adinhodovic/mixin-utils/utils.libsonnet';
  2 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet';
  3 | local util = import 'util.libsonnet';
  4 | 
  5 | local dashboard = g.dashboard;
  6 | local row = g.panel.row;
  7 | local grid = g.util.grid;
  8 | 
  9 | local tablePanel = g.panel.table;
 10 | 
 11 | // Table
 12 | local tbQueryOptions = tablePanel.queryOptions;
 13 | 
 14 | {
 15 |   grafanaDashboards+:: {
 16 |     'kubernetes-autoscaling-mixin-hpa.json':
 17 | 
 18 |       local defaultVariables = util.variables($._config);
 19 | 
 20 |       local variables = [
 21 |         defaultVariables.datasource,
 22 |         defaultVariables.cluster,
 23 |         defaultVariables.hpaJob,
 24 |         defaultVariables.hpaNamespace,
 25 |         defaultVariables.hpa,
 26 |         defaultVariables.hpaMetricName,
 27 |         defaultVariables.hpaMetricTargetType,
 28 |       ];
 29 | 
 30 |       local defaultFilters = util.filters($._config);
 31 |       local queries = {
 32 |         desiredReplicas: |||
 33 |           round(
 34 |             sum(
 35 |               kube_horizontalpodautoscaler_status_desired_replicas{
 36 |                 %(withHpa)s
 37 |               }
 38 |             )
 39 |           )
 40 |         ||| % defaultFilters,
 41 | 
 42 |         currentReplicas: |||
 43 |           round(
 44 |             sum(
 45 |               kube_horizontalpodautoscaler_status_current_replicas{
 46 |                 %(withHpa)s
 47 |               }
 48 |             )
 49 |           )
 50 |         ||| % defaultFilters,
 51 | 
 52 |         minReplicas: |||
 53 |           round(
 54 |             sum(
 55 |               kube_horizontalpodautoscaler_spec_min_replicas{
 56 |                 %(withHpa)s
 57 |               }
 58 |             )
 59 |           )
 60 |         ||| % defaultFilters,
 61 | 
 62 |         maxReplicas: |||
 63 |           round(
 64 |             sum(
 65 |               kube_horizontalpodautoscaler_spec_max_replicas{
 66 |                 %(withHpa)s
 67 |               }
 68 |             )
 69 |           )
 70 |         ||| % defaultFilters,
 71 | 
 72 |         metricTargets: |||
 73 |           sum(
 74 |             kube_horizontalpodautoscaler_spec_target_metric{
 75 |               %(withHpaMetricName)s
 76 |             }
 77 |           ) by (job, namespace, horizontalpodautoscaler, metric_name, metric_target_type)
 78 |         ||| % defaultFilters,
 79 | 
 80 |         usageThreshold: |||
 81 |           sum(
 82 |             kube_horizontalpodautoscaler_spec_target_metric{
 83 |               %(withHpaMetricTargetType)s
 84 |             }
 85 |           ) by (job, namespace, horizontalpodautoscaler, metric_name, metric_target_type)
 86 |         ||| % defaultFilters,
 87 | 
 88 |         utilization: |||
 89 |           sum(
 90 |             kube_horizontalpodautoscaler_status_target_metric{
 91 |               %(withHpaMetricTargetType)s
 92 |             }
 93 |           ) by (job, namespace, horizontalpodautoscaler, metric_name, metric_target_type)
 94 |         ||| % defaultFilters,
 95 |       };
 96 | 
 97 |       local panels = {
 98 |         desiredReplicasStat:
 99 |           mixinUtils.dashboards.statPanel(
100 |             'Desired Replicas',
101 |             'short',
102 |             queries.desiredReplicas,
103 |             description='The desired number of replicas for the HPA.',
104 |           ),
105 | 
106 |         currentReplicasStat:
107 |           mixinUtils.dashboards.statPanel(
108 |             'Current Replicas',
109 |             'short',
110 |             queries.currentReplicas,
111 |             description='The current number of replicas for the HPA.',
112 |           ),
113 | 
114 |         minReplicasStat:
115 |           mixinUtils.dashboards.statPanel(
116 |             'Min Replicas',
117 |             'short',
118 |             queries.minReplicas,
119 |             description='The minimum number of replicas configured for the HPA.',
120 |           ),
121 | 
122 |         maxReplicasStat:
123 |           mixinUtils.dashboards.statPanel(
124 |             'Max Replicas',
125 |             'short',
126 |             queries.maxReplicas,
127 |             description='The maximum number of replicas configured for the HPA.',
128 |           ),
129 | 
130 |         usageAndThresholdTimeSeries:
131 |           mixinUtils.dashboards.timeSeriesPanel(
132 |             'Usage & Threshold',
133 |             'short',
134 |             [
135 |               {
136 |                 expr: queries.utilization,
137 |                 legend: '{{ metric_target_type }} / {{ metric_name }}',
138 |               },
139 |               {
140 |                 expr: queries.usageThreshold,
141 |                 legend: 'Threshold / {{ metric_name }}',
142 |               },
143 |             ],
144 |             fillOpacity=0,
145 |             description='The current utilization and configured threshold for the HPA metric.',
146 |           ),
147 | 
148 |         replicasTimeSeries:
149 |           mixinUtils.dashboards.timeSeriesPanel(
150 |             'Replicas',
151 |             'short',
152 |             [
153 |               {
154 |                 expr: queries.desiredReplicas,
155 |                 legend: 'Desired Replicas',
156 |               },
157 |               {
158 |                 expr: queries.currentReplicas,
159 |                 legend: 'Current Replicas',
160 |               },
161 |               {
162 |                 expr: queries.minReplicas,
163 |                 legend: 'Min Replicas',
164 |               },
165 |               {
166 |                 expr: queries.maxReplicas,
167 |                 legend: 'Max Replicas',
168 |               },
169 |             ],
170 |             fillOpacity=0,
171 |             description='The desired, current, minimum, and maximum replicas for the HPA over time.',
172 |           ),
173 | 
174 |         metricTargetsTable:
175 |           mixinUtils.dashboards.tablePanel(
176 |             'Metric Targets',
177 |             'short',
178 |             queries.metricTargets,
179 |             description='Configured metric targets for the HPA.',
180 |             sortBy={ name: 'Horizontal Pod Autoscaler', desc: false },
181 |             transformations=[
182 |               tbQueryOptions.transformation.withId(
183 |                 'organize'
184 |               ) +
185 |               tbQueryOptions.transformation.withOptions(
186 |                 {
187 |                   renameByName: {
188 |                     namespace: 'Namespace',
189 |                     horizontalpodautoscaler: 'Horizontal Pod Autoscaler',
190 |                     metric_name: 'Metric Name',
191 |                     metric_target_type: 'Metric Target Type',
192 |                     'Value #A': 'Threshold',
193 |                   },
194 |                   indexByName: {
195 |                     horizontalpodautoscaler: 0,
196 |                     namespace: 1,
197 |                     metric_name: 2,
198 |                     metric_target_type: 3,
199 |                     'Value #A': 4,
200 |                   },
201 |                   excludeByName: {
202 |                     Time: true,
203 |                     job: true,
204 |                   },
205 |                 }
206 |               ),
207 |             ]
208 |           ),
209 |       };
210 | 
211 |       local rows =
212 |         [
213 |           row.new('Summary') +
214 |           row.gridPos.withX(0) +
215 |           row.gridPos.withY(0) +
216 |           row.gridPos.withW(24) +
217 |           row.gridPos.withH(1),
218 |         ] +
219 |         grid.makeGrid(
220 |           [
221 |             panels.desiredReplicasStat,
222 |             panels.currentReplicasStat,
223 |             panels.minReplicasStat,
224 |             panels.maxReplicasStat,
225 |           ],
226 |           panelWidth=6,
227 |           panelHeight=3,
228 |           startY=1
229 |         ) +
230 |         [
231 |           panels.metricTargetsTable +
232 |           row.gridPos.withX(0) +
233 |           row.gridPos.withY(4) +
234 |           row.gridPos.withW(24) +
235 |           row.gridPos.withH(8),
236 |           row.new('$horizontalpodautoscaler / $metric_name / $metric_target_type') +
237 |           row.gridPos.withX(0) +
238 |           row.gridPos.withY(12) +
239 |           row.gridPos.withW(24) +
240 |           row.gridPos.withH(1) +
241 |           row.withRepeat('metric_target_type'),
242 |         ] +
243 |         grid.makeGrid(
244 |           [
245 |             panels.usageAndThresholdTimeSeries,
246 |             panels.replicasTimeSeries,
247 |           ],
248 |           panelWidth=24,
249 |           panelHeight=6,
250 |           startY=13
251 |         );
252 | 
253 |       mixinUtils.dashboards.bypassDashboardValidation +
254 |       dashboard.new(
255 |         'Kubernetes / Autoscaling / Horizontal Pod Autoscaler',
256 |       ) +
257 |       dashboard.withDescription('A dashboard that monitors Horizontal Pod Autoscalers. %s' % mixinUtils.dashboards.dashboardDescriptionLink('kubernetes-autoscaling-mixin', 'https://github.com/adinhodovic/kubernetes-autoscaling-mixin')) +
258 |       dashboard.withUid($._config.hpaDashboardUid) +
259 |       dashboard.withTags($._config.tags + ['kubernetes-core']) +
260 |       dashboard.withTimezone('utc') +
261 |       dashboard.withEditable(true) +
262 |       dashboard.time.withFrom('now-6h') +
263 |       dashboard.time.withTo('now') +
264 |       dashboard.withVariables(variables) +
265 |       dashboard.withLinks(
266 |         mixinUtils.dashboards.dashboardLinks('Kubernetes / Autoscaling', $._config, dropdown=true)
267 |       ) +
268 |       dashboard.withPanels(rows),
269 |   },
270 | }
271 | 


--------------------------------------------------------------------------------
/dashboards/karpenter/karpenter-activity.libsonnet:
--------------------------------------------------------------------------------
  1 | local mixinUtils = import 'github.com/adinhodovic/mixin-utils/utils.libsonnet';
  2 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet';
  3 | local util = import 'util.libsonnet';
  4 | 
  5 | local dashboard = g.dashboard;
  6 | local row = g.panel.row;
  7 | local grid = g.util.grid;
  8 | 
  9 | {
 10 |   grafanaDashboards+:: {
 11 |     'kubernetes-autoscaling-mixin-karpenter-act.json':
 12 |       if !$._config.karpenter.enabled then {} else
 13 | 
 14 |         local defaultVariables = util.variables($._config);
 15 | 
 16 |         local variables = [
 17 |           defaultVariables.datasource,
 18 |           defaultVariables.cluster,
 19 |           defaultVariables.job,
 20 |           defaultVariables.nodepoolSimple,
 21 |         ];
 22 | 
 23 |         local defaultFilters = util.filters($._config);
 24 |         local queries = {
 25 |           // Node Activity
 26 |           nodesCreatedByNodePool: |||
 27 |             round(
 28 |               sum(
 29 |                 increase(
 30 |                   karpenter_nodes_created_total{
 31 |                     %(base)s,
 32 |                     %(nodepool)s
 33 |                   }[$__rate_interval]
 34 |                 )
 35 |               ) by (nodepool)
 36 |             )
 37 |           ||| % defaultFilters,
 38 | 
 39 |           nodesTerminatedByNodePool: |||
 40 |             round(
 41 |               sum(
 42 |                 increase(
 43 |                   karpenter_nodes_terminated_total{
 44 |                     %(base)s,
 45 |                     %(nodepool)s
 46 |                   }[$__rate_interval]
 47 |                 )
 48 |               ) by (nodepool)
 49 |             )
 50 |           ||| % defaultFilters,
 51 | 
 52 |           nodesVoluntaryDisruptionDecisions: |||
 53 |             round(
 54 |               sum(
 55 |                 increase(
 56 |                   karpenter_voluntary_disruption_decisions_total{
 57 |                     %(base)s
 58 |                   }[$__rate_interval]
 59 |                 )
 60 |               ) by (decision, reason)
 61 |             )
 62 |           ||| % defaultFilters,
 63 | 
 64 |           nodesVoluntaryDisruptionEligible: |||
 65 |             round(
 66 |               sum(
 67 |                 karpenter_voluntary_disruption_eligible_nodes{
 68 |                   %(base)s
 69 |                 }
 70 |               ) by (reason)
 71 |             )
 72 |           ||| % defaultFilters,
 73 | 
 74 |           nodesDisrupted: |||
 75 |             round(
 76 |               sum(
 77 |                 increase(
 78 |                   karpenter_nodeclaims_disrupted_total{
 79 |                     %(base)s,
 80 |                     %(nodepool)s
 81 |                   }[$__rate_interval]
 82 |                 )
 83 |               ) by (nodepool, capacity_type, reason)
 84 |             )
 85 |           ||| % defaultFilters,
 86 | 
 87 |           // Pod Activity
 88 |           podStateByPhase: |||
 89 |             round(
 90 |               sum(
 91 |                 karpenter_pods_state{
 92 |                   %(base)s
 93 |                 }
 94 |               ) by (phase)
 95 |             )
 96 |           ||| % defaultFilters,
 97 | 
 98 |           podsStartupP50Duration: |||
 99 |             max(
100 |               karpenter_pods_startup_duration_seconds{
101 |                 %(base)s,
102 |                 quantile="0.5"
103 |               }
104 |             )
105 |           ||| % defaultFilters,
106 | 
107 |           podsStartupP95Duration: |||
108 |             max(
109 |               karpenter_pods_startup_duration_seconds{
110 |                 %(base)s,
111 |                 quantile="0.95"
112 |               }
113 |             )
114 |           ||| % defaultFilters,
115 | 
116 |           podsStartupP99Duration: |||
117 |             max(
118 |               karpenter_pods_startup_duration_seconds{
119 |                 %(base)s,
120 |                 quantile="0.99"
121 |               }
122 |             )
123 |           ||| % defaultFilters,
124 |         };
125 | 
126 |         local panels = {
127 |           // Node Activity
128 |           nodesCreatedByNodePoolTimeSeries:
129 |             mixinUtils.dashboards.timeSeriesPanel(
130 |               'Nodes Created by Node Pool',
131 |               'short',
132 |               queries.nodesCreatedByNodePool,
133 |               '{{ nodepool }}',
134 |               description='The number of nodes created by node pool.',
135 |               stack='normal'
136 |             ),
137 | 
138 |           nodesTerminatedByNodePoolTimeSeries:
139 |             mixinUtils.dashboards.timeSeriesPanel(
140 |               'Nodes Terminated by Node Pool',
141 |               'short',
142 |               queries.nodesTerminatedByNodePool,
143 |               '{{ nodepool }}',
144 |               description='The number of nodes terminated by node pool.',
145 |               stack='normal'
146 |             ),
147 | 
148 |           nodesVoluntaryDisruptionDecisionsTimeSeries:
149 |             mixinUtils.dashboards.timeSeriesPanel(
150 |               'Node Disruption Decisions by Reason and Decision',
151 |               'short',
152 |               queries.nodesVoluntaryDisruptionDecisions,
153 |               '{{ decision }} - {{ reason }}',
154 |               description='The number of voluntary disruption decisions by reason and decision.',
155 |               stack='normal'
156 |             ),
157 | 
158 |           nodesVoluntaryDisruptionEligibleTimeSeries:
159 |             mixinUtils.dashboards.timeSeriesPanel(
160 |               'Nodes Eligible for Disruption by Reason',
161 |               'short',
162 |               queries.nodesVoluntaryDisruptionEligible,
163 |               '{{ reason }}',
164 |               description='The number of nodes eligible for voluntary disruption by reason.',
165 |               stack='normal'
166 |             ),
167 | 
168 |           nodesDisruptedTimeSeries:
169 |             mixinUtils.dashboards.timeSeriesPanel(
170 |               'Nodes Disrupted by Node Pool',
171 |               'short',
172 |               queries.nodesDisrupted,
173 |               '{{ nodepool }} - {{ capacity_type }} - {{ reason }}',
174 |               description='The number of nodes disrupted by node pool, capacity type, and reason.',
175 |               stack='normal'
176 |             ),
177 | 
178 |           // Pod Activity
179 |           podStateByPhaseTimeSeries:
180 |             mixinUtils.dashboards.timeSeriesPanel(
181 |               'Pods by Phase',
182 |               'short',
183 |               queries.podStateByPhase,
184 |               '{{ phase }}',
185 |               description='The number of pods by phase.',
186 |               stack='normal'
187 |             ),
188 | 
189 |           podStartupDurationTimeSeries:
190 |             mixinUtils.dashboards.timeSeriesPanel(
191 |               'Pods Startup Duration',
192 |               's',
193 |               [
194 |                 {
195 |                   expr: queries.podsStartupP50Duration,
196 |                   legend: 'P50',
197 |                 },
198 |                 {
199 |                   expr: queries.podsStartupP95Duration,
200 |                   legend: 'P95',
201 |                 },
202 |                 {
203 |                   expr: queries.podsStartupP99Duration,
204 |                   legend: 'P99',
205 |                 },
206 |               ],
207 |               description='The duration for pods to start up.',
208 |               fillOpacity=0
209 |             ),
210 |         };
211 | 
212 |         local rows =
213 |           [
214 |             row.new('Node Pool Activity') +
215 |             row.gridPos.withX(0) +
216 |             row.gridPos.withY(0) +
217 |             row.gridPos.withW(24) +
218 |             row.gridPos.withH(1),
219 |           ] +
220 |           grid.makeGrid(
221 |             [
222 |               panels.nodesCreatedByNodePoolTimeSeries,
223 |               panels.nodesTerminatedByNodePoolTimeSeries,
224 |             ],
225 |             panelWidth=12,
226 |             panelHeight=6,
227 |             startY=1
228 |           ) +
229 |           grid.makeGrid(
230 |             [
231 |               panels.nodesVoluntaryDisruptionDecisionsTimeSeries,
232 |               panels.nodesVoluntaryDisruptionEligibleTimeSeries,
233 |             ],
234 |             panelWidth=12,
235 |             panelHeight=6,
236 |             startY=7
237 |           ) +
238 |           grid.makeGrid(
239 |             [
240 |               panels.nodesDisruptedTimeSeries,
241 |             ],
242 |             panelWidth=24,
243 |             panelHeight=6,
244 |             startY=13
245 |           ) +
246 |           [
247 |             row.new('Pod Activity') +
248 |             row.gridPos.withX(0) +
249 |             row.gridPos.withY(19) +
250 |             row.gridPos.withW(24) +
251 |             row.gridPos.withH(1),
252 |           ] +
253 |           grid.makeGrid(
254 |             [
255 |               panels.podStateByPhaseTimeSeries,
256 |               panels.podStartupDurationTimeSeries,
257 |             ],
258 |             panelWidth=12,
259 |             panelHeight=6,
260 |             startY=20
261 |           );
262 | 
263 |         mixinUtils.dashboards.bypassDashboardValidation +
264 |         dashboard.new(
265 |           'Kubernetes / Autoscaling / Karpenter / Activity',
266 |         ) +
267 |         dashboard.withDescription('A dashboard that monitors Karpenter and focuses on Karpenter deletion/creation activity. %s' % mixinUtils.dashboards.dashboardDescriptionLink('kubernetes-autoscaling-mixin', 'https://github.com/adinhodovic/kubernetes-autoscaling-mixin')) +
268 |         dashboard.withUid($._config.karpenterActivityDashboardUid) +
269 |         dashboard.withTags($._config.tags + ['karpenter']) +
270 |         dashboard.withTimezone('utc') +
271 |         dashboard.withEditable(true) +
272 |         dashboard.time.withFrom('now-24h') +
273 |         dashboard.time.withTo('now') +
274 |         dashboard.withVariables(variables) +
275 |         dashboard.withLinks(
276 |           mixinUtils.dashboards.dashboardLinks('Kubernetes / Autoscaling', $._config, dropdown=true)
277 |         ) +
278 |         dashboard.withPanels(
279 |           rows
280 |         ) +
281 |         dashboard.withAnnotations(
282 |           mixinUtils.dashboards.annotations($._config, defaultFilters)
283 |         ),
284 |   },
285 | }
286 | 


--------------------------------------------------------------------------------
/tests/tests.yaml:
--------------------------------------------------------------------------------
  1 | ---
  2 | rule_files:
  3 |   - ../prometheus_alerts.yaml
  4 | 
  5 | tests:
  6 |   # Karpenter
  7 |   - interval: 1m
  8 |     input_series:
  9 |       - series: 'karpenter_cloudprovider_errors_total{namespace="karpenter", job="karpenter", provider="aws", controller="nodeclaim.disruption", method="Get"}'
 10 |         values: "1+1x20"
 11 |     alert_rule_test:
 12 |       - eval_time: 20m
 13 |         alertname: KarpenterCloudProviderErrors
 14 |         exp_alerts:
 15 |           - exp_labels:
 16 |               namespace: karpenter
 17 |               job: karpenter
 18 |               provider: aws
 19 |               controller: nodeclaim.disruption
 20 |               method: Get
 21 |               severity: warning
 22 |             exp_annotations:
 23 |               summary: "Karpenter has Cloud Provider Errors."
 24 |               description: "The Karpenter provider aws with the controller nodeclaim.disruption has errors with the method Get."
 25 |               dashboard_url: "https://grafana.com/d/kubernetes-autoscaling-mixin-kperf-jkwq/kubernetes-autoscaling-karpenter-performance"
 26 |   - interval: 1m
 27 |     input_series:
 28 |       - series: 'karpenter_nodepools_usage{namespace="karpenter", job="karpenter", nodepool="nodepool-a", resource_type="cpu"}'
 29 |         values: "80x15"
 30 |       - series: 'karpenter_nodepools_limit{namespace="karpenter", job="karpenter", nodepool="nodepool-a", resource_type="cpu"}'
 31 |         values: "100x15"
 32 |     alert_rule_test:
 33 |       - eval_time: 15m
 34 |         alertname: KarpenterNodepoolNearCapacity
 35 |         exp_alerts:
 36 |           - exp_labels:
 37 |               namespace: karpenter
 38 |               job: karpenter
 39 |               nodepool: nodepool-a
 40 |               resource_type: cpu
 41 |               severity: warning
 42 |             exp_annotations:
 43 |               summary: "Karpenter Nodepool near capacity."
 44 |               description: "The resource cpu in the Karpenter node pool nodepool-a is nearing its limit. Consider scaling or adding resources."
 45 |               dashboard_url: "https://grafana.com/d/kubernetes-autoscaling-mixin-kover-jkwq/kubernetes-autoscaling-karpenter-overview"
 46 |   - interval: 1m
 47 |     input_series:
 48 |       - series: karpenter_nodeclaims_termination_duration_seconds_sum{namespace="karpenter", job="karpenter", nodepool="nodepool-a"}
 49 |         values: "0+2400x20"
 50 |       - series: karpenter_nodeclaims_termination_duration_seconds_count{namespace="karpenter", job="karpenter", nodepool="nodepool-a"}
 51 |         values: "0+1x20"
 52 |       - series: karpenter_nodeclaims_termination_duration_seconds_sum{namespace="karpenter", job="karpenter", nodepool="nodepool-b"}
 53 |         values: "0+60x20"
 54 |       - series: karpenter_nodeclaims_termination_duration_seconds_count{namespace="karpenter", job="karpenter", nodepool="nodepool-b"}
 55 |         values: "0+1x20"
 56 |     alert_rule_test:
 57 |       - eval_time: 20m
 58 |         alertname: KarpenterNodeClaimsTerminationDurationHigh
 59 |         exp_alerts:
 60 |           - exp_labels:
 61 |               namespace: karpenter
 62 |               job: karpenter
 63 |               nodepool: nodepool-a
 64 |               severity: warning
 65 |             exp_annotations:
 66 |               summary: "Karpenter Node Claims Termination Duration is High."
 67 |               description: "The average node claim termination duration in Karpenter has exceeded 20 minutes for more than 15 minutes in nodepool nodepool-a. This may indicate cloud provider issues or improper instance termination handling."
 68 |               dashboard_url: "https://grafana.com/d/kubernetes-autoscaling-mixin-kact-jkwq/kubernetes-autoscaling-karpenter-activity"
 69 |   # Cluster Autoscaler
 70 |   - interval: 1m
 71 |     input_series:
 72 |       - series: 'cluster_autoscaler_nodes_count{namespace="autoscaler", job="cluster-autoscaler"}'
 73 |         values: "95x15"
 74 |       - series: 'cluster_autoscaler_max_nodes_count{namespace="autoscaler", job="cluster-autoscaler"}'
 75 |         values: "100x15"
 76 |     alert_rule_test:
 77 |       - eval_time: 15m
 78 |         alertname: ClusterAutoscalerNodeCountNearCapacity
 79 |         exp_alerts:
 80 |           - exp_labels:
 81 |               namespace: autoscaler
 82 |               job: cluster-autoscaler
 83 |               severity: warning
 84 |             exp_annotations:
 85 |               summary: "Cluster Autoscaler Node Count near Capacity."
 86 |               description: "The node count for the cluster autoscaler job cluster-autoscaler is reaching max limit. Consider scaling node groups."
 87 |               dashboard_url: "https://grafana.com/d/kubernetes-autoscaling-mixin-ca-jkwq/kubernetes-autoscaling-cluster-autoscaler"
 88 |   - interval: 1m
 89 |     input_series:
 90 |       - series: 'cluster_autoscaler_unschedulable_pods_count{namespace="autoscaler", job="cluster-autoscaler"}'
 91 |         values: "1x15"
 92 |     alert_rule_test:
 93 |       - eval_time: 15m
 94 |         alertname: ClusterAutoscalerUnschedulablePods
 95 |         exp_alerts:
 96 |           - exp_labels:
 97 |               namespace: autoscaler
 98 |               job: cluster-autoscaler
 99 |               severity: warning
100 |             exp_annotations:
101 |               summary: "Pods Pending Scheduling - Cluster Node Group Scaling Required"
102 |               description: "The cluster currently has unschedulable pods, indicating resource shortages. Consider adding more nodes or increasing node group capacity."
103 |               dashboard_url: "https://grafana.com/d/kubernetes-autoscaling-mixin-ca-jkwq/kubernetes-autoscaling-cluster-autoscaler"
104 |   # KEDA
105 |   - interval: 1m
106 |     input_series:
107 |       - series: 'keda_scaled_job_errors_total{job="keda-operator", exported_namespace="test", scaledObject="test"}'
108 |         values: "0+10x15"
109 |     alert_rule_test:
110 |       - eval_time: 15m
111 |         alertname: KedaScaledJobErrors
112 |         exp_alerts:
113 |           - exp_labels:
114 |               job: keda-operator
115 |               exported_namespace: test
116 |               scaledObject: test
117 |               severity: warning
118 |             exp_annotations:
119 |               summary: "Errors detected for KEDA scaled jobs."
120 |               description: "KEDA scaled jobs are experiencing errors. Check the scaled job test in the namespace test."
121 |               dashboard_url: "https://grafana.com/d/kubernetes-autoscaling-mixin-kedasj-jkwq/kubernetes-autoscaling-keda-scaled-job?var-scaled_job=test&var-resource_namespace=test"
122 | 
123 |   - interval: 1m
124 |     input_series:
125 |       - series: 'keda_scaled_object_errors_total{job="keda-operator", exported_namespace="test", scaledObject="test"}'
126 |         values: "0+10x15"
127 |     alert_rule_test:
128 |       - eval_time: 15m
129 |         alertname: KedaScaledObjectErrors
130 |         exp_alerts:
131 |           - exp_labels:
132 |               job: keda-operator
133 |               exported_namespace: test
134 |               scaledObject: test
135 |               severity: warning
136 |             exp_annotations:
137 |               summary: "Errors detected for KEDA scaled objects."
138 |               description: "KEDA scaled objects are experiencing errors. Check the scaled object test in the namespace test."
139 |               dashboard_url: "https://grafana.com/d/kubernetes-autoscaling-mixin-kedaso-jkwq/kubernetes-autoscaling-keda-scaled-object?var-scaled_object=test&var-resource_namespace=test"
140 | 
141 |   - interval: 1m
142 |     input_series:
143 |       - series: 'keda_scaler_metrics_latency_seconds{namespace="keda", job="keda-operator", exported_namespace="test", scaler="prometheus", scaledObject="test"}'
144 |         values: "10x10"
145 |     alert_rule_test:
146 |       - eval_time: 10m
147 |         alertname: KedaScalerLatencyHigh
148 |         exp_alerts:
149 |           - exp_labels:
150 |               job: keda-operator
151 |               exported_namespace: test
152 |               scaler: prometheus
153 |               scaledObject: test
154 |               severity: warning
155 |             exp_annotations:
156 |               summary: "High latency for KEDA scaler metrics."
157 |               description: "Metric latency for scaler prometheus for the object test has exceeded acceptable limits."
158 |               dashboard_url: "https://grafana.com/d/kubernetes-autoscaling-mixin-kedaso-jkwq/kubernetes-autoscaling-keda-scaled-object?var-scaled_object=test&var-scaler=prometheus"
159 | 
160 |   - interval: 1m
161 |     input_series:
162 |       - series: 'keda_scaled_object_paused{namespace="keda", job="keda-operator", exported_namespace="test", scaledObject="test"}'
163 |         values: "1x1500"
164 |     alert_rule_test:
165 |       - eval_time: 25h
166 |         alertname: KedaScaledObjectPaused
167 |         exp_alerts:
168 |           - exp_labels:
169 |               job: keda-operator
170 |               exported_namespace: test
171 |               scaledObject: test
172 |               severity: warning
173 |             exp_annotations:
174 |               summary: "KEDA scaled object is paused."
175 |               description: "The scaled object test in namespace test is paused for longer than 25h. This may indicate a configuration issue or manual intervention."
176 |               dashboard_url: "https://grafana.com/d/kubernetes-autoscaling-mixin-kedaso-jkwq/kubernetes-autoscaling-keda-scaled-object?var-scaled_object=test&var-resource_namespace=test"
177 | 
178 |   - interval: 1m
179 |     input_series:
180 |       - series: 'keda_scaler_detail_errors_total{metric="s0-prometheus", namespace="keda", exported_namespace="test", job="keda-operator", scaledObject="test", scaler="prometheusScaler",triggerIndex="0",type="scaledjob"}'
181 |         values: "0+10x15"
182 |     alert_rule_test:
183 |       - eval_time: 15m
184 |         alertname: KedaScalerDetailErrors
185 |         exp_alerts:
186 |           - exp_labels:
187 |               exported_namespace: test
188 |               job: keda-operator
189 |               scaledObject: test
190 |               scaler: prometheusScaler
191 |               type: scaledjob
192 |               severity: warning
193 |             exp_annotations:
194 |               summary: "Errors detected in KEDA scaler."
195 |               description: "Errors have occurred in the KEDA scaler prometheusScaler. Investigate the scaler for the scaledjob test in namespace test."
196 |               dashboard_url: "https://grafana.com/d/kubernetes-autoscaling-mixin-kedaso-jkwq/kubernetes-autoscaling-keda-scaled-object?var-scaler=prometheusScaler&var-scaled_object=test"
197 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Prometheus Monitoring Mixin for Kubernetes Autoscaling
  2 | 
  3 | A set of Grafana dashboards and Prometheus alerts for Kubernetes Autoscaling using the metrics from Kube-state-metrics, Karpenter, and Cluster-autoscaler.
  4 | 
  5 | This serves as a extension for the [Kubernetes-mixin](https://github.com/kubernetes-monitoring/kubernetes-mixin) and adds monitoring for components that aren't deployed by default in a Kubernetes cluster (VPA, Karpenter, Cluster-Autoscaler).
  6 | 
  7 | ## Dashboards
  8 | 
  9 | The mixin provides the following dashboards:
 10 | 
 11 | - Kubernetes Autoscaling
 12 |   - Pod Disruption Budgets
 13 |   - Horizontal Pod Autoscalers
 14 |   - Vertical Pod Autoscalers
 15 | - Cluster Autoscaler
 16 | - Karpenter
 17 |   - Overview
 18 |   - Activity
 19 |   - Performance
 20 | - KEDA
 21 |   - Scaled Objects
 22 |   - Scaled Jobs
 23 | 
 24 | Generated dashboards also exist in the `./dashboards_out` directory.
 25 | 
 26 | Alerts are created for the following components currently:
 27 | 
 28 | - Karpenter
 29 | - Keda
 30 | - Cluster Autoscaler
 31 | 
 32 | VPA, Karpenter, Keda, and Cluster Autoscaler are configurable in the `config.libsonnet` file. They can be turned off by setting the `enabled` field to `false`.
 33 | 
 34 | ## How to use
 35 | 
 36 | This mixin is designed to be vendored into the repo with your infrastructure config. To do this, use [jsonnet-bundler](https://github.com/jsonnet-bundler/jsonnet-bundler):
 37 | 
 38 | You then have three options for deploying your dashboards
 39 | 
 40 | 1. Generate the config files and deploy them yourself
 41 | 2. Use jsonnet to deploy this mixin along with Prometheus and Grafana
 42 | 3. Use prometheus-operator to deploy this mixin
 43 | 
 44 | Or import the dashboard using json in `./dashboards_out`, alternatively import them from the `Grafana.com` dashboard page.
 45 | 
 46 | ## Generate config files
 47 | 
 48 | You can manually generate the alerts, dashboards, and rules files, but first you must install some tools:
 49 | 
 50 | ```sh
 51 | go get github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb
 52 | brew install jsonnet
 53 | ```
 54 | 
 55 | Then, grab the mixin and its dependencies:
 56 | 
 57 | ```sh
 58 | git clone https://github.com/adinhodovic/kubernetes-autoscaling-mixin
 59 | cd kubernetes-autoscaling-mixin
 60 | jb install
 61 | ```
 62 | 
 63 | Finally, build the mixin:
 64 | 
 65 | ```sh
 66 | make prometheus_alerts.yaml
 67 | make dashboards_out
 68 | ```
 69 | 
 70 | The `prometheus_alerts.yaml` file then need to passed to your Prometheus server, and the files in `dashboards_out` need to be imported into you Grafana server. The exact details depend on how you deploy your monitoring stack.
 71 | 
 72 | ### Configuration
 73 | 
 74 | This mixin has its configuration in the `config.libsonnet` file. You can turn off the alerts for VPA, Karpenter, KEDA, and Cluster Autoscaler by setting the `enabled` field to `false`.
 75 | 
 76 | ```jsonnet
 77 | {
 78 |   _config+:: {
 79 |     vpa+: {
 80 |       enabled: false,
 81 |     },
 82 |     keda+: {
 83 |       enabled: false,
 84 |     },
 85 |     karpenter+: {
 86 |       enabled: false,
 87 |     },
 88 |     clusterAutoscaler+: {
 89 |       enabled: false,
 90 |     },
 91 |   },
 92 | }
 93 | ```
 94 | 
 95 | The mixin has all components enabled by default and all the dashboards are generated in the `dashboards_out` directory. You can import them into Grafana.
 96 | 
 97 | ### VPA Requirements
 98 | 
 99 | Kube-state-metrics doesn't ship with VPA metrics by default. You need to deploy a custom kube-state-metrics with the following configuration:
100 | 
101 | Adjust the `ClusterRole` `kube-state-metrics` to include the following rules:
102 | 
103 | ```yaml
104 | apiVersion: rbac.authorization.k8s.io/v1
105 | kind: ClusterRole
106 | metadata:
107 |   labels:
108 |     app.kubernetes.io/component: exporter
109 |     app.kubernetes.io/name: kube-state-metrics
110 |     app.kubernetes.io/part-of: kube-prometheus
111 |   name: kube-state-metrics
112 | rules:
113 |     # ... other rules
114 |     - apiGroups:
115 |       - autoscaling.k8s.io
116 |       resources:
117 |       - verticalpodautoscalers
118 |       verbs:
119 |       - list
120 |       - watch
121 |     - apiGroups:
122 |       - apiextensions.k8s.io
123 |       resources:
124 |       - customresourcedefinitions
125 |       verbs:
126 |       - list
127 |       - watch
128 | ```
129 | 
130 | Adjust the `Deployment` `kube-state-metrics` to include the following extra arguments:
131 | 
132 | ```yaml
133 | kind: Deployment
134 | metadata:
135 |   labels:
136 |     app.kubernetes.io/name: kube-state-metrics
137 |     app.kubernetes.io/part-of: kube-prometheus
138 |     app.kubernetes.io/version: 2.13.0
139 |   name: kube-state-metrics
140 |   namespace: monitoring
141 | spec:
142 |     ...
143 |       containers:
144 |       - args:
145 |         ...
146 |         - --custom-resource-state-config
147 |         - |
148 |           kind: CustomResourceStateMetrics
149 |           spec:
150 |             resources:
151 |               - groupVersionKind:
152 |                   group: autoscaling.k8s.io
153 |                   kind: "VerticalPodAutoscaler"
154 |                   version: "v1"
155 |                 labelsFromPath:
156 |                   verticalpodautoscaler: [metadata, name]
157 |                   namespace: [metadata, namespace]
158 |                   target_api_version: [spec, targetRef, apiVersion]
159 |                   target_kind: [spec, targetRef, kind]
160 |                   target_name: [spec, targetRef, name]
161 |                 metrics:
162 |                   # Labels
163 |                   - name: "verticalpodautoscaler_labels"
164 |                     help: "VPA container recommendations. Kubernetes labels converted to Prometheus labels"
165 |                     each:
166 |                       type: Info
167 |                       info:
168 |                         labelsFromPath:
169 |                           name: [metadata, name]
170 |                   # Memory Information
171 |                   - name: "verticalpodautoscaler_status_recommendation_containerrecommendations_target"
172 |                     help: "VPA container recommendations for memory. Target resources the VerticalPodAutoscaler recommends for the container."
173 |                     each:
174 |                       type: Gauge
175 |                       gauge:
176 |                         path: [status, recommendation, containerRecommendations]
177 |                         valueFrom: [target, memory]
178 |                         labelsFromPath:
179 |                           container: [containerName]
180 |                     commonLabels:
181 |                       resource: "memory"
182 |                       unit: "byte"
183 |                   - name: "verticalpodautoscaler_status_recommendation_containerrecommendations_lowerbound"
184 |                     help: "VPA container recommendations for memory. Minimum resources the container can use before the VerticalPodAutoscaler updater evicts it"
185 |                     each:
186 |                       type: Gauge
187 |                       gauge:
188 |                         path: [status, recommendation, containerRecommendations]
189 |                         valueFrom: [lowerBound, memory]
190 |                         labelsFromPath:
191 |                           container: [containerName]
192 |                     commonLabels:
193 |                       resource: "memory"
194 |                       unit: "byte"
195 |                   - name: "verticalpodautoscaler_status_recommendation_containerrecommendations_upperbound"
196 |                     help: "VPA container recommendations for memory. Maximum resources the container can use before the VerticalPodAutoscaler updater evicts it"
197 |                     each:
198 |                       type: Gauge
199 |                       gauge:
200 |                         path: [status, recommendation, containerRecommendations]
201 |                         valueFrom: [upperBound, memory]
202 |                         labelsFromPath:
203 |                           container: [containerName]
204 |                     commonLabels:
205 |                       resource: "memory"
206 |                       unit: "byte"
207 |                   - name: "verticalpodautoscaler_status_recommendation_containerrecommendations_uncappedtarget"
208 |                     help: "VPA container recommendations for memory. Target resources the VerticalPodAutoscaler recommends for the container ignoring bounds"
209 |                     each:
210 |                       type: Gauge
211 |                       gauge:
212 |                         path: [status, recommendation, containerRecommendations]
213 |                         valueFrom: [uncappedTarget, memory]
214 |                         labelsFromPath:
215 |                           container: [containerName]
216 |                     commonLabels:
217 |                       resource: "memory"
218 |                       unit: "byte"
219 |                   # CPU Information
220 |                   - name: "verticalpodautoscaler_status_recommendation_containerrecommendations_target"
221 |                     help: "VPA container recommendations for cpu. Target resources the VerticalPodAutoscaler recommends for the container."
222 |                     each:
223 |                       type: Gauge
224 |                       gauge:
225 |                         path: [status, recommendation, containerRecommendations]
226 |                         valueFrom: [target, cpu]
227 |                         labelsFromPath:
228 |                           container: [containerName]
229 |                     commonLabels:
230 |                       resource: "cpu"
231 |                       unit: "core"
232 |                   - name: "verticalpodautoscaler_status_recommendation_containerrecommendations_lowerbound"
233 |                     help: "VPA container recommendations for cpu. Minimum resources the container can use before the VerticalPodAutoscaler updater evicts it"
234 |                     each:
235 |                       type: Gauge
236 |                       gauge:
237 |                         path: [status, recommendation, containerRecommendations]
238 |                         valueFrom: [lowerBound, cpu]
239 |                         labelsFromPath:
240 |                           container: [containerName]
241 |                     commonLabels:
242 |                       resource: "cpu"
243 |                       unit: "core"
244 |                   - name: "verticalpodautoscaler_status_recommendation_containerrecommendations_upperbound"
245 |                     help: "VPA container recommendations for cpu. Maximum resources the container can use before the VerticalPodAutoscaler updater evicts it"
246 |                     each:
247 |                       type: Gauge
248 |                       gauge:
249 |                         path: [status, recommendation, containerRecommendations]
250 |                         valueFrom: [upperBound, cpu]
251 |                         labelsFromPath:
252 |                           container: [containerName]
253 |                     commonLabels:
254 |                       resource: "cpu"
255 |                       unit: "core"
256 |                   - name: "verticalpodautoscaler_status_recommendation_containerrecommendations_uncappedtarget"
257 |                     help: "VPA container recommendations for cpu. Target resources the VerticalPodAutoscaler recommends for the container ignoring bounds"
258 |                     each:
259 |                       type: Gauge
260 |                       gauge:
261 |                         path: [status, recommendation, containerRecommendations]
262 |                         valueFrom: [uncappedTarget, cpu]
263 |                         labelsFromPath:
264 |                           container: [containerName]
265 |                     commonLabels:
266 |                       resource: "cpu"
267 |                       unit: "core"
268 | ```
269 | 
270 | ## Alerts
271 | 
272 | The mixin follows the [monitoring-mixins guidelines](https://github.com/monitoring-mixins/docs#guidelines-for-alert-names-labels-and-annotations) for alerts.
273 | 


--------------------------------------------------------------------------------
/alerts/alerts.libsonnet:
--------------------------------------------------------------------------------
  1 | {
  2 |   local clusterVariableQueryString = if $._config.showMultiCluster then '&var-%(clusterLabel)s={{ $labels.%(clusterLabel)s }}' % $._config else '',
  3 |   local clusterLabel = { clusterLabel: $._config.clusterLabel },
  4 |   prometheusAlerts+:: {
  5 |     groups+: std.prune([
  6 |       if $._config.karpenter.enabled then {
  7 |         local karpenterConfig = $._config.karpenter + clusterLabel,
  8 |         name: 'karpenter',
  9 |         rules: [
 10 |           {
 11 |             alert: 'KarpenterCloudProviderErrors',
 12 |             expr: |||
 13 |               sum(
 14 |                 increase(
 15 |                   karpenter_cloudprovider_errors_total{
 16 |                     %(karpenterSelector)s,
 17 |                     controller!~"nodeclaim.termination|node.termination",
 18 |                     error!="NodeClaimNotFoundError"
 19 |                   }[5m]
 20 |                 )
 21 |               ) by (%(clusterLabel)s, namespace, job, provider, controller, method) > 0
 22 |             ||| % karpenterConfig,
 23 |             labels: {
 24 |               severity: 'warning',
 25 |             },
 26 |             'for': '5m',
 27 |             annotations: {
 28 |               summary: 'Karpenter has Cloud Provider Errors.',
 29 |               description: 'The Karpenter provider {{ $labels.provider }} with the controller {{ $labels.controller }} has errors with the method {{ $labels.method }}.',
 30 |               dashboard_url: $._config.karpenter.karpenterPerformanceDashboardUrl + clusterVariableQueryString,
 31 |             },
 32 |           },
 33 |           {
 34 |             alert: 'KarpenterNodeClaimsTerminationDurationHigh',
 35 |             expr: |||
 36 |               sum(
 37 |                 rate(
 38 |                   karpenter_nodeclaims_termination_duration_seconds_sum{
 39 |                     %(karpenterSelector)s
 40 |                   }[5m]
 41 |                 )
 42 |               ) by (%(clusterLabel)s, namespace, job, nodepool)
 43 |               /
 44 |               sum(
 45 |                 rate(
 46 |                   karpenter_nodeclaims_termination_duration_seconds_count{
 47 |                     %(karpenterSelector)s
 48 |                   }[5m]
 49 |                 )
 50 |               ) by (%(clusterLabel)s, namespace, job, nodepool) > %(nodeclaimTerminationThreshold)s
 51 |             ||| % karpenterConfig,
 52 |             labels: {
 53 |               severity: 'warning',
 54 |             },
 55 |             'for': '15m',
 56 |             annotations: {
 57 |               summary: 'Karpenter Node Claims Termination Duration is High.',
 58 |               description: 'The average node claim termination duration in Karpenter has exceeded %s minutes for more than 15 minutes in nodepool {{ $labels.nodepool }}. This may indicate cloud provider issues or improper instance termination handling.' % std.toString($._config.karpenter.nodeclaimTerminationThreshold / 60),
 59 |               dashboard_url: $._config.karpenter.karpenterActivityDashboardUrl + clusterVariableQueryString,
 60 |             },
 61 |           },
 62 |           {
 63 |             alert: 'KarpenterNodepoolNearCapacity',
 64 |             annotations: {
 65 |               summary: 'Karpenter Nodepool near capacity.',
 66 |               description: 'The resource {{ $labels.resource_type }} in the Karpenter node pool {{ $labels.nodepool }} is nearing its limit. Consider scaling or adding resources.',
 67 |               dashboard_url: $._config.karpenter.karpenterOverviewDashboardUrl + clusterVariableQueryString,
 68 |             },
 69 |             expr: |||
 70 |               sum (
 71 |                 karpenter_nodepools_usage{%(karpenterSelector)s}
 72 |               ) by (%(clusterLabel)s, namespace, job, nodepool, resource_type)
 73 |               /
 74 |               sum (
 75 |                 karpenter_nodepools_limit{%(karpenterSelector)s}
 76 |               ) by (%(clusterLabel)s, namespace, job, nodepool, resource_type)
 77 |               * 100 > %(nodepoolCapacityThreshold)s
 78 |             ||| % karpenterConfig,
 79 |             'for': '15m',
 80 |             labels: {
 81 |               severity: 'warning',
 82 |             },
 83 |           },
 84 |         ],
 85 |       },
 86 |       if $._config.clusterAutoscaler.enabled then {
 87 |         local clusterAutoscalerConfig = $._config.clusterAutoscaler + clusterLabel,
 88 |         name: 'cluster-autoscaler',
 89 |         rules: [
 90 |           {
 91 |             alert: 'ClusterAutoscalerNodeCountNearCapacity',
 92 |             annotations: {
 93 |               summary: 'Cluster Autoscaler Node Count near Capacity.',
 94 |               description: 'The node count for the cluster autoscaler job {{ $labels.job }} is reaching max limit. Consider scaling node groups.',
 95 |               dashboard_url: $._config.clusterAutoscaler.clusterAutoscalerDashboardUrl + clusterVariableQueryString,
 96 |             },
 97 |             expr: |||
 98 |               sum (
 99 |                 cluster_autoscaler_nodes_count{
100 |                   %(clusterAutoscalerSelector)s
101 |                 }
102 |               ) by (%(clusterLabel)s, namespace, job)
103 |               /
104 |               sum (
105 |                 cluster_autoscaler_max_nodes_count{
106 |                   %(clusterAutoscalerSelector)s
107 |                 }
108 |               ) by (%(clusterLabel)s, namespace, job)
109 |               * 100 > %(nodeCountCapacityThreshold)s
110 |             ||| % clusterAutoscalerConfig,
111 |             'for': '15m',
112 |             labels: {
113 |               severity: 'warning',
114 |             },
115 |           },
116 |           {
117 |             alert: 'ClusterAutoscalerUnschedulablePods',
118 |             annotations: {
119 |               summary: 'Pods Pending Scheduling - Cluster Node Group Scaling Required',
120 |               description: 'The cluster currently has unschedulable pods, indicating resource shortages. Consider adding more nodes or increasing node group capacity.',
121 |               dashboard_url: $._config.clusterAutoscaler.clusterAutoscalerDashboardUrl + clusterVariableQueryString,
122 |             },
123 |             expr: |||
124 |               sum (
125 |                 cluster_autoscaler_unschedulable_pods_count{
126 |                   %(clusterAutoscalerSelector)s
127 |                 }
128 |               ) by (%(clusterLabel)s, namespace, job)
129 |               > 0
130 |             ||| % clusterAutoscalerConfig,
131 |             'for': '15m',
132 |             labels: {
133 |               severity: 'warning',
134 |             },
135 |           },
136 |         ],
137 |       },
138 |       if $._config.keda.enabled then {
139 |         local kedaConfig = $._config.keda + clusterLabel,
140 |         name: 'keda',
141 |         rules: [
142 |           {
143 |             alert: 'KedaScaledJobErrors',
144 |             annotations: {
145 |               summary: 'Errors detected for KEDA scaled jobs.',
146 |               description: 'KEDA scaled jobs are experiencing errors. Check the scaled job {{ $labels.scaledObject }} in the namespace {{ $labels.exported_namespace }}.',
147 |               dashboard_url: $._config.keda.kedaScaledJobDashboardUrl + '?var-scaled_job={{ $labels.scaledObject }}&var-resource_namespace={{ $labels.exported_namespace }}' + clusterVariableQueryString,
148 |             },
149 |             expr: |||
150 |               sum(
151 |                 increase(
152 |                   keda_scaled_job_errors_total{
153 |                     %(kedaSelector)s
154 |                   }[10m]
155 |                 )
156 |               ) by (%(clusterLabel)s, job, exported_namespace, scaledObject) > 0
157 |             ||| % kedaConfig,
158 |             'for': '1m',
159 |             labels: {
160 |               severity: 'warning',
161 |             },
162 |           },
163 |           {
164 |             alert: 'KedaScaledObjectErrors',
165 |             annotations: {
166 |               summary: 'Errors detected for KEDA scaled objects.',
167 |               description: 'KEDA scaled objects are experiencing errors. Check the scaled object {{ $labels.scaledObject }} in the namespace {{ $labels.exported_namespace }}.',
168 |               dashboard_url: $._config.keda.kedaScaledObjectDashboardUrl + '?var-scaled_object={{ $labels.scaledObject }}&var-resource_namespace={{ $labels.exported_namespace }}' + clusterVariableQueryString,
169 |             },
170 |             expr: |||
171 |               sum(
172 |                 increase(
173 |                   keda_scaled_object_errors_total{
174 |                     %(kedaSelector)s
175 |                   }[10m]
176 |                 )
177 |               ) by (%(clusterLabel)s, job, exported_namespace, scaledObject) > 0
178 |             ||| % kedaConfig,
179 |             'for': '1m',
180 |             labels: {
181 |               severity: 'warning',
182 |             },
183 |           },
184 |           {
185 |             alert: 'KedaScalerLatencyHigh',
186 |             annotations: {
187 |               summary: 'High latency for KEDA scaler metrics.',
188 |               description: 'Metric latency for scaler {{ $labels.scaler }} for the object {{ $labels.scaledObject }} has exceeded acceptable limits.',
189 |               dashboard_url: $._config.keda.kedaScaledObjectDashboardUrl + '?var-scaled_object={{ $labels.scaledObject }}&var-scaler={{ $labels.scaler }}' + clusterVariableQueryString,
190 |             },
191 |             expr: |||
192 |               avg(
193 |                 keda_scaler_metrics_latency_seconds{
194 |                   %(kedaSelector)s
195 |                 }
196 |               ) by (%(clusterLabel)s, job, exported_namespace, scaledObject, scaler) > %(scalerMetricsLatencyThreshold)s
197 |             ||| % kedaConfig,
198 |             'for': '10m',
199 |             labels: {
200 |               severity: 'warning',
201 |             },
202 |           },
203 |           {
204 |             alert: 'KedaScaledObjectPaused',
205 |             annotations: {
206 |               summary: 'KEDA scaled object is paused.',
207 |               description: 'The scaled object {{ $labels.scaledObject }} in namespace {{ $labels.exported_namespace }} is paused for longer than %(scaledObjectPausedThreshold)s. This may indicate a configuration issue or manual intervention.' % kedaConfig,
208 |               dashboard_url: $._config.keda.kedaScaledObjectDashboardUrl + '?var-scaled_object={{ $labels.scaledObject }}&var-resource_namespace={{ $labels.exported_namespace }}' + clusterVariableQueryString,
209 |             },
210 |             expr: |||
211 |               max(
212 |                 keda_scaled_object_paused{
213 |                   %(kedaSelector)s
214 |                 }
215 |               ) by (%(clusterLabel)s, job, exported_namespace, scaledObject) > 0
216 |             ||| % kedaConfig,
217 |             'for': kedaConfig.scaledObjectPausedThreshold,
218 |             labels: {
219 |               severity: 'warning',
220 |             },
221 |           },
222 |           {
223 |             alert: 'KedaScalerDetailErrors',
224 |             annotations: {
225 |               summary: 'Errors detected in KEDA scaler.',
226 |               description: 'Errors have occurred in the KEDA scaler {{ $labels.scaler }}. Investigate the scaler for the {{ $labels.type }} {{ $labels.scaledObject }} in namespace {{ $labels.exported_namespace }}.',
227 |               dashboard_url: $._config.keda.kedaScaledObjectDashboardUrl + '?var-scaler={{ $labels.scaler }}&var-scaled_object={{ $labels.scaledObject }}' + clusterVariableQueryString,
228 |             },
229 |             expr: |||
230 |               sum(
231 |                 increase(
232 |                   keda_scaler_detail_errors_total{
233 |                     %(kedaSelector)s
234 |                   }[10m]
235 |                 )
236 |               ) by (%(clusterLabel)s, job, exported_namespace, scaledObject, type, scaler) > 0
237 |             ||| % kedaConfig,
238 |             'for': '1m',
239 |             labels: {
240 |               severity: 'warning',
241 |             },
242 |           },
243 |         ],
244 |       },
245 |     ]),
246 |   },
247 | }
248 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/dashboards/keda/keda-scaled-job.libsonnet:
--------------------------------------------------------------------------------
  1 | local mixinUtils = import 'github.com/adinhodovic/mixin-utils/utils.libsonnet';
  2 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet';
  3 | local util = import 'util.libsonnet';
  4 | 
  5 | local dashboard = g.dashboard;
  6 | local row = g.panel.row;
  7 | local grid = g.util.grid;
  8 | 
  9 | local tablePanel = g.panel.table;
 10 | 
 11 | // Table
 12 | local tbQueryOptions = tablePanel.queryOptions;
 13 | local tbPanelOptions = tablePanel.panelOptions;
 14 | 
 15 | {
 16 |   grafanaDashboards+:: {
 17 |     'kubernetes-autoscaling-mixin-keda-sj.json':
 18 |       if !$._config.keda.enabled then {} else
 19 | 
 20 |         local defaultVariables = util.variables($._config);
 21 | 
 22 |         local variables = [
 23 |           defaultVariables.datasource,
 24 |           defaultVariables.cluster,
 25 |           defaultVariables.scaledJobJob,
 26 |           defaultVariables.scaledJobOperatorNamespace,
 27 |           defaultVariables.scaledJobResourceNamespace,
 28 |           defaultVariables.scaledJob,
 29 |           defaultVariables.scalerForScaledJob,
 30 |           defaultVariables.metricForScaledJob,
 31 |         ];
 32 | 
 33 |         local defaultFilters = util.filters($._config);
 34 | 
 35 |         local queries = {
 36 |           resourcesRegisteredByNamespace: |||
 37 |             sum(
 38 |               keda_resource_registered_total{
 39 |                 %(base)s,
 40 |                 type="scaled_job"
 41 |               }
 42 |             ) by (exported_namespace, type)
 43 |           ||| % defaultFilters,
 44 | 
 45 |           triggersByType: |||
 46 |             sum(
 47 |               keda_trigger_registered_total{
 48 |                 %(base)s
 49 |               }
 50 |             ) by (type)
 51 |           ||| % defaultFilters,
 52 | 
 53 |           scaledJobsErrors: |||
 54 |             sum(
 55 |               increase(
 56 |                 keda_scaled_job_errors_total{
 57 |                   %(withResourceNamespace)s
 58 |                 }[$__rate_interval]
 59 |               )
 60 |             ) by (exported_namespace, scaledJob)
 61 |           ||| % defaultFilters,
 62 | 
 63 |           scalerDetailErrors: |||
 64 |             sum(
 65 |               increase(
 66 |                 keda_scaler_detail_errors_total{
 67 |                   %(withResourceNamespace)s,
 68 |                   type="scaledjob"
 69 |                 }[$__rate_interval]
 70 |               )
 71 |             ) by (exported_namespace, scaledObject, scaler)
 72 |           ||| % defaultFilters,
 73 | 
 74 |           scaleTargetValues: |||
 75 |             sum(
 76 |               keda_scaler_metrics_value{
 77 |                 %(withResourceNamespace)s,
 78 |                 type="scaledjob"
 79 |               }
 80 |             ) by (job, exported_namespace, scaledObject, scaler, metric)
 81 |           ||| % defaultFilters,
 82 | 
 83 |           scaledJobActive: |||
 84 |             sum(
 85 |               keda_scaler_active{
 86 |                 %(withScaledJob)s
 87 |               }
 88 |             ) by (exported_namespace, scaledObject)
 89 |           ||| % defaultFilters,
 90 | 
 91 |           scaledJobDetailError: |||
 92 |             sum(
 93 |               increase(
 94 |                 keda_scaler_detail_errors_total{
 95 |                   %(withScaledJob)s
 96 |                 }[$__rate_interval]
 97 |               )
 98 |             ) by (exported_namespace, scaledObject)
 99 |           ||| % defaultFilters,
100 | 
101 |           scaledJobMetricValue: |||
102 |             avg(
103 |               keda_scaler_metrics_value{
104 |                 %(withScaledJobMetric)s
105 |               }
106 |             ) by (exported_namespace, scaledObject, scaler, metric)
107 |           ||| % defaultFilters,
108 | 
109 |           scaledJobMetricLatency: |||
110 |             avg(
111 |               keda_scaler_metrics_latency_seconds{
112 |                 %(withScaledJobMetric)s
113 |               }
114 |             ) by (exported_namespace, scaledObject, scaler, metric)
115 |           ||| % defaultFilters,
116 |         };
117 | 
118 |         local panels = {
119 |           resourcesRegisteredTimeSeries:
120 |             mixinUtils.dashboards.timeSeriesPanel(
121 |               'Resources Registered by Namespace',
122 |               'short',
123 |               queries.resourcesRegisteredByNamespace,
124 |               '{{ exported_namespace}} / {{ type }}',
125 |               description='The number of scaled job resources registered by namespace.',
126 |               stack='normal',
127 |             ),
128 | 
129 |           triggersByTypeTimeSeries:
130 |             mixinUtils.dashboards.timeSeriesPanel(
131 |               'Triggers by Type',
132 |               'short',
133 |               queries.triggersByType,
134 |               '{{ type }}',
135 |               description='The number of triggers registered by type.',
136 |               stack='normal',
137 |             ),
138 | 
139 |           scaledJobsErrorsTimeSeries:
140 |             mixinUtils.dashboards.timeSeriesPanel(
141 |               'Scaled Jobs Errors',
142 |               'short',
143 |               queries.scaledJobsErrors,
144 |               '{{ scaledJob }}',
145 |               description='The rate of errors for scaled jobs.',
146 |               stack='normal',
147 |             ),
148 | 
149 |           scalerDetailErrorsTimeSeries:
150 |             mixinUtils.dashboards.timeSeriesPanel(
151 |               'Scaler Detail Errors',
152 |               'short',
153 |               queries.scalerDetailErrors,
154 |               '{{ scaledObject }} / {{ scaler }}',
155 |               description='The rate of scaler detail errors.',
156 |               stack='normal',
157 |             ),
158 | 
159 |           scaleTargetValuesTable:
160 |             mixinUtils.dashboards.tablePanel(
161 |               'Scale Target Values',
162 |               'short',
163 |               queries.scaleTargetValues,
164 |               description='This table has links to the Workload dashboard for the scaled Job, which can be used to see the current resource usage. The Workload dashboard can be found at [kubernetes-mixin](https://github.com/kubernetes-monitoring/kubernetes-mixin) and requires ID customization.',
165 |               sortBy={ name: 'Scaled Job', desc: false },
166 |               transformations=[
167 |                 tbQueryOptions.transformation.withId(
168 |                   'organize'
169 |                 ) +
170 |                 tbQueryOptions.transformation.withOptions(
171 |                   {
172 |                     renameByName: {
173 |                       scaledObject: 'Scaled Job',
174 |                       exported_namespace: 'Resource Namespace',
175 |                       scaler: 'Scaler',
176 |                       metric: 'Metric',
177 |                       value: 'Value',
178 |                     },
179 |                     indexByName: {
180 |                       scaledObject: 0,
181 |                       exported_namespace: 1,
182 |                       scaler: 2,
183 |                       metric: 3,
184 |                       value: 4,
185 |                     },
186 |                     excludeByName: {
187 |                       Time: true,
188 |                       job: true,
189 |                     },
190 |                   },
191 |                 ),
192 |               ],
193 |               links=[
194 |                 tbPanelOptions.link.withTitle('Go to Scaled Job') +
195 |                 tbPanelOptions.link.withUrl(
196 |                   '/d/%s/kubernetes-compute-resources-workload?var-namespace=${__data.fields.exported_namespace}&var-type=ScaledJob&var-workload=${__data.fields.scaledObject}' % $._config.keda.k8sResourcesWorkloadDashboardUid
197 |                 ) +
198 |                 tbPanelOptions.link.withTargetBlank(true),
199 |               ]
200 |             ),
201 | 
202 |           scaledJobActiveTimeSeries:
203 |             mixinUtils.dashboards.timeSeriesPanel(
204 |               'Scaled Job Active',
205 |               'short',
206 |               queries.scaledJobActive,
207 |               '{{ scaledObject }}',
208 |               description='Whether the scaled job is active.',
209 |             ),
210 | 
211 |           scaledJobDetailErrorTimeSeries:
212 |             mixinUtils.dashboards.timeSeriesPanel(
213 |               'Scaled Job Detail Errors',
214 |               'short',
215 |               queries.scaledJobDetailError,
216 |               '{{ scaledObject }}',
217 |               description='The rate of errors for the selected scaled job.',
218 |             ),
219 | 
220 |           scaledJobMetricValueTimeSeries:
221 |             mixinUtils.dashboards.timeSeriesPanel(
222 |               'Scaled Job Metric Value',
223 |               'short',
224 |               queries.scaledJobMetricValue,
225 |               '{{ scaledObject }} / {{ scaler }} / {{ metric }}',
226 |               description='The metric value for the selected scaled job.',
227 |               stack='normal',
228 |             ),
229 | 
230 |           scaledJobMetricLatencyTimeSeries:
231 |             mixinUtils.dashboards.timeSeriesPanel(
232 |               'Scaled Job Metric Latency',
233 |               's',
234 |               queries.scaledJobMetricLatency,
235 |               '{{ scaledObject }} / {{ scaler }} / {{ metric }}',
236 |               description='The metric collection latency for the selected scaled job.',
237 |             ),
238 |         };
239 | 
240 |         local rows =
241 |           [
242 |             row.new('Overview') +
243 |             row.gridPos.withX(0) +
244 |             row.gridPos.withY(0) +
245 |             row.gridPos.withW(24) +
246 |             row.gridPos.withH(1),
247 |           ] +
248 |           grid.makeGrid(
249 |             [
250 |               panels.resourcesRegisteredTimeSeries,
251 |               panels.triggersByTypeTimeSeries,
252 |             ],
253 |             panelWidth=12,
254 |             panelHeight=6,
255 |             startY=1
256 |           ) +
257 |           grid.makeGrid(
258 |             [
259 |               panels.scaledJobsErrorsTimeSeries,
260 |               panels.scalerDetailErrorsTimeSeries,
261 |             ],
262 |             panelWidth=12,
263 |             panelHeight=6,
264 |             startY=7
265 |           ) +
266 |           grid.makeGrid(
267 |             [
268 |               panels.scaleTargetValuesTable,
269 |             ],
270 |             panelWidth=24,
271 |             panelHeight=8,
272 |             startY=13
273 |           ) +
274 |           [
275 |             row.new('Scaled Job $scaled_job / $scaler / $metric') +
276 |             row.gridPos.withX(0) +
277 |             row.gridPos.withY(21) +
278 |             row.gridPos.withW(24) +
279 |             row.gridPos.withH(1),
280 |           ] +
281 |           grid.makeGrid(
282 |             [
283 |               panels.scaledJobActiveTimeSeries,
284 |               panels.scaledJobDetailErrorTimeSeries,
285 |             ],
286 |             panelWidth=12,
287 |             panelHeight=5,
288 |             startY=22
289 |           ) +
290 |           grid.makeGrid(
291 |             [
292 |               panels.scaledJobMetricValueTimeSeries,
293 |               panels.scaledJobMetricLatencyTimeSeries,
294 |             ],
295 |             panelWidth=24,
296 |             panelHeight=8,
297 |             startY=27
298 |           );
299 | 
300 |         mixinUtils.dashboards.bypassDashboardValidation +
301 |         dashboard.new(
302 |           'Kubernetes / Autoscaling / KEDA / Scaled Job',
303 |         ) +
304 |         dashboard.withDescription('A dashboard that monitors KEDA Scaled Jobs. %s' % mixinUtils.dashboards.dashboardDescriptionLink('kubernetes-autoscaling-mixin', 'https://github.com/adinhodovic/kubernetes-autoscaling-mixin')) +
305 |         dashboard.withUid($._config.kedaScaledJobDashboardUid) +
306 |         dashboard.withTags($._config.tags + ['keda']) +
307 |         dashboard.withTimezone('utc') +
308 |         dashboard.withEditable(true) +
309 |         dashboard.time.withFrom('now-6h') +
310 |         dashboard.time.withTo('now') +
311 |         dashboard.withVariables(variables) +
312 |         dashboard.withLinks(
313 |           mixinUtils.dashboards.dashboardLinks('Kubernetes / Autoscaling', $._config, dropdown=true)
314 |         ) +
315 |         dashboard.withPanels(rows),
316 |   },
317 | }
318 | 


--------------------------------------------------------------------------------
/dashboards/kubernetes/kubernetes-autoscaling-pdb.libsonnet:
--------------------------------------------------------------------------------
  1 | local mixinUtils = import 'github.com/adinhodovic/mixin-utils/utils.libsonnet';
  2 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet';
  3 | local util = import 'util.libsonnet';
  4 | 
  5 | local dashboard = g.dashboard;
  6 | local row = g.panel.row;
  7 | local grid = g.util.grid;
  8 | 
  9 | local tablePanel = g.panel.table;
 10 | local timeSeriesPanel = g.panel.timeSeries;
 11 | 
 12 | // Table
 13 | local tbStandardOptions = tablePanel.standardOptions;
 14 | local tbQueryOptions = tablePanel.queryOptions;
 15 | local tbOverride = tbStandardOptions.override;
 16 | local tbFieldConfig = tablePanel.fieldConfig;
 17 | 
 18 | // Timeseries
 19 | local tsStandardOptions = timeSeriesPanel.standardOptions;
 20 | local tsOverride = tsStandardOptions.override;
 21 | 
 22 | {
 23 |   grafanaDashboards+:: {
 24 |     'kubernetes-autoscaling-mixin-pdb.json':
 25 | 
 26 |       local defaultVariables = util.variables($._config);
 27 | 
 28 |       local variables = [
 29 |         defaultVariables.datasource,
 30 |         defaultVariables.cluster,
 31 |         defaultVariables.pdbJob,
 32 |         defaultVariables.pdbNamespace,
 33 |         defaultVariables.pdb,
 34 |       ];
 35 | 
 36 |       local defaultFilters = util.filters($._config);
 37 |       local queries = {
 38 |         disruptionsAllowed: |||
 39 |           round(
 40 |             sum(
 41 |               kube_poddisruptionbudget_status_pod_disruptions_allowed{
 42 |                 %(withPdb)s
 43 |               }
 44 |             )
 45 |           )
 46 |         ||| % defaultFilters,
 47 | 
 48 |         desiredHealthy: |||
 49 |           round(
 50 |             sum(
 51 |               kube_poddisruptionbudget_status_desired_healthy{
 52 |                 %(withPdb)s
 53 |               }
 54 |             )
 55 |           )
 56 |         ||| % defaultFilters,
 57 | 
 58 |         currentlyHealthy: |||
 59 |           round(
 60 |             sum(
 61 |               kube_poddisruptionbudget_status_current_healthy{
 62 |                 %(withPdb)s
 63 |               }
 64 |             )
 65 |           )
 66 |         ||| % defaultFilters,
 67 | 
 68 |         expectedPods: |||
 69 |           round(
 70 |             sum(
 71 |               kube_poddisruptionbudget_status_expected_pods{
 72 |                 %(withPdb)s
 73 |               }
 74 |             )
 75 |           )
 76 |         ||| % defaultFilters,
 77 | 
 78 |         disruptionsAllowedNamespace: |||
 79 |           round(
 80 |             sum(
 81 |               kube_poddisruptionbudget_status_pod_disruptions_allowed{
 82 |                 %(base)s
 83 |               }
 84 |             ) by (job, namespace, poddisruptionbudget)
 85 |           )
 86 |         ||| % defaultFilters,
 87 | 
 88 |         desiredHealthyNamespace: |||
 89 |           round(
 90 |             sum(
 91 |               kube_poddisruptionbudget_status_desired_healthy{
 92 |                 %(base)s
 93 |               }
 94 |             ) by (job, namespace, poddisruptionbudget)
 95 |           )
 96 |         ||| % defaultFilters,
 97 | 
 98 |         currentlyHealthyNamespace: |||
 99 |           round(
100 |             sum(
101 |               kube_poddisruptionbudget_status_current_healthy{
102 |                 %(base)s
103 |               }
104 |             ) by (job, namespace, poddisruptionbudget)
105 |           )
106 |         ||| % defaultFilters,
107 | 
108 |         expectedPodsNamespace: |||
109 |           round(
110 |             sum(
111 |               kube_poddisruptionbudget_status_expected_pods{
112 |                 %(base)s
113 |               }
114 |             ) by (job, namespace, poddisruptionbudget)
115 |           )
116 |         ||| % defaultFilters,
117 |       };
118 | 
119 |       local panels = {
120 |         disruptionsAllowedStat:
121 |           mixinUtils.dashboards.statPanel(
122 |             'Disruptions Allowed',
123 |             'short',
124 |             queries.disruptionsAllowed,
125 |             description='The number of pod disruptions allowed for the selected PDB.',
126 |           ),
127 | 
128 |         desiredHealthyStat:
129 |           mixinUtils.dashboards.statPanel(
130 |             'Desired Healthy',
131 |             'short',
132 |             queries.desiredHealthy,
133 |             description='The desired number of healthy pods for the selected PDB.',
134 |           ),
135 | 
136 |         currentlyHealthyStat:
137 |           mixinUtils.dashboards.statPanel(
138 |             'Currently Healthy',
139 |             'short',
140 |             queries.currentlyHealthy,
141 |             description='The current number of healthy pods for the selected PDB.',
142 |           ),
143 | 
144 |         expectedPodsStat:
145 |           mixinUtils.dashboards.statPanel(
146 |             'Expected Pods',
147 |             'short',
148 |             queries.expectedPods,
149 |             description='The expected number of pods for the selected PDB.',
150 |           ),
151 | 
152 |         namespaceSummaryTable:
153 |           mixinUtils.dashboards.tablePanel(
154 |             'Summary',
155 |             'short',
156 |             [
157 |               {
158 |                 expr: queries.disruptionsAllowedNamespace,
159 |                 legend: 'Disruptions Allowed',
160 |               },
161 |               {
162 |                 expr: queries.desiredHealthyNamespace,
163 |                 legend: 'Desired Healthy',
164 |               },
165 |               {
166 |                 expr: queries.currentlyHealthyNamespace,
167 |                 legend: 'Currently Healthy',
168 |               },
169 |               {
170 |                 expr: queries.expectedPodsNamespace,
171 |                 legend: 'Expected Pods',
172 |               },
173 |             ],
174 |             description='Summary of all PDBs in the selected namespace.',
175 |             sortBy={ name: 'Pod Disruption Budget', desc: false },
176 |             transformations=[
177 |               tbQueryOptions.transformation.withId('merge'),
178 |               tbQueryOptions.transformation.withId('organize') +
179 |               tbQueryOptions.transformation.withOptions(
180 |                 {
181 |                   renameByName: {
182 |                     poddisruptionbudget: 'Pod Disruption Budget',
183 |                     namespace: 'Namespace',
184 |                     'Value #A': 'Disruptions Allowed',
185 |                     'Value #B': 'Desired Healthy',
186 |                     'Value #C': 'Currently Healthy',
187 |                     'Value #D': 'Expected Pods',
188 |                   },
189 |                   indexByName: {
190 |                     namespace: 0,
191 |                     poddisruptionbudget: 1,
192 |                     'Value #A': 2,
193 |                     'Value #B': 3,
194 |                     'Value #C': 4,
195 |                     'Value #D': 5,
196 |                   },
197 |                   excludeByName: {
198 |                     Time: true,
199 |                     job: true,
200 |                   },
201 |                 }
202 |               ),
203 |             ],
204 |             overrides=[
205 |               tbOverride.byName.new('Disruptions Allowed') +
206 |               tbOverride.byName.withPropertiesFromOptions(
207 |                 tbFieldConfig.defaults.custom.withCellOptions(
208 |                   { type: 'color-text' }
209 |                 ) +
210 |                 tbStandardOptions.thresholds.withMode('absolute') +
211 |                 tbStandardOptions.thresholds.withSteps([
212 |                   tbStandardOptions.threshold.step.withValue(0) +
213 |                   tbStandardOptions.threshold.step.withColor('red'),
214 |                   tbStandardOptions.threshold.step.withValue(0.1) +
215 |                   tbStandardOptions.threshold.step.withColor('green'),
216 |                 ])
217 |               ),
218 |             ],
219 |           ),
220 | 
221 |         statusTimeSeries:
222 |           mixinUtils.dashboards.timeSeriesPanel(
223 |             'Status',
224 |             'short',
225 |             [
226 |               {
227 |                 expr: queries.disruptionsAllowed,
228 |                 legend: 'Disruptions Allowed',
229 |               },
230 |               {
231 |                 expr: queries.desiredHealthy,
232 |                 legend: 'Desired Healthy',
233 |               },
234 |               {
235 |                 expr: queries.currentlyHealthy,
236 |                 legend: 'Currently Healthy',
237 |               },
238 |               {
239 |                 expr: queries.expectedPods,
240 |                 legend: 'Expected Pods',
241 |               },
242 |             ],
243 |             description='Status metrics for the selected PDB over time.',
244 |             fillOpacity=0,
245 |             overrides=[
246 |               tsOverride.byName.new('Currently Healthy') +
247 |               tsOverride.byName.withPropertiesFromOptions(
248 |                 tsStandardOptions.color.withMode('fixed') +
249 |                 tsStandardOptions.color.withFixedColor('yellow')
250 |               ),
251 |               tsOverride.byName.new('Disruptions Allowed') +
252 |               tsOverride.byName.withPropertiesFromOptions(
253 |                 tsStandardOptions.color.withMode('fixed') +
254 |                 tsStandardOptions.color.withFixedColor('red')
255 |               ),
256 |               tsOverride.byName.new('Desired Healthy') +
257 |               tsOverride.byName.withPropertiesFromOptions(
258 |                 tsStandardOptions.color.withMode('fixed') +
259 |                 tsStandardOptions.color.withFixedColor('green')
260 |               ),
261 |               tsOverride.byName.new('Expected Pods') +
262 |               tsOverride.byName.withPropertiesFromOptions(
263 |                 tsStandardOptions.color.withMode('fixed') +
264 |                 tsStandardOptions.color.withFixedColor('blue')
265 |               ),
266 |             ],
267 |           ),
268 |       };
269 | 
270 |       local rows =
271 |         [
272 |           row.new('$namespace Namespace Summary') +
273 |           row.gridPos.withX(0) +
274 |           row.gridPos.withY(0) +
275 |           row.gridPos.withW(24) +
276 |           row.gridPos.withH(1),
277 |           panels.namespaceSummaryTable +
278 |           row.gridPos.withX(0) +
279 |           row.gridPos.withY(1) +
280 |           row.gridPos.withW(24) +
281 |           row.gridPos.withH(10),
282 |           row.new('$poddisruptionbudget') +
283 |           row.gridPos.withX(0) +
284 |           row.gridPos.withY(11) +
285 |           row.gridPos.withW(24) +
286 |           row.gridPos.withH(1) +
287 |           row.withRepeat('poddisruptionbudget'),
288 |         ] +
289 |         grid.makeGrid(
290 |           [
291 |             panels.disruptionsAllowedStat,
292 |             panels.desiredHealthyStat,
293 |             panels.currentlyHealthyStat,
294 |             panels.expectedPodsStat,
295 |           ],
296 |           panelWidth=6,
297 |           panelHeight=4,
298 |           startY=12
299 |         ) +
300 |         [
301 |           panels.statusTimeSeries +
302 |           row.gridPos.withX(0) +
303 |           row.gridPos.withY(16) +
304 |           row.gridPos.withW(24) +
305 |           row.gridPos.withH(10),
306 |         ];
307 | 
308 |       mixinUtils.dashboards.bypassDashboardValidation +
309 |       dashboard.new(
310 |         'Kubernetes / Autoscaling / Pod Disruption Budget',
311 |       ) +
312 |       dashboard.withDescription('A dashboard that monitors Kubernetes and focuses on giving a overview for pod disruption budgets. %s' % mixinUtils.dashboards.dashboardDescriptionLink('kubernetes-autoscaling-mixin', 'https://github.com/adinhodovic/kubernetes-autoscaling-mixin')) +
313 |       dashboard.withUid($._config.pdbDashboardUid) +
314 |       dashboard.withTags($._config.tags + ['kubernetes-core']) +
315 |       dashboard.withTimezone('utc') +
316 |       dashboard.withEditable(true) +
317 |       dashboard.time.withFrom('now-6h') +
318 |       dashboard.time.withTo('now') +
319 |       dashboard.withVariables(variables) +
320 |       dashboard.withLinks(
321 |         mixinUtils.dashboards.dashboardLinks('Kubernetes / Autoscaling', $._config, dropdown=true)
322 |       ) +
323 |       dashboard.withPanels(rows) +
324 |       dashboard.withAnnotations(
325 |         mixinUtils.dashboards.annotations($._config, defaultFilters)
326 |       ),
327 |   },
328 | }
329 | 


--------------------------------------------------------------------------------
/dashboards/cluster-autoscaler/kubernetes-autoscaling-cluster-autoscaler.libsonnet:
--------------------------------------------------------------------------------
  1 | local mixinUtils = import 'github.com/adinhodovic/mixin-utils/utils.libsonnet';
  2 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet';
  3 | local util = import 'util.libsonnet';
  4 | 
  5 | local dashboard = g.dashboard;
  6 | local row = g.panel.row;
  7 | local grid = g.util.grid;
  8 | 
  9 | // Gauge panel helpers
 10 | local gauge = g.panel.gauge;
 11 | local gaStandardOptions = gauge.standardOptions;
 12 | 
 13 | {
 14 |   grafanaDashboards+:: {
 15 |     'kubernetes-autoscaling-mixin-ca.json':
 16 |       if !$._config.clusterAutoscaler.enabled then {} else
 17 | 
 18 |         local defaultVariables = util.variables($._config);
 19 | 
 20 |         local variables = [
 21 |           defaultVariables.datasource,
 22 |           defaultVariables.cluster,
 23 |           defaultVariables.job,
 24 |         ];
 25 | 
 26 |         local defaultFilters = util.filters($._config);
 27 |         local queries = {
 28 |           totalNodes: |||
 29 |             round(
 30 |               sum(
 31 |                 cluster_autoscaler_nodes_count{
 32 |                   %(base)s
 33 |                 }
 34 |               )
 35 |             )
 36 |           ||| % defaultFilters,
 37 | 
 38 |           maxNodes: |||
 39 |             round(
 40 |               sum(
 41 |                 cluster_autoscaler_max_nodes_count{
 42 |                   %(base)s
 43 |                 }
 44 |               )
 45 |             )
 46 |           ||| % defaultFilters,
 47 | 
 48 |           nodeGroups: |||
 49 |             round(
 50 |               sum(
 51 |                 cluster_autoscaler_node_groups_count{
 52 |                   %(base)s
 53 |                 }
 54 |               )
 55 |             )
 56 |           ||| % defaultFilters,
 57 | 
 58 |           healthyNodes: |||
 59 |             round(
 60 |               sum(
 61 |                 cluster_autoscaler_nodes_count{
 62 |                   %(base)s,
 63 |                   state="ready"
 64 |                 }
 65 |               ) /
 66 |               sum(
 67 |                 cluster_autoscaler_nodes_count{
 68 |                   %(base)s
 69 |                 }
 70 |               ) * 100
 71 |             )
 72 |           ||| % defaultFilters,
 73 | 
 74 |           safeToScale: |||
 75 |             sum(
 76 |               cluster_autoscaler_cluster_safe_to_autoscale{
 77 |                 %(base)s
 78 |               }
 79 |             )
 80 |           ||| % defaultFilters,
 81 | 
 82 |           numberUnscheduledPods: |||
 83 |             round(
 84 |               sum(
 85 |                 cluster_autoscaler_unschedulable_pods_count{
 86 |                   %(base)s
 87 |                 }
 88 |               )
 89 |             )
 90 |           ||| % defaultFilters,
 91 | 
 92 |           lastScaleDown: |||
 93 |             time() - max(
 94 |               cluster_autoscaler_last_activity{
 95 |                 %(base)s,
 96 |                 activity="scaleDown"
 97 |               }
 98 |             )
 99 |           ||| % defaultFilters,
100 | 
101 |           lastScaleUp: |||
102 |             time() - max(
103 |               cluster_autoscaler_last_activity{
104 |                 %(base)s,
105 |                 activity="scaleUp"
106 |               }
107 |             )
108 |           ||| % defaultFilters,
109 | 
110 |           unschedulablePods: |||
111 |             round(
112 |               sum(
113 |                 increase(
114 |                   cluster_autoscaler_unschedulable_pods_count{
115 |                     %(base)s
116 |                   }[$__rate_interval]
117 |                 )
118 |               ) by (type)
119 |             )
120 |           ||| % defaultFilters,
121 | 
122 |           evictedPods: |||
123 |             round(
124 |               sum(
125 |                 increase(
126 |                   cluster_autoscaler_evicted_pods_total{
127 |                     %(base)s
128 |                   }[$__rate_interval]
129 |                 )
130 |               ) by (eviction_result)
131 |             )
132 |           ||| % defaultFilters,
133 | 
134 |           nodeActivity: |||
135 |             round(
136 |               sum(
137 |                 cluster_autoscaler_nodes_count{
138 |                   %(base)s
139 |                 }
140 |               ) by (state)
141 |             )
142 |           ||| % defaultFilters,
143 | 
144 |           unneededNodes: |||
145 |             round(
146 |               sum(
147 |                 cluster_autoscaler_unneeded_nodes_count{
148 |                   %(base)s
149 |                 }
150 |               )
151 |             )
152 |           ||| % defaultFilters,
153 | 
154 |           scaledUpNodes: |||
155 |             round(
156 |               sum(
157 |                 increase(
158 |                   cluster_autoscaler_scaled_up_nodes_total{
159 |                     %(base)s
160 |                   }[$__rate_interval]
161 |                 )
162 |               )
163 |             )
164 |           ||| % defaultFilters,
165 | 
166 |           scaledDownNodes: |||
167 |             round(
168 |               sum(
169 |                 increase(
170 |                   cluster_autoscaler_scaled_down_nodes_total{
171 |                     %(base)s
172 |                   }[$__rate_interval]
173 |                 )
174 |               )
175 |             )
176 |           ||| % defaultFilters,
177 |         };
178 | 
179 |         local panels = {
180 |           totalNodesStat:
181 |             mixinUtils.dashboards.statPanel(
182 |               'Total Nodes',
183 |               'short',
184 |               queries.totalNodes,
185 |               description='The total number of nodes in the cluster.',
186 |             ),
187 | 
188 |           maxNodesStat:
189 |             mixinUtils.dashboards.statPanel(
190 |               'Max Nodes',
191 |               'short',
192 |               queries.maxNodes,
193 |               description='The maximum number of nodes allowed in the cluster.',
194 |             ),
195 | 
196 |           nodeGroupsStat:
197 |             mixinUtils.dashboards.statPanel(
198 |               'Node Groups',
199 |               'short',
200 |               queries.nodeGroups,
201 |               description='The number of node groups in the cluster.',
202 |             ),
203 | 
204 |           healthyNodesGauge:
205 |             mixinUtils.dashboards.gaugePanel(
206 |               'Healthy Nodes',
207 |               'percent',
208 |               queries.healthyNodes,
209 |               description='The percentage of healthy nodes in the cluster.',
210 |               min=0,
211 |               max=100,
212 |               steps=[
213 |                 gaStandardOptions.threshold.step.withValue(0) +
214 |                 gaStandardOptions.threshold.step.withColor('red'),
215 |                 gaStandardOptions.threshold.step.withValue(50) +
216 |                 gaStandardOptions.threshold.step.withColor('yellow'),
217 |                 gaStandardOptions.threshold.step.withValue(80) +
218 |                 gaStandardOptions.threshold.step.withColor('green'),
219 |               ],
220 |             ),
221 | 
222 |           safeToScaleStat:
223 |             mixinUtils.dashboards.statPanel(
224 |               'Safe to Scale',
225 |               'short',
226 |               queries.safeToScale,
227 |               description='Indicates whether it is safe to scale the cluster.',
228 |               steps=[
229 |                 gaStandardOptions.threshold.step.withValue(0) +
230 |                 gaStandardOptions.threshold.step.withColor('red'),
231 |                 gaStandardOptions.threshold.step.withValue(0.1) +
232 |                 gaStandardOptions.threshold.step.withColor('green'),
233 |               ],
234 |               mappings=[
235 |                 gaStandardOptions.mapping.ValueMap.withType() +
236 |                 gaStandardOptions.mapping.ValueMap.withOptions(
237 |                   {
238 |                     '0': { text: 'No', color: 'red' },
239 |                     '1': { text: 'Yes', color: 'green' },
240 |                   }
241 |                 ),
242 |               ],
243 |             ),
244 | 
245 |           numberUnscheduledPodsStat:
246 |             mixinUtils.dashboards.statPanel(
247 |               'Unscheduled Pods',
248 |               'short',
249 |               queries.numberUnscheduledPods,
250 |               description='The number of unscheduled pods in the cluster.',
251 |             ),
252 | 
253 |           lastScaleDownStat:
254 |             mixinUtils.dashboards.statPanel(
255 |               'Last Scale Down',
256 |               's',
257 |               queries.lastScaleDown,
258 |               description='The timestamp of the last scale down activity.',
259 |             ),
260 | 
261 |           lastScaleUpStat:
262 |             mixinUtils.dashboards.statPanel(
263 |               'Last Scale Up',
264 |               's',
265 |               queries.lastScaleUp,
266 |               description='The timestamp of the last scale up activity.',
267 |             ),
268 | 
269 |           podActivityTimeSeries:
270 |             mixinUtils.dashboards.timeSeriesPanel(
271 |               'Pod Activity',
272 |               'short',
273 |               [
274 |                 {
275 |                   expr: queries.unschedulablePods,
276 |                   legend: '{{ type }}',
277 |                 },
278 |                 {
279 |                   expr: queries.evictedPods,
280 |                   legend: 'Evicted / {{ eviction_result }}',
281 |                 },
282 |               ],
283 |               description='The activity of pods in the cluster.',
284 |               stack='normal'
285 |             ),
286 | 
287 |           nodeActivityTimeSeries:
288 |             mixinUtils.dashboards.timeSeriesPanel(
289 |               'Node Activity',
290 |               'short',
291 |               queries.nodeActivity,
292 |               '{{ state }}',
293 |               description='The activity of nodes in the cluster.',
294 |               stack='normal'
295 |             ),
296 | 
297 |           autoscalingActivityTimeSeries:
298 |             mixinUtils.dashboards.timeSeriesPanel(
299 |               'Autoscaling Activity',
300 |               'short',
301 |               [
302 |                 {
303 |                   expr: queries.totalNodes,
304 |                   legend: 'Total Nodes',
305 |                 },
306 |                 {
307 |                   expr: queries.unneededNodes,
308 |                   legend: 'Unneeded',
309 |                 },
310 |                 {
311 |                   expr: queries.scaledUpNodes,
312 |                   legend: 'Scaled Up',
313 |                 },
314 |                 {
315 |                   expr: queries.scaledDownNodes,
316 |                   legend: 'Scaled Down',
317 |                 },
318 |               ],
319 |               description='The autoscaling activity in the cluster.',
320 |               fillOpacity=0,
321 |             ),
322 |         };
323 | 
324 |         local rows =
325 |           [
326 |             row.new('Summary') +
327 |             row.gridPos.withX(0) +
328 |             row.gridPos.withY(0) +
329 |             row.gridPos.withW(24) +
330 |             row.gridPos.withH(1),
331 |           ] +
332 |           grid.makeGrid(
333 |             [
334 |               panels.totalNodesStat,
335 |               panels.maxNodesStat,
336 |               panels.nodeGroupsStat,
337 |               panels.healthyNodesGauge,
338 |               panels.safeToScaleStat,
339 |               panels.numberUnscheduledPodsStat,
340 |               panels.lastScaleDownStat,
341 |               panels.lastScaleUpStat,
342 |             ],
343 |             panelWidth=3,
344 |             panelHeight=4,
345 |             startY=1
346 |           ) +
347 |           [
348 |             row.new('Activity') +
349 |             row.gridPos.withX(0) +
350 |             row.gridPos.withY(5) +
351 |             row.gridPos.withW(24) +
352 |             row.gridPos.withH(1),
353 |           ] +
354 |           grid.makeGrid(
355 |             [
356 |               panels.podActivityTimeSeries,
357 |               panels.nodeActivityTimeSeries,
358 |             ],
359 |             panelWidth=12,
360 |             panelHeight=8,
361 |             startY=6
362 |           ) +
363 |           grid.makeGrid(
364 |             [
365 |               panels.autoscalingActivityTimeSeries,
366 |             ],
367 |             panelWidth=24,
368 |             panelHeight=8,
369 |             startY=14
370 |           );
371 | 
372 |         mixinUtils.dashboards.bypassDashboardValidation +
373 |         dashboard.new(
374 |           'Kubernetes / Autoscaling / Cluster Autoscaler',
375 |         ) +
376 |         dashboard.withDescription('A dashboard that monitors the Cluster Autoscaler. %s' % mixinUtils.dashboards.dashboardDescriptionLink('kubernetes-autoscaling-mixin', 'https://github.com/adinhodovic/kubernetes-autoscaling-mixin')) +
377 |         dashboard.withUid($._config.clusterAutoscalerDashboardUid) +
378 |         dashboard.withTags($._config.tags + ['cluster-autoscaler']) +
379 |         dashboard.withTimezone('utc') +
380 |         dashboard.withEditable(true) +
381 |         dashboard.time.withFrom('now-6h') +
382 |         dashboard.time.withTo('now') +
383 |         dashboard.withVariables(variables) +
384 |         dashboard.withLinks(
385 |           mixinUtils.dashboards.dashboardLinks('Kubernetes / Autoscaling', $._config, dropdown=true)
386 |         ) +
387 |         dashboard.withPanels(
388 |           rows
389 |         ) +
390 |         dashboard.withAnnotations(
391 |           mixinUtils.dashboards.annotations($._config, defaultFilters)
392 |         ),
393 |   },
394 | }
395 | 


--------------------------------------------------------------------------------
/dashboards/keda/keda-scaled-object.libsonnet:
--------------------------------------------------------------------------------
  1 | local mixinUtils = import 'github.com/adinhodovic/mixin-utils/utils.libsonnet';
  2 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet';
  3 | local util = import 'util.libsonnet';
  4 | 
  5 | local dashboard = g.dashboard;
  6 | local row = g.panel.row;
  7 | local grid = g.util.grid;
  8 | 
  9 | local tablePanel = g.panel.table;
 10 | 
 11 | // Table
 12 | local tbQueryOptions = tablePanel.queryOptions;
 13 | local tbPanelOptions = tablePanel.panelOptions;
 14 | 
 15 | {
 16 |   grafanaDashboards+:: {
 17 |     'kubernetes-autoscaling-mixin-keda-so.json':
 18 |       if !$._config.keda.enabled then {} else
 19 | 
 20 |         local defaultVariables = util.variables($._config);
 21 | 
 22 |         local variables = [
 23 |           defaultVariables.datasource,
 24 |           defaultVariables.cluster,
 25 |           defaultVariables.scaledObjectJob,
 26 |           defaultVariables.scaledObjectOperatorNamespace,
 27 |           defaultVariables.scaledObjectResourceNamespace,
 28 |           defaultVariables.scaledObject,
 29 |           defaultVariables.scalerForScaledObject,
 30 |           defaultVariables.metricForScaledObject,
 31 |         ];
 32 | 
 33 |         local defaultFilters = util.filters($._config);
 34 | 
 35 |         local queries = {
 36 |           resourcesRegisteredByNamespace: |||
 37 |             sum(
 38 |               keda_resource_registered_total{
 39 |                 %(base)s,
 40 |                 type="scaled_object"
 41 |               }
 42 |             ) by (exported_namespace, type)
 43 |           ||| % defaultFilters,
 44 | 
 45 |           triggersByType: |||
 46 |             sum(
 47 |               keda_trigger_registered_total{
 48 |                 %(base)s
 49 |               }
 50 |             ) by (type)
 51 |           ||| % defaultFilters,
 52 | 
 53 |           scaledObjectsErrors: |||
 54 |             sum(
 55 |               increase(
 56 |                 keda_scaled_object_errors_total{
 57 |                   %(withResourceNamespace)s
 58 |                 }[$__rate_interval]
 59 |               )
 60 |             ) by (exported_namespace, scaledObject)
 61 |           ||| % defaultFilters,
 62 | 
 63 |           scalerDetailErrors: |||
 64 |             sum(
 65 |               increase(
 66 |                 keda_scaler_detail_errors_total{
 67 |                   %(withResourceNamespace)s,
 68 |                   type="scaledobject"
 69 |                 }[$__rate_interval]
 70 |               )
 71 |             ) by (exported_namespace, scaledObject, scaler)
 72 |           ||| % defaultFilters,
 73 | 
 74 |           scaledObjectsPaused: |||
 75 |             sum(
 76 |               keda_scaled_object_paused{
 77 |                 %(withResourceNamespace)s
 78 |               }
 79 |             ) by (exported_namespace, scaledObject)
 80 |             > 0
 81 |           ||| % defaultFilters,
 82 | 
 83 |           scaleTargetValues: |||
 84 |             sum(
 85 |               keda_scaler_metrics_value{
 86 |                 %(withResourceNamespace)s,
 87 |                 type="scaledobject"
 88 |               }
 89 |             ) by (job, exported_namespace, scaledObject, scaler, metric)
 90 |           ||| % defaultFilters,
 91 | 
 92 |           scaledObjectPaused: |||
 93 |             sum(
 94 |               keda_scaled_object_paused{
 95 |                 %(withScaledObject)s
 96 |               }
 97 |             ) by (exported_namespace, scaledObject)
 98 |           ||| % defaultFilters,
 99 | 
100 |           scaledObjectActive: |||
101 |             sum(
102 |               keda_scaler_active{
103 |                 %(withScaledObject)s
104 |               }
105 |             ) by (exported_namespace, scaledObject)
106 |           ||| % defaultFilters,
107 | 
108 |           scaledObjectDetailError: |||
109 |             sum(
110 |               increase(
111 |                 keda_scaler_detail_errors_total{
112 |                   %(withScaledObject)s
113 |                 }[$__rate_interval]
114 |               )
115 |             ) by (exported_namespace, scaledObject)
116 |           ||| % defaultFilters,
117 | 
118 |           scaledObjectMetricValue: |||
119 |             avg(
120 |               keda_scaler_metrics_value{
121 |                 %(withScaledObjectMetric)s
122 |               }
123 |             ) by (exported_namespace, scaledObject, scaler, metric)
124 |           ||| % defaultFilters,
125 | 
126 |           scaledObjectMetricLatency: |||
127 |             avg(
128 |               keda_scaler_metrics_latency_seconds{
129 |                 %(withScaledObjectMetric)s
130 |               }
131 |             ) by (exported_namespace, scaledObject, scaler, metric)
132 |           ||| % defaultFilters,
133 |         };
134 | 
135 |         local panels = {
136 |           resourcesRegisteredTimeSeries:
137 |             mixinUtils.dashboards.timeSeriesPanel(
138 |               'Resources Registered by Namespace',
139 |               'short',
140 |               queries.resourcesRegisteredByNamespace,
141 |               '{{ exported_namespace}} / {{ type }}',
142 |               description='The number of scaled object resources registered by namespace.',
143 |               stack='normal',
144 |             ),
145 | 
146 |           triggersByTypeTimeSeries:
147 |             mixinUtils.dashboards.timeSeriesPanel(
148 |               'Triggers by Type',
149 |               'short',
150 |               queries.triggersByType,
151 |               '{{ type }}',
152 |               description='The number of triggers registered by type.',
153 |               stack='normal',
154 |             ),
155 | 
156 |           scaledObjectsErrorsTimeSeries:
157 |             mixinUtils.dashboards.timeSeriesPanel(
158 |               'Scaled Objects Errors',
159 |               'short',
160 |               queries.scaledObjectsErrors,
161 |               '{{ scaledObject }}',
162 |               description='The rate of errors for scaled objects.',
163 |               stack='normal',
164 |             ),
165 | 
166 |           scalerDetailErrorsTimeSeries:
167 |             mixinUtils.dashboards.timeSeriesPanel(
168 |               'Scaler Detail Errors',
169 |               'short',
170 |               queries.scalerDetailErrors,
171 |               '{{ scaledObject }} / {{ scaler }}',
172 |               description='The rate of scaler detail errors.',
173 |               stack='normal',
174 |             ),
175 | 
176 |           scaledObjectsPausedTimeSeries:
177 |             mixinUtils.dashboards.timeSeriesPanel(
178 |               'Scaled Objects Paused',
179 |               'short',
180 |               queries.scaledObjectsPaused,
181 |               '{{ scaledObject }}',
182 |               description='Scaled objects that are currently paused.',
183 |               stack='normal',
184 |             ),
185 | 
186 |           scaleTargetValuesTable:
187 |             mixinUtils.dashboards.tablePanel(
188 |               'Scale Target Values',
189 |               'short',
190 |               queries.scaleTargetValues,
191 |               description='This table has links to the HPA for the scaled object, which can be used to see the current scaling status and history. The HPA dashboard can be found at [kubernetes-autoscaling-mixin](https://github.com/adinhodovic/kubernetes-autoscaling-mixin).',
192 |               sortBy={ name: 'Scaled Object', desc: false },
193 |               transformations=[
194 |                 tbQueryOptions.transformation.withId(
195 |                   'organize'
196 |                 ) +
197 |                 tbQueryOptions.transformation.withOptions(
198 |                   {
199 |                     renameByName: {
200 |                       scaledObject: 'Scaled Object',
201 |                       exported_namespace: 'Resource Namespace',
202 |                       scaler: 'Scaler',
203 |                       metric: 'Metric',
204 |                       value: 'Value',
205 |                     },
206 |                     indexByName: {
207 |                       scaledObject: 0,
208 |                       exported_namespace: 1,
209 |                       scaler: 2,
210 |                       metric: 3,
211 |                       value: 4,
212 |                     },
213 |                     excludeByName: {
214 |                       Time: true,
215 |                       job: true,
216 |                     },
217 |                   }
218 |                 ),
219 |               ],
220 |               links=[
221 |                 tbPanelOptions.link.withTitle('Go to HPA') +
222 |                 tbPanelOptions.link.withUrl(
223 |                   '/d/%s/kubernetes-autoscaling-horizontal-pod-autoscaler?var-namespace=${__data.fields.namespace}&var-hpa=keda-hpa-${__data.fields.scaledObject}&var-metric_name=${__data.fields.metric}' % $._config.hpaDashboardUid
224 |                 ) +
225 |                 tbPanelOptions.link.withTargetBlank(true),
226 |               ]
227 |             ),
228 | 
229 |           scaledObjectPausedTimeSeries:
230 |             mixinUtils.dashboards.timeSeriesPanel(
231 |               'Scaled Object Paused',
232 |               'short',
233 |               queries.scaledObjectPaused,
234 |               '{{ scaledObject }}',
235 |               description='Whether the selected scaled object is paused.',
236 |             ),
237 | 
238 |           scaledObjectActiveTimeSeries:
239 |             mixinUtils.dashboards.timeSeriesPanel(
240 |               'Scaled Object Active',
241 |               'short',
242 |               queries.scaledObjectActive,
243 |               '{{ scaledObject }}',
244 |               description='Whether the selected scaled object is active.',
245 |             ),
246 | 
247 |           scaledObjectDetailErrorTimeSeries:
248 |             mixinUtils.dashboards.timeSeriesPanel(
249 |               'Scaled Object Detail Errors',
250 |               'short',
251 |               queries.scaledObjectDetailError,
252 |               '{{ scaledObject }}',
253 |               description='The rate of errors for the selected scaled object.',
254 |             ),
255 | 
256 |           scaledObjectMetricValueTimeSeries:
257 |             mixinUtils.dashboards.timeSeriesPanel(
258 |               'Scaled Object Metric Value',
259 |               'short',
260 |               queries.scaledObjectMetricValue,
261 |               '{{ scaledObject }} / {{ scaler }} / {{ metric }}',
262 |               description='The metric value for the selected scaled object.',
263 |               stack='normal',
264 |             ),
265 | 
266 |           scaledObjectMetricLatencyTimeSeries:
267 |             mixinUtils.dashboards.timeSeriesPanel(
268 |               'Scaled Object Metric Latency',
269 |               's',
270 |               queries.scaledObjectMetricLatency,
271 |               '{{ scaledObject }} / {{ scaler }} / {{ metric }}',
272 |               description='The metric collection latency for the selected scaled object.',
273 |             ),
274 |         };
275 | 
276 |         local rows =
277 |           [
278 |             row.new('Overview') +
279 |             row.gridPos.withX(0) +
280 |             row.gridPos.withY(0) +
281 |             row.gridPos.withW(24) +
282 |             row.gridPos.withH(1),
283 |           ] +
284 |           grid.makeGrid(
285 |             [
286 |               panels.resourcesRegisteredTimeSeries,
287 |               panels.triggersByTypeTimeSeries,
288 |             ],
289 |             panelWidth=12,
290 |             panelHeight=6,
291 |             startY=1
292 |           ) +
293 |           grid.makeGrid(
294 |             [
295 |               panels.scaledObjectsErrorsTimeSeries,
296 |               panels.scalerDetailErrorsTimeSeries,
297 |               panels.scaledObjectsPausedTimeSeries,
298 |             ],
299 |             panelWidth=8,
300 |             panelHeight=6,
301 |             startY=7
302 |           ) +
303 |           grid.makeGrid(
304 |             [
305 |               panels.scaleTargetValuesTable,
306 |             ],
307 |             panelWidth=24,
308 |             panelHeight=8,
309 |             startY=13
310 |           ) +
311 |           [
312 |             row.new('Scaled Object $scaled_object / $scaler / $metric') +
313 |             row.gridPos.withX(0) +
314 |             row.gridPos.withY(21) +
315 |             row.gridPos.withW(24) +
316 |             row.gridPos.withH(1),
317 |           ] +
318 |           grid.makeGrid(
319 |             [
320 |               panels.scaledObjectPausedTimeSeries,
321 |               panels.scaledObjectActiveTimeSeries,
322 |               panels.scaledObjectDetailErrorTimeSeries,
323 |             ],
324 |             panelWidth=8,
325 |             panelHeight=5,
326 |             startY=22
327 |           ) +
328 |           grid.makeGrid(
329 |             [
330 |               panels.scaledObjectMetricValueTimeSeries,
331 |               panels.scaledObjectMetricLatencyTimeSeries,
332 |             ],
333 |             panelWidth=24,
334 |             panelHeight=8,
335 |             startY=27
336 |           );
337 | 
338 |         mixinUtils.dashboards.bypassDashboardValidation +
339 |         dashboard.new(
340 |           'Kubernetes / Autoscaling / KEDA / Scaled Object',
341 |         ) +
342 |         dashboard.withDescription('A dashboard that monitors KEDA Scaled Objects. %s' % mixinUtils.dashboards.dashboardDescriptionLink('kubernetes-autoscaling-mixin', 'https://github.com/adinhodovic/kubernetes-autoscaling-mixin')) +
343 |         dashboard.withUid($._config.kedaScaledObjectDashboardUid) +
344 |         dashboard.withTags($._config.tags + ['keda']) +
345 |         dashboard.withTimezone('utc') +
346 |         dashboard.withEditable(true) +
347 |         dashboard.time.withFrom('now-6h') +
348 |         dashboard.time.withTo('now') +
349 |         dashboard.withVariables(variables) +
350 |         dashboard.withLinks(
351 |           mixinUtils.dashboards.dashboardLinks('Kubernetes / Autoscaling', $._config, dropdown=true)
352 |         ) +
353 |         dashboard.withPanels(rows),
354 |   },
355 | }
356 | 


--------------------------------------------------------------------------------
/scripts/go.mod:
--------------------------------------------------------------------------------
  1 | module _
  2 | 
  3 | go 1.24.0
  4 | 
  5 | toolchain go1.24.1
  6 | 
  7 | require (
  8 | 	github.com/Kunde21/markdownfmt/v3 v3.1.0
  9 | 	github.com/cloudflare/pint v0.74.6
 10 | 	github.com/errata-ai/vale/v3 v3.12.0
 11 | 	github.com/google/go-jsonnet v0.21.0
 12 | 	github.com/grafana/dashboard-linter v0.0.0-20231114210226-c458893a5731
 13 | 	github.com/jsonnet-bundler/jsonnet-bundler v0.6.0
 14 | 	github.com/prometheus/prometheus v0.304.2
 15 | )
 16 | 
 17 | require (
 18 | 	atomicgo.dev/cursor v0.2.0 // indirect
 19 | 	atomicgo.dev/keyboard v0.2.9 // indirect
 20 | 	atomicgo.dev/schedule v0.1.0 // indirect
 21 | 	cloud.google.com/go v0.115.1 // indirect
 22 | 	cloud.google.com/go/auth v0.16.0 // indirect
 23 | 	cloud.google.com/go/auth/oauth2adapt v0.2.8 // indirect
 24 | 	cloud.google.com/go/compute/metadata v0.6.0 // indirect
 25 | 	cloud.google.com/go/iam v1.2.0 // indirect
 26 | 	cloud.google.com/go/storage v1.43.0 // indirect
 27 | 	dario.cat/mergo v1.0.1 // indirect
 28 | 	github.com/Azure/azure-sdk-for-go/sdk/azcore v1.18.0 // indirect
 29 | 	github.com/Azure/azure-sdk-for-go/sdk/azidentity v1.9.0 // indirect
 30 | 	github.com/Azure/azure-sdk-for-go/sdk/internal v1.11.1 // indirect
 31 | 	github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/compute/armcompute/v5 v5.7.0 // indirect
 32 | 	github.com/Azure/azure-sdk-for-go/sdk/resourcemanager/network/armnetwork/v4 v4.3.0 // indirect
 33 | 	github.com/Azure/go-ansiterm v0.0.0-20230124172434-306776ec8161 // indirect
 34 | 	github.com/AzureAD/microsoft-authentication-library-for-go v1.4.2 // indirect
 35 | 	github.com/BurntSushi/toml v1.4.0 // indirect
 36 | 	github.com/Code-Hex/go-generics-cache v1.5.1 // indirect
 37 | 	github.com/Masterminds/goutils v1.1.1 // indirect
 38 | 	github.com/Masterminds/semver/v3 v3.3.1 // indirect
 39 | 	github.com/Masterminds/sprig/v3 v3.3.0 // indirect
 40 | 	github.com/Microsoft/go-winio v0.6.1 // indirect
 41 | 	github.com/adrg/frontmatter v0.2.0 // indirect
 42 | 	github.com/adrg/strutil v0.3.1 // indirect
 43 | 	github.com/adrg/xdg v0.5.3 // indirect
 44 | 	github.com/agext/levenshtein v1.2.1 // indirect
 45 | 	github.com/alecthomas/kingpin/v2 v2.4.0 // indirect
 46 | 	github.com/alecthomas/template v0.0.0-20190718012654-fb15b899a751 // indirect
 47 | 	github.com/alecthomas/units v0.0.0-20240927000941-0f3dac36c52b // indirect
 48 | 	github.com/apparentlymart/go-textseg/v15 v15.0.0 // indirect
 49 | 	github.com/armon/go-metrics v0.4.1 // indirect
 50 | 	github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 // indirect
 51 | 	github.com/aws/aws-sdk-go v1.55.7 // indirect
 52 | 	github.com/bboreham/go-loser v0.0.0-20230920113527-fcc2c21820a3 // indirect
 53 | 	github.com/benbjohnson/clock v1.3.5 // indirect
 54 | 	github.com/beorn7/perks v1.0.1 // indirect
 55 | 	github.com/bmatcuk/doublestar/v4 v4.7.1 // indirect
 56 | 	github.com/cespare/xxhash/v2 v2.3.0 // indirect
 57 | 	github.com/cncf/xds/go v0.0.0-20250121191232-2f005788dc42 // indirect
 58 | 	github.com/containerd/console v1.0.3 // indirect
 59 | 	github.com/coreos/go-systemd/v22 v22.5.0 // indirect
 60 | 	github.com/d5/tengo/v2 v2.17.0 // indirect
 61 | 	github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
 62 | 	github.com/dennwc/varint v1.0.0 // indirect
 63 | 	github.com/digitalocean/godo v1.144.0 // indirect
 64 | 	github.com/distribution/reference v0.5.0 // indirect
 65 | 	github.com/docker/docker v28.1.1+incompatible // indirect
 66 | 	github.com/docker/go-connections v0.4.0 // indirect
 67 | 	github.com/docker/go-units v0.5.0 // indirect
 68 | 	github.com/edsrzf/mmap-go v1.2.0 // indirect
 69 | 	github.com/elliotchance/orderedmap/v2 v2.2.0 // indirect
 70 | 	github.com/emicklei/go-restful/v3 v3.11.0 // indirect
 71 | 	github.com/envoyproxy/go-control-plane/envoy v1.32.4 // indirect
 72 | 	github.com/envoyproxy/protoc-gen-validate v1.2.1 // indirect
 73 | 	github.com/errata-ai/ini v1.63.0 // indirect
 74 | 	github.com/errata-ai/regexp2 v1.7.0 // indirect
 75 | 	github.com/expr-lang/expr v1.17.0 // indirect
 76 | 	github.com/facette/natsort v0.0.0-20181210072756-2cd4dd1e2dcb // indirect
 77 | 	github.com/fatih/color v1.18.0 // indirect
 78 | 	github.com/felixge/httpsnoop v1.0.4 // indirect
 79 | 	github.com/fsnotify/fsnotify v1.8.0 // indirect
 80 | 	github.com/fxamacker/cbor/v2 v2.7.0 // indirect
 81 | 	github.com/ghodss/yaml v1.0.0 // indirect
 82 | 	github.com/go-logr/logr v1.4.2 // indirect
 83 | 	github.com/go-logr/stdr v1.2.2 // indirect
 84 | 	github.com/go-openapi/analysis v0.23.0 // indirect
 85 | 	github.com/go-openapi/errors v0.22.0 // indirect
 86 | 	github.com/go-openapi/jsonpointer v0.21.0 // indirect
 87 | 	github.com/go-openapi/jsonreference v0.21.0 // indirect
 88 | 	github.com/go-openapi/loads v0.22.0 // indirect
 89 | 	github.com/go-openapi/spec v0.21.0 // indirect
 90 | 	github.com/go-openapi/strfmt v0.23.0 // indirect
 91 | 	github.com/go-openapi/swag v0.23.0 // indirect
 92 | 	github.com/go-openapi/validate v0.24.0 // indirect
 93 | 	github.com/go-resty/resty/v2 v2.16.5 // indirect
 94 | 	github.com/go-viper/mapstructure/v2 v2.3.0 // indirect
 95 | 	github.com/go-zookeeper/zk v1.0.4 // indirect
 96 | 	github.com/gobwas/glob v0.2.3 // indirect
 97 | 	github.com/gogo/protobuf v1.3.2 // indirect
 98 | 	github.com/golang-jwt/jwt/v5 v5.2.2 // indirect
 99 | 	github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
100 | 	github.com/golang/protobuf v1.5.4 // indirect
101 | 	github.com/golang/snappy v1.0.0 // indirect
102 | 	github.com/google/gnostic-models v0.6.8 // indirect
103 | 	github.com/google/go-cmp v0.7.0 // indirect
104 | 	github.com/google/go-github/v73 v73.0.0 // indirect
105 | 	github.com/google/go-querystring v1.1.0 // indirect
106 | 	github.com/google/gofuzz v1.2.0 // indirect
107 | 	github.com/google/pprof v0.0.0-20241210010833-40e02aabc2ad // indirect
108 | 	github.com/google/s2a-go v0.1.9 // indirect
109 | 	github.com/google/uuid v1.6.0 // indirect
110 | 	github.com/googleapis/enterprise-certificate-proxy v0.3.6 // indirect
111 | 	github.com/googleapis/gax-go/v2 v2.14.1 // indirect
112 | 	github.com/gookit/color v1.5.4 // indirect
113 | 	github.com/gophercloud/gophercloud/v2 v2.7.0 // indirect
114 | 	github.com/gorilla/websocket v1.5.0 // indirect
115 | 	github.com/grafana/regexp v0.0.0-20240518133315-a468a5bfb3bc // indirect
116 | 	github.com/hashicorp/consul/api v1.32.0 // indirect
117 | 	github.com/hashicorp/cronexpr v1.1.2 // indirect
118 | 	github.com/hashicorp/errwrap v1.1.0 // indirect
119 | 	github.com/hashicorp/go-cleanhttp v0.5.2 // indirect
120 | 	github.com/hashicorp/go-hclog v1.6.3 // indirect
121 | 	github.com/hashicorp/go-immutable-radix v1.3.1 // indirect
122 | 	github.com/hashicorp/go-multierror v1.1.1 // indirect
123 | 	github.com/hashicorp/go-retryablehttp v0.7.8 // indirect
124 | 	github.com/hashicorp/go-rootcerts v1.0.2 // indirect
125 | 	github.com/hashicorp/go-version v1.7.0 // indirect
126 | 	github.com/hashicorp/golang-lru v0.6.0 // indirect
127 | 	github.com/hashicorp/hcl v1.0.0 // indirect
128 | 	github.com/hashicorp/hcl/v2 v2.24.0 // indirect
129 | 	github.com/hashicorp/nomad/api v0.0.0-20241218080744-e3ac00f30eec // indirect
130 | 	github.com/hashicorp/serf v0.10.1 // indirect
131 | 	github.com/hetznercloud/hcloud-go/v2 v2.21.0 // indirect
132 | 	github.com/huandu/xstrings v1.5.0 // indirect
133 | 	github.com/inconshreveable/mousetrap v1.1.0 // indirect
134 | 	github.com/ionos-cloud/sdk-go/v6 v6.3.3 // indirect
135 | 	github.com/jdkato/go-tree-sitter-julia v0.1.0 // indirect
136 | 	github.com/jdkato/twine v0.10.2 // indirect
137 | 	github.com/jmespath/go-jmespath v0.4.0 // indirect
138 | 	github.com/josharian/intern v1.0.0 // indirect
139 | 	github.com/jpillora/backoff v1.0.0 // indirect
140 | 	github.com/json-iterator/go v1.1.12 // indirect
141 | 	github.com/klauspost/compress v1.18.0 // indirect
142 | 	github.com/knadh/koanf/maps v0.1.2 // indirect
143 | 	github.com/knadh/koanf/providers/confmap v0.1.0 // indirect
144 | 	github.com/knadh/koanf/v2 v2.1.2 // indirect
145 | 	github.com/kolo/xmlrpc v0.0.0-20220921171641-a4b6fa1dd06b // indirect
146 | 	github.com/kylelemons/godebug v1.1.0 // indirect
147 | 	github.com/linode/linodego v1.49.0 // indirect
148 | 	github.com/lithammer/fuzzysearch v1.1.8 // indirect
149 | 	github.com/magiconair/properties v1.8.7 // indirect
150 | 	github.com/mailru/easyjson v0.7.7 // indirect
151 | 	github.com/mattn/go-colorable v0.1.13 // indirect
152 | 	github.com/mattn/go-isatty v0.0.20 // indirect
153 | 	github.com/mattn/go-runewidth v0.0.16 // indirect
154 | 	github.com/mdlayher/socket v0.4.1 // indirect
155 | 	github.com/mdlayher/vsock v1.2.1 // indirect
156 | 	github.com/miekg/dns v1.1.65 // indirect
157 | 	github.com/mitchellh/copystructure v1.2.0 // indirect
158 | 	github.com/mitchellh/go-homedir v1.1.0 // indirect
159 | 	github.com/mitchellh/go-wordwrap v1.0.1 // indirect
160 | 	github.com/mitchellh/mapstructure v1.5.0 // indirect
161 | 	github.com/mitchellh/reflectwalk v1.0.2 // indirect
162 | 	github.com/moby/docker-image-spec v1.3.1 // indirect
163 | 	github.com/moby/sys/sequential v0.6.0 // indirect
164 | 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
165 | 	github.com/modern-go/reflect2 v1.0.2 // indirect
166 | 	github.com/montanaflynn/stats v0.7.1 // indirect
167 | 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
168 | 	github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect
169 | 	github.com/niklasfasching/go-org v1.7.0 // indirect
170 | 	github.com/nsf/jsondiff v0.0.0-20230430225905-43f6cf3098c1 // indirect
171 | 	github.com/oklog/ulid v1.3.1 // indirect
172 | 	github.com/oklog/ulid/v2 v2.1.0 // indirect
173 | 	github.com/olekukonko/tablewriter v0.0.5 // indirect
174 | 	github.com/open-telemetry/opentelemetry-collector-contrib/internal/exp/metrics v0.124.1 // indirect
175 | 	github.com/open-telemetry/opentelemetry-collector-contrib/pkg/pdatautil v0.124.1 // indirect
176 | 	github.com/open-telemetry/opentelemetry-collector-contrib/processor/deltatocumulativeprocessor v0.124.1 // indirect
177 | 	github.com/opencontainers/go-digest v1.0.0 // indirect
178 | 	github.com/opencontainers/image-spec v1.0.2 // indirect
179 | 	github.com/otiai10/copy v1.14.0 // indirect
180 | 	github.com/ovh/go-ovh v1.7.0 // indirect
181 | 	github.com/pelletier/go-toml/v2 v2.2.3 // indirect
182 | 	github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c // indirect
183 | 	github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e // indirect
184 | 	github.com/pkg/errors v0.9.1 // indirect
185 | 	github.com/planetscale/vtprotobuf v0.6.1-0.20240319094008-0393e58bdf10 // indirect
186 | 	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
187 | 	github.com/prometheus/alertmanager v0.28.1 // indirect
188 | 	github.com/prometheus/client_golang v1.22.0 // indirect
189 | 	github.com/prometheus/client_model v0.6.2 // indirect
190 | 	github.com/prometheus/common v0.65.0 // indirect
191 | 	github.com/prometheus/exporter-toolkit v0.14.0 // indirect
192 | 	github.com/prometheus/otlptranslator v0.0.0-20250320144820-d800c8b0eb07 // indirect
193 | 	github.com/prometheus/procfs v0.15.1 // indirect
194 | 	github.com/prometheus/sigv4 v0.1.2 // indirect
195 | 	github.com/prymitive/current v0.1.1 // indirect
196 | 	github.com/pterm/pterm v0.12.76 // indirect
197 | 	github.com/puzpuzpuz/xsync/v3 v3.5.1 // indirect
198 | 	github.com/remeh/sizedwaitgroup v1.0.0 // indirect
199 | 	github.com/rivo/uniseg v0.4.7 // indirect
200 | 	github.com/scaleway/scaleway-sdk-go v1.0.0-beta.33 // indirect
201 | 	github.com/shopspring/decimal v1.4.0 // indirect
202 | 	github.com/smacker/go-tree-sitter v0.0.0-20240827094217-dd81d9e9be82 // indirect
203 | 	github.com/spf13/afero v1.10.0 // indirect
204 | 	github.com/spf13/cast v1.7.1 // indirect
205 | 	github.com/spf13/cobra v1.8.0 // indirect
206 | 	github.com/spf13/jwalterweatherman v1.1.0 // indirect
207 | 	github.com/spf13/pflag v1.0.5 // indirect
208 | 	github.com/spf13/viper v1.16.0 // indirect
209 | 	github.com/stretchr/testify v1.10.0 // indirect
210 | 	github.com/subosito/gotenv v1.4.2 // indirect
211 | 	github.com/tomwright/dasel/v2 v2.8.1 // indirect
212 | 	github.com/urfave/cli/v3 v3.3.8 // indirect
213 | 	github.com/vultr/govultr/v2 v2.17.2 // indirect
214 | 	github.com/x448/float16 v0.8.4 // indirect
215 | 	github.com/xeipuuv/gojsonpointer v0.0.0-20190905194746-02993c407bfb // indirect
216 | 	github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415 // indirect
217 | 	github.com/xeipuuv/gojsonschema v1.2.0 // indirect
218 | 	github.com/xhit/go-str2duration/v2 v2.1.0 // indirect
219 | 	github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect
220 | 	github.com/yuin/goldmark v1.7.8 // indirect
221 | 	github.com/zclconf/go-cty v1.16.3 // indirect
222 | 	github.com/zeitlinger/conflate v0.0.0-20230622100834-279724abda8c // indirect
223 | 	gitlab.com/gitlab-org/api/client-go v0.137.0 // indirect
224 | 	go.mongodb.org/mongo-driver v1.14.0 // indirect
225 | 	go.opencensus.io v0.24.0 // indirect
226 | 	go.opentelemetry.io/auto/sdk v1.1.0 // indirect
227 | 	go.opentelemetry.io/collector/component v1.30.0 // indirect
228 | 	go.opentelemetry.io/collector/confmap v1.30.0 // indirect
229 | 	go.opentelemetry.io/collector/confmap/xconfmap v0.124.0 // indirect
230 | 	go.opentelemetry.io/collector/consumer v1.30.0 // indirect
231 | 	go.opentelemetry.io/collector/featuregate v1.30.0 // indirect
232 | 	go.opentelemetry.io/collector/internal/telemetry v0.124.0 // indirect
233 | 	go.opentelemetry.io/collector/pdata v1.30.0 // indirect
234 | 	go.opentelemetry.io/collector/pipeline v0.124.0 // indirect
235 | 	go.opentelemetry.io/collector/processor v1.30.0 // indirect
236 | 	go.opentelemetry.io/collector/semconv v0.124.0 // indirect
237 | 	go.opentelemetry.io/contrib/bridges/otelzap v0.10.0 // indirect
238 | 	go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.60.0 // indirect
239 | 	go.opentelemetry.io/contrib/instrumentation/net/http/httptrace/otelhttptrace v0.60.0 // indirect
240 | 	go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.60.0 // indirect
241 | 	go.opentelemetry.io/otel v1.35.0 // indirect
242 | 	go.opentelemetry.io/otel/log v0.11.0 // indirect
243 | 	go.opentelemetry.io/otel/metric v1.35.0 // indirect
244 | 	go.opentelemetry.io/otel/sdk v1.35.0 // indirect
245 | 	go.opentelemetry.io/otel/trace v1.35.0 // indirect
246 | 	go.uber.org/atomic v1.11.0 // indirect
247 | 	go.uber.org/automaxprocs v1.6.0 // indirect
248 | 	go.uber.org/goleak v1.3.0 // indirect
249 | 	go.uber.org/multierr v1.11.0 // indirect
250 | 	go.uber.org/ratelimit v0.3.1 // indirect
251 | 	go.uber.org/zap v1.27.0 // indirect
252 | 	golang.org/x/crypto v0.38.0 // indirect
253 | 	golang.org/x/exp v0.0.0-20250106191152-7588d65b2ba8 // indirect
254 | 	golang.org/x/mod v0.24.0 // indirect
255 | 	golang.org/x/net v0.40.0 // indirect
256 | 	golang.org/x/oauth2 v0.30.0 // indirect
257 | 	golang.org/x/sync v0.14.0 // indirect
258 | 	golang.org/x/sys v0.33.0 // indirect
259 | 	golang.org/x/term v0.32.0 // indirect
260 | 	golang.org/x/text v0.25.0 // indirect
261 | 	golang.org/x/time v0.12.0 // indirect
262 | 	golang.org/x/tools v0.32.0 // indirect
263 | 	google.golang.org/api v0.230.0 // indirect
264 | 	google.golang.org/genproto v0.0.0-20240903143218-8af14fe29dc1 // indirect
265 | 	google.golang.org/genproto/googleapis/api v0.0.0-20250303144028-a0af3efb3deb // indirect
266 | 	google.golang.org/genproto/googleapis/rpc v0.0.0-20250414145226-207652e42e2e // indirect
267 | 	google.golang.org/grpc v1.72.0 // indirect
268 | 	google.golang.org/protobuf v1.36.6 // indirect
269 | 	gopkg.in/alecthomas/kingpin.v2 v2.2.6 // indirect
270 | 	gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect
271 | 	gopkg.in/inf.v0 v0.9.1 // indirect
272 | 	gopkg.in/ini.v1 v1.67.0 // indirect
273 | 	gopkg.in/neurosnap/sentences.v1 v1.0.7 // indirect
274 | 	gopkg.in/yaml.v2 v2.4.0 // indirect
275 | 	gopkg.in/yaml.v3 v3.0.1 // indirect
276 | 	k8s.io/api v0.32.3 // indirect
277 | 	k8s.io/apimachinery v0.32.3 // indirect
278 | 	k8s.io/client-go v0.32.3 // indirect
279 | 	k8s.io/klog/v2 v2.130.1 // indirect
280 | 	k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f // indirect
281 | 	k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 // indirect
282 | 	sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect
283 | 	sigs.k8s.io/structured-merge-diff/v4 v4.4.2 // indirect
284 | 	sigs.k8s.io/yaml v1.4.0 // indirect
285 | )
286 | 
287 | // TODO: This could be removed after https://github.com/mholt/archiver/pull/396 merged
288 | replace github.com/mholt/archiver/v3 => github.com/anchore/archiver/v3 v3.5.2
289 | 


--------------------------------------------------------------------------------
/dashboards_out/kubernetes-autoscaling-mixin-karpenter-act.json:
--------------------------------------------------------------------------------
  1 | {
  2 |    "__inputs": [ ],
  3 |    "__requires": [ ],
  4 |    "annotations": {
  5 |       "list": [ ]
  6 |    },
  7 |    "description": "A dashboard that monitors Karpenter and focuses on Karpenter deletion/creation activity. The dashboards were generated using [kubernetes-autoscaling-mixin](https://github.com/adinhodovic/kubernetes-autoscaling-mixin). Open issues and create feature requests in the repository.",
  8 |    "editable": true,
  9 |    "links": [
 10 |       {
 11 |          "asDropdown": true,
 12 |          "includeVars": false,
 13 |          "keepTime": true,
 14 |          "tags": [
 15 |             "kubernetes",
 16 |             "autoscaling",
 17 |             "kubernetes-autoscaling-mixin"
 18 |          ],
 19 |          "targetBlank": true,
 20 |          "title": "Kubernetes / Autoscaling",
 21 |          "type": "dashboards"
 22 |       }
 23 |    ],
 24 |    "panels": [
 25 |       {
 26 |          "collapsed": false,
 27 |          "gridPos": {
 28 |             "h": 1,
 29 |             "w": 24,
 30 |             "x": 0,
 31 |             "y": 0
 32 |          },
 33 |          "id": 1,
 34 |          "title": "Node Pool Activity",
 35 |          "type": "row"
 36 |       },
 37 |       {
 38 |          "datasource": {
 39 |             "type": "prometheus",
 40 |             "uid": "$datasource"
 41 |          },
 42 |          "description": "The number of nodes created by node pool.",
 43 |          "fieldConfig": {
 44 |             "defaults": {
 45 |                "custom": {
 46 |                   "axisSoftMin": 0,
 47 |                   "fillOpacity": 100,
 48 |                   "lineWidth": 1,
 49 |                   "stacking": {
 50 |                      "mode": "normal"
 51 |                   }
 52 |                },
 53 |                "unit": "short"
 54 |             },
 55 |             "overrides": [ ]
 56 |          },
 57 |          "gridPos": {
 58 |             "h": 6,
 59 |             "w": 12,
 60 |             "x": 0,
 61 |             "y": 1
 62 |          },
 63 |          "id": 2,
 64 |          "options": {
 65 |             "legend": {
 66 |                "calcs": [
 67 |                   "mean",
 68 |                   "max"
 69 |                ],
 70 |                "displayMode": "table",
 71 |                "placement": "right",
 72 |                "showLegend": true,
 73 |                "sortBy": "Mean",
 74 |                "sortDesc": true
 75 |             },
 76 |             "tooltip": {
 77 |                "mode": "multi",
 78 |                "sort": "desc"
 79 |             }
 80 |          },
 81 |          "pluginVersion": "v11.4.0",
 82 |          "targets": [
 83 |             {
 84 |                "datasource": {
 85 |                   "type": "prometheus",
 86 |                   "uid": "$datasource"
 87 |                },
 88 |                "exemplar": false,
 89 |                "expr": "round(\n  sum(\n    increase(\n      karpenter_nodes_created_total{\n        cluster=\"$cluster\",\njob=~\"$job\"\n,\n        nodepool=~\"$nodepool\"\n      }[$__rate_interval]\n    )\n  ) by (nodepool)\n)\n",
 90 |                "legendFormat": "{{ nodepool }}"
 91 |             }
 92 |          ],
 93 |          "title": "Nodes Created by Node Pool",
 94 |          "type": "timeseries"
 95 |       },
 96 |       {
 97 |          "datasource": {
 98 |             "type": "prometheus",
 99 |             "uid": "$datasource"
100 |          },
101 |          "description": "The number of nodes terminated by node pool.",
102 |          "fieldConfig": {
103 |             "defaults": {
104 |                "custom": {
105 |                   "axisSoftMin": 0,
106 |                   "fillOpacity": 100,
107 |                   "lineWidth": 1,
108 |                   "stacking": {
109 |                      "mode": "normal"
110 |                   }
111 |                },
112 |                "unit": "short"
113 |             },
114 |             "overrides": [ ]
115 |          },
116 |          "gridPos": {
117 |             "h": 6,
118 |             "w": 12,
119 |             "x": 12,
120 |             "y": 1
121 |          },
122 |          "id": 3,
123 |          "options": {
124 |             "legend": {
125 |                "calcs": [
126 |                   "mean",
127 |                   "max"
128 |                ],
129 |                "displayMode": "table",
130 |                "placement": "right",
131 |                "showLegend": true,
132 |                "sortBy": "Mean",
133 |                "sortDesc": true
134 |             },
135 |             "tooltip": {
136 |                "mode": "multi",
137 |                "sort": "desc"
138 |             }
139 |          },
140 |          "pluginVersion": "v11.4.0",
141 |          "targets": [
142 |             {
143 |                "datasource": {
144 |                   "type": "prometheus",
145 |                   "uid": "$datasource"
146 |                },
147 |                "exemplar": false,
148 |                "expr": "round(\n  sum(\n    increase(\n      karpenter_nodes_terminated_total{\n        cluster=\"$cluster\",\njob=~\"$job\"\n,\n        nodepool=~\"$nodepool\"\n      }[$__rate_interval]\n    )\n  ) by (nodepool)\n)\n",
149 |                "legendFormat": "{{ nodepool }}"
150 |             }
151 |          ],
152 |          "title": "Nodes Terminated by Node Pool",
153 |          "type": "timeseries"
154 |       },
155 |       {
156 |          "datasource": {
157 |             "type": "prometheus",
158 |             "uid": "$datasource"
159 |          },
160 |          "description": "The number of voluntary disruption decisions by reason and decision.",
161 |          "fieldConfig": {
162 |             "defaults": {
163 |                "custom": {
164 |                   "axisSoftMin": 0,
165 |                   "fillOpacity": 100,
166 |                   "lineWidth": 1,
167 |                   "stacking": {
168 |                      "mode": "normal"
169 |                   }
170 |                },
171 |                "unit": "short"
172 |             },
173 |             "overrides": [ ]
174 |          },
175 |          "gridPos": {
176 |             "h": 6,
177 |             "w": 12,
178 |             "x": 0,
179 |             "y": 7
180 |          },
181 |          "id": 4,
182 |          "options": {
183 |             "legend": {
184 |                "calcs": [
185 |                   "mean",
186 |                   "max"
187 |                ],
188 |                "displayMode": "table",
189 |                "placement": "right",
190 |                "showLegend": true,
191 |                "sortBy": "Mean",
192 |                "sortDesc": true
193 |             },
194 |             "tooltip": {
195 |                "mode": "multi",
196 |                "sort": "desc"
197 |             }
198 |          },
199 |          "pluginVersion": "v11.4.0",
200 |          "targets": [
201 |             {
202 |                "datasource": {
203 |                   "type": "prometheus",
204 |                   "uid": "$datasource"
205 |                },
206 |                "exemplar": false,
207 |                "expr": "round(\n  sum(\n    increase(\n      karpenter_voluntary_disruption_decisions_total{\n        cluster=\"$cluster\",\njob=~\"$job\"\n\n      }[$__rate_interval]\n    )\n  ) by (decision, reason)\n)\n",
208 |                "legendFormat": "{{ decision }} - {{ reason }}"
209 |             }
210 |          ],
211 |          "title": "Node Disruption Decisions by Reason and Decision",
212 |          "type": "timeseries"
213 |       },
214 |       {
215 |          "datasource": {
216 |             "type": "prometheus",
217 |             "uid": "$datasource"
218 |          },
219 |          "description": "The number of nodes eligible for voluntary disruption by reason.",
220 |          "fieldConfig": {
221 |             "defaults": {
222 |                "custom": {
223 |                   "axisSoftMin": 0,
224 |                   "fillOpacity": 100,
225 |                   "lineWidth": 1,
226 |                   "stacking": {
227 |                      "mode": "normal"
228 |                   }
229 |                },
230 |                "unit": "short"
231 |             },
232 |             "overrides": [ ]
233 |          },
234 |          "gridPos": {
235 |             "h": 6,
236 |             "w": 12,
237 |             "x": 12,
238 |             "y": 7
239 |          },
240 |          "id": 5,
241 |          "options": {
242 |             "legend": {
243 |                "calcs": [
244 |                   "mean",
245 |                   "max"
246 |                ],
247 |                "displayMode": "table",
248 |                "placement": "right",
249 |                "showLegend": true,
250 |                "sortBy": "Mean",
251 |                "sortDesc": true
252 |             },
253 |             "tooltip": {
254 |                "mode": "multi",
255 |                "sort": "desc"
256 |             }
257 |          },
258 |          "pluginVersion": "v11.4.0",
259 |          "targets": [
260 |             {
261 |                "datasource": {
262 |                   "type": "prometheus",
263 |                   "uid": "$datasource"
264 |                },
265 |                "exemplar": false,
266 |                "expr": "round(\n  sum(\n    karpenter_voluntary_disruption_eligible_nodes{\n      cluster=\"$cluster\",\njob=~\"$job\"\n\n    }\n  ) by (reason)\n)\n",
267 |                "legendFormat": "{{ reason }}"
268 |             }
269 |          ],
270 |          "title": "Nodes Eligible for Disruption by Reason",
271 |          "type": "timeseries"
272 |       },
273 |       {
274 |          "datasource": {
275 |             "type": "prometheus",
276 |             "uid": "$datasource"
277 |          },
278 |          "description": "The number of nodes disrupted by node pool, capacity type, and reason.",
279 |          "fieldConfig": {
280 |             "defaults": {
281 |                "custom": {
282 |                   "axisSoftMin": 0,
283 |                   "fillOpacity": 100,
284 |                   "lineWidth": 1,
285 |                   "stacking": {
286 |                      "mode": "normal"
287 |                   }
288 |                },
289 |                "unit": "short"
290 |             },
291 |             "overrides": [ ]
292 |          },
293 |          "gridPos": {
294 |             "h": 6,
295 |             "w": 24,
296 |             "x": 0,
297 |             "y": 13
298 |          },
299 |          "id": 6,
300 |          "options": {
301 |             "legend": {
302 |                "calcs": [
303 |                   "mean",
304 |                   "max"
305 |                ],
306 |                "displayMode": "table",
307 |                "placement": "right",
308 |                "showLegend": true,
309 |                "sortBy": "Mean",
310 |                "sortDesc": true
311 |             },
312 |             "tooltip": {
313 |                "mode": "multi",
314 |                "sort": "desc"
315 |             }
316 |          },
317 |          "pluginVersion": "v11.4.0",
318 |          "targets": [
319 |             {
320 |                "datasource": {
321 |                   "type": "prometheus",
322 |                   "uid": "$datasource"
323 |                },
324 |                "exemplar": false,
325 |                "expr": "round(\n  sum(\n    increase(\n      karpenter_nodeclaims_disrupted_total{\n        cluster=\"$cluster\",\njob=~\"$job\"\n,\n        nodepool=~\"$nodepool\"\n      }[$__rate_interval]\n    )\n  ) by (nodepool, capacity_type, reason)\n)\n",
326 |                "legendFormat": "{{ nodepool }} - {{ capacity_type }} - {{ reason }}"
327 |             }
328 |          ],
329 |          "title": "Nodes Disrupted by Node Pool",
330 |          "type": "timeseries"
331 |       },
332 |       {
333 |          "collapsed": false,
334 |          "gridPos": {
335 |             "h": 1,
336 |             "w": 24,
337 |             "x": 0,
338 |             "y": 19
339 |          },
340 |          "id": 7,
341 |          "title": "Pod Activity",
342 |          "type": "row"
343 |       },
344 |       {
345 |          "datasource": {
346 |             "type": "prometheus",
347 |             "uid": "$datasource"
348 |          },
349 |          "description": "The number of pods by phase.",
350 |          "fieldConfig": {
351 |             "defaults": {
352 |                "custom": {
353 |                   "axisSoftMin": 0,
354 |                   "fillOpacity": 100,
355 |                   "lineWidth": 1,
356 |                   "stacking": {
357 |                      "mode": "normal"
358 |                   }
359 |                },
360 |                "unit": "short"
361 |             },
362 |             "overrides": [ ]
363 |          },
364 |          "gridPos": {
365 |             "h": 6,
366 |             "w": 12,
367 |             "x": 0,
368 |             "y": 20
369 |          },
370 |          "id": 8,
371 |          "options": {
372 |             "legend": {
373 |                "calcs": [
374 |                   "mean",
375 |                   "max"
376 |                ],
377 |                "displayMode": "table",
378 |                "placement": "right",
379 |                "showLegend": true,
380 |                "sortBy": "Mean",
381 |                "sortDesc": true
382 |             },
383 |             "tooltip": {
384 |                "mode": "multi",
385 |                "sort": "desc"
386 |             }
387 |          },
388 |          "pluginVersion": "v11.4.0",
389 |          "targets": [
390 |             {
391 |                "datasource": {
392 |                   "type": "prometheus",
393 |                   "uid": "$datasource"
394 |                },
395 |                "exemplar": false,
396 |                "expr": "round(\n  sum(\n    karpenter_pods_state{\n      cluster=\"$cluster\",\njob=~\"$job\"\n\n    }\n  ) by (phase)\n)\n",
397 |                "legendFormat": "{{ phase }}"
398 |             }
399 |          ],
400 |          "title": "Pods by Phase",
401 |          "type": "timeseries"
402 |       },
403 |       {
404 |          "datasource": {
405 |             "type": "prometheus",
406 |             "uid": "$datasource"
407 |          },
408 |          "description": "The duration for pods to start up.",
409 |          "fieldConfig": {
410 |             "defaults": {
411 |                "custom": {
412 |                   "fillOpacity": 0
413 |                },
414 |                "unit": "s"
415 |             },
416 |             "overrides": [ ]
417 |          },
418 |          "gridPos": {
419 |             "h": 6,
420 |             "w": 12,
421 |             "x": 12,
422 |             "y": 20
423 |          },
424 |          "id": 9,
425 |          "options": {
426 |             "legend": {
427 |                "calcs": [
428 |                   "mean",
429 |                   "max"
430 |                ],
431 |                "displayMode": "table",
432 |                "placement": "right",
433 |                "showLegend": true,
434 |                "sortBy": "Mean",
435 |                "sortDesc": true
436 |             },
437 |             "tooltip": {
438 |                "mode": "multi",
439 |                "sort": "desc"
440 |             }
441 |          },
442 |          "pluginVersion": "v11.4.0",
443 |          "targets": [
444 |             {
445 |                "datasource": {
446 |                   "type": "prometheus",
447 |                   "uid": "$datasource"
448 |                },
449 |                "exemplar": false,
450 |                "expr": "max(\n  karpenter_pods_startup_duration_seconds{\n    cluster=\"$cluster\",\njob=~\"$job\"\n,\n    quantile=\"0.5\"\n  }\n)\n",
451 |                "legendFormat": "P50"
452 |             },
453 |             {
454 |                "datasource": {
455 |                   "type": "prometheus",
456 |                   "uid": "$datasource"
457 |                },
458 |                "exemplar": false,
459 |                "expr": "max(\n  karpenter_pods_startup_duration_seconds{\n    cluster=\"$cluster\",\njob=~\"$job\"\n,\n    quantile=\"0.95\"\n  }\n)\n",
460 |                "legendFormat": "P95"
461 |             },
462 |             {
463 |                "datasource": {
464 |                   "type": "prometheus",
465 |                   "uid": "$datasource"
466 |                },
467 |                "exemplar": false,
468 |                "expr": "max(\n  karpenter_pods_startup_duration_seconds{\n    cluster=\"$cluster\",\njob=~\"$job\"\n,\n    quantile=\"0.99\"\n  }\n)\n",
469 |                "legendFormat": "P99"
470 |             }
471 |          ],
472 |          "title": "Pods Startup Duration",
473 |          "type": "timeseries"
474 |       }
475 |    ],
476 |    "schemaVersion": 39,
477 |    "tags": [
478 |       "kubernetes",
479 |       "autoscaling",
480 |       "kubernetes-autoscaling-mixin",
481 |       "karpenter"
482 |    ],
483 |    "templating": {
484 |       "list": [
485 |          {
486 |             "current": {
487 |                "selected": true,
488 |                "text": "default",
489 |                "value": "default"
490 |             },
491 |             "label": "Data source",
492 |             "name": "datasource",
493 |             "query": "prometheus",
494 |             "type": "datasource"
495 |          },
496 |          {
497 |             "datasource": {
498 |                "type": "prometheus",
499 |                "uid": "${datasource}"
500 |             },
501 |             "hide": 2,
502 |             "label": "Cluster",
503 |             "name": "cluster",
504 |             "query": "label_values(kube_pod_info{job=~\"kube-state-metrics\"}, cluster)",
505 |             "refresh": 2,
506 |             "sort": 1,
507 |             "type": "query"
508 |          },
509 |          {
510 |             "datasource": {
511 |                "type": "prometheus",
512 |                "uid": "${datasource}"
513 |             },
514 |             "label": "Job",
515 |             "name": "job",
516 |             "query": "label_values(karpenter_nodes_allocatable{cluster=\"$cluster\"}, job)",
517 |             "refresh": 2,
518 |             "sort": 1,
519 |             "type": "query"
520 |          },
521 |          {
522 |             "datasource": {
523 |                "type": "prometheus",
524 |                "uid": "${datasource}"
525 |             },
526 |             "includeAll": true,
527 |             "label": "Node Pool",
528 |             "multi": true,
529 |             "name": "nodepool",
530 |             "query": "label_values(karpenter_nodes_allocatable{cluster=\"$cluster\", job=~\"$job\"}, nodepool)",
531 |             "refresh": 2,
532 |             "sort": 1,
533 |             "type": "query"
534 |          }
535 |       ]
536 |    },
537 |    "time": {
538 |       "from": "now-24h",
539 |       "to": "now"
540 |    },
541 |    "timezone": "utc",
542 |    "title": "Kubernetes / Autoscaling / Karpenter / Activity",
543 |    "uid": "kubernetes-autoscaling-mixin-kact-jkwq"
544 | }
545 | 


--------------------------------------------------------------------------------
/dashboards/karpenter/karpenter-performance.libsonnet:
--------------------------------------------------------------------------------
  1 | local mixinUtils = import 'github.com/adinhodovic/mixin-utils/utils.libsonnet';
  2 | local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet';
  3 | local util = import 'util.libsonnet';
  4 | 
  5 | local dashboard = g.dashboard;
  6 | local row = g.panel.row;
  7 | local grid = g.util.grid;
  8 | 
  9 | // Stat panel helpers
 10 | local stat = g.panel.stat;
 11 | local stStandardOptions = stat.standardOptions;
 12 | 
 13 | {
 14 |   grafanaDashboards+:: {
 15 |     'kubernetes-autoscaling-mixin-karpenter-perf.json':
 16 |       if !$._config.karpenter.enabled then {} else
 17 | 
 18 |         local defaultVariables = util.variables($._config);
 19 | 
 20 |         local variables = [
 21 |           defaultVariables.datasource,
 22 |           defaultVariables.cluster,
 23 |           defaultVariables.job,
 24 |         ];
 25 | 
 26 |         local defaultFilters = util.filters($._config);
 27 |         local queries = {
 28 |           // Summary
 29 |           clusterStateSynced: |||
 30 |             sum(
 31 |               karpenter_cluster_state_synced{
 32 |                 %(base)s
 33 |               }
 34 |             ) by (job)
 35 |           ||| % defaultFilters,
 36 | 
 37 |           clusterStateNodeCount: |||
 38 |             sum(
 39 |               karpenter_cluster_state_node_count{
 40 |                 %(base)s
 41 |               }
 42 |             ) by (job)
 43 |           ||| % defaultFilters,
 44 | 
 45 |           cloudProviderErrors: |||
 46 |             round(
 47 |               sum(
 48 |                 increase(
 49 |                   karpenter_cloudprovider_errors_total{
 50 |                     %(base)s
 51 |                   }[$__rate_interval]
 52 |                 )
 53 |               ) by (job, provider, controller, method, error)
 54 |             )
 55 |           ||| % defaultFilters,
 56 | 
 57 |           // Node Termination
 58 |           nodeTerminationP50Duration: |||
 59 |             max(
 60 |               karpenter_nodes_termination_duration_seconds{
 61 |                 %(base)s,
 62 |                 quantile="0.5"
 63 |               }
 64 |             )
 65 |           ||| % defaultFilters,
 66 | 
 67 |           nodeTerminationP95Duration: |||
 68 |             max(
 69 |               karpenter_nodes_termination_duration_seconds{
 70 |                 %(base)s,
 71 |                 quantile="0.95"
 72 |               }
 73 |             )
 74 |           ||| % defaultFilters,
 75 | 
 76 |           nodeTerminationP99Duration: |||
 77 |             max(
 78 |               karpenter_nodes_termination_duration_seconds{
 79 |                 %(base)s,
 80 |                 quantile="0.99"
 81 |               }
 82 |             )
 83 |           ||| % defaultFilters,
 84 | 
 85 |           // Pod Startup
 86 |           podsStartupP50Duration: |||
 87 |             max(
 88 |               karpenter_pods_startup_duration_seconds{
 89 |                 %(base)s,
 90 |                 quantile="0.5"
 91 |               }
 92 |             )
 93 |           ||| % defaultFilters,
 94 | 
 95 |           podsStartupP95Duration: |||
 96 |             max(
 97 |               karpenter_pods_startup_duration_seconds{
 98 |                 %(base)s,
 99 |                 quantile="0.95"
100 |               }
101 |             )
102 |           ||| % defaultFilters,
103 | 
104 |           podsStartupP99Duration: |||
105 |             max(
106 |               karpenter_pods_startup_duration_seconds{
107 |                 %(base)s,
108 |                 quantile="0.99"
109 |               }
110 |             )
111 |           ||| % defaultFilters,
112 | 
113 |           // Interruption Queue
114 |           interruptionReceivedMessages: |||
115 |             sum(
116 |               increase(
117 |                 karpenter_interruption_received_messages_total{
118 |                   %(base)s
119 |                 }[$__rate_interval]
120 |               )
121 |             ) by (job, message_type)
122 |           ||| % defaultFilters,
123 | 
124 |           interruptionDeletedMessages: |||
125 |             sum(
126 |               increase(
127 |                 karpenter_interruption_deleted_messages_total{
128 |                   %(base)s
129 |                 }[$__rate_interval]
130 |               )
131 |             ) by (job)
132 |           ||| % defaultFilters,
133 | 
134 |           interuptionDurationP50: |||
135 |             histogram_quantile(0.50,
136 |               sum(
137 |                 irate(
138 |                   karpenter_interruption_message_queue_duration_seconds_bucket{
139 |                     %(base)s
140 |                   }[$__rate_interval]
141 |                 ) > 0
142 |               ) by (job, le)
143 |             )
144 |           ||| % defaultFilters,
145 | 
146 |           interuptionDurationP95: |||
147 |             histogram_quantile(0.95,
148 |               sum(
149 |                 irate(
150 |                   karpenter_interruption_message_queue_duration_seconds_bucket{
151 |                     %(base)s
152 |                   }[$__rate_interval]
153 |                 ) > 0
154 |               ) by (job, le)
155 |             )
156 |           ||| % defaultFilters,
157 | 
158 |           interuptionDurationP99: |||
159 |             histogram_quantile(0.99,
160 |               sum(
161 |                 irate(
162 |                   karpenter_interruption_message_queue_duration_seconds_bucket{
163 |                     %(base)s
164 |                   }[$__rate_interval]
165 |                 ) > 0
166 |               ) by (job, le)
167 |             )
168 |           ||| % defaultFilters,
169 | 
170 |           // Work Queue
171 |           workQueueDepth: |||
172 |             sum(
173 |               karpenter_workqueue_depth{
174 |                 %(base)s
175 |               }
176 |             ) by (job)
177 |           ||| % defaultFilters,
178 | 
179 |           workQueueInQueueDurationP50: |||
180 |             histogram_quantile(0.50,
181 |               sum(
182 |                 irate(
183 |                   karpenter_workqueue_queue_duration_seconds_bucket{
184 |                     %(base)s
185 |                   }[$__rate_interval]
186 |                 ) > 0
187 |               ) by (job, le)
188 |             )
189 |           ||| % defaultFilters,
190 | 
191 |           workQueueInQueueDurationP95: |||
192 |             histogram_quantile(0.95,
193 |               sum(
194 |                 irate(
195 |                   karpenter_workqueue_queue_duration_seconds_bucket{
196 |                     %(base)s
197 |                   }[$__rate_interval]
198 |                 ) > 0
199 |               ) by (job, le)
200 |             )
201 |           ||| % defaultFilters,
202 | 
203 |           workQueueInQueueDurationP99: |||
204 |             histogram_quantile(0.99,
205 |               sum(
206 |                 irate(
207 |                   karpenter_workqueue_queue_duration_seconds_bucket{
208 |                     %(base)s
209 |                   }[$__rate_interval]
210 |                 ) > 0
211 |               ) by (job, le)
212 |             )
213 |           ||| % defaultFilters,
214 | 
215 |           workQueueWorkDurationP50: |||
216 |             histogram_quantile(0.50,
217 |               sum(
218 |                 irate(
219 |                   karpenter_workqueue_work_duration_seconds_bucket{
220 |                     %(base)s
221 |                   }[$__rate_interval]
222 |                 ) > 0
223 |               ) by (job, le)
224 |             )
225 |           ||| % defaultFilters,
226 | 
227 |           workQueueWorkDurationP95: |||
228 |             histogram_quantile(0.95,
229 |               sum(
230 |                 irate(
231 |                   karpenter_workqueue_work_duration_seconds_bucket{
232 |                     %(base)s
233 |                   }[$__rate_interval]
234 |                 ) > 0
235 |               ) by (job, le)
236 |             )
237 |           ||| % defaultFilters,
238 | 
239 |           workQueueWorkDurationP99: |||
240 |             histogram_quantile(0.99,
241 |               sum(
242 |                 irate(
243 |                   karpenter_workqueue_work_duration_seconds_bucket{
244 |                     %(base)s
245 |                   }[$__rate_interval]
246 |                 ) > 0
247 |               ) by (job, le)
248 |             )
249 |           ||| % defaultFilters,
250 | 
251 |           // Controller
252 |           controllerReconcile: |||
253 |             sum(
254 |               irate(
255 |                 controller_runtime_reconcile_total{
256 |                   %(base)s
257 |                 }[$__rate_interval]
258 |               )
259 |             ) by (job, controller)
260 |           ||| % defaultFilters,
261 | 
262 |           controllerResult: |||
263 |             sum(
264 |               irate(
265 |                 controller_runtime_reconcile_total{
266 |                   %(base)s
267 |                 }[$__rate_interval]
268 |               )
269 |             ) by (job, result)
270 |           ||| % defaultFilters,
271 |         };
272 | 
273 |         local panels = {
274 |           // Summary
275 |           clusterStateSyncedStat:
276 |             mixinUtils.dashboards.statPanel(
277 |               'Cluster State Synced',
278 |               'short',
279 |               queries.clusterStateSynced,
280 |               description='Indicates whether the cluster state is synced.',
281 |               steps=[
282 |                 stStandardOptions.threshold.step.withValue(0) +
283 |                 stStandardOptions.threshold.step.withColor('red'),
284 |                 stStandardOptions.threshold.step.withValue(0.1) +
285 |                 stStandardOptions.threshold.step.withColor('green'),
286 |               ],
287 |               mappings=[
288 |                 stStandardOptions.mapping.ValueMap.withType() +
289 |                 stStandardOptions.mapping.ValueMap.withOptions(
290 |                   {
291 |                     '0': { text: 'No', color: 'red' },
292 |                     '1': { text: 'Yes', color: 'green' },
293 |                   }
294 |                 ),
295 |               ],
296 |             ),
297 | 
298 |           clusterStateNodeCountStat:
299 |             mixinUtils.dashboards.statPanel(
300 |               'Cluster State Node Count',
301 |               'short',
302 |               queries.clusterStateNodeCount,
303 |               description='The number of nodes in the cluster state.',
304 |               steps=[
305 |                 stStandardOptions.threshold.step.withValue(0) +
306 |                 stStandardOptions.threshold.step.withColor('red'),
307 |                 stStandardOptions.threshold.step.withValue(0.1) +
308 |                 stStandardOptions.threshold.step.withColor('green'),
309 |               ],
310 |             ),
311 | 
312 |           cloudProviderErrorsTimeSeries:
313 |             mixinUtils.dashboards.timeSeriesPanel(
314 |               'Cloud Provider Errors',
315 |               'short',
316 |               queries.cloudProviderErrors,
317 |               '{{ provider }} - {{ controller }} - {{ method }} - {{ error }}',
318 |               description='The number of cloud provider errors over time.',
319 |             ),
320 | 
321 |           // Node Termination & Pod Startup
322 |           nodeTerminationDurationTimeSeries:
323 |             mixinUtils.dashboards.timeSeriesPanel(
324 |               'Node Termination Duration',
325 |               's',
326 |               [
327 |                 {
328 |                   expr: queries.nodeTerminationP50Duration,
329 |                   legend: 'P50',
330 |                 },
331 |                 {
332 |                   expr: queries.nodeTerminationP95Duration,
333 |                   legend: 'P95',
334 |                 },
335 |                 {
336 |                   expr: queries.nodeTerminationP99Duration,
337 |                   legend: 'P99',
338 |                 },
339 |               ],
340 |               description='The duration to terminate nodes.',
341 |             ),
342 | 
343 |           podStartupDurationTimeSeries:
344 |             mixinUtils.dashboards.timeSeriesPanel(
345 |               'Pods Startup Duration',
346 |               's',
347 |               [
348 |                 {
349 |                   expr: queries.podsStartupP50Duration,
350 |                   legend: 'P50',
351 |                 },
352 |                 {
353 |                   expr: queries.podsStartupP95Duration,
354 |                   legend: 'P95',
355 |                 },
356 |                 {
357 |                   expr: queries.podsStartupP99Duration,
358 |                   legend: 'P99',
359 |                 },
360 |               ],
361 |               description='The duration for pods to start up.',
362 |             ),
363 | 
364 |           // Interruption Queue
365 |           interruptionReceivedMessagesTimeSeries:
366 |             mixinUtils.dashboards.timeSeriesPanel(
367 |               'Interruption Received Messages',
368 |               'short',
369 |               queries.interruptionReceivedMessages,
370 |               '{{ message_type }}',
371 |               description='The number of interruption messages received.',
372 |             ),
373 | 
374 |           interruptionDeletedMessagesTimeSeries:
375 |             mixinUtils.dashboards.timeSeriesPanel(
376 |               'Interruption Deleted Messages',
377 |               'short',
378 |               queries.interruptionDeletedMessages,
379 |               'Deleted Messages',
380 |               description='The number of interruption messages deleted.',
381 |             ),
382 | 
383 |           interuptionDurationTimeSeries:
384 |             mixinUtils.dashboards.timeSeriesPanel(
385 |               'Interruption Duration',
386 |               's',
387 |               [
388 |                 {
389 |                   expr: queries.interuptionDurationP50,
390 |                   legend: 'P50',
391 |                 },
392 |                 {
393 |                   expr: queries.interuptionDurationP95,
394 |                   legend: 'P95',
395 |                 },
396 |                 {
397 |                   expr: queries.interuptionDurationP99,
398 |                   legend: 'P99',
399 |                 },
400 |               ],
401 |               description='The duration for interruption message processing.',
402 |             ),
403 | 
404 |           // Work Queue
405 |           workQueueDepthTimeSeries:
406 |             mixinUtils.dashboards.timeSeriesPanel(
407 |               'Work Queue Depth',
408 |               'short',
409 |               queries.workQueueDepth,
410 |               'Queue Depth',
411 |               description='The depth of the work queue.',
412 |             ),
413 | 
414 |           workQueueInQueueDurationTimeSeries:
415 |             mixinUtils.dashboards.timeSeriesPanel(
416 |               'Work Queue In Queue Duration',
417 |               's',
418 |               [
419 |                 {
420 |                   expr: queries.workQueueInQueueDurationP50,
421 |                   legend: 'P50',
422 |                 },
423 |                 {
424 |                   expr: queries.workQueueInQueueDurationP95,
425 |                   legend: 'P95',
426 |                 },
427 |                 {
428 |                   expr: queries.workQueueInQueueDurationP99,
429 |                   legend: 'P99',
430 |                 },
431 |               ],
432 |               description='The duration items spend in the work queue.',
433 |             ),
434 | 
435 |           workQueueWorkDurationTimeSeries:
436 |             mixinUtils.dashboards.timeSeriesPanel(
437 |               'Work Queue Work Duration',
438 |               's',
439 |               [
440 |                 {
441 |                   expr: queries.workQueueWorkDurationP50,
442 |                   legend: 'P50',
443 |                 },
444 |                 {
445 |                   expr: queries.workQueueWorkDurationP95,
446 |                   legend: 'P95',
447 |                 },
448 |                 {
449 |                   expr: queries.workQueueWorkDurationP99,
450 |                   legend: 'P99',
451 |                 },
452 |               ],
453 |               description='The duration to process work queue items.',
454 |             ),
455 | 
456 |           // Controller
457 |           controllerReconcileTimeSeries:
458 |             mixinUtils.dashboards.timeSeriesPanel(
459 |               'Controller Reconcile',
460 |               'ops',
461 |               queries.controllerReconcile,
462 |               '{{ controller }}',
463 |               description='The ops of controller reconciliation.',
464 |               stack='normal'
465 |             ),
466 | 
467 |           controllerResultTimeSeries:
468 |             mixinUtils.dashboards.timeSeriesPanel(
469 |               'Controller Result',
470 |               'ops',
471 |               queries.controllerResult,
472 |               '{{ result }}',
473 |               description='The result of controller reconciliations.',
474 |               stack='normal'
475 |             ),
476 |         };
477 | 
478 |         local rows =
479 |           [
480 |             row.new('Summary') +
481 |             row.gridPos.withX(0) +
482 |             row.gridPos.withY(0) +
483 |             row.gridPos.withW(24) +
484 |             row.gridPos.withH(1),
485 |           ] +
486 |           grid.makeGrid(
487 |             [
488 |               panels.clusterStateSyncedStat,
489 |               panels.clusterStateNodeCountStat,
490 |             ],
491 |             panelWidth=3,
492 |             panelHeight=6,
493 |             startY=1
494 |           ) +
495 |           [
496 |             panels.cloudProviderErrorsTimeSeries +
497 |             row.gridPos.withX(12) +
498 |             row.gridPos.withY(1) +
499 |             row.gridPos.withW(18) +
500 |             row.gridPos.withH(6),
501 |           ] +
502 |           grid.makeGrid(
503 |             [
504 |               panels.nodeTerminationDurationTimeSeries,
505 |               panels.podStartupDurationTimeSeries,
506 |             ],
507 |             panelWidth=12,
508 |             panelHeight=6,
509 |             startY=7
510 |           ) +
511 |           [
512 |             row.new('Interruption Queue') +
513 |             row.gridPos.withX(0) +
514 |             row.gridPos.withY(13) +
515 |             row.gridPos.withW(24) +
516 |             row.gridPos.withH(1),
517 |           ] +
518 |           grid.makeGrid(
519 |             [
520 |               panels.interruptionReceivedMessagesTimeSeries,
521 |               panels.interruptionDeletedMessagesTimeSeries,
522 |               panels.interuptionDurationTimeSeries,
523 |             ],
524 |             panelWidth=8,
525 |             panelHeight=6,
526 |             startY=14
527 |           ) +
528 |           [
529 |             row.new('Work Queue') +
530 |             row.gridPos.withX(0) +
531 |             row.gridPos.withY(20) +
532 |             row.gridPos.withW(24) +
533 |             row.gridPos.withH(1),
534 |           ] +
535 |           grid.makeGrid(
536 |             [
537 |               panels.workQueueDepthTimeSeries,
538 |               panels.workQueueInQueueDurationTimeSeries,
539 |               panels.workQueueWorkDurationTimeSeries,
540 |             ],
541 |             panelWidth=8,
542 |             panelHeight=6,
543 |             startY=21
544 |           ) +
545 |           [
546 |             row.new('Controller') +
547 |             row.gridPos.withX(0) +
548 |             row.gridPos.withY(27) +
549 |             row.gridPos.withW(24) +
550 |             row.gridPos.withH(1),
551 |           ] +
552 |           grid.makeGrid(
553 |             [
554 |               panels.controllerReconcileTimeSeries,
555 |               panels.controllerResultTimeSeries,
556 |             ],
557 |             panelWidth=24,
558 |             panelHeight=8,
559 |             startY=28
560 |           );
561 | 
562 |         mixinUtils.dashboards.bypassDashboardValidation +
563 |         dashboard.new(
564 |           'Kubernetes / Autoscaling / Karpenter / Performance',
565 |         ) +
566 |         dashboard.withDescription('A dashboard that monitors Karpenter performance metrics. %s' % mixinUtils.dashboards.dashboardDescriptionLink('kubernetes-autoscaling-mixin', 'https://github.com/adinhodovic/kubernetes-autoscaling-mixin')) +
567 |         dashboard.withUid($._config.karpenterPerformanceDashboardUid) +
568 |         dashboard.withTags($._config.tags + ['karpenter']) +
569 |         dashboard.withTimezone('utc') +
570 |         dashboard.withEditable(true) +
571 |         dashboard.time.withFrom('now-6h') +
572 |         dashboard.time.withTo('now') +
573 |         dashboard.withVariables(variables) +
574 |         dashboard.withLinks(
575 |           mixinUtils.dashboards.dashboardLinks('Kubernetes / Autoscaling', $._config, dropdown=true)
576 |         ) +
577 |         dashboard.withPanels(
578 |           rows
579 |         ) +
580 |         dashboard.withAnnotations(
581 |           mixinUtils.dashboards.annotations($._config, defaultFilters)
582 |         ),
583 |   },
584 | }
585 | 


--------------------------------------------------------------------------------