├── .nancy-ignore
├── test
    ├── hack
    │   ├── .gitignore
    │   └── bin
    │   │   ├── .gitignore
    │   │   ├── get-inhibition.sh
    │   │   ├── template-chart.sh
    │   │   └── run-pint.sh
    ├── conf
    │   ├── providers
    │   ├── pint
    │   │   ├── pint-config.hcl
    │   │   └── pint-all.hcl
    │   └── promtool_ignore
    └── tests
    │   ├── .gitignore
    │   └── providers
    │       ├── global
    │           ├── platform
    │           │   ├── honeybadger
    │           │   │   └── alerting-rules
    │           │   │   │   ├── helm-operations.rules.test.yml
    │           │   │   │   ├── flux.rules.test.yml
    │           │   │   │   ├── zot.rules.test.yml
    │           │   │   │   ├── crsync.rules.test.yml
    │           │   │   │   └── konfigure-operator.rules.test.yml
    │           │   ├── shield
    │           │   │   └── alerting-rules
    │           │   │   │   ├── general.rules.test.yml
    │           │   │   │   └── cert-manager.rules.test.yml
    │           │   └── atlas
    │           │   │   └── alerting-rules
    │           │   │       ├── silence-operator.rules.test.yml
    │           │   │       ├── flux-atlas.rules.test.yml
    │           │   │       ├── sloth.rules.test.yml
    │           │   │       ├── logging-pipeline.rules.test.yml
    │           │   │       └── statefulset.rules.test.yml
    │           └── kaas
    │           │   └── tenet
    │           │       └── alerting-rules
    │           │           ├── capi-machineset.rules.test.yml
    │           │           ├── certificate.management-cluster.rules.test.yml
    │           │           ├── pods.rules.test.yml
    │           │           ├── capi-machinepool.rules.test.yml
    │           │           ├── capi-machinedeployment.rules.test.yml
    │           │           └── capi-machine.rules.test.yml
    │       ├── capa
    │           └── kaas
    │           │   └── phoenix
    │           │       └── alerting-rules
    │           │           ├── cluster-crossplane.rules.test.yml
    │           │           └── capa.inhibition.rules.test.yml
    │       └── capz
    │           └── kaas
    │               └── phoenix
    │                   └── alerting-rules
    │                       └── dns-operator-azure.rules.test.yml
├── loki
    ├── .gitignore
    ├── mixin.libsonnet
    └── update.sh
├── mimir
    ├── .gitignore
    ├── update.sh
    └── mixin.libsonnet
├── .abs
    └── main.yaml
├── assets
    └── inhibition-graph.png
├── .nancy-ignore.generated
├── SECURITY.md
├── .cursor
    └── rules
    │   └── alert-editing.mdc
├── helm
    └── prometheus-rules
    │   ├── values.yaml
    │   ├── Chart.yaml
    │   ├── templates
    │       ├── platform
    │       │   ├── honeybadger
    │       │   │   ├── recording-rules
    │       │   │   │   └── helm-operations.rules.yml
    │       │   │   └── alerting-rules
    │       │   │   │   ├── secret.rules.yml
    │       │   │   │   ├── helm.rules.yml
    │       │   │   │   ├── external-secrets.rules.yml
    │       │   │   │   ├── crossplane.rules.yml
    │       │   │   │   ├── chart.rules.yml
    │       │   │   │   ├── konfigure-operator.rules.yml
    │       │   │   │   └── zot.rules.yml
    │       │   ├── atlas
    │       │   │   ├── alerting-rules
    │       │   │   │   ├── inhibit.oncall.rules.yml
    │       │   │   │   ├── teleport.logs.yml
    │       │   │   │   ├── mimir.logs.yml
    │       │   │   │   ├── statefulset.rules.yml
    │       │   │   │   ├── storage.rules.yml
    │       │   │   │   ├── flux-atlas.rules.yml
    │       │   │   │   ├── app-configuration.rules.yml
    │       │   │   │   ├── sloth.rules.yml
    │       │   │   │   ├── logging-pipeline.rules.yml
    │       │   │   │   ├── silence-operator.rules.yml
    │       │   │   │   ├── keda.rules.yml
    │       │   │   │   ├── tracing-pipeline.rules.yml
    │       │   │   │   └── fluentbit.rules.yml
    │       │   │   └── recording-rules
    │       │   │   │   └── monitoring.resource-usage-estimation.rules.yml
    │       │   ├── shield
    │       │   │   └── alerting-rules
    │       │   │   │   ├── general.rules.yml
    │       │   │   │   ├── dex.logs.yml
    │       │   │   │   ├── cert-manager.rules.yml
    │       │   │   │   ├── dex.rules.yml
    │       │   │   │   └── falco.rules.yml
    │       │   └── cabbage
    │       │   │   ├── alerting-rules
    │       │   │       ├── dns.rules.yml
    │       │   │       ├── kong.rules.yml
    │       │   │       └── external-dns.rules.yml
    │       │   │   └── recording-rules
    │       │   │       └── gs-managed-app-deployment-status.rules.yml
    │       ├── kaas
    │       │   ├── tenet
    │       │   │   └── alerting-rules
    │       │   │   │   ├── inhibit.kubelet.rules.yml
    │       │   │   │   ├── net-exporter.rules.yml
    │       │   │   │   ├── job.rules.yml
    │       │   │   │   ├── timesync.rules.yml
    │       │   │   │   ├── inhibit.nodes.rules.yml
    │       │   │   │   ├── inhibit.capi.rules.yml
    │       │   │   │   ├── node.memory-pressure.rules.yml
    │       │   │   │   ├── capi-kubeadmconfig.rules.yml
    │       │   │   │   ├── capi-machineset.rules.yml
    │       │   │   │   ├── cluster-autoscaler.rules.yml
    │       │   │   │   ├── systemd.rules.yml
    │       │   │   │   ├── fairness.rules.yml
    │       │   │   │   ├── pods.core.rules.yml
    │       │   │   │   ├── certificate.workload-cluster.rules.yml
    │       │   │   │   ├── pods.rules.yml
    │       │   │   │   ├── node-exporter.rules.yml
    │       │   │   │   ├── capi-machine.rules.yml
    │       │   │   │   ├── capi-machinepool.rules.yml
    │       │   │   │   ├── capi-machinedeployment.rules.yml
    │       │   │   │   ├── etcdbackup.rules.yml
    │       │   │   │   ├── apiserver.management-cluster.rules.yml
    │       │   │   │   ├── certificate.management-cluster.rules.yml
    │       │   │   │   ├── capi-kubeadmcontrolplane.rules.yml
    │       │   │   │   └── vertical-pod-autoscaler.rules.yml
    │       │   ├── phoenix
    │       │   │   └── alerting-rules
    │       │   │   │   ├── irsa.rules.yml
    │       │   │   │   ├── aws.node.workload-cluster.rules.yml
    │       │   │   │   ├── nodes.cidrnotavailable.events.logs.yml
    │       │   │   │   ├── capa.inhibition.rules.yml
    │       │   │   │   ├── aws-load-balancer-controller.rules.yml
    │       │   │   │   ├── dns-operator-azure.rules.yml
    │       │   │   │   ├── aws.workload-cluster.rules.yml
    │       │   │   │   ├── cloud-provider-controller.rules.yml
    │       │   │   │   └── cluster-crossplane.rules.yml
    │       │   └── rocket
    │       │   │   └── alerting-rules
    │       │   │       └── blackbox-exporter.cloud-provider-api.rules.yml
    │       └── _helpers.tpl
    │   └── values.schema.json
├── .github
    ├── workflows
    │   ├── zz_generated.gitleaks.yaml
    │   ├── zz_generated.validate_changelog.yaml
    │   ├── zz_generated.check_values_schema.yaml
    │   ├── zz_generated.run_ossf_scorecard.yaml
    │   ├── zz_generated.create_release_pr.yaml
    │   ├── alert_tests.yaml
    │   ├── update-tempo-mixins.yml
    │   ├── update-loki-mixins.yml
    │   ├── update-mimir-mixins.yml
    │   └── zz_generated.add-team-labels.yaml
    └── pull_request_template.md
├── CODEOWNERS
├── Makefile
├── scripts
    ├── find-alerts.sh
    └── sync-kube-mixin.sh
├── DCO
├── Makefile.gen.app.mk
├── Makefile.custom.mk
├── renovate.json5
└── .circleci
    └── config.yml


/.nancy-ignore:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/test/hack/.gitignore:
--------------------------------------------------------------------------------
1 | output
2 | checkLabels/alertmanager.yaml
3 | 


--------------------------------------------------------------------------------
/loki/.gitignore:
--------------------------------------------------------------------------------
1 | vendor/
2 | dashboards_out/
3 | jsonnetfile.*
4 | 
5 | 


--------------------------------------------------------------------------------
/mimir/.gitignore:
--------------------------------------------------------------------------------
1 | vendor/
2 | dashboards_out/
3 | jsonnetfile.*
4 | 
5 | 


--------------------------------------------------------------------------------
/.abs/main.yaml:
--------------------------------------------------------------------------------
1 | replace-chart-version-with-git: true
2 | generate-metadata: true
3 | 


--------------------------------------------------------------------------------
/test/conf/providers:
--------------------------------------------------------------------------------
1 | capa
2 | capz
3 | cloud-director
4 | proxmox
5 | vsphere
6 | 


--------------------------------------------------------------------------------
/assets/inhibition-graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/giantswarm/prometheus-rules/HEAD/assets/inhibition-graph.png


--------------------------------------------------------------------------------
/test/tests/.gitignore:
--------------------------------------------------------------------------------
1 | **/*.rules.yml
2 | **/*.rules.yaml
3 | **/*.rules.test.yml_global
4 | **/*.rules.test.yaml_global
5 | 


--------------------------------------------------------------------------------
/.nancy-ignore.generated:
--------------------------------------------------------------------------------
1 | # This file is generated by https://github.com/giantswarm/github
2 | # Repository specific ignores should be added to .nancy-ignore
3 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | # Security Policy
2 | 
3 | ## Reporting a Vulnerability
4 | 
5 | Please visit <https://www.giantswarm.io/responsible-disclosure> for information on reporting security issues.
6 | 


--------------------------------------------------------------------------------
/test/hack/bin/.gitignore:
--------------------------------------------------------------------------------
 1 | # Ignore everything in this directory
 2 | *
 3 | # Except the following
 4 | !.gitignore
 5 | !verify-rules.sh
 6 | !fetch-tools.sh
 7 | !template-chart.sh
 8 | !get-inhibition.sh
 9 | !check-runbooks.sh
10 | 


--------------------------------------------------------------------------------
/.cursor/rules/alert-editing.mdc:
--------------------------------------------------------------------------------
 1 | ---
 2 | globs: **/*.rules.yml
 3 | alwaysApply: false
 4 | ---
 5 | # Rules for editing alerting rules
 6 | 
 7 | - **Update tests:**
 8 |   - When modifying an alerting rule, check if there are any tests for the rule. This is best done by grepping for the alert name.
 9 |   - If a runbook URL is changed, the according test must be updated.
10 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/values.yaml:
--------------------------------------------------------------------------------
 1 | name: prometheus-rules
 2 | namespace: monitoring
 3 | serviceType: managed
 4 | managementCluster:
 5 |   customer: ""
 6 |   name: ""
 7 |   pipeline: ""
 8 |   provider:
 9 |     kind: ""
10 |     flavor: ""
11 |     region: ""
12 | 
13 | Installation:
14 |   V1:
15 |     Guest:
16 |       Kubernetes:
17 |         IngressController:
18 |           BaseDomain: ""
19 | 


--------------------------------------------------------------------------------
/.github/workflows/zz_generated.gitleaks.yaml:
--------------------------------------------------------------------------------
 1 | # DO NOT EDIT. Generated with:
 2 | #
 3 | #    devctl
 4 | #
 5 | #    https://github.com/giantswarm/devctl/blob/ad0a25fbf301b2513e169ec964a8785d28f75be4/pkg/gen/input/workflows/internal/file/gitleaks.yaml.template
 6 | #
 7 | name: gitleaks
 8 | 
 9 | on:
10 |   - pull_request
11 | 
12 | jobs:
13 |   publish:
14 |     uses: giantswarm/github-workflows/.github/workflows/gitleaks.yaml@main
15 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/Chart.yaml:
--------------------------------------------------------------------------------
 1 | apiVersion: v2
 2 | description: A Helm chart for Giant Swarm prometheus alerting and recording rules
 3 | home: https://github.com/giantswarm/prometheus-rules
 4 | icon: https://s.giantswarm.io/app-icons/1/png/default-app-light.png
 5 | name: prometheus-rules
 6 | appVersion: "0.1.0"
 7 | version: "4.89.2"
 8 | annotations:
 9 |   application.giantswarm.io/team: "atlas"
10 |   config.giantswarm.io/version: 1.x.x
11 | 


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
 1 | * @giantswarm/team-atlas
 2 | /helm/prometheus-rules/templates/kaas/phoenix/ @giantswarm/team-phoenix
 3 | /helm/prometheus-rules/templates/kaas/rocket/ @giantswarm/team-rocket
 4 | /helm/prometheus-rules/templates/kaas/turtles/ @giantswarm/team-tenet
 5 | /helm/prometheus-rules/templates/platform/atlas/ @giantswarm/team-atlas
 6 | /helm/prometheus-rules/templates/platform/cabbage/ @giantswarm/team-cabbage
 7 | /helm/prometheus-rules/templates/platform/honeybadger/ @giantswarm/team-honeybadger
 8 | /helm/prometheus-rules/templates/platform/shield/ @giantswarm/team-shield
 9 | 
10 | # No owners for changelog
11 | /CHANGELOG.md
12 | 


--------------------------------------------------------------------------------
/.github/workflows/zz_generated.validate_changelog.yaml:
--------------------------------------------------------------------------------
 1 | # DO NOT EDIT. Generated with:
 2 | #
 3 | #    devctl
 4 | #
 5 | #    https://github.com/giantswarm/devctl/blob/ad0a25fbf301b2513e169ec964a8785d28f75be4/pkg/gen/input/workflows/internal/file/validate_changelog.yaml.template
 6 | #
 7 | name: Validate changelog
 8 | 
 9 | on:
10 |   pull_request:
11 |     types: [opened, synchronize, reopened]
12 |     paths:
13 |       - 'CHANGELOG.md'
14 | 
15 | permissions:
16 |   contents: read
17 |   pull-requests: write
18 | 
19 | jobs:
20 |   validate-changelog:
21 |     uses: giantswarm/github-workflows/.github/workflows/validate-changelog.yaml@main
22 | 


--------------------------------------------------------------------------------
/loki/mixin.libsonnet:
--------------------------------------------------------------------------------
 1 | (import 'loki-mixin/mixin-ssd.libsonnet') + {
 2 |   _config+:: {
 3 |     tags: [
 4 |       'owner:team-atlas',
 5 |       'topic:observability',
 6 |       'component:loki',
 7 |     ],
 8 | 
 9 |     per_node_label: 'node',
10 |     per_cluster_label: 'cluster_id',
11 | 
12 |     blooms: {
13 |       enabled: false,
14 |     },
15 | 
16 |     canary+: {
17 |       enabled: true,
18 |     },
19 | 
20 |     operational: {
21 |       memcached: false,
22 |       consul: false,
23 |       bigTable: false,
24 |       dynamo: false,
25 |       gcs: false,
26 |       s3: true,
27 |       azureBlob: true,
28 |       boltDB: false,
29 |     },
30 |   },
31 | }
32 | 


--------------------------------------------------------------------------------
/test/hack/bin/get-inhibition.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | GIT_WORKDIR="$(git rev-parse --show-toplevel)"
 4 | REPO_VERSION="$(curl -s https://api.github.com/repos/giantswarm/observability-operator/releases/latest | ./test/hack/bin/yq -r .name)"
 5 | 
 6 | curl -s https://raw.githubusercontent.com/giantswarm/observability-operator/"$REPO_VERSION"/helm/observability-operator/files/alertmanager/alertmanager.yaml.helm-template > "$GIT_WORKDIR"/test/hack/checkLabels/alertmanager.yaml
 7 | 
 8 | # Deleting all lines from begining of the file to the concerned section to avoid issues with go templates
 9 | sed -i '/global:/,/inhibit_rules:/{/inhibit_rules/b a;/^.*/d; :a}' "$GIT_WORKDIR"/test/hack/checkLabels/alertmanager.yaml
10 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/platform/honeybadger/recording-rules/helm-operations.rules.yml:
--------------------------------------------------------------------------------
 1 | # TODO(@giantswarm/team-honeybadger): This is only used by the chart-operator, let's get rid of it when the chart operator is gone.
 2 | apiVersion: monitoring.coreos.com/v1
 3 | kind: PrometheusRule
 4 | metadata:
 5 |   labels:
 6 |     {{- include "labels.common" . | nindent 4 }}
 7 |   name: helm-operations.recording.rules
 8 |   namespace: {{ .Values.namespace  }}
 9 | spec:
10 |   groups:
11 |   - name: helm-operations.recording
12 |     rules:
13 |     - expr: "sum by (cluster_id, installation, pipeline, provider, release, event) (helmclient_library_event_total{release!=''})"
14 |       record: monitoring:helm:number_of_operations_on_release
15 | 


--------------------------------------------------------------------------------
/.github/workflows/zz_generated.check_values_schema.yaml:
--------------------------------------------------------------------------------
 1 | # DO NOT EDIT. Generated with:
 2 | #
 3 | #    devctl
 4 | #
 5 | #    https://github.com/giantswarm/devctl/blob/ad0a25fbf301b2513e169ec964a8785d28f75be4/pkg/gen/input/workflows/internal/file/check_values_schema.yaml.template
 6 | #
 7 | 
 8 | name: 'Values and schema'
 9 | 
10 | on:
11 |   pull_request:
12 |     branches:
13 |       - master
14 |       - main
15 |     paths:
16 |       - 'helm/**/values.yaml'         # default helm chart values
17 |       - 'helm/**/values.schema.json'  # schema
18 |       - 'helm/**/ci/ci-values.yaml'   # overrides for CI (can contain required entries)
19 | 
20 | jobs:
21 |   check:
22 |     uses: giantswarm/github-workflows/.github/workflows/chart-values.yaml@main
23 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/platform/atlas/alerting-rules/inhibit.oncall.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   creationTimestamp: null
 5 |   labels:
 6 |     {{- include "labels.common" . | nindent 4 }}
 7 |   name: inhibit.oncall.rules
 8 |   namespace: {{ .Values.namespace  }}
 9 | spec:
10 |   groups:
11 |   - name: inhibit.oncall
12 |     rules:
13 |     - alert: InhibitionOutsideWorkingHours
14 |       annotations:
15 |         description: '{{`Fires outside working hours.`}}'
16 |       expr: (hour() <= 7 or hour() >= 16) or (day_of_week() > 5 or day_of_week() < 1)
17 |       labels:
18 |         area: platform
19 |         outside_working_hours: "true"
20 |         team: atlas
21 |         topic: monitoring
22 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/kaas/tenet/alerting-rules/inhibit.kubelet.rules.yml:
--------------------------------------------------------------------------------
 1 | # This rule applies to all clusters
 2 | apiVersion: monitoring.coreos.com/v1
 3 | kind: PrometheusRule
 4 | metadata:
 5 |   creationTimestamp: null
 6 |   labels:
 7 |     {{- include "labels.common" . | nindent 4 }}
 8 |   name: inhibit.kubelet.rules
 9 |   namespace: {{ .Values.namespace  }}
10 | spec:
11 |   groups:
12 |   - name: inhibit.kubelet
13 |     rules:
14 |     - alert: InhibitionKubeletDown
15 |       annotations:
16 |         description: '{{`Kubelet ({{ $labels.instance }}) is down.`}}'
17 |       expr: label_replace(up{app="kubelet"}, "ip", "$1", "instance", "(.+):\\d+") == 0
18 |       labels:
19 |         kubelet_down: "true"
20 |         area: kaas
21 |         topic: kubernetes
22 |         team: tenet
23 | 


--------------------------------------------------------------------------------
/test/hack/bin/template-chart.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -euo pipefail
 3 | 
 4 | main() {
 5 |   local GIT_WORKDIR
 6 |   GIT_WORKDIR="$(git rev-parse --show-toplevel)"
 7 | 
 8 |   local -a providers
 9 |   mapfile -t providers <"$GIT_WORKDIR/test/conf/providers"
10 | 
11 |   rm -rf "$GIT_WORKDIR"/test/hack/output/helm-chart/
12 | 
13 |   for provider in "${providers[@]}"; do
14 |     echo "Templating chart for provider: $provider"
15 | 
16 |     helm template \
17 |       "$GIT_WORKDIR"/helm/prometheus-rules \
18 |       --set="managementCluster.provider.flavor=capi" \
19 |       --set="managementCluster.provider.kind=$provider" \
20 |       --set="managementCluster.name=myinstall" \
21 |       --set="managementCluster.pipeline=stable" \
22 |       --output-dir "$GIT_WORKDIR"/test/hack/output/helm-chart/"$provider"
23 |   done
24 | }
25 | 
26 | main "$@"
27 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/kaas/tenet/alerting-rules/net-exporter.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   creationTimestamp: null
 5 |   labels:
 6 |     {{- include "labels.common" . | nindent 4 }}
 7 |   name: net-exporter.rules
 8 |   namespace: {{ .Values.namespace  }}
 9 | spec:
10 |   groups:
11 |   - name: net-exporter
12 |     rules:
13 |     - alert: ClusterNetExporterCPUUsageTooHigh
14 |       annotations:
15 |         description: '{{`net-exporter cpu usage is too high.`}}'
16 |       expr: rate(container_cpu_user_seconds_total{pod=~"net-exporter-.*"}[5m]) > 0.015
17 |       for: 5m
18 |       labels:
19 |         area: kaas
20 |         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
21 |         severity: notify
22 |         team: tenet
23 |         topic: observability
24 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/kaas/tenet/alerting-rules/job.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   creationTimestamp: null
 5 |   labels:
 6 |     {{- include "labels.common" . | nindent 4 }}
 7 |   name: job.rules
 8 |   namespace: {{ .Values.namespace  }}
 9 | spec:
10 |   groups:
11 |   - name: job
12 |     rules:
13 |     - alert: ManagementClusterJobFailed
14 |       annotations:
15 |         description: '{{`Job {{ $labels.namespace }}/{{ $labels.job_name }} is failed.`}}'
16 |       expr: kube_job_failed{cluster_type="management_cluster", condition="true"} == 1
17 |       for: 15m
18 |       labels:
19 |         area: kaas
20 |         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
21 |         severity: notify
22 |         team: {{ include "providerTeam" . }}
23 |         topic: managementcluster
24 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/secret.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   creationTimestamp: null
 5 |   labels:
 6 |     {{- include "labels.common" . | nindent 4 }}
 7 |   name: secret.rules
 8 |   namespace: {{ .Values.namespace  }}
 9 | spec:
10 |   groups:
11 |   - name: secret
12 |     rules:
13 |     - alert: HelmHistorySecretCountTooHigh
14 |       annotations:
15 |         description: '{{`Helm release Secret count too high.`}}'
16 |       expr: sum(kube_secret_info{namespace=~"giantswarm|kube-system|monitoring", secret=~"sh.helm.+"}) by (cluster_id, installation, pipeline, provider) > 1000
17 |       for: 15m
18 |       labels:
19 |         area: platform
20 |         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
21 |         severity: notify
22 |         team: honeybadger
23 |         topic: releng
24 | 


--------------------------------------------------------------------------------
/test/tests/providers/global/platform/honeybadger/alerting-rules/helm-operations.rules.test.yml:
--------------------------------------------------------------------------------
 1 | # TODO(@giantswarm/team-honeybadger): This is only used by the chart-operator, let's get rid of it when the chart operator is gone.
 2 | ---
 3 | rule_files:
 4 |   - helm-operations.rules.yml
 5 | 
 6 | tests:
 7 |   - interval: 1m
 8 |     input_series:
 9 |       - series: 'helmclient_library_event_total{app="chart-operator", cluster_id="gauss", container="chart-operator", event="update_release_from_tarball", namespace="giantswarm", pod="chart-operator-5c7b6f8867-pr44n", release="cilium"}'
10 |         values: "0+1x20"
11 |     promql_expr_test:
12 |       - expr: monitoring:helm:number_of_operations_on_release
13 |         eval_time: 10m
14 |         exp_samples:
15 |           - labels: 'monitoring:helm:number_of_operations_on_release{cluster_id="gauss", event="update_release_from_tarball", release="cilium"}'
16 |             value: 10
17 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/kaas/tenet/alerting-rules/timesync.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   creationTimestamp: null
 5 |   labels:
 6 |     {{- include "labels.common" . | nindent 4 }}
 7 |   name: timesync.rules
 8 |   namespace: {{ .Values.namespace  }}
 9 | spec:
10 |   groups:
11 |   - name: timesync
12 |     rules:
13 |     - alert: ClockOutOfSync
14 |       annotations:
15 |         description: '{{`Clock is out of sync on {{ $labels.instance }}.`}}'
16 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/clock-out-of-sync/
17 |       expr: timestamp(node_time_seconds) - node_time_seconds > 60
18 |       for: 30m
19 |       labels:
20 |         area: kaas
21 |         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
22 |         severity: page
23 |         team: {{ include "providerTeam" . }}
24 |         topic: infrastructure
25 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/irsa.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   labels:
 5 |     {{- include "labels.common" . | nindent 4 }}
 6 |   name: irsa.rules
 7 |   namespace: {{ .Values.namespace  }}
 8 | spec:
 9 |   groups:
10 |   - name: irsa-crossplane
11 |     rules:
12 |     - alert: IRSAClaimNotReady
13 |       annotations:
14 |         description: '{{`IRSAClaim {{ $labels.name }} in Cluster {{ $labels.installation }} is not ready.`}}'
15 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/irsaclaim-not-ready/
16 |       expr: irsaclaim_status_conditions{type="Ready", status="False"} > 0
17 |       for: 30m
18 |       labels:
19 |         area: kaas
20 |         cancel_if_kube_state_metrics_down: "true"
21 |         cancel_if_outside_working_hours: "true"
22 |         severity: page
23 |         team: phoenix
24 |         topic: aws
25 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.node.workload-cluster.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   creationTimestamp: null
 5 |   labels:
 6 |     {{- include "labels.common" . | nindent 4 }}
 7 |   name: node.aws.workload-cluster.rules
 8 |   namespace: {{ .Values.namespace  }}
 9 | spec:
10 |   groups:
11 |   - name: node.aws
12 |     rules:
13 |     - alert: WorkloadClusterNodeUnexpectedTaintNodeWithImpairedVolumes
14 |       annotations:
15 |         description: '{{`Node {{ $labels.node }} has unexpected taint NodeWithImpairedVolumes`}}'
16 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/aws-node-taint-nodewithimpairedvolumes/
17 |       expr: kube_node_spec_taint{key="NodeWithImpairedVolumes"} > 0
18 |       for: 30m
19 |       labels:
20 |         area: kaas
21 |         severity: notify
22 |         team: phoenix
23 |         topic: kubernetes
24 | 
25 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/platform/shield/alerting-rules/general.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   creationTimestamp: null
 5 |   labels:
 6 |     {{- include "labels.common" . | nindent 4 }}
 7 |   name: kyverno.rules
 8 |   namespace: {{ .Values.namespace  }}
 9 | spec:
10 |   groups:
11 |   - name: general
12 |     rules:
13 |     - alert: ShieldComponentRestartingTooOften
14 |       annotations:
15 |         description: '{{`Pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting too often.`}}'
16 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/shield-pod-failing/
17 |       expr: increase(kube_pod_container_status_restarts_total{cluster_type="workload_cluster", pod=~"trivy-.*|kyverno-.*|falco-*|"}[1h]) > 5
18 |       for: 30m
19 |       labels:
20 |         area: platform
21 |         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
22 |         severity: notify
23 |         team: shield
24 |         topic: security
25 | 


--------------------------------------------------------------------------------
/test/hack/bin/run-pint.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -euo pipefail
 4 | 
 5 | ## Arguments:
 6 | 
 7 | # 1. config file
 8 | # 2. team filter (optional)
 9 | 
10 | 
11 | main () {
12 |     echo "Running Pint"
13 | 
14 |     local GIT_WORKDIR
15 |     GIT_WORKDIR="$(git rev-parse --show-toplevel)"
16 | 
17 |     local -a PINT_FILES_LIST
18 |     local -a PROVIDERS
19 | 
20 |     PINT_CONFIG="${1:-test/conf/pint/pint-config.hcl}"
21 |     mapfile -t PROVIDERS <"$GIT_WORKDIR/test/conf/providers"
22 | 
23 |     if [[ "${2:-}" != "" ]]; then
24 |         for provider in "${PROVIDERS[@]}"; do
25 |             mapfile -t PINT_FILES_LIST < <(grep -lr "team:.*${PINT_TEAM_FILTER}" "test/hack/output/generated/$provider/" | grep -v ".test.yml")
26 |         done
27 |     else
28 |         for provider in "${PROVIDERS[@]}"; do
29 |             mapfile -t PINT_FILES_LIST < <(find test/hack/output/generated/$provider/ -name "*.rules.yml")
30 |         done
31 |     fi
32 | 
33 |     test/hack/bin/pint -c "$PINT_CONFIG" lint "${PINT_FILES_LIST[@]}"
34 | }
35 | 
36 | main "$@"
37 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/helm.rules.yml:
--------------------------------------------------------------------------------
 1 | # TODO(@giantswarm/team-honeybadger): This is only used by the chart-operator, let's get rid of it when the chart operator is gone.
 2 | apiVersion: monitoring.coreos.com/v1
 3 | kind: PrometheusRule
 4 | metadata:
 5 |   creationTimestamp: null
 6 |   labels:
 7 |     {{- include "labels.common" . | nindent 4 }}
 8 |   name: helm.rules
 9 |   namespace: {{ .Values.namespace  }}
10 | spec:
11 |   groups:
12 |   - name: helm
13 |     rules:
14 |     - alert: RepeatedHelmOperation
15 |       annotations:
16 |         description: '{{`Helm release {{ $labels.release }} in cluster {{ $labels.cluster_id }} is being repeated {{ $labels.event }} for {{ $value | printf "%.1f" }} times.`}}'
17 |       expr: increase(monitoring:helm:number_of_operations_on_release[15m]) > 5
18 |       for: 5m
19 |       labels:
20 |         area: platform
21 |         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
22 |         severity: notify
23 |         team: honeybadger
24 |         topic: releng
25 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/kaas/tenet/alerting-rules/inhibit.nodes.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   creationTimestamp: null
 5 |   labels:
 6 |     {{- include "labels.common" . | nindent 4 }}
 7 |   name: inhibit.nodes.rules
 8 |   namespace: {{ .Values.namespace  }}
 9 | spec:
10 |   groups:
11 |   - name: inhibit.nodes
12 |     rules:
13 |     - alert: InhibitionNodeNotReady
14 |       annotations:
15 |         description: '{{`Node {{ $labels.node }} is not ready.`}}'
16 |       expr: kube_node_status_condition{condition="Ready", status!="true"} > 0
17 |       labels:
18 |         area: kaas
19 |         node_not_ready: "true"
20 |         team: tenet
21 |         topic: kubernetes
22 |     - alert: InhibitionNodeUnschedulable
23 |       annotations:
24 |         description: '{{`Node {{ $labels.node }} is unschedulable.`}}'
25 |       expr: kube_node_spec_unschedulable > 0
26 |       labels:
27 |         area: kaas
28 |         node_unschedulable: "true"
29 |         team: tenet
30 |         topic: kubernetes
31 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/kaas/tenet/alerting-rules/inhibit.capi.rules.yml:
--------------------------------------------------------------------------------
 1 | # This rule applies to all capi clusters
 2 | apiVersion: monitoring.coreos.com/v1
 3 | kind: PrometheusRule
 4 | metadata:
 5 |   creationTimestamp: null
 6 |   labels:
 7 |     {{- include "labels.common" . | nindent 4 }}
 8 |   name: inhibit.capi.rules
 9 |   namespace: {{ .Values.namespace  }}
10 | spec:
11 |   groups:
12 |   - name: inhibit.capi
13 |     rules:
14 |     - alert: InhibitionControlPlaneUnhealthy
15 |       annotations:
16 |         description: '{{`Control plane of cluster {{ $labels.cluster_id }} is not healthy.`}}'
17 |       expr: |-
18 |         capi_kubeadmcontrolplane_status_condition{type="ControlPlaneComponentsHealthy", status="False"} == 1
19 |         or capi_kubeadmcontrolplane_status_condition{type="EtcdClusterHealthy", status="False"} == 1
20 |         or capi_kubeadmcontrolplane_status_condition{type="Available", status="False"} == 1
21 |       labels:
22 |         area: kaas
23 |         cluster_control_plane_unhealthy: "true"
24 |         team: tenet
25 |         topic: status
26 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/platform/atlas/alerting-rules/teleport.logs.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   labels:
 5 |     {{- include "labels.common" . | nindent 4}}
 6 |   name: teleport.audit.logs.rules
 7 |   namespace: {{ .Values.namespace }}
 8 | spec:
 9 |   groups:
10 |     - name: teleport.audit.logs
11 |       rules:
12 |         - alert: TeleportAuditLogsMissing
13 |           annotations:
14 |             description: Teleport audit logs are missing from installation {{`{{ $labels.installation }}`}}.
15 |             runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/teleport-audit-logs-missing/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}'
16 |           expr: |-
17 |             absent_over_time({scrape_job="teleport.giantswarm.io"} [7d]) > 0
18 |           for: 5m
19 |           labels:
20 |             area: kaas
21 |             cancel_if_outside_working_hours: "true"
22 |             severity: page
23 |             team: atlas
24 |             topic: observability
25 | 


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
 1 | Before adding a new alerting rule into this repository you should consider creating an SLO rules instead.
 2 | SLO helps you both increase the quality of your monitoring and reduce the alert noise.
 3 | 
 4 | * How to create a SLO rule: https://github.com/giantswarm/sloth-rules#how-to-create-a-slo
 5 | * Documentation: https://intranet.giantswarm.io/docs/observability/slo-alerting/
 6 | 
 7 | ---
 8 | Towards: https://github.com/giantswarm/...
 9 | 
10 | This PR ...
11 | 
12 | ### Checklist
13 | 
14 | - [ ] Update CHANGELOG.md
15 | - [ ] Add [Unit tests](https://github.com/giantswarm/prometheus-rules/#testing)
16 | - [ ] Follow [Alert structure](https://github.com/giantswarm/prometheus-rules/#how-alerts-are-structured)
17 | - [ ] Consider [creating a dashboard](https://docs.giantswarm.io/tutorials/observability/data-exploration/creating-custom-dashboards/) ([guidelines](https://intranet.giantswarm.io/docs/product/ux/guidelines/dashboards/)) (if it does not exist already) to help oncallers monitor the status of the issue.
18 | - [ ] Request review from oncall area, as well as team (e.g: `oncall-kaas-cloud` GitHub group).
19 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/platform/shield/alerting-rules/dex.logs.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   labels:
 5 |     {{- include "labels.common" . | nindent 4}}
 6 |   name: dex.logs.rules
 7 |   namespace: {{ .Values.namespace }}
 8 | spec:
 9 |   groups:
10 |     - name: dex.logs
11 |       rules:
12 |         - alert: DexInvalidClientId
13 |           annotations:
14 |             description: '{{`Dex in {{ $labels.installation }} reports an invalid client ID.`}}'
15 |             runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/dex-invalid-client-id/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}'
16 |           expr: |-
17 |             sum(rate({scrape_job="kubernetes-pods", pod=~"dex.*"} |= `Invalid client_id` | logfmt | err =~ `Invalid client_id \(\"[a-zA-Z0-9+]{20,}.*` [1h])) by (installation, cluster_id) > 0
18 |           for: 5m
19 |           labels:
20 |             area: kaas
21 |             cancel_if_outside_working_hours: "true"
22 |             severity: page
23 |             team: shield
24 |             topic: dex
25 | 


--------------------------------------------------------------------------------
/loki/update.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Update Loki mixins from upstream
 4 | #
 5 | # This script is used to update the Loki mixins from the upstream repository.
 6 | #
 7 | # Usage:
 8 | #  ./loki/update.sh from the root of the repository
 9 | 
10 | set -e
11 | 
12 | BRANCH="main"
13 | MIXIN_URL=https://github.com/grafana/loki/production/loki-mixin@$BRANCH
14 | OUTPUT_FILE="$(pwd)"/helm/prometheus-rules/templates/platform/atlas/recording-rules/loki-mixins.rules.yml
15 | 
16 | cd loki
17 | rm -rf vendor jsonnetfile.* "$OUTPUT_FILE"
18 | 
19 | jb init
20 | jb install $MIXIN_URL
21 | mixtool generate rules mixin.libsonnet -r "$OUTPUT_FILE"
22 | 
23 | # Remove the initial `groups:` line
24 | sed -i '1d' "$OUTPUT_FILE"
25 | 
26 | # Add the PrometheusRule metadata header
27 | sed -i '1i\
28 | apiVersion: monitoring.coreos.com/v1\
29 | kind: PrometheusRule\
30 | metadata:\
31 |   labels:\
32 |     {{- include "labels.common" . | nindent 4 }}\
33 |   name: loki.recording.rules\
34 |   namespace: {{ .Values.namespace  }}\
35 | spec:\
36 |   groups:' "$OUTPUT_FILE"
37 | 
38 | sed -i 's/cluster_id,/cluster_id, installation, pipeline, provider,/g' "$OUTPUT_FILE"
39 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/nodes.cidrnotavailable.events.logs.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   labels:
 5 |     {{- include "labels.common" . | nindent 4}}
 6 |   name: nodes.cidrnotavailable.events.logs.rules
 7 |   namespace: {{ .Values.namespace }}
 8 | spec:
 9 |   groups:
10 |     - name: nodes.cidrnotavailable.events.logs
11 |       rules:
12 |         - alert: NodeCIDRNotAvailable
13 |           annotations:
14 |             description: Node(s) CIDR(s) are not available in the cluster {{`{{ $labels.cluster_id }}`}}.
15 |             runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/cidr-not-available/
16 |           expr: |-
17 |             sum (
18 |               count_over_time({scrape_job="kubernetes-events"} |= "CIDRNotAvailable" | logfmt [30m])
19 |             ) by (name, cluster_id, installation, pipeline, provider) > 0
20 |           for: 5m
21 |           labels:
22 |             area: kaas
23 |             cancel_if_outside_working_hours: "true"
24 |             severity: page
25 |             team: phoenix
26 |             topic: nodes
27 | 


--------------------------------------------------------------------------------
/.github/workflows/zz_generated.run_ossf_scorecard.yaml:
--------------------------------------------------------------------------------
 1 | # DO NOT EDIT. Generated with:
 2 | #
 3 | #    devctl
 4 | #
 5 | #    https://github.com/giantswarm/devctl/blob/ad0a25fbf301b2513e169ec964a8785d28f75be4/pkg/gen/input/workflows/internal/file/run_ossf_scorecard.yaml.template
 6 | #
 7 | 
 8 | # This workflow uses actions that are not certified by GitHub. They are provided
 9 | # by a third-party and are governed by separate terms of service, privacy
10 | # policy, and support documentation.
11 | 
12 | name: Scorecard supply-chain security
13 | on:
14 |   # For Branch-Protection check. Only the default branch is supported. See
15 |   # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection
16 |   branch_protection_rule: {}
17 |   # To guarantee Maintained check is occasionally updated. See
18 |   # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained
19 |   schedule:
20 |     - cron: '15 15 15 * *'
21 |   push:
22 |     branches:
23 |       - main
24 |       - master
25 |   workflow_dispatch: {}
26 | 
27 | jobs:
28 |   analysis:
29 |     uses: giantswarm/github-workflows/.github/workflows/ossf-scorecard.yaml@main
30 |     secrets:
31 |       scorecard_token: ${{ secrets.SCORECARD_TOKEN }}
32 | 


--------------------------------------------------------------------------------
/test/conf/pint/pint-config.hcl:
--------------------------------------------------------------------------------
 1 | rule {
 2 |   # Disallow spaces in label/annotation keys, they're only allowed in values.
 3 |   reject ".* +.*" {
 4 |     label_keys      = true
 5 |     annotation_keys = true
 6 |   }
 7 | 
 8 |   # Disallow URLs in labels, they should go to annotations.
 9 |   reject "https?://.+" {
10 |     label_keys   = true
11 |     label_values = true
12 |   }
13 | 
14 |   # Ensure that all aggregations are preserving mandatory labels.
15 |   aggregate ".+" {
16 |     severity = "bug"
17 |     keep     = ["cluster_id", "installation", "pipeline", "provider"]
18 |   }
19 | }
20 | 
21 | rule {
22 |   # This block will apply to all alerting rules.
23 |   match {
24 |     kind = "alerting"
25 |   }
26 | 
27 |   # Each alert must have a 'description' annotation.
28 |   annotation "description" {
29 |     severity = "bug"
30 |     required = true
31 |   }
32 | 
33 |   # Each alert must have an `area' label that's either 'kaas' or 'platform'.
34 |   label "area" {
35 |     severity = "bug"
36 |     value    = "(kaas|platform)"
37 |     required = true
38 |   }
39 | 
40 |   # Check how many times each alert would fire in the last 1d.
41 |   alerts {
42 |     range   = "1d"
43 |     step    = "1m"
44 |     resolve = "5m"
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/kaas/tenet/alerting-rules/node.memory-pressure.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   creationTimestamp: null
 5 |   labels:
 6 |     {{- include "labels.common" . | nindent 4 }}
 7 |   name: node.memory-pressure.rules
 8 |   namespace: {{ .Values.namespace  }}
 9 | spec:
10 |   groups:
11 |   - name: node.memory-pressure
12 |     rules:
13 |     - alert: ControlPlaneNodeMemoryPressureTaint
14 |       annotations:
15 |         description: '{{`Control plane node {{ $labels.node }} in {{ $labels.cluster_type }} cluster {{ $labels.installation }}{{ if $labels.cluster_id }}/{{ $labels.cluster_id }}{{ end }} has memory-pressure taint applied, indicating memory issues.`}}'
16 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/runbooks/memory-pressure/
17 |       expr: |
18 |         kube_node_spec_taint{key="node.kubernetes.io/memory-pressure", effect="NoSchedule"} > 0
19 |         and on (node) kube_node_role{role=~"control-plane|master"}
20 |       for: 5m
21 |       labels:
22 |         area: kaas
23 |         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
24 |         severity: page
25 |         team: tenet
26 |         topic: kubernetes
27 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | # DO NOT EDIT. Generated with:
 2 | #
 3 | #    devctl
 4 | #
 5 | #    https://github.com/giantswarm/devctl/blob/6a704f7e2a8b0f09e82b5bab88f17971af849711/pkg/gen/input/makefile/internal/file/Makefile.template
 6 | #
 7 | 
 8 | include Makefile.*.mk
 9 | 
10 | ##@ General
11 | 
12 | # The help target prints out all targets with their descriptions organized
13 | # beneath their categories. The categories are represented by '##@' and the
14 | # target descriptions by '##'. The awk commands is responsible for reading the
15 | # entire set of makefiles included in this invocation, looking for lines of the
16 | # file as xyz: ## something, and then pretty-format the target and help. Then,
17 | # if there's a line with ##@ something, that gets pretty-printed as a category.
18 | # More info on the usage of ANSI control characters for terminal formatting:
19 | # https://en.wikipedia.org/wiki/ANSI_escape_code#SGR_parameters
20 | # More info on the awk command:
21 | # http://linuxcommand.org/lc3_adv_awk.php
22 | 
23 | .PHONY: help
24 | help: ## Display this help.
25 | 	@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n  make \033[36m<target>\033[0m\n"} /^[a-zA-Z%\\\/_0-9-]+:.*?##/ { printf "  \033[36m%-20s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST)
26 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/kaas/rocket/alerting-rules/blackbox-exporter.cloud-provider-api.rules.yml:
--------------------------------------------------------------------------------
 1 | # This rule is applied to all management clusters but it is only active if blackbox
 2 | # exporter is deployed and configured with a scrape job named 'http-cloud-provider-api'
 3 | apiVersion: monitoring.coreos.com/v1
 4 | kind: PrometheusRule
 5 | metadata:
 6 |   creationTimestamp: null
 7 |   labels:
 8 |     {{- include "labels.common" . | nindent 4 }}
 9 |   name: rocket-onprem-cloud-provider-api
10 |   namespace: {{ .Values.namespace  }}
11 | spec:
12 |   groups:
13 |   - name: rocket-onprem-cloud-provider-api
14 |     rules:
15 |     - alert: OnPremCloudProviderAPIIsDown
16 |       annotations:
17 |         description: '{{` blackbox-exporter on {{ $labels.cluster_id}} is unable to connect to the on-prem cloud provider API.`}}'
18 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/runbooks/onprem-cloud-provider-api/
19 |       expr: probe_success{cluster_type="management_cluster",job="prometheus-blackbox-exporter",target="http-cloud-provider-api"} == 0
20 |       for: 5m
21 |       labels:
22 |         area: kaas
23 |         severity: page
24 |         team: rocket
25 |         topic: network
26 |         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
27 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/external-secrets.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   labels:
 5 |     {{- include "labels.common" . | nindent 4 }}
 6 |   name: external-secrets.rules
 7 |   namespace: {{ .Values.namespace  }}
 8 | spec:
 9 |   groups:
10 |   - name: external-secrets
11 |     rules:
12 |     # This alert is for any deployment being in failed status in the `external-secrets` namespace.
13 |     - alert: ExternalSecretsDeploymentNotSatisfied
14 |       annotations:
15 |         description: '{{`ExternalSecrets related deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}'
16 |         runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/deployment-not-satisfied/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}&NAMESPACE={{ $labels.namespace }}&KIND=deployment&NAME={{ $labels.deployment }}`}}'
17 |       expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", namespace="external-secrets"} > 0
18 |       for: 30m
19 |       labels:
20 |         area: platform
21 |         cancel_if_outside_working_hours: "true"
22 |         severity: page
23 |         team: honeybadger
24 |         topic: managementcluster
25 | 


--------------------------------------------------------------------------------
/.github/workflows/zz_generated.create_release_pr.yaml:
--------------------------------------------------------------------------------
 1 | # DO NOT EDIT. Generated with:
 2 | #
 3 | #    devctl
 4 | #
 5 | #    https://github.com/giantswarm/devctl/blob/ad0a25fbf301b2513e169ec964a8785d28f75be4/pkg/gen/input/workflows/internal/file/create_release_pr.yaml.template
 6 | #
 7 | name: Create Release PR
 8 | on:
 9 |   push:
10 |     branches:
11 |       - 'legacy#release#v*.*.*'
12 |       - 'main#release#v*.*.*'
13 |       - 'main#release#major'
14 |       - 'main#release#minor'
15 |       - 'main#release#patch'
16 |       - 'master#release#v*.*.*'
17 |       - 'master#release#major'
18 |       - 'master#release#minor'
19 |       - 'master#release#patch'
20 |       - 'release#v*.*.*'
21 |       - 'release#major'
22 |       - 'release#minor'
23 |       - 'release#patch'
24 |       - 'release-v*.*.x#release#v*.*.*'
25 |       # "!" negates previous positive patterns so it has to be at the end.
26 |       - '!release-v*.x.x#release#v*.*.*'
27 |   workflow_call:
28 |     inputs:
29 |       branch:
30 |         required: true
31 |         type: string
32 | 
33 | jobs:
34 |   publish:
35 |     uses: giantswarm/github-workflows/.github/workflows/create-release-pr.yaml@main
36 |     with:
37 |       branch: ${{ inputs.branch }}
38 |     secrets:
39 |       TAYLORBOT_GITHUB_ACTION: ${{ secrets.TAYLORBOT_GITHUB_ACTION }}
40 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/platform/atlas/recording-rules/monitoring.resource-usage-estimation.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   labels:
 5 |     {{- include "labels.common" . | nindent 4 }}
 6 |   name: monitoring.resource-usage-estimation.recording.rules
 7 |   namespace: {{ .Values.namespace  }}
 8 | spec:
 9 |   groups:
10 |   - name: monitoring.resource-usage-estimation.recording
11 |     rules:
12 |     - expr: (sum(scrape_samples_post_metric_relabeling) by (cluster_id, installation, job, pipeline, provider) / on(cluster_id) group_left sum(cortex_ingester_active_series{container="ingester"}) by (cluster_id)) * on(cluster_id) group_left sum(container_memory_usage_bytes{container="ingester", namespace="mimir"}) by (cluster_id)
13 |       record: giantswarm:observability:monitoring:resource_usage_estimation:memory_usage_bytes
14 |     - expr: (sum(scrape_samples_post_metric_relabeling) by (cluster_id, installation, job, pipeline, provider) / on(cluster_id) group_left sum(cortex_ingester_active_series{container="ingester"}) by (cluster_id)) * on(cluster_id) group_left sum(container_memory_working_set_bytes{container="ingester", namespace="mimir"}) by (cluster_id)
15 |       record: giantswarm:observability:monitoring:resource_usage_estimation:memory_working_set_bytes
16 | 


--------------------------------------------------------------------------------
/test/tests/providers/global/platform/shield/alerting-rules/general.rules.test.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | rule_files:
 3 |   - general.rules.yml
 4 | tests:
 5 |   - interval: 1m
 6 |     input_series:
 7 |       # Kyverno validating webhooks
 8 |       - series: 'kube_pod_container_status_restarts_total{cluster_id="golem", cluster_type="workload_cluster", installation="golem", namespace="security", pipeline="stable", pod="trivy-0", provider="capa"}'
 9 |         values: "0+1x120"
10 |     alert_rule_test:
11 |       # Trivy pod
12 |       - alertname: ShieldComponentRestartingTooOften
13 |         eval_time: 91m
14 |         exp_alerts:
15 |           - exp_labels:
16 |               area: platform
17 |               cluster_id: golem
18 |               cluster_type: workload_cluster
19 |               installation: golem
20 |               pipeline: stable
21 |               provider: capa
22 |               severity: notify
23 |               team: shield
24 |               topic: security
25 |               namespace: security
26 |               pod: trivy-0
27 |               cancel_if_outside_working_hours: "false"
28 |             exp_annotations:
29 |               description: 'Pod security/trivy-0 is restarting too often.'
30 |               runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/shield-pod-failing/
31 | 


--------------------------------------------------------------------------------
/mimir/update.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Update Mimir mixins from upstream
 4 | #
 5 | # This script is used to update the Mimir mixins from the upstream repository.
 6 | #
 7 | # Usage:
 8 | #  ./mimir/update.sh from the root of the repository
 9 | 
10 | set -e
11 | 
12 | BRANCH="main"
13 | MIXIN_URL=https://github.com/grafana/mimir/operations/mimir-mixin@$BRANCH
14 | OUTPUT_FILE="$(pwd)"/helm/prometheus-rules/templates/platform/atlas/recording-rules/mimir-mixins.rules.yml
15 | 
16 | cd mimir
17 | rm -rf vendor jsonnetfile.* "$OUTPUT_FILE"
18 | 
19 | jb init
20 | jb install $MIXIN_URL
21 | mixtool generate rules mixin.libsonnet -r "$OUTPUT_FILE"
22 | 
23 | # Remove the initial `groups:` line
24 | sed -i '1d' "$OUTPUT_FILE"
25 | 
26 | # Add the PrometheusRule metadata header
27 | sed -i '1i\
28 | apiVersion: monitoring.coreos.com/v1\
29 | kind: PrometheusRule\
30 | metadata:\
31 |   labels:\
32 |     {{- include "labels.common" . | nindent 4 }}\
33 |   name: mimir.recording.rules\
34 |   namespace: {{ .Values.namespace  }}\
35 | spec:\
36 |   groups:' "$OUTPUT_FILE"
37 | 
38 | # Add the mimir enabled helm conditional blocks
39 | sed -i '1i{{- if eq .Values.managementCluster.provider.flavor "capi" }}' "$OUTPUT_FILE"
40 | sed -i -e '$a{{- end }}' "$OUTPUT_FILE"
41 | 
42 | sed -i 's/cluster_id,/cluster_id, installation, pipeline, provider,/g' "$OUTPUT_FILE"
43 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.logs.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   labels:
 5 |     {{- include "labels.common" . | nindent 4}}
 6 |   name: mimir.logs.rules
 7 |   namespace: {{ .Values.namespace }}
 8 | spec:
 9 |   groups:
10 |     - name: mimir-distributor
11 |       rules:
12 |         - alert: MimirDistributorMaxInflightPushRequests
13 |           annotations:
14 |             description: '{{`Mimir distributor is experiencing high rate of "err-mimir-distributor-max-inflight-push-requests" errors. Rate: {{ printf "%.2f" $value }} errors/second over the last 10 minutes.`}}'
15 |             summary: Mimir distributor max inflight push requests errors
16 |             runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/mimir/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}'
17 |           expr: |-
18 |             sum(rate({service_name="distributor"} |= "err-mimir-distributor-max-inflight-push-requests" [10m])) by (cluster_id, installation, provider, pipeline, namespace) > 0.1
19 |           for: 15m
20 |           labels:
21 |             area: platform
22 |             cancel_if_outside_working_hours: "true"
23 |             severity: page
24 |             team: atlas
25 |             topic: observability
26 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/capa.inhibition.rules.yml:
--------------------------------------------------------------------------------
 1 | {{- if eq .Values.managementCluster.provider.kind "capa" }}
 2 | apiVersion: monitoring.coreos.com/v1
 3 | kind: PrometheusRule
 4 | metadata:
 5 |   creationTimestamp: null
 6 |   labels:
 7 |     {{- include "labels.common" . | nindent 4 }}
 8 |     cluster_type: "management_cluster"
 9 |   name: capa.inhibitions.rules
10 |   namespace: {{ .Values.namespace  }}
11 | spec:
12 |   groups:
13 |   - name: capa.inhibitions
14 |     rules:
15 |     - alert: InhibitionClusterWithoutWorkerNodes
16 |       annotations:
17 |         description: '{{`Cluster ({{ $labels.cluster_id }}) has no worker nodes.`}}'
18 |       expr: |-
19 |         label_replace(
20 |             capi_cluster_status_condition{type="ControlPlaneReady", status="True"},
21 |                 "cluster_id",
22 |                 "$1",
23 |                 "name",
24 |                 "(.*)"
25 |             ) == 1
26 |         unless on (cluster_id) (
27 |             sum(capi_machinepool_spec_replicas{} > 0) by (cluster_id)
28 |         )
29 |         and on (cluster_id) (
30 |             capi_cluster_info{infrastructure_reference_kind="AWSCluster"} == 1
31 |         )
32 |       labels:
33 |         area: kaas
34 |         has_worker_nodes: "false"
35 |         team: phoenix
36 |         topic: status
37 | {{- end }}
38 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/platform/atlas/alerting-rules/statefulset.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   creationTimestamp: null
 5 |   labels:
 6 |     {{- include "labels.common" . | nindent 4 }}
 7 |   name: statefulset.rules
 8 |   namespace: {{ .Values.namespace  }}
 9 | spec:
10 |   groups:
11 |   - name: statefulset
12 |     rules:
13 |     - alert: StatefulsetNotSatisfiedAtlas
14 |       annotations:
15 |         description: '{{`Statefulset {{ $labels.namespace}}/{{ $labels.statefulset }} is not satisfied.`}}'
16 |         runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/deployment-not-satisfied/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}&NAMESPACE={{ $labels.namespace }}&KIND=statefulset&NAME={{ $labels.statefulset }}`}}'
17 |       expr: |-
18 |         kube_statefulset_status_replicas{cluster_type="management_cluster", statefulset=~"loki.*|mimir.*|pyroscope.*|tempo.*"}
19 |         - kube_statefulset_status_replicas_ready{cluster_type="management_cluster", statefulset=~"loki.*|mimir.*|pyroscope.*|tempo.*"}
20 |         > 0
21 |       for: 3d
22 |       labels:
23 |         area: platform
24 |         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
25 |         severity: page
26 |         team: atlas
27 |         topic: managementcluster
28 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/crossplane.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   labels:
 5 |     {{- include "labels.common" . | nindent 4 }}
 6 |   name: crossplane.rules
 7 |   namespace: {{ .Values.namespace  }}
 8 | spec:
 9 |   groups:
10 |   - name: crossplane
11 |     rules:
12 |     # This alert is for any deployment being in failed status in the `crossplane` namespace.
13 |     # This usually includes Crossplane core components themselves, installed provider(s) and the metrics exporter.
14 |     - alert: CrossplaneDeploymentNotSatisfied
15 |       annotations:
16 |         description: '{{`Crossplane related deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}'
17 |         runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/deployment-not-satisfied/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}&NAMESPACE={{ $labels.namespace }}&KIND=deployment&NAME={{ $labels.deployment }}`}}'
18 |       expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", namespace="crossplane"} > 0
19 |       for: 30m
20 |       labels:
21 |         area: platform
22 |         cancel_if_outside_working_hours: "true"
23 |         severity: page
24 |         team: honeybadger
25 |         topic: managementcluster
26 | 


--------------------------------------------------------------------------------
/test/tests/providers/global/kaas/tenet/alerting-rules/capi-machineset.rules.test.yml:
--------------------------------------------------------------------------------
 1 | rule_files:
 2 |   - capi-machineset.rules.yml
 3 | 
 4 | tests:
 5 |   - interval: 1m
 6 |     input_series:
 7 |       - series: 'capi_machineset_annotation_paused{paused_value="true",cluster_id="grumpy", name="grumpy-def99", exported_namespace="giantswarm"}'
 8 |         values: "0+1x75"
 9 |       - series: 'capi_cluster_info{cluster_id="grumpy", provider="capa"}'
10 |         values: "1+0x75"
11 |     alert_rule_test:
12 |       - alertname: MachineSetPaused
13 |         eval_time: 75m
14 |         exp_alerts:
15 |           - exp_labels:
16 |               area: kaas
17 |               cancel_if_monitoring_agent_down: "true"
18 |               cancel_if_outside_working_hours: "true"
19 |               provider: capa
20 |               severity: notify
21 |               team: phoenix
22 |               topic: managementcluster
23 |               cluster_id: grumpy
24 |               name: grumpy-def99
25 |               exported_namespace: giantswarm
26 |               paused_value: "true"
27 |             exp_annotations:
28 |               description: "Machineset giantswarm/grumpy-def99 is paused."
29 |               runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/capi-machineset/
30 |               __dashboardUid__: bdi7iswg81czkcasd
31 |               dashboardQueryParams: "orgId=2"
32 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/platform/atlas/alerting-rules/storage.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   creationTimestamp: null
 5 |   labels:
 6 |     {{- include "labels.common" . | nindent 4 }}
 7 |   name: observability.storage.rules
 8 |   namespace: {{ .Values.namespace  }}
 9 | spec:
10 |   groups:
11 |   - name: observability.storage
12 |     rules:
13 |     - alert: ObservabilityStorageSpaceTooLow
14 |       annotations:
15 |         description: '{{`The free space on the Data Disk for instance: {{ $labels.instance }} and PVC: {{ $labels.persistentvolumeclaim}} was below 10 percent for longer than 1 hour (current value {{ $value | printf "%.2f" }}).`}}'
16 |         runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/low-disk-space/#persistent-volume?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}'
17 |       expr: kubelet_volume_stats_available_bytes{cluster_type="management_cluster", persistentvolumeclaim=~".*(alertmanager|loki|mimir|prometheus|pyroscope|tempo|grafana-postgresql).*"}/kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~".*(alertmanager|loki|mimir|prometheus|pyroscope|tempo).*"} < 0.10
18 |       for: 1h
19 |       labels:
20 |         area: platform
21 |         cancel_if_outside_working_hours: "true"
22 |         severity: page
23 |         team: atlas
24 |         topic: observability
25 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/platform/atlas/alerting-rules/flux-atlas.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   creationTimestamp: null
 5 |   labels:
 6 |     {{- include "labels.common" . | nindent 4 }}
 7 |   name: fluxcd-atlas.rules
 8 |   namespace: {{ .Values.namespace  }}
 9 | spec:
10 |   groups:
11 |   - name: fluxcd-atlas
12 |     rules:
13 |     #
14 |     # FluxKustomizationFailed
15 |     #
16 |     # Alerting for GiantSwarm management clusters silences Kustomization CRs.
17 |     #
18 |     - alert: FluxKustomizationFailed
19 |       annotations:
20 |         description: |-
21 |           {{`Flux Kustomization {{ $labels.name }} in ns {{ $labels.exported_namespace }} on {{ $labels.installation }}/{{ $labels.cluster_id }} is stuck in Failed state.`}}
22 |         runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/flux-kustomization-failed/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}&NAMESPACE={{ $labels.exported_namespace }}&KUSTOMIZATION_NAME={{ $labels.name }}`}}'
23 |       expr: gotk_resource_info{ready="False", customresource_kind="Kustomization", cluster_type="management_cluster", exported_namespace=~".*giantswarm.*", name="silences"} > 0
24 |       for: 20m
25 |       labels:
26 |         area: platform
27 |         cancel_if_outside_working_hours: "true"
28 |         severity: page
29 |         team: atlas
30 |         topic: releng
31 | 


--------------------------------------------------------------------------------
/test/tests/providers/capa/kaas/phoenix/alerting-rules/cluster-crossplane.rules.test.yml:
--------------------------------------------------------------------------------
 1 | rule_files:
 2 |   - cluster-crossplane.rules.yml
 3 | 
 4 | tests:
 5 |   - interval: 1m
 6 |     input_series:
 7 |       - series: 'crossplane_managed_resource_exists{gvk="cloudwatchevents.aws.upbound.io/v1beta1, Kind=Rule", cluster_id="mymc", installation="test-installation"}'
 8 |         values: "6x20"
 9 |       - series: 'crossplane_managed_resource_ready{gvk="cloudwatchevents.aws.upbound.io/v1beta1, Kind=Rule", cluster_id="mymc", installation="test-installation"}'
10 |         values: "5x20"
11 | 
12 |     alert_rule_test:
13 |       - alertname: ClusterCrossplaneResourcesNotReady
14 |         eval_time: 20m
15 |         exp_alerts:
16 |           - exp_labels:
17 |               area: kaas
18 |               cancel_if_outside_working_hours: "false"
19 |               cluster_id: "mymc"
20 |               gvk: "cloudwatchevents.aws.upbound.io/v1beta1, Kind=Rule"
21 |               installation: "test-installation"
22 |               severity: page
23 |               team: phoenix
24 |             exp_annotations:
25 |               description: 'Not all managed Crossplane resources of type "cloudwatchevents.aws.upbound.io/v1beta1, Kind=Rule" on mymc are ready. This could affect creation or health of workload clusters.'
26 |               runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/runbooks/cluster-crossplane-resources/?INSTALLATION=test-installation&CLUSTER=mymc
27 | 


--------------------------------------------------------------------------------
/test/tests/providers/global/platform/atlas/alerting-rules/silence-operator.rules.test.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | rule_files:
 3 | - silence-operator.rules.yml
 4 | 
 5 | tests:
 6 |   - interval: 1m
 7 |     input_series:
 8 |       - series: 'operatorkit_controller_errors_total{job="monitoring/silence-operator", controller="silence-controller", cluster_type="management_cluster", installation="myinstall", cluster_id="bar"}'
 9 |         values: "0x30 1+0x20 20x45 20-1x20 0x100"
10 |     alert_rule_test:
11 |       - alertname: SilenceOperatorReconcileErrors
12 |         eval_time: 60m
13 |       - alertname: SilenceOperatorReconcileErrors
14 |         eval_time: 95m
15 |         exp_alerts:
16 |           - exp_labels:
17 |               job: "monitoring/silence-operator"
18 |               area: platform
19 |               cancel_if_outside_working_hours: "true"
20 |               cluster_id: bar
21 |               cluster_type: management_cluster
22 |               controller: silence-controller
23 |               installation: "myinstall"
24 |               severity: "page"
25 |               team: "atlas"
26 |               topic: "observability"
27 |             exp_annotations:
28 |               description: "silence-operator controller silence-controller too many reconcile errors."
29 |               runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/operator-not-reconciling/?INSTALLATION=myinstall&CLUSTER=bar
30 |       - alertname: SilenceOperatorReconcileErrors
31 |         eval_time: 215m
32 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/kaas/tenet/alerting-rules/capi-kubeadmconfig.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   labels: {{- include "labels.common" . | nindent 4}}
 5 |   name: capi-kubeadmconfig.rules
 6 |   namespace: {{.Values.namespace}}
 7 | spec:
 8 |   groups:
 9 |     - name: capi-kubeadmconfig
10 |       rules:
11 |         - alert: KubeadmConfigNotReady
12 |           expr: |-
13 |             (
14 |               capi_kubeadmconfig_status_condition{type="Ready", status="False"}
15 |               * on(cluster_id) group_left(provider)
16 |               sum(
17 |                   label_replace(
18 |                     capi_cluster_info, "provider", "vsphere", "infrastructure_reference_kind", "VSphereCluster"
19 |                   )
20 |               ) by (cluster_id, provider)
21 |             ) > 0
22 |           for: 1h
23 |           labels:
24 |             area: kaas
25 |             cancel_if_monitoring_agent_down: "true"
26 |             cancel_if_outside_working_hours: "true"
27 |             severity: page
28 |             team: {{ include "providerTeam" . }}
29 |             topic: managementcluster
30 |           annotations:
31 |             description: |-
32 |               {{`KubeadmConfig {{$labels.exported_namespace}}/{{$labels.name}} in cluster {{$labels.cluster_id}} encountered errors while generating a data secret`}}
33 |             runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/capi-kubeadmconfig/
34 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/kaas/tenet/alerting-rules/capi-machineset.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   labels: {{- include "labels.common" . | nindent 4}}
 5 |   name: capi-machineset.rules
 6 |   namespace: {{.Values.namespace}}
 7 | spec:
 8 |   groups:
 9 |     - name: capi-machineset
10 |       rules:
11 |         - alert: MachineSetPaused
12 |           expr: |-
13 |             (
14 |               capi_machineset_annotation_paused{paused_value="true"}
15 |               * on(cluster_id) group_left(provider)
16 |               sum(
17 |                   label_replace(
18 |                     capi_cluster_info, "provider", "vsphere", "infrastructure_reference_kind", "VSphereCluster"
19 |                   )
20 |               ) by (cluster_id, provider)
21 |             ) > 0
22 |           for: 1h
23 |           labels:
24 |             area: kaas
25 |             cancel_if_monitoring_agent_down: "true"
26 |             cancel_if_outside_working_hours: "true"
27 |             severity: notify
28 |             team: {{ include "providerTeam" . }}
29 |             topic: managementcluster
30 |           annotations:
31 |             description: |-
32 |               {{`Machineset {{ $labels.exported_namespace }}/{{ $labels.name }} is paused.`}}
33 |             runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/capi-machineset/
34 |             __dashboardUid__: bdi7iswg81czkcasd
35 |             dashboardQueryParams: "orgId=2"
36 | 


--------------------------------------------------------------------------------
/scripts/find-alerts.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Contributed during Xmas 2022 hackathon.
 4 | #
 5 | 
 6 | # Example how to run it:
 7 | # scripts/find-alerts.sh '.labels.team=="atlas"' '.labels.cancel_if_outside_working_hours=="true"' '.labels.severity=="page"'
 8 | # => will report all alerts for team Atlas that page but are canceled out of working hours.
 9 | 
10 | # /!\ This script is provided as-is.
11 | # It won't break anything in your files, but parameters management, help, error handling is missing.
12 | # Meaning: no guarantee about the quality of generated output
13 | 
14 | # In this place we can file helm-generated rules
15 | rulesFilesDir=test/tests/providers/aws/
16 | # => prerequisite: have files generated. for instance "make test" starts with generating files.
17 | 
18 | # Custom (user-provided) filters
19 | selectQueries=("$@")
20 | 
21 | # Build `jq` query from filters given as parameters
22 | selectQueriesString="$(printf "| select(%s)\n" "${selectQueries[@]}")"
23 | 
24 | # For each rules file
25 | for rulesFile in "$rulesFilesDir"/*.rules.yml; do
26 | 
27 |     # Retrieve (in an array) alert names that match the query
28 |     mapfile -t alertsList < <(
29 |         yq -ojson "$rulesFile" 2>/dev/null \
30 |         | jq '.groups[].rules[]
31 |             '"$selectQueriesString"'
32 |             | .alert' 2>/dev/null
33 |     ) || continue
34 | 
35 |     # Console output
36 |     for alert in "${alertsList[@]}"; do
37 |         echo "alert $alert - file $(basename "$rulesFile")"
38 |     done
39 | done
40 | 


--------------------------------------------------------------------------------
/DCO:
--------------------------------------------------------------------------------
 1 | Developer Certificate of Origin
 2 | Version 1.1
 3 | 
 4 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors.
 5 | 660 York Street, Suite 102,
 6 | San Francisco, CA 94110 USA
 7 | 
 8 | Everyone is permitted to copy and distribute verbatim copies of this
 9 | license document, but changing it is not allowed.
10 | 
11 | 
12 | Developer's Certificate of Origin 1.1
13 | 
14 | By making a contribution to this project, I certify that:
15 | 
16 | (a) The contribution was created in whole or in part by me and I
17 |     have the right to submit it under the open source license
18 |     indicated in the file; or
19 | 
20 | (b) The contribution is based upon previous work that, to the best
21 |     of my knowledge, is covered under an appropriate open source
22 |     license and I have the right under that license to submit that
23 |     work with modifications, whether created in whole or in part
24 |     by me, under the same open source license (unless I am
25 |     permitted to submit under a different license), as indicated
26 |     in the file; or
27 | 
28 | (c) The contribution was provided directly to me by some other
29 |     person who certified (a), (b) or (c) and I have not modified
30 |     it.
31 | 
32 | (d) I understand and agree that this project and the contribution
33 |     are public and that a record of the contribution (including all
34 |     personal information I submit with it, including my sign-off) is
35 |     maintained indefinitely and may be redistributed consistent with
36 |     this project or the open source license(s) involved.
37 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/kaas/tenet/alerting-rules/cluster-autoscaler.rules.yml:
--------------------------------------------------------------------------------
 1 | # This rule applies to all cloud workload clusters
 2 | apiVersion: monitoring.coreos.com/v1
 3 | kind: PrometheusRule
 4 | metadata:
 5 |   creationTimestamp: null
 6 |   labels:
 7 |     {{- include "labels.common" . | nindent 4 }}
 8 |   name: cluster-autoscaler.rules
 9 |   namespace: {{ .Values.namespace  }}
10 | spec:
11 |   groups:
12 |   - name: cluster-autoscaler
13 |     rules:
14 |     - alert: ClusterAutoscalerUnneededNodes
15 |       annotations:
16 |         description: '{{`Cluster-Autoscaler on {{ $labels.cluster_id }} has unneeded nodes.`}}'
17 |       expr: cluster_autoscaler_unneeded_nodes_count{cluster_type="workload_cluster", provider=~"capa|capz|eks"} > 0
18 |       for: 240m
19 |       labels:
20 |         area: kaas
21 |         cancel_if_outside_working_hours: "true"
22 |         cancel_if_cluster_has_no_workers: "true"
23 |         severity: notify
24 |         team: tenet
25 |         topic: cluster-autoscaler
26 |     - alert: ClusterAutoscalerFailedScaling
27 |       annotations:
28 |         description: '{{`Cluster-Autoscaler on {{ $labels.cluster_id }} has failed scaling up {{ $value | printf "%.0f" }} times recently.`}}'
29 |       expr: increase(cluster_autoscaler_failed_scale_ups_total[15m]) > 1 and rate(cluster_autoscaler_failed_scale_ups_total[5m]) > 0
30 |       labels:
31 |         area: kaas
32 |         cancel_if_outside_working_hours: "true"
33 |         cancel_if_cluster_has_no_workers: "true"
34 |         severity: page
35 |         team: tenet
36 |         topic: cluster-autoscaler
37 | 


--------------------------------------------------------------------------------
/test/tests/providers/global/platform/atlas/alerting-rules/flux-atlas.rules.test.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | rule_files:
 3 |   - flux-atlas.rules.yml
 4 | 
 5 | tests:
 6 |   - interval: 1m
 7 |     input_series:
 8 |       - series: 'gotk_resource_info{ready="False", job="giantswarm/cluster-api-monitoring", installation="test", cluster_id="test", customresource_kind="Kustomization", cluster_type="management_cluster", exported_namespace="flux-giantswarm", name="silences"}'
 9 |         values: "1x60"
10 |     alert_rule_test:
11 |       - alertname: FluxKustomizationFailed
12 |         eval_time: 30m
13 |         exp_alerts:
14 |           - exp_labels:
15 |               alertname: "FluxKustomizationFailed"
16 |               area: "platform"
17 |               cancel_if_outside_working_hours: "true"
18 |               cluster_id: "test"
19 |               cluster_type: "management_cluster"
20 |               customresource_kind: "Kustomization"
21 |               exported_namespace: "flux-giantswarm"
22 |               installation: "test"
23 |               job: "giantswarm/cluster-api-monitoring"
24 |               name: "silences"
25 |               ready: "False"
26 |               severity: "page"
27 |               team: "atlas"
28 |               topic: "releng"
29 |             exp_annotations:
30 |               description: "Flux Kustomization silences in ns flux-giantswarm on test/test is stuck in Failed state."
31 |               runbook_url: "https://intranet.giantswarm.io/docs/support-and-ops/runbooks/flux-kustomization-failed/?INSTALLATION=test&CLUSTER=test&NAMESPACE=flux-giantswarm&KUSTOMIZATION_NAME=silences"
32 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/dns.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   creationTimestamp: null
 5 |   labels:
 6 |     {{- include "labels.common" . | nindent 4 }}
 7 |   name: dns.rules
 8 |   namespace: {{ .Values.namespace  }}
 9 | spec:
10 |   groups:
11 |   - name: dns
12 |     rules:
13 |     - alert: DNSErrorRateTooHigh
14 |       annotations:
15 |         description: '{{`DNS error rate is too high for {{ or $labels.pod $labels.instance }} to {{ $labels.host }}, using {{ $labels.proto }}.`}}'
16 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/network-error/
17 |       expr: rate(dns_resolve_error_total[15m]) > 0.015
18 |       for: 15m
19 |       labels:
20 |         area: platform
21 |         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
22 |         cancel_if_cluster_has_no_workers: "true"
23 |         severity: page
24 |         team: cabbage
25 |         topic: network
26 |     - alert: DNSCheckErrorRateTooHigh
27 |       annotations:
28 |         description: '{{`DNS check error rate is too high for {{ or $labels.pod $labels.instance }}.`}}'
29 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/network-error/
30 |       expr: rate(dns_error_total[15m]) > 0.015
31 |       for: 15m
32 |       labels:
33 |         area: platform
34 |         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
35 |         cancel_if_cluster_has_no_workers: "true"
36 |         severity: page
37 |         team: cabbage
38 |         topic: network
39 | 


--------------------------------------------------------------------------------
/.github/workflows/alert_tests.yaml:
--------------------------------------------------------------------------------
 1 | name: alert-test
 2 | run-name: run unit and conformance tests
 3 | 
 4 | on:
 5 |   pull_request:
 6 |     # Only run on PRs based on the main branch
 7 |     branches:
 8 |       - main
 9 | 
10 | jobs:
11 |   rules-tests:
12 |     runs-on: ubuntu-latest
13 |     steps:
14 |       - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
15 |         with:
16 |           fetch-depth: "0"
17 |       - name: run rules tests
18 |         run: make test-rules
19 |   inhibition-tests:
20 |     runs-on: ubuntu-latest
21 |     steps:
22 |       - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
23 |         with:
24 |           fetch-depth: "0"
25 |       - name: run inhibition tests
26 |         run: make test-inhibitions
27 |   runbook-tests:
28 |     runs-on: ubuntu-latest
29 |     steps:
30 |       - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
31 |         with:
32 |           fetch-depth: "0"
33 |       - name: Clone intranet repository
34 |         uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
35 |         with:
36 |           fetch-depth: 1
37 |           repository: giantswarm/giantswarm
38 |           path: giantswarm
39 |           token: ${{ secrets.TAYLORBOT_GITHUB_ACTION }}
40 |       - name: run runbook tests
41 |         env:
42 |           RUNBOOKS_DIR: ./giantswarm
43 |         run: make test-ci-runbooks
44 |   prometheus-lint:
45 |     runs-on: ubuntu-latest
46 |     steps:
47 |       - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1
48 |         with:
49 |           fetch-depth: "0"
50 |       - name: run pint linter
51 |         run: make pint
52 | 


--------------------------------------------------------------------------------
/test/tests/providers/global/kaas/tenet/alerting-rules/certificate.management-cluster.rules.test.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | rule_files:
 3 |   - certificate.management-cluster.rules.yml
 4 | 
 5 | tests:
 6 |   - interval: 1m
 7 |     input_series:
 8 |       - series: 'cert_exporter_secret_not_after{cluster_id="gauss", cluster_type="management_cluster", secretkey="tls.crt", certificatename="capa-serving-cert", exported_namespace="giantswarm", provider="capa"}'
 9 |         values: "1x20 1x20 0+0x20"
10 |       - series: 'cert_exporter_certificate_cr_not_after{cluster_id="gauss", cluster_type="management_cluster", name="capa-serving-cert", exported_namespace="giantswarm", provider="capa"}'
11 |         values: "1x20 _x20 0+0x20"
12 |     alert_rule_test:
13 |       - alertname: ManagementClusterCertificateIsMissing
14 |         eval_time: 15m
15 |       - alertname: ManagementClusterCertificateIsMissing
16 |         eval_time: 35m
17 |         exp_alerts:
18 |           - exp_labels:
19 |               area: kaas
20 |               cancel_if_outside_working_hours: "true"
21 |               certificatename: capa-serving-cert
22 |               cluster_id: gauss
23 |               exported_namespace: giantswarm
24 |               provider: capa
25 |               severity: page
26 |               team: phoenix
27 |               topic: security
28 |             exp_annotations:
29 |               description: 'Cannot renew Certificate for Secret giantswarm/capa-serving-cert on gauss because it is missing.'
30 |               runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/managed-app-cert-manager/missing-certificate-for-secret/
31 |       - alertname: ManagementClusterCertificateIsMissing
32 |         eval_time: 55m
33 | 


--------------------------------------------------------------------------------
/.github/workflows/update-tempo-mixins.yml:
--------------------------------------------------------------------------------
 1 | name: Update Tempo Mixins
 2 | 
 3 | on:
 4 |   schedule:
 5 |     # Run on the 1st day of every month at 10:00 UTC
 6 |     - cron: '0 10 1 * *'
 7 |   workflow_dispatch: # Allow manual triggering
 8 | 
 9 | permissions:
10 |   contents: write
11 |   pull-requests: write
12 | 
13 | jobs:
14 |   update-tempo-mixins:
15 |     runs-on: ubuntu-latest
16 |     
17 |     steps:
18 |       - name: Checkout repository
19 |         uses: actions/checkout@v6
20 |         with:
21 |           token: ${{ secrets.GITHUB_TOKEN }}
22 |       
23 |       - name: Set up Go
24 |         uses: actions/setup-go@v4
25 |         with:
26 |           go-version: '1.25'
27 |       
28 |       - name: Update Tempo mixins
29 |         run: make update-tempo-mixin
30 |       
31 |       - name: Check for changes
32 |         id: changes
33 |         run: |
34 |           if git diff --quiet; then
35 |             echo "has_changes=false" >> $GITHUB_OUTPUT
36 |           else
37 |             echo "has_changes=true" >> $GITHUB_OUTPUT
38 |           fi
39 |       
40 |       - name: Create Pull Request
41 |         if: steps.changes.outputs.has_changes == 'true'
42 |         uses: peter-evans/create-pull-request@v8
43 |         with:
44 |           token: ${{ secrets.GITHUB_TOKEN }}
45 |           commit-message: 'chore: update Tempo mixins from upstream'
46 |           title: 'chore: update Tempo mixins from upstream'
47 |           body: |
48 |             This PR updates the Tempo mixins from grafana/tempo upstream repository.
49 |             
50 |             This is an automated update that runs monthly.
51 |           branch: update-tempo-mixins-${{ github.run_number }}
52 |           delete-branch: true
53 |           draft: false
54 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/kaas/tenet/alerting-rules/systemd.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   creationTimestamp: null
 5 |   labels:
 6 |     {{- include "labels.common" . | nindent 4 }}
 7 |   name: systemd.rules
 8 |   namespace: {{ .Values.namespace  }}
 9 | spec:
10 |   groups:
11 |   - name: systemd
12 |     rules:
13 |     ## TODO(@giantswarm/team-tenet): Update those lists when all vintage clusters are gone
14 |     - alert: ClusterCriticalSystemdUnitFailed
15 |       annotations:
16 |         description: '{{`Critical systemd unit {{ $labels.name }} is failed on {{ $labels.instance }}.`}}'
17 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/critical-systemd-unit-failed/
18 |       expr: node_systemd_unit_state{name=~"k8s-addons.service|systemd-networkd.service", state="failed"} == 1
19 |       for: 5m
20 |       labels:
21 |         area: kaas
22 |         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
23 |         severity: page
24 |         team: tenet
25 |         topic: infrastructure
26 |     - alert: ClusterDisabledSystemdUnitActive
27 |       annotations:
28 |         description: '{{`Disabled Systemd unit {{ $labels.name }} is active on {{ $labels.ip }}.`}}'
29 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/disabled-systemd-unit-active/
30 |       expr: node_systemd_unit_state{name=~"locksmithd.service|update-engine.service", state="active"} == 1
31 |       for: 5m
32 |       labels:
33 |         area: kaas
34 |         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
35 |         severity: page
36 |         team: tenet
37 |         topic: infrastructure
38 | 


--------------------------------------------------------------------------------
/test/conf/promtool_ignore:
--------------------------------------------------------------------------------
 1 | kaas/phoenix/alerting-rules/aws-load-balancer-controller.rules.yml
 2 | kaas/phoenix/alerting-rules/aws.node.workload-cluster.rules.yml
 3 | kaas/phoenix/alerting-rules/aws.workload-cluster.rules.yml
 4 | kaas/phoenix/alerting-rules/capa.management-cluster.rules.yml
 5 | kaas/phoenix/alerting-rules/dns-operator-azure.rules.yml
 6 | kaas/phoenix/alerting-rules/irsa.rules.yml
 7 | platform/atlas/alerting-rules/deployment.management-cluster.rules.yml
 8 | platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml
 9 | platform/atlas/alerting-rules/fluentbit.rules.yml
10 | platform/atlas/alerting-rules/inhibit.oncall.rules.yml
11 | platform/atlas/alerting-rules/keda.rules.yml
12 | platform/atlas/alerting-rules/kube-state-metrics.rules.yml
13 | platform/atlas/alerting-rules/prometheus-operator.rules.yml
14 | platform/atlas/alerting-rules/storage.rules.yml
15 | platform/atlas/recording-rules/grafana-cloud.rules.yml
16 | platform/atlas/recording-rules/loki-mixins.rules.yml
17 | platform/atlas/recording-rules/mimir-mixins.rules.yml
18 | platform/cabbage/alerting-rules/coredns.rules.yml
19 | platform/cabbage/alerting-rules/external-dns.rules.yml
20 | platform/cabbage/alerting-rules/ingress-controller.rules.yml
21 | platform/cabbage/alerting-rules/dns.rules.yml
22 | platform/cabbage/recording-rules/gs-managed-app-deployment-status.rules.yml
23 | platform/honeybadger/alerting-rules/chart.rules.yml
24 | platform/honeybadger/alerting-rules/helm.rules.yml
25 | platform/honeybadger/alerting-rules/secret.rules.yml
26 | platform/honeybadger/recording-rules/helm-operations.rules.yml
27 | platform/shield/alerting-rules/falco.rules.yml
28 | platform/shield/alerting-rules/cert-manager.rules.yml
29 | platform/shield/alerting-rules/dex.rules.yml
30 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/_helpers.tpl:
--------------------------------------------------------------------------------
 1 | {{/* vim: set filetype=mustache: */}}
 2 | {{/*
 3 | Expand the name of the chart.
 4 | */}}
 5 | {{- define "name" -}}
 6 | {{- .Chart.Name | trunc 63 | trimSuffix "-" -}}
 7 | {{- end -}}
 8 | 
 9 | {{/*
10 | Create chart name and version as used by the chart label.
11 | */}}
12 | {{- define "chart" -}}
13 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}}
14 | {{- end -}}
15 | 
16 | {{/*
17 | Common labels
18 | */}}
19 | {{- define "labels.common" -}}
20 | app.kubernetes.io/name: {{ include "name" . | quote }}
21 | app.kubernetes.io/instance: {{ .Release.Name | quote }}
22 | app.kubernetes.io/managed-by: {{ .Release.Service | quote }}
23 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }}
24 | application.giantswarm.io/team: {{ index .Chart.Annotations "application.giantswarm.io/team" | default "atlas" | quote }}
25 | helm.sh/chart: {{ include "chart" . | quote }}
26 | giantswarm.io/service-type: {{ .Values.serviceType }}
27 | {{- if or (.Template.Name | hasSuffix "logs.yaml") (.Template.Name | hasSuffix "logs.yml")}}
28 | application.giantswarm.io/prometheus-rule-kind: loki
29 | {{- end }}
30 | observability.giantswarm.io/tenant: giantswarm
31 | {{- end -}}
32 | 
33 | {{- define "providerTeam" -}}
34 | '{{`{{ if or (eq .Labels.provider "cloud-director") (eq .Labels.provider "vsphere") }}rocket{{ else }}phoenix{{ end }}`}}'
35 | {{- end -}}
36 | 
37 | {{- define "workingHoursOnly" -}}
38 | {{- if eq .Values.managementCluster.pipeline "stable-testing" -}}
39 | "true"
40 | {{- else -}}
41 | "false"
42 | {{- end -}}
43 | {{- end -}}
44 | 
45 | {{- define "namespaceNotGiantswarm" -}}
46 | "(([^g]|g[^i]|gi[^a]|gia[^n]|gian[^t]|giant[^s]|giants[^w]|giantsw[^a]|giantswa[^r]|giantswar[^m])*)"
47 | {{- end -}}
48 | 


--------------------------------------------------------------------------------
/.github/workflows/update-loki-mixins.yml:
--------------------------------------------------------------------------------
 1 | name: Update Loki Mixins
 2 | 
 3 | on:
 4 |   schedule:
 5 |     # Run on the 1st day of every month at 09:30 UTC
 6 |     - cron: '30 9 1 * *'
 7 |   workflow_dispatch: # Allow manual triggering
 8 | 
 9 | permissions:
10 |   contents: write
11 |   pull-requests: write
12 | 
13 | jobs:
14 |   update-loki-mixins:
15 |     runs-on: ubuntu-latest
16 |     
17 |     steps:
18 |       - name: Checkout repository
19 |         uses: actions/checkout@v6
20 |         with:
21 |           token: ${{ secrets.GITHUB_TOKEN }}
22 |       
23 |       - name: Set up Go
24 |         uses: actions/setup-go@v4
25 |         with:
26 |           go-version: '1.25'
27 |       
28 |       - name: Install tools
29 |         run: make install-tools
30 |       
31 |       - name: Update Loki mixins
32 |         run: make update-loki-mixin
33 |       
34 |       - name: Check for changes
35 |         id: changes
36 |         run: |
37 |           if git diff --quiet; then
38 |             echo "has_changes=false" >> $GITHUB_OUTPUT
39 |           else
40 |             echo "has_changes=true" >> $GITHUB_OUTPUT
41 |           fi
42 |       
43 |       - name: Create Pull Request
44 |         if: steps.changes.outputs.has_changes == 'true'
45 |         uses: peter-evans/create-pull-request@v8
46 |         with:
47 |           token: ${{ secrets.GITHUB_TOKEN }}
48 |           commit-message: 'chore: update Loki mixins from upstream'
49 |           title: 'chore: update Loki mixins from upstream'
50 |           body: |
51 |             This PR updates the Loki mixins from grafana/loki upstream repository.
52 |             
53 |             This is an automated update that runs monthly.
54 |           branch: update-loki-mixins-${{ github.run_number }}
55 |           delete-branch: true
56 |           draft: false
57 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/kaas/tenet/alerting-rules/fairness.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   labels:
 5 |     {{- include "labels.common" . | nindent 4 }}
 6 |   name: fairness.rules
 7 |   namespace: {{ .Values.namespace  }}
 8 | spec:
 9 |   groups:
10 |   - name: fairness
11 |     rules:
12 |     - alert: FlowcontrolRejectedRequests
13 |       annotations:
14 |         description: '{{`Cluster {{ $labels.installation }}/{{ $labels.cluster_id }}: k8s API fairness is rejecting calls in flow schema {{ $labels.flow_schema }}.`}}'
15 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/flowcontrol-rejected-requests/
16 |       expr: (increase(apiserver_flowcontrol_rejected_requests_total[1m]) > 0)
17 |       for: 5m
18 |       labels:
19 |         area: kaas
20 |         cancel_if_outside_working_hours: "true"
21 |         severity: notify
22 |         team: tenet
23 |         topic: kubernetes
24 |     - alert: FlowcontrolTooManyRequests
25 |       annotations:
26 |         description: '{{`Cluster {{ $labels.installation }}/{{ $labels.cluster_id }}: there are too many API requests for flow schema {{ $labels.flow_schema }}.`}}'
27 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/flowcontrol-rejected-requests/
28 |       expr: sum(irate(apiserver_flowcontrol_dispatched_requests_total[1m])) by (cluster_id, installation, pipeline, provider, flow_schema, priority_level) > min by(cluster_id, installation, pipeline, provider, flow_schema, priority_level) (apiserver_flowcontrol_nominal_limit_seats)
29 |       for: 15m
30 |       labels:
31 |         area: kaas
32 |         cancel_if_outside_working_hours: "true"
33 |         severity: notify
34 |         team: tenet
35 |         topic: kubernetes
36 | 


--------------------------------------------------------------------------------
/.github/workflows/update-mimir-mixins.yml:
--------------------------------------------------------------------------------
 1 | name: Update Mimir Mixins
 2 | 
 3 | on:
 4 |   schedule:
 5 |     # Run on the 1st day of every month at 09:00 UTC
 6 |     - cron: '0 9 1 * *'
 7 |   workflow_dispatch: # Allow manual triggering
 8 | 
 9 | permissions:
10 |   contents: write
11 |   pull-requests: write
12 | 
13 | jobs:
14 |   update-mimir-mixins:
15 |     runs-on: ubuntu-latest
16 |     
17 |     steps:
18 |       - name: Checkout repository
19 |         uses: actions/checkout@v6
20 |         with:
21 |           token: ${{ secrets.GITHUB_TOKEN }}
22 |       
23 |       - name: Set up Go
24 |         uses: actions/setup-go@v4
25 |         with:
26 |           go-version: '1.25'
27 |       
28 |       - name: Install tools
29 |         run: make install-tools
30 |       
31 |       - name: Update Mimir mixins
32 |         run: make update-mimir-mixin
33 |       
34 |       - name: Check for changes
35 |         id: changes
36 |         run: |
37 |           if git diff --quiet; then
38 |             echo "has_changes=false" >> $GITHUB_OUTPUT
39 |           else
40 |             echo "has_changes=true" >> $GITHUB_OUTPUT
41 |           fi
42 |       
43 |       - name: Create Pull Request
44 |         if: steps.changes.outputs.has_changes == 'true'
45 |         uses: peter-evans/create-pull-request@v8
46 |         with:
47 |           token: ${{ secrets.GITHUB_TOKEN }}
48 |           commit-message: 'chore: update Mimir mixins from upstream'
49 |           title: 'chore: update Mimir mixins from upstream'
50 |           body: |
51 |             This PR updates the Mimir mixins from grafana/mimir upstream repository.
52 |             
53 |             This is an automated update that runs monthly.
54 |           branch: update-mimir-mixins-${{ github.run_number }}
55 |           delete-branch: true
56 |           draft: false
57 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/platform/atlas/alerting-rules/app-configuration.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   labels:
 5 |     {{- include "labels.common" . | nindent 4 }}
 6 |   name: atlas-app-configuration.rules
 7 |   namespace: {{ .Values.namespace }}
 8 | spec:
 9 |   groups:
10 |   - name: atlas-app-configuration
11 |     rules:
12 |     # Coming from https://gigantic.slack.com/archives/C07A03AN9JM
13 |     # This alert ensures our app has no unexpected configmaps.
14 |     - alert: ConfigmapUnexpected
15 |       annotations:
16 |         description: '{{`{{ $labels.configmap }} configmap is not expected.`}}'
17 |         runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/atlas-app-configuration/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}'
18 |       expr: |
19 |         kube_configmap_info{cluster_type="management_cluster", configmap=~".*(loki|mimir)-user-values"} > 0
20 |       for: 2d
21 |       labels:
22 |         area: platform
23 |         cancel_if_outside_working_hours: "true"
24 |         severity: notify
25 |         team: atlas
26 |         topic: observability
27 |     # This alert ensures our app has no unexpected secrets.
28 |     - alert: SecretUnexpected
29 |       annotations:
30 |         description: '{{`{{ $labels.secret }} secret is not expected.`}}'
31 |         runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/atlas-app-configuration/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}'
32 |       expr: |
33 |         kube_secret_info{cluster_type="management_cluster", secret=~".*(loki|mimir)-user-values"} > 0
34 |       for: 2d
35 |       labels:
36 |         area: platform
37 |         cancel_if_outside_working_hours: "true"
38 |         severity: notify
39 |         team: atlas
40 |         topic: observability
41 | 


--------------------------------------------------------------------------------
/test/tests/providers/global/platform/honeybadger/alerting-rules/flux.rules.test.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | rule_files:
 3 |   - flux.rules.yml
 4 | 
 5 | tests:
 6 |   - interval: 1m
 7 |     input_series:
 8 |       - series: 'gotk_resource_info{ready="False", job="giantswarm/cluster-api-monitoring", customresource_kind="Kustomization", cluster_type="management_cluster", exported_namespace="flux-giantswarm", name="silences"}'
 9 |         values: "1x60"
10 |     alert_rule_test:
11 |       - alertname: FluxKustomizationFailed
12 |         eval_time: 30m
13 |         exp_alerts: []
14 |   - interval: 1m
15 |     input_series:
16 |       - series: 'gotk_resource_info{installation="test", job="giantswarm/cluster-api-monitoring", cluster_type="management_cluster", exported_namespace="flux-giantswarm", customresource_kind="Kustomization", name="flux", suspended="true"}'
17 |         values: "1x60 0+1x60 1+0x1500"
18 |     alert_rule_test:
19 |       - alertname:  FluxSuspendedForTooLong
20 |         eval_time: 1560m
21 |         exp_alerts:
22 |           - exp_labels:
23 |               alertname: "FluxSuspendedForTooLong"
24 |               area: platform
25 |               cancel_if_outside_working_hours: "true"
26 |               cluster_type: "management_cluster"
27 |               customresource_kind: "Kustomization"
28 |               exported_namespace: "flux-giantswarm"
29 |               installation: "test"
30 |               job: "giantswarm/cluster-api-monitoring"
31 |               name: "flux"
32 |               severity: "page"
33 |               suspended: "true"
34 |               team: "honeybadger"
35 |               topic: "releng"
36 |             exp_annotations:
37 |               description: "Flux Kustomization flux in ns flux-giantswarm on test has been suspended for 24h."
38 |               runbook_url: "https://intranet.giantswarm.io/docs/support-and-ops/runbooks/flux-suspended/?INSTALLATION=test&CLUSTER="
39 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/platform/atlas/alerting-rules/sloth.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   labels:
 5 |     {{- include "labels.common" . | nindent 4 }}
 6 |   name: sloth.rules
 7 |   namespace: {{ .Values.namespace }}
 8 | spec:
 9 |   groups:
10 |   - name: sloth
11 |     rules:
12 |     - alert: SlothDown
13 |       annotations:
14 |         description: 'Sloth is down.'
15 |         runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/sloth-down/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}'
16 |       expr: count(up{job="monitoring/sloth"} == 0) by (cluster_id, installation, provider, pipeline) > 0
17 |       for: 5m
18 |       labels:
19 |         area: platform
20 |         cancel_if_cluster_control_plane_unhealthy: "true"
21 |         cancel_if_outside_working_hours: "true"
22 |         severity: page
23 |         team: atlas
24 |         topic: observability
25 |     # Coming from https://github.com/giantswarm/giantswarm/issues/31133
26 |     # This alert ensures sloth container are not restarting too often (flappiness).
27 |     - alert: SlothRestartingTooOften
28 |       annotations:
29 |         description: '{{`Sloth is restarting too often.`}}'
30 |         runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/sloth-down/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}'
31 |       expr: |
32 |         increase(
33 |           kube_pod_container_status_restarts_total{cluster_type="management_cluster", namespace="monitoring", container="sloth"}[1h]
34 |         ) > 5
35 |       for: 5m
36 |       labels:
37 |         area: platform
38 |         cancel_if_cluster_control_plane_unhealthy: "true"
39 |         cancel_if_outside_working_hours: "true"
40 |         severity: page
41 |         team: atlas
42 |         topic: observability
43 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/kaas/tenet/alerting-rules/pods.core.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   creationTimestamp: null
 5 |   labels:
 6 |     {{- include "labels.common" . | nindent 4 }}
 7 |   name: pods.core.rules
 8 |   namespace: {{ .Values.namespace  }}
 9 | spec:
10 |   groups:
11 |   - name: pods.core
12 |     rules:
13 |     - alert: ContainerIsRestartingTooFrequently
14 |       annotations:
15 |         description: '{{`Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting too often in cluster {{ $labels.installation }}/{{ $labels.cluster_id }}.`}}'
16 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/container-is-restarting-too-often/
17 |       expr: label_join(increase(kube_pod_container_status_restarts_total{container=~"cluster-autoscaler.*|etcd-kubernetes-resources-count-exporter.*"}[1h]), "service", "/", "namespace", "pod") > 10
18 |       for: 10m
19 |       labels:
20 |         area: kaas
21 |         cancel_if_outside_working_hours: "true"
22 |         cancel_if_cluster_has_no_workers: "true"
23 |         severity: page
24 |         team: tenet
25 |         topic: kubernetes
26 |     - alert: PodPending
27 |       annotations:
28 |         description: '{{`Pod {{ $labels.namespace }}/{{ $labels.pod }} is stuck in Pending in cluster {{ $labels.installation }}/{{ $labels.cluster_id }}.`}}'
29 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/pod-stuck-in-pending/
30 |       expr: kube_pod_status_phase{namespace="kube-system",pod=~"(cluster-autoscaler.*)",phase="Pending"} == 1
31 |       for: 15m
32 |       labels:
33 |         area: kaas
34 |         cancel_if_outside_working_hours: "true"
35 |         cancel_if_kube_state_metrics_down: "true"
36 |         cancel_if_cluster_has_no_workers: "true"
37 |         severity: page
38 |         team: tenet
39 | 
40 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/platform/cabbage/recording-rules/gs-managed-app-deployment-status.rules.yml:
--------------------------------------------------------------------------------
 1 | ## Cabbage is the only user of those recording rules
 2 | apiVersion: monitoring.coreos.com/v1
 3 | kind: PrometheusRule
 4 | metadata:
 5 |   labels:
 6 |     {{- include "labels.common" . | nindent 4 }}
 7 |   name: gs-managed-app-deployment-status.recording.rules
 8 |   namespace: {{ .Values.namespace }}
 9 | spec:
10 |   groups:
11 |   - name: gs-managed-app-deployments.recording
12 |     rules:
13 |     - expr: |
14 |         label_replace(
15 |             kube_deployment_status_replicas_available
16 |           * on (cluster_id, cluster_type, pod, namespace, deployment) group_left (label_app_kubernetes_io_name)
17 |             kube_deployment_labels{label_giantswarm_io_service_type="managed"},
18 |           "managed_app",
19 |           "$1",
20 |           "label_app_kubernetes_io_name",
21 |           "(.*)"
22 |         )
23 |       record: managed_app_deployment_status_replicas_available
24 |     - expr: |
25 |         label_replace(
26 |             kube_deployment_status_replicas_unavailable
27 |           * on (cluster_id, cluster_type, pod, namespace, deployment) group_left (label_app_kubernetes_io_name)
28 |             kube_deployment_labels{label_giantswarm_io_service_type="managed"},
29 |           "managed_app",
30 |           "$1",
31 |           "label_app_kubernetes_io_name",
32 |           "(.*)"
33 |         )
34 |       record: managed_app_deployment_status_replicas_unavailable
35 |     - expr: |
36 |         label_replace(
37 |             kube_deployment_spec_replicas
38 |           * on (pod, namespace, deployment) group_left (label_app_kubernetes_io_name)
39 |             kube_deployment_labels{label_giantswarm_io_service_type="managed"},
40 |           "managed_app",
41 |           "$1",
42 |           "label_app_kubernetes_io_name",
43 |           "(.*)"
44 |         )
45 |       record: managed_app_deployment_spec_replicas
46 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/kaas/tenet/alerting-rules/certificate.workload-cluster.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   creationTimestamp: null
 5 |   labels:
 6 |     {{- include "labels.common" . | nindent 4 }}
 7 |   name: certificate.workload-cluster.rules
 8 |   namespace: {{ .Values.namespace  }}
 9 | spec:
10 |   groups:
11 |   - name: certificate.workload-cluster
12 |     rules:
13 |     - alert: WorkloadClusterCertificateWillExpireInLessThanAMonth
14 |       annotations:
15 |         description: '{{`Certificate {{ $labels.path }} on {{ $labels.node }} will expire in less than a month.`}}'
16 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/renew-certificates/
17 |         dashboardExternalUrl: https://giantswarm.grafana.net/d/a2f4976Zk/certificates
18 |       expr: (cert_exporter_not_after{cluster_type="workload_cluster", path!="/etc/kubernetes/ssl/service-account-crt.pem"} - time()) < 4 * 7 * 24 * 60 * 60
19 |       for: 5m
20 |       labels:
21 |         area: kaas
22 |         cancel_if_outside_working_hours: "true"
23 |         severity: notify
24 |         team: teddyfriends
25 |         topic: security
26 |     - alert: ClusterCertificateExpirationMetricsMissing
27 |       annotations:
28 |         description: '{{`Certificate metrics are missing for cluster {{ $labels.cluster_id }}.`}}'
29 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/absent-metrics/
30 |       expr: max(up{cluster_id!="", cluster_type="workload_cluster"}) by (cluster_id, installation, pipeline, provider) unless on (cluster_id) count (cert_exporter_not_after{cluster_type="workload_cluster"}) by (cluster_id, installation, pipeline, provider) > 0
31 |       for: 30m
32 |       labels:
33 |         area: kaas
34 |         cancel_if_outside_working_hours: "true"
35 |         severity: page
36 |         team: {{ include "providerTeam" . }}
37 |         topic: security
38 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/kaas/tenet/alerting-rules/pods.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   creationTimestamp: null
 5 |   labels:
 6 |     {{- include "labels.common" . | nindent 4 }}
 7 |   name: pods.rules
 8 |   namespace: {{ .Values.namespace  }}
 9 | spec:
10 |   groups:
11 |   - name: pods
12 |     rules:
13 |     # PodsUnschedulable fires when too many pods are in `unschedulable` status in the `kube-system` namespace
14 |     # This is a signal something is wrong with the WC.
15 |     - alert: PodsUnschedulable
16 |       annotations:
17 |         description: '{{`Cluster {{ $labels.cluster_id }} has unschedulable kube-system pods.`}}'
18 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/validate-cluster-health/
19 |         __dashboardUid__: unschedulable-pods
20 |         dashboardQueryParams: '{{`orgId=1&var-namespace=kube-system&var-cluster={{ $labels.cluster_id }}`}}'
21 |       expr: |-
22 |         count(
23 |           count_over_time(
24 |             # Have a list of unschedulable pods
25 |             count(
26 |               kube_pod_status_unschedulable{namespace="kube-system"}
27 |             ) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, pod)
28 |           # only keep those that have been unschedulable for more than 10 minutes over the past 30 minutes
29 |           [30m:]) > 10
30 |         # count per cluster
31 |         ) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region)
32 |         # At least 2 pods should be unschedulable for the alert to page.
33 |         >= 2
34 |       for: 15m
35 |       labels:
36 |         area: kaas
37 |         # Let's start with business hours only, maybe 24x7 in the future
38 |         cancel_if_outside_working_hours: "true"
39 |         inhibit_cluster_broken: "true"
40 |         severity: page
41 |         team: {{ include "providerTeam" . }}
42 |         topic: workloadcluster
43 | 


--------------------------------------------------------------------------------
/Makefile.gen.app.mk:
--------------------------------------------------------------------------------
 1 | # DO NOT EDIT. Generated with:
 2 | #
 3 | #    devctl
 4 | #
 5 | #    https://github.com/giantswarm/devctl/blob/eea19f200d7cfd27ded22474b787563bbfdb8ec4/pkg/gen/input/makefile/internal/file/Makefile.gen.app.mk.template
 6 | #
 7 | 
 8 | ##@ App
 9 | 
10 | YQ=docker run --rm -u $$(id -u) -v $${PWD}:/workdir mikefarah/yq:4.29.2
11 | HELM_DOCS=docker run --rm -u $$(id -u) -v $${PWD}:/helm-docs jnorwood/helm-docs:v1.11.0
12 | 
13 | ifdef APPLICATION
14 | DEPS := $(shell find $(APPLICATION)/charts -maxdepth 2 -name "Chart.yaml" -printf "%h\n")
15 | endif
16 | 
17 | .PHONY: lint-chart check-env update-chart helm-docs update-deps $(DEPS)
18 | 
19 | lint-chart: IMAGE := giantswarm/helm-chart-testing:v3.0.0-rc.1
20 | lint-chart: check-env ## Runs ct against the default chart.
21 | 	@echo "====> $@"
22 | 	rm -rf /tmp/$(APPLICATION)-test
23 | 	mkdir -p /tmp/$(APPLICATION)-test/helm
24 | 	cp -a ./helm/$(APPLICATION) /tmp/$(APPLICATION)-test/helm/
25 | 	architect helm template --dir /tmp/$(APPLICATION)-test/helm/$(APPLICATION)
26 | 	docker run -it --rm -v /tmp/$(APPLICATION)-test:/wd --workdir=/wd --name ct $(IMAGE) ct lint --validate-maintainers=false --charts="helm/$(APPLICATION)"
27 | 	rm -rf /tmp/$(APPLICATION)-test
28 | 
29 | update-chart: check-env ## Sync chart with upstream repo.
30 | 	@echo "====> $@"
31 | 	vendir sync
32 | 	$(MAKE) update-deps
33 | 
34 | update-deps: check-env $(DEPS) ## Update Helm dependencies.
35 | 	cd $(APPLICATION) && helm dependency update
36 | 
37 | $(DEPS): check-env ## Update main Chart.yaml with new local dep versions.
38 | 	dep_name=$(shell basename $@) && \
39 | 	new_version=`$(YQ) .version $(APPLICATION)/charts/$$dep_name/Chart.yaml` && \
40 | 	$(YQ) -i e "with(.dependencies[]; select(.name == \"$$dep_name\") | .version = \"$$new_version\")" $(APPLICATION)/Chart.yaml
41 | 
42 | helm-docs: check-env ## Update $(APPLICATION) README.
43 | 	$(HELM_DOCS) -c $(APPLICATION) -g $(APPLICATION)
44 | 
45 | check-env:
46 | ifndef APPLICATION
47 | 	$(error APPLICATION is not defined)
48 | endif
49 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/chart.rules.yml:
--------------------------------------------------------------------------------
 1 | # TODO(@giantswarm/team-honeybadger): This is only used by the chart-operator, let's get rid of it when the chart operator is gone.
 2 | apiVersion: monitoring.coreos.com/v1
 3 | kind: PrometheusRule
 4 | metadata:
 5 |   creationTimestamp: null
 6 |   labels:
 7 |     {{- include "labels.common" . | nindent 4 }}
 8 |   name: chart.rules
 9 |   namespace: {{ .Values.namespace  }}
10 | spec:
11 |   groups:
12 |   - name: chart
13 |     rules:
14 |     - alert: ChartOperatorDown
15 |       annotations:
16 |         description: '{{`ChartOperator ({{ $labels.instance }}) is down.`}}'
17 |         runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/chart-operator-down/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}'
18 |       expr: label_replace(up{job="chart-operator"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0
19 |       for: 15m
20 |       labels:
21 |         area: platform
22 |         cancel_if_cluster_control_plane_unhealthy: "true"
23 |         cancel_if_kubelet_down: "true"
24 |         cancel_if_cluster_has_no_workers: "true"
25 |         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
26 |         cancel_if_monitoring_agent_down: "true"
27 |         severity: notify
28 |         team: honeybadger
29 |         topic: releng
30 |     - alert: ChartOrphanConfigMap
31 |       annotations:
32 |         description: '{{`Chart configmaps have not been deleted.`}}'
33 |         runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/chart-operator-orphan-resources/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}'
34 |       expr: chart_operator_configmap_orphan > 0
35 |       for: 10m
36 |       labels:
37 |         area: platform
38 |         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
39 |         severity: notify
40 |         team: honeybadger
41 |         topic: releng
42 | 


--------------------------------------------------------------------------------
/Makefile.custom.mk:
--------------------------------------------------------------------------------
 1 | .PHONY: clean-dry-run
 2 | clean-dry-run: ## dry run for `make clean` - print all untracked files
 3 | 	@git clean -xnf
 4 | 
 5 | .PHONY: clean
 6 | clean: ## Clean the git work dir and remove all untracked files
 7 | 	# clean stage
 8 | 	git clean -xdf -- test/hack/bin test/hack/output test/hack/checkLabels
 9 | 
10 | ##@ Testing
11 | 
12 | .PHONY: test
13 | test: install-tools template-chart test-rules test-inhibitions test-runbooks ## Run all tests
14 | 
15 | install-tools:
16 | 	./test/hack/bin/fetch-tools.sh
17 | 
18 | template-chart: install-tools ## prepare the helm chart
19 | 	bash ./test/hack/bin/template-chart.sh
20 | 
21 | test-rules: install-tools template-chart ## run unit tests for alerting rules
22 | 	bash test/hack/bin/verify-rules.sh "$(test_filter)" "${rules_type}"
23 | 
24 | test-inhibitions: install-tools template-chart ## test whether inhibition labels are well defined
25 | 	bash test/hack/bin/get-inhibition.sh
26 | 	cd test/hack/checkLabels; go run main.go
27 | 
28 | test-runbooks: install-tools template-chart ## Check if runbooks are valid
29 | 	bash test/hack/bin/check-runbooks.sh
30 | 
31 | test-ci-runbooks: ## Check if runbooks are valid in CI
32 | 	test/hack/bin/check-runbooks.sh --ci
33 | 
34 | pint: install-tools template-chart ## Run pint
35 | 	GENERATE_ONLY=true bash test/hack/bin/verify-rules.sh
36 | 	./test/hack/bin/run-pint.sh test/conf/pint/pint-config.hcl ${PINT_TEAM_FILTER}
37 | 
38 | pint-all: install-tools template-chart ## Run pint with extra checks
39 | 	GENERATE_ONLY=true bash test/hack/bin/verify-rules.sh
40 | 	./test/hack/bin/run-pint.sh test/conf/pint/pint-all.hcl ${PINT_TEAM_FILTER}
41 | 
42 | ##@ Mixins
43 | update-mimir-mixin: install-tools ##        Update Mimir mixins
44 | 	./mimir/update.sh
45 | 
46 | update-loki-mixin: install-tools ##        Update Loki mixins
47 | 	./loki/update.sh
48 | 
49 | update-tempo-mixin: install-tools ##        Update Tempo mixins
50 | 	./tempo/update.sh
51 | 
52 | update-mixin: update-mimir-mixin update-loki-mixin update-tempo-mixin ##        Update all mixins
53 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging-pipeline.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   labels:
 5 |     {{- include "labels.common" . | nindent 4 }}
 6 |   name: logging-pipeline.rules
 7 |   namespace: {{ .Values.namespace }}
 8 | spec:
 9 |   groups:
10 |     - name: logging-pipeline
11 |       rules:
12 |         # Any alloy component that uses the loki.write component can throw such errors.
13 |         - alert: LogForwardingErrors
14 |           annotations:
15 |             __dashboardUid__: 53c1ecddc3a1d5d4b8d6cd0c23676c31
16 |             dashboardQueryParams: "orgId=2"
17 |             description: '{{`{{ $value | printf "%.2f" }}% of the requests to Loki are failing for pod {{ $labels.pod }} (threshold 10%)`}}'
18 |             runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/logging-pipeline/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}'
19 |           expr: |-
20 |             (
21 |               100
22 |               *
23 |                 (
24 |                     (
25 |                       sum by (cluster_id, installation, provider, pipeline, namespace, job, instance, pod) (
26 |                         rate (
27 |                           loki_write_request_duration_seconds_count{status_code!~"2.."}[5m:]
28 |                         )
29 |                       )
30 |                     )
31 |                   /
32 |                     (
33 |                       sum by (cluster_id, installation, provider, pipeline, namespace, job, instance, pod) (
34 |                         rate (
35 |                           loki_write_request_duration_seconds_count[5m:]
36 |                         )
37 |                       )
38 |                     )
39 |                 )
40 |             )
41 |             > 10
42 |           for: 15m
43 |           labels:
44 |             area: platform
45 |             severity: page
46 |             team: atlas
47 |             topic: observability
48 |             cancel_if_outside_working_hours: "true"
49 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws-load-balancer-controller.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   creationTimestamp: null
 5 |   labels:
 6 |     {{- include "labels.common" . | nindent 4 }}
 7 |   name: aws-load-balancer-controller.rules
 8 |   namespace: {{ .Values.namespace  }}
 9 | spec:
10 |   groups:
11 |   - name: aws-load-balancer-controller
12 |     rules:
13 |     - alert: AWSLoadBalancerControllerAWSAPIErrors
14 |       annotations:
15 |         description: '{{`AWS load balancer controller pod {{ $labels.namespace}}/{{ $labels.pod }} on {{ $labels.cluster_id}} is throwing {{ $labels.error_code }} errors when contacting AWS API.`}}'
16 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/alb-errors/
17 |       expr: sum(increase(aws_api_calls_total{cluster_type="workload_cluster", error_code != "", provider=~"capa|eks"}[20m])) by (cluster_id, error_code, installation, namespace, pipeline, provider, pod) > 0
18 |       for: 40m
19 |       labels:
20 |         area: kaas
21 |         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
22 |         severity: page
23 |         team: phoenix
24 |         topic: alb
25 |     - alert: AWSLoadBalancerControllerReconcileErrors
26 |       annotations:
27 |         description: '{{`AWS load balancer controller pod {{ $labels.namespace }}/{{ $labels.pod }} on {{ $labels.cluster_id }} is throwing errors while reconciling the {{ $labels.controller }} controller.`}}'
28 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/alb-errors/
29 |       expr: sum(increase(controller_runtime_reconcile_total{cluster_type="workload_cluster", provider=~"capa|eks", result = "error", service="aws-load-balancer-controller"}[20m])) by (cluster_id, controller, installation, namespace, pipeline, provider, pod) > 0
30 |       for: 40m
31 |       labels:
32 |         area: kaas
33 |         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
34 |         severity: page
35 |         team: phoenix
36 |         topic: alb
37 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/dns-operator-azure.rules.yml:
--------------------------------------------------------------------------------
 1 | {{- if eq .Values.managementCluster.provider.kind "capz" }}
 2 | apiVersion: monitoring.coreos.com/v1
 3 | kind: PrometheusRule
 4 | metadata:
 5 |   labels: {{- include "labels.common" . | nindent 4}}
 6 |   name: dns-operator-azure.rules
 7 |   namespace: {{.Values.namespace}}
 8 | spec:
 9 |   groups:
10 |     - name: dns-operator-azure
11 |       rules:
12 |         - alert: ClusterDNSZoneMissing
13 |           annotations:
14 |             description: |-
15 |               {{`No DNS-zone for cluster {{ $labels.exported_namespace}}/{{ $labels.name }} got created yet. Check dns-operator-azure logs in installation/{{ $labels.installation}}.`}}
16 |             runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/dns-operator-azure/
17 |           expr: |-
18 |             capi_cluster_status_phase{phase="Provisioned"}
19 |             unless on (cluster_id, name)
20 |             label_replace(dns_operator_azure_zone_info{type="public"}, "name", "$1", "resource_group", "(.+)")
21 |           for: 30m
22 |           labels:
23 |             area: kaas
24 |             cancel_if_outside_working_hours: {{include "workingHoursOnly" .}}
25 |             severity: notify
26 |             team: phoenix
27 |             topic: managementcluster
28 |         - alert: AzureDNSOperatorAPIErrorRate
29 |           annotations:
30 |             description: |-
31 |               {{`Error rate for {{ $labels.method }} is high. Check dns-operator-azure logs in installation/{{ $labels.installation }}.`}}
32 |             runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/dns-operator-azure/
33 |           expr: |-
34 |             sum by (cluster_id, installation, method, pipeline, provider) (rate(dns_operator_azure_api_request_errors_total[5m])) > 0
35 |           for: 15m
36 |           labels:
37 |             area: kaas
38 |             cancel_if_outside_working_hours: {{include "workingHoursOnly" .}}
39 |             severity: notify
40 |             team: phoenix
41 |             topic: managementcluster
42 | {{- end }}
43 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/konfigure-operator.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   creationTimestamp: null
 5 |   labels:
 6 |     {{- include "labels.common" . | nindent 4 }}
 7 |   name: konfigure-operator.rules
 8 |   namespace: {{ .Values.namespace  }}
 9 | spec:
10 |   groups:
11 |     - name: konfigure-operator
12 |       rules:
13 |         - alert: KonfigureOperatorDeploymentNotSatisfied
14 |           annotations:
15 |             description: '{{`Konfigure Operator deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}'
16 |             runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/deployment-not-satisfied/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}&NAMESPACE={{ $labels.namespace }}&KIND=deployment&NAME={{ $labels.deployment }}`}}'
17 |           expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", namespace="giantswarm", deployment="konfigure-operator"} > 0
18 |           for: 30m
19 |           labels:
20 |             area: platform
21 |             cancel_if_outside_working_hours: "true"
22 |             severity: page
23 |             team: honeybadger
24 |             topic: managementcluster
25 |         - alert: KonfigurationReconciliationFailed
26 |           annotations:
27 |             description: |-
28 |               {{`{{ $labels.resource_kind }} {{ $labels.resource_name }} in ns {{ $labels.resource_namespace }} on {{ $labels.installation }} is stuck in Failed state.`}}
29 |             runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/konfigure-operator/
30 |           expr: konfigure_operator_reconcile_condition{condition_type="Ready", condition_status="False"} > 0
31 |           for: 10m
32 |           labels:
33 |             area: platform
34 |             cancel_if_outside_working_hours: "true"
35 |             severity: page
36 |             team: honeybadger
37 |             topic: releng
38 |             namespace: |-
39 |               {{`{{ $labels.exported_namespace }}`}}
40 | 


--------------------------------------------------------------------------------
/scripts/sync-kube-mixin.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -o errexit
 4 | set -o nounset
 5 | set -o pipefail
 6 | 
 7 | TMPDIR="$(mktemp -d -t 'tmp.XXXXXXXXXX')"
 8 | RULESFILE="helm/prometheus-rules/templates/kaas/tenet/recording-rules/kubernetes-mixins.rules.yml"
 9 | 
10 | trap 'cleanup' EXIT
11 | 
12 | function cleanup {
13 |   rm -rf "$TMPDIR"
14 | }
15 | 
16 | function tune_rules {
17 |     # Extra tuning
18 | 
19 |     # Latest mixins use SLO instead of classic metrics in several places
20 |     # but we dropped these SLO metrics
21 |     sed -i 's/apiserver_request_slo_duration_seconds/apiserver_request_duration_seconds/g' "$RULESFILE"
22 |     sed -i 's/cluster_id/cluster_id, installation, pipeline, provider/g' "$RULESFILE"
23 | }
24 | 
25 | function main {
26 |   local MIXIN_VER
27 |   # make a temporary dir to work in
28 |   local MIXIN_REPO="git@github.com:giantswarm/giantswarm-kubernetes-mixin.git"
29 |   # clone a branch or tag if provided
30 |   local BRANCH="${1:-}"
31 | 
32 |   if [[ -z "$BRANCH" ]]; then
33 |       # clone the mixins repo
34 |       echo -e "\nCloning master branch:\n"
35 |       git clone --single-branch "$MIXIN_REPO" "$TMPDIR"/mixins
36 |   else
37 |       # clone the mixins repo branch or tag
38 |       echo -e "\nCloning branch or tag '$BRANCH':\n"
39 |       git clone --branch "$BRANCH" --single-branch "$MIXIN_REPO" "$TMPDIR"/mixins
40 |   fi
41 | 
42 |   # get the current commit of the mixin repo
43 |   cd "$TMPDIR"/mixins
44 |   MIXIN_VER="$(git rev-parse HEAD)"
45 |   cd - > /dev/null
46 | 
47 | 
48 |   local PRECONTENT='apiVersion: monitoring.coreos.com/v1
49 | kind: PrometheusRule
50 | metadata:
51 |   labels:
52 |     {{- include "labels.common" . | nindent 4 }}
53 |   name: kube-mixins.recording.rules
54 |   namespace: {{ .Values.namespace  }}
55 | spec:
56 | '
57 | 
58 |   # copy generated rules file
59 |   cp "$TMPDIR"/mixins/files/prometheus-rules/rules.yml "$RULESFILE"
60 | 
61 |   # prepend K8s objectmeta to the rules file
62 |   printf '%s  %s' "$PRECONTENT" "$(cat "$RULESFILE")" > "$RULESFILE"
63 | 
64 |   tune_rules
65 | 
66 |   echo -e "\nSynced mixin repo at commit: $MIXIN_VER\n"
67 | 
68 |   # tidy up
69 |   cleanup
70 | }
71 | 
72 | main "$@"
73 | 


--------------------------------------------------------------------------------
/test/conf/pint/pint-all.hcl:
--------------------------------------------------------------------------------
 1 | rule {
 2 |   # Disallow spaces in label/annotation keys, they're only allowed in values.
 3 |   reject ".* +.*" {
 4 |     label_keys = true
 5 |     annotation_keys = true
 6 |   }
 7 | 
 8 |   # Disallow URLs in labels, they should go to annotations.
 9 |   reject "https?://.+" {
10 |     label_keys = true
11 |     label_values = true
12 |   }
13 | 
14 |   # Ensure that all aggregations are preserving mandatory labels.
15 |   aggregate ".+" {
16 |     severity = "bug"
17 |     keep = ["cluster_id", "installation", "pipeline", "provider"]
18 |   }
19 | }
20 | 
21 | rule {
22 |   # This block will apply to all alerting rules.
23 |   match {
24 |     kind = "alerting"
25 |   }
26 | 
27 |   # Each alert must have a 'description' annotation.
28 |   annotation "description" {
29 |     severity = "bug"
30 |     required = true
31 |   }
32 | 
33 |   # Each alert must have an `area' label that's either 'kaas' or 'platform'.
34 |   label "area" {
35 |     severity = "bug"
36 |     value = "(kaas|platform)"
37 |     required = true
38 |   }
39 | 
40 |    # Each alert must have a 'runbook_url' annotation.
41 |   annotation "runbook_url" {
42 |     severity = "bug"
43 |     required = true
44 |   }
45 | 
46 |   # Each alert should have a 'dashboardUid' annotation.
47 |   annotation "__dashboardUid__" {
48 |     severity = "warning"
49 |     required = true
50 |   }
51 | 
52 |   # Check how many times each alert would fire in the last 1d.
53 |   alerts {
54 |     range = "1d"
55 |     step = "1m"
56 |     resolve = "5m"
57 |   }
58 | }
59 | 
60 | # Rule for regular alerts
61 | rule {
62 |   match {
63 |     kind = "alerting"
64 |     name = "!~Inhibition.*|.*Heartbeat.*"
65 |   }
66 | 
67 |   # Each alert must have a 'severity' label that's either 'page', 'notify' or 'ticket'.
68 |   label "severity" {
69 |     severity = "bug"
70 |     value = "(page|notify|ticket)"
71 |     required = true
72 |   }
73 | }
74 | 
75 | # Rule for inhibition and heartbeat alerts
76 | rule {
77 |   match {
78 |     kind = "alerting"
79 |     name = "~Inhibition.*|.*Heartbeat.*"
80 |   }
81 | 
82 |   label "severity" {
83 |     severity = "bug"
84 |     value = "none"
85 |     required = true
86 |   }
87 | }
88 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.workload-cluster.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   creationTimestamp: null
 5 |   labels:
 6 |     {{- include "labels.common" . | nindent 4 }}
 7 |   name: aws.workload-cluster.rules
 8 |   namespace: {{ .Values.namespace  }}
 9 | spec:
10 |   groups:
11 |   - name: aws.workload-cluster
12 |     rules:
13 |     - alert: WorkloadClusterContainerIsRestartingTooFrequentlyAWS
14 |       annotations:
15 |         description: '{{`Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting too often.`}}'
16 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/container-is-restarting-too-often/
17 |       ## TODO(@giantswarm/team-phoenix): Review this list once all vintage installations are gone
18 |       expr: label_join(increase(kube_pod_container_status_restarts_total{container=~"aws-node.*|kiam-agent.*|kiam-server.*|ebs-(plugin|csi).*|aws-pod-identity-webhook.*|efs-csi-(node|controller).*"}[1h]), "service", "/", "namespace", "pod") > 10
19 |       for: 10m
20 |       labels:
21 |         area: kaas
22 |         cancel_if_outside_working_hours: "true"
23 |         cancel_if_cluster_has_no_workers: "true"
24 |         severity: page
25 |         team: phoenix
26 |         topic: kubernetes
27 |     - alert: WorkloadClusterPodPendingAWS
28 |       annotations:
29 |         description: '{{`Pod {{ $labels.namespace }}/{{ $labels.pod }} is stuck in Pending.`}}'
30 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/pod-stuck-in-pending/
31 |       ## TODO(@giantswarm/team-phoenix): Review this list once all vintage installations are gone
32 |       expr: kube_pod_status_phase{namespace="kube-system",pod=~"(aws-node.*|kiam-agent.*|kiam-server.*|ebs-(plugin|csi).*|efs-csi-(node|controller).*)", phase="Pending"} == 1
33 |       for: 15m
34 |       labels:
35 |         area: kaas
36 |         cancel_if_outside_working_hours: "true"
37 |         cancel_if_kube_state_metrics_down: "true"
38 |         cancel_if_cluster_has_no_workers: "true"
39 |         severity: page
40 |         team: phoenix
41 | 


--------------------------------------------------------------------------------
/test/tests/providers/capa/kaas/phoenix/alerting-rules/capa.inhibition.rules.test.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | rule_files:
 3 | - capa.inhibition.rules.yml
 4 | 
 5 | tests:
 6 |   # Tests for `InhibitionClusterWithoutWorkerNodes` inhibition alert
 7 |   - interval: 1m
 8 |     input_series:
 9 |       - series: 'capi_cluster_status_condition{cluster_id="golem", cluster_type="management_cluster", name="golem", pipeline="testing", status="True", type="ControlPlaneReady"}'
10 |         values: "1+0x300"
11 |       - series: 'capi_machinepool_spec_replicas{cluster_id="golem", cluster_name="golem", cluster_type="management_cluster", customer="giantswarm", installation="golem", organization="giantswarm", pipeline="testing", provider="capa"}'
12 |         values: "_x60 0x60 3x60"
13 |       - series: 'capi_cluster_info{infrastructure_reference_kind="AWSCluster", cluster_id="golem"}'
14 |         values: "1+0x300"
15 |     alert_rule_test:
16 |       - alertname: InhibitionClusterWithoutWorkerNodes
17 |         eval_time: 30m
18 |         exp_alerts:
19 |           - exp_labels:
20 |               area: kaas
21 |               cluster_id: "golem"
22 |               cluster_type: "management_cluster"
23 |               has_worker_nodes: "false"
24 |               name: "golem"
25 |               pipeline: "testing"
26 |               status: "True"
27 |               team: "phoenix"
28 |               topic: "status"
29 |               type: "ControlPlaneReady"
30 |             exp_annotations:
31 |               description: "Cluster (golem) has no worker nodes."
32 |       - alertname: InhibitionClusterWithoutWorkerNodes
33 |         eval_time: 90m
34 |         exp_alerts:
35 |           - exp_labels:
36 |               area: kaas
37 |               cluster_id: "golem"
38 |               cluster_type: "management_cluster"
39 |               has_worker_nodes: "false"
40 |               name: "golem"
41 |               pipeline: "testing"
42 |               status: "True"
43 |               team: "phoenix"
44 |               topic: "status"
45 |               type: "ControlPlaneReady"
46 |             exp_annotations:
47 |               description: "Cluster (golem) has no worker nodes."
48 |       - alertname: InhibitionClusterWithoutWorkerNodes
49 |         eval_time: 150m
50 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/cloud-provider-controller.rules.yml:
--------------------------------------------------------------------------------
 1 | # This rule applies to CAPI management clusters only
 2 | apiVersion: monitoring.coreos.com/v1
 3 | kind: PrometheusRule
 4 | metadata:
 5 |   creationTimestamp: null
 6 |   labels:
 7 |     {{- include "labels.common" . | nindent 4 }}
 8 |   name: cloud-provider-controller.rules
 9 |   namespace: {{ .Values.namespace  }}
10 | spec:
11 |   groups:
12 |   - name: cloud-provider-controller
13 |     rules:
14 |     - alert: FluxHelmReleaseFailed
15 |       annotations:
16 |         description: |-
17 |           {{`Flux HelmRelease {{ $labels.name }} in ns {{ $labels.exported_namespace }} on {{ $labels.installation }}/{{ $labels.cluster_id }} is stuck in Failed state.`}}
18 |         runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/flux-helmrelease-failed/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}&NAMESPACE={{ $labels.exported_namespace }}&HELMRELEASE_NAME={{ $labels.name }}`}}'
19 |       {{- $components := "(aws-ebs-csi-driver|cloud-provider-aws|azure-cloud-controller-manager|azure-cloud-node-manager|azuredisk-csi-driver|azurefile-csi-driver|cloud-provider-vsphere|cloud-provider-cloud-director)" }}
20 |       expr: |
21 |         (
22 |           label_replace(gotk_resource_info{ready="False", customresource_kind="HelmRelease", cluster_type="management_cluster", exported_namespace!="flux-giantswarm", exported_namespace!~"org-t-.*", name=~"(.+)-{{ $components }}"}, "cluster_id", "$1", "name", "(.+)-{{ $components }}")
23 |           * on(cluster_id) group_left(provider)
24 |           sum(
25 |               label_replace(
26 |                 capi_cluster_info, "provider", "vsphere", "infrastructure_reference_kind", "VSphereCluster"
27 |               )
28 |           ) by (cluster_id, provider)
29 |         ) > 0
30 |       for: 20m
31 |       labels:
32 |         area: kaas
33 |         cancel_if_outside_working_hours: "true"
34 |         cancel_if_kube_state_metrics_down: "true"
35 |         cancel_if_monitoring_agent_down: "true"
36 |         severity: page
37 |         team: {{ include "providerTeam" . }}
38 |         topic: managementcluster
39 |         namespace: |-
40 |           {{`{{ $labels.exported_namespace }}`}}
41 | 


--------------------------------------------------------------------------------
/.github/workflows/zz_generated.add-team-labels.yaml:
--------------------------------------------------------------------------------
 1 | name: Add appropriate labels to issue
 2 | 
 3 | on:
 4 |   issues:
 5 |     types: [assigned]
 6 | 
 7 | jobs:
 8 |   build_user_list:
 9 |     name: Get yaml config of GS users
10 |     runs-on: ubuntu-latest
11 |     permissions:
12 |       contents: read
13 |     steps:
14 |     - name: Get user-mapping
15 |       env:
16 |         GH_TOKEN: ${{ secrets.ISSUE_AUTOMATION }}
17 |       run: |
18 |         mkdir -p artifacts
19 |         gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" \
20 |           /repos/giantswarm/github/contents/tools/issue-automation/user-mapping.yaml \
21 |           | jq -r '.content' \
22 |           | base64 -d > artifacts/users.yaml
23 |     - name: Upload Artifact
24 |       uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4.3.3
25 |       with:
26 |         name: users
27 |         path: artifacts/users.yaml
28 |         retention-days: 1
29 | 
30 |   add_label:
31 |     name: Add team label when assigned
32 |     runs-on: ubuntu-latest
33 |     needs: build_user_list
34 |     steps:
35 |     - uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e  # v4.1.7
36 |       id: download-users
37 |       with:
38 |         name: users
39 |     - name: Find team label based on user names
40 |       run: |
41 |         event_assignee=$(cat $GITHUB_EVENT_PATH | jq -r .assignee.login | tr '[:upper:]' '[:lower:]')
42 |         echo "Issue assigned to: ${event_assignee}"
43 | 
44 |         TEAMS=$(cat ${{steps.download-users.outputs.download-path}}/users.yaml | tr '[:upper:]' '[:lower:]' | yq ".${event_assignee}.teams" -o csv | tr ',' ' ')
45 | 
46 |         echo "LABEL<<EOF" >> $GITHUB_ENV
47 |         for team in ${TEAMS}; do
48 |           echo "Team: ${team} | Label: team/${team}"
49 |           echo "team/${team}" >> $GITHUB_ENV
50 |         done
51 |         echo "EOF" >> $GITHUB_ENV
52 |     - name: Apply label to issue
53 |       if: ${{ env.LABEL != '' && env.LABEL != 'null' && env.LABEL != null }}
54 |       uses: actions-ecosystem/action-add-labels@bd52874380e3909a1ac983768df6976535ece7f8  # v1.1.3
55 |       with:
56 |         github_token: ${{ secrets.ISSUE_AUTOMATION }}
57 |         labels: |
58 |           ${{ env.LABEL }}
59 | 


--------------------------------------------------------------------------------
/mimir/mixin.libsonnet:
--------------------------------------------------------------------------------
 1 | (import 'mimir-mixin/mixin.libsonnet') + {
 2 |   _config+:: {
 3 |     tags: [
 4 |       'owner:team-atlas',
 5 |       'topic:observability',
 6 |       'component:mimir',
 7 |     ],
 8 | 
 9 |     per_cluster_label: 'cluster_id',
10 |     // Not sure why the default is set to instance, but we want to set it to node
11 |     per_node_label: 'node',
12 |     per_component_loki_label: 'component',
13 |     // We marked it as disabled as this should be enabled only if the enterprise gateway is enabled
14 |     gateway_enabled: false,
15 |     // Whether alerts for experimental ingest storage are enabled.
16 |     ingest_storage_enabled: false,
17 |     // Disable autoscaling components we do not use
18 |     autoscaling_hpa_prefix: 'mimir-',
19 |     // Whether autoscaling panels and alerts should be enabled for specific Mimir services.
20 |     autoscaling: {
21 |       query_frontend: {
22 |         enabled: false,
23 |         hpa_name: $._config.autoscaling_hpa_prefix + 'query-frontend',
24 |       },
25 |       ruler_query_frontend: {
26 |         enabled: false,
27 |         hpa_name: $._config.autoscaling_hpa_prefix + 'ruler-query-frontend',
28 |       },
29 |       querier: {
30 |         enabled: true,
31 |         hpa_name: $._config.autoscaling_hpa_prefix + 'querier',
32 |       },
33 |       ruler_querier: {
34 |         enabled: false,
35 |         hpa_name: $._config.autoscaling_hpa_prefix + 'ruler-querier',
36 |       },
37 |       store_gateway: {
38 |         enabled: false,
39 |         hpa_name: $._config.autoscaling_hpa_prefix + 'store-gateway',
40 |       },
41 |       distributor: {
42 |         enabled: true,
43 |         hpa_name: $._config.autoscaling_hpa_prefix + 'distributor',
44 |       },
45 |       ruler: {
46 |         enabled: false,
47 |         hpa_name: $._config.autoscaling_hpa_prefix + 'ruler',
48 |       },
49 |       gateway: {
50 |         enabled: true,
51 |         hpa_name: $._config.autoscaling_hpa_prefix + 'gateway',
52 |       },
53 |       ingester: {
54 |         enabled: false,
55 |         hpa_name: $._config.autoscaling_hpa_prefix + 'ingester',
56 |       },
57 |       compactor: {
58 |         enabled: false,
59 |         hpa_name: $._config.autoscaling_hpa_prefix + 'compactor',
60 |       },
61 |     },
62 |   },
63 | }
64 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/zot.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   labels:
 5 |     {{- include "labels.common" . | nindent 4 }}
 6 |   name: zot.rules
 7 |   namespace: {{ .Values.namespace  }}
 8 | spec:
 9 |   groups:
10 |     - name: zot
11 |       rules:
12 |         - alert: ZotDeploymentNotSatisfied
13 |           annotations:
14 |             description: '{{`Zot deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}'
15 |             runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/zot/?CUSTOMER={{ $labels.customer }}&INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}'
16 |           expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster",namespace="zot",deployment="zot-zot"} > 0
17 |           for: 30m
18 |           labels:
19 |             area: platform
20 |             cancel_if_outside_working_hours: "true"
21 |             severity: page
22 |             team: honeybadger
23 |             topic: managementcluster
24 |         - alert: ZotPersistentVolumeFillingUp
25 |           annotations:
26 |             description: '{{`The Zot PersistentVolume claimed by {{ $labels.persistentvolumeclaim}} in namespace {{ $labels.namespace }} is at least 80% full and projected to fill up soon.`}}'
27 |             runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/zot/?CUSTOMER={{ $labels.customer }}&INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}'
28 |           expr: |-
29 |             (
30 |               kubelet_volume_stats_available_bytes{namespace="zot", persistentvolumeclaim="zot-zot-pvc"}
31 |               /
32 |               kubelet_volume_stats_capacity_bytes{namespace="zot", persistentvolumeclaim="zot-zot-pvc"}
33 |             ) < 0.1
34 |             or
35 |             predict_linear(kubelet_volume_stats_available_bytes{namespace="zot", persistentvolumeclaim="zot-zot-pvc"}[1h], 4 * 3600) < 0.05
36 |           for: 1h
37 |           labels:
38 |             area: platform
39 |             cancel_if_outside_working_hours: "true"
40 |             severity: page
41 |             team: honeybadger
42 |             topic: managementcluster
43 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/kong.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   creationTimestamp: null
 5 |   labels:
 6 |     {{- include "labels.common" . | nindent 4 }}
 7 |   name: kong.rules
 8 |   namespace: {{ .Values.namespace  }}
 9 | spec:
10 |   groups:
11 |   - name: kong
12 |     rules:
13 |     - alert: KongNonProdDeploymentNotSatisfied
14 |       annotations:
15 |         description: '{{`Kong Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}'
16 |         runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/deployment-not-satisfied/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}&NAMESPACE={{ $labels.namespace }}&KIND=deployment&NAME={{ $labels.deployment }}`}}'
17 |       expr: managed_app_deployment_status_replicas_available{managed_app=~"kong.*", cluster_id!~"p.*"} / (managed_app_deployment_status_replicas_available{managed_app=~"kong.*", cluster_id!~"p.*"} + managed_app_deployment_status_replicas_unavailable{managed_app=~"kong.*", cluster_id!~"p.*"}) < 0.6
18 |       for: 30m
19 |       labels:
20 |         area: platform
21 |         cancel_if_outside_working_hours: "true"
22 |         severity: page
23 |         team: cabbage
24 |         topic: kong
25 |     - alert: KongProductionDeploymentNotSatisfied
26 |       annotations:
27 |         description: '{{`Kong Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}'
28 |         runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/deployment-not-satisfied/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}&NAMESPACE={{ $labels.namespace }}&KIND=deployment&NAME={{ $labels.deployment }}`}}'
29 |       expr: managed_app_deployment_status_replicas_available{managed_app=~"kong.*", cluster_id=~"p.*"} / (managed_app_deployment_status_replicas_available{managed_app=~"kong.*", cluster_id=~"p.*"} + managed_app_deployment_status_replicas_unavailable{managed_app=~"kong.*", cluster_id=~"p.*"}) < 0.6
30 |       for: 30m
31 |       labels:
32 |         area: platform
33 |         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
34 |         severity: page
35 |         team: cabbage
36 |         topic: kong
37 | 


--------------------------------------------------------------------------------
/test/tests/providers/global/platform/honeybadger/alerting-rules/zot.rules.test.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | rule_files:
 3 |   - zot.rules.yml
 4 | 
 5 | tests:
 6 |   - interval: 1m
 7 |     input_series:
 8 |       - series: 'kube_deployment_status_replicas_unavailable{cluster_type="management_cluster",namespace="zot",deployment="zot-zot"}'
 9 |         values: '_x5 0x10 1x45'
10 |     alert_rule_test:
11 |       - alertname: ZotDeploymentNotSatisfied
12 |         eval_time: 46m
13 |         exp_alerts:
14 |           - exp_labels:
15 |               alertname: "ZotDeploymentNotSatisfied"
16 |               area: "platform"
17 |               cancel_if_outside_working_hours: "true"
18 |               cluster_type: "management_cluster"
19 |               deployment: "zot-zot"
20 |               namespace: "zot"
21 |               severity: "page"
22 |               team: "honeybadger"
23 |               topic: "managementcluster"
24 |             exp_annotations:
25 |               description: "Zot deployment zot/zot-zot is not satisfied."
26 |               runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/runbooks/zot/?CUSTOMER=&INSTALLATION=&CLUSTER=
27 |   - interval: 1m
28 |     input_series:
29 |       - series: 'kubelet_volume_stats_available_bytes{namespace="zot", persistentvolumeclaim="zot-zot-pvc"}'
30 |         values: '50x30 20x30 15x30 5x60'
31 |       - series: 'kubelet_volume_stats_capacity_bytes{namespace="zot", persistentvolumeclaim="zot-zot-pvc"}'
32 |         values: '100x150'
33 |     alert_rule_test:
34 |       - alertname: ZotPersistentVolumeFillingUp
35 |         eval_time: 150m
36 |         exp_alerts:
37 |           - exp_labels:
38 |               alertname: "ZotPersistentVolumeFillingUp"
39 |               area: "platform"
40 |               cancel_if_outside_working_hours: "true"
41 |               namespace: "zot"
42 |               persistentvolumeclaim: "zot-zot-pvc"
43 |               severity: "page"
44 |               team: "honeybadger"
45 |               topic: "managementcluster"
46 |             exp_annotations:
47 |               description: "The Zot PersistentVolume claimed by zot-zot-pvc in namespace zot is at least 80% full and projected to fill up soon."
48 |               runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/runbooks/zot/?CUSTOMER=&INSTALLATION=&CLUSTER=
49 | 


--------------------------------------------------------------------------------
/test/tests/providers/global/platform/shield/alerting-rules/cert-manager.rules.test.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | rule_files:
 3 |   - cert-manager.rules.yml
 4 | 
 5 | tests:
 6 |   - interval: 1m
 7 |     input_series:
 8 |       - series: 'up{cluster_id="12345", cluster_type="workload_cluster", container="cert-manager", customer="giantswarm", installation="golem", instance="10.0.0.0:1234", job="12345-prometheus/workload-12345/0", namespace="kube-system", organization="giantswarm", pod="cert-manager-controller-7fcc585578-gnprd", provider="capa", service_priority="highest"}'
 9 |         values: "0+0x60"
10 |     alert_rule_test:
11 |       - alertname: CertManagerDown
12 |         eval_time: 15m
13 |         exp_alerts:
14 |           - exp_labels:
15 |               alertname: CertManagerDown
16 |               area: platform
17 |               cancel_if_kubelet_down: "true"
18 |               cancel_if_outside_working_hours: "true"
19 |               cluster_id: 12345
20 |               cluster_type: workload_cluster
21 |               container: cert-manager
22 |               customer: giantswarm
23 |               instance: 10.0.0.0:1234
24 |               ip: 10.0.0.0
25 |               job: 12345-prometheus/workload-12345/0
26 |               namespace: kube-system
27 |               organization: giantswarm
28 |               pod: cert-manager-controller-7fcc585578-gnprd
29 |               provider: capa
30 |               installation: golem
31 |               service_priority: highest
32 |               severity: page
33 |               team: shield
34 |               topic: cert-manager
35 |             exp_annotations:
36 |               description: "cert-manager in namespace kube-system is down."
37 |               runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/cert-manager-down/
38 |   - interval: 1m
39 |     input_series:
40 |       - series: 'up{cluster_id="12345", cluster_type="workload_cluster", container="cert-manager", customer="giantswarm", installation="golem", instance="10.0.0.0:1234", job="12345-prometheus/workload-12345/0", namespace="kube-system", organization="giantswarm", pod="cert-manager-controller-7fcc585578-gnprd", provider="capa", service_priority="highest"}'
41 |         values: "1+0x60"
42 |     alert_rule_test:
43 |       - alertname: CertManagerDown
44 |         eval_time: 15m
45 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/platform/cabbage/alerting-rules/external-dns.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   creationTimestamp: null
 5 |   labels:
 6 |     {{- include "labels.common" . | nindent 4 }}
 7 |   name: external-dns.rules
 8 |   namespace: {{ .Values.namespace }}
 9 | spec:
10 |   groups:
11 |   - name: external-dns
12 |     rules:
13 |     - alert: ExternalDNSCantAccessRegistry
14 |       annotations:
15 |         description: '{{`external-dns in namespace {{ $labels.namespace }}) can''t access registry (cloud service provider DNS service).`}}'
16 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/external-dns-cant-access-registry/
17 |       expr: rate(external_dns_registry_errors_total{provider=~"capa|capz|eks"}[2m]) > 0
18 |       for: 15m
19 |       labels:
20 |         area: platform
21 |         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
22 |         severity: page
23 |         team: cabbage
24 |         topic: external-dns
25 |     - alert: ExternalDNSCantAccessSource
26 |       annotations:
27 |         description: '{{`external-dns in namespace {{ $labels.namespace }}) can''t access source (Service or Ingress resource).`}}'
28 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/external-dns-cant-access-source/
29 |       expr: rate(external_dns_source_errors_total{provider=~"capa|capz|eks"}[2m]) > 0
30 |       for: 15m
31 |       labels:
32 |         area: platform
33 |         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
34 |         severity: page
35 |         team: cabbage
36 |         topic: external-dns
37 |     - alert: ExternalDNSDown
38 |       annotations:
39 |         description: '{{`external-dns in namespace {{ $labels.namespace }}) is down.`}}'
40 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/external-dns-down/
41 |       expr: label_replace(up{container="external-dns", provider=~"capa|capz|eks"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0
42 |       for: 15m
43 |       labels:
44 |         area: platform
45 |         cancel_if_outside_working_hours: "true"
46 |         cancel_if_kubelet_down: "true"
47 |         severity: page
48 |         team: cabbage
49 |         topic: external-dns
50 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/platform/atlas/alerting-rules/silence-operator.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   labels:
 5 |     {{- include "labels.common" . | nindent 4 }}
 6 |   name: silence-operator
 7 |   namespace: {{ .Values.namespace  }}
 8 | spec:
 9 |   groups:
10 |   - name: silence-operator
11 |     rules:
12 |     - alert: "SilenceOperatorReconcileErrors"
13 |       annotations:
14 |         description: '{{`silence-operator controller {{ $labels.controller }} too many reconcile errors.`}}'
15 |         runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/operator-not-reconciling/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}'
16 |       expr: |
17 |         avg_over_time(operatorkit_controller_errors_total{job="monitoring/silence-operator", cluster_type="management_cluster"}[20m]) > 0
18 |       for: 1h
19 |       labels:
20 |         area: platform
21 |         cancel_if_outside_working_hours: "true"
22 |         installation: {{ .Values.managementCluster.name }}
23 |         severity: page
24 |         team: atlas
25 |         topic: observability
26 |     - alert: SilenceOperatorSyncJobHasNotBeenScheduledForTooLong
27 |       annotations:
28 |         description: '{{`CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has not been scheduled for more than 1 day.`}}'
29 |         runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/job-has-not-been-scheduled-for-too-long/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}'
30 |       # This alert triggers when the silence operator sync job did not schedule for more than 1 day
31 |       # or if the job did not run successfully at least once in the last day
32 |       expr: (time() - kube_cronjob_status_last_schedule_time{cronjob="silence-operator-sync", cluster_type="management_cluster"}) > 86400
33 |             or count by (cronjob, cluster_id, installation, namespace, provider, pipeline) (label_replace(max_over_time(kube_job_status_succeeded{job_name=~"silence-operator-sync-.+", cluster_type="management_cluster"}[1d]), "cronjob", "silence-operator-sync", "job_name", "silence-operator-sync-.+") == 1) == 0
34 |       labels:
35 |         area: platform
36 |         severity: page
37 |         team: atlas
38 |         topic: managementcluster
39 |         cancel_if_outside_working_hours: "true"
40 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/platform/atlas/alerting-rules/keda.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   creationTimestamp: null
 5 |   labels:
 6 |     {{- include "labels.common" . | nindent 4 }}
 7 |   name: keda.rules
 8 |   namespace: {{ .Values.namespace  }}
 9 | spec:
10 |   groups:
11 |   - name: Keda
12 |     rules:
13 |     - alert: KedaDown
14 |       annotations:
15 |         description: 'Keda is down.'
16 |       expr: count by (cluster_id, installation, provider, pipeline) (up{container=~"keda-.*"} == 0) > 0
17 |       for: 10m
18 |       labels:
19 |         area: platform
20 |         cancel_if_cluster_control_plane_unhealthy: "true"
21 |         cancel_if_outside_working_hours: "true"
22 |         severity: notify
23 |         team: atlas
24 |         topic: autoscaling
25 |     - alert: KedaScaledObjectErrors
26 |       annotations:
27 |         description: '{{`Errors detected in scaled object {{ $labels.scaledObject }} in namespace {{ $labels.namespace}}.`}}'
28 |       expr: increase(keda_scaled_object_errors[10m])> 0
29 |       for: 15m
30 |       labels:
31 |         area: platform
32 |         cancel_if_cluster_control_plane_unhealthy: "true"
33 |         cancel_if_outside_working_hours: "true"
34 |         severity: notify
35 |         team: atlas
36 |         topic: autoscaling
37 |     - alert: KedaWebhookScaledObjectValidationErrors
38 |       annotations:
39 |         description: '{{`Validation errors detected in webhook for scaled object {{ $labels.scaledObject }} in namespace {{ $labels.namespace}}.`}}'
40 |       expr: increase(keda_webhook_scaled_object_validation_errors[10m]) > 0
41 |       for: 15m
42 |       labels:
43 |         area: platform
44 |         cancel_if_cluster_control_plane_unhealthy: "true"
45 |         cancel_if_outside_working_hours: "true"
46 |         severity: notify
47 |         team: atlas
48 |         topic: autoscaling
49 |     - alert: KedaScalerErrors
50 |       annotations:
51 |         description: '{{`Errors detected in scaler {{ $labels.scaler }} for scaled object {{ $labels.scaledObject }} in namespace {{ $labels.namespace}}.`}}'
52 |       expr: increase(keda_scaler_errors[10m]) > 0
53 |       for: 15m
54 |       labels:
55 |         area: platform
56 |         cancel_if_cluster_control_plane_unhealthy: "true"
57 |         cancel_if_outside_working_hours: "true"
58 |         severity: notify
59 |         team: atlas
60 |         topic: autoscaling
61 | 


--------------------------------------------------------------------------------
/test/tests/providers/global/kaas/tenet/alerting-rules/pods.rules.test.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | rule_files:
 3 |   - pods.rules.yml
 4 | 
 5 | tests:
 6 |   # PodsUnschedulable
 7 |   - interval: 1m
 8 |     input_series:
 9 |       # All is good for 1h,
10 |       # Then every hour we have a new pod unschedulable
11 |       - series: 'kube_pod_status_unschedulable{app="kube-state-metrics", cluster_id="wc01", cluster_type="workload_cluster", customer="giantswarm", installation="testinstall", namespace="kube-system", pipeline="stable", pod="alloy-logs-1", provider="capa", region="us-east-1"}'
12 |         values: "_x60 1x1000"
13 |       - series: 'kube_pod_status_unschedulable{app="kube-state-metrics", cluster_id="wc01", cluster_type="workload_cluster", customer="giantswarm", installation="testinstall", namespace="kube-system", pipeline="stable", pod="alloy-metrics-1", provider="capa", region="us-east-1"}'
14 |         values: "_x120 1x1000"
15 |       - series: 'kube_pod_status_unschedulable{app="kube-state-metrics", cluster_id="wc01", cluster_type="workload_cluster", customer="giantswarm", installation="testinstall", namespace="kube-system", pipeline="stable", pod="alloy-metrics-2", provider="capa", region="us-east-1"}'
16 |         values: "_x180 1x1000"
17 |     alert_rule_test:
18 |       - alertname: PodsUnschedulable
19 |         eval_time: 10m
20 |       - alertname: PodsUnschedulable
21 |         eval_time: 50m
22 |       - alertname: PodsUnschedulable
23 |         eval_time: 90m
24 |       - alertname: PodsUnschedulable
25 |         eval_time: 150m
26 |         exp_alerts:
27 |           - exp_labels:
28 |               area: "kaas"
29 |               cancel_if_outside_working_hours: "true"
30 |               cluster_id: "wc01"
31 |               cluster_type: "workload_cluster"
32 |               customer: "giantswarm"
33 |               installation: "testinstall"
34 |               inhibit_cluster_broken: "true"
35 |               pipeline: "stable"
36 |               provider: "capa"
37 |               region: "us-east-1"
38 |               severity: "page"
39 |               team: "phoenix"
40 |               topic: "workloadcluster"
41 |             exp_annotations:
42 |               __dashboardUid__: "unschedulable-pods"
43 |               dashboardQueryParams: "orgId=1&var-namespace=kube-system&var-cluster=wc01"
44 |               description: 'Cluster wc01 has unschedulable kube-system pods.'
45 |               runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/validate-cluster-health/
46 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/values.schema.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "$schema": "http://json-schema.org/schema#",
 3 |     "type": "object",
 4 |     "properties": {
 5 |         "Installation": {
 6 |             "type": "object",
 7 |             "properties": {
 8 |                 "V1": {
 9 |                     "type": "object",
10 |                     "properties": {
11 |                         "Guest": {
12 |                             "type": "object",
13 |                             "properties": {
14 |                                 "Kubernetes": {
15 |                                     "type": "object",
16 |                                     "properties": {
17 |                                         "IngressController": {
18 |                                             "type": "object",
19 |                                             "properties": {
20 |                                                 "BaseDomain": {
21 |                                                     "type": "string"
22 |                                                 }
23 |                                             }
24 |                                         }
25 |                                     }
26 |                                 }
27 |                             }
28 |                         }
29 |                     }
30 |                 }
31 |             }
32 |         },
33 |         "managementCluster": {
34 |             "type": "object",
35 |             "properties": {
36 |                 "customer": {
37 |                     "type": "string"
38 |                 },
39 |                 "name": {
40 |                     "type": "string"
41 |                 },
42 |                 "pipeline": {
43 |                     "type": "string"
44 |                 },
45 |                 "provider": {
46 |                     "type": "object",
47 |                     "properties": {
48 |                         "flavor": {
49 |                             "type": "string"
50 |                         },
51 |                         "kind": {
52 |                             "type": "string"
53 |                         },
54 |                         "region": {
55 |                             "type": "string"
56 |                         }
57 |                     }
58 |                 }
59 |             }
60 |         },
61 |         "name": {
62 |             "type": "string"
63 |         },
64 |         "namespace": {
65 |             "type": "string"
66 |         },
67 |         "serviceType": {
68 |             "type": "string"
69 |         }
70 |     }
71 | }
72 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/platform/shield/alerting-rules/cert-manager.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   creationTimestamp: null
 5 |   labels:
 6 |     {{- include "labels.common" . | nindent 4 }}
 7 |   name: cert-manager.rules
 8 |   namespace: {{ .Values.namespace  }}
 9 | spec:
10 |   groups:
11 |   - name: cert-manager
12 |     rules:
13 |     - alert: CertManagerPodHighMemoryUsage
14 |       annotations:
15 |         description: |-
16 |           {{`High memory usage ({{ $value }}) for container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }}.
17 |           If memory usage value is equal to memory limit value then it is likely the pod will be evicted.
18 |           If no limits are set then the pod will burst.
19 |           `}}
20 |       expr: (sum by (cluster_id, installation, pipeline, provider, pod, namespace, container) (container_memory_working_set_bytes{container=~"(cert-manager|cert-manager-app-controller)"}) / 1024 / 1024 / 1024) >= 0.85
21 |       for: 10m
22 |       labels:
23 |         area: platform
24 |         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
25 |         severity: notify
26 |         team: shield
27 |         topic: observability
28 |     - alert: CertManagerDown
29 |       annotations:
30 |         description: '{{`cert-manager in namespace {{ $labels.namespace }} is down.`}}'
31 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/cert-manager-down/
32 |       expr: label_replace(up{container=~"cert-manager(-app-controller)?"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0
33 |       for: 15m
34 |       labels:
35 |         area: platform
36 |         cancel_if_outside_working_hours: "true"
37 |         cancel_if_kubelet_down: "true"
38 |         severity: page
39 |         team: shield
40 |         topic: cert-manager
41 |     - alert: CertManagerTooManyCertificateRequests
42 |       annotations:
43 |         description: '{{`There are too many CertificateRequests in cluster {{ $labels.cluster_id }}.`}}'
44 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/cert-requests-too-many/
45 |       expr: sum by (cluster_id, installation, pipeline, provider) (etcd_kubernetes_resources_count{kind="certificaterequests.cert-manager.io"}) > 10000
46 |       for: 15m
47 |       labels:
48 |         area: platform
49 |         cancel_if_outside_working_hours: "true"
50 |         severity: notify
51 |         team: shield
52 |         topic: cert-manager
53 | 


--------------------------------------------------------------------------------
/test/tests/providers/global/platform/honeybadger/alerting-rules/crsync.rules.test.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | rule_files:
 3 |   - crsync.rules.yml
 4 | 
 5 | tests:
 6 |   - interval: 1m
 7 |     input_series:
 8 |       - series: 'kube_deployment_status_replicas_available{cluster_type="workload_cluster", installation="gazelle", cluster_id="operations", namespace="crsync", deployment="crsync-giantswarm-azurecr-io"}'
 9 |         values: "1x5 0x9 1x5 0x10"
10 |     alert_rule_test:
11 |       - alertname: CrsyncDeploymentNotSatisfied
12 |         eval_time: 32m
13 |         exp_alerts:
14 |           - exp_labels:
15 |               alertname: "CrsyncDeploymentNotSatisfied"
16 |               area: platform
17 |               cancel_if_outside_working_hours: "true"
18 |               cluster_id: "operations"
19 |               cluster_type: "workload_cluster"
20 |               deployment: "crsync-giantswarm-azurecr-io"
21 |               installation: "gazelle"
22 |               namespace: "crsync"
23 |               severity: "page"
24 |               team: "honeybadger"
25 |               topic: "releng"
26 |             exp_annotations:
27 |               description: "CrSync deployment crsync-giantswarm-azurecr-io is not satisfied in gazelle / operations at the crsync namespace."
28 |               runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/runbooks/deployment-not-satisfied/?INSTALLATION=gazelle&CLUSTER=operations&NAMESPACE=crsync&KIND=deployment&NAME=crsync-giantswarm-azurecr-io
29 |   - interval: 1m
30 |     input_series:
31 |       - series: 'crsync_sync_tags_total{registry="quay.io", cluster_id="example", repository="giantswarm/example"}'
32 |         values: "100x60"
33 |       - series: 'crsync_sync_tags_total{registry="docker.io", cluster_id="example", repository="giantswarm/example"}'
34 |         values: "95x60"
35 |     alert_rule_test:
36 |       - alertname: CrsyncTooManyTagsMissing
37 |         eval_time: 60m
38 |         exp_alerts:
39 |           - exp_labels:
40 |               alertname: "CrsyncTooManyTagsMissing"
41 |               area: platform
42 |               cancel_if_outside_working_hours: "true"
43 |               cluster_id: "example"
44 |               registry: "quay.io"
45 |               repository: "giantswarm/example"
46 |               severity: "page"
47 |               team: "honeybadger"
48 |               topic: "releng"
49 |             exp_annotations:
50 |               description: "Too many tags are not synchronised to registry mirrors."
51 |               runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/crsync-too-many-tags-missing/
52 | 


--------------------------------------------------------------------------------
/test/tests/providers/global/kaas/tenet/alerting-rules/capi-machinepool.rules.test.yml:
--------------------------------------------------------------------------------
 1 | rule_files:
 2 |   - capi-machinepool.rules.yml
 3 | 
 4 | tests:
 5 |   - interval: 1m
 6 |     input_series:
 7 |       - series: 'capi_machinepool_status_phase{phase="Failed", cluster_id="clippaxy", name="clippaxy-def00", exported_namespace="giantswarm"}'
 8 |         values: "0+3x75"
 9 |       - series: 'capi_cluster_info{cluster_id="clippaxy", provider="capa"}'
10 |         values: "1+0x75"
11 |       - series: 'capi_machinepool_annotation_paused{paused_value="true",cluster_id="grumpy", name="grumpy-72r5c", exported_namespace="giantswarm"}'
12 |         values: "0+1x75"
13 |       - series: 'capi_cluster_info{cluster_id="grumpy", provider="capa"}'
14 |         values: "1+0x75"
15 |     alert_rule_test:
16 |       - alertname: MachinePoolIsNotHealthy
17 |         eval_time: 25m
18 |         exp_alerts:
19 |           - exp_labels:
20 |               area: kaas
21 |               cancel_if_monitoring_agent_down: "true"
22 |               cancel_if_outside_working_hours: "true"
23 |               provider: capa
24 |               severity: page
25 |               phase: Failed
26 |               team: phoenix
27 |               topic: managementcluster
28 |               cluster_id: clippaxy
29 |               name: clippaxy-def00
30 |               exported_namespace: giantswarm
31 |             exp_annotations:
32 |               description: "The clusters clippaxy machinepool giantswarm/clippaxy-def00 is not healthy."
33 |               runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/capi-machinepool/
34 |               __dashboardUid__: bdi7iswg81czkcasd
35 |               dashboardQueryParams: "orgId=2"
36 |       - alertname: MachinePoolPaused
37 |         eval_time: 75m
38 |         exp_alerts:
39 |           - exp_labels:
40 |               area: kaas
41 |               cancel_if_monitoring_agent_down: "true"
42 |               cancel_if_outside_working_hours: "true"
43 |               provider: capa
44 |               severity: notify
45 |               team: phoenix
46 |               topic: managementcluster
47 |               cluster_id: grumpy
48 |               name: grumpy-72r5c
49 |               exported_namespace: giantswarm
50 |               paused_value: "true"
51 |             exp_annotations:
52 |               description: "The clusters grumpy machinepool giantswarm/grumpy-72r5c is paused."
53 |               runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/capi-machinepool/
54 |               __dashboardUid__: bdi7iswg81czkcasd
55 |               dashboardQueryParams: "orgId=2"
56 | 


--------------------------------------------------------------------------------
/renovate.json5:
--------------------------------------------------------------------------------
 1 | {
 2 |   // Base config - https://github.com/giantswarm/renovate-presets/blob/main/default.json5
 3 |   "extends": [
 4 |     "github>giantswarm/renovate-presets:default.json5"
 5 |   ],
 6 |   "customManagers": [
 7 |     {
 8 |       "customType": "regex",
 9 |       "fileMatch": ["test/hack/bin/fetch-tools.sh"],
10 |       "matchStrings": [
11 |         "ARCHITECT_VERSION=\"(?<currentValue>.*?)\""
12 |       ],
13 |       "depNameTemplate": "giantswarm/architect",
14 |       "datasourceTemplate": "github-releases",
15 |       "versioningTemplate": "semver"
16 |     },
17 |     {
18 |       "customType": "regex",
19 |       "fileMatch": ["test/hack/bin/fetch-tools.sh"],
20 |       "matchStrings": [
21 |         "HELM_VERSION=\"(?<currentValue>.*?)\""
22 |       ],
23 |       "depNameTemplate": "helm/helm",
24 |       "datasourceTemplate": "github-releases",
25 |       "versioningTemplate": "semver"
26 |     },
27 |     {
28 |       "customType": "regex",
29 |       "fileMatch": ["test/hack/bin/fetch-tools.sh"],
30 |       "matchStrings": [
31 |         "JQ_VERSION=\"(?<currentValue>.*?)\""
32 |       ],
33 |       "depNameTemplate": "jqlang/jq",
34 |       "datasourceTemplate": "github-releases",
35 |       "versioningTemplate": "semver"
36 |     },
37 |     {
38 |       "customType": "regex",
39 |       "fileMatch": ["test/hack/bin/fetch-tools.sh"],
40 |       "matchStrings": [
41 |         "LOKITOOL_VERSION=\"(?<currentValue>.*?)\""
42 |       ],
43 |       "depNameTemplate": "grafana/loki",
44 |       "datasourceTemplate": "github-releases",
45 |       "versioningTemplate": "semver"
46 |     },
47 |     {
48 |       "customType": "regex",
49 |       "fileMatch": ["test/hack/bin/fetch-tools.sh"],
50 |       "matchStrings": [
51 |         "PINT_VERSION=\"(?<currentValue>.*?)\""
52 |       ],
53 |       "depNameTemplate": "cloudflare/pint",
54 |       "datasourceTemplate": "github-releases",
55 |       "versioningTemplate": "semver"
56 |     },
57 |     {
58 |       "customType": "regex",
59 |       "fileMatch": ["test/hack/bin/fetch-tools.sh"],
60 |       "matchStrings": [
61 |         "PROMETHEUS_VERSION=\"(?<currentValue>.*?)\""
62 |       ],
63 |       "depNameTemplate": "prometheus/prometheus",
64 |       "datasourceTemplate": "github-releases",
65 |       "versioningTemplate": "semver"
66 |     },
67 |     {
68 |       "customType": "regex",
69 |       "fileMatch": ["test/hack/bin/fetch-tools.sh"],
70 |       "matchStrings": [
71 |         "YQ_VERSION=\"(?<currentValue>.*?)\""
72 |       ],
73 |       "depNameTemplate": "mikefarah/yq",
74 |       "datasourceTemplate": "github-releases",
75 |       "versioningTemplate": "semver"
76 |     }
77 |   ]
78 | }
79 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/platform/shield/alerting-rules/dex.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   creationTimestamp: null
 5 |   labels:
 6 |     {{- include "labels.common" . | nindent 4 }}
 7 |   name: dex.rules
 8 |   namespace: {{ .Values.namespace  }}
 9 | spec:
10 |   groups:
11 |   - name: dex
12 |     rules:
13 |     - alert: DexErrorRateHigh
14 |       annotations:
15 |         description: '{{`Dex running on {{ $labels.cluster_id }} is reporting an increased error rate.`}}'
16 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/dex-error-rate-high/
17 |       expr: sum(increase(http_requests_total{job="dex", handler!="/token", code=~"^[4]..$|[5]..$", cluster_type="management_cluster"}[5m])) by (cluster_id, installation, pipeline, provider) > 10
18 |       for: 30m
19 |       labels:
20 |         area: platform
21 |         cancel_if_outside_working_hours: "true"
22 |         severity: page
23 |         team: shield
24 |         topic: dex
25 |     - alert: DexSecretExpired
26 |       annotations:
27 |         description: '{{`dex-operator failed to renew secret of {{ $labels.app_registration_name }} for {{ $labels.app_owner }} on provider {{ $labels.provider_type }}.`}}'
28 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/dex-operator/
29 |       expr: min by(app_registration_name, app_owner, app_namespace, provider_name, provider_type, installation, cluster_id, pipeline, provider) (aggregation:dex_operator_idp_secret_expiry_time{cluster_type="management_cluster", provider_type!="github"}) - time() < 60*60*12
30 |       for: 30m
31 |       labels:
32 |         area: platform
33 |         cancel_if_outside_working_hours: "true"
34 |         severity: page
35 |         team: shield
36 |         topic: dex
37 |     - alert: ManagementClusterDexAppMissing
38 |       annotations:
39 |         description: '{{`dex-operator did not register a dex-app in giantswarm namespace.`}}'
40 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/dex-operator/
41 |       expr: absent(dex_operator_idp_secret_expiry_time{app_namespace="giantswarm", cluster_type="management_cluster", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"})
42 |       for: 30m
43 |       labels:
44 |         area: platform
45 |         cancel_if_outside_working_hours: "true"
46 |         cancel_if_metrics_broken: "true"
47 |         severity: page
48 |         team: shield
49 |         topic: dex
50 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/kaas/tenet/alerting-rules/node-exporter.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   creationTimestamp: null
 5 |   labels:
 6 |     {{- include "labels.common" . | nindent 4 }}
 7 |   name: node-exporter.rules
 8 |   namespace: {{ .Values.namespace  }}
 9 | spec:
10 |   groups:
11 |   - name: node-exporter
12 |     rules:
13 |     - alert: NodeExporterCollectorFailed
14 |       annotations:
15 |         description: '{{`NodeExporter Collector {{ $labels.collector }} on {{ $labels.instance }} is failed.`}}'
16 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/node-exporter-device-error/
17 |         # TODO(@giantswarm/team-atlas): the namespace filter should be removed when this completed https://github.com/giantswarm/roadmap/issues/3791, see https://github.com/giantswarm/prometheus-rules/pull/1491
18 |       expr: node_scrape_collector_success{collector!~"conntrack|bonding|hwmon|powersupplyclass|mdadm|nfs|nfsd|tapestats|fibrechannel|nvme|watchdog", namespace="kube-system"} == 0
19 |       for: 5m
20 |       labels:
21 |         area: kaas
22 |         cancel_if_outside_working_hours: "true"
23 |         severity: page
24 |         team: tenet
25 |         topic: observability
26 |   - name: resource-usage
27 |     rules:
28 |     # IncorrectResourceUsageData alert detects if the data used in the Grafana Cloud Resource Usage dashboard is incorrect by comparing the dashboard data against data from the kubelet.
29 |     - alert: IncorrectResourceUsageData
30 |       annotations:
31 |         description: '{{`Data used in the Grafana Cloud Resource Usage dashboard is incorrect for cluster {{ $labels.cluster_id }}.`}}'
32 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/resource-usage-dashboard/
33 |       expr: |
34 |         quantile_over_time(0.9, aggregation:node:cpu_cores_total[120m:30m])  / on(cluster_id, cluster_type, customer, installation, pipeline, provider, region)  quantile_over_time(0.9, (sum(machine_cpu_cores)by(cluster_id, cluster_type, customer, installation, pipeline, provider, region))[120m:30m]) < 0.9
35 |         or
36 |         quantile_over_time(0.9, aggregation:node:memory_memtotal_bytes_total[120m:30m])  / on(cluster_id, cluster_type, customer, installation, pipeline, provider, region)  quantile_over_time(0.9, (sum(machine_memory_bytes)by(cluster_id, cluster_type, customer, installation, pipeline, provider, region))[120m:30m]) < 0.9
37 |       for: 1h
38 |       labels:
39 |         area: kaas
40 |         cancel_if_outside_working_hours: "true"
41 |         severity: page
42 |         team: tenet
43 |         topic: observability
44 | 


--------------------------------------------------------------------------------
/test/tests/providers/global/kaas/tenet/alerting-rules/capi-machinedeployment.rules.test.yml:
--------------------------------------------------------------------------------
 1 | rule_files:
 2 |   - capi-machinedeployment.rules.yml
 3 | 
 4 | tests:
 5 |   - interval: 1m
 6 |     input_series:
 7 |       - series: 'capi_machinedeployment_status_phase{phase="Failed", cluster_id="clippaxy", name="clippaxy-def00", exported_namespace="giantswarm"}'
 8 |         values: "0+3x75"
 9 |       - series: 'capi_cluster_info{cluster_id="clippaxy", provider="capa"}'
10 |         values: "1+0x75"
11 |       - series: 'capi_machinedeployment_annotation_paused{paused_value="true",cluster_id="grumpy", name="grumpy-def99", exported_namespace="giantswarm"}'
12 |         values: "0+1x75"
13 |       - series: 'capi_cluster_info{cluster_id="grumpy", provider="capa"}'
14 |         values: "1+0x75"
15 |     alert_rule_test:
16 |       - alertname: MachineDeploymentIsNotHealthy
17 |         eval_time: 25m
18 |         exp_alerts:
19 |           - exp_labels:
20 |               area: kaas
21 |               cancel_if_monitoring_agent_down: "true"
22 |               cancel_if_outside_working_hours: "true"
23 |               provider: capa
24 |               severity: notify
25 |               phase: Failed
26 |               team: phoenix
27 |               topic: managementcluster
28 |               cluster_id: clippaxy
29 |               name: clippaxy-def00
30 |               exported_namespace: giantswarm
31 |             exp_annotations:
32 |               description: "The clusters clippaxy machinedeployment giantswarm/clippaxy-def00 is not healthy."
33 |               runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/capi-machinedeployment/
34 |               __dashboardUid__: bdi7iswg81czkcasd
35 |               dashboardQueryParams: "orgId=2"
36 |       - alertname: MachineDeploymentPaused
37 |         eval_time: 75m
38 |         exp_alerts:
39 |           - exp_labels:
40 |               area: kaas
41 |               cancel_if_monitoring_agent_down: "true"
42 |               cancel_if_outside_working_hours: "true"
43 |               provider: capa
44 |               severity: notify
45 |               team: phoenix
46 |               topic: managementcluster
47 |               cluster_id: grumpy
48 |               name: grumpy-def99
49 |               exported_namespace: giantswarm
50 |               paused_value: "true"
51 |             exp_annotations:
52 |               description: "The clusters grumpy machinedeployment giantswarm/grumpy-def99 is paused."
53 |               runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/capi-machinedeployment/
54 |               __dashboardUid__: bdi7iswg81czkcasd
55 |               dashboardQueryParams: "orgId=2"
56 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/kaas/tenet/alerting-rules/capi-machine.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   labels: {{- include "labels.common" . | nindent 4}}
 5 |   name: capi-machine.rules
 6 |   namespace: {{.Values.namespace}}
 7 | spec:
 8 |   groups:
 9 |     - name: capi-machine
10 |       rules:
11 |         - alert: MachineUnhealthyPhase
12 |           annotations:
13 |             description: |-
14 |               {{`Machine {{ $labels.exported_namespace}}/{{ $labels.name }} stuck in phase {{ $labels.phase }} for more than 30 minutes.`}}
15 |             runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/capi-machine/
16 |             __dashboardUid__: bdi7iswg81czkcasd
17 |             dashboardQueryParams: "orgId=2"
18 |           expr: |-
19 |             (
20 |               capi_machine_status_phase{phase!="Running", name!~".*bastion.*"}
21 |               * on(cluster_id) group_left(provider)
22 |               sum(
23 |                   label_replace(
24 |                     capi_cluster_info, "provider", "vsphere", "infrastructure_reference_kind", "VSphereCluster"
25 |                   )
26 |               ) by (cluster_id, provider)
27 |             ) > 0
28 |           for: 30m
29 |           labels:
30 |             area: kaas
31 |             cancel_if_monitoring_agent_down: "true"
32 |             cancel_if_outside_working_hours: "true"
33 |             severity: page
34 |             team: {{ include "providerTeam" . }}
35 |             topic: managementcluster
36 |         - alert: MachinePaused
37 |           expr: |-
38 |             (
39 |               capi_machine_annotation_paused{paused_value="true"}
40 |               * on(cluster_id) group_left(provider)
41 |               sum(
42 |                   label_replace(
43 |                     capi_cluster_info, "provider", "vsphere", "infrastructure_reference_kind", "VSphereCluster"
44 |                   )
45 |               ) by (cluster_id, provider)
46 |             ) > 0
47 |           for: 1h
48 |           labels:
49 |             area: kaas
50 |             cancel_if_monitoring_agent_down: "true"
51 |             cancel_if_outside_working_hours: "true"
52 |             severity: notify
53 |             team: {{ include "providerTeam" . }}
54 |             topic: managementcluster
55 |           annotations:
56 |             description: |-
57 |               {{`Machine {{ $labels.exported_namespace}}/{{ $labels.name }} is paused.`}}
58 |             runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/capi-machine/
59 |             __dashboardUid__: bdi7iswg81czkcasd
60 |             dashboardQueryParams: "orgId=2"
61 | 


--------------------------------------------------------------------------------
/test/tests/providers/capz/kaas/phoenix/alerting-rules/dns-operator-azure.rules.test.yml:
--------------------------------------------------------------------------------
 1 | rule_files:
 2 |   - dns-operator-azure.rules.yml
 3 | 
 4 | tests:
 5 |   - interval: 1m
 6 |     input_series:
 7 |       - series: 'dns_operator_azure_zone_info{controller="dns-operator-azure",resource_group="425bdf54",subscription_id="09be0ac8-38d9-4fe1-aa72-4ce2e8a084d2",tenant_id="4e4e320b-cf45-4fd4-9dd3-ec0046779035",zone="425bdf54.azuretest.gigantic.io",installation="puppy",type="public"}'
 8 |         values: "1+0x60"
 9 |       - series: 'capi_cluster_status_phase{name="425bdf54", exported_namespace="org-83dd715d", phase="Provisioned", installation="puppy"}'
10 |         values: "1+0x60"
11 |       - series: 'capi_cluster_status_phase{name="8e8225b5", exported_namespace="org-31f75bf9", phase="Provisioned", installation="puppy"}'
12 |         values: "1+0x60"
13 |       - series: 'dns_operator_azure_api_request_errors_total{controller="dns-operator-azure",method="recordSets.CreateOrUpdate",installation="puppy"}'
14 |         values: "0+0x10 1+1x20"
15 |       - series: 'dns_operator_azure_api_request_errors_total{controller="dns-operator-azure",method="zones.Get",installation="puppy"}'
16 |         values: "0+0x10 1+1x10 0+0x10"
17 |     alert_rule_test:
18 |       - alertname: ClusterDNSZoneMissing
19 |         eval_time: 30m
20 |         exp_alerts:
21 |           - exp_labels:
22 |               area: kaas
23 |               cancel_if_outside_working_hours: "false"
24 |               severity: notify
25 |               team: phoenix
26 |               topic: managementcluster
27 |               phase: Provisioned
28 |               exported_namespace: org-31f75bf9
29 |               installation: puppy
30 |               name: 8e8225b5
31 |             exp_annotations:
32 |               description: "No DNS-zone for cluster org-31f75bf9/8e8225b5 got created yet. Check dns-operator-azure logs in installation/puppy."
33 |               runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/dns-operator-azure/
34 |       - alertname: AzureDNSOperatorAPIErrorRate
35 |         eval_time: 30m
36 |         exp_alerts:
37 |           - exp_labels:
38 |               area: kaas
39 |               cancel_if_outside_working_hours: "false"
40 |               severity: notify
41 |               team: phoenix
42 |               topic: managementcluster
43 |               installation: puppy
44 |               method: recordSets.CreateOrUpdate
45 |             exp_annotations:
46 |               description: "Error rate for recordSets.CreateOrUpdate is high. Check dns-operator-azure logs in installation/puppy."
47 |               runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/dns-operator-azure/
48 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/kaas/tenet/alerting-rules/capi-machinepool.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   labels: {{- include "labels.common" . | nindent 4}}
 5 |   name: capi-machinepool.rules
 6 |   namespace: {{.Values.namespace}}
 7 | spec:
 8 |   groups:
 9 |     - name: capi-machinepool
10 |       rules:
11 |         - alert: MachinePoolIsNotHealthy
12 |           expr: |-
13 |             (
14 |               capi_machinepool_status_phase{phase="Failed"}
15 |               * on(cluster_id) group_left(provider)
16 |               sum(
17 |                   label_replace(
18 |                     capi_cluster_info, "provider", "vsphere", "infrastructure_reference_kind", "VSphereCluster"
19 |                   )
20 |               ) by (cluster_id, provider)
21 |             ) > 0
22 |           for: 15m
23 |           labels:
24 |             area: kaas
25 |             cancel_if_monitoring_agent_down: "true"
26 |             cancel_if_outside_working_hours: "true"
27 |             severity: page
28 |             team: {{ include "providerTeam" . }}
29 |             topic: managementcluster
30 |           annotations:
31 |             description: |-
32 |               {{`The clusters {{ $labels.cluster_id }} machinepool {{ $labels.exported_namespace }}/{{ $labels.name }} is not healthy.`}}
33 |             runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/capi-machinepool/
34 |             __dashboardUid__: bdi7iswg81czkcasd
35 |             dashboardQueryParams: "orgId=2"
36 |         - alert: MachinePoolPaused
37 |           expr: |-
38 |             (
39 |               capi_machinepool_annotation_paused{paused_value="true"}
40 |               * on(cluster_id) group_left(provider)
41 |               sum(
42 |                   label_replace(
43 |                     capi_cluster_info, "provider", "vsphere", "infrastructure_reference_kind", "VSphereCluster"
44 |                   )
45 |               ) by (cluster_id, provider)
46 |             ) > 0
47 |           for: 1h
48 |           labels:
49 |             area: kaas
50 |             cancel_if_monitoring_agent_down: "true"
51 |             cancel_if_outside_working_hours: "true"
52 |             severity: notify
53 |             team: {{ include "providerTeam" . }}
54 |             topic: managementcluster
55 |           annotations:
56 |             description: |-
57 |               {{`The clusters {{ $labels.cluster_id }} machinepool {{ $labels.exported_namespace }}/{{ $labels.name }} is paused.`}}
58 |             runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/capi-machinepool/
59 |             __dashboardUid__: bdi7iswg81czkcasd
60 |             dashboardQueryParams: "orgId=2"
61 | 


--------------------------------------------------------------------------------
/test/tests/providers/global/kaas/tenet/alerting-rules/capi-machine.rules.test.yml:
--------------------------------------------------------------------------------
 1 | rule_files:
 2 |   - capi-machine.rules.yml
 3 | 
 4 | tests:
 5 |   - interval: 1m
 6 |     input_series:
 7 |       - series: 'capi_machine_status_phase{cluster_id="clippaxy", name="clippaxy-72jq5", exported_namespace="giantswarm", phase="Running"}'
 8 |         values: "1+0x10 0+0x35"
 9 |       - series: 'capi_machine_status_phase{cluster_id="clippaxy", name="clippaxy-72jq5", exported_namespace="giantswarm", phase="Failed"}'
10 |         values: "0+0x10 1+0x35"
11 |       - series: 'capi_cluster_info{cluster_id="clippaxy", provider="capa"}'
12 |         values: "1+0x45"
13 |       - series: 'capi_machine_annotation_paused{paused_value="true",cluster_id="grumpy", name="grumpy-72r5c", exported_namespace="giantswarm"}'
14 |         values: "0+1x75"
15 |       - series: 'capi_cluster_info{cluster_id="grumpy", provider="capa"}'
16 |         values: "1+0x75"
17 |     alert_rule_test:
18 |       - alertname: MachineUnhealthyPhase
19 |         eval_time: 45m
20 |         exp_alerts:
21 |           - exp_labels:
22 |               area: kaas
23 |               cancel_if_monitoring_agent_down: "true"
24 |               cancel_if_outside_working_hours: "true"
25 |               provider: capa
26 |               severity: page
27 |               team: phoenix
28 |               topic: managementcluster
29 |               cluster_id: clippaxy
30 |               name: clippaxy-72jq5
31 |               exported_namespace: giantswarm
32 |               phase: Failed
33 |             exp_annotations:
34 |               description: "Machine giantswarm/clippaxy-72jq5 stuck in phase Failed for more than 30 minutes."
35 |               runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/capi-machine/
36 |               __dashboardUid__: bdi7iswg81czkcasd
37 |               dashboardQueryParams: "orgId=2"
38 |       - alertname: MachinePaused
39 |         eval_time: 75m
40 |         exp_alerts:
41 |           - exp_labels:
42 |               area: kaas
43 |               cancel_if_monitoring_agent_down: "true"
44 |               cancel_if_outside_working_hours: "true"
45 |               provider: capa
46 |               severity: notify
47 |               team: phoenix
48 |               topic: managementcluster
49 |               cluster_id: grumpy
50 |               name: grumpy-72r5c
51 |               exported_namespace: giantswarm
52 |               paused_value: "true"
53 |             exp_annotations:
54 |               description: "Machine giantswarm/grumpy-72r5c is paused."
55 |               runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/capi-machine/
56 |               __dashboardUid__: bdi7iswg81czkcasd
57 |               dashboardQueryParams: "orgId=2"
58 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/kaas/tenet/alerting-rules/capi-machinedeployment.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   labels: {{- include "labels.common" . | nindent 4}}
 5 |   name: capi-machinedeployment.rules
 6 |   namespace: {{.Values.namespace}}
 7 | spec:
 8 |   groups:
 9 |     - name: capi-machinedeployment
10 |       rules:
11 |         - alert: MachineDeploymentIsNotHealthy
12 |           expr: |-
13 |             (
14 |               capi_machinedeployment_status_phase{phase="Failed"}
15 |               * on(cluster_id) group_left(provider)
16 |               sum(
17 |                   label_replace(
18 |                     capi_cluster_info, "provider", "vsphere", "infrastructure_reference_kind", "VSphereCluster"
19 |                   )
20 |               ) by (cluster_id, provider)
21 |             ) > 0
22 |           for: 15m
23 |           labels:
24 |             area: kaas
25 |             cancel_if_monitoring_agent_down: "true"
26 |             cancel_if_outside_working_hours: "true"
27 |             severity: notify
28 |             team: {{ include "providerTeam" . }}
29 |             topic: managementcluster
30 |           annotations:
31 |             description: |-
32 |               {{`The clusters {{$labels.cluster_id}} machinedeployment {{$labels.exported_namespace}}/{{$labels.name}} is not healthy.`}}
33 |             runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/capi-machinedeployment/
34 |             __dashboardUid__: bdi7iswg81czkcasd
35 |             dashboardQueryParams: "orgId=2"
36 |         - alert: MachineDeploymentPaused
37 |           expr: |-
38 |             (
39 |               capi_machinedeployment_annotation_paused{paused_value="true"}
40 |               * on(cluster_id) group_left(provider)
41 |               sum(
42 |                   label_replace(
43 |                     capi_cluster_info, "provider", "vsphere", "infrastructure_reference_kind", "VSphereCluster"
44 |                   )
45 |               ) by (cluster_id, provider)
46 |             ) > 0
47 |           for: 1h
48 |           labels:
49 |             area: kaas
50 |             cancel_if_monitoring_agent_down: "true"
51 |             cancel_if_outside_working_hours: "true"
52 |             severity: notify
53 |             team: {{ include "providerTeam" . }}
54 |             topic: managementcluster
55 |           annotations:
56 |             description: |-
57 |               {{`The clusters {{$labels.cluster_id}} machinedeployment {{$labels.exported_namespace}}/{{$labels.name}} is paused.`}}
58 |             runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/capi-machinedeployment/
59 |             __dashboardUid__: bdi7iswg81czkcasd
60 |             dashboardQueryParams: "orgId=2"
61 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/kaas/tenet/alerting-rules/etcdbackup.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   creationTimestamp: null
 5 |   labels:
 6 |     {{- include "labels.common" . | nindent 4 }}
 7 |   name: etcdbackup.rules
 8 |   namespace: {{ .Values.namespace  }}
 9 | spec:
10 |   groups:
11 |   - name: etcdbackup
12 |     rules:
13 |     - alert: ETCDBackupJobFailedOrStuck
14 |       annotations:
15 |         description: '{{`Job {{ $labels.job }} failed or has not been completed for more than 30 minutes.`}}'
16 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/etcd-backup-failed/
17 |       expr: kube_job_failed{cluster_type="management_cluster",condition="true",job=~"etcd-backup.+"} == 1 or kube_pod_status_phase{cluster_type="management_cluster",phase="Pending",pod=~"etcd-backup.+"} == 1 or kube_job_status_succeeded{cluster_type="management_cluster",job=~"etcd-backup.+"} == 0
18 |       for: 30m
19 |       labels:
20 |         area: kaas
21 |         cancel_if_outside_working_hours: "true"
22 |         severity: page
23 |         team: tenet
24 |         topic: etcd-backup
25 |     - alert: LatestETCDBackup2DaysOld
26 |       annotations:
27 |         description: '{{`Latest successful ETCD backup for {{ $labels.cluster_id }}/{{ $labels.tenant_cluster_id }} was more than 48h ago.`}}'
28 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/etcd-backup-failed/
29 |       expr: count(label_replace(capi_cluster_created, "tenant_cluster_id", "$1", "name", "(.*)")) by (cluster_id, installation, pipeline, provider, tenant_cluster_id)  > 48 * 60 * 60 unless count((time() - etcd_backup_latest_success{tenant_cluster_id!="Control Plane"}) > 48 * 60 * 60) by (cluster_id, installation, pipeline, provider, tenant_cluster_id)
30 |       for: 5m
31 |       labels:
32 |         area: kaas
33 |         cancel_if_outside_working_hours: "true"
34 |         severity: page
35 |         team: tenet
36 |         topic: etcd-backup
37 |     - alert: ETCDBackupMetricsMissing
38 |       annotations:
39 |         description: '{{`ETCD backup metrics are missing`}}'
40 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/etcd-backup-metrics-missing/
41 |       expr: absent(etcd_backup_latest_attempt{cluster_type="management_cluster", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"})
42 |       for: 12h
43 |       labels:
44 |         area: kaas
45 |         cancel_if_outside_working_hours: "true"
46 |         cancel_if_metrics_broken: "true"
47 |         severity: page
48 |         team: tenet
49 |         topic: etcd-backup
50 | 


--------------------------------------------------------------------------------
/.circleci/config.yml:
--------------------------------------------------------------------------------
 1 | version: 2.1
 2 | orbs:
 3 |   architect: giantswarm/architect@6.11.0
 4 | 
 5 | workflows:
 6 |   package-and-push-chart-on-tag:
 7 |     jobs:
 8 |     - architect/push-to-app-catalog:
 9 |         context: architect
10 |         executor: app-build-suite
11 |         name: app-catalog
12 |         app_catalog: control-plane-catalog
13 |         app_catalog_test: control-plane-test-catalog
14 |         chart: prometheus-rules
15 |           # Trigger job on git tag.
16 |         filters:
17 |           tags:
18 |             only: /^v.*/
19 |           branches:
20 |             ignore:
21 |             - main
22 |             - master
23 | 
24 |     - architect/push-to-app-collection:
25 |         context: architect
26 |         name: push-to-capa-app-collection
27 |         app_name: prometheus-rules
28 |         app_namespace: monitoring
29 |         app_collection_repo: capa-app-collection
30 |         requires:
31 |         - app-catalog
32 |         filters:
33 |           branches:
34 |             ignore: /.*/
35 |           tags:
36 |             only: /^v.*/
37 | 
38 |     - architect/push-to-app-collection:
39 |         context: architect
40 |         name: push-to-capz-app-collection
41 |         app_name: prometheus-rules
42 |         app_namespace: monitoring
43 |         app_collection_repo: capz-app-collection
44 |         requires:
45 |         - app-catalog
46 |         filters:
47 |           branches:
48 |             ignore: /.*/
49 |           tags:
50 |             only: /^v.*/
51 | 
52 |     - architect/push-to-app-collection:
53 |         context: architect
54 |         name: push-to-cloud-director-app-collection
55 |         app_name: prometheus-rules
56 |         app_namespace: monitoring
57 |         app_collection_repo: cloud-director-app-collection
58 |         requires:
59 |         - app-catalog
60 |         filters:
61 |           branches:
62 |             ignore: /.*/
63 |           tags:
64 |             only: /^v.*/
65 | 
66 |     - architect/push-to-app-collection:
67 |         context: architect
68 |         name: vsphere-app-collection
69 |         app_name: prometheus-rules
70 |         app_namespace: monitoring
71 |         app_collection_repo: vsphere-app-collection
72 |         requires:
73 |         - app-catalog
74 |         filters:
75 |           branches:
76 |             ignore: /.*/
77 |           tags:
78 |             only: /^v.*/
79 | 
80 |     - architect/push-to-app-collection:
81 |         context: architect
82 |         name: proxmox-app-collection
83 |         app_name: prometheus-rules
84 |         app_namespace: monitoring
85 |         app_collection_repo: proxmox-app-collection
86 |         requires:
87 |         - app-catalog
88 |         filters:
89 |           branches:
90 |             ignore: /.*/
91 |           tags:
92 |             only: /^v.*/
93 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/platform/atlas/alerting-rules/tracing-pipeline.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   labels:
 5 |     {{- include "labels.common" . | nindent 4 }}
 6 |   name: logging-pipeline.rules
 7 |   namespace: {{ .Values.namespace }}
 8 | spec:
 9 |   groups:
10 |     - name: tracing-pipeline
11 |       rules:
12 |         # This alert will trigger if the failure rate of spans sent by the OTEL exporter exceeds a defined threshold (e.g., 10%).
13 |         - alert: OTLPTraceForwardingErrors
14 |           annotations:
15 |             __dashboardUid__: 9b6d37c8603e19e8922133984faad93d
16 |             dashboardQueryParams: "orgId=2"
17 |             summary: Alloy OTLP exporter is failing to send spans.
18 |             description: '{{`The Alloy OTLP exporter has failed to send {{ printf "%.1f" $value }}% of spans over the last 5 minutes.`}}'
19 |             runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/tracing-pipeline/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}'
20 |           expr: |-
21 |             (
22 |               rate(otelcol_exporter_send_failed_spans_total{job="alloy-events"}[5m])
23 |               /
24 |               rate(otelcol_exporter_sent_spans_total{job="alloy-events"}[5m])
25 |             ) * 100
26 |             >= 10  # Trigger if failure rate exceeds 10%
27 |           for: 1h
28 |           labels:
29 |             area: platform
30 |             severity: page
31 |             team: atlas
32 |             topic: observability
33 |             cancel_if_outside_working_hours: "true"
34 |         # This alert triggers if the Alloy OTLP exporter fails to enqueue spans at a sustained rate exceeding 100 spans per second over 5 minutes, which could indicate upstream issues or resource constraints.
35 |         - alert: OTLPExporterEnqueueFailures
36 |           annotations:
37 |             summary: Alloy OTLP exporter enqueue failures exceed 100 spans/second over 5 minutes
38 |             description: '{{`The Alloy OTLP exporter has failed to enqueue more than 100 spans per second on average over the last 5 minutes, indicating potential upstream issues.`}}'
39 |             runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/tracing-pipeline/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}'
40 |             __dashboardUid__: 9b6d37c8603e19e8922133984faad93d 
41 |             dashboardQueryParams: "orgId=2"
42 |           expr: rate(otelcol_exporter_enqueue_failed_spans_total{job="alloy-events"}[5m]) > 100
43 |           for: 1h
44 |           labels:
45 |             area: platform
46 |             severity: page
47 |             team: atlas
48 |             topic: observability
49 |             cancel_if_outside_working_hours: "true"
50 | 


--------------------------------------------------------------------------------
/test/tests/providers/global/platform/atlas/alerting-rules/sloth.rules.test.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | rule_files:
 3 |   - sloth.rules.yml
 4 | 
 5 | tests:
 6 |   - interval: 1m
 7 |     input_series:
 8 |       # For the first 60min: test with 1 pod: none, up, down
 9 |       - series: 'up{job="monitoring/sloth", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", provider="capa", pipeline="testing"}'
10 |         values: "_x20 1+0x20 0+0x20"
11 |     alert_rule_test:
12 |       - alertname: SlothDown
13 |         eval_time: 10m
14 |       - alertname: SlothDown
15 |         eval_time: 30m
16 |       - alertname: SlothDown
17 |         eval_time: 50m
18 |         exp_alerts:
19 |           - exp_labels:
20 |               area: platform
21 |               cluster_id: gauss
22 |               installation: gauss
23 |               provider: capa
24 |               pipeline: testing
25 |               severity: page
26 |               team: atlas
27 |               topic: observability
28 |               cancel_if_cluster_control_plane_unhealthy: "true"
29 |               cancel_if_outside_working_hours: "true"
30 |             exp_annotations:
31 |               description: "Sloth is down."
32 |               runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/runbooks/sloth-down/?INSTALLATION=gauss&CLUSTER=gauss
33 |   - interval: 1m
34 |     input_series:
35 |       - series: 'kube_pod_container_status_restarts_total{cluster_type="management_cluster", namespace="monitoring", container="sloth", installation="gauss", cluster_id="gauss"}'
36 |         values: "0+0x20 0+5x20 100+0x140" # 0 restarts after 20 minutes then we restart 5 times per minute for 20 minutes then we stop restarting for 140 minutes
37 |     alert_rule_test:
38 |       - alertname: SlothRestartingTooOften
39 |         eval_time: 15m  # should be OK after 15 minutes
40 |       - alertname: SlothRestartingTooOften
41 |         eval_time: 85m  # After 85 minutes, should fire an alert for the t+85 error
42 |         exp_alerts:
43 |           - exp_labels:
44 |               area: platform
45 |               cancel_if_cluster_control_plane_unhealthy: "true"
46 |               cancel_if_outside_working_hours: "true"
47 |               cluster_id: gauss
48 |               cluster_type: management_cluster
49 |               container: sloth
50 |               installation: gauss
51 |               namespace: monitoring
52 |               severity: page
53 |               team: atlas
54 |               topic: observability
55 |             exp_annotations:
56 |               description: Sloth is restarting too often.
57 |               runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/runbooks/sloth-down/?INSTALLATION=gauss&CLUSTER=gauss
58 |       - alertname: SlothRestartingTooOften
59 |         eval_time: 140m  # After 140m minutes, all should be back to normal
60 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/kaas/tenet/alerting-rules/apiserver.management-cluster.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   creationTimestamp: null
 5 |   labels:
 6 |     {{- include "labels.common" . | nindent 4 }}
 7 |   name: apiserver.management-cluster.rules
 8 |   namespace: {{ .Values.namespace  }}
 9 | spec:
10 |   groups:
11 |   - name: apiserver
12 |     rules:
13 |     - alert: ManagementClusterAPIServerAdmissionWebhookErrors
14 |       annotations:
15 |         description: '{{`Kubernetes API Server {{ $labels.cluster_id }} having admission webhook errors.`}}'
16 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/apiserver-admission-webhook-errors/
17 |       expr: label_replace(rate(apiserver_admission_webhook_rejection_count{cluster_type="management_cluster", error_type=~"calling_webhook_error|apiserver_internal_error"}[5m]), "service", "$1", "name", "(.*)") > 1
18 |       for: 15m
19 |       labels:
20 |         area: kaas
21 |         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
22 |         severity: page
23 |         team: tenet
24 |         topic: managementcluster
25 |     - alert: ManagementClusterWebhookDurationExceedsTimeout
26 |       annotations:
27 |         description: '{{`Kubernetes API Server admission webhook {{ $labels.name }} is timing out.`}}'
28 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/apiserver-admission-webhook-errors/
29 |       expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="management_cluster", name!="apps.app-admission-controller.giantswarm.io"}[5m])) by (cluster_id, installation, pipeline, provider, name, job, le)) > 5
30 |       for: 25m
31 |       labels:
32 |         area: kaas
33 |         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
34 |         severity: page
35 |         team: tenet
36 |         topic: managementcluster
37 | 
38 |     # Kyverno webhooks that may block critical objects
39 |     - alert: ManagementClusterWebhookDurationExceedsTimeoutKyvernoCritical
40 |       annotations:
41 |         description: '{{`Kubernetes API Server admission webhook {{ $labels.name }} takes very long or is timing out.`}}'
42 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/apiserver-admission-webhook-errors/
43 |       expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="management_cluster", name=~".*(kyverno.*fail).*"}[15m])) by (cluster_id, installation, pipeline, provider, name, job, le)) > 10
44 |       for: 10m
45 |       labels:
46 |         area: kaas
47 |         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
48 |         severity: page
49 |         team: tenet
50 |         topic: managementcluster
51 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/kaas/tenet/alerting-rules/certificate.management-cluster.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   creationTimestamp: null
 5 |   labels:
 6 |     {{- include "labels.common" . | nindent 4 }}
 7 |   name: certificate.management-cluster.rules
 8 |   namespace: {{ .Values.namespace  }}
 9 | spec:
10 |   groups:
11 |   - name: certificate.management-cluster
12 |     rules:
13 |     - alert: ManagementClusterCertificateIsMissing
14 |       annotations:
15 |         description: '{{`Cannot renew Certificate for Secret {{ $labels.exported_namespace }}/{{ $labels.certificatename }} on {{ $labels.cluster_id }} because it is missing.`}}'
16 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/managed-app-cert-manager/missing-certificate-for-secret/
17 |       expr: |
18 |         count(
19 |           cert_exporter_secret_not_after{cluster_type="management_cluster", secretkey="tls.crt", certificatename=~"^capa-serving-cert$|^capi-serving-cert$|^capi-kubeadm-bootstrap-serving-cert$|^capi-kubeadm-control-plane-serving-cert$|^capv-serving-cert$|^capmox-serving-cert$|^caip-in-cluster-serving-cert$|^capvcd-serving-cert$|^capz-serving-cert$|^azureserviceoperator-serving-cert$|^aws-pod-identity-webhook$"}
20 |         ) by (cluster_id, installation, pipeline, provider, certificatename, exported_namespace)
21 |         unless 
22 |         count(
23 |           label_replace(
24 |             cert_exporter_certificate_cr_not_after{cluster_type="management_cluster", name=~"^capa-serving-cert$|^capi-serving-cert$|^capi-kubeadm-bootstrap-serving-cert$|^capi-kubeadm-control-plane-serving-cert$|^capv-serving-cert$|^capmox-serving-cert$|^caip-in-cluster-serving-cert$|^capvcd-serving-cert$|^capz-serving-cert$|^azureserviceoperator-serving-cert$|^aws-pod-identity-webhook$"}, 
25 |             "certificatename", 
26 |             "$1", 
27 |             "name", 
28 |             "(.*)"
29 |           )
30 |         ) by (cluster_id, installation, pipeline, provider, certificatename, exported_namespace)
31 |       for: 5m
32 |       labels:
33 |         area: kaas
34 |         cancel_if_outside_working_hours: "true"
35 |         severity: page
36 |         team: {{ include "providerTeam" . }}
37 |         topic: security
38 |     - alert: ManagementClusterCertificateWillExpireInLessThanOneMonth
39 |       annotations:
40 |         description: '{{`Certificate {{ $labels.path }} on {{ $labels.node }} will expire in less than one month.`}}'
41 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/renew-certificates/
42 |       expr: (cert_exporter_not_after{cluster_type="management_cluster", path!="/etc/kubernetes/ssl/service-account-crt.pem"} - time()) < 4 * 7 * 24 * 60 * 60
43 |       for: 5m
44 |       labels:
45 |         area: kaas
46 |         cancel_if_outside_working_hours: "true"
47 |         severity: page
48 |         team: se
49 |         topic: security
50 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/kaas/tenet/alerting-rules/capi-kubeadmcontrolplane.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   labels: {{- include "labels.common" . | nindent 4}}
 5 |   name: capi-kubeadmcontrolplane.rules
 6 |   namespace: {{.Values.namespace}}
 7 | spec:
 8 |   groups:
 9 |     - name: capi-kubeadmcontrolplane
10 |       rules:
11 |         - alert: KubeadmControlPlaneReplicasMismatch
12 |           expr: |-
13 |             (
14 |               (capi_kubeadmcontrolplane_spec_replicas != capi_kubeadmcontrolplane_status_replicas_ready)
15 |               * on(cluster_id) group_left(provider)
16 |               sum(
17 |                   label_replace(
18 |                     capi_cluster_info, "provider", "vsphere", "infrastructure_reference_kind", "VSphereCluster"
19 |                   )
20 |               ) by (cluster_id, provider)
21 |             )
22 |           # 90min at max 3 replicas results in maximum of 30 minutes per control-plane machine.
23 |           for: 90m
24 |           labels:
25 |             area: kaas
26 |             cancel_if_monitoring_agent_down: "true"
27 |             cancel_if_outside_working_hours: "true"
28 |             severity: notify
29 |             team: {{ include "providerTeam" . }}
30 |             topic: managementcluster
31 |           annotations:
32 |             description: |-
33 |               {{`The clusters {{$labels.cluster_id}} kubeadmcontrolplane {{$labels.exported_namespace}}/{{$labels.name}} does not match the expected number of replicas for longer than 90 minutes.`}}
34 |             runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/capi-kubeadmcontrolplane/
35 |             __dashboardUid__: bdi7iswg81czkcasd
36 |             dashboardQueryParams: "orgId=2"
37 |         - alert: KubeadmControlPlanePaused
38 |           expr: |-
39 |             (
40 |               capi_kubeadmcontrolplane_annotation_paused{paused_value="true"}
41 |               * on(cluster_id) group_left(provider)
42 |               sum(
43 |                   label_replace(
44 |                     capi_cluster_info, "provider", "vsphere", "infrastructure_reference_kind", "VSphereCluster"
45 |                   )
46 |               ) by (cluster_id, provider)
47 |             ) > 0
48 |           for: 1h
49 |           labels:
50 |             area: kaas
51 |             cancel_if_monitoring_agent_down: "true"
52 |             cancel_if_outside_working_hours: "true"
53 |             severity: notify
54 |             team: {{ include "providerTeam" . }}
55 |             topic: managementcluster
56 |           annotations:
57 |             description: |-
58 |               {{`The clusters {{$labels.cluster_id}} kubeadmcontrolplane {{$labels.exported_namespace}}/{{$labels.name}} is paused.`}}
59 |             runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/capi-kubeadmcontrolplane/
60 |             __dashboardUid__: bdi7iswg81czkcasd
61 |             dashboardQueryParams: "orgId=2"
62 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/kaas/tenet/alerting-rules/vertical-pod-autoscaler.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   labels:
 5 |     {{- include "labels.common" . | nindent 4 }}
 6 |   name: vertical-pod-autoscaler.rules
 7 |   namespace: {{ .Values.namespace }}
 8 | spec:
 9 |   groups:
10 |   - name: vertical-pod-autoscaler
11 |     rules:
12 |     - alert: VpaComponentTooManyRestarts
13 |       annotations:
14 |         description: This pages when one of the vpa's component has restarted too much over the last 10min.
15 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/vpa-component-too-many-restarts/
16 |       expr: |
17 |         1 - sum(increase(kube_pod_container_status_restarts_total{container=~"recommender|updater|admission-controller"}[10m])) by (container, cluster_id, cluster_type, customer, installation, pipeline, provider, region)/100 < 0.98
18 |           or
19 |         1 - sum(increase(kube_pod_container_status_restarts_total{container="vertical-pod-autoscaler-app"}[10m])) by (container, cluster_id, cluster_type, customer, installation, pipeline, provider, region)/100 < 0.98
20 |       for: 10m
21 |       labels:
22 |         area: kaas
23 |         cancel_if_cluster_control_plane_unhealthy: "true"
24 |         cancel_if_outside_working_hours: "true"
25 |         severity: notify
26 |         team: tenet
27 |         topic: autoscaling
28 |     - alert: FluxHelmReleaseFailed
29 |       annotations:
30 |         description: |-
31 |           {{`Flux HelmRelease {{ $labels.name }} in ns {{ $labels.exported_namespace }} on {{ $labels.installation }}/{{ $labels.cluster_id }} is stuck in Failed state.`}}
32 |         runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/flux-helmrelease-failed/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}&NAMESPACE={{ $labels.exported_namespace }}&HELMRELEASE_NAME={{ $labels.name }}`}}'
33 |       {{- $components := "(vertical-pod-autoscaler-crd)" }}
34 |       expr: |
35 |         (
36 |           label_replace(gotk_resource_info{ready="False", customresource_kind="HelmRelease", cluster_type="management_cluster", exported_namespace!="flux-giantswarm", exported_namespace!~"org-t-.*", name=~"(.+)-{{ $components }}"}, "cluster_id", "$1", "name", "(.+)-{{ $components }}")
37 |           * on(cluster_id) group_left(provider)
38 |           sum(
39 |               label_replace(
40 |                 capi_cluster_info, "provider", "vsphere", "infrastructure_reference_kind", "VSphereCluster"
41 |               )
42 |           ) by (cluster_id, provider)
43 |         ) > 0
44 |       for: 20m
45 |       labels:
46 |         area: kaas
47 |         cancel_if_outside_working_hours: "true"
48 |         cancel_if_kube_state_metrics_down: "true"
49 |         cancel_if_monitoring_agent_down: "true"
50 |         severity: page
51 |         team: {{ include "providerTeam" . }}
52 |         topic: autoscaling
53 |         namespace: |-
54 |           {{`{{ $labels.exported_namespace }}`}}
55 | 


--------------------------------------------------------------------------------
/test/tests/providers/global/platform/honeybadger/alerting-rules/konfigure-operator.rules.test.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | rule_files:
 3 |   - konfigure-operator.rules.yml
 4 | 
 5 | tests:
 6 |   - interval: 1m
 7 |     input_series:
 8 |       - series: 'kube_deployment_status_replicas_unavailable{installation="exampleinstallation",cluster_id="examplecluster",cluster_type="management_cluster",namespace="giantswarm",deployment="konfigure-operator"}'
 9 |         values: '_x5 0x10 1x45 0x60'
10 |     alert_rule_test:
11 |       - alertname: KonfigureOperatorDeploymentNotSatisfied
12 |         eval_time: 46m
13 |         exp_alerts:
14 |           - exp_labels:
15 |               alertname: "KonfigureOperatorDeploymentNotSatisfied"
16 |               area: "platform"
17 |               cancel_if_outside_working_hours: "true"
18 |               cluster_id: "examplecluster"
19 |               cluster_type: "management_cluster"
20 |               deployment: "konfigure-operator"
21 |               installation: "exampleinstallation"
22 |               namespace: "giantswarm"
23 |               severity: "page"
24 |               team: "honeybadger"
25 |               topic: "managementcluster"
26 |             exp_annotations:
27 |               description: "Konfigure Operator deployment giantswarm/konfigure-operator is not satisfied."
28 |               runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/runbooks/deployment-not-satisfied/?INSTALLATION=exampleinstallation&CLUSTER=examplecluster&NAMESPACE=giantswarm&KIND=deployment&NAME=konfigure-operator
29 |       - alertname: KonfigureOperatorDeploymentNotSatisfied
30 |         eval_time: 100m
31 |         exp_alerts: []
32 |   - interval: 1m
33 |     input_series:
34 |       - series: 'konfigure_operator_reconcile_condition{condition_type="Ready", condition_status="False", resource_kind="ManagementClusterConfiguration", resource_name="test", resource_namespace="giantswarm", installation="example"}'
35 |         values: '0x30 1x5 0x20 1x15'
36 |     alert_rule_test:
37 |       - alertname: KonfigurationReconciliationFailed
38 |         eval_time: 35m
39 |         exp_alerts: []
40 |       - alertname: KonfigurationReconciliationFailed
41 |         eval_time: 70m
42 |         exp_alerts:
43 |           - exp_labels:
44 |               alertname: "KonfigurationReconciliationFailed"
45 |               area: "platform"
46 |               cancel_if_outside_working_hours: "true"
47 |               condition_status: "False"
48 |               condition_type: "Ready"
49 |               installation: "example"
50 |               resource_kind: "ManagementClusterConfiguration"
51 |               resource_name: "test"
52 |               resource_namespace: "giantswarm"
53 |               severity: "page"
54 |               team: "honeybadger"
55 |               topic: "releng"
56 |             exp_annotations:
57 |               description: "ManagementClusterConfiguration test in ns giantswarm on example is stuck in Failed state."
58 |               runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/konfigure-operator/
59 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/cluster-crossplane.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   creationTimestamp: null
 5 |   labels:
 6 |     {{- include "labels.common" . | nindent 4 }}
 7 |   name: cluster-crossplane.rules
 8 |   namespace: {{ .Values.namespace  }}
 9 | spec:
10 |   groups:
11 |   - name: cluster-crossplane
12 |     rules:
13 |     - alert: ClusterCrossplaneResourcesNotReady
14 |       annotations:
15 |         # Crossplane doesn't offer object names and the objects are stored on the MC, so right
16 |         # now (2025-01), we can't make this alert WC-specific.
17 |         description: '{{`Not all managed Crossplane resources of type "{{ $labels.gvk }}" on {{ $labels.cluster_id }} are ready. This could affect creation or health of workload clusters.`}}'
18 |         runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/cluster-crossplane-resources/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}'
19 |       # Match critical resources deployed by cluster-aws via aws-nth-crossplane-resources,
20 |       # cilium-crossplane-resources, crossplane-fn-irsa, ...
21 |       expr: |
22 |         (
23 |         crossplane_managed_resource_exists{gvk=~"(iam.aws.upbound.io/.*, Kind=(Role.*|Policy)|sqs.aws.upbound.io/.*, Kind=Queue|sqs.aws.upbound.io/.*, Kind=QueuePolicy|cloudwatchevents.aws.upbound.io/.*, Kind=Rule|cloudwatchevents.aws.upbound.io/.*, Kind=Target|ec2.aws.upbound.io/.*, Kind=SecurityGroup|acm.aws.upbound.io/.*, Kind=Certificate|cloudfront.aws.upbound.io/.*, Kind=.+|iam.aws.upbound.io/.*, Kind=OpenIDConnectProvider|route53.aws.upbound.io/.*, Kind=Record|s3.aws.upbound.io/.*, Kind=Bucket.*)"} != crossplane_managed_resource_ready{gvk=~"(iam.aws.upbound.io/.*, Kind=(Role.*|Policy)|sqs.aws.upbound.io/.*, Kind=Queue|sqs.aws.upbound.io/.*, Kind=QueuePolicy|cloudwatchevents.aws.upbound.io/.*, Kind=Rule|cloudwatchevents.aws.upbound.io/.*, Kind=Target|ec2.aws.upbound.io/.*, Kind=SecurityGroup|acm.aws.upbound.io/.*, Kind=Certificate|cloudfront.aws.upbound.io/.*, Kind=.+|iam.aws.upbound.io/.*, Kind=OpenIDConnectProvider|route53.aws.upbound.io/.*, Kind=Record|s3.aws.upbound.io/.*, Kind=Bucket.*)"}
24 |         ) OR
25 |         iam_aws_upbound_role_ready{status="False", label_giantswarm_io_service_type="managed"} == 1 OR
26 |         sqs_aws_upbound_queue_ready{status="False", label_giantswarm_io_service_type="managed"} == 1 OR
27 |         sqs_aws_upbound_queuepolicy_ready{status="False", label_giantswarm_io_service_type="managed"} == 1 OR
28 |         cloudwatchevents_aws_upbound_rule_ready{status="False", label_giantswarm_io_service_type="managed"} == 1 OR
29 |         cloudwatchevents_aws_upbound_target_ready{status="False", label_giantswarm_io_service_type="managed"} == 1 OR
30 |         ec2_aws_upbound_securitygroup_ready{status="False", label_giantswarm_io_service_type="managed"} == 1
31 |       for: 15m
32 |       labels:
33 |         area: kaas
34 |         cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }}
35 |         severity: page
36 |         team: phoenix
37 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/platform/atlas/alerting-rules/fluentbit.rules.yml:
--------------------------------------------------------------------------------
 1 | {{- if not (or (eq .Values.managementCluster.provider.kind "vsphere") (eq .Values.managementCluster.provider.kind "cloud-director")) }}
 2 | apiVersion: monitoring.coreos.com/v1
 3 | kind: PrometheusRule
 4 | metadata:
 5 |   creationTimestamp: null
 6 |   labels:
 7 |     {{- include "labels.common" . | nindent 4 }}
 8 |   name: fluentbit.rules
 9 |   namespace: {{ .Values.namespace  }}
10 | spec:
11 |   groups:
12 |   - name: fluentbit
13 |     rules:
14 |     - alert: FluentbitDropRatio
15 |       annotations:
16 |         description: '{{`Fluentbit ({{ $labels.instance }}) is dropping more than 1% records.`}}'
17 |         runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/fluentbit/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}'
18 |         __dashboardUid__: fluentbit
19 |         dashboardQueryParams: "orgId=2"
20 |       # Check the ratio of dropped records over the total number of records.
21 |       # We only monitor this app on the management cluster so we don't get alerts if the customer misconfigures theirs.
22 |       expr: |-
23 |         rate(
24 |           fluentbit_output_dropped_records_total{cluster_type="management_cluster"}[10m])
25 |           / (
26 |             rate(fluentbit_output_proc_records_total{cluster_type="management_cluster"}[10m])
27 |             + rate(fluentbit_output_dropped_records_total{cluster_type="management_cluster"}[10m])
28 |           )
29 |         > 0.01
30 |       for: 20m
31 |       labels:
32 |         area: platform
33 |         cancel_if_outside_working_hours: "true"
34 |         severity: page
35 |         team: atlas
36 |         topic: observability
37 |     - alert: FluentbitDown
38 |       annotations:
39 |         description: '{{`Fluentbit is down on node ({{ $labels.node }}).`}}'
40 |         runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/fluentbit/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}'
41 |         __dashboardUid__: fluentbit
42 |         dashboardQueryParams: "orgId=2"
43 |       expr: sum(up{job="fluent-logshipping-app"}) by (job, cluster_id, installation, provider, pipeline, namespace, node) == 0
44 |       for: 15m
45 |       labels:
46 |         area: platform
47 |         cancel_if_outside_working_hours: "true"
48 |         severity: page
49 |         team: atlas
50 |         topic: observability
51 |     - alert: FluentbitDaemonSetNotSatisfied
52 |       annotations:
53 |         description: '{{`Daemonset {{ $labels.namespace}}/{{ $labels.daemonset }} is not satisfied.`}}'
54 |         runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/daemonset-not-satisfied/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}'
55 |         __dashboardUid__: fluentbit
56 |         dashboardQueryParams: "orgId=2"
57 |       expr: kube_daemonset_status_number_unavailable{daemonset="fluent-logshipping-app"} > 0
58 |       for: 1h
59 |       labels:
60 |         area: platform
61 |         cancel_if_outside_working_hours: "true"
62 |         severity: page
63 |         team: atlas
64 |         topic: observability
65 | {{- end }}
66 | 


--------------------------------------------------------------------------------
/helm/prometheus-rules/templates/platform/shield/alerting-rules/falco.rules.yml:
--------------------------------------------------------------------------------
 1 | apiVersion: monitoring.coreos.com/v1
 2 | kind: PrometheusRule
 3 | metadata:
 4 |   creationTimestamp: null
 5 |   labels:
 6 |     {{- include "labels.common" . | nindent 4 }}
 7 |   name: falco.rules
 8 |   namespace: {{ .Values.namespace  }}
 9 | spec:
10 |   groups:
11 |   - name: falco
12 |     rules:
13 |     - alert: FalcoCriticalAlertFiring
14 |       annotations:
15 |         description: |-
16 |           {{`{{ if eq $labels.k8s_pod_name "<NA>" }}The Falco rule {{ $labels.rule }} was triggered on the node {{ $labels.hostname }}.
17 |           {{else}}Pod {{ $labels.k8s_ns_name }}/{{ $labels.k8s_pod_name }} triggered the Falco rule {{ $labels.rule }} on the node {{ $labels.hostname }}.{{ end }}`}}
18 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/falco-alert/
19 |       expr: increase(falco_events{priority=~"0|1|2|3"}[10m] ) > 0
20 |       labels:
21 |         area: platform
22 |         cancel_if_outside_working_hours: "true"
23 |         severity: notify
24 |         team: shield
25 |         topic: security
26 |     - alert: FalcoMediumAlertFiring
27 |       annotations:
28 |         description: |-
29 |           {{`{{ if eq $labels.k8s_pod_name "<NA>" }}The Falco rule {{ $labels.rule }} was triggered on the node {{ $labels.hostname }}.
30 |           {{else}}Pod {{ $labels.k8s_ns_name }}/{{ $labels.k8s_pod_name }} triggered the Falco rule {{ $labels.rule }} on the node {{ $labels.hostname }}.{{ end }}`}}
31 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/falco-alert/
32 |       expr: increase(falco_events{priority=~"4|5"}[10m] ) > 0
33 |       labels:
34 |         area: platform
35 |         severity: notify
36 |         team: shield
37 |         topic: security
38 |     - alert: FalcoInformationalAlert
39 |       annotations:
40 |         description: |-
41 |           {{`{{ if eq $labels.k8s_pod_name "<NA>" }}The Falco rule {{ $labels.rule }} was triggered on the node {{ $labels.hostname }}.
42 |           {{else}}Pod {{ $labels.k8s_ns_name }}/{{ $labels.k8s_pod_name }} triggered the Falco rule {{ $labels.rule }} on the node {{ $labels.hostname }}.{{ end }}`}}
43 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/falco-alert/
44 |       expr: increase(falco_events{priority="6"}[10m] ) > 0
45 |       labels:
46 |         area: platform
47 |         severity: notify
48 |         team: shield
49 |         topic: security
50 |     - alert: FalcoXZBackdoorAlert
51 |       annotations:
52 |         description: |-
53 |           {{`{{ if eq $labels.k8s_pod_name "<NA>" }}The Falco rule {{ $labels.rule }} was triggered on the node {{ $labels.hostname }}.
54 |           {{else}}Pod {{ $labels.k8s_ns_name }}/{{ $labels.k8s_pod_name }} triggered the Falco rule {{ $labels.rule }} on the node {{ $labels.hostname }}.{{ end }}`}}
55 |         runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/falco-alert/
56 |       expr: falco_events{rule="Backdoored library loaded into SSHD (CVE-2024-3094)"} > 0
57 |       labels:
58 |         area: platform
59 |         severity: notify
60 |         team: shield
61 |         topic: security
62 | 


--------------------------------------------------------------------------------
/test/tests/providers/global/platform/atlas/alerting-rules/logging-pipeline.rules.test.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | rule_files:
 3 |   - logging-pipeline.rules.yml
 4 | 
 5 | tests:
 6 |   # Test LogForwardingErrors
 7 |   - interval: 1m
 8 |     input_series:
 9 |       # Tests with multiple cases: no metrics, no requests, only status_code 204 ones, 204 ones and 500 that are less than 10% of the the total, 500 request that represent more than 10% of the total, only 500 ones
10 |       - series: 'loki_write_request_duration_seconds_count{status_code="500", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", provider="capa", pipeline="testing", node="ip-10-0-5-145.eu-west-1.compute.internal", pod="alloy-2j7z7"}'
11 |         values: "_x60 0+0x60 0+0x60   0+50x60      3000+100x60  9000+600x60"
12 |       - series: 'loki_write_request_duration_seconds_count{status_code="204", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", provider="capa", pipeline="testing", node="ip-10-0-5-145.eu-west-1.compute.internal", pod="alloy-2j7z7"}'
13 |         values: "_x60 0+0x60 0+600x60 36000+600x60 72000+600x60 108000+0x60"
14 |     alert_rule_test:
15 |       - alertname: LogForwardingErrors
16 |         eval_time: 30m
17 |       - alertname: LogForwardingErrors
18 |         eval_time: 90m
19 |       - alertname: LogForwardingErrors
20 |         eval_time: 150m
21 |       - alertname: LogForwardingErrors
22 |         eval_time: 210m
23 |       - alertname: LogForwardingErrors
24 |         eval_time: 270m
25 |         exp_alerts:
26 |           - exp_labels:
27 |               area: platform
28 |               cancel_if_outside_working_hours: "true"
29 |               cluster_id: gauss
30 |               installation: gauss
31 |               pod: alloy-2j7z7
32 |               provider: capa
33 |               pipeline: testing
34 |               severity: page
35 |               team: atlas
36 |               topic: observability
37 |             exp_annotations:
38 |               __dashboardUid__: 53c1ecddc3a1d5d4b8d6cd0c23676c31
39 |               dashboardQueryParams: orgId=2
40 |               description: "14.29% of the requests to Loki are failing for pod alloy-2j7z7 (threshold 10%)"
41 |               runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/runbooks/logging-pipeline/?INSTALLATION=gauss&CLUSTER=gauss
42 |       - alertname: LogForwardingErrors
43 |         eval_time: 330m
44 |         exp_alerts:
45 |           - exp_labels:
46 |               area: platform
47 |               cancel_if_outside_working_hours: "true"
48 |               cluster_id: gauss
49 |               installation: gauss
50 |               pod: alloy-2j7z7
51 |               provider: capa
52 |               pipeline: testing
53 |               severity: page
54 |               team: atlas
55 |               topic: observability
56 |             exp_annotations:
57 |               __dashboardUid__: 53c1ecddc3a1d5d4b8d6cd0c23676c31
58 |               dashboardQueryParams: orgId=2
59 |               description: "100.00% of the requests to Loki are failing for pod alloy-2j7z7 (threshold 10%)"
60 |               runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/runbooks/logging-pipeline/?INSTALLATION=gauss&CLUSTER=gauss
61 | 


--------------------------------------------------------------------------------
/test/tests/providers/global/platform/atlas/alerting-rules/statefulset.rules.test.yml:
--------------------------------------------------------------------------------
 1 | ---
 2 | rule_files:
 3 |   - statefulset.rules.yml
 4 | 
 5 | tests:
 6 |   - interval: 1m
 7 |     input_series:
 8 |       - series: 'kube_statefulset_status_replicas{app="kube-state-metrics", cluster_id="gauss", cluster_type="management_cluster", customer="giantswarm", installation="gauss", namespace="loki", organization="giantswarm", pipeline="testing", region="westeurope", statefulset="loki-write"}'
 9 |         values: "3+0x5760" # 5760 = 4 days
10 |       - series: 'kube_statefulset_status_replicas_ready{app="kube-state-metrics", cluster_id="gauss", cluster_type="management_cluster", customer="giantswarm", installation="gauss", namespace="loki", organization="giantswarm", pipeline="testing", region="westeurope", statefulset="loki-write"}'
11 |         values: "3+0x60 2+0x4440 3+0x60" # 4440 = 3 days + 2h
12 |     alert_rule_test:
13 |       - alertname: StatefulsetNotSatisfiedAtlas
14 |         eval_time: 60m
15 |       - alertname: StatefulsetNotSatisfiedAtlas
16 |         eval_time: 4380m # 3 days + 1h
17 |       - alertname: StatefulsetNotSatisfiedAtlas
18 |         eval_time: 4382m
19 |         exp_alerts:
20 |           - exp_labels:
21 |               app: kube-state-metrics
22 |               area: platform
23 |               cancel_if_outside_working_hours: "false"
24 |               cluster_id: "gauss"
25 |               cluster_type: management_cluster
26 |               customer: giantswarm
27 |               installation: "gauss"
28 |               namespace: loki
29 |               organization: giantswarm
30 |               pipeline: "testing"
31 |               region: westeurope
32 |               severity: page
33 |               statefulset: loki-write
34 |               team: atlas
35 |               topic: managementcluster
36 |             exp_annotations:
37 |               description: "Statefulset loki/loki-write is not satisfied."
38 |               runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/runbooks/deployment-not-satisfied/?INSTALLATION=gauss&CLUSTER=gauss&NAMESPACE=loki&KIND=statefulset&NAME=loki-write
39 |       - alertname: StatefulsetNotSatisfiedAtlas
40 |         eval_time: 4500m # 3 days + 3h
41 |         exp_alerts:
42 |           - exp_labels:
43 |               app: kube-state-metrics
44 |               area: platform
45 |               cancel_if_outside_working_hours: "false"
46 |               cluster_id: "gauss"
47 |               cluster_type: management_cluster
48 |               customer: giantswarm
49 |               installation: "gauss"
50 |               namespace: loki
51 |               organization: giantswarm
52 |               pipeline: "testing"
53 |               region: westeurope
54 |               severity: page
55 |               statefulset: loki-write
56 |               team: atlas
57 |               topic: managementcluster
58 |             exp_annotations:
59 |               description: "Statefulset loki/loki-write is not satisfied."
60 |               runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/runbooks/deployment-not-satisfied/?INSTALLATION=gauss&CLUSTER=gauss&NAMESPACE=loki&KIND=statefulset&NAME=loki-write
61 |       - alertname: StatefulsetNotSatisfiedAtlas
62 |         eval_time: 4502m
63 | 


--------------------------------------------------------------------------------