├── .nancy-ignore ├── test ├── hack │ ├── .gitignore │ └── bin │ │ ├── .gitignore │ │ ├── get-inhibition.sh │ │ ├── template-chart.sh │ │ └── run-pint.sh ├── conf │ ├── providers │ ├── pint │ │ ├── pint-config.hcl │ │ └── pint-all.hcl │ └── promtool_ignore └── tests │ ├── .gitignore │ └── providers │ ├── global │ ├── platform │ │ ├── honeybadger │ │ │ └── alerting-rules │ │ │ │ ├── helm-operations.rules.test.yml │ │ │ │ ├── flux.rules.test.yml │ │ │ │ ├── zot.rules.test.yml │ │ │ │ ├── crsync.rules.test.yml │ │ │ │ └── konfigure-operator.rules.test.yml │ │ ├── shield │ │ │ └── alerting-rules │ │ │ │ ├── general.rules.test.yml │ │ │ │ └── cert-manager.rules.test.yml │ │ └── atlas │ │ │ └── alerting-rules │ │ │ ├── silence-operator.rules.test.yml │ │ │ ├── flux-atlas.rules.test.yml │ │ │ ├── sloth.rules.test.yml │ │ │ ├── logging-pipeline.rules.test.yml │ │ │ └── statefulset.rules.test.yml │ └── kaas │ │ └── tenet │ │ └── alerting-rules │ │ ├── capi-machineset.rules.test.yml │ │ ├── certificate.management-cluster.rules.test.yml │ │ ├── pods.rules.test.yml │ │ ├── capi-machinepool.rules.test.yml │ │ ├── capi-machinedeployment.rules.test.yml │ │ └── capi-machine.rules.test.yml │ ├── capa │ └── kaas │ │ └── phoenix │ │ └── alerting-rules │ │ ├── cluster-crossplane.rules.test.yml │ │ └── capa.inhibition.rules.test.yml │ └── capz │ └── kaas │ └── phoenix │ └── alerting-rules │ └── dns-operator-azure.rules.test.yml ├── loki ├── .gitignore ├── mixin.libsonnet └── update.sh ├── mimir ├── .gitignore ├── update.sh └── mixin.libsonnet ├── .abs └── main.yaml ├── assets └── inhibition-graph.png ├── .nancy-ignore.generated ├── SECURITY.md ├── .cursor └── rules │ └── alert-editing.mdc ├── helm └── prometheus-rules │ ├── values.yaml │ ├── Chart.yaml │ ├── templates │ ├── platform │ │ ├── honeybadger │ │ │ ├── recording-rules │ │ │ │ └── helm-operations.rules.yml │ │ │ └── alerting-rules │ │ │ │ ├── secret.rules.yml │ │ │ │ ├── helm.rules.yml │ │ │ │ ├── external-secrets.rules.yml │ │ │ │ ├── crossplane.rules.yml │ │ │ │ ├── chart.rules.yml │ │ │ │ ├── konfigure-operator.rules.yml │ │ │ │ └── zot.rules.yml │ │ ├── atlas │ │ │ ├── alerting-rules │ │ │ │ ├── inhibit.oncall.rules.yml │ │ │ │ ├── teleport.logs.yml │ │ │ │ ├── mimir.logs.yml │ │ │ │ ├── statefulset.rules.yml │ │ │ │ ├── storage.rules.yml │ │ │ │ ├── flux-atlas.rules.yml │ │ │ │ ├── app-configuration.rules.yml │ │ │ │ ├── sloth.rules.yml │ │ │ │ ├── logging-pipeline.rules.yml │ │ │ │ ├── silence-operator.rules.yml │ │ │ │ ├── keda.rules.yml │ │ │ │ ├── tracing-pipeline.rules.yml │ │ │ │ └── fluentbit.rules.yml │ │ │ └── recording-rules │ │ │ │ └── monitoring.resource-usage-estimation.rules.yml │ │ ├── shield │ │ │ └── alerting-rules │ │ │ │ ├── general.rules.yml │ │ │ │ ├── dex.logs.yml │ │ │ │ ├── cert-manager.rules.yml │ │ │ │ ├── dex.rules.yml │ │ │ │ └── falco.rules.yml │ │ └── cabbage │ │ │ ├── alerting-rules │ │ │ ├── dns.rules.yml │ │ │ ├── kong.rules.yml │ │ │ └── external-dns.rules.yml │ │ │ └── recording-rules │ │ │ └── gs-managed-app-deployment-status.rules.yml │ ├── kaas │ │ ├── tenet │ │ │ └── alerting-rules │ │ │ │ ├── inhibit.kubelet.rules.yml │ │ │ │ ├── net-exporter.rules.yml │ │ │ │ ├── job.rules.yml │ │ │ │ ├── timesync.rules.yml │ │ │ │ ├── inhibit.nodes.rules.yml │ │ │ │ ├── inhibit.capi.rules.yml │ │ │ │ ├── node.memory-pressure.rules.yml │ │ │ │ ├── capi-kubeadmconfig.rules.yml │ │ │ │ ├── capi-machineset.rules.yml │ │ │ │ ├── cluster-autoscaler.rules.yml │ │ │ │ ├── systemd.rules.yml │ │ │ │ ├── fairness.rules.yml │ │ │ │ ├── pods.core.rules.yml │ │ │ │ ├── certificate.workload-cluster.rules.yml │ │ │ │ ├── pods.rules.yml │ │ │ │ ├── node-exporter.rules.yml │ │ │ │ ├── capi-machine.rules.yml │ │ │ │ ├── capi-machinepool.rules.yml │ │ │ │ ├── capi-machinedeployment.rules.yml │ │ │ │ ├── etcdbackup.rules.yml │ │ │ │ ├── apiserver.management-cluster.rules.yml │ │ │ │ ├── certificate.management-cluster.rules.yml │ │ │ │ ├── capi-kubeadmcontrolplane.rules.yml │ │ │ │ └── vertical-pod-autoscaler.rules.yml │ │ ├── phoenix │ │ │ └── alerting-rules │ │ │ │ ├── irsa.rules.yml │ │ │ │ ├── aws.node.workload-cluster.rules.yml │ │ │ │ ├── nodes.cidrnotavailable.events.logs.yml │ │ │ │ ├── capa.inhibition.rules.yml │ │ │ │ ├── aws-load-balancer-controller.rules.yml │ │ │ │ ├── dns-operator-azure.rules.yml │ │ │ │ ├── aws.workload-cluster.rules.yml │ │ │ │ ├── cloud-provider-controller.rules.yml │ │ │ │ └── cluster-crossplane.rules.yml │ │ └── rocket │ │ │ └── alerting-rules │ │ │ └── blackbox-exporter.cloud-provider-api.rules.yml │ └── _helpers.tpl │ └── values.schema.json ├── .github ├── workflows │ ├── zz_generated.gitleaks.yaml │ ├── zz_generated.validate_changelog.yaml │ ├── zz_generated.check_values_schema.yaml │ ├── zz_generated.run_ossf_scorecard.yaml │ ├── zz_generated.create_release_pr.yaml │ ├── alert_tests.yaml │ ├── update-tempo-mixins.yml │ ├── update-loki-mixins.yml │ ├── update-mimir-mixins.yml │ └── zz_generated.add-team-labels.yaml └── pull_request_template.md ├── CODEOWNERS ├── Makefile ├── scripts ├── find-alerts.sh └── sync-kube-mixin.sh ├── DCO ├── Makefile.gen.app.mk ├── Makefile.custom.mk ├── renovate.json5 └── .circleci └── config.yml /.nancy-ignore: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /test/hack/.gitignore: -------------------------------------------------------------------------------- 1 | output 2 | checkLabels/alertmanager.yaml 3 | -------------------------------------------------------------------------------- /loki/.gitignore: -------------------------------------------------------------------------------- 1 | vendor/ 2 | dashboards_out/ 3 | jsonnetfile.* 4 | 5 | -------------------------------------------------------------------------------- /mimir/.gitignore: -------------------------------------------------------------------------------- 1 | vendor/ 2 | dashboards_out/ 3 | jsonnetfile.* 4 | 5 | -------------------------------------------------------------------------------- /.abs/main.yaml: -------------------------------------------------------------------------------- 1 | replace-chart-version-with-git: true 2 | generate-metadata: true 3 | -------------------------------------------------------------------------------- /test/conf/providers: -------------------------------------------------------------------------------- 1 | capa 2 | capz 3 | cloud-director 4 | proxmox 5 | vsphere 6 | -------------------------------------------------------------------------------- /assets/inhibition-graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/giantswarm/prometheus-rules/HEAD/assets/inhibition-graph.png -------------------------------------------------------------------------------- /test/tests/.gitignore: -------------------------------------------------------------------------------- 1 | **/*.rules.yml 2 | **/*.rules.yaml 3 | **/*.rules.test.yml_global 4 | **/*.rules.test.yaml_global 5 | -------------------------------------------------------------------------------- /.nancy-ignore.generated: -------------------------------------------------------------------------------- 1 | # This file is generated by https://github.com/giantswarm/github 2 | # Repository specific ignores should be added to .nancy-ignore 3 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security Policy 2 | 3 | ## Reporting a Vulnerability 4 | 5 | Please visit for information on reporting security issues. 6 | -------------------------------------------------------------------------------- /test/hack/bin/.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore everything in this directory 2 | * 3 | # Except the following 4 | !.gitignore 5 | !verify-rules.sh 6 | !fetch-tools.sh 7 | !template-chart.sh 8 | !get-inhibition.sh 9 | !check-runbooks.sh 10 | -------------------------------------------------------------------------------- /.cursor/rules/alert-editing.mdc: -------------------------------------------------------------------------------- 1 | --- 2 | globs: **/*.rules.yml 3 | alwaysApply: false 4 | --- 5 | # Rules for editing alerting rules 6 | 7 | - **Update tests:** 8 | - When modifying an alerting rule, check if there are any tests for the rule. This is best done by grepping for the alert name. 9 | - If a runbook URL is changed, the according test must be updated. 10 | -------------------------------------------------------------------------------- /helm/prometheus-rules/values.yaml: -------------------------------------------------------------------------------- 1 | name: prometheus-rules 2 | namespace: monitoring 3 | serviceType: managed 4 | managementCluster: 5 | customer: "" 6 | name: "" 7 | pipeline: "" 8 | provider: 9 | kind: "" 10 | flavor: "" 11 | region: "" 12 | 13 | Installation: 14 | V1: 15 | Guest: 16 | Kubernetes: 17 | IngressController: 18 | BaseDomain: "" 19 | -------------------------------------------------------------------------------- /.github/workflows/zz_generated.gitleaks.yaml: -------------------------------------------------------------------------------- 1 | # DO NOT EDIT. Generated with: 2 | # 3 | # devctl 4 | # 5 | # https://github.com/giantswarm/devctl/blob/ad0a25fbf301b2513e169ec964a8785d28f75be4/pkg/gen/input/workflows/internal/file/gitleaks.yaml.template 6 | # 7 | name: gitleaks 8 | 9 | on: 10 | - pull_request 11 | 12 | jobs: 13 | publish: 14 | uses: giantswarm/github-workflows/.github/workflows/gitleaks.yaml@main 15 | -------------------------------------------------------------------------------- /helm/prometheus-rules/Chart.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v2 2 | description: A Helm chart for Giant Swarm prometheus alerting and recording rules 3 | home: https://github.com/giantswarm/prometheus-rules 4 | icon: https://s.giantswarm.io/app-icons/1/png/default-app-light.png 5 | name: prometheus-rules 6 | appVersion: "0.1.0" 7 | version: "4.89.2" 8 | annotations: 9 | application.giantswarm.io/team: "atlas" 10 | config.giantswarm.io/version: 1.x.x 11 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @giantswarm/team-atlas 2 | /helm/prometheus-rules/templates/kaas/phoenix/ @giantswarm/team-phoenix 3 | /helm/prometheus-rules/templates/kaas/rocket/ @giantswarm/team-rocket 4 | /helm/prometheus-rules/templates/kaas/turtles/ @giantswarm/team-tenet 5 | /helm/prometheus-rules/templates/platform/atlas/ @giantswarm/team-atlas 6 | /helm/prometheus-rules/templates/platform/cabbage/ @giantswarm/team-cabbage 7 | /helm/prometheus-rules/templates/platform/honeybadger/ @giantswarm/team-honeybadger 8 | /helm/prometheus-rules/templates/platform/shield/ @giantswarm/team-shield 9 | 10 | # No owners for changelog 11 | /CHANGELOG.md 12 | -------------------------------------------------------------------------------- /.github/workflows/zz_generated.validate_changelog.yaml: -------------------------------------------------------------------------------- 1 | # DO NOT EDIT. Generated with: 2 | # 3 | # devctl 4 | # 5 | # https://github.com/giantswarm/devctl/blob/ad0a25fbf301b2513e169ec964a8785d28f75be4/pkg/gen/input/workflows/internal/file/validate_changelog.yaml.template 6 | # 7 | name: Validate changelog 8 | 9 | on: 10 | pull_request: 11 | types: [opened, synchronize, reopened] 12 | paths: 13 | - 'CHANGELOG.md' 14 | 15 | permissions: 16 | contents: read 17 | pull-requests: write 18 | 19 | jobs: 20 | validate-changelog: 21 | uses: giantswarm/github-workflows/.github/workflows/validate-changelog.yaml@main 22 | -------------------------------------------------------------------------------- /loki/mixin.libsonnet: -------------------------------------------------------------------------------- 1 | (import 'loki-mixin/mixin-ssd.libsonnet') + { 2 | _config+:: { 3 | tags: [ 4 | 'owner:team-atlas', 5 | 'topic:observability', 6 | 'component:loki', 7 | ], 8 | 9 | per_node_label: 'node', 10 | per_cluster_label: 'cluster_id', 11 | 12 | blooms: { 13 | enabled: false, 14 | }, 15 | 16 | canary+: { 17 | enabled: true, 18 | }, 19 | 20 | operational: { 21 | memcached: false, 22 | consul: false, 23 | bigTable: false, 24 | dynamo: false, 25 | gcs: false, 26 | s3: true, 27 | azureBlob: true, 28 | boltDB: false, 29 | }, 30 | }, 31 | } 32 | -------------------------------------------------------------------------------- /test/hack/bin/get-inhibition.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | GIT_WORKDIR="$(git rev-parse --show-toplevel)" 4 | REPO_VERSION="$(curl -s https://api.github.com/repos/giantswarm/observability-operator/releases/latest | ./test/hack/bin/yq -r .name)" 5 | 6 | curl -s https://raw.githubusercontent.com/giantswarm/observability-operator/"$REPO_VERSION"/helm/observability-operator/files/alertmanager/alertmanager.yaml.helm-template > "$GIT_WORKDIR"/test/hack/checkLabels/alertmanager.yaml 7 | 8 | # Deleting all lines from begining of the file to the concerned section to avoid issues with go templates 9 | sed -i '/global:/,/inhibit_rules:/{/inhibit_rules/b a;/^.*/d; :a}' "$GIT_WORKDIR"/test/hack/checkLabels/alertmanager.yaml 10 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/platform/honeybadger/recording-rules/helm-operations.rules.yml: -------------------------------------------------------------------------------- 1 | # TODO(@giantswarm/team-honeybadger): This is only used by the chart-operator, let's get rid of it when the chart operator is gone. 2 | apiVersion: monitoring.coreos.com/v1 3 | kind: PrometheusRule 4 | metadata: 5 | labels: 6 | {{- include "labels.common" . | nindent 4 }} 7 | name: helm-operations.recording.rules 8 | namespace: {{ .Values.namespace }} 9 | spec: 10 | groups: 11 | - name: helm-operations.recording 12 | rules: 13 | - expr: "sum by (cluster_id, installation, pipeline, provider, release, event) (helmclient_library_event_total{release!=''})" 14 | record: monitoring:helm:number_of_operations_on_release 15 | -------------------------------------------------------------------------------- /.github/workflows/zz_generated.check_values_schema.yaml: -------------------------------------------------------------------------------- 1 | # DO NOT EDIT. Generated with: 2 | # 3 | # devctl 4 | # 5 | # https://github.com/giantswarm/devctl/blob/ad0a25fbf301b2513e169ec964a8785d28f75be4/pkg/gen/input/workflows/internal/file/check_values_schema.yaml.template 6 | # 7 | 8 | name: 'Values and schema' 9 | 10 | on: 11 | pull_request: 12 | branches: 13 | - master 14 | - main 15 | paths: 16 | - 'helm/**/values.yaml' # default helm chart values 17 | - 'helm/**/values.schema.json' # schema 18 | - 'helm/**/ci/ci-values.yaml' # overrides for CI (can contain required entries) 19 | 20 | jobs: 21 | check: 22 | uses: giantswarm/github-workflows/.github/workflows/chart-values.yaml@main 23 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/platform/atlas/alerting-rules/inhibit.oncall.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | creationTimestamp: null 5 | labels: 6 | {{- include "labels.common" . | nindent 4 }} 7 | name: inhibit.oncall.rules 8 | namespace: {{ .Values.namespace }} 9 | spec: 10 | groups: 11 | - name: inhibit.oncall 12 | rules: 13 | - alert: InhibitionOutsideWorkingHours 14 | annotations: 15 | description: '{{`Fires outside working hours.`}}' 16 | expr: (hour() <= 7 or hour() >= 16) or (day_of_week() > 5 or day_of_week() < 1) 17 | labels: 18 | area: platform 19 | outside_working_hours: "true" 20 | team: atlas 21 | topic: monitoring 22 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/kaas/tenet/alerting-rules/inhibit.kubelet.rules.yml: -------------------------------------------------------------------------------- 1 | # This rule applies to all clusters 2 | apiVersion: monitoring.coreos.com/v1 3 | kind: PrometheusRule 4 | metadata: 5 | creationTimestamp: null 6 | labels: 7 | {{- include "labels.common" . | nindent 4 }} 8 | name: inhibit.kubelet.rules 9 | namespace: {{ .Values.namespace }} 10 | spec: 11 | groups: 12 | - name: inhibit.kubelet 13 | rules: 14 | - alert: InhibitionKubeletDown 15 | annotations: 16 | description: '{{`Kubelet ({{ $labels.instance }}) is down.`}}' 17 | expr: label_replace(up{app="kubelet"}, "ip", "$1", "instance", "(.+):\\d+") == 0 18 | labels: 19 | kubelet_down: "true" 20 | area: kaas 21 | topic: kubernetes 22 | team: tenet 23 | -------------------------------------------------------------------------------- /test/hack/bin/template-chart.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -euo pipefail 3 | 4 | main() { 5 | local GIT_WORKDIR 6 | GIT_WORKDIR="$(git rev-parse --show-toplevel)" 7 | 8 | local -a providers 9 | mapfile -t providers <"$GIT_WORKDIR/test/conf/providers" 10 | 11 | rm -rf "$GIT_WORKDIR"/test/hack/output/helm-chart/ 12 | 13 | for provider in "${providers[@]}"; do 14 | echo "Templating chart for provider: $provider" 15 | 16 | helm template \ 17 | "$GIT_WORKDIR"/helm/prometheus-rules \ 18 | --set="managementCluster.provider.flavor=capi" \ 19 | --set="managementCluster.provider.kind=$provider" \ 20 | --set="managementCluster.name=myinstall" \ 21 | --set="managementCluster.pipeline=stable" \ 22 | --output-dir "$GIT_WORKDIR"/test/hack/output/helm-chart/"$provider" 23 | done 24 | } 25 | 26 | main "$@" 27 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/kaas/tenet/alerting-rules/net-exporter.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | creationTimestamp: null 5 | labels: 6 | {{- include "labels.common" . | nindent 4 }} 7 | name: net-exporter.rules 8 | namespace: {{ .Values.namespace }} 9 | spec: 10 | groups: 11 | - name: net-exporter 12 | rules: 13 | - alert: ClusterNetExporterCPUUsageTooHigh 14 | annotations: 15 | description: '{{`net-exporter cpu usage is too high.`}}' 16 | expr: rate(container_cpu_user_seconds_total{pod=~"net-exporter-.*"}[5m]) > 0.015 17 | for: 5m 18 | labels: 19 | area: kaas 20 | cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} 21 | severity: notify 22 | team: tenet 23 | topic: observability 24 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/kaas/tenet/alerting-rules/job.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | creationTimestamp: null 5 | labels: 6 | {{- include "labels.common" . | nindent 4 }} 7 | name: job.rules 8 | namespace: {{ .Values.namespace }} 9 | spec: 10 | groups: 11 | - name: job 12 | rules: 13 | - alert: ManagementClusterJobFailed 14 | annotations: 15 | description: '{{`Job {{ $labels.namespace }}/{{ $labels.job_name }} is failed.`}}' 16 | expr: kube_job_failed{cluster_type="management_cluster", condition="true"} == 1 17 | for: 15m 18 | labels: 19 | area: kaas 20 | cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} 21 | severity: notify 22 | team: {{ include "providerTeam" . }} 23 | topic: managementcluster 24 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/secret.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | creationTimestamp: null 5 | labels: 6 | {{- include "labels.common" . | nindent 4 }} 7 | name: secret.rules 8 | namespace: {{ .Values.namespace }} 9 | spec: 10 | groups: 11 | - name: secret 12 | rules: 13 | - alert: HelmHistorySecretCountTooHigh 14 | annotations: 15 | description: '{{`Helm release Secret count too high.`}}' 16 | expr: sum(kube_secret_info{namespace=~"giantswarm|kube-system|monitoring", secret=~"sh.helm.+"}) by (cluster_id, installation, pipeline, provider) > 1000 17 | for: 15m 18 | labels: 19 | area: platform 20 | cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} 21 | severity: notify 22 | team: honeybadger 23 | topic: releng 24 | -------------------------------------------------------------------------------- /test/tests/providers/global/platform/honeybadger/alerting-rules/helm-operations.rules.test.yml: -------------------------------------------------------------------------------- 1 | # TODO(@giantswarm/team-honeybadger): This is only used by the chart-operator, let's get rid of it when the chart operator is gone. 2 | --- 3 | rule_files: 4 | - helm-operations.rules.yml 5 | 6 | tests: 7 | - interval: 1m 8 | input_series: 9 | - series: 'helmclient_library_event_total{app="chart-operator", cluster_id="gauss", container="chart-operator", event="update_release_from_tarball", namespace="giantswarm", pod="chart-operator-5c7b6f8867-pr44n", release="cilium"}' 10 | values: "0+1x20" 11 | promql_expr_test: 12 | - expr: monitoring:helm:number_of_operations_on_release 13 | eval_time: 10m 14 | exp_samples: 15 | - labels: 'monitoring:helm:number_of_operations_on_release{cluster_id="gauss", event="update_release_from_tarball", release="cilium"}' 16 | value: 10 17 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/kaas/tenet/alerting-rules/timesync.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | creationTimestamp: null 5 | labels: 6 | {{- include "labels.common" . | nindent 4 }} 7 | name: timesync.rules 8 | namespace: {{ .Values.namespace }} 9 | spec: 10 | groups: 11 | - name: timesync 12 | rules: 13 | - alert: ClockOutOfSync 14 | annotations: 15 | description: '{{`Clock is out of sync on {{ $labels.instance }}.`}}' 16 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/clock-out-of-sync/ 17 | expr: timestamp(node_time_seconds) - node_time_seconds > 60 18 | for: 30m 19 | labels: 20 | area: kaas 21 | cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} 22 | severity: page 23 | team: {{ include "providerTeam" . }} 24 | topic: infrastructure 25 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/irsa.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | labels: 5 | {{- include "labels.common" . | nindent 4 }} 6 | name: irsa.rules 7 | namespace: {{ .Values.namespace }} 8 | spec: 9 | groups: 10 | - name: irsa-crossplane 11 | rules: 12 | - alert: IRSAClaimNotReady 13 | annotations: 14 | description: '{{`IRSAClaim {{ $labels.name }} in Cluster {{ $labels.installation }} is not ready.`}}' 15 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/irsaclaim-not-ready/ 16 | expr: irsaclaim_status_conditions{type="Ready", status="False"} > 0 17 | for: 30m 18 | labels: 19 | area: kaas 20 | cancel_if_kube_state_metrics_down: "true" 21 | cancel_if_outside_working_hours: "true" 22 | severity: page 23 | team: phoenix 24 | topic: aws 25 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.node.workload-cluster.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | creationTimestamp: null 5 | labels: 6 | {{- include "labels.common" . | nindent 4 }} 7 | name: node.aws.workload-cluster.rules 8 | namespace: {{ .Values.namespace }} 9 | spec: 10 | groups: 11 | - name: node.aws 12 | rules: 13 | - alert: WorkloadClusterNodeUnexpectedTaintNodeWithImpairedVolumes 14 | annotations: 15 | description: '{{`Node {{ $labels.node }} has unexpected taint NodeWithImpairedVolumes`}}' 16 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/aws-node-taint-nodewithimpairedvolumes/ 17 | expr: kube_node_spec_taint{key="NodeWithImpairedVolumes"} > 0 18 | for: 30m 19 | labels: 20 | area: kaas 21 | severity: notify 22 | team: phoenix 23 | topic: kubernetes 24 | 25 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/platform/shield/alerting-rules/general.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | creationTimestamp: null 5 | labels: 6 | {{- include "labels.common" . | nindent 4 }} 7 | name: kyverno.rules 8 | namespace: {{ .Values.namespace }} 9 | spec: 10 | groups: 11 | - name: general 12 | rules: 13 | - alert: ShieldComponentRestartingTooOften 14 | annotations: 15 | description: '{{`Pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting too often.`}}' 16 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/shield-pod-failing/ 17 | expr: increase(kube_pod_container_status_restarts_total{cluster_type="workload_cluster", pod=~"trivy-.*|kyverno-.*|falco-*|"}[1h]) > 5 18 | for: 30m 19 | labels: 20 | area: platform 21 | cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} 22 | severity: notify 23 | team: shield 24 | topic: security 25 | -------------------------------------------------------------------------------- /test/hack/bin/run-pint.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -euo pipefail 4 | 5 | ## Arguments: 6 | 7 | # 1. config file 8 | # 2. team filter (optional) 9 | 10 | 11 | main () { 12 | echo "Running Pint" 13 | 14 | local GIT_WORKDIR 15 | GIT_WORKDIR="$(git rev-parse --show-toplevel)" 16 | 17 | local -a PINT_FILES_LIST 18 | local -a PROVIDERS 19 | 20 | PINT_CONFIG="${1:-test/conf/pint/pint-config.hcl}" 21 | mapfile -t PROVIDERS <"$GIT_WORKDIR/test/conf/providers" 22 | 23 | if [[ "${2:-}" != "" ]]; then 24 | for provider in "${PROVIDERS[@]}"; do 25 | mapfile -t PINT_FILES_LIST < <(grep -lr "team:.*${PINT_TEAM_FILTER}" "test/hack/output/generated/$provider/" | grep -v ".test.yml") 26 | done 27 | else 28 | for provider in "${PROVIDERS[@]}"; do 29 | mapfile -t PINT_FILES_LIST < <(find test/hack/output/generated/$provider/ -name "*.rules.yml") 30 | done 31 | fi 32 | 33 | test/hack/bin/pint -c "$PINT_CONFIG" lint "${PINT_FILES_LIST[@]}" 34 | } 35 | 36 | main "$@" 37 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/helm.rules.yml: -------------------------------------------------------------------------------- 1 | # TODO(@giantswarm/team-honeybadger): This is only used by the chart-operator, let's get rid of it when the chart operator is gone. 2 | apiVersion: monitoring.coreos.com/v1 3 | kind: PrometheusRule 4 | metadata: 5 | creationTimestamp: null 6 | labels: 7 | {{- include "labels.common" . | nindent 4 }} 8 | name: helm.rules 9 | namespace: {{ .Values.namespace }} 10 | spec: 11 | groups: 12 | - name: helm 13 | rules: 14 | - alert: RepeatedHelmOperation 15 | annotations: 16 | description: '{{`Helm release {{ $labels.release }} in cluster {{ $labels.cluster_id }} is being repeated {{ $labels.event }} for {{ $value | printf "%.1f" }} times.`}}' 17 | expr: increase(monitoring:helm:number_of_operations_on_release[15m]) > 5 18 | for: 5m 19 | labels: 20 | area: platform 21 | cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} 22 | severity: notify 23 | team: honeybadger 24 | topic: releng 25 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/kaas/tenet/alerting-rules/inhibit.nodes.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | creationTimestamp: null 5 | labels: 6 | {{- include "labels.common" . | nindent 4 }} 7 | name: inhibit.nodes.rules 8 | namespace: {{ .Values.namespace }} 9 | spec: 10 | groups: 11 | - name: inhibit.nodes 12 | rules: 13 | - alert: InhibitionNodeNotReady 14 | annotations: 15 | description: '{{`Node {{ $labels.node }} is not ready.`}}' 16 | expr: kube_node_status_condition{condition="Ready", status!="true"} > 0 17 | labels: 18 | area: kaas 19 | node_not_ready: "true" 20 | team: tenet 21 | topic: kubernetes 22 | - alert: InhibitionNodeUnschedulable 23 | annotations: 24 | description: '{{`Node {{ $labels.node }} is unschedulable.`}}' 25 | expr: kube_node_spec_unschedulable > 0 26 | labels: 27 | area: kaas 28 | node_unschedulable: "true" 29 | team: tenet 30 | topic: kubernetes 31 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/kaas/tenet/alerting-rules/inhibit.capi.rules.yml: -------------------------------------------------------------------------------- 1 | # This rule applies to all capi clusters 2 | apiVersion: monitoring.coreos.com/v1 3 | kind: PrometheusRule 4 | metadata: 5 | creationTimestamp: null 6 | labels: 7 | {{- include "labels.common" . | nindent 4 }} 8 | name: inhibit.capi.rules 9 | namespace: {{ .Values.namespace }} 10 | spec: 11 | groups: 12 | - name: inhibit.capi 13 | rules: 14 | - alert: InhibitionControlPlaneUnhealthy 15 | annotations: 16 | description: '{{`Control plane of cluster {{ $labels.cluster_id }} is not healthy.`}}' 17 | expr: |- 18 | capi_kubeadmcontrolplane_status_condition{type="ControlPlaneComponentsHealthy", status="False"} == 1 19 | or capi_kubeadmcontrolplane_status_condition{type="EtcdClusterHealthy", status="False"} == 1 20 | or capi_kubeadmcontrolplane_status_condition{type="Available", status="False"} == 1 21 | labels: 22 | area: kaas 23 | cluster_control_plane_unhealthy: "true" 24 | team: tenet 25 | topic: status 26 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/platform/atlas/alerting-rules/teleport.logs.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | labels: 5 | {{- include "labels.common" . | nindent 4}} 6 | name: teleport.audit.logs.rules 7 | namespace: {{ .Values.namespace }} 8 | spec: 9 | groups: 10 | - name: teleport.audit.logs 11 | rules: 12 | - alert: TeleportAuditLogsMissing 13 | annotations: 14 | description: Teleport audit logs are missing from installation {{`{{ $labels.installation }}`}}. 15 | runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/teleport-audit-logs-missing/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}' 16 | expr: |- 17 | absent_over_time({scrape_job="teleport.giantswarm.io"} [7d]) > 0 18 | for: 5m 19 | labels: 20 | area: kaas 21 | cancel_if_outside_working_hours: "true" 22 | severity: page 23 | team: atlas 24 | topic: observability 25 | -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | Before adding a new alerting rule into this repository you should consider creating an SLO rules instead. 2 | SLO helps you both increase the quality of your monitoring and reduce the alert noise. 3 | 4 | * How to create a SLO rule: https://github.com/giantswarm/sloth-rules#how-to-create-a-slo 5 | * Documentation: https://intranet.giantswarm.io/docs/observability/slo-alerting/ 6 | 7 | --- 8 | Towards: https://github.com/giantswarm/... 9 | 10 | This PR ... 11 | 12 | ### Checklist 13 | 14 | - [ ] Update CHANGELOG.md 15 | - [ ] Add [Unit tests](https://github.com/giantswarm/prometheus-rules/#testing) 16 | - [ ] Follow [Alert structure](https://github.com/giantswarm/prometheus-rules/#how-alerts-are-structured) 17 | - [ ] Consider [creating a dashboard](https://docs.giantswarm.io/tutorials/observability/data-exploration/creating-custom-dashboards/) ([guidelines](https://intranet.giantswarm.io/docs/product/ux/guidelines/dashboards/)) (if it does not exist already) to help oncallers monitor the status of the issue. 18 | - [ ] Request review from oncall area, as well as team (e.g: `oncall-kaas-cloud` GitHub group). 19 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/platform/shield/alerting-rules/dex.logs.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | labels: 5 | {{- include "labels.common" . | nindent 4}} 6 | name: dex.logs.rules 7 | namespace: {{ .Values.namespace }} 8 | spec: 9 | groups: 10 | - name: dex.logs 11 | rules: 12 | - alert: DexInvalidClientId 13 | annotations: 14 | description: '{{`Dex in {{ $labels.installation }} reports an invalid client ID.`}}' 15 | runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/dex-invalid-client-id/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}' 16 | expr: |- 17 | sum(rate({scrape_job="kubernetes-pods", pod=~"dex.*"} |= `Invalid client_id` | logfmt | err =~ `Invalid client_id \(\"[a-zA-Z0-9+]{20,}.*` [1h])) by (installation, cluster_id) > 0 18 | for: 5m 19 | labels: 20 | area: kaas 21 | cancel_if_outside_working_hours: "true" 22 | severity: page 23 | team: shield 24 | topic: dex 25 | -------------------------------------------------------------------------------- /loki/update.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Update Loki mixins from upstream 4 | # 5 | # This script is used to update the Loki mixins from the upstream repository. 6 | # 7 | # Usage: 8 | # ./loki/update.sh from the root of the repository 9 | 10 | set -e 11 | 12 | BRANCH="main" 13 | MIXIN_URL=https://github.com/grafana/loki/production/loki-mixin@$BRANCH 14 | OUTPUT_FILE="$(pwd)"/helm/prometheus-rules/templates/platform/atlas/recording-rules/loki-mixins.rules.yml 15 | 16 | cd loki 17 | rm -rf vendor jsonnetfile.* "$OUTPUT_FILE" 18 | 19 | jb init 20 | jb install $MIXIN_URL 21 | mixtool generate rules mixin.libsonnet -r "$OUTPUT_FILE" 22 | 23 | # Remove the initial `groups:` line 24 | sed -i '1d' "$OUTPUT_FILE" 25 | 26 | # Add the PrometheusRule metadata header 27 | sed -i '1i\ 28 | apiVersion: monitoring.coreos.com/v1\ 29 | kind: PrometheusRule\ 30 | metadata:\ 31 | labels:\ 32 | {{- include "labels.common" . | nindent 4 }}\ 33 | name: loki.recording.rules\ 34 | namespace: {{ .Values.namespace }}\ 35 | spec:\ 36 | groups:' "$OUTPUT_FILE" 37 | 38 | sed -i 's/cluster_id,/cluster_id, installation, pipeline, provider,/g' "$OUTPUT_FILE" 39 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/nodes.cidrnotavailable.events.logs.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | labels: 5 | {{- include "labels.common" . | nindent 4}} 6 | name: nodes.cidrnotavailable.events.logs.rules 7 | namespace: {{ .Values.namespace }} 8 | spec: 9 | groups: 10 | - name: nodes.cidrnotavailable.events.logs 11 | rules: 12 | - alert: NodeCIDRNotAvailable 13 | annotations: 14 | description: Node(s) CIDR(s) are not available in the cluster {{`{{ $labels.cluster_id }}`}}. 15 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/cidr-not-available/ 16 | expr: |- 17 | sum ( 18 | count_over_time({scrape_job="kubernetes-events"} |= "CIDRNotAvailable" | logfmt [30m]) 19 | ) by (name, cluster_id, installation, pipeline, provider) > 0 20 | for: 5m 21 | labels: 22 | area: kaas 23 | cancel_if_outside_working_hours: "true" 24 | severity: page 25 | team: phoenix 26 | topic: nodes 27 | -------------------------------------------------------------------------------- /.github/workflows/zz_generated.run_ossf_scorecard.yaml: -------------------------------------------------------------------------------- 1 | # DO NOT EDIT. Generated with: 2 | # 3 | # devctl 4 | # 5 | # https://github.com/giantswarm/devctl/blob/ad0a25fbf301b2513e169ec964a8785d28f75be4/pkg/gen/input/workflows/internal/file/run_ossf_scorecard.yaml.template 6 | # 7 | 8 | # This workflow uses actions that are not certified by GitHub. They are provided 9 | # by a third-party and are governed by separate terms of service, privacy 10 | # policy, and support documentation. 11 | 12 | name: Scorecard supply-chain security 13 | on: 14 | # For Branch-Protection check. Only the default branch is supported. See 15 | # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection 16 | branch_protection_rule: {} 17 | # To guarantee Maintained check is occasionally updated. See 18 | # https://github.com/ossf/scorecard/blob/main/docs/checks.md#maintained 19 | schedule: 20 | - cron: '15 15 15 * *' 21 | push: 22 | branches: 23 | - main 24 | - master 25 | workflow_dispatch: {} 26 | 27 | jobs: 28 | analysis: 29 | uses: giantswarm/github-workflows/.github/workflows/ossf-scorecard.yaml@main 30 | secrets: 31 | scorecard_token: ${{ secrets.SCORECARD_TOKEN }} 32 | -------------------------------------------------------------------------------- /test/conf/pint/pint-config.hcl: -------------------------------------------------------------------------------- 1 | rule { 2 | # Disallow spaces in label/annotation keys, they're only allowed in values. 3 | reject ".* +.*" { 4 | label_keys = true 5 | annotation_keys = true 6 | } 7 | 8 | # Disallow URLs in labels, they should go to annotations. 9 | reject "https?://.+" { 10 | label_keys = true 11 | label_values = true 12 | } 13 | 14 | # Ensure that all aggregations are preserving mandatory labels. 15 | aggregate ".+" { 16 | severity = "bug" 17 | keep = ["cluster_id", "installation", "pipeline", "provider"] 18 | } 19 | } 20 | 21 | rule { 22 | # This block will apply to all alerting rules. 23 | match { 24 | kind = "alerting" 25 | } 26 | 27 | # Each alert must have a 'description' annotation. 28 | annotation "description" { 29 | severity = "bug" 30 | required = true 31 | } 32 | 33 | # Each alert must have an `area' label that's either 'kaas' or 'platform'. 34 | label "area" { 35 | severity = "bug" 36 | value = "(kaas|platform)" 37 | required = true 38 | } 39 | 40 | # Check how many times each alert would fire in the last 1d. 41 | alerts { 42 | range = "1d" 43 | step = "1m" 44 | resolve = "5m" 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/kaas/tenet/alerting-rules/node.memory-pressure.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | creationTimestamp: null 5 | labels: 6 | {{- include "labels.common" . | nindent 4 }} 7 | name: node.memory-pressure.rules 8 | namespace: {{ .Values.namespace }} 9 | spec: 10 | groups: 11 | - name: node.memory-pressure 12 | rules: 13 | - alert: ControlPlaneNodeMemoryPressureTaint 14 | annotations: 15 | description: '{{`Control plane node {{ $labels.node }} in {{ $labels.cluster_type }} cluster {{ $labels.installation }}{{ if $labels.cluster_id }}/{{ $labels.cluster_id }}{{ end }} has memory-pressure taint applied, indicating memory issues.`}}' 16 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/runbooks/memory-pressure/ 17 | expr: | 18 | kube_node_spec_taint{key="node.kubernetes.io/memory-pressure", effect="NoSchedule"} > 0 19 | and on (node) kube_node_role{role=~"control-plane|master"} 20 | for: 5m 21 | labels: 22 | area: kaas 23 | cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} 24 | severity: page 25 | team: tenet 26 | topic: kubernetes 27 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | # DO NOT EDIT. Generated with: 2 | # 3 | # devctl 4 | # 5 | # https://github.com/giantswarm/devctl/blob/6a704f7e2a8b0f09e82b5bab88f17971af849711/pkg/gen/input/makefile/internal/file/Makefile.template 6 | # 7 | 8 | include Makefile.*.mk 9 | 10 | ##@ General 11 | 12 | # The help target prints out all targets with their descriptions organized 13 | # beneath their categories. The categories are represented by '##@' and the 14 | # target descriptions by '##'. The awk commands is responsible for reading the 15 | # entire set of makefiles included in this invocation, looking for lines of the 16 | # file as xyz: ## something, and then pretty-format the target and help. Then, 17 | # if there's a line with ##@ something, that gets pretty-printed as a category. 18 | # More info on the usage of ANSI control characters for terminal formatting: 19 | # https://en.wikipedia.org/wiki/ANSI_escape_code#SGR_parameters 20 | # More info on the awk command: 21 | # http://linuxcommand.org/lc3_adv_awk.php 22 | 23 | .PHONY: help 24 | help: ## Display this help. 25 | @awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m\033[0m\n"} /^[a-zA-Z%\\\/_0-9-]+:.*?##/ { printf " \033[36m%-20s\033[0m %s\n", $$1, $$2 } /^##@/ { printf "\n\033[1m%s\033[0m\n", substr($$0, 5) } ' $(MAKEFILE_LIST) 26 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/kaas/rocket/alerting-rules/blackbox-exporter.cloud-provider-api.rules.yml: -------------------------------------------------------------------------------- 1 | # This rule is applied to all management clusters but it is only active if blackbox 2 | # exporter is deployed and configured with a scrape job named 'http-cloud-provider-api' 3 | apiVersion: monitoring.coreos.com/v1 4 | kind: PrometheusRule 5 | metadata: 6 | creationTimestamp: null 7 | labels: 8 | {{- include "labels.common" . | nindent 4 }} 9 | name: rocket-onprem-cloud-provider-api 10 | namespace: {{ .Values.namespace }} 11 | spec: 12 | groups: 13 | - name: rocket-onprem-cloud-provider-api 14 | rules: 15 | - alert: OnPremCloudProviderAPIIsDown 16 | annotations: 17 | description: '{{` blackbox-exporter on {{ $labels.cluster_id}} is unable to connect to the on-prem cloud provider API.`}}' 18 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/runbooks/onprem-cloud-provider-api/ 19 | expr: probe_success{cluster_type="management_cluster",job="prometheus-blackbox-exporter",target="http-cloud-provider-api"} == 0 20 | for: 5m 21 | labels: 22 | area: kaas 23 | severity: page 24 | team: rocket 25 | topic: network 26 | cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} 27 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/external-secrets.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | labels: 5 | {{- include "labels.common" . | nindent 4 }} 6 | name: external-secrets.rules 7 | namespace: {{ .Values.namespace }} 8 | spec: 9 | groups: 10 | - name: external-secrets 11 | rules: 12 | # This alert is for any deployment being in failed status in the `external-secrets` namespace. 13 | - alert: ExternalSecretsDeploymentNotSatisfied 14 | annotations: 15 | description: '{{`ExternalSecrets related deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}' 16 | runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/deployment-not-satisfied/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}&NAMESPACE={{ $labels.namespace }}&KIND=deployment&NAME={{ $labels.deployment }}`}}' 17 | expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", namespace="external-secrets"} > 0 18 | for: 30m 19 | labels: 20 | area: platform 21 | cancel_if_outside_working_hours: "true" 22 | severity: page 23 | team: honeybadger 24 | topic: managementcluster 25 | -------------------------------------------------------------------------------- /.github/workflows/zz_generated.create_release_pr.yaml: -------------------------------------------------------------------------------- 1 | # DO NOT EDIT. Generated with: 2 | # 3 | # devctl 4 | # 5 | # https://github.com/giantswarm/devctl/blob/ad0a25fbf301b2513e169ec964a8785d28f75be4/pkg/gen/input/workflows/internal/file/create_release_pr.yaml.template 6 | # 7 | name: Create Release PR 8 | on: 9 | push: 10 | branches: 11 | - 'legacy#release#v*.*.*' 12 | - 'main#release#v*.*.*' 13 | - 'main#release#major' 14 | - 'main#release#minor' 15 | - 'main#release#patch' 16 | - 'master#release#v*.*.*' 17 | - 'master#release#major' 18 | - 'master#release#minor' 19 | - 'master#release#patch' 20 | - 'release#v*.*.*' 21 | - 'release#major' 22 | - 'release#minor' 23 | - 'release#patch' 24 | - 'release-v*.*.x#release#v*.*.*' 25 | # "!" negates previous positive patterns so it has to be at the end. 26 | - '!release-v*.x.x#release#v*.*.*' 27 | workflow_call: 28 | inputs: 29 | branch: 30 | required: true 31 | type: string 32 | 33 | jobs: 34 | publish: 35 | uses: giantswarm/github-workflows/.github/workflows/create-release-pr.yaml@main 36 | with: 37 | branch: ${{ inputs.branch }} 38 | secrets: 39 | TAYLORBOT_GITHUB_ACTION: ${{ secrets.TAYLORBOT_GITHUB_ACTION }} 40 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/platform/atlas/recording-rules/monitoring.resource-usage-estimation.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | labels: 5 | {{- include "labels.common" . | nindent 4 }} 6 | name: monitoring.resource-usage-estimation.recording.rules 7 | namespace: {{ .Values.namespace }} 8 | spec: 9 | groups: 10 | - name: monitoring.resource-usage-estimation.recording 11 | rules: 12 | - expr: (sum(scrape_samples_post_metric_relabeling) by (cluster_id, installation, job, pipeline, provider) / on(cluster_id) group_left sum(cortex_ingester_active_series{container="ingester"}) by (cluster_id)) * on(cluster_id) group_left sum(container_memory_usage_bytes{container="ingester", namespace="mimir"}) by (cluster_id) 13 | record: giantswarm:observability:monitoring:resource_usage_estimation:memory_usage_bytes 14 | - expr: (sum(scrape_samples_post_metric_relabeling) by (cluster_id, installation, job, pipeline, provider) / on(cluster_id) group_left sum(cortex_ingester_active_series{container="ingester"}) by (cluster_id)) * on(cluster_id) group_left sum(container_memory_working_set_bytes{container="ingester", namespace="mimir"}) by (cluster_id) 15 | record: giantswarm:observability:monitoring:resource_usage_estimation:memory_working_set_bytes 16 | -------------------------------------------------------------------------------- /test/tests/providers/global/platform/shield/alerting-rules/general.rules.test.yml: -------------------------------------------------------------------------------- 1 | --- 2 | rule_files: 3 | - general.rules.yml 4 | tests: 5 | - interval: 1m 6 | input_series: 7 | # Kyverno validating webhooks 8 | - series: 'kube_pod_container_status_restarts_total{cluster_id="golem", cluster_type="workload_cluster", installation="golem", namespace="security", pipeline="stable", pod="trivy-0", provider="capa"}' 9 | values: "0+1x120" 10 | alert_rule_test: 11 | # Trivy pod 12 | - alertname: ShieldComponentRestartingTooOften 13 | eval_time: 91m 14 | exp_alerts: 15 | - exp_labels: 16 | area: platform 17 | cluster_id: golem 18 | cluster_type: workload_cluster 19 | installation: golem 20 | pipeline: stable 21 | provider: capa 22 | severity: notify 23 | team: shield 24 | topic: security 25 | namespace: security 26 | pod: trivy-0 27 | cancel_if_outside_working_hours: "false" 28 | exp_annotations: 29 | description: 'Pod security/trivy-0 is restarting too often.' 30 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/shield-pod-failing/ 31 | -------------------------------------------------------------------------------- /mimir/update.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Update Mimir mixins from upstream 4 | # 5 | # This script is used to update the Mimir mixins from the upstream repository. 6 | # 7 | # Usage: 8 | # ./mimir/update.sh from the root of the repository 9 | 10 | set -e 11 | 12 | BRANCH="main" 13 | MIXIN_URL=https://github.com/grafana/mimir/operations/mimir-mixin@$BRANCH 14 | OUTPUT_FILE="$(pwd)"/helm/prometheus-rules/templates/platform/atlas/recording-rules/mimir-mixins.rules.yml 15 | 16 | cd mimir 17 | rm -rf vendor jsonnetfile.* "$OUTPUT_FILE" 18 | 19 | jb init 20 | jb install $MIXIN_URL 21 | mixtool generate rules mixin.libsonnet -r "$OUTPUT_FILE" 22 | 23 | # Remove the initial `groups:` line 24 | sed -i '1d' "$OUTPUT_FILE" 25 | 26 | # Add the PrometheusRule metadata header 27 | sed -i '1i\ 28 | apiVersion: monitoring.coreos.com/v1\ 29 | kind: PrometheusRule\ 30 | metadata:\ 31 | labels:\ 32 | {{- include "labels.common" . | nindent 4 }}\ 33 | name: mimir.recording.rules\ 34 | namespace: {{ .Values.namespace }}\ 35 | spec:\ 36 | groups:' "$OUTPUT_FILE" 37 | 38 | # Add the mimir enabled helm conditional blocks 39 | sed -i '1i{{- if eq .Values.managementCluster.provider.flavor "capi" }}' "$OUTPUT_FILE" 40 | sed -i -e '$a{{- end }}' "$OUTPUT_FILE" 41 | 42 | sed -i 's/cluster_id,/cluster_id, installation, pipeline, provider,/g' "$OUTPUT_FILE" 43 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/platform/atlas/alerting-rules/mimir.logs.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | labels: 5 | {{- include "labels.common" . | nindent 4}} 6 | name: mimir.logs.rules 7 | namespace: {{ .Values.namespace }} 8 | spec: 9 | groups: 10 | - name: mimir-distributor 11 | rules: 12 | - alert: MimirDistributorMaxInflightPushRequests 13 | annotations: 14 | description: '{{`Mimir distributor is experiencing high rate of "err-mimir-distributor-max-inflight-push-requests" errors. Rate: {{ printf "%.2f" $value }} errors/second over the last 10 minutes.`}}' 15 | summary: Mimir distributor max inflight push requests errors 16 | runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/mimir/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}' 17 | expr: |- 18 | sum(rate({service_name="distributor"} |= "err-mimir-distributor-max-inflight-push-requests" [10m])) by (cluster_id, installation, provider, pipeline, namespace) > 0.1 19 | for: 15m 20 | labels: 21 | area: platform 22 | cancel_if_outside_working_hours: "true" 23 | severity: page 24 | team: atlas 25 | topic: observability 26 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/capa.inhibition.rules.yml: -------------------------------------------------------------------------------- 1 | {{- if eq .Values.managementCluster.provider.kind "capa" }} 2 | apiVersion: monitoring.coreos.com/v1 3 | kind: PrometheusRule 4 | metadata: 5 | creationTimestamp: null 6 | labels: 7 | {{- include "labels.common" . | nindent 4 }} 8 | cluster_type: "management_cluster" 9 | name: capa.inhibitions.rules 10 | namespace: {{ .Values.namespace }} 11 | spec: 12 | groups: 13 | - name: capa.inhibitions 14 | rules: 15 | - alert: InhibitionClusterWithoutWorkerNodes 16 | annotations: 17 | description: '{{`Cluster ({{ $labels.cluster_id }}) has no worker nodes.`}}' 18 | expr: |- 19 | label_replace( 20 | capi_cluster_status_condition{type="ControlPlaneReady", status="True"}, 21 | "cluster_id", 22 | "$1", 23 | "name", 24 | "(.*)" 25 | ) == 1 26 | unless on (cluster_id) ( 27 | sum(capi_machinepool_spec_replicas{} > 0) by (cluster_id) 28 | ) 29 | and on (cluster_id) ( 30 | capi_cluster_info{infrastructure_reference_kind="AWSCluster"} == 1 31 | ) 32 | labels: 33 | area: kaas 34 | has_worker_nodes: "false" 35 | team: phoenix 36 | topic: status 37 | {{- end }} 38 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/platform/atlas/alerting-rules/statefulset.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | creationTimestamp: null 5 | labels: 6 | {{- include "labels.common" . | nindent 4 }} 7 | name: statefulset.rules 8 | namespace: {{ .Values.namespace }} 9 | spec: 10 | groups: 11 | - name: statefulset 12 | rules: 13 | - alert: StatefulsetNotSatisfiedAtlas 14 | annotations: 15 | description: '{{`Statefulset {{ $labels.namespace}}/{{ $labels.statefulset }} is not satisfied.`}}' 16 | runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/deployment-not-satisfied/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}&NAMESPACE={{ $labels.namespace }}&KIND=statefulset&NAME={{ $labels.statefulset }}`}}' 17 | expr: |- 18 | kube_statefulset_status_replicas{cluster_type="management_cluster", statefulset=~"loki.*|mimir.*|pyroscope.*|tempo.*"} 19 | - kube_statefulset_status_replicas_ready{cluster_type="management_cluster", statefulset=~"loki.*|mimir.*|pyroscope.*|tempo.*"} 20 | > 0 21 | for: 3d 22 | labels: 23 | area: platform 24 | cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} 25 | severity: page 26 | team: atlas 27 | topic: managementcluster 28 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/crossplane.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | labels: 5 | {{- include "labels.common" . | nindent 4 }} 6 | name: crossplane.rules 7 | namespace: {{ .Values.namespace }} 8 | spec: 9 | groups: 10 | - name: crossplane 11 | rules: 12 | # This alert is for any deployment being in failed status in the `crossplane` namespace. 13 | # This usually includes Crossplane core components themselves, installed provider(s) and the metrics exporter. 14 | - alert: CrossplaneDeploymentNotSatisfied 15 | annotations: 16 | description: '{{`Crossplane related deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}' 17 | runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/deployment-not-satisfied/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}&NAMESPACE={{ $labels.namespace }}&KIND=deployment&NAME={{ $labels.deployment }}`}}' 18 | expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", namespace="crossplane"} > 0 19 | for: 30m 20 | labels: 21 | area: platform 22 | cancel_if_outside_working_hours: "true" 23 | severity: page 24 | team: honeybadger 25 | topic: managementcluster 26 | -------------------------------------------------------------------------------- /test/tests/providers/global/kaas/tenet/alerting-rules/capi-machineset.rules.test.yml: -------------------------------------------------------------------------------- 1 | rule_files: 2 | - capi-machineset.rules.yml 3 | 4 | tests: 5 | - interval: 1m 6 | input_series: 7 | - series: 'capi_machineset_annotation_paused{paused_value="true",cluster_id="grumpy", name="grumpy-def99", exported_namespace="giantswarm"}' 8 | values: "0+1x75" 9 | - series: 'capi_cluster_info{cluster_id="grumpy", provider="capa"}' 10 | values: "1+0x75" 11 | alert_rule_test: 12 | - alertname: MachineSetPaused 13 | eval_time: 75m 14 | exp_alerts: 15 | - exp_labels: 16 | area: kaas 17 | cancel_if_monitoring_agent_down: "true" 18 | cancel_if_outside_working_hours: "true" 19 | provider: capa 20 | severity: notify 21 | team: phoenix 22 | topic: managementcluster 23 | cluster_id: grumpy 24 | name: grumpy-def99 25 | exported_namespace: giantswarm 26 | paused_value: "true" 27 | exp_annotations: 28 | description: "Machineset giantswarm/grumpy-def99 is paused." 29 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/capi-machineset/ 30 | __dashboardUid__: bdi7iswg81czkcasd 31 | dashboardQueryParams: "orgId=2" 32 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/platform/atlas/alerting-rules/storage.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | creationTimestamp: null 5 | labels: 6 | {{- include "labels.common" . | nindent 4 }} 7 | name: observability.storage.rules 8 | namespace: {{ .Values.namespace }} 9 | spec: 10 | groups: 11 | - name: observability.storage 12 | rules: 13 | - alert: ObservabilityStorageSpaceTooLow 14 | annotations: 15 | description: '{{`The free space on the Data Disk for instance: {{ $labels.instance }} and PVC: {{ $labels.persistentvolumeclaim}} was below 10 percent for longer than 1 hour (current value {{ $value | printf "%.2f" }}).`}}' 16 | runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/low-disk-space/#persistent-volume?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}' 17 | expr: kubelet_volume_stats_available_bytes{cluster_type="management_cluster", persistentvolumeclaim=~".*(alertmanager|loki|mimir|prometheus|pyroscope|tempo|grafana-postgresql).*"}/kubelet_volume_stats_capacity_bytes{persistentvolumeclaim=~".*(alertmanager|loki|mimir|prometheus|pyroscope|tempo).*"} < 0.10 18 | for: 1h 19 | labels: 20 | area: platform 21 | cancel_if_outside_working_hours: "true" 22 | severity: page 23 | team: atlas 24 | topic: observability 25 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/platform/atlas/alerting-rules/flux-atlas.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | creationTimestamp: null 5 | labels: 6 | {{- include "labels.common" . | nindent 4 }} 7 | name: fluxcd-atlas.rules 8 | namespace: {{ .Values.namespace }} 9 | spec: 10 | groups: 11 | - name: fluxcd-atlas 12 | rules: 13 | # 14 | # FluxKustomizationFailed 15 | # 16 | # Alerting for GiantSwarm management clusters silences Kustomization CRs. 17 | # 18 | - alert: FluxKustomizationFailed 19 | annotations: 20 | description: |- 21 | {{`Flux Kustomization {{ $labels.name }} in ns {{ $labels.exported_namespace }} on {{ $labels.installation }}/{{ $labels.cluster_id }} is stuck in Failed state.`}} 22 | runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/flux-kustomization-failed/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}&NAMESPACE={{ $labels.exported_namespace }}&KUSTOMIZATION_NAME={{ $labels.name }}`}}' 23 | expr: gotk_resource_info{ready="False", customresource_kind="Kustomization", cluster_type="management_cluster", exported_namespace=~".*giantswarm.*", name="silences"} > 0 24 | for: 20m 25 | labels: 26 | area: platform 27 | cancel_if_outside_working_hours: "true" 28 | severity: page 29 | team: atlas 30 | topic: releng 31 | -------------------------------------------------------------------------------- /test/tests/providers/capa/kaas/phoenix/alerting-rules/cluster-crossplane.rules.test.yml: -------------------------------------------------------------------------------- 1 | rule_files: 2 | - cluster-crossplane.rules.yml 3 | 4 | tests: 5 | - interval: 1m 6 | input_series: 7 | - series: 'crossplane_managed_resource_exists{gvk="cloudwatchevents.aws.upbound.io/v1beta1, Kind=Rule", cluster_id="mymc", installation="test-installation"}' 8 | values: "6x20" 9 | - series: 'crossplane_managed_resource_ready{gvk="cloudwatchevents.aws.upbound.io/v1beta1, Kind=Rule", cluster_id="mymc", installation="test-installation"}' 10 | values: "5x20" 11 | 12 | alert_rule_test: 13 | - alertname: ClusterCrossplaneResourcesNotReady 14 | eval_time: 20m 15 | exp_alerts: 16 | - exp_labels: 17 | area: kaas 18 | cancel_if_outside_working_hours: "false" 19 | cluster_id: "mymc" 20 | gvk: "cloudwatchevents.aws.upbound.io/v1beta1, Kind=Rule" 21 | installation: "test-installation" 22 | severity: page 23 | team: phoenix 24 | exp_annotations: 25 | description: 'Not all managed Crossplane resources of type "cloudwatchevents.aws.upbound.io/v1beta1, Kind=Rule" on mymc are ready. This could affect creation or health of workload clusters.' 26 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/runbooks/cluster-crossplane-resources/?INSTALLATION=test-installation&CLUSTER=mymc 27 | -------------------------------------------------------------------------------- /test/tests/providers/global/platform/atlas/alerting-rules/silence-operator.rules.test.yml: -------------------------------------------------------------------------------- 1 | --- 2 | rule_files: 3 | - silence-operator.rules.yml 4 | 5 | tests: 6 | - interval: 1m 7 | input_series: 8 | - series: 'operatorkit_controller_errors_total{job="monitoring/silence-operator", controller="silence-controller", cluster_type="management_cluster", installation="myinstall", cluster_id="bar"}' 9 | values: "0x30 1+0x20 20x45 20-1x20 0x100" 10 | alert_rule_test: 11 | - alertname: SilenceOperatorReconcileErrors 12 | eval_time: 60m 13 | - alertname: SilenceOperatorReconcileErrors 14 | eval_time: 95m 15 | exp_alerts: 16 | - exp_labels: 17 | job: "monitoring/silence-operator" 18 | area: platform 19 | cancel_if_outside_working_hours: "true" 20 | cluster_id: bar 21 | cluster_type: management_cluster 22 | controller: silence-controller 23 | installation: "myinstall" 24 | severity: "page" 25 | team: "atlas" 26 | topic: "observability" 27 | exp_annotations: 28 | description: "silence-operator controller silence-controller too many reconcile errors." 29 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/operator-not-reconciling/?INSTALLATION=myinstall&CLUSTER=bar 30 | - alertname: SilenceOperatorReconcileErrors 31 | eval_time: 215m 32 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/kaas/tenet/alerting-rules/capi-kubeadmconfig.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | labels: {{- include "labels.common" . | nindent 4}} 5 | name: capi-kubeadmconfig.rules 6 | namespace: {{.Values.namespace}} 7 | spec: 8 | groups: 9 | - name: capi-kubeadmconfig 10 | rules: 11 | - alert: KubeadmConfigNotReady 12 | expr: |- 13 | ( 14 | capi_kubeadmconfig_status_condition{type="Ready", status="False"} 15 | * on(cluster_id) group_left(provider) 16 | sum( 17 | label_replace( 18 | capi_cluster_info, "provider", "vsphere", "infrastructure_reference_kind", "VSphereCluster" 19 | ) 20 | ) by (cluster_id, provider) 21 | ) > 0 22 | for: 1h 23 | labels: 24 | area: kaas 25 | cancel_if_monitoring_agent_down: "true" 26 | cancel_if_outside_working_hours: "true" 27 | severity: page 28 | team: {{ include "providerTeam" . }} 29 | topic: managementcluster 30 | annotations: 31 | description: |- 32 | {{`KubeadmConfig {{$labels.exported_namespace}}/{{$labels.name}} in cluster {{$labels.cluster_id}} encountered errors while generating a data secret`}} 33 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/capi-kubeadmconfig/ 34 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/kaas/tenet/alerting-rules/capi-machineset.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | labels: {{- include "labels.common" . | nindent 4}} 5 | name: capi-machineset.rules 6 | namespace: {{.Values.namespace}} 7 | spec: 8 | groups: 9 | - name: capi-machineset 10 | rules: 11 | - alert: MachineSetPaused 12 | expr: |- 13 | ( 14 | capi_machineset_annotation_paused{paused_value="true"} 15 | * on(cluster_id) group_left(provider) 16 | sum( 17 | label_replace( 18 | capi_cluster_info, "provider", "vsphere", "infrastructure_reference_kind", "VSphereCluster" 19 | ) 20 | ) by (cluster_id, provider) 21 | ) > 0 22 | for: 1h 23 | labels: 24 | area: kaas 25 | cancel_if_monitoring_agent_down: "true" 26 | cancel_if_outside_working_hours: "true" 27 | severity: notify 28 | team: {{ include "providerTeam" . }} 29 | topic: managementcluster 30 | annotations: 31 | description: |- 32 | {{`Machineset {{ $labels.exported_namespace }}/{{ $labels.name }} is paused.`}} 33 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/capi-machineset/ 34 | __dashboardUid__: bdi7iswg81czkcasd 35 | dashboardQueryParams: "orgId=2" 36 | -------------------------------------------------------------------------------- /scripts/find-alerts.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Contributed during Xmas 2022 hackathon. 4 | # 5 | 6 | # Example how to run it: 7 | # scripts/find-alerts.sh '.labels.team=="atlas"' '.labels.cancel_if_outside_working_hours=="true"' '.labels.severity=="page"' 8 | # => will report all alerts for team Atlas that page but are canceled out of working hours. 9 | 10 | # /!\ This script is provided as-is. 11 | # It won't break anything in your files, but parameters management, help, error handling is missing. 12 | # Meaning: no guarantee about the quality of generated output 13 | 14 | # In this place we can file helm-generated rules 15 | rulesFilesDir=test/tests/providers/aws/ 16 | # => prerequisite: have files generated. for instance "make test" starts with generating files. 17 | 18 | # Custom (user-provided) filters 19 | selectQueries=("$@") 20 | 21 | # Build `jq` query from filters given as parameters 22 | selectQueriesString="$(printf "| select(%s)\n" "${selectQueries[@]}")" 23 | 24 | # For each rules file 25 | for rulesFile in "$rulesFilesDir"/*.rules.yml; do 26 | 27 | # Retrieve (in an array) alert names that match the query 28 | mapfile -t alertsList < <( 29 | yq -ojson "$rulesFile" 2>/dev/null \ 30 | | jq '.groups[].rules[] 31 | '"$selectQueriesString"' 32 | | .alert' 2>/dev/null 33 | ) || continue 34 | 35 | # Console output 36 | for alert in "${alertsList[@]}"; do 37 | echo "alert $alert - file $(basename "$rulesFile")" 38 | done 39 | done 40 | -------------------------------------------------------------------------------- /DCO: -------------------------------------------------------------------------------- 1 | Developer Certificate of Origin 2 | Version 1.1 3 | 4 | Copyright (C) 2004, 2006 The Linux Foundation and its contributors. 5 | 660 York Street, Suite 102, 6 | San Francisco, CA 94110 USA 7 | 8 | Everyone is permitted to copy and distribute verbatim copies of this 9 | license document, but changing it is not allowed. 10 | 11 | 12 | Developer's Certificate of Origin 1.1 13 | 14 | By making a contribution to this project, I certify that: 15 | 16 | (a) The contribution was created in whole or in part by me and I 17 | have the right to submit it under the open source license 18 | indicated in the file; or 19 | 20 | (b) The contribution is based upon previous work that, to the best 21 | of my knowledge, is covered under an appropriate open source 22 | license and I have the right under that license to submit that 23 | work with modifications, whether created in whole or in part 24 | by me, under the same open source license (unless I am 25 | permitted to submit under a different license), as indicated 26 | in the file; or 27 | 28 | (c) The contribution was provided directly to me by some other 29 | person who certified (a), (b) or (c) and I have not modified 30 | it. 31 | 32 | (d) I understand and agree that this project and the contribution 33 | are public and that a record of the contribution (including all 34 | personal information I submit with it, including my sign-off) is 35 | maintained indefinitely and may be redistributed consistent with 36 | this project or the open source license(s) involved. 37 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/kaas/tenet/alerting-rules/cluster-autoscaler.rules.yml: -------------------------------------------------------------------------------- 1 | # This rule applies to all cloud workload clusters 2 | apiVersion: monitoring.coreos.com/v1 3 | kind: PrometheusRule 4 | metadata: 5 | creationTimestamp: null 6 | labels: 7 | {{- include "labels.common" . | nindent 4 }} 8 | name: cluster-autoscaler.rules 9 | namespace: {{ .Values.namespace }} 10 | spec: 11 | groups: 12 | - name: cluster-autoscaler 13 | rules: 14 | - alert: ClusterAutoscalerUnneededNodes 15 | annotations: 16 | description: '{{`Cluster-Autoscaler on {{ $labels.cluster_id }} has unneeded nodes.`}}' 17 | expr: cluster_autoscaler_unneeded_nodes_count{cluster_type="workload_cluster", provider=~"capa|capz|eks"} > 0 18 | for: 240m 19 | labels: 20 | area: kaas 21 | cancel_if_outside_working_hours: "true" 22 | cancel_if_cluster_has_no_workers: "true" 23 | severity: notify 24 | team: tenet 25 | topic: cluster-autoscaler 26 | - alert: ClusterAutoscalerFailedScaling 27 | annotations: 28 | description: '{{`Cluster-Autoscaler on {{ $labels.cluster_id }} has failed scaling up {{ $value | printf "%.0f" }} times recently.`}}' 29 | expr: increase(cluster_autoscaler_failed_scale_ups_total[15m]) > 1 and rate(cluster_autoscaler_failed_scale_ups_total[5m]) > 0 30 | labels: 31 | area: kaas 32 | cancel_if_outside_working_hours: "true" 33 | cancel_if_cluster_has_no_workers: "true" 34 | severity: page 35 | team: tenet 36 | topic: cluster-autoscaler 37 | -------------------------------------------------------------------------------- /test/tests/providers/global/platform/atlas/alerting-rules/flux-atlas.rules.test.yml: -------------------------------------------------------------------------------- 1 | --- 2 | rule_files: 3 | - flux-atlas.rules.yml 4 | 5 | tests: 6 | - interval: 1m 7 | input_series: 8 | - series: 'gotk_resource_info{ready="False", job="giantswarm/cluster-api-monitoring", installation="test", cluster_id="test", customresource_kind="Kustomization", cluster_type="management_cluster", exported_namespace="flux-giantswarm", name="silences"}' 9 | values: "1x60" 10 | alert_rule_test: 11 | - alertname: FluxKustomizationFailed 12 | eval_time: 30m 13 | exp_alerts: 14 | - exp_labels: 15 | alertname: "FluxKustomizationFailed" 16 | area: "platform" 17 | cancel_if_outside_working_hours: "true" 18 | cluster_id: "test" 19 | cluster_type: "management_cluster" 20 | customresource_kind: "Kustomization" 21 | exported_namespace: "flux-giantswarm" 22 | installation: "test" 23 | job: "giantswarm/cluster-api-monitoring" 24 | name: "silences" 25 | ready: "False" 26 | severity: "page" 27 | team: "atlas" 28 | topic: "releng" 29 | exp_annotations: 30 | description: "Flux Kustomization silences in ns flux-giantswarm on test/test is stuck in Failed state." 31 | runbook_url: "https://intranet.giantswarm.io/docs/support-and-ops/runbooks/flux-kustomization-failed/?INSTALLATION=test&CLUSTER=test&NAMESPACE=flux-giantswarm&KUSTOMIZATION_NAME=silences" 32 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/platform/cabbage/alerting-rules/dns.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | creationTimestamp: null 5 | labels: 6 | {{- include "labels.common" . | nindent 4 }} 7 | name: dns.rules 8 | namespace: {{ .Values.namespace }} 9 | spec: 10 | groups: 11 | - name: dns 12 | rules: 13 | - alert: DNSErrorRateTooHigh 14 | annotations: 15 | description: '{{`DNS error rate is too high for {{ or $labels.pod $labels.instance }} to {{ $labels.host }}, using {{ $labels.proto }}.`}}' 16 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/network-error/ 17 | expr: rate(dns_resolve_error_total[15m]) > 0.015 18 | for: 15m 19 | labels: 20 | area: platform 21 | cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} 22 | cancel_if_cluster_has_no_workers: "true" 23 | severity: page 24 | team: cabbage 25 | topic: network 26 | - alert: DNSCheckErrorRateTooHigh 27 | annotations: 28 | description: '{{`DNS check error rate is too high for {{ or $labels.pod $labels.instance }}.`}}' 29 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/network-error/ 30 | expr: rate(dns_error_total[15m]) > 0.015 31 | for: 15m 32 | labels: 33 | area: platform 34 | cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} 35 | cancel_if_cluster_has_no_workers: "true" 36 | severity: page 37 | team: cabbage 38 | topic: network 39 | -------------------------------------------------------------------------------- /.github/workflows/alert_tests.yaml: -------------------------------------------------------------------------------- 1 | name: alert-test 2 | run-name: run unit and conformance tests 3 | 4 | on: 5 | pull_request: 6 | # Only run on PRs based on the main branch 7 | branches: 8 | - main 9 | 10 | jobs: 11 | rules-tests: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 15 | with: 16 | fetch-depth: "0" 17 | - name: run rules tests 18 | run: make test-rules 19 | inhibition-tests: 20 | runs-on: ubuntu-latest 21 | steps: 22 | - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 23 | with: 24 | fetch-depth: "0" 25 | - name: run inhibition tests 26 | run: make test-inhibitions 27 | runbook-tests: 28 | runs-on: ubuntu-latest 29 | steps: 30 | - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 31 | with: 32 | fetch-depth: "0" 33 | - name: Clone intranet repository 34 | uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 35 | with: 36 | fetch-depth: 1 37 | repository: giantswarm/giantswarm 38 | path: giantswarm 39 | token: ${{ secrets.TAYLORBOT_GITHUB_ACTION }} 40 | - name: run runbook tests 41 | env: 42 | RUNBOOKS_DIR: ./giantswarm 43 | run: make test-ci-runbooks 44 | prometheus-lint: 45 | runs-on: ubuntu-latest 46 | steps: 47 | - uses: actions/checkout@8e8c483db84b4bee98b60c0593521ed34d9990e8 # v6.0.1 48 | with: 49 | fetch-depth: "0" 50 | - name: run pint linter 51 | run: make pint 52 | -------------------------------------------------------------------------------- /test/tests/providers/global/kaas/tenet/alerting-rules/certificate.management-cluster.rules.test.yml: -------------------------------------------------------------------------------- 1 | --- 2 | rule_files: 3 | - certificate.management-cluster.rules.yml 4 | 5 | tests: 6 | - interval: 1m 7 | input_series: 8 | - series: 'cert_exporter_secret_not_after{cluster_id="gauss", cluster_type="management_cluster", secretkey="tls.crt", certificatename="capa-serving-cert", exported_namespace="giantswarm", provider="capa"}' 9 | values: "1x20 1x20 0+0x20" 10 | - series: 'cert_exporter_certificate_cr_not_after{cluster_id="gauss", cluster_type="management_cluster", name="capa-serving-cert", exported_namespace="giantswarm", provider="capa"}' 11 | values: "1x20 _x20 0+0x20" 12 | alert_rule_test: 13 | - alertname: ManagementClusterCertificateIsMissing 14 | eval_time: 15m 15 | - alertname: ManagementClusterCertificateIsMissing 16 | eval_time: 35m 17 | exp_alerts: 18 | - exp_labels: 19 | area: kaas 20 | cancel_if_outside_working_hours: "true" 21 | certificatename: capa-serving-cert 22 | cluster_id: gauss 23 | exported_namespace: giantswarm 24 | provider: capa 25 | severity: page 26 | team: phoenix 27 | topic: security 28 | exp_annotations: 29 | description: 'Cannot renew Certificate for Secret giantswarm/capa-serving-cert on gauss because it is missing.' 30 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/managed-app-cert-manager/missing-certificate-for-secret/ 31 | - alertname: ManagementClusterCertificateIsMissing 32 | eval_time: 55m 33 | -------------------------------------------------------------------------------- /.github/workflows/update-tempo-mixins.yml: -------------------------------------------------------------------------------- 1 | name: Update Tempo Mixins 2 | 3 | on: 4 | schedule: 5 | # Run on the 1st day of every month at 10:00 UTC 6 | - cron: '0 10 1 * *' 7 | workflow_dispatch: # Allow manual triggering 8 | 9 | permissions: 10 | contents: write 11 | pull-requests: write 12 | 13 | jobs: 14 | update-tempo-mixins: 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - name: Checkout repository 19 | uses: actions/checkout@v6 20 | with: 21 | token: ${{ secrets.GITHUB_TOKEN }} 22 | 23 | - name: Set up Go 24 | uses: actions/setup-go@v4 25 | with: 26 | go-version: '1.25' 27 | 28 | - name: Update Tempo mixins 29 | run: make update-tempo-mixin 30 | 31 | - name: Check for changes 32 | id: changes 33 | run: | 34 | if git diff --quiet; then 35 | echo "has_changes=false" >> $GITHUB_OUTPUT 36 | else 37 | echo "has_changes=true" >> $GITHUB_OUTPUT 38 | fi 39 | 40 | - name: Create Pull Request 41 | if: steps.changes.outputs.has_changes == 'true' 42 | uses: peter-evans/create-pull-request@v8 43 | with: 44 | token: ${{ secrets.GITHUB_TOKEN }} 45 | commit-message: 'chore: update Tempo mixins from upstream' 46 | title: 'chore: update Tempo mixins from upstream' 47 | body: | 48 | This PR updates the Tempo mixins from grafana/tempo upstream repository. 49 | 50 | This is an automated update that runs monthly. 51 | branch: update-tempo-mixins-${{ github.run_number }} 52 | delete-branch: true 53 | draft: false 54 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/kaas/tenet/alerting-rules/systemd.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | creationTimestamp: null 5 | labels: 6 | {{- include "labels.common" . | nindent 4 }} 7 | name: systemd.rules 8 | namespace: {{ .Values.namespace }} 9 | spec: 10 | groups: 11 | - name: systemd 12 | rules: 13 | ## TODO(@giantswarm/team-tenet): Update those lists when all vintage clusters are gone 14 | - alert: ClusterCriticalSystemdUnitFailed 15 | annotations: 16 | description: '{{`Critical systemd unit {{ $labels.name }} is failed on {{ $labels.instance }}.`}}' 17 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/critical-systemd-unit-failed/ 18 | expr: node_systemd_unit_state{name=~"k8s-addons.service|systemd-networkd.service", state="failed"} == 1 19 | for: 5m 20 | labels: 21 | area: kaas 22 | cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} 23 | severity: page 24 | team: tenet 25 | topic: infrastructure 26 | - alert: ClusterDisabledSystemdUnitActive 27 | annotations: 28 | description: '{{`Disabled Systemd unit {{ $labels.name }} is active on {{ $labels.ip }}.`}}' 29 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/disabled-systemd-unit-active/ 30 | expr: node_systemd_unit_state{name=~"locksmithd.service|update-engine.service", state="active"} == 1 31 | for: 5m 32 | labels: 33 | area: kaas 34 | cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} 35 | severity: page 36 | team: tenet 37 | topic: infrastructure 38 | -------------------------------------------------------------------------------- /test/conf/promtool_ignore: -------------------------------------------------------------------------------- 1 | kaas/phoenix/alerting-rules/aws-load-balancer-controller.rules.yml 2 | kaas/phoenix/alerting-rules/aws.node.workload-cluster.rules.yml 3 | kaas/phoenix/alerting-rules/aws.workload-cluster.rules.yml 4 | kaas/phoenix/alerting-rules/capa.management-cluster.rules.yml 5 | kaas/phoenix/alerting-rules/dns-operator-azure.rules.yml 6 | kaas/phoenix/alerting-rules/irsa.rules.yml 7 | platform/atlas/alerting-rules/deployment.management-cluster.rules.yml 8 | platform/atlas/alerting-rules/deployment.workload-cluster.rules.yml 9 | platform/atlas/alerting-rules/fluentbit.rules.yml 10 | platform/atlas/alerting-rules/inhibit.oncall.rules.yml 11 | platform/atlas/alerting-rules/keda.rules.yml 12 | platform/atlas/alerting-rules/kube-state-metrics.rules.yml 13 | platform/atlas/alerting-rules/prometheus-operator.rules.yml 14 | platform/atlas/alerting-rules/storage.rules.yml 15 | platform/atlas/recording-rules/grafana-cloud.rules.yml 16 | platform/atlas/recording-rules/loki-mixins.rules.yml 17 | platform/atlas/recording-rules/mimir-mixins.rules.yml 18 | platform/cabbage/alerting-rules/coredns.rules.yml 19 | platform/cabbage/alerting-rules/external-dns.rules.yml 20 | platform/cabbage/alerting-rules/ingress-controller.rules.yml 21 | platform/cabbage/alerting-rules/dns.rules.yml 22 | platform/cabbage/recording-rules/gs-managed-app-deployment-status.rules.yml 23 | platform/honeybadger/alerting-rules/chart.rules.yml 24 | platform/honeybadger/alerting-rules/helm.rules.yml 25 | platform/honeybadger/alerting-rules/secret.rules.yml 26 | platform/honeybadger/recording-rules/helm-operations.rules.yml 27 | platform/shield/alerting-rules/falco.rules.yml 28 | platform/shield/alerting-rules/cert-manager.rules.yml 29 | platform/shield/alerting-rules/dex.rules.yml 30 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/_helpers.tpl: -------------------------------------------------------------------------------- 1 | {{/* vim: set filetype=mustache: */}} 2 | {{/* 3 | Expand the name of the chart. 4 | */}} 5 | {{- define "name" -}} 6 | {{- .Chart.Name | trunc 63 | trimSuffix "-" -}} 7 | {{- end -}} 8 | 9 | {{/* 10 | Create chart name and version as used by the chart label. 11 | */}} 12 | {{- define "chart" -}} 13 | {{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" -}} 14 | {{- end -}} 15 | 16 | {{/* 17 | Common labels 18 | */}} 19 | {{- define "labels.common" -}} 20 | app.kubernetes.io/name: {{ include "name" . | quote }} 21 | app.kubernetes.io/instance: {{ .Release.Name | quote }} 22 | app.kubernetes.io/managed-by: {{ .Release.Service | quote }} 23 | app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} 24 | application.giantswarm.io/team: {{ index .Chart.Annotations "application.giantswarm.io/team" | default "atlas" | quote }} 25 | helm.sh/chart: {{ include "chart" . | quote }} 26 | giantswarm.io/service-type: {{ .Values.serviceType }} 27 | {{- if or (.Template.Name | hasSuffix "logs.yaml") (.Template.Name | hasSuffix "logs.yml")}} 28 | application.giantswarm.io/prometheus-rule-kind: loki 29 | {{- end }} 30 | observability.giantswarm.io/tenant: giantswarm 31 | {{- end -}} 32 | 33 | {{- define "providerTeam" -}} 34 | '{{`{{ if or (eq .Labels.provider "cloud-director") (eq .Labels.provider "vsphere") }}rocket{{ else }}phoenix{{ end }}`}}' 35 | {{- end -}} 36 | 37 | {{- define "workingHoursOnly" -}} 38 | {{- if eq .Values.managementCluster.pipeline "stable-testing" -}} 39 | "true" 40 | {{- else -}} 41 | "false" 42 | {{- end -}} 43 | {{- end -}} 44 | 45 | {{- define "namespaceNotGiantswarm" -}} 46 | "(([^g]|g[^i]|gi[^a]|gia[^n]|gian[^t]|giant[^s]|giants[^w]|giantsw[^a]|giantswa[^r]|giantswar[^m])*)" 47 | {{- end -}} 48 | -------------------------------------------------------------------------------- /.github/workflows/update-loki-mixins.yml: -------------------------------------------------------------------------------- 1 | name: Update Loki Mixins 2 | 3 | on: 4 | schedule: 5 | # Run on the 1st day of every month at 09:30 UTC 6 | - cron: '30 9 1 * *' 7 | workflow_dispatch: # Allow manual triggering 8 | 9 | permissions: 10 | contents: write 11 | pull-requests: write 12 | 13 | jobs: 14 | update-loki-mixins: 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - name: Checkout repository 19 | uses: actions/checkout@v6 20 | with: 21 | token: ${{ secrets.GITHUB_TOKEN }} 22 | 23 | - name: Set up Go 24 | uses: actions/setup-go@v4 25 | with: 26 | go-version: '1.25' 27 | 28 | - name: Install tools 29 | run: make install-tools 30 | 31 | - name: Update Loki mixins 32 | run: make update-loki-mixin 33 | 34 | - name: Check for changes 35 | id: changes 36 | run: | 37 | if git diff --quiet; then 38 | echo "has_changes=false" >> $GITHUB_OUTPUT 39 | else 40 | echo "has_changes=true" >> $GITHUB_OUTPUT 41 | fi 42 | 43 | - name: Create Pull Request 44 | if: steps.changes.outputs.has_changes == 'true' 45 | uses: peter-evans/create-pull-request@v8 46 | with: 47 | token: ${{ secrets.GITHUB_TOKEN }} 48 | commit-message: 'chore: update Loki mixins from upstream' 49 | title: 'chore: update Loki mixins from upstream' 50 | body: | 51 | This PR updates the Loki mixins from grafana/loki upstream repository. 52 | 53 | This is an automated update that runs monthly. 54 | branch: update-loki-mixins-${{ github.run_number }} 55 | delete-branch: true 56 | draft: false 57 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/kaas/tenet/alerting-rules/fairness.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | labels: 5 | {{- include "labels.common" . | nindent 4 }} 6 | name: fairness.rules 7 | namespace: {{ .Values.namespace }} 8 | spec: 9 | groups: 10 | - name: fairness 11 | rules: 12 | - alert: FlowcontrolRejectedRequests 13 | annotations: 14 | description: '{{`Cluster {{ $labels.installation }}/{{ $labels.cluster_id }}: k8s API fairness is rejecting calls in flow schema {{ $labels.flow_schema }}.`}}' 15 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/flowcontrol-rejected-requests/ 16 | expr: (increase(apiserver_flowcontrol_rejected_requests_total[1m]) > 0) 17 | for: 5m 18 | labels: 19 | area: kaas 20 | cancel_if_outside_working_hours: "true" 21 | severity: notify 22 | team: tenet 23 | topic: kubernetes 24 | - alert: FlowcontrolTooManyRequests 25 | annotations: 26 | description: '{{`Cluster {{ $labels.installation }}/{{ $labels.cluster_id }}: there are too many API requests for flow schema {{ $labels.flow_schema }}.`}}' 27 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/flowcontrol-rejected-requests/ 28 | expr: sum(irate(apiserver_flowcontrol_dispatched_requests_total[1m])) by (cluster_id, installation, pipeline, provider, flow_schema, priority_level) > min by(cluster_id, installation, pipeline, provider, flow_schema, priority_level) (apiserver_flowcontrol_nominal_limit_seats) 29 | for: 15m 30 | labels: 31 | area: kaas 32 | cancel_if_outside_working_hours: "true" 33 | severity: notify 34 | team: tenet 35 | topic: kubernetes 36 | -------------------------------------------------------------------------------- /.github/workflows/update-mimir-mixins.yml: -------------------------------------------------------------------------------- 1 | name: Update Mimir Mixins 2 | 3 | on: 4 | schedule: 5 | # Run on the 1st day of every month at 09:00 UTC 6 | - cron: '0 9 1 * *' 7 | workflow_dispatch: # Allow manual triggering 8 | 9 | permissions: 10 | contents: write 11 | pull-requests: write 12 | 13 | jobs: 14 | update-mimir-mixins: 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - name: Checkout repository 19 | uses: actions/checkout@v6 20 | with: 21 | token: ${{ secrets.GITHUB_TOKEN }} 22 | 23 | - name: Set up Go 24 | uses: actions/setup-go@v4 25 | with: 26 | go-version: '1.25' 27 | 28 | - name: Install tools 29 | run: make install-tools 30 | 31 | - name: Update Mimir mixins 32 | run: make update-mimir-mixin 33 | 34 | - name: Check for changes 35 | id: changes 36 | run: | 37 | if git diff --quiet; then 38 | echo "has_changes=false" >> $GITHUB_OUTPUT 39 | else 40 | echo "has_changes=true" >> $GITHUB_OUTPUT 41 | fi 42 | 43 | - name: Create Pull Request 44 | if: steps.changes.outputs.has_changes == 'true' 45 | uses: peter-evans/create-pull-request@v8 46 | with: 47 | token: ${{ secrets.GITHUB_TOKEN }} 48 | commit-message: 'chore: update Mimir mixins from upstream' 49 | title: 'chore: update Mimir mixins from upstream' 50 | body: | 51 | This PR updates the Mimir mixins from grafana/mimir upstream repository. 52 | 53 | This is an automated update that runs monthly. 54 | branch: update-mimir-mixins-${{ github.run_number }} 55 | delete-branch: true 56 | draft: false 57 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/platform/atlas/alerting-rules/app-configuration.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | labels: 5 | {{- include "labels.common" . | nindent 4 }} 6 | name: atlas-app-configuration.rules 7 | namespace: {{ .Values.namespace }} 8 | spec: 9 | groups: 10 | - name: atlas-app-configuration 11 | rules: 12 | # Coming from https://gigantic.slack.com/archives/C07A03AN9JM 13 | # This alert ensures our app has no unexpected configmaps. 14 | - alert: ConfigmapUnexpected 15 | annotations: 16 | description: '{{`{{ $labels.configmap }} configmap is not expected.`}}' 17 | runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/atlas-app-configuration/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}' 18 | expr: | 19 | kube_configmap_info{cluster_type="management_cluster", configmap=~".*(loki|mimir)-user-values"} > 0 20 | for: 2d 21 | labels: 22 | area: platform 23 | cancel_if_outside_working_hours: "true" 24 | severity: notify 25 | team: atlas 26 | topic: observability 27 | # This alert ensures our app has no unexpected secrets. 28 | - alert: SecretUnexpected 29 | annotations: 30 | description: '{{`{{ $labels.secret }} secret is not expected.`}}' 31 | runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/atlas-app-configuration/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}' 32 | expr: | 33 | kube_secret_info{cluster_type="management_cluster", secret=~".*(loki|mimir)-user-values"} > 0 34 | for: 2d 35 | labels: 36 | area: platform 37 | cancel_if_outside_working_hours: "true" 38 | severity: notify 39 | team: atlas 40 | topic: observability 41 | -------------------------------------------------------------------------------- /test/tests/providers/global/platform/honeybadger/alerting-rules/flux.rules.test.yml: -------------------------------------------------------------------------------- 1 | --- 2 | rule_files: 3 | - flux.rules.yml 4 | 5 | tests: 6 | - interval: 1m 7 | input_series: 8 | - series: 'gotk_resource_info{ready="False", job="giantswarm/cluster-api-monitoring", customresource_kind="Kustomization", cluster_type="management_cluster", exported_namespace="flux-giantswarm", name="silences"}' 9 | values: "1x60" 10 | alert_rule_test: 11 | - alertname: FluxKustomizationFailed 12 | eval_time: 30m 13 | exp_alerts: [] 14 | - interval: 1m 15 | input_series: 16 | - series: 'gotk_resource_info{installation="test", job="giantswarm/cluster-api-monitoring", cluster_type="management_cluster", exported_namespace="flux-giantswarm", customresource_kind="Kustomization", name="flux", suspended="true"}' 17 | values: "1x60 0+1x60 1+0x1500" 18 | alert_rule_test: 19 | - alertname: FluxSuspendedForTooLong 20 | eval_time: 1560m 21 | exp_alerts: 22 | - exp_labels: 23 | alertname: "FluxSuspendedForTooLong" 24 | area: platform 25 | cancel_if_outside_working_hours: "true" 26 | cluster_type: "management_cluster" 27 | customresource_kind: "Kustomization" 28 | exported_namespace: "flux-giantswarm" 29 | installation: "test" 30 | job: "giantswarm/cluster-api-monitoring" 31 | name: "flux" 32 | severity: "page" 33 | suspended: "true" 34 | team: "honeybadger" 35 | topic: "releng" 36 | exp_annotations: 37 | description: "Flux Kustomization flux in ns flux-giantswarm on test has been suspended for 24h." 38 | runbook_url: "https://intranet.giantswarm.io/docs/support-and-ops/runbooks/flux-suspended/?INSTALLATION=test&CLUSTER=" 39 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/platform/atlas/alerting-rules/sloth.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | labels: 5 | {{- include "labels.common" . | nindent 4 }} 6 | name: sloth.rules 7 | namespace: {{ .Values.namespace }} 8 | spec: 9 | groups: 10 | - name: sloth 11 | rules: 12 | - alert: SlothDown 13 | annotations: 14 | description: 'Sloth is down.' 15 | runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/sloth-down/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}' 16 | expr: count(up{job="monitoring/sloth"} == 0) by (cluster_id, installation, provider, pipeline) > 0 17 | for: 5m 18 | labels: 19 | area: platform 20 | cancel_if_cluster_control_plane_unhealthy: "true" 21 | cancel_if_outside_working_hours: "true" 22 | severity: page 23 | team: atlas 24 | topic: observability 25 | # Coming from https://github.com/giantswarm/giantswarm/issues/31133 26 | # This alert ensures sloth container are not restarting too often (flappiness). 27 | - alert: SlothRestartingTooOften 28 | annotations: 29 | description: '{{`Sloth is restarting too often.`}}' 30 | runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/sloth-down/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}' 31 | expr: | 32 | increase( 33 | kube_pod_container_status_restarts_total{cluster_type="management_cluster", namespace="monitoring", container="sloth"}[1h] 34 | ) > 5 35 | for: 5m 36 | labels: 37 | area: platform 38 | cancel_if_cluster_control_plane_unhealthy: "true" 39 | cancel_if_outside_working_hours: "true" 40 | severity: page 41 | team: atlas 42 | topic: observability 43 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/kaas/tenet/alerting-rules/pods.core.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | creationTimestamp: null 5 | labels: 6 | {{- include "labels.common" . | nindent 4 }} 7 | name: pods.core.rules 8 | namespace: {{ .Values.namespace }} 9 | spec: 10 | groups: 11 | - name: pods.core 12 | rules: 13 | - alert: ContainerIsRestartingTooFrequently 14 | annotations: 15 | description: '{{`Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting too often in cluster {{ $labels.installation }}/{{ $labels.cluster_id }}.`}}' 16 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/container-is-restarting-too-often/ 17 | expr: label_join(increase(kube_pod_container_status_restarts_total{container=~"cluster-autoscaler.*|etcd-kubernetes-resources-count-exporter.*"}[1h]), "service", "/", "namespace", "pod") > 10 18 | for: 10m 19 | labels: 20 | area: kaas 21 | cancel_if_outside_working_hours: "true" 22 | cancel_if_cluster_has_no_workers: "true" 23 | severity: page 24 | team: tenet 25 | topic: kubernetes 26 | - alert: PodPending 27 | annotations: 28 | description: '{{`Pod {{ $labels.namespace }}/{{ $labels.pod }} is stuck in Pending in cluster {{ $labels.installation }}/{{ $labels.cluster_id }}.`}}' 29 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/pod-stuck-in-pending/ 30 | expr: kube_pod_status_phase{namespace="kube-system",pod=~"(cluster-autoscaler.*)",phase="Pending"} == 1 31 | for: 15m 32 | labels: 33 | area: kaas 34 | cancel_if_outside_working_hours: "true" 35 | cancel_if_kube_state_metrics_down: "true" 36 | cancel_if_cluster_has_no_workers: "true" 37 | severity: page 38 | team: tenet 39 | 40 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/platform/cabbage/recording-rules/gs-managed-app-deployment-status.rules.yml: -------------------------------------------------------------------------------- 1 | ## Cabbage is the only user of those recording rules 2 | apiVersion: monitoring.coreos.com/v1 3 | kind: PrometheusRule 4 | metadata: 5 | labels: 6 | {{- include "labels.common" . | nindent 4 }} 7 | name: gs-managed-app-deployment-status.recording.rules 8 | namespace: {{ .Values.namespace }} 9 | spec: 10 | groups: 11 | - name: gs-managed-app-deployments.recording 12 | rules: 13 | - expr: | 14 | label_replace( 15 | kube_deployment_status_replicas_available 16 | * on (cluster_id, cluster_type, pod, namespace, deployment) group_left (label_app_kubernetes_io_name) 17 | kube_deployment_labels{label_giantswarm_io_service_type="managed"}, 18 | "managed_app", 19 | "$1", 20 | "label_app_kubernetes_io_name", 21 | "(.*)" 22 | ) 23 | record: managed_app_deployment_status_replicas_available 24 | - expr: | 25 | label_replace( 26 | kube_deployment_status_replicas_unavailable 27 | * on (cluster_id, cluster_type, pod, namespace, deployment) group_left (label_app_kubernetes_io_name) 28 | kube_deployment_labels{label_giantswarm_io_service_type="managed"}, 29 | "managed_app", 30 | "$1", 31 | "label_app_kubernetes_io_name", 32 | "(.*)" 33 | ) 34 | record: managed_app_deployment_status_replicas_unavailable 35 | - expr: | 36 | label_replace( 37 | kube_deployment_spec_replicas 38 | * on (pod, namespace, deployment) group_left (label_app_kubernetes_io_name) 39 | kube_deployment_labels{label_giantswarm_io_service_type="managed"}, 40 | "managed_app", 41 | "$1", 42 | "label_app_kubernetes_io_name", 43 | "(.*)" 44 | ) 45 | record: managed_app_deployment_spec_replicas 46 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/kaas/tenet/alerting-rules/certificate.workload-cluster.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | creationTimestamp: null 5 | labels: 6 | {{- include "labels.common" . | nindent 4 }} 7 | name: certificate.workload-cluster.rules 8 | namespace: {{ .Values.namespace }} 9 | spec: 10 | groups: 11 | - name: certificate.workload-cluster 12 | rules: 13 | - alert: WorkloadClusterCertificateWillExpireInLessThanAMonth 14 | annotations: 15 | description: '{{`Certificate {{ $labels.path }} on {{ $labels.node }} will expire in less than a month.`}}' 16 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/renew-certificates/ 17 | dashboardExternalUrl: https://giantswarm.grafana.net/d/a2f4976Zk/certificates 18 | expr: (cert_exporter_not_after{cluster_type="workload_cluster", path!="/etc/kubernetes/ssl/service-account-crt.pem"} - time()) < 4 * 7 * 24 * 60 * 60 19 | for: 5m 20 | labels: 21 | area: kaas 22 | cancel_if_outside_working_hours: "true" 23 | severity: notify 24 | team: teddyfriends 25 | topic: security 26 | - alert: ClusterCertificateExpirationMetricsMissing 27 | annotations: 28 | description: '{{`Certificate metrics are missing for cluster {{ $labels.cluster_id }}.`}}' 29 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/absent-metrics/ 30 | expr: max(up{cluster_id!="", cluster_type="workload_cluster"}) by (cluster_id, installation, pipeline, provider) unless on (cluster_id) count (cert_exporter_not_after{cluster_type="workload_cluster"}) by (cluster_id, installation, pipeline, provider) > 0 31 | for: 30m 32 | labels: 33 | area: kaas 34 | cancel_if_outside_working_hours: "true" 35 | severity: page 36 | team: {{ include "providerTeam" . }} 37 | topic: security 38 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/kaas/tenet/alerting-rules/pods.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | creationTimestamp: null 5 | labels: 6 | {{- include "labels.common" . | nindent 4 }} 7 | name: pods.rules 8 | namespace: {{ .Values.namespace }} 9 | spec: 10 | groups: 11 | - name: pods 12 | rules: 13 | # PodsUnschedulable fires when too many pods are in `unschedulable` status in the `kube-system` namespace 14 | # This is a signal something is wrong with the WC. 15 | - alert: PodsUnschedulable 16 | annotations: 17 | description: '{{`Cluster {{ $labels.cluster_id }} has unschedulable kube-system pods.`}}' 18 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/validate-cluster-health/ 19 | __dashboardUid__: unschedulable-pods 20 | dashboardQueryParams: '{{`orgId=1&var-namespace=kube-system&var-cluster={{ $labels.cluster_id }}`}}' 21 | expr: |- 22 | count( 23 | count_over_time( 24 | # Have a list of unschedulable pods 25 | count( 26 | kube_pod_status_unschedulable{namespace="kube-system"} 27 | ) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region, pod) 28 | # only keep those that have been unschedulable for more than 10 minutes over the past 30 minutes 29 | [30m:]) > 10 30 | # count per cluster 31 | ) by (cluster_id, cluster_type, customer, installation, pipeline, provider, region) 32 | # At least 2 pods should be unschedulable for the alert to page. 33 | >= 2 34 | for: 15m 35 | labels: 36 | area: kaas 37 | # Let's start with business hours only, maybe 24x7 in the future 38 | cancel_if_outside_working_hours: "true" 39 | inhibit_cluster_broken: "true" 40 | severity: page 41 | team: {{ include "providerTeam" . }} 42 | topic: workloadcluster 43 | -------------------------------------------------------------------------------- /Makefile.gen.app.mk: -------------------------------------------------------------------------------- 1 | # DO NOT EDIT. Generated with: 2 | # 3 | # devctl 4 | # 5 | # https://github.com/giantswarm/devctl/blob/eea19f200d7cfd27ded22474b787563bbfdb8ec4/pkg/gen/input/makefile/internal/file/Makefile.gen.app.mk.template 6 | # 7 | 8 | ##@ App 9 | 10 | YQ=docker run --rm -u $$(id -u) -v $${PWD}:/workdir mikefarah/yq:4.29.2 11 | HELM_DOCS=docker run --rm -u $$(id -u) -v $${PWD}:/helm-docs jnorwood/helm-docs:v1.11.0 12 | 13 | ifdef APPLICATION 14 | DEPS := $(shell find $(APPLICATION)/charts -maxdepth 2 -name "Chart.yaml" -printf "%h\n") 15 | endif 16 | 17 | .PHONY: lint-chart check-env update-chart helm-docs update-deps $(DEPS) 18 | 19 | lint-chart: IMAGE := giantswarm/helm-chart-testing:v3.0.0-rc.1 20 | lint-chart: check-env ## Runs ct against the default chart. 21 | @echo "====> $@" 22 | rm -rf /tmp/$(APPLICATION)-test 23 | mkdir -p /tmp/$(APPLICATION)-test/helm 24 | cp -a ./helm/$(APPLICATION) /tmp/$(APPLICATION)-test/helm/ 25 | architect helm template --dir /tmp/$(APPLICATION)-test/helm/$(APPLICATION) 26 | docker run -it --rm -v /tmp/$(APPLICATION)-test:/wd --workdir=/wd --name ct $(IMAGE) ct lint --validate-maintainers=false --charts="helm/$(APPLICATION)" 27 | rm -rf /tmp/$(APPLICATION)-test 28 | 29 | update-chart: check-env ## Sync chart with upstream repo. 30 | @echo "====> $@" 31 | vendir sync 32 | $(MAKE) update-deps 33 | 34 | update-deps: check-env $(DEPS) ## Update Helm dependencies. 35 | cd $(APPLICATION) && helm dependency update 36 | 37 | $(DEPS): check-env ## Update main Chart.yaml with new local dep versions. 38 | dep_name=$(shell basename $@) && \ 39 | new_version=`$(YQ) .version $(APPLICATION)/charts/$$dep_name/Chart.yaml` && \ 40 | $(YQ) -i e "with(.dependencies[]; select(.name == \"$$dep_name\") | .version = \"$$new_version\")" $(APPLICATION)/Chart.yaml 41 | 42 | helm-docs: check-env ## Update $(APPLICATION) README. 43 | $(HELM_DOCS) -c $(APPLICATION) -g $(APPLICATION) 44 | 45 | check-env: 46 | ifndef APPLICATION 47 | $(error APPLICATION is not defined) 48 | endif 49 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/chart.rules.yml: -------------------------------------------------------------------------------- 1 | # TODO(@giantswarm/team-honeybadger): This is only used by the chart-operator, let's get rid of it when the chart operator is gone. 2 | apiVersion: monitoring.coreos.com/v1 3 | kind: PrometheusRule 4 | metadata: 5 | creationTimestamp: null 6 | labels: 7 | {{- include "labels.common" . | nindent 4 }} 8 | name: chart.rules 9 | namespace: {{ .Values.namespace }} 10 | spec: 11 | groups: 12 | - name: chart 13 | rules: 14 | - alert: ChartOperatorDown 15 | annotations: 16 | description: '{{`ChartOperator ({{ $labels.instance }}) is down.`}}' 17 | runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/chart-operator-down/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}' 18 | expr: label_replace(up{job="chart-operator"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 19 | for: 15m 20 | labels: 21 | area: platform 22 | cancel_if_cluster_control_plane_unhealthy: "true" 23 | cancel_if_kubelet_down: "true" 24 | cancel_if_cluster_has_no_workers: "true" 25 | cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} 26 | cancel_if_monitoring_agent_down: "true" 27 | severity: notify 28 | team: honeybadger 29 | topic: releng 30 | - alert: ChartOrphanConfigMap 31 | annotations: 32 | description: '{{`Chart configmaps have not been deleted.`}}' 33 | runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/chart-operator-orphan-resources/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}' 34 | expr: chart_operator_configmap_orphan > 0 35 | for: 10m 36 | labels: 37 | area: platform 38 | cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} 39 | severity: notify 40 | team: honeybadger 41 | topic: releng 42 | -------------------------------------------------------------------------------- /Makefile.custom.mk: -------------------------------------------------------------------------------- 1 | .PHONY: clean-dry-run 2 | clean-dry-run: ## dry run for `make clean` - print all untracked files 3 | @git clean -xnf 4 | 5 | .PHONY: clean 6 | clean: ## Clean the git work dir and remove all untracked files 7 | # clean stage 8 | git clean -xdf -- test/hack/bin test/hack/output test/hack/checkLabels 9 | 10 | ##@ Testing 11 | 12 | .PHONY: test 13 | test: install-tools template-chart test-rules test-inhibitions test-runbooks ## Run all tests 14 | 15 | install-tools: 16 | ./test/hack/bin/fetch-tools.sh 17 | 18 | template-chart: install-tools ## prepare the helm chart 19 | bash ./test/hack/bin/template-chart.sh 20 | 21 | test-rules: install-tools template-chart ## run unit tests for alerting rules 22 | bash test/hack/bin/verify-rules.sh "$(test_filter)" "${rules_type}" 23 | 24 | test-inhibitions: install-tools template-chart ## test whether inhibition labels are well defined 25 | bash test/hack/bin/get-inhibition.sh 26 | cd test/hack/checkLabels; go run main.go 27 | 28 | test-runbooks: install-tools template-chart ## Check if runbooks are valid 29 | bash test/hack/bin/check-runbooks.sh 30 | 31 | test-ci-runbooks: ## Check if runbooks are valid in CI 32 | test/hack/bin/check-runbooks.sh --ci 33 | 34 | pint: install-tools template-chart ## Run pint 35 | GENERATE_ONLY=true bash test/hack/bin/verify-rules.sh 36 | ./test/hack/bin/run-pint.sh test/conf/pint/pint-config.hcl ${PINT_TEAM_FILTER} 37 | 38 | pint-all: install-tools template-chart ## Run pint with extra checks 39 | GENERATE_ONLY=true bash test/hack/bin/verify-rules.sh 40 | ./test/hack/bin/run-pint.sh test/conf/pint/pint-all.hcl ${PINT_TEAM_FILTER} 41 | 42 | ##@ Mixins 43 | update-mimir-mixin: install-tools ## Update Mimir mixins 44 | ./mimir/update.sh 45 | 46 | update-loki-mixin: install-tools ## Update Loki mixins 47 | ./loki/update.sh 48 | 49 | update-tempo-mixin: install-tools ## Update Tempo mixins 50 | ./tempo/update.sh 51 | 52 | update-mixin: update-mimir-mixin update-loki-mixin update-tempo-mixin ## Update all mixins 53 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/platform/atlas/alerting-rules/logging-pipeline.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | labels: 5 | {{- include "labels.common" . | nindent 4 }} 6 | name: logging-pipeline.rules 7 | namespace: {{ .Values.namespace }} 8 | spec: 9 | groups: 10 | - name: logging-pipeline 11 | rules: 12 | # Any alloy component that uses the loki.write component can throw such errors. 13 | - alert: LogForwardingErrors 14 | annotations: 15 | __dashboardUid__: 53c1ecddc3a1d5d4b8d6cd0c23676c31 16 | dashboardQueryParams: "orgId=2" 17 | description: '{{`{{ $value | printf "%.2f" }}% of the requests to Loki are failing for pod {{ $labels.pod }} (threshold 10%)`}}' 18 | runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/logging-pipeline/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}' 19 | expr: |- 20 | ( 21 | 100 22 | * 23 | ( 24 | ( 25 | sum by (cluster_id, installation, provider, pipeline, namespace, job, instance, pod) ( 26 | rate ( 27 | loki_write_request_duration_seconds_count{status_code!~"2.."}[5m:] 28 | ) 29 | ) 30 | ) 31 | / 32 | ( 33 | sum by (cluster_id, installation, provider, pipeline, namespace, job, instance, pod) ( 34 | rate ( 35 | loki_write_request_duration_seconds_count[5m:] 36 | ) 37 | ) 38 | ) 39 | ) 40 | ) 41 | > 10 42 | for: 15m 43 | labels: 44 | area: platform 45 | severity: page 46 | team: atlas 47 | topic: observability 48 | cancel_if_outside_working_hours: "true" 49 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws-load-balancer-controller.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | creationTimestamp: null 5 | labels: 6 | {{- include "labels.common" . | nindent 4 }} 7 | name: aws-load-balancer-controller.rules 8 | namespace: {{ .Values.namespace }} 9 | spec: 10 | groups: 11 | - name: aws-load-balancer-controller 12 | rules: 13 | - alert: AWSLoadBalancerControllerAWSAPIErrors 14 | annotations: 15 | description: '{{`AWS load balancer controller pod {{ $labels.namespace}}/{{ $labels.pod }} on {{ $labels.cluster_id}} is throwing {{ $labels.error_code }} errors when contacting AWS API.`}}' 16 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/alb-errors/ 17 | expr: sum(increase(aws_api_calls_total{cluster_type="workload_cluster", error_code != "", provider=~"capa|eks"}[20m])) by (cluster_id, error_code, installation, namespace, pipeline, provider, pod) > 0 18 | for: 40m 19 | labels: 20 | area: kaas 21 | cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} 22 | severity: page 23 | team: phoenix 24 | topic: alb 25 | - alert: AWSLoadBalancerControllerReconcileErrors 26 | annotations: 27 | description: '{{`AWS load balancer controller pod {{ $labels.namespace }}/{{ $labels.pod }} on {{ $labels.cluster_id }} is throwing errors while reconciling the {{ $labels.controller }} controller.`}}' 28 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/alb-errors/ 29 | expr: sum(increase(controller_runtime_reconcile_total{cluster_type="workload_cluster", provider=~"capa|eks", result = "error", service="aws-load-balancer-controller"}[20m])) by (cluster_id, controller, installation, namespace, pipeline, provider, pod) > 0 30 | for: 40m 31 | labels: 32 | area: kaas 33 | cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} 34 | severity: page 35 | team: phoenix 36 | topic: alb 37 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/dns-operator-azure.rules.yml: -------------------------------------------------------------------------------- 1 | {{- if eq .Values.managementCluster.provider.kind "capz" }} 2 | apiVersion: monitoring.coreos.com/v1 3 | kind: PrometheusRule 4 | metadata: 5 | labels: {{- include "labels.common" . | nindent 4}} 6 | name: dns-operator-azure.rules 7 | namespace: {{.Values.namespace}} 8 | spec: 9 | groups: 10 | - name: dns-operator-azure 11 | rules: 12 | - alert: ClusterDNSZoneMissing 13 | annotations: 14 | description: |- 15 | {{`No DNS-zone for cluster {{ $labels.exported_namespace}}/{{ $labels.name }} got created yet. Check dns-operator-azure logs in installation/{{ $labels.installation}}.`}} 16 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/dns-operator-azure/ 17 | expr: |- 18 | capi_cluster_status_phase{phase="Provisioned"} 19 | unless on (cluster_id, name) 20 | label_replace(dns_operator_azure_zone_info{type="public"}, "name", "$1", "resource_group", "(.+)") 21 | for: 30m 22 | labels: 23 | area: kaas 24 | cancel_if_outside_working_hours: {{include "workingHoursOnly" .}} 25 | severity: notify 26 | team: phoenix 27 | topic: managementcluster 28 | - alert: AzureDNSOperatorAPIErrorRate 29 | annotations: 30 | description: |- 31 | {{`Error rate for {{ $labels.method }} is high. Check dns-operator-azure logs in installation/{{ $labels.installation }}.`}} 32 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/dns-operator-azure/ 33 | expr: |- 34 | sum by (cluster_id, installation, method, pipeline, provider) (rate(dns_operator_azure_api_request_errors_total[5m])) > 0 35 | for: 15m 36 | labels: 37 | area: kaas 38 | cancel_if_outside_working_hours: {{include "workingHoursOnly" .}} 39 | severity: notify 40 | team: phoenix 41 | topic: managementcluster 42 | {{- end }} 43 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/konfigure-operator.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | creationTimestamp: null 5 | labels: 6 | {{- include "labels.common" . | nindent 4 }} 7 | name: konfigure-operator.rules 8 | namespace: {{ .Values.namespace }} 9 | spec: 10 | groups: 11 | - name: konfigure-operator 12 | rules: 13 | - alert: KonfigureOperatorDeploymentNotSatisfied 14 | annotations: 15 | description: '{{`Konfigure Operator deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}' 16 | runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/deployment-not-satisfied/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}&NAMESPACE={{ $labels.namespace }}&KIND=deployment&NAME={{ $labels.deployment }}`}}' 17 | expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster", namespace="giantswarm", deployment="konfigure-operator"} > 0 18 | for: 30m 19 | labels: 20 | area: platform 21 | cancel_if_outside_working_hours: "true" 22 | severity: page 23 | team: honeybadger 24 | topic: managementcluster 25 | - alert: KonfigurationReconciliationFailed 26 | annotations: 27 | description: |- 28 | {{`{{ $labels.resource_kind }} {{ $labels.resource_name }} in ns {{ $labels.resource_namespace }} on {{ $labels.installation }} is stuck in Failed state.`}} 29 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/konfigure-operator/ 30 | expr: konfigure_operator_reconcile_condition{condition_type="Ready", condition_status="False"} > 0 31 | for: 10m 32 | labels: 33 | area: platform 34 | cancel_if_outside_working_hours: "true" 35 | severity: page 36 | team: honeybadger 37 | topic: releng 38 | namespace: |- 39 | {{`{{ $labels.exported_namespace }}`}} 40 | -------------------------------------------------------------------------------- /scripts/sync-kube-mixin.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -o errexit 4 | set -o nounset 5 | set -o pipefail 6 | 7 | TMPDIR="$(mktemp -d -t 'tmp.XXXXXXXXXX')" 8 | RULESFILE="helm/prometheus-rules/templates/kaas/tenet/recording-rules/kubernetes-mixins.rules.yml" 9 | 10 | trap 'cleanup' EXIT 11 | 12 | function cleanup { 13 | rm -rf "$TMPDIR" 14 | } 15 | 16 | function tune_rules { 17 | # Extra tuning 18 | 19 | # Latest mixins use SLO instead of classic metrics in several places 20 | # but we dropped these SLO metrics 21 | sed -i 's/apiserver_request_slo_duration_seconds/apiserver_request_duration_seconds/g' "$RULESFILE" 22 | sed -i 's/cluster_id/cluster_id, installation, pipeline, provider/g' "$RULESFILE" 23 | } 24 | 25 | function main { 26 | local MIXIN_VER 27 | # make a temporary dir to work in 28 | local MIXIN_REPO="git@github.com:giantswarm/giantswarm-kubernetes-mixin.git" 29 | # clone a branch or tag if provided 30 | local BRANCH="${1:-}" 31 | 32 | if [[ -z "$BRANCH" ]]; then 33 | # clone the mixins repo 34 | echo -e "\nCloning master branch:\n" 35 | git clone --single-branch "$MIXIN_REPO" "$TMPDIR"/mixins 36 | else 37 | # clone the mixins repo branch or tag 38 | echo -e "\nCloning branch or tag '$BRANCH':\n" 39 | git clone --branch "$BRANCH" --single-branch "$MIXIN_REPO" "$TMPDIR"/mixins 40 | fi 41 | 42 | # get the current commit of the mixin repo 43 | cd "$TMPDIR"/mixins 44 | MIXIN_VER="$(git rev-parse HEAD)" 45 | cd - > /dev/null 46 | 47 | 48 | local PRECONTENT='apiVersion: monitoring.coreos.com/v1 49 | kind: PrometheusRule 50 | metadata: 51 | labels: 52 | {{- include "labels.common" . | nindent 4 }} 53 | name: kube-mixins.recording.rules 54 | namespace: {{ .Values.namespace }} 55 | spec: 56 | ' 57 | 58 | # copy generated rules file 59 | cp "$TMPDIR"/mixins/files/prometheus-rules/rules.yml "$RULESFILE" 60 | 61 | # prepend K8s objectmeta to the rules file 62 | printf '%s %s' "$PRECONTENT" "$(cat "$RULESFILE")" > "$RULESFILE" 63 | 64 | tune_rules 65 | 66 | echo -e "\nSynced mixin repo at commit: $MIXIN_VER\n" 67 | 68 | # tidy up 69 | cleanup 70 | } 71 | 72 | main "$@" 73 | -------------------------------------------------------------------------------- /test/conf/pint/pint-all.hcl: -------------------------------------------------------------------------------- 1 | rule { 2 | # Disallow spaces in label/annotation keys, they're only allowed in values. 3 | reject ".* +.*" { 4 | label_keys = true 5 | annotation_keys = true 6 | } 7 | 8 | # Disallow URLs in labels, they should go to annotations. 9 | reject "https?://.+" { 10 | label_keys = true 11 | label_values = true 12 | } 13 | 14 | # Ensure that all aggregations are preserving mandatory labels. 15 | aggregate ".+" { 16 | severity = "bug" 17 | keep = ["cluster_id", "installation", "pipeline", "provider"] 18 | } 19 | } 20 | 21 | rule { 22 | # This block will apply to all alerting rules. 23 | match { 24 | kind = "alerting" 25 | } 26 | 27 | # Each alert must have a 'description' annotation. 28 | annotation "description" { 29 | severity = "bug" 30 | required = true 31 | } 32 | 33 | # Each alert must have an `area' label that's either 'kaas' or 'platform'. 34 | label "area" { 35 | severity = "bug" 36 | value = "(kaas|platform)" 37 | required = true 38 | } 39 | 40 | # Each alert must have a 'runbook_url' annotation. 41 | annotation "runbook_url" { 42 | severity = "bug" 43 | required = true 44 | } 45 | 46 | # Each alert should have a 'dashboardUid' annotation. 47 | annotation "__dashboardUid__" { 48 | severity = "warning" 49 | required = true 50 | } 51 | 52 | # Check how many times each alert would fire in the last 1d. 53 | alerts { 54 | range = "1d" 55 | step = "1m" 56 | resolve = "5m" 57 | } 58 | } 59 | 60 | # Rule for regular alerts 61 | rule { 62 | match { 63 | kind = "alerting" 64 | name = "!~Inhibition.*|.*Heartbeat.*" 65 | } 66 | 67 | # Each alert must have a 'severity' label that's either 'page', 'notify' or 'ticket'. 68 | label "severity" { 69 | severity = "bug" 70 | value = "(page|notify|ticket)" 71 | required = true 72 | } 73 | } 74 | 75 | # Rule for inhibition and heartbeat alerts 76 | rule { 77 | match { 78 | kind = "alerting" 79 | name = "~Inhibition.*|.*Heartbeat.*" 80 | } 81 | 82 | label "severity" { 83 | severity = "bug" 84 | value = "none" 85 | required = true 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/aws.workload-cluster.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | creationTimestamp: null 5 | labels: 6 | {{- include "labels.common" . | nindent 4 }} 7 | name: aws.workload-cluster.rules 8 | namespace: {{ .Values.namespace }} 9 | spec: 10 | groups: 11 | - name: aws.workload-cluster 12 | rules: 13 | - alert: WorkloadClusterContainerIsRestartingTooFrequentlyAWS 14 | annotations: 15 | description: '{{`Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} is restarting too often.`}}' 16 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/container-is-restarting-too-often/ 17 | ## TODO(@giantswarm/team-phoenix): Review this list once all vintage installations are gone 18 | expr: label_join(increase(kube_pod_container_status_restarts_total{container=~"aws-node.*|kiam-agent.*|kiam-server.*|ebs-(plugin|csi).*|aws-pod-identity-webhook.*|efs-csi-(node|controller).*"}[1h]), "service", "/", "namespace", "pod") > 10 19 | for: 10m 20 | labels: 21 | area: kaas 22 | cancel_if_outside_working_hours: "true" 23 | cancel_if_cluster_has_no_workers: "true" 24 | severity: page 25 | team: phoenix 26 | topic: kubernetes 27 | - alert: WorkloadClusterPodPendingAWS 28 | annotations: 29 | description: '{{`Pod {{ $labels.namespace }}/{{ $labels.pod }} is stuck in Pending.`}}' 30 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/pod-stuck-in-pending/ 31 | ## TODO(@giantswarm/team-phoenix): Review this list once all vintage installations are gone 32 | expr: kube_pod_status_phase{namespace="kube-system",pod=~"(aws-node.*|kiam-agent.*|kiam-server.*|ebs-(plugin|csi).*|efs-csi-(node|controller).*)", phase="Pending"} == 1 33 | for: 15m 34 | labels: 35 | area: kaas 36 | cancel_if_outside_working_hours: "true" 37 | cancel_if_kube_state_metrics_down: "true" 38 | cancel_if_cluster_has_no_workers: "true" 39 | severity: page 40 | team: phoenix 41 | -------------------------------------------------------------------------------- /test/tests/providers/capa/kaas/phoenix/alerting-rules/capa.inhibition.rules.test.yml: -------------------------------------------------------------------------------- 1 | --- 2 | rule_files: 3 | - capa.inhibition.rules.yml 4 | 5 | tests: 6 | # Tests for `InhibitionClusterWithoutWorkerNodes` inhibition alert 7 | - interval: 1m 8 | input_series: 9 | - series: 'capi_cluster_status_condition{cluster_id="golem", cluster_type="management_cluster", name="golem", pipeline="testing", status="True", type="ControlPlaneReady"}' 10 | values: "1+0x300" 11 | - series: 'capi_machinepool_spec_replicas{cluster_id="golem", cluster_name="golem", cluster_type="management_cluster", customer="giantswarm", installation="golem", organization="giantswarm", pipeline="testing", provider="capa"}' 12 | values: "_x60 0x60 3x60" 13 | - series: 'capi_cluster_info{infrastructure_reference_kind="AWSCluster", cluster_id="golem"}' 14 | values: "1+0x300" 15 | alert_rule_test: 16 | - alertname: InhibitionClusterWithoutWorkerNodes 17 | eval_time: 30m 18 | exp_alerts: 19 | - exp_labels: 20 | area: kaas 21 | cluster_id: "golem" 22 | cluster_type: "management_cluster" 23 | has_worker_nodes: "false" 24 | name: "golem" 25 | pipeline: "testing" 26 | status: "True" 27 | team: "phoenix" 28 | topic: "status" 29 | type: "ControlPlaneReady" 30 | exp_annotations: 31 | description: "Cluster (golem) has no worker nodes." 32 | - alertname: InhibitionClusterWithoutWorkerNodes 33 | eval_time: 90m 34 | exp_alerts: 35 | - exp_labels: 36 | area: kaas 37 | cluster_id: "golem" 38 | cluster_type: "management_cluster" 39 | has_worker_nodes: "false" 40 | name: "golem" 41 | pipeline: "testing" 42 | status: "True" 43 | team: "phoenix" 44 | topic: "status" 45 | type: "ControlPlaneReady" 46 | exp_annotations: 47 | description: "Cluster (golem) has no worker nodes." 48 | - alertname: InhibitionClusterWithoutWorkerNodes 49 | eval_time: 150m 50 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/cloud-provider-controller.rules.yml: -------------------------------------------------------------------------------- 1 | # This rule applies to CAPI management clusters only 2 | apiVersion: monitoring.coreos.com/v1 3 | kind: PrometheusRule 4 | metadata: 5 | creationTimestamp: null 6 | labels: 7 | {{- include "labels.common" . | nindent 4 }} 8 | name: cloud-provider-controller.rules 9 | namespace: {{ .Values.namespace }} 10 | spec: 11 | groups: 12 | - name: cloud-provider-controller 13 | rules: 14 | - alert: FluxHelmReleaseFailed 15 | annotations: 16 | description: |- 17 | {{`Flux HelmRelease {{ $labels.name }} in ns {{ $labels.exported_namespace }} on {{ $labels.installation }}/{{ $labels.cluster_id }} is stuck in Failed state.`}} 18 | runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/flux-helmrelease-failed/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}&NAMESPACE={{ $labels.exported_namespace }}&HELMRELEASE_NAME={{ $labels.name }}`}}' 19 | {{- $components := "(aws-ebs-csi-driver|cloud-provider-aws|azure-cloud-controller-manager|azure-cloud-node-manager|azuredisk-csi-driver|azurefile-csi-driver|cloud-provider-vsphere|cloud-provider-cloud-director)" }} 20 | expr: | 21 | ( 22 | label_replace(gotk_resource_info{ready="False", customresource_kind="HelmRelease", cluster_type="management_cluster", exported_namespace!="flux-giantswarm", exported_namespace!~"org-t-.*", name=~"(.+)-{{ $components }}"}, "cluster_id", "$1", "name", "(.+)-{{ $components }}") 23 | * on(cluster_id) group_left(provider) 24 | sum( 25 | label_replace( 26 | capi_cluster_info, "provider", "vsphere", "infrastructure_reference_kind", "VSphereCluster" 27 | ) 28 | ) by (cluster_id, provider) 29 | ) > 0 30 | for: 20m 31 | labels: 32 | area: kaas 33 | cancel_if_outside_working_hours: "true" 34 | cancel_if_kube_state_metrics_down: "true" 35 | cancel_if_monitoring_agent_down: "true" 36 | severity: page 37 | team: {{ include "providerTeam" . }} 38 | topic: managementcluster 39 | namespace: |- 40 | {{`{{ $labels.exported_namespace }}`}} 41 | -------------------------------------------------------------------------------- /.github/workflows/zz_generated.add-team-labels.yaml: -------------------------------------------------------------------------------- 1 | name: Add appropriate labels to issue 2 | 3 | on: 4 | issues: 5 | types: [assigned] 6 | 7 | jobs: 8 | build_user_list: 9 | name: Get yaml config of GS users 10 | runs-on: ubuntu-latest 11 | permissions: 12 | contents: read 13 | steps: 14 | - name: Get user-mapping 15 | env: 16 | GH_TOKEN: ${{ secrets.ISSUE_AUTOMATION }} 17 | run: | 18 | mkdir -p artifacts 19 | gh api -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" \ 20 | /repos/giantswarm/github/contents/tools/issue-automation/user-mapping.yaml \ 21 | | jq -r '.content' \ 22 | | base64 -d > artifacts/users.yaml 23 | - name: Upload Artifact 24 | uses: actions/upload-artifact@65462800fd760344b1a7b4382951275a0abb4808 # v4.3.3 25 | with: 26 | name: users 27 | path: artifacts/users.yaml 28 | retention-days: 1 29 | 30 | add_label: 31 | name: Add team label when assigned 32 | runs-on: ubuntu-latest 33 | needs: build_user_list 34 | steps: 35 | - uses: actions/download-artifact@65a9edc5881444af0b9093a5e628f2fe47ea3b2e # v4.1.7 36 | id: download-users 37 | with: 38 | name: users 39 | - name: Find team label based on user names 40 | run: | 41 | event_assignee=$(cat $GITHUB_EVENT_PATH | jq -r .assignee.login | tr '[:upper:]' '[:lower:]') 42 | echo "Issue assigned to: ${event_assignee}" 43 | 44 | TEAMS=$(cat ${{steps.download-users.outputs.download-path}}/users.yaml | tr '[:upper:]' '[:lower:]' | yq ".${event_assignee}.teams" -o csv | tr ',' ' ') 45 | 46 | echo "LABEL<> $GITHUB_ENV 47 | for team in ${TEAMS}; do 48 | echo "Team: ${team} | Label: team/${team}" 49 | echo "team/${team}" >> $GITHUB_ENV 50 | done 51 | echo "EOF" >> $GITHUB_ENV 52 | - name: Apply label to issue 53 | if: ${{ env.LABEL != '' && env.LABEL != 'null' && env.LABEL != null }} 54 | uses: actions-ecosystem/action-add-labels@bd52874380e3909a1ac983768df6976535ece7f8 # v1.1.3 55 | with: 56 | github_token: ${{ secrets.ISSUE_AUTOMATION }} 57 | labels: | 58 | ${{ env.LABEL }} 59 | -------------------------------------------------------------------------------- /mimir/mixin.libsonnet: -------------------------------------------------------------------------------- 1 | (import 'mimir-mixin/mixin.libsonnet') + { 2 | _config+:: { 3 | tags: [ 4 | 'owner:team-atlas', 5 | 'topic:observability', 6 | 'component:mimir', 7 | ], 8 | 9 | per_cluster_label: 'cluster_id', 10 | // Not sure why the default is set to instance, but we want to set it to node 11 | per_node_label: 'node', 12 | per_component_loki_label: 'component', 13 | // We marked it as disabled as this should be enabled only if the enterprise gateway is enabled 14 | gateway_enabled: false, 15 | // Whether alerts for experimental ingest storage are enabled. 16 | ingest_storage_enabled: false, 17 | // Disable autoscaling components we do not use 18 | autoscaling_hpa_prefix: 'mimir-', 19 | // Whether autoscaling panels and alerts should be enabled for specific Mimir services. 20 | autoscaling: { 21 | query_frontend: { 22 | enabled: false, 23 | hpa_name: $._config.autoscaling_hpa_prefix + 'query-frontend', 24 | }, 25 | ruler_query_frontend: { 26 | enabled: false, 27 | hpa_name: $._config.autoscaling_hpa_prefix + 'ruler-query-frontend', 28 | }, 29 | querier: { 30 | enabled: true, 31 | hpa_name: $._config.autoscaling_hpa_prefix + 'querier', 32 | }, 33 | ruler_querier: { 34 | enabled: false, 35 | hpa_name: $._config.autoscaling_hpa_prefix + 'ruler-querier', 36 | }, 37 | store_gateway: { 38 | enabled: false, 39 | hpa_name: $._config.autoscaling_hpa_prefix + 'store-gateway', 40 | }, 41 | distributor: { 42 | enabled: true, 43 | hpa_name: $._config.autoscaling_hpa_prefix + 'distributor', 44 | }, 45 | ruler: { 46 | enabled: false, 47 | hpa_name: $._config.autoscaling_hpa_prefix + 'ruler', 48 | }, 49 | gateway: { 50 | enabled: true, 51 | hpa_name: $._config.autoscaling_hpa_prefix + 'gateway', 52 | }, 53 | ingester: { 54 | enabled: false, 55 | hpa_name: $._config.autoscaling_hpa_prefix + 'ingester', 56 | }, 57 | compactor: { 58 | enabled: false, 59 | hpa_name: $._config.autoscaling_hpa_prefix + 'compactor', 60 | }, 61 | }, 62 | }, 63 | } 64 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/platform/honeybadger/alerting-rules/zot.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | labels: 5 | {{- include "labels.common" . | nindent 4 }} 6 | name: zot.rules 7 | namespace: {{ .Values.namespace }} 8 | spec: 9 | groups: 10 | - name: zot 11 | rules: 12 | - alert: ZotDeploymentNotSatisfied 13 | annotations: 14 | description: '{{`Zot deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}' 15 | runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/zot/?CUSTOMER={{ $labels.customer }}&INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}' 16 | expr: kube_deployment_status_replicas_unavailable{cluster_type="management_cluster",namespace="zot",deployment="zot-zot"} > 0 17 | for: 30m 18 | labels: 19 | area: platform 20 | cancel_if_outside_working_hours: "true" 21 | severity: page 22 | team: honeybadger 23 | topic: managementcluster 24 | - alert: ZotPersistentVolumeFillingUp 25 | annotations: 26 | description: '{{`The Zot PersistentVolume claimed by {{ $labels.persistentvolumeclaim}} in namespace {{ $labels.namespace }} is at least 80% full and projected to fill up soon.`}}' 27 | runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/zot/?CUSTOMER={{ $labels.customer }}&INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}' 28 | expr: |- 29 | ( 30 | kubelet_volume_stats_available_bytes{namespace="zot", persistentvolumeclaim="zot-zot-pvc"} 31 | / 32 | kubelet_volume_stats_capacity_bytes{namespace="zot", persistentvolumeclaim="zot-zot-pvc"} 33 | ) < 0.1 34 | or 35 | predict_linear(kubelet_volume_stats_available_bytes{namespace="zot", persistentvolumeclaim="zot-zot-pvc"}[1h], 4 * 3600) < 0.05 36 | for: 1h 37 | labels: 38 | area: platform 39 | cancel_if_outside_working_hours: "true" 40 | severity: page 41 | team: honeybadger 42 | topic: managementcluster 43 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/platform/cabbage/alerting-rules/kong.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | creationTimestamp: null 5 | labels: 6 | {{- include "labels.common" . | nindent 4 }} 7 | name: kong.rules 8 | namespace: {{ .Values.namespace }} 9 | spec: 10 | groups: 11 | - name: kong 12 | rules: 13 | - alert: KongNonProdDeploymentNotSatisfied 14 | annotations: 15 | description: '{{`Kong Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}' 16 | runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/deployment-not-satisfied/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}&NAMESPACE={{ $labels.namespace }}&KIND=deployment&NAME={{ $labels.deployment }}`}}' 17 | expr: managed_app_deployment_status_replicas_available{managed_app=~"kong.*", cluster_id!~"p.*"} / (managed_app_deployment_status_replicas_available{managed_app=~"kong.*", cluster_id!~"p.*"} + managed_app_deployment_status_replicas_unavailable{managed_app=~"kong.*", cluster_id!~"p.*"}) < 0.6 18 | for: 30m 19 | labels: 20 | area: platform 21 | cancel_if_outside_working_hours: "true" 22 | severity: page 23 | team: cabbage 24 | topic: kong 25 | - alert: KongProductionDeploymentNotSatisfied 26 | annotations: 27 | description: '{{`Kong Deployment {{ $labels.namespace}}/{{ $labels.deployment }} is not satisfied.`}}' 28 | runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/deployment-not-satisfied/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}&NAMESPACE={{ $labels.namespace }}&KIND=deployment&NAME={{ $labels.deployment }}`}}' 29 | expr: managed_app_deployment_status_replicas_available{managed_app=~"kong.*", cluster_id=~"p.*"} / (managed_app_deployment_status_replicas_available{managed_app=~"kong.*", cluster_id=~"p.*"} + managed_app_deployment_status_replicas_unavailable{managed_app=~"kong.*", cluster_id=~"p.*"}) < 0.6 30 | for: 30m 31 | labels: 32 | area: platform 33 | cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} 34 | severity: page 35 | team: cabbage 36 | topic: kong 37 | -------------------------------------------------------------------------------- /test/tests/providers/global/platform/honeybadger/alerting-rules/zot.rules.test.yml: -------------------------------------------------------------------------------- 1 | --- 2 | rule_files: 3 | - zot.rules.yml 4 | 5 | tests: 6 | - interval: 1m 7 | input_series: 8 | - series: 'kube_deployment_status_replicas_unavailable{cluster_type="management_cluster",namespace="zot",deployment="zot-zot"}' 9 | values: '_x5 0x10 1x45' 10 | alert_rule_test: 11 | - alertname: ZotDeploymentNotSatisfied 12 | eval_time: 46m 13 | exp_alerts: 14 | - exp_labels: 15 | alertname: "ZotDeploymentNotSatisfied" 16 | area: "platform" 17 | cancel_if_outside_working_hours: "true" 18 | cluster_type: "management_cluster" 19 | deployment: "zot-zot" 20 | namespace: "zot" 21 | severity: "page" 22 | team: "honeybadger" 23 | topic: "managementcluster" 24 | exp_annotations: 25 | description: "Zot deployment zot/zot-zot is not satisfied." 26 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/runbooks/zot/?CUSTOMER=&INSTALLATION=&CLUSTER= 27 | - interval: 1m 28 | input_series: 29 | - series: 'kubelet_volume_stats_available_bytes{namespace="zot", persistentvolumeclaim="zot-zot-pvc"}' 30 | values: '50x30 20x30 15x30 5x60' 31 | - series: 'kubelet_volume_stats_capacity_bytes{namespace="zot", persistentvolumeclaim="zot-zot-pvc"}' 32 | values: '100x150' 33 | alert_rule_test: 34 | - alertname: ZotPersistentVolumeFillingUp 35 | eval_time: 150m 36 | exp_alerts: 37 | - exp_labels: 38 | alertname: "ZotPersistentVolumeFillingUp" 39 | area: "platform" 40 | cancel_if_outside_working_hours: "true" 41 | namespace: "zot" 42 | persistentvolumeclaim: "zot-zot-pvc" 43 | severity: "page" 44 | team: "honeybadger" 45 | topic: "managementcluster" 46 | exp_annotations: 47 | description: "The Zot PersistentVolume claimed by zot-zot-pvc in namespace zot is at least 80% full and projected to fill up soon." 48 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/runbooks/zot/?CUSTOMER=&INSTALLATION=&CLUSTER= 49 | -------------------------------------------------------------------------------- /test/tests/providers/global/platform/shield/alerting-rules/cert-manager.rules.test.yml: -------------------------------------------------------------------------------- 1 | --- 2 | rule_files: 3 | - cert-manager.rules.yml 4 | 5 | tests: 6 | - interval: 1m 7 | input_series: 8 | - series: 'up{cluster_id="12345", cluster_type="workload_cluster", container="cert-manager", customer="giantswarm", installation="golem", instance="10.0.0.0:1234", job="12345-prometheus/workload-12345/0", namespace="kube-system", organization="giantswarm", pod="cert-manager-controller-7fcc585578-gnprd", provider="capa", service_priority="highest"}' 9 | values: "0+0x60" 10 | alert_rule_test: 11 | - alertname: CertManagerDown 12 | eval_time: 15m 13 | exp_alerts: 14 | - exp_labels: 15 | alertname: CertManagerDown 16 | area: platform 17 | cancel_if_kubelet_down: "true" 18 | cancel_if_outside_working_hours: "true" 19 | cluster_id: 12345 20 | cluster_type: workload_cluster 21 | container: cert-manager 22 | customer: giantswarm 23 | instance: 10.0.0.0:1234 24 | ip: 10.0.0.0 25 | job: 12345-prometheus/workload-12345/0 26 | namespace: kube-system 27 | organization: giantswarm 28 | pod: cert-manager-controller-7fcc585578-gnprd 29 | provider: capa 30 | installation: golem 31 | service_priority: highest 32 | severity: page 33 | team: shield 34 | topic: cert-manager 35 | exp_annotations: 36 | description: "cert-manager in namespace kube-system is down." 37 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/cert-manager-down/ 38 | - interval: 1m 39 | input_series: 40 | - series: 'up{cluster_id="12345", cluster_type="workload_cluster", container="cert-manager", customer="giantswarm", installation="golem", instance="10.0.0.0:1234", job="12345-prometheus/workload-12345/0", namespace="kube-system", organization="giantswarm", pod="cert-manager-controller-7fcc585578-gnprd", provider="capa", service_priority="highest"}' 41 | values: "1+0x60" 42 | alert_rule_test: 43 | - alertname: CertManagerDown 44 | eval_time: 15m 45 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/platform/cabbage/alerting-rules/external-dns.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | creationTimestamp: null 5 | labels: 6 | {{- include "labels.common" . | nindent 4 }} 7 | name: external-dns.rules 8 | namespace: {{ .Values.namespace }} 9 | spec: 10 | groups: 11 | - name: external-dns 12 | rules: 13 | - alert: ExternalDNSCantAccessRegistry 14 | annotations: 15 | description: '{{`external-dns in namespace {{ $labels.namespace }}) can''t access registry (cloud service provider DNS service).`}}' 16 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/external-dns-cant-access-registry/ 17 | expr: rate(external_dns_registry_errors_total{provider=~"capa|capz|eks"}[2m]) > 0 18 | for: 15m 19 | labels: 20 | area: platform 21 | cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} 22 | severity: page 23 | team: cabbage 24 | topic: external-dns 25 | - alert: ExternalDNSCantAccessSource 26 | annotations: 27 | description: '{{`external-dns in namespace {{ $labels.namespace }}) can''t access source (Service or Ingress resource).`}}' 28 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/external-dns-cant-access-source/ 29 | expr: rate(external_dns_source_errors_total{provider=~"capa|capz|eks"}[2m]) > 0 30 | for: 15m 31 | labels: 32 | area: platform 33 | cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} 34 | severity: page 35 | team: cabbage 36 | topic: external-dns 37 | - alert: ExternalDNSDown 38 | annotations: 39 | description: '{{`external-dns in namespace {{ $labels.namespace }}) is down.`}}' 40 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/external-dns-down/ 41 | expr: label_replace(up{container="external-dns", provider=~"capa|capz|eks"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 42 | for: 15m 43 | labels: 44 | area: platform 45 | cancel_if_outside_working_hours: "true" 46 | cancel_if_kubelet_down: "true" 47 | severity: page 48 | team: cabbage 49 | topic: external-dns 50 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/platform/atlas/alerting-rules/silence-operator.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | labels: 5 | {{- include "labels.common" . | nindent 4 }} 6 | name: silence-operator 7 | namespace: {{ .Values.namespace }} 8 | spec: 9 | groups: 10 | - name: silence-operator 11 | rules: 12 | - alert: "SilenceOperatorReconcileErrors" 13 | annotations: 14 | description: '{{`silence-operator controller {{ $labels.controller }} too many reconcile errors.`}}' 15 | runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/operator-not-reconciling/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}' 16 | expr: | 17 | avg_over_time(operatorkit_controller_errors_total{job="monitoring/silence-operator", cluster_type="management_cluster"}[20m]) > 0 18 | for: 1h 19 | labels: 20 | area: platform 21 | cancel_if_outside_working_hours: "true" 22 | installation: {{ .Values.managementCluster.name }} 23 | severity: page 24 | team: atlas 25 | topic: observability 26 | - alert: SilenceOperatorSyncJobHasNotBeenScheduledForTooLong 27 | annotations: 28 | description: '{{`CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} has not been scheduled for more than 1 day.`}}' 29 | runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/job-has-not-been-scheduled-for-too-long/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}' 30 | # This alert triggers when the silence operator sync job did not schedule for more than 1 day 31 | # or if the job did not run successfully at least once in the last day 32 | expr: (time() - kube_cronjob_status_last_schedule_time{cronjob="silence-operator-sync", cluster_type="management_cluster"}) > 86400 33 | or count by (cronjob, cluster_id, installation, namespace, provider, pipeline) (label_replace(max_over_time(kube_job_status_succeeded{job_name=~"silence-operator-sync-.+", cluster_type="management_cluster"}[1d]), "cronjob", "silence-operator-sync", "job_name", "silence-operator-sync-.+") == 1) == 0 34 | labels: 35 | area: platform 36 | severity: page 37 | team: atlas 38 | topic: managementcluster 39 | cancel_if_outside_working_hours: "true" 40 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/platform/atlas/alerting-rules/keda.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | creationTimestamp: null 5 | labels: 6 | {{- include "labels.common" . | nindent 4 }} 7 | name: keda.rules 8 | namespace: {{ .Values.namespace }} 9 | spec: 10 | groups: 11 | - name: Keda 12 | rules: 13 | - alert: KedaDown 14 | annotations: 15 | description: 'Keda is down.' 16 | expr: count by (cluster_id, installation, provider, pipeline) (up{container=~"keda-.*"} == 0) > 0 17 | for: 10m 18 | labels: 19 | area: platform 20 | cancel_if_cluster_control_plane_unhealthy: "true" 21 | cancel_if_outside_working_hours: "true" 22 | severity: notify 23 | team: atlas 24 | topic: autoscaling 25 | - alert: KedaScaledObjectErrors 26 | annotations: 27 | description: '{{`Errors detected in scaled object {{ $labels.scaledObject }} in namespace {{ $labels.namespace}}.`}}' 28 | expr: increase(keda_scaled_object_errors[10m])> 0 29 | for: 15m 30 | labels: 31 | area: platform 32 | cancel_if_cluster_control_plane_unhealthy: "true" 33 | cancel_if_outside_working_hours: "true" 34 | severity: notify 35 | team: atlas 36 | topic: autoscaling 37 | - alert: KedaWebhookScaledObjectValidationErrors 38 | annotations: 39 | description: '{{`Validation errors detected in webhook for scaled object {{ $labels.scaledObject }} in namespace {{ $labels.namespace}}.`}}' 40 | expr: increase(keda_webhook_scaled_object_validation_errors[10m]) > 0 41 | for: 15m 42 | labels: 43 | area: platform 44 | cancel_if_cluster_control_plane_unhealthy: "true" 45 | cancel_if_outside_working_hours: "true" 46 | severity: notify 47 | team: atlas 48 | topic: autoscaling 49 | - alert: KedaScalerErrors 50 | annotations: 51 | description: '{{`Errors detected in scaler {{ $labels.scaler }} for scaled object {{ $labels.scaledObject }} in namespace {{ $labels.namespace}}.`}}' 52 | expr: increase(keda_scaler_errors[10m]) > 0 53 | for: 15m 54 | labels: 55 | area: platform 56 | cancel_if_cluster_control_plane_unhealthy: "true" 57 | cancel_if_outside_working_hours: "true" 58 | severity: notify 59 | team: atlas 60 | topic: autoscaling 61 | -------------------------------------------------------------------------------- /test/tests/providers/global/kaas/tenet/alerting-rules/pods.rules.test.yml: -------------------------------------------------------------------------------- 1 | --- 2 | rule_files: 3 | - pods.rules.yml 4 | 5 | tests: 6 | # PodsUnschedulable 7 | - interval: 1m 8 | input_series: 9 | # All is good for 1h, 10 | # Then every hour we have a new pod unschedulable 11 | - series: 'kube_pod_status_unschedulable{app="kube-state-metrics", cluster_id="wc01", cluster_type="workload_cluster", customer="giantswarm", installation="testinstall", namespace="kube-system", pipeline="stable", pod="alloy-logs-1", provider="capa", region="us-east-1"}' 12 | values: "_x60 1x1000" 13 | - series: 'kube_pod_status_unschedulable{app="kube-state-metrics", cluster_id="wc01", cluster_type="workload_cluster", customer="giantswarm", installation="testinstall", namespace="kube-system", pipeline="stable", pod="alloy-metrics-1", provider="capa", region="us-east-1"}' 14 | values: "_x120 1x1000" 15 | - series: 'kube_pod_status_unschedulable{app="kube-state-metrics", cluster_id="wc01", cluster_type="workload_cluster", customer="giantswarm", installation="testinstall", namespace="kube-system", pipeline="stable", pod="alloy-metrics-2", provider="capa", region="us-east-1"}' 16 | values: "_x180 1x1000" 17 | alert_rule_test: 18 | - alertname: PodsUnschedulable 19 | eval_time: 10m 20 | - alertname: PodsUnschedulable 21 | eval_time: 50m 22 | - alertname: PodsUnschedulable 23 | eval_time: 90m 24 | - alertname: PodsUnschedulable 25 | eval_time: 150m 26 | exp_alerts: 27 | - exp_labels: 28 | area: "kaas" 29 | cancel_if_outside_working_hours: "true" 30 | cluster_id: "wc01" 31 | cluster_type: "workload_cluster" 32 | customer: "giantswarm" 33 | installation: "testinstall" 34 | inhibit_cluster_broken: "true" 35 | pipeline: "stable" 36 | provider: "capa" 37 | region: "us-east-1" 38 | severity: "page" 39 | team: "phoenix" 40 | topic: "workloadcluster" 41 | exp_annotations: 42 | __dashboardUid__: "unschedulable-pods" 43 | dashboardQueryParams: "orgId=1&var-namespace=kube-system&var-cluster=wc01" 44 | description: 'Cluster wc01 has unschedulable kube-system pods.' 45 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/validate-cluster-health/ 46 | -------------------------------------------------------------------------------- /helm/prometheus-rules/values.schema.json: -------------------------------------------------------------------------------- 1 | { 2 | "$schema": "http://json-schema.org/schema#", 3 | "type": "object", 4 | "properties": { 5 | "Installation": { 6 | "type": "object", 7 | "properties": { 8 | "V1": { 9 | "type": "object", 10 | "properties": { 11 | "Guest": { 12 | "type": "object", 13 | "properties": { 14 | "Kubernetes": { 15 | "type": "object", 16 | "properties": { 17 | "IngressController": { 18 | "type": "object", 19 | "properties": { 20 | "BaseDomain": { 21 | "type": "string" 22 | } 23 | } 24 | } 25 | } 26 | } 27 | } 28 | } 29 | } 30 | } 31 | } 32 | }, 33 | "managementCluster": { 34 | "type": "object", 35 | "properties": { 36 | "customer": { 37 | "type": "string" 38 | }, 39 | "name": { 40 | "type": "string" 41 | }, 42 | "pipeline": { 43 | "type": "string" 44 | }, 45 | "provider": { 46 | "type": "object", 47 | "properties": { 48 | "flavor": { 49 | "type": "string" 50 | }, 51 | "kind": { 52 | "type": "string" 53 | }, 54 | "region": { 55 | "type": "string" 56 | } 57 | } 58 | } 59 | } 60 | }, 61 | "name": { 62 | "type": "string" 63 | }, 64 | "namespace": { 65 | "type": "string" 66 | }, 67 | "serviceType": { 68 | "type": "string" 69 | } 70 | } 71 | } 72 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/platform/shield/alerting-rules/cert-manager.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | creationTimestamp: null 5 | labels: 6 | {{- include "labels.common" . | nindent 4 }} 7 | name: cert-manager.rules 8 | namespace: {{ .Values.namespace }} 9 | spec: 10 | groups: 11 | - name: cert-manager 12 | rules: 13 | - alert: CertManagerPodHighMemoryUsage 14 | annotations: 15 | description: |- 16 | {{`High memory usage ({{ $value }}) for container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }}. 17 | If memory usage value is equal to memory limit value then it is likely the pod will be evicted. 18 | If no limits are set then the pod will burst. 19 | `}} 20 | expr: (sum by (cluster_id, installation, pipeline, provider, pod, namespace, container) (container_memory_working_set_bytes{container=~"(cert-manager|cert-manager-app-controller)"}) / 1024 / 1024 / 1024) >= 0.85 21 | for: 10m 22 | labels: 23 | area: platform 24 | cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} 25 | severity: notify 26 | team: shield 27 | topic: observability 28 | - alert: CertManagerDown 29 | annotations: 30 | description: '{{`cert-manager in namespace {{ $labels.namespace }} is down.`}}' 31 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/cert-manager-down/ 32 | expr: label_replace(up{container=~"cert-manager(-app-controller)?"}, "ip", "$1.$2.$3.$4", "node", "ip-(\\d+)-(\\d+)-(\\d+)-(\\d+).*") == 0 33 | for: 15m 34 | labels: 35 | area: platform 36 | cancel_if_outside_working_hours: "true" 37 | cancel_if_kubelet_down: "true" 38 | severity: page 39 | team: shield 40 | topic: cert-manager 41 | - alert: CertManagerTooManyCertificateRequests 42 | annotations: 43 | description: '{{`There are too many CertificateRequests in cluster {{ $labels.cluster_id }}.`}}' 44 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/cert-requests-too-many/ 45 | expr: sum by (cluster_id, installation, pipeline, provider) (etcd_kubernetes_resources_count{kind="certificaterequests.cert-manager.io"}) > 10000 46 | for: 15m 47 | labels: 48 | area: platform 49 | cancel_if_outside_working_hours: "true" 50 | severity: notify 51 | team: shield 52 | topic: cert-manager 53 | -------------------------------------------------------------------------------- /test/tests/providers/global/platform/honeybadger/alerting-rules/crsync.rules.test.yml: -------------------------------------------------------------------------------- 1 | --- 2 | rule_files: 3 | - crsync.rules.yml 4 | 5 | tests: 6 | - interval: 1m 7 | input_series: 8 | - series: 'kube_deployment_status_replicas_available{cluster_type="workload_cluster", installation="gazelle", cluster_id="operations", namespace="crsync", deployment="crsync-giantswarm-azurecr-io"}' 9 | values: "1x5 0x9 1x5 0x10" 10 | alert_rule_test: 11 | - alertname: CrsyncDeploymentNotSatisfied 12 | eval_time: 32m 13 | exp_alerts: 14 | - exp_labels: 15 | alertname: "CrsyncDeploymentNotSatisfied" 16 | area: platform 17 | cancel_if_outside_working_hours: "true" 18 | cluster_id: "operations" 19 | cluster_type: "workload_cluster" 20 | deployment: "crsync-giantswarm-azurecr-io" 21 | installation: "gazelle" 22 | namespace: "crsync" 23 | severity: "page" 24 | team: "honeybadger" 25 | topic: "releng" 26 | exp_annotations: 27 | description: "CrSync deployment crsync-giantswarm-azurecr-io is not satisfied in gazelle / operations at the crsync namespace." 28 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/runbooks/deployment-not-satisfied/?INSTALLATION=gazelle&CLUSTER=operations&NAMESPACE=crsync&KIND=deployment&NAME=crsync-giantswarm-azurecr-io 29 | - interval: 1m 30 | input_series: 31 | - series: 'crsync_sync_tags_total{registry="quay.io", cluster_id="example", repository="giantswarm/example"}' 32 | values: "100x60" 33 | - series: 'crsync_sync_tags_total{registry="docker.io", cluster_id="example", repository="giantswarm/example"}' 34 | values: "95x60" 35 | alert_rule_test: 36 | - alertname: CrsyncTooManyTagsMissing 37 | eval_time: 60m 38 | exp_alerts: 39 | - exp_labels: 40 | alertname: "CrsyncTooManyTagsMissing" 41 | area: platform 42 | cancel_if_outside_working_hours: "true" 43 | cluster_id: "example" 44 | registry: "quay.io" 45 | repository: "giantswarm/example" 46 | severity: "page" 47 | team: "honeybadger" 48 | topic: "releng" 49 | exp_annotations: 50 | description: "Too many tags are not synchronised to registry mirrors." 51 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/crsync-too-many-tags-missing/ 52 | -------------------------------------------------------------------------------- /test/tests/providers/global/kaas/tenet/alerting-rules/capi-machinepool.rules.test.yml: -------------------------------------------------------------------------------- 1 | rule_files: 2 | - capi-machinepool.rules.yml 3 | 4 | tests: 5 | - interval: 1m 6 | input_series: 7 | - series: 'capi_machinepool_status_phase{phase="Failed", cluster_id="clippaxy", name="clippaxy-def00", exported_namespace="giantswarm"}' 8 | values: "0+3x75" 9 | - series: 'capi_cluster_info{cluster_id="clippaxy", provider="capa"}' 10 | values: "1+0x75" 11 | - series: 'capi_machinepool_annotation_paused{paused_value="true",cluster_id="grumpy", name="grumpy-72r5c", exported_namespace="giantswarm"}' 12 | values: "0+1x75" 13 | - series: 'capi_cluster_info{cluster_id="grumpy", provider="capa"}' 14 | values: "1+0x75" 15 | alert_rule_test: 16 | - alertname: MachinePoolIsNotHealthy 17 | eval_time: 25m 18 | exp_alerts: 19 | - exp_labels: 20 | area: kaas 21 | cancel_if_monitoring_agent_down: "true" 22 | cancel_if_outside_working_hours: "true" 23 | provider: capa 24 | severity: page 25 | phase: Failed 26 | team: phoenix 27 | topic: managementcluster 28 | cluster_id: clippaxy 29 | name: clippaxy-def00 30 | exported_namespace: giantswarm 31 | exp_annotations: 32 | description: "The clusters clippaxy machinepool giantswarm/clippaxy-def00 is not healthy." 33 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/capi-machinepool/ 34 | __dashboardUid__: bdi7iswg81czkcasd 35 | dashboardQueryParams: "orgId=2" 36 | - alertname: MachinePoolPaused 37 | eval_time: 75m 38 | exp_alerts: 39 | - exp_labels: 40 | area: kaas 41 | cancel_if_monitoring_agent_down: "true" 42 | cancel_if_outside_working_hours: "true" 43 | provider: capa 44 | severity: notify 45 | team: phoenix 46 | topic: managementcluster 47 | cluster_id: grumpy 48 | name: grumpy-72r5c 49 | exported_namespace: giantswarm 50 | paused_value: "true" 51 | exp_annotations: 52 | description: "The clusters grumpy machinepool giantswarm/grumpy-72r5c is paused." 53 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/capi-machinepool/ 54 | __dashboardUid__: bdi7iswg81czkcasd 55 | dashboardQueryParams: "orgId=2" 56 | -------------------------------------------------------------------------------- /renovate.json5: -------------------------------------------------------------------------------- 1 | { 2 | // Base config - https://github.com/giantswarm/renovate-presets/blob/main/default.json5 3 | "extends": [ 4 | "github>giantswarm/renovate-presets:default.json5" 5 | ], 6 | "customManagers": [ 7 | { 8 | "customType": "regex", 9 | "fileMatch": ["test/hack/bin/fetch-tools.sh"], 10 | "matchStrings": [ 11 | "ARCHITECT_VERSION=\"(?.*?)\"" 12 | ], 13 | "depNameTemplate": "giantswarm/architect", 14 | "datasourceTemplate": "github-releases", 15 | "versioningTemplate": "semver" 16 | }, 17 | { 18 | "customType": "regex", 19 | "fileMatch": ["test/hack/bin/fetch-tools.sh"], 20 | "matchStrings": [ 21 | "HELM_VERSION=\"(?.*?)\"" 22 | ], 23 | "depNameTemplate": "helm/helm", 24 | "datasourceTemplate": "github-releases", 25 | "versioningTemplate": "semver" 26 | }, 27 | { 28 | "customType": "regex", 29 | "fileMatch": ["test/hack/bin/fetch-tools.sh"], 30 | "matchStrings": [ 31 | "JQ_VERSION=\"(?.*?)\"" 32 | ], 33 | "depNameTemplate": "jqlang/jq", 34 | "datasourceTemplate": "github-releases", 35 | "versioningTemplate": "semver" 36 | }, 37 | { 38 | "customType": "regex", 39 | "fileMatch": ["test/hack/bin/fetch-tools.sh"], 40 | "matchStrings": [ 41 | "LOKITOOL_VERSION=\"(?.*?)\"" 42 | ], 43 | "depNameTemplate": "grafana/loki", 44 | "datasourceTemplate": "github-releases", 45 | "versioningTemplate": "semver" 46 | }, 47 | { 48 | "customType": "regex", 49 | "fileMatch": ["test/hack/bin/fetch-tools.sh"], 50 | "matchStrings": [ 51 | "PINT_VERSION=\"(?.*?)\"" 52 | ], 53 | "depNameTemplate": "cloudflare/pint", 54 | "datasourceTemplate": "github-releases", 55 | "versioningTemplate": "semver" 56 | }, 57 | { 58 | "customType": "regex", 59 | "fileMatch": ["test/hack/bin/fetch-tools.sh"], 60 | "matchStrings": [ 61 | "PROMETHEUS_VERSION=\"(?.*?)\"" 62 | ], 63 | "depNameTemplate": "prometheus/prometheus", 64 | "datasourceTemplate": "github-releases", 65 | "versioningTemplate": "semver" 66 | }, 67 | { 68 | "customType": "regex", 69 | "fileMatch": ["test/hack/bin/fetch-tools.sh"], 70 | "matchStrings": [ 71 | "YQ_VERSION=\"(?.*?)\"" 72 | ], 73 | "depNameTemplate": "mikefarah/yq", 74 | "datasourceTemplate": "github-releases", 75 | "versioningTemplate": "semver" 76 | } 77 | ] 78 | } 79 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/platform/shield/alerting-rules/dex.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | creationTimestamp: null 5 | labels: 6 | {{- include "labels.common" . | nindent 4 }} 7 | name: dex.rules 8 | namespace: {{ .Values.namespace }} 9 | spec: 10 | groups: 11 | - name: dex 12 | rules: 13 | - alert: DexErrorRateHigh 14 | annotations: 15 | description: '{{`Dex running on {{ $labels.cluster_id }} is reporting an increased error rate.`}}' 16 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/dex-error-rate-high/ 17 | expr: sum(increase(http_requests_total{job="dex", handler!="/token", code=~"^[4]..$|[5]..$", cluster_type="management_cluster"}[5m])) by (cluster_id, installation, pipeline, provider) > 10 18 | for: 30m 19 | labels: 20 | area: platform 21 | cancel_if_outside_working_hours: "true" 22 | severity: page 23 | team: shield 24 | topic: dex 25 | - alert: DexSecretExpired 26 | annotations: 27 | description: '{{`dex-operator failed to renew secret of {{ $labels.app_registration_name }} for {{ $labels.app_owner }} on provider {{ $labels.provider_type }}.`}}' 28 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/dex-operator/ 29 | expr: min by(app_registration_name, app_owner, app_namespace, provider_name, provider_type, installation, cluster_id, pipeline, provider) (aggregation:dex_operator_idp_secret_expiry_time{cluster_type="management_cluster", provider_type!="github"}) - time() < 60*60*12 30 | for: 30m 31 | labels: 32 | area: platform 33 | cancel_if_outside_working_hours: "true" 34 | severity: page 35 | team: shield 36 | topic: dex 37 | - alert: ManagementClusterDexAppMissing 38 | annotations: 39 | description: '{{`dex-operator did not register a dex-app in giantswarm namespace.`}}' 40 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/dex-operator/ 41 | expr: absent(dex_operator_idp_secret_expiry_time{app_namespace="giantswarm", cluster_type="management_cluster", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"}) 42 | for: 30m 43 | labels: 44 | area: platform 45 | cancel_if_outside_working_hours: "true" 46 | cancel_if_metrics_broken: "true" 47 | severity: page 48 | team: shield 49 | topic: dex 50 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/kaas/tenet/alerting-rules/node-exporter.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | creationTimestamp: null 5 | labels: 6 | {{- include "labels.common" . | nindent 4 }} 7 | name: node-exporter.rules 8 | namespace: {{ .Values.namespace }} 9 | spec: 10 | groups: 11 | - name: node-exporter 12 | rules: 13 | - alert: NodeExporterCollectorFailed 14 | annotations: 15 | description: '{{`NodeExporter Collector {{ $labels.collector }} on {{ $labels.instance }} is failed.`}}' 16 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/node-exporter-device-error/ 17 | # TODO(@giantswarm/team-atlas): the namespace filter should be removed when this completed https://github.com/giantswarm/roadmap/issues/3791, see https://github.com/giantswarm/prometheus-rules/pull/1491 18 | expr: node_scrape_collector_success{collector!~"conntrack|bonding|hwmon|powersupplyclass|mdadm|nfs|nfsd|tapestats|fibrechannel|nvme|watchdog", namespace="kube-system"} == 0 19 | for: 5m 20 | labels: 21 | area: kaas 22 | cancel_if_outside_working_hours: "true" 23 | severity: page 24 | team: tenet 25 | topic: observability 26 | - name: resource-usage 27 | rules: 28 | # IncorrectResourceUsageData alert detects if the data used in the Grafana Cloud Resource Usage dashboard is incorrect by comparing the dashboard data against data from the kubelet. 29 | - alert: IncorrectResourceUsageData 30 | annotations: 31 | description: '{{`Data used in the Grafana Cloud Resource Usage dashboard is incorrect for cluster {{ $labels.cluster_id }}.`}}' 32 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/resource-usage-dashboard/ 33 | expr: | 34 | quantile_over_time(0.9, aggregation:node:cpu_cores_total[120m:30m]) / on(cluster_id, cluster_type, customer, installation, pipeline, provider, region) quantile_over_time(0.9, (sum(machine_cpu_cores)by(cluster_id, cluster_type, customer, installation, pipeline, provider, region))[120m:30m]) < 0.9 35 | or 36 | quantile_over_time(0.9, aggregation:node:memory_memtotal_bytes_total[120m:30m]) / on(cluster_id, cluster_type, customer, installation, pipeline, provider, region) quantile_over_time(0.9, (sum(machine_memory_bytes)by(cluster_id, cluster_type, customer, installation, pipeline, provider, region))[120m:30m]) < 0.9 37 | for: 1h 38 | labels: 39 | area: kaas 40 | cancel_if_outside_working_hours: "true" 41 | severity: page 42 | team: tenet 43 | topic: observability 44 | -------------------------------------------------------------------------------- /test/tests/providers/global/kaas/tenet/alerting-rules/capi-machinedeployment.rules.test.yml: -------------------------------------------------------------------------------- 1 | rule_files: 2 | - capi-machinedeployment.rules.yml 3 | 4 | tests: 5 | - interval: 1m 6 | input_series: 7 | - series: 'capi_machinedeployment_status_phase{phase="Failed", cluster_id="clippaxy", name="clippaxy-def00", exported_namespace="giantswarm"}' 8 | values: "0+3x75" 9 | - series: 'capi_cluster_info{cluster_id="clippaxy", provider="capa"}' 10 | values: "1+0x75" 11 | - series: 'capi_machinedeployment_annotation_paused{paused_value="true",cluster_id="grumpy", name="grumpy-def99", exported_namespace="giantswarm"}' 12 | values: "0+1x75" 13 | - series: 'capi_cluster_info{cluster_id="grumpy", provider="capa"}' 14 | values: "1+0x75" 15 | alert_rule_test: 16 | - alertname: MachineDeploymentIsNotHealthy 17 | eval_time: 25m 18 | exp_alerts: 19 | - exp_labels: 20 | area: kaas 21 | cancel_if_monitoring_agent_down: "true" 22 | cancel_if_outside_working_hours: "true" 23 | provider: capa 24 | severity: notify 25 | phase: Failed 26 | team: phoenix 27 | topic: managementcluster 28 | cluster_id: clippaxy 29 | name: clippaxy-def00 30 | exported_namespace: giantswarm 31 | exp_annotations: 32 | description: "The clusters clippaxy machinedeployment giantswarm/clippaxy-def00 is not healthy." 33 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/capi-machinedeployment/ 34 | __dashboardUid__: bdi7iswg81czkcasd 35 | dashboardQueryParams: "orgId=2" 36 | - alertname: MachineDeploymentPaused 37 | eval_time: 75m 38 | exp_alerts: 39 | - exp_labels: 40 | area: kaas 41 | cancel_if_monitoring_agent_down: "true" 42 | cancel_if_outside_working_hours: "true" 43 | provider: capa 44 | severity: notify 45 | team: phoenix 46 | topic: managementcluster 47 | cluster_id: grumpy 48 | name: grumpy-def99 49 | exported_namespace: giantswarm 50 | paused_value: "true" 51 | exp_annotations: 52 | description: "The clusters grumpy machinedeployment giantswarm/grumpy-def99 is paused." 53 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/capi-machinedeployment/ 54 | __dashboardUid__: bdi7iswg81czkcasd 55 | dashboardQueryParams: "orgId=2" 56 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/kaas/tenet/alerting-rules/capi-machine.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | labels: {{- include "labels.common" . | nindent 4}} 5 | name: capi-machine.rules 6 | namespace: {{.Values.namespace}} 7 | spec: 8 | groups: 9 | - name: capi-machine 10 | rules: 11 | - alert: MachineUnhealthyPhase 12 | annotations: 13 | description: |- 14 | {{`Machine {{ $labels.exported_namespace}}/{{ $labels.name }} stuck in phase {{ $labels.phase }} for more than 30 minutes.`}} 15 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/capi-machine/ 16 | __dashboardUid__: bdi7iswg81czkcasd 17 | dashboardQueryParams: "orgId=2" 18 | expr: |- 19 | ( 20 | capi_machine_status_phase{phase!="Running", name!~".*bastion.*"} 21 | * on(cluster_id) group_left(provider) 22 | sum( 23 | label_replace( 24 | capi_cluster_info, "provider", "vsphere", "infrastructure_reference_kind", "VSphereCluster" 25 | ) 26 | ) by (cluster_id, provider) 27 | ) > 0 28 | for: 30m 29 | labels: 30 | area: kaas 31 | cancel_if_monitoring_agent_down: "true" 32 | cancel_if_outside_working_hours: "true" 33 | severity: page 34 | team: {{ include "providerTeam" . }} 35 | topic: managementcluster 36 | - alert: MachinePaused 37 | expr: |- 38 | ( 39 | capi_machine_annotation_paused{paused_value="true"} 40 | * on(cluster_id) group_left(provider) 41 | sum( 42 | label_replace( 43 | capi_cluster_info, "provider", "vsphere", "infrastructure_reference_kind", "VSphereCluster" 44 | ) 45 | ) by (cluster_id, provider) 46 | ) > 0 47 | for: 1h 48 | labels: 49 | area: kaas 50 | cancel_if_monitoring_agent_down: "true" 51 | cancel_if_outside_working_hours: "true" 52 | severity: notify 53 | team: {{ include "providerTeam" . }} 54 | topic: managementcluster 55 | annotations: 56 | description: |- 57 | {{`Machine {{ $labels.exported_namespace}}/{{ $labels.name }} is paused.`}} 58 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/capi-machine/ 59 | __dashboardUid__: bdi7iswg81czkcasd 60 | dashboardQueryParams: "orgId=2" 61 | -------------------------------------------------------------------------------- /test/tests/providers/capz/kaas/phoenix/alerting-rules/dns-operator-azure.rules.test.yml: -------------------------------------------------------------------------------- 1 | rule_files: 2 | - dns-operator-azure.rules.yml 3 | 4 | tests: 5 | - interval: 1m 6 | input_series: 7 | - series: 'dns_operator_azure_zone_info{controller="dns-operator-azure",resource_group="425bdf54",subscription_id="09be0ac8-38d9-4fe1-aa72-4ce2e8a084d2",tenant_id="4e4e320b-cf45-4fd4-9dd3-ec0046779035",zone="425bdf54.azuretest.gigantic.io",installation="puppy",type="public"}' 8 | values: "1+0x60" 9 | - series: 'capi_cluster_status_phase{name="425bdf54", exported_namespace="org-83dd715d", phase="Provisioned", installation="puppy"}' 10 | values: "1+0x60" 11 | - series: 'capi_cluster_status_phase{name="8e8225b5", exported_namespace="org-31f75bf9", phase="Provisioned", installation="puppy"}' 12 | values: "1+0x60" 13 | - series: 'dns_operator_azure_api_request_errors_total{controller="dns-operator-azure",method="recordSets.CreateOrUpdate",installation="puppy"}' 14 | values: "0+0x10 1+1x20" 15 | - series: 'dns_operator_azure_api_request_errors_total{controller="dns-operator-azure",method="zones.Get",installation="puppy"}' 16 | values: "0+0x10 1+1x10 0+0x10" 17 | alert_rule_test: 18 | - alertname: ClusterDNSZoneMissing 19 | eval_time: 30m 20 | exp_alerts: 21 | - exp_labels: 22 | area: kaas 23 | cancel_if_outside_working_hours: "false" 24 | severity: notify 25 | team: phoenix 26 | topic: managementcluster 27 | phase: Provisioned 28 | exported_namespace: org-31f75bf9 29 | installation: puppy 30 | name: 8e8225b5 31 | exp_annotations: 32 | description: "No DNS-zone for cluster org-31f75bf9/8e8225b5 got created yet. Check dns-operator-azure logs in installation/puppy." 33 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/dns-operator-azure/ 34 | - alertname: AzureDNSOperatorAPIErrorRate 35 | eval_time: 30m 36 | exp_alerts: 37 | - exp_labels: 38 | area: kaas 39 | cancel_if_outside_working_hours: "false" 40 | severity: notify 41 | team: phoenix 42 | topic: managementcluster 43 | installation: puppy 44 | method: recordSets.CreateOrUpdate 45 | exp_annotations: 46 | description: "Error rate for recordSets.CreateOrUpdate is high. Check dns-operator-azure logs in installation/puppy." 47 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/dns-operator-azure/ 48 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/kaas/tenet/alerting-rules/capi-machinepool.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | labels: {{- include "labels.common" . | nindent 4}} 5 | name: capi-machinepool.rules 6 | namespace: {{.Values.namespace}} 7 | spec: 8 | groups: 9 | - name: capi-machinepool 10 | rules: 11 | - alert: MachinePoolIsNotHealthy 12 | expr: |- 13 | ( 14 | capi_machinepool_status_phase{phase="Failed"} 15 | * on(cluster_id) group_left(provider) 16 | sum( 17 | label_replace( 18 | capi_cluster_info, "provider", "vsphere", "infrastructure_reference_kind", "VSphereCluster" 19 | ) 20 | ) by (cluster_id, provider) 21 | ) > 0 22 | for: 15m 23 | labels: 24 | area: kaas 25 | cancel_if_monitoring_agent_down: "true" 26 | cancel_if_outside_working_hours: "true" 27 | severity: page 28 | team: {{ include "providerTeam" . }} 29 | topic: managementcluster 30 | annotations: 31 | description: |- 32 | {{`The clusters {{ $labels.cluster_id }} machinepool {{ $labels.exported_namespace }}/{{ $labels.name }} is not healthy.`}} 33 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/capi-machinepool/ 34 | __dashboardUid__: bdi7iswg81czkcasd 35 | dashboardQueryParams: "orgId=2" 36 | - alert: MachinePoolPaused 37 | expr: |- 38 | ( 39 | capi_machinepool_annotation_paused{paused_value="true"} 40 | * on(cluster_id) group_left(provider) 41 | sum( 42 | label_replace( 43 | capi_cluster_info, "provider", "vsphere", "infrastructure_reference_kind", "VSphereCluster" 44 | ) 45 | ) by (cluster_id, provider) 46 | ) > 0 47 | for: 1h 48 | labels: 49 | area: kaas 50 | cancel_if_monitoring_agent_down: "true" 51 | cancel_if_outside_working_hours: "true" 52 | severity: notify 53 | team: {{ include "providerTeam" . }} 54 | topic: managementcluster 55 | annotations: 56 | description: |- 57 | {{`The clusters {{ $labels.cluster_id }} machinepool {{ $labels.exported_namespace }}/{{ $labels.name }} is paused.`}} 58 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/capi-machinepool/ 59 | __dashboardUid__: bdi7iswg81czkcasd 60 | dashboardQueryParams: "orgId=2" 61 | -------------------------------------------------------------------------------- /test/tests/providers/global/kaas/tenet/alerting-rules/capi-machine.rules.test.yml: -------------------------------------------------------------------------------- 1 | rule_files: 2 | - capi-machine.rules.yml 3 | 4 | tests: 5 | - interval: 1m 6 | input_series: 7 | - series: 'capi_machine_status_phase{cluster_id="clippaxy", name="clippaxy-72jq5", exported_namespace="giantswarm", phase="Running"}' 8 | values: "1+0x10 0+0x35" 9 | - series: 'capi_machine_status_phase{cluster_id="clippaxy", name="clippaxy-72jq5", exported_namespace="giantswarm", phase="Failed"}' 10 | values: "0+0x10 1+0x35" 11 | - series: 'capi_cluster_info{cluster_id="clippaxy", provider="capa"}' 12 | values: "1+0x45" 13 | - series: 'capi_machine_annotation_paused{paused_value="true",cluster_id="grumpy", name="grumpy-72r5c", exported_namespace="giantswarm"}' 14 | values: "0+1x75" 15 | - series: 'capi_cluster_info{cluster_id="grumpy", provider="capa"}' 16 | values: "1+0x75" 17 | alert_rule_test: 18 | - alertname: MachineUnhealthyPhase 19 | eval_time: 45m 20 | exp_alerts: 21 | - exp_labels: 22 | area: kaas 23 | cancel_if_monitoring_agent_down: "true" 24 | cancel_if_outside_working_hours: "true" 25 | provider: capa 26 | severity: page 27 | team: phoenix 28 | topic: managementcluster 29 | cluster_id: clippaxy 30 | name: clippaxy-72jq5 31 | exported_namespace: giantswarm 32 | phase: Failed 33 | exp_annotations: 34 | description: "Machine giantswarm/clippaxy-72jq5 stuck in phase Failed for more than 30 minutes." 35 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/capi-machine/ 36 | __dashboardUid__: bdi7iswg81czkcasd 37 | dashboardQueryParams: "orgId=2" 38 | - alertname: MachinePaused 39 | eval_time: 75m 40 | exp_alerts: 41 | - exp_labels: 42 | area: kaas 43 | cancel_if_monitoring_agent_down: "true" 44 | cancel_if_outside_working_hours: "true" 45 | provider: capa 46 | severity: notify 47 | team: phoenix 48 | topic: managementcluster 49 | cluster_id: grumpy 50 | name: grumpy-72r5c 51 | exported_namespace: giantswarm 52 | paused_value: "true" 53 | exp_annotations: 54 | description: "Machine giantswarm/grumpy-72r5c is paused." 55 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/capi-machine/ 56 | __dashboardUid__: bdi7iswg81czkcasd 57 | dashboardQueryParams: "orgId=2" 58 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/kaas/tenet/alerting-rules/capi-machinedeployment.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | labels: {{- include "labels.common" . | nindent 4}} 5 | name: capi-machinedeployment.rules 6 | namespace: {{.Values.namespace}} 7 | spec: 8 | groups: 9 | - name: capi-machinedeployment 10 | rules: 11 | - alert: MachineDeploymentIsNotHealthy 12 | expr: |- 13 | ( 14 | capi_machinedeployment_status_phase{phase="Failed"} 15 | * on(cluster_id) group_left(provider) 16 | sum( 17 | label_replace( 18 | capi_cluster_info, "provider", "vsphere", "infrastructure_reference_kind", "VSphereCluster" 19 | ) 20 | ) by (cluster_id, provider) 21 | ) > 0 22 | for: 15m 23 | labels: 24 | area: kaas 25 | cancel_if_monitoring_agent_down: "true" 26 | cancel_if_outside_working_hours: "true" 27 | severity: notify 28 | team: {{ include "providerTeam" . }} 29 | topic: managementcluster 30 | annotations: 31 | description: |- 32 | {{`The clusters {{$labels.cluster_id}} machinedeployment {{$labels.exported_namespace}}/{{$labels.name}} is not healthy.`}} 33 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/capi-machinedeployment/ 34 | __dashboardUid__: bdi7iswg81czkcasd 35 | dashboardQueryParams: "orgId=2" 36 | - alert: MachineDeploymentPaused 37 | expr: |- 38 | ( 39 | capi_machinedeployment_annotation_paused{paused_value="true"} 40 | * on(cluster_id) group_left(provider) 41 | sum( 42 | label_replace( 43 | capi_cluster_info, "provider", "vsphere", "infrastructure_reference_kind", "VSphereCluster" 44 | ) 45 | ) by (cluster_id, provider) 46 | ) > 0 47 | for: 1h 48 | labels: 49 | area: kaas 50 | cancel_if_monitoring_agent_down: "true" 51 | cancel_if_outside_working_hours: "true" 52 | severity: notify 53 | team: {{ include "providerTeam" . }} 54 | topic: managementcluster 55 | annotations: 56 | description: |- 57 | {{`The clusters {{$labels.cluster_id}} machinedeployment {{$labels.exported_namespace}}/{{$labels.name}} is paused.`}} 58 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/capi-machinedeployment/ 59 | __dashboardUid__: bdi7iswg81czkcasd 60 | dashboardQueryParams: "orgId=2" 61 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/kaas/tenet/alerting-rules/etcdbackup.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | creationTimestamp: null 5 | labels: 6 | {{- include "labels.common" . | nindent 4 }} 7 | name: etcdbackup.rules 8 | namespace: {{ .Values.namespace }} 9 | spec: 10 | groups: 11 | - name: etcdbackup 12 | rules: 13 | - alert: ETCDBackupJobFailedOrStuck 14 | annotations: 15 | description: '{{`Job {{ $labels.job }} failed or has not been completed for more than 30 minutes.`}}' 16 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/etcd-backup-failed/ 17 | expr: kube_job_failed{cluster_type="management_cluster",condition="true",job=~"etcd-backup.+"} == 1 or kube_pod_status_phase{cluster_type="management_cluster",phase="Pending",pod=~"etcd-backup.+"} == 1 or kube_job_status_succeeded{cluster_type="management_cluster",job=~"etcd-backup.+"} == 0 18 | for: 30m 19 | labels: 20 | area: kaas 21 | cancel_if_outside_working_hours: "true" 22 | severity: page 23 | team: tenet 24 | topic: etcd-backup 25 | - alert: LatestETCDBackup2DaysOld 26 | annotations: 27 | description: '{{`Latest successful ETCD backup for {{ $labels.cluster_id }}/{{ $labels.tenant_cluster_id }} was more than 48h ago.`}}' 28 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/etcd-backup-failed/ 29 | expr: count(label_replace(capi_cluster_created, "tenant_cluster_id", "$1", "name", "(.*)")) by (cluster_id, installation, pipeline, provider, tenant_cluster_id) > 48 * 60 * 60 unless count((time() - etcd_backup_latest_success{tenant_cluster_id!="Control Plane"}) > 48 * 60 * 60) by (cluster_id, installation, pipeline, provider, tenant_cluster_id) 30 | for: 5m 31 | labels: 32 | area: kaas 33 | cancel_if_outside_working_hours: "true" 34 | severity: page 35 | team: tenet 36 | topic: etcd-backup 37 | - alert: ETCDBackupMetricsMissing 38 | annotations: 39 | description: '{{`ETCD backup metrics are missing`}}' 40 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/etcd-backup-metrics-missing/ 41 | expr: absent(etcd_backup_latest_attempt{cluster_type="management_cluster", cluster_id="{{ .Values.managementCluster.name }}", installation="{{ .Values.managementCluster.name }}", provider="{{ .Values.managementCluster.provider.kind }}", pipeline="{{ .Values.managementCluster.pipeline }}"}) 42 | for: 12h 43 | labels: 44 | area: kaas 45 | cancel_if_outside_working_hours: "true" 46 | cancel_if_metrics_broken: "true" 47 | severity: page 48 | team: tenet 49 | topic: etcd-backup 50 | -------------------------------------------------------------------------------- /.circleci/config.yml: -------------------------------------------------------------------------------- 1 | version: 2.1 2 | orbs: 3 | architect: giantswarm/architect@6.11.0 4 | 5 | workflows: 6 | package-and-push-chart-on-tag: 7 | jobs: 8 | - architect/push-to-app-catalog: 9 | context: architect 10 | executor: app-build-suite 11 | name: app-catalog 12 | app_catalog: control-plane-catalog 13 | app_catalog_test: control-plane-test-catalog 14 | chart: prometheus-rules 15 | # Trigger job on git tag. 16 | filters: 17 | tags: 18 | only: /^v.*/ 19 | branches: 20 | ignore: 21 | - main 22 | - master 23 | 24 | - architect/push-to-app-collection: 25 | context: architect 26 | name: push-to-capa-app-collection 27 | app_name: prometheus-rules 28 | app_namespace: monitoring 29 | app_collection_repo: capa-app-collection 30 | requires: 31 | - app-catalog 32 | filters: 33 | branches: 34 | ignore: /.*/ 35 | tags: 36 | only: /^v.*/ 37 | 38 | - architect/push-to-app-collection: 39 | context: architect 40 | name: push-to-capz-app-collection 41 | app_name: prometheus-rules 42 | app_namespace: monitoring 43 | app_collection_repo: capz-app-collection 44 | requires: 45 | - app-catalog 46 | filters: 47 | branches: 48 | ignore: /.*/ 49 | tags: 50 | only: /^v.*/ 51 | 52 | - architect/push-to-app-collection: 53 | context: architect 54 | name: push-to-cloud-director-app-collection 55 | app_name: prometheus-rules 56 | app_namespace: monitoring 57 | app_collection_repo: cloud-director-app-collection 58 | requires: 59 | - app-catalog 60 | filters: 61 | branches: 62 | ignore: /.*/ 63 | tags: 64 | only: /^v.*/ 65 | 66 | - architect/push-to-app-collection: 67 | context: architect 68 | name: vsphere-app-collection 69 | app_name: prometheus-rules 70 | app_namespace: monitoring 71 | app_collection_repo: vsphere-app-collection 72 | requires: 73 | - app-catalog 74 | filters: 75 | branches: 76 | ignore: /.*/ 77 | tags: 78 | only: /^v.*/ 79 | 80 | - architect/push-to-app-collection: 81 | context: architect 82 | name: proxmox-app-collection 83 | app_name: prometheus-rules 84 | app_namespace: monitoring 85 | app_collection_repo: proxmox-app-collection 86 | requires: 87 | - app-catalog 88 | filters: 89 | branches: 90 | ignore: /.*/ 91 | tags: 92 | only: /^v.*/ 93 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/platform/atlas/alerting-rules/tracing-pipeline.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | labels: 5 | {{- include "labels.common" . | nindent 4 }} 6 | name: logging-pipeline.rules 7 | namespace: {{ .Values.namespace }} 8 | spec: 9 | groups: 10 | - name: tracing-pipeline 11 | rules: 12 | # This alert will trigger if the failure rate of spans sent by the OTEL exporter exceeds a defined threshold (e.g., 10%). 13 | - alert: OTLPTraceForwardingErrors 14 | annotations: 15 | __dashboardUid__: 9b6d37c8603e19e8922133984faad93d 16 | dashboardQueryParams: "orgId=2" 17 | summary: Alloy OTLP exporter is failing to send spans. 18 | description: '{{`The Alloy OTLP exporter has failed to send {{ printf "%.1f" $value }}% of spans over the last 5 minutes.`}}' 19 | runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/tracing-pipeline/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}' 20 | expr: |- 21 | ( 22 | rate(otelcol_exporter_send_failed_spans_total{job="alloy-events"}[5m]) 23 | / 24 | rate(otelcol_exporter_sent_spans_total{job="alloy-events"}[5m]) 25 | ) * 100 26 | >= 10 # Trigger if failure rate exceeds 10% 27 | for: 1h 28 | labels: 29 | area: platform 30 | severity: page 31 | team: atlas 32 | topic: observability 33 | cancel_if_outside_working_hours: "true" 34 | # This alert triggers if the Alloy OTLP exporter fails to enqueue spans at a sustained rate exceeding 100 spans per second over 5 minutes, which could indicate upstream issues or resource constraints. 35 | - alert: OTLPExporterEnqueueFailures 36 | annotations: 37 | summary: Alloy OTLP exporter enqueue failures exceed 100 spans/second over 5 minutes 38 | description: '{{`The Alloy OTLP exporter has failed to enqueue more than 100 spans per second on average over the last 5 minutes, indicating potential upstream issues.`}}' 39 | runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/tracing-pipeline/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}' 40 | __dashboardUid__: 9b6d37c8603e19e8922133984faad93d 41 | dashboardQueryParams: "orgId=2" 42 | expr: rate(otelcol_exporter_enqueue_failed_spans_total{job="alloy-events"}[5m]) > 100 43 | for: 1h 44 | labels: 45 | area: platform 46 | severity: page 47 | team: atlas 48 | topic: observability 49 | cancel_if_outside_working_hours: "true" 50 | -------------------------------------------------------------------------------- /test/tests/providers/global/platform/atlas/alerting-rules/sloth.rules.test.yml: -------------------------------------------------------------------------------- 1 | --- 2 | rule_files: 3 | - sloth.rules.yml 4 | 5 | tests: 6 | - interval: 1m 7 | input_series: 8 | # For the first 60min: test with 1 pod: none, up, down 9 | - series: 'up{job="monitoring/sloth", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", provider="capa", pipeline="testing"}' 10 | values: "_x20 1+0x20 0+0x20" 11 | alert_rule_test: 12 | - alertname: SlothDown 13 | eval_time: 10m 14 | - alertname: SlothDown 15 | eval_time: 30m 16 | - alertname: SlothDown 17 | eval_time: 50m 18 | exp_alerts: 19 | - exp_labels: 20 | area: platform 21 | cluster_id: gauss 22 | installation: gauss 23 | provider: capa 24 | pipeline: testing 25 | severity: page 26 | team: atlas 27 | topic: observability 28 | cancel_if_cluster_control_plane_unhealthy: "true" 29 | cancel_if_outside_working_hours: "true" 30 | exp_annotations: 31 | description: "Sloth is down." 32 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/runbooks/sloth-down/?INSTALLATION=gauss&CLUSTER=gauss 33 | - interval: 1m 34 | input_series: 35 | - series: 'kube_pod_container_status_restarts_total{cluster_type="management_cluster", namespace="monitoring", container="sloth", installation="gauss", cluster_id="gauss"}' 36 | values: "0+0x20 0+5x20 100+0x140" # 0 restarts after 20 minutes then we restart 5 times per minute for 20 minutes then we stop restarting for 140 minutes 37 | alert_rule_test: 38 | - alertname: SlothRestartingTooOften 39 | eval_time: 15m # should be OK after 15 minutes 40 | - alertname: SlothRestartingTooOften 41 | eval_time: 85m # After 85 minutes, should fire an alert for the t+85 error 42 | exp_alerts: 43 | - exp_labels: 44 | area: platform 45 | cancel_if_cluster_control_plane_unhealthy: "true" 46 | cancel_if_outside_working_hours: "true" 47 | cluster_id: gauss 48 | cluster_type: management_cluster 49 | container: sloth 50 | installation: gauss 51 | namespace: monitoring 52 | severity: page 53 | team: atlas 54 | topic: observability 55 | exp_annotations: 56 | description: Sloth is restarting too often. 57 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/runbooks/sloth-down/?INSTALLATION=gauss&CLUSTER=gauss 58 | - alertname: SlothRestartingTooOften 59 | eval_time: 140m # After 140m minutes, all should be back to normal 60 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/kaas/tenet/alerting-rules/apiserver.management-cluster.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | creationTimestamp: null 5 | labels: 6 | {{- include "labels.common" . | nindent 4 }} 7 | name: apiserver.management-cluster.rules 8 | namespace: {{ .Values.namespace }} 9 | spec: 10 | groups: 11 | - name: apiserver 12 | rules: 13 | - alert: ManagementClusterAPIServerAdmissionWebhookErrors 14 | annotations: 15 | description: '{{`Kubernetes API Server {{ $labels.cluster_id }} having admission webhook errors.`}}' 16 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/apiserver-admission-webhook-errors/ 17 | expr: label_replace(rate(apiserver_admission_webhook_rejection_count{cluster_type="management_cluster", error_type=~"calling_webhook_error|apiserver_internal_error"}[5m]), "service", "$1", "name", "(.*)") > 1 18 | for: 15m 19 | labels: 20 | area: kaas 21 | cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} 22 | severity: page 23 | team: tenet 24 | topic: managementcluster 25 | - alert: ManagementClusterWebhookDurationExceedsTimeout 26 | annotations: 27 | description: '{{`Kubernetes API Server admission webhook {{ $labels.name }} is timing out.`}}' 28 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/apiserver-admission-webhook-errors/ 29 | expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="management_cluster", name!="apps.app-admission-controller.giantswarm.io"}[5m])) by (cluster_id, installation, pipeline, provider, name, job, le)) > 5 30 | for: 25m 31 | labels: 32 | area: kaas 33 | cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} 34 | severity: page 35 | team: tenet 36 | topic: managementcluster 37 | 38 | # Kyverno webhooks that may block critical objects 39 | - alert: ManagementClusterWebhookDurationExceedsTimeoutKyvernoCritical 40 | annotations: 41 | description: '{{`Kubernetes API Server admission webhook {{ $labels.name }} takes very long or is timing out.`}}' 42 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/apiserver-admission-webhook-errors/ 43 | expr: histogram_quantile(0.95, sum(rate(apiserver_admission_webhook_admission_duration_seconds_bucket{cluster_type="management_cluster", name=~".*(kyverno.*fail).*"}[15m])) by (cluster_id, installation, pipeline, provider, name, job, le)) > 10 44 | for: 10m 45 | labels: 46 | area: kaas 47 | cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} 48 | severity: page 49 | team: tenet 50 | topic: managementcluster 51 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/kaas/tenet/alerting-rules/certificate.management-cluster.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | creationTimestamp: null 5 | labels: 6 | {{- include "labels.common" . | nindent 4 }} 7 | name: certificate.management-cluster.rules 8 | namespace: {{ .Values.namespace }} 9 | spec: 10 | groups: 11 | - name: certificate.management-cluster 12 | rules: 13 | - alert: ManagementClusterCertificateIsMissing 14 | annotations: 15 | description: '{{`Cannot renew Certificate for Secret {{ $labels.exported_namespace }}/{{ $labels.certificatename }} on {{ $labels.cluster_id }} because it is missing.`}}' 16 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/managed-app-cert-manager/missing-certificate-for-secret/ 17 | expr: | 18 | count( 19 | cert_exporter_secret_not_after{cluster_type="management_cluster", secretkey="tls.crt", certificatename=~"^capa-serving-cert$|^capi-serving-cert$|^capi-kubeadm-bootstrap-serving-cert$|^capi-kubeadm-control-plane-serving-cert$|^capv-serving-cert$|^capmox-serving-cert$|^caip-in-cluster-serving-cert$|^capvcd-serving-cert$|^capz-serving-cert$|^azureserviceoperator-serving-cert$|^aws-pod-identity-webhook$"} 20 | ) by (cluster_id, installation, pipeline, provider, certificatename, exported_namespace) 21 | unless 22 | count( 23 | label_replace( 24 | cert_exporter_certificate_cr_not_after{cluster_type="management_cluster", name=~"^capa-serving-cert$|^capi-serving-cert$|^capi-kubeadm-bootstrap-serving-cert$|^capi-kubeadm-control-plane-serving-cert$|^capv-serving-cert$|^capmox-serving-cert$|^caip-in-cluster-serving-cert$|^capvcd-serving-cert$|^capz-serving-cert$|^azureserviceoperator-serving-cert$|^aws-pod-identity-webhook$"}, 25 | "certificatename", 26 | "$1", 27 | "name", 28 | "(.*)" 29 | ) 30 | ) by (cluster_id, installation, pipeline, provider, certificatename, exported_namespace) 31 | for: 5m 32 | labels: 33 | area: kaas 34 | cancel_if_outside_working_hours: "true" 35 | severity: page 36 | team: {{ include "providerTeam" . }} 37 | topic: security 38 | - alert: ManagementClusterCertificateWillExpireInLessThanOneMonth 39 | annotations: 40 | description: '{{`Certificate {{ $labels.path }} on {{ $labels.node }} will expire in less than one month.`}}' 41 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/renew-certificates/ 42 | expr: (cert_exporter_not_after{cluster_type="management_cluster", path!="/etc/kubernetes/ssl/service-account-crt.pem"} - time()) < 4 * 7 * 24 * 60 * 60 43 | for: 5m 44 | labels: 45 | area: kaas 46 | cancel_if_outside_working_hours: "true" 47 | severity: page 48 | team: se 49 | topic: security 50 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/kaas/tenet/alerting-rules/capi-kubeadmcontrolplane.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | labels: {{- include "labels.common" . | nindent 4}} 5 | name: capi-kubeadmcontrolplane.rules 6 | namespace: {{.Values.namespace}} 7 | spec: 8 | groups: 9 | - name: capi-kubeadmcontrolplane 10 | rules: 11 | - alert: KubeadmControlPlaneReplicasMismatch 12 | expr: |- 13 | ( 14 | (capi_kubeadmcontrolplane_spec_replicas != capi_kubeadmcontrolplane_status_replicas_ready) 15 | * on(cluster_id) group_left(provider) 16 | sum( 17 | label_replace( 18 | capi_cluster_info, "provider", "vsphere", "infrastructure_reference_kind", "VSphereCluster" 19 | ) 20 | ) by (cluster_id, provider) 21 | ) 22 | # 90min at max 3 replicas results in maximum of 30 minutes per control-plane machine. 23 | for: 90m 24 | labels: 25 | area: kaas 26 | cancel_if_monitoring_agent_down: "true" 27 | cancel_if_outside_working_hours: "true" 28 | severity: notify 29 | team: {{ include "providerTeam" . }} 30 | topic: managementcluster 31 | annotations: 32 | description: |- 33 | {{`The clusters {{$labels.cluster_id}} kubeadmcontrolplane {{$labels.exported_namespace}}/{{$labels.name}} does not match the expected number of replicas for longer than 90 minutes.`}} 34 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/capi-kubeadmcontrolplane/ 35 | __dashboardUid__: bdi7iswg81czkcasd 36 | dashboardQueryParams: "orgId=2" 37 | - alert: KubeadmControlPlanePaused 38 | expr: |- 39 | ( 40 | capi_kubeadmcontrolplane_annotation_paused{paused_value="true"} 41 | * on(cluster_id) group_left(provider) 42 | sum( 43 | label_replace( 44 | capi_cluster_info, "provider", "vsphere", "infrastructure_reference_kind", "VSphereCluster" 45 | ) 46 | ) by (cluster_id, provider) 47 | ) > 0 48 | for: 1h 49 | labels: 50 | area: kaas 51 | cancel_if_monitoring_agent_down: "true" 52 | cancel_if_outside_working_hours: "true" 53 | severity: notify 54 | team: {{ include "providerTeam" . }} 55 | topic: managementcluster 56 | annotations: 57 | description: |- 58 | {{`The clusters {{$labels.cluster_id}} kubeadmcontrolplane {{$labels.exported_namespace}}/{{$labels.name}} is paused.`}} 59 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/capi-kubeadmcontrolplane/ 60 | __dashboardUid__: bdi7iswg81czkcasd 61 | dashboardQueryParams: "orgId=2" 62 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/kaas/tenet/alerting-rules/vertical-pod-autoscaler.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | labels: 5 | {{- include "labels.common" . | nindent 4 }} 6 | name: vertical-pod-autoscaler.rules 7 | namespace: {{ .Values.namespace }} 8 | spec: 9 | groups: 10 | - name: vertical-pod-autoscaler 11 | rules: 12 | - alert: VpaComponentTooManyRestarts 13 | annotations: 14 | description: This pages when one of the vpa's component has restarted too much over the last 10min. 15 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/vpa-component-too-many-restarts/ 16 | expr: | 17 | 1 - sum(increase(kube_pod_container_status_restarts_total{container=~"recommender|updater|admission-controller"}[10m])) by (container, cluster_id, cluster_type, customer, installation, pipeline, provider, region)/100 < 0.98 18 | or 19 | 1 - sum(increase(kube_pod_container_status_restarts_total{container="vertical-pod-autoscaler-app"}[10m])) by (container, cluster_id, cluster_type, customer, installation, pipeline, provider, region)/100 < 0.98 20 | for: 10m 21 | labels: 22 | area: kaas 23 | cancel_if_cluster_control_plane_unhealthy: "true" 24 | cancel_if_outside_working_hours: "true" 25 | severity: notify 26 | team: tenet 27 | topic: autoscaling 28 | - alert: FluxHelmReleaseFailed 29 | annotations: 30 | description: |- 31 | {{`Flux HelmRelease {{ $labels.name }} in ns {{ $labels.exported_namespace }} on {{ $labels.installation }}/{{ $labels.cluster_id }} is stuck in Failed state.`}} 32 | runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/flux-helmrelease-failed/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}&NAMESPACE={{ $labels.exported_namespace }}&HELMRELEASE_NAME={{ $labels.name }}`}}' 33 | {{- $components := "(vertical-pod-autoscaler-crd)" }} 34 | expr: | 35 | ( 36 | label_replace(gotk_resource_info{ready="False", customresource_kind="HelmRelease", cluster_type="management_cluster", exported_namespace!="flux-giantswarm", exported_namespace!~"org-t-.*", name=~"(.+)-{{ $components }}"}, "cluster_id", "$1", "name", "(.+)-{{ $components }}") 37 | * on(cluster_id) group_left(provider) 38 | sum( 39 | label_replace( 40 | capi_cluster_info, "provider", "vsphere", "infrastructure_reference_kind", "VSphereCluster" 41 | ) 42 | ) by (cluster_id, provider) 43 | ) > 0 44 | for: 20m 45 | labels: 46 | area: kaas 47 | cancel_if_outside_working_hours: "true" 48 | cancel_if_kube_state_metrics_down: "true" 49 | cancel_if_monitoring_agent_down: "true" 50 | severity: page 51 | team: {{ include "providerTeam" . }} 52 | topic: autoscaling 53 | namespace: |- 54 | {{`{{ $labels.exported_namespace }}`}} 55 | -------------------------------------------------------------------------------- /test/tests/providers/global/platform/honeybadger/alerting-rules/konfigure-operator.rules.test.yml: -------------------------------------------------------------------------------- 1 | --- 2 | rule_files: 3 | - konfigure-operator.rules.yml 4 | 5 | tests: 6 | - interval: 1m 7 | input_series: 8 | - series: 'kube_deployment_status_replicas_unavailable{installation="exampleinstallation",cluster_id="examplecluster",cluster_type="management_cluster",namespace="giantswarm",deployment="konfigure-operator"}' 9 | values: '_x5 0x10 1x45 0x60' 10 | alert_rule_test: 11 | - alertname: KonfigureOperatorDeploymentNotSatisfied 12 | eval_time: 46m 13 | exp_alerts: 14 | - exp_labels: 15 | alertname: "KonfigureOperatorDeploymentNotSatisfied" 16 | area: "platform" 17 | cancel_if_outside_working_hours: "true" 18 | cluster_id: "examplecluster" 19 | cluster_type: "management_cluster" 20 | deployment: "konfigure-operator" 21 | installation: "exampleinstallation" 22 | namespace: "giantswarm" 23 | severity: "page" 24 | team: "honeybadger" 25 | topic: "managementcluster" 26 | exp_annotations: 27 | description: "Konfigure Operator deployment giantswarm/konfigure-operator is not satisfied." 28 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/runbooks/deployment-not-satisfied/?INSTALLATION=exampleinstallation&CLUSTER=examplecluster&NAMESPACE=giantswarm&KIND=deployment&NAME=konfigure-operator 29 | - alertname: KonfigureOperatorDeploymentNotSatisfied 30 | eval_time: 100m 31 | exp_alerts: [] 32 | - interval: 1m 33 | input_series: 34 | - series: 'konfigure_operator_reconcile_condition{condition_type="Ready", condition_status="False", resource_kind="ManagementClusterConfiguration", resource_name="test", resource_namespace="giantswarm", installation="example"}' 35 | values: '0x30 1x5 0x20 1x15' 36 | alert_rule_test: 37 | - alertname: KonfigurationReconciliationFailed 38 | eval_time: 35m 39 | exp_alerts: [] 40 | - alertname: KonfigurationReconciliationFailed 41 | eval_time: 70m 42 | exp_alerts: 43 | - exp_labels: 44 | alertname: "KonfigurationReconciliationFailed" 45 | area: "platform" 46 | cancel_if_outside_working_hours: "true" 47 | condition_status: "False" 48 | condition_type: "Ready" 49 | installation: "example" 50 | resource_kind: "ManagementClusterConfiguration" 51 | resource_name: "test" 52 | resource_namespace: "giantswarm" 53 | severity: "page" 54 | team: "honeybadger" 55 | topic: "releng" 56 | exp_annotations: 57 | description: "ManagementClusterConfiguration test in ns giantswarm on example is stuck in Failed state." 58 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/konfigure-operator/ 59 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/kaas/phoenix/alerting-rules/cluster-crossplane.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | creationTimestamp: null 5 | labels: 6 | {{- include "labels.common" . | nindent 4 }} 7 | name: cluster-crossplane.rules 8 | namespace: {{ .Values.namespace }} 9 | spec: 10 | groups: 11 | - name: cluster-crossplane 12 | rules: 13 | - alert: ClusterCrossplaneResourcesNotReady 14 | annotations: 15 | # Crossplane doesn't offer object names and the objects are stored on the MC, so right 16 | # now (2025-01), we can't make this alert WC-specific. 17 | description: '{{`Not all managed Crossplane resources of type "{{ $labels.gvk }}" on {{ $labels.cluster_id }} are ready. This could affect creation or health of workload clusters.`}}' 18 | runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/cluster-crossplane-resources/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}' 19 | # Match critical resources deployed by cluster-aws via aws-nth-crossplane-resources, 20 | # cilium-crossplane-resources, crossplane-fn-irsa, ... 21 | expr: | 22 | ( 23 | crossplane_managed_resource_exists{gvk=~"(iam.aws.upbound.io/.*, Kind=(Role.*|Policy)|sqs.aws.upbound.io/.*, Kind=Queue|sqs.aws.upbound.io/.*, Kind=QueuePolicy|cloudwatchevents.aws.upbound.io/.*, Kind=Rule|cloudwatchevents.aws.upbound.io/.*, Kind=Target|ec2.aws.upbound.io/.*, Kind=SecurityGroup|acm.aws.upbound.io/.*, Kind=Certificate|cloudfront.aws.upbound.io/.*, Kind=.+|iam.aws.upbound.io/.*, Kind=OpenIDConnectProvider|route53.aws.upbound.io/.*, Kind=Record|s3.aws.upbound.io/.*, Kind=Bucket.*)"} != crossplane_managed_resource_ready{gvk=~"(iam.aws.upbound.io/.*, Kind=(Role.*|Policy)|sqs.aws.upbound.io/.*, Kind=Queue|sqs.aws.upbound.io/.*, Kind=QueuePolicy|cloudwatchevents.aws.upbound.io/.*, Kind=Rule|cloudwatchevents.aws.upbound.io/.*, Kind=Target|ec2.aws.upbound.io/.*, Kind=SecurityGroup|acm.aws.upbound.io/.*, Kind=Certificate|cloudfront.aws.upbound.io/.*, Kind=.+|iam.aws.upbound.io/.*, Kind=OpenIDConnectProvider|route53.aws.upbound.io/.*, Kind=Record|s3.aws.upbound.io/.*, Kind=Bucket.*)"} 24 | ) OR 25 | iam_aws_upbound_role_ready{status="False", label_giantswarm_io_service_type="managed"} == 1 OR 26 | sqs_aws_upbound_queue_ready{status="False", label_giantswarm_io_service_type="managed"} == 1 OR 27 | sqs_aws_upbound_queuepolicy_ready{status="False", label_giantswarm_io_service_type="managed"} == 1 OR 28 | cloudwatchevents_aws_upbound_rule_ready{status="False", label_giantswarm_io_service_type="managed"} == 1 OR 29 | cloudwatchevents_aws_upbound_target_ready{status="False", label_giantswarm_io_service_type="managed"} == 1 OR 30 | ec2_aws_upbound_securitygroup_ready{status="False", label_giantswarm_io_service_type="managed"} == 1 31 | for: 15m 32 | labels: 33 | area: kaas 34 | cancel_if_outside_working_hours: {{ include "workingHoursOnly" . }} 35 | severity: page 36 | team: phoenix 37 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/platform/atlas/alerting-rules/fluentbit.rules.yml: -------------------------------------------------------------------------------- 1 | {{- if not (or (eq .Values.managementCluster.provider.kind "vsphere") (eq .Values.managementCluster.provider.kind "cloud-director")) }} 2 | apiVersion: monitoring.coreos.com/v1 3 | kind: PrometheusRule 4 | metadata: 5 | creationTimestamp: null 6 | labels: 7 | {{- include "labels.common" . | nindent 4 }} 8 | name: fluentbit.rules 9 | namespace: {{ .Values.namespace }} 10 | spec: 11 | groups: 12 | - name: fluentbit 13 | rules: 14 | - alert: FluentbitDropRatio 15 | annotations: 16 | description: '{{`Fluentbit ({{ $labels.instance }}) is dropping more than 1% records.`}}' 17 | runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/fluentbit/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}' 18 | __dashboardUid__: fluentbit 19 | dashboardQueryParams: "orgId=2" 20 | # Check the ratio of dropped records over the total number of records. 21 | # We only monitor this app on the management cluster so we don't get alerts if the customer misconfigures theirs. 22 | expr: |- 23 | rate( 24 | fluentbit_output_dropped_records_total{cluster_type="management_cluster"}[10m]) 25 | / ( 26 | rate(fluentbit_output_proc_records_total{cluster_type="management_cluster"}[10m]) 27 | + rate(fluentbit_output_dropped_records_total{cluster_type="management_cluster"}[10m]) 28 | ) 29 | > 0.01 30 | for: 20m 31 | labels: 32 | area: platform 33 | cancel_if_outside_working_hours: "true" 34 | severity: page 35 | team: atlas 36 | topic: observability 37 | - alert: FluentbitDown 38 | annotations: 39 | description: '{{`Fluentbit is down on node ({{ $labels.node }}).`}}' 40 | runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/runbooks/fluentbit/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}' 41 | __dashboardUid__: fluentbit 42 | dashboardQueryParams: "orgId=2" 43 | expr: sum(up{job="fluent-logshipping-app"}) by (job, cluster_id, installation, provider, pipeline, namespace, node) == 0 44 | for: 15m 45 | labels: 46 | area: platform 47 | cancel_if_outside_working_hours: "true" 48 | severity: page 49 | team: atlas 50 | topic: observability 51 | - alert: FluentbitDaemonSetNotSatisfied 52 | annotations: 53 | description: '{{`Daemonset {{ $labels.namespace}}/{{ $labels.daemonset }} is not satisfied.`}}' 54 | runbook_url: '{{`https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/daemonset-not-satisfied/?INSTALLATION={{ $labels.installation }}&CLUSTER={{ $labels.cluster_id }}`}}' 55 | __dashboardUid__: fluentbit 56 | dashboardQueryParams: "orgId=2" 57 | expr: kube_daemonset_status_number_unavailable{daemonset="fluent-logshipping-app"} > 0 58 | for: 1h 59 | labels: 60 | area: platform 61 | cancel_if_outside_working_hours: "true" 62 | severity: page 63 | team: atlas 64 | topic: observability 65 | {{- end }} 66 | -------------------------------------------------------------------------------- /helm/prometheus-rules/templates/platform/shield/alerting-rules/falco.rules.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | creationTimestamp: null 5 | labels: 6 | {{- include "labels.common" . | nindent 4 }} 7 | name: falco.rules 8 | namespace: {{ .Values.namespace }} 9 | spec: 10 | groups: 11 | - name: falco 12 | rules: 13 | - alert: FalcoCriticalAlertFiring 14 | annotations: 15 | description: |- 16 | {{`{{ if eq $labels.k8s_pod_name "" }}The Falco rule {{ $labels.rule }} was triggered on the node {{ $labels.hostname }}. 17 | {{else}}Pod {{ $labels.k8s_ns_name }}/{{ $labels.k8s_pod_name }} triggered the Falco rule {{ $labels.rule }} on the node {{ $labels.hostname }}.{{ end }}`}} 18 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/falco-alert/ 19 | expr: increase(falco_events{priority=~"0|1|2|3"}[10m] ) > 0 20 | labels: 21 | area: platform 22 | cancel_if_outside_working_hours: "true" 23 | severity: notify 24 | team: shield 25 | topic: security 26 | - alert: FalcoMediumAlertFiring 27 | annotations: 28 | description: |- 29 | {{`{{ if eq $labels.k8s_pod_name "" }}The Falco rule {{ $labels.rule }} was triggered on the node {{ $labels.hostname }}. 30 | {{else}}Pod {{ $labels.k8s_ns_name }}/{{ $labels.k8s_pod_name }} triggered the Falco rule {{ $labels.rule }} on the node {{ $labels.hostname }}.{{ end }}`}} 31 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/falco-alert/ 32 | expr: increase(falco_events{priority=~"4|5"}[10m] ) > 0 33 | labels: 34 | area: platform 35 | severity: notify 36 | team: shield 37 | topic: security 38 | - alert: FalcoInformationalAlert 39 | annotations: 40 | description: |- 41 | {{`{{ if eq $labels.k8s_pod_name "" }}The Falco rule {{ $labels.rule }} was triggered on the node {{ $labels.hostname }}. 42 | {{else}}Pod {{ $labels.k8s_ns_name }}/{{ $labels.k8s_pod_name }} triggered the Falco rule {{ $labels.rule }} on the node {{ $labels.hostname }}.{{ end }}`}} 43 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/falco-alert/ 44 | expr: increase(falco_events{priority="6"}[10m] ) > 0 45 | labels: 46 | area: platform 47 | severity: notify 48 | team: shield 49 | topic: security 50 | - alert: FalcoXZBackdoorAlert 51 | annotations: 52 | description: |- 53 | {{`{{ if eq $labels.k8s_pod_name "" }}The Falco rule {{ $labels.rule }} was triggered on the node {{ $labels.hostname }}. 54 | {{else}}Pod {{ $labels.k8s_ns_name }}/{{ $labels.k8s_pod_name }} triggered the Falco rule {{ $labels.rule }} on the node {{ $labels.hostname }}.{{ end }}`}} 55 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/ops-recipes/falco-alert/ 56 | expr: falco_events{rule="Backdoored library loaded into SSHD (CVE-2024-3094)"} > 0 57 | labels: 58 | area: platform 59 | severity: notify 60 | team: shield 61 | topic: security 62 | -------------------------------------------------------------------------------- /test/tests/providers/global/platform/atlas/alerting-rules/logging-pipeline.rules.test.yml: -------------------------------------------------------------------------------- 1 | --- 2 | rule_files: 3 | - logging-pipeline.rules.yml 4 | 5 | tests: 6 | # Test LogForwardingErrors 7 | - interval: 1m 8 | input_series: 9 | # Tests with multiple cases: no metrics, no requests, only status_code 204 ones, 204 ones and 500 that are less than 10% of the the total, 500 request that represent more than 10% of the total, only 500 ones 10 | - series: 'loki_write_request_duration_seconds_count{status_code="500", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", provider="capa", pipeline="testing", node="ip-10-0-5-145.eu-west-1.compute.internal", pod="alloy-2j7z7"}' 11 | values: "_x60 0+0x60 0+0x60 0+50x60 3000+100x60 9000+600x60" 12 | - series: 'loki_write_request_duration_seconds_count{status_code="204", cluster_type="management_cluster", cluster_id="gauss", installation="gauss", provider="capa", pipeline="testing", node="ip-10-0-5-145.eu-west-1.compute.internal", pod="alloy-2j7z7"}' 13 | values: "_x60 0+0x60 0+600x60 36000+600x60 72000+600x60 108000+0x60" 14 | alert_rule_test: 15 | - alertname: LogForwardingErrors 16 | eval_time: 30m 17 | - alertname: LogForwardingErrors 18 | eval_time: 90m 19 | - alertname: LogForwardingErrors 20 | eval_time: 150m 21 | - alertname: LogForwardingErrors 22 | eval_time: 210m 23 | - alertname: LogForwardingErrors 24 | eval_time: 270m 25 | exp_alerts: 26 | - exp_labels: 27 | area: platform 28 | cancel_if_outside_working_hours: "true" 29 | cluster_id: gauss 30 | installation: gauss 31 | pod: alloy-2j7z7 32 | provider: capa 33 | pipeline: testing 34 | severity: page 35 | team: atlas 36 | topic: observability 37 | exp_annotations: 38 | __dashboardUid__: 53c1ecddc3a1d5d4b8d6cd0c23676c31 39 | dashboardQueryParams: orgId=2 40 | description: "14.29% of the requests to Loki are failing for pod alloy-2j7z7 (threshold 10%)" 41 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/runbooks/logging-pipeline/?INSTALLATION=gauss&CLUSTER=gauss 42 | - alertname: LogForwardingErrors 43 | eval_time: 330m 44 | exp_alerts: 45 | - exp_labels: 46 | area: platform 47 | cancel_if_outside_working_hours: "true" 48 | cluster_id: gauss 49 | installation: gauss 50 | pod: alloy-2j7z7 51 | provider: capa 52 | pipeline: testing 53 | severity: page 54 | team: atlas 55 | topic: observability 56 | exp_annotations: 57 | __dashboardUid__: 53c1ecddc3a1d5d4b8d6cd0c23676c31 58 | dashboardQueryParams: orgId=2 59 | description: "100.00% of the requests to Loki are failing for pod alloy-2j7z7 (threshold 10%)" 60 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/runbooks/logging-pipeline/?INSTALLATION=gauss&CLUSTER=gauss 61 | -------------------------------------------------------------------------------- /test/tests/providers/global/platform/atlas/alerting-rules/statefulset.rules.test.yml: -------------------------------------------------------------------------------- 1 | --- 2 | rule_files: 3 | - statefulset.rules.yml 4 | 5 | tests: 6 | - interval: 1m 7 | input_series: 8 | - series: 'kube_statefulset_status_replicas{app="kube-state-metrics", cluster_id="gauss", cluster_type="management_cluster", customer="giantswarm", installation="gauss", namespace="loki", organization="giantswarm", pipeline="testing", region="westeurope", statefulset="loki-write"}' 9 | values: "3+0x5760" # 5760 = 4 days 10 | - series: 'kube_statefulset_status_replicas_ready{app="kube-state-metrics", cluster_id="gauss", cluster_type="management_cluster", customer="giantswarm", installation="gauss", namespace="loki", organization="giantswarm", pipeline="testing", region="westeurope", statefulset="loki-write"}' 11 | values: "3+0x60 2+0x4440 3+0x60" # 4440 = 3 days + 2h 12 | alert_rule_test: 13 | - alertname: StatefulsetNotSatisfiedAtlas 14 | eval_time: 60m 15 | - alertname: StatefulsetNotSatisfiedAtlas 16 | eval_time: 4380m # 3 days + 1h 17 | - alertname: StatefulsetNotSatisfiedAtlas 18 | eval_time: 4382m 19 | exp_alerts: 20 | - exp_labels: 21 | app: kube-state-metrics 22 | area: platform 23 | cancel_if_outside_working_hours: "false" 24 | cluster_id: "gauss" 25 | cluster_type: management_cluster 26 | customer: giantswarm 27 | installation: "gauss" 28 | namespace: loki 29 | organization: giantswarm 30 | pipeline: "testing" 31 | region: westeurope 32 | severity: page 33 | statefulset: loki-write 34 | team: atlas 35 | topic: managementcluster 36 | exp_annotations: 37 | description: "Statefulset loki/loki-write is not satisfied." 38 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/runbooks/deployment-not-satisfied/?INSTALLATION=gauss&CLUSTER=gauss&NAMESPACE=loki&KIND=statefulset&NAME=loki-write 39 | - alertname: StatefulsetNotSatisfiedAtlas 40 | eval_time: 4500m # 3 days + 3h 41 | exp_alerts: 42 | - exp_labels: 43 | app: kube-state-metrics 44 | area: platform 45 | cancel_if_outside_working_hours: "false" 46 | cluster_id: "gauss" 47 | cluster_type: management_cluster 48 | customer: giantswarm 49 | installation: "gauss" 50 | namespace: loki 51 | organization: giantswarm 52 | pipeline: "testing" 53 | region: westeurope 54 | severity: page 55 | statefulset: loki-write 56 | team: atlas 57 | topic: managementcluster 58 | exp_annotations: 59 | description: "Statefulset loki/loki-write is not satisfied." 60 | runbook_url: https://intranet.giantswarm.io/docs/support-and-ops/runbooks/deployment-not-satisfied/?INSTALLATION=gauss&CLUSTER=gauss&NAMESPACE=loki&KIND=statefulset&NAME=loki-write 61 | - alertname: StatefulsetNotSatisfiedAtlas 62 | eval_time: 4502m 63 | --------------------------------------------------------------------------------