├── .bingo ├── .gitignore ├── README.md ├── Variables.mk ├── bingo.mod ├── bingo.sum ├── faillint.mod ├── faillint.sum ├── go.mod ├── goimports.mod ├── goimports.sum ├── gojq.mod ├── gojq.sum ├── gojsontoyaml.mod ├── gojsontoyaml.sum ├── golangci-lint.mod ├── golangci-lint.sum ├── jb.mod ├── jb.sum ├── jsonnet-deps.mod ├── jsonnet-deps.sum ├── jsonnet-lint.mod ├── jsonnet-lint.sum ├── jsonnet.mod ├── jsonnet.sum ├── jsonnetfmt.mod ├── jsonnetfmt.sum ├── promtool.mod ├── promtool.sum ├── variables.env ├── yq.mod └── yq.sum ├── .circleci └── config.yml ├── .github └── workflows │ └── ci.yml ├── .gitignore ├── .gitleaks.toml ├── CODEOWNERS ├── COPYRIGHT ├── LICENSE ├── Makefile ├── README.md ├── build_deploy.sh ├── configuration ├── observatorium │ ├── metric-federation-rules.libsonnet │ ├── queries-ruler.libsonnet │ ├── queries.libsonnet │ ├── rbac.go │ ├── ruler-remote-write.libsonnet │ ├── slo.go │ └── tenants.libsonnet ├── rhelemeter │ └── metrics.json ├── telemeter-rosa │ ├── README.md │ └── metrics.json └── telemeter │ └── metrics.json ├── crds ├── loki.grafana.com_alertingrules.libsonnet ├── loki.grafana.com_recordingrules.libsonnet └── observatorium-logs-crds-template.jsonnet ├── docs ├── observatorium-logs.png ├── observatorium.md ├── readme.md ├── rhelemeter.md ├── sop │ ├── observatorium.md │ ├── remote_write_load_shedding_sop.md │ └── tenant_removal_sop.md ├── telemeter.md └── telemeter.png ├── go.mod ├── go.sum ├── jsonnetfile.json ├── jsonnetfile.lock.json ├── lib └── k.libsonnet ├── loki-operational.json ├── magefiles ├── alertmanager.go ├── cache.go ├── gateway.go ├── lib.go ├── magefile.go ├── operator.go ├── secrets.go ├── servicemonitors.go ├── telemeter_rules.go ├── template.go └── thanos.go ├── mimic.go ├── observability ├── config.libsonnet ├── dashboards │ ├── observatorium-api-logs.libsonnet │ ├── observatorium-api.libsonnet │ ├── observatorium-gubernator.libsonnet │ ├── opentelemetry.libsonnet │ ├── rhobs-instance-utilization-overview.libsonnet │ ├── rules-objstore.libsonnet │ ├── slo.libsonnet │ ├── telemeter-canary.libsonnet │ ├── telemeter.libsonnet │ └── tracing.libsonnet ├── grafana-obs-logs.jsonnet ├── grafana.jsonnet ├── observatorium-logs │ ├── loki-overview.libsonnet │ └── loki-tenant-alerts.libsonnet ├── prometheus_rule_tests │ ├── observatorium-custom-metrics.prometheusrulestests.yaml │ ├── observatorium-tenants.prometheusrulestests.yaml │ ├── rhobs-slos-mst.prometheusrulestests.yaml │ └── rhobs-slos-telemeter.prometheusrulestests.yaml ├── prometheusrules.jsonnet └── utils.jsonnet ├── resources ├── observability │ ├── grafana │ │ ├── observatorium-logs │ │ │ ├── grafana-dashboards-rules-template.yaml │ │ │ └── grafana-dashboards-template.yaml │ │ └── observatorium │ │ │ ├── grafana-dashboard-observatorium-alertmanager-overview.configmap.yaml │ │ │ ├── grafana-dashboard-observatorium-api.configmap.yaml │ │ │ ├── grafana-dashboard-observatorium-gubernator.configmap.yaml │ │ │ ├── grafana-dashboard-observatorium-memcached-memcached-overview.configmap.yaml │ │ │ ├── grafana-dashboard-observatorium-thanos-compact.configmap.yaml │ │ │ ├── grafana-dashboard-observatorium-thanos-overview.configmap.yaml │ │ │ ├── grafana-dashboard-observatorium-thanos-query-frontend.configmap.yaml │ │ │ ├── grafana-dashboard-observatorium-thanos-query.configmap.yaml │ │ │ ├── grafana-dashboard-observatorium-thanos-receive-controller.configmap.yaml │ │ │ ├── grafana-dashboard-observatorium-thanos-receive.configmap.yaml │ │ │ ├── grafana-dashboard-observatorium-thanos-rule.configmap.yaml │ │ │ ├── grafana-dashboard-observatorium-thanos-store.configmap.yaml │ │ │ ├── grafana-dashboard-rhobs-instance-utilization-overview.configmap.yaml │ │ │ ├── grafana-dashboard-rules-objstore.configmap.yaml │ │ │ ├── grafana-dashboard-slo-mst-production.configmap.yaml │ │ │ ├── grafana-dashboard-slo-mst-stage.configmap.yaml │ │ │ ├── grafana-dashboard-slo-rhobsp02ue1-production.configmap.yaml │ │ │ ├── grafana-dashboard-slo-telemeter-production.configmap.yaml │ │ │ ├── grafana-dashboard-slo-telemeter-stage.configmap.yaml │ │ │ ├── grafana-dashboard-telemeter-canary.configmap.yaml │ │ │ ├── grafana-dashboard-telemeter.configmap.yaml │ │ │ ├── grafana-dashboard-tracing-jaeger.configmap.yaml │ │ │ └── grafana-dashboard-tracing-otel.configmap.yaml │ └── prometheusrules │ │ ├── observatorium-alertmanager-production.prometheusrules.yaml │ │ ├── observatorium-alertmanager-stage.prometheusrules.yaml │ │ ├── observatorium-custom-metrics-production.prometheusrules.yaml │ │ ├── observatorium-custom-metrics-stage.prometheusrules.yaml │ │ ├── observatorium-custom-metrics.prometheusrulestests.yaml │ │ ├── observatorium-gubernator-production.prometheusrules.yaml │ │ ├── observatorium-gubernator-stage.prometheusrules.yaml │ │ ├── observatorium-http-traffic-production.prometheusrules.yaml │ │ ├── observatorium-http-traffic-stage.prometheusrules.yaml │ │ ├── observatorium-obsctl-reloader-production.prometheusrules.yaml │ │ ├── observatorium-obsctl-reloader-stage.prometheusrules.yaml │ │ ├── observatorium-proactive-monitoring-production.prometheusrules.yaml │ │ ├── observatorium-proactive-monitoring-stage.prometheusrules.yaml │ │ ├── observatorium-tenants-production.prometheusrules.yaml │ │ ├── observatorium-tenants-stage.prometheusrules.yaml │ │ ├── observatorium-tenants.prometheusrulestests.yaml │ │ ├── observatorium-thanos-production.prometheusrules.yaml │ │ ├── observatorium-thanos-stage.prometheusrules.yaml │ │ ├── pyrra │ │ ├── mst-production-api-alerting-availability-slo.yaml │ │ ├── mst-production-api-alerting-notif-availability-slo.yaml │ │ ├── mst-production-api-logs-prom-tail-availability-slo.yaml │ │ ├── mst-production-api-logs-query-availability-slo.yaml │ │ ├── mst-production-api-logs-query-range-availability-slo.yaml │ │ ├── mst-production-api-logs-tail-availability-slo.yaml │ │ ├── mst-production-api-logs-write-availability-slo.yaml │ │ ├── mst-production-api-logs-write-latency-slo.yaml │ │ ├── mst-production-api-metrics-query-availability-slo.yaml │ │ ├── mst-production-api-metrics-query-range-availability-slo.yaml │ │ ├── mst-production-api-metrics-read-100M-latency-slo.yaml │ │ ├── mst-production-api-metrics-read-10M-latency-slo.yaml │ │ ├── mst-production-api-metrics-read-1M-latency-slo.yaml │ │ ├── mst-production-api-metrics-rule-query-availability-slo.yaml │ │ ├── mst-production-api-metrics-rule-read-100M-latency-slo.yaml │ │ ├── mst-production-api-metrics-rule-read-10M-latency-slo.yaml │ │ ├── mst-production-api-metrics-rule-read-1M-latency-slo.yaml │ │ ├── mst-production-api-metrics-write-availability-slo.yaml │ │ ├── mst-production-api-metrics-write-latency-slo.yaml │ │ ├── mst-production-api-rules-raw-read-availability-slo.yaml │ │ ├── mst-production-api-rules-raw-write-availability-slo.yaml │ │ ├── mst-production-api-rules-read-availability-slo.yaml │ │ ├── mst-production-api-rules-sync-availability-slo.yaml │ │ ├── mst-stage-api-alerting-availability-slo.yaml │ │ ├── mst-stage-api-alerting-notif-availability-slo.yaml │ │ ├── mst-stage-api-logs-prom-tail-availability-slo.yaml │ │ ├── mst-stage-api-logs-query-availability-slo.yaml │ │ ├── mst-stage-api-logs-query-range-availability-slo.yaml │ │ ├── mst-stage-api-logs-tail-availability-slo.yaml │ │ ├── mst-stage-api-logs-write-availability-slo.yaml │ │ ├── mst-stage-api-logs-write-latency-slo.yaml │ │ ├── mst-stage-api-metrics-query-availability-slo.yaml │ │ ├── mst-stage-api-metrics-query-range-availability-slo.yaml │ │ ├── mst-stage-api-metrics-read-100M-latency-slo.yaml │ │ ├── mst-stage-api-metrics-read-10M-latency-slo.yaml │ │ ├── mst-stage-api-metrics-read-1M-latency-slo.yaml │ │ ├── mst-stage-api-metrics-rule-query-availability-slo.yaml │ │ ├── mst-stage-api-metrics-rule-read-100M-latency-slo.yaml │ │ ├── mst-stage-api-metrics-rule-read-10M-latency-slo.yaml │ │ ├── mst-stage-api-metrics-rule-read-1M-latency-slo.yaml │ │ ├── mst-stage-api-metrics-write-availability-slo.yaml │ │ ├── mst-stage-api-metrics-write-latency-slo.yaml │ │ ├── mst-stage-api-rules-raw-read-availability-slo.yaml │ │ ├── mst-stage-api-rules-raw-write-availability-slo.yaml │ │ ├── mst-stage-api-rules-read-availability-slo.yaml │ │ ├── mst-stage-api-rules-sync-availability-slo.yaml │ │ ├── rhelemeter-production-rhobs-rhelemeter-server-metrics-receive-availability-slo.yaml │ │ ├── rhelemeter-production-rhobs-rhelemeter-server-metrics-receive-latency-slo.yaml │ │ ├── rhelemeter-stage-rhobs-rhelemeter-server-metrics-receive-availability-slo.yaml │ │ ├── rhelemeter-stage-rhobs-rhelemeter-server-metrics-receive-latency-slo.yaml │ │ ├── rhobsp02ue1-production-api-alerting-availability-slo.yaml │ │ ├── rhobsp02ue1-production-api-alerting-notif-availability-slo.yaml │ │ ├── rhobsp02ue1-production-api-metrics-query-availability-slo.yaml │ │ ├── rhobsp02ue1-production-api-metrics-query-range-availability-slo.yaml │ │ ├── rhobsp02ue1-production-api-metrics-read-100M-latency-slo.yaml │ │ ├── rhobsp02ue1-production-api-metrics-read-10M-latency-slo.yaml │ │ ├── rhobsp02ue1-production-api-metrics-read-1M-latency-slo.yaml │ │ ├── rhobsp02ue1-production-api-metrics-rule-query-availability-slo.yaml │ │ ├── rhobsp02ue1-production-api-metrics-rule-read-100M-latency-slo.yaml │ │ ├── rhobsp02ue1-production-api-metrics-rule-read-10M-latency-slo.yaml │ │ ├── rhobsp02ue1-production-api-metrics-rule-read-1M-latency-slo.yaml │ │ ├── rhobsp02ue1-production-api-metrics-write-availability-slo.yaml │ │ ├── rhobsp02ue1-production-api-metrics-write-latency-slo.yaml │ │ ├── rhobsp02ue1-production-api-rules-raw-read-availability-slo.yaml │ │ ├── rhobsp02ue1-production-api-rules-raw-write-availability-slo.yaml │ │ ├── rhobsp02ue1-production-api-rules-read-availability-slo.yaml │ │ ├── rhobsp02ue1-production-api-rules-sync-availability-slo.yaml │ │ ├── telemeter-production-api-alerting-availability-slo.yaml │ │ ├── telemeter-production-api-alerting-notif-availability-slo.yaml │ │ ├── telemeter-production-api-metrics-query-availability-slo.yaml │ │ ├── telemeter-production-api-metrics-query-range-availability-slo.yaml │ │ ├── telemeter-production-api-metrics-read-100M-latency-slo.yaml │ │ ├── telemeter-production-api-metrics-read-10M-latency-slo.yaml │ │ ├── telemeter-production-api-metrics-read-1M-latency-slo.yaml │ │ ├── telemeter-production-api-metrics-rule-query-availability-slo.yaml │ │ ├── telemeter-production-api-metrics-rule-read-100M-latency-slo.yaml │ │ ├── telemeter-production-api-metrics-rule-read-10M-latency-slo.yaml │ │ ├── telemeter-production-api-metrics-rule-read-1M-latency-slo.yaml │ │ ├── telemeter-production-api-metrics-write-availability-slo.yaml │ │ ├── telemeter-production-api-metrics-write-latency-slo.yaml │ │ ├── telemeter-production-api-rules-raw-read-availability-slo.yaml │ │ ├── telemeter-production-api-rules-raw-write-availability-slo.yaml │ │ ├── telemeter-production-api-rules-read-availability-slo.yaml │ │ ├── telemeter-production-api-rules-sync-availability-slo.yaml │ │ ├── telemeter-production-rhobs-telemeter-server-metrics-receive-availability-slo.yaml │ │ ├── telemeter-production-rhobs-telemeter-server-metrics-receive-latency-slo.yaml │ │ ├── telemeter-production-rhobs-telemeter-server-metrics-upload-availability-slo.yaml │ │ ├── telemeter-production-rhobs-telemeter-server-metrics-upload-latency-slo.yaml │ │ ├── telemeter-staging-api-alerting-availability-slo.yaml │ │ ├── telemeter-staging-api-alerting-notif-availability-slo.yaml │ │ ├── telemeter-staging-api-metrics-query-availability-slo.yaml │ │ ├── telemeter-staging-api-metrics-query-range-availability-slo.yaml │ │ ├── telemeter-staging-api-metrics-read-100M-latency-slo.yaml │ │ ├── telemeter-staging-api-metrics-read-10M-latency-slo.yaml │ │ ├── telemeter-staging-api-metrics-read-1M-latency-slo.yaml │ │ ├── telemeter-staging-api-metrics-rule-query-availability-slo.yaml │ │ ├── telemeter-staging-api-metrics-rule-read-100M-latency-slo.yaml │ │ ├── telemeter-staging-api-metrics-rule-read-10M-latency-slo.yaml │ │ ├── telemeter-staging-api-metrics-rule-read-1M-latency-slo.yaml │ │ ├── telemeter-staging-api-metrics-write-availability-slo.yaml │ │ ├── telemeter-staging-api-metrics-write-latency-slo.yaml │ │ ├── telemeter-staging-api-rules-raw-read-availability-slo.yaml │ │ ├── telemeter-staging-api-rules-raw-write-availability-slo.yaml │ │ ├── telemeter-staging-api-rules-read-availability-slo.yaml │ │ ├── telemeter-staging-api-rules-sync-availability-slo.yaml │ │ ├── telemeter-staging-rhobs-telemeter-server-metrics-receive-availability-slo.yaml │ │ ├── telemeter-staging-rhobs-telemeter-server-metrics-receive-latency-slo.yaml │ │ ├── telemeter-staging-rhobs-telemeter-server-metrics-upload-availability-slo.yaml │ │ └── telemeter-staging-rhobs-telemeter-server-metrics-upload-latency-slo.yaml │ │ ├── rhobs-logs-mst-production.prometheusrules.yaml │ │ ├── rhobs-logs-mst-stage.prometheusrules.yaml │ │ ├── rhobs-slos-mst-production.prometheusrules.yaml │ │ ├── rhobs-slos-mst-stage.prometheusrules.yaml │ │ ├── rhobs-slos-mst.prometheusrulestests.yaml │ │ ├── rhobs-slos-rhelemeter-production.prometheusrules.yaml │ │ ├── rhobs-slos-rhelemeter-stage.prometheusrules.yaml │ │ ├── rhobs-slos-telemeter-production.prometheusrules.yaml │ │ ├── rhobs-slos-telemeter-stage.prometheusrules.yaml │ │ └── rhobs-slos-telemeter.prometheusrulestests.yaml ├── operations │ ├── bucket-inspect │ │ ├── README.md │ │ ├── cron-job-template.yaml │ │ ├── job-template.yaml │ │ └── s3-secret-template.yaml │ ├── bucket-replicate │ │ ├── README.md │ │ ├── cron-job-template.yaml │ │ ├── job-template.yaml │ │ ├── monitoring-template.yaml │ │ └── s3-secret-template.yaml │ └── rclone-bucket-replicate │ │ ├── README.md │ │ ├── job-template.env │ │ ├── job-template.yaml │ │ ├── monitoring-template.yaml │ │ ├── rclone-config-template.yaml │ │ ├── s3-secret-template.env │ │ └── s3-secret-template.yaml └── services │ ├── alertmanager │ ├── production │ │ ├── alertmanager-template.yaml │ │ └── service-monitor-template.yaml │ └── staging │ │ ├── alertmanager-template.yaml │ │ └── service-monitor-template.yaml │ ├── bundle │ ├── local │ │ ├── operator.yaml │ │ └── thanos-operator-crds.yaml │ ├── production │ │ ├── operator.yaml │ │ └── thanos-operator-crds.yaml │ └── staging │ │ ├── operator.yaml │ │ └── thanos-operator-crds.yaml │ ├── memcached │ ├── production │ │ ├── memcached-template.yaml │ │ └── service-monitor-memcached-template.yaml │ └── staging │ │ ├── memcached-template.yaml │ │ └── service-monitor-memcached-template.yaml │ ├── meta-monitoring │ ├── logging-template.yaml │ ├── profiling-template.yaml │ └── tracing-template.yaml │ ├── metric-federation-rule-template.yaml │ ├── objstore │ ├── local │ │ ├── thanos-default-secret.yaml │ │ └── thanos-telemeter-secret.yaml │ ├── production │ │ ├── thanos-default-secret.yaml │ │ └── thanos-telemeter-secret.yaml │ ├── staging │ │ ├── thanos-default-secret.yaml │ │ └── thanos-telemeter-secret.yaml │ └── thanos-object-store-secret.yaml │ ├── observatorium-api │ ├── production │ │ ├── observatorium-api-template.yaml │ │ └── service-monitor-observatorium-api-template.yaml │ └── staging │ │ ├── observatorium-api-template.yaml │ │ └── service-monitor-observatorium-api-template.yaml │ ├── observatorium-logs-template.yaml │ ├── observatorium-metrics-template.yaml │ ├── observatorium-template.yaml │ ├── observatorium-tenants-template.yaml │ ├── observatorium-traces-subscriptions-template.yaml │ ├── observatorium-traces-template.yaml │ ├── redis │ └── staging │ │ └── cache.yaml │ ├── rhelemeter-template.yaml │ ├── rhobs-thanos-operator │ ├── local │ │ ├── rhobs.yaml │ │ └── telemeter-rules.yaml │ ├── production │ │ └── rhobs.yaml │ └── staging │ │ ├── rhobs.yaml │ │ └── telemeter-rules.yaml │ ├── servicemonitors │ ├── local │ │ └── servicemonitors.yaml │ ├── production │ │ └── servicemonitors.yaml │ └── staging │ │ └── servicemonitors.yaml │ └── telemeter-template.yaml ├── services ├── components │ └── loki-caches.libsonnet ├── dex-template.jsonnet ├── metric-federation-rule-template.jsonnet ├── minio-template.jsonnet ├── observatorium-logs-template-overwrites.libsonnet ├── observatorium-logs-template.jsonnet ├── observatorium-logs.libsonnet ├── observatorium-metrics-template-overwrites.libsonnet ├── observatorium-metrics-template.jsonnet ├── observatorium-metrics.libsonnet ├── observatorium-template.jsonnet ├── observatorium-tenants-template.jsonnet ├── observatorium-traces-subscriptions-template.jsonnet ├── observatorium-traces-subscriptions.libsonnet ├── observatorium-traces-template.jsonnet ├── observatorium-traces.libsonnet ├── observatorium.libsonnet ├── prometheus │ ├── remote-write-proxy.libsonnet │ └── remote_write_proxy.conf ├── rhelemeter-template.jsonnet ├── rhelemeter.libsonnet ├── sidecars │ ├── jaeger-agent.libsonnet │ ├── oauth-proxy.libsonnet │ ├── opa-ams.libsonnet │ └── thanos-rule-syncer.libsonnet ├── telemeter-template.jsonnet └── telemeter.libsonnet ├── services_go ├── instances │ └── rhobs │ │ └── rhobs.go ├── observatorium │ ├── api.go │ ├── assets │ │ └── store-auto-shard-relabel-configMap.sh │ ├── cache.go │ ├── encoders.go │ ├── helpers.go │ ├── metrics.go │ ├── observatorium.go │ └── sidecars.go └── services.go ├── synchronize.sh └── tests ├── ci ├── README.md ├── ci_test.sh ├── env │ ├── dex.test.ci.env │ ├── logging.test.ci.env │ ├── minio.test.ci.env │ ├── observatorium-logs.test.ci.env │ ├── observatorium-metric-federation-rule.test.ci.env │ ├── observatorium-metrics.ci.env │ ├── observatorium-parca.test.ci.env │ ├── observatorium.test.ci.env │ ├── rhelemeter.test.ci.env │ └── telemeter.ci.env ├── manifests │ ├── observatorium-up-logs.yaml │ ├── observatorium-up-metrics.yaml │ ├── pre-requisites.yaml │ ├── rbac.yaml │ └── test-tenant.yaml └── rhobsci.png ├── deploy ├── README.md ├── env │ ├── logging.test.env │ ├── observatorium-jaeger.test.env │ ├── observatorium-logs.test.env │ ├── observatorium-metric-federation-rule.test.env │ ├── observatorium-metrics.test.env │ ├── observatorium-parca.test.env │ ├── observatorium.test.env │ ├── rhelemeter.test.env │ └── telemeter.test.env ├── launch.sh ├── manifests │ ├── clusterlogforwader.yaml │ ├── clusterlogging.yaml │ ├── dex-template.yaml │ ├── logging-operator.yaml │ ├── loki-operator.yaml │ ├── minio-template.yaml │ ├── observatorium-alertmanager-config-secret.yaml │ ├── observatorium-cluster-role-binding.yaml │ ├── observatorium-cluster-role.yaml │ ├── observatorium-logs-secret.yaml │ ├── observatorium-metrics-thanos-objectstorage-secret-template.yaml │ ├── observatorium-parca-secret.yaml │ ├── observatorium-rhobs-tenant-secret.yaml │ ├── observatorium-rules-objstore-secret.yaml │ ├── observatorium-service-account.yaml │ ├── observatorium-tools-network-policy.yaml │ ├── rhelemeter_certs │ │ ├── ca.crt │ │ ├── tls.crt │ │ └── tls.key │ └── telemeter-token-refersher-oidc-secret.yaml └── testdata │ └── client-info.json └── integration_tests ├── Dockerfile ├── README.md ├── build_deploy.sh ├── framework ├── .bingo │ ├── .gitignore │ ├── README.md │ ├── Variables.mk │ ├── go.mod │ ├── gojsontoyaml.mod │ ├── gojsontoyaml.sum │ ├── jsonnet-lint.mod │ ├── jsonnet-lint.sum │ ├── jsonnet.mod │ ├── jsonnet.sum │ ├── jsonnetfmt.mod │ ├── jsonnetfmt.sum │ └── variables.env ├── Dockerfile ├── Makefile ├── README.md ├── cmd │ └── rhobs-test │ │ └── main.go ├── examples │ └── manifests │ │ ├── dev │ │ ├── test-deployment-faulty.yaml │ │ ├── test-deployment.yaml │ │ ├── test-job.yaml │ │ └── test-rbac.yaml │ │ └── openshift │ │ ├── rhobs-rbac-template.yaml │ │ ├── rhobs-test-job-template.yaml │ │ ├── test-deployment-faulty-template.yaml │ │ └── test-deployment-template.yaml ├── go.mod ├── go.sum ├── integration-test.png ├── jsonnet │ ├── dev-manifests.jsonnet │ ├── job.libsonnet │ ├── ocp-manifests.jsonnet │ ├── rbac.libsonnet │ └── test-deployment.libsonnet └── pkg │ ├── client │ └── client.go │ ├── deployment │ ├── deployment.go │ └── deployment_test.go │ ├── logger │ └── logger.go │ ├── pod │ ├── pod.go │ └── pod_test.go │ ├── service │ ├── service.go │ └── service_test.go │ └── statefulset │ ├── statefulset.go │ └── statefulset_test.go ├── post-deploy-host-metering-job-template.yaml ├── post-deploy-logs-job-template.yaml ├── post-deploy-metrics-job-template.yaml └── runtest.sh /.bingo/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Ignore everything 3 | * 4 | 5 | # But not these files: 6 | !.gitignore 7 | !*.mod 8 | !*.sum 9 | !README.md 10 | !Variables.mk 11 | !variables.env 12 | 13 | *tmp.mod 14 | -------------------------------------------------------------------------------- /.bingo/README.md: -------------------------------------------------------------------------------- 1 | # Project Development Dependencies. 2 | 3 | This is directory which stores Go modules with pinned buildable package that is used within this repository, managed by https://github.com/bwplotka/bingo. 4 | 5 | * Run `bingo get` to install all tools having each own module file in this directory. 6 | * Run `bingo get ` to install that have own module file in this directory. 7 | * For Makefile: Make sure to put `include .bingo/Variables.mk` in your Makefile, then use $() variable where is the .bingo/.mod. 8 | * For shell: Run `source .bingo/variables.env` to source all environment variable for each tool. 9 | * For go: Import `.bingo/variables.go` to for variable names. 10 | * See https://github.com/bwplotka/bingo or -h on how to add, remove or change binaries dependencies. 11 | 12 | ## Requirements 13 | 14 | * Go 1.14+ 15 | -------------------------------------------------------------------------------- /.bingo/bingo.mod: -------------------------------------------------------------------------------- 1 | module _ // Auto generated by https://github.com/bwplotka/bingo. DO NOT EDIT 2 | 3 | go 1.15 4 | 5 | require github.com/bwplotka/bingo v0.8.0 6 | -------------------------------------------------------------------------------- /.bingo/faillint.mod: -------------------------------------------------------------------------------- 1 | module _ // Auto generated by https://github.com/bwplotka/bingo. DO NOT EDIT 2 | 3 | go 1.19 4 | 5 | require github.com/fatih/faillint v1.14.0 6 | -------------------------------------------------------------------------------- /.bingo/go.mod: -------------------------------------------------------------------------------- 1 | module _ // Fake go.mod auto-created by 'bingo' for go -moddir compatibility with non-Go projects. Commit this file, together with other .mod files. -------------------------------------------------------------------------------- /.bingo/goimports.mod: -------------------------------------------------------------------------------- 1 | module _ // Auto generated by https://github.com/bwplotka/bingo. DO NOT EDIT 2 | 3 | go 1.19 4 | 5 | require golang.org/x/tools v0.9.3 // cmd/goimports 6 | -------------------------------------------------------------------------------- /.bingo/goimports.sum: -------------------------------------------------------------------------------- 1 | golang.org/x/mod v0.8.0 h1:LUYupSeNrTNCGzR/hVBk2NHZO4hXcVaW1k4Qx7rjPx8= 2 | golang.org/x/mod v0.9.0 h1:KENHtAZL2y3NLMYZeHY9DW8HW8V+kQyJsY/V9JlKvCs= 3 | golang.org/x/mod v0.10.0 h1:lFO9qtOdlre5W1jxS3r/4szv2/6iXxScdzjoBMXNhYk= 4 | golang.org/x/mod v0.10.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= 5 | golang.org/x/sys v0.5.0 h1:MUK/U/4lj1t1oPg0HfuXDN/Z1wv31ZJ/YcPiGccS4DU= 6 | golang.org/x/sys v0.6.0 h1:MVltZSvRTcU2ljQOhs94SXPftV6DCNnZViHeQps87pQ= 7 | golang.org/x/sys v0.8.0 h1:EBmGv8NaZBZTWvrbjNoL6HVt+IVy3QDQpJs7VRIw3tU= 8 | golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 9 | golang.org/x/tools v0.6.0 h1:BOw41kyTf3PuCW1pVQf8+Cyg8pMlkYB1oo9iJ6D/lKM= 10 | golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= 11 | golang.org/x/tools v0.7.0 h1:W4OVu8VVOaIO0yzWMNdepAulS7YfoS3Zabrm8DOXXU4= 12 | golang.org/x/tools v0.7.0/go.mod h1:4pg6aUX35JBAogB10C9AtvVL+qowtN4pT3CGSQex14s= 13 | golang.org/x/tools v0.9.3 h1:Gn1I8+64MsuTb/HpH+LmQtNas23LhUVr3rYZ0eKuaMM= 14 | golang.org/x/tools v0.9.3/go.mod h1:owI94Op576fPu3cIGQeHs3joujW/2Oc6MtlxbF5dfNc= 15 | -------------------------------------------------------------------------------- /.bingo/gojq.mod: -------------------------------------------------------------------------------- 1 | module _ // Auto generated by https://github.com/bwplotka/bingo. DO NOT EDIT 2 | 3 | go 1.15 4 | 5 | require github.com/itchyny/gojq v0.12.12 // cmd/gojq 6 | -------------------------------------------------------------------------------- /.bingo/gojsontoyaml.mod: -------------------------------------------------------------------------------- 1 | module _ // Auto generated by https://github.com/bwplotka/bingo. DO NOT EDIT 2 | 3 | go 1.14 4 | 5 | require github.com/brancz/gojsontoyaml v0.0.0-20200602132005-3697ded27e8c 6 | -------------------------------------------------------------------------------- /.bingo/gojsontoyaml.sum: -------------------------------------------------------------------------------- 1 | github.com/brancz/gojsontoyaml v0.0.0-20200602132005-3697ded27e8c h1:hb6WqfcKQZlNx/vahy51SaIvKnoXD5609Nm0PC4msEM= 2 | github.com/brancz/gojsontoyaml v0.0.0-20200602132005-3697ded27e8c/go.mod h1:+00lOjYXPgMfxHVPvg9GDtc3BX5Xh5aFpB4gMB8gfMo= 3 | github.com/ghodss/yaml v1.0.0 h1:wQHKEahhL6wmXdzwWG11gIVCkOv05bNOh+Rxn0yngAk= 4 | github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= 5 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 6 | gopkg.in/yaml.v2 v2.3.0 h1:clyUAQHOM3G0M3f5vQj7LuJrETvjVot3Z5el9nffUtU= 7 | gopkg.in/yaml.v2 v2.3.0/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 8 | -------------------------------------------------------------------------------- /.bingo/golangci-lint.mod: -------------------------------------------------------------------------------- 1 | module _ // Auto generated by https://github.com/bwplotka/bingo. DO NOT EDIT 2 | 3 | go 1.22.9 4 | 5 | toolchain go1.23.1 6 | 7 | require github.com/golangci/golangci-lint v1.63.4 // cmd/golangci-lint 8 | -------------------------------------------------------------------------------- /.bingo/jb.mod: -------------------------------------------------------------------------------- 1 | module _ // Auto generated by https://github.com/bwplotka/bingo. DO NOT EDIT 2 | 3 | go 1.15 4 | 5 | require github.com/jsonnet-bundler/jsonnet-bundler v0.5.1 // cmd/jb 6 | -------------------------------------------------------------------------------- /.bingo/jsonnet-deps.mod: -------------------------------------------------------------------------------- 1 | module _ // Auto generated by https://github.com/bwplotka/bingo. DO NOT EDIT 2 | 3 | go 1.15 4 | 5 | require github.com/google/go-jsonnet v0.19.1 // cmd/jsonnet-deps 6 | -------------------------------------------------------------------------------- /.bingo/jsonnet-lint.mod: -------------------------------------------------------------------------------- 1 | module _ // Auto generated by https://github.com/bwplotka/bingo. DO NOT EDIT 2 | 3 | go 1.16 4 | 5 | require github.com/google/go-jsonnet v0.19.1 // cmd/jsonnet-lint 6 | -------------------------------------------------------------------------------- /.bingo/jsonnet.mod: -------------------------------------------------------------------------------- 1 | module _ // Auto generated by https://github.com/bwplotka/bingo. DO NOT EDIT 2 | 3 | go 1.15 4 | 5 | require github.com/google/go-jsonnet v0.19.1 // cmd/jsonnet 6 | -------------------------------------------------------------------------------- /.bingo/jsonnetfmt.mod: -------------------------------------------------------------------------------- 1 | module _ // Auto generated by https://github.com/bwplotka/bingo. DO NOT EDIT 2 | 3 | go 1.15 4 | 5 | require github.com/google/go-jsonnet v0.19.1 // cmd/jsonnetfmt 6 | -------------------------------------------------------------------------------- /.bingo/promtool.mod: -------------------------------------------------------------------------------- 1 | module _ // Auto generated by https://github.com/bwplotka/bingo. DO NOT EDIT 2 | 3 | go 1.14 4 | 5 | replace k8s.io/klog => github.com/simonpasquier/klog-gokit v0.3.0 6 | 7 | replace k8s.io/klog/v2 => github.com/simonpasquier/klog-gokit/v3 v3.0.0 8 | 9 | exclude github.com/linode/linodego v1.0.0 10 | 11 | exclude github.com/grpc-ecosystem/grpc-gateway v1.14.7 12 | 13 | exclude google.golang.org/api v0.30.0 14 | 15 | require github.com/prometheus/prometheus v0.43.0 // cmd/promtool 16 | -------------------------------------------------------------------------------- /.bingo/variables.env: -------------------------------------------------------------------------------- 1 | # Auto generated binary variables helper managed by https://github.com/bwplotka/bingo v0.9. DO NOT EDIT. 2 | # All tools are designed to be build inside $GOBIN. 3 | # Those variables will work only until 'bingo get' was invoked, or if tools were installed via Makefile's Variables.mk. 4 | GOBIN=${GOBIN:=$(go env GOBIN)} 5 | 6 | if [ -z "$GOBIN" ]; then 7 | GOBIN="$(go env GOPATH)/bin" 8 | fi 9 | 10 | 11 | BINGO="${GOBIN}/bingo-v0.8.0" 12 | 13 | FAILLINT="${GOBIN}/faillint-v1.14.0" 14 | 15 | GOIMPORTS="${GOBIN}/goimports-v0.9.3" 16 | 17 | GOJQ="${GOBIN}/gojq-v0.12.12" 18 | 19 | GOJSONTOYAML="${GOBIN}/gojsontoyaml-v0.0.0-20200602132005-3697ded27e8c" 20 | 21 | GOLANGCI_LINT="${GOBIN}/golangci-lint-v1.63.4" 22 | 23 | JB="${GOBIN}/jb-v0.5.1" 24 | 25 | JSONNET_DEPS="${GOBIN}/jsonnet-deps-v0.19.1" 26 | 27 | JSONNET_LINT="${GOBIN}/jsonnet-lint-v0.19.1" 28 | 29 | JSONNET="${GOBIN}/jsonnet-v0.19.1" 30 | 31 | JSONNETFMT="${GOBIN}/jsonnetfmt-v0.19.1" 32 | 33 | PROMTOOL="${GOBIN}/promtool-v0.43.0" 34 | 35 | YQ="${GOBIN}/yq-v4.33.1" 36 | 37 | -------------------------------------------------------------------------------- /.bingo/yq.mod: -------------------------------------------------------------------------------- 1 | module _ // Auto generated by https://github.com/bwplotka/bingo. DO NOT EDIT 2 | 3 | go 1.15 4 | 5 | require github.com/mikefarah/yq/v4 v4.33.1 6 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | workflow_dispatch: 9 | 10 | jobs: 11 | build: 12 | runs-on: ubuntu-latest 13 | steps: 14 | - uses: actions/checkout@v2 15 | - name: Set up Go 16 | uses: actions/setup-go@v2 17 | with: 18 | go-version: 1.23 19 | - name: Vendor 20 | run: make vendor_jsonnet 21 | - name: Build 22 | run: make grafana manifests prometheusrules 23 | - name: Format 24 | run: make format 25 | - name: Lint 26 | run: make lint 27 | - name: Validate 28 | run: make validate 29 | - name: Diff 30 | run: git diff --exit-code 31 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | vendor/ 2 | vendor_jsonnet/ 3 | .bin 4 | .idea 5 | .envrc 6 | resources/.tmp/ 7 | tmp/ 8 | .fleet 9 | .vscode 10 | .DS_Store 11 | -------------------------------------------------------------------------------- /.gitleaks.toml: -------------------------------------------------------------------------------- 1 | title = "gitleaks config" 2 | [allowlist] 3 | paths=[ 4 | '''dex-template.jsonnet''', 5 | '''observatorium-template.yaml''', 6 | '''dex-template.yaml''', 7 | ] -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | - bwplotka 2 | - kakkoyun 3 | - squat 4 | - onprem 5 | - spaparaju -------------------------------------------------------------------------------- /COPYRIGHT: -------------------------------------------------------------------------------- 1 | Copyright (c) The Red Hat Monitoring Team 2 | Licensed under the Apache License 2.0. -------------------------------------------------------------------------------- /build_deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This is a placeholder file needed by the CI/CD templates 3 | # You can safely ignore this file :) 4 | exit 0 -------------------------------------------------------------------------------- /configuration/observatorium/metric-federation-rules.libsonnet: -------------------------------------------------------------------------------- 1 | { 2 | prometheus+:: { 3 | recordingrules+: { 4 | groups+: [ 5 | { 6 | name: 'rhacs.rules', 7 | interval: '1m', 8 | rules: [ 9 | { 10 | record: 'rhacs:rox_central_cluster_metrics_cpu_capacity:avg_over_time1h', 11 | expr: ||| 12 | rhacs:rox_central_cluster_metrics_cpu_capacity:avg_over_time1h 13 | |||, 14 | }, 15 | ], 16 | }, 17 | ], 18 | }, 19 | }, 20 | } 21 | -------------------------------------------------------------------------------- /configuration/observatorium/queries-ruler.libsonnet: -------------------------------------------------------------------------------- 1 | { 2 | queries: [ 3 | { 4 | name: 'rule-query-path-sli-1M-samples', 5 | query: 'avg_over_time(avalanche_metric_mmmmm_0_0{tenant_id="0fc2b00e-201b-4c17-b9f2-19d91adc4fd2"}[1h])', 6 | }, 7 | { 8 | name: 'rule-query-path-sli-10M-samples', 9 | query: 'avg_over_time(avalanche_metric_mmmmm_0_0{tenant_id="0fc2b00e-201b-4c17-b9f2-19d91adc4fd2"}[10h])', 10 | }, 11 | { 12 | name: 'rule-query-path-sli-100M-samples', 13 | query: 'avg_over_time(avalanche_metric_mmmmm_0_0{tenant_id="0fc2b00e-201b-4c17-b9f2-19d91adc4fd2"}[100h])', 14 | }, 15 | ], 16 | } 17 | -------------------------------------------------------------------------------- /configuration/observatorium/queries.libsonnet: -------------------------------------------------------------------------------- 1 | { 2 | queries: [ 3 | { 4 | name: 'query-path-sli-1M-samples', 5 | query: 'avg_over_time(avalanche_metric_mmmmm_0_0{tenant_id="0fc2b00e-201b-4c17-b9f2-19d91adc4fd2"}[1h])', 6 | }, 7 | { 8 | name: 'query-path-sli-10M-samples', 9 | query: 'avg_over_time(avalanche_metric_mmmmm_0_0{tenant_id="0fc2b00e-201b-4c17-b9f2-19d91adc4fd2"}[10h])', 10 | }, 11 | { 12 | name: 'query-path-sli-100M-samples', 13 | query: 'avg_over_time(avalanche_metric_mmmmm_0_0{tenant_id="0fc2b00e-201b-4c17-b9f2-19d91adc4fd2"}[100h])', 14 | }, 15 | ], 16 | } 17 | -------------------------------------------------------------------------------- /configuration/observatorium/tenants.libsonnet: -------------------------------------------------------------------------------- 1 | { 2 | tenants: [ 3 | { 4 | name: 'rhobs', 5 | id: '770c1124-6ae8-4324-a9d4-9ce08590094b', 6 | oidc: { 7 | clientID: 'test', 8 | clientSecret: 'ZXhhbXBsZS1hcHAtc2VjcmV0', 9 | issuerURL: 'http://dex.dex.svc.cluster.local:5556/dex', 10 | usernameClaim: 'email', 11 | }, 12 | }, 13 | { 14 | name: 'telemeter', 15 | id: 'FB870BF3-9F3A-44FF-9BF7-D7A047A52F43', 16 | oidc: { 17 | clientID: 'test', 18 | clientSecret: 'ZXhhbXBsZS1hcHAtc2VjcmV0', 19 | issuerURL: 'http://dex.dex.svc.cluster.local:5556/dex', 20 | usernameClaim: 'email', 21 | }, 22 | }, 23 | ], 24 | // Collect all tenants in a map for convenient access. 25 | map:: { 26 | [tenant.name]: tenant 27 | for tenant in self.tenants 28 | }, 29 | } 30 | -------------------------------------------------------------------------------- /configuration/rhelemeter/metrics.json: -------------------------------------------------------------------------------- 1 | [ 2 | "{__name__=\"system_cpu_logical_count\"}" 3 | ] 4 | -------------------------------------------------------------------------------- /configuration/telemeter-rosa/README.md: -------------------------------------------------------------------------------- 1 | # ROSA HCP Billing Metrics 2 | There are a set of ROSA metrics that are federated from OBO into `telemeter-staging`, `telemeter-int` and `telemeter-prod`. These metrics are used for Subwatch billing of ROSA clusters via telemetry. 3 | 4 | The remote-write config can be found [here](https://gitlab.cee.redhat.com/service/osd-fleet-manager/-/blob/main/config/resources/managed-cluster-monitoring-stack.yaml). *Do not modify* without express approval from the ROSA team in the #sd-rosa-hcp channel. -------------------------------------------------------------------------------- /configuration/telemeter-rosa/metrics.json: -------------------------------------------------------------------------------- 1 | [ 2 | "{__name__=\"hostedcluster:hypershift_cluster_vcpus:max\"}" 3 | ] 4 | -------------------------------------------------------------------------------- /crds/observatorium-logs-crds-template.jsonnet: -------------------------------------------------------------------------------- 1 | local ar = (import 'loki.grafana.com_alertingrules.libsonnet'); 2 | local rr = (import 'loki.grafana.com_recordingrules.libsonnet'); 3 | 4 | { 5 | local clusterRole = { 6 | apiVersion: 'rbac.authorization.k8s.io/v1', 7 | kind: 'ClusterRole', 8 | metadata: { 9 | name: 'observatorium-logs-edit', 10 | labels: { 11 | 'managed.openshift.io/aggregate-to-dedicated-admins': 'cluster', 12 | }, 13 | }, 14 | rules: [ 15 | { 16 | apiGroups: ['loki.grafana.com'], 17 | resources: ['alertingrules', 'recordingrules'], 18 | verbs: ['create', 'update', 'delete', 'patch', 'get', 'list', 'watch'], 19 | }, 20 | ], 21 | }, 22 | 23 | local withServedV1Beta1 = function(crd) crd { 24 | spec+: { 25 | conversion:: {}, 26 | versions: [ 27 | v + (if v.name == 'v1beta1' then { 28 | served: true, 29 | } else {}) 30 | for v in super.versions 31 | ], 32 | }, 33 | }, 34 | 35 | apiVersion: 'template.openshift.io/v1', 36 | kind: 'Template', 37 | metadata: { 38 | name: 'observatorium-logs-crds', 39 | }, 40 | objects: [ 41 | withServedV1Beta1(ar), 42 | withServedV1Beta1(rr), 43 | clusterRole, 44 | ], 45 | } 46 | -------------------------------------------------------------------------------- /docs/observatorium-logs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rhobs/configuration/6d2b136e48c291c1a415394fed40219188620665/docs/observatorium-logs.png -------------------------------------------------------------------------------- /docs/readme.md: -------------------------------------------------------------------------------- 1 | # Observatorium 2 | 3 | ## Table of Contents 4 | 5 | - [Observatorium](./observatorium.md) 6 | - [Observatorium Logs](./observatorium.md#observatorium-logs) 7 | - [Observatorium Metrics](./observatorium.md#observatorium-metrics) 8 | - [Telemeter](./telemeter.md) 9 | - [Rhelemeter](./rhelemeter.md) 10 | -------------------------------------------------------------------------------- /docs/rhelemeter.md: -------------------------------------------------------------------------------- 1 | # rhelemeter 2 | 3 | Rhelemeter is a specialized instance of [Telemeter](telemeter.md). While Telemeter receives metrics from OCP clusters, Rhelemeter 4 | receives metrics from RHEL hosts. 5 | 6 | The source code and more details can be found at [its upstream repository](https://github.com/openshift/telemeter/tree/master/cmd/rhelemeter-server). 7 | -------------------------------------------------------------------------------- /docs/telemeter.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rhobs/configuration/6d2b136e48c291c1a415394fed40219188620665/docs/telemeter.png -------------------------------------------------------------------------------- /lib/k.libsonnet: -------------------------------------------------------------------------------- 1 | (import 'github.com/jsonnet-libs/k8s-libsonnet/1.26/main.libsonnet') 2 | -------------------------------------------------------------------------------- /mimic.go: -------------------------------------------------------------------------------- 1 | package main 2 | 3 | import ( 4 | "github.com/bwplotka/mimic" 5 | cfgobservatorium "github.com/rhobs/configuration/configuration/observatorium" 6 | ) 7 | 8 | func main() { 9 | gen := mimic.New() 10 | 11 | defer gen.Generate() 12 | 13 | cfgobservatorium.GenSLO(gen.With("observability", "prometheusrules", "pyrra"), gen.With("observability", "prometheusrules")) 14 | 15 | cfgobservatorium.GenerateRBACFile(gen.With(".tmp", "tenants")) 16 | 17 | } 18 | -------------------------------------------------------------------------------- /observability/observatorium-logs/loki-tenant-alerts.libsonnet: -------------------------------------------------------------------------------- 1 | function(namespace) { 2 | prometheusAlerts+:: { 3 | groups+: [ 4 | { 5 | name: 'loki_tenant_alerts', 6 | rules: [ 7 | { 8 | alert: 'LokiTenantRateLimitWarning', 9 | expr: ||| 10 | sum by (namespace, tenant, reason) (sum_over_time(rate(loki_discarded_samples_total{namespace="%s"}[1m])[30m:1m])) 11 | > 100 12 | ||| % namespace, 13 | 'for': '15m', 14 | labels: { 15 | severity: 'medium', 16 | }, 17 | annotations: { 18 | message: ||| 19 | {{ $labels.tenant }} is experiencing rate limiting for reason '{{ $labels.reason }}': {{ printf "%.0f" $value }} 20 | |||, 21 | }, 22 | }, 23 | ], 24 | }, 25 | ], 26 | }, 27 | } 28 | -------------------------------------------------------------------------------- /observability/utils.jsonnet: -------------------------------------------------------------------------------- 1 | { 2 | instanceNamespace(name, metricsNamespace, upNamespace): if name == 'telemeter' then metricsNamespace else upNamespace, 3 | instance_name_filter: '/^rhobs.*|telemeter-prod-01-prometheus|app-sre-stage-01-prometheus/', 4 | } 5 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/observatorium-gubernator-production.prometheusrules.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | $schema: /openshift/prometheus-rule-1.yml 3 | apiVersion: monitoring.coreos.com/v1 4 | kind: PrometheusRule 5 | metadata: 6 | labels: 7 | prometheus: app-sre 8 | role: alert-rules 9 | name: observatorium-gubernator-production 10 | spec: 11 | groups: 12 | - name: gubernator-absent 13 | rules: 14 | - alert: gubernatorIsDown 15 | annotations: 16 | dashboard: https://grafana.app-sre.devshift.net/d/no-dashboard/gubernator-absent?orgId=1&refresh=10s&var-datasource={{$externalLabels.cluster}}-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 17 | message: gubernator has disappeared from Prometheus target discovery. 18 | runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#gubernatorisdown 19 | expr: | 20 | absent(up{job="observatorium-gubernator"} == 1) 21 | for: 5m 22 | labels: 23 | service: telemeter 24 | severity: critical 25 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/observatorium-gubernator-stage.prometheusrules.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | $schema: /openshift/prometheus-rule-1.yml 3 | apiVersion: monitoring.coreos.com/v1 4 | kind: PrometheusRule 5 | metadata: 6 | labels: 7 | prometheus: app-sre 8 | role: alert-rules 9 | name: observatorium-gubernator-stage 10 | spec: 11 | groups: 12 | - name: gubernator-absent 13 | rules: 14 | - alert: gubernatorIsDown 15 | annotations: 16 | dashboard: https://grafana.app-sre.devshift.net/d/no-dashboard/gubernator-absent?orgId=1&refresh=10s&var-datasource={{$externalLabels.cluster}}-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 17 | message: gubernator has disappeared from Prometheus target discovery. 18 | runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#gubernatorisdown 19 | expr: | 20 | absent(up{job="observatorium-gubernator"} == 1) 21 | for: 5m 22 | labels: 23 | service: telemeter 24 | severity: high 25 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/observatorium-http-traffic-production.prometheusrules.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | $schema: /openshift/prometheus-rule-1.yml 3 | apiVersion: monitoring.coreos.com/v1 4 | kind: PrometheusRule 5 | metadata: 6 | labels: 7 | prometheus: app-sre 8 | role: alert-rules 9 | name: observatorium-http-traffic-production 10 | spec: 11 | groups: 12 | - name: observatorium-http-traffic 13 | rules: 14 | - alert: ObservatoriumHttpTrafficErrorRateHigh 15 | annotations: 16 | dashboard: https://grafana.app-sre.devshift.net/d/no-dashboard/observatorium-http-traffic?orgId=1&refresh=10s&var-datasource={{$externalLabels.cluster}}-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 17 | message: Observatorium route {{$labels.route}} are failing to handle {{$value | humanize}}% of requests. 18 | runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#observatoriumhttptrafficerrorratehigh 19 | expr: | 20 | (sum by (route) (rate(haproxy_backend_http_responses_total{route=~"observatorium.*|telemeter.*|infogw.*", code="5xx"} [5m])) / sum by (route) (rate(haproxy_backend_http_responses_total{route=~"observatorium.*|telemeter.*|infogw.*"}[5m]))) * 100 > 25 21 | labels: 22 | service: telemeter 23 | severity: medium 24 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/observatorium-http-traffic-stage.prometheusrules.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | $schema: /openshift/prometheus-rule-1.yml 3 | apiVersion: monitoring.coreos.com/v1 4 | kind: PrometheusRule 5 | metadata: 6 | labels: 7 | prometheus: app-sre 8 | role: alert-rules 9 | name: observatorium-http-traffic-stage 10 | spec: 11 | groups: 12 | - name: observatorium-http-traffic 13 | rules: 14 | - alert: ObservatoriumHttpTrafficErrorRateHigh 15 | annotations: 16 | dashboard: https://grafana.app-sre.devshift.net/d/no-dashboard/observatorium-http-traffic?orgId=1&refresh=10s&var-datasource={{$externalLabels.cluster}}-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 17 | message: Observatorium route {{$labels.route}} are failing to handle {{$value | humanize}}% of requests. 18 | runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#observatoriumhttptrafficerrorratehigh 19 | expr: | 20 | (sum by (route) (rate(haproxy_backend_http_responses_total{route=~"observatorium.*|telemeter.*|infogw.*", code="5xx"} [5m])) / sum by (route) (rate(haproxy_backend_http_responses_total{route=~"observatorium.*|telemeter.*|infogw.*"}[5m]))) * 100 > 25 21 | labels: 22 | service: telemeter 23 | severity: medium 24 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/observatorium-proactive-monitoring-production.prometheusrules.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | $schema: /openshift/prometheus-rule-1.yml 3 | apiVersion: monitoring.coreos.com/v1 4 | kind: PrometheusRule 5 | metadata: 6 | labels: 7 | prometheus: app-sre 8 | role: alert-rules 9 | name: observatorium-proactive-monitoring-production 10 | spec: 11 | groups: 12 | - name: observatorium-proactive-monitoring 13 | rules: 14 | - alert: ObservatoriumProActiveMetricsQueryErrorRateHigh 15 | annotations: 16 | dashboard: https://grafana.app-sre.devshift.net/d/no-dashboard/observatorium-proactive-monitoring?orgId=1&refresh=10s&var-datasource={{$externalLabels.cluster}}-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 17 | message: Observatorium metric queries {{$labels.job}} in {{$labels.namespace}} are failing to handle {{$value | humanize}}% of requests. 18 | runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#observatoriumproactivemetricsqueryerrorratehigh 19 | expr: | 20 | ( sum by (namespace, job, query) (rate(up_custom_query_errors_total[5m])) / sum by (namespace, job, query) (rate(up_custom_query_executed_total[5m]))) * 100 > 25 21 | labels: 22 | service: telemeter 23 | severity: medium 24 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/observatorium-proactive-monitoring-stage.prometheusrules.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | $schema: /openshift/prometheus-rule-1.yml 3 | apiVersion: monitoring.coreos.com/v1 4 | kind: PrometheusRule 5 | metadata: 6 | labels: 7 | prometheus: app-sre 8 | role: alert-rules 9 | name: observatorium-proactive-monitoring-stage 10 | spec: 11 | groups: 12 | - name: observatorium-proactive-monitoring 13 | rules: 14 | - alert: ObservatoriumProActiveMetricsQueryErrorRateHigh 15 | annotations: 16 | dashboard: https://grafana.app-sre.devshift.net/d/no-dashboard/observatorium-proactive-monitoring?orgId=1&refresh=10s&var-datasource={{$externalLabels.cluster}}-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 17 | message: Observatorium metric queries {{$labels.job}} in {{$labels.namespace}} are failing to handle {{$value | humanize}}% of requests. 18 | runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#observatoriumproactivemetricsqueryerrorratehigh 19 | expr: | 20 | ( sum by (namespace, job, query) (rate(up_custom_query_errors_total[5m])) / sum by (namespace, job, query) (rate(up_custom_query_executed_total[5m]))) * 100 > 25 21 | labels: 22 | service: telemeter 23 | severity: medium 24 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/mst-production-api-alerting-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/283e7002d85c08126681241df2fdb22b/mst-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API Thanos Rule failing to send alerts to Alertmanager and 7 | is burning too much error budget to guarantee availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIAlertmanagerAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: mst-production 12 | pyrra.dev/service: observatorium-api 13 | name: api-alerting-availability-slo 14 | spec: 15 | alerting: 16 | name: APIAlertmanagerAvailabilityErrorBudgetBurning 17 | description: API Thanos Rule failing to send alerts to Alertmanager and is burning 18 | too much error budget to guarantee availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: thanos_alert_sender_alerts_dropped_total{container="thanos-rule", 23 | namespace="observatorium-mst-production", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: thanos_alert_sender_alerts_dropped_total{container="thanos-rule", 27 | namespace="observatorium-mst-production"} 28 | target: "99" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/mst-production-api-logs-prom-tail-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/283e7002d85c08126681241df2fdb22b/mst-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API logs /prom_tail is burning too much error budget to guarantee 7 | availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APILogsPromTailAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: mst-production 12 | service: observatorium-api 13 | name: api-logs-prom-tail-availability-slo 14 | spec: 15 | alerting: 16 | name: APILogsPromTailAvailabilityErrorBudgetBurning 17 | description: API logs /prom_tail is burning too much error budget to guarantee availability 18 | SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="prom_tail", 23 | group="logsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="prom_tail", 27 | group="logsv1"} 28 | target: "95" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/mst-production-api-logs-query-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/283e7002d85c08126681241df2fdb22b/mst-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API logs /query, /labels, or /label_values handler is burning 7 | too much error budget to guarantee availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APILogsQueryAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: mst-production 12 | service: observatorium-api 13 | name: api-logs-query-availability-slo 14 | spec: 15 | alerting: 16 | name: APILogsQueryAvailabilityErrorBudgetBurning 17 | description: API logs /query, /labels, or /label_values handler is burning too much 18 | error budget to guarantee availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler=~"query|label|labels|label_values", 23 | group="logsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler=~"query|label|labels|label_values", 27 | group="logsv1"} 28 | target: "95" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/mst-production-api-logs-query-range-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/283e7002d85c08126681241df2fdb22b/mst-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API logs /query_range handler is burning too much error budget 7 | to guarantee availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APILogsQueryRangeAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: mst-production 12 | service: observatorium-api 13 | name: api-logs-query-range-availability-slo 14 | spec: 15 | alerting: 16 | name: APILogsQueryRangeAvailabilityErrorBudgetBurning 17 | description: API logs /query_range handler is burning too much error budget to guarantee 18 | availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="query_range", 23 | group="logsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="query_range", 27 | group="logsv1"} 28 | target: "95" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/mst-production-api-logs-tail-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/283e7002d85c08126681241df2fdb22b/mst-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API logs /tail is burning too much error budget to guarantee 7 | availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APILogsTailAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: mst-production 12 | service: observatorium-api 13 | name: api-logs-tail-availability-slo 14 | spec: 15 | alerting: 16 | name: APILogsTailAvailabilityErrorBudgetBurning 17 | description: API logs /tail is burning too much error budget to guarantee availability 18 | SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="tail", 23 | group="logsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="tail", 27 | group="logsv1"} 28 | target: "95" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/mst-production-api-logs-write-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/283e7002d85c08126681241df2fdb22b/mst-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API logs /push handler is burning too much error budget to 7 | guarantee availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APILogsPushAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: mst-production 12 | service: observatorium-api 13 | name: api-logs-write-availability-slo 14 | spec: 15 | alerting: 16 | name: APILogsPushAvailabilityErrorBudgetBurning 17 | description: API logs /push handler is burning too much error budget to guarantee 18 | availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="push", 23 | group="logsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="push", 27 | group="logsv1"} 28 | target: "95" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/mst-production-api-logs-write-latency-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/283e7002d85c08126681241df2fdb22b/mst-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /push handler is burning too much error budget to guarantee 7 | latency SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APILogsPushLatencyErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: mst-production 12 | service: observatorium-api 13 | name: api-logs-write-latency-slo 14 | spec: 15 | alerting: 16 | name: APILogsPushLatencyErrorBudgetBurning 17 | description: API /push handler is burning too much error budget to guarantee latency 18 | SLOs. 19 | indicator: 20 | latency: 21 | grouping: null 22 | success: 23 | metric: http_request_duration_seconds_bucket{job="observatorium-observatorium-mst-api", 24 | handler="push", group="logsv1", code=~"^2..$", le="5"} 25 | total: 26 | metric: http_request_duration_seconds_count{job="observatorium-observatorium-mst-api", 27 | handler="push", group="logsv1", code=~"^2..$"} 28 | target: "90" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/mst-production-api-metrics-query-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/283e7002d85c08126681241df2fdb22b/mst-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /query handler is burning too much error budget to guarantee 7 | availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIMetricsQueryAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: mst-production 12 | pyrra.dev/service: observatorium-api 13 | name: api-metrics-query-availability-slo 14 | spec: 15 | alerting: 16 | name: APIMetricsQueryAvailabilityErrorBudgetBurning 17 | description: API /query handler is burning too much error budget to guarantee availability 18 | SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="query", 23 | group="metricsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="query", 27 | group="metricsv1"} 28 | target: "99" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/mst-production-api-metrics-query-range-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/283e7002d85c08126681241df2fdb22b/mst-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /query_range handler is burning too much error budget to 7 | guarantee availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIMetricsQueryRangeAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: mst-production 12 | pyrra.dev/service: observatorium-api 13 | name: api-metrics-query-range-availability-slo 14 | spec: 15 | alerting: 16 | name: APIMetricsQueryRangeAvailabilityErrorBudgetBurning 17 | description: API /query_range handler is burning too much error budget to guarantee 18 | availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="query_range", 23 | group="metricsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="query_range", 27 | group="metricsv1"} 28 | target: "99" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/mst-production-api-metrics-read-100M-latency-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/283e7002d85c08126681241df2fdb22b/mst-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /query endpoint is burning too much error budget for 100M 7 | samples, to guarantee latency SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIMetricsReadLatency100MErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: mst-production 12 | pyrra.dev/service: observatorium-api 13 | name: api-metrics-read-100M-latency-slo 14 | spec: 15 | alerting: 16 | name: APIMetricsReadLatency100MErrorBudgetBurning 17 | description: API /query endpoint is burning too much error budget for 100M samples, 18 | to guarantee latency SLOs. 19 | indicator: 20 | latency: 21 | grouping: null 22 | success: 23 | metric: up_custom_query_duration_seconds_bucket{query="query-path-sli-1M-samples", 24 | namespace="observatorium-mst-production", http_code=~"^2..$", le="120"} 25 | total: 26 | metric: up_custom_query_duration_seconds_count{query="query-path-sli-1M-samples", 27 | namespace="observatorium-mst-production", http_code=~"^2..$"} 28 | target: "90" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/mst-production-api-metrics-read-1M-latency-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/283e7002d85c08126681241df2fdb22b/mst-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /query endpoint is burning too much error budget for 1M 7 | samples, to guarantee latency SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIMetricsReadLatency1MErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: mst-production 12 | pyrra.dev/service: observatorium-api 13 | name: api-metrics-read-1M-latency-slo 14 | spec: 15 | alerting: 16 | name: APIMetricsReadLatency1MErrorBudgetBurning 17 | description: API /query endpoint is burning too much error budget for 1M samples, 18 | to guarantee latency SLOs. 19 | indicator: 20 | latency: 21 | grouping: null 22 | success: 23 | metric: up_custom_query_duration_seconds_bucket{query="query-path-sli-1M-samples", 24 | namespace="observatorium-mst-production", http_code=~"^2..$", le="10"} 25 | total: 26 | metric: up_custom_query_duration_seconds_count{query="query-path-sli-1M-samples", 27 | namespace="observatorium-mst-production", http_code=~"^2..$"} 28 | target: "90" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/mst-production-api-metrics-rule-query-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/283e7002d85c08126681241df2fdb22b/mst-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /query handler endpoint for rules evaluation is burning 7 | too much error budget to guarantee availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIMetricsRulerQueryAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: mst-production 12 | pyrra.dev/service: observatorium-api 13 | name: api-metrics-rule-query-availability-slo 14 | spec: 15 | alerting: 16 | name: APIMetricsRulerQueryAvailabilityErrorBudgetBurning 17 | description: API /query handler endpoint for rules evaluation is burning too much 18 | error budget to guarantee availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-ruler-query", handler="query", 23 | code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-ruler-query", handler="query"} 27 | target: "99" 28 | window: 28d 29 | status: {} 30 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/mst-production-api-metrics-write-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/283e7002d85c08126681241df2fdb22b/mst-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /receive handler is burning too much error budget to guarantee 7 | availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIMetricsWriteAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: mst-production 12 | pyrra.dev/service: observatorium-api 13 | name: api-metrics-write-availability-slo 14 | spec: 15 | alerting: 16 | name: APIMetricsWriteAvailabilityErrorBudgetBurning 17 | description: API /receive handler is burning too much error budget to guarantee 18 | availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="receive", 23 | group="metricsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="receive", 27 | group="metricsv1"} 28 | target: "99" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/mst-production-api-metrics-write-latency-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/283e7002d85c08126681241df2fdb22b/mst-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /receive handler is burning too much error budget to guarantee 7 | latency SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIMetricsWriteLatencyErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: mst-production 12 | pyrra.dev/service: observatorium-api 13 | name: api-metrics-write-latency-slo 14 | spec: 15 | alerting: 16 | name: APIMetricsWriteLatencyErrorBudgetBurning 17 | description: API /receive handler is burning too much error budget to guarantee 18 | latency SLOs. 19 | indicator: 20 | latency: 21 | grouping: null 22 | success: 23 | metric: http_request_duration_seconds_bucket{job="observatorium-observatorium-mst-api", 24 | handler="receive", group="metricsv1", code=~"^2..$", le="5"} 25 | total: 26 | metric: http_request_duration_seconds_count{job="observatorium-observatorium-mst-api", 27 | handler="receive", group="metricsv1", code=~"^2..$"} 28 | target: "90" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/mst-production-api-rules-raw-read-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/283e7002d85c08126681241df2fdb22b/mst-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /rules/raw endpoint for reads is burning too much error 7 | budget to guarantee availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: mst-production 12 | pyrra.dev/service: observatorium-api 13 | name: api-rules-raw-read-availability-slo 14 | spec: 15 | alerting: 16 | name: APIRulesRawReadAvailabilityErrorBudgetBurning 17 | description: API /rules/raw endpoint for reads is burning too much error budget 18 | to guarantee availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="rules-raw", 23 | method="GET", group="metricsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="rules-raw", 27 | method="GET", group="metricsv1"} 28 | target: "99" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/mst-production-api-rules-raw-write-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/283e7002d85c08126681241df2fdb22b/mst-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /rules/raw endpoint for writes is burning too much error 7 | budget to guarantee availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawWriteAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: mst-production 12 | pyrra.dev/service: observatorium-api 13 | name: api-rules-raw-write-availability-slo 14 | spec: 15 | alerting: 16 | name: APIRulesRawWriteAvailabilityErrorBudgetBurning 17 | description: API /rules/raw endpoint for writes is burning too much error budget 18 | to guarantee availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="rules-raw", 23 | method="PUT", group="metricsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="rules-raw", 27 | method="PUT", group="metricsv1"} 28 | target: "99" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/mst-production-api-rules-read-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/283e7002d85c08126681241df2fdb22b/mst-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /rules endpoint is burning too much error budget to guarantee 7 | availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesReadAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: mst-production 12 | pyrra.dev/service: observatorium-api 13 | name: api-rules-read-availability-slo 14 | spec: 15 | alerting: 16 | name: APIRulesReadAvailabilityErrorBudgetBurning 17 | description: API /rules endpoint is burning too much error budget to guarantee availability 18 | SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="rules", 23 | method="GET", group="metricsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="rules", 27 | method="GET", group="metricsv1"} 28 | target: "99" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/mst-production-api-rules-sync-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/283e7002d85c08126681241df2fdb22b/mst-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: Thanos Ruler /reload endpoint is burning too much error budget 7 | to guarantee availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesSyncAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: mst-production 12 | pyrra.dev/service: observatorium-api 13 | name: api-rules-sync-availability-slo 14 | spec: 15 | alerting: 16 | name: APIRulesSyncAvailabilityErrorBudgetBurning 17 | description: Thanos Ruler /reload endpoint is burning too much error budget to guarantee 18 | availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: client_api_requests_total{client="reload", container="thanos-rule-syncer", 23 | namespace="observatorium-mst-production", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: client_api_requests_total{client="reload", container="thanos-rule-syncer", 27 | namespace="observatorium-mst-production"} 28 | target: "99" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/mst-stage-api-alerting-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/92520ea4d6976f30d1618164e186ef9b/mst-stage-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API Thanos Rule failing to send alerts to Alertmanager and 7 | is burning too much error budget to guarantee availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIAlertmanagerAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: mst-stage 12 | pyrra.dev/service: observatorium-api 13 | name: api-alerting-availability-slo 14 | spec: 15 | alerting: 16 | name: APIAlertmanagerAvailabilityErrorBudgetBurning 17 | description: API Thanos Rule failing to send alerts to Alertmanager and is burning 18 | too much error budget to guarantee availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: thanos_alert_sender_alerts_dropped_total{container="thanos-rule", 23 | namespace="observatorium-mst-stage", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: thanos_alert_sender_alerts_dropped_total{container="thanos-rule", 27 | namespace="observatorium-mst-stage"} 28 | target: "99" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/mst-stage-api-logs-prom-tail-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/92520ea4d6976f30d1618164e186ef9b/mst-stage-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API logs /prom_tail is burning too much error budget to guarantee 7 | availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APILogsPromTailAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: mst-stage 12 | service: observatorium-api 13 | name: api-logs-prom-tail-availability-slo 14 | spec: 15 | alerting: 16 | name: APILogsPromTailAvailabilityErrorBudgetBurning 17 | description: API logs /prom_tail is burning too much error budget to guarantee availability 18 | SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="prom_tail", 23 | group="logsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="prom_tail", 27 | group="logsv1"} 28 | target: "95" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/mst-stage-api-logs-query-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/92520ea4d6976f30d1618164e186ef9b/mst-stage-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API logs /query, /labels, or /label_values handler is burning 7 | too much error budget to guarantee availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APILogsQueryAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: mst-stage 12 | service: observatorium-api 13 | name: api-logs-query-availability-slo 14 | spec: 15 | alerting: 16 | name: APILogsQueryAvailabilityErrorBudgetBurning 17 | description: API logs /query, /labels, or /label_values handler is burning too much 18 | error budget to guarantee availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler=~"query|label|labels|label_values", 23 | group="logsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler=~"query|label|labels|label_values", 27 | group="logsv1"} 28 | target: "95" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/mst-stage-api-logs-query-range-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/92520ea4d6976f30d1618164e186ef9b/mst-stage-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API logs /query_range handler is burning too much error budget 7 | to guarantee availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APILogsQueryRangeAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: mst-stage 12 | service: observatorium-api 13 | name: api-logs-query-range-availability-slo 14 | spec: 15 | alerting: 16 | name: APILogsQueryRangeAvailabilityErrorBudgetBurning 17 | description: API logs /query_range handler is burning too much error budget to guarantee 18 | availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="query_range", 23 | group="logsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="query_range", 27 | group="logsv1"} 28 | target: "95" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/mst-stage-api-logs-tail-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/92520ea4d6976f30d1618164e186ef9b/mst-stage-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API logs /tail is burning too much error budget to guarantee 7 | availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APILogsTailAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: mst-stage 12 | service: observatorium-api 13 | name: api-logs-tail-availability-slo 14 | spec: 15 | alerting: 16 | name: APILogsTailAvailabilityErrorBudgetBurning 17 | description: API logs /tail is burning too much error budget to guarantee availability 18 | SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="tail", 23 | group="logsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="tail", 27 | group="logsv1"} 28 | target: "95" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/mst-stage-api-logs-write-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/92520ea4d6976f30d1618164e186ef9b/mst-stage-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API logs /push handler is burning too much error budget to 7 | guarantee availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APILogsPushAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: mst-stage 12 | service: observatorium-api 13 | name: api-logs-write-availability-slo 14 | spec: 15 | alerting: 16 | name: APILogsPushAvailabilityErrorBudgetBurning 17 | description: API logs /push handler is burning too much error budget to guarantee 18 | availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="push", 23 | group="logsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="push", 27 | group="logsv1"} 28 | target: "95" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/mst-stage-api-logs-write-latency-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/92520ea4d6976f30d1618164e186ef9b/mst-stage-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /push handler is burning too much error budget to guarantee 7 | latency SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APILogsPushLatencyErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: mst-stage 12 | service: observatorium-api 13 | name: api-logs-write-latency-slo 14 | spec: 15 | alerting: 16 | name: APILogsPushLatencyErrorBudgetBurning 17 | description: API /push handler is burning too much error budget to guarantee latency 18 | SLOs. 19 | indicator: 20 | latency: 21 | grouping: null 22 | success: 23 | metric: http_request_duration_seconds_bucket{job="observatorium-observatorium-mst-api", 24 | handler="push", group="logsv1", code=~"^2..$", le="5"} 25 | total: 26 | metric: http_request_duration_seconds_count{job="observatorium-observatorium-mst-api", 27 | handler="push", group="logsv1", code=~"^2..$"} 28 | target: "90" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/mst-stage-api-metrics-query-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/92520ea4d6976f30d1618164e186ef9b/mst-stage-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /query handler is burning too much error budget to guarantee 7 | availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIMetricsQueryAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: mst-stage 12 | pyrra.dev/service: observatorium-api 13 | name: api-metrics-query-availability-slo 14 | spec: 15 | alerting: 16 | name: APIMetricsQueryAvailabilityErrorBudgetBurning 17 | description: API /query handler is burning too much error budget to guarantee availability 18 | SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="query", 23 | group="metricsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="query", 27 | group="metricsv1"} 28 | target: "99" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/mst-stage-api-metrics-query-range-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/92520ea4d6976f30d1618164e186ef9b/mst-stage-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /query_range handler is burning too much error budget to 7 | guarantee availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIMetricsQueryRangeAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: mst-stage 12 | pyrra.dev/service: observatorium-api 13 | name: api-metrics-query-range-availability-slo 14 | spec: 15 | alerting: 16 | name: APIMetricsQueryRangeAvailabilityErrorBudgetBurning 17 | description: API /query_range handler is burning too much error budget to guarantee 18 | availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="query_range", 23 | group="metricsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="query_range", 27 | group="metricsv1"} 28 | target: "99" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/mst-stage-api-metrics-read-100M-latency-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/92520ea4d6976f30d1618164e186ef9b/mst-stage-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /query endpoint is burning too much error budget for 100M 7 | samples, to guarantee latency SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIMetricsReadLatency100MErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: mst-stage 12 | pyrra.dev/service: observatorium-api 13 | name: api-metrics-read-100M-latency-slo 14 | spec: 15 | alerting: 16 | name: APIMetricsReadLatency100MErrorBudgetBurning 17 | description: API /query endpoint is burning too much error budget for 100M samples, 18 | to guarantee latency SLOs. 19 | indicator: 20 | latency: 21 | grouping: null 22 | success: 23 | metric: up_custom_query_duration_seconds_bucket{query="query-path-sli-1M-samples", 24 | namespace="observatorium-mst-stage", http_code=~"^2..$", le="120"} 25 | total: 26 | metric: up_custom_query_duration_seconds_count{query="query-path-sli-1M-samples", 27 | namespace="observatorium-mst-stage", http_code=~"^2..$"} 28 | target: "90" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/mst-stage-api-metrics-read-10M-latency-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/92520ea4d6976f30d1618164e186ef9b/mst-stage-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /query endpoint is burning too much error budget for 100M 7 | samples, to guarantee latency SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIMetricsReadLatency10MErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: mst-stage 12 | pyrra.dev/service: observatorium-api 13 | name: api-metrics-read-10M-latency-slo 14 | spec: 15 | alerting: 16 | name: APIMetricsReadLatency10MErrorBudgetBurning 17 | description: API /query endpoint is burning too much error budget for 100M samples, 18 | to guarantee latency SLOs. 19 | indicator: 20 | latency: 21 | grouping: null 22 | success: 23 | metric: up_custom_query_duration_seconds_bucket{query="query-path-sli-10M-samples", 24 | namespace="observatorium-mst-stage", http_code=~"^2..$", le="30"} 25 | total: 26 | metric: up_custom_query_duration_seconds_count{query="query-path-sli-10M-samples", 27 | namespace="observatorium-mst-stage", http_code=~"^2..$"} 28 | target: "90" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/mst-stage-api-metrics-read-1M-latency-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/92520ea4d6976f30d1618164e186ef9b/mst-stage-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /query endpoint is burning too much error budget for 1M 7 | samples, to guarantee latency SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIMetricsReadLatency1MErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: mst-stage 12 | pyrra.dev/service: observatorium-api 13 | name: api-metrics-read-1M-latency-slo 14 | spec: 15 | alerting: 16 | name: APIMetricsReadLatency1MErrorBudgetBurning 17 | description: API /query endpoint is burning too much error budget for 1M samples, 18 | to guarantee latency SLOs. 19 | indicator: 20 | latency: 21 | grouping: null 22 | success: 23 | metric: up_custom_query_duration_seconds_bucket{query="query-path-sli-1M-samples", 24 | namespace="observatorium-mst-stage", http_code=~"^2..$", le="10"} 25 | total: 26 | metric: up_custom_query_duration_seconds_count{query="query-path-sli-1M-samples", 27 | namespace="observatorium-mst-stage", http_code=~"^2..$"} 28 | target: "90" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/mst-stage-api-metrics-rule-query-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/92520ea4d6976f30d1618164e186ef9b/mst-stage-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /query handler endpoint for rules evaluation is burning 7 | too much error budget to guarantee availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIMetricsRulerQueryAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: mst-stage 12 | pyrra.dev/service: observatorium-api 13 | name: api-metrics-rule-query-availability-slo 14 | spec: 15 | alerting: 16 | name: APIMetricsRulerQueryAvailabilityErrorBudgetBurning 17 | description: API /query handler endpoint for rules evaluation is burning too much 18 | error budget to guarantee availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-ruler-query", handler="query", 23 | code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-ruler-query", handler="query"} 27 | target: "99" 28 | window: 28d 29 | status: {} 30 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/mst-stage-api-metrics-write-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/92520ea4d6976f30d1618164e186ef9b/mst-stage-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /receive handler is burning too much error budget to guarantee 7 | availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIMetricsWriteAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: mst-stage 12 | pyrra.dev/service: observatorium-api 13 | name: api-metrics-write-availability-slo 14 | spec: 15 | alerting: 16 | name: APIMetricsWriteAvailabilityErrorBudgetBurning 17 | description: API /receive handler is burning too much error budget to guarantee 18 | availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="receive", 23 | group="metricsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="receive", 27 | group="metricsv1"} 28 | target: "99" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/mst-stage-api-metrics-write-latency-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/92520ea4d6976f30d1618164e186ef9b/mst-stage-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /receive handler is burning too much error budget to guarantee 7 | latency SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIMetricsWriteLatencyErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: mst-stage 12 | pyrra.dev/service: observatorium-api 13 | name: api-metrics-write-latency-slo 14 | spec: 15 | alerting: 16 | name: APIMetricsWriteLatencyErrorBudgetBurning 17 | description: API /receive handler is burning too much error budget to guarantee 18 | latency SLOs. 19 | indicator: 20 | latency: 21 | grouping: null 22 | success: 23 | metric: http_request_duration_seconds_bucket{job="observatorium-observatorium-mst-api", 24 | handler="receive", group="metricsv1", code=~"^2..$", le="5"} 25 | total: 26 | metric: http_request_duration_seconds_count{job="observatorium-observatorium-mst-api", 27 | handler="receive", group="metricsv1", code=~"^2..$"} 28 | target: "90" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/mst-stage-api-rules-raw-read-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/92520ea4d6976f30d1618164e186ef9b/mst-stage-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /rules/raw endpoint for reads is burning too much error 7 | budget to guarantee availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: mst-stage 12 | pyrra.dev/service: observatorium-api 13 | name: api-rules-raw-read-availability-slo 14 | spec: 15 | alerting: 16 | name: APIRulesRawReadAvailabilityErrorBudgetBurning 17 | description: API /rules/raw endpoint for reads is burning too much error budget 18 | to guarantee availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="rules-raw", 23 | method="GET", group="metricsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="rules-raw", 27 | method="GET", group="metricsv1"} 28 | target: "99" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/mst-stage-api-rules-raw-write-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/92520ea4d6976f30d1618164e186ef9b/mst-stage-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /rules/raw endpoint for writes is burning too much error 7 | budget to guarantee availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawWriteAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: mst-stage 12 | pyrra.dev/service: observatorium-api 13 | name: api-rules-raw-write-availability-slo 14 | spec: 15 | alerting: 16 | name: APIRulesRawWriteAvailabilityErrorBudgetBurning 17 | description: API /rules/raw endpoint for writes is burning too much error budget 18 | to guarantee availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="rules-raw", 23 | method="PUT", group="metricsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="rules-raw", 27 | method="PUT", group="metricsv1"} 28 | target: "99" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/mst-stage-api-rules-read-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/92520ea4d6976f30d1618164e186ef9b/mst-stage-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /rules endpoint is burning too much error budget to guarantee 7 | availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesReadAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: mst-stage 12 | pyrra.dev/service: observatorium-api 13 | name: api-rules-read-availability-slo 14 | spec: 15 | alerting: 16 | name: APIRulesReadAvailabilityErrorBudgetBurning 17 | description: API /rules endpoint is burning too much error budget to guarantee availability 18 | SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="rules", 23 | method="GET", group="metricsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="rules", 27 | method="GET", group="metricsv1"} 28 | target: "99" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/mst-stage-api-rules-sync-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/92520ea4d6976f30d1618164e186ef9b/mst-stage-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: Thanos Ruler /reload endpoint is burning too much error budget 7 | to guarantee availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesSyncAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: mst-stage 12 | pyrra.dev/service: observatorium-api 13 | name: api-rules-sync-availability-slo 14 | spec: 15 | alerting: 16 | name: APIRulesSyncAvailabilityErrorBudgetBurning 17 | description: Thanos Ruler /reload endpoint is burning too much error budget to guarantee 18 | availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: client_api_requests_total{client="reload", container="thanos-rule-syncer", 23 | namespace="observatorium-mst-stage", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: client_api_requests_total{client="reload", container="thanos-rule-syncer", 27 | namespace="observatorium-mst-stage"} 28 | target: "99" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/rhelemeter-production-rhobs-rhelemeter-server-metrics-receive-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d//rhelemeter-production-slos?orgId=1&refresh=10s&var-datasource=&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: Rhelemeter Server /receive is burning too much error budget 7 | to guarantee availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#RhelemeterServerMetricsReceiveWriteAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | pyrra.dev/service: telemeter 12 | route: rhelemeter-server-receive 13 | name: rhobs-rhelemeter-server-metrics-receive-availability-slo 14 | spec: 15 | alerting: 16 | name: RhelemeterServerMetricsReceiveWriteAvailabilityErrorBudgetBurning 17 | description: Rhelemeter Server /receive is burning too much error budget to guarantee 18 | availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: haproxy_server_http_responses_total{route="rhelemeter-server-metrics-v1-receive", 23 | code=~"5.."} 24 | grouping: null 25 | total: 26 | metric: haproxy_server_http_responses_total{route="rhelemeter-server-metrics-v1-receive"} 27 | target: "99" 28 | window: 28d 29 | status: {} 30 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/rhelemeter-production-rhobs-rhelemeter-server-metrics-receive-latency-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d//rhelemeter-production-slos?orgId=1&refresh=10s&var-datasource=&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: Rhelemeter Server /receive is burning too much error budget 7 | to guarantee latency SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#RhelemeterServerMetricsReceiveWriteLatencyErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | pyrra.dev/service: telemeter 12 | route: rhelemeter-server-receive 13 | name: rhobs-rhelemeter-server-metrics-receive-latency-slo 14 | spec: 15 | alerting: 16 | name: RhelemeterServerMetricsReceiveWriteLatencyErrorBudgetBurning 17 | description: Rhelemeter Server /receive is burning too much error budget to guarantee 18 | latency SLOs. 19 | indicator: 20 | latency: 21 | grouping: null 22 | success: 23 | metric: http_request_duration_seconds_bucket{job="rhelemeter-server", handler="receive", 24 | code=~"^2..$", le="5"} 25 | total: 26 | metric: http_request_duration_seconds_count{job="rhelemeter-server", handler="receive", 27 | code=~"^2..$"} 28 | target: "90" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/rhelemeter-stage-rhobs-rhelemeter-server-metrics-receive-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d//rhelemeter-stage-slos?orgId=1&refresh=10s&var-datasource=&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: Rhelemeter Server /receive is burning too much error budget 7 | to guarantee availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#RhelemeterServerMetricsReceiveWriteAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | pyrra.dev/service: telemeter 12 | route: rhelemeter-server-receive 13 | name: rhobs-rhelemeter-server-metrics-receive-availability-slo 14 | spec: 15 | alerting: 16 | name: RhelemeterServerMetricsReceiveWriteAvailabilityErrorBudgetBurning 17 | description: Rhelemeter Server /receive is burning too much error budget to guarantee 18 | availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: haproxy_server_http_responses_total{route="rhelemeter-server-metrics-v1-receive", 23 | code=~"5.."} 24 | grouping: null 25 | total: 26 | metric: haproxy_server_http_responses_total{route="rhelemeter-server-metrics-v1-receive"} 27 | target: "99" 28 | window: 28d 29 | status: {} 30 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/rhelemeter-stage-rhobs-rhelemeter-server-metrics-receive-latency-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d//rhelemeter-stage-slos?orgId=1&refresh=10s&var-datasource=&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: Rhelemeter Server /receive is burning too much error budget 7 | to guarantee latency SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#RhelemeterServerMetricsReceiveWriteLatencyErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | pyrra.dev/service: telemeter 12 | route: rhelemeter-server-receive 13 | name: rhobs-rhelemeter-server-metrics-receive-latency-slo 14 | spec: 15 | alerting: 16 | name: RhelemeterServerMetricsReceiveWriteLatencyErrorBudgetBurning 17 | description: Rhelemeter Server /receive is burning too much error budget to guarantee 18 | latency SLOs. 19 | indicator: 20 | latency: 21 | grouping: null 22 | success: 23 | metric: http_request_duration_seconds_bucket{job="rhelemeter-server", handler="receive", 24 | code=~"^2..$", le="5"} 25 | total: 26 | metric: http_request_duration_seconds_count{job="rhelemeter-server", handler="receive", 27 | code=~"^2..$"} 28 | target: "90" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/rhobsp02ue1-production-api-metrics-query-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobsp02ue1-production-slos?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /query handler is burning too much error budget to guarantee 7 | availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIMetricsQueryAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: rhobsp02ue1-production 12 | pyrra.dev/service: observatorium-api 13 | name: api-metrics-query-availability-slo 14 | spec: 15 | alerting: 16 | name: APIMetricsQueryAvailabilityErrorBudgetBurning 17 | description: API /query handler is burning too much error budget to guarantee availability 18 | SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="query", 23 | group="metricsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="query", 27 | group="metricsv1"} 28 | target: "99" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/rhobsp02ue1-production-api-metrics-query-range-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobsp02ue1-production-slos?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /query_range handler is burning too much error budget to 7 | guarantee availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIMetricsQueryRangeAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: rhobsp02ue1-production 12 | pyrra.dev/service: observatorium-api 13 | name: api-metrics-query-range-availability-slo 14 | spec: 15 | alerting: 16 | name: APIMetricsQueryRangeAvailabilityErrorBudgetBurning 17 | description: API /query_range handler is burning too much error budget to guarantee 18 | availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="query_range", 23 | group="metricsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="query_range", 27 | group="metricsv1"} 28 | target: "99" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/rhobsp02ue1-production-api-metrics-rule-query-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobsp02ue1-production-slos?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /query handler endpoint for rules evaluation is burning 7 | too much error budget to guarantee availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIMetricsRulerQueryAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: rhobsp02ue1-production 12 | pyrra.dev/service: observatorium-api 13 | name: api-metrics-rule-query-availability-slo 14 | spec: 15 | alerting: 16 | name: APIMetricsRulerQueryAvailabilityErrorBudgetBurning 17 | description: API /query handler endpoint for rules evaluation is burning too much 18 | error budget to guarantee availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-ruler-query", handler="query", 23 | code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-ruler-query", handler="query"} 27 | target: "99" 28 | window: 28d 29 | status: {} 30 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/rhobsp02ue1-production-api-metrics-write-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobsp02ue1-production-slos?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /receive handler is burning too much error budget to guarantee 7 | availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIMetricsWriteAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: rhobsp02ue1-production 12 | pyrra.dev/service: observatorium-api 13 | name: api-metrics-write-availability-slo 14 | spec: 15 | alerting: 16 | name: APIMetricsWriteAvailabilityErrorBudgetBurning 17 | description: API /receive handler is burning too much error budget to guarantee 18 | availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="receive", 23 | group="metricsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="receive", 27 | group="metricsv1"} 28 | target: "99" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/rhobsp02ue1-production-api-metrics-write-latency-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobsp02ue1-production-slos?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /receive handler is burning too much error budget to guarantee 7 | latency SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIMetricsWriteLatencyErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: rhobsp02ue1-production 12 | pyrra.dev/service: observatorium-api 13 | name: api-metrics-write-latency-slo 14 | spec: 15 | alerting: 16 | name: APIMetricsWriteLatencyErrorBudgetBurning 17 | description: API /receive handler is burning too much error budget to guarantee 18 | latency SLOs. 19 | indicator: 20 | latency: 21 | grouping: null 22 | success: 23 | metric: http_request_duration_seconds_bucket{job="observatorium-observatorium-mst-api", 24 | handler="receive", group="metricsv1", code=~"^2..$", le="5"} 25 | total: 26 | metric: http_request_duration_seconds_count{job="observatorium-observatorium-mst-api", 27 | handler="receive", group="metricsv1", code=~"^2..$"} 28 | target: "90" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/rhobsp02ue1-production-api-rules-raw-read-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobsp02ue1-production-slos?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /rules/raw endpoint for reads is burning too much error 7 | budget to guarantee availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: rhobsp02ue1-production 12 | pyrra.dev/service: observatorium-api 13 | name: api-rules-raw-read-availability-slo 14 | spec: 15 | alerting: 16 | name: APIRulesRawReadAvailabilityErrorBudgetBurning 17 | description: API /rules/raw endpoint for reads is burning too much error budget 18 | to guarantee availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="rules-raw", 23 | method="GET", group="metricsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="rules-raw", 27 | method="GET", group="metricsv1"} 28 | target: "99" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/rhobsp02ue1-production-api-rules-read-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobsp02ue1-production-slos?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /rules endpoint is burning too much error budget to guarantee 7 | availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesReadAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: rhobsp02ue1-production 12 | pyrra.dev/service: observatorium-api 13 | name: api-rules-read-availability-slo 14 | spec: 15 | alerting: 16 | name: APIRulesReadAvailabilityErrorBudgetBurning 17 | description: API /rules endpoint is burning too much error budget to guarantee availability 18 | SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="rules", 23 | method="GET", group="metricsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-mst-api", handler="rules", 27 | method="GET", group="metricsv1"} 28 | target: "99" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/rhobsp02ue1-production-api-rules-sync-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/7f4df1c2d5518d5c3f2876ca9bb874a8/rhobsp02ue1-production-slos?orgId=1&refresh=10s&var-datasource=rhobsp02ue1-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: Thanos Ruler /reload endpoint is burning too much error budget 7 | to guarantee availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesSyncAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: rhobsp02ue1-production 12 | pyrra.dev/service: observatorium-api 13 | name: api-rules-sync-availability-slo 14 | spec: 15 | alerting: 16 | name: APIRulesSyncAvailabilityErrorBudgetBurning 17 | description: Thanos Ruler /reload endpoint is burning too much error budget to guarantee 18 | availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: client_api_requests_total{client="reload", container="thanos-rule-syncer", 23 | namespace="observatorium-mst-production", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: client_api_requests_total{client="reload", container="thanos-rule-syncer", 27 | namespace="observatorium-mst-production"} 28 | target: "99" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/telemeter-production-api-metrics-query-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/f9fa7677fb4a2669f123f9a0f2234b47/telemeter-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /query handler is burning too much error budget to guarantee 7 | availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIMetricsQueryAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: telemeter-production 12 | pyrra.dev/service: observatorium-api 13 | name: api-metrics-query-availability-slo 14 | spec: 15 | alerting: 16 | name: APIMetricsQueryAvailabilityErrorBudgetBurning 17 | description: API /query handler is burning too much error budget to guarantee availability 18 | SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-api", handler="query", 23 | group="metricsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-api", handler="query", 27 | group="metricsv1"} 28 | target: "99" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/telemeter-production-api-metrics-query-range-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/f9fa7677fb4a2669f123f9a0f2234b47/telemeter-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /query_range handler is burning too much error budget to 7 | guarantee availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIMetricsQueryRangeAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: telemeter-production 12 | pyrra.dev/service: observatorium-api 13 | name: api-metrics-query-range-availability-slo 14 | spec: 15 | alerting: 16 | name: APIMetricsQueryRangeAvailabilityErrorBudgetBurning 17 | description: API /query_range handler is burning too much error budget to guarantee 18 | availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-api", handler="query_range", 23 | group="metricsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-api", handler="query_range", 27 | group="metricsv1"} 28 | target: "99" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/telemeter-production-api-metrics-rule-query-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/f9fa7677fb4a2669f123f9a0f2234b47/telemeter-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /query handler endpoint for rules evaluation is burning 7 | too much error budget to guarantee availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIMetricsRulerQueryAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: telemeter-production 12 | pyrra.dev/service: observatorium-api 13 | name: api-metrics-rule-query-availability-slo 14 | spec: 15 | alerting: 16 | name: APIMetricsRulerQueryAvailabilityErrorBudgetBurning 17 | description: API /query handler endpoint for rules evaluation is burning too much 18 | error budget to guarantee availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-ruler-query", handler="query", 23 | code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-ruler-query", handler="query"} 27 | target: "99" 28 | window: 28d 29 | status: {} 30 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/telemeter-production-api-metrics-write-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/f9fa7677fb4a2669f123f9a0f2234b47/telemeter-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /receive handler is burning too much error budget to guarantee 7 | availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIMetricsWriteAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: telemeter-production 12 | pyrra.dev/service: observatorium-api 13 | name: api-metrics-write-availability-slo 14 | spec: 15 | alerting: 16 | name: APIMetricsWriteAvailabilityErrorBudgetBurning 17 | description: API /receive handler is burning too much error budget to guarantee 18 | availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-api", handler="receive", 23 | group="metricsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-api", handler="receive", 27 | group="metricsv1"} 28 | target: "99" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/telemeter-production-api-metrics-write-latency-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/f9fa7677fb4a2669f123f9a0f2234b47/telemeter-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /receive handler is burning too much error budget to guarantee 7 | latency SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIMetricsWriteLatencyErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: telemeter-production 12 | pyrra.dev/service: observatorium-api 13 | name: api-metrics-write-latency-slo 14 | spec: 15 | alerting: 16 | name: APIMetricsWriteLatencyErrorBudgetBurning 17 | description: API /receive handler is burning too much error budget to guarantee 18 | latency SLOs. 19 | indicator: 20 | latency: 21 | grouping: null 22 | success: 23 | metric: http_request_duration_seconds_bucket{job="observatorium-observatorium-api", 24 | handler="receive", group="metricsv1", code=~"^2..$", le="5"} 25 | total: 26 | metric: http_request_duration_seconds_count{job="observatorium-observatorium-api", 27 | handler="receive", group="metricsv1", code=~"^2..$"} 28 | target: "90" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/telemeter-production-api-rules-raw-read-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/f9fa7677fb4a2669f123f9a0f2234b47/telemeter-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /rules/raw endpoint for reads is burning too much error 7 | budget to guarantee availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: telemeter-production 12 | pyrra.dev/service: observatorium-api 13 | name: api-rules-raw-read-availability-slo 14 | spec: 15 | alerting: 16 | name: APIRulesRawReadAvailabilityErrorBudgetBurning 17 | description: API /rules/raw endpoint for reads is burning too much error budget 18 | to guarantee availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-api", handler="rules-raw", 23 | method="GET", group="metricsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-api", handler="rules-raw", 27 | method="GET", group="metricsv1"} 28 | target: "99" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/telemeter-production-api-rules-raw-write-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/f9fa7677fb4a2669f123f9a0f2234b47/telemeter-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /rules/raw endpoint for writes is burning too much error 7 | budget to guarantee availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawWriteAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: telemeter-production 12 | pyrra.dev/service: observatorium-api 13 | name: api-rules-raw-write-availability-slo 14 | spec: 15 | alerting: 16 | name: APIRulesRawWriteAvailabilityErrorBudgetBurning 17 | description: API /rules/raw endpoint for writes is burning too much error budget 18 | to guarantee availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-api", handler="rules-raw", 23 | method="PUT", group="metricsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-api", handler="rules-raw", 27 | method="PUT", group="metricsv1"} 28 | target: "99" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/telemeter-production-api-rules-read-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/f9fa7677fb4a2669f123f9a0f2234b47/telemeter-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /rules endpoint is burning too much error budget to guarantee 7 | availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesReadAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: telemeter-production 12 | pyrra.dev/service: observatorium-api 13 | name: api-rules-read-availability-slo 14 | spec: 15 | alerting: 16 | name: APIRulesReadAvailabilityErrorBudgetBurning 17 | description: API /rules endpoint is burning too much error budget to guarantee availability 18 | SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-api", handler="rules", 23 | method="GET", group="metricsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-api", handler="rules", 27 | method="GET", group="metricsv1"} 28 | target: "99" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/telemeter-production-api-rules-sync-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/f9fa7677fb4a2669f123f9a0f2234b47/telemeter-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: Thanos Ruler /reload endpoint is burning too much error budget 7 | to guarantee availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesSyncAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: telemeter-production 12 | pyrra.dev/service: observatorium-api 13 | name: api-rules-sync-availability-slo 14 | spec: 15 | alerting: 16 | name: APIRulesSyncAvailabilityErrorBudgetBurning 17 | description: Thanos Ruler /reload endpoint is burning too much error budget to guarantee 18 | availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: client_api_requests_total{client="reload", container="thanos-rule-syncer", 23 | namespace="observatorium-metrics-production", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: client_api_requests_total{client="reload", container="thanos-rule-syncer", 27 | namespace="observatorium-metrics-production"} 28 | target: "99" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/telemeter-production-rhobs-telemeter-server-metrics-receive-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/f9fa7677fb4a2669f123f9a0f2234b47/telemeter-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: Telemeter Server /receive is burning too much error budget 7 | to guarantee availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#TelemeterServerMetricsReceiveWriteAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | pyrra.dev/service: telemeter 12 | route: telemeter-server-receive 13 | name: rhobs-telemeter-server-metrics-receive-availability-slo 14 | spec: 15 | alerting: 16 | name: TelemeterServerMetricsReceiveWriteAvailabilityErrorBudgetBurning 17 | description: Telemeter Server /receive is burning too much error budget to guarantee 18 | availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: haproxy_server_http_responses_total{route="telemeter-server-metrics-v1-receive", 23 | code=~"5.."} 24 | grouping: null 25 | total: 26 | metric: haproxy_server_http_responses_total{route="telemeter-server-metrics-v1-receive"} 27 | target: "99" 28 | window: 28d 29 | status: {} 30 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/telemeter-production-rhobs-telemeter-server-metrics-receive-latency-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/f9fa7677fb4a2669f123f9a0f2234b47/telemeter-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: Telemeter Server /receive is burning too much error budget 7 | to guarantee latency SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#TelemeterServerMetricsReceiveWriteLatencyErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | pyrra.dev/service: telemeter 12 | route: telemeter-server-receive 13 | name: rhobs-telemeter-server-metrics-receive-latency-slo 14 | spec: 15 | alerting: 16 | name: TelemeterServerMetricsReceiveWriteLatencyErrorBudgetBurning 17 | description: Telemeter Server /receive is burning too much error budget to guarantee 18 | latency SLOs. 19 | indicator: 20 | latency: 21 | grouping: null 22 | success: 23 | metric: http_request_duration_seconds_bucket{job="telemeter-server", handler="receive", 24 | code=~"^2..$", le="5"} 25 | total: 26 | metric: http_request_duration_seconds_count{job="telemeter-server", handler="receive", 27 | code=~"^2..$"} 28 | target: "90" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/telemeter-production-rhobs-telemeter-server-metrics-upload-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/f9fa7677fb4a2669f123f9a0f2234b47/telemeter-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: Telemeter Server /upload is burning too much error budget to 7 | guarantee availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#TelemeterServerMetricsUploadWriteAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | pyrra.dev/service: telemeter 12 | route: telemeter-server-upload 13 | name: rhobs-telemeter-server-metrics-upload-availability-slo 14 | spec: 15 | alerting: 16 | name: TelemeterServerMetricsUploadWriteAvailabilityErrorBudgetBurning 17 | description: Telemeter Server /upload is burning too much error budget to guarantee 18 | availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: haproxy_server_http_responses_total{route="telemeter-server-upload", 23 | code=~"5.."} 24 | grouping: null 25 | total: 26 | metric: haproxy_server_http_responses_total{route="telemeter-server-upload"} 27 | target: "99" 28 | window: 28d 29 | status: {} 30 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/telemeter-production-rhobs-telemeter-server-metrics-upload-latency-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/f9fa7677fb4a2669f123f9a0f2234b47/telemeter-production-slos?orgId=1&refresh=10s&var-datasource=telemeter-prod-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: Telemeter Server /upload is burning too much error budget to 7 | guarantee latency SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#TelemeterServerMetricsUploadWriteLatencyErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | pyrra.dev/service: telemeter 12 | route: telemeter-server-upload 13 | name: rhobs-telemeter-server-metrics-upload-latency-slo 14 | spec: 15 | alerting: 16 | name: TelemeterServerMetricsUploadWriteLatencyErrorBudgetBurning 17 | description: Telemeter Server /upload is burning too much error budget to guarantee 18 | latency SLOs. 19 | indicator: 20 | latency: 21 | grouping: null 22 | success: 23 | metric: http_request_duration_seconds_bucket{job="telemeter-server", handler="upload", 24 | code=~"^2..$", le="5"} 25 | total: 26 | metric: http_request_duration_seconds_count{job="telemeter-server", handler="upload", 27 | code=~"^2..$"} 28 | target: "90" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/telemeter-staging-api-metrics-query-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/080e53f245a15445bdf777ae0e66945d/telemeter-staging-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /query handler is burning too much error budget to guarantee 7 | availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIMetricsQueryAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: telemeter-staging 12 | pyrra.dev/service: observatorium-api 13 | name: api-metrics-query-availability-slo 14 | spec: 15 | alerting: 16 | name: APIMetricsQueryAvailabilityErrorBudgetBurning 17 | description: API /query handler is burning too much error budget to guarantee availability 18 | SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-api", handler="query", 23 | group="metricsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-api", handler="query", 27 | group="metricsv1"} 28 | target: "99" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/telemeter-staging-api-metrics-query-range-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/080e53f245a15445bdf777ae0e66945d/telemeter-staging-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /query_range handler is burning too much error budget to 7 | guarantee availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIMetricsQueryRangeAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: telemeter-staging 12 | pyrra.dev/service: observatorium-api 13 | name: api-metrics-query-range-availability-slo 14 | spec: 15 | alerting: 16 | name: APIMetricsQueryRangeAvailabilityErrorBudgetBurning 17 | description: API /query_range handler is burning too much error budget to guarantee 18 | availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-api", handler="query_range", 23 | group="metricsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-api", handler="query_range", 27 | group="metricsv1"} 28 | target: "99" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/telemeter-staging-api-metrics-read-100M-latency-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/080e53f245a15445bdf777ae0e66945d/telemeter-staging-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /query endpoint is burning too much error budget for 100M 7 | samples, to guarantee latency SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIMetricsReadLatency100MErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: telemeter-staging 12 | pyrra.dev/service: observatorium-api 13 | name: api-metrics-read-100M-latency-slo 14 | spec: 15 | alerting: 16 | name: APIMetricsReadLatency100MErrorBudgetBurning 17 | description: API /query endpoint is burning too much error budget for 100M samples, 18 | to guarantee latency SLOs. 19 | indicator: 20 | latency: 21 | grouping: null 22 | success: 23 | metric: up_custom_query_duration_seconds_bucket{query="query-path-sli-1M-samples", 24 | namespace="observatorium-stage", http_code=~"^2..$", le="120"} 25 | total: 26 | metric: up_custom_query_duration_seconds_count{query="query-path-sli-1M-samples", 27 | namespace="observatorium-stage", http_code=~"^2..$"} 28 | target: "90" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/telemeter-staging-api-metrics-read-10M-latency-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/080e53f245a15445bdf777ae0e66945d/telemeter-staging-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /query endpoint is burning too much error budget for 100M 7 | samples, to guarantee latency SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIMetricsReadLatency10MErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: telemeter-staging 12 | pyrra.dev/service: observatorium-api 13 | name: api-metrics-read-10M-latency-slo 14 | spec: 15 | alerting: 16 | name: APIMetricsReadLatency10MErrorBudgetBurning 17 | description: API /query endpoint is burning too much error budget for 100M samples, 18 | to guarantee latency SLOs. 19 | indicator: 20 | latency: 21 | grouping: null 22 | success: 23 | metric: up_custom_query_duration_seconds_bucket{query="query-path-sli-10M-samples", 24 | namespace="observatorium-stage", http_code=~"^2..$", le="30"} 25 | total: 26 | metric: up_custom_query_duration_seconds_count{query="query-path-sli-10M-samples", 27 | namespace="observatorium-stage", http_code=~"^2..$"} 28 | target: "90" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/telemeter-staging-api-metrics-read-1M-latency-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/080e53f245a15445bdf777ae0e66945d/telemeter-staging-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /query endpoint is burning too much error budget for 1M 7 | samples, to guarantee latency SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIMetricsReadLatency1MErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: telemeter-staging 12 | pyrra.dev/service: observatorium-api 13 | name: api-metrics-read-1M-latency-slo 14 | spec: 15 | alerting: 16 | name: APIMetricsReadLatency1MErrorBudgetBurning 17 | description: API /query endpoint is burning too much error budget for 1M samples, 18 | to guarantee latency SLOs. 19 | indicator: 20 | latency: 21 | grouping: null 22 | success: 23 | metric: up_custom_query_duration_seconds_bucket{query="query-path-sli-1M-samples", 24 | namespace="observatorium-stage", http_code=~"^2..$", le="10"} 25 | total: 26 | metric: up_custom_query_duration_seconds_count{query="query-path-sli-1M-samples", 27 | namespace="observatorium-stage", http_code=~"^2..$"} 28 | target: "90" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/telemeter-staging-api-metrics-rule-query-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/080e53f245a15445bdf777ae0e66945d/telemeter-staging-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /query handler endpoint for rules evaluation is burning 7 | too much error budget to guarantee availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIMetricsRulerQueryAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: telemeter-staging 12 | pyrra.dev/service: observatorium-api 13 | name: api-metrics-rule-query-availability-slo 14 | spec: 15 | alerting: 16 | name: APIMetricsRulerQueryAvailabilityErrorBudgetBurning 17 | description: API /query handler endpoint for rules evaluation is burning too much 18 | error budget to guarantee availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-ruler-query", handler="query", 23 | code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-ruler-query", handler="query"} 27 | target: "99" 28 | window: 28d 29 | status: {} 30 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/telemeter-staging-api-metrics-write-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/080e53f245a15445bdf777ae0e66945d/telemeter-staging-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /receive handler is burning too much error budget to guarantee 7 | availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIMetricsWriteAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: telemeter-staging 12 | pyrra.dev/service: observatorium-api 13 | name: api-metrics-write-availability-slo 14 | spec: 15 | alerting: 16 | name: APIMetricsWriteAvailabilityErrorBudgetBurning 17 | description: API /receive handler is burning too much error budget to guarantee 18 | availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-api", handler="receive", 23 | group="metricsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-api", handler="receive", 27 | group="metricsv1"} 28 | target: "99" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/telemeter-staging-api-metrics-write-latency-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/080e53f245a15445bdf777ae0e66945d/telemeter-staging-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /receive handler is burning too much error budget to guarantee 7 | latency SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIMetricsWriteLatencyErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: telemeter-staging 12 | pyrra.dev/service: observatorium-api 13 | name: api-metrics-write-latency-slo 14 | spec: 15 | alerting: 16 | name: APIMetricsWriteLatencyErrorBudgetBurning 17 | description: API /receive handler is burning too much error budget to guarantee 18 | latency SLOs. 19 | indicator: 20 | latency: 21 | grouping: null 22 | success: 23 | metric: http_request_duration_seconds_bucket{job="observatorium-observatorium-api", 24 | handler="receive", group="metricsv1", code=~"^2..$", le="5"} 25 | total: 26 | metric: http_request_duration_seconds_count{job="observatorium-observatorium-api", 27 | handler="receive", group="metricsv1", code=~"^2..$"} 28 | target: "90" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/telemeter-staging-api-rules-raw-read-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/080e53f245a15445bdf777ae0e66945d/telemeter-staging-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /rules/raw endpoint for reads is burning too much error 7 | budget to guarantee availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawReadAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: telemeter-staging 12 | pyrra.dev/service: observatorium-api 13 | name: api-rules-raw-read-availability-slo 14 | spec: 15 | alerting: 16 | name: APIRulesRawReadAvailabilityErrorBudgetBurning 17 | description: API /rules/raw endpoint for reads is burning too much error budget 18 | to guarantee availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-api", handler="rules-raw", 23 | method="GET", group="metricsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-api", handler="rules-raw", 27 | method="GET", group="metricsv1"} 28 | target: "99" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/telemeter-staging-api-rules-raw-write-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/080e53f245a15445bdf777ae0e66945d/telemeter-staging-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /rules/raw endpoint for writes is burning too much error 7 | budget to guarantee availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesRawWriteAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: telemeter-staging 12 | pyrra.dev/service: observatorium-api 13 | name: api-rules-raw-write-availability-slo 14 | spec: 15 | alerting: 16 | name: APIRulesRawWriteAvailabilityErrorBudgetBurning 17 | description: API /rules/raw endpoint for writes is burning too much error budget 18 | to guarantee availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-api", handler="rules-raw", 23 | method="PUT", group="metricsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-api", handler="rules-raw", 27 | method="PUT", group="metricsv1"} 28 | target: "99" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/telemeter-staging-api-rules-read-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/080e53f245a15445bdf777ae0e66945d/telemeter-staging-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: API /rules endpoint is burning too much error budget to guarantee 7 | availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesReadAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: telemeter-staging 12 | pyrra.dev/service: observatorium-api 13 | name: api-rules-read-availability-slo 14 | spec: 15 | alerting: 16 | name: APIRulesReadAvailabilityErrorBudgetBurning 17 | description: API /rules endpoint is burning too much error budget to guarantee availability 18 | SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: http_requests_total{job="observatorium-observatorium-api", handler="rules", 23 | method="GET", group="metricsv1", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: http_requests_total{job="observatorium-observatorium-api", handler="rules", 27 | method="GET", group="metricsv1"} 28 | target: "99" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/telemeter-staging-api-rules-sync-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/080e53f245a15445bdf777ae0e66945d/telemeter-staging-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: Thanos Ruler /reload endpoint is burning too much error budget 7 | to guarantee availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#APIRulesSyncAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | instance: telemeter-staging 12 | pyrra.dev/service: observatorium-api 13 | name: api-rules-sync-availability-slo 14 | spec: 15 | alerting: 16 | name: APIRulesSyncAvailabilityErrorBudgetBurning 17 | description: Thanos Ruler /reload endpoint is burning too much error budget to guarantee 18 | availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: client_api_requests_total{client="reload", container="thanos-rule-syncer", 23 | namespace="observatorium-metrics-stage", code=~"^5..$"} 24 | grouping: null 25 | total: 26 | metric: client_api_requests_total{client="reload", container="thanos-rule-syncer", 27 | namespace="observatorium-metrics-stage"} 28 | target: "99" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/telemeter-staging-rhobs-telemeter-server-metrics-receive-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/080e53f245a15445bdf777ae0e66945d/telemeter-staging-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: Telemeter Server /receive is burning too much error budget 7 | to guarantee availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#TelemeterServerMetricsReceiveWriteAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | pyrra.dev/service: telemeter 12 | route: telemeter-server-receive 13 | name: rhobs-telemeter-server-metrics-receive-availability-slo 14 | spec: 15 | alerting: 16 | name: TelemeterServerMetricsReceiveWriteAvailabilityErrorBudgetBurning 17 | description: Telemeter Server /receive is burning too much error budget to guarantee 18 | availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: haproxy_server_http_responses_total{route="telemeter-server-metrics-v1-receive", 23 | code=~"5.."} 24 | grouping: null 25 | total: 26 | metric: haproxy_server_http_responses_total{route="telemeter-server-metrics-v1-receive"} 27 | target: "99" 28 | window: 28d 29 | status: {} 30 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/telemeter-staging-rhobs-telemeter-server-metrics-receive-latency-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/080e53f245a15445bdf777ae0e66945d/telemeter-staging-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: Telemeter Server /receive is burning too much error budget 7 | to guarantee latency SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#TelemeterServerMetricsReceiveWriteLatencyErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | pyrra.dev/service: telemeter 12 | route: telemeter-server-receive 13 | name: rhobs-telemeter-server-metrics-receive-latency-slo 14 | spec: 15 | alerting: 16 | name: TelemeterServerMetricsReceiveWriteLatencyErrorBudgetBurning 17 | description: Telemeter Server /receive is burning too much error budget to guarantee 18 | latency SLOs. 19 | indicator: 20 | latency: 21 | grouping: null 22 | success: 23 | metric: http_request_duration_seconds_bucket{job="telemeter-server", handler="receive", 24 | code=~"^2..$", le="5"} 25 | total: 26 | metric: http_request_duration_seconds_count{job="telemeter-server", handler="receive", 27 | code=~"^2..$"} 28 | target: "90" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/telemeter-staging-rhobs-telemeter-server-metrics-upload-availability-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/080e53f245a15445bdf777ae0e66945d/telemeter-staging-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: Telemeter Server /upload is burning too much error budget to 7 | guarantee availability SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#TelemeterServerMetricsUploadWriteAvailabilityErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | pyrra.dev/service: telemeter 12 | route: telemeter-server-upload 13 | name: rhobs-telemeter-server-metrics-upload-availability-slo 14 | spec: 15 | alerting: 16 | name: TelemeterServerMetricsUploadWriteAvailabilityErrorBudgetBurning 17 | description: Telemeter Server /upload is burning too much error budget to guarantee 18 | availability SLOs. 19 | indicator: 20 | ratio: 21 | errors: 22 | metric: haproxy_server_http_responses_total{route="telemeter-server-upload", 23 | code=~"5.."} 24 | grouping: null 25 | total: 26 | metric: haproxy_server_http_responses_total{route="telemeter-server-upload"} 27 | target: "99" 28 | window: 28d 29 | status: {} 30 | -------------------------------------------------------------------------------- /resources/observability/prometheusrules/pyrra/telemeter-staging-rhobs-telemeter-server-metrics-upload-latency-slo.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1alpha1 2 | kind: ServiceLevelObjective 3 | metadata: 4 | annotations: 5 | pyrra.dev/dashboard: https://grafana.app-sre.devshift.net/d/080e53f245a15445bdf777ae0e66945d/telemeter-staging-slos?orgId=1&refresh=10s&var-datasource=app-sre-stage-01-prometheus&var-namespace={{$labels.namespace}}&var-job=All&var-pod=All&var-interval=5m 6 | pyrra.dev/message: Telemeter Server /upload is burning too much error budget to 7 | guarantee latency SLOs. 8 | pyrra.dev/runbook: https://github.com/rhobs/configuration/blob/main/docs/sop/observatorium.md#TelemeterServerMetricsUploadWriteLatencyErrorBudgetBurning 9 | creationTimestamp: null 10 | labels: 11 | pyrra.dev/service: telemeter 12 | route: telemeter-server-upload 13 | name: rhobs-telemeter-server-metrics-upload-latency-slo 14 | spec: 15 | alerting: 16 | name: TelemeterServerMetricsUploadWriteLatencyErrorBudgetBurning 17 | description: Telemeter Server /upload is burning too much error budget to guarantee 18 | latency SLOs. 19 | indicator: 20 | latency: 21 | grouping: null 22 | success: 23 | metric: http_request_duration_seconds_bucket{job="telemeter-server", handler="upload", 24 | code=~"^2..$", le="5"} 25 | total: 26 | metric: http_request_duration_seconds_count{job="telemeter-server", handler="upload", 27 | code=~"^2..$"} 28 | target: "90" 29 | window: 28d 30 | status: {} 31 | -------------------------------------------------------------------------------- /resources/operations/bucket-inspect/README.md: -------------------------------------------------------------------------------- 1 | # What 2 | 3 | This template deploys [Thanos Bucket Inspect](https://thanos.io/tip/components/tools.md/#bucket-insepct) 4 | as a Kubernetes Job or CronJob. 5 | 6 | # SOP 7 | 8 | Create a Kubernetes Secret that contains the credentials for the target object storage provider, or use the 9 | template provided in this directory for S3 compatible object storage providers. 10 | 11 | ```yaml 12 | apiVersion: v1 13 | kind: Secret 14 | metadata: 15 | name: thanos-bucket-inspect-config 16 | type: Opaque 17 | stringData: 18 | from-config.yaml: | 19 | # see https://thanos.io/tip/thanos/storage.md/ 20 | ``` 21 | 22 | Process the template and run the Job 23 | 24 | ```bash 25 | oc process -f job-template.yaml | oc apply -f - 26 | ``` 27 | 28 | Alternatively, you can run it as a CronJob 29 | ```bash 30 | oc process -f cron-job-template.yaml | oc apply -f - 31 | ``` 32 | -------------------------------------------------------------------------------- /resources/operations/bucket-inspect/s3-secret-template.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: template.openshift.io/v1 2 | kind: Template 3 | metadata: 4 | name: Thanos Bucket Inspect 5 | labels: 6 | app.kubernetes.io/name: thanos-bucket-inspect 7 | app.kubernetes.io/part-of: observatorium 8 | description: | 9 | This template creates a Secret that supports Thanos Object Storage inspection for S3. 10 | parameters: 11 | - name: NAMESPACE 12 | description: The namespace where the Secret will be created. 13 | value: 'observatorium-operations' 14 | - name: OBJ_STORE_CONFIG_SECRET_NAME 15 | value: 'thanos-bucket-inspect-config' 16 | - name: ACCESS_KEY_ID 17 | - name: SECRET_ACCESS_KEY 18 | - name: S3_BUCKET_NAME 19 | - name: S3_BUCKET_ENDPOINT 20 | value: s3.us-east-1.amazonaws.com 21 | - name: S3_BUCKET_REGION 22 | value: us-east-1 23 | - name: K8S_SECRET_KEY 24 | value: config.yaml 25 | objects: 26 | - apiVersion: v1 27 | kind: Secret 28 | metadata: 29 | name: ${OBJ_STORE_CONFIG_SECRET_NAME} 30 | namespace: ${NAMESPACE} 31 | labels: 32 | app.kubernetes.io/name: thanos-bucket-inspect 33 | app.kubernetes.io/part-of: observatorium 34 | type: Opaque 35 | stringData: 36 | ${K8S_SECRET_KEY}: | 37 | type: S3 38 | config: 39 | bucket: ${S3_BUCKET_NAME} 40 | region: ${S3_BUCKET_REGION} 41 | access_key: ${ACCESS_KEY_ID} 42 | secret_key: ${SECRET_ACCESS_KEY} 43 | endpoint: ${S3_BUCKET_ENDPOINT} 44 | 45 | 46 | -------------------------------------------------------------------------------- /resources/operations/bucket-replicate/monitoring-template.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: template.openshift.io/v1 2 | kind: Template 3 | metadata: 4 | name: rhobs-thanos-bucket-replicate-pod-monitor 5 | labels: 6 | app.kubernetes.io/name: thanos-bucket-replicate 7 | app.kubernetes.io/part-of: observatorium 8 | parameters: 9 | - name: NAMESPACE 10 | description: The namespace where the running Job will reside. 11 | value: 'observatorium-operations' 12 | - name: NAME 13 | description: The name of the Job. 14 | value: 'thanos-bucket-replicate' 15 | objects: 16 | - apiVersion: monitoring.coreos.com/v1 17 | kind: PodMonitor 18 | metadata: 19 | name: observatorium-operations-thanos-bucket-replicate 20 | labels: 21 | prometheus: app-sre 22 | spec: 23 | namespaceSelector: 24 | matchNames: 25 | - ${NAMESPACE} 26 | selector: 27 | matchLabels: 28 | job-name: ${NAME} 29 | podMetricsEndpoints: 30 | - port: metrics 31 | interval: 30s 32 | path: /metrics 33 | 34 | -------------------------------------------------------------------------------- /resources/operations/bucket-replicate/s3-secret-template.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: template.openshift.io/v1 2 | kind: Template 3 | metadata: 4 | name: Thanos Bucket Inspect 5 | labels: 6 | app.kubernetes.io/name: thanos-bucket-replicate-secret 7 | app.kubernetes.io/part-of: observatorium 8 | description: | 9 | This template creates a Secret that supports Thanos Object Storage for S3. 10 | parameters: 11 | - name: NAMESPACE 12 | description: The namespace where the Secret will be created. 13 | value: 'observatorium-operations' 14 | - name: OBJ_STORE_CONFIG_SECRET_NAME 15 | value: 'thanos-bucket-config' 16 | - name: K8S_SECRET_KEY 17 | value: config.yaml 18 | - name: ACCESS_KEY_ID 19 | - name: SECRET_ACCESS_KEY 20 | - name: S3_BUCKET_NAME 21 | - name: S3_BUCKET_ENDPOINT 22 | value: s3.us-east-1.amazonaws.com 23 | - name: S3_BUCKET_REGION 24 | value: us-east-1 25 | objects: 26 | - apiVersion: v1 27 | kind: Secret 28 | metadata: 29 | name: ${OBJ_STORE_CONFIG_SECRET_NAME} 30 | namespace: ${NAMESPACE} 31 | labels: 32 | app.kubernetes.io/name: thanos-bucket-replicate-secret 33 | app.kubernetes.io/part-of: observatorium 34 | type: Opaque 35 | stringData: 36 | ${K8S_SECRET_KEY}: | 37 | type: S3 38 | config: 39 | bucket: ${S3_BUCKET_NAME} 40 | region: ${S3_BUCKET_REGION} 41 | access_key: ${ACCESS_KEY_ID} 42 | secret_key: ${SECRET_ACCESS_KEY} 43 | endpoint: ${S3_BUCKET_ENDPOINT} 44 | 45 | 46 | -------------------------------------------------------------------------------- /resources/operations/rclone-bucket-replicate/README.md: -------------------------------------------------------------------------------- 1 | # What 2 | 3 | This template deploys [Rclone Copy Command](https://rclone.org/commands/rclone_copy/) 4 | as a Kubernetes Job. 5 | 6 | # SOP 7 | TBC 8 | 9 | -------------------------------------------------------------------------------- /resources/operations/rclone-bucket-replicate/job-template.env: -------------------------------------------------------------------------------- 1 | OBJ_STORE_CONFIG_SECRET_NAME=rclone-rhobs-testing-secret 2 | NAMESPACE=rclone-test 3 | SRC_ENDPOINT=SRC 4 | SRC_BUCKET=telemeter-thanos-testing 5 | DST_ENDPOINT=DST 6 | DST_BUCKET=replicate -------------------------------------------------------------------------------- /resources/operations/rclone-bucket-replicate/monitoring-template.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: template.openshift.io/v1 2 | kind: Template 3 | metadata: 4 | name: rhobs-rclone-bucket-replicate-pod-monitor 5 | labels: 6 | app.kubernetes.io/name: rclone-bucket-replicate 7 | app.kubernetes.io/part-of: observatorium 8 | parameters: 9 | - name: NAMESPACE 10 | description: The namespace where the running Job will reside. 11 | value: 'observatorium-operations' 12 | - name: NAME 13 | description: The name of the Job. 14 | value: 'rclone-bucket-replicate' 15 | objects: 16 | - apiVersion: monitoring.coreos.com/v1 17 | kind: PodMonitor 18 | metadata: 19 | name: observatorium-operations-rclone-bucket-replicate 20 | labels: 21 | prometheus: app-sre 22 | spec: 23 | namespaceSelector: 24 | matchNames: 25 | - ${NAMESPACE} 26 | selector: 27 | matchLabels: 28 | job-name: ${NAME} 29 | podMetricsEndpoints: 30 | - port: metrics 31 | interval: 30s 32 | path: /metrics 33 | 34 | -------------------------------------------------------------------------------- /resources/operations/rclone-bucket-replicate/s3-secret-template.env: -------------------------------------------------------------------------------- 1 | OBJ_STORE_CONFIG_SECRET_NAME= 2 | NAMESPACE=rclone-test 3 | SOURCE_ACCESS_KEY_ID= 4 | SOURCE_SECRET_ACCESS_KEY=+tm 5 | SOURCE_S3_BUCKET_PROVIDER=AWS 6 | SOURCE_S3_BUCKET_NAME= 7 | SOURCE_S3_BUCKET_ENDPOINT= 8 | SOURCE_S3_BUCKET_REGION= 9 | 10 | TARGET_ACCESS_KEY_ID= 11 | TARGET_SECRET_ACCESS_KEY= 12 | TARGET_S3_BUCKET_PROVIDER= 13 | TARGET_S3_BUCKET_NAME= 14 | TARGET_S3_BUCKET_ENDPOINT= 15 | TARGET_S3_BUCKET_REGION= -------------------------------------------------------------------------------- /resources/services/alertmanager/production/service-monitor-template.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: template.openshift.io/v1 2 | kind: Template 3 | metadata: 4 | creationTimestamp: null 5 | name: alertmanager-service-monitor-rhobs-production 6 | objects: 7 | - apiVersion: monitoring.coreos.com/v1 8 | kind: ServiceMonitor 9 | metadata: 10 | creationTimestamp: null 11 | labels: 12 | app.kubernetes.io/component: alertmanager 13 | app.kubernetes.io/instance: observatorium 14 | app.kubernetes.io/name: alertmanager 15 | app.kubernetes.io/part-of: observatorium 16 | prometheus: app-sre 17 | name: alertmanager 18 | namespace: openshift-customer-monitoring 19 | spec: 20 | endpoints: 21 | - port: http 22 | relabelings: 23 | - action: replace 24 | separator: / 25 | sourceLabels: 26 | - namespace 27 | - pod 28 | targetLabel: instance 29 | namespaceSelector: 30 | matchNames: 31 | - rhobs-production 32 | selector: 33 | matchLabels: 34 | app.kubernetes.io/component: alertmanager 35 | app.kubernetes.io/instance: observatorium 36 | app.kubernetes.io/name: alertmanager 37 | app.kubernetes.io/part-of: observatorium 38 | -------------------------------------------------------------------------------- /resources/services/alertmanager/staging/service-monitor-template.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: template.openshift.io/v1 2 | kind: Template 3 | metadata: 4 | creationTimestamp: null 5 | name: alertmanager-service-monitor-rhobs-stage 6 | objects: 7 | - apiVersion: monitoring.coreos.com/v1 8 | kind: ServiceMonitor 9 | metadata: 10 | creationTimestamp: null 11 | labels: 12 | app.kubernetes.io/component: alertmanager 13 | app.kubernetes.io/instance: observatorium 14 | app.kubernetes.io/name: alertmanager 15 | app.kubernetes.io/part-of: observatorium 16 | prometheus: app-sre 17 | name: alertmanager 18 | namespace: openshift-customer-monitoring 19 | spec: 20 | endpoints: 21 | - port: http 22 | relabelings: 23 | - action: replace 24 | separator: / 25 | sourceLabels: 26 | - namespace 27 | - pod 28 | targetLabel: instance 29 | namespaceSelector: 30 | matchNames: 31 | - rhobs-stage 32 | selector: 33 | matchLabels: 34 | app.kubernetes.io/component: alertmanager 35 | app.kubernetes.io/instance: observatorium 36 | app.kubernetes.io/name: alertmanager 37 | app.kubernetes.io/part-of: observatorium 38 | -------------------------------------------------------------------------------- /resources/services/memcached/staging/service-monitor-memcached-template.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: template.openshift.io/v1 2 | kind: Template 3 | metadata: 4 | creationTimestamp: null 5 | name: memcached-service-monitor 6 | objects: 7 | - apiVersion: monitoring.coreos.com/v1 8 | kind: ServiceMonitor 9 | metadata: 10 | creationTimestamp: null 11 | labels: 12 | app.kubernetes.io/component: api-memcached 13 | app.kubernetes.io/instance: rhobs 14 | app.kubernetes.io/name: memcached 15 | app.kubernetes.io/part-of: observatorium 16 | app.kubernetes.io/version: 1.5-316 17 | prometheus: app-sre 18 | name: api-memcached 19 | namespace: openshift-customer-monitoring 20 | spec: 21 | endpoints: 22 | - honorLabels: true 23 | interval: 30s 24 | path: /metrics 25 | port: metrics 26 | namespaceSelector: 27 | matchNames: 28 | - rhobs-stage 29 | selector: 30 | matchLabels: 31 | app.kubernetes.io/component: api-memcached 32 | app.kubernetes.io/instance: rhobs 33 | app.kubernetes.io/name: memcached 34 | app.kubernetes.io/part-of: observatorium 35 | app.kubernetes.io/version: 1.5-316 36 | -------------------------------------------------------------------------------- /resources/services/objstore/local/thanos-default-secret.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Secret 3 | metadata: 4 | creationTimestamp: null 5 | labels: 6 | app.kubernetes.io/name: observatorium-mst-thanos-objectstorage 7 | name: observatorium-mst-thanos-objectstorage 8 | namespace: rhobs-local 9 | stringData: 10 | thanos.yaml: |- 11 | type: S3 12 | config: 13 | bucket: thanos 14 | region: us-east-1 15 | access_key: minio 16 | secret_key: minio123 17 | endpoint: minio.observatorium-minio.svc:9000 18 | insecure: true 19 | type: Opaque 20 | -------------------------------------------------------------------------------- /resources/services/objstore/local/thanos-telemeter-secret.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Secret 3 | metadata: 4 | creationTimestamp: null 5 | labels: 6 | app.kubernetes.io/name: thanos-objectstorage 7 | name: thanos-objectstorage 8 | namespace: rhobs-local 9 | stringData: 10 | thanos.yaml: |- 11 | type: S3 12 | config: 13 | bucket: thanos 14 | region: us-east-1 15 | access_key: minio 16 | secret_key: minio123 17 | endpoint: minio.observatorium-minio.svc:9000 18 | insecure: true 19 | type: Opaque 20 | -------------------------------------------------------------------------------- /resources/services/objstore/production/thanos-default-secret.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: template.openshift.io/v1 2 | kind: Template 3 | metadata: 4 | creationTimestamp: null 5 | name: thanos-default-secret 6 | objects: 7 | - apiVersion: v1 8 | kind: Secret 9 | metadata: 10 | creationTimestamp: null 11 | labels: 12 | app.kubernetes.io/name: observatorium-mst-thanos-objectstorage 13 | name: observatorium-mst-thanos-objectstorage 14 | namespace: rhobs-production 15 | stringData: 16 | thanos.yaml: |- 17 | type: S3 18 | config: 19 | bucket: ${S3_BUCKET_NAME} 20 | region: ${S3_BUCKET_REGION} 21 | access_key: ${ACCESS_KEY_ID} 22 | secret_key: ${SECRET_ACCESS_KEY} 23 | endpoint: ${S3_BUCKET_ENDPOINT} 24 | type: Opaque 25 | parameters: 26 | - name: S3_BUCKET_NAME 27 | - name: S3_BUCKET_REGION 28 | - name: S3_BUCKET_ENDPOINT 29 | - name: ACCESS_KEY_ID 30 | - name: SECRET_ACCESS_KEY 31 | -------------------------------------------------------------------------------- /resources/services/objstore/production/thanos-telemeter-secret.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: template.openshift.io/v1 2 | kind: Template 3 | metadata: 4 | creationTimestamp: null 5 | name: thanos-telemeter-secret 6 | objects: 7 | - apiVersion: v1 8 | kind: Secret 9 | metadata: 10 | creationTimestamp: null 11 | labels: 12 | app.kubernetes.io/name: thanos-objectstorage 13 | name: thanos-objectstorage 14 | namespace: rhobs-production 15 | stringData: 16 | thanos.yaml: |- 17 | type: S3 18 | config: 19 | bucket: ${S3_BUCKET_NAME} 20 | region: ${S3_BUCKET_REGION} 21 | access_key: ${ACCESS_KEY_ID} 22 | secret_key: ${SECRET_ACCESS_KEY} 23 | endpoint: ${S3_BUCKET_ENDPOINT} 24 | type: Opaque 25 | parameters: 26 | - name: S3_BUCKET_NAME 27 | - name: S3_BUCKET_REGION 28 | - name: S3_BUCKET_ENDPOINT 29 | - name: ACCESS_KEY_ID 30 | - name: SECRET_ACCESS_KEY 31 | -------------------------------------------------------------------------------- /resources/services/objstore/staging/thanos-default-secret.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: template.openshift.io/v1 2 | kind: Template 3 | metadata: 4 | creationTimestamp: null 5 | name: thanos-default-secret 6 | objects: 7 | - apiVersion: v1 8 | kind: Secret 9 | metadata: 10 | creationTimestamp: null 11 | labels: 12 | app.kubernetes.io/name: observatorium-mst-thanos-objectstorage 13 | name: observatorium-mst-thanos-objectstorage 14 | namespace: rhobs-stage 15 | stringData: 16 | thanos.yaml: |- 17 | type: S3 18 | config: 19 | bucket: ${S3_BUCKET_NAME} 20 | region: ${S3_BUCKET_REGION} 21 | access_key: ${ACCESS_KEY_ID} 22 | secret_key: ${SECRET_ACCESS_KEY} 23 | endpoint: ${S3_BUCKET_ENDPOINT} 24 | type: Opaque 25 | parameters: 26 | - name: S3_BUCKET_NAME 27 | - name: S3_BUCKET_REGION 28 | - name: S3_BUCKET_ENDPOINT 29 | - name: ACCESS_KEY_ID 30 | - name: SECRET_ACCESS_KEY 31 | -------------------------------------------------------------------------------- /resources/services/objstore/staging/thanos-telemeter-secret.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: template.openshift.io/v1 2 | kind: Template 3 | metadata: 4 | creationTimestamp: null 5 | name: thanos-telemeter-secret 6 | objects: 7 | - apiVersion: v1 8 | kind: Secret 9 | metadata: 10 | creationTimestamp: null 11 | labels: 12 | app.kubernetes.io/name: thanos-objectstorage 13 | name: thanos-objectstorage 14 | namespace: rhobs-stage 15 | stringData: 16 | thanos.yaml: |- 17 | type: S3 18 | config: 19 | bucket: ${S3_BUCKET_NAME} 20 | region: ${S3_BUCKET_REGION} 21 | access_key: ${ACCESS_KEY_ID} 22 | secret_key: ${SECRET_ACCESS_KEY} 23 | endpoint: ${S3_BUCKET_ENDPOINT} 24 | type: Opaque 25 | parameters: 26 | - name: S3_BUCKET_NAME 27 | - name: S3_BUCKET_REGION 28 | - name: S3_BUCKET_ENDPOINT 29 | - name: ACCESS_KEY_ID 30 | - name: SECRET_ACCESS_KEY 31 | -------------------------------------------------------------------------------- /resources/services/objstore/thanos-object-store-secret.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: template.openshift.io/v1 2 | kind: Template 3 | metadata: 4 | creationTimestamp: null 5 | name: thanos-object-store-secret 6 | objects: 7 | - apiVersion: v1 8 | kind: Secret 9 | metadata: 10 | creationTimestamp: null 11 | labels: 12 | app.kubernetes.io/name: ${SECRET_NAME} 13 | name: ${SECRET_NAME} 14 | namespace: ${NAMESPACE} 15 | stringData: 16 | thanos.yaml: |- 17 | type: S3 18 | config: 19 | bucket: ${S3_BUCKET_NAME} 20 | region: ${S3_BUCKET_REGION} 21 | access_key: ${ACCESS_KEY_ID} 22 | secret_key: ${SECRET_ACCESS_KEY} 23 | endpoint: ${S3_BUCKET_ENDPOINT} 24 | type: Opaque 25 | parameters: 26 | - name: SECRET_NAME 27 | - name: NAMESPACE 28 | - name: S3_BUCKET_NAME 29 | - name: S3_BUCKET_REGION 30 | - name: S3_BUCKET_ENDPOINT 31 | - name: ACCESS_KEY_ID 32 | - name: SECRET_ACCESS_KEY 33 | -------------------------------------------------------------------------------- /resources/services/observatorium-api/production/service-monitor-observatorium-api-template.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: template.openshift.io/v1 2 | kind: Template 3 | metadata: 4 | creationTimestamp: null 5 | name: observatorium-api-service-monitor 6 | objects: 7 | - apiVersion: monitoring.coreos.com/v1 8 | kind: ServiceMonitor 9 | metadata: 10 | creationTimestamp: null 11 | labels: 12 | app.kubernetes.io/component: api 13 | app.kubernetes.io/instance: rhobs 14 | app.kubernetes.io/name: observatorium-api 15 | app.kubernetes.io/part-of: rhobs 16 | app.kubernetes.io/version: 9aada65247a07782465beb500323a0e18d7e3d05 17 | prometheus: app-sre 18 | name: rhobs-gateway 19 | namespace: openshift-customer-monitoring 20 | spec: 21 | endpoints: 22 | - interval: 30s 23 | path: /metrics 24 | port: internal 25 | - interval: 30s 26 | path: /metrics 27 | port: opa-ams-metrics 28 | - interval: 30s 29 | path: /metrics 30 | port: metrics 31 | namespaceSelector: 32 | matchNames: 33 | - rhobs-production 34 | selector: 35 | matchLabels: 36 | app.kubernetes.io/component: api 37 | app.kubernetes.io/instance: rhobs 38 | app.kubernetes.io/name: observatorium-api 39 | app.kubernetes.io/part-of: rhobs 40 | -------------------------------------------------------------------------------- /resources/services/observatorium-api/staging/service-monitor-observatorium-api-template.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: template.openshift.io/v1 2 | kind: Template 3 | metadata: 4 | creationTimestamp: null 5 | name: observatorium-api-service-monitor 6 | objects: 7 | - apiVersion: monitoring.coreos.com/v1 8 | kind: ServiceMonitor 9 | metadata: 10 | creationTimestamp: null 11 | labels: 12 | app.kubernetes.io/component: api 13 | app.kubernetes.io/instance: rhobs 14 | app.kubernetes.io/name: observatorium-api 15 | app.kubernetes.io/part-of: rhobs 16 | app.kubernetes.io/version: 9aada65247a07782465beb500323a0e18d7e3d05 17 | prometheus: app-sre 18 | name: rhobs-gateway 19 | namespace: openshift-customer-monitoring 20 | spec: 21 | endpoints: 22 | - interval: 30s 23 | path: /metrics 24 | port: internal 25 | - interval: 30s 26 | path: /metrics 27 | port: opa-ams-metrics 28 | - interval: 30s 29 | path: /metrics 30 | port: metrics 31 | namespaceSelector: 32 | matchNames: 33 | - rhobs-stage 34 | selector: 35 | matchLabels: 36 | app.kubernetes.io/component: api 37 | app.kubernetes.io/instance: rhobs 38 | app.kubernetes.io/name: observatorium-api 39 | app.kubernetes.io/part-of: rhobs 40 | -------------------------------------------------------------------------------- /resources/services/observatorium-tenants-template.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: template.openshift.io/v1 2 | kind: Template 3 | metadata: 4 | annotations: 5 | qontract.recycle: "true" 6 | name: ${SECRET_NAME} 7 | objects: 8 | - apiVersion: v1 9 | kind: Secret 10 | metadata: 11 | name: ${SECRET_NAME} 12 | annotations: 13 | qontract.recycle: "true" 14 | stringData: 15 | client-id: ${CLIENT_ID} 16 | client-secret: ${CLIENT_SECRET} 17 | issuer-url: https://sso.redhat.com/auth/realms/redhat-external 18 | tenants.yaml: | 19 | ${TENANTS} 20 | type: Opaque 21 | parameters: 22 | - name: SECRET_NAME 23 | - name: CLIENT_ID 24 | - name: CLIENT_SECRET 25 | - name: TENANTS 26 | -------------------------------------------------------------------------------- /resources/services/servicemonitors/local/servicemonitors.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: ServiceMonitor 3 | metadata: 4 | creationTimestamp: null 5 | labels: 6 | app.kubernetes.io/component: monitoring 7 | app.kubernetes.io/created-by: thanos-operator 8 | app.kubernetes.io/instance: controller-manager-metrics 9 | app.kubernetes.io/managed-by: rhobs 10 | app.kubernetes.io/name: servicemonitor 11 | app.kubernetes.io/part-of: thanos-operator 12 | prometheus: app-sre 13 | name: thanos-operator-controller-manager-metrics 14 | namespace: openshift-customer-monitoring 15 | spec: 16 | endpoints: 17 | - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token 18 | path: /metrics 19 | port: https 20 | scheme: https 21 | tlsConfig: 22 | ca: {} 23 | cert: {} 24 | insecureSkipVerify: true 25 | namespaceSelector: 26 | matchNames: 27 | - rhobs-local 28 | selector: 29 | matchLabels: 30 | control-plane: controller-manager 31 | -------------------------------------------------------------------------------- /services/observatorium-tenants-template.jsonnet: -------------------------------------------------------------------------------- 1 | { 2 | apiVersion: 'v1', 3 | kind: 'Template', 4 | metadata: { 5 | name: '${SECRET_NAME}', 6 | annotations: { 7 | 'qontract.recycle': 'true', 8 | }, 9 | }, 10 | objects: [ 11 | { 12 | apiVersion: 'v1', 13 | kind: 'Secret', 14 | metadata+: { 15 | name: '${SECRET_NAME}', 16 | annotations: { 17 | 'qontract.recycle': 'true', 18 | }, 19 | }, 20 | type: 'Opaque', 21 | stringData: { 22 | 'client-id': '${CLIENT_ID}', 23 | 'client-secret': '${CLIENT_SECRET}', 24 | 'issuer-url': 'https://sso.redhat.com/auth/realms/redhat-external', 25 | 'tenants.yaml': '${TENANTS}', 26 | }, 27 | }, 28 | ], 29 | parameters: [ 30 | { name: 'SECRET_NAME' }, 31 | { name: 'CLIENT_ID' }, 32 | { name: 'CLIENT_SECRET' }, 33 | { name: 'TENANTS' }, 34 | ], 35 | } 36 | -------------------------------------------------------------------------------- /services/observatorium-traces-subscriptions-template.jsonnet: -------------------------------------------------------------------------------- 1 | local subscriptions = import 'observatorium-traces-subscriptions.libsonnet'; 2 | { 3 | apiVersion: 'template.openshift.io/v1', 4 | kind: 'Template', 5 | metadata: { 6 | name: 'observatorium-traces-subscriptions', 7 | }, 8 | objects: [ 9 | subscriptions.otelcol, 10 | subscriptions.jaeger, 11 | subscriptions.elasticsearch, 12 | ], 13 | parameters: [ 14 | { name: 'OPENTELEMETRY_OPERATOR_VERSION', value: '0.44.1-1' }, 15 | { name: 'OPENTELEMETRY_OPERATOR_NAMESPACE', value: 'openshift-operators' }, 16 | { name: 'OPENTELEMETRY_OPERATOR_SOURCE', value: 'redhat-operators' }, 17 | { name: 'JAEGER_OPERATOR_VERSION', value: '1.30.2' }, 18 | { name: 'JAEGER_OPERATOR_NAMESPACE', value: 'openshift-operators' }, 19 | { name: 'JAEGER_OPERATOR_SOURCE', value: 'redhat-operators' }, 20 | { name: 'ELASTICSEARCH_OPERATOR_VERSION', value: '5.4.1-24' }, 21 | { name: 'ELASTICSEARCH_OPERATOR_NAMESPACE', value: 'openshift-operators' }, 22 | { name: 'ELASTICSEARCH_OPERATOR_SOURCE', value: 'redhat-operators' }, 23 | ], 24 | } 25 | -------------------------------------------------------------------------------- /services/observatorium-traces-template.jsonnet: -------------------------------------------------------------------------------- 1 | local obs = import 'observatorium.libsonnet'; 2 | { 3 | apiVersion: 'template.openshift.io/v1', 4 | kind: 'Template', 5 | metadata: { 6 | name: 'observatorium-traces', 7 | }, 8 | objects: [ 9 | obs.elasticsearch, 10 | ] + [ 11 | obs.tracing.manifests[name] { 12 | metadata+: { 13 | }, 14 | } 15 | for name in std.objectFields(obs.tracing.manifests) 16 | ], 17 | parameters: [ 18 | { name: 'NAMESPACE', value: 'observatorium-traces' }, 19 | { name: 'OPENTELEMETRY_COLLECTOR_IMAGE', value: 'ghcr.io/open-telemetry/opentelemetry-collector-releases/opentelemetry-collector-contrib' }, 20 | { name: 'OPENTELEMETRY_COLLECTOR_IMAGE_TAG', value: '0.46.0' }, 21 | { name: 'ELASTICSEARCH_MEMORY', value: '4Gi' }, 22 | { name: 'ELASTICSEARCH_REQUEST_CPU', value: '200m' }, 23 | { name: 'ELASTICSEARCH_NAME', value: 'shared-es' }, 24 | { name: 'ELASTICSEARCH_NODE_COUNT', value: '1' }, 25 | { name: 'ELASTICSEARCH_REDUNDANCY_POLICY', value: 'ZeroRedundancy' }, 26 | ], 27 | } 28 | -------------------------------------------------------------------------------- /services/prometheus/remote_write_proxy.conf: -------------------------------------------------------------------------------- 1 | daemon off; 2 | worker_processes 1; 3 | error_log /dev/stderr; 4 | pid /tmp/nginx.pid; 5 | 6 | events { 7 | worker_connections 1024; 8 | } 9 | 10 | http { 11 | log_format main '$remote_addr - $remote_user [$time_local] "$request" ' 12 | '$status $body_bytes_sent "$http_referer" ' 13 | '"$http_user_agent" "$http_x_forwarded_for"'; 14 | 15 | server { 16 | listen *:%(listen_port)d; 17 | server_name _; 18 | access_log /dev/stdout main; 19 | error_log /dev/stderr; 20 | 21 | location / { 22 | proxy_set_header THANOS-TENANT %(thanos_tenant)s; 23 | proxy_pass %(forward_host)s; 24 | } 25 | } 26 | } 27 | -------------------------------------------------------------------------------- /services_go/observatorium/assets/store-auto-shard-relabel-configMap.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Kubernetes replicas are named with the following convention "-". 4 | # This parameter expansion removes all characters until the last hyphen, capturing only the ordinal. 5 | export ORDINAL_INDEX=${HOSTNAME##*-} 6 | # This parameter expansion removes all characters after the last hyphen, capturing only the statefulset name. 7 | export STATEFULSET_NAME="${HOSTNAME%-*}" 8 | export THANOS_STORE_REPLICAS=$(oc get statefulset ${STATEFULSET_NAME} -n ${NAMESPACE} -o=jsonpath='{.status.replicas}') 9 | 10 | # Logging parameters 11 | echo "generating store hashmod config with ORDINAL_INDEX=${ORDINAL_INDEX} THANOS_STORE_REPLICAS=${STATEFULSET_NAME} HOSTNAME=${HOSTNAME} NAMESPACE=${NAMESPACE} THANOS_STORE_REPLICAS=${THANOS_STORE_REPLICAS}" 12 | 13 | cat </tmp/config/hashmod-config.yaml 14 | - action: hashmod 15 | source_labels: 16 | - __block_id 17 | target_label: shard 18 | modulus: ${THANOS_STORE_REPLICAS} 19 | - action: keep 20 | source_labels: 21 | - shard 22 | regex: ${ORDINAL_INDEX} 23 | EOF 24 | -------------------------------------------------------------------------------- /services_go/observatorium/observatorium.go: -------------------------------------------------------------------------------- 1 | package observatorium 2 | 3 | import ( 4 | "github.com/bwplotka/mimic" 5 | ) 6 | 7 | type Observatorium struct { 8 | Cluster string 9 | Instance string // Instance is the name of the observatorium instance 10 | // MetricsInstances is the list of metrics instances in the observatorium instance 11 | // These are the different deployment units to which our tenants are mapped (e.g. default, rhel, telemeter) 12 | MetricsInstances ObservatoriumMetrics 13 | API ObservatoriumAPI 14 | } 15 | 16 | // Manifests generates the manifests for the instance of observatorium. 17 | func (o *Observatorium) Manifests(generator *mimic.Generator) { 18 | o.MetricsInstances.Manifests(generator.With(o.Cluster, o.Instance)) 19 | o.API.Manifests(generator.With(o.Cluster, o.Instance)) 20 | } 21 | -------------------------------------------------------------------------------- /services_go/services.go: -------------------------------------------------------------------------------- 1 | package services 2 | 3 | import ( 4 | "github.com/bwplotka/mimic" 5 | "github.com/rhobs/configuration/services_go/instances/rhobs" 6 | ) 7 | 8 | // Generate generates the manifests for all observatorium instances. 9 | func Generate(gen *mimic.Generator) { 10 | rhobsConfigs := rhobs.ClusterConfigs() 11 | for _, obsCfg := range rhobsConfigs { 12 | obsCfg.Manifests(gen) 13 | } 14 | } 15 | -------------------------------------------------------------------------------- /synchronize.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -e 4 | set -x 5 | set -o pipefail 6 | 7 | tmpdir=$(mktemp -d -t app-interface-XXXXXXXXXX) 8 | echo $tmpdir 9 | 10 | git clone git@gitlab.cee.redhat.com:service/app-interface.git $tmpdir 11 | 12 | cp -r resources/* $tmpdir/resources/ 13 | 14 | cd $tmpdir 15 | 16 | echo -n "Enter a new branch name and press [ENTER] (will be prefixed with synchronize_): " 17 | read branchname 18 | 19 | echo -n "Enter a environment name and press [ENTER] (e.g stage, production): " 20 | read environment 21 | 22 | branch="synchronize_${branchname}_${environment}" 23 | 24 | git checkout -b $branch 25 | 26 | git add resources/*-${environment}.* 27 | 28 | git commit 29 | 30 | echo -n "Enter your fork URL to push to and press [ENTER] (e.g: git@gitlab.cee.redhat.com:USERNAME/app-interface.git): " 31 | read fork 32 | 33 | git push $fork $branch 34 | 35 | rm -rf $tmpdir 36 | -------------------------------------------------------------------------------- /tests/ci/env/dex.test.ci.env: -------------------------------------------------------------------------------- 1 | DEX_CPU_REQUEST=30m 2 | DEX_CPU_LIMITS=50m 3 | DEX_MEMORY_REQUEST=100Mi 4 | DEX_MEMORY_LIMITS=100Mi 5 | DEX_STORAGE=0.25Gi 6 | -------------------------------------------------------------------------------- /tests/ci/env/logging.test.ci.env: -------------------------------------------------------------------------------- 1 | NAMESPACE=observatorium-tools 2 | ACCESS_KEY_ID=minio 3 | SECRET_ACCESS_KEY=minio123 4 | S3_BUCKET_NAME=loki 5 | S3_BUCKET_ENDPOINT=http://minio.minio.svc.cluster.local:9000 6 | S3_BUCKET_REGION="" 7 | LOKI_SIZE=1x.extra-small 8 | LOKI_STORAGE_CLASS=kubevirt-hostpath-provisioner 9 | -------------------------------------------------------------------------------- /tests/ci/env/minio.test.ci.env: -------------------------------------------------------------------------------- 1 | MINIO_CPU_REQUEST=50m 2 | MINIO_CPU_LIMITS=50m 3 | MINIO_MEMORY_REQUEST=200Mi 4 | MINIO_MEMORY_LIMITS=200Mi 5 | MINIO_STORAGE=0.25Gi 6 | -------------------------------------------------------------------------------- /tests/ci/env/observatorium-metric-federation-rule.test.ci.env: -------------------------------------------------------------------------------- 1 | STORAGE_CLASS=kubevirt-hostpath-provisioner 2 | THANOS_RULER_CPU_LIMIT=50m 3 | THANOS_RULER_CPU_REQUEST=25m 4 | THANOS_RULER_MEMORY_LIMIT=200Mi 5 | THANOS_RULER_MEMORY_REQUEST=200Mi 6 | THANOS_S3_SECRET=thanos-test-s3 7 | THANOS_QUERIER_NAMESPACE=observatorium-metrics 8 | JAEGER_AGENT_IMAGE=jaegertracing/jaeger-agent 9 | SERVICE_ACCOUNT_NAME=observatorium 10 | THANOS_RULER_PVC_REQUEST=0.25Gi 11 | THANOS_RULER_REPLICAS=1 12 | -------------------------------------------------------------------------------- /tests/ci/env/observatorium-parca.test.ci.env: -------------------------------------------------------------------------------- 1 | IMAGE=ghcr.io/parca-dev/parca 2 | PARCA_CPU_REQUEST=30m 3 | PARCA_MEMORY_REQUEST=500Mi 4 | PARCA_CPU_LIMITS=50m 5 | PARCA_MEMORY_LIMITS=500Mi 6 | ACCESS_KEY_ID=minio 7 | SECRET_ACCESS_KEY=minio123 8 | S3_BUCKET_NAME=parca 9 | S3_BUCKET_ENDPOINT=minio.minio.svc.cluster.local:9000 10 | S3_BUCKET_REGION=eu-central-1 11 | SD_NAMESPACE_LIST='["observatorium-metrics"]' 12 | -------------------------------------------------------------------------------- /tests/ci/env/observatorium.test.ci.env: -------------------------------------------------------------------------------- 1 | SERVICE_ACCOUNT_NAME=default 2 | JAEGER_AGENT_IMAGE=jaegertracing/jaeger-agent 3 | RULES_OBJSTORE_S3_SECRET=rules-objstore-s3 4 | MANAGED_TENANTS=rhobs 5 | OBSERVATORIUM_URL=http://observatorium-observatorium-api.observatorium.svc.cluster.local:8080 6 | 7 | GUBERNATOR_REPLICAS=1 8 | GUBERNATOR_CPU_LIMIT=50m 9 | GUBERNATOR_CPU_REQUEST=25m 10 | GUBERNATOR_MEMORY_LIMIT=100Mi 11 | GUBERNATOR_MEMORY_REQUEST=50Mi 12 | OBSERVATORIUM_API_REPLICAS=1 13 | OBSERVATORIUM_API_CPU_LIMIT=50m 14 | OBSERVATORIUM_API_CPU_REQUEST=25m 15 | OBSERVATORIUM_API_MEMORY_LIMIT=100Mi 16 | OBSERVATORIUM_API_MEMORY_REQUEST=50Mi 17 | UP_REPLICAS=1 18 | UP_CPU_REQUEST=25m 19 | UP_CPU_LIMIT=50m 20 | UP_MEMORY_REQUEST=50Mi 21 | UP_MEMORY_LIMIT=100Mi 22 | MEMCACHED_CPU_LIMIT=50m 23 | MEMCACHED_CPU_REQUEST=25m 24 | MEMCACHED_EXPORTER_CPU_LIMIT=50m 25 | MEMCACHED_EXPORTER_CPU_REQUEST=25m 26 | MEMCACHED_EXPORTER_MEMORY_LIMIT=100Mi 27 | MEMCACHED_MEMORY_LIMIT=100Mi 28 | MEMCACHED_MEMORY_REQUEST=50Mi 29 | OAUTH_PROXY_CPU_LIMITS=50m 30 | OAUTH_PROXY_CPU_REQUEST=25m 31 | OAUTH_PROXY_MEMORY_LIMITS=100Mi 32 | OAUTH_PROXY_MEMORY_REQUEST=50Mi 33 | OPA_AMS_CPU_LIMIT=50m 34 | OPA_AMS_CPU_REQUEST=25m 35 | OPA_AMS_MEMORY_LIMIT=100Mi 36 | OPA_AMS_MEMORY_REQUEST=50Mi 37 | -------------------------------------------------------------------------------- /tests/ci/env/rhelemeter.test.ci.env: -------------------------------------------------------------------------------- 1 | RHELEMETER_SERVER_CPU_LIMIT=50m 2 | RHELEMETER_SERVER_CPU_REQUEST=25m 3 | RHELEMETER_SERVER_MEMORY_LIMIT=100Mi 4 | RHELEMETER_SERVER_MEMORY_REQUEST=50Mi 5 | RHELEMETER_FORWARD_URL=http://observatorium-observatorium-api.observatorium.svc.cluster.local:8080/api/metrics/v1/test/api/v1/receive 6 | RHELEMETER_OIDC_ISSUER=http://dex.dex.svc.cluster.local:5556/dex 7 | RHELEMETER_CLIENT_ID=test 8 | RHELEMETER_TENANT_ID=test 9 | RHELEMETER_CLIENT_SECRET=ZXhhbXBsZS1hcHAtc2VjcmV0 10 | RHELEMETER_CLIENT_INFO_PSK=ZXhhbXBsZS1hcHAtc2VjcmV0 11 | -------------------------------------------------------------------------------- /tests/ci/env/telemeter.ci.env: -------------------------------------------------------------------------------- 1 | SERVICE_ACCOUNT_NAME=default 2 | TELEMETER_FORWARD_URL=http://observatorium-observatorium-api.observatorium.svc.cluster.local:8080/api/metrics/v1/telemeter/api/v1/receive 3 | 4 | MEMCACHED_CPU_LIMIT=1 5 | REPLICAS=1 6 | MEMCACHED_CPU_LIMIT=30m 7 | MEMCACHED_CPU_REQUEST=25m 8 | MEMCACHED_EXPORTER_CPU_LIMIT=50m 9 | MEMCACHED_EXPORTER_CPU_REQUEST=25m 10 | MEMCACHED_EXPORTER_MEMORY_LIMIT=100Mi 11 | MEMCACHED_MEMORY_LIMIT=100Mi 12 | MEMCACHED_MEMORY_REQUEST=50Mi 13 | OAUTH_PROXY_CPU_REQUEST=25m 14 | OAUTH_PROXY_MEMORY_REQUEST=50Mi 15 | OAUTH_PROXY_CPU_LIMITS=50m 16 | OAUTH_PROXY_MEMORY_LIMITS=100Mi 17 | TELEMETER_SERVER_CPU_LIMIT=100m 18 | TELEMETER_SERVER_CPU_REQUEST=25m 19 | TELEMETER_SERVER_MEMORY_LIMIT=100Mi 20 | TELEMETER_SERVER_MEMORY_REQUEST=50Mi 21 | -------------------------------------------------------------------------------- /tests/ci/manifests/rbac.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | data: 3 | rbac.yaml: |- 4 | "roleBindings": 5 | - "name": "test" 6 | "roles": 7 | - "read-write" 8 | "subjects": 9 | - "kind": "user" 10 | "name": "admin@example.com" 11 | "roles": 12 | - "name": "read-write" 13 | "permissions": 14 | - "read" 15 | - "write" 16 | "resources": 17 | - "logs" 18 | - "metrics" 19 | "tenants": 20 | - "test" 21 | kind: ConfigMap 22 | metadata: 23 | labels: 24 | app.kubernetes.io/component: api 25 | app.kubernetes.io/instance: observatorium 26 | app.kubernetes.io/name: observatorium-api 27 | app.kubernetes.io/part-of: observatorium 28 | name: observatorium-observatorium-api 29 | namespace: observatorium 30 | -------------------------------------------------------------------------------- /tests/ci/manifests/test-tenant.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Secret 3 | metadata: 4 | labels: 5 | app.kubernetes.io/component: api 6 | app.kubernetes.io/instance: observatorium 7 | app.kubernetes.io/name: observatorium-api 8 | app.kubernetes.io/part-of: observatorium 9 | name: observatorium-observatorium-api 10 | namespace: observatorium 11 | stringData: 12 | client-id: test 13 | client-secret: ZXhhbXBsZS1hcHAtc2VjcmV0 14 | issuer-url: http://dex.dex.svc.cluster.local:5556/dex 15 | tenants.yaml: |- 16 | "tenants": 17 | - "id": "1610b0c3-c509-4592-a256-a1871353dbfa" 18 | "name": "test" 19 | "oidc": 20 | "clientID": "test" 21 | "clientSecret": "ZXhhbXBsZS1hcHAtc2VjcmV0" 22 | "issuerURL": "http://dex.dex.svc.cluster.local:5556/dex" 23 | "usernameClaim": "email" 24 | "rateLimits": 25 | - "endpoint": "/api/metrics/v1/.+/api/v1/receive" 26 | "limit": 1000 27 | "window": "1s" 28 | - "endpoint": "/api/logs/v1/.*" 29 | "limit": 1000 30 | "window": "1s" 31 | -------------------------------------------------------------------------------- /tests/ci/rhobsci.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rhobs/configuration/6d2b136e48c291c1a415394fed40219188620665/tests/ci/rhobsci.png -------------------------------------------------------------------------------- /tests/deploy/env/logging.test.env: -------------------------------------------------------------------------------- 1 | NAMESPACE=observatorium-tools 2 | ACCESS_KEY_ID=minio 3 | SECRET_ACCESS_KEY=minio123 4 | S3_BUCKET_NAME=loki 5 | S3_BUCKET_ENDPOINT=http://minio.minio.svc.cluster.local:9000 6 | S3_BUCKET_REGION="" 7 | LOKI_SIZE=1x.extra-small 8 | LOKI_STORAGE_CLASS=gp2-csi 9 | -------------------------------------------------------------------------------- /tests/deploy/env/observatorium-jaeger.test.env: -------------------------------------------------------------------------------- 1 | SERVICE_ACCOUNT_NAME=observatorium 2 | STORAGE_CLASS=gp2-csi 3 | 4 | JAEGER_CPU_REQUEST=100m 5 | JAEGER_MEMORY_REQUEST=100Mi 6 | JAEGER_CPU_LIMITS=200m 7 | JAEGER_MEMORY_LIMITS=200Mi 8 | OAUTH_PROXY_CPU_REQUEST=100m 9 | OAUTH_PROXY_MEMORY_REQUEST=100Mi 10 | OAUTH_PROXY_CPU_LIMITS=200m 11 | OAUTH_PROXY_MEMORY_LIMITS=200Mi 12 | -------------------------------------------------------------------------------- /tests/deploy/env/observatorium-metric-federation-rule.test.env: -------------------------------------------------------------------------------- 1 | STORAGE_CLASS=gp2-csi 2 | THANOS_QUERIER_NAMESPACE=observatorium-metrics 3 | JAEGER_AGENT_IMAGE=jaegertracing/jaeger-agent 4 | THANOS_S3_SECRET=thanos-test-s3 5 | SERVICE_ACCOUNT_NAME=observatorium 6 | 7 | THANOS_RULER_CPU_LIMIT=200m 8 | THANOS_RULER_CPU_REQUEST=100m 9 | THANOS_RULER_MEMORY_LIMIT=200Mi 10 | THANOS_RULER_MEMORY_REQUEST=100Mi 11 | -------------------------------------------------------------------------------- /tests/deploy/env/observatorium-parca.test.env: -------------------------------------------------------------------------------- 1 | IMAGE=ghcr.io/parca-dev/parca 2 | PARCA_CPU_REQUEST=100m 3 | PARCA_MEMORY_REQUEST=500Mi 4 | PARCA_CPU_LIMITS=200m 5 | PARCA_MEMORY_LIMITS=1Gi 6 | ACCESS_KEY_ID=minio 7 | SECRET_ACCESS_KEY=minio123 8 | S3_BUCKET_NAME=parca 9 | S3_BUCKET_ENDPOINT=minio.minio.svc.cluster.local:9000 10 | S3_BUCKET_REGION=eu-central-1 11 | SD_NAMESPACE_LIST='["observatorium-metrics"]' 12 | -------------------------------------------------------------------------------- /tests/deploy/env/observatorium.test.env: -------------------------------------------------------------------------------- 1 | SERVICE_ACCOUNT_NAME=default 2 | JAEGER_AGENT_IMAGE=jaegertracing/jaeger-agent 3 | RULES_OBJSTORE_S3_SECRET=rules-objstore-s3 4 | MANAGED_TENANTS=rhobs 5 | OBSERVATORIUM_URL=http://observatorium-observatorium-api.observatorium.svc.cluster.local:8080 6 | 7 | GUBERNATOR_REPLICAS=1 8 | OBSERVATORIUM_API_REPLICAS=1 9 | UP_REPLICAS=1 10 | 11 | OBSERVATORIUM_API_CPU_LIMIT=100m 12 | OBSERVATORIUM_API_MEMORY_LIMIT=100Mi 13 | OBSERVATORIUM_API_MEMORY_REQUEST=100Mi 14 | UP_CPU_LIMIT=100m 15 | UP_MEMORY_REQUEST=100Mi 16 | UP_MEMORY_LIMIT=100Mi 17 | -------------------------------------------------------------------------------- /tests/deploy/env/rhelemeter.test.env: -------------------------------------------------------------------------------- 1 | RHELEMETER_SERVER_CPU_LIMIT=200m 2 | RHELEMETER_SERVER_CPU_REQUEST=100m 3 | RHELEMETER_SERVER_MEMORY_LIMIT=200Mi 4 | RHELEMETER_SERVER_MEMORY_REQUEST=100Mi 5 | RHELEMETER_FORWARD_URL=http://observatorium-observatorium-api.observatorium.svc.cluster.local:8080/api/metrics/v1/test/api/v1/receive 6 | RHELEMETER_OIDC_ISSUER=http://dex.dex.svc.cluster.local:5556/dex 7 | RHELEMETER_CLIENT_ID=test 8 | RHELEMETER_TENANT_ID=test 9 | RHELEMETER_CLIENT_SECRET=ZXhhbXBsZS1hcHAtc2VjcmV0 10 | RHELEMETER_CLIENT_INFO_PSK=ZXhhbXBsZS1hcHAtc2VjcmV0 11 | -------------------------------------------------------------------------------- /tests/deploy/env/telemeter.test.env: -------------------------------------------------------------------------------- 1 | SERVICE_ACCOUNT_NAME=default 2 | TELEMETER_FORWARD_URL=http://observatorium-observatorium-api.observatorium.svc.cluster.local:8080/api/metrics/v1/telemeter/api/v1/receive 3 | 4 | REPLICAS=1 5 | 6 | MEMCACHED_CPU_LIMIT=200m 7 | MEMCACHED_CPU_REQUEST=100m 8 | MEMCACHED_EXPORTER_CPU_LIMIT=200m 9 | MEMCACHED_EXPORTER_CPU_REQUEST=100m 10 | MEMCACHED_EXPORTER_MEMORY_LIMIT=200Mi 11 | MEMCACHED_MEMORY_LIMIT=200Mi 12 | MEMCACHED_MEMORY_REQUEST=100Mi 13 | OAUTH_PROXY_CPU_REQUEST=100m 14 | OAUTH_PROXY_MEMORY_REQUEST=100Mi 15 | OAUTH_PROXY_CPU_LIMITS=200m 16 | OAUTH_PROXY_MEMORY_LIMITS=200Mi 17 | TELEMETER_SERVER_CPU_LIMIT=200m 18 | TELEMETER_SERVER_CPU_REQUEST=100m 19 | TELEMETER_SERVER_MEMORY_LIMIT=200Mi 20 | TELEMETER_SERVER_MEMORY_REQUEST=100Mi 21 | -------------------------------------------------------------------------------- /tests/deploy/manifests/clusterlogforwader.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: logging.openshift.io/v1 2 | kind: ClusterLogForwarder 3 | metadata: 4 | name: instance 5 | namespace: openshift-logging 6 | spec: 7 | inputs: 8 | - application: 9 | namespaces: 10 | - observatorium-metrics 11 | - telemeter 12 | - observatorium-logs 13 | - observatorium 14 | name: send-observatorium-app-logs 15 | outputs: 16 | - name: loki-app 17 | type: loki 18 | url: https://observatorium-lokistack-gateway-http.observatorium-tools.svc.cluster.local:8080/api/logs/v1/application 19 | pipelines: 20 | - inputRefs: 21 | - send-observatorium-app-logs 22 | name: observatorium-app-logs 23 | outputRefs: 24 | - loki-app 25 | -------------------------------------------------------------------------------- /tests/deploy/manifests/clusterlogging.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: logging.openshift.io/v1 2 | kind: ClusterLogging 3 | metadata: 4 | name: instance 5 | namespace: openshift-logging 6 | spec: 7 | collection: 8 | logs: 9 | fluentd: {} 10 | type: vector 11 | managementState: Managed 12 | -------------------------------------------------------------------------------- /tests/deploy/manifests/logging-operator.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | name: openshift-logging 5 | annotations: 6 | openshift.io/node-selector: "" 7 | labels: 8 | openshift.io/cluster-monitoring: "true" 9 | --- 10 | apiVersion: operators.coreos.com/v1 11 | kind: OperatorGroup 12 | metadata: 13 | name: cluster-logging 14 | namespace: openshift-logging 15 | spec: 16 | targetNamespaces: 17 | - openshift-logging 18 | --- 19 | apiVersion: operators.coreos.com/v1alpha1 20 | kind: Subscription 21 | metadata: 22 | name: cluster-logging 23 | namespace: openshift-logging 24 | spec: 25 | channel: "stable-5.6" 26 | name: cluster-logging 27 | source: redhat-operators 28 | sourceNamespace: openshift-marketplace 29 | 30 | -------------------------------------------------------------------------------- /tests/deploy/manifests/loki-operator.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Namespace 3 | metadata: 4 | name: openshift-operators-redhat 5 | annotations: 6 | openshift.io/node-selector: "" 7 | labels: 8 | openshift.io/cluster-monitoring: "true" 9 | --- 10 | apiVersion: operators.coreos.com/v1 11 | kind: OperatorGroup 12 | metadata: 13 | name: operator-group 14 | namespace: openshift-operators-redhat 15 | spec: 16 | targetNamespaces: 17 | --- 18 | apiVersion: operators.coreos.com/v1alpha1 19 | kind: Subscription 20 | metadata: 21 | name: loki-operator 22 | namespace: openshift-operators-redhat 23 | spec: 24 | channel: stable-5.6 25 | installPlanApproval: Automatic 26 | name: loki-operator 27 | source: redhat-operators 28 | sourceNamespace: openshift-marketplace 29 | -------------------------------------------------------------------------------- /tests/deploy/manifests/observatorium-alertmanager-config-secret.yaml: -------------------------------------------------------------------------------- 1 | kind: Secret 2 | apiVersion: v1 3 | metadata: 4 | name: alertmanager-config 5 | data: 6 | alertmanager.yaml: >- 7 | Cmdsb2JhbDoKICByZXNvbHZlX3RpbWVvdXQ6IDVtCiAgc2xhY2tfYXBpX3VybDogaHR0cHM6Ly9ob29rcy5zbGFjay5jb20vc2VydmljZXMvVDAyN0YzR0FKL0JGWVBCNTQwWi8xUlU0U2hMZmd4ZEpvMUNCTVpXaXgzRHYKcmVjZWl2ZXJzOgotIG5hbWU6IGRlZmF1bHQKcm91dGU6CiAgZ3JvdXBfaW50ZXJ2YWw6IDVtCiAgZ3JvdXBfd2FpdDogMzBzCiAgcmVjZWl2ZXI6IGRlZmF1bHQKICByZXBlYXRfaW50ZXJ2YWw6IDEyaAogIHJvdXRlczoKICAtIG1hdGNoOgogICAgICB0ZW5hbnRfaWQ6IDBmYzJiMDBlLTIwMWItNGMxNy1iOWYyLTE5ZDkxYWRjNGZkMgp0ZW1wbGF0ZXM6CiAgLSAnKi50bXBsJwo= 8 | type: Opaque 9 | -------------------------------------------------------------------------------- /tests/deploy/manifests/observatorium-cluster-role-binding.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRoleBinding 3 | metadata: 4 | name: observatorium 5 | subjects: 6 | - kind: ServiceAccount 7 | name: observatorium 8 | namespace: observatorium-metrics 9 | roleRef: 10 | kind: ClusterRole 11 | name: observatorium 12 | apiGroup: rbac.authorization.k8s.io 13 | -------------------------------------------------------------------------------- /tests/deploy/manifests/observatorium-cluster-role.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: rbac.authorization.k8s.io/v1 2 | kind: ClusterRole 3 | metadata: 4 | name: observatorium 5 | rules: 6 | - apiGroups: 7 | - authentication.k8s.io 8 | resources: 9 | - tokenreviews 10 | verbs: 11 | - create 12 | - apiGroups: 13 | - authorization.k8s.io 14 | resources: 15 | - subjectaccessreviews 16 | verbs: 17 | - create 18 | - apiGroups: 19 | - "" 20 | resourceNames: 21 | - observatorium-metrics 22 | resources: 23 | - namespaces 24 | verbs: 25 | - get 26 | -------------------------------------------------------------------------------- /tests/deploy/manifests/observatorium-logs-secret.yaml: -------------------------------------------------------------------------------- 1 | kind: Secret 2 | apiVersion: v1 3 | metadata: 4 | name: rules-objstore-s3 5 | data: 6 | aws_access_key_id: bWluaW8= 7 | aws_region: ZXUtY2VudHJhbC0x 8 | aws_secret_access_key: bWluaW8xMjM= 9 | bucket: cnVsZXM= 10 | endpoint: bWluaW8ubWluaW8uc3ZjLmNsdXN0ZXIubG9jYWw6OTAwMA== 11 | type: Opaque 12 | --- 13 | kind: Secret 14 | apiVersion: v1 15 | metadata: 16 | name: observatorium-logs-testing-s3 17 | data: 18 | aws_access_key_id: bWluaW8= 19 | aws_region: ZXUtY2VudHJhbC0x 20 | aws_secret_access_key: bWluaW8xMjM= 21 | bucket: bG9raQ== 22 | endpoint: bWluaW8ubWluaW8uc3ZjLmNsdXN0ZXIubG9jYWw6OTAwMA== 23 | type: Opaque 24 | -------------------------------------------------------------------------------- /tests/deploy/manifests/observatorium-metrics-thanos-objectstorage-secret-template.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: template.openshift.io/v1 2 | kind: Template 3 | metadata: 4 | name: minio-secret 5 | objects: 6 | - apiVersion: v1 7 | kind: Secret 8 | metadata: 9 | name: ${THANOS_CONFIG_SECRET} 10 | namespace: ${OBSERVATORIUM_METRICS_NAMESPACE} 11 | stringData: 12 | thanos.yaml: | 13 | type: s3 14 | config: 15 | bucket: thanos 16 | endpoint: minio.${MINIO_NAMESPACE}.svc.cluster.local:9000 17 | insecure: true 18 | access_key: minio 19 | secret_key: minio123 20 | type: Opaque 21 | - apiVersion: v1 22 | kind: Secret 23 | metadata: 24 | name: ${THANOS_S3_SECRET} 25 | namespace: ${OBSERVATORIUM_METRICS_NAMESPACE} 26 | data: 27 | aws_access_key_id: bWluaW8= 28 | aws_secret_access_key: bWluaW8xMjM= 29 | type: Opaque 30 | parameters: 31 | - name: MINIO_NAMESPACE 32 | value: minio 33 | - name: OBSERVATORIUM_METRICS_NAMESPACE 34 | value: observatorium-metrics 35 | - name: THANOS_CONFIG_SECRET 36 | value: thanos-objectstorage 37 | - name: THANOS_S3_SECRET 38 | value: thanos-test-s3 39 | -------------------------------------------------------------------------------- /tests/deploy/manifests/observatorium-parca-secret.yaml: -------------------------------------------------------------------------------- 1 | kind: Secret 2 | apiVersion: v1 3 | metadata: 4 | name: conprof-proxy 5 | data: 6 | session_secret: NjU2MDlmZTFhNWQ0NDgwMDliZTE3YjYxYTVlNjg5OGU= 7 | type: Opaque 8 | -------------------------------------------------------------------------------- /tests/deploy/manifests/observatorium-rhobs-tenant-secret.yaml: -------------------------------------------------------------------------------- 1 | kind: Secret 2 | apiVersion: v1 3 | metadata: 4 | name: rhobs-tenant 5 | data: 6 | client_id: b2JzZXJ2YXRvcml1bS1yaG9icy10ZXN0aW5n 7 | client_secret: ZjA3OTIxOTctMmNjZS00NTZkLTlmYTItZTM4YTliMTI5NjVh 8 | type: Opaque 9 | -------------------------------------------------------------------------------- /tests/deploy/manifests/observatorium-rules-objstore-secret.yaml: -------------------------------------------------------------------------------- 1 | kind: Secret 2 | apiVersion: v1 3 | metadata: 4 | name: rules-objstore 5 | data: 6 | objstore.yaml: >- 7 | dHlwZTogUzMKY29uZmlnOgogIGJ1Y2tldDogInRlbGVtZXRlci1ydWxlcy1vYmpzdG9yZS10ZXN0aW5nIgogIGVuZHBvaW50OiAiczMudXMtZWFzdC0xLmFtYXpvbmF3cy5jb20iCiAgcmVnaW9uOiAidXMtZWFzdC0xIgogIHRyYWNlOgogICAgZW5hYmxlOiBmYWxzZQo= 8 | type: Opaque 9 | --- 10 | kind: Secret 11 | apiVersion: v1 12 | metadata: 13 | name: rules-objstore-s3 14 | data: 15 | aws_access_key_id: bWluaW8= 16 | aws_region: ZXUtY2VudHJhbC0x 17 | aws_secret_access_key: bWluaW8xMjM= 18 | bucket: cnVsZXM= 19 | endpoint: bWluaW8ubWluaW8uc3ZjLmNsdXN0ZXIubG9jYWw6OTAwMA== 20 | type: Opaque 21 | -------------------------------------------------------------------------------- /tests/deploy/manifests/observatorium-service-account.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: ServiceAccount 3 | metadata: 4 | name: observatorium 5 | -------------------------------------------------------------------------------- /tests/deploy/manifests/observatorium-tools-network-policy.yaml: -------------------------------------------------------------------------------- 1 | kind: NetworkPolicy 2 | apiVersion: networking.k8s.io/v1 3 | metadata: 4 | name: allow-from-ingress-namespace 5 | namespace: observatorium-tools 6 | spec: 7 | podSelector: {} 8 | ingress: 9 | - from: 10 | - namespaceSelector: 11 | matchLabels: 12 | network.openshift.io/policy-group: ingress 13 | policyTypes: 14 | - Ingress 15 | --- 16 | kind: NetworkPolicy 17 | apiVersion: networking.k8s.io/v1 18 | metadata: 19 | name: allow-from-openshift-logging-namespace 20 | namespace: observatorium-tools 21 | spec: 22 | podSelector: {} 23 | ingress: 24 | - from: 25 | - namespaceSelector: 26 | matchLabels: 27 | kubernetes.io/metadata.name: openshift-logging 28 | policyTypes: 29 | - Ingress 30 | --- 31 | kind: NetworkPolicy 32 | apiVersion: networking.k8s.io/v1 33 | metadata: 34 | name: allow-from-same-namespace 35 | namespace: observatorium-tools 36 | spec: 37 | podSelector: {} 38 | ingress: 39 | - from: 40 | - podSelector: {} 41 | policyTypes: 42 | - Ingress 43 | 44 | -------------------------------------------------------------------------------- /tests/deploy/manifests/rhelemeter_certs/ca.crt: -------------------------------------------------------------------------------- 1 | -----BEGIN CERTIFICATE----- 2 | MIIBDzCBtgIJANecoFgWJpRZMAoGCCqGSM49BAMCMA8xDTALBgNVBAMMBHRlc3Qw 3 | IBcNMjMwNzE5MDc0ODM2WhgPMzAwOTAzMTIwNzQ4MzZaMA8xDTALBgNVBAMMBHRl 4 | c3QwWTATBgcqhkjOPQIBBggqhkjOPQMBBwNCAARYZDE4Kz0ys2KvRo7p3e6/P3Yo 5 | eSkDXJ1DpVWH5+XemuAriGE8pMwij7yTsbmUHHGlnMZNh0Uc+Uiplb5rbeaSMAoG 6 | CCqGSM49BAMCA0gAMEUCIBYKEb0GBppTsRXrVGJqfrzcgqQhpEXWwhg9LQPfiRce 7 | AiEAtpGaoRW5KYA30uNZNabK0U9rfrORYLZhN2ovhpm3+Ko= 8 | -----END CERTIFICATE----- 9 | -------------------------------------------------------------------------------- /tests/deploy/manifests/rhelemeter_certs/tls.crt: -------------------------------------------------------------------------------- 1 | -----BEGIN CERTIFICATE----- 2 | MIIBDzCBtgIJANecoFgWJpRZMAoGCCqGSM49BAMCMA8xDTALBgNVBAMMBHRlc3Qw 3 | IBcNMjMwNzE5MDc0ODM2WhgPMzAwOTAzMTIwNzQ4MzZaMA8xDTALBgNVBAMMBHRl 4 | c3QwWTATBgcqhkjOPQIBBggqhkjOPQMBBwNCAARYZDE4Kz0ys2KvRo7p3e6/P3Yo 5 | eSkDXJ1DpVWH5+XemuAriGE8pMwij7yTsbmUHHGlnMZNh0Uc+Uiplb5rbeaSMAoG 6 | CCqGSM49BAMCA0gAMEUCIBYKEb0GBppTsRXrVGJqfrzcgqQhpEXWwhg9LQPfiRce 7 | AiEAtpGaoRW5KYA30uNZNabK0U9rfrORYLZhN2ovhpm3+Ko= 8 | -----END CERTIFICATE----- 9 | -------------------------------------------------------------------------------- /tests/deploy/manifests/rhelemeter_certs/tls.key: -------------------------------------------------------------------------------- 1 | -----BEGIN EC PRIVATE KEY----- 2 | MHcCAQEEIO5yfP9d0RcEzTTeM732EWnGEqWYlvu+JaOEpRXYsHaloAoGCCqGSM49 3 | AwEHoUQDQgAEWGQxOCs9MrNir0aO6d3uvz92KHkpA1ydQ6VVh+fl3prgK4hhPKTM 4 | Io+8k7G5lBxxpZzGTYdFHPlIqZW+a23mkg== 5 | -----END EC PRIVATE KEY----- 6 | -------------------------------------------------------------------------------- /tests/deploy/manifests/telemeter-token-refersher-oidc-secret.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | kind: Secret 3 | metadata: 4 | labels: 5 | k8s-app: telemeter-server 6 | name: token-refresher-oidc 7 | type: Opaque 8 | stringData: 9 | audience: test 10 | clientID: test 11 | clientSecret: ZXhhbXBsZS1hcHAtc2VjcmV0 12 | issuerURL: http://dex.dex.svc.cluster.local:5556/dex 13 | -------------------------------------------------------------------------------- /tests/deploy/testdata/client-info.json: -------------------------------------------------------------------------------- 1 | { 2 | "secret": "super-secret", 3 | "config": { 4 | "secret_header": "x-secret", 5 | "common_name_header": "x-common-name", 6 | "issuer_header": "x-issuer" 7 | } 8 | } 9 | -------------------------------------------------------------------------------- /tests/integration_tests/Dockerfile: -------------------------------------------------------------------------------- 1 | # This is a workaround since `up` images are now built with scratch, 2 | # meaning we cannot execture other commands. This copies `up` binary 3 | # and runs the tests in an Alpine-based container. 4 | FROM quay.io/observatorium/up:master-2022-07-13-7f0630b as source 5 | 6 | FROM quay.io/app-sre/ubi8-ubi-minimal 7 | 8 | COPY --from=source /usr/bin/up /usr/bin/up 9 | 10 | RUN microdnf update -y &&\ 11 | microdnf install -y curl jq 12 | 13 | COPY ./tests/integration_tests/runtest.sh /tests/runtest.sh 14 | 15 | WORKDIR /tests 16 | ENTRYPOINT ["/bin/sh", "runtest.sh"] 17 | -------------------------------------------------------------------------------- /tests/integration_tests/README.md: -------------------------------------------------------------------------------- 1 | ## RHOBS post-deploy job 2 | 3 | This directory includes definition for a post-deploy job that is supposed to be run after each deployment. 4 | It consists of: 5 | 1) Post-deploy OpenShift job template (`post-deploy-job-template.yaml`) that is leveraged by AppSRE Interface. The usage is defined in an Saas file [here](https://gitlab.cee.redhat.com/service/app-interface/-/tree/master/data/services/rhobs/observatorium/cicd/saas-post-deploy-test.yaml). 6 | 2) Dockerfile to run the test in a container. The tests are based on the `up` Docker image, with few additions to be able to `curl` to obtain a bearer token for the test. The Docker image is built and pushed with help of AppSRE Interface integration, see the relevant [Jenkins config file](). 7 | 3) The actual test is specified in the `runtest.sh` script. Currently, this is a bare-bones, simple `up` run, which means the test will try to write a couple of requests and subsequently read those metrics. 8 | 9 | To see the exact template usage, check the [Saas file definition](https://gitlab.cee.redhat.com/service/app-interface/-/tree/master/data/services/rhobs/observatorium/cicd/saas-post-deploy-test.yaml) in AppSRE Interface. The tests are currently set up to run only in `observatorium-stage`. So far, no automatic deployment promotion has been enabled, as we'll first test and assess how the post-deploy job is functioining in the staging environment. -------------------------------------------------------------------------------- /tests/integration_tests/build_deploy.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -exv 4 | 5 | # We need to find absolute path since CI is running the script from repo's root directory. 6 | ABSOLUTE_PATH="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" 7 | 8 | IMAGE="quay.io/app-sre/rhobs-e2e" 9 | IMAGE_TAG=$(git rev-parse --short=7 HEAD) 10 | 11 | docker build -t "${IMAGE}:${IMAGE_TAG}" -f "${ABSOLUTE_PATH}/Dockerfile" . 12 | 13 | if [[ -n "$QUAY_USER" && -n "$QUAY_TOKEN" ]]; then 14 | DOCKER_CONF="$PWD/.docker" 15 | mkdir -p "$DOCKER_CONF" 16 | docker --config="$DOCKER_CONF" login -u="$QUAY_USER" -p="$QUAY_TOKEN" quay.io 17 | docker --config="$DOCKER_CONF" push "${IMAGE}:${IMAGE_TAG}" 18 | fi -------------------------------------------------------------------------------- /tests/integration_tests/framework/.bingo/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Ignore everything 3 | * 4 | 5 | # But not these files: 6 | !.gitignore 7 | !*.mod 8 | !*.sum 9 | !README.md 10 | !Variables.mk 11 | !variables.env 12 | 13 | *tmp.mod 14 | -------------------------------------------------------------------------------- /tests/integration_tests/framework/.bingo/README.md: -------------------------------------------------------------------------------- 1 | # Project Development Dependencies. 2 | 3 | This is directory which stores Go modules with pinned buildable package that is used within this repository, managed by https://github.com/bwplotka/bingo. 4 | 5 | * Run `bingo get` to install all tools having each own module file in this directory. 6 | * Run `bingo get ` to install that have own module file in this directory. 7 | * For Makefile: Make sure to put `include .bingo/Variables.mk` in your Makefile, then use $() variable where is the .bingo/.mod. 8 | * For shell: Run `source .bingo/variables.env` to source all environment variable for each tool. 9 | * For go: Import `.bingo/variables.go` to for variable names. 10 | * See https://github.com/bwplotka/bingo or -h on how to add, remove or change binaries dependencies. 11 | 12 | ## Requirements 13 | 14 | * Go 1.14+ 15 | -------------------------------------------------------------------------------- /tests/integration_tests/framework/.bingo/go.mod: -------------------------------------------------------------------------------- 1 | module _ // Fake go.mod auto-created by 'bingo' for go -moddir compatibility with non-Go projects. Commit this file, together with other .mod files. -------------------------------------------------------------------------------- /tests/integration_tests/framework/.bingo/gojsontoyaml.mod: -------------------------------------------------------------------------------- 1 | module _ // Auto generated by https://github.com/bwplotka/bingo. DO NOT EDIT 2 | 3 | go 1.19 4 | 5 | require github.com/brancz/gojsontoyaml v0.1.0 6 | -------------------------------------------------------------------------------- /tests/integration_tests/framework/.bingo/gojsontoyaml.sum: -------------------------------------------------------------------------------- 1 | github.com/brancz/gojsontoyaml v0.1.0 h1:SdzR3+BCVOqaI42nFGTeaB7/2DgDM4fhuvRLqxatA8M= 2 | github.com/brancz/gojsontoyaml v0.1.0/go.mod h1:+ycZY94+V11XZBUaDEsbLr3hPNS/ZPrDVKKNUg3Sgvg= 3 | github.com/ghodss/yaml v1.0.0 h1:wQHKEahhL6wmXdzwWG11gIVCkOv05bNOh+Rxn0yngAk= 4 | github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= 5 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 6 | gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY= 7 | gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ= 8 | -------------------------------------------------------------------------------- /tests/integration_tests/framework/.bingo/jsonnet-lint.mod: -------------------------------------------------------------------------------- 1 | module _ // Auto generated by https://github.com/bwplotka/bingo. DO NOT EDIT 2 | 3 | go 1.19 4 | 5 | require github.com/google/go-jsonnet v0.20.0 // cmd/jsonnet-lint 6 | -------------------------------------------------------------------------------- /tests/integration_tests/framework/.bingo/jsonnet-lint.sum: -------------------------------------------------------------------------------- 1 | github.com/fatih/color v1.12.0 h1:mRhaKNwANqRgUBGKmnI5ZxEk7QXmjQeCcuYFMX2bfcc= 2 | github.com/fatih/color v1.12.0/go.mod h1:ELkj/draVOlAH/xkhN6mQ50Qd0MPOk5AAr3maGEBuJM= 3 | github.com/google/go-jsonnet v0.20.0 h1:WG4TTSARuV7bSm4PMB4ohjxe33IHT5WVTrJSU33uT4g= 4 | github.com/google/go-jsonnet v0.20.0/go.mod h1:VbgWF9JX7ztlv770x/TolZNGGFfiHEVx9G6ca2eUmeA= 5 | github.com/mattn/go-colorable v0.1.8 h1:c1ghPdyEDarC70ftn0y+A/Ee++9zz8ljHG1b13eJ0s8= 6 | github.com/mattn/go-colorable v0.1.8/go.mod h1:u6P/XSegPjTcexA+o6vUJrdnUu04hMope9wVRipJSqc= 7 | github.com/mattn/go-isatty v0.0.12 h1:wuysRhFDzyxgEmMf5xjvJ2M9dZoWAXNNr5LSBS7uHXY= 8 | github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU= 9 | golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 10 | golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 11 | golang.org/x/sys v0.1.0 h1:kunALQeHf1/185U1i0GOB/fy1IPRDDpuoOOqRReG57U= 12 | golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 13 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 14 | gopkg.in/yaml.v2 v2.2.7 h1:VUgggvou5XRW9mHwD/yXxIYSMtY0zoKQf/v226p2nyo= 15 | gopkg.in/yaml.v2 v2.2.7/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 16 | sigs.k8s.io/yaml v1.1.0 h1:4A07+ZFc2wgJwo8YNlQpr1rVlgUDlxXHhPJciaPY5gs= 17 | sigs.k8s.io/yaml v1.1.0/go.mod h1:UJmg0vDUVViEyp3mgSv9WPwZCDxu4rQW1olrI1uml+o= 18 | -------------------------------------------------------------------------------- /tests/integration_tests/framework/.bingo/jsonnet.mod: -------------------------------------------------------------------------------- 1 | module _ // Auto generated by https://github.com/bwplotka/bingo. DO NOT EDIT 2 | 3 | go 1.19 4 | 5 | require github.com/google/go-jsonnet v0.20.0 // cmd/jsonnet 6 | -------------------------------------------------------------------------------- /tests/integration_tests/framework/.bingo/jsonnet.sum: -------------------------------------------------------------------------------- 1 | github.com/fatih/color v1.12.0 h1:mRhaKNwANqRgUBGKmnI5ZxEk7QXmjQeCcuYFMX2bfcc= 2 | github.com/fatih/color v1.12.0/go.mod h1:ELkj/draVOlAH/xkhN6mQ50Qd0MPOk5AAr3maGEBuJM= 3 | github.com/google/go-jsonnet v0.20.0 h1:WG4TTSARuV7bSm4PMB4ohjxe33IHT5WVTrJSU33uT4g= 4 | github.com/google/go-jsonnet v0.20.0/go.mod h1:VbgWF9JX7ztlv770x/TolZNGGFfiHEVx9G6ca2eUmeA= 5 | github.com/mattn/go-colorable v0.1.8 h1:c1ghPdyEDarC70ftn0y+A/Ee++9zz8ljHG1b13eJ0s8= 6 | github.com/mattn/go-colorable v0.1.8/go.mod h1:u6P/XSegPjTcexA+o6vUJrdnUu04hMope9wVRipJSqc= 7 | github.com/mattn/go-isatty v0.0.12 h1:wuysRhFDzyxgEmMf5xjvJ2M9dZoWAXNNr5LSBS7uHXY= 8 | github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU= 9 | golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 10 | golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 11 | golang.org/x/sys v0.1.0 h1:kunALQeHf1/185U1i0GOB/fy1IPRDDpuoOOqRReG57U= 12 | golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 13 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 14 | gopkg.in/yaml.v2 v2.2.7 h1:VUgggvou5XRW9mHwD/yXxIYSMtY0zoKQf/v226p2nyo= 15 | gopkg.in/yaml.v2 v2.2.7/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 16 | sigs.k8s.io/yaml v1.1.0 h1:4A07+ZFc2wgJwo8YNlQpr1rVlgUDlxXHhPJciaPY5gs= 17 | sigs.k8s.io/yaml v1.1.0/go.mod h1:UJmg0vDUVViEyp3mgSv9WPwZCDxu4rQW1olrI1uml+o= 18 | -------------------------------------------------------------------------------- /tests/integration_tests/framework/.bingo/jsonnetfmt.mod: -------------------------------------------------------------------------------- 1 | module _ // Auto generated by https://github.com/bwplotka/bingo. DO NOT EDIT 2 | 3 | go 1.19 4 | 5 | require github.com/google/go-jsonnet v0.20.0 // cmd/jsonnetfmt 6 | -------------------------------------------------------------------------------- /tests/integration_tests/framework/.bingo/jsonnetfmt.sum: -------------------------------------------------------------------------------- 1 | github.com/fatih/color v1.12.0 h1:mRhaKNwANqRgUBGKmnI5ZxEk7QXmjQeCcuYFMX2bfcc= 2 | github.com/fatih/color v1.12.0/go.mod h1:ELkj/draVOlAH/xkhN6mQ50Qd0MPOk5AAr3maGEBuJM= 3 | github.com/google/go-jsonnet v0.20.0 h1:WG4TTSARuV7bSm4PMB4ohjxe33IHT5WVTrJSU33uT4g= 4 | github.com/google/go-jsonnet v0.20.0/go.mod h1:VbgWF9JX7ztlv770x/TolZNGGFfiHEVx9G6ca2eUmeA= 5 | github.com/mattn/go-colorable v0.1.8 h1:c1ghPdyEDarC70ftn0y+A/Ee++9zz8ljHG1b13eJ0s8= 6 | github.com/mattn/go-colorable v0.1.8/go.mod h1:u6P/XSegPjTcexA+o6vUJrdnUu04hMope9wVRipJSqc= 7 | github.com/mattn/go-isatty v0.0.12 h1:wuysRhFDzyxgEmMf5xjvJ2M9dZoWAXNNr5LSBS7uHXY= 8 | github.com/mattn/go-isatty v0.0.12/go.mod h1:cbi8OIDigv2wuxKPP5vlRcQ1OAZbq2CE4Kysco4FUpU= 9 | golang.org/x/sys v0.0.0-20200116001909-b77594299b42/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 10 | golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 11 | golang.org/x/sys v0.1.0 h1:kunALQeHf1/185U1i0GOB/fy1IPRDDpuoOOqRReG57U= 12 | golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 13 | gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= 14 | gopkg.in/yaml.v2 v2.2.7 h1:VUgggvou5XRW9mHwD/yXxIYSMtY0zoKQf/v226p2nyo= 15 | gopkg.in/yaml.v2 v2.2.7/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= 16 | sigs.k8s.io/yaml v1.1.0 h1:4A07+ZFc2wgJwo8YNlQpr1rVlgUDlxXHhPJciaPY5gs= 17 | sigs.k8s.io/yaml v1.1.0/go.mod h1:UJmg0vDUVViEyp3mgSv9WPwZCDxu4rQW1olrI1uml+o= 18 | -------------------------------------------------------------------------------- /tests/integration_tests/framework/.bingo/variables.env: -------------------------------------------------------------------------------- 1 | # Auto generated binary variables helper managed by https://github.com/bwplotka/bingo v0.8. DO NOT EDIT. 2 | # All tools are designed to be build inside $GOBIN. 3 | # Those variables will work only until 'bingo get' was invoked, or if tools were installed via Makefile's Variables.mk. 4 | GOBIN=${GOBIN:=$(go env GOBIN)} 5 | 6 | if [ -z "$GOBIN" ]; then 7 | GOBIN="$(go env GOPATH)/bin" 8 | fi 9 | 10 | 11 | GOJSONTOYAML="${GOBIN}/gojsontoyaml-v0.1.0" 12 | 13 | JSONNET_LINT="${GOBIN}/jsonnet-lint-v0.20.0" 14 | 15 | JSONNET="${GOBIN}/jsonnet-v0.20.0" 16 | 17 | JSONNETFMT="${GOBIN}/jsonnetfmt-v0.20.0" 18 | 19 | -------------------------------------------------------------------------------- /tests/integration_tests/framework/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM golang:1.20-alpine 2 | RUN apk update && apk add git 3 | 4 | WORKDIR /integration-tests 5 | 6 | COPY go.mod go.sum ./ 7 | RUN go mod download 8 | 9 | COPY . . 10 | 11 | RUN go build ./cmd/rhobs-test 12 | ENTRYPOINT [ "./rhobs-test" ] 13 | -------------------------------------------------------------------------------- /tests/integration_tests/framework/examples/manifests/dev/test-job.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | items: 3 | - apiVersion: batch/v1 4 | kind: Job 5 | metadata: 6 | labels: 7 | app.kubernetes.io/component: test 8 | app.kubernetes.io/instance: rhobs-test 9 | app.kubernetes.io/name: rhobs-test-job 10 | name: rhobs-test-job 11 | spec: 12 | template: 13 | metadata: 14 | labels: 15 | app.kubernetes.io/component: test 16 | app.kubernetes.io/instance: rhobs-test 17 | app.kubernetes.io/name: rhobs-test-job 18 | spec: 19 | containers: 20 | - args: 21 | - --namespaces=prometheus-example 22 | - --interval=5s 23 | - --timeout=60s 24 | image: localhost:5001/rhobs-test:latest 25 | name: rhobs-test-job 26 | resources: {} 27 | volumeMounts: [] 28 | initContainers: [] 29 | restartPolicy: OnFailure 30 | serviceAccountName: rhobs-test-job 31 | volumes: [] 32 | kind: List 33 | -------------------------------------------------------------------------------- /tests/integration_tests/framework/examples/manifests/openshift/rhobs-test-job-template.yaml: -------------------------------------------------------------------------------- 1 | apiVersion: template.openshift.io/v1 2 | kind: Template 3 | metadata: 4 | name: rhobs-test-job 5 | objects: 6 | - apiVersion: batch/v1 7 | kind: Job 8 | metadata: 9 | labels: 10 | app.kubernetes.io/component: test 11 | app.kubernetes.io/instance: rhobs-test 12 | app.kubernetes.io/name: ${JOB_NAME} 13 | name: ${JOB_NAME} 14 | spec: 15 | template: 16 | metadata: 17 | labels: 18 | app.kubernetes.io/component: test 19 | app.kubernetes.io/instance: rhobs-test 20 | app.kubernetes.io/name: ${JOB_NAME} 21 | spec: 22 | containers: 23 | - args: 24 | - --namespaces=${JOB_NAMESPACES} 25 | - --interval=${JOB_INTERVAL} 26 | - --timeout=${JOB_TIMEOUT} 27 | image: ${JOB_IMAGE}:${JOB_IMAGE_TAG} 28 | name: ${JOB_NAME} 29 | resources: {} 30 | volumeMounts: [] 31 | initContainers: [] 32 | restartPolicy: OnFailure 33 | serviceAccountName: ${SERVICE_ACCOUNT_NAME} 34 | volumes: [] 35 | parameters: 36 | - name: JOB_NAMESPACES 37 | value: observatorium,observatorium-metrics,observatorium-logs,minio,dex,telemeter 38 | - name: JOB_NAME 39 | value: rhobs-test-job 40 | - name: JOB_INTERVAL 41 | value: 10s 42 | - name: JOB_TIMEOUT 43 | value: 1m 44 | - name: JOB_IMAGE 45 | value: quay.io/app-sre/rhobs-test 46 | - name: JOB_IMAGE_TAG 47 | value: latest 48 | - name: SERVICE_ACCOUNT_NAME 49 | value: rhobs-test-job 50 | -------------------------------------------------------------------------------- /tests/integration_tests/framework/integration-test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rhobs/configuration/6d2b136e48c291c1a415394fed40219188620665/tests/integration_tests/framework/integration-test.png -------------------------------------------------------------------------------- /tests/integration_tests/framework/pkg/client/client.go: -------------------------------------------------------------------------------- 1 | package client 2 | 3 | import ( 4 | "github.com/rhobs/configuration/tests/integration_tests/framework/pkg/logger" 5 | 6 | "k8s.io/client-go/kubernetes" 7 | "k8s.io/client-go/rest" 8 | "k8s.io/client-go/tools/clientcmd" 9 | ) 10 | 11 | func client(kubeconfig string) *kubernetes.Clientset { 12 | var ( 13 | config *rest.Config 14 | err error 15 | ) 16 | // If no kubeconfig file specified, then use in-cluster config 17 | if kubeconfig == "" { 18 | config, err = rest.InClusterConfig() 19 | if err != nil { 20 | logger.AppLog.LogFatal("Error getting in-cluster config: %v\n", err) 21 | } 22 | } else { 23 | // If kubeconfig file is specified, then use it 24 | config, err = clientcmd.BuildConfigFromFlags("", kubeconfig) 25 | if err != nil { 26 | logger.AppLog.LogFatal("Error building kubeconfig from file %s: %v\n", kubeconfig, err) 27 | } 28 | } 29 | // Create Kubernetes clientset 30 | clientset, err := kubernetes.NewForConfig(config) 31 | if err != nil { 32 | logger.AppLog.LogFatal("Error creating Kubernetes clientset: %v\n", err) 33 | } 34 | return clientset 35 | } 36 | func GetClient(kubeconfig string) *kubernetes.Clientset { 37 | return client(kubeconfig) 38 | } 39 | --------------------------------------------------------------------------------