├── exporters.md
├── runtimes
    └── jvm.md
├── orchestrators
    ├── linkerd.yml
    ├── consul.yml
    ├── nomad.yml
    ├── istio.yml
    ├── etcd.yml
    └── k8s.yml
├── brokers
    ├── kafka.md
    ├── zookeeper.md
    ├── elasticsearch.md
    └── rabbitmq.md
├── some
    ├── promtail.yml
    ├── thanos.yml
    ├── ssltls.yml
    ├── loki.yml
    ├── vmware.md
    ├── jenkins.yml
    └── node-exporter.md
├── storage
    ├── minio.yml
    └── ceph.md
├── proxy
    ├── nginx.md
    ├── traefik.md
    └── haproxy.md
├── databases
    ├── pgbouncer.md
    ├── mysql.md
    ├── redis.md
    ├── mongodb.md
    ├── cassandra.md
    └── postgresql.md
├── README.md
├── docker-containers.md
└── prometheus.md


/exporters.md:
--------------------------------------------------------------------------------
1 | #### [NGINX-to-Prometheus log file exporter](https://github.com/philyuchkoff/prometheus-nginxlog-exporter)
2 | 
3 | Helper tool that continuously reads an Nginx log file (or any kind of similar log file) and exports metrics to [Prometheus](https://prometheus.io/).
4 | 


--------------------------------------------------------------------------------
/runtimes/jvm.md:
--------------------------------------------------------------------------------
 1 | ### [Java Client](https://github.com/prometheus/client_java)
 2 | 
 3 | ````
 4 | - name: JVM
 5 |   rules:
 6 |   - alert: JvmMemoryFillingUp
 7 |     expr: (sum by (instance)(jvm_memory_used_bytes{area="heap"}) / sum by (instance)(jvm_memory_max_bytes{area="heap"})) * 100 > 80
 8 |     for: 2m
 9 |     labels:
10 |       severity: warning
11 |     annotations:
12 |       summary: "JVM memory filling up (instance {{ $labels.instance }})"
13 |       description: "JVM memory is filling up (> 80%)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
14 | ````
15 | 


--------------------------------------------------------------------------------
/orchestrators/linkerd.yml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | 
 3 | - name: EmbeddedExporter
 4 | 
 5 |   rules:
 6 | 
 7 |     - alert: LinkerdHighErrorRate
 8 |       expr: 'sum(rate(request_errors_total[1m])) by (deployment, statefulset, daemonset) / sum(rate(request_total[1m])) by (deployment, statefulset, daemonset) * 100 > 10'
 9 |       for: 1m
10 |       labels:
11 |         severity: warning
12 |       annotations:
13 |         summary: Linkerd high error rate (instance {{ $labels.instance }})
14 |         description: "Linkerd error rate for {{ $labels.deployment | $labels.statefulset | $labels.daemonset }} is over 10%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
15 | 


--------------------------------------------------------------------------------
/brokers/kafka.md:
--------------------------------------------------------------------------------
 1 | # Kafka [Kafka Exporter](https://github.com/danielqsj/kafka_exporter)
 2 | 
 3 | ## Kafka topics replicas   
 4 | ##### Kafka topic in-sync partition
 5 |     
 6 | ````
 7 |   - alert: KafkaTopicsReplicas
 8 |     expr: sum(kafka_topic_partition_in_sync_replica) by (topic) < 3
 9 |     for: 5m
10 |     labels:
11 |       severity: critical
12 |     annotations:
13 |       summary: "Kafka topics replicas (instance {{ $labels.instance }})"
14 |       description: "Kafka topic in-sync partition\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
15 | ````
16 | ## Kafka consumers group  
17 | ##### Kafka consumers group
18 |     
19 | ````
20 |   - alert: KafkaConsumersGroup
21 |     expr: sum(kafka_consumergroup_lag) by (consumergroup) > 50
22 |     for: 5m
23 |     labels:
24 |       severity: critical
25 |      annotations:
26 |        summary: "Kafka consumers group (instance {{ $labels.instance }})"
27 |        description: "Kafka consumers group\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
28 | ````
29 | 


--------------------------------------------------------------------------------
/some/promtail.yml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | 
 3 | - name: EmbeddedExporter
 4 | 
 5 |   rules:
 6 | 
 7 |     - alert: PromtailRequestErrors
 8 |       expr: '100 * sum(rate(promtail_request_duration_seconds_count{status_code=~"5..|failed"}[1m])) by (namespace, job, route, instance) / sum(rate(promtail_request_duration_seconds_count[1m])) by (namespace, job, route, instance) > 10'
 9 |       for: 5m
10 |       labels:
11 |         severity: critical
12 |       annotations:
13 |         summary: Promtail request errors (instance {{ $labels.instance }})
14 |         description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}% errors.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
15 | 
16 |     - alert: PromtailRequestLatency
17 |       expr: 'histogram_quantile(0.99, sum(rate(promtail_request_duration_seconds_bucket[5m])) by (le)) > 1'
18 |       for: 5m
19 |       labels:
20 |         severity: critical
21 |       annotations:
22 |         summary: Promtail request latency (instance {{ $labels.instance }})
23 |         description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
24 | 


--------------------------------------------------------------------------------
/storage/minio.yml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | 
 3 | - name: EmbeddedExporter
 4 | 
 5 |   rules:
 6 | 
 7 |     - alert: MinioClusterDiskOffline
 8 |       expr: 'minio_cluster_disk_offline_total > 0'
 9 |       for: 0m
10 |       labels:
11 |         severity: critical
12 |       annotations:
13 |         summary: Minio cluster disk offline (instance {{ $labels.instance }})
14 |         description: "Minio cluster disk is offline\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
15 | 
16 |     - alert: MinioNodeDiskOffline
17 |       expr: 'minio_cluster_nodes_offline_total > 0'
18 |       for: 0m
19 |       labels:
20 |         severity: critical
21 |       annotations:
22 |         summary: Minio node disk offline (instance {{ $labels.instance }})
23 |         description: "Minio cluster node disk is offline\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
24 | 
25 |     - alert: MinioDiskSpaceUsage
26 |       expr: 'disk_storage_available / disk_storage_total * 100 < 10'
27 |       for: 0m
28 |       labels:
29 |         severity: warning
30 |       annotations:
31 |         summary: Minio disk space usage (instance {{ $labels.instance }})
32 |         description: "Minio available free space is low (< 10%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
33 | 


--------------------------------------------------------------------------------
/some/thanos.yml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | 
 3 | - name: EmbeddedExporter
 4 | 
 5 |   rules:
 6 | 
 7 |     - alert: ThanosCompactionHalted
 8 |       expr: 'thanos_compact_halted == 1'
 9 |       for: 0m
10 |       labels:
11 |         severity: critical
12 |       annotations:
13 |         summary: Thanos compaction halted (instance {{ $labels.instance }})
14 |         description: "Thanos compaction has failed to run and is now halted.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
15 | 
16 |     - alert: ThanosCompactBucketOperationFailure
17 |       expr: 'rate(thanos_objstore_bucket_operation_failures_total[1m]) > 0'
18 |       for: 0m
19 |       labels:
20 |         severity: critical
21 |       annotations:
22 |         summary: Thanos compact bucket operation failure (instance {{ $labels.instance }})
23 |         description: "Thanos compaction has failing storage operations\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
24 | 
25 |     - alert: ThanosCompactNotRun
26 |       expr: '(time() - thanos_objstore_bucket_last_successful_upload_time) > 24*60*60'
27 |       for: 0m
28 |       labels:
29 |         severity: critical
30 |       annotations:
31 |         summary: Thanos compact not run (instance {{ $labels.instance }})
32 |         description: "Thanos compaction has not run in 24 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
33 | 


--------------------------------------------------------------------------------
/orchestrators/consul.yml:
--------------------------------------------------------------------------------
 1 | # prometheus/consul_exporter: https://github.com/prometheus/consul_exporter
 2 | 
 3 | groups:
 4 | 
 5 | - name: ConsulExporter
 6 | 
 7 |   rules:
 8 | 
 9 |     - alert: ConsulServiceHealthcheckFailed
10 |       expr: 'consul_catalog_service_node_healthy == 0'
11 |       for: 1m
12 |       labels:
13 |         severity: critical
14 |       annotations:
15 |         summary: Consul service healthcheck failed (instance {{ $labels.instance }})
16 |         description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
17 | 
18 |     - alert: ConsulMissingMasterNode
19 |       expr: 'consul_raft_peers < 3'
20 |       for: 0m
21 |       labels:
22 |         severity: critical
23 |       annotations:
24 |         summary: Consul missing master node (instance {{ $labels.instance }})
25 |         description: "Numbers of consul raft peers should be 3, in order to preserve quorum.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
26 | 
27 |     - alert: ConsulAgentUnhealthy
28 |       expr: 'consul_health_node_status{status="critical"} == 1'
29 |       for: 0m
30 |       labels:
31 |         severity: critical
32 |       annotations:
33 |         summary: Consul agent unhealthy (instance {{ $labels.instance }})
34 |         description: "A Consul agent is down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
35 | 


--------------------------------------------------------------------------------
/orchestrators/nomad.yml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | 
 3 | - name: EmbeddedExporter
 4 | 
 5 |   rules:
 6 | 
 7 |     - alert: NomadJobFailed
 8 |       expr: 'nomad_nomad_job_summary_failed > 0'
 9 |       for: 0m
10 |       labels:
11 |         severity: warning
12 |       annotations:
13 |         summary: Nomad job failed (instance {{ $labels.instance }})
14 |         description: "Nomad job failed\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
15 | 
16 |     - alert: NomadJobLost
17 |       expr: 'nomad_nomad_job_summary_lost > 0'
18 |       for: 0m
19 |       labels:
20 |         severity: warning
21 |       annotations:
22 |         summary: Nomad job lost (instance {{ $labels.instance }})
23 |         description: "Nomad job lost\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
24 | 
25 |     - alert: NomadJobQueued
26 |       expr: 'nomad_nomad_job_summary_queued > 0'
27 |       for: 2m
28 |       labels:
29 |         severity: warning
30 |       annotations:
31 |         summary: Nomad job queued (instance {{ $labels.instance }})
32 |         description: "Nomad job queued\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
33 | 
34 |     - alert: NomadBlockedEvaluation
35 |       expr: 'nomad_nomad_blocked_evals_total_blocked > 0'
36 |       for: 0m
37 |       labels:
38 |         severity: warning
39 |       annotations:
40 |         summary: Nomad blocked evaluation (instance {{ $labels.instance }})
41 |         description: "Nomad blocked evaluation\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
42 | 


--------------------------------------------------------------------------------
/brokers/zookeeper.md:
--------------------------------------------------------------------------------
 1 | ## Zookeeper
 2 | 
 3 | ###  [ZooKeeper Exporter](https://github.com/dabealu/zookeeper-exporter)
 4 | 
 5 | ````
 6 | - name: Zookeeper
 7 |   rules:
 8 |   - alert: ZookeeperDown
 9 |     expr: zk_up == 0
10 |     for: 0m
11 |     labels:
12 |       severity: critical
13 |     annotations:
14 |       summary: "Zookeeper Down (instance {{ $labels.instance }})"
15 |       description: "Zookeeper down on instance {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
16 | 
17 |   - alert: ZookeeperMissingLeader
18 |     expr: sum(zk_server_leader) == 0
19 |     for: 0m
20 |     labels:
21 |       severity: critical
22 |     annotations:
23 |       summary: "Zookeeper missing leader (instance {{ $labels.instance }})"
24 |       description: "Zookeeper cluster has no node marked as leader\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
25 | 
26 |   - alert: ZookeeperTooManyLeaders
27 |     expr: sum(zk_server_leader) > 1
28 |     for: 0m
29 |     labels:
30 |       severity: critical
31 |     annotations:
32 |       summary: "Zookeeper Too Many Leaders (instance {{ $labels.instance }})"
33 |       description: "Zookeeper cluster has too many nodes marked as leader\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
34 | 
35 |   - alert: ZookeeperNotOk
36 |     expr: zk_ruok == 0
37 |     for: 3m
38 |     labels:
39 |       severity: warning
40 |     annotations:
41 |       summary: "Zookeeper Not Ok (instance {{ $labels.instance }})"
42 |       description: "Zookeeper instance is not ok\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
43 | ````
44 | 


--------------------------------------------------------------------------------
/some/ssltls.yml:
--------------------------------------------------------------------------------
 1 | # exporter: https://github.com/ribbybibby/ssl_exporter
 2 | 
 3 | groups:
 4 | 
 5 | - name: RibbybibbySslExporter
 6 | 
 7 |   rules:
 8 | 
 9 |     - alert: SslCertificateProbeFailed
10 |       expr: 'ssl_probe_success == 0'
11 |       for: 0m
12 |       labels:
13 |         severity: critical
14 |       annotations:
15 |         summary: SSL certificate probe failed (instance {{ $labels.instance }})
16 |         description: "Failed to fetch SSL information {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
17 | 
18 |     - alert: SslCertificateOscpStatusUnknown
19 |       expr: 'ssl_ocsp_response_status == 2'
20 |       for: 0m
21 |       labels:
22 |         severity: warning
23 |       annotations:
24 |         summary: SSL certificate OSCP status unknown (instance {{ $labels.instance }})
25 |         description: "Failed to get the OSCP status {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
26 | 
27 |     - alert: SslCertificateRevoked
28 |       expr: 'ssl_ocsp_response_status == 1'
29 |       for: 0m
30 |       labels:
31 |         severity: critical
32 |       annotations:
33 |         summary: SSL certificate revoked (instance {{ $labels.instance }})
34 |         description: "SSL certificate revoked {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
35 | 
36 |     - alert: SslCertificateExpiry(<7Days)
37 |       expr: 'ssl_verified_cert_not_after{chain_no="0"} - time() < 86400 * 7'
38 |       for: 0m
39 |       labels:
40 |         severity: warning
41 |       annotations:
42 |         summary: SSL certificate expiry (< 7 days) (instance {{ $labels.instance }})
43 |         description: "{{ $labels.instance }} Certificate is expiring in 7 days\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
44 | 


--------------------------------------------------------------------------------
/proxy/nginx.md:
--------------------------------------------------------------------------------
 1 | ## Nginx
 2 | 
 3 | ### [Prometheus metric library for Nginx](https://github.com/knyar/nginx-lua-prometheus)
 4 | 
 5 | -   #### Nginx high HTTP 4xx error rate
 6 |     
 7 | ##### Too many HTTP requests with status 4xx (> 5%)
 8 |     
 9 | ```yaml
10 |   - alert: NginxHighHttp4xxErrorRate
11 |     expr: sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
12 |     for: 5m
13 |     labels:
14 |       severity: critical
15 |     annotations:
16 |       summary: Nginx high HTTP 4xx error rate (instance {{ $labels.instance }})
17 |       description: Too many HTTP requests with status 4xx (> 5%)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}
18 | ```
19 |     
20 | -   #### Nginx high HTTP 5xx error rate
21 |     
22 | ##### Too many HTTP requests with status 5xx (> 5%)
23 |     
24 | ```yaml
25 |   - alert: NginxHighHttp5xxErrorRate
26 |     expr: sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
27 |     for: 5m
28 |     labels:
29 |       severity: critical
30 |     annotations:
31 |       summary: Nginx high HTTP 5xx error rate (instance {{ $labels.instance }})
32 |       description: Too many HTTP requests with status 5xx (> 5%)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}
33 | ```      
34 |     
35 | -   #### Nginx latency high
36 |     
37 | ##### Nginx p99 latency is higher than 10 seconds
38 |     
39 | ```yaml
40 |   - alert: NginxLatencyHigh
41 |     expr: histogram_quantile(0.99, sum(rate(nginx_http_request_duration_seconds_bucket[30m])) by (host, node)) > 10
42 |     for: 5m
43 |     labels:
44 |       severity: warning
45 |     annotations:
46 |       summary: Nginx latency high (instance {{ $labels.instance }})
47 |       description: Nginx p99 latency is higher than 10 seconds\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}
48 | ```
49 | 


--------------------------------------------------------------------------------
/databases/pgbouncer.md:
--------------------------------------------------------------------------------
 1 | # PGBouncer [Prometheus exporter for PgBouncer](https://github.com/spreaker/prometheus-pgbouncer-exporter)
 2 | 
 3 | ## PGBouncer active connectinos    
 4 | ##### PGBouncer pools are filling up
 5 |     
 6 | ````
 7 |       - alert: PgbouncerActiveConnectinos
 8 |         expr: pgbouncer_pools_server_active_connections > 200
 9 |         for: 5m
10 |         labels:
11 |           severity: warning
12 |         annotations:
13 |           summary: "PGBouncer active connectinos (instance {{ $labels.instance }})"
14 |           description: "PGBouncer pools are filling up\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
15 | ````
16 | 
17 | ## PGBouncer errors   
18 | ##### PGBouncer is logging errors. This may be due to a a server restart or an admin typing commands at the pgbouncer console.
19 |     
20 | ````
21 |       - alert: PgbouncerErrors
22 |         expr: increase(pgbouncer_errors_count{errmsg!="server conn crashed?"}[5m]) > 10
23 |         for: 5m
24 |         labels:
25 |           severity: warning
26 |         annotations:
27 |           summary: "PGBouncer errors (instance {{ $labels.instance }})"
28 |           description: "PGBouncer is logging errors. This may be due to a a server restart or an admin typing commands at the pgbouncer console.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
29 | ````
30 | 
31 | ## PGBouncer max connections   
32 | ##### The number of PGBouncer client connections has reached max_client_conn.
33 |     
34 | ````
35 |       - alert: PgbouncerMaxConnections
36 |         expr: rate(pgbouncer_errors_count{errmsg="no more connections allowed (max_client_conn)"}[1m]) > 0
37 |         for: 5m
38 |         labels:
39 |           severity: critical
40 |         annotations:
41 |           summary: "PGBouncer max connections (instance {{ $labels.instance }})"
42 |           description: "The number of PGBouncer client connections has reached max_client_conn.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
43 | ````
44 | 


--------------------------------------------------------------------------------
/proxy/traefik.md:
--------------------------------------------------------------------------------
 1 | ## Traefik
 2 | 
 3 | ### [Traefik Exporter](https://docs.traefik.io/observability/metrics/prometheus/)
 4 | 
 5 | -   #### Traefik backend down
 6 |     
 7 | ##### All Traefik backends are down
 8 |     
 9 | ```yaml
10 |       - alert: TraefikBackendDown
11 |         expr: count(traefik_backend_server_up) by (backend) == 0
12 |         for: 5m
13 |         labels:
14 |           severity: critical
15 |         annotations:
16 |           summary: "Traefik backend down (instance {{ $labels.instance }})"
17 |           description: "All Traefik backends are down\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
18 | ```
19 |     
20 |       
21 |     
22 | -   #### Traefik high HTTP 4xx error rate backend
23 |     
24 | ##### Traefik backend 4xx error rate is above 5%
25 |     
26 | ```yaml
27 |       - alert: TraefikHighHttp4xxErrorRateBackend
28 |         expr: sum(rate(traefik_backend_requests_total{code=~"4.*"}[3m])) by (backend) / sum(rate(traefik_backend_requests_total[3m])) by (backend) * 100 > 5
29 |         for: 5m
30 |         labels:
31 |           severity: critical
32 |         annotations:
33 |           summary: "Traefik high HTTP 4xx error rate backend (instance {{ $labels.instance }})"
34 |           description: "Traefik backend 4xx error rate is above 5%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
35 | ```
36 |     
37 |       
38 |     
39 | -   #### Traefik high HTTP 5xx error rate backend
40 |     
41 | ##### Traefik backend 5xx error rate is above 5%
42 |     
43 | ```yaml
44 |       - alert: TraefikHighHttp5xxErrorRateBackend
45 |         expr: sum(rate(traefik_backend_requests_total{code=~"5.*"}[3m])) by (backend) / sum(rate(traefik_backend_requests_total[3m])) by (backend) * 100 > 5
46 |         for: 5m
47 |         labels:
48 |           severity: critical
49 |         annotations:
50 |           summary: "Traefik high HTTP 5xx error rate backend (instance {{ $labels.instance }})"
51 |           description: "Traefik backend 5xx error rate is above 5%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
52 | ```
53 | 


--------------------------------------------------------------------------------
/some/loki.yml:
--------------------------------------------------------------------------------
 1 | groups:
 2 | 
 3 | - name: EmbeddedExporter
 4 | 
 5 |   rules:
 6 | 
 7 |     - alert: LokiProcessTooManyRestarts
 8 |       expr: 'changes(process_start_time_seconds{job=~"loki"}[15m]) > 2'
 9 |       for: 0m
10 |       labels:
11 |         severity: warning
12 |       annotations:
13 |         summary: Loki process too many restarts (instance {{ $labels.instance }})
14 |         description: "A loki process had too many restarts (target {{ $labels.instance }})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
15 | 
16 |     - alert: LokiRequestErrors
17 |       expr: '100 * sum(rate(loki_request_duration_seconds_count{status_code=~"5.."}[1m])) by (namespace, job, route) / sum(rate(loki_request_duration_seconds_count[1m])) by (namespace, job, route) > 10'
18 |       for: 15m
19 |       labels:
20 |         severity: critical
21 |       annotations:
22 |         summary: Loki request errors (instance {{ $labels.instance }})
23 |         description: "The {{ $labels.job }} and {{ $labels.route }} are experiencing errors\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
24 | 
25 |     - alert: LokiRequestPanic
26 |       expr: 'sum(increase(loki_panic_total[10m])) by (namespace, job) > 0'
27 |       for: 5m
28 |       labels:
29 |         severity: critical
30 |       annotations:
31 |         summary: Loki request panic (instance {{ $labels.instance }})
32 |         description: "The {{ $labels.job }} is experiencing {{ printf \"%.2f\" $value }}% increase of panics\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
33 | 
34 |     - alert: LokiRequestLatency
35 |       expr: '(histogram_quantile(0.99, sum(rate(loki_request_duration_seconds_bucket{route!~"(?i).*tail.*"}[5m])) by (le)))  > 1'
36 |       for: 5m
37 |       labels:
38 |         severity: critical
39 |       annotations:
40 |         summary: Loki request latency (instance {{ $labels.instance }})
41 |         description: "The {{ $labels.job }} {{ $labels.route }} is experiencing {{ printf \"%.2f\" $value }}s 99th percentile latency\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
42 | 


--------------------------------------------------------------------------------
/some/vmware.md:
--------------------------------------------------------------------------------
 1 | # VMware : [pryorda/vmware_exporter](https://github.com/pryorda/vmware_exporter)
 2 | 
 3 | ## Virtual Machine Memory Warning
 4 | #### High memory usage on {{ $labels.instance }}: {{ $value | printf "%.2f"}}%
 5 | ````
 6 |   - alert: VirtualMachineMemoryWarning
 7 |     expr: vmware_vm_mem_usage_average / 100 >= 80 and vmware_vm_mem_usage_average / 100 < 90
 8 |     for: 5m
 9 |     labels:
10 |       severity: warning
11 |     annotations:
12 |       summary: Virtual Machine Memory Warning (instance {{ $labels.instance }})
13 |       description: "High memory usage on {{ $labels.instance }}: {{ $value | printf \"%.2f\"}}%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
14 | ````
15 | 
16 | ## Virtual Machine Memory Critical
17 | #### High memory usage on {{ $labels.instance }}: {{ $value | printf "%.2f"}}%
18 | ````
19 | - alert: VirtualMachineMemoryCritical
20 |     expr: vmware_vm_mem_usage_average / 100 >= 90
21 |     for: 1m
22 |     labels:
23 |       severity: critical
24 |     annotations:
25 |       summary: Virtual Machine Memory Critical (instance {{ $labels.instance }})
26 |       description: "High memory usage on {{ $labels.instance }}: {{ $value | printf \"%.2f\"}}%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
27 | ````
28 | 
29 | ## High Number of Snapshots
30 | #### High snapshots number on {{ $labels.instance }}: {{ $value }}
31 | ````
32 | - alert: HighNumberOfSnapshots
33 |     expr: vmware_vm_snapshots > 3
34 |     for: 30m
35 |     labels:
36 |       severity: warning
37 |     annotations:
38 |       summary: High Number of Snapshots (instance {{ $labels.instance }})
39 |       description: "High snapshots number on {{ $labels.instance }}: {{ $value }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
40 | ````
41 | 
42 | ## Outdated Snapshots
43 | #### Outdated snapshots on {{ $labels.instance }}: {{ $value | printf "%.0f"}} days
44 | ````
45 | - alert: OutdatedSnapshots
46 |     expr: (time() - vmware_vm_snapshot_timestamp_seconds) / (60 * 60 * 24) >= 3
47 |     for: 5m
48 |     labels:
49 |       severity: warning
50 |     annotations:
51 |       summary: Outdated Snapshots (instance {{ $labels.instance }})
52 |       description: "Outdated snapshots on {{ $labels.instance }}: {{ $value | printf \"%.0f\"}} days\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
53 |  ````
54 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 |  - [Exporters which I use always](https://github.com/philyuchkoff/prometheus-alerts/blob/master/exporters.md)
 3 |  - [Prometheus documentation](https://prometheus.io/docs/introduction/overview/)
 4 | 
 5 | ### Prometheus
 6 | [Prometheus self-monitoring](https://github.com/philyuchkoff/prometheus-alerts/blob/master/prometheus.md)
 7 | 
 8 | ### Docker Containers
 9 | [cAdvisor](https://github.com/philyuchkoff/prometheus-alerts/blob/master/docker-containers.md)
10 | 
11 | ### Databases
12 | 
13 |  - VictoriaMetrics
14 |  - [MySQL](https://github.com/philyuchkoff/prometheus-alerts/blob/master/databases/mysql.md)
15 |  - [PostgreSQL](https://github.com/philyuchkoff/prometheus-alerts/blob/master/databases/postgresql.md)
16 |  - [PGBouncer](https://github.com/philyuchkoff/prometheus-alerts/blob/master/databases/pgbouncer.md)
17 |  - [Redis](https://github.com/philyuchkoff/prometheus-alerts/blob/master/databases/redis.md)
18 |  - [MongoDB](https://github.com/philyuchkoff/prometheus-alerts/blob/master/databases/mongodb.md)
19 |  - [Cassandra](https://github.com/philyuchkoff/prometheus-alerts/blob/master/databases/cassandra.md)
20 | 
21 | ### Brokers
22 | 
23 |  - [RabbitMQ](https://github.com/philyuchkoff/prometheus-alerts/blob/master/brokers/rabbitmq.md)
24 |  - [Zookeeper](https://github.com/philyuchkoff/prometheus-alerts/blob/master/brokers/zookeeper.md)
25 |  - [Kafka](https://github.com/philyuchkoff/prometheus-alerts/blob/master/brokers/kafka.md)
26 |  - [Elasticsearch](https://github.com/philyuchkoff/prometheus-alerts/blob/master/brokers/elasticsearch.md)
27 |  
28 |  ### Proxies and load balancers
29 | 
30 |  - [Nginx](https://github.com/philyuchkoff/prometheus-alerts/blob/master/proxy/nginx.md)
31 |  - [HAProxy v.2](https://github.com/philyuchkoff/prometheus-alerts/blob/master/proxy/haproxy.md)
32 |  - [Traefik](https://github.com/philyuchkoff/prometheus-alerts/blob/master/proxy/traefik.md)
33 |  
34 | ### Runtimes
35 | - PHP-FPM
36 | - [JVM](https://github.com/philyuchkoff/prometheus-alerts/blob/master/runtimes/jvm.md)
37 | - Sidekiq
38 | 
39 | ### Orchestrators
40 | - [Kubernetes](orchestrators/k8s.yml)
41 | - [Nomad](orchestrators/nomad.yml)
42 | - [Consul](orchestrators/consul.yml)
43 | - [Etcd](orchestrators/etcd.yml)
44 | - [Linkerd](orchestrators/linkerd.yml)
45 | - [Istio](orchestrators/istio.yml)
46 | 
47 | ### Network and storage
48 | - Ceph
49 | - ZFS
50 | - OpenEBS
51 | - [MinIO](storage/minio.yml)
52 | - Juniper
53 | - CoreDNS
54 |  
55 | ### Some
56 | 
57 |  - [Thanos](some/thanos.yml)
58 |  - [Loki](some/loki.yml)
59 |  - [Promtail](some/promtail.yml)
60 |  - [Jenkins](some/jenkins.yml)
61 |  - [VMware](some/vmware.md)
62 |  - [Node Exporter](https://github.com/philyuchkoff/prometheus-alerts/blob/master/some/node-exporter.md)
63 |  - [SSL/TLS](some/ssltls.yml)
64 | 


--------------------------------------------------------------------------------
/some/jenkins.yml:
--------------------------------------------------------------------------------
 1 | # https://plugins.jenkins.io/prometheus/
 2 | 
 3 | groups:
 4 | 
 5 | - name: MetricPlugin
 6 | 
 7 |   rules:
 8 | 
 9 |     - alert: JenkinsOffline
10 |       expr: 'jenkins_node_offline_value > 1'
11 |       for: 0m
12 |       labels:
13 |         severity: critical
14 |       annotations:
15 |         summary: Jenkins offline (instance {{ $labels.instance }})
16 |         description: "Jenkins offline: `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
17 | 
18 |     - alert: JenkinsHealthcheck
19 |       expr: 'jenkins_health_check_score < 1'
20 |       for: 0m
21 |       labels:
22 |         severity: critical
23 |       annotations:
24 |         summary: Jenkins healthcheck (instance {{ $labels.instance }})
25 |         description: "Jenkins healthcheck score: {{$value}}. Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
26 | 
27 |     - alert: JenkinsOutdatedPlugins
28 |       expr: 'sum(jenkins_plugins_withUpdate) by (instance) > 3'
29 |       for: 1d
30 |       labels:
31 |         severity: warning
32 |       annotations:
33 |         summary: Jenkins outdated plugins (instance {{ $labels.instance }})
34 |         description: "{{ $value }} plugins need update\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
35 | 
36 |     - alert: JenkinsBuildsHealthScore
37 |       expr: 'default_jenkins_builds_health_score < 1'
38 |       for: 0m
39 |       labels:
40 |         severity: critical
41 |       annotations:
42 |         summary: Jenkins builds health score (instance {{ $labels.instance }})
43 |         description: "Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
44 | 
45 |     - alert: JenkinsRunFailureTotal
46 |       expr: 'delta(jenkins_runs_failure_total[1h]) > 100'
47 |       for: 0m
48 |       labels:
49 |         severity: warning
50 |       annotations:
51 |         summary: Jenkins run failure total (instance {{ $labels.instance }})
52 |         description: "Job run failures: ({{$value}}) {{$labels.jenkins_job}}. Healthcheck failure for `{{$labels.instance}}` in realm {{$labels.realm}}/{{$labels.env}} ({{$labels.region}})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
53 | 
54 |     - alert: JenkinsBuildTestsFailing
55 |       expr: 'default_jenkins_builds_last_build_tests_failing > 0'
56 |       for: 0m
57 |       labels:
58 |         severity: warning
59 |       annotations:
60 |         summary: Jenkins build tests failing (instance {{ $labels.instance }})
61 |         description: "Last build tests failed: {{$labels.jenkins_job}}. Failed build Tests for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
62 | 
63 |     - alert: JenkinsLastBuildFailed
64 |       expr: 'default_jenkins_builds_last_build_result_ordinal == 2'
65 |       for: 0m
66 |       labels:
67 |         severity: warning
68 |       annotations:
69 |         summary: Jenkins last build failed (instance {{ $labels.instance }})
70 |         description: "Last build failed: {{$labels.jenkins_job}}. Failed build for job `{{$labels.jenkins_job}}` on {{$labels.instance}}/{{$labels.env}} ({{$labels.region}})\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
71 | 


--------------------------------------------------------------------------------
/docker-containers.md:
--------------------------------------------------------------------------------
  1 | # [google/cAdvisor](https://github.com/google/cadvisor) 
  2 | 
  3 | ````
  4 | - name: Containers
  5 |   rules:
  6 | ````
  7 | 
  8 | ## Container killed
  9 | #### A container has disappeared
 10 | 
 11 | ````
 12 | # Если инфраструктура динамическая - часто и много контейнеров запускается, останавливается или перезапускается - может быть много ложных алертов
 13 |   - alert: ContainerKilled
 14 |     expr: time() - container_last_seen > 60
 15 |     for: 0m
 16 |     labels:
 17 |       severity: warning
 18 |     annotations:
 19 |       summary: Container killed (instance {{ $labels.instance }})
 20 |       description: "A container has disappeared\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 21 | ````
 22 | 
 23 | ## Container absent
 24 | #### A container is absent for 5 min
 25 | ````
 26 | # Если инфраструктура динамическая - часто и много контейнеров запускается, останавливается или перезапускается - может быть много ложных алертов
 27 |   - alert: ContainerAbsent
 28 |     expr: absent(container_last_seen)
 29 |     for: 5m
 30 |     labels:
 31 |       severity: warning
 32 |     annotations:
 33 |       summary: Container absent (instance {{ $labels.instance }})
 34 |       description: "A container is absent for 5 min\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 35 | ````
 36 | ## Container CPU usage
 37 | #### Container CPU usage is above 80%
 38 | ````
 39 | # cAdvisor иногда жрет много CPU и этот алерт бутет срабатывать часто.
 40 | # Чтобы исключить срабатывание алерта по этой причине - нужно исключить серию с пустым именем: container_cpu_usage_seconds_total{name!=""}
 41 |   - alert: ContainerCpuUsage
 42 |     expr: (sum(rate(container_cpu_usage_seconds_total[3m])) BY (instance, name) * 100) > 80
 43 |     for: 2m
 44 |     labels:
 45 |       severity: warning
 46 |     annotations:
 47 |       summary: Container CPU usage (instance {{ $labels.instance }})
 48 |       description: "Container CPU usage is above 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 49 | ````
 50 | ## Container Memory usage
 51 | #### Container Memory usage is above 80%
 52 | ````
 53 | # Прочитай "How much is too much? The Linux OOMKiller and “used” memory" 
 54 | # https://medium.com/faun/how-much-is-too-much-the-linux-oomkiller-and-used-memory-d32186f29c9d
 55 |   - alert: ContainerMemoryUsage
 56 |     expr: (sum(container_memory_working_set_bytes) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 80
 57 |     for: 2m
 58 |     labels:
 59 |       severity: warning
 60 |     annotations:
 61 |       summary: Container Memory usage (instance {{ $labels.instance }})
 62 |       description: "Container Memory usage is above 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 63 | ````
 64 | ## Container Volume usage
 65 | #### Container Volume usage is above 80%
 66 | ````
 67 |   - alert: ContainerVolumeUsage
 68 |     expr: (1 - (sum(container_fs_inodes_free) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80
 69 |     for: 2m
 70 |     labels:
 71 |       severity: warning
 72 |     annotations:
 73 |       summary: Container Volume usage (instance {{ $labels.instance }})
 74 |       description: "Container Volume usage is above 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 75 | ````
 76 | ## Container Volume IO usage
 77 | #### Container Volume IO usage is above 80%
 78 | ````
 79 |   - alert: ContainerVolumeIoUsage
 80 |     expr: (sum(container_fs_io_current) BY (instance, name) * 100) > 80
 81 |     for: 2m
 82 |     labels:
 83 |       severity: warning
 84 |     annotations:
 85 |       summary: Container Volume IO usage (instance {{ $labels.instance }})
 86 |       description: "Container Volume IO usage is above 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 87 | ````
 88 | 
 89 | ## Container high throttle rate
 90 | #### Container is being throttled
 91 | ````
 92 | # Если контейнер превышает лимит по CPU, то включается throttling - "урезание" квоты по CPU.Почитать про это, например:
 93 | # https://habr.com/ru/company/flant/blog/489668/
 94 | # https://www.itsumma.ru/knowledges/blog/CPUlimits
 95 | # и т.п.
 96 |   - alert: ContainerHighThrottleRate
 97 |     expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1
 98 |     for: 2m
 99 |     labels:
100 |       severity: warning
101 |     annotations:
102 |       summary: "Container high throttle rate (instance {{ $labels.instance }})"
103 |       description: "Container is being throttled\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
104 | ````
105 | 


--------------------------------------------------------------------------------
/orchestrators/istio.yml:
--------------------------------------------------------------------------------
 1 | # Exporter: https://istio.io/latest/docs/tasks/observability/metrics/querying-metrics/
 2 | 
 3 | groups:
 4 | 
 5 | - name: EmbeddedExporter
 6 | 
 7 |   rules:
 8 | 
 9 |     - alert: IstioKubernetesGatewayAvailabilityDrop
10 |       expr: 'min(kube_deployment_status_replicas_available{deployment="istio-ingressgateway", namespace="istio-system"}) without (instance, pod) < 2'
11 |       for: 1m
12 |       labels:
13 |         severity: warning
14 |       annotations:
15 |         summary: Istio Kubernetes gateway availability drop (instance {{ $labels.instance }})
16 |         description: "Gateway pods have dropped. Inbound traffic will likely be affected.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
17 | 
18 |     - alert: IstioPilotHighTotalRequestRate
19 |       expr: 'sum(rate(pilot_xds_push_errors[1m])) / sum(rate(pilot_xds_pushes[1m])) * 100 > 5'
20 |       for: 1m
21 |       labels:
22 |         severity: warning
23 |       annotations:
24 |         summary: Istio Pilot high total request rate (instance {{ $labels.instance }})
25 |         description: "Number of Istio Pilot push errors is too high (> 5%). Envoy sidecars might have outdated configuration.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
26 | 
27 |     - alert: IstioMixerPrometheusDispatchesLow
28 |       expr: 'sum(rate(mixer_runtime_dispatches_total{adapter=~"prometheus"}[1m])) < 180'
29 |       for: 1m
30 |       labels:
31 |         severity: warning
32 |       annotations:
33 |         summary: Istio Mixer Prometheus dispatches low (instance {{ $labels.instance }})
34 |         description: "Number of Mixer dispatches to Prometheus is too low. Istio metrics might not be being exported properly.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
35 | 
36 |     - alert: IstioHighTotalRequestRate
37 |       expr: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) > 1000'
38 |       for: 2m
39 |       labels:
40 |         severity: warning
41 |       annotations:
42 |         summary: Istio high total request rate (instance {{ $labels.instance }})
43 |         description: "Global request rate in the service mesh is unusually high.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
44 | 
45 |     - alert: IstioLowTotalRequestRate
46 |       expr: 'sum(rate(istio_requests_total{reporter="destination"}[5m])) < 100'
47 |       for: 2m
48 |       labels:
49 |         severity: warning
50 |       annotations:
51 |         summary: Istio low total request rate (instance {{ $labels.instance }})
52 |         description: "Global request rate in the service mesh is unusually low.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
53 | 
54 |     - alert: IstioHigh4xxErrorRate
55 |       expr: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"4.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5'
56 |       for: 1m
57 |       labels:
58 |         severity: warning
59 |       annotations:
60 |         summary: Istio high 4xx error rate (instance {{ $labels.instance }})
61 |         description: "High percentage of HTTP 5xx responses in Istio (> 5%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
62 | 
63 |     - alert: IstioHigh5xxErrorRate
64 |       expr: 'sum(rate(istio_requests_total{reporter="destination", response_code=~"5.*"}[5m])) / sum(rate(istio_requests_total{reporter="destination"}[5m])) * 100 > 5'
65 |       for: 1m
66 |       labels:
67 |         severity: warning
68 |       annotations:
69 |         summary: Istio high 5xx error rate (instance {{ $labels.instance }})
70 |         description: "High percentage of HTTP 5xx responses in Istio (> 5%).\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
71 | 
72 |     - alert: IstioHighRequestLatency
73 |       expr: 'rate(istio_request_duration_milliseconds_sum{reporter="destination"}[1m]) / rate(istio_request_duration_milliseconds_count{reporter="destination"}[1m]) > 100'
74 |       for: 1m
75 |       labels:
76 |         severity: warning
77 |       annotations:
78 |         summary: Istio high request latency (instance {{ $labels.instance }})
79 |         description: "Istio average requests execution is longer than 100ms.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
80 | 
81 |     - alert: IstioLatency99Percentile
82 |       expr: 'histogram_quantile(0.99, sum(rate(istio_request_duration_milliseconds_bucket[1m])) by (destination_canonical_service, destination_workload_namespace, source_canonical_service, source_workload_namespace, le)) > 1'
83 |       for: 1m
84 |       labels:
85 |         severity: warning
86 |       annotations:
87 |         summary: Istio latency 99 percentile (instance {{ $labels.instance }})
88 |         description: "Istio 1% slowest requests are longer than 1s.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
89 | 
90 |     - alert: IstioPilotDuplicateEntry
91 |       expr: 'sum(rate(pilot_duplicate_envoy_clusters{}[5m])) > 0'
92 |       for: 0m
93 |       labels:
94 |         severity: critical
95 |       annotations:
96 |         summary: Istio Pilot Duplicate Entry (instance {{ $labels.instance }})
97 |         description: "Istio pilot duplicate entry error.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
98 | 


--------------------------------------------------------------------------------
/databases/mysql.md:
--------------------------------------------------------------------------------
  1 | ## MySQL
  2 | 
  3 | ### [Prometheus MySQL exporter](https://github.com/prometheus/mysqld_exporter)
  4 | 
  5 | - #### MySQL down
  6 | ##### MySQL instance is down on {{ $labels.instance }}
  7 | 
  8 | ```yaml
  9 |  - alert: MysqlDown
 10 |     expr: mysql_up == 0
 11 |     for: 5m
 12 |     labels:
 13 |       severity: critical
 14 |     annotations:
 15 |       summary: "MySQL down (instance {{ $labels.instance }})"
 16 |       description: "MySQL instance is down on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 17 | ```
 18 | -   #### MySQL too many connections
 19 |     
 20 | ##### More than 80% of MySQL connections are in use on {{ $labels.instance }}
 21 |     
 22 | ```yaml
 23 |       - alert: MysqlTooManyConnections
 24 |         expr: avg by (instance) (max_over_time(mysql_global_status_threads_connected[5m])) / avg by (instance) (mysql_global_variables_max_connections) * 100 > 80
 25 |         for: 5m
 26 |         labels:
 27 |           severity: warning
 28 |         annotations:
 29 |           summary: "MySQL too many connections (instance {{ $labels.instance }})"
 30 |           description: "More than 80% of MySQL connections are in use on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 31 | ```
 32 |     
 33 |       
 34 |     
 35 | -   #### MySQL high threads running
 36 |     
 37 | ##### More than 60% of MySQL connections are in running state on {{ $labels.instance }}
 38 |     
 39 | ```yaml
 40 |       - alert: MysqlHighThreadsRunning
 41 |         expr: avg by (instance) (max_over_time(mysql_global_status_threads_running[5m])) / avg by (instance) (mysql_global_variables_max_connections) * 100 > 60
 42 |         for: 5m
 43 |         labels:
 44 |           severity: warning
 45 |         annotations:
 46 |           summary: "MySQL high threads running (instance {{ $labels.instance }})"
 47 |           description: "More than 60% of MySQL connections are in running state on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 48 | ```
 49 |     
 50 |       
 51 |     
 52 | -   #### MySQL Slave IO thread not running
 53 |     
 54 | ##### MySQL Slave IO thread not running on {{ $labels.instance }}
 55 |     
 56 | ```yaml
 57 |       - alert: MysqlSlaveIoThreadNotRunning
 58 |         expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_io_running == 0
 59 |         for: 5m
 60 |         labels:
 61 |           severity: critical
 62 |         annotations:
 63 |           summary: "MySQL Slave IO thread not running (instance {{ $labels.instance }})"
 64 |           description: "MySQL Slave IO thread not running on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 65 | ```
 66 |         
 67 | -   #### MySQL Slave SQL thread not running
 68 |     
 69 | ##### MySQL Slave SQL thread not running on {{ $labels.instance }}
 70 |     
 71 | ```yaml
 72 |       - alert: MysqlSlaveSqlThreadNotRunning
 73 |         expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_sql_running == 0
 74 |         for: 5m
 75 |         labels:
 76 |           severity: critical
 77 |         annotations:
 78 |           summary: "MySQL Slave SQL thread not running (instance {{ $labels.instance }})"
 79 |           description: "MySQL Slave SQL thread not running on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 80 | ```
 81 |     
 82 |       
 83 |     
 84 | -   #### MySQL Slave replication lag
 85 |     
 86 | ##### MysqL replication lag on {{ $labels.instance }}
 87 |     
 88 | ```yaml
 89 |       - alert: MysqlSlaveReplicationLag
 90 |         expr: mysql_slave_status_master_server_id > 0 and ON (instance) (mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay) > 300
 91 |         for: 5m
 92 |         labels:
 93 |           severity: warning
 94 |         annotations:
 95 |           summary: "MySQL Slave replication lag (instance {{ $labels.instance }})"
 96 |           description: "MysqL replication lag on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 97 | ```
 98 |     
 99 |       
100 |     
101 | -   #### MySQL slow queries
102 |     
103 | ##### MySQL server is having some slow queries.
104 |     
105 | ```yaml
106 |       - alert: MysqlSlowQueries
107 |         expr: mysql_global_status_slow_queries > 0
108 |         for: 5m
109 |         labels:
110 |           severity: warning
111 |         annotations:
112 |           summary: "MySQL slow queries (instance {{ $labels.instance }})"
113 |           description: "MySQL server is having some slow queries.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
114 | ```
115 |     
116 |       
117 |     
118 | -   #### MySQL restarted
119 |     
120 | ##### MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}.
121 |     
122 | ```yaml
123 |       - alert: MysqlRestarted
124 |         expr: mysql_global_status_uptime < 60
125 |         for: 5m
126 |         labels:
127 |           severity: warning
128 |         annotations:
129 |           summary: "MySQL restarted (instance {{ $labels.instance }})"
130 |           description: "MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
131 | ```
132 | 


--------------------------------------------------------------------------------
/orchestrators/etcd.yml:
--------------------------------------------------------------------------------
  1 | groups:
  2 | 
  3 | - name: EmbeddedExporter
  4 | 
  5 |   rules:
  6 | 
  7 |     - alert: EtcdInsufficientMembers
  8 |       expr: 'count(etcd_server_id) % 2 == 0'
  9 |       for: 0m
 10 |       labels:
 11 |         severity: critical
 12 |       annotations:
 13 |         summary: Etcd insufficient Members (instance {{ $labels.instance }})
 14 |         description: "Etcd cluster should have an odd number of members\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 15 | 
 16 |     - alert: EtcdNoLeader
 17 |       expr: 'etcd_server_has_leader == 0'
 18 |       for: 0m
 19 |       labels:
 20 |         severity: critical
 21 |       annotations:
 22 |         summary: Etcd no Leader (instance {{ $labels.instance }})
 23 |         description: "Etcd cluster have no leader\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 24 | 
 25 |     - alert: EtcdHighNumberOfLeaderChanges
 26 |       expr: 'increase(etcd_server_leader_changes_seen_total[10m]) > 2'
 27 |       for: 0m
 28 |       labels:
 29 |         severity: warning
 30 |       annotations:
 31 |         summary: Etcd high number of leader changes (instance {{ $labels.instance }})
 32 |         description: "Etcd leader changed more than 2 times during 10 minutes\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 33 | 
 34 |     - alert: EtcdHighNumberOfFailedGrpcRequests
 35 |       expr: 'sum(rate(grpc_server_handled_total{grpc_code!="OK"}[1m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.01'
 36 |       for: 2m
 37 |       labels:
 38 |         severity: warning
 39 |       annotations:
 40 |         summary: Etcd high number of failed GRPC requests (instance {{ $labels.instance }})
 41 |         description: "More than 1% GRPC request failure detected in Etcd\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 42 | 
 43 |     - alert: EtcdHighNumberOfFailedGrpcRequests
 44 |       expr: 'sum(rate(grpc_server_handled_total{grpc_code!="OK"}[1m])) BY (grpc_service, grpc_method) / sum(rate(grpc_server_handled_total[1m])) BY (grpc_service, grpc_method) > 0.05'
 45 |       for: 2m
 46 |       labels:
 47 |         severity: critical
 48 |       annotations:
 49 |         summary: Etcd high number of failed GRPC requests (instance {{ $labels.instance }})
 50 |         description: "More than 5% GRPC request failure detected in Etcd\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 51 | 
 52 |     - alert: EtcdGrpcRequestsSlow
 53 |       expr: 'histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{grpc_type="unary"}[1m])) by (grpc_service, grpc_method, le)) > 0.15'
 54 |       for: 2m
 55 |       labels:
 56 |         severity: warning
 57 |       annotations:
 58 |         summary: Etcd GRPC requests slow (instance {{ $labels.instance }})
 59 |         description: "GRPC requests slowing down, 99th percentile is over 0.15s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 60 | 
 61 |     - alert: EtcdHighNumberOfFailedHttpRequests
 62 |       expr: 'sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.01'
 63 |       for: 2m
 64 |       labels:
 65 |         severity: warning
 66 |       annotations:
 67 |         summary: Etcd high number of failed HTTP requests (instance {{ $labels.instance }})
 68 |         description: "More than 1% HTTP failure detected in Etcd\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 69 | 
 70 |     - alert: EtcdHighNumberOfFailedHttpRequests
 71 |       expr: 'sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.05'
 72 |       for: 2m
 73 |       labels:
 74 |         severity: critical
 75 |       annotations:
 76 |         summary: Etcd high number of failed HTTP requests (instance {{ $labels.instance }})
 77 |         description: "More than 5% HTTP failure detected in Etcd\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 78 | 
 79 |     - alert: EtcdHttpRequestsSlow
 80 |       expr: 'histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[1m])) > 0.15'
 81 |       for: 2m
 82 |       labels:
 83 |         severity: warning
 84 |       annotations:
 85 |         summary: Etcd HTTP requests slow (instance {{ $labels.instance }})
 86 |         description: "HTTP requests slowing down, 99th percentile is over 0.15s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 87 | 
 88 |     - alert: EtcdMemberCommunicationSlow
 89 |       expr: 'histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) > 0.15'
 90 |       for: 2m
 91 |       labels:
 92 |         severity: warning
 93 |       annotations:
 94 |         summary: Etcd member communication slow (instance {{ $labels.instance }})
 95 |         description: "Etcd member communication slowing down, 99th percentile is over 0.15s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 96 | 
 97 |     - alert: EtcdHighNumberOfFailedProposals
 98 |       expr: 'increase(etcd_server_proposals_failed_total[1h]) > 5'
 99 |       for: 2m
100 |       labels:
101 |         severity: warning
102 |       annotations:
103 |         summary: Etcd high number of failed proposals (instance {{ $labels.instance }})
104 |         description: "Etcd server got more than 5 failed proposals past hour\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
105 | 
106 |     - alert: EtcdHighFsyncDurations
107 |       expr: 'histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) > 0.5'
108 |       for: 2m
109 |       labels:
110 |         severity: warning
111 |       annotations:
112 |         summary: Etcd high fsync durations (instance {{ $labels.instance }})
113 |         description: "Etcd WAL fsync duration increasing, 99th percentile is over 0.5s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
114 | 
115 |     - alert: EtcdHighCommitDurations
116 |       expr: 'histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[1m])) > 0.25'
117 |       for: 2m
118 |       labels:
119 |         severity: warning
120 |       annotations:
121 |         summary: Etcd high commit durations (instance {{ $labels.instance }})
122 |         description: "Etcd commit duration increasing, 99th percentile is over 0.25s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
123 | 


--------------------------------------------------------------------------------
/databases/redis.md:
--------------------------------------------------------------------------------
  1 | ## Redis
  2 | 
  3 | ### [Prometheus Redis Metrics Exporter](https://github.com/oliver006/redis_exporter)
  4 | -   #### Redis down
  5 |     
  6 | ##### Redis instance is down
  7 | ```yaml
  8 |       - alert: RedisDown
  9 |         expr: redis_up == 0
 10 |         for: 5m
 11 |         labels:
 12 |           severity: critical
 13 |         annotations:
 14 |           summary: "Redis down (instance {{ $labels.instance }})"
 15 |           description: "Redis instance is down\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 16 | ```
 17 |     
 18 |       
 19 |     
 20 | -   #### Redis missing master
 21 |     
 22 | ##### Redis cluster has no node marked as master
 23 |     
 24 | ```yaml
 25 |       - alert: RedisMissingMaster
 26 |         expr: count(redis_instance_info{role="master"}) == 0
 27 |         for: 5m
 28 |         labels:
 29 |           severity: critical
 30 |         annotations:
 31 |           summary: "Redis missing master (instance {{ $labels.instance }})"
 32 |           description: "Redis cluster has no node marked as master.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 33 | ```
 34 |     
 35 |       
 36 |     
 37 | -   #### Redis too many masters
 38 |     
 39 | ##### Redis cluster has too many nodes marked as master
 40 |     
 41 | ```yaml
 42 |       - alert: RedisTooManyMasters
 43 |         expr: count(redis_instance_info{role="master"}) > 1
 44 |         for: 5m
 45 |         labels:
 46 |           severity: critical
 47 |         annotations:
 48 |           summary: "Redis too many masters (instance {{ $labels.instance }})"
 49 |           description: "Redis cluster has too many nodes marked as master.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 50 | ```
 51 |     
 52 |       
 53 |     
 54 | -   #### Redis disconnected slaves
 55 |     
 56 | ##### Redis not replicating for all slaves. Consider reviewing the redis replication status
 57 | ```yaml
 58 |       - alert: RedisDisconnectedSlaves
 59 |         expr: count without (instance, job) (redis_connected_slaves) - sum without (instance, job) (redis_connected_slaves) - 1 > 1
 60 |         for: 5m
 61 |         labels:
 62 |           severity: critical
 63 |         annotations:
 64 |           summary: "Redis disconnected slaves (instance {{ $labels.instance }})"
 65 |           description: "Redis not replicating for all slaves. Consider reviewing the redis replication status.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 66 | ```
 67 |     
 68 |       
 69 |     
 70 | -   #### Redis replication broken
 71 |     
 72 | ##### Redis instance lost a slave
 73 | ```yaml
 74 |       - alert: RedisReplicationBroken
 75 |         expr: delta(redis_connected_slaves[1m]) < 0
 76 |         for: 5m
 77 |         labels:
 78 |           severity: critical
 79 |         annotations:
 80 |           summary: "Redis replication broken (instance {{ $labels.instance }})"
 81 |           description: "Redis instance lost a slave\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 82 | ```
 83 |     
 84 |       
 85 |     
 86 | -   #### Redis cluster flapping
 87 |     
 88 | ##### Changes have been detected in Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping)
 89 |     
 90 | ```yaml
 91 |       - alert: RedisClusterFlapping
 92 |         expr: changes(redis_connected_slaves[5m]) > 2
 93 |         for: 5m
 94 |         labels:
 95 |           severity: critical
 96 |         annotations:
 97 |           summary: "Redis cluster flapping (instance {{ $labels.instance }})"
 98 |           description: "Changes have been detected in Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping).\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 99 | ```
100 |     
101 |       
102 |     
103 | -   #### Redis missing backup
104 |     
105 | ##### Redis has not been backuped for 24 hours
106 |     
107 | ```yaml
108 |       - alert: RedisMissingBackup
109 |         expr: time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 24
110 |         for: 5m
111 |         labels:
112 |           severity: critical
113 |         annotations:
114 |           summary: "Redis missing backup (instance {{ $labels.instance }})"
115 |           description: "Redis has not been backuped for 24 hours\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
116 | ```
117 |     
118 |       
119 |     
120 | -   #### Redis out of memory
121 |     
122 | ##### Redis is running out of memory (> 90%)
123 |     
124 | ```yaml
125 |       - alert: RedisOutOfMemory
126 |         expr: redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 90
127 |         for: 5m
128 |         labels:
129 |           severity: warning
130 |         annotations:
131 |           summary: "Redis out of memory (instance {{ $labels.instance }})"
132 |           description: "Redis is running out of memory (> 90%)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
133 | ```
134 |     
135 |       
136 |     
137 | -   #### Redis too many connections
138 |     
139 | ##### Redis instance has too many connections
140 |     
141 | ```yaml
142 |       - alert: RedisTooManyConnections
143 |         expr: redis_connected_clients > 100
144 |         for: 5m
145 |         labels:
146 |           severity: warning
147 |         annotations:
148 |           summary: "Redis too many connections (instance {{ $labels.instance }})"
149 |           description: "Redis instance has too many connections\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
150 | ```
151 |     
152 |       
153 |     
154 | -   #### Redis not enough connections
155 |     
156 | ##### Redis instance should have more connections (> 5)
157 |     
158 | ```yaml
159 |       - alert: RedisNotEnoughConnections
160 |         expr: redis_connected_clients < 5
161 |         for: 5m
162 |         labels:
163 |           severity: warning
164 |         annotations:
165 |           summary: "Redis not enough connections (instance {{ $labels.instance }})"
166 |           description: "Redis instance should have more connections (> 5)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
167 | ```
168 |     
169 |       
170 |     
171 | -   #### Redis rejected connections
172 |     
173 | ##### Some connections to Redis has been rejected
174 |     
175 | ```yaml
176 |       - alert: RedisRejectedConnections
177 |         expr: increase(redis_rejected_connections_total[1m]) > 0
178 |         for: 5m
179 |         labels:
180 |           severity: critical
181 |         annotations:
182 |           summary: "Redis rejected connections (instance {{ $labels.instance }})"
183 |           description: "Some connections to Redis has been rejected\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
184 | ```
185 | 


--------------------------------------------------------------------------------
/storage/ceph.md:
--------------------------------------------------------------------------------
  1 | Embedded exporter https://docs.ceph.com/en/quincy/mgr/prometheus/
  2 | 
  3 | ### Ceph State
  4 | #### Ceph instance unhealthy
  5 | ```yaml
  6 |   - alert: CephState
  7 |     expr: ceph_health_status != 0
  8 |     for: 0m
  9 |     labels:
 10 |       severity: critical
 11 |     annotations:
 12 |       summary: Ceph State (instance {{ $labels.instance }})
 13 |       description: "Ceph instance unhealthy\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 14 | ```
 15 | 
 16 | ### Ceph monitor clock skew
 17 | #### Ceph monitor clock skew detected. Please check ntp and hardware clock settings[copy]
 18 | ```yaml
 19 |   - alert: CephMonitorClockSkew
 20 |     expr: abs(ceph_monitor_clock_skew_seconds) > 0.2
 21 |     for: 2m
 22 |     labels:
 23 |       severity: warning
 24 |     annotations:
 25 |       summary: Ceph monitor clock skew (instance {{ $labels.instance }})
 26 |       description: "Ceph monitor clock skew detected. Please check ntp and hardware clock settings\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 27 | ```
 28 | ### Ceph monitor low space
 29 | #### Ceph monitor storage is low.[copy]
 30 | ```yaml
 31 |   - alert: CephMonitorLowSpace
 32 |     expr: ceph_monitor_avail_percent < 10
 33 |     for: 2m
 34 |     labels:
 35 |       severity: warning
 36 |     annotations:
 37 |       summary: Ceph monitor low space (instance {{ $labels.instance }})
 38 |       description: "Ceph monitor storage is low.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 39 | ```
 40 | ### Ceph OSD Down
 41 | #### Ceph Object Storage Daemon Down[copy]
 42 | ```yaml
 43 |   - alert: CephOsdDown
 44 |     expr: ceph_osd_up == 0
 45 |     for: 0m
 46 |     labels:
 47 |       severity: critical
 48 |     annotations:
 49 |       summary: Ceph OSD Down (instance {{ $labels.instance }})
 50 |       description: "Ceph Object Storage Daemon Down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 51 | ```
 52 | ### Ceph high OSD latency
 53 | #### Ceph Object Storage Daemon latency is high. Please check if it doesn't stuck in weird state.[copy]
 54 | ```yaml
 55 |   - alert: CephHighOsdLatency
 56 |     expr: ceph_osd_perf_apply_latency_seconds > 5
 57 |     for: 1m
 58 |     labels:
 59 |       severity: warning
 60 |     annotations:
 61 |       summary: Ceph high OSD latency (instance {{ $labels.instance }})
 62 |       description: "Ceph Object Storage Daemon latency is high. Please check if it doesn't stuck in weird state.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 63 | ```
 64 | ### Ceph OSD low space
 65 | #### Ceph Object Storage Daemon is going out of space. Please add more disks.[copy]
 66 | ```yaml
 67 |   - alert: CephOsdLowSpace
 68 |     expr: ceph_osd_utilization > 90
 69 |     for: 2m
 70 |     labels:
 71 |       severity: warning
 72 |     annotations:
 73 |       summary: Ceph OSD low space (instance {{ $labels.instance }})
 74 |       description: "Ceph Object Storage Daemon is going out of space. Please add more disks.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 75 | ```
 76 | ### Ceph OSD reweighted
 77 | #### Ceph Object Storage Daemon takes too much time to resize.[copy]
 78 | ```yaml
 79 |   - alert: CephOsdReweighted
 80 |     expr: ceph_osd_weight < 1
 81 |     for: 2m
 82 |     labels:
 83 |       severity: warning
 84 |     annotations:
 85 |       summary: Ceph OSD reweighted (instance {{ $labels.instance }})
 86 |       description: "Ceph Object Storage Daemon takes too much time to resize.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 87 | ```
 88 | ### Ceph PG down
 89 | #### Some Ceph placement groups are down. Please ensure that all the data are available.[copy]
 90 | ```yaml
 91 |   - alert: CephPgDown
 92 |     expr: ceph_pg_down > 0
 93 |     for: 0m
 94 |     labels:
 95 |       severity: critical
 96 |     annotations:
 97 |       summary: Ceph PG down (instance {{ $labels.instance }})
 98 |       description: "Some Ceph placement groups are down. Please ensure that all the data are available.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 99 | ```
100 | ### Ceph PG incomplete
101 | #### Some Ceph placement groups are incomplete. Please ensure that all the data are available.[copy]
102 | ```yaml
103 |   - alert: CephPgIncomplete
104 |     expr: ceph_pg_incomplete > 0
105 |     for: 0m
106 |     labels:
107 |       severity: critical
108 |     annotations:
109 |       summary: Ceph PG incomplete (instance {{ $labels.instance }})
110 |       description: "Some Ceph placement groups are incomplete. Please ensure that all the data are available.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
111 | ```
112 | ### Ceph PG inconsistent
113 | #### Some Ceph placement groups are inconsistent. Data is available but inconsistent across nodes.[copy]
114 | ```yaml
115 |   - alert: CephPgInconsistent
116 |     expr: ceph_pg_inconsistent > 0
117 |     for: 0m
118 |     labels:
119 |       severity: warning
120 |     annotations:
121 |       summary: Ceph PG inconsistent (instance {{ $labels.instance }})
122 |       description: "Some Ceph placement groups are inconsistent. Data is available but inconsistent across nodes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
123 | ```
124 | ### Ceph PG activation long
125 | #### Some Ceph placement groups are too long to activate.[copy]
126 | ```yaml
127 |   - alert: CephPgActivationLong
128 |     expr: ceph_pg_activating > 0
129 |     for: 2m
130 |     labels:
131 |       severity: warning
132 |     annotations:
133 |       summary: Ceph PG activation long (instance {{ $labels.instance }})
134 |       description: "Some Ceph placement groups are too long to activate.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
135 | ```
136 | ### Ceph PG backfill full
137 | #### Some Ceph placement groups are located on full Object Storage Daemon on cluster. Those PGs can be unavailable shortly. Please check OSDs, change weight or reconfigure CRUSH rules.[copy]
138 | ```yaml
139 |   - alert: CephPgBackfillFull
140 |     expr: ceph_pg_backfill_toofull > 0
141 |     for: 2m
142 |     labels:
143 |       severity: warning
144 |     annotations:
145 |       summary: Ceph PG backfill full (instance {{ $labels.instance }})
146 |       description: "Some Ceph placement groups are located on full Object Storage Daemon on cluster. Those PGs can be unavailable shortly. Please check OSDs, change weight or reconfigure CRUSH rules.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
147 | ```
148 | ### Ceph PG unavailable
149 | #### Some Ceph placement groups are unavailable.[copy]
150 | ```yaml
151 |   - alert: CephPgUnavailable
152 |     expr: ceph_pg_total - ceph_pg_active > 0
153 |     for: 0m
154 |     labels:
155 |       severity: critical
156 |     annotations:
157 |       summary: Ceph PG unavailable (instance {{ $labels.instance }})
158 |       description: "Some Ceph placement groups are unavailable.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
159 | ```
160 | 


--------------------------------------------------------------------------------
/databases/mongodb.md:
--------------------------------------------------------------------------------
  1 | ## MongoDB
  2 | 
  3 | ###  [MongoDB Exporter](https://github.com/dcu/mongodb_exporter)
  4 | 
  5 | -   #### MongoDB replication lag
  6 |     
  7 | ##### Mongodb replication lag is more than 10s
  8 |     
  9 | ```yaml
 10 |       - alert: MongodbReplicationLag
 11 |         expr: avg(mongodb_replset_member_optime_date{state="PRIMARY"}) - avg(mongodb_replset_member_optime_date{state="SECONDARY"}) > 10
 12 |         for: 5m
 13 |         labels:
 14 |           severity: critical
 15 |         annotations:
 16 |           summary: "MongoDB replication lag (instance {{ $labels.instance }})"
 17 |           description: "Mongodb replication lag is more than 10s\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 18 | ```
 19 |     
 20 |       
 21 |     
 22 | -   #### MongoDB replication headroom
 23 |     
 24 | ##### MongoDB replication headroom is <= 0
 25 |     
 26 | ```yaml
 27 |       - alert: MongodbReplicationHeadroom
 28 |         expr: (avg(mongodb_replset_oplog_tail_timestamp - mongodb_replset_oplog_head_timestamp) - (avg(mongodb_replset_member_optime_date{state="PRIMARY"}) - avg(mongodb_replset_member_optime_date{state="SECONDARY"}))) <= 0
 29 |         for: 5m
 30 |         labels:
 31 |           severity: critical
 32 |         annotations:
 33 |           summary: "MongoDB replication headroom (instance {{ $labels.instance }})"
 34 |           description: "MongoDB replication headroom is <= 0\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 35 | ```
 36 |     
 37 |       
 38 |     
 39 | -   #### MongoDB replication Status 3
 40 |     
 41 | ##### MongoDB Replication set member either perform startup self-checks, or transition from completing a rollback or resync
 42 |     
 43 | ```yaml
 44 |       - alert: MongodbReplicationStatus3
 45 |         expr: mongodb_replset_member_state == 3
 46 |         for: 5m
 47 |         labels:
 48 |           severity: critical
 49 |         annotations:
 50 |           summary: "MongoDB replication Status 3 (instance {{ $labels.instance }})"
 51 |           description: "MongoDB Replication set member either perform startup self-checks, or transition from completing a rollback or resync\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 52 | ```
 53 |     
 54 |       
 55 |     
 56 | -   #### MongoDB replication Status 6
 57 |     
 58 | ##### MongoDB Replication set member as seen from another member of the set, is not yet known
 59 |     
 60 | ```yaml
 61 |       - alert: MongodbReplicationStatus6
 62 |         expr: mongodb_replset_member_state == 6
 63 |         for: 5m
 64 |         labels:
 65 |           severity: critical
 66 |         annotations:
 67 |           summary: "MongoDB replication Status 6 (instance {{ $labels.instance }})"
 68 |           description: "MongoDB Replication set member as seen from another member of the set, is not yet known\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 69 | ```
 70 |     
 71 |       
 72 |     
 73 | -   #### MongoDB replication Status 8
 74 |     
 75 | ##### MongoDB Replication set member as seen from another member of the set, is unreachable
 76 | ```yaml
 77 |       - alert: MongodbReplicationStatus8
 78 |         expr: mongodb_replset_member_state == 8
 79 |         for: 5m
 80 |         labels:
 81 |           severity: critical
 82 |         annotations:
 83 |           summary: "MongoDB replication Status 8 (instance {{ $labels.instance }})"
 84 |           description: "MongoDB Replication set member as seen from another member of the set, is unreachable\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 85 | ```
 86 |     
 87 |       
 88 |     
 89 | -   #### MongoDB replication Status 9
 90 |     
 91 | ##### MongoDB Replication set member is actively performing a rollback. Data is not available for reads
 92 | 
 93 | ```yaml
 94 |       - alert: MongodbReplicationStatus9
 95 |         expr: mongodb_replset_member_state == 9
 96 |         for: 5m
 97 |         labels:
 98 |           severity: critical
 99 |         annotations:
100 |           summary: "MongoDB replication Status 9 (instance {{ $labels.instance }})"
101 |           description: "MongoDB Replication set member is actively performing a rollback. Data is not available for reads\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
102 | ```
103 |     
104 |       
105 |     
106 | -   #### MongoDB replication Status 10
107 |     
108 | ##### MongoDB Replication set member was once in a replica set but was subsequently removed
109 | 
110 | ```yaml
111 |       - alert: MongodbReplicationStatus10
112 |         expr: mongodb_replset_member_state == 10
113 |         for: 5m
114 |         labels:
115 |           severity: critical
116 |         annotations:
117 |           summary: "MongoDB replication Status 10 (instance {{ $labels.instance }})"
118 |           description: "MongoDB Replication set member was once in a replica set but was subsequently removed\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
119 | ```
120 |     
121 |       
122 |     
123 | -   #### MongoDB number cursors open
124 |     
125 | ##### Too many cursors opened by MongoDB for clients (> 10k)
126 |     
127 | ```yaml
128 |       - alert: MongodbNumberCursorsOpen
129 |         expr: mongodb_metrics_cursor_open{state="total_open"} > 10000
130 |         for: 5m
131 |         labels:
132 |           severity: warning
133 |         annotations:
134 |           summary: "MongoDB number cursors open (instance {{ $labels.instance }})"
135 |           description: "Too many cursors opened by MongoDB for clients (> 10k)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
136 | ```
137 |     
138 |       
139 |     
140 | -   #### MongoDB cursors timeouts
141 |     
142 | ##### Too many cursors are timing out
143 |     
144 | ```yaml
145 |       - alert: MongodbCursorsTimeouts
146 |         expr: increase(mongodb_metrics_cursor_timed_out_total[10m]) > 100
147 |         for: 5m
148 |         labels:
149 |           severity: warning
150 |         annotations:
151 |           summary: "MongoDB cursors timeouts (instance {{ $labels.instance }})"
152 |           description: "Too many cursors are timing out\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
153 | ```
154 |     
155 |       
156 |     
157 | -   #### MongoDB too many connections
158 |     
159 | ##### Too many connections
160 |     
161 | ```yaml
162 |       - alert: MongodbTooManyConnections
163 |         expr: mongodb_connections{state="current"} > 500
164 |         for: 5m
165 |         labels:
166 |           severity: warning
167 |         annotations:
168 |           summary: "MongoDB too many connections (instance {{ $labels.instance }})"
169 |           description: "Too many connections\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
170 | ```
171 |     
172 |       
173 |     
174 | -   #### MongoDB virtual memory usage
175 |     
176 | ##### High memory usage
177 |     
178 | ```yaml
179 |       - alert: MongodbVirtualMemoryUsage
180 |         expr: (sum(mongodb_memory{type="virtual"}) BY (ip) / sum(mongodb_memory{type="mapped"}) BY (ip)) > 3
181 |         for: 5m
182 |         labels:
183 |           severity: warning
184 |         annotations:
185 |           summary: "MongoDB virtual memory usage (instance {{ $labels.instance }})"
186 |           description: "High memory usage\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
187 | ```
188 | 


--------------------------------------------------------------------------------
/brokers/elasticsearch.md:
--------------------------------------------------------------------------------
  1 | ## Elasticsearch
  2 | 
  3 | ###  [Elasticsearch Exporter](https://github.com/justwatchcom/elasticsearch_exporter)
  4 | 
  5 | 
  6 | -   #### Elasticsearch Heap Usage Too High
  7 |     
  8 | ##### The heap usage is over 90% for 5m
  9 |     
 10 | ```yaml
 11 |       - alert: ElasticsearchHeapUsageTooHigh
 12 |         expr: (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 90
 13 |         for: 5m
 14 |         labels:
 15 |           severity: critical
 16 |         annotations:
 17 |           summary: "Elasticsearch Heap Usage Too High (instance {{ $labels.instance }})"
 18 |           description: "The heap usage is over 90% for 5m\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 19 | ```
 20 |     
 21 |       
 22 |     
 23 | -   #### Elasticsearch Heap Usage warning
 24 |     
 25 | ##### The heap usage is over 80% for 5m
 26 |     
 27 | ```yaml
 28 |       - alert: ElasticsearchHeapUsageWarning
 29 |         expr: (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 80
 30 |         for: 5m
 31 |         labels:
 32 |           severity: warning
 33 |         annotations:
 34 |           summary: "Elasticsearch Heap Usage warning (instance {{ $labels.instance }})"
 35 |           description: "The heap usage is over 80% for 5m\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 36 | ```
 37 |     
 38 |       
 39 |     
 40 | -   #### Elasticsearch disk space low
 41 |     
 42 | ##### The disk usage is over 80%
 43 |     
 44 | ```yaml
 45 |       - alert: ElasticsearchDiskSpaceLow
 46 |         expr: elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 20
 47 |         for: 5m
 48 |         labels:
 49 |           severity: warning
 50 |         annotations:
 51 |           summary: "Elasticsearch disk space low (instance {{ $labels.instance }})"
 52 |           description: "The disk usage is over 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 53 | ```
 54 |     
 55 |       
 56 |     
 57 | -   #### Elasticsearch disk out of space
 58 |     
 59 | ##### The disk usage is over 90%
 60 |     
 61 | ```yaml
 62 |       - alert: ElasticsearchDiskOutOfSpace
 63 |         expr: elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10
 64 |         for: 5m
 65 |         labels:
 66 |           severity: critical
 67 |         annotations:
 68 |           summary: "Elasticsearch disk out of space (instance {{ $labels.instance }})"
 69 |           description: "The disk usage is over 90%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 70 | ```
 71 |     
 72 |       
 73 |     
 74 | -   #### Elasticsearch Cluster Red
 75 |     
 76 | ##### Elastic Cluster Red status
 77 |     
 78 | ```yaml
 79 |       - alert: ElasticsearchClusterRed
 80 |         expr: elasticsearch_cluster_health_status{color="red"} == 1
 81 |         for: 5m
 82 |         labels:
 83 |           severity: critical
 84 |         annotations:
 85 |           summary: "Elasticsearch Cluster Red (instance {{ $labels.instance }})"
 86 |           description: "Elastic Cluster Red status\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 87 | ```
 88 |     
 89 |       
 90 |     
 91 | -   #### Elasticsearch Cluster Yellow
 92 |     
 93 | ##### Elastic Cluster Yellow status
 94 |     
 95 | ```yaml
 96 |       - alert: ElasticsearchClusterYellow
 97 |         expr: elasticsearch_cluster_health_status{color="yellow"} == 1
 98 |         for: 5m
 99 |         labels:
100 |           severity: warning
101 |         annotations:
102 |           summary: "Elasticsearch Cluster Yellow (instance {{ $labels.instance }})"
103 |           description: "Elastic Cluster Yellow status\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
104 | ```
105 |     
106 |       
107 |     
108 | -   #### Elasticsearch Healthy Nodes
109 |     
110 | ##### Number Healthy Nodes less then number_of_nodes
111 |     
112 | ```yaml
113 |       - alert: ElasticsearchHealthyNodes
114 |         expr: elasticsearch_cluster_health_number_of_nodes < number_of_nodes
115 |         for: 5m
116 |         labels:
117 |           severity: critical
118 |         annotations:
119 |           summary: "Elasticsearch Healthy Nodes (instance {{ $labels.instance }})"
120 |           description: "Number Healthy Nodes less then number_of_nodes\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
121 | ```
122 |     
123 |       
124 |     
125 | -   #### Elasticsearch Healthy Data Nodes
126 |     
127 | ##### Number Healthy Data Nodes less then number_of_data_nodes
128 |     
129 | ```yaml
130 |       - alert: ElasticsearchHealthyDataNodes
131 |         expr: elasticsearch_cluster_health_number_of_data_nodes < number_of_data_nodes
132 |         for: 5m
133 |         labels:
134 |           severity: critical
135 |         annotations:
136 |           summary: "Elasticsearch Healthy Data Nodes (instance {{ $labels.instance }})"
137 |           description: "Number Healthy Data Nodes less then number_of_data_nodes\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
138 | ```
139 |     
140 |       
141 |     
142 | -   #### Elasticsearch relocation shards
143 |     
144 | ##### Number of relocation shards for 20 min
145 |     
146 | ```yaml
147 |       - alert: ElasticsearchRelocationShards
148 |         expr: elasticsearch_cluster_health_relocating_shards > 0
149 |         for: 5m
150 |         labels:
151 |           severity: critical
152 |         annotations:
153 |           summary: "Elasticsearch relocation shards (instance {{ $labels.instance }})"
154 |           description: "Number of relocation shards for 20 min\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
155 | ```
156 |     
157 |       
158 |     
159 | -   #### Elasticsearch initializing shards
160 |     
161 | ##### Number of initializing shards for 10 min
162 |     
163 | ```yaml
164 |       - alert: ElasticsearchInitializingShards
165 |         expr: elasticsearch_cluster_health_initializing_shards > 0
166 |         for: 5m
167 |         labels:
168 |           severity: warning
169 |         annotations:
170 |           summary: "Elasticsearch initializing shards (instance {{ $labels.instance }})"
171 |           description: "Number of initializing shards for 10 min\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
172 | ```
173 |     
174 |       
175 |     
176 | -   #### Elasticsearch unassigned shards
177 |     
178 | ##### Number of unassigned shards for 2 min
179 |     
180 | ```yaml
181 |       - alert: ElasticsearchUnassignedShards
182 |         expr: elasticsearch_cluster_health_unassigned_shards > 0
183 |         for: 5m
184 |         labels:
185 |           severity: critical
186 |         annotations:
187 |           summary: "Elasticsearch unassigned shards (instance {{ $labels.instance }})"
188 |           description: "Number of unassigned shards for 2 min\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
189 | ```
190 |     
191 |       
192 |     
193 | -   #### Elasticsearch pending tasks
194 |     
195 | ##### Number of pending tasks for 10 min. Cluster works slowly
196 |     
197 | ```yaml
198 |       - alert: ElasticsearchPendingTasks
199 |         expr: elasticsearch_cluster_health_number_of_pending_tasks > 0
200 |         for: 5m
201 |         labels:
202 |           severity: warning
203 |         annotations:
204 |           summary: "Elasticsearch pending tasks (instance {{ $labels.instance }})"
205 |           description: "Number of pending tasks for 10 min. Cluster works slowly.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
206 | ```
207 |     
208 |       
209 |     
210 | -   #### Elasticsearch no new documents
211 |     
212 | ##### No new documents for 10 min!
213 |     
214 | ```yaml
215 |       - alert: ElasticsearchNoNewDocuments
216 |         expr: rate(elasticsearch_indices_docs{es_data_node="true"}[10m]) < 1
217 |         for: 5m
218 |         labels:
219 |           severity: warning
220 |         annotations:
221 |           summary: "Elasticsearch no new documents (instance {{ $labels.instance }})"
222 |           description: "No new documents for 10 min!\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
223 | ```
224 | 


--------------------------------------------------------------------------------
/databases/cassandra.md:
--------------------------------------------------------------------------------
  1 | ## Cassandra
  2 | 
  3 | ###  [Cassandra Exporter](https://github.com/criteo/cassandra_exporter)
  4 | 
  5 | -   #### Cassandra hints count
  6 |     
  7 | ##### Cassandra hints count has changed on {{ $labels.instance }} some nodes may go down
  8 |     
  9 | ```yaml
 10 |       - alert: CassandraHintsCount
 11 |         expr: changes(cassandra_stats{name="org:apache:cassandra:metrics:storage:totalhints:count"}[1m]) > 3
 12 |         for: 5m
 13 |         labels:
 14 |           severity: critical
 15 |         annotations:
 16 |           summary: "Cassandra hints count (instance {{ $labels.instance }})"
 17 |           description: "Cassandra hints count has changed on {{ $labels.instance }} some nodes may go down\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 18 | ```
 19 |     
 20 |       
 21 |     
 22 | -   #### Cassandra compaction task pending
 23 |     
 24 | ##### Many Cassandra compaction tasks are pending. You might need to increase I/O capacity by adding nodes to the cluster
 25 |     
 26 | ```yaml
 27 |       - alert: CassandraCompactionTaskPending
 28 |         expr: avg_over_time(cassandra_stats{name="org:apache:cassandra:metrics:compaction:pendingtasks:value"}[30m]) > 100
 29 |         for: 5m
 30 |         labels:
 31 |           severity: warning
 32 |         annotations:
 33 |           summary: "Cassandra compaction task pending (instance {{ $labels.instance }})"
 34 |           description: "Many Cassandra compaction tasks are pending. You might need to increase I/O capacity by adding nodes to the cluster.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 35 | ```
 36 |     
 37 |       
 38 |     
 39 | -   #### Cassandra viewwrite latency
 40 |     
 41 | ##### High viewwrite latency on {{ $labels.instance }} cassandra node
 42 |     
 43 | ```yaml
 44 |       - alert: CassandraViewwriteLatency
 45 |         expr: cassandra_stats{name="org:apache:cassandra:metrics:clientrequest:viewwrite:viewwritelatency:99thpercentile",service="cas"} > 100000
 46 |         for: 5m
 47 |         labels:
 48 |           severity: warning
 49 |         annotations:
 50 |           summary: "Cassandra viewwrite latency (instance {{ $labels.instance }})"
 51 |           description: "High viewwrite latency on {{ $labels.instance }} cassandra node\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 52 | ```
 53 |     
 54 |       
 55 |     
 56 | -   #### Cassandra cool hacker
 57 |     
 58 | ##### Increase of Cassandra authentication failures
 59 |     
 60 | ```yaml
 61 |       - alert: CassandraCoolHacker
 62 |         expr: irate(cassandra_stats{name="org:apache:cassandra:metrics:client:authfailure:count"}[1m]) > 5
 63 |         for: 5m
 64 |         labels:
 65 |           severity: warning
 66 |         annotations:
 67 |           summary: "Cassandra cool hacker (instance {{ $labels.instance }})"
 68 |           description: "Increase of Cassandra authentication failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 69 | ```
 70 |     
 71 |       
 72 |     
 73 | -   #### Cassandra node down
 74 |     
 75 | ##### Cassandra node down
 76 |     
 77 | ```yaml
 78 |       - alert: CassandraNodeDown
 79 |         expr: sum(cassandra_stats{name="org:apache:cassandra:net:failuredetector:downendpointcount"}) by (service,group,cluster,env) > 0
 80 |         for: 5m
 81 |         labels:
 82 |           severity: critical
 83 |         annotations:
 84 |           summary: "Cassandra node down (instance {{ $labels.instance }})"
 85 |           description: "Cassandra node down\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 86 | ```
 87 |     
 88 |       
 89 |     
 90 | -   #### Cassandra commitlog pending tasks
 91 |     
 92 | ##### Unexpected number of Cassandra commitlog pending tasks
 93 |     
 94 | ```yaml
 95 |       - alert: CassandraCommitlogPendingTasks
 96 |         expr: cassandra_stats{name="org:apache:cassandra:metrics:commitlog:pendingtasks:value"} > 15
 97 |         for: 5m
 98 |         labels:
 99 |           severity: warning
100 |         annotations:
101 |           summary: "Cassandra commitlog pending tasks (instance {{ $labels.instance }})"
102 |           description: "Unexpected number of Cassandra commitlog pending tasks\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
103 | ```
104 |     
105 |       
106 |     
107 | -   #### Cassandra compaction executor blocked tasks
108 |     
109 | ##### Some Cassandra compaction executor tasks are blocked
110 |     
111 | ```yaml
112 |       - alert: CassandraCompactionExecutorBlockedTasks
113 |         expr: cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:compactionexecutor:currentlyblockedtasks:count"} > 0
114 |         for: 5m
115 |         labels:
116 |           severity: warning
117 |         annotations:
118 |           summary: "Cassandra compaction executor blocked tasks (instance {{ $labels.instance }})"
119 |           description: "Some Cassandra compaction executor tasks are blocked\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
120 | ```
121 |     
122 |       
123 |     
124 | -   #### Cassandra flush writer blocked tasks
125 |     
126 | ##### Some Cassandra flush writer tasks are blocked
127 |     
128 | ```yaml
129 |       - alert: CassandraFlushWriterBlockedTasks
130 |         expr: cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:memtableflushwriter:currentlyblockedtasks:count"} > 0
131 |         for: 5m
132 |         labels:
133 |           severity: warning
134 |         annotations:
135 |           summary: "Cassandra flush writer blocked tasks (instance {{ $labels.instance }})"
136 |           description: "Some Cassandra flush writer tasks are blocked\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
137 | ```
138 |     
139 |       
140 |     
141 | -   #### Cassandra repair pending tasks
142 |     
143 | ##### Some Cassandra repair tasks are pending
144 |     
145 | ```yaml
146 |       - alert: CassandraRepairPendingTasks
147 |         expr: cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:antientropystage:pendingtasks:value"} > 2
148 |         for: 5m
149 |         labels:
150 |           severity: warning
151 |         annotations:
152 |           summary: "Cassandra repair pending tasks (instance {{ $labels.instance }})"
153 |           description: "Some Cassandra repair tasks are pending\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
154 | ```
155 |     
156 |       
157 |     
158 | -   #### Cassandra repair blocked tasks
159 |     
160 | ##### Some Cassandra repair tasks are blocked
161 |     
162 | ```yaml
163 |       - alert: CassandraRepairBlockedTasks
164 |         expr: cassandra_stats{name="org:apache:cassandra:metrics:threadpools:internal:antientropystage:currentlyblockedtasks:count"} > 0
165 |         for: 5m
166 |         labels:
167 |           severity: warning
168 |         annotations:
169 |           summary: "Cassandra repair blocked tasks (instance {{ $labels.instance }})"
170 |           description: "Some Cassandra repair tasks are blocked\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
171 | ```
172 |     
173 |       
174 |     
175 | -   #### Cassandra connection timeouts total
176 |     
177 | ##### Some connection between nodes are ending in timeout
178 |     
179 | ```yaml
180 |       - alert: CassandraConnectionTimeoutsTotal
181 |         expr: rate(cassandra_stats{name="org:apache:cassandra:metrics:connection:totaltimeouts:count"}[1m]) > 5
182 |         for: 5m
183 |         labels:
184 |           severity: critical
185 |         annotations:
186 |           summary: "Cassandra connection timeouts total (instance {{ $labels.instance }})"
187 |           description: "Some connection between nodes are ending in timeout\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
188 | ```
189 |     
190 |       
191 |     
192 | -   #### Cassandra storage exceptions
193 |     
194 | ##### Something is going wrong with cassandra storage
195 |     
196 | ```yaml
197 |       - alert: CassandraStorageExceptions
198 |         expr: changes(cassandra_stats{name="org:apache:cassandra:metrics:storage:exceptions:count"}[1m]) > 1
199 |         for: 5m
200 |         labels:
201 |           severity: critical
202 |         annotations:
203 |           summary: "Cassandra storage exceptions (instance {{ $labels.instance }})"
204 |           description: "Something is going wrong with cassandra storage\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
205 | ```
206 | 


--------------------------------------------------------------------------------
/brokers/rabbitmq.md:
--------------------------------------------------------------------------------
  1 | ## RabbitMQ
  2 | 
  3 | ### [RabbitMQ Exporter](https://github.com/rabbitmq/rabbitmq-prometheus)
  4 | 
  5 | -   #### Rabbitmq down
  6 |     
  7 | ##### RabbitMQ node down
  8 |     
  9 | ```yaml
 10 |       - alert: RabbitmqDown
 11 |         expr: rabbitmq_up == 0
 12 |         for: 5m
 13 |         labels:
 14 |           severity: critical
 15 |         annotations:
 16 |           summary: "Rabbitmq down (instance {{ $labels.instance }})"
 17 |           description: "RabbitMQ node down\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 18 | ```
 19 |     
 20 | -   #### Rabbitmq cluster down
 21 |     
 22 | ##### Less than 3 nodes running in RabbitMQ cluster
 23 |     
 24 | ```yaml
 25 |       - alert: RabbitmqClusterDown
 26 |         expr: sum(rabbitmq_running) < 3
 27 |         for: 5m
 28 |         labels:
 29 |           severity: critical
 30 |         annotations:
 31 |           summary: "Rabbitmq cluster down (instance {{ $labels.instance }})"
 32 |           description: "Less than 3 nodes running in RabbitMQ cluster\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 33 | ```
 34 |     
 35 |       
 36 |     
 37 | -   #### Rabbitmq cluster partition
 38 |     
 39 | ##### Cluster partition
 40 |     
 41 | ```yaml
 42 |       - alert: RabbitmqClusterPartition
 43 |         expr: rabbitmq_partitions > 0
 44 |         for: 5m
 45 |         labels:
 46 |           severity: critical
 47 |         annotations:
 48 |           summary: "Rabbitmq cluster partition (instance {{ $labels.instance }})"
 49 |           description: "Cluster partition\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 50 | ```
 51 |     
 52 |       
 53 |     
 54 | -   #### Rabbitmq out of memory
 55 |     
 56 | ##### Memory available for RabbmitMQ is low (< 10%)
 57 |     
 58 | ```yaml
 59 |       - alert: RabbitmqOutOfMemory
 60 |         expr: rabbitmq_node_mem_used / rabbitmq_node_mem_limit * 100 > 90
 61 |         for: 5m
 62 |         labels:
 63 |           severity: warning
 64 |         annotations:
 65 |           summary: "Rabbitmq out of memory (instance {{ $labels.instance }})"
 66 |           description: "Memory available for RabbmitMQ is low (< 10%)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 67 | ```
 68 |     
 69 | -   #### RabbitMQ memory high
 70 |     
 71 | ##### A node use more than 90% of allocated RAM      
 72 | ```yaml
 73 |   - alert: RabbitmqMemoryHigh
 74 |     expr: rabbitmq_process_resident_memory_bytes / rabbitmq_resident_memory_limit_bytes * 100 > 90
 75 |     for: 2m
 76 |     labels:
 77 |       severity: warning
 78 |     annotations:
 79 |       summary: Rabbitmq memory high (instance {{ $labels.instance }})
 80 |       description: "A node use more than 90% of allocated RAM\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 81 | ```
 82 | 
 83 | -   #### Rabbitmq file descriptors usage
 84 |     
 85 | ##### A node use more than 90% of file descriptors      
 86 | ```yaml
 87 |   - alert: RabbitmqFileDescriptorsUsage
 88 |     expr: rabbitmq_process_open_fds / rabbitmq_process_max_fds * 100 > 90
 89 |     for: 2m
 90 |     labels:
 91 |       severity: warning
 92 |     annotations:
 93 |       summary: Rabbitmq file descriptors usage (instance {{ $labels.instance }})
 94 |       description: "A node use more than 90% of file descriptors\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 95 | ```
 96 | 
 97 | -   #### RabbitMQ too many unack messages
 98 |     
 99 | ##### Too many unacknowledged messages      
100 | ```yaml
101 |   - alert: RabbitmqTooManyUnackMessages
102 |     expr: sum(rabbitmq_queue_messages_unacked) BY (queue) > 1000
103 |     for: 1m
104 |     labels:
105 |       severity: warning
106 |     annotations:
107 |       summary: Rabbitmq too many unack messages (instance {{ $labels.instance }})
108 |       description: "Too many unacknowledged messages\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
109 | ```
110 | 
111 | -   #### Rabbitmq too many connections
112 |     
113 | ##### RabbitMQ instance has too many connections (> 1000)
114 |     
115 | ```yaml
116 |       - alert: RabbitmqTooManyConnections
117 |         expr: rabbitmq_connectionsTotal > 1000
118 |         for: 5m
119 |         labels:
120 |           severity: warning
121 |         annotations:
122 |           summary: "Rabbitmq too many connections (instance {{ $labels.instance }})"
123 |           description: "RabbitMQ instance has too many connections (> 1000)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
124 | ```
125 |  -   #### RabbitMQ no queue consumer
126 |     
127 | ##### A queue has less than 1 consumer
128 |     
129 | ```yaml
130 |   - alert: RabbitmqNoQueueConsumer
131 |     expr: rabbitmq_queue_consumers < 1
132 |     for: 1m
133 |     labels:
134 |       severity: warning
135 |     annotations:
136 |       summary: Rabbitmq no queue consumer (instance {{ $labels.instance }})
137 |       description: "A queue has less than 1 consumer\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
138 | ```   
139 |       
140 |     
141 | -   #### Rabbitmq dead letter queue filling up
142 |     
143 | ##### Dead letter queue is filling up (> 10 msgs)
144 |     
145 | ```yaml
146 |       - alert: RabbitmqDeadLetterQueueFillingUp
147 |         expr: rabbitmq_queue_messages{queue="my-dead-letter-queue"} > 10
148 |         for: 5m
149 |         labels:
150 |           severity: critical
151 |         annotations:
152 |           summary: "Rabbitmq dead letter queue filling up (instance {{ $labels.instance }})"
153 |           description: "Dead letter queue is filling up (> 10 msgs)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
154 | ```
155 |     
156 |       
157 |     
158 | -   #### Rabbitmq too many messages in queue
159 |     
160 | ##### Queue is filling up (> 1000 msgs)
161 |     
162 | ```yaml
163 |       - alert: RabbitmqTooManyMessagesInQueue
164 |         expr: rabbitmq_queue_messages_ready{queue="my-queue"} > 1000
165 |         for: 5m
166 |         labels:
167 |           severity: warning
168 |         annotations:
169 |           summary: "Rabbitmq too many messages in queue (instance {{ $labels.instance }})"
170 |           description: "Queue is filling up (> 1000 msgs)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
171 | ```
172 |     
173 |       
174 |     
175 | -   #### Rabbitmq slow queue consuming
176 |     
177 | ##### Queue messages are consumed slowly (> 60s)
178 |     
179 | ```yaml
180 |       - alert: RabbitmqSlowQueueConsuming
181 |         expr: time() - rabbitmq_queue_head_message_timestamp{queue="my-queue"} > 60
182 |         for: 5m
183 |         labels:
184 |           severity: warning
185 |         annotations:
186 |           summary: "Rabbitmq slow queue consuming (instance {{ $labels.instance }})"
187 |           description: "Queue messages are consumed slowly (> 60s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
188 | ```
189 |     
190 |       
191 |     
192 | -   #### Rabbitmq no consumer
193 |     
194 | ##### Queue has no consumer
195 |     
196 | ```yaml
197 |       - alert: RabbitmqNoConsumer
198 |         expr: rabbitmq_queue_consumers == 0
199 |         for: 5m
200 |         labels:
201 |           severity: critical
202 |         annotations:
203 |           summary: "Rabbitmq no consumer (instance {{ $labels.instance }})"
204 |           description: "Queue has no consumer\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
205 | ```
206 |     
207 |       
208 |     
209 | -   #### Rabbitmq too many consumers
210 |     
211 | ##### Queue should have only 1 consumer
212 |     
213 | ```yaml
214 |       - alert: RabbitmqTooManyConsumers
215 |         expr: rabbitmq_queue_consumers > 1
216 |         for: 5m
217 |         labels:
218 |           severity: critical
219 |         annotations:
220 |           summary: "Rabbitmq too many consumers (instance {{ $labels.instance }})"
221 |           description: "Queue should have only 1 consumer\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
222 | ```
223 |     
224 |       
225 |     
226 | -   #### Rabbitmq unactive exchange
227 |     
228 | ##### Exchange receive less than 5 msgs per second[copy]
229 |     
230 | ```yaml
231 |       - alert: RabbitmqUnactiveExchange
232 |         expr: rate(rabbitmq_exchange_messages_published_in_total{exchange="my-exchange"}[1m]) < 5
233 |         for: 5m
234 |         labels:
235 |           severity: warning
236 |         annotations:
237 |           summary: "Rabbitmq unactive exchange (instance {{ $labels.instance }})"
238 |           description: "Exchange receive less than 5 msgs per second\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
239 | ```
240 | 


--------------------------------------------------------------------------------
/prometheus.md:
--------------------------------------------------------------------------------
  1 | ````
  2 | - name: PrometheusSelf
  3 |   rules:
  4 |   - alert: PrometheusJobMissing
  5 |     expr: absent(up{job="prometheus"})
  6 |     for: 0m
  7 |     labels:
  8 |       severity: warning
  9 |     annotations:
 10 |       summary: "Prometheus job missing (instance {{ $labels.instance }})"
 11 |       description: "A Prometheus job has disappeared\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 12 | 
 13 |   - alert: PrometheusAllTargetsMissing
 14 |     expr: count by (job) (up) == 0
 15 |     for: 0m
 16 |     labels:
 17 |       severity: critical
 18 |     annotations:
 19 |       summary: "Prometheus all targets missing (instance {{ $labels.instance }})"
 20 |       description: "A Prometheus job does not have living target anymore.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 21 | 
 22 |   - alert: PrometheusTargetMissing
 23 |     expr: up == 0
 24 |     for: 0m
 25 |     labels:
 26 |       severity: critical
 27 |     annotations:
 28 |       summary: Prometheus target missing (instance {{ $labels.instance }})
 29 |       description: "A Prometheus target has disappeared. An exporter might be crashed.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 30 | 
 31 |   - alert: PrometheusJobMissing
 32 |     expr: absent(up{job="prometheus"})
 33 |     for: 0m
 34 |     labels:
 35 |       severity: warning
 36 |     annotations:
 37 |       summary: Prometheus job missing (instance {{ $labels.instance }})
 38 |       description: "A Prometheus job has disappeared\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 39 | 
 40 |   - alert: PrometheusConfigurationReloadFailure
 41 |     expr: prometheus_config_last_reload_successful != 1
 42 |     for: 0m
 43 |     labels:
 44 |       severity: warning
 45 |     annotations:
 46 |       summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
 47 |       description: "Prometheus configuration reload error\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 48 | 
 49 |   - alert: PrometheusTooManyRestarts
 50 |     expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
 51 |     for: 0m
 52 |     labels:
 53 |       severity: warning
 54 |     annotations:
 55 |       summary: "Prometheus too many restarts (instance {{ $labels.instance }})"
 56 |       description: "Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 57 | 
 58 |   - alert: PrometheusAlertmanagerConfigurationReloadFailure
 59 |     expr: alertmanager_config_last_reload_successful != 1
 60 |     for: 0m
 61 |     labels:
 62 |       severity: warning
 63 |     annotations:
 64 |       summary: "Prometheus AlertManager configuration reload failure (instance {{ $labels.instance }})"
 65 |       description: "AlertManager configuration reload error\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 66 | 
 67 |   - alert: PrometheusAlertmanagerConfigNotSynced
 68 |     expr: count(count_values("config_hash", alertmanager_config_hash)) > 1
 69 |     for: 0m
 70 |     labels:
 71 |       severity: warning
 72 |     annotations:
 73 |       summary: "Prometheus AlertManager config not synced (instance {{ $labels.instance }})"
 74 |       description: "Configurations of AlertManager cluster instances are out of sync\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 75 | 
 76 |   - alert: PrometheusAlertmanagerE2eDeadManSwitch
 77 |     expr: vector(1)
 78 |     for: 0m
 79 |     labels:
 80 |       severity: critical
 81 |     annotations:
 82 |       summary: "Prometheus AlertManager E2E dead man switch (instance {{ $labels.instance }})"
 83 |       description: "Prometheus DeadManSwitch is an always-firing alert. It's used as an end-to-end test of Prometheus through the Alertmanager.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 84 | 
 85 |   - alert: PrometheusNotConnectedToAlertmanager
 86 |     expr: prometheus_notifications_alertmanagers_discovered < 1
 87 |     for: 0m
 88 |     labels:
 89 |       severity: critical
 90 |     annotations:
 91 |       summary: "Prometheus not connected to alertmanager (instance {{ $labels.instance }})"
 92 |       description: "Prometheus cannot connect the alertmanager\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 93 | 
 94 |   - alert: PrometheusRuleEvaluationFailures
 95 |     expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0
 96 |     for: 0m
 97 |     labels:
 98 |       severity: critical
 99 |     annotations:
100 |       summary: "Prometheus rule evaluation failures (instance {{ $labels.instance }})"
101 |       description: "Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
102 | 
103 |   - alert: PrometheusTemplateTextExpansionFailures
104 |     expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0
105 |     for: 0m
106 |     labels:
107 |       severity: critical
108 |     annotations:
109 |       summary: "Prometheus template text expansion failures (instance {{ $labels.instance }})"
110 |       description: "Prometheus encountered {{ $value }} template text expansion failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
111 | 
112 |   - alert: PrometheusRuleEvaluationSlow
113 |     expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds
114 |     for: 5m
115 |     labels:
116 |       severity: warning
117 |     annotations:
118 |       summary: "Prometheus rule evaluation slow (instance {{ $labels.instance }})"
119 |       description: "Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
120 | 
121 |   - alert: PrometheusNotificationsBacklog
122 |     expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
123 |     for: 0m
124 |     labels:
125 |       severity: warning
126 |     annotations:
127 |       summary: "Prometheus notifications backlog (instance {{ $labels.instance }})"
128 |       description: "The Prometheus notification queue has not been empty for 10 minutes\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
129 | 
130 |   - alert: PrometheusAlertmanagerNotificationFailing
131 |     expr: rate(alertmanager_notifications_failed_total[1m]) > 0
132 |     for: 0m
133 |     labels:
134 |       severity: critical
135 |     annotations:
136 |       summary: "Prometheus AlertManager notification failing (instance {{ $labels.instance }})"
137 |       description: "Alertmanager is failing sending notifications\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
138 | 
139 |   - alert: PrometheusTargetEmpty
140 |     expr: prometheus_sd_discovered_targets == 0
141 |     for: 0m
142 |     labels:
143 |       severity: critical
144 |     annotations:
145 |       summary: "Prometheus target empty (instance {{ $labels.instance }})"
146 |       description: "Prometheus has no target in service discovery\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
147 | 
148 |   - alert: PrometheusTargetScrapingSlow
149 |     expr: prometheus_target_interval_length_seconds{quantile="0.9"} > 60
150 |     for: 5m
151 |     labels:
152 |       severity: warning
153 |     annotations:
154 |       summary: "Prometheus target scraping slow (instance {{ $labels.instance }})"
155 |       description: "Prometheus is scraping exporters slowly\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
156 | 
157 |   - alert: PrometheusLargeScrape
158 |     expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10
159 |     for: 5m
160 |     labels:
161 |       severity: warning
162 |     annotations:
163 |       summary: "Prometheus large scrape (instance {{ $labels.instance }})"
164 |       description: "Prometheus has many scrapes that exceed the sample limit\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
165 | 
166 |   - alert: PrometheusTargetScrapeDuplicate
167 |     expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0
168 |     for: 0m
169 |     labels:
170 |       severity: warning
171 |     annotations:
172 |       summary: "Prometheus target scrape duplicate (instance {{ $labels.instance }})"
173 |       description: "Prometheus has many samples rejected due to duplicate timestamps but different values\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
174 | 
175 |   - alert: PrometheusTsdbCheckpointCreationFailures
176 |     expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0
177 |     for: 0m
178 |     labels:
179 |       severity: critical
180 |     annotations:
181 |       summary: "Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})"
182 |       description: "Prometheus encountered {{ $value }} checkpoint creation failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
183 | 
184 |   - alert: PrometheusTsdbCheckpointDeletionFailures
185 |     expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0
186 |     for: 0m
187 |     labels:
188 |       severity: critical
189 |     annotations:
190 |       summary: "Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})"
191 |       description: "Prometheus encountered {{ $value }} checkpoint deletion failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
192 | 
193 |   - alert: PrometheusTsdbCompactionsFailed
194 |     expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
195 |     for: 0m
196 |     labels:
197 |       severity: critical
198 |     annotations:
199 |       summary: "Prometheus TSDB compactions failed (instance {{ $labels.instance }})"
200 |       description: "Prometheus encountered {{ $value }} TSDB compactions failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
201 | 
202 |   - alert: PrometheusTsdbHeadTruncationsFailed
203 |     expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
204 |     for: 0m
205 |     labels:
206 |       severity: critical
207 |     annotations:
208 |       summary: "Prometheus TSDB head truncations failed (instance {{ $labels.instance }})"
209 |       description: "Prometheus encountered {{ $value }} TSDB head truncation failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
210 | 
211 |   - alert: PrometheusTsdbReloadFailures
212 |     expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0
213 |     for: 0m
214 |     labels:
215 |       severity: critical
216 |     annotations:
217 |       summary: "Prometheus TSDB reload failures (instance {{ $labels.instance }})"
218 |       description: "Prometheus encountered {{ $value }} TSDB reload failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
219 | 
220 |   - alert: PrometheusTsdbWalCorruptions
221 |     expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0
222 |     for: 0m
223 |     labels:
224 |       severity: critical
225 |     annotations:
226 |       summary: "Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})"
227 |       description: "Prometheus encountered {{ $value }} TSDB WAL corruptions\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
228 | 
229 |   - alert: PrometheusTsdbWalTruncationsFailed
230 |     expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0
231 |     for: 0m
232 |     labels:
233 |       severity: critical
234 |     annotations:
235 |       summary: "Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})"
236 |       description: "Prometheus encountered {{ $value }} TSDB WAL truncation failures\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
237 | ````
238 | 


--------------------------------------------------------------------------------
/proxy/haproxy.md:
--------------------------------------------------------------------------------
  1 | # [HAProxy Exporter version > 2](https://github.com/haproxy/haproxy/tree/master/contrib/prometheus-exporter)
  2 | 
  3 | ````
  4 | - name: HAProxy
  5 |   rules:
  6 | ````
  7 | ````
  8 |   - alert: HaproxyHighHttp4xxErrorRateBackend
  9 |     expr: ((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5
 10 |     for: 1m
 11 |     labels:
 12 |       severity: critical
 13 |     annotations:
 14 |       summary: "HAProxy high HTTP 4xx error rate backend (instance {{ $labels.instance }})"
 15 |       description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 16 | ````
 17 | ````
 18 |   - alert: HaproxyHighHttp5xxErrorRateBackend
 19 |     expr: ((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5
 20 |     for: 1m
 21 |     labels:
 22 |       severity: critical
 23 |     annotations:
 24 |       summary: "HAProxy high HTTP 5xx error rate backend (instance {{ $labels.instance }})"
 25 |       description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 26 | ````
 27 | ````
 28 |   - alert: HaproxyHighHttp4xxErrorRateServer
 29 |     expr: ((sum by (server) (rate(haproxy_server_http_responses_total{code="4xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5
 30 |     for: 1m
 31 |     labels:
 32 |       severity: critical
 33 |     annotations:
 34 |       summary: "HAProxy high HTTP 4xx error rate server (instance {{ $labels.instance }})"
 35 |       description: "Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }}\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 36 | ````
 37 | ````
 38 |   - alert: HaproxyHighHttp5xxErrorRateServer
 39 |     expr: ((sum by (server) (rate(haproxy_server_http_responses_total{code="5xx"}[1m])) / sum by (proxy) (rate(haproxy_server_http_responses_total[1m]))) * 100) > 5
 40 |     for: 1m
 41 |     labels:
 42 |       severity: critical
 43 |     annotations:
 44 |       summary: "HAProxy high HTTP 5xx error rate server (instance {{ $labels.instance }})"
 45 |       description: "Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }}\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 46 | ````
 47 | ````
 48 |   - alert: HaproxyServerResponseErrors
 49 |     expr: (sum by (server) (rate(haproxy_server_response_errors_total[1m])) / sum by (server) (rate(haproxy_server_http_responses_total[1m]))) * 100 > 5
 50 |     for: 1m
 51 |     labels:
 52 |       severity: critical
 53 |     annotations:
 54 |       summary: "HAProxy server response errors (instance {{ $labels.instance }})"
 55 |       description: "Too many response errors to {{ $labels.server }} server (> 5%).\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 56 | ````
 57 | ````
 58 |   - alert: HaproxyBackendConnectionErrors
 59 |     expr: (sum by (proxy) (rate(haproxy_backend_connection_errors_total[1m]))) > 100
 60 |     for: 1m
 61 |     labels:
 62 |       severity: critical
 63 |     annotations:
 64 |       summary: "HAProxy backend connection errors (instance {{ $labels.instance }})"
 65 |       description: "Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be to high.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 66 | ````
 67 | ````
 68 |   - alert: HaproxyServerConnectionErrors
 69 |     expr: (sum by (proxy) (rate(haproxy_backend_connection_errors_total[1m]))) > 100
 70 |     for: 0m
 71 |     labels:
 72 |       severity: critical
 73 |     annotations:
 74 |       summary: "HAProxy server connection errors (instance {{ $labels.instance }})"
 75 |       description: "Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be to high.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 76 | ````
 77 | ````
 78 |   - alert: HaproxyBackendMaxActiveSession
 79 |     expr: avg_over_time(((sum by (proxy) (haproxy_server_max_sessions)) / (sum by (proxy) (haproxy_server_limit_sessions))) [2m]) * 100 > 80
 80 |     for: 2m
 81 |     labels:
 82 |       severity: warning
 83 |     annotations:
 84 |       summary: "HAProxy backend max active session (instance {{ $labels.instance }})"
 85 |       description: "HAproxy backend {{ $labels.fqdn }}/{{ $labels.backend }} is reaching session limit (> 80%).\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 86 | ````
 87 | ````
 88 |   - alert: HaproxyPendingRequests
 89 |     expr: sum by (proxy) (rate(haproxy_backend_current_queue[2m])) > 0
 90 |     for: 2m
 91 |     labels:
 92 |       severity: warning
 93 |     annotations:
 94 |       summary: "HAProxy pending requests (instance {{ $labels.instance }})"
 95 |       description: "Some HAProxy requests are pending on {{ $labels.fqdn }}/{{ $labels.backend }} backend\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 96 | ````
 97 | ````
 98 |   - alert: HaproxyHttpSlowingDown
 99 |     expr: avg by (proxy) (haproxy_backend_max_total_time_seconds) > 1
100 |     for: 1m
101 |     labels:
102 |       severity: warning
103 |     annotations:
104 |       summary: "HAProxy HTTP slowing down (instance {{ $labels.instance }})"
105 |       description: "Average request time is increasing\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
106 | ````
107 | ````
108 |   - alert: HaproxyRetryHigh
109 |     expr: sum by (proxy) (rate(haproxy_backend_retry_warnings_total[1m])) > 10
110 |     for: 2m
111 |     labels:
112 |       severity: warning
113 |     annotations:
114 |       summary: "HAProxy retry high (instance {{ $labels.instance }})"
115 |       description: "High rate of retry on {{ $labels.fqdn }}/{{ $labels.backend }} backend\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
116 | ````
117 | ````
118 |   - alert: HaproxyProxyDown
119 |     expr: haproxy_backend_up == 0
120 |     for: 0m
121 |     labels:
122 |       severity: critical
123 |     annotations:
124 |       summary: "HAProxy proxy down (instance {{ $labels.instance }})"
125 |       description: "HAProxy proxy is down\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
126 | ````
127 | ````
128 |   - alert: HaproxyServerDown
129 |     expr: haproxy_backend_active_servers == 0
130 |     for: 0m
131 |     labels:
132 |       severity: critical
133 |     annotations:
134 |       summary: "HAProxy server down (instance {{ $labels.instance }})"
135 |       description: "HAProxy backend is down\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
136 | ````
137 | ````
138 |   - alert: HaproxyFrontendSecurityBlockedRequests
139 |     expr: sum by (proxy) (rate(haproxy_frontend_denied_connections_total[2m])) > 10
140 |     for: 2m
141 |     labels:
142 |       severity: warning
143 |     annotations:
144 |       summary: "HAProxy frontend security blocked requests (instance {{ $labels.instance }})"
145 |       description: "HAProxy is blocking requests for security reason\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
146 | ````
147 | ````
148 |   - alert: HaproxyServerHealthcheckFailure
149 |     expr: increase(haproxy_server_check_failures_total[1m]) > 0
150 |     for: 1m
151 |     labels:
152 |       severity: warning
153 |     annotations:
154 |       summary: "HAProxy server healthcheck failure (instance {{ $labels.instance }})"
155 |       description: "Some server healthcheck are failing on {{ $labels.server }}\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
156 | ````
157 | ````
158 |   - alert: HaproxyDown
159 |     expr: haproxy_up == 0
160 |     for: 0m
161 |     labels:
162 |       severity: critical
163 |     annotations:
164 |       summary: "HAProxy down (instance {{ $labels.instance }})"
165 |       description: "HAProxy down\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
166 | ````
167 | ````
168 |   - alert: HaproxyHighHttp4xxErrorRateBackend
169 |     expr: sum by (backend) rate(haproxy_server_http_responses_total{code="4xx"}[1m]) / sum by (backend) rate(haproxy_server_http_responses_total[1m]) * 100 > 5
170 |     for: 1m
171 |     labels:
172 |       severity: critical
173 |     annotations:
174 |       summary: "HAProxy high HTTP 4xx error rate backend (instance {{ $labels.instance }})"
175 |       description: "Too many HTTP requests with status 4xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
176 | ````
177 | ````
178 |   - alert: HaproxyHighHttp4xxErrorRateBackend
179 |     expr: sum by (backend) rate(haproxy_server_http_responses_total{code="5xx"}[1m]) / sum by (backend) rate(haproxy_server_http_responses_total[1m]) * 100 > 5
180 |     for: 1m
181 |     labels:
182 |       severity: critical
183 |     annotations:
184 |       summary: "HAProxy high HTTP 4xx error rate backend (instance {{ $labels.instance }})"
185 |       description: "Too many HTTP requests with status 5xx (> 5%) on backend {{ $labels.fqdn }}/{{ $labels.backend }}\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
186 | ````
187 | ````
188 |   - alert: HaproxyHighHttp4xxErrorRateServer
189 |     expr: sum by (server) rate(haproxy_server_http_responses_total{code="4xx"}[1m]) / sum by (backend) rate(haproxy_server_http_responses_total[1m]) * 100 > 5
190 |     for: 1m
191 |     labels:
192 |       severity: critical
193 |     annotations:
194 |       summary: "HAProxy high HTTP 4xx error rate server (instance {{ $labels.instance }})"
195 |       description: "Too many HTTP requests with status 4xx (> 5%) on server {{ $labels.server }}\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
196 | ````
197 | ````
198 |   - alert: HaproxyHighHttp5xxErrorRateServer
199 |     expr: sum by (server) rate(haproxy_server_http_responses_total{code="5xx"}[1m]) / sum by (backend) rate(haproxy_server_http_responses_total[1m]) * 100 > 5
200 |     for: 1m
201 |     labels:
202 |       severity: critical
203 |     annotations:
204 |       summary: "HAProxy high HTTP 5xx error rate server (instance {{ $labels.instance }})"
205 |       description: "Too many HTTP requests with status 5xx (> 5%) on server {{ $labels.server }}\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
206 | ````
207 | ````
208 |   - alert: HaproxyServerResponseErrors
209 |     expr: sum by (server) rate(haproxy_server_response_errors_total[1m]) / sum by (server) rate(haproxy_server_http_responses_total[1m]) * 100 > 5
210 |     for: 1m
211 |     labels:
212 |       severity: critical
213 |     annotations:
214 |       summary: "HAProxy server response errors (instance {{ $labels.instance }})"
215 |       description: "Too many response errors to {{ $labels.server }} server (> 5%).\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
216 | ````
217 | ````
218 |   - alert: HaproxyBackendConnectionErrors
219 |     expr: sum by (backend) rate(haproxy_backend_connection_errors_total[1m]) > 100
220 |     for: 1m
221 |     labels:
222 |       severity: critical
223 |     annotations:
224 |       summary: "HAProxy backend connection errors (instance {{ $labels.instance }})"
225 |       description: "Too many connection errors to {{ $labels.fqdn }}/{{ $labels.backend }} backend (> 100 req/s). Request throughput may be to high.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
226 | ````
227 | ````
228 |   - alert: HaproxyServerConnectionErrors
229 |     expr: sum by (server) rate(haproxy_server_connection_errors_total[1m]) > 100
230 |     for: 0m
231 |     labels:
232 |       severity: critical
233 |     annotations:
234 |       summary: "HAProxy server connection errors (instance {{ $labels.instance }})"
235 |       description: "Too many connection errors to {{ $labels.server }} server (> 100 req/s). Request throughput may be to high.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
236 | ````
237 | ````
238 |   - alert: HaproxyBackendMaxActiveSession
239 |     expr: ((sum by (backend) (avg_over_time(haproxy_backend_max_sessions[2m])) / sum by (backend) (avg_over_time(haproxy_backend_limit_sessions[2m]))) * 100) > 80
240 |     for: 2m
241 |     labels:
242 |       severity: warning
243 |     annotations:
244 |       summary: "HAProxy backend max active session (instance {{ $labels.instance }})"
245 |       description: "HAproxy backend {{ $labels.fqdn }}/{{ $labels.backend }} is reaching session limit (> 80%).\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
246 | ````
247 | ````
248 |   - alert: HaproxyPendingRequests
249 |     expr: sum by (backend) haproxy_backend_current_queue > 0
250 |     for: 2m
251 |     labels:
252 |       severity: warning
253 |     annotations:
254 |       summary: "HAProxy pending requests (instance {{ $labels.instance }})"
255 |       description: "Some HAProxy requests are pending on {{ $labels.fqdn }}/{{ $labels.backend }} backend\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
256 | ````
257 | ````
258 |   - alert: HaproxyHttpSlowingDown
259 |     expr: avg by (backend) (haproxy_backend_http_total_time_average_seconds) > 1
260 |     for: 1m
261 |     labels:
262 |       severity: warning
263 |     annotations:
264 |       summary: "HAProxy HTTP slowing down (instance {{ $labels.instance }})"
265 |       description: "Average request time is increasing\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
266 | ````
267 | ````
268 |   - alert: HaproxyRetryHigh
269 |     expr: rate(sum by (backend) (haproxy_backend_retry_warnings_total)) > 10
270 |     for: 2m
271 |     labels:
272 |       severity: warning
273 |     annotations:
274 |       summary: "HAProxy retry high (instance {{ $labels.instance }})"
275 |       description: "High rate of retry on {{ $labels.fqdn }}/{{ $labels.backend }} backend\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
276 | ````
277 | ````
278 |   - alert: HaproxyBackendDown
279 |     expr: haproxy_backend_up == 0
280 |     for: 0m
281 |     labels:
282 |       severity: critical
283 |     annotations:
284 |       summary: "HAProxy backend down (instance {{ $labels.instance }})"
285 |       description: "HAProxy backend is down\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
286 | ````
287 | ````
288 |   - alert: HaproxyServerDown
289 |     expr: haproxy_server_up == 0
290 |     for: 0m
291 |     labels:
292 |       severity: critical
293 |     annotations:
294 |       summary: "HAProxy server down (instance {{ $labels.instance }})"
295 |       description: "HAProxy server is down\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
296 | ````
297 | ````
298 |   - alert: HaproxyFrontendSecurityBlockedRequests
299 |     expr: rate(sum by (frontend) (haproxy_frontend_requests_denied_total)) > 10
300 |     for: 2m
301 |     labels:
302 |       severity: warning
303 |     annotations:
304 |       summary: "HAProxy frontend security blocked requests (instance {{ $labels.instance }})"
305 |       description: "HAProxy is blocking requests for security reason\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
306 | ````
307 | ````
308 |   - alert: HaproxyServerHealthcheckFailure
309 |     expr: increase(haproxy_server_check_failures_total) > 0
310 |     for: 1m
311 |     labels:
312 |       severity: warning
313 |     annotations:
314 |       summary: "HAProxy server healthcheck failure (instance {{ $labels.instance }})"
315 |       description: "Some server healthcheck are failing on {{ $labels.server }}\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
316 | ````
317 | 


--------------------------------------------------------------------------------
/databases/postgresql.md:
--------------------------------------------------------------------------------
  1 | ## PostgreSQL
  2 | 
  3 | ### [PostgreSQL Server Exporter](https://github.com/wrouesnel/postgres_exporter/)
  4 | 
  5 | 
  6 | -   #### Postgresql instance is down
  7 |     
  8 | 
  9 |     
 10 | ```yaml
 11 |       - alert: PostgresqlDown
 12 |         expr: pg_up == 0
 13 |         for: 5m
 14 |         labels:
 15 |           severity: critical
 16 |         annotations:
 17 |           summary: "Postgresql down (instance {{ $labels.instance }})"
 18 |           description: "Postgresql instance is down\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 19 | ```
 20 |     
 21 |       
 22 |     
 23 | -   #### Postgresql restarted
 24 |     
 25 |     
 26 | ```yaml
 27 |       - alert: PostgresqlRestarted
 28 |         expr: time() - pg_postmaster_start_time_seconds < 60
 29 |         for: 5m
 30 |         labels:
 31 |           severity: critical
 32 |         annotations:
 33 |           summary: "Postgresql restarted (instance {{ $labels.instance }})"
 34 |           description: "Postgresql restarted\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 35 | ```
 36 |     
 37 |       
 38 |     
 39 | -   #### Postgresql exporter error
 40 |     
 41 | ##### Postgresql exporter is showing errors. A query may be buggy in query.yaml
 42 |     
 43 | ```yaml
 44 |       - alert: PostgresqlExporterError
 45 |         expr: pg_exporter_last_scrape_error > 0
 46 |         for: 5m
 47 |         labels:
 48 |           severity: warning
 49 |         annotations:
 50 |           summary: "Postgresql exporter error (instance {{ $labels.instance }})"
 51 |           description: "Postgresql exporter is showing errors. A query may be buggy in query.yaml\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 52 | ```
 53 |     
 54 |       
 55 |     
 56 | -   #### Postgresql replication lag
 57 |     
 58 | ##### PostgreSQL replication lag is going up (> 10s)
 59 |     
 60 | ```yaml
 61 |       - alert: PostgresqlReplicationLag
 62 |         expr: (pg_replication_lag) > 10 and ON(instance) (pg_replication_is_replica == 1)
 63 |         for: 5m
 64 |         labels:
 65 |           severity: warning
 66 |         annotations:
 67 |           summary: "Postgresql replication lag (instance {{ $labels.instance }})"
 68 |           description: "PostgreSQL replication lag is going up (> 10s)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 69 | ```
 70 |     
 71 |       
 72 |     
 73 | -   #### Postgresql table not vaccumed
 74 |     
 75 | ##### Table has not been vaccum for 24 hours
 76 |     
 77 | ```yaml
 78 |       - alert: PostgresqlTableNotVaccumed
 79 |         expr: time() - pg_stat_user_tables_last_autovacuum > 60 * 60 * 24
 80 |         for: 5m
 81 |         labels:
 82 |           severity: warning
 83 |         annotations:
 84 |           summary: "Postgresql table not vaccumed (instance {{ $labels.instance }})"
 85 |           description: "Table has not been vaccum for 24 hours\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 86 | ```
 87 |     
 88 |       
 89 |     
 90 | -   #### Postgresql table not analyzed
 91 |     
 92 | ##### Table has not been analyzed for 24 hours
 93 |     
 94 | ```yaml
 95 |       - alert: PostgresqlTableNotAnalyzed
 96 |         expr: time() - pg_stat_user_tables_last_autoanalyze > 60 * 60 * 24
 97 |         for: 5m
 98 |         labels:
 99 |           severity: warning
100 |         annotations:
101 |           summary: "Postgresql table not analyzed (instance {{ $labels.instance }})"
102 |           description: "Table has not been analyzed for 24 hours\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
103 | ```
104 |     
105 |       
106 |     
107 | -   #### Postgresql too many connections
108 |     
109 | ##### PostgreSQL instance has too many connections
110 |     
111 | ```yaml
112 |       - alert: PostgresqlTooManyConnections
113 |         expr: sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) > pg_settings_max_connections * 0.9
114 |         for: 5m
115 |         labels:
116 |           severity: warning
117 |         annotations:
118 |           summary: "Postgresql too many connections (instance {{ $labels.instance }})"
119 |           description: "PostgreSQL instance has too many connections\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
120 | ```
121 |     
122 |       
123 |     
124 | -   #### Postgresql not enough connections
125 |     
126 | ##### PostgreSQL instance should have more connections (> 5)
127 |     
128 | ```yaml
129 |       - alert: PostgresqlNotEnoughConnections
130 |         expr: sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) < 5
131 |         for: 5m
132 |         labels:
133 |           severity: warning
134 |         annotations:
135 |           summary: "Postgresql not enough connections (instance {{ $labels.instance }})"
136 |           description: "PostgreSQL instance should have more connections (> 5)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
137 | ```
138 |     
139 |       
140 |     
141 | -   #### Postgresql dead locks
142 |     
143 | ##### PostgreSQL has dead-locks
144 |     
145 | ```yaml
146 |       - alert: PostgresqlDeadLocks
147 |         expr: rate(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 0
148 |         for: 5m
149 |         labels:
150 |           severity: warning
151 |         annotations:
152 |           summary: "Postgresql dead locks (instance {{ $labels.instance }})"
153 |           description: "PostgreSQL has dead-locks\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
154 | ```
155 |     
156 |       
157 |     
158 | -   #### Postgresql slow queries
159 |     
160 | ##### PostgreSQL executes slow queries
161 |     
162 | ```yaml
163 |       - alert: PostgresqlSlowQueries
164 |         expr: pg_slow_queries > 0
165 |         for: 5m
166 |         labels:
167 |           severity: warning
168 |         annotations:
169 |           summary: "Postgresql slow queries (instance {{ $labels.instance }})"
170 |           description: "PostgreSQL executes slow queries\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
171 | ```
172 |     
173 |       
174 |     
175 | -   #### Postgresql high rollback rate
176 |     
177 | ##### Ratio of transactions being aborted compared to committed is > 2 %
178 |     
179 | ```yaml
180 |       - alert: PostgresqlHighRollbackRate
181 |         expr: rate(pg_stat_database_xact_rollback{datname!~"template.*"}[3m]) / rate(pg_stat_database_xact_commit{datname!~"template.*"}[3m]) > 0.02
182 |         for: 5m
183 |         labels:
184 |           severity: warning
185 |         annotations:
186 |           summary: "Postgresql high rollback rate (instance {{ $labels.instance }})"
187 |           description: "Ratio of transactions being aborted compared to committed is > 2 %\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
188 | ```
189 |     
190 |       
191 |     
192 | -   #### Postgresql commit rate low
193 |     
194 | ##### Postgres seems to be processing very few transactions
195 |     
196 | ```yaml
197 |       - alert: PostgresqlCommitRateLow
198 |         expr: rate(pg_stat_database_xact_commit[1m]) < 10
199 |         for: 5m
200 |         labels:
201 |           severity: critical
202 |         annotations:
203 |           summary: "Postgresql commit rate low (instance {{ $labels.instance }})"
204 |           description: "Postgres seems to be processing very few transactions\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
205 | ```
206 |     
207 |       
208 |     
209 | -   #### Postgresql low XID consumption
210 |     
211 | ##### Postgresql seems to be consuming transaction IDs very slowly
212 |     
213 | ```yaml
214 |       - alert: PostgresqlLowXidConsumption
215 |         expr: rate(pg_txid_current[1m]) < 5
216 |         for: 5m
217 |         labels:
218 |           severity: warning
219 |         annotations:
220 |           summary: "Postgresql low XID consumption (instance {{ $labels.instance }})"
221 |           description: "Postgresql seems to be consuming transaction IDs very slowly\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
222 | ```
223 |     
224 |       
225 |     
226 | -   #### Postgresqllow XLOG consumption
227 |     
228 | ##### Postgres seems to be consuming XLOG very slowly
229 |     
230 | ```yaml
231 |       - alert: PostgresqllowXlogConsumption
232 |         expr: rate(pg_xlog_position_bytes[1m]) < 100
233 |         for: 5m
234 |         labels:
235 |           severity: warning
236 |         annotations:
237 |           summary: "Postgresqllow XLOG consumption (instance {{ $labels.instance }})"
238 |           description: "Postgres seems to be consuming XLOG very slowly\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
239 | ```
240 |     
241 |       
242 |     
243 | -   #### Postgresql WALE replication stopped
244 |     
245 | ##### WAL-E replication seems to be stopped
246 |     
247 | ```yaml
248 |       - alert: PostgresqlWaleReplicationStopped
249 |         expr: rate(pg_xlog_position_bytes[1m]) == 0
250 |         for: 5m
251 |         labels:
252 |           severity: critical
253 |         annotations:
254 |           summary: "Postgresql WALE replication stopped (instance {{ $labels.instance }})"
255 |           description: "WAL-E replication seems to be stopped\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
256 | ```
257 |     
258 |       
259 |     
260 | -   #### Postgresql high rate statement timeout
261 |     
262 | ##### Postgres transactions showing high rate of statement timeouts
263 |     
264 | ```yaml
265 |       - alert: PostgresqlHighRateStatementTimeout
266 |         expr: rate(postgresql_errors_total{type="statement_timeout"}[5m]) > 3
267 |         for: 5m
268 |         labels:
269 |           severity: critical
270 |         annotations:
271 |           summary: "Postgresql high rate statement timeout (instance {{ $labels.instance }})"
272 |           description: "Postgres transactions showing high rate of statement timeouts\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
273 | ```
274 |     
275 |       
276 |     
277 | -   #### Postgresql high rate deadlock
278 |     
279 | ##### Postgres detected deadlocks
280 |     
281 | ```yaml
282 |       - alert: PostgresqlHighRateDeadlock
283 |         expr: rate(postgresql_errors_total{type="deadlock_detected"}[1m]) * 60 > 1
284 |         for: 5m
285 |         labels:
286 |           severity: critical
287 |         annotations:
288 |           summary: "Postgresql high rate deadlock (instance {{ $labels.instance }})"
289 |           description: "Postgres detected deadlocks\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
290 | ```
291 |     
292 |       
293 |     
294 | -   #### Postgresql replication lab bytes
295 |     
296 | ##### Postgres Replication lag (in bytes) is high
297 |     
298 | ```yaml
299 |       - alert: PostgresqlReplicationLabBytes
300 |         expr: (pg_xlog_position_bytes and pg_replication_is_replica == 0) - GROUP_RIGHT(instance) (pg_xlog_position_bytes and pg_replication_is_replica == 1) > 1e+09
301 |         for: 5m
302 |         labels:
303 |           severity: critical
304 |         annotations:
305 |           summary: "Postgresql replication lab bytes (instance {{ $labels.instance }})"
306 |           description: "Postgres Replication lag (in bytes) is high\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
307 | ```
308 |     
309 |       
310 |     
311 | -   #### Postgresql unused replication slot
312 |     
313 | ##### Unused Replication Slots
314 |     
315 | ```yaml
316 |       - alert: PostgresqlUnusedReplicationSlot
317 |         expr: pg_replication_slots_active == 0
318 |         for: 5m
319 |         labels:
320 |           severity: warning
321 |         annotations:
322 |           summary: "Postgresql unused replication slot (instance {{ $labels.instance }})"
323 |           description: "Unused Replication Slots\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
324 | ```
325 |     
326 |       
327 |     
328 | -   #### Postgresql too many dead tuples
329 |     
330 | ##### PostgreSQL dead tuples is too large
331 |     
332 | ```yaml
333 |       - alert: PostgresqlTooManyDeadTuples
334 |         expr: ((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1 unless ON(instance) (pg_replication_is_replica == 1)
335 |         for: 5m
336 |         labels:
337 |           severity: warning
338 |         annotations:
339 |           summary: "Postgresql too many dead tuples (instance {{ $labels.instance }})"
340 |           description: "PostgreSQL dead tuples is too large\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
341 | ```
342 |     
343 |       
344 |     
345 | -   #### Postgresql split brain
346 |     
347 | ##### Split Brain, too many primary Postgresql databases in read-write mode
348 |     
349 | ```yaml
350 |       - alert: PostgresqlSplitBrain
351 |         expr: count(pg_replication_is_replica == 0) != 1
352 |         for: 5m
353 |         labels:
354 |           severity: critical
355 |         annotations:
356 |           summary: "Postgresql split brain (instance {{ $labels.instance }})"
357 |           description: "Split Brain, too many primary Postgresql databases in read-write mode\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
358 | ```
359 |     
360 |       
361 |     
362 | -   #### Postgresql promoted node
363 |     
364 | ##### Postgresql standby server has been promoted as primary node
365 |     
366 | ```yaml
367 |       - alert: PostgresqlPromotedNode
368 |         expr: pg_replication_is_replica and changes(pg_replication_is_replica[1m]) > 0
369 |         for: 5m
370 |         labels:
371 |           severity: warning
372 |         annotations:
373 |           summary: "Postgresql promoted node (instance {{ $labels.instance }})"
374 |           description: "Postgresql standby server has been promoted as primary node\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
375 | ```
376 |     
377 |       
378 |     
379 | -   #### Postgresql configuration changed
380 |     
381 | ##### Postgres Database configuration change has occurred
382 |     
383 | ```yaml
384 |       - alert: PostgresqlConfigurationChanged
385 |         expr: {__name__=~"pg_settings_.*"} != ON(__name__) {__name__=~"pg_settings_([^t]|t[^r]|tr[^a]|tra[^n]|tran[^s]|trans[^a]|transa[^c]|transac[^t]|transact[^i]|transacti[^o]|transactio[^n]|transaction[^_]|transaction_[^r]|transaction_r[^e]|transaction_re[^a]|transaction_rea[^d]|transaction_read[^_]|transaction_read_[^o]|transaction_read_o[^n]|transaction_read_on[^l]|transaction_read_onl[^y]).*"} OFFSET 5m
386 |         for: 5m
387 |         labels:
388 |           severity: warning
389 |         annotations:
390 |           summary: "Postgresql configuration changed (instance {{ $labels.instance }})"
391 |           description: "Postgres Database configuration change has occurred\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
392 | ```
393 |     
394 |       
395 |     
396 | -   #### Postgresql SSL compression active
397 |     
398 | ##### Database connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`
399 |     
400 | ```yaml
401 |       - alert: PostgresqlSslCompressionActive
402 |         expr: sum(pg_stat_ssl_compression) > 0
403 |         for: 5m
404 |         labels:
405 |           severity: critical
406 |         annotations:
407 |           summary: "Postgresql SSL compression active (instance {{ $labels.instance }})"
408 |           description: "Database connections with SSL compression enabled. This may add significant jitter in replication delay. Replicas should turn off SSL compression via `sslcompression=0` in `recovery.conf`.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
409 | ```
410 |     
411 |       
412 |     
413 | -   #### Postgresql too many locks acquired
414 |     
415 | ##### Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.
416 |     
417 | ```yaml
418 |       - alert: PostgresqlTooManyLocksAcquired
419 |         expr: ((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20
420 |         for: 5m
421 |         labels:
422 |           severity: critical
423 |         annotations:
424 |           summary: "Postgresql too many locks acquired (instance {{ $labels.instance }})"
425 |           description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
426 | ```
427 | 


--------------------------------------------------------------------------------
/orchestrators/k8s.yml:
--------------------------------------------------------------------------------
  1 | # kube-state-metrics: https://github.com/kubernetes/kube-state-metrics/tree/master/docs
  2 | 
  3 | groups:
  4 | 
  5 | - name: KubestateExporter
  6 | 
  7 |   rules:
  8 | 
  9 |     - alert: KubernetesNodeReady
 10 |       expr: 'kube_node_status_condition{condition="Ready",status="true"} == 0'
 11 |       for: 10m
 12 |       labels:
 13 |         severity: critical
 14 |       annotations:
 15 |         summary: Kubernetes Node ready (instance {{ $labels.instance }})
 16 |         description: "Node {{ $labels.node }} has been unready for a long time\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 17 | 
 18 |     - alert: KubernetesMemoryPressure
 19 |       expr: 'kube_node_status_condition{condition="MemoryPressure",status="true"} == 1'
 20 |       for: 2m
 21 |       labels:
 22 |         severity: critical
 23 |       annotations:
 24 |         summary: Kubernetes memory pressure (instance {{ $labels.instance }})
 25 |         description: "{{ $labels.node }} has MemoryPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 26 | 
 27 |     - alert: KubernetesDiskPressure
 28 |       expr: 'kube_node_status_condition{condition="DiskPressure",status="true"} == 1'
 29 |       for: 2m
 30 |       labels:
 31 |         severity: critical
 32 |       annotations:
 33 |         summary: Kubernetes disk pressure (instance {{ $labels.instance }})
 34 |         description: "{{ $labels.node }} has DiskPressure condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 35 | 
 36 |     - alert: KubernetesNetworkUnavailable
 37 |       expr: 'kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1'
 38 |       for: 2m
 39 |       labels:
 40 |         severity: critical
 41 |       annotations:
 42 |         summary: Kubernetes network unavailable (instance {{ $labels.instance }})
 43 |         description: "{{ $labels.node }} has NetworkUnavailable condition\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 44 | 
 45 |     - alert: KubernetesOutOfCapacity
 46 |       expr: 'sum by (node) ((kube_pod_status_phase{phase="Running"} == 1) + on(uid) group_left(node) (0 * kube_pod_info{pod_template_hash=""})) / sum by (node) (kube_node_status_allocatable{resource="pods"}) * 100 > 90'
 47 |       for: 2m
 48 |       labels:
 49 |         severity: warning
 50 |       annotations:
 51 |         summary: Kubernetes out of capacity (instance {{ $labels.instance }})
 52 |         description: "{{ $labels.node }} is out of capacity\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 53 | 
 54 |     - alert: KubernetesContainerOomKiller
 55 |       expr: '(kube_pod_container_status_restarts_total - kube_pod_container_status_restarts_total offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled"}[10m]) == 1'
 56 |       for: 0m
 57 |       labels:
 58 |         severity: warning
 59 |       annotations:
 60 |         summary: Kubernetes container oom killer (instance {{ $labels.instance }})
 61 |         description: "Container {{ $labels.container }} in pod {{ $labels.namespace }}/{{ $labels.pod }} has been OOMKilled {{ $value }} times in the last 10 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 62 | 
 63 |     - alert: KubernetesJobFailed
 64 |       expr: 'kube_job_status_failed > 0'
 65 |       for: 0m
 66 |       labels:
 67 |         severity: warning
 68 |       annotations:
 69 |         summary: Kubernetes Job failed (instance {{ $labels.instance }})
 70 |         description: "Job {{ $labels.namespace }}/{{ $labels.exported_job }} failed to complete\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 71 | 
 72 |     - alert: KubernetesCronjobSuspended
 73 |       expr: 'kube_cronjob_spec_suspend != 0'
 74 |       for: 0m
 75 |       labels:
 76 |         severity: warning
 77 |       annotations:
 78 |         summary: Kubernetes CronJob suspended (instance {{ $labels.instance }})
 79 |         description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 80 | 
 81 |     - alert: KubernetesPersistentvolumeclaimPending
 82 |       expr: 'kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1'
 83 |       for: 2m
 84 |       labels:
 85 |         severity: warning
 86 |       annotations:
 87 |         summary: Kubernetes PersistentVolumeClaim pending (instance {{ $labels.instance }})
 88 |         description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 89 | 
 90 |     - alert: KubernetesVolumeOutOfDiskSpace
 91 |       expr: 'kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10'
 92 |       for: 2m
 93 |       labels:
 94 |         severity: warning
 95 |       annotations:
 96 |         summary: Kubernetes Volume out of disk space (instance {{ $labels.instance }})
 97 |         description: "Volume is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 98 | 
 99 |     - alert: KubernetesVolumeFullInFourDays
100 |       expr: 'predict_linear(kubelet_volume_stats_available_bytes[6h], 4 * 24 * 3600) < 0'
101 |       for: 0m
102 |       labels:
103 |         severity: critical
104 |       annotations:
105 |         summary: Kubernetes Volume full in four days (instance {{ $labels.instance }})
106 |         description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
107 | 
108 |     - alert: KubernetesPersistentvolumeError
109 |       expr: 'kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics"} > 0'
110 |       for: 0m
111 |       labels:
112 |         severity: critical
113 |       annotations:
114 |         summary: Kubernetes PersistentVolume error (instance {{ $labels.instance }})
115 |         description: "Persistent volume is in bad state\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
116 | 
117 |     - alert: KubernetesStatefulsetDown
118 |       expr: 'kube_statefulset_replicas != kube_statefulset_status_replicas_ready > 0'
119 |       for: 1m
120 |       labels:
121 |         severity: critical
122 |       annotations:
123 |         summary: Kubernetes StatefulSet down (instance {{ $labels.instance }})
124 |         description: "A StatefulSet went down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
125 | 
126 |     - alert: KubernetesHpaScalingAbility
127 |       expr: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="AbleToScale"} == 1'
128 |       for: 2m
129 |       labels:
130 |         severity: warning
131 |       annotations:
132 |         summary: Kubernetes HPA scaling ability (instance {{ $labels.instance }})
133 |         description: "Pod is unable to scale\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
134 | 
135 |     - alert: KubernetesHpaMetricAvailability
136 |       expr: 'kube_horizontalpodautoscaler_status_condition{status="false", condition="ScalingActive"} == 1'
137 |       for: 0m
138 |       labels:
139 |         severity: warning
140 |       annotations:
141 |         summary: Kubernetes HPA metric availability (instance {{ $labels.instance }})
142 |         description: "HPA is not able to collect metrics\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
143 | 
144 |     - alert: KubernetesHpaScaleCapability
145 |       expr: 'kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas'
146 |       for: 2m
147 |       labels:
148 |         severity: info
149 |       annotations:
150 |         summary: Kubernetes HPA scale capability (instance {{ $labels.instance }})
151 |         description: "The maximum number of desired Pods has been hit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
152 | 
153 |     - alert: KubernetesHpaUnderutilized
154 |       expr: 'max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) == kube_horizontalpodautoscaler_spec_min_replicas) by (horizontalpodautoscaler) > 3'
155 |       for: 0m
156 |       labels:
157 |         severity: info
158 |       annotations:
159 |         summary: Kubernetes HPA underutilized (instance {{ $labels.instance }})
160 |         description: "HPA is constantly at minimum replicas for 50% of the time. Potential cost saving here.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
161 | 
162 |     - alert: KubernetesPodNotHealthy
163 |       expr: 'sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"}) > 0'
164 |       for: 15m
165 |       labels:
166 |         severity: critical
167 |       annotations:
168 |         summary: Kubernetes Pod not healthy (instance {{ $labels.instance }})
169 |         description: "Pod has been in a non-ready state for longer than 15 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
170 | 
171 |     - alert: KubernetesPodCrashLooping
172 |       expr: 'increase(kube_pod_container_status_restarts_total[1m]) > 3'
173 |       for: 2m
174 |       labels:
175 |         severity: warning
176 |       annotations:
177 |         summary: Kubernetes pod crash looping (instance {{ $labels.instance }})
178 |         description: "Pod {{ $labels.pod }} is crash looping\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
179 | 
180 |     - alert: KubernetesReplicassetMismatch
181 |       expr: 'kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas'
182 |       for: 10m
183 |       labels:
184 |         severity: warning
185 |       annotations:
186 |         summary: Kubernetes ReplicasSet mismatch (instance {{ $labels.instance }})
187 |         description: "Deployment Replicas mismatch\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
188 | 
189 |     - alert: KubernetesDeploymentReplicasMismatch
190 |       expr: 'kube_deployment_spec_replicas != kube_deployment_status_replicas_available'
191 |       for: 10m
192 |       labels:
193 |         severity: warning
194 |       annotations:
195 |         summary: Kubernetes Deployment replicas mismatch (instance {{ $labels.instance }})
196 |         description: "Deployment Replicas mismatch\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
197 | 
198 |     - alert: KubernetesStatefulsetReplicasMismatch
199 |       expr: 'kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas'
200 |       for: 10m
201 |       labels:
202 |         severity: warning
203 |       annotations:
204 |         summary: Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }})
205 |         description: "A StatefulSet does not match the expected number of replicas.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
206 | 
207 |     - alert: KubernetesDeploymentGenerationMismatch
208 |       expr: 'kube_deployment_status_observed_generation != kube_deployment_metadata_generation'
209 |       for: 10m
210 |       labels:
211 |         severity: critical
212 |       annotations:
213 |         summary: Kubernetes Deployment generation mismatch (instance {{ $labels.instance }})
214 |         description: "A Deployment has failed but has not been rolled back.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
215 | 
216 |     - alert: KubernetesStatefulsetGenerationMismatch
217 |       expr: 'kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation'
218 |       for: 10m
219 |       labels:
220 |         severity: critical
221 |       annotations:
222 |         summary: Kubernetes StatefulSet generation mismatch (instance {{ $labels.instance }})
223 |         description: "A StatefulSet has failed but has not been rolled back.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
224 | 
225 |     - alert: KubernetesStatefulsetUpdateNotRolledOut
226 |       expr: 'max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)'
227 |       for: 10m
228 |       labels:
229 |         severity: warning
230 |       annotations:
231 |         summary: Kubernetes StatefulSet update not rolled out (instance {{ $labels.instance }})
232 |         description: "StatefulSet update has not been rolled out.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
233 | 
234 |     - alert: KubernetesDaemonsetRolloutStuck
235 |       expr: 'kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0'
236 |       for: 10m
237 |       labels:
238 |         severity: warning
239 |       annotations:
240 |         summary: Kubernetes DaemonSet rollout stuck (instance {{ $labels.instance }})
241 |         description: "Some Pods of DaemonSet are not scheduled or not ready\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
242 | 
243 |     - alert: KubernetesDaemonsetMisscheduled
244 |       expr: 'kube_daemonset_status_number_misscheduled > 0'
245 |       for: 1m
246 |       labels:
247 |         severity: critical
248 |       annotations:
249 |         summary: Kubernetes DaemonSet misscheduled (instance {{ $labels.instance }})
250 |         description: "Some DaemonSet Pods are running where they are not supposed to run\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
251 | 
252 |     - alert: KubernetesCronjobTooLong
253 |       expr: 'time() - kube_cronjob_next_schedule_time > 3600'
254 |       for: 0m
255 |       labels:
256 |         severity: warning
257 |       annotations:
258 |         summary: Kubernetes CronJob too long (instance {{ $labels.instance }})
259 |         description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
260 | 
261 |     - alert: KubernetesJobSlowCompletion
262 |       expr: 'kube_job_spec_completions - kube_job_status_succeeded > 0'
263 |       for: 12h
264 |       labels:
265 |         severity: critical
266 |       annotations:
267 |         summary: Kubernetes job slow completion (instance {{ $labels.instance }})
268 |         description: "Kubernetes Job {{ $labels.namespace }}/{{ $labels.job_name }} did not complete in time.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
269 | 
270 |     - alert: KubernetesApiServerErrors
271 |       expr: 'sum(rate(apiserver_request_total{job="apiserver",code=~"^(?:5..)$"}[1m])) / sum(rate(apiserver_request_total{job="apiserver"}[1m])) * 100 > 3'
272 |       for: 2m
273 |       labels:
274 |         severity: critical
275 |       annotations:
276 |         summary: Kubernetes API server errors (instance {{ $labels.instance }})
277 |         description: "Kubernetes API server is experiencing high error rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
278 | 
279 |     - alert: KubernetesApiClientErrors
280 |       expr: '(sum(rate(rest_client_requests_total{code=~"(4|5).."}[1m])) by (instance, job) / sum(rate(rest_client_requests_total[1m])) by (instance, job)) * 100 > 1'
281 |       for: 2m
282 |       labels:
283 |         severity: critical
284 |       annotations:
285 |         summary: Kubernetes API client errors (instance {{ $labels.instance }})
286 |         description: "Kubernetes API client is experiencing high error rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
287 | 
288 |     - alert: KubernetesClientCertificateExpiresNextWeek
289 |       expr: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60'
290 |       for: 0m
291 |       labels:
292 |         severity: warning
293 |       annotations:
294 |         summary: Kubernetes client certificate expires next week (instance {{ $labels.instance }})
295 |         description: "A client certificate used to authenticate to the apiserver is expiring next week.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
296 | 
297 |     - alert: KubernetesClientCertificateExpiresSoon
298 |       expr: 'apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 24*60*60'
299 |       for: 0m
300 |       labels:
301 |         severity: critical
302 |       annotations:
303 |         summary: Kubernetes client certificate expires soon (instance {{ $labels.instance }})
304 |         description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
305 | 
306 |     - alert: KubernetesApiServerLatency
307 |       expr: 'histogram_quantile(0.99, sum(rate(apiserver_request_latencies_bucket{subresource!="log",verb!~"^(?:CONNECT|WATCHLIST|WATCH|PROXY)$"} [10m])) WITHOUT (instance, resource)) / 1e+06 > 1'
308 |       for: 2m
309 |       labels:
310 |         severity: warning
311 |       annotations:
312 |         summary: Kubernetes API server latency (instance {{ $labels.instance }})
313 |         description: "Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
314 | 


--------------------------------------------------------------------------------
/some/node-exporter.md:
--------------------------------------------------------------------------------
  1 | [node-exporter](https://github.com/prometheus/node_exporter)
  2 | 
  3 | ## Host out of memory
  4 | #### Node memory is filling up (< 10% left)
  5 | ````
  6 |   - alert: HostOutOfMemory
  7 |     expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
  8 |     for: 2m
  9 |     labels:
 10 |       severity: warning
 11 |     annotations:
 12 |       summary: Host out of memory (instance {{ $labels.instance }})
 13 |       description: "Node memory is filling up (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 14 | ````
 15 | 
 16 | ## Host memory under memory pressure
 17 | #### The node is under heavy memory pressure. High rate of major page faults
 18 | ````
 19 |   - alert: HostMemoryUnderMemoryPressure
 20 |     expr: rate(node_vmstat_pgmajfault[1m]) > 1000
 21 |     for: 2m
 22 |     labels:
 23 |       severity: warning
 24 |     annotations:
 25 |       summary: Host memory under memory pressure (instance {{ $labels.instance }})
 26 |       description: "The node is under heavy memory pressure. High rate of major page faults\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 27 | ````
 28 | 
 29 | ## Host unusual network throughput in
 30 | #### Host network interfaces are probably receiving too much data (> 100 MB/s)
 31 | ````
 32 |   - alert: HostUnusualNetworkThroughputIn
 33 |     expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
 34 |     for: 5m
 35 |     labels:
 36 |       severity: warning
 37 |     annotations:
 38 |       summary: Host unusual network throughput in (instance {{ $labels.instance }})
 39 |       description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 40 | ````
 41 | 
 42 | ## Host unusual network throughput out
 43 | #### Host network interfaces are probably sending too much data (> 100 MB/s)
 44 | ````
 45 |   - alert: HostUnusualNetworkThroughputOut
 46 |     expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
 47 |     for: 5m
 48 |     labels:
 49 |       severity: warning
 50 |     annotations:
 51 |       summary: Host unusual network throughput out (instance {{ $labels.instance }})
 52 |       description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 53 | ````
 54 | 
 55 | ## Host unusual disk read rate
 56 | #### Disk is probably reading too much data (> 50 MB/s)
 57 | ````
 58 |   - alert: HostUnusualDiskReadRate
 59 |     expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
 60 |     for: 5m
 61 |     labels:
 62 |       severity: warning
 63 |     annotations:
 64 |       summary: Host unusual disk read rate (instance {{ $labels.instance }})
 65 |       description: "Disk is probably reading too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 66 | ````
 67 | 
 68 | ## Host unusual disk write rate
 69 | #### Disk is probably writing too much data (> 50 MB/s)
 70 | ````
 71 |   - alert: HostUnusualDiskWriteRate
 72 |     expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
 73 |     for: 2m
 74 |     labels:
 75 |       severity: warning
 76 |     annotations:
 77 |       summary: Host unusual disk write rate (instance {{ $labels.instance }})
 78 |       description: "Disk is probably writing too much data (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 79 | ````
 80 | 
 81 | ## Host out of disk space
 82 | #### Disk is almost full (< 10% left)
 83 | ````
 84 |   # Please add ignored mountpoints in node_exporter parameters like
 85 |   # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
 86 |   # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
 87 |   - alert: HostOutOfDiskSpace
 88 |     expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
 89 |     for: 2m
 90 |     labels:
 91 |       severity: warning
 92 |     annotations:
 93 |       summary: Host out of disk space (instance {{ $labels.instance }})
 94 |       description: "Disk is almost full (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 95 | ````
 96 | 
 97 | ## Host disk will fill in 24 hours
 98 | #### Filesystem is predicted to run out of space within the next 24 hours at current write rate
 99 | ````
100 |   # Please add ignored mountpoints in node_exporter parameters like
101 |   # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
102 |   # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
103 |   - alert: HostDiskWillFillIn24Hours
104 |     expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
105 |     for: 2m
106 |     labels:
107 |       severity: warning
108 |     annotations:
109 |       summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
110 |       description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
111 | ````
112 | 
113 | ## Host out of inodes
114 | #### Disk is almost running out of available inodes (< 10% left)
115 | ````
116 |   - alert: HostOutOfInodes
117 |     expr: node_filesystem_files_free / node_filesystem_files * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
118 |     for: 2m
119 |     labels:
120 |       severity: warning
121 |     annotations:
122 |       summary: Host out of inodes (instance {{ $labels.instance }})
123 |       description: "Disk is almost running out of available inodes (< 10% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
124 | ````
125 | 
126 | ## Host inodes will fill in 24 hours
127 | #### Filesystem is predicted to run out of inodes within the next 24 hours at current write rate
128 | ````
129 |   - alert: HostInodesWillFillIn24Hours
130 |     expr: node_filesystem_files_free / node_filesystem_files * 100 < 10 and predict_linear(node_filesystem_files_free[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
131 |     for: 2m
132 |     labels:
133 |       severity: warning
134 |     annotations:
135 |       summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
136 |       description: "Filesystem is predicted to run out of inodes within the next 24 hours at current write rate\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
137 | ````
138 | 
139 | ## Host unusual disk read latency
140 | #### Disk latency is growing (read operations > 100ms)
141 | ````
142 |   - alert: HostUnusualDiskReadLatency
143 |     expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0
144 |     for: 2m
145 |     labels:
146 |       severity: warning
147 |     annotations:
148 |       summary: Host unusual disk read latency (instance {{ $labels.instance }})
149 |       description: "Disk latency is growing (read operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
150 | ````
151 | 
152 | ## Host unusual disk write latency
153 | #### Disk latency is growing (write operations > 100ms)
154 | ````
155 |   - alert: HostUnusualDiskWriteLatency
156 |     expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0
157 |     for: 2m
158 |     labels:
159 |       severity: warning
160 |     annotations:
161 |       summary: Host unusual disk write latency (instance {{ $labels.instance }})
162 |       description: "Disk latency is growing (write operations > 100ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
163 | ````
164 | 
165 | ## Host high CPU load
166 | #### CPU load is > 80%
167 | ````
168 |   - alert: HostHighCpuLoad
169 |     expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
170 |     for: 0m
171 |     labels:
172 |       severity: warning
173 |     annotations:
174 |       summary: Host high CPU load (instance {{ $labels.instance }})
175 |       description: "CPU load is > 80%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
176 | ````
177 | 
178 | ## Host CPU steal noisy neighbor
179 | #### CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.
180 | ````
181 |   - alert: HostCpuStealNoisyNeighbor
182 |     expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
183 |     for: 0m
184 |     labels:
185 |       severity: warning
186 |     annotations:
187 |       summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
188 |       description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
189 | ````
190 | 
191 | ## Host CPU high iowait
192 | #### CPU iowait > 5%. A high iowait means that you are disk or network bound.
193 | ````
194 |   - alert: HostCpuHighIowait
195 |     expr: avg by (instance) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 5
196 |     for: 0m
197 |     labels:
198 |       severity: warning
199 |     annotations:
200 |       summary: Host CPU high iowait (instance {{ $labels.instance }})
201 |       description: "CPU iowait > 5%. A high iowait means that you are disk or network bound.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
202 | ````
203 | 
204 | ## Host context switching
205 | #### Context switching is growing on node (> 1000 / s)
206 | ````
207 |   # 1000 context switches is an arbitrary number.
208 |   # Alert threshold depends on nature of application.
209 |   # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
210 |   - alert: HostContextSwitching
211 |     expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 1000
212 |     for: 0m
213 |     labels:
214 |       severity: warning
215 |     annotations:
216 |       summary: Host context switching (instance {{ $labels.instance }})
217 |       description: "Context switching is growing on node (> 1000 / s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
218 | ````
219 | 
220 | ## Host swap is filling up
221 | #### Swap is filling up (>80%)
222 | ````
223 |   - alert: HostSwapIsFillingUp
224 |     expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
225 |     for: 2m
226 |     labels:
227 |       severity: warning
228 |     annotations:
229 |       summary: Host swap is filling up (instance {{ $labels.instance }})
230 |       description: "Swap is filling up (>80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
231 | ````
232 | 
233 | ## Host systemd service crashed
234 | #### systemd service crashed
235 | ````
236 |   - alert: HostSystemdServiceCrashed
237 |     expr: node_systemd_unit_state{state="failed"} == 1
238 |     for: 0m
239 |     labels:
240 |       severity: warning
241 |     annotations:
242 |       summary: Host systemd service crashed (instance {{ $labels.instance }})
243 |       description: "systemd service crashed\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
244 | ````
245 | 
246 | ## Host physical component too hot
247 | #### Physical hardware component too hot
248 | ````
249 |   - alert: HostPhysicalComponentTooHot
250 |     expr: node_hwmon_temp_celsius > 75
251 |     for: 5m
252 |     labels:
253 |       severity: warning
254 |     annotations:
255 |       summary: Host physical component too hot (instance {{ $labels.instance }})
256 |       description: "Physical hardware component too hot\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
257 | ````
258 | 
259 | ## Host node overtemperature alarm
260 | #### Physical node temperature alarm triggered
261 | ````
262 |   - alert: HostNodeOvertemperatureAlarm
263 |     expr: node_hwmon_temp_crit_alarm_celsius == 1
264 |     for: 0m
265 |     labels:
266 |       severity: critical
267 |     annotations:
268 |       summary: Host node overtemperature alarm (instance {{ $labels.instance }})
269 |       description: "Physical node temperature alarm triggered\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
270 | ````
271 | 
272 | ## Host RAID array got inactive
273 | #### RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.
274 | ````
275 |   - alert: HostRaidArrayGotInactive
276 |     expr: node_md_state{state="inactive"} > 0
277 |     for: 0m
278 |     labels:
279 |       severity: critical
280 |     annotations:
281 |       summary: Host RAID array got inactive (instance {{ $labels.instance }})
282 |       description: "RAID array {{ $labels.device }} is in degraded state due to one or more disks failures. Number of spare drives is insufficient to fix issue automatically.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
283 | ````
284 | 
285 | ## Host RAID disk failure
286 | #### At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap
287 | ````
288 |   - alert: HostRaidDiskFailure
289 |     expr: node_md_disks{state="failed"} > 0
290 |     for: 2m
291 |     labels:
292 |       severity: warning
293 |     annotations:
294 |       summary: Host RAID disk failure (instance {{ $labels.instance }})
295 |       description: "At least one device in RAID array on {{ $labels.instance }} failed. Array {{ $labels.md_device }} needs attention and possibly a disk swap\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
296 | ````
297 | 
298 | ## Host kernel version deviations
299 | #### Different kernel versions are running
300 | ````
301 |   - alert: HostKernelVersionDeviations
302 |     expr: count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1
303 |     for: 6h
304 |     labels:
305 |       severity: warning
306 |     annotations:
307 |       summary: Host kernel version deviations (instance {{ $labels.instance }})
308 |       description: "Different kernel versions are running\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
309 | ````
310 | 
311 | ## Host OOM kill detected
312 | #### OOM kill detected
313 | ````
314 |   - alert: HostOomKillDetected
315 |     expr: increase(node_vmstat_oom_kill[1m]) > 0
316 |     for: 0m
317 |     labels:
318 |       severity: warning
319 |     annotations:
320 |       summary: Host OOM kill detected (instance {{ $labels.instance }})
321 |       description: "OOM kill detected\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
322 | ````
323 | 
324 | ## Host EDAC Correctable Errors detected
325 | #### Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.
326 | ````
327 |   - alert: HostEdacCorrectableErrorsDetected
328 |     expr: increase(node_edac_correctable_errors_total[1m]) > 0
329 |     for: 0m
330 |     labels:
331 |       severity: info
332 |     annotations:
333 |       summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
334 |       description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
335 | ````
336 | 
337 | ## Host EDAC Uncorrectable Errors detected
338 | #### Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.
339 | ````
340 |   - alert: HostEdacUncorrectableErrorsDetected
341 |     expr: node_edac_uncorrectable_errors_total > 0
342 |     for: 0m
343 |     labels:
344 |       severity: warning
345 |     annotations:
346 |       summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
347 |       description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
348 | ````
349 | 
350 | ## Host Network Receive Errors
351 | #### Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.
352 | ````
353 |   - alert: HostNetworkReceiveErrors
354 |     expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
355 |     for: 2m
356 |     labels:
357 |       severity: warning
358 |     annotations:
359 |       summary: Host Network Receive Errors (instance {{ $labels.instance }})
360 |       description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last two minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
361 | ````
362 | 
363 | ## Host Network Transmit Errors
364 | #### Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.
365 | ````
366 |   - alert: HostNetworkTransmitErrors
367 |     expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
368 |     for: 2m
369 |     labels:
370 |       severity: warning
371 |     annotations:
372 |       summary: Host Network Transmit Errors (instance {{ $labels.instance }})
373 |       description: "Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last two minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
374 | ````
375 | 
376 | ## Host Network Interface Saturated
377 | #### The network interface "{{ $labels.device }}" on "{{ $labels.instance }}" is getting overloaded.
378 | ````
379 |   - alert: HostNetworkInterfaceSaturated
380 |     expr: (rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000
381 |     for: 1m
382 |     labels:
383 |       severity: warning
384 |     annotations:
385 |       summary: Host Network Interface Saturated (instance {{ $labels.instance }})
386 |       description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
387 | ````
388 | 
389 | ## Host Network Bond Degraded
390 | #### Bond "{{ $labels.device }}" degraded on "{{ $labels.instance }}".
391 | ````
392 |   - alert: HostNetworkBondDegraded
393 |     expr: (node_bonding_active - node_bonding_slaves) != 0
394 |     for: 2m
395 |     labels:
396 |       severity: warning
397 |     annotations:
398 |       summary: Host Network Bond Degraded (instance {{ $labels.instance }})
399 |       description: "Bond \"{{ $labels.device }}\" degraded on \"{{ $labels.instance }}\".\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
400 | ````
401 | 
402 | ## Host conntrack limit
403 | #### The number of conntrack is approaching limit
404 | ````
405 |   - alert: HostConntrackLimit
406 |     expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
407 |     for: 5m
408 |     labels:
409 |       severity: warning
410 |     annotations:
411 |       summary: Host conntrack limit (instance {{ $labels.instance }})
412 |       description: "The number of conntrack is approaching limit\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
413 | ````
414 | 
415 | ## Host clock skew
416 | #### Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.
417 | ````
418 |   - alert: HostClockSkew
419 |     expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
420 |     for: 2m
421 |     labels:
422 |       severity: warning
423 |     annotations:
424 |       summary: Host clock skew (instance {{ $labels.instance }})
425 |       description: "Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
426 | ````
427 | 
428 | ## Host clock not synchronising
429 | #### Clock not synchronising. Ensure NTP is configured on this host.
430 | ````
431 |   - alert: HostClockNotSynchronising
432 |     expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
433 |     for: 2m
434 |     labels:
435 |       severity: warning
436 |     annotations:
437 |       summary: Host clock not synchronising (instance {{ $labels.instance }})
438 |       description: "Clock not synchronising. Ensure NTP is configured on this host.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
439 | ````
440 | 
441 | ## Host requires reboot
442 | #### {{ $labels.instance }} requires a reboot.
443 | ````
444 |   - alert: HostRequiresReboot
445 |     expr: node_reboot_required > 0
446 |     for: 4h
447 |     labels:
448 |       severity: info
449 |     annotations:
450 |       summary: Host requires reboot (instance {{ $labels.instance }})
451 |       description: "{{ $labels.instance }} requires a reboot.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
452 | ````
453 | 


--------------------------------------------------------------------------------