├── README.md
└── ceph.yml


/README.md:
--------------------------------------------------------------------------------
1 | # [CEPH](https://www.ceph.com) prometheus rules
2 | ## A set of production-grade rules used to monitor CEPH cluster.
3 | 
4 | Since Luminous CEPH release there is a [prometheus endpoint](http://docs.ceph.com/docs/luminous/mgr/prometheus/) available. You can use it for alerting purposes.
5 | In this repository you can find out some basic rules that can be used as a good base for your CEPH monitoring.
6 | 


--------------------------------------------------------------------------------
/ceph.yml:
--------------------------------------------------------------------------------
  1 | groups:
  2 | - name: ceph.rules
  3 |   rules:
  4 |   - alert: CephTargetDown
  5 |     expr: up{job="ceph"} == 0
  6 |     for: 10m
  7 |     labels:
  8 |       severity: critical
  9 |     annotations:
 10 |       description: CEPH target down for more than 2m, please check - it could be a either exporter crash or a whole cluster crash
 11 |       summary: CEPH exporter down
 12 |   - alert: CephErrorState
 13 |     expr: ceph_health_status > 1
 14 |     for: 5m
 15 |     labels:
 16 |       severity: critical
 17 |     annotations:
 18 |       description: Ceph is in Error state longer than 5m, please check status of pools and OSDs
 19 |       summary: CEPH in ERROR
 20 |   - alert: CephWarnState
 21 |     expr: ceph_health_status == 1
 22 |     for: 30m
 23 |     labels:
 24 |       severity: warning
 25 |     annotations:
 26 |       description: Ceph is in Warn state longer than 30m, please check status of pools and OSDs
 27 |       summary: CEPH in WARN
 28 |   - alert: OsdDown
 29 |     expr: ceph_osd_up == 0
 30 |     for: 30m
 31 |     labels:
 32 |       severity: warning
 33 |     annotations:
 34 |       description: OSD is down longer than 30 min, please check whats the status
 35 |       summary: OSD down
 36 |   - alert: OsdApplyLatencyTooHigh
 37 |     expr: ceph_osd_perf_apply_latency_seconds > 10
 38 |     for: 90s
 39 |     labels:
 40 |       severity: warning
 41 |     annotations:
 42 |       description: OSD latency for {{ $labels.osd }} is too high. Please check if it doesn't stuck in weird state
 43 |       summary: OSD latency too high {{ $labels.osd }}
 44 |   - alert: MonitorClockSkewTooHigh
 45 |     expr: abs(ceph_monitor_clock_skew_seconds) > 0.1
 46 |     for: 60s
 47 |     labels:
 48 |       severity: warning
 49 |     annotations:
 50 |       description: Monitor clock skew detected on  {{ $labels.monitor }} - please check ntp and harware clock settins
 51 |       summary: Clock skew detected on {{ $labels.monitor }}
 52 |   - alert: MonitorAvailableStorage
 53 |     expr: ceph_monitor_avail_percent < 30
 54 |     for: 60s
 55 |     labels:
 56 |       severity: warning
 57 |     annotations:
 58 |       description: Monitor storage for {{ $labels.monitor }} less than 30% - please check why its too high
 59 |       summary: Nonitor storage for  {{ $labels.monitor }} less than 30%
 60 |   - alert: MonitorAvailableStorage
 61 |     expr: ceph_monitor_avail_percent < 15
 62 |     for: 60s
 63 |     labels:
 64 |       severity: critical
 65 |     annotations:
 66 |       description: Monitor storage for {{ $labels.monitor }} less than 15% - please check why its too high
 67 |       summary: Nonitor storage for  {{ $labels.monitor }} less than 15%
 68 |   - alert: CephOSDUtilizatoin
 69 |     expr: ceph_osd_utilization > 90
 70 |     for: 60s
 71 |     labels:
 72 |       severity: critical
 73 |     annotations:
 74 |       description: Osd free space for  {{ $labels.osd }} is higher tan 90%. Please validate why its so big, reweight or add storage
 75 |       summary: OSD {{ $labels.osd }} is going out of space
 76 |   - alert: CephPgDown
 77 |     expr: ceph_pg_down > 0
 78 |     for: 3m
 79 |     labels:
 80 |       severity: critical
 81 |     annotations:
 82 |       description: Some groups are down (unavailable) for too long on {{ $labels.cluster }}. Please ensure that all the data are available
 83 |       summary: PG DOWN [{{ $value }}] on {{ $labels.cluster }}
 84 |   - alert: CephPgIncomplete
 85 |     expr: ceph_pg_incomplete > 0
 86 |     for: 2m
 87 |     labels:
 88 |       severity: critical
 89 |     annotations:
 90 |       description: Some groups are incomplete (unavailable) for too long on {{ $labels.cluster }}. Please ensure that all the data are available
 91 |       summary: PG INCOMPLETE [{{ $value }}] on {{ $labels.cluster }}
 92 |   - alert: CephPgInconsistent
 93 |     expr: ceph_pg_inconsistent > 0
 94 |     for: 1m
 95 |     labels:
 96 |       severity: warning
 97 |     annotations:
 98 |       description: Some groups are inconsistent for too long on {{ $labels.cluster }}. Data is available but inconsistent across nodes
 99 |       summary: PG INCONSISTENT [{{ $value }}] on {{ $labels.cluster }}
100 |   - alert: CephPgActivating
101 |     expr: ceph_pg_activating > 0
102 |     for: 5m
103 |     labels:
104 |       severity: critical
105 |     annotations:
106 |       description: Some groups are activating for too long on {{ $labels.cluster }}. Those PGs are unavailable for too long!
107 |       summary: PG ACTIVATING [{{ $value }}] on {{ $labels.cluster }}
108 |   - alert: CephPgBackfillTooFull
109 |     expr: ceph_pg_backfill_toofull > 0
110 |     for: 5m
111 |     labels:
112 |       severity: warning
113 |     annotations:
114 |       description: Some groups are located on full OSD on cluster {{ $labels.cluster }}. Those PGs can be unavailable shortly. Please check OSDs, change weight or reconfigure CRUSH rules.
115 |       summary: PG TOO FULL [{{ $value }}] on {{ $labels.cluster }}
116 |   - alert: CephPgUnavailable
117 |     expr: ceph_pg_total - ceph_pg_active > 0
118 |     for: 5m
119 |     labels:
120 |       severity: critical
121 |     annotations:
122 |       description: Some groups are unavailable on {{ $labels.cluster }}. Please check their detailed status and current configuration.
123 |       summary: PG UNAVAILABLE [{{ $value }}] on {{ $labels.cluster }}
124 |   - alert: CephOsdReweighted
125 |     expr: ceph_osd_weight < 1
126 |     for: 1h
127 |     labels:
128 |       severity: warning
129 |     annotations:
130 |       description: OSD {{ $labels.ceph_daemon}} on cluster {{ $labels.cluster}} was reweighted for too long. Please either create silent or fix that issue
131 |       summary: OSD {{ $labels.ceph_daemon }} on {{ $labels.cluster }} reweighted - {{ $value }}
132 | 


--------------------------------------------------------------------------------