├── alert.yml
├── etcd.txt
├── one-latest.yml
└── readme.md


/alert.yml:
--------------------------------------------------------------------------------
  1 | apiVersion: monitoring.coreos.com/v1
  2 | kind: PrometheusRule
  3 | metadata:
  4 |   labels:
  5 |     prometheus: k8s
  6 |     role: alert-rules
  7 |   name: node-rules
  8 |   namespace: monitoring
  9 | spec:
 10 |   groups:
 11 |     - name: external_node_alarm
 12 |       rules:
 13 |       - alert: node_host_lost
 14 |         expr: up{job="external-node"} == 0
 15 |         for: 1m
 16 |         labels:
 17 |           severity: critical
 18 |         annotations:
 19 |           summary: "{{$labels.instance}}:service is down"
 20 |           description: "{{$labels.instance}}: lost contact for 1 minutes"
 21 |       - alert: HostOutOfMemory10
 22 |         expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
 23 |         for: 1m
 24 |         labels:
 25 |           severity: critical
 26 |         annotations:
 27 |           summary: (instance {{ $labels.instance }})实例内存不足
 28 |           description: "节点内存已用完 (剩余小于10%)已持续1分钟\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 29 |       - alert: HostOutOfMemory2
 30 |         expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 2
 31 |         for: 1m
 32 |         labels:
 33 |           severity: critical
 34 |         annotations:
 35 |           summary: (instance {{ $labels.instance }})实例内存不足,可能会发生内存溢出
 36 |           description: "节点内存已用完 (剩余小于2%)已持续1分钟\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"          
 37 |       - alert: HostMemoryUnderMemoryPressure
 38 |         expr: rate(node_vmstat_pgmajfault[1m]) > 1000
 39 |         for: 1m
 40 |         labels:
 41 |           severity: warning
 42 |         annotations:
 43 |           summary: (instance {{ $labels.instance }})实例内存压力很大
 44 |           description: "节点内存压力很大major page错误率高已持续1分钟\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 45 |       - alert: HostUnusualNetworkThroughputIn
 46 |         expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
 47 |         for: 2m
 48 |         labels:
 49 |           severity: critical
 50 |         annotations:
 51 |           summary: (instance {{ $labels.instance }})实例出现异常的input网络吞吐量
 52 |           description: "{{ $labels.instance }}网络接口接收超过100MB/s的数据已持续2分钟 \n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"          
 53 |       - alert: HostUnusualNetworkThroughputOut
 54 |         expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
 55 |         for: 2m
 56 |         labels:
 57 |           severity: critical
 58 |         annotations:
 59 |           summary: (instance {{ $labels.instance }})实例出现异常的output网络吞吐量
 60 |           description: "{{ $labels.instance }}实例网络发送了超过100MB/s的数据已持续2分钟 (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 61 |       - alert: HostUnusualDiskReadRatecritical
 62 |         expr: sum by (instance) (rate(node_disk_read_bytes_total[1m])) / 1024 / 1024 > 100
 63 |         for: 1m
 64 |         labels:
 65 |           severity: critical
 66 |         annotations:
 67 |           summary: (instance {{ $labels.instance }})实例磁盘读取过高
 68 |           description: "{{ $labels.instance }}实例磁盘读取超过>100MB/s avg [HDD]已持续1分钟 \n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
 69 |       - alert: HostUnusualDiskReadRatewarning
 70 |         expr: sum by (instance) (rate(node_disk_read_bytes_total[1m])) / 1024 / 1024 > 60
 71 |         for: 1m
 72 |         labels:
 73 |           severity: warning
 74 |         annotations:
 75 |           summary: (instance {{ $labels.instance }})实例磁盘读取过高
 76 |           description: "{{ $labels.instance }}实例磁盘读取超过>60MB/s avg [HDD]已持续1分钟 \n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"	
 77 |       - alert: HostUnusualDiskWriteRatewarning
 78 |         expr: sum by (instance) (rate(node_disk_written_bytes_total[1m])) / 1024 / 1024 > 60
 79 |         for: 1m
 80 |         labels:
 81 |           severity: warning
 82 |         annotations:
 83 |           summary: (instance {{ $labels.instance }})实例磁盘写入过高
 84 |           description: "{{ $labels.instance }}实例磁盘写入超过60MB/s avg [HDD]已持续1分钟 \n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"		  
 85 |       - alert: HostUnusualDiskWriteRatecritical
 86 |         expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 60
 87 |         for: 2m
 88 |         labels:
 89 |           severity: critical
 90 |         annotations:
 91 |           summary: (instance {{ $labels.instance }})实例磁盘写入过高
 92 |           description: "{{ $labels.instance }}实例磁盘写入超过60MB/s avg [HDD]已持续2分钟 \n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"		  		 
 93 |       # Please add ignored mountpoints in node_exporter parameters like
 94 |       # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
 95 |       # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
 96 |       - alert: HostOutOfDiskSpacefree
 97 |         expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
 98 |         for: 2m
 99 |         labels:
100 |           severity: critical
101 |         annotations:
102 |           summary: (instance {{ $labels.instance }})实例磁盘空间不足
103 |           description: "{{ $labels.instance }}磁盘快满了,剩余不足10%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
104 |       # Please add ignored mountpoints in node_exporter parameters like
105 |       # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)".
106 |       # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users.
107 |       - alert: HostOutOfDiskSpacehigh
108 |         expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 5 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
109 |         for: 1m
110 |         labels:
111 |           severity: critical
112 |         annotations:
113 |           summary: (instance {{ $labels.instance }})实例磁盘空间不足
114 |           description: "{{ $labels.instance }}磁盘快满了,剩余不足5%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"      
115 |       - alert: HostDiskWillFillIn24Hours
116 |         expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
117 |         for: 2m
118 |         labels:
119 |           severity: critical
120 |         annotations:
121 |           summary: (instance {{ $labels.instance }})在24小时内就会写满
122 |           description: "{{ $labels.instance }}在当前的写入速率下,预计文件系统将会在24小时内耗尽磁盘空间\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
123 |       - alert: HostOutOfInodes
124 |         expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
125 |         for: 2m
126 |         labels:
127 |           severity: critical
128 |         annotations:
129 |           summary: (instance {{ $labels.instance }})实例inodes不足
130 |           description: "{{ $labels.instance }}磁盘几乎用完了inodes,剩余不足10%\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
131 |       - alert: HostInodesWillFillIn24Hours
132 |         expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 *3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0
133 |         for: 2m
134 |         labels:
135 |           severity: critical
136 |         annotations:
137 |           summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }})
138 |           description: "{{ $labels.instance }}以当前写入速率,文件系统预计将在未来 24小时内耗尽inode\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
139 |       - alert: HostUnusualDiskReadLatency
140 |         expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.2 and rate(node_disk_reads_completed_total[1m]) > 0
141 |         for: 1m
142 |         labels:
143 |           severity: critical
144 |         annotations:
145 |           summary: (instance {{ $labels.instance }})实例磁盘读取延迟过高
146 |           description: "磁盘读取延迟正在增长,当下大于200ms已持续1分钟\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"		  
147 |       - alert: HostUnusualDiskWriteLatency
148 |         expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0
149 |         for: 1m
150 |         labels:
151 |           severity: warning
152 |         annotations:
153 |           summary: (instance {{ $labels.instance }})实例磁盘写入延迟过高
154 |           description: "磁盘写入延迟正在增长,当下大于100ms已持续1分钟\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"      	  
155 |       - alert: HostHighCpuLoad85
156 |         expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[1m])) * 100) > 85
157 |         for: 1m
158 |         labels:
159 |           severity: critical
160 |         annotations:
161 |           summary: (instance {{ $labels.instance }})实例CPU负载过高
162 |           description: "当前CPU load高于85%持续1分钟\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
163 |       - alert: HostHighCpuLoad75
164 |         expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[1m])) * 100) > 75
165 |         for: 1m
166 |         labels:
167 |           severity: critical
168 |         annotations:
169 |           summary: (instance {{ $labels.instance }})实例CPU负载过高
170 |           description: "当前CPU load高于75%持续1分钟\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"		  
171 |       - alert: HostCpuStealNoisyNeighbor
172 |         expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
173 |         for: 5m
174 |         labels:
175 |           severity: warning
176 |         annotations:
177 |           summary: (instance {{ $labels.instance }})CPU steal time开始增长
178 |           description: "CPU steal time大于10%已持续5分钟. 这意味着CPU资源不足或vm资源竞争。当前实例计算能力将开始下降.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
179 |       # 1000 context switches is an arbitrary number.
180 |       # Alert threshold depends on nature of application.
181 |       # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58
182 |       - alert: HostContextSwitching
183 |         expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000
184 |         for: 0m
185 |         labels:
186 |           severity: warning
187 |         annotations:
188 |           summary: (instance {{ $labels.instance }})主机上下文切换异常
189 |           description: "实例上下文切换正在增长,当下超过10000/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
190 |       - alert: HostSwapIsFillingUp
191 |         expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
192 |         for: 2m
193 |         labels:
194 |           severity: warning
195 |         annotations:
196 |           summary: (instance {{ $labels.instance }})swap内存即将用完
197 |           description: "Swap使用超过80% \n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
198 |       - alert: HostSystemdServiceCrashed
199 |         expr: node_systemd_unit_state{state="failed"} == 1
200 |         for: 0m
201 |         labels:
202 |           severity: critical
203 |         annotations:
204 |           summary: 请注意，(instance {{ $labels.instance }})当前实例systemd service服务已崩溃
205 |           description: "当前实例systemd service崩溃\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
206 |       - alert: HostKernelVersionDeviations
207 |         expr: count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1
208 |         for: 6h
209 |         labels:
210 |           severity: warning
211 |         annotations:
212 |           summary: (instance {{ $labels.instance }})实例内核版本偏差
213 |           description: "正在运行的内核版本:\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
214 |       - alert: HostOomKillDetected
215 |         expr: increase(node_vmstat_oom_kill[1m]) > 0
216 |         for: 0m
217 |         labels:
218 |           severity: critical
219 |         annotations:
220 |           summary: (instance {{ $labels.instance }})实例发生OOM kill事件
221 |           description: "检测到OOM kill\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
222 |       - alert: HostNetworkReceiveErrors
223 |         expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
224 |         for: 2m
225 |         labels:
226 |           severity: warning
227 |         annotations:
228 |           summary: (instance {{ $labels.instance }})主机网络接收错误
229 |           description: "主机 {{ $labels.instance }} interface {{ $labels.device }} 在过去五分钟内遇到了 {{ printf \"%.0f\" $value }} 接收错误。\n VALUE = { { $value }}\n 标签 = {{ $labels }}"
230 |       - alert: HostNetworkTransmitErrors
231 |         expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01
232 |         for: 2m
233 |         labels:
234 |           severity: warning
235 |         annotations:
236 |           summary: (instance {{ $labels.instance }})主机网络传输错误
237 |           description: "主机 {{ $labels.instance }} interface {{ $labels.device }} 在过去五分钟内遇到了 {{ printf \"%.0f\" $value }} 传输错误。\n VALUE = { { $value }}\n 标签 = {{ $labels }}"
238 |       - alert: HostNetworkInterfaceSaturated
239 |         expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
240 |         for: 1m
241 |         labels:
242 |           severity: critical
243 |         annotations:
244 |           summary: (instance {{ $labels.instance }})网络接口接近饱和
245 |           description: "\"{{ $labels.instance }}\" 上的网络接口 \"{{ $labels.interface }}\" 正在过载。\n VALUE = {{ $value }}\n LABELS = { { $labels }}"
246 |       - alert: HostConntrackLimit
247 |         expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
248 |         for: 5m
249 |         labels:
250 |           severity: warning
251 |         annotations:
252 |           summary: 主机 conntrack 限制(instance {{ $labels.instance }})
253 |           description: "conntrack 数量接近限制\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
254 |       - alert: HostClockSkew
255 |         expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
256 |         for: 2m
257 |         labels:
258 |           severity: warning
259 |         annotations:
260 |           summary: 主机时钟偏差(instance {{ $labels.instance }})
261 |           description: "检测到时钟偏差。时钟不同步。\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
262 |       - alert: HostEdacCorrectableErrorsDetected
263 |         expr: increase(node_edac_correctable_errors_total[1m]) > 0
264 |         for: 0m
265 |         labels:
266 |           severity: info
267 |         annotations:
268 |           summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
269 |           description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
270 |       - alert: HostEdacUncorrectableErrorsDetected
271 |         expr: node_edac_uncorrectable_errors_total > 0
272 |         for: 0m
273 |         labels:
274 |           severity: warning
275 |         annotations:
276 |           summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
277 |           description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
278 |       - alert: node_disk_Utilization_high
279 |         expr: rate(node_disk_io_time_seconds_total[5m]) * 100 > 90
280 |         for: 1m
281 |         labels:
282 |           severity: critical
283 |         annotations:
284 |           summary: "{{$labels.instance}}: disk Utilization > {{ $value }} % "
285 |           description: "{{$labels.instance}}: disk Utilization > {{ $value }}. this should attract attention . disk performance seems to be problematic .for 1 minutes"
286 | ---
287 | apiVersion: monitoring.coreos.com/v1
288 | kind: PrometheusRule
289 | metadata:
290 |   labels:
291 |     prometheus: k8s
292 |     role: alert-rules
293 |   name: redis-rules
294 |   namespace: monitoring
295 | spec:
296 |   groups:
297 |   - name: redisStatsAlert
298 |     rules:
299 |     - alert: redis is down
300 |       expr: up{job="external-redis"} == 0
301 |       for: 1m
302 |       labels:
303 |         severity: critical
304 |       annotations:
305 |         summary: "{{ $labels.instance }} redis is down"
306 |         description: "{{ $labels.instance }} redis is down "
307 | ---
308 | apiVersion: monitoring.coreos.com/v1
309 | kind: PrometheusRule
310 | metadata:
311 |   labels:
312 |     prometheus: k8s
313 |     role: alert-rules
314 |   name: rabbitmq-rules
315 |   namespace: monitoring
316 | spec:
317 |   groups:
318 |   - name: rabbitmqStatsAlert
319 |     rules:
320 |     - alert: rabbitmq is down
321 |       expr: up{job="external-rabbitmq"} == 0
322 |       for: 1m
323 |       labels:
324 |         severity: critical
325 |       annotations:
326 |         summary: "{{ $labels.instance }} rabbitmq is down"
327 |         description: "{{ $labels.instance }} rabbitmq is down "
328 | ---
329 | apiVersion: monitoring.coreos.com/v1
330 | kind: PrometheusRule
331 | metadata:
332 |   labels:
333 |     prometheus: k8s
334 |     role: alert-rules
335 |   name: mysql-rules
336 |   namespace: monitoring
337 | spec:
338 |   groups:
339 |   - name: MySQLStatsAlert
340 |     rules:
341 |     - alert: MySQL is down
342 |       expr: mysql_up == 0
343 |       for: 1m
344 |       labels:
345 |         severity: critical
346 |       annotations:
347 |         summary: "{{ $labels.instance }} MySQL is down"
348 |         description: "MySQL database is down. This requires immediate action!"
349 |     - alert: MysqlTooManyConnections(>80%)
350 |       expr: avg by (instance) (rate(mysql_global_status_threads_connected[1m])) / avg by (instance) (mysql_global_variables_max_connections) * 100 > 80
351 |       for: 2m
352 |       labels:
353 |         severity: warning
354 |       annotations:
355 |         summary: MySQL too many connections (> 80%) (instance {{ $labels.instance }})
356 |         description: "超过 80% 的 MySQL 连接在 {{ $labels.instance }} 上使用\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
357 |     - alert: MysqlHighThreadsRunning
358 |       expr: avg by (instance) (rate(mysql_global_status_threads_running[1m])) / avg by (instance) (mysql_global_variables_max_connections) * 100 > 60
359 |       for: 2m
360 |       labels:
361 |         severity: warning
362 |       annotations:
363 |         summary: MySQL high threads running (instance {{ $labels.instance }})
364 |         description: "超过 60% 的 MySQL 连接在 {{ $labels.instance }} 上处于运行状态\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
365 |     - alert: MysqlRestarted
366 |       expr: mysql_global_status_uptime < 60
367 |       for: 0m
368 |       labels:
369 |         severity: critical
370 |       annotations:
371 |         summary: MySQL重新启动 (instance {{ $labels.instance }})
372 |         description: "MySQL 刚刚重新启动，不到一分钟前在 {{ $labels.instance }} 上。\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
373 |   ##  MySQL Slave IO 线程未运行
374 |   #- alert: MysqlSlaveIoThreadNotRunning
375 |   #  expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_io_running == 0
376 |   #  for: 0m
377 |   #  labels:
378 |   #    severity: critical
379 |   #  annotations:
380 |   #    summary: MySQL Slave IO thread not running (instance {{ $labels.instance }})
381 |   #    description: "MySQL Slave IO thread not running on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"   
382 |   ##  MySQL Slave SQL 线程未运行
383 |   #- alert: MysqlSlaveSqlThreadNotRunning
384 |   #  expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_sql_running == 0
385 |   #  for: 0m
386 |   #  labels:
387 |   #    severity: critical
388 |   #  annotations:
389 |   #    summary: MySQL Slave SQL thread not running (instance {{ $labels.instance }})
390 |   #    description: "MySQL Slave SQL thread not running on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
391 |   ## MySQL Slave 复制滞后
392 |   #- alert: MysqlSlaveReplicationLag
393 |   #  expr: mysql_slave_status_master_server_id > 0 and ON (instance) (mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay) > 30
394 |   #  for: 1m
395 |   #  labels:
396 |   #    severity: critical
397 |   #  annotations:
398 |   #    summary: MySQL Slave replication lag (instance {{ $labels.instance }})
399 |   #    description: "MySQL replication lag on {{ $labels.instance }}\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
400 |   ## MySQL 慢查询
401 |   #- alert: MysqlSlowQueries
402 |   #  expr: increase(mysql_global_status_slow_queries[1m]) > 0
403 |   #  for: 2m
404 |   #  labels:
405 |   #    severity: warning
406 |   #  annotations:
407 |   #    summary: MySQL slow queries (instance {{ $labels.instance }})
408 |   #    description: "MySQL server mysql has some new slow query.\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"  
409 |   ## MySQL innodb 日志写入停滞 MySQL InnoDB 日志等待
410 |   #- alert: MysqlInnodbLogWaits
411 |   #  expr: rate(mysql_global_status_innodb_log_waits[15m]) > 10
412 |   #  for: 0m
413 |   #  labels:
414 |   #    severity: warning
415 |   #  annotations:
416 |   #    summary: MySQL InnoDB log waits (instance {{ $labels.instance }})
417 |   #    description: "MySQL innodb log writes stalling\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
418 | ---
419 | apiVersion: monitoring.coreos.com/v1
420 | kind: PrometheusRule
421 | metadata:
422 |   labels:
423 |     prometheus: k8s
424 |     role: alert-rules
425 |   name: kafka-rules
426 |   namespace: monitoring
427 | spec:
428 |   groups:
429 |   - name: external_kafka_alarm
430 |     rules:
431 |     - alert: KafkaTopicsReplicas
432 |       expr: sum(kafka_topic_partition_in_sync_replica) by (topic) < 3
433 |       for: 0m
434 |       labels:
435 |         severity: critical
436 |       annotations:
437 |         summary: Kafka topics replicas (instance {{ $labels.instance }})
438 |         description: "Kafka 副本分区\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
439 |     - alert: kafka_is_down
440 |       expr: up{job="external-kafka"} == 0
441 |       for: 0m
442 |       labels:
443 |         severity: critical
444 |       annotations:
445 |         summary: Kafka server is down (instance {{ $labels.instance }})
446 |         description: " {{ $labels.instance }} Kafka server is down now!"
447 |     - alert: KafkaConsumersGroup
448 |       expr: sum(kafka_consumergroup_lag) by (consumergroup) > 8888
449 |       for: 1m
450 |       labels:
451 |         severity: critical
452 |       annotations:
453 |         summary: Kafka consumers group (instance {{ $labels.instance }})
454 |         description: "Kafka 消费者组\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"   
455 |     - alert: KafkaTopicOffsetDecreased
456 |       expr: delta(kafka_burrow_partition_current_offset[1m]) < 0
457 |       for: 0m
458 |       labels:
459 |         severity: warning
460 |       annotations:
461 |         summary: Kafka topic offset decreased (instance {{ $labels.instance }})
462 |         description: "Kafka 主题偏移量已减少\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
463 |     - alert: KafkaConsumerLag
464 |       expr: 'kafka_burrow_topic_partition_offset - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset >= (kafka_burrow_topic_partition_offset offset 15m - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset offset 15m)
465 |       AND kafka_burrow_topic_partition_offset - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset > 0'
466 |       for: 15m
467 |       labels:
468 |         severity: critical
469 |       annotations:
470 |         summary: Kafka consumer lag (instance {{ $labels.instance }})
471 |         description: "卡夫卡消费者有 30 分钟的延迟并且越来越多\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"  
472 | ---
473 | apiVersion: monitoring.coreos.com/v1
474 | kind: PrometheusRule
475 | metadata:
476 |   labels:
477 |     prometheus: k8s
478 |     role: alert-rules
479 |   name: mongodb-rules
480 |   namespace: monitoring
481 | spec:
482 |   groups:
483 |   - name: external_mongodb_alarm
484 |     rules:
485 |     - alert: MongodbDown
486 |       expr: mongodb_up == 0
487 |       for: 0m
488 |       labels:
489 |         severity: critical
490 |       annotations:
491 |         summary: MongoDB Down (instance {{ $labels.instance }})
492 |         description: "MongoDB 实例已关闭\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
493 |     - alert: MongodbReplicationLag
494 |       #expr: mongodb_mongod_replset_member_optime_date{state="PRIMARY"} - ON (set) mongodb_mongod_replset_member_optime_date{state="SECONDARY"} > 10
495 |       expr: avg(mongodb_mongod_replset_member_optime_date{state="PRIMARY"})-avg(mongodb_mongod_replset_member_optime_date{state="SECONDARY"}) > 10
496 |       for: 0m
497 |       labels:
498 |         severity: critical
499 |       annotations:
500 |         summary: MongoDB replication lag (instance {{ $labels.instance }})
501 |         description: "Mongodb 复制延迟超过 10 秒\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
502 |     - alert: MongodbNumberCursorsOpen
503 |       expr: mongodb_mongod_metrics_cursor_open{state="total"} > 10 * 1000
504 |       for: 2m
505 |       labels:
506 |         severity: warning
507 |       annotations:
508 |         summary: MongoDB number cursors open (instance {{ $labels.instance }})
509 |         description: "MongoDB 为客户端打开了太多cursors （> 10k）\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
510 |     - alert: MongodbCursorsTimeouts
511 |       expr: increase(mongodb_mongod_metrics_cursor_timed_out_total[1m]) > 100
512 |       for: 2m
513 |       labels:
514 |         severity: warning
515 |       annotations:
516 |         summary: MongoDB cursors timeouts (instance {{ $labels.instance }})
517 |         description: "太多的cursors 超时\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
518 |     - alert: MongodbTooManyConnections
519 |       expr: avg by(instance) (rate(mongodb_connections{state="current"}[1m])) / avg by(instance) (sum (mongodb_connections) by (instance)) * 100 > 80
520 |       for: 2m
521 |       labels:
522 |         severity: warning
523 |       annotations:
524 |         summary: MongoDB too many connections (instance {{ $labels.instance }})
525 |         description: "连接过多 (> 80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
526 | ---
527 | apiVersion: monitoring.coreos.com/v1
528 | kind: PrometheusRule
529 | metadata:
530 |   labels:
531 |     prometheus: k8s
532 |     role: alert-rules
533 |   name: etcd-rules
534 |   namespace: monitoring
535 | spec:
536 |   groups:
537 |   - name: external_etcd_alarm
538 |     rules:
539 |     - alert: EtcdInsufficientMembers
540 |       expr: count(etcd_server_id) % 2 == 0
541 |       for: 0m
542 |       labels:
543 |         severity: critical
544 |       annotations:
545 |         summary: etcd cluster 已崩溃 (instance {{ $labels.instance }})
546 |         description: "Etcd 集群应该有奇数个成员, \n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
547 |     - alert: EtcdNoLeader
548 |       expr: etcd_server_has_leader == 0
549 |       for: 0m
550 |       labels:
551 |         severity: critical
552 |       annotations:
553 |         summary: Etcd 选举失败 (instance {{ $labels.instance }})
554 |         description: "Etcd cluster have no leader\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
555 |     - alert: EtcdHighNumberOfLeaderChanges
556 |       expr: increase(etcd_server_leader_changes_seen_total[10m]) > 2
557 |       for: 0m
558 |       labels:
559 |         severity: warning
560 |       annotations:
561 |         summary: Etcd leader 切换异常 (instance {{ $labels.instance }})
562 |         description: "10分钟内leader切换了两次\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"        
563 |     #- alert: EtcdHighNumberOfFailedGrpcRequestswarning
564 |     #  expr: 100 * sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{grpc_code!="OK",job=~".*etcd.*"}[5m])) / sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{job=~".*etcd.*"}[5m]))> 1
565 |     #  for: 10m
566 |     #  labels:
567 |     #    severity: warning
568 |     #  annotations:
569 |     #    summary: Etcd 大量失败的 GRPC 请求 (instance {{ $labels.instance }})
570 |     #    description: "在 Etcd 中检测到超过 1% 的 GRPC 请求失败\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"        
571 |     #- alert: EtcdHighNumberOfFailedGrpcRequestscritical     
572 |     #  expr: 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) without (grpc_type, grpc_code) / sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code) > 5
573 |     #  for: 5m
574 |     #  labels:
575 |     #    severity: critical
576 |     #  annotations:
577 |     #    summary:  Etcd 大量失败的 GRPC 请求  (instance {{ $labels.instance }})
578 |     #    description: "在 Etcd 中检测到超过 5% 的 GRPC 请求失败\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"                       
579 |     - alert: EtcdGrpcRequestsSlow
580 |       expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{grpc_type="unary"}[1m])) by (grpc_service, grpc_method, le)) > 0.15
581 |       for: 2m
582 |       labels:
583 |         severity: critical
584 |       annotations:
585 |         summary: Etcd GRPC 请求缓慢(instance {{ $labels.instance }})
586 |         description: "GRPC 请求变慢，99% 超过 0.15s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"	  
587 |     - alert: EtcdHighNumberOfFailedHttpRequestswarning
588 |       expr: sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.01
589 |       for: 2m
590 |       labels:
591 |         severity: warning
592 |       annotations:
593 |         summary: Etcd 大量失败的 HTTP 请求 (instance {{ $labels.instance }})
594 |         description: "在 Etcd 中检测到超过 1% 的 HTTP 失败\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"	
595 |     - alert: EtcdHighNumberOfFailedHttpRequestscritical
596 |       expr: sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.05
597 |       for: 2m
598 |       labels:
599 |         severity: critical
600 |       annotations:
601 |         summary: Etcd 大量失败的 HTTP 请求  (instance {{ $labels.instance }})
602 |         description: "在 Etcd 中检测到超过 5% 的 HTTP 失败\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"	 
603 |     - alert: EtcdHttpRequestsSlow
604 |       expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[1m])) > 0.15
605 |       for: 2m
606 |       labels:
607 |         severity: warning
608 |       annotations:
609 |         summary: Etcd HTTP 请求缓慢 (instance {{ $labels.instance }})
610 |         description: "Etcd HTTP 请求变慢，99% 超过 0.15s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
611 |     - alert: EtcdMemberCommunicationSlow
612 |       expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) > 0.2
613 |       for: 2m
614 |       labels:
615 |         severity: critical
616 |       annotations:
617 |         summary: Etcd成员通讯缓慢 (instance {{ $labels.instance }})
618 |         description: "Etcd 成员通信变慢，99% 超过 0.2s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
619 |     - alert: EtcdHighNumberOfFailedProposals
620 |       expr: increase(etcd_server_proposals_failed_total[1h]) > 5
621 |       for: 2m
622 |       labels:
623 |         severity: warning
624 |       annotations:
625 |         summary: Etcd 大量失败的proposals  (instance {{ $labels.instance }})
626 |         description: "Etcd 服务器在过去一小时收到了超过 5 个失败的proposals\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
627 |     - alert: EtcdHighFsyncDurations
628 |       expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) > 0.5
629 |       for: 2m
630 |       labels:
631 |         severity: critical
632 |       annotations:
633 |         summary: Etcd fsync 持续时间变高 (instance {{ $labels.instance }})
634 |         description: "Etcd WAL fsync 持续时间增加，99% 超过 0.5s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"	
635 |     - alert: EtcdHighCommitDurations
636 |       expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[1m])) > 0.25
637 |       for: 2m
638 |       labels:
639 |         severity: warning
640 |       annotations:
641 |         summary: Etcd 提交持续时间较高 (instance {{ $labels.instance }})
642 |         description: "Etcd 提交持续时间增加，99% 超过 0.25s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


--------------------------------------------------------------------------------
/etcd.txt:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: external_etcd_alarm
 3 |   rules:
 4 | 	- alert: EtcdInsufficientMembers
 5 | 	  expr: count(etcd_server_id) % 2 == 0
 6 | 	  for: 0m
 7 | 	  labels:
 8 | 	    severity: critical
 9 | 	  annotations:
10 | 	    summary: etcd cluster 已崩溃 (instance {{ $labels.instance }})
11 | 	    description: "Etcd 集群应该有奇数个成员, \n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
12 | 	- alert: EtcdNoLeader
13 | 	  expr: etcd_server_has_leader == 0
14 | 	  for: 0m
15 | 	  labels:
16 | 	    severity: critical
17 | 	  annotations:
18 | 	    summary: Etcd no Leader (instance {{ $labels.instance }})
19 | 	    description: "Etcd cluster have no leader\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
20 | 	- alert: EtcdHighNumberOfLeaderChanges
21 | 	  expr: increase(etcd_server_leader_changes_seen_total[10m]) > 2
22 | 	  for: 0m
23 | 	  labels:
24 | 	    severity: critical
25 | 	  annotations:
26 | 	    summary: Etcd leader 切换异常 (instance {{ $labels.instance }})
27 | 	    description: "10分钟内leader切换了两次\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
28 | 	- alert: EtcdHighNumberOfFailedGrpcRequests     
29 | 	  expr: 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) without (grpc_type, grpc_code) / sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code) > 5
30 | 	  for: 5m
31 | 	  labels:
32 | 	    severity: warning
33 | 	  annotations:
34 | 	    summary:  Etcd 大量失败的 GRPC 请求  (instance {{ $labels.instance }})
35 | 	    description: "在 Etcd 中检测到超过 5% 的 GRPC 请求失败\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
36 | 	- alert: EtcdGrpcRequestsSlow
37 | 	  expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{grpc_type="unary"}[1m])) by (grpc_service, grpc_method, le)) > 0.15
38 | 	  for: 2m
39 | 	  labels:
40 | 	    severity: critical
41 | 	  annotations:
42 | 	    summary: Etcd GRPC 请求缓慢(instance {{ $labels.instance }})
43 | 	    description: "GRPC 请求变慢，99% 超过 0.15s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
44 | 	- alert: EtcdHighNumberOfFailedHttpRequests
45 | 	  expr: sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.01
46 | 	  for: 2m
47 | 	  labels:
48 | 	    severity: warning
49 | 	  annotations:
50 | 	    summary: Etcd 大量失败的 HTTP 请求 (instance {{ $labels.instance }})
51 | 	    description: "在 Etcd 中检测到超过 1% 的 HTTP 失败\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"	
52 | 	- alert: EtcdHighNumberOfFailedHttpRequests
53 | 	  expr: sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.05
54 | 	  for: 2m
55 | 	  labels:
56 | 	    severity: critical
57 | 	  annotations:
58 | 	    summary: Etcd 大量失败的 HTTP 请求  (instance {{ $labels.instance }})
59 | 	    description: "在 Etcd 中检测到超过 5% 的 HTTP 失败\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"	 
60 | 	- alert: EtcdHttpRequestsSlow
61 | 	  expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[1m])) > 0.15
62 | 	  for: 2m
63 | 	  labels:
64 | 	    severity: critical
65 | 	  annotations:
66 | 	    summary: Etcd HTTP 请求缓慢 (instance {{ $labels.instance }})
67 | 	    description: "Etcd HTTP 请求变慢，99% 超过 0.15s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
68 | 	- alert: EtcdMemberCommunicationSlow
69 | 	  expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) > 0.15
70 | 	  for: 2m
71 | 	  labels:
72 | 	    severity: critical
73 | 	  annotations:
74 | 	    summary: Etcd成员通讯缓慢 (instance {{ $labels.instance }})
75 | 	    description: "Etcd 成员通信变慢，99% 超过 0.15s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
76 | 	- alert: EtcdHighNumberOfFailedProposals
77 | 	  expr: increase(etcd_server_proposals_failed_total[1h]) > 5
78 | 	  for: 2m
79 | 	  labels:
80 | 	    severity: warning
81 | 	  annotations:
82 | 	    summary: Etcd 大量失败的proposals  (instance {{ $labels.instance }})
83 | 	    description: "Etcd 服务器在过去一小时收到了超过 5 个失败的proposals\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
84 | 	- alert: EtcdHighFsyncDurations
85 | 	  expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) > 0.5
86 | 	  for: 2m
87 | 	  labels:
88 | 	    severity: critical
89 | 	  annotations:
90 | 	    summary: Etcd fsync 持续时间变高 (instance {{ $labels.instance }})
91 | 	    description: "Etcd WAL fsync 持续时间增加，99% 超过 0.5s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
92 | 	- alert: EtcdHighCommitDurations
93 | 	  expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[1m])) > 0.25
94 | 	  for: 2m
95 | 	  labels:
96 | 	    severity: critical
97 | 	  annotations:
98 | 	    summary: Etcd 提交持续时间较高 (instance {{ $labels.instance }})
99 | 	    description: "Etcd 提交持续时间增加，99% 超过 0.25s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"


--------------------------------------------------------------------------------
/one-latest.yml:
--------------------------------------------------------------------------------
  1 | apiVersion: v1
  2 | data: {}
  3 | kind: Secret
  4 | metadata:
  5 |   name: alertmanager-main
  6 |   namespace: monitoring
  7 | stringData:
  8 |   alertmanager.yaml: |-
  9 |     global:
 10 |       resolve_timeout: 5m
 11 |     route:
 12 |       group_by: ['alertname']
 13 |       group_wait: 10s
 14 |       group_interval: 10s
 15 |       repeat_interval: 12h
 16 |       receiver: 'webhookw'
 17 |       routes:
 18 |       - receiver: 'webhooke'
 19 |         match:
 20 |           severity: 'critical'
 21 |       - receiver: 'webhookw'
 22 |         match:
 23 |           severity: '~(warning)$'
 24 |     receivers:
 25 |     - name: 'webhookw'
 26 |       webhook_configs:
 27 |       - send_resolved: true
 28 |         url: 'http://webhook-dingtalk:8060/dingtalk/webhook1/send'
 29 |     - name: 'webhooke'
 30 |       webhook_configs:
 31 |       - send_resolved: true
 32 |         url: 'http://webhook-dingtalk:8060/dingtalk/webhook2/send'
 33 |     inhibit_rules:
 34 |       - source_match:
 35 |           severity: 'critical'
 36 |         target_match:
 37 |           severity: 'warning'
 38 |         equal: ['alertname', 'dev', 'instance']
 39 | ---
 40 | apiVersion: v1
 41 | data:
 42 |   config.yaml: |
 43 |     ## Request timeout
 44 |     # timeout: 5s
 45 |     ## Customizable templates path
 46 |     templates:
 47 |       - /config/template.tmpl
 48 | 
 49 |     ## You can also override default template using `default_message`
 50 |     ## The following example to use the 'legacy' template from v0.3.0
 51 |     # default_message:
 52 |     #   title: '{{ template "legacy.title" . }}'
 53 |     #   text: '{{ template "legacy.content" . }}'
 54 |     targets:
 55 |       webhook1:
 56 |         url: https://oapi.dingtalk.com/robot/send?access_token=ee2d09e90b0d12
 57 |         # secret for signature
 58 |         secret: SECd9242c03ffac2277f0c9
 59 |       webhook2:
 60 |         url: https://oapi.dingtalk.com/robot/send?access_token=0102d8ed764e49
 61 |         secret: SEC9f447197ae2795ff2bed7abd5b042e26e4ac51f
 62 |   template.tmpl: |
 63 |     {{ define "__subject" }}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .CommonLabels) (len .GroupLabels) }}({{ with .CommonLabels.Remove .GroupLabels.Names }}{{ .Values | join " " }}{{ end }}){{ end }}{{ end }}
 64 |     {{ define "__alertmanagerURL" }}{{ .ExternalURL }}/#/alerts?receiver={{ .Receiver }}{{ end }}
 65 | 
 66 |     {{ define "__text_alert_list" }}{{ range . }}
 67 |     **Labels**
 68 |     {{ range .Labels.SortedPairs }}> - {{ .Name }}: {{ .Value | markdown | html }}
 69 |     {{ end }}
 70 |     **Annotations**
 71 |     {{ range .Annotations.SortedPairs }}> - {{ .Name }}: {{ .Value | markdown | html }}
 72 |     {{ end }}
 73 |     **Source:** [{{ .GeneratorURL }}]({{ .GeneratorURL }})
 74 |     {{ end }}{{ end }}
 75 | 
 76 |     {{ define "default.__text_alert_list" }}{{ range . }}
 77 | 
 78 |     ---
 79 | 
 80 |     **告警级别:** {{ .Labels.severity | upper }}
 81 | 
 82 |     **触发时间:** {{ dateInZone "2006.01.02 15:04:05" (.StartsAt) "Asia/Shanghai" }}
 83 | 
 84 |     **事件信息:**
 85 | 
 86 |     {{ range .Annotations.SortedPairs }}> - {{ .Name }}: {{ .Value | markdown | html }}
 87 | 
 88 | 
 89 |     {{ end }}
 90 | 
 91 |     **事件标签:**
 92 | 
 93 |     {{ range .Labels.SortedPairs }}{{ if and (ne (.Name) "severity") (ne (.Name) "summary") (ne (.Name) "team") }}> - {{ .Name }}: {{ .Value | markdown | html }}
 94 |     
 95 |     {{ end }}{{ end }}
 96 |     {{ end }}
 97 |     {{ end }}
 98 |     {{ define "default.__text_alertresovle_list" }}{{ range . }}
 99 | 
100 |     ---
101 | 
102 |     **告警级别:** {{ .Labels.severity | upper }}
103 | 
104 |     **触发时间:** {{ dateInZone "2006.01.02 15:04:05" (.StartsAt) "Asia/Shanghai" }}
105 | 
106 |     **结束时间:** {{ dateInZone "2006.01.02 15:04:05" (.EndsAt) "Asia/Shanghai" }}
107 | 
108 |     **事件信息:**
109 | 
110 |     {{ range .Annotations.SortedPairs }}> - {{ .Name }}: {{ .Value | markdown | html }}
111 | 
112 | 
113 |     {{ end }}
114 | 
115 |     **事件标签:**
116 | 
117 |     {{ range .Labels.SortedPairs }}{{ if and (ne (.Name) "severity") (ne (.Name) "summary") (ne (.Name) "team") }}> - {{ .Name }}: {{ .Value | markdown | html }}
118 |     
119 |     {{ end }}{{ end }}
120 |     {{ end }}
121 |     {{ end }}
122 | 
123 |     {{/* Default */}}
124 |     {{ define "default.title" }}{{ template "__subject" . }}{{ end }}
125 |     {{ define "default.content" }}#### \[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}\] **[{{ index .GroupLabels "alertname" }}]({{ template "__alertmanagerURL" . }})**
126 |     {{ if gt (len .Alerts.Firing) 0 -}}
127 | 
128 | 
129 |     **====⚠️⚠️⚠️trigger alarm====**
130 | 
131 |     {{ template "default.__text_alert_list" .Alerts.Firing }}
132 | 
133 | 
134 |     {{- end }}
135 | 
136 |     {{ if gt (len .Alerts.Resolved) 0 -}}
137 | 
138 |     **====[烟花]recover alarm====**
139 | 
140 |     {{ template "default.__text_alertresovle_list" .Alerts.Resolved }}
141 | 
142 | 
143 |     {{- end }}
144 |     {{- end }}
145 | 
146 |     {{/* Legacy */}}
147 |     {{ define "legacy.title" }}{{ template "__subject" . }}{{ end }}
148 |     {{ define "legacy.content" }}#### \[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}\] **[{{ index .GroupLabels "alertname" }}]({{ template "__alertmanagerURL" . }})**
149 |     {{ template "__text_alert_list" .Alerts.Firing }}
150 |     {{- end }}
151 | 
152 |     {{/* Following names for compatibility */}}
153 |     {{ define "ding.link.title" }}{{ template "default.title" . }}{{ end }}
154 |     {{ define "ding.link.content" }}{{ template "default.content" . }}{{ end }}
155 | kind: ConfigMap
156 | metadata:
157 |   labels:
158 |     app: webhook-dingtalk
159 |   name: webhook-dingtalk
160 |   namespace: monitoring
161 | ---
162 | apiVersion: v1
163 | kind: Service
164 | metadata:
165 |   labels:
166 |     app: webhook-dingtalk
167 |   name: webhook-dingtalk
168 |   namespace: monitoring
169 | spec:
170 |   ports:
171 |   - name: http
172 |     port: 8060
173 |     protocol: TCP
174 |     targetPort: 8060
175 |   selector:
176 |     app: webhook-dingtalk
177 | ---
178 | apiVersion: apps/v1
179 | kind: Deployment
180 | metadata:
181 |   labels:
182 |     app: webhook-dingtalk
183 |   name: webhook-dingtalk
184 |   namespace: monitoring
185 | spec:
186 |   replicas: 1
187 |   selector:
188 |     matchLabels:
189 |       app: webhook-dingtalk
190 |   template:
191 |     metadata:
192 |       labels:
193 |         app: webhook-dingtalk
194 |     spec:
195 |       containers:
196 |       - args:
197 |         - --web.listen-address=:8060
198 |         - --config.file=/config/config.yaml
199 |         image: marksugar/k8s-prometheus:prometheus_dingtalk_v2.0.0
200 |         name: webhook-dingtalk
201 |         ports:
202 |         - containerPort: 8060
203 |           name: http
204 |         resources:
205 |           limits:
206 |             cpu: 100m
207 |             memory: 100Mi
208 |         volumeMounts:
209 |         - mountPath: /config
210 |           name: config
211 |       volumes:
212 |       - configMap:
213 |           name: webhook-dingtalk
214 |         name: config


--------------------------------------------------------------------------------