├── alert.yml ├── etcd.txt ├── one-latest.yml └── readme.md /alert.yml: -------------------------------------------------------------------------------- 1 | apiVersion: monitoring.coreos.com/v1 2 | kind: PrometheusRule 3 | metadata: 4 | labels: 5 | prometheus: k8s 6 | role: alert-rules 7 | name: node-rules 8 | namespace: monitoring 9 | spec: 10 | groups: 11 | - name: external_node_alarm 12 | rules: 13 | - alert: node_host_lost 14 | expr: up{job="external-node"} == 0 15 | for: 1m 16 | labels: 17 | severity: critical 18 | annotations: 19 | summary: "{{$labels.instance}}:service is down" 20 | description: "{{$labels.instance}}: lost contact for 1 minutes" 21 | - alert: HostOutOfMemory10 22 | expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 23 | for: 1m 24 | labels: 25 | severity: critical 26 | annotations: 27 | summary: (instance {{ $labels.instance }})实例内存不足 28 | description: "节点内存已用完 (剩余小于10%)已持续1分钟\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 29 | - alert: HostOutOfMemory2 30 | expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 2 31 | for: 1m 32 | labels: 33 | severity: critical 34 | annotations: 35 | summary: (instance {{ $labels.instance }})实例内存不足,可能会发生内存溢出 36 | description: "节点内存已用完 (剩余小于2%)已持续1分钟\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 37 | - alert: HostMemoryUnderMemoryPressure 38 | expr: rate(node_vmstat_pgmajfault[1m]) > 1000 39 | for: 1m 40 | labels: 41 | severity: warning 42 | annotations: 43 | summary: (instance {{ $labels.instance }})实例内存压力很大 44 | description: "节点内存压力很大major page错误率高已持续1分钟\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 45 | - alert: HostUnusualNetworkThroughputIn 46 | expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100 47 | for: 2m 48 | labels: 49 | severity: critical 50 | annotations: 51 | summary: (instance {{ $labels.instance }})实例出现异常的input网络吞吐量 52 | description: "{{ $labels.instance }}网络接口接收超过100MB/s的数据已持续2分钟 \n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 53 | - alert: HostUnusualNetworkThroughputOut 54 | expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100 55 | for: 2m 56 | labels: 57 | severity: critical 58 | annotations: 59 | summary: (instance {{ $labels.instance }})实例出现异常的output网络吞吐量 60 | description: "{{ $labels.instance }}实例网络发送了超过100MB/s的数据已持续2分钟 (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 61 | - alert: HostUnusualDiskReadRatecritical 62 | expr: sum by (instance) (rate(node_disk_read_bytes_total[1m])) / 1024 / 1024 > 100 63 | for: 1m 64 | labels: 65 | severity: critical 66 | annotations: 67 | summary: (instance {{ $labels.instance }})实例磁盘读取过高 68 | description: "{{ $labels.instance }}实例磁盘读取超过>100MB/s avg [HDD]已持续1分钟 \n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 69 | - alert: HostUnusualDiskReadRatewarning 70 | expr: sum by (instance) (rate(node_disk_read_bytes_total[1m])) / 1024 / 1024 > 60 71 | for: 1m 72 | labels: 73 | severity: warning 74 | annotations: 75 | summary: (instance {{ $labels.instance }})实例磁盘读取过高 76 | description: "{{ $labels.instance }}实例磁盘读取超过>60MB/s avg [HDD]已持续1分钟 \n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 77 | - alert: HostUnusualDiskWriteRatewarning 78 | expr: sum by (instance) (rate(node_disk_written_bytes_total[1m])) / 1024 / 1024 > 60 79 | for: 1m 80 | labels: 81 | severity: warning 82 | annotations: 83 | summary: (instance {{ $labels.instance }})实例磁盘写入过高 84 | description: "{{ $labels.instance }}实例磁盘写入超过60MB/s avg [HDD]已持续1分钟 \n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 85 | - alert: HostUnusualDiskWriteRatecritical 86 | expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 60 87 | for: 2m 88 | labels: 89 | severity: critical 90 | annotations: 91 | summary: (instance {{ $labels.instance }})实例磁盘写入过高 92 | description: "{{ $labels.instance }}实例磁盘写入超过60MB/s avg [HDD]已持续2分钟 \n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 93 | # Please add ignored mountpoints in node_exporter parameters like 94 | # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". 95 | # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. 96 | - alert: HostOutOfDiskSpacefree 97 | expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 98 | for: 2m 99 | labels: 100 | severity: critical 101 | annotations: 102 | summary: (instance {{ $labels.instance }})实例磁盘空间不足 103 | description: "{{ $labels.instance }}磁盘快满了,剩余不足10%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 104 | # Please add ignored mountpoints in node_exporter parameters like 105 | # "--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|run)($|/)". 106 | # Same rule using "node_filesystem_free_bytes" will fire when disk fills for non-root users. 107 | - alert: HostOutOfDiskSpacehigh 108 | expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 5 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 109 | for: 1m 110 | labels: 111 | severity: critical 112 | annotations: 113 | summary: (instance {{ $labels.instance }})实例磁盘空间不足 114 | description: "{{ $labels.instance }}磁盘快满了,剩余不足5%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 115 | - alert: HostDiskWillFillIn24Hours 116 | expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0 117 | for: 2m 118 | labels: 119 | severity: critical 120 | annotations: 121 | summary: (instance {{ $labels.instance }})在24小时内就会写满 122 | description: "{{ $labels.instance }}在当前的写入速率下,预计文件系统将会在24小时内耗尽磁盘空间\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 123 | - alert: HostOutOfInodes 124 | expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0 125 | for: 2m 126 | labels: 127 | severity: critical 128 | annotations: 129 | summary: (instance {{ $labels.instance }})实例inodes不足 130 | description: "{{ $labels.instance }}磁盘几乎用完了inodes,剩余不足10%\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 131 | - alert: HostInodesWillFillIn24Hours 132 | expr: node_filesystem_files_free{mountpoint ="/rootfs"} / node_filesystem_files{mountpoint="/rootfs"} * 100 < 10 and predict_linear(node_filesystem_files_free{mountpoint="/rootfs"}[1h], 24 *3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/rootfs"} == 0 133 | for: 2m 134 | labels: 135 | severity: critical 136 | annotations: 137 | summary: Host inodes will fill in 24 hours (instance {{ $labels.instance }}) 138 | description: "{{ $labels.instance }}以当前写入速率,文件系统预计将在未来 24小时内耗尽inode\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 139 | - alert: HostUnusualDiskReadLatency 140 | expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.2 and rate(node_disk_reads_completed_total[1m]) > 0 141 | for: 1m 142 | labels: 143 | severity: critical 144 | annotations: 145 | summary: (instance {{ $labels.instance }})实例磁盘读取延迟过高 146 | description: "磁盘读取延迟正在增长,当下大于200ms已持续1分钟\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 147 | - alert: HostUnusualDiskWriteLatency 148 | expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0 149 | for: 1m 150 | labels: 151 | severity: warning 152 | annotations: 153 | summary: (instance {{ $labels.instance }})实例磁盘写入延迟过高 154 | description: "磁盘写入延迟正在增长,当下大于100ms已持续1分钟\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 155 | - alert: HostHighCpuLoad85 156 | expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[1m])) * 100) > 85 157 | for: 1m 158 | labels: 159 | severity: critical 160 | annotations: 161 | summary: (instance {{ $labels.instance }})实例CPU负载过高 162 | description: "当前CPU load高于85%持续1分钟\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 163 | - alert: HostHighCpuLoad75 164 | expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[1m])) * 100) > 75 165 | for: 1m 166 | labels: 167 | severity: critical 168 | annotations: 169 | summary: (instance {{ $labels.instance }})实例CPU负载过高 170 | description: "当前CPU load高于75%持续1分钟\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 171 | - alert: HostCpuStealNoisyNeighbor 172 | expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10 173 | for: 5m 174 | labels: 175 | severity: warning 176 | annotations: 177 | summary: (instance {{ $labels.instance }})CPU steal time开始增长 178 | description: "CPU steal time大于10%已持续5分钟. 这意味着CPU资源不足或vm资源竞争。当前实例计算能力将开始下降.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 179 | # 1000 context switches is an arbitrary number. 180 | # Alert threshold depends on nature of application. 181 | # Please read: https://github.com/samber/awesome-prometheus-alerts/issues/58 182 | - alert: HostContextSwitching 183 | expr: (rate(node_context_switches_total[5m])) / (count without(cpu, mode) (node_cpu_seconds_total{mode="idle"})) > 10000 184 | for: 0m 185 | labels: 186 | severity: warning 187 | annotations: 188 | summary: (instance {{ $labels.instance }})主机上下文切换异常 189 | description: "实例上下文切换正在增长,当下超过10000/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 190 | - alert: HostSwapIsFillingUp 191 | expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80 192 | for: 2m 193 | labels: 194 | severity: warning 195 | annotations: 196 | summary: (instance {{ $labels.instance }})swap内存即将用完 197 | description: "Swap使用超过80% \n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 198 | - alert: HostSystemdServiceCrashed 199 | expr: node_systemd_unit_state{state="failed"} == 1 200 | for: 0m 201 | labels: 202 | severity: critical 203 | annotations: 204 | summary: 请注意,(instance {{ $labels.instance }})当前实例systemd service服务已崩溃 205 | description: "当前实例systemd service崩溃\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 206 | - alert: HostKernelVersionDeviations 207 | expr: count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1 208 | for: 6h 209 | labels: 210 | severity: warning 211 | annotations: 212 | summary: (instance {{ $labels.instance }})实例内核版本偏差 213 | description: "正在运行的内核版本:\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 214 | - alert: HostOomKillDetected 215 | expr: increase(node_vmstat_oom_kill[1m]) > 0 216 | for: 0m 217 | labels: 218 | severity: critical 219 | annotations: 220 | summary: (instance {{ $labels.instance }})实例发生OOM kill事件 221 | description: "检测到OOM kill\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 222 | - alert: HostNetworkReceiveErrors 223 | expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 224 | for: 2m 225 | labels: 226 | severity: warning 227 | annotations: 228 | summary: (instance {{ $labels.instance }})主机网络接收错误 229 | description: "主机 {{ $labels.instance }} interface {{ $labels.device }} 在过去五分钟内遇到了 {{ printf \"%.0f\" $value }} 接收错误。\n VALUE = { { $value }}\n 标签 = {{ $labels }}" 230 | - alert: HostNetworkTransmitErrors 231 | expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 232 | for: 2m 233 | labels: 234 | severity: warning 235 | annotations: 236 | summary: (instance {{ $labels.instance }})主机网络传输错误 237 | description: "主机 {{ $labels.instance }} interface {{ $labels.device }} 在过去五分钟内遇到了 {{ printf \"%.0f\" $value }} 传输错误。\n VALUE = { { $value }}\n 标签 = {{ $labels }}" 238 | - alert: HostNetworkInterfaceSaturated 239 | expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8 240 | for: 1m 241 | labels: 242 | severity: critical 243 | annotations: 244 | summary: (instance {{ $labels.instance }})网络接口接近饱和 245 | description: "\"{{ $labels.instance }}\" 上的网络接口 \"{{ $labels.interface }}\" 正在过载。\n VALUE = {{ $value }}\n LABELS = { { $labels }}" 246 | - alert: HostConntrackLimit 247 | expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8 248 | for: 5m 249 | labels: 250 | severity: warning 251 | annotations: 252 | summary: 主机 conntrack 限制(instance {{ $labels.instance }}) 253 | description: "conntrack 数量接近限制\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 254 | - alert: HostClockSkew 255 | expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0) 256 | for: 2m 257 | labels: 258 | severity: warning 259 | annotations: 260 | summary: 主机时钟偏差(instance {{ $labels.instance }}) 261 | description: "检测到时钟偏差。时钟不同步。\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 262 | - alert: HostEdacCorrectableErrorsDetected 263 | expr: increase(node_edac_correctable_errors_total[1m]) > 0 264 | for: 0m 265 | labels: 266 | severity: info 267 | annotations: 268 | summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }}) 269 | description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 270 | - alert: HostEdacUncorrectableErrorsDetected 271 | expr: node_edac_uncorrectable_errors_total > 0 272 | for: 0m 273 | labels: 274 | severity: warning 275 | annotations: 276 | summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}) 277 | description: "Host {{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 278 | - alert: node_disk_Utilization_high 279 | expr: rate(node_disk_io_time_seconds_total[5m]) * 100 > 90 280 | for: 1m 281 | labels: 282 | severity: critical 283 | annotations: 284 | summary: "{{$labels.instance}}: disk Utilization > {{ $value }} % " 285 | description: "{{$labels.instance}}: disk Utilization > {{ $value }}. this should attract attention . disk performance seems to be problematic .for 1 minutes" 286 | --- 287 | apiVersion: monitoring.coreos.com/v1 288 | kind: PrometheusRule 289 | metadata: 290 | labels: 291 | prometheus: k8s 292 | role: alert-rules 293 | name: redis-rules 294 | namespace: monitoring 295 | spec: 296 | groups: 297 | - name: redisStatsAlert 298 | rules: 299 | - alert: redis is down 300 | expr: up{job="external-redis"} == 0 301 | for: 1m 302 | labels: 303 | severity: critical 304 | annotations: 305 | summary: "{{ $labels.instance }} redis is down" 306 | description: "{{ $labels.instance }} redis is down " 307 | --- 308 | apiVersion: monitoring.coreos.com/v1 309 | kind: PrometheusRule 310 | metadata: 311 | labels: 312 | prometheus: k8s 313 | role: alert-rules 314 | name: rabbitmq-rules 315 | namespace: monitoring 316 | spec: 317 | groups: 318 | - name: rabbitmqStatsAlert 319 | rules: 320 | - alert: rabbitmq is down 321 | expr: up{job="external-rabbitmq"} == 0 322 | for: 1m 323 | labels: 324 | severity: critical 325 | annotations: 326 | summary: "{{ $labels.instance }} rabbitmq is down" 327 | description: "{{ $labels.instance }} rabbitmq is down " 328 | --- 329 | apiVersion: monitoring.coreos.com/v1 330 | kind: PrometheusRule 331 | metadata: 332 | labels: 333 | prometheus: k8s 334 | role: alert-rules 335 | name: mysql-rules 336 | namespace: monitoring 337 | spec: 338 | groups: 339 | - name: MySQLStatsAlert 340 | rules: 341 | - alert: MySQL is down 342 | expr: mysql_up == 0 343 | for: 1m 344 | labels: 345 | severity: critical 346 | annotations: 347 | summary: "{{ $labels.instance }} MySQL is down" 348 | description: "MySQL database is down. This requires immediate action!" 349 | - alert: MysqlTooManyConnections(>80%) 350 | expr: avg by (instance) (rate(mysql_global_status_threads_connected[1m])) / avg by (instance) (mysql_global_variables_max_connections) * 100 > 80 351 | for: 2m 352 | labels: 353 | severity: warning 354 | annotations: 355 | summary: MySQL too many connections (> 80%) (instance {{ $labels.instance }}) 356 | description: "超过 80% 的 MySQL 连接在 {{ $labels.instance }} 上使用\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 357 | - alert: MysqlHighThreadsRunning 358 | expr: avg by (instance) (rate(mysql_global_status_threads_running[1m])) / avg by (instance) (mysql_global_variables_max_connections) * 100 > 60 359 | for: 2m 360 | labels: 361 | severity: warning 362 | annotations: 363 | summary: MySQL high threads running (instance {{ $labels.instance }}) 364 | description: "超过 60% 的 MySQL 连接在 {{ $labels.instance }} 上处于运行状态\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 365 | - alert: MysqlRestarted 366 | expr: mysql_global_status_uptime < 60 367 | for: 0m 368 | labels: 369 | severity: critical 370 | annotations: 371 | summary: MySQL重新启动 (instance {{ $labels.instance }}) 372 | description: "MySQL 刚刚重新启动,不到一分钟前在 {{ $labels.instance }} 上。\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 373 | ## MySQL Slave IO 线程未运行 374 | #- alert: MysqlSlaveIoThreadNotRunning 375 | # expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_io_running == 0 376 | # for: 0m 377 | # labels: 378 | # severity: critical 379 | # annotations: 380 | # summary: MySQL Slave IO thread not running (instance {{ $labels.instance }}) 381 | # description: "MySQL Slave IO thread not running on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 382 | ## MySQL Slave SQL 线程未运行 383 | #- alert: MysqlSlaveSqlThreadNotRunning 384 | # expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_sql_running == 0 385 | # for: 0m 386 | # labels: 387 | # severity: critical 388 | # annotations: 389 | # summary: MySQL Slave SQL thread not running (instance {{ $labels.instance }}) 390 | # description: "MySQL Slave SQL thread not running on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 391 | ## MySQL Slave 复制滞后 392 | #- alert: MysqlSlaveReplicationLag 393 | # expr: mysql_slave_status_master_server_id > 0 and ON (instance) (mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay) > 30 394 | # for: 1m 395 | # labels: 396 | # severity: critical 397 | # annotations: 398 | # summary: MySQL Slave replication lag (instance {{ $labels.instance }}) 399 | # description: "MySQL replication lag on {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 400 | ## MySQL 慢查询 401 | #- alert: MysqlSlowQueries 402 | # expr: increase(mysql_global_status_slow_queries[1m]) > 0 403 | # for: 2m 404 | # labels: 405 | # severity: warning 406 | # annotations: 407 | # summary: MySQL slow queries (instance {{ $labels.instance }}) 408 | # description: "MySQL server mysql has some new slow query.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 409 | ## MySQL innodb 日志写入停滞 MySQL InnoDB 日志等待 410 | #- alert: MysqlInnodbLogWaits 411 | # expr: rate(mysql_global_status_innodb_log_waits[15m]) > 10 412 | # for: 0m 413 | # labels: 414 | # severity: warning 415 | # annotations: 416 | # summary: MySQL InnoDB log waits (instance {{ $labels.instance }}) 417 | # description: "MySQL innodb log writes stalling\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 418 | --- 419 | apiVersion: monitoring.coreos.com/v1 420 | kind: PrometheusRule 421 | metadata: 422 | labels: 423 | prometheus: k8s 424 | role: alert-rules 425 | name: kafka-rules 426 | namespace: monitoring 427 | spec: 428 | groups: 429 | - name: external_kafka_alarm 430 | rules: 431 | - alert: KafkaTopicsReplicas 432 | expr: sum(kafka_topic_partition_in_sync_replica) by (topic) < 3 433 | for: 0m 434 | labels: 435 | severity: critical 436 | annotations: 437 | summary: Kafka topics replicas (instance {{ $labels.instance }}) 438 | description: "Kafka 副本分区\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 439 | - alert: kafka_is_down 440 | expr: up{job="external-kafka"} == 0 441 | for: 0m 442 | labels: 443 | severity: critical 444 | annotations: 445 | summary: Kafka server is down (instance {{ $labels.instance }}) 446 | description: " {{ $labels.instance }} Kafka server is down now!" 447 | - alert: KafkaConsumersGroup 448 | expr: sum(kafka_consumergroup_lag) by (consumergroup) > 8888 449 | for: 1m 450 | labels: 451 | severity: critical 452 | annotations: 453 | summary: Kafka consumers group (instance {{ $labels.instance }}) 454 | description: "Kafka 消费者组\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 455 | - alert: KafkaTopicOffsetDecreased 456 | expr: delta(kafka_burrow_partition_current_offset[1m]) < 0 457 | for: 0m 458 | labels: 459 | severity: warning 460 | annotations: 461 | summary: Kafka topic offset decreased (instance {{ $labels.instance }}) 462 | description: "Kafka 主题偏移量已减少\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 463 | - alert: KafkaConsumerLag 464 | expr: 'kafka_burrow_topic_partition_offset - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset >= (kafka_burrow_topic_partition_offset offset 15m - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset offset 15m) 465 | AND kafka_burrow_topic_partition_offset - on(partition, cluster, topic) group_right() kafka_burrow_partition_current_offset > 0' 466 | for: 15m 467 | labels: 468 | severity: critical 469 | annotations: 470 | summary: Kafka consumer lag (instance {{ $labels.instance }}) 471 | description: "卡夫卡消费者有 30 分钟的延迟并且越来越多\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 472 | --- 473 | apiVersion: monitoring.coreos.com/v1 474 | kind: PrometheusRule 475 | metadata: 476 | labels: 477 | prometheus: k8s 478 | role: alert-rules 479 | name: mongodb-rules 480 | namespace: monitoring 481 | spec: 482 | groups: 483 | - name: external_mongodb_alarm 484 | rules: 485 | - alert: MongodbDown 486 | expr: mongodb_up == 0 487 | for: 0m 488 | labels: 489 | severity: critical 490 | annotations: 491 | summary: MongoDB Down (instance {{ $labels.instance }}) 492 | description: "MongoDB 实例已关闭\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 493 | - alert: MongodbReplicationLag 494 | #expr: mongodb_mongod_replset_member_optime_date{state="PRIMARY"} - ON (set) mongodb_mongod_replset_member_optime_date{state="SECONDARY"} > 10 495 | expr: avg(mongodb_mongod_replset_member_optime_date{state="PRIMARY"})-avg(mongodb_mongod_replset_member_optime_date{state="SECONDARY"}) > 10 496 | for: 0m 497 | labels: 498 | severity: critical 499 | annotations: 500 | summary: MongoDB replication lag (instance {{ $labels.instance }}) 501 | description: "Mongodb 复制延迟超过 10 秒\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 502 | - alert: MongodbNumberCursorsOpen 503 | expr: mongodb_mongod_metrics_cursor_open{state="total"} > 10 * 1000 504 | for: 2m 505 | labels: 506 | severity: warning 507 | annotations: 508 | summary: MongoDB number cursors open (instance {{ $labels.instance }}) 509 | description: "MongoDB 为客户端打开了太多cursors (> 10k)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 510 | - alert: MongodbCursorsTimeouts 511 | expr: increase(mongodb_mongod_metrics_cursor_timed_out_total[1m]) > 100 512 | for: 2m 513 | labels: 514 | severity: warning 515 | annotations: 516 | summary: MongoDB cursors timeouts (instance {{ $labels.instance }}) 517 | description: "太多的cursors 超时\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 518 | - alert: MongodbTooManyConnections 519 | expr: avg by(instance) (rate(mongodb_connections{state="current"}[1m])) / avg by(instance) (sum (mongodb_connections) by (instance)) * 100 > 80 520 | for: 2m 521 | labels: 522 | severity: warning 523 | annotations: 524 | summary: MongoDB too many connections (instance {{ $labels.instance }}) 525 | description: "连接过多 (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 526 | --- 527 | apiVersion: monitoring.coreos.com/v1 528 | kind: PrometheusRule 529 | metadata: 530 | labels: 531 | prometheus: k8s 532 | role: alert-rules 533 | name: etcd-rules 534 | namespace: monitoring 535 | spec: 536 | groups: 537 | - name: external_etcd_alarm 538 | rules: 539 | - alert: EtcdInsufficientMembers 540 | expr: count(etcd_server_id) % 2 == 0 541 | for: 0m 542 | labels: 543 | severity: critical 544 | annotations: 545 | summary: etcd cluster 已崩溃 (instance {{ $labels.instance }}) 546 | description: "Etcd 集群应该有奇数个成员, \n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 547 | - alert: EtcdNoLeader 548 | expr: etcd_server_has_leader == 0 549 | for: 0m 550 | labels: 551 | severity: critical 552 | annotations: 553 | summary: Etcd 选举失败 (instance {{ $labels.instance }}) 554 | description: "Etcd cluster have no leader\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 555 | - alert: EtcdHighNumberOfLeaderChanges 556 | expr: increase(etcd_server_leader_changes_seen_total[10m]) > 2 557 | for: 0m 558 | labels: 559 | severity: warning 560 | annotations: 561 | summary: Etcd leader 切换异常 (instance {{ $labels.instance }}) 562 | description: "10分钟内leader切换了两次\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 563 | #- alert: EtcdHighNumberOfFailedGrpcRequestswarning 564 | # expr: 100 * sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{grpc_code!="OK",job=~".*etcd.*"}[5m])) / sum by(job, instance, grpc_service, grpc_method) (rate(grpc_server_handled_total{job=~".*etcd.*"}[5m]))> 1 565 | # for: 10m 566 | # labels: 567 | # severity: warning 568 | # annotations: 569 | # summary: Etcd 大量失败的 GRPC 请求 (instance {{ $labels.instance }}) 570 | # description: "在 Etcd 中检测到超过 1% 的 GRPC 请求失败\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 571 | #- alert: EtcdHighNumberOfFailedGrpcRequestscritical 572 | # expr: 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) without (grpc_type, grpc_code) / sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code) > 5 573 | # for: 5m 574 | # labels: 575 | # severity: critical 576 | # annotations: 577 | # summary: Etcd 大量失败的 GRPC 请求 (instance {{ $labels.instance }}) 578 | # description: "在 Etcd 中检测到超过 5% 的 GRPC 请求失败\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 579 | - alert: EtcdGrpcRequestsSlow 580 | expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{grpc_type="unary"}[1m])) by (grpc_service, grpc_method, le)) > 0.15 581 | for: 2m 582 | labels: 583 | severity: critical 584 | annotations: 585 | summary: Etcd GRPC 请求缓慢(instance {{ $labels.instance }}) 586 | description: "GRPC 请求变慢,99% 超过 0.15s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 587 | - alert: EtcdHighNumberOfFailedHttpRequestswarning 588 | expr: sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.01 589 | for: 2m 590 | labels: 591 | severity: warning 592 | annotations: 593 | summary: Etcd 大量失败的 HTTP 请求 (instance {{ $labels.instance }}) 594 | description: "在 Etcd 中检测到超过 1% 的 HTTP 失败\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 595 | - alert: EtcdHighNumberOfFailedHttpRequestscritical 596 | expr: sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.05 597 | for: 2m 598 | labels: 599 | severity: critical 600 | annotations: 601 | summary: Etcd 大量失败的 HTTP 请求 (instance {{ $labels.instance }}) 602 | description: "在 Etcd 中检测到超过 5% 的 HTTP 失败\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 603 | - alert: EtcdHttpRequestsSlow 604 | expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[1m])) > 0.15 605 | for: 2m 606 | labels: 607 | severity: warning 608 | annotations: 609 | summary: Etcd HTTP 请求缓慢 (instance {{ $labels.instance }}) 610 | description: "Etcd HTTP 请求变慢,99% 超过 0.15s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 611 | - alert: EtcdMemberCommunicationSlow 612 | expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) > 0.2 613 | for: 2m 614 | labels: 615 | severity: critical 616 | annotations: 617 | summary: Etcd成员通讯缓慢 (instance {{ $labels.instance }}) 618 | description: "Etcd 成员通信变慢,99% 超过 0.2s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 619 | - alert: EtcdHighNumberOfFailedProposals 620 | expr: increase(etcd_server_proposals_failed_total[1h]) > 5 621 | for: 2m 622 | labels: 623 | severity: warning 624 | annotations: 625 | summary: Etcd 大量失败的proposals (instance {{ $labels.instance }}) 626 | description: "Etcd 服务器在过去一小时收到了超过 5 个失败的proposals\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 627 | - alert: EtcdHighFsyncDurations 628 | expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) > 0.5 629 | for: 2m 630 | labels: 631 | severity: critical 632 | annotations: 633 | summary: Etcd fsync 持续时间变高 (instance {{ $labels.instance }}) 634 | description: "Etcd WAL fsync 持续时间增加,99% 超过 0.5s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 635 | - alert: EtcdHighCommitDurations 636 | expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[1m])) > 0.25 637 | for: 2m 638 | labels: 639 | severity: warning 640 | annotations: 641 | summary: Etcd 提交持续时间较高 (instance {{ $labels.instance }}) 642 | description: "Etcd 提交持续时间增加,99% 超过 0.25s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" -------------------------------------------------------------------------------- /etcd.txt: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: external_etcd_alarm 3 | rules: 4 | - alert: EtcdInsufficientMembers 5 | expr: count(etcd_server_id) % 2 == 0 6 | for: 0m 7 | labels: 8 | severity: critical 9 | annotations: 10 | summary: etcd cluster 已崩溃 (instance {{ $labels.instance }}) 11 | description: "Etcd 集群应该有奇数个成员, \n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 12 | - alert: EtcdNoLeader 13 | expr: etcd_server_has_leader == 0 14 | for: 0m 15 | labels: 16 | severity: critical 17 | annotations: 18 | summary: Etcd no Leader (instance {{ $labels.instance }}) 19 | description: "Etcd cluster have no leader\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 20 | - alert: EtcdHighNumberOfLeaderChanges 21 | expr: increase(etcd_server_leader_changes_seen_total[10m]) > 2 22 | for: 0m 23 | labels: 24 | severity: critical 25 | annotations: 26 | summary: Etcd leader 切换异常 (instance {{ $labels.instance }}) 27 | description: "10分钟内leader切换了两次\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 28 | - alert: EtcdHighNumberOfFailedGrpcRequests 29 | expr: 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code!="OK"}[5m])) without (grpc_type, grpc_code) / sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code) > 5 30 | for: 5m 31 | labels: 32 | severity: warning 33 | annotations: 34 | summary: Etcd 大量失败的 GRPC 请求 (instance {{ $labels.instance }}) 35 | description: "在 Etcd 中检测到超过 5% 的 GRPC 请求失败\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 36 | - alert: EtcdGrpcRequestsSlow 37 | expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{grpc_type="unary"}[1m])) by (grpc_service, grpc_method, le)) > 0.15 38 | for: 2m 39 | labels: 40 | severity: critical 41 | annotations: 42 | summary: Etcd GRPC 请求缓慢(instance {{ $labels.instance }}) 43 | description: "GRPC 请求变慢,99% 超过 0.15s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 44 | - alert: EtcdHighNumberOfFailedHttpRequests 45 | expr: sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.01 46 | for: 2m 47 | labels: 48 | severity: warning 49 | annotations: 50 | summary: Etcd 大量失败的 HTTP 请求 (instance {{ $labels.instance }}) 51 | description: "在 Etcd 中检测到超过 1% 的 HTTP 失败\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 52 | - alert: EtcdHighNumberOfFailedHttpRequests 53 | expr: sum(rate(etcd_http_failed_total[1m])) BY (method) / sum(rate(etcd_http_received_total[1m])) BY (method) > 0.05 54 | for: 2m 55 | labels: 56 | severity: critical 57 | annotations: 58 | summary: Etcd 大量失败的 HTTP 请求 (instance {{ $labels.instance }}) 59 | description: "在 Etcd 中检测到超过 5% 的 HTTP 失败\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 60 | - alert: EtcdHttpRequestsSlow 61 | expr: histogram_quantile(0.99, rate(etcd_http_successful_duration_seconds_bucket[1m])) > 0.15 62 | for: 2m 63 | labels: 64 | severity: critical 65 | annotations: 66 | summary: Etcd HTTP 请求缓慢 (instance {{ $labels.instance }}) 67 | description: "Etcd HTTP 请求变慢,99% 超过 0.15s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 68 | - alert: EtcdMemberCommunicationSlow 69 | expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) > 0.15 70 | for: 2m 71 | labels: 72 | severity: critical 73 | annotations: 74 | summary: Etcd成员通讯缓慢 (instance {{ $labels.instance }}) 75 | description: "Etcd 成员通信变慢,99% 超过 0.15s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 76 | - alert: EtcdHighNumberOfFailedProposals 77 | expr: increase(etcd_server_proposals_failed_total[1h]) > 5 78 | for: 2m 79 | labels: 80 | severity: warning 81 | annotations: 82 | summary: Etcd 大量失败的proposals (instance {{ $labels.instance }}) 83 | description: "Etcd 服务器在过去一小时收到了超过 5 个失败的proposals\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 84 | - alert: EtcdHighFsyncDurations 85 | expr: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) > 0.5 86 | for: 2m 87 | labels: 88 | severity: critical 89 | annotations: 90 | summary: Etcd fsync 持续时间变高 (instance {{ $labels.instance }}) 91 | description: "Etcd WAL fsync 持续时间增加,99% 超过 0.5s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" 92 | - alert: EtcdHighCommitDurations 93 | expr: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[1m])) > 0.25 94 | for: 2m 95 | labels: 96 | severity: critical 97 | annotations: 98 | summary: Etcd 提交持续时间较高 (instance {{ $labels.instance }}) 99 | description: "Etcd 提交持续时间增加,99% 超过 0.25s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" -------------------------------------------------------------------------------- /one-latest.yml: -------------------------------------------------------------------------------- 1 | apiVersion: v1 2 | data: {} 3 | kind: Secret 4 | metadata: 5 | name: alertmanager-main 6 | namespace: monitoring 7 | stringData: 8 | alertmanager.yaml: |- 9 | global: 10 | resolve_timeout: 5m 11 | route: 12 | group_by: ['alertname'] 13 | group_wait: 10s 14 | group_interval: 10s 15 | repeat_interval: 12h 16 | receiver: 'webhookw' 17 | routes: 18 | - receiver: 'webhooke' 19 | match: 20 | severity: 'critical' 21 | - receiver: 'webhookw' 22 | match: 23 | severity: '~(warning)$' 24 | receivers: 25 | - name: 'webhookw' 26 | webhook_configs: 27 | - send_resolved: true 28 | url: 'http://webhook-dingtalk:8060/dingtalk/webhook1/send' 29 | - name: 'webhooke' 30 | webhook_configs: 31 | - send_resolved: true 32 | url: 'http://webhook-dingtalk:8060/dingtalk/webhook2/send' 33 | inhibit_rules: 34 | - source_match: 35 | severity: 'critical' 36 | target_match: 37 | severity: 'warning' 38 | equal: ['alertname', 'dev', 'instance'] 39 | --- 40 | apiVersion: v1 41 | data: 42 | config.yaml: | 43 | ## Request timeout 44 | # timeout: 5s 45 | ## Customizable templates path 46 | templates: 47 | - /config/template.tmpl 48 | 49 | ## You can also override default template using `default_message` 50 | ## The following example to use the 'legacy' template from v0.3.0 51 | # default_message: 52 | # title: '{{ template "legacy.title" . }}' 53 | # text: '{{ template "legacy.content" . }}' 54 | targets: 55 | webhook1: 56 | url: https://oapi.dingtalk.com/robot/send?access_token=ee2d09e90b0d12 57 | # secret for signature 58 | secret: SECd9242c03ffac2277f0c9 59 | webhook2: 60 | url: https://oapi.dingtalk.com/robot/send?access_token=0102d8ed764e49 61 | secret: SEC9f447197ae2795ff2bed7abd5b042e26e4ac51f 62 | template.tmpl: | 63 | {{ define "__subject" }}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .CommonLabels) (len .GroupLabels) }}({{ with .CommonLabels.Remove .GroupLabels.Names }}{{ .Values | join " " }}{{ end }}){{ end }}{{ end }} 64 | {{ define "__alertmanagerURL" }}{{ .ExternalURL }}/#/alerts?receiver={{ .Receiver }}{{ end }} 65 | 66 | {{ define "__text_alert_list" }}{{ range . }} 67 | **Labels** 68 | {{ range .Labels.SortedPairs }}> - {{ .Name }}: {{ .Value | markdown | html }} 69 | {{ end }} 70 | **Annotations** 71 | {{ range .Annotations.SortedPairs }}> - {{ .Name }}: {{ .Value | markdown | html }} 72 | {{ end }} 73 | **Source:** [{{ .GeneratorURL }}]({{ .GeneratorURL }}) 74 | {{ end }}{{ end }} 75 | 76 | {{ define "default.__text_alert_list" }}{{ range . }} 77 | 78 | --- 79 | 80 | **告警级别:** {{ .Labels.severity | upper }} 81 | 82 | **触发时间:** {{ dateInZone "2006.01.02 15:04:05" (.StartsAt) "Asia/Shanghai" }} 83 | 84 | **事件信息:** 85 | 86 | {{ range .Annotations.SortedPairs }}> - {{ .Name }}: {{ .Value | markdown | html }} 87 | 88 | 89 | {{ end }} 90 | 91 | **事件标签:** 92 | 93 | {{ range .Labels.SortedPairs }}{{ if and (ne (.Name) "severity") (ne (.Name) "summary") (ne (.Name) "team") }}> - {{ .Name }}: {{ .Value | markdown | html }} 94 | 95 | {{ end }}{{ end }} 96 | {{ end }} 97 | {{ end }} 98 | {{ define "default.__text_alertresovle_list" }}{{ range . }} 99 | 100 | --- 101 | 102 | **告警级别:** {{ .Labels.severity | upper }} 103 | 104 | **触发时间:** {{ dateInZone "2006.01.02 15:04:05" (.StartsAt) "Asia/Shanghai" }} 105 | 106 | **结束时间:** {{ dateInZone "2006.01.02 15:04:05" (.EndsAt) "Asia/Shanghai" }} 107 | 108 | **事件信息:** 109 | 110 | {{ range .Annotations.SortedPairs }}> - {{ .Name }}: {{ .Value | markdown | html }} 111 | 112 | 113 | {{ end }} 114 | 115 | **事件标签:** 116 | 117 | {{ range .Labels.SortedPairs }}{{ if and (ne (.Name) "severity") (ne (.Name) "summary") (ne (.Name) "team") }}> - {{ .Name }}: {{ .Value | markdown | html }} 118 | 119 | {{ end }}{{ end }} 120 | {{ end }} 121 | {{ end }} 122 | 123 | {{/* Default */}} 124 | {{ define "default.title" }}{{ template "__subject" . }}{{ end }} 125 | {{ define "default.content" }}#### \[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}\] **[{{ index .GroupLabels "alertname" }}]({{ template "__alertmanagerURL" . }})** 126 | {{ if gt (len .Alerts.Firing) 0 -}} 127 | 128 | 129 | **====⚠️⚠️⚠️trigger alarm====** 130 | 131 | {{ template "default.__text_alert_list" .Alerts.Firing }} 132 | 133 | 134 | {{- end }} 135 | 136 | {{ if gt (len .Alerts.Resolved) 0 -}} 137 | 138 | **====[烟花]recover alarm====** 139 | 140 | {{ template "default.__text_alertresovle_list" .Alerts.Resolved }} 141 | 142 | 143 | {{- end }} 144 | {{- end }} 145 | 146 | {{/* Legacy */}} 147 | {{ define "legacy.title" }}{{ template "__subject" . }}{{ end }} 148 | {{ define "legacy.content" }}#### \[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}\] **[{{ index .GroupLabels "alertname" }}]({{ template "__alertmanagerURL" . }})** 149 | {{ template "__text_alert_list" .Alerts.Firing }} 150 | {{- end }} 151 | 152 | {{/* Following names for compatibility */}} 153 | {{ define "ding.link.title" }}{{ template "default.title" . }}{{ end }} 154 | {{ define "ding.link.content" }}{{ template "default.content" . }}{{ end }} 155 | kind: ConfigMap 156 | metadata: 157 | labels: 158 | app: webhook-dingtalk 159 | name: webhook-dingtalk 160 | namespace: monitoring 161 | --- 162 | apiVersion: v1 163 | kind: Service 164 | metadata: 165 | labels: 166 | app: webhook-dingtalk 167 | name: webhook-dingtalk 168 | namespace: monitoring 169 | spec: 170 | ports: 171 | - name: http 172 | port: 8060 173 | protocol: TCP 174 | targetPort: 8060 175 | selector: 176 | app: webhook-dingtalk 177 | --- 178 | apiVersion: apps/v1 179 | kind: Deployment 180 | metadata: 181 | labels: 182 | app: webhook-dingtalk 183 | name: webhook-dingtalk 184 | namespace: monitoring 185 | spec: 186 | replicas: 1 187 | selector: 188 | matchLabels: 189 | app: webhook-dingtalk 190 | template: 191 | metadata: 192 | labels: 193 | app: webhook-dingtalk 194 | spec: 195 | containers: 196 | - args: 197 | - --web.listen-address=:8060 198 | - --config.file=/config/config.yaml 199 | image: marksugar/k8s-prometheus:prometheus_dingtalk_v2.0.0 200 | name: webhook-dingtalk 201 | ports: 202 | - containerPort: 8060 203 | name: http 204 | resources: 205 | limits: 206 | cpu: 100m 207 | memory: 100Mi 208 | volumeMounts: 209 | - mountPath: /config 210 | name: config 211 | volumes: 212 | - configMap: 213 | name: webhook-dingtalk 214 | name: config --------------------------------------------------------------------------------