├── Prometheus-Server-Architecture-v2.png ├── Prometheus-Server-Architecture-v3.png ├── Prometheus-Server-Architecture.png ├── References.md ├── mtail ├── messages.mtail └── mysql-slow.mtail ├── oracle_collectors └── oracle.collector.yml └── rules ├── adhocteam-script.rules ├── alertmanager.rules ├── any.rules ├── baas-gateway.rules ├── blackbox.rules ├── card-server-proxy.rules ├── consul.rules ├── docker.rules ├── jvm.rules ├── kafka3node.rules ├── linux-bigdata.rules ├── linux.rules ├── messages.rules ├── mysql.rules ├── nginx.rules ├── oracle-h6.rules ├── oracle.rules ├── pgbouncer.rules ├── postgresql-bigdata.rules ├── postgresql.rules ├── process.rules ├── rabbitmq.rules └── windows.rules /Prometheus-Server-Architecture-v2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wanghy8166/prometheus/44e7df52d6b702529219888bb001c188d6a1e948/Prometheus-Server-Architecture-v2.png -------------------------------------------------------------------------------- /Prometheus-Server-Architecture-v3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wanghy8166/prometheus/44e7df52d6b702529219888bb001c188d6a1e948/Prometheus-Server-Architecture-v3.png -------------------------------------------------------------------------------- /Prometheus-Server-Architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/wanghy8166/prometheus/44e7df52d6b702529219888bb001c188d6a1e948/Prometheus-Server-Architecture.png -------------------------------------------------------------------------------- /References.md: -------------------------------------------------------------------------------- 1 | # rules 2 | https://awesome-prometheus-alerts.grep.to 3 | 4 | https://www.prometheus.wang/di-wu-zhang-jing-bao/chang-yong-cha-xun-alert-rules 5 | 6 | https://github.com/coreos/kube-prometheus/blob/master/manifests/prometheus-rules.yaml 7 | 8 | https://ranchermanager.docs.rancher.com/zh/integrations-in-rancher/monitoring-and-alerting/promql-expressions 9 | 10 | # dashboards 11 | https://grafana.com/orgs/wanghy8166/dashboards 12 | 13 | https://github.com/percona/grafana-dashboards/tree/master/dashboards 14 | 15 | https://github.com/grafana/kubernetes-app 16 | 17 | # 云厂商dashboards 18 | https://github.com/aliyun/aliyun-cms-grafana 19 | 20 | https://github.com/TencentCloud/tencentcloud-monitor-grafana-app 21 | 22 | https://github.com/monitoringartist/grafana-aws-cloudwatch-dashboards 23 | 24 | # 常用exporter 25 | https://prometheus.io/docs/instrumenting/exporters 26 | 27 | https://github.com/prometheus/prometheus/wiki/Default-port-allocations 28 | 29 | # 自定义exporter 30 | - 自己写 exporter 暴露指标 31 | - 自己写脚本发给 pushgateway 暴露指标 32 | - 通过 https://github.com/prometheus/client_python 写脚本暴露指标 33 | - 通过 https://github.com/prometheus/client_python 写脚本发给 pushgateway 暴露指标 34 | 35 | - 自己写脚本可以参考以下工具列表 36 | https://prometheus.io/docs/instrumenting/clientlibs/ 37 | 38 | - 此外,也有第三方写好的exporter,直接加脚本套用即可 39 | https://github.com/gree-gorey/bash-exporter 探测:脚本执行输出 40 | https://github.com/adhocteam/script_exporter 探测:脚本执行耗时、脚本执行是否成功 41 | https://github.com/ncabatoff/script-exporter 探测:脚本执行输出、脚本执行耗时、脚本是否在执行中、脚本执行次数 42 | https://github.com/ricoberger/script_exporter 探测:脚本执行输出、脚本执行耗时、脚本执行是否成功 43 | 44 | # 在业务中集成自定义Metrics,不需要exporter 45 | https://cloud.tencent.com/document/product/1416/56030 46 | http://ylzheng.com/2018/01/24/use-prometheus-monitor-your-spring-boot-application/ 47 | 48 | # 远端存储方案 49 | https://prometheus.io/docs/prometheus/latest/storage 50 | 51 | https://prometheus.io/docs/operating/integrations/#remote-endpoints-and-storage 52 | 53 | # mtail日志分析 54 | 实时或近实时监视日志,以用于性能测量和告警,没有ELK的时候就用它吧。 55 | https://github.com/google/mtail 56 | 1. 在日志更新较快的时候,可以修改scrape_interval:5s让prometheus尽快抓取。 57 | 2. 由于抓取始终有间隔,在高并发场景,可以记录"汇总执行时间"、"汇总执行次数",再换算得出所要的指标。 58 | 具体可参考 59 | https://github.com/google/mtail/blob/master/docs/Programming-Guide.md#storing-intermediate-state 60 | https://github.com/google/mtail/blob/master/docs/Programming-Guide.md#computing-moving-averages 61 | -------------------------------------------------------------------------------- /mtail/messages.mtail: -------------------------------------------------------------------------------- 1 | counter terminated 2 | counter ailed 3 | counter killed 4 | counter lines_total 5 | 6 | # To make ex_test.go happy 7 | strptime("2017-12-07T16:07:14Z", "2006-01-02T15:04:05Z07:00") 8 | 9 | /(.*)/ { 10 | $1 =~ /terminated/ { 11 | terminated++ 12 | } 13 | $1 =~ /ailed/ { 14 | ailed++ 15 | } 16 | $1 =~ /killed/ { 17 | killed++ 18 | } 19 | lines_total++ 20 | } -------------------------------------------------------------------------------- /mtail/mysql-slow.mtail: -------------------------------------------------------------------------------- 1 | # mysql-slow.mtail 取MySQL慢日志里面的查询时间 2 | gauge tmp_query_time 3 | gauge tmp_lock_time 4 | # Query_time: 3.000143 Lock_time: 0.000000 Rows_sent: 1 Rows_examined: 0 5 | const QUERY_TIME /^# Query_time: (\d+\.\d+)\s*Lock_time: (\d+\.\d+)/ 6 | 7 | // + QUERY_TIME { 8 | tmp_query_time = $1 9 | tmp_lock_time = $2 10 | } 11 | -------------------------------------------------------------------------------- /oracle_collectors/oracle.collector.yml: -------------------------------------------------------------------------------- 1 | # cd /usr/local/database_exporter/config/oracle_collectors 2 | # cat oracle.collector.yml 3 | 4 | # A collector defining standard metrics for Oracle Database Server. 5 | # 6 | # 7 | 8 | collector_name: oracle_metrics 9 | 10 | # Similar to global.min_interval, but applies to the queries defined by this collector only. 11 | # min_interval: 0s 12 | 13 | metrics: 14 | # SELECT decode(DATABASE_ROLE, 'PRIMARY', '主库执行sql', '备库执行sql') alias FROM V$DATABASE 15 | 16 | # tablespace_size 17 | - metric_name: oracle_tablespace_size 18 | type: gauge 19 | help: 'Size of oracle db tablespaces.' 20 | values: 21 | - free_rate 22 | key_labels: 23 | - tablespace_name 24 | query: | 25 | SELECT a.tablespace_name "tablespace_name", 26 | round((b.free_mb + a.enextensible_mb) / a.maxtotal_mb * 100, 2) "free_rate" 27 | FROM (SELECT tablespace_name, 28 | SUM(decode(autoextensible, 'YES', maxbytes, bytes)) / 1024 / 1024 maxtotal_mb, 29 | SUM(decode(autoextensible, 'YES', (maxbytes - bytes), 0)) / 1024 / 1024 enextensible_mb 30 | FROM dba_data_files 31 | GROUP BY tablespace_name) a, 32 | (SELECT tablespace_name, SUM(bytes) / 1024 / 1024 free_mb 33 | FROM dba_free_space 34 | GROUP BY tablespace_name) b 35 | WHERE a.tablespace_name = b.tablespace_name(+) 36 | and a.TABLESPACE_NAME not like 'UNDOTBS%' 37 | and (b.free_mb + a.enextensible_mb) < 20 * 1024 38 | 39 | # asm_space 40 | - metric_name: oracle_asm_space 41 | type: gauge 42 | help: 'Size of oracle db asm_space.' 43 | values: 44 | - free_rate 45 | key_labels: 46 | - name 47 | query: | 48 | select name "name", round((free_mb / total_mb) * 100, 2) "free_rate" from v$ASM_DISKGROUP 49 | 50 | # 归档 51 | - metric_name: oracle_archive_size 52 | type: gauge 53 | help: 'Size of oracle db archive.' 54 | values: 55 | - value 56 | key_labels: 57 | - archive_mb 58 | query: | 59 | select 'archive_mb' "archive_mb",nvl(round(sum(blocks * block_size) / 1024 / 1024), 0) "value" 60 | from v$archived_log 61 | where thread# = (select thread# from v$instance) 62 | and FIRST_TIME >= (sysdate - 30 / 24 / 60) 63 | 64 | # dbtime_load 65 | - metric_name: oracle_dbtime_load 66 | type: gauge 67 | help: 'Size of oracle db dbtime_load.' 68 | values: 69 | - value 70 | key_labels: 71 | - dbtime_load 72 | query: | 73 | SELECT 'dbtime_load' "dbtime_load",nvl(decode(DATABASE_ROLE, 'PRIMARY', ( 74 | select round(((real.VALUE - last.value) / 1000000 / 60) / ((sysdate + 1 / 24 / 60 / 60 - to_date(to_char(end_interval_time, 'yyyymmddHH24MISS'), 'yyyymmddHH24MISS')) * 24 * 60) * 100, 2) 75 | from (select * from (select t.end_interval_time, v.value 76 | FROM DBA_HIST_SYS_TIME_MODEL v, dba_hist_snapshot t 77 | WHERE v.STAT_NAME = 'DB time' 78 | and t.snap_id = v.snap_id 79 | and t.instance_number = v.instance_number 80 | and v.instance_number = (select instance_number from v$instance) 81 | and t.begin_interval_time >= sysdate - 140 / (24 * 60) 82 | order by t.snap_id desc) 83 | where rownum = 1) last, v$SYS_TIME_MODEL real 84 | where real.stat_name = 'DB time' 85 | ), 0),0) "value" FROM V$DATABASE 86 | 87 | # sessions 88 | - metric_name: oracle_session_active 89 | type: gauge 90 | help: 'Size of oracle db session_active.' 91 | values: 92 | - value 93 | key_labels: 94 | - session_active 95 | query: | 96 | select 'session_active' "session_active",count(1) "value" from v$session where TYPE='USER' and status='ACTIVE' 97 | 98 | # dg_archived_log 99 | - metric_name: oracle_dg_archived_log 100 | type: gauge 101 | help: 'Size of oracle db dg_archived_log.' 102 | values: 103 | - value 104 | key_labels: 105 | - dg_archived_log 106 | query: | 107 | SELECT 'dg_archived_log' "dg_archived_log",decode(DATABASE_ROLE, 108 | 'PRIMARY', 109 | 0, 110 | (select count(1) from v$archived_log where applied = 'NO')) "value" 111 | FROM V$DATABASE 112 | 113 | # checkBitAttack 114 | - metric_name: oracle_checkBitAttack 115 | type: gauge 116 | help: 'Size of oracle db checkBitAttack.' 117 | values: 118 | - value 119 | key_labels: 120 | - checkBitAttack 121 | query: | 122 | Select 'checkBitAttack' "checkBitAttack",Count(1) "value" 123 | From (Select 1 124 | From Dba_Triggers 125 | Where Trigger_Name Like 'DBMS_%_INTERNAL%' 126 | Union All 127 | Select 1 128 | From Dba_Procedures 129 | Where Object_Name Like 'DBMS_%_INTERNAL% ' 130 | Union All 131 | Select 1 132 | From Dba_Objects 133 | Where Object_Name Like 'DBMS_CORE_INTERNA%' 134 | Or Object_Name Like 'DBMS_SYSTEM_INTERNA%' 135 | Or Object_Name Like 'DBMS_SUPPORT_INTERNA%' 136 | Or Object_Name Like 'DBMS_STANDARD_FUN9%' 137 | Or Object_Name Like 'ORACHK%' 138 | Union All 139 | Select 1 140 | From Dba_Jobs 141 | Where What Like 'DBMS_STANDARD_FUN9%') 142 | 143 | # DBA_JOBS 144 | - metric_name: oracle_dba_jobs 145 | type: gauge 146 | help: 'FAILURES DBA_JOBS' 147 | values: 148 | - FAILURES 149 | key_labels: 150 | - JOB 151 | - WHAT 152 | query: | 153 | SELECT JOB, WHAT, FAILURES FROM DBA_JOBS WHERE FAILURES >= 3 154 | 155 | -------------------------------------------------------------------------------- /rules/adhocteam-script.rules: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: adhocteam-script 3 | rules: 4 | - alert: "adhocteam-script" 5 | expr: script_success == 0 6 | for: 5m 7 | labels: 8 | severity: "严重" 9 | annotations: 10 | value: "{{ $value }}" 11 | summary: "adhocteam-script,{{ $labels.script }}脚本执行失败" 12 | description: "adhocteam-script,{{ $labels.script }}脚本执行失败" 13 | -------------------------------------------------------------------------------- /rules/alertmanager.rules: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: alertmanager 3 | rules: 4 | - alert: "level_error" 5 | expr: irate(level_error[5m]) > 0 6 | for: 10s 7 | labels: 8 | severity: "警告" 9 | annotations: 10 | value: "{{ $value }}%" 11 | summary: "日志匹配到关键字level=error,重启可临时解决,请排查" 12 | description: "日志匹配到关键字level=error,重启可临时解决,请排查" 13 | -------------------------------------------------------------------------------- /rules/any.rules: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: any 3 | rules: 4 | - alert: "InstanceDown" 5 | expr: up{exported_job !~ "oracle"} == 0 6 | for: 5m 7 | labels: 8 | severity: "严重" 9 | annotations: 10 | value: "{{ $value }}" 11 | summary: "{{ $labels.job }}实例Down" 12 | description: "{{ $labels.job }}实例Down" 13 | 14 | - alert: "InstanceDown-oracle" 15 | expr: up{exported_job =~ "oracle"} == 0 16 | for: 5m 17 | labels: 18 | severity: "严重" 19 | annotations: 20 | value: "{{ $value }}" 21 | summary: "{{ $labels.exported_instance }}实例Down" 22 | description: "{{ $labels.exported_instance }}实例Down" -------------------------------------------------------------------------------- /rules/baas-gateway.rules: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: baas-gateway 3 | rules: 4 | - alert: "OutOfDirectMemoryError" 5 | expr: irate(OutOfDirectMemoryError[5m]) > 0 6 | for: 10s 7 | labels: 8 | severity: "警告" 9 | annotations: 10 | value: "{{ $value }}%" 11 | summary: "日志匹配到关键字OutOfDirectMemoryError,请排查" 12 | description: "日志匹配到关键字OutOfDirectMemoryError,请排查" 13 | -------------------------------------------------------------------------------- /rules/blackbox.rules: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: blackbox 3 | rules: 4 | - alert: "blackbox_exporter_config_last_reload_successful" 5 | expr: blackbox_exporter_config_last_reload_successful == 0 6 | for: 5m 7 | labels: 8 | severity: "严重" 9 | annotations: 10 | value: "{{ $value }}" 11 | summary: "blackbox_exporter配置载入失败" 12 | description: "blackbox_exporter配置载入失败" 13 | 14 | - alert: "probe_success" 15 | expr: probe_success == 0 16 | for: 5m 17 | labels: 18 | severity: "严重" 19 | annotations: 20 | value: "{{ $value }}" 21 | summary: "地址访问失败" 22 | description: "地址访问失败" -------------------------------------------------------------------------------- /rules/card-server-proxy.rules: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: card-server-proxy 3 | rules: 4 | - alert: "success-false" 5 | expr: irate(success[5m]) == irate(false[5m]) and irate(false[5m]) > 0 6 | for: 10s 7 | labels: 8 | severity: "警告" 9 | annotations: 10 | value: "{{ $value }}%" 11 | summary: "接口服务card-server-proxy告警,日志匹配到关键字success:false,请排查" 12 | description: "接口服务card-server-proxy告警,日志匹配到关键字success:false,请排查" 13 | -------------------------------------------------------------------------------- /rules/consul.rules: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: consul 3 | rules: 4 | - alert: "consul_up" 5 | expr: consul_up==0 6 | for: 5m 7 | labels: 8 | severity: "严重" 9 | annotations: 10 | value: "{{ $value }}" 11 | summary: "consul Down" 12 | description: "consul Down" 13 | -------------------------------------------------------------------------------- /rules/docker.rules: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: docker 3 | rules: 4 | - alert: "DockerInstanceDown" 5 | expr: rate(container_last_seen{image!=""}[15m])*100 <= 99 6 | for: 5m 7 | labels: 8 | severity: "警告" 9 | annotations: 10 | value: "{{ $value }}" 11 | summary: "Docker容器{{ $labels.name }}异常:停了/删除/重建/网络" 12 | description: "Docker容器{{ $labels.name }}异常:停了/删除/重建/网络" 13 | -------------------------------------------------------------------------------- /rules/jvm.rules: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: jvm 3 | rules: 4 | - alert: "jmx_scrape_error" 5 | expr: jmx_scrape_error == 1 6 | for: 5m 7 | labels: 8 | severity: "严重" 9 | annotations: 10 | value: "{{ $value }}" 11 | summary: "jmx拉取数据异常" 12 | description: "jmx拉取数据异常" 13 | 14 | - alert: "heap_use" 15 | expr: jvm_memory_bytes_used{area="heap"} / jvm_memory_bytes_max{area="heap"} * 100 > 95 or java_lang_Memory_HeapMemoryUsage_used / java_lang_Memory_HeapMemoryUsage_max * 100 > 95 16 | for: 5m 17 | labels: 18 | severity: "警告" 19 | annotations: 20 | value: "{{ $value }}%" 21 | summary: "堆内存利用率,5分钟持续>95%" 22 | description: "堆内存利用率,5分钟持续>95%" 23 | 24 | - alert: "ThreadCount" 25 | expr: java_lang_Threading_ThreadCount>500 26 | for: 5m 27 | labels: 28 | severity: "严重" 29 | annotations: 30 | value: "{{ $value }}" 31 | summary: "ThreadCount>500" 32 | description: "ThreadCount>500" 33 | 34 | - alert: "GC-Throughput" 35 | expr: ( java_lang_Runtime_Uptime - ignoring(name) java_lang_GarbageCollector_CollectionTime{name=~"ParNew"} - ignoring(name) java_lang_GarbageCollector_CollectionTime{name=~"ConcurrentMarkSweep"} )/java_lang_Runtime_Uptime < 0.96 36 | for: 5m 37 | labels: 38 | severity: "严重" 39 | annotations: 40 | value: "{{ $value }}" 41 | summary: "GC总吞吐率<96%" 42 | description: "GC总吞吐率<96%" 43 | -------------------------------------------------------------------------------- /rules/kafka3node.rules: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: kafka 3 | rules: 4 | - alert: "kafka_brokers" 5 | expr: kafka_brokers < 3 6 | for: 5m 7 | labels: 8 | severity: "严重" 9 | annotations: 10 | value: "{{ $value }}" 11 | summary: "kafka_brokers < 3" 12 | description: "kafka_brokers < 3" 13 | -------------------------------------------------------------------------------- /rules/linux-bigdata.rules: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: linux 3 | rules: 4 | - alert: "cpu_use" 5 | expr: 100 - (avg by (instance,server,project) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85 6 | for: 30m 7 | labels: 8 | severity: "警告" 9 | annotations: 10 | value: "{{ $value }}%" 11 | summary: "cpu高" 12 | description: "cpu利用率,30分钟平均值>85%" 13 | 14 | - alert: "cpu_iowait_use" 15 | expr: avg by (instance,server,project) (irate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 35 16 | for: 30m 17 | labels: 18 | severity: "警告" 19 | annotations: 20 | value: "{{ $value }}%" 21 | summary: "cpu_iowait高" 22 | description: "cpu_iowait利用率,30分钟平均值>35%" 23 | 24 | - alert: "disk_use" 25 | expr: 100 - (node_filesystem_avail_bytes{fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"})*100 > 80 26 | for: 30m 27 | labels: 28 | severity: "灾难" 29 | annotations: 30 | value: "{{ $value }}%" 31 | summary: "磁盘快满了" 32 | description: "{{ $labels.mountpoint }}磁盘利用率,30分钟持续>80%" 33 | 34 | - alert: "disk_inode_use" 35 | expr: 100 - (node_filesystem_files_free{fstype=~"ext4|xfs"} / node_filesystem_files{fstype=~"ext4|xfs"})*100 > 80 36 | for: 10m 37 | labels: 38 | severity: "灾难" 39 | annotations: 40 | value: "{{ $value }}%" 41 | summary: "磁盘inode快满了" 42 | description: "{{ $labels.mountpoint }}磁盘inode利用率,10分钟持续>80%" 43 | -------------------------------------------------------------------------------- /rules/linux.rules: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: linux 3 | rules: 4 | - alert: "cpu_use" 5 | expr: 100 - (avg by (instance,server,project) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85 6 | for: 5m 7 | labels: 8 | severity: "警告" 9 | annotations: 10 | value: "{{ $value }}%" 11 | summary: "cpu高" 12 | description: "cpu利用率,5分钟平均值>85%" 13 | 14 | - alert: "cpu_iowait_use" 15 | expr: avg by (instance,server,project) (irate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 35 16 | for: 5m 17 | labels: 18 | severity: "警告" 19 | annotations: 20 | value: "{{ $value }}%" 21 | summary: "cpu_iowait高" 22 | description: "cpu_iowait利用率,5分钟平均值>35%" 23 | 24 | - alert: "disk_use" 25 | expr: 100 - (node_filesystem_avail_bytes{fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"})*100 > 95 26 | for: 5m 27 | labels: 28 | severity: "灾难" 29 | annotations: 30 | value: "{{ $value }}%" 31 | summary: "磁盘快满了" 32 | description: "{{ $labels.mountpoint }}磁盘利用率,5分钟持续>95%" 33 | 34 | - alert: "disk_inode_use" 35 | expr: 100 - (node_filesystem_files_free{fstype=~"ext4|xfs"} / node_filesystem_files{fstype=~"ext4|xfs"})*100 > 95 36 | for: 5m 37 | labels: 38 | severity: "灾难" 39 | annotations: 40 | value: "{{ $value }}%" 41 | summary: "磁盘inode快满了" 42 | description: "{{ $labels.mountpoint }}磁盘inode利用率,5分钟持续>95%" 43 | 44 | - alert: "mem_use" 45 | expr: 100-(node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes)*100>95 46 | for: 5m 47 | labels: 48 | severity: "灾难" 49 | annotations: 50 | value: "{{ $value }}%" 51 | summary: "内存利用率,5分钟持续>95%" 52 | description: "内存利用率,5分钟持续>95%" 53 | 54 | -------------------------------------------------------------------------------- /rules/messages.rules: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: messages 3 | rules: 4 | - alert: "messages-killed" 5 | expr: irate(killed[5m]) > 0 6 | for: 10s 7 | labels: 8 | severity: "警告" 9 | annotations: 10 | value: "{{ $value }}%" 11 | summary: "有进程(pg)被杀了" 12 | description: "有进程(pg)被杀了" -------------------------------------------------------------------------------- /rules/mysql.rules: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: mysql 3 | rules: 4 | - alert: "MySQLDown" 5 | expr: mysql_up == 0 6 | for: 5m 7 | labels: 8 | severity: "严重" 9 | annotations: 10 | value: "{{ $value }}" 11 | summary: "MySQL Down" 12 | description: "MySQL Down" 13 | 14 | - alert: "slave_io_running" 15 | expr: mysql_slave_status_slave_io_running == 0 16 | for: 5m 17 | labels: 18 | severity: "严重" 19 | annotations: 20 | value: "{{ $value }}" 21 | summary: "MySQL slave_io_running Down" 22 | description: "MySQL slave_io_running Down" 23 | 24 | - alert: "slave_sql_running" 25 | expr: mysql_slave_status_slave_sql_running == 0 26 | for: 5m 27 | labels: 28 | severity: "严重" 29 | annotations: 30 | value: "{{ $value }}" 31 | summary: "MySQL slave_sql_running Down" 32 | description: "MySQL slave_sql_running Down" 33 | 34 | - alert: "seconds_behind_master" 35 | expr: mysql_slave_status_seconds_behind_master > 1800 36 | for: 5m 37 | labels: 38 | severity: "警告" 39 | annotations: 40 | value: "{{ $value }}" 41 | summary: "MySQL从库复制延迟>30分钟" 42 | description: "MySQL从库复制延迟>30分钟" 43 | -------------------------------------------------------------------------------- /rules/nginx.rules: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: nginx 3 | rules: 4 | - alert: "nginxDown" 5 | expr: nginx_up == 0 6 | for: 5m 7 | labels: 8 | severity: "严重" 9 | annotations: 10 | value: "{{ $value }}" 11 | summary: "nginx Down" 12 | description: "nginx Down" 13 | -------------------------------------------------------------------------------- /rules/oracle-h6.rules: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: oracle-h6 3 | rules: 4 | - alert: "HD_MONITORLOG" 5 | expr: HD_MONITORLOG == 1 6 | for: 5m 7 | labels: 8 | severity: "警告" 9 | annotations: 10 | value: "{{ $value }}%" 11 | summary: "oracle业务监控异常" 12 | description: "oracle业务监控异常: {{ $labels.CONTENT }} " 13 | 14 | - alert: "BUY1POOLS" 15 | expr: BUY1POOLS > 10000 16 | for: 5m 17 | labels: 18 | severity: "警告" 19 | annotations: 20 | value: "{{ $value }}%" 21 | summary: "BUY1POOLS > 1w" 22 | description: "BUY1POOLS > 1w" 23 | 24 | - alert: "V_CUSTOM_LOG" 25 | expr: V_CUSTOM_LOG == 1 26 | for: 5m 27 | labels: 28 | severity: "警告" 29 | annotations: 30 | value: "{{ $value }}%" 31 | summary: "自定义告警" 32 | description: "自定义告警: {{ $labels.CONTENT }} " 33 | 34 | 35 | -------------------------------------------------------------------------------- /rules/oracle.rules: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: oracle 3 | rules: 4 | - alert: "oracle_tablespace_size" 5 | expr: oracle_tablespace_size{exported_instance=~".*",tablespace_name=~".*"} < 10 6 | for: 5m 7 | labels: 8 | severity: "严重" 9 | annotations: 10 | value: "{{ $value }}%" 11 | summary: "oracle表空间不足" 12 | description: "oracle {{ $labels.exported_instance }} 表空间{{ $labels.tablespace_name }} < 10% 且 < 20G" 13 | 14 | - alert: "oracle_asm_space" 15 | expr: oracle_asm_space{exported_instance=~".*",name=~".*"} < 10 16 | for: 5m 17 | labels: 18 | severity: "严重" 19 | annotations: 20 | value: "{{ $value }}%" 21 | summary: "oracle-asm磁盘空间不足" 22 | description: "oracle {{ $labels.exported_instance }} asm磁盘{{ $labels.name }} < 10%" 23 | 24 | - alert: "oracle_dbtime_load" 25 | expr: oracle_dbtime_load{exported_instance=~".*"} > 2000 26 | for: 5m 27 | labels: 28 | severity: "警告" 29 | annotations: 30 | value: "{{ $value }}%" 31 | summary: "oracle_dbtime_load负载高" 32 | description: "oracle {{ $labels.exported_instance }} dbtime_load负载 > 2000%" 33 | 34 | - alert: "oracle_dg_archived_log" 35 | expr: oracle_dg_archived_log{exported_instance=~".*"} > 2 36 | for: 30m 37 | labels: 38 | severity: "警告" 39 | annotations: 40 | value: "{{ $value }}%" 41 | summary: "ADG备库同步有延迟" 42 | description: "oracle {{ $labels.exported_instance }} ADG备库同步有延迟,dg_archived_log > 2" 43 | 44 | - alert: "oracle_checkBitAttack" 45 | expr: oracle_checkBitAttack{exported_instance=~".*"} > 0 46 | for: 5m 47 | labels: 48 | severity: "灾难" 49 | annotations: 50 | value: "{{ $value }}%" 51 | summary: "Oracle中有勒索脚本" 52 | description: "oracle {{ $labels.exported_instance }} 有勒索脚本,checkBitAttack > 0" 53 | 54 | - alert: "oracle_dba_jobs" 55 | expr: oracle_dba_jobs{exported_instance=~".*"} >= 3 56 | for: 5m 57 | labels: 58 | severity: "严重" 59 | annotations: 60 | value: "{{ $value }}" 61 | summary: "oracle {{ $labels.exported_instance }} 有job{{ $labels.JOB }}失败了{{ $value }}次:{{ $labels.WHAT }}" 62 | description: "oracle {{ $labels.exported_instance }} 有job{{ $labels.JOB }}失败了{{ $value }}次:{{ $labels.WHAT }}" 63 | 64 | 65 | -------------------------------------------------------------------------------- /rules/pgbouncer.rules: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: pgbouncer 3 | rules: 4 | - alert: "PGbouncerDown" 5 | expr: pgbouncer_up == 0 6 | for: 5m 7 | labels: 8 | severity: "严重" 9 | annotations: 10 | value: "{{ $value }}" 11 | summary: "PGbouncer {{ $labels.pool_id }} Down" 12 | description: "PGbouncer {{ $labels.pool_id }} Down" 13 | -------------------------------------------------------------------------------- /rules/postgresql-bigdata.rules: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: postgresql-bigdata 3 | rules: 4 | - alert: "PostgreSQL standby dalay" 5 | expr: pg_standby_dalay_standby_dalay == 0 6 | for: 3m 7 | labels: 8 | severity: "严重" 9 | annotations: 10 | value: "{{ $value }}" 11 | summary: "PostgreSQL standby 数据延迟超过1分钟" 12 | description: "PostgreSQL standby 数据延迟超过1分钟" 13 | - alert: "PostgreSQL lock wait" 14 | expr: pg_logc_wait_logc_wait_hour == 1 15 | for: 5m 16 | labels: 17 | severity: "严重" 18 | annotations: 19 | value: "{{ $value }}" 20 | summary: "PostgreSQL 锁等待超过1分钟" 21 | description: "PostgreSQL 锁等待超过1分钟" -------------------------------------------------------------------------------- /rules/postgresql.rules: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: postgresql 3 | rules: 4 | - alert: "PostgreSQLDown" 5 | expr: pg_up == 0 6 | for: 5m 7 | labels: 8 | severity: "严重" 9 | annotations: 10 | value: "{{ $value }}" 11 | summary: "PostgreSQL Down" 12 | description: "PostgreSQL Down" 13 | -------------------------------------------------------------------------------- /rules/process.rules: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: process 3 | rules: 4 | - alert: "namedprocess_scrape_errors" 5 | expr: namedprocess_scrape_errors == 1 6 | for: 5m 7 | labels: 8 | severity: "严重" 9 | annotations: 10 | value: "{{ $value }}" 11 | summary: "process拉取失败" 12 | description: "process拉取失败" -------------------------------------------------------------------------------- /rules/rabbitmq.rules: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: rabbitmq 3 | rules: 4 | - alert: "rabbitmq_up" 5 | expr: rabbitmq_up{cluster != ''}==0 6 | for: 5m 7 | labels: 8 | severity: "严重" 9 | annotations: 10 | value: "{{ $value }}" 11 | summary: "RabbitMQ Down" 12 | description: "RabbitMQ Down" 13 | -------------------------------------------------------------------------------- /rules/windows.rules: -------------------------------------------------------------------------------- 1 | groups: 2 | - name: windows 3 | rules: 4 | - alert: "cpu_use" 5 | expr: 100 - (avg(irate(wmi_cpu_time_total{mode="idle"}[5m])))*100 > 85 6 | for: 5m 7 | labels: 8 | severity: "警告" 9 | annotations: 10 | value: "{{ $value }}%" 11 | summary: "cpu高" 12 | description: "cpu利用率,5分钟平均值>85%" 13 | 14 | - alert: "disk_use" 15 | expr: 100 - (wmi_logical_disk_free_bytes / wmi_logical_disk_size_bytes)*100 > 95 16 | for: 5m 17 | labels: 18 | severity: "灾难" 19 | annotations: 20 | value: "{{ $value }}%" 21 | summary: "磁盘快满了" 22 | description: "{{ $labels.volume }}磁盘利用率,5分钟持续>95%" 23 | --------------------------------------------------------------------------------