├── Prometheus-Server-Architecture-v2.png
├── Prometheus-Server-Architecture-v3.png
├── Prometheus-Server-Architecture.png
├── References.md
├── mtail
    ├── messages.mtail
    └── mysql-slow.mtail
├── oracle_collectors
    └── oracle.collector.yml
└── rules
    ├── adhocteam-script.rules
    ├── alertmanager.rules
    ├── any.rules
    ├── baas-gateway.rules
    ├── blackbox.rules
    ├── card-server-proxy.rules
    ├── consul.rules
    ├── docker.rules
    ├── jvm.rules
    ├── kafka3node.rules
    ├── linux-bigdata.rules
    ├── linux.rules
    ├── messages.rules
    ├── mysql.rules
    ├── nginx.rules
    ├── oracle-h6.rules
    ├── oracle.rules
    ├── pgbouncer.rules
    ├── postgresql-bigdata.rules
    ├── postgresql.rules
    ├── process.rules
    ├── rabbitmq.rules
    └── windows.rules


/Prometheus-Server-Architecture-v2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wanghy8166/prometheus/44e7df52d6b702529219888bb001c188d6a1e948/Prometheus-Server-Architecture-v2.png


--------------------------------------------------------------------------------
/Prometheus-Server-Architecture-v3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wanghy8166/prometheus/44e7df52d6b702529219888bb001c188d6a1e948/Prometheus-Server-Architecture-v3.png


--------------------------------------------------------------------------------
/Prometheus-Server-Architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/wanghy8166/prometheus/44e7df52d6b702529219888bb001c188d6a1e948/Prometheus-Server-Architecture.png


--------------------------------------------------------------------------------
/References.md:
--------------------------------------------------------------------------------
 1 | # rules  
 2 | https://awesome-prometheus-alerts.grep.to  
 3 | 
 4 | https://www.prometheus.wang/di-wu-zhang-jing-bao/chang-yong-cha-xun-alert-rules  
 5 | 
 6 | https://github.com/coreos/kube-prometheus/blob/master/manifests/prometheus-rules.yaml  
 7 | 
 8 | https://ranchermanager.docs.rancher.com/zh/integrations-in-rancher/monitoring-and-alerting/promql-expressions  
 9 | 
10 | # dashboards  
11 | https://grafana.com/orgs/wanghy8166/dashboards  
12 | 
13 | https://github.com/percona/grafana-dashboards/tree/master/dashboards  
14 | 
15 | https://github.com/grafana/kubernetes-app  
16 | 
17 | # 云厂商dashboards  
18 | https://github.com/aliyun/aliyun-cms-grafana  
19 | 
20 | https://github.com/TencentCloud/tencentcloud-monitor-grafana-app  
21 | 
22 | https://github.com/monitoringartist/grafana-aws-cloudwatch-dashboards  
23 | 
24 | # 常用exporter  
25 | https://prometheus.io/docs/instrumenting/exporters  
26 | 
27 | https://github.com/prometheus/prometheus/wiki/Default-port-allocations  
28 | 
29 | # 自定义exporter  
30 | - 自己写 exporter 暴露指标  
31 | - 自己写脚本发给 pushgateway 暴露指标  
32 | - 通过 https://github.com/prometheus/client_python 写脚本暴露指标  
33 | - 通过 https://github.com/prometheus/client_python 写脚本发给 pushgateway 暴露指标  
34 | 
35 | - 自己写脚本可以参考以下工具列表  
36 | https://prometheus.io/docs/instrumenting/clientlibs/  
37 | 
38 | - 此外，也有第三方写好的exporter，直接加脚本套用即可  
39 | https://github.com/gree-gorey/bash-exporter    探测:脚本执行输出  
40 | https://github.com/adhocteam/script_exporter   探测:脚本执行耗时、脚本执行是否成功  
41 | https://github.com/ncabatoff/script-exporter   探测:脚本执行输出、脚本执行耗时、脚本是否在执行中、脚本执行次数  
42 | https://github.com/ricoberger/script_exporter  探测:脚本执行输出、脚本执行耗时、脚本执行是否成功  
43 | 
44 | # 在业务中集成自定义Metrics,不需要exporter  
45 | https://cloud.tencent.com/document/product/1416/56030  
46 | http://ylzheng.com/2018/01/24/use-prometheus-monitor-your-spring-boot-application/  
47 | 
48 | # 远端存储方案  
49 | https://prometheus.io/docs/prometheus/latest/storage  
50 | 
51 | https://prometheus.io/docs/operating/integrations/#remote-endpoints-and-storage  
52 | 
53 | # mtail日志分析  
54 | 实时或近实时监视日志，以用于性能测量和告警，没有ELK的时候就用它吧。  
55 | https://github.com/google/mtail  
56 | 1. 在日志更新较快的时候，可以修改scrape_interval:5s让prometheus尽快抓取。
57 | 2. 由于抓取始终有间隔，在高并发场景，可以记录"汇总执行时间"、"汇总执行次数"，再换算得出所要的指标。  
58 | 具体可参考  
59 | https://github.com/google/mtail/blob/master/docs/Programming-Guide.md#storing-intermediate-state  
60 | https://github.com/google/mtail/blob/master/docs/Programming-Guide.md#computing-moving-averages  
61 | 


--------------------------------------------------------------------------------
/mtail/messages.mtail:
--------------------------------------------------------------------------------
 1 | counter terminated
 2 | counter ailed
 3 | counter killed
 4 | counter lines_total
 5 | 
 6 | # To make ex_test.go happy
 7 | strptime("2017-12-07T16:07:14Z", "2006-01-02T15:04:05Z07:00")
 8 | 
 9 | /(.*)/ {
10 |   $1 =~ /terminated/ {
11 |     terminated++
12 |   }
13 |   $1 =~ /ailed/ {
14 |     ailed++
15 |   }
16 |   $1 =~ /killed/ {
17 |     killed++
18 |   }
19 |   lines_total++
20 | }


--------------------------------------------------------------------------------
/mtail/mysql-slow.mtail:
--------------------------------------------------------------------------------
 1 | # mysql-slow.mtail 取MySQL慢日志里面的查询时间
 2 | gauge tmp_query_time
 3 | gauge tmp_lock_time
 4 | # Query_time: 3.000143  Lock_time: 0.000000 Rows_sent: 1  Rows_examined: 0
 5 | const QUERY_TIME /^# Query_time: (\d+\.\d+)\s*Lock_time: (\d+\.\d+)/
 6 | 
 7 | // + QUERY_TIME {
 8 |   tmp_query_time = $1
 9 |   tmp_lock_time = $2
10 | }
11 | 


--------------------------------------------------------------------------------
/oracle_collectors/oracle.collector.yml:
--------------------------------------------------------------------------------
  1 | # cd /usr/local/database_exporter/config/oracle_collectors
  2 | # cat oracle.collector.yml
  3 | 
  4 | # A collector defining standard metrics for Oracle Database Server.
  5 | #
  6 | #
  7 | 
  8 | collector_name: oracle_metrics
  9 | 
 10 | # Similar to global.min_interval, but applies to the queries defined by this collector only.
 11 | # min_interval: 0s
 12 | 
 13 | metrics:
 14 | # SELECT decode(DATABASE_ROLE, 'PRIMARY', '主库执行sql', '备库执行sql') alias FROM V$DATABASE
 15 | 
 16 | # tablespace_size
 17 |   - metric_name: oracle_tablespace_size
 18 |     type: gauge
 19 |     help: 'Size of oracle db tablespaces.'
 20 |     values:
 21 |       - free_rate
 22 |     key_labels:
 23 |       - tablespace_name
 24 |     query: |
 25 |       SELECT a.tablespace_name "tablespace_name",
 26 |              round((b.free_mb + a.enextensible_mb) / a.maxtotal_mb * 100, 2) "free_rate"
 27 |         FROM (SELECT tablespace_name,
 28 |                      SUM(decode(autoextensible, 'YES', maxbytes, bytes)) / 1024 / 1024 maxtotal_mb,
 29 |                      SUM(decode(autoextensible, 'YES', (maxbytes - bytes), 0)) / 1024 / 1024 enextensible_mb
 30 |                 FROM dba_data_files
 31 |                GROUP BY tablespace_name) a,
 32 |              (SELECT tablespace_name, SUM(bytes) / 1024 / 1024 free_mb
 33 |                 FROM dba_free_space
 34 |                GROUP BY tablespace_name) b
 35 |        WHERE a.tablespace_name = b.tablespace_name(+)
 36 |          and a.TABLESPACE_NAME not like 'UNDOTBS%'
 37 |          and (b.free_mb + a.enextensible_mb) < 20 * 1024
 38 | 
 39 | # asm_space
 40 |   - metric_name: oracle_asm_space
 41 |     type: gauge
 42 |     help: 'Size of oracle db asm_space.'
 43 |     values:
 44 |       - free_rate
 45 |     key_labels:
 46 |       - name
 47 |     query: |
 48 |       select name "name", round((free_mb / total_mb) * 100, 2) "free_rate" from v$ASM_DISKGROUP
 49 | 
 50 | # 归档
 51 |   - metric_name: oracle_archive_size
 52 |     type: gauge
 53 |     help: 'Size of oracle db archive.'
 54 |     values:
 55 |       - value
 56 |     key_labels:
 57 |       - archive_mb
 58 |     query: |
 59 |       select 'archive_mb' "archive_mb",nvl(round(sum(blocks * block_size) / 1024 / 1024), 0) "value"
 60 |         from v$archived_log
 61 |        where thread# = (select thread# from v$instance)
 62 |          and FIRST_TIME >= (sysdate - 30 / 24 / 60) 
 63 | 
 64 | # dbtime_load
 65 |   - metric_name: oracle_dbtime_load
 66 |     type: gauge
 67 |     help: 'Size of oracle db dbtime_load.'
 68 |     values:
 69 |       - value
 70 |     key_labels:
 71 |       - dbtime_load
 72 |     query: |
 73 |       SELECT 'dbtime_load' "dbtime_load",nvl(decode(DATABASE_ROLE, 'PRIMARY', (
 74 |       select round(((real.VALUE - last.value) / 1000000 / 60) / ((sysdate + 1 / 24 / 60 / 60 - to_date(to_char(end_interval_time, 'yyyymmddHH24MISS'), 'yyyymmddHH24MISS')) * 24 * 60) * 100, 2)  
 75 |               from (select * from (select t.end_interval_time, v.value 
 76 |                               FROM DBA_HIST_SYS_TIME_MODEL v, dba_hist_snapshot t 
 77 |                              WHERE v.STAT_NAME = 'DB time' 
 78 |                                and t.snap_id = v.snap_id 
 79 |                                and t.instance_number = v.instance_number 
 80 |                                and v.instance_number = (select instance_number from v$instance) 
 81 |                                and t.begin_interval_time >= sysdate - 140 / (24 * 60) 
 82 |                              order by t.snap_id desc) 
 83 |                      where rownum = 1) last, v$SYS_TIME_MODEL real 
 84 |              where real.stat_name = 'DB time'
 85 |       ), 0),0) "value" FROM V$DATABASE 
 86 | 
 87 | # sessions
 88 |   - metric_name: oracle_session_active
 89 |     type: gauge
 90 |     help: 'Size of oracle db session_active.'
 91 |     values:
 92 |       - value
 93 |     key_labels:
 94 |       - session_active
 95 |     query: |
 96 |       select 'session_active' "session_active",count(1) "value" from v$session where TYPE='USER' and status='ACTIVE'
 97 | 
 98 | # dg_archived_log
 99 |   - metric_name: oracle_dg_archived_log
100 |     type: gauge
101 |     help: 'Size of oracle db dg_archived_log.'
102 |     values:
103 |       - value
104 |     key_labels:
105 |       - dg_archived_log
106 |     query: |
107 |       SELECT 'dg_archived_log' "dg_archived_log",decode(DATABASE_ROLE,
108 |                     'PRIMARY',
109 |                     0,
110 |                     (select count(1) from v$archived_log where applied = 'NO')) "value"
111 |         FROM V$DATABASE
112 | 
113 | # checkBitAttack
114 |   - metric_name: oracle_checkBitAttack
115 |     type: gauge
116 |     help: 'Size of oracle db checkBitAttack.'
117 |     values:
118 |       - value
119 |     key_labels:
120 |       - checkBitAttack
121 |     query: |
122 |       Select 'checkBitAttack' "checkBitAttack",Count(1) "value"
123 |         From (Select 1
124 |                 From Dba_Triggers
125 |                Where Trigger_Name Like 'DBMS_%_INTERNAL%'
126 |               Union All
127 |               Select 1
128 |                 From Dba_Procedures
129 |                Where Object_Name Like 'DBMS_%_INTERNAL% '
130 |               Union All
131 |               Select 1
132 |                 From Dba_Objects
133 |                Where Object_Name Like 'DBMS_CORE_INTERNA%'
134 |                   Or Object_Name Like 'DBMS_SYSTEM_INTERNA%'
135 |                   Or Object_Name Like 'DBMS_SUPPORT_INTERNA%'
136 |                   Or Object_Name Like 'DBMS_STANDARD_FUN9%'
137 |                   Or Object_Name Like 'ORACHK%'
138 |               Union All
139 |               Select 1
140 |                 From Dba_Jobs
141 |                Where What Like 'DBMS_STANDARD_FUN9%')
142 | 
143 | # DBA_JOBS
144 |   - metric_name: oracle_dba_jobs
145 |     type: gauge
146 |     help: 'FAILURES DBA_JOBS'
147 |     values:
148 |       - FAILURES
149 |     key_labels:
150 |       - JOB
151 |       - WHAT
152 |     query: |
153 |       SELECT JOB, WHAT, FAILURES FROM DBA_JOBS WHERE FAILURES >= 3
154 | 
155 | 


--------------------------------------------------------------------------------
/rules/adhocteam-script.rules:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: adhocteam-script
 3 |   rules:
 4 |   - alert: "adhocteam-script"
 5 |     expr: script_success == 0
 6 |     for: 5m
 7 |     labels:
 8 |       severity: "严重"
 9 |     annotations:
10 |       value: "{{ $value }}"
11 |       summary: "adhocteam-script,{{ $labels.script }}脚本执行失败"
12 |       description: "adhocteam-script,{{ $labels.script }}脚本执行失败"
13 | 


--------------------------------------------------------------------------------
/rules/alertmanager.rules:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: alertmanager
 3 |   rules:
 4 |   - alert: "level_error"
 5 |     expr: irate(level_error[5m]) > 0
 6 |     for: 10s
 7 |     labels:
 8 |       severity: "警告"
 9 |     annotations:
10 |       value: "{{ $value }}%"
11 |       summary: "日志匹配到关键字level=error,重启可临时解决,请排查"
12 |       description: "日志匹配到关键字level=error,重启可临时解决,请排查"
13 | 


--------------------------------------------------------------------------------
/rules/any.rules:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: any
 3 |   rules:
 4 |   - alert: "InstanceDown"
 5 |     expr: up{exported_job !~ "oracle"} == 0
 6 |     for: 5m
 7 |     labels:
 8 |       severity: "严重"
 9 |     annotations:
10 |       value: "{{ $value }}"
11 |       summary: "{{ $labels.job }}实例Down"
12 |       description: "{{ $labels.job }}实例Down"
13 | 
14 |   - alert: "InstanceDown-oracle"
15 |     expr: up{exported_job =~ "oracle"} == 0
16 |     for: 5m
17 |     labels:
18 |       severity: "严重"
19 |     annotations:
20 |       value: "{{ $value }}"
21 |       summary: "{{ $labels.exported_instance }}实例Down"
22 |       description: "{{ $labels.exported_instance }}实例Down"


--------------------------------------------------------------------------------
/rules/baas-gateway.rules:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: baas-gateway
 3 |   rules:
 4 |   - alert: "OutOfDirectMemoryError"
 5 |     expr: irate(OutOfDirectMemoryError[5m]) > 0
 6 |     for: 10s
 7 |     labels:
 8 |       severity: "警告"
 9 |     annotations:
10 |       value: "{{ $value }}%"
11 |       summary: "日志匹配到关键字OutOfDirectMemoryError,请排查"
12 |       description: "日志匹配到关键字OutOfDirectMemoryError,请排查"
13 | 


--------------------------------------------------------------------------------
/rules/blackbox.rules:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: blackbox
 3 |   rules:
 4 |   - alert: "blackbox_exporter_config_last_reload_successful"
 5 |     expr: blackbox_exporter_config_last_reload_successful == 0
 6 |     for: 5m
 7 |     labels:
 8 |       severity: "严重"
 9 |     annotations:
10 |       value: "{{ $value }}"
11 |       summary: "blackbox_exporter配置载入失败"
12 |       description: "blackbox_exporter配置载入失败"
13 | 
14 |   - alert: "probe_success"
15 |     expr: probe_success == 0
16 |     for: 5m
17 |     labels:
18 |       severity: "严重"
19 |     annotations:
20 |       value: "{{ $value }}"
21 |       summary: "地址访问失败"
22 |       description: "地址访问失败"


--------------------------------------------------------------------------------
/rules/card-server-proxy.rules:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: card-server-proxy
 3 |   rules:
 4 |   - alert: "success-false"
 5 |     expr: irate(success[5m]) == irate(false[5m]) and irate(false[5m]) > 0
 6 |     for: 10s
 7 |     labels:
 8 |       severity: "警告"
 9 |     annotations:
10 |       value: "{{ $value }}%"
11 |       summary: "接口服务card-server-proxy告警,日志匹配到关键字success:false,请排查"
12 |       description: "接口服务card-server-proxy告警,日志匹配到关键字success:false,请排查"
13 | 


--------------------------------------------------------------------------------
/rules/consul.rules:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: consul
 3 |   rules:
 4 |   - alert: "consul_up"
 5 |     expr: consul_up==0
 6 |     for: 5m
 7 |     labels:
 8 |       severity: "严重"
 9 |     annotations:
10 |       value: "{{ $value }}"
11 |       summary: "consul Down"
12 |       description: "consul Down"
13 | 


--------------------------------------------------------------------------------
/rules/docker.rules:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: docker
 3 |   rules:
 4 |   - alert: "DockerInstanceDown"
 5 |     expr: rate(container_last_seen{image!=""}[15m])*100 <= 99 
 6 |     for: 5m
 7 |     labels:
 8 |       severity: "警告"
 9 |     annotations:
10 |       value: "{{ $value }}"
11 |       summary: "Docker容器{{ $labels.name }}异常:停了/删除/重建/网络"
12 |       description: "Docker容器{{ $labels.name }}异常:停了/删除/重建/网络"
13 | 


--------------------------------------------------------------------------------
/rules/jvm.rules:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: jvm
 3 |   rules:
 4 |   - alert: "jmx_scrape_error"
 5 |     expr: jmx_scrape_error == 1
 6 |     for: 5m
 7 |     labels:
 8 |       severity: "严重"
 9 |     annotations:
10 |       value: "{{ $value }}"
11 |       summary: "jmx拉取数据异常"
12 |       description: "jmx拉取数据异常"
13 | 
14 |   - alert: "heap_use"
15 |     expr: jvm_memory_bytes_used{area="heap"} / jvm_memory_bytes_max{area="heap"} * 100 > 95 or java_lang_Memory_HeapMemoryUsage_used / java_lang_Memory_HeapMemoryUsage_max * 100 > 95  
16 |     for: 5m
17 |     labels:
18 |       severity: "警告"
19 |     annotations:
20 |       value: "{{ $value }}%"
21 |       summary: "堆内存利用率,5分钟持续>95%"
22 |       description: "堆内存利用率,5分钟持续>95%"
23 | 
24 |   - alert: "ThreadCount"
25 |     expr: java_lang_Threading_ThreadCount>500
26 |     for: 5m
27 |     labels:
28 |       severity: "严重"
29 |     annotations:
30 |       value: "{{ $value }}"
31 |       summary: "ThreadCount>500"
32 |       description: "ThreadCount>500"
33 | 
34 |   - alert: "GC-Throughput"
35 |     expr: ( java_lang_Runtime_Uptime - ignoring(name) java_lang_GarbageCollector_CollectionTime{name=~"ParNew"} - ignoring(name) java_lang_GarbageCollector_CollectionTime{name=~"ConcurrentMarkSweep"} )/java_lang_Runtime_Uptime < 0.96
36 |     for: 5m
37 |     labels:
38 |       severity: "严重"
39 |     annotations:
40 |       value: "{{ $value }}"
41 |       summary: "GC总吞吐率<96%"
42 |       description: "GC总吞吐率<96%"
43 | 


--------------------------------------------------------------------------------
/rules/kafka3node.rules:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: kafka
 3 |   rules:
 4 |   - alert: "kafka_brokers"
 5 |     expr: kafka_brokers < 3
 6 |     for: 5m
 7 |     labels:
 8 |       severity: "严重"
 9 |     annotations:
10 |       value: "{{ $value }}"
11 |       summary: "kafka_brokers < 3"
12 |       description: "kafka_brokers < 3"
13 | 


--------------------------------------------------------------------------------
/rules/linux-bigdata.rules:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: linux
 3 |   rules:
 4 |   - alert: "cpu_use"
 5 |     expr: 100 - (avg by (instance,server,project) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85 
 6 |     for: 30m
 7 |     labels:
 8 |       severity: "警告"
 9 |     annotations:
10 |       value: "{{ $value }}%"
11 |       summary: "cpu高"
12 |       description: "cpu利用率,30分钟平均值>85%"
13 | 
14 |   - alert: "cpu_iowait_use"
15 |     expr: avg by (instance,server,project) (irate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 35 
16 |     for: 30m
17 |     labels:
18 |       severity: "警告"
19 |     annotations:
20 |       value: "{{ $value }}%"
21 |       summary: "cpu_iowait高"
22 |       description: "cpu_iowait利用率,30分钟平均值>35%"
23 | 
24 |   - alert: "disk_use"
25 |     expr: 100 - (node_filesystem_avail_bytes{fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"})*100 > 80 
26 |     for: 30m
27 |     labels:
28 |       severity: "灾难"
29 |     annotations:
30 |       value: "{{ $value }}%"
31 |       summary: "磁盘快满了"
32 |       description: "{{ $labels.mountpoint }}磁盘利用率,30分钟持续>80%"
33 | 
34 |   - alert: "disk_inode_use"
35 |     expr: 100 - (node_filesystem_files_free{fstype=~"ext4|xfs"} / node_filesystem_files{fstype=~"ext4|xfs"})*100 > 80 
36 |     for: 10m
37 |     labels:
38 |       severity: "灾难"
39 |     annotations:
40 |       value: "{{ $value }}%"
41 |       summary: "磁盘inode快满了"
42 |       description: "{{ $labels.mountpoint }}磁盘inode利用率,10分钟持续>80%"
43 | 


--------------------------------------------------------------------------------
/rules/linux.rules:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: linux
 3 |   rules:
 4 |   - alert: "cpu_use"
 5 |     expr: 100 - (avg by (instance,server,project) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85 
 6 |     for: 5m
 7 |     labels:
 8 |       severity: "警告"
 9 |     annotations:
10 |       value: "{{ $value }}%"
11 |       summary: "cpu高"
12 |       description: "cpu利用率,5分钟平均值>85%"
13 | 
14 |   - alert: "cpu_iowait_use"
15 |     expr: avg by (instance,server,project) (irate(node_cpu_seconds_total{mode="iowait"}[5m])) * 100 > 35 
16 |     for: 5m
17 |     labels:
18 |       severity: "警告"
19 |     annotations:
20 |       value: "{{ $value }}%"
21 |       summary: "cpu_iowait高"
22 |       description: "cpu_iowait利用率,5分钟平均值>35%"
23 | 
24 |   - alert: "disk_use"
25 |     expr: 100 - (node_filesystem_avail_bytes{fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"})*100 > 95 
26 |     for: 5m
27 |     labels:
28 |       severity: "灾难"
29 |     annotations:
30 |       value: "{{ $value }}%"
31 |       summary: "磁盘快满了"
32 |       description: "{{ $labels.mountpoint }}磁盘利用率,5分钟持续>95%"
33 | 
34 |   - alert: "disk_inode_use"
35 |     expr: 100 - (node_filesystem_files_free{fstype=~"ext4|xfs"} / node_filesystem_files{fstype=~"ext4|xfs"})*100 > 95 
36 |     for: 5m
37 |     labels:
38 |       severity: "灾难"
39 |     annotations:
40 |       value: "{{ $value }}%"
41 |       summary: "磁盘inode快满了"
42 |       description: "{{ $labels.mountpoint }}磁盘inode利用率,5分钟持续>95%"
43 | 
44 |   - alert: "mem_use"
45 |     expr: 100-(node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes)*100>95 
46 |     for: 5m
47 |     labels:
48 |       severity: "灾难"
49 |     annotations:
50 |       value: "{{ $value }}%"
51 |       summary: "内存利用率,5分钟持续>95%"
52 |       description: "内存利用率,5分钟持续>95%"
53 | 
54 | 


--------------------------------------------------------------------------------
/rules/messages.rules:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: messages
 3 |   rules:
 4 |   - alert: "messages-killed"
 5 |     expr: irate(killed[5m]) > 0
 6 |     for: 10s
 7 |     labels:
 8 |       severity: "警告"
 9 |     annotations:
10 |       value: "{{ $value }}%"
11 |       summary: "有进程(pg)被杀了"
12 |       description: "有进程(pg)被杀了"


--------------------------------------------------------------------------------
/rules/mysql.rules:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: mysql
 3 |   rules:
 4 |   - alert: "MySQLDown"
 5 |     expr: mysql_up == 0
 6 |     for: 5m
 7 |     labels:
 8 |       severity: "严重"
 9 |     annotations:
10 |       value: "{{ $value }}"
11 |       summary: "MySQL Down"
12 |       description: "MySQL Down"
13 | 
14 |   - alert: "slave_io_running"
15 |     expr: mysql_slave_status_slave_io_running == 0
16 |     for: 5m
17 |     labels:
18 |       severity: "严重"
19 |     annotations:
20 |       value: "{{ $value }}"
21 |       summary: "MySQL slave_io_running Down"
22 |       description: "MySQL slave_io_running Down"
23 | 
24 |   - alert: "slave_sql_running"
25 |     expr: mysql_slave_status_slave_sql_running == 0
26 |     for: 5m
27 |     labels:
28 |       severity: "严重"
29 |     annotations:
30 |       value: "{{ $value }}"
31 |       summary: "MySQL slave_sql_running Down"
32 |       description: "MySQL slave_sql_running Down"
33 | 
34 |   - alert: "seconds_behind_master"
35 |     expr: mysql_slave_status_seconds_behind_master > 1800
36 |     for: 5m
37 |     labels:
38 |       severity: "警告"
39 |     annotations:
40 |       value: "{{ $value }}"
41 |       summary: "MySQL从库复制延迟>30分钟"
42 |       description: "MySQL从库复制延迟>30分钟"
43 | 


--------------------------------------------------------------------------------
/rules/nginx.rules:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: nginx
 3 |   rules:
 4 |   - alert: "nginxDown"
 5 |     expr: nginx_up == 0
 6 |     for: 5m
 7 |     labels:
 8 |       severity: "严重"
 9 |     annotations:
10 |       value: "{{ $value }}"
11 |       summary: "nginx Down"
12 |       description: "nginx Down"
13 | 


--------------------------------------------------------------------------------
/rules/oracle-h6.rules:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: oracle-h6
 3 |   rules:
 4 |   - alert: "HD_MONITORLOG"
 5 |     expr: HD_MONITORLOG == 1
 6 |     for: 5m
 7 |     labels:
 8 |       severity: "警告"
 9 |     annotations:
10 |       value: "{{ $value }}%"
11 |       summary: "oracle业务监控异常"
12 |       description: "oracle业务监控异常: {{ $labels.CONTENT }} "
13 | 
14 |   - alert: "BUY1POOLS"
15 |     expr: BUY1POOLS > 10000
16 |     for: 5m
17 |     labels:
18 |       severity: "警告"
19 |     annotations:
20 |       value: "{{ $value }}%"
21 |       summary: "BUY1POOLS > 1w"
22 |       description: "BUY1POOLS > 1w"
23 | 
24 |   - alert: "V_CUSTOM_LOG"
25 |     expr: V_CUSTOM_LOG == 1
26 |     for: 5m
27 |     labels:
28 |       severity: "警告"
29 |     annotations:
30 |       value: "{{ $value }}%"
31 |       summary: "自定义告警"
32 |       description: "自定义告警: {{ $labels.CONTENT }} "
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/rules/oracle.rules:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: oracle
 3 |   rules:
 4 |   - alert: "oracle_tablespace_size"
 5 |     expr: oracle_tablespace_size{exported_instance=~".*",tablespace_name=~".*"} < 10
 6 |     for: 5m
 7 |     labels:
 8 |       severity: "严重"
 9 |     annotations:
10 |       value: "{{ $value }}%"
11 |       summary: "oracle表空间不足"
12 |       description: "oracle {{ $labels.exported_instance }} 表空间{{ $labels.tablespace_name }} < 10% 且 < 20G"
13 | 
14 |   - alert: "oracle_asm_space"
15 |     expr: oracle_asm_space{exported_instance=~".*",name=~".*"} < 10
16 |     for: 5m
17 |     labels:
18 |       severity: "严重"
19 |     annotations:
20 |       value: "{{ $value }}%"
21 |       summary: "oracle-asm磁盘空间不足"
22 |       description: "oracle {{ $labels.exported_instance }} asm磁盘{{ $labels.name }} < 10%"
23 | 
24 |   - alert: "oracle_dbtime_load"
25 |     expr: oracle_dbtime_load{exported_instance=~".*"} > 2000
26 |     for: 5m
27 |     labels:
28 |       severity: "警告"
29 |     annotations:
30 |       value: "{{ $value }}%"
31 |       summary: "oracle_dbtime_load负载高"
32 |       description: "oracle {{ $labels.exported_instance }} dbtime_load负载 > 2000%"
33 | 
34 |   - alert: "oracle_dg_archived_log"
35 |     expr: oracle_dg_archived_log{exported_instance=~".*"} > 2
36 |     for: 30m
37 |     labels:
38 |       severity: "警告"
39 |     annotations:
40 |       value: "{{ $value }}%"
41 |       summary: "ADG备库同步有延迟"
42 |       description: "oracle {{ $labels.exported_instance }} ADG备库同步有延迟,dg_archived_log > 2"
43 | 
44 |   - alert: "oracle_checkBitAttack"
45 |     expr: oracle_checkBitAttack{exported_instance=~".*"} > 0
46 |     for: 5m
47 |     labels:
48 |       severity: "灾难"
49 |     annotations:
50 |       value: "{{ $value }}%"
51 |       summary: "Oracle中有勒索脚本"
52 |       description: "oracle {{ $labels.exported_instance }} 有勒索脚本,checkBitAttack > 0"
53 | 
54 |   - alert: "oracle_dba_jobs"
55 |     expr: oracle_dba_jobs{exported_instance=~".*"} >= 3
56 |     for: 5m
57 |     labels:
58 |       severity: "严重"
59 |     annotations:
60 |       value: "{{ $value }}"
61 |       summary: "oracle {{ $labels.exported_instance }} 有job{{ $labels.JOB }}失败了{{ $value }}次:{{ $labels.WHAT }}"
62 |       description: "oracle {{ $labels.exported_instance }} 有job{{ $labels.JOB }}失败了{{ $value }}次:{{ $labels.WHAT }}"
63 | 
64 | 
65 | 


--------------------------------------------------------------------------------
/rules/pgbouncer.rules:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: pgbouncer
 3 |   rules:
 4 |   - alert: "PGbouncerDown"
 5 |     expr: pgbouncer_up == 0
 6 |     for: 5m
 7 |     labels:
 8 |       severity: "严重"
 9 |     annotations:
10 |       value: "{{ $value }}"
11 |       summary: "PGbouncer {{ $labels.pool_id }} Down"
12 |       description: "PGbouncer {{ $labels.pool_id }} Down"
13 | 


--------------------------------------------------------------------------------
/rules/postgresql-bigdata.rules:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: postgresql-bigdata
 3 |   rules:
 4 |   - alert: "PostgreSQL standby dalay"
 5 |     expr: pg_standby_dalay_standby_dalay == 0
 6 |     for: 3m
 7 |     labels:
 8 |       severity: "严重"
 9 |     annotations:
10 |       value: "{{ $value }}"
11 |       summary: "PostgreSQL standby 数据延迟超过1分钟"
12 |       description: "PostgreSQL standby 数据延迟超过1分钟"
13 |   - alert: "PostgreSQL lock wait"
14 |     expr: pg_logc_wait_logc_wait_hour == 1
15 |     for: 5m
16 |     labels:
17 |       severity: "严重"
18 |     annotations:
19 |       value: "{{ $value }}"
20 |       summary: "PostgreSQL 锁等待超过1分钟"
21 |       description: "PostgreSQL 锁等待超过1分钟"


--------------------------------------------------------------------------------
/rules/postgresql.rules:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: postgresql
 3 |   rules:
 4 |   - alert: "PostgreSQLDown"
 5 |     expr: pg_up == 0
 6 |     for: 5m
 7 |     labels:
 8 |       severity: "严重"
 9 |     annotations:
10 |       value: "{{ $value }}"
11 |       summary: "PostgreSQL Down"
12 |       description: "PostgreSQL Down"
13 | 


--------------------------------------------------------------------------------
/rules/process.rules:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: process
 3 |   rules:
 4 |   - alert: "namedprocess_scrape_errors"
 5 |     expr: namedprocess_scrape_errors == 1
 6 |     for: 5m
 7 |     labels:
 8 |       severity: "严重"
 9 |     annotations:
10 |       value: "{{ $value }}"
11 |       summary: "process拉取失败"
12 |       description: "process拉取失败"


--------------------------------------------------------------------------------
/rules/rabbitmq.rules:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: rabbitmq
 3 |   rules:
 4 |   - alert: "rabbitmq_up"
 5 |     expr: rabbitmq_up{cluster != ''}==0
 6 |     for: 5m
 7 |     labels:
 8 |       severity: "严重"
 9 |     annotations:
10 |       value: "{{ $value }}"
11 |       summary: "RabbitMQ Down"
12 |       description: "RabbitMQ Down"
13 | 


--------------------------------------------------------------------------------
/rules/windows.rules:
--------------------------------------------------------------------------------
 1 | groups:
 2 | - name: windows
 3 |   rules:
 4 |   - alert: "cpu_use"
 5 |     expr: 100 - (avg(irate(wmi_cpu_time_total{mode="idle"}[5m])))*100 > 85
 6 |     for: 5m
 7 |     labels:
 8 |       severity: "警告"
 9 |     annotations:
10 |       value: "{{ $value }}%"
11 |       summary: "cpu高"
12 |       description: "cpu利用率,5分钟平均值>85%"
13 | 
14 |   - alert: "disk_use"
15 |     expr: 100 - (wmi_logical_disk_free_bytes / wmi_logical_disk_size_bytes)*100 > 95
16 |     for: 5m
17 |     labels:
18 |       severity: "灾难"
19 |     annotations:
20 |       value: "{{ $value }}%"
21 |       summary: "磁盘快满了"
22 |       description: "{{ $labels.volume }}磁盘利用率,5分钟持续>95%"
23 | 


--------------------------------------------------------------------------------